求文件中单词出现的频数

来源：互联网发布：linux查看dns 编辑：程序博客网时间：2024/06/09 13:45

一、主体思路：

(1)建立一个hash表；

--(a) hash函数：除留取余法，H(key) = key % size；

--(b) 解决冲突的方法：链地址法，将所有映射到相同索引的字符串用链接指针连接在一起。

(2)读取文本文件word.txt，每次读取一行，然后分隔每个单词，插入到hash表，插入过程中会对单词出现次数统计；

(3)将整个hash表内容写到一个文本文件result.txt中。

二、数据结构及算法实现

(1)hash表

--(a) hash表大小(HASHNUMBER)：采用一个大质数作为表的总容量，本例中为HASHNUMBER29989；

#define HASHNUMBER 29989 //散列表的大小，29989为质数。
#define MULT 31 //hash函数的一个乘子。

typedef struct hashnode

        {//链表中每个节点的结构
           hashnode()
           {
              word = NULL;
              count = 0;
              next = NULL;
           }
           char * word;//单词
           int count;//出现频率
           struct hashnode *next;//指向链表中具有相同hash值的下个节点
       }hashNode,*hashNodePtr ;

hashNodePtr bin[HASHNUMBER] = { NULL};//HASHNUMBER大小的指针数组作为hash表。

--(b) hash函数：将每个单词映射为一个小于HASHNUMBER的正整数；

     unsigned int hashIndex(const char * pWord)//返回hash表索引（即hash指针数组的下标）。
      {
          assert(pWord != NULL);
          unsigned int index = 0; //以下四行为将一个单词映射到一个小于HASHNUMBER的正整数的函数。
          int n=strlen(pWord);

for(int i=0;i<n;i++)

index=MULT*index+pWord[i];

return index % HASHNUMBER;
}

--(c) 向hash表中插入单词。

void insertWord (const char * pWord )// 在hash 表中插入单词，如果已经存在了，则增加单词的出现次数count 。

{

assert (pWord != NULL );

hashNodePtr p ;

unsigned int index = hashIndex (pWord );//用(b)中的hash函数得到单词在hash表中的下标。

for (p =bin [index ];p != NULL ;p = p ->next )

{// 查找是否单词已经在hash 表中了。

if (strcmp (pWord ,p ->word ) == 0)

{// 找到，将单词出现次数加.

(p ->count )++;

return ;

}

p = (hashNodePtr )malloc (sizeof (hashNode ));//hash 表中不存在该单词，创建节点。

p ->count = 1;//出现次数设置为1。

p ->word = (char *)malloc (strlen (pWord )+1);

strcpy (p ->word ,pWord );

p ->next = bin [index ];//将新生成的节点插入到index为下标的链表中去。

bin [index ] = p ;

}

(2)读取Data.txt中的单词，并将每个单词插入到(1)中设计好的hash表中。

void readWordToHashTable (const char *path )

{// 从文本文件中读取单词，插入到hash 表中。

FILE *fp ;

char buf [1024];// 存储一行字符串。

char *p ;

fp = fopen (path ,"r" );

if (fp == NULL )

{

printf ("open file error!exit/n" );

exit (-1);

}

while (NULL != fgets (buf ,sizeof (buf ),fp ))//数据读完，到文本末尾了

{

buf [strlen (buf )-1] = '/0' ; // 除去单词最后的换行符

//printf("%s/n",buf);

if (strcmp ("" ,buf )==0)//blank line,continue.

continue ;

p = strtok (buf ,"'/t',' ','/n'" );//用strtok函数从一行字符串中分离出每个单词，分隔符设置为（空格、逗号、换行、制表符）。

while (p != NULL )

{

insertWord (p );// 调用insertWord()，向hash 表中插入分隔出来的单词。

p = strtok (NULL ,"'/t',' ','/n'" );

}

fclose (fp );

}

(3) 将hash表中对每个单词的统计信息写入到文本文件中。

void writeHashTable (const char *path )

{// 将结果写到path 中。

FILE *fp ;

hashNodePtr p ;

int i ;

fp = fopen (path ,"w" );

if (fp == NULL )

{

printf ("write file error! exit" );

exit (-1);

}

for (i =0;i <HASHNUMBER ;i ++)

{

for (p = bin [i ];p != NULL ;p = p ->next )

{

fprintf (fp ,"Index %d: <%s,%d>" ,i ,p ->word ,p ->count );

if (p ->next == NULL )

fprintf (fp ,"/n" );

}

fclose (fp );

}

(4)释放hash表中占用的内存

void freeHashTable()//释放hash表所占用内存。
  {
    int i;
    hashNodePtr p,q;
    p = q = NULL;
    for(i=0;i<HASHNUMBER;i++)
    {
        p = bin[i];
        while(p != NULL)
        {
            q = p;
            p = p->next;
            free(q->word);
            free(q);
        }
     }
  }

(5) main函数中。

int main (void )

{

readWordToHashTable ("data.txt" );

writeHashTable ("result.txt" );

return 0;

}