文本词频统计(hash)
文本单词词频统计
0x00 问题描述
0x01 代码实现
字符串为关键字的hash函数,线性探测的冲突解决方案,tablesize是给定的下一个素数
#include <stdio.h>
#include <stdlib.h>
#define ElementType char*
#define word_max_len 20
typedef struct hashtable* HashTable;
struct hashtable{
int size;
int* flag;
ElementType* data;
};
int nextprime(int a){
int i,j,k;
for(i=a;;i++){
int flag=1;
for(j=2;j<i;j++){
if(i%j==0){
flag=0;
break;
}
}
if(flag){
return i;
}
}
}
HashTable initTable(int tablesize){
HashTable hs=NULL;//局部变量不会自动初始化,所以必须置NULL;
if(tablesize<10){
printf("tablesize太小,建议用数组!\n");
}else{
hs=(HashTable)malloc(sizeof(struct hashtable));
if(hs){
hs->size=nextprime(tablesize);
hs->data=(ElementType*)malloc(sizeof(ElementType)*hs->size);
hs->flag=(int*)malloc(sizeof(int)*hs->size);
if(hs->data&&hs->flag){
int i;
for(i=0;i<hs->size;i++){
hs->flag[i]=0;
}
}else{
printf("空间溢出\n");
}
}else{
printf("空间溢出\n");
}
}
return hs;
}
int hash(int tablesize,ElementType data){
unsigned int sum=0;//必须是unsigned int 不然会变成负数。
while(*data){
sum=(sum<<5)+(int)(*data);
data=data+1;
}
return sum%tablesize;
}
int find(HashTable hs,ElementType data){
int hash_pos=hash(hs->size,data);
while(hs->flag[hash_pos]!=0&&strcmp(hs->data[hash_pos],data)){//strcmp
hash_pos++;//使用最简单的线性探测
hash_pos%=hs->size;
}
return hash_pos;
}
int findx(HashTable hs,ElementType data){
int pos=find(hs,data);
if(hs->flag[pos]>0){
return pos;
}else return -1;
}
void insert(HashTable hs,ElementType data){
int pos=find(hs,data);
if(hs->flag[pos]<1){
hs->data[pos]=data;
hs->flag[pos]=1;
}else{
hs->flag[pos]++;
}
}
int get_a_word(FILE*fp,char*buff,int size){
int i=0;
char c;
while( (c=fgetc(fp))!=EOF&&c!=95&&(c<65||(c>90&&c<97)||c>122) ){
}
if(c!=EOF){
buff[i++]=c;
while( (c=fgetc(fp))!=EOF&&( c==95||(c>=65&&c<=90)||(c>=97&&c<=122) ) ){
buff[i++]=c;
if(i==size-1){
break;
}
}
}
buff[i]=0;
return i;
}
void insert_sort(HashTable hs,int N){
int i,j;
for(i=1;i<N;i++){
ElementType tmp=hs->data[i];
int tmp1=hs->flag[i];
for(j=i;j>0&&tmp1>hs->flag[j-1];j-- ){
hs->data[j]=hs->data[j-1];
hs->flag[j]=hs->flag[j-1];
}
hs->data[j]=tmp;
hs->flag[j]=tmp1;
}
}
void print_hash(HashTable hs,int N){
int i=0;
for(i=0;i<N;i++){
if(hs->flag[i]){
printf("%s\t%d\n",hs->data[i],hs->flag[i]);
free(hs->data[i]);//打印完后即free掉空间
}
};
free(hs->data);
free(hs);
}
int main(void){
HashTable hs=initTable(2000);
FILE* fp=fopen("d://test.txt","r");
if(fp){
while(1){
char*str=(char*)malloc(sizeof(char)*word_max_len);
int len=get_a_word(fp,str,word_max_len);
if(len>=3) insert(hs,str);
else if(len==0){
free(str);
break;
}
}
insert_sort(hs,hs->size);
print_hash(hs,hs->size);
}
fclose(fp);
}
以上代码编译通过