文本词频统计(hash)

文本单词词频统计

0x00 问题描述

QQ截图20200218135217

0x01 代码实现

​ 字符串为关键字的hash函数,线性探测的冲突解决方案,tablesize是给定的下一个素数

#include <stdio.h>
#include <stdlib.h>
#define ElementType char*
#define word_max_len 20
typedef struct hashtable* HashTable;
struct hashtable{
	int size;
	int* flag;
	ElementType* data;
};
int nextprime(int a){
	int i,j,k;
	for(i=a;;i++){
		int flag=1;
		for(j=2;j<i;j++){
			if(i%j==0){
				flag=0;
				break;
			}
		}
		if(flag){
			return i;
		}
	}
}
HashTable initTable(int tablesize){
	HashTable hs=NULL;//局部变量不会自动初始化,所以必须置NULL; 
	if(tablesize<10){
		printf("tablesize太小,建议用数组!\n");
	}else{
		hs=(HashTable)malloc(sizeof(struct hashtable));
		if(hs){
			hs->size=nextprime(tablesize);
			hs->data=(ElementType*)malloc(sizeof(ElementType)*hs->size);
			hs->flag=(int*)malloc(sizeof(int)*hs->size);
			if(hs->data&&hs->flag){
				int i;
				for(i=0;i<hs->size;i++){
					hs->flag[i]=0;
				}					
			}else{
				printf("空间溢出\n");
			}
		}else{
			printf("空间溢出\n");
		}
	}
	return hs;
}
int hash(int tablesize,ElementType data){

	unsigned int sum=0;//必须是unsigned int 不然会变成负数。 
	while(*data){
		sum=(sum<<5)+(int)(*data);
		data=data+1;
	}
	return sum%tablesize;
}
int find(HashTable hs,ElementType data){
	int hash_pos=hash(hs->size,data);
	while(hs->flag[hash_pos]!=0&&strcmp(hs->data[hash_pos],data)){//strcmp
		hash_pos++;//使用最简单的线性探测
		hash_pos%=hs->size;
	}
	return hash_pos;
}
int findx(HashTable hs,ElementType data){
	int pos=find(hs,data);
	if(hs->flag[pos]>0){
		return pos;
	}else return -1;
}
void insert(HashTable hs,ElementType data){
	int pos=find(hs,data);
	if(hs->flag[pos]<1){
		hs->data[pos]=data;
		hs->flag[pos]=1;
	}else{
		hs->flag[pos]++;
	}
}
int get_a_word(FILE*fp,char*buff,int size){
	int i=0;
	char c;
	while( (c=fgetc(fp))!=EOF&&c!=95&&(c<65||(c>90&&c<97)||c>122) ){
		
	}
	if(c!=EOF){
		buff[i++]=c;
	
		while( (c=fgetc(fp))!=EOF&&( c==95||(c>=65&&c<=90)||(c>=97&&c<=122) ) ){
			buff[i++]=c;
			if(i==size-1){
				break;
			}
		}
	}
	buff[i]=0;
	return i;
}
void insert_sort(HashTable hs,int N){
	int i,j;
	for(i=1;i<N;i++){
		ElementType tmp=hs->data[i];
		int tmp1=hs->flag[i];
		for(j=i;j>0&&tmp1>hs->flag[j-1];j-- ){
			hs->data[j]=hs->data[j-1];
			hs->flag[j]=hs->flag[j-1];
		}
		hs->data[j]=tmp;
		hs->flag[j]=tmp1;
	} 


}
void print_hash(HashTable hs,int N){
	int i=0;
	for(i=0;i<N;i++){
		if(hs->flag[i]){
            printf("%s\t%d\n",hs->data[i],hs->flag[i]);
            free(hs->data[i]);//打印完后即free掉空间
        } 
	};
    free(hs->data);
    free(hs);
}
int main(void){
	HashTable hs=initTable(2000);
	FILE* fp=fopen("d://test.txt","r");
	if(fp){
		while(1){
			char*str=(char*)malloc(sizeof(char)*word_max_len);
			int len=get_a_word(fp,str,word_max_len);
			if(len>=3) insert(hs,str);
			else if(len==0){
				free(str);
				break;
			}
		}
		insert_sort(hs,hs->size);
		print_hash(hs,hs->size);
		 
	}
	fclose(fp);
} 

以上代码编译通过