distance.c

  1 #include "stdio.h"
  2 #include "string.h"
  3 #include "math.h"
  4 #include "malloc.h"
  5
  6 const long long Max_size = 2000;//输入字符串的最大长度,可以由单个词条和多个词条组成
  7 const long long N = 40;//输出与某个单词最接近的N个词
  8 const long long Max_w = 50;//单个词条的最大长度
  9
 10 int main(int argc,char **argv)
 11 {
 12     FILE *f;//读取的文件指针
 13     char stemp[Max_size];//中间变量
 14     char *bestw[N];//存储与某个词最接近N个词条
 15     char ch;
 16     float *M;//存储所有词条的距离相关信息
 17     char *vocab;//存储所有词条的字符信息
 18 /*
 19 *file_name[Max_size];存放要读取内容的文件名;
 20 *st[100][Max_size];//二维数组,中间变量
 21 *dist:距离;len:长度;bestd[N]:存储与某个词最接近的N个词的距离
 22 *vec[Max_size]:存储Max_size个词与某个指定词的距离
 23 *words:词条的总数目;size:词条表示的维数;
 24 *key[100],存储某个词条在总词条中的位置下标
 25 */
 26     char file_name[Max_size],st[100][Max_size];
 27     float dist,len,bestd[N],vec[Max_size];
 28     long long words,size,i,j,k,l,num,key[100];
 29
 30     if(argc<2)//打印程序所在路径
 31     {
 32         printf("Usage:./distance<FILE>\nwhere FILE contains word projections in the  BINARY FORMAIN\n");
 33         return 0;
 34     }
 35
 36     strcpy(file_name,argv[1]);//argv[1]中存放的文件名赋给file_name
 37     f  = fopen(file_name,"rb");
 38
 39     if(f == NULL)//文件打开失败
 40     {
 41         printf("Input file not found\n");
 42         return -1;
 43     }
 44
 45     fscanf(f,"lld",&words);//输入总词条数组
 46     fscanf(f,"lld",&size);//输入用来表示每个词条的维数大小
 47
 48     vocab = (char *)malloc((long long)words * Max_w *sizeof(char));
 49
 50     //根据实际的维数分配存储块大小
 51     for(i = 0; i < N; i++)bestw[i] = (char *)malloc(Max_size *  sizeof(char));
 52
 53     M = (float *)malloc((long long)words * (long long)size *sizeof(float));
 54     if(M==NULL)//不能分配所需要大小的内存
 55     {
 56         printf("Cannot allocate memory:%lld MB %lld %lld\n",(long long )words *size*sizeof(float)/1024/1024,words,size);
 57         return -1;
 58     }
 59
 60     for(j = 0; j < words; j++)//words个词条每个词条坐标进行归一化
 61     {
 62         i = 0;
 63         while(true)
 64         {
 65             vocab[j*Max_w + i] = fgetc(f);
 66             if(feof(f) || vocab[j * Max_w + i] == ‘ ‘)break;
 67             if((i <Max_w)&&(vocab[j * Max_w + i] !=‘\n‘))i++;
 68         }
 69         vocab[j * Max_w + i] = 0;
 70         for(i = 0;i < size ; i++ ) fread(&M[j * size + i],sizeof(float),1,f);
 71         len = 0;
 72         for(i = 0 ; i < size ; i++ ) len += M[j * size + i]*M[j * size + i];
 73         len = sqrt(len);
 74         for(i = 0 ; i < size ; i++ )  M[j * size + i] = M[j * size + i] / len; //坐标归一化
 75     }
 76     fclose(f);
 77
 78     while(true)
 79     {
 80         for(i = 0 ; i < N ; i++ )
 81         {
 82             bestd[i] = 0;//N个词条距离初始为0
 83             bestw[i][0] = 0 ;//初始化
 84         }
 85
 86         printf("Entor word or sentence(EXIT to break:)");
 87
 88         i = 0;
 89         while(true)//键盘读入单个词条或多个词条  ;当字符数组超过Max_size -1 或者遇到回车符结束
 90         {
 91             stemp[i] = fgetc(stdin);
 92             if((stemp[i] == ‘\n‘)||(i >=Max_size -1))
 93             {
 94                 stemp[i] = 0;
 95                 break;
 96             }
 97         }
 98         if(!strcmp(stemp,"EXIT"))break;//如果输入的是"EXIT",则退出
 99
100         num = 0 ;//用来统计键盘输入词条的数目
101         j = 0;
102         k = 0;
103         while(true)
104         {
105             st[num][j++] = stemp[k++];
106             st[num][j] = 0;
107             if(stemp[k] ==0)break;//读完结束
108             if(stemp[k++] == ‘ ‘)//遇到空格 词条数目+1
109             {
110                 num++;//词条数目+1
111                 j = 0;
112             }
113         }
114
115         num++;//词条的总数目
116
117         //找出每个词条 最接近的N个词条
118         for(i = 0 ; i < num ; i++)
119         {
120             for(j = 0 ; j < words ;j++)if(!strcmp(&vocab[j * Max_w],st[j]))break;//在总词条中找到这个词条,获得这个词条在总词条中的位置
121             if(j == words)  j = -1;//这个词条不存在
122             key[i] = j;
123             printf("\nWord:%s Position in vocabulary: %lld\n",st[i],key[i]);
124
125             if(j == -1)//这个词条不在这个词汇表中
126             {
127                 printf("Out of dictionary word!\n");
128                 break;//终止循环
129             }
130         }
131
132         if(j==-1)continue;//继续执行
133
134          printf("\n                                              Word       Cosine distance\n");
135          printf("------------------------------------------------------------------------\n");
136
137          for(i = 0 ; i < size ; i++ ) vec[i] = 0;//距离初始化为0
138          for(j = 0 ; j < num ; j++ ) //遍历每个词,如果输入多个词向量vec[i]是各个词向量的累加和
139          {
140              if(key[j] == -1)continue;
141              for(i = 0 ; i < size ; i++ ) vec[i] += M[i + key[j]*size];
142          }
143
144          len = 0;
145          for(i = 0 ; i< size ; i++) len +=vec[i] * vec[i];
146          len = sqrt(len);
147
148          for(i = 0; i < size ; i++) vec[i] = vec[i] / len;//将vec归一化,当只输入一个词时,不起作用
149
150          for(i = 0 ; i < N ; i++)
151          {
152              bestd[i] = -1;
153              bestw[i][0] = 0;
154          }
155
156          //由于查询词和词汇表都做了归一化,所以余弦相似度等价于向量的内积,内机越大越相似
157          for(k = 0 ; k < words ; k++)//遍历词汇表
158          {
159              i = 0;
160              for(j = 0 ; j < num ; j++)//i的作用;如果遍历词和查询词相同,则跳过此词
161              {
162                  if(key[j] == k) i =1;
163              }
164              if(i == 1) continue;
165              dist = 0;
166
167              for(i = 0 ; i < N ; i++ )
168              {
169                  dist += vec[i] * M[k * size + i] ;
170              }
171
172              for(i = 0 ; i < N ; i++ )
173              {
174                  if(dist > bestd[i])
175                  {
176                      for(j = N-1 ; j > i ; j--)
177                      {
178                          bestd[j] = bestd[j - 1 ];
179                          strcpy(bestw[j],bestw[j-1]);
180                      }
181                      bestd[j] = dist;
182                      strcpy(bestw[i] , &vocab[k * Max_size]);
183                      break;
184                  }
185              }
186          }
187          for(i = 0 ; i < N ; i++ )printf("%50s\t\t%f\n",bestw[i],bestd[i]);
188     }
189     return 0;
190 }
时间: 2024-08-28 03:35:51

distance.c的相关文章

461.求两个数字转成二进制后的“汉明距离” Hamming Distance

public class Solution { public int HammingDistance(int x, int y) { int distance = 0; string sX = Convert.ToString(x, 2); string sY = Convert.ToString(y, 2); int maxLength = Math.Max(sX.Length, sY.Length); //填充0,使两个字符串右对齐 sX = sX.PadLeft(maxLength, '0

LeetCode 72 Edit Distance

Given two words word1 and word2, find the minimum number of steps required to convert word1 to word2. (each operation is counted as 1 step.) You have the following 3 operations permitted on a word: a) Insert a character b) Delete a character c) Repla

461. Hamming Distance

The Hamming distance between two integers is the number of positions at which the corresponding bits are different. Given two integers x and y, calculate the Hamming distance. Note:0 ≤ x, y < 2^31. Example: Input: x = 1, y = 4 Output: 2 Explanation:

搬土距离(Earth Mover&#39;s Distance)

搬土距离(The Earth Mover's Distance,EMD)最早由Y. Rubner在1999年的文章<A Metric for Distributions with Applications to Image Databases>中提出,它是归一化的从一个分布变为另一个分布的最小代价,因此可用于表征两个分布之间的距离. 例如,对于图像而言,它可以看做是由色调.饱和度.亮度三个分量组成,每个分量的直方图就是一个分布.不同的图像对应的直方图不同,因此图像之间的距离可以用直方图的距离表

[Locked] One Edit Distance

One Edit Distance Given two strings S and T, determine if they are both one edit distance apart. 分析: 编辑距离复杂度为O(MN),而本题显然不能用这么高的复杂度:首先,可以通过判断两个字符串是否等长来决定用增一位.减一位.替换一位这三种方法之一来使得两个字符串等同,如果都不行,就return false:然后同时遍历S和T,第一次遇到不匹配的,就用刚才判断出的方法拯救一下:第二次还遇到不匹配的,就

I - Long Distance Racing(第二季水)

Description Bessie is training for her next race by running on a path that includes hills so that she will be prepared for any terrain. She has planned a straight path and wants to run as far as she can -- but she must be back to the farm within M se

[LeetCode] One Edit Distance 一个编辑距离

Given two strings S and T, determine if they are both one edit distance apart. 这道题是之前那道Edit Distance的拓展,然而这道题并没有那道题难,这道题只让我们判断两个字符串的编辑距离是否为1,那么我们只需分下列三种情况来考虑就行了: 1. 两个字符串的长度之差大于1,那么直接返回False 2. 两个字符串的长度之差等于1,那么长的那个字符串去掉一个字符,剩下的应该和短的字符串相同 3. 两个字符串的长度之

[ACM] POJ 2689 Prime Distance (筛选范围大素数)

Prime Distance Time Limit: 1000MS   Memory Limit: 65536K Total Submissions: 12811   Accepted: 3420 Description The branch of mathematics called number theory is about properties of numbers. One of the areas that has captured the interest of number th

HDU 4712 Hamming Distance (随机函数)

Hamming Distance Time Limit: 6000/3000 MS (Java/Others)    Memory Limit: 65535/65535 K (Java/Others)Total Submission(s): 1806    Accepted Submission(s): 714 Problem Description (From wikipedia) For binary strings a and b the Hamming distance is equal