├── README.md
├── distancecli
├── distancecli.c
├── makefile
├── phpword2vec.php
└── w2v
    └── trunk
        ├── .svn
            ├── entries
            ├── format
            ├── pristine
            │   ├── 13
            │   │   └── 13294f538c32fae2da1c85726695d77d70980247.svn-base
            │   ├── 21
            │   │   └── 210681519593463cd6742bbec2abc1a253932bb1.svn-base
            │   ├── 23
            │   │   └── 2334b431b808544014e14d0ddbb66ccb03d13277.svn-base
            │   ├── 51
            │   │   └── 518a7ad549627c6ef8cf05b49408fcf0f6157460.svn-base
            │   ├── 72
            │   │   └── 724bf0b7fd08d78098c1ccc622ada62ad58093ba.svn-base
            │   ├── 80
            │   │   └── 80740eb5930e039b8002a6c2213cd152847a4169.svn-base
            │   ├── 83
            │   │   └── 83a04fdb0a7cc66001a1abe29157acbe28321564.svn-base
            │   ├── 91
            │   │   └── 91063b176c2f3543afd684071bf4677203917a52.svn-base
            │   ├── 2b
            │   │   └── 2b8b815229aa8a61e483fb4ba0588b8b6c491890.svn-base
            │   ├── 4e
            │   │   └── 4ea10e60b208f31ae965718f905268ac42fbf1ac.svn-base
            │   ├── 6f
            │   │   └── 6ffd58121b45291fcc42a5484d2e3f1ef1156b0d.svn-base
            │   ├── 8c
            │   │   └── 8ccd7b8850b84c7d306aebd933c2f1a26d264320.svn-base
            │   ├── 9a
            │   │   └── 9a7277255e393a35ce6a0738867c29304f43b55c.svn-base
            │   ├── c7
            │   │   └── c7b37d6aa035fe7b53a54351b44ab577d2fd3337.svn-base
            │   ├── ea
            │   │   └── ea5f636000c445177e5f2f14af11f716b1e91bd0.svn-base
            │   ├── f4
            │   │   └── f4f8420f4ff647df0f4196ceee895888fb7f63f7.svn-base
            │   └── fa
            │   │   └── fa92df4bbe788f2d51827c762c63bd8e470edf31.svn-base
            └── wc.db
        ├── LICENSE
        ├── README.txt
        ├── compute-accuracy.c
        ├── demo-analogy.sh
        ├── demo-classes.sh
        ├── demo-phrase-accuracy.sh
        ├── demo-phrases.sh
        ├── demo-train-big-model-v1.sh
        ├── demo-word-accuracy.sh
        ├── demo-word.sh
        ├── distance.c
        ├── makefile
        ├── questions-phrases.txt
        ├── questions-words.txt
        ├── vectors.bin
        ├── word-analogy.c
        ├── word2phrase.c
        └── word2vec.c


/README.md:
--------------------------------------------------------------------------------
 1 | # phpword2vec
 2 | php调用word2vec实现机器学习
 3 | ### 使用方法
 4 |     执行make进行编译
 5 |     执行phpphpword2vec.php可以得到当前关键词的文档向量（该工具是把300维向量转化文档向量的工具）
 6 |     php直接调用然后可以进行svm等分类操作
 7 |     该工具在已经有训练数据后调用
 8 | ### 项目地址
 9 |     github：https://github.com/qieangel2013/phpword2vec
10 |     oschina：https://gitee.com/qieangel2013/phpword2vec
11 | ### 如果你对我的辛勤劳动给予肯定，请给我捐赠，你的捐赠是我最大的动力
12 | ![](https://github.com/qieangel2013/zys/blob/master/public/images/pw.jpg)
13 | ![](https://github.com/qieangel2013/zys/blob/master/public/images/pay.png)
14 | [项目捐赠列表](https://github.com/qieangel2013/zys/wiki/%E9%A1%B9%E7%9B%AE%E6%8D%90%E8%B5%A0)
15 | 
16 |     
17 | 


--------------------------------------------------------------------------------
/distancecli:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qieangel2013/phpword2vec/e12958b7bd6f5382ec1e14da3759f904dd5fe3e3/distancecli


--------------------------------------------------------------------------------
/distancecli.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <malloc.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char *bestw[N];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn,mmn, bi[100];
 31 |   //char ch;
 32 |   float *M;
 33 |   char *vocab;
 34 |   if (argc < 2) {
 35 |     printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 36 |     return 0;
 37 |   }
 38 |   strcpy(file_name, argv[1]);
 39 |   f = fopen(file_name, "rb");
 40 |   if (f == NULL) {
 41 |     printf("Input file not found\n");
 42 |     return -1;
 43 |   }
 44 |   fscanf(f, "%lld", &words);
 45 |   fscanf(f, "%lld", &size);
 46 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 47 |   for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
 48 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 49 |   if (M == NULL) {
 50 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 51 |     return -1;
 52 |   }
 53 |   for (b = 0; b < words; b++) {
 54 |     a = 0;
 55 |     while (1) {
 56 |       vocab[b * max_w + a] = fgetc(f);
 57 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 58 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 59 |     }
 60 |     vocab[b * max_w + a] = 0;
 61 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 62 |     len = 0;
 63 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 64 |     len = sqrt(len);
 65 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 66 |   }
 67 |   fclose(f);
 68 |   //while (1) {
 69 |     for (a = 0; a < N; a++) bestd[a] = 0;
 70 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 71 |     //printf("Enter word or sentence (EXIT to break): ");
 72 |     a = 0;
 73 |      //while (1) {
 74 |        //st1[a] = fgetc(stdin);
 75 |       for (mmn = 2; mmn < sizeof(argv); mmn++) {
 76 |         strcpy(st1,argv[2]);
 77 |         if ((st1[a] == '\n') || (a >= max_size - 1)) {
 78 |           st1[a] = 0;
 79 |           break;
 80 |         }
 81 |         a++;
 82 |       }
 83 |     //if (!strcmp(st1, "EXIT")) break;
 84 |     cn = 0;
 85 |     b = 0;
 86 |     c = 0;
 87 |     while (1) {
 88 |       st[cn][b] = st1[c];
 89 |       b++;
 90 |       c++;
 91 |       st[cn][b] = 0;
 92 |       if (st1[c] == 0) break;
 93 |       if (st1[c] == ' ') {
 94 |         cn++;
 95 |         b = 0;
 96 |         c++;
 97 |       }
 98 |     }
 99 |     cn++;
100 |     for (a = 0; a < cn; a++) {
101 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
102 |       if (b == words) b = -1;
103 |       bi[a] = b;
104 |       //printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
105 |       if (b == -1) {
106 |         //printf("Out of dictionary word!\n");
107 |         break;
108 |       }
109 |     }
110 |     //if (b == -1) continue;
111 |     //printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
112 |     for (a = 0; a < size; a++) vec[a] = 0;
113 |     for (b = 0; b < cn; b++) {
114 |       if (bi[b] == -1) continue;
115 |       for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
116 |     }
117 |     len = 0;
118 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
119 |     len = sqrt(len);
120 |     for (a = 0; a < size; a++) vec[a] /= len;
121 |     for (a = 0; a < N; a++) bestd[a] = -1;
122 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
123 |     for (c = 0; c < words; c++) {
124 |       a = 0;
125 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
126 |       if (a == 1) continue;
127 |       dist = 0;
128 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
129 |       for (a = 0; a < N; a++) {
130 |         if (dist > bestd[a]) {
131 |           for (d = N - 1; d > a; d--) {
132 |             bestd[d] = bestd[d - 1];
133 |             strcpy(bestw[d], bestw[d - 1]);
134 |           }
135 |           bestd[a] = dist;
136 |           strcpy(bestw[a], &vocab[c * max_w]);
137 |           break;
138 |         }
139 |       }
140 |     }
141 |     if(sizeof(bestw)>0 && strlen(bestw[0])!=0){
142 |       for (a = 0; a < N; a++) printf("%s,%f\n", bestw[a], bestd[a]);
143 |     }else{
144 |       printf("%s\n", bestw[a]);
145 |     }
146 |     
147 |   //}
148 |   return 0;
149 | }
150 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
 4 | 
 5 | all: distancecli
 6 | 
 7 | distancecli : distancecli.c
 8 | 	$(CC) distancecli.c -o distancecli $(CFLAGS)
 9 | 
10 | clean:
11 | 	rm -rf distancecli


--------------------------------------------------------------------------------
/phpword2vec.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | function distance_pre($keyword) {
 3 | 	exec ( dirname ( __FILE__ ) . "/distancecli " . dirname ( __FILE__ ) . "/w2v/trunk/vectors.bin " . $keyword, $outputArray );
 4 | 	if (isset ( $outputArray[0] )) {
 5 | 		return $outputArray;
 6 | 	} else {
 7 | 		return array();
 8 | 	}
 9 | }
10 | function distance($keyword) {
11 | 	exec ( dirname ( __FILE__ ) . "/distancecli " . dirname ( __FILE__ ) . "/w2v/trunk/vectors.bin " . $keyword, $outputArray );
12 | 	$resultdata=array();
13 | 	if (isset ( $outputArray[0] )) {
14 | 		$tmparr=explode(",",$outputArray[0]);
15 | 		$tmpssr=distance_pre($tmparr[0]);
16 | 		foreach ($tmpssr as $k => $v) {
17 | 			$tmpdata=explode(",",$v);
18 | 			if($tmpdata[0]==$keyword){
19 | 				$resultdata=$tmpdata;
20 | 				break;
21 | 			}
22 | 		}
23 | 		return $resultdata;
24 | 	} else {
25 | 		return array();
26 | 	}
27 | }
28 | print_r(distance("警察"));
29 | ?>
30 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/entries:
--------------------------------------------------------------------------------
1 | 12
2 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/format:
--------------------------------------------------------------------------------
1 | 12
2 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/13/13294f538c32fae2da1c85726695d77d70980247.svn-base:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./distance vectors.bin
8 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/21/210681519593463cd6742bbec2abc1a253932bb1.svn-base:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <malloc.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char *bestw[N];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   char ch;
 32 |   float *M;
 33 |   char *vocab;
 34 |   if (argc < 2) {
 35 |     printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 36 |     return 0;
 37 |   }
 38 |   strcpy(file_name, argv[1]);
 39 |   f = fopen(file_name, "rb");
 40 |   if (f == NULL) {
 41 |     printf("Input file not found\n");
 42 |     return -1;
 43 |   }
 44 |   fscanf(f, "%lld", &words);
 45 |   fscanf(f, "%lld", &size);
 46 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 47 |   for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
 48 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 49 |   if (M == NULL) {
 50 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 51 |     return -1;
 52 |   }
 53 |   for (b = 0; b < words; b++) {
 54 |     a = 0;
 55 |     while (1) {
 56 |       vocab[b * max_w + a] = fgetc(f);
 57 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 58 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 59 |     }
 60 |     vocab[b * max_w + a] = 0;
 61 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 62 |     len = 0;
 63 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 64 |     len = sqrt(len);
 65 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 66 |   }
 67 |   fclose(f);
 68 |   while (1) {
 69 |     for (a = 0; a < N; a++) bestd[a] = 0;
 70 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 71 |     printf("Enter word or sentence (EXIT to break): ");
 72 |     a = 0;
 73 |     while (1) {
 74 |       st1[a] = fgetc(stdin);
 75 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 76 |         st1[a] = 0;
 77 |         break;
 78 |       }
 79 |       a++;
 80 |     }
 81 |     if (!strcmp(st1, "EXIT")) break;
 82 |     cn = 0;
 83 |     b = 0;
 84 |     c = 0;
 85 |     while (1) {
 86 |       st[cn][b] = st1[c];
 87 |       b++;
 88 |       c++;
 89 |       st[cn][b] = 0;
 90 |       if (st1[c] == 0) break;
 91 |       if (st1[c] == ' ') {
 92 |         cn++;
 93 |         b = 0;
 94 |         c++;
 95 |       }
 96 |     }
 97 |     cn++;
 98 |     for (a = 0; a < cn; a++) {
 99 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
100 |       if (b == words) b = -1;
101 |       bi[a] = b;
102 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
103 |       if (b == -1) {
104 |         printf("Out of dictionary word!\n");
105 |         break;
106 |       }
107 |     }
108 |     if (b == -1) continue;
109 |     printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
110 |     for (a = 0; a < size; a++) vec[a] = 0;
111 |     for (b = 0; b < cn; b++) {
112 |       if (bi[b] == -1) continue;
113 |       for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
114 |     }
115 |     len = 0;
116 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
117 |     len = sqrt(len);
118 |     for (a = 0; a < size; a++) vec[a] /= len;
119 |     for (a = 0; a < N; a++) bestd[a] = -1;
120 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
121 |     for (c = 0; c < words; c++) {
122 |       a = 0;
123 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
124 |       if (a == 1) continue;
125 |       dist = 0;
126 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
127 |       for (a = 0; a < N; a++) {
128 |         if (dist > bestd[a]) {
129 |           for (d = N - 1; d > a; d--) {
130 |             bestd[d] = bestd[d - 1];
131 |             strcpy(bestw[d], bestw[d - 1]);
132 |           }
133 |           bestd[a] = dist;
134 |           strcpy(bestw[a], &vocab[c * max_w]);
135 |           break;
136 |         }
137 |       }
138 |     }
139 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
140 |   }
141 |   return 0;
142 | }
143 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/23/2334b431b808544014e14d0ddbb66ccb03d13277.svn-base:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <malloc.h>
 20 | #include <ctype.h>
 21 | 
 22 | const long long max_size = 2000;         // max length of strings
 23 | const long long N = 1;                   // number of closest words
 24 | const long long max_w = 50;              // max length of vocabulary entries
 25 | 
 26 | int main(int argc, char **argv)
 27 | {
 28 |   FILE *f;
 29 |   char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
 30 |   float dist, len, bestd[N], vec[max_size];
 31 |   long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
 32 |   float *M;
 33 |   char *vocab;
 34 |   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
 35 |   if (argc < 2) {
 36 |     printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
 37 |     return 0;
 38 |   }
 39 |   strcpy(file_name, argv[1]);
 40 |   if (argc > 2) threshold = atoi(argv[2]);
 41 |   f = fopen(file_name, "rb");
 42 |   if (f == NULL) {
 43 |     printf("Input file not found\n");
 44 |     return -1;
 45 |   }
 46 |   fscanf(f, "%lld", &words);
 47 |   if (threshold) if (words > threshold) words = threshold;
 48 |   fscanf(f, "%lld", &size);
 49 |   vocab = (char *)malloc(words * max_w * sizeof(char));
 50 |   M = (float *)malloc(words * size * sizeof(float));
 51 |   if (M == NULL) {
 52 |     printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
 53 |     return -1;
 54 |   }
 55 |   for (b = 0; b < words; b++) {
 56 |     a = 0;
 57 |     while (1) {
 58 |       vocab[b * max_w + a] = fgetc(f);
 59 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 60 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 61 |     }
 62 |     vocab[b * max_w + a] = 0;
 63 |     for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
 64 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 65 |     len = 0;
 66 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 67 |     len = sqrt(len);
 68 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 69 |   }
 70 |   fclose(f);
 71 |   TCN = 0;
 72 |   while (1) {
 73 |     for (a = 0; a < N; a++) bestd[a] = 0;
 74 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 75 |     scanf("%s", st1);
 76 |     for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]);
 77 |     if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
 78 |       if (TCN == 0) TCN = 1;
 79 |       if (QID != 0) {
 80 |         printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
 81 |         printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
 82 |       }
 83 |       QID++;
 84 |       scanf("%s", st1);
 85 |       if (feof(stdin)) break;
 86 |       printf("%s:\n", st1);
 87 |       TCN = 0;
 88 |       CCN = 0;
 89 |       continue;
 90 |     }
 91 |     if (!strcmp(st1, "EXIT")) break;
 92 |     scanf("%s", st2);
 93 |     for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]);
 94 |     scanf("%s", st3);
 95 |     for (a = 0; a<strlen(st3); a++) st3[a] = toupper(st3[a]);
 96 |     scanf("%s", st4);
 97 |     for (a = 0; a < strlen(st4); a++) st4[a] = toupper(st4[a]);
 98 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st1)) break;
 99 |     b1 = b;
100 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st2)) break;
101 |     b2 = b;
102 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st3)) break;
103 |     b3 = b;
104 |     for (a = 0; a < N; a++) bestd[a] = 0;
105 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
106 |     TQ++;
107 |     if (b1 == words) continue;
108 |     if (b2 == words) continue;
109 |     if (b3 == words) continue;
110 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st4)) break;
111 |     if (b == words) continue;
112 |     for (a = 0; a < size; a++) vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
113 |     TQS++;
114 |     for (c = 0; c < words; c++) {
115 |       if (c == b1) continue;
116 |       if (c == b2) continue;
117 |       if (c == b3) continue;
118 |       dist = 0;
119 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
120 |       for (a = 0; a < N; a++) {
121 |         if (dist > bestd[a]) {
122 |           for (d = N - 1; d > a; d--) {
123 |             bestd[d] = bestd[d - 1];
124 |             strcpy(bestw[d], bestw[d - 1]);
125 |           }
126 |           bestd[a] = dist;
127 |           strcpy(bestw[a], &vocab[c * max_w]);
128 |           break;
129 |         }
130 |       }
131 |     }
132 |     if (!strcmp(st4, bestw[0])) {
133 |       CCN++;
134 |       CACN++;
135 |       if (QID <= 5) SEAC++; else SYAC++;
136 |     }
137 |     if (QID <= 5) SECN++; else SYCN++;
138 |     TCN++;
139 |     TACN++;
140 |   }
141 |   printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
142 |   return 0;
143 | }
144 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/2b/2b8b815229aa8a61e483fb4ba0588b8b6c491890.svn-base:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/4e/4ea10e60b208f31ae965718f905268ac42fbf1ac.svn-base:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 60
 22 | 
 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary
 24 | 
 25 | typedef float real;                    // Precision of float numbers
 26 | 
 27 | struct vocab_word {
 28 |   long long cn;
 29 |   char *word;
 30 | };
 31 | 
 32 | char train_file[MAX_STRING], output_file[MAX_STRING];
 33 | struct vocab_word *vocab;
 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1;
 35 | long long vocab_max_size = 10000, vocab_size = 0;
 36 | long long train_words = 0;
 37 | real threshold = 100;
 38 | 
 39 | unsigned long long next_random = 1;
 40 | 
 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 42 | void ReadWord(char *word, FILE *fin) {
 43 |   int a = 0, ch;
 44 |   while (!feof(fin)) {
 45 |     ch = fgetc(fin);
 46 |     if (ch == 13) continue;
 47 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 48 |       if (a > 0) {
 49 |         if (ch == '\n') ungetc(ch, fin);
 50 |         break;
 51 |       }
 52 |       if (ch == '\n') {
 53 |         strcpy(word, (char *)"</s>");
 54 |         return;
 55 |       } else continue;
 56 |     }
 57 |     word[a] = ch;
 58 |     a++;
 59 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 60 |   }
 61 |   word[a] = 0;
 62 | }
 63 | 
 64 | // Returns hash value of a word
 65 | int GetWordHash(char *word) {
 66 |   unsigned long long a, hash = 1;
 67 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
 68 |   hash = hash % vocab_hash_size;
 69 |   return hash;
 70 | }
 71 | 
 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
 73 | int SearchVocab(char *word) {
 74 |   unsigned int hash = GetWordHash(word);
 75 |   while (1) {
 76 |     if (vocab_hash[hash] == -1) return -1;
 77 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
 78 |     hash = (hash + 1) % vocab_hash_size;
 79 |   }
 80 |   return -1;
 81 | }
 82 | 
 83 | // Reads a word and returns its index in the vocabulary
 84 | int ReadWordIndex(FILE *fin) {
 85 |   char word[MAX_STRING];
 86 |   ReadWord(word, fin);
 87 |   if (feof(fin)) return -1;
 88 |   return SearchVocab(word);
 89 | }
 90 | 
 91 | // Adds a word to the vocabulary
 92 | int AddWordToVocab(char *word) {
 93 |   unsigned int hash, length = strlen(word) + 1;
 94 |   if (length > MAX_STRING) length = MAX_STRING;
 95 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
 96 |   strcpy(vocab[vocab_size].word, word);
 97 |   vocab[vocab_size].cn = 0;
 98 |   vocab_size++;
 99 |   // Reallocate memory if needed
100 |   if (vocab_size + 2 >= vocab_max_size) {
101 |     vocab_max_size += 10000;
102 |     vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
103 |   }
104 |   hash = GetWordHash(word);
105 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
106 |   vocab_hash[hash]=vocab_size - 1;
107 |   return vocab_size - 1;
108 | }
109 | 
110 | // Used later for sorting by word counts
111 | int VocabCompare(const void *a, const void *b) {
112 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
113 | }
114 | 
115 | // Sorts the vocabulary by frequency using word counts
116 | void SortVocab() {
117 |   int a;
118 |   unsigned int hash;
119 |   // Sort the vocabulary and keep </s> at the first position
120 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
121 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
122 |   for (a = 0; a < vocab_size; a++) {
123 |     // Words occuring less than min_count times will be discarded from the vocab
124 |     if (vocab[a].cn < min_count) {
125 |       vocab_size--;
126 |       free(vocab[vocab_size].word);
127 |     } else {
128 |       // Hash will be re-computed, as after the sorting it is not actual
129 |       hash = GetWordHash(vocab[a].word);
130 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
131 |       vocab_hash[hash] = a;
132 |     }
133 |   }
134 |   vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word));
135 | }
136 | 
137 | // Reduces the vocabulary by removing infrequent tokens
138 | void ReduceVocab() {
139 |   int a, b = 0;
140 |   unsigned int hash;
141 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
142 |     vocab[b].cn = vocab[a].cn;
143 |     vocab[b].word = vocab[a].word;
144 |     b++;
145 |   } else free(vocab[a].word);
146 |   vocab_size = b;
147 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
148 |   for (a = 0; a < vocab_size; a++) {
149 |     // Hash will be re-computed, as it is not actual
150 |     hash = GetWordHash(vocab[a].word);
151 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
152 |     vocab_hash[hash] = a;
153 |   }
154 |   fflush(stdout);
155 |   min_reduce++;
156 | }
157 | 
158 | void LearnVocabFromTrainFile() {
159 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
160 |   FILE *fin;
161 |   long long a, i, start = 1;
162 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
163 |   fin = fopen(train_file, "rb");
164 |   if (fin == NULL) {
165 |     printf("ERROR: training data file not found!\n");
166 |     exit(1);
167 |   }
168 |   vocab_size = 0;
169 |   AddWordToVocab((char *)"</s>");
170 |   while (1) {
171 |     ReadWord(word, fin);
172 |     if (feof(fin)) break;
173 |     if (!strcmp(word, "</s>")) {
174 |       start = 1;
175 |       continue;
176 |     } else start = 0;
177 |     train_words++;
178 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
179 |       printf("Words processed: %lldK     Vocab size: %lldK  %c", train_words / 1000, vocab_size / 1000, 13);
180 |       fflush(stdout);
181 |     }
182 |     i = SearchVocab(word);
183 |     if (i == -1) {
184 |       a = AddWordToVocab(word);
185 |       vocab[a].cn = 1;
186 |     } else vocab[i].cn++;
187 |     if (start) continue;
188 |     sprintf(bigram_word, "%s_%s", last_word, word);
189 |     bigram_word[MAX_STRING - 1] = 0;
190 |     strcpy(last_word, word);
191 |     i = SearchVocab(bigram_word);
192 |     if (i == -1) {
193 |       a = AddWordToVocab(bigram_word);
194 |       vocab[a].cn = 1;
195 |     } else vocab[i].cn++;
196 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
197 |   }
198 |   SortVocab();
199 |   if (debug_mode > 0) {
200 |     printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
201 |     printf("Words in train file: %lld\n", train_words);
202 |   }
203 |   fclose(fin);
204 | }
205 | 
206 | void TrainModel() {
207 |   long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
208 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
209 |   real score;
210 |   FILE *fo, *fin;
211 |   printf("Starting training using file %s\n", train_file);
212 |   LearnVocabFromTrainFile();
213 |   fin = fopen(train_file, "rb");
214 |   fo = fopen(output_file, "wb");
215 |   word[0] = 0;
216 |   while (1) {
217 |     strcpy(last_word, word);
218 |     ReadWord(word, fin);
219 |     if (feof(fin)) break;
220 |     if (!strcmp(word, "</s>")) {
221 |       fprintf(fo, "\n");
222 |       continue;
223 |     }
224 |     cn++;
225 |     if ((debug_mode > 1) && (cn % 100000 == 0)) {
226 |       printf("Words written: %lldK%c", cn / 1000, 13);
227 |       fflush(stdout);
228 |     }
229 |     oov = 0;
230 |     i = SearchVocab(word);
231 |     if (i == -1) oov = 1; else pb = vocab[i].cn;
232 |     if (li == -1) oov = 1;
233 |     li = i;
234 |     sprintf(bigram_word, "%s_%s", last_word, word);
235 |     bigram_word[MAX_STRING - 1] = 0;
236 |     i = SearchVocab(bigram_word);
237 |     if (i == -1) oov = 1; else pab = vocab[i].cn;
238 |     if (pa < min_count) oov = 1;
239 |     if (pb < min_count) oov = 1;
240 |     if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
241 |     if (score > threshold) {
242 |       fprintf(fo, "_%s", word);
243 |       pb = 0;
244 |     } else fprintf(fo, " %s", word);
245 |     pa = pb;
246 |   }
247 |   fclose(fo);
248 |   fclose(fin);
249 | }
250 | 
251 | int ArgPos(char *str, int argc, char **argv) {
252 |   int a;
253 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
254 |     if (a == argc - 1) {
255 |       printf("Argument missing for %s\n", str);
256 |       exit(1);
257 |     }
258 |     return a;
259 |   }
260 |   return -1;
261 | }
262 | 
263 | int main(int argc, char **argv) {
264 |   int i;
265 |   if (argc == 1) {
266 |     printf("WORD2PHRASE tool v0.1a\n\n");
267 |     printf("Options:\n");
268 |     printf("Parameters for training:\n");
269 |     printf("\t-train <file>\n");
270 |     printf("\t\tUse text data from <file> to train the model\n");
271 |     printf("\t-output <file>\n");
272 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters / phrases\n");
273 |     printf("\t-min-count <int>\n");
274 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
275 |     printf("\t-threshold <float>\n");
276 |     printf("\t\t The <float> value represents threshold for forming the phrases (higher means less phrases); default 100\n");
277 |     printf("\t-debug <int>\n");
278 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
279 |     printf("\nExamples:\n");
280 |     printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n");
281 |     return 0;
282 |   }
283 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
284 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
285 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
286 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
287 |   if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]);
288 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
289 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
290 |   TrainModel();
291 |   return 0;
292 | }
293 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/51/518a7ad549627c6ef8cf05b49408fcf0f6157460.svn-base:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e news.2012.en.shuffled ]; then
 3 |   wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 4 |   gzip -d news.2012.en.shuffled.gz -f
 5 | fi
 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt
12 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/72/724bf0b7fd08d78098c1ccc622ada62ad58093ba.svn-base:
--------------------------------------------------------------------------------
  1 | ###############################################################################################
  2 | #
  3 | # Script for training good word and phrase vector model using public corpora, version 1.0.
  4 | # The training time will be from several hours to about a day.
  5 | #
  6 | # Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains
  7 | # a 500-dimensional vector model and evaluates it on word and phrase analogy tasks.
  8 | #
  9 | ###############################################################################################
 10 | 
 11 | # This function will convert text to lowercase and remove special characters
 12 | normalize_text() {
 13 |   awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
 14 |   -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
 15 |   -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
 16 |   -e 's/«/ /g' | tr 0-9 " "
 17 | }
 18 | 
 19 | mkdir word2vec
 20 | cd word2vec
 21 | 
 22 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 23 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz
 24 | gzip -d news.2012.en.shuffled.gz
 25 | gzip -d news.2013.en.shuffled.gz
 26 | normalize_text < news.2012.en.shuffled > data.txt
 27 | normalize_text < news.2013.en.shuffled >> data.txt
 28 | 
 29 | wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
 30 | tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
 31 | for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do
 32 |   normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt
 33 | done
 34 | 
 35 | wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus
 36 | tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt
 37 | for i in `ls webbase_all`; do
 38 |   normalize_text < webbase_all/$i >> data.txt
 39 | done
 40 | 
 41 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
 42 | bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e '
 43 | # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
 44 | # letters (a-z, converted from A-Z), and spaces (never consecutive)...
 45 | # All other characters are converted to spaces.  Only text which normally appears.
 46 | # in the web browser is displayed.  Tables are removed.  Image captions are.
 47 | # preserved.  Links are converted to normal text.  Digits are spelled out.
 48 | # *** Modified to not spell digits or throw away non-ASCII characters ***
 49 | 
 50 | # Written by Matt Mahoney, June 10, 2006.  This program is released to the public domain.
 51 | 
 52 | $/=">";                     # input record separator
 53 | while (<>) {
 54 |   if (/<text /) {$text=1;}  # remove all but between <text> ... </text>
 55 |   if (/#redirect/i) {$text=0;}  # remove #REDIRECT
 56 |   if ($text) {
 57 | 
 58 |     # Remove any text not normally visible
 59 |     if (/<\/text>/) {$text=0;}
 60 |     s/<.*>//;               # remove xml tags
 61 |     s/&amp;/&/g;            # decode URL encoded chars
 62 |     s/&lt;/</g;
 63 |     s/&gt;/>/g;
 64 |     s/<ref[^<]*<\/ref>//g;  # remove references <ref...> ... </ref>
 65 |     s/<[^>]*>//g;           # remove xhtml tags
 66 |     s/\[http:[^] ]*/[/g;    # remove normal url, preserve visible text
 67 |     s/\|thumb//ig;          # remove images links, preserve caption
 68 |     s/\|left//ig;
 69 |     s/\|right//ig;
 70 |     s/\|\d+px//ig;
 71 |     s/\[\[image:[^\[\]]*\|//ig;
 72 |     s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig;  # show categories without markup
 73 |     s/\[\[[a-z\-]*:[^\]]*\]\]//g;  # remove links to other languages
 74 |     s/\[\[[^\|\]]*\|/[[/g;  # remove wiki url, preserve visible text
 75 |     s/{{[^}]*}}//g;         # remove {{icons}} and {tables}
 76 |     s/{[^}]*}//g;
 77 |     s/\[//g;                # remove [ and ]
 78 |     s/\]//g;
 79 |     s/&[^;]*;/ /g;          # remove URL encoded chars
 80 | 
 81 |     $_=" $_ ";
 82 |     chop;
 83 |     print $_;
 84 |   }
 85 | }
 86 | ' | normalize_text | awk '{if (NF>1) print;}' >> data.txt
 87 | 
 88 | wget http://word2vec.googlecode.com/svn/trunk/word2vec.c
 89 | wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c
 90 | wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c
 91 | wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
 92 | wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt
 93 | gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops
 94 | gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops
 95 | gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops
 96 | ./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2
 97 | ./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2
 98 | ./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10
 99 | ./compute-accuracy vectors.bin 400000 < questions-words.txt     # should get to almost 78% accuracy on 99.7% of questions
100 | ./compute-accuracy vectors.bin 1000000 < questions-phrases.txt  # about 78% accuracy with 77% coverage
101 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/80/80740eb5930e039b8002a6c2213cd152847a4169.svn-base:
--------------------------------------------------------------------------------
 1 | Tools for computing distributed representtion of words
 2 | ------------------------------------------------------
 3 | 
 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts.
 5 | 
 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous
 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following:
 8 |  - desired vector dimensionality
 9 |  - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model
10 |  - training algorithm: hierarchical softmax and / or negative sampling
11 |  - threshold for downsampling the frequent words 
12 |  - number of threads to use
13 |  - the format of the output word vector file (text or binary)
14 | 
15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 
16 | 
17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training
18 | is finished, the user can interactively explore the similarity of the words.
19 | 
20 | More information about the scripts is provided at https://code.google.com/p/word2vec/
21 | 
22 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/83/83a04fdb0a7cc66001a1abe29157acbe28321564.svn-base:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e news.2012.en.shuffled ]; then
 3 |   wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 4 |   gzip -d news.2012.en.shuffled.gz -f
 5 | fi
 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./distance vectors-phrase.bin
12 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/8c/8ccd7b8850b84c7d306aebd933c2f1a26d264320.svn-base:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
 4 | 
 5 | all: word2vec word2phrase distance word-analogy compute-accuracy
 6 | 
 7 | word2vec : word2vec.c
 8 | 	$(CC) word2vec.c -o word2vec $(CFLAGS)
 9 | word2phrase : word2phrase.c
10 | 	$(CC) word2phrase.c -o word2phrase $(CFLAGS)
11 | distance : distance.c
12 | 	$(CC) distance.c -o distance $(CFLAGS)
13 | word-analogy : word-analogy.c
14 | 	$(CC) word-analogy.c -o word-analogy $(CFLAGS)
15 | compute-accuracy : compute-accuracy.c
16 | 	$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
17 | 	chmod +x *.sh
18 | 
19 | clean:
20 | 	rm -rf word2vec word2phrase distance word-analogy compute-accuracy


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/91/91063b176c2f3543afd684071bf4677203917a52.svn-base:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt
8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
9 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/9a/9a7277255e393a35ce6a0738867c29304f43b55c.svn-base:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <malloc.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char bestw[N][max_size];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   char ch;
 32 |   float *M;
 33 |   char *vocab;
 34 |   if (argc < 2) {
 35 |     printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 36 |     return 0;
 37 |   }
 38 |   strcpy(file_name, argv[1]);
 39 |   f = fopen(file_name, "rb");
 40 |   if (f == NULL) {
 41 |     printf("Input file not found\n");
 42 |     return -1;
 43 |   }
 44 |   fscanf(f, "%lld", &words);
 45 |   fscanf(f, "%lld", &size);
 46 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 47 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 48 |   if (M == NULL) {
 49 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 50 |     return -1;
 51 |   }
 52 |   for (b = 0; b < words; b++) {
 53 |     a = 0;
 54 |     while (1) {
 55 |       vocab[b * max_w + a] = fgetc(f);
 56 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 57 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 58 |     }
 59 |     vocab[b * max_w + a] = 0;
 60 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 61 |     len = 0;
 62 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 63 |     len = sqrt(len);
 64 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 65 |   }
 66 |   fclose(f);
 67 |   while (1) {
 68 |     for (a = 0; a < N; a++) bestd[a] = 0;
 69 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 70 |     printf("Enter three words (EXIT to break): ");
 71 |     a = 0;
 72 |     while (1) {
 73 |       st1[a] = fgetc(stdin);
 74 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 75 |         st1[a] = 0;
 76 |         break;
 77 |       }
 78 |       a++;
 79 |     }
 80 |     if (!strcmp(st1, "EXIT")) break;
 81 |     cn = 0;
 82 |     b = 0;
 83 |     c = 0;
 84 |     while (1) {
 85 |       st[cn][b] = st1[c];
 86 |       b++;
 87 |       c++;
 88 |       st[cn][b] = 0;
 89 |       if (st1[c] == 0) break;
 90 |       if (st1[c] == ' ') {
 91 |         cn++;
 92 |         b = 0;
 93 |         c++;
 94 |       }
 95 |     }
 96 |     cn++;
 97 |     if (cn < 3) {
 98 |       printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
 99 |       continue;
100 |     }
101 |     for (a = 0; a < cn; a++) {
102 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
103 |       if (b == words) b = 0;
104 |       bi[a] = b;
105 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
106 |       if (b == 0) {
107 |         printf("Out of dictionary word!\n");
108 |         break;
109 |       }
110 |     }
111 |     if (b == 0) continue;
112 |     printf("\n                                              Word              Distance\n------------------------------------------------------------------------\n");
113 |     for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
114 |     len = 0;
115 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
116 |     len = sqrt(len);
117 |     for (a = 0; a < size; a++) vec[a] /= len;
118 |     for (a = 0; a < N; a++) bestd[a] = 0;
119 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
120 |     for (c = 0; c < words; c++) {
121 |       if (c == bi[0]) continue;
122 |       if (c == bi[1]) continue;
123 |       if (c == bi[2]) continue;
124 |       a = 0;
125 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
126 |       if (a == 1) continue;
127 |       dist = 0;
128 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
129 |       for (a = 0; a < N; a++) {
130 |         if (dist > bestd[a]) {
131 |           for (d = N - 1; d > a; d--) {
132 |             bestd[d] = bestd[d - 1];
133 |             strcpy(bestw[d], bestw[d - 1]);
134 |           }
135 |           bestd[a] = dist;
136 |           strcpy(bestw[a], &vocab[c * max_w]);
137 |           break;
138 |         }
139 |       }
140 |     }
141 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
142 |   }
143 |   return 0;
144 | }
145 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/c7/c7b37d6aa035fe7b53a54351b44ab577d2fd3337.svn-base:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 100
 22 | #define EXP_TABLE_SIZE 1000
 23 | #define MAX_EXP 6
 24 | #define MAX_SENTENCE_LENGTH 1000
 25 | #define MAX_CODE_LENGTH 40
 26 | 
 27 | const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 28 | 
 29 | typedef float real;                    // Precision of float numbers
 30 | 
 31 | struct vocab_word {
 32 |   long long cn;
 33 |   int *point;
 34 |   char *word, *code, codelen;
 35 | };
 36 | 
 37 | char train_file[MAX_STRING], output_file[MAX_STRING];
 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 39 | struct vocab_word *vocab;
 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
 41 | int *vocab_hash;
 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
 44 | real alpha = 0.025, starting_alpha, sample = 1e-3;
 45 | real *syn0, *syn1, *syn1neg, *expTable;
 46 | clock_t start;
 47 | 
 48 | int hs = 0, negative = 5;
 49 | const int table_size = 1e8;
 50 | int *table;
 51 | 
 52 | void InitUnigramTable() {
 53 |   int a, i;
 54 |   long long train_words_pow = 0;
 55 |   real d1, power = 0.75;
 56 |   table = (int *)malloc(table_size * sizeof(int));
 57 |   for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
 58 |   i = 0;
 59 |   d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
 60 |   for (a = 0; a < table_size; a++) {
 61 |     table[a] = i;
 62 |     if (a / (real)table_size > d1) {
 63 |       i++;
 64 |       d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
 65 |     }
 66 |     if (i >= vocab_size) i = vocab_size - 1;
 67 |   }
 68 | }
 69 | 
 70 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 71 | void ReadWord(char *word, FILE *fin) {
 72 |   int a = 0, ch;
 73 |   while (!feof(fin)) {
 74 |     ch = fgetc(fin);
 75 |     if (ch == 13) continue;
 76 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 77 |       if (a > 0) {
 78 |         if (ch == '\n') ungetc(ch, fin);
 79 |         break;
 80 |       }
 81 |       if (ch == '\n') {
 82 |         strcpy(word, (char *)"</s>");
 83 |         return;
 84 |       } else continue;
 85 |     }
 86 |     word[a] = ch;
 87 |     a++;
 88 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 89 |   }
 90 |   word[a] = 0;
 91 | }
 92 | 
 93 | // Returns hash value of a word
 94 | int GetWordHash(char *word) {
 95 |   unsigned long long a, hash = 0;
 96 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
 97 |   hash = hash % vocab_hash_size;
 98 |   return hash;
 99 | }
100 | 
101 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
102 | int SearchVocab(char *word) {
103 |   unsigned int hash = GetWordHash(word);
104 |   while (1) {
105 |     if (vocab_hash[hash] == -1) return -1;
106 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
107 |     hash = (hash + 1) % vocab_hash_size;
108 |   }
109 |   return -1;
110 | }
111 | 
112 | // Reads a word and returns its index in the vocabulary
113 | int ReadWordIndex(FILE *fin) {
114 |   char word[MAX_STRING];
115 |   ReadWord(word, fin);
116 |   if (feof(fin)) return -1;
117 |   return SearchVocab(word);
118 | }
119 | 
120 | // Adds a word to the vocabulary
121 | int AddWordToVocab(char *word) {
122 |   unsigned int hash, length = strlen(word) + 1;
123 |   if (length > MAX_STRING) length = MAX_STRING;
124 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
125 |   strcpy(vocab[vocab_size].word, word);
126 |   vocab[vocab_size].cn = 0;
127 |   vocab_size++;
128 |   // Reallocate memory if needed
129 |   if (vocab_size + 2 >= vocab_max_size) {
130 |     vocab_max_size += 1000;
131 |     vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
132 |   }
133 |   hash = GetWordHash(word);
134 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
135 |   vocab_hash[hash] = vocab_size - 1;
136 |   return vocab_size - 1;
137 | }
138 | 
139 | // Used later for sorting by word counts
140 | int VocabCompare(const void *a, const void *b) {
141 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
142 | }
143 | 
144 | // Sorts the vocabulary by frequency using word counts
145 | void SortVocab() {
146 |   int a, size;
147 |   unsigned int hash;
148 |   // Sort the vocabulary and keep </s> at the first position
149 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
150 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
151 |   size = vocab_size;
152 |   train_words = 0;
153 |   for (a = 0; a < size; a++) {
154 |     // Words occuring less than min_count times will be discarded from the vocab
155 |     if ((vocab[a].cn < min_count) && (a != 0)) {
156 |       vocab_size--;
157 |       free(vocab[a].word);
158 |     } else {
159 |       // Hash will be re-computed, as after the sorting it is not actual
160 |       hash=GetWordHash(vocab[a].word);
161 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
162 |       vocab_hash[hash] = a;
163 |       train_words += vocab[a].cn;
164 |     }
165 |   }
166 |   vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
167 |   // Allocate memory for the binary tree construction
168 |   for (a = 0; a < vocab_size; a++) {
169 |     vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
170 |     vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
171 |   }
172 | }
173 | 
174 | // Reduces the vocabulary by removing infrequent tokens
175 | void ReduceVocab() {
176 |   int a, b = 0;
177 |   unsigned int hash;
178 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
179 |     vocab[b].cn = vocab[a].cn;
180 |     vocab[b].word = vocab[a].word;
181 |     b++;
182 |   } else free(vocab[a].word);
183 |   vocab_size = b;
184 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
185 |   for (a = 0; a < vocab_size; a++) {
186 |     // Hash will be re-computed, as it is not actual
187 |     hash = GetWordHash(vocab[a].word);
188 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
189 |     vocab_hash[hash] = a;
190 |   }
191 |   fflush(stdout);
192 |   min_reduce++;
193 | }
194 | 
195 | // Create binary Huffman tree using the word counts
196 | // Frequent words will have short uniqe binary codes
197 | void CreateBinaryTree() {
198 |   long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
199 |   char code[MAX_CODE_LENGTH];
200 |   long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
201 |   long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
202 |   long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
203 |   for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
204 |   for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
205 |   pos1 = vocab_size - 1;
206 |   pos2 = vocab_size;
207 |   // Following algorithm constructs the Huffman tree by adding one node at a time
208 |   for (a = 0; a < vocab_size - 1; a++) {
209 |     // First, find two smallest nodes 'min1, min2'
210 |     if (pos1 >= 0) {
211 |       if (count[pos1] < count[pos2]) {
212 |         min1i = pos1;
213 |         pos1--;
214 |       } else {
215 |         min1i = pos2;
216 |         pos2++;
217 |       }
218 |     } else {
219 |       min1i = pos2;
220 |       pos2++;
221 |     }
222 |     if (pos1 >= 0) {
223 |       if (count[pos1] < count[pos2]) {
224 |         min2i = pos1;
225 |         pos1--;
226 |       } else {
227 |         min2i = pos2;
228 |         pos2++;
229 |       }
230 |     } else {
231 |       min2i = pos2;
232 |       pos2++;
233 |     }
234 |     count[vocab_size + a] = count[min1i] + count[min2i];
235 |     parent_node[min1i] = vocab_size + a;
236 |     parent_node[min2i] = vocab_size + a;
237 |     binary[min2i] = 1;
238 |   }
239 |   // Now assign binary code to each vocabulary word
240 |   for (a = 0; a < vocab_size; a++) {
241 |     b = a;
242 |     i = 0;
243 |     while (1) {
244 |       code[i] = binary[b];
245 |       point[i] = b;
246 |       i++;
247 |       b = parent_node[b];
248 |       if (b == vocab_size * 2 - 2) break;
249 |     }
250 |     vocab[a].codelen = i;
251 |     vocab[a].point[0] = vocab_size - 2;
252 |     for (b = 0; b < i; b++) {
253 |       vocab[a].code[i - b - 1] = code[b];
254 |       vocab[a].point[i - b] = point[b] - vocab_size;
255 |     }
256 |   }
257 |   free(count);
258 |   free(binary);
259 |   free(parent_node);
260 | }
261 | 
262 | void LearnVocabFromTrainFile() {
263 |   char word[MAX_STRING];
264 |   FILE *fin;
265 |   long long a, i;
266 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
267 |   fin = fopen(train_file, "rb");
268 |   if (fin == NULL) {
269 |     printf("ERROR: training data file not found!\n");
270 |     exit(1);
271 |   }
272 |   vocab_size = 0;
273 |   AddWordToVocab((char *)"</s>");
274 |   while (1) {
275 |     ReadWord(word, fin);
276 |     if (feof(fin)) break;
277 |     train_words++;
278 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
279 |       printf("%lldK%c", train_words / 1000, 13);
280 |       fflush(stdout);
281 |     }
282 |     i = SearchVocab(word);
283 |     if (i == -1) {
284 |       a = AddWordToVocab(word);
285 |       vocab[a].cn = 1;
286 |     } else vocab[i].cn++;
287 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
288 |   }
289 |   SortVocab();
290 |   if (debug_mode > 0) {
291 |     printf("Vocab size: %lld\n", vocab_size);
292 |     printf("Words in train file: %lld\n", train_words);
293 |   }
294 |   file_size = ftell(fin);
295 |   fclose(fin);
296 | }
297 | 
298 | void SaveVocab() {
299 |   long long i;
300 |   FILE *fo = fopen(save_vocab_file, "wb");
301 |   for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
302 |   fclose(fo);
303 | }
304 | 
305 | void ReadVocab() {
306 |   long long a, i = 0;
307 |   char c;
308 |   char word[MAX_STRING];
309 |   FILE *fin = fopen(read_vocab_file, "rb");
310 |   if (fin == NULL) {
311 |     printf("Vocabulary file not found\n");
312 |     exit(1);
313 |   }
314 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
315 |   vocab_size = 0;
316 |   while (1) {
317 |     ReadWord(word, fin);
318 |     if (feof(fin)) break;
319 |     a = AddWordToVocab(word);
320 |     fscanf(fin, "%lld%c", &vocab[a].cn, &c);
321 |     i++;
322 |   }
323 |   SortVocab();
324 |   if (debug_mode > 0) {
325 |     printf("Vocab size: %lld\n", vocab_size);
326 |     printf("Words in train file: %lld\n", train_words);
327 |   }
328 |   fin = fopen(train_file, "rb");
329 |   if (fin == NULL) {
330 |     printf("ERROR: training data file not found!\n");
331 |     exit(1);
332 |   }
333 |   fseek(fin, 0, SEEK_END);
334 |   file_size = ftell(fin);
335 |   fclose(fin);
336 | }
337 | 
338 | void InitNet() {
339 |   long long a, b;
340 |   unsigned long long next_random = 1;
341 |   a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
342 |   if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
343 |   if (hs) {
344 |     a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
345 |     if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
346 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
347 |      syn1[a * layer1_size + b] = 0;
348 |   }
349 |   if (negative>0) {
350 |     a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
351 |     if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
352 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
353 |      syn1neg[a * layer1_size + b] = 0;
354 |   }
355 |   for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
356 |     next_random = next_random * (unsigned long long)25214903917 + 11;
357 |     syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
358 |   }
359 |   CreateBinaryTree();
360 | }
361 | 
362 | void *TrainModelThread(void *id) {
363 |   long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
364 |   long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
365 |   long long l1, l2, c, target, label, local_iter = iter;
366 |   unsigned long long next_random = (long long)id;
367 |   real f, g;
368 |   clock_t now;
369 |   real *neu1 = (real *)calloc(layer1_size, sizeof(real));
370 |   real *neu1e = (real *)calloc(layer1_size, sizeof(real));
371 |   FILE *fi = fopen(train_file, "rb");
372 |   fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
373 |   while (1) {
374 |     if (word_count - last_word_count > 10000) {
375 |       word_count_actual += word_count - last_word_count;
376 |       last_word_count = word_count;
377 |       if ((debug_mode > 1)) {
378 |         now=clock();
379 |         printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
380 |          word_count_actual / (real)(iter * train_words + 1) * 100,
381 |          word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
382 |         fflush(stdout);
383 |       }
384 |       alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
385 |       if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
386 |     }
387 |     if (sentence_length == 0) {
388 |       while (1) {
389 |         word = ReadWordIndex(fi);
390 |         if (feof(fi)) break;
391 |         if (word == -1) continue;
392 |         word_count++;
393 |         if (word == 0) break;
394 |         // The subsampling randomly discards frequent words while keeping the ranking same
395 |         if (sample > 0) {
396 |           real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
397 |           next_random = next_random * (unsigned long long)25214903917 + 11;
398 |           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
399 |         }
400 |         sen[sentence_length] = word;
401 |         sentence_length++;
402 |         if (sentence_length >= MAX_SENTENCE_LENGTH) break;
403 |       }
404 |       sentence_position = 0;
405 |     }
406 |     if (feof(fi) || (word_count > train_words / num_threads)) {
407 |       word_count_actual += word_count - last_word_count;
408 |       local_iter--;
409 |       if (local_iter == 0) break;
410 |       word_count = 0;
411 |       last_word_count = 0;
412 |       sentence_length = 0;
413 |       fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
414 |       continue;
415 |     }
416 |     word = sen[sentence_position];
417 |     if (word == -1) continue;
418 |     for (c = 0; c < layer1_size; c++) neu1[c] = 0;
419 |     for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
420 |     next_random = next_random * (unsigned long long)25214903917 + 11;
421 |     b = next_random % window;
422 |     if (cbow) {  //train the cbow architecture
423 |       // in -> hidden
424 |       cw = 0;
425 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
426 |         c = sentence_position - window + a;
427 |         if (c < 0) continue;
428 |         if (c >= sentence_length) continue;
429 |         last_word = sen[c];
430 |         if (last_word == -1) continue;
431 |         for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
432 |         cw++;
433 |       }
434 |       if (cw) {
435 |         for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
436 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
437 |           f = 0;
438 |           l2 = vocab[word].point[d] * layer1_size;
439 |           // Propagate hidden -> output
440 |           for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
441 |           if (f <= -MAX_EXP) continue;
442 |           else if (f >= MAX_EXP) continue;
443 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
444 |           // 'g' is the gradient multiplied by the learning rate
445 |           g = (1 - vocab[word].code[d] - f) * alpha;
446 |           // Propagate errors output -> hidden
447 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
448 |           // Learn weights hidden -> output
449 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
450 |         }
451 |         // NEGATIVE SAMPLING
452 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
453 |           if (d == 0) {
454 |             target = word;
455 |             label = 1;
456 |           } else {
457 |             next_random = next_random * (unsigned long long)25214903917 + 11;
458 |             target = table[(next_random >> 16) % table_size];
459 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
460 |             if (target == word) continue;
461 |             label = 0;
462 |           }
463 |           l2 = target * layer1_size;
464 |           f = 0;
465 |           for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
466 |           if (f > MAX_EXP) g = (label - 1) * alpha;
467 |           else if (f < -MAX_EXP) g = (label - 0) * alpha;
468 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
469 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
470 |           for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
471 |         }
472 |         // hidden -> in
473 |         for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
474 |           c = sentence_position - window + a;
475 |           if (c < 0) continue;
476 |           if (c >= sentence_length) continue;
477 |           last_word = sen[c];
478 |           if (last_word == -1) continue;
479 |           for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
480 |         }
481 |       }
482 |     } else {  //train skip-gram
483 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
484 |         c = sentence_position - window + a;
485 |         if (c < 0) continue;
486 |         if (c >= sentence_length) continue;
487 |         last_word = sen[c];
488 |         if (last_word == -1) continue;
489 |         l1 = last_word * layer1_size;
490 |         for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
491 |         // HIERARCHICAL SOFTMAX
492 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
493 |           f = 0;
494 |           l2 = vocab[word].point[d] * layer1_size;
495 |           // Propagate hidden -> output
496 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
497 |           if (f <= -MAX_EXP) continue;
498 |           else if (f >= MAX_EXP) continue;
499 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
500 |           // 'g' is the gradient multiplied by the learning rate
501 |           g = (1 - vocab[word].code[d] - f) * alpha;
502 |           // Propagate errors output -> hidden
503 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
504 |           // Learn weights hidden -> output
505 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
506 |         }
507 |         // NEGATIVE SAMPLING
508 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
509 |           if (d == 0) {
510 |             target = word;
511 |             label = 1;
512 |           } else {
513 |             next_random = next_random * (unsigned long long)25214903917 + 11;
514 |             target = table[(next_random >> 16) % table_size];
515 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
516 |             if (target == word) continue;
517 |             label = 0;
518 |           }
519 |           l2 = target * layer1_size;
520 |           f = 0;
521 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
522 |           if (f > MAX_EXP) g = (label - 1) * alpha;
523 |           else if (f < -MAX_EXP) g = (label - 0) * alpha;
524 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
525 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
526 |           for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
527 |         }
528 |         // Learn weights input -> hidden
529 |         for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
530 |       }
531 |     }
532 |     sentence_position++;
533 |     if (sentence_position >= sentence_length) {
534 |       sentence_length = 0;
535 |       continue;
536 |     }
537 |   }
538 |   fclose(fi);
539 |   free(neu1);
540 |   free(neu1e);
541 |   pthread_exit(NULL);
542 | }
543 | 
544 | void TrainModel() {
545 |   long a, b, c, d;
546 |   FILE *fo;
547 |   pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
548 |   printf("Starting training using file %s\n", train_file);
549 |   starting_alpha = alpha;
550 |   if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
551 |   if (save_vocab_file[0] != 0) SaveVocab();
552 |   if (output_file[0] == 0) return;
553 |   InitNet();
554 |   if (negative > 0) InitUnigramTable();
555 |   start = clock();
556 |   for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
557 |   for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
558 |   fo = fopen(output_file, "wb");
559 |   if (classes == 0) {
560 |     // Save the word vectors
561 |     fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
562 |     for (a = 0; a < vocab_size; a++) {
563 |       fprintf(fo, "%s ", vocab[a].word);
564 |       if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
565 |       else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
566 |       fprintf(fo, "\n");
567 |     }
568 |   } else {
569 |     // Run K-means on the word vectors
570 |     int clcn = classes, iter = 10, closeid;
571 |     int *centcn = (int *)malloc(classes * sizeof(int));
572 |     int *cl = (int *)calloc(vocab_size, sizeof(int));
573 |     real closev, x;
574 |     real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
575 |     for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
576 |     for (a = 0; a < iter; a++) {
577 |       for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
578 |       for (b = 0; b < clcn; b++) centcn[b] = 1;
579 |       for (c = 0; c < vocab_size; c++) {
580 |         for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
581 |         centcn[cl[c]]++;
582 |       }
583 |       for (b = 0; b < clcn; b++) {
584 |         closev = 0;
585 |         for (c = 0; c < layer1_size; c++) {
586 |           cent[layer1_size * b + c] /= centcn[b];
587 |           closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
588 |         }
589 |         closev = sqrt(closev);
590 |         for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
591 |       }
592 |       for (c = 0; c < vocab_size; c++) {
593 |         closev = -10;
594 |         closeid = 0;
595 |         for (d = 0; d < clcn; d++) {
596 |           x = 0;
597 |           for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
598 |           if (x > closev) {
599 |             closev = x;
600 |             closeid = d;
601 |           }
602 |         }
603 |         cl[c] = closeid;
604 |       }
605 |     }
606 |     // Save the K-means classes
607 |     for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
608 |     free(centcn);
609 |     free(cent);
610 |     free(cl);
611 |   }
612 |   fclose(fo);
613 | }
614 | 
615 | int ArgPos(char *str, int argc, char **argv) {
616 |   int a;
617 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
618 |     if (a == argc - 1) {
619 |       printf("Argument missing for %s\n", str);
620 |       exit(1);
621 |     }
622 |     return a;
623 |   }
624 |   return -1;
625 | }
626 | 
627 | int main(int argc, char **argv) {
628 |   int i;
629 |   if (argc == 1) {
630 |     printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
631 |     printf("Options:\n");
632 |     printf("Parameters for training:\n");
633 |     printf("\t-train <file>\n");
634 |     printf("\t\tUse text data from <file> to train the model\n");
635 |     printf("\t-output <file>\n");
636 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
637 |     printf("\t-size <int>\n");
638 |     printf("\t\tSet size of word vectors; default is 100\n");
639 |     printf("\t-window <int>\n");
640 |     printf("\t\tSet max skip length between words; default is 5\n");
641 |     printf("\t-sample <float>\n");
642 |     printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
643 |     printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
644 |     printf("\t-hs <int>\n");
645 |     printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
646 |     printf("\t-negative <int>\n");
647 |     printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
648 |     printf("\t-threads <int>\n");
649 |     printf("\t\tUse <int> threads (default 12)\n");
650 |     printf("\t-iter <int>\n");
651 |     printf("\t\tRun more training iterations (default 5)\n");
652 |     printf("\t-min-count <int>\n");
653 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
654 |     printf("\t-alpha <float>\n");
655 |     printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
656 |     printf("\t-classes <int>\n");
657 |     printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
658 |     printf("\t-debug <int>\n");
659 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
660 |     printf("\t-binary <int>\n");
661 |     printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
662 |     printf("\t-save-vocab <file>\n");
663 |     printf("\t\tThe vocabulary will be saved to <file>\n");
664 |     printf("\t-read-vocab <file>\n");
665 |     printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
666 |     printf("\t-cbow <int>\n");
667 |     printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
668 |     printf("\nExamples:\n");
669 |     printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
670 |     return 0;
671 |   }
672 |   output_file[0] = 0;
673 |   save_vocab_file[0] = 0;
674 |   read_vocab_file[0] = 0;
675 |   if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
676 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
677 |   if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
678 |   if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
679 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
680 |   if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
681 |   if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
682 |   if (cbow) alpha = 0.05;
683 |   if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
684 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
685 |   if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
686 |   if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
687 |   if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
688 |   if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
689 |   if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
690 |   if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
691 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
692 |   if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
693 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
694 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
695 |   expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
696 |   for (i = 0; i < EXP_TABLE_SIZE; i++) {
697 |     expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
698 |     expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
699 |   }
700 |   TrainModel();
701 |   return 0;
702 | }
703 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/ea/ea5f636000c445177e5f2f14af11f716b1e91bd0.svn-base:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e text8 ]; then
 3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
 4 |   gzip -d text8.gz -f
 5 | fi
 6 | echo ---------------------------------------------------------------------------------------------------
 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set
 8 | echo Example input: paris france berlin
 9 | echo ---------------------------------------------------------------------------------------------------
10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
11 | ./word-analogy vectors.bin
12 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/pristine/f4/f4f8420f4ff647df0f4196ceee895888fb7f63f7.svn-base:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
7 | sort classes.txt -k 2 -n > classes.sorted.txt
8 | echo The word classes were saved to file classes.sorted.txt
9 | 


--------------------------------------------------------------------------------
/w2v/trunk/.svn/wc.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qieangel2013/phpword2vec/e12958b7bd6f5382ec1e14da3759f904dd5fe3e3/w2v/trunk/.svn/wc.db


--------------------------------------------------------------------------------
/w2v/trunk/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/w2v/trunk/README.txt:
--------------------------------------------------------------------------------
 1 | Tools for computing distributed representtion of words
 2 | ------------------------------------------------------
 3 | 
 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts.
 5 | 
 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous
 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following:
 8 |  - desired vector dimensionality
 9 |  - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model
10 |  - training algorithm: hierarchical softmax and / or negative sampling
11 |  - threshold for downsampling the frequent words 
12 |  - number of threads to use
13 |  - the format of the output word vector file (text or binary)
14 | 
15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 
16 | 
17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training
18 | is finished, the user can interactively explore the similarity of the words.
19 | 
20 | More information about the scripts is provided at https://code.google.com/p/word2vec/
21 | 
22 | 


--------------------------------------------------------------------------------
/w2v/trunk/compute-accuracy.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <malloc.h>
 20 | #include <ctype.h>
 21 | 
 22 | const long long max_size = 2000;         // max length of strings
 23 | const long long N = 1;                   // number of closest words
 24 | const long long max_w = 50;              // max length of vocabulary entries
 25 | 
 26 | int main(int argc, char **argv)
 27 | {
 28 |   FILE *f;
 29 |   char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
 30 |   float dist, len, bestd[N], vec[max_size];
 31 |   long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
 32 |   float *M;
 33 |   char *vocab;
 34 |   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
 35 |   if (argc < 2) {
 36 |     printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
 37 |     return 0;
 38 |   }
 39 |   strcpy(file_name, argv[1]);
 40 |   if (argc > 2) threshold = atoi(argv[2]);
 41 |   f = fopen(file_name, "rb");
 42 |   if (f == NULL) {
 43 |     printf("Input file not found\n");
 44 |     return -1;
 45 |   }
 46 |   fscanf(f, "%lld", &words);
 47 |   if (threshold) if (words > threshold) words = threshold;
 48 |   fscanf(f, "%lld", &size);
 49 |   vocab = (char *)malloc(words * max_w * sizeof(char));
 50 |   M = (float *)malloc(words * size * sizeof(float));
 51 |   if (M == NULL) {
 52 |     printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
 53 |     return -1;
 54 |   }
 55 |   for (b = 0; b < words; b++) {
 56 |     a = 0;
 57 |     while (1) {
 58 |       vocab[b * max_w + a] = fgetc(f);
 59 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 60 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 61 |     }
 62 |     vocab[b * max_w + a] = 0;
 63 |     for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
 64 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 65 |     len = 0;
 66 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 67 |     len = sqrt(len);
 68 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 69 |   }
 70 |   fclose(f);
 71 |   TCN = 0;
 72 |   while (1) {
 73 |     for (a = 0; a < N; a++) bestd[a] = 0;
 74 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 75 |     scanf("%s", st1);
 76 |     for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]);
 77 |     if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
 78 |       if (TCN == 0) TCN = 1;
 79 |       if (QID != 0) {
 80 |         printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
 81 |         printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
 82 |       }
 83 |       QID++;
 84 |       scanf("%s", st1);
 85 |       if (feof(stdin)) break;
 86 |       printf("%s:\n", st1);
 87 |       TCN = 0;
 88 |       CCN = 0;
 89 |       continue;
 90 |     }
 91 |     if (!strcmp(st1, "EXIT")) break;
 92 |     scanf("%s", st2);
 93 |     for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]);
 94 |     scanf("%s", st3);
 95 |     for (a = 0; a<strlen(st3); a++) st3[a] = toupper(st3[a]);
 96 |     scanf("%s", st4);
 97 |     for (a = 0; a < strlen(st4); a++) st4[a] = toupper(st4[a]);
 98 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st1)) break;
 99 |     b1 = b;
100 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st2)) break;
101 |     b2 = b;
102 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st3)) break;
103 |     b3 = b;
104 |     for (a = 0; a < N; a++) bestd[a] = 0;
105 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
106 |     TQ++;
107 |     if (b1 == words) continue;
108 |     if (b2 == words) continue;
109 |     if (b3 == words) continue;
110 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st4)) break;
111 |     if (b == words) continue;
112 |     for (a = 0; a < size; a++) vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
113 |     TQS++;
114 |     for (c = 0; c < words; c++) {
115 |       if (c == b1) continue;
116 |       if (c == b2) continue;
117 |       if (c == b3) continue;
118 |       dist = 0;
119 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
120 |       for (a = 0; a < N; a++) {
121 |         if (dist > bestd[a]) {
122 |           for (d = N - 1; d > a; d--) {
123 |             bestd[d] = bestd[d - 1];
124 |             strcpy(bestw[d], bestw[d - 1]);
125 |           }
126 |           bestd[a] = dist;
127 |           strcpy(bestw[a], &vocab[c * max_w]);
128 |           break;
129 |         }
130 |       }
131 |     }
132 |     if (!strcmp(st4, bestw[0])) {
133 |       CCN++;
134 |       CACN++;
135 |       if (QID <= 5) SEAC++; else SYAC++;
136 |     }
137 |     if (QID <= 5) SECN++; else SYCN++;
138 |     TCN++;
139 |     TACN++;
140 |   }
141 |   printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
142 |   return 0;
143 | }
144 | 


--------------------------------------------------------------------------------
/w2v/trunk/demo-analogy.sh:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e text8 ]; then
 3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
 4 |   gzip -d text8.gz -f
 5 | fi
 6 | echo ---------------------------------------------------------------------------------------------------
 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set
 8 | echo Example input: paris france berlin
 9 | echo ---------------------------------------------------------------------------------------------------
10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
11 | ./word-analogy vectors.bin
12 | 


--------------------------------------------------------------------------------
/w2v/trunk/demo-classes.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
7 | sort classes.txt -k 2 -n > classes.sorted.txt
8 | echo The word classes were saved to file classes.sorted.txt
9 | 


--------------------------------------------------------------------------------
/w2v/trunk/demo-phrase-accuracy.sh:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e news.2012.en.shuffled ]; then
 3 |   wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 4 |   gzip -d news.2012.en.shuffled.gz -f
 5 | fi
 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt
12 | 


--------------------------------------------------------------------------------
/w2v/trunk/demo-phrases.sh:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e news.2012.en.shuffled ]; then
 3 |   wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 4 |   gzip -d news.2012.en.shuffled.gz -f
 5 | fi
 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./distance vectors-phrase.bin
12 | 


--------------------------------------------------------------------------------
/w2v/trunk/demo-train-big-model-v1.sh:
--------------------------------------------------------------------------------
  1 | ###############################################################################################
  2 | #
  3 | # Script for training good word and phrase vector model using public corpora, version 1.0.
  4 | # The training time will be from several hours to about a day.
  5 | #
  6 | # Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains
  7 | # a 500-dimensional vector model and evaluates it on word and phrase analogy tasks.
  8 | #
  9 | ###############################################################################################
 10 | 
 11 | # This function will convert text to lowercase and remove special characters
 12 | normalize_text() {
 13 |   awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
 14 |   -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
 15 |   -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
 16 |   -e 's/«/ /g' | tr 0-9 " "
 17 | }
 18 | 
 19 | mkdir word2vec
 20 | cd word2vec
 21 | 
 22 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 23 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz
 24 | gzip -d news.2012.en.shuffled.gz
 25 | gzip -d news.2013.en.shuffled.gz
 26 | normalize_text < news.2012.en.shuffled > data.txt
 27 | normalize_text < news.2013.en.shuffled >> data.txt
 28 | 
 29 | wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
 30 | tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
 31 | for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do
 32 |   normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt
 33 | done
 34 | 
 35 | wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus
 36 | tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt
 37 | for i in `ls webbase_all`; do
 38 |   normalize_text < webbase_all/$i >> data.txt
 39 | done
 40 | 
 41 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
 42 | bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e '
 43 | # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
 44 | # letters (a-z, converted from A-Z), and spaces (never consecutive)...
 45 | # All other characters are converted to spaces.  Only text which normally appears.
 46 | # in the web browser is displayed.  Tables are removed.  Image captions are.
 47 | # preserved.  Links are converted to normal text.  Digits are spelled out.
 48 | # *** Modified to not spell digits or throw away non-ASCII characters ***
 49 | 
 50 | # Written by Matt Mahoney, June 10, 2006.  This program is released to the public domain.
 51 | 
 52 | $/=">";                     # input record separator
 53 | while (<>) {
 54 |   if (/<text /) {$text=1;}  # remove all but between <text> ... </text>
 55 |   if (/#redirect/i) {$text=0;}  # remove #REDIRECT
 56 |   if ($text) {
 57 | 
 58 |     # Remove any text not normally visible
 59 |     if (/<\/text>/) {$text=0;}
 60 |     s/<.*>//;               # remove xml tags
 61 |     s/&amp;/&/g;            # decode URL encoded chars
 62 |     s/&lt;/</g;
 63 |     s/&gt;/>/g;
 64 |     s/<ref[^<]*<\/ref>//g;  # remove references <ref...> ... </ref>
 65 |     s/<[^>]*>//g;           # remove xhtml tags
 66 |     s/\[http:[^] ]*/[/g;    # remove normal url, preserve visible text
 67 |     s/\|thumb//ig;          # remove images links, preserve caption
 68 |     s/\|left//ig;
 69 |     s/\|right//ig;
 70 |     s/\|\d+px//ig;
 71 |     s/\[\[image:[^\[\]]*\|//ig;
 72 |     s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig;  # show categories without markup
 73 |     s/\[\[[a-z\-]*:[^\]]*\]\]//g;  # remove links to other languages
 74 |     s/\[\[[^\|\]]*\|/[[/g;  # remove wiki url, preserve visible text
 75 |     s/{{[^}]*}}//g;         # remove {{icons}} and {tables}
 76 |     s/{[^}]*}//g;
 77 |     s/\[//g;                # remove [ and ]
 78 |     s/\]//g;
 79 |     s/&[^;]*;/ /g;          # remove URL encoded chars
 80 | 
 81 |     $_=" $_ ";
 82 |     chop;
 83 |     print $_;
 84 |   }
 85 | }
 86 | ' | normalize_text | awk '{if (NF>1) print;}' >> data.txt
 87 | 
 88 | wget http://word2vec.googlecode.com/svn/trunk/word2vec.c
 89 | wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c
 90 | wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c
 91 | wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
 92 | wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt
 93 | gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops
 94 | gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops
 95 | gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops
 96 | ./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2
 97 | ./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2
 98 | ./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10
 99 | ./compute-accuracy vectors.bin 400000 < questions-words.txt     # should get to almost 78% accuracy on 99.7% of questions
100 | ./compute-accuracy vectors.bin 1000000 < questions-phrases.txt  # about 78% accuracy with 77% coverage
101 | 


--------------------------------------------------------------------------------
/w2v/trunk/demo-word-accuracy.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt
8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
9 | 


--------------------------------------------------------------------------------
/w2v/trunk/demo-word.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./distance vectors.bin
8 | 


--------------------------------------------------------------------------------
/w2v/trunk/distance.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <malloc.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char *bestw[N];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   char ch;
 32 |   float *M;
 33 |   char *vocab;
 34 |   if (argc < 2) {
 35 |     printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 36 |     return 0;
 37 |   }
 38 |   strcpy(file_name, argv[1]);
 39 |   f = fopen(file_name, "rb");
 40 |   if (f == NULL) {
 41 |     printf("Input file not found\n");
 42 |     return -1;
 43 |   }
 44 |   fscanf(f, "%lld", &words);
 45 |   fscanf(f, "%lld", &size);
 46 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 47 |   for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
 48 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 49 |   if (M == NULL) {
 50 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 51 |     return -1;
 52 |   }
 53 |   for (b = 0; b < words; b++) {
 54 |     a = 0;
 55 |     while (1) {
 56 |       vocab[b * max_w + a] = fgetc(f);
 57 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 58 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 59 |     }
 60 |     vocab[b * max_w + a] = 0;
 61 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 62 |     len = 0;
 63 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 64 |     len = sqrt(len);
 65 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 66 |   }
 67 |   fclose(f);
 68 |   while (1) {
 69 |     for (a = 0; a < N; a++) bestd[a] = 0;
 70 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 71 |     printf("Enter word or sentence (EXIT to break): ");
 72 |     a = 0;
 73 |     while (1) {
 74 |       st1[a] = fgetc(stdin);
 75 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 76 |         st1[a] = 0;
 77 |         break;
 78 |       }
 79 |       a++;
 80 |     }
 81 |     if (!strcmp(st1, "EXIT")) break;
 82 |     cn = 0;
 83 |     b = 0;
 84 |     c = 0;
 85 |     while (1) {
 86 |       st[cn][b] = st1[c];
 87 |       b++;
 88 |       c++;
 89 |       st[cn][b] = 0;
 90 |       if (st1[c] == 0) break;
 91 |       if (st1[c] == ' ') {
 92 |         cn++;
 93 |         b = 0;
 94 |         c++;
 95 |       }
 96 |     }
 97 |     cn++;
 98 |     for (a = 0; a < cn; a++) {
 99 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
100 |       if (b == words) b = -1;
101 |       bi[a] = b;
102 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
103 |       if (b == -1) {
104 |         printf("Out of dictionary word!\n");
105 |         break;
106 |       }
107 |     }
108 |     if (b == -1) continue;
109 |     printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
110 |     for (a = 0; a < size; a++) vec[a] = 0;
111 |     for (b = 0; b < cn; b++) {
112 |       if (bi[b] == -1) continue;
113 |       for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
114 |     }
115 |     len = 0;
116 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
117 |     len = sqrt(len);
118 |     for (a = 0; a < size; a++) vec[a] /= len;
119 |     for (a = 0; a < N; a++) bestd[a] = -1;
120 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
121 |     for (c = 0; c < words; c++) {
122 |       a = 0;
123 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
124 |       if (a == 1) continue;
125 |       dist = 0;
126 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
127 |       for (a = 0; a < N; a++) {
128 |         if (dist > bestd[a]) {
129 |           for (d = N - 1; d > a; d--) {
130 |             bestd[d] = bestd[d - 1];
131 |             strcpy(bestw[d], bestw[d - 1]);
132 |           }
133 |           bestd[a] = dist;
134 |           strcpy(bestw[a], &vocab[c * max_w]);
135 |           break;
136 |         }
137 |       }
138 |     }
139 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
140 |   }
141 |   return 0;
142 | }
143 | 


--------------------------------------------------------------------------------
/w2v/trunk/makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
 4 | 
 5 | all: word2vec word2phrase distance word-analogy compute-accuracy
 6 | 
 7 | word2vec : word2vec.c
 8 | 	$(CC) word2vec.c -o word2vec $(CFLAGS)
 9 | word2phrase : word2phrase.c
10 | 	$(CC) word2phrase.c -o word2phrase $(CFLAGS)
11 | distance : distance.c
12 | 	$(CC) distance.c -o distance $(CFLAGS)
13 | word-analogy : word-analogy.c
14 | 	$(CC) word-analogy.c -o word-analogy $(CFLAGS)
15 | compute-accuracy : compute-accuracy.c
16 | 	$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
17 | 	chmod +x *.sh
18 | 
19 | clean:
20 | 	rm -rf word2vec word2phrase distance word-analogy compute-accuracy


--------------------------------------------------------------------------------
/w2v/trunk/vectors.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qieangel2013/phpword2vec/e12958b7bd6f5382ec1e14da3759f904dd5fe3e3/w2v/trunk/vectors.bin


--------------------------------------------------------------------------------
/w2v/trunk/word-analogy.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <malloc.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char bestw[N][max_size];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   char ch;
 32 |   float *M;
 33 |   char *vocab;
 34 |   if (argc < 2) {
 35 |     printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 36 |     return 0;
 37 |   }
 38 |   strcpy(file_name, argv[1]);
 39 |   f = fopen(file_name, "rb");
 40 |   if (f == NULL) {
 41 |     printf("Input file not found\n");
 42 |     return -1;
 43 |   }
 44 |   fscanf(f, "%lld", &words);
 45 |   fscanf(f, "%lld", &size);
 46 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 47 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 48 |   if (M == NULL) {
 49 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 50 |     return -1;
 51 |   }
 52 |   for (b = 0; b < words; b++) {
 53 |     a = 0;
 54 |     while (1) {
 55 |       vocab[b * max_w + a] = fgetc(f);
 56 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 57 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 58 |     }
 59 |     vocab[b * max_w + a] = 0;
 60 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 61 |     len = 0;
 62 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 63 |     len = sqrt(len);
 64 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 65 |   }
 66 |   fclose(f);
 67 |   while (1) {
 68 |     for (a = 0; a < N; a++) bestd[a] = 0;
 69 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 70 |     printf("Enter three words (EXIT to break): ");
 71 |     a = 0;
 72 |     while (1) {
 73 |       st1[a] = fgetc(stdin);
 74 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 75 |         st1[a] = 0;
 76 |         break;
 77 |       }
 78 |       a++;
 79 |     }
 80 |     if (!strcmp(st1, "EXIT")) break;
 81 |     cn = 0;
 82 |     b = 0;
 83 |     c = 0;
 84 |     while (1) {
 85 |       st[cn][b] = st1[c];
 86 |       b++;
 87 |       c++;
 88 |       st[cn][b] = 0;
 89 |       if (st1[c] == 0) break;
 90 |       if (st1[c] == ' ') {
 91 |         cn++;
 92 |         b = 0;
 93 |         c++;
 94 |       }
 95 |     }
 96 |     cn++;
 97 |     if (cn < 3) {
 98 |       printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
 99 |       continue;
100 |     }
101 |     for (a = 0; a < cn; a++) {
102 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
103 |       if (b == words) b = 0;
104 |       bi[a] = b;
105 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
106 |       if (b == 0) {
107 |         printf("Out of dictionary word!\n");
108 |         break;
109 |       }
110 |     }
111 |     if (b == 0) continue;
112 |     printf("\n                                              Word              Distance\n------------------------------------------------------------------------\n");
113 |     for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
114 |     len = 0;
115 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
116 |     len = sqrt(len);
117 |     for (a = 0; a < size; a++) vec[a] /= len;
118 |     for (a = 0; a < N; a++) bestd[a] = 0;
119 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
120 |     for (c = 0; c < words; c++) {
121 |       if (c == bi[0]) continue;
122 |       if (c == bi[1]) continue;
123 |       if (c == bi[2]) continue;
124 |       a = 0;
125 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
126 |       if (a == 1) continue;
127 |       dist = 0;
128 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
129 |       for (a = 0; a < N; a++) {
130 |         if (dist > bestd[a]) {
131 |           for (d = N - 1; d > a; d--) {
132 |             bestd[d] = bestd[d - 1];
133 |             strcpy(bestw[d], bestw[d - 1]);
134 |           }
135 |           bestd[a] = dist;
136 |           strcpy(bestw[a], &vocab[c * max_w]);
137 |           break;
138 |         }
139 |       }
140 |     }
141 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
142 |   }
143 |   return 0;
144 | }
145 | 


--------------------------------------------------------------------------------
/w2v/trunk/word2phrase.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 60
 22 | 
 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary
 24 | 
 25 | typedef float real;                    // Precision of float numbers
 26 | 
 27 | struct vocab_word {
 28 |   long long cn;
 29 |   char *word;
 30 | };
 31 | 
 32 | char train_file[MAX_STRING], output_file[MAX_STRING];
 33 | struct vocab_word *vocab;
 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1;
 35 | long long vocab_max_size = 10000, vocab_size = 0;
 36 | long long train_words = 0;
 37 | real threshold = 100;
 38 | 
 39 | unsigned long long next_random = 1;
 40 | 
 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 42 | void ReadWord(char *word, FILE *fin) {
 43 |   int a = 0, ch;
 44 |   while (!feof(fin)) {
 45 |     ch = fgetc(fin);
 46 |     if (ch == 13) continue;
 47 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 48 |       if (a > 0) {
 49 |         if (ch == '\n') ungetc(ch, fin);
 50 |         break;
 51 |       }
 52 |       if (ch == '\n') {
 53 |         strcpy(word, (char *)"</s>");
 54 |         return;
 55 |       } else continue;
 56 |     }
 57 |     word[a] = ch;
 58 |     a++;
 59 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 60 |   }
 61 |   word[a] = 0;
 62 | }
 63 | 
 64 | // Returns hash value of a word
 65 | int GetWordHash(char *word) {
 66 |   unsigned long long a, hash = 1;
 67 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
 68 |   hash = hash % vocab_hash_size;
 69 |   return hash;
 70 | }
 71 | 
 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
 73 | int SearchVocab(char *word) {
 74 |   unsigned int hash = GetWordHash(word);
 75 |   while (1) {
 76 |     if (vocab_hash[hash] == -1) return -1;
 77 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
 78 |     hash = (hash + 1) % vocab_hash_size;
 79 |   }
 80 |   return -1;
 81 | }
 82 | 
 83 | // Reads a word and returns its index in the vocabulary
 84 | int ReadWordIndex(FILE *fin) {
 85 |   char word[MAX_STRING];
 86 |   ReadWord(word, fin);
 87 |   if (feof(fin)) return -1;
 88 |   return SearchVocab(word);
 89 | }
 90 | 
 91 | // Adds a word to the vocabulary
 92 | int AddWordToVocab(char *word) {
 93 |   unsigned int hash, length = strlen(word) + 1;
 94 |   if (length > MAX_STRING) length = MAX_STRING;
 95 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
 96 |   strcpy(vocab[vocab_size].word, word);
 97 |   vocab[vocab_size].cn = 0;
 98 |   vocab_size++;
 99 |   // Reallocate memory if needed
100 |   if (vocab_size + 2 >= vocab_max_size) {
101 |     vocab_max_size += 10000;
102 |     vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
103 |   }
104 |   hash = GetWordHash(word);
105 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
106 |   vocab_hash[hash]=vocab_size - 1;
107 |   return vocab_size - 1;
108 | }
109 | 
110 | // Used later for sorting by word counts
111 | int VocabCompare(const void *a, const void *b) {
112 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
113 | }
114 | 
115 | // Sorts the vocabulary by frequency using word counts
116 | void SortVocab() {
117 |   int a;
118 |   unsigned int hash;
119 |   // Sort the vocabulary and keep </s> at the first position
120 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
121 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
122 |   for (a = 0; a < vocab_size; a++) {
123 |     // Words occuring less than min_count times will be discarded from the vocab
124 |     if (vocab[a].cn < min_count) {
125 |       vocab_size--;
126 |       free(vocab[vocab_size].word);
127 |     } else {
128 |       // Hash will be re-computed, as after the sorting it is not actual
129 |       hash = GetWordHash(vocab[a].word);
130 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
131 |       vocab_hash[hash] = a;
132 |     }
133 |   }
134 |   vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word));
135 | }
136 | 
137 | // Reduces the vocabulary by removing infrequent tokens
138 | void ReduceVocab() {
139 |   int a, b = 0;
140 |   unsigned int hash;
141 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
142 |     vocab[b].cn = vocab[a].cn;
143 |     vocab[b].word = vocab[a].word;
144 |     b++;
145 |   } else free(vocab[a].word);
146 |   vocab_size = b;
147 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
148 |   for (a = 0; a < vocab_size; a++) {
149 |     // Hash will be re-computed, as it is not actual
150 |     hash = GetWordHash(vocab[a].word);
151 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
152 |     vocab_hash[hash] = a;
153 |   }
154 |   fflush(stdout);
155 |   min_reduce++;
156 | }
157 | 
158 | void LearnVocabFromTrainFile() {
159 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
160 |   FILE *fin;
161 |   long long a, i, start = 1;
162 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
163 |   fin = fopen(train_file, "rb");
164 |   if (fin == NULL) {
165 |     printf("ERROR: training data file not found!\n");
166 |     exit(1);
167 |   }
168 |   vocab_size = 0;
169 |   AddWordToVocab((char *)"</s>");
170 |   while (1) {
171 |     ReadWord(word, fin);
172 |     if (feof(fin)) break;
173 |     if (!strcmp(word, "</s>")) {
174 |       start = 1;
175 |       continue;
176 |     } else start = 0;
177 |     train_words++;
178 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
179 |       printf("Words processed: %lldK     Vocab size: %lldK  %c", train_words / 1000, vocab_size / 1000, 13);
180 |       fflush(stdout);
181 |     }
182 |     i = SearchVocab(word);
183 |     if (i == -1) {
184 |       a = AddWordToVocab(word);
185 |       vocab[a].cn = 1;
186 |     } else vocab[i].cn++;
187 |     if (start) continue;
188 |     sprintf(bigram_word, "%s_%s", last_word, word);
189 |     bigram_word[MAX_STRING - 1] = 0;
190 |     strcpy(last_word, word);
191 |     i = SearchVocab(bigram_word);
192 |     if (i == -1) {
193 |       a = AddWordToVocab(bigram_word);
194 |       vocab[a].cn = 1;
195 |     } else vocab[i].cn++;
196 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
197 |   }
198 |   SortVocab();
199 |   if (debug_mode > 0) {
200 |     printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
201 |     printf("Words in train file: %lld\n", train_words);
202 |   }
203 |   fclose(fin);
204 | }
205 | 
206 | void TrainModel() {
207 |   long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
208 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
209 |   real score;
210 |   FILE *fo, *fin;
211 |   printf("Starting training using file %s\n", train_file);
212 |   LearnVocabFromTrainFile();
213 |   fin = fopen(train_file, "rb");
214 |   fo = fopen(output_file, "wb");
215 |   word[0] = 0;
216 |   while (1) {
217 |     strcpy(last_word, word);
218 |     ReadWord(word, fin);
219 |     if (feof(fin)) break;
220 |     if (!strcmp(word, "</s>")) {
221 |       fprintf(fo, "\n");
222 |       continue;
223 |     }
224 |     cn++;
225 |     if ((debug_mode > 1) && (cn % 100000 == 0)) {
226 |       printf("Words written: %lldK%c", cn / 1000, 13);
227 |       fflush(stdout);
228 |     }
229 |     oov = 0;
230 |     i = SearchVocab(word);
231 |     if (i == -1) oov = 1; else pb = vocab[i].cn;
232 |     if (li == -1) oov = 1;
233 |     li = i;
234 |     sprintf(bigram_word, "%s_%s", last_word, word);
235 |     bigram_word[MAX_STRING - 1] = 0;
236 |     i = SearchVocab(bigram_word);
237 |     if (i == -1) oov = 1; else pab = vocab[i].cn;
238 |     if (pa < min_count) oov = 1;
239 |     if (pb < min_count) oov = 1;
240 |     if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
241 |     if (score > threshold) {
242 |       fprintf(fo, "_%s", word);
243 |       pb = 0;
244 |     } else fprintf(fo, " %s", word);
245 |     pa = pb;
246 |   }
247 |   fclose(fo);
248 |   fclose(fin);
249 | }
250 | 
251 | int ArgPos(char *str, int argc, char **argv) {
252 |   int a;
253 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
254 |     if (a == argc - 1) {
255 |       printf("Argument missing for %s\n", str);
256 |       exit(1);
257 |     }
258 |     return a;
259 |   }
260 |   return -1;
261 | }
262 | 
263 | int main(int argc, char **argv) {
264 |   int i;
265 |   if (argc == 1) {
266 |     printf("WORD2PHRASE tool v0.1a\n\n");
267 |     printf("Options:\n");
268 |     printf("Parameters for training:\n");
269 |     printf("\t-train <file>\n");
270 |     printf("\t\tUse text data from <file> to train the model\n");
271 |     printf("\t-output <file>\n");
272 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters / phrases\n");
273 |     printf("\t-min-count <int>\n");
274 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
275 |     printf("\t-threshold <float>\n");
276 |     printf("\t\t The <float> value represents threshold for forming the phrases (higher means less phrases); default 100\n");
277 |     printf("\t-debug <int>\n");
278 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
279 |     printf("\nExamples:\n");
280 |     printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n");
281 |     return 0;
282 |   }
283 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
284 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
285 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
286 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
287 |   if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]);
288 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
289 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
290 |   TrainModel();
291 |   return 0;
292 | }
293 | 


--------------------------------------------------------------------------------
/w2v/trunk/word2vec.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 100
 22 | #define EXP_TABLE_SIZE 1000
 23 | #define MAX_EXP 6
 24 | #define MAX_SENTENCE_LENGTH 1000
 25 | #define MAX_CODE_LENGTH 40
 26 | 
 27 | const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 28 | 
 29 | typedef float real;                    // Precision of float numbers
 30 | 
 31 | struct vocab_word {
 32 |   long long cn;
 33 |   int *point;
 34 |   char *word, *code, codelen;
 35 | };
 36 | 
 37 | char train_file[MAX_STRING], output_file[MAX_STRING];
 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 39 | struct vocab_word *vocab;
 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
 41 | int *vocab_hash;
 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
 44 | real alpha = 0.025, starting_alpha, sample = 1e-3;
 45 | real *syn0, *syn1, *syn1neg, *expTable;
 46 | clock_t start;
 47 | 
 48 | int hs = 0, negative = 5;
 49 | const int table_size = 1e8;
 50 | int *table;
 51 | 
 52 | void InitUnigramTable() {
 53 |   int a, i;
 54 |   long long train_words_pow = 0;
 55 |   real d1, power = 0.75;
 56 |   table = (int *)malloc(table_size * sizeof(int));
 57 |   for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
 58 |   i = 0;
 59 |   d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
 60 |   for (a = 0; a < table_size; a++) {
 61 |     table[a] = i;
 62 |     if (a / (real)table_size > d1) {
 63 |       i++;
 64 |       d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
 65 |     }
 66 |     if (i >= vocab_size) i = vocab_size - 1;
 67 |   }
 68 | }
 69 | 
 70 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 71 | void ReadWord(char *word, FILE *fin) {
 72 |   int a = 0, ch;
 73 |   while (!feof(fin)) {
 74 |     ch = fgetc(fin);
 75 |     if (ch == 13) continue;
 76 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 77 |       if (a > 0) {
 78 |         if (ch == '\n') ungetc(ch, fin);
 79 |         break;
 80 |       }
 81 |       if (ch == '\n') {
 82 |         strcpy(word, (char *)"</s>");
 83 |         return;
 84 |       } else continue;
 85 |     }
 86 |     word[a] = ch;
 87 |     a++;
 88 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 89 |   }
 90 |   word[a] = 0;
 91 | }
 92 | 
 93 | // Returns hash value of a word
 94 | int GetWordHash(char *word) {
 95 |   unsigned long long a, hash = 0;
 96 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
 97 |   hash = hash % vocab_hash_size;
 98 |   return hash;
 99 | }
100 | 
101 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
102 | int SearchVocab(char *word) {
103 |   unsigned int hash = GetWordHash(word);
104 |   while (1) {
105 |     if (vocab_hash[hash] == -1) return -1;
106 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
107 |     hash = (hash + 1) % vocab_hash_size;
108 |   }
109 |   return -1;
110 | }
111 | 
112 | // Reads a word and returns its index in the vocabulary
113 | int ReadWordIndex(FILE *fin) {
114 |   char word[MAX_STRING];
115 |   ReadWord(word, fin);
116 |   if (feof(fin)) return -1;
117 |   return SearchVocab(word);
118 | }
119 | 
120 | // Adds a word to the vocabulary
121 | int AddWordToVocab(char *word) {
122 |   unsigned int hash, length = strlen(word) + 1;
123 |   if (length > MAX_STRING) length = MAX_STRING;
124 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
125 |   strcpy(vocab[vocab_size].word, word);
126 |   vocab[vocab_size].cn = 0;
127 |   vocab_size++;
128 |   // Reallocate memory if needed
129 |   if (vocab_size + 2 >= vocab_max_size) {
130 |     vocab_max_size += 1000;
131 |     vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
132 |   }
133 |   hash = GetWordHash(word);
134 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
135 |   vocab_hash[hash] = vocab_size - 1;
136 |   return vocab_size - 1;
137 | }
138 | 
139 | // Used later for sorting by word counts
140 | int VocabCompare(const void *a, const void *b) {
141 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
142 | }
143 | 
144 | // Sorts the vocabulary by frequency using word counts
145 | void SortVocab() {
146 |   int a, size;
147 |   unsigned int hash;
148 |   // Sort the vocabulary and keep </s> at the first position
149 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
150 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
151 |   size = vocab_size;
152 |   train_words = 0;
153 |   for (a = 0; a < size; a++) {
154 |     // Words occuring less than min_count times will be discarded from the vocab
155 |     if ((vocab[a].cn < min_count) && (a != 0)) {
156 |       vocab_size--;
157 |       free(vocab[a].word);
158 |     } else {
159 |       // Hash will be re-computed, as after the sorting it is not actual
160 |       hash=GetWordHash(vocab[a].word);
161 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
162 |       vocab_hash[hash] = a;
163 |       train_words += vocab[a].cn;
164 |     }
165 |   }
166 |   vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
167 |   // Allocate memory for the binary tree construction
168 |   for (a = 0; a < vocab_size; a++) {
169 |     vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
170 |     vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
171 |   }
172 | }
173 | 
174 | // Reduces the vocabulary by removing infrequent tokens
175 | void ReduceVocab() {
176 |   int a, b = 0;
177 |   unsigned int hash;
178 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
179 |     vocab[b].cn = vocab[a].cn;
180 |     vocab[b].word = vocab[a].word;
181 |     b++;
182 |   } else free(vocab[a].word);
183 |   vocab_size = b;
184 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
185 |   for (a = 0; a < vocab_size; a++) {
186 |     // Hash will be re-computed, as it is not actual
187 |     hash = GetWordHash(vocab[a].word);
188 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
189 |     vocab_hash[hash] = a;
190 |   }
191 |   fflush(stdout);
192 |   min_reduce++;
193 | }
194 | 
195 | // Create binary Huffman tree using the word counts
196 | // Frequent words will have short uniqe binary codes
197 | void CreateBinaryTree() {
198 |   long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
199 |   char code[MAX_CODE_LENGTH];
200 |   long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
201 |   long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
202 |   long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
203 |   for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
204 |   for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
205 |   pos1 = vocab_size - 1;
206 |   pos2 = vocab_size;
207 |   // Following algorithm constructs the Huffman tree by adding one node at a time
208 |   for (a = 0; a < vocab_size - 1; a++) {
209 |     // First, find two smallest nodes 'min1, min2'
210 |     if (pos1 >= 0) {
211 |       if (count[pos1] < count[pos2]) {
212 |         min1i = pos1;
213 |         pos1--;
214 |       } else {
215 |         min1i = pos2;
216 |         pos2++;
217 |       }
218 |     } else {
219 |       min1i = pos2;
220 |       pos2++;
221 |     }
222 |     if (pos1 >= 0) {
223 |       if (count[pos1] < count[pos2]) {
224 |         min2i = pos1;
225 |         pos1--;
226 |       } else {
227 |         min2i = pos2;
228 |         pos2++;
229 |       }
230 |     } else {
231 |       min2i = pos2;
232 |       pos2++;
233 |     }
234 |     count[vocab_size + a] = count[min1i] + count[min2i];
235 |     parent_node[min1i] = vocab_size + a;
236 |     parent_node[min2i] = vocab_size + a;
237 |     binary[min2i] = 1;
238 |   }
239 |   // Now assign binary code to each vocabulary word
240 |   for (a = 0; a < vocab_size; a++) {
241 |     b = a;
242 |     i = 0;
243 |     while (1) {
244 |       code[i] = binary[b];
245 |       point[i] = b;
246 |       i++;
247 |       b = parent_node[b];
248 |       if (b == vocab_size * 2 - 2) break;
249 |     }
250 |     vocab[a].codelen = i;
251 |     vocab[a].point[0] = vocab_size - 2;
252 |     for (b = 0; b < i; b++) {
253 |       vocab[a].code[i - b - 1] = code[b];
254 |       vocab[a].point[i - b] = point[b] - vocab_size;
255 |     }
256 |   }
257 |   free(count);
258 |   free(binary);
259 |   free(parent_node);
260 | }
261 | 
262 | void LearnVocabFromTrainFile() {
263 |   char word[MAX_STRING];
264 |   FILE *fin;
265 |   long long a, i;
266 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
267 |   fin = fopen(train_file, "rb");
268 |   if (fin == NULL) {
269 |     printf("ERROR: training data file not found!\n");
270 |     exit(1);
271 |   }
272 |   vocab_size = 0;
273 |   AddWordToVocab((char *)"</s>");
274 |   while (1) {
275 |     ReadWord(word, fin);
276 |     if (feof(fin)) break;
277 |     train_words++;
278 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
279 |       printf("%lldK%c", train_words / 1000, 13);
280 |       fflush(stdout);
281 |     }
282 |     i = SearchVocab(word);
283 |     if (i == -1) {
284 |       a = AddWordToVocab(word);
285 |       vocab[a].cn = 1;
286 |     } else vocab[i].cn++;
287 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
288 |   }
289 |   SortVocab();
290 |   if (debug_mode > 0) {
291 |     printf("Vocab size: %lld\n", vocab_size);
292 |     printf("Words in train file: %lld\n", train_words);
293 |   }
294 |   file_size = ftell(fin);
295 |   fclose(fin);
296 | }
297 | 
298 | void SaveVocab() {
299 |   long long i;
300 |   FILE *fo = fopen(save_vocab_file, "wb");
301 |   for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
302 |   fclose(fo);
303 | }
304 | 
305 | void ReadVocab() {
306 |   long long a, i = 0;
307 |   char c;
308 |   char word[MAX_STRING];
309 |   FILE *fin = fopen(read_vocab_file, "rb");
310 |   if (fin == NULL) {
311 |     printf("Vocabulary file not found\n");
312 |     exit(1);
313 |   }
314 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
315 |   vocab_size = 0;
316 |   while (1) {
317 |     ReadWord(word, fin);
318 |     if (feof(fin)) break;
319 |     a = AddWordToVocab(word);
320 |     fscanf(fin, "%lld%c", &vocab[a].cn, &c);
321 |     i++;
322 |   }
323 |   SortVocab();
324 |   if (debug_mode > 0) {
325 |     printf("Vocab size: %lld\n", vocab_size);
326 |     printf("Words in train file: %lld\n", train_words);
327 |   }
328 |   fin = fopen(train_file, "rb");
329 |   if (fin == NULL) {
330 |     printf("ERROR: training data file not found!\n");
331 |     exit(1);
332 |   }
333 |   fseek(fin, 0, SEEK_END);
334 |   file_size = ftell(fin);
335 |   fclose(fin);
336 | }
337 | 
338 | void InitNet() {
339 |   long long a, b;
340 |   unsigned long long next_random = 1;
341 |   a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
342 |   if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
343 |   if (hs) {
344 |     a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
345 |     if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
346 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
347 |      syn1[a * layer1_size + b] = 0;
348 |   }
349 |   if (negative>0) {
350 |     a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
351 |     if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
352 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
353 |      syn1neg[a * layer1_size + b] = 0;
354 |   }
355 |   for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
356 |     next_random = next_random * (unsigned long long)25214903917 + 11;
357 |     syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
358 |   }
359 |   CreateBinaryTree();
360 | }
361 | 
362 | void *TrainModelThread(void *id) {
363 |   long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
364 |   long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
365 |   long long l1, l2, c, target, label, local_iter = iter;
366 |   unsigned long long next_random = (long long)id;
367 |   real f, g;
368 |   clock_t now;
369 |   real *neu1 = (real *)calloc(layer1_size, sizeof(real));
370 |   real *neu1e = (real *)calloc(layer1_size, sizeof(real));
371 |   FILE *fi = fopen(train_file, "rb");
372 |   fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
373 |   while (1) {
374 |     if (word_count - last_word_count > 10000) {
375 |       word_count_actual += word_count - last_word_count;
376 |       last_word_count = word_count;
377 |       if ((debug_mode > 1)) {
378 |         now=clock();
379 |         printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
380 |          word_count_actual / (real)(iter * train_words + 1) * 100,
381 |          word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
382 |         fflush(stdout);
383 |       }
384 |       alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
385 |       if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
386 |     }
387 |     if (sentence_length == 0) {
388 |       while (1) {
389 |         word = ReadWordIndex(fi);
390 |         if (feof(fi)) break;
391 |         if (word == -1) continue;
392 |         word_count++;
393 |         if (word == 0) break;
394 |         // The subsampling randomly discards frequent words while keeping the ranking same
395 |         if (sample > 0) {
396 |           real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
397 |           next_random = next_random * (unsigned long long)25214903917 + 11;
398 |           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
399 |         }
400 |         sen[sentence_length] = word;
401 |         sentence_length++;
402 |         if (sentence_length >= MAX_SENTENCE_LENGTH) break;
403 |       }
404 |       sentence_position = 0;
405 |     }
406 |     if (feof(fi) || (word_count > train_words / num_threads)) {
407 |       word_count_actual += word_count - last_word_count;
408 |       local_iter--;
409 |       if (local_iter == 0) break;
410 |       word_count = 0;
411 |       last_word_count = 0;
412 |       sentence_length = 0;
413 |       fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
414 |       continue;
415 |     }
416 |     word = sen[sentence_position];
417 |     if (word == -1) continue;
418 |     for (c = 0; c < layer1_size; c++) neu1[c] = 0;
419 |     for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
420 |     next_random = next_random * (unsigned long long)25214903917 + 11;
421 |     b = next_random % window;
422 |     if (cbow) {  //train the cbow architecture
423 |       // in -> hidden
424 |       cw = 0;
425 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
426 |         c = sentence_position - window + a;
427 |         if (c < 0) continue;
428 |         if (c >= sentence_length) continue;
429 |         last_word = sen[c];
430 |         if (last_word == -1) continue;
431 |         for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
432 |         cw++;
433 |       }
434 |       if (cw) {
435 |         for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
436 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
437 |           f = 0;
438 |           l2 = vocab[word].point[d] * layer1_size;
439 |           // Propagate hidden -> output
440 |           for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
441 |           if (f <= -MAX_EXP) continue;
442 |           else if (f >= MAX_EXP) continue;
443 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
444 |           // 'g' is the gradient multiplied by the learning rate
445 |           g = (1 - vocab[word].code[d] - f) * alpha;
446 |           // Propagate errors output -> hidden
447 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
448 |           // Learn weights hidden -> output
449 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
450 |         }
451 |         // NEGATIVE SAMPLING
452 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
453 |           if (d == 0) {
454 |             target = word;
455 |             label = 1;
456 |           } else {
457 |             next_random = next_random * (unsigned long long)25214903917 + 11;
458 |             target = table[(next_random >> 16) % table_size];
459 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
460 |             if (target == word) continue;
461 |             label = 0;
462 |           }
463 |           l2 = target * layer1_size;
464 |           f = 0;
465 |           for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
466 |           if (f > MAX_EXP) g = (label - 1) * alpha;
467 |           else if (f < -MAX_EXP) g = (label - 0) * alpha;
468 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
469 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
470 |           for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
471 |         }
472 |         // hidden -> in
473 |         for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
474 |           c = sentence_position - window + a;
475 |           if (c < 0) continue;
476 |           if (c >= sentence_length) continue;
477 |           last_word = sen[c];
478 |           if (last_word == -1) continue;
479 |           for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
480 |         }
481 |       }
482 |     } else {  //train skip-gram
483 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
484 |         c = sentence_position - window + a;
485 |         if (c < 0) continue;
486 |         if (c >= sentence_length) continue;
487 |         last_word = sen[c];
488 |         if (last_word == -1) continue;
489 |         l1 = last_word * layer1_size;
490 |         for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
491 |         // HIERARCHICAL SOFTMAX
492 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
493 |           f = 0;
494 |           l2 = vocab[word].point[d] * layer1_size;
495 |           // Propagate hidden -> output
496 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
497 |           if (f <= -MAX_EXP) continue;
498 |           else if (f >= MAX_EXP) continue;
499 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
500 |           // 'g' is the gradient multiplied by the learning rate
501 |           g = (1 - vocab[word].code[d] - f) * alpha;
502 |           // Propagate errors output -> hidden
503 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
504 |           // Learn weights hidden -> output
505 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
506 |         }
507 |         // NEGATIVE SAMPLING
508 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
509 |           if (d == 0) {
510 |             target = word;
511 |             label = 1;
512 |           } else {
513 |             next_random = next_random * (unsigned long long)25214903917 + 11;
514 |             target = table[(next_random >> 16) % table_size];
515 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
516 |             if (target == word) continue;
517 |             label = 0;
518 |           }
519 |           l2 = target * layer1_size;
520 |           f = 0;
521 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
522 |           if (f > MAX_EXP) g = (label - 1) * alpha;
523 |           else if (f < -MAX_EXP) g = (label - 0) * alpha;
524 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
525 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
526 |           for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
527 |         }
528 |         // Learn weights input -> hidden
529 |         for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
530 |       }
531 |     }
532 |     sentence_position++;
533 |     if (sentence_position >= sentence_length) {
534 |       sentence_length = 0;
535 |       continue;
536 |     }
537 |   }
538 |   fclose(fi);
539 |   free(neu1);
540 |   free(neu1e);
541 |   pthread_exit(NULL);
542 | }
543 | 
544 | void TrainModel() {
545 |   long a, b, c, d;
546 |   FILE *fo;
547 |   pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
548 |   printf("Starting training using file %s\n", train_file);
549 |   starting_alpha = alpha;
550 |   if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
551 |   if (save_vocab_file[0] != 0) SaveVocab();
552 |   if (output_file[0] == 0) return;
553 |   InitNet();
554 |   if (negative > 0) InitUnigramTable();
555 |   start = clock();
556 |   for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
557 |   for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
558 |   fo = fopen(output_file, "wb");
559 |   if (classes == 0) {
560 |     // Save the word vectors
561 |     fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
562 |     for (a = 0; a < vocab_size; a++) {
563 |       fprintf(fo, "%s ", vocab[a].word);
564 |       if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
565 |       else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
566 |       fprintf(fo, "\n");
567 |     }
568 |   } else {
569 |     // Run K-means on the word vectors
570 |     int clcn = classes, iter = 10, closeid;
571 |     int *centcn = (int *)malloc(classes * sizeof(int));
572 |     int *cl = (int *)calloc(vocab_size, sizeof(int));
573 |     real closev, x;
574 |     real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
575 |     for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
576 |     for (a = 0; a < iter; a++) {
577 |       for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
578 |       for (b = 0; b < clcn; b++) centcn[b] = 1;
579 |       for (c = 0; c < vocab_size; c++) {
580 |         for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
581 |         centcn[cl[c]]++;
582 |       }
583 |       for (b = 0; b < clcn; b++) {
584 |         closev = 0;
585 |         for (c = 0; c < layer1_size; c++) {
586 |           cent[layer1_size * b + c] /= centcn[b];
587 |           closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
588 |         }
589 |         closev = sqrt(closev);
590 |         for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
591 |       }
592 |       for (c = 0; c < vocab_size; c++) {
593 |         closev = -10;
594 |         closeid = 0;
595 |         for (d = 0; d < clcn; d++) {
596 |           x = 0;
597 |           for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
598 |           if (x > closev) {
599 |             closev = x;
600 |             closeid = d;
601 |           }
602 |         }
603 |         cl[c] = closeid;
604 |       }
605 |     }
606 |     // Save the K-means classes
607 |     for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
608 |     free(centcn);
609 |     free(cent);
610 |     free(cl);
611 |   }
612 |   fclose(fo);
613 | }
614 | 
615 | int ArgPos(char *str, int argc, char **argv) {
616 |   int a;
617 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
618 |     if (a == argc - 1) {
619 |       printf("Argument missing for %s\n", str);
620 |       exit(1);
621 |     }
622 |     return a;
623 |   }
624 |   return -1;
625 | }
626 | 
627 | int main(int argc, char **argv) {
628 |   int i;
629 |   if (argc == 1) {
630 |     printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
631 |     printf("Options:\n");
632 |     printf("Parameters for training:\n");
633 |     printf("\t-train <file>\n");
634 |     printf("\t\tUse text data from <file> to train the model\n");
635 |     printf("\t-output <file>\n");
636 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
637 |     printf("\t-size <int>\n");
638 |     printf("\t\tSet size of word vectors; default is 100\n");
639 |     printf("\t-window <int>\n");
640 |     printf("\t\tSet max skip length between words; default is 5\n");
641 |     printf("\t-sample <float>\n");
642 |     printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
643 |     printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
644 |     printf("\t-hs <int>\n");
645 |     printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
646 |     printf("\t-negative <int>\n");
647 |     printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
648 |     printf("\t-threads <int>\n");
649 |     printf("\t\tUse <int> threads (default 12)\n");
650 |     printf("\t-iter <int>\n");
651 |     printf("\t\tRun more training iterations (default 5)\n");
652 |     printf("\t-min-count <int>\n");
653 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
654 |     printf("\t-alpha <float>\n");
655 |     printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
656 |     printf("\t-classes <int>\n");
657 |     printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
658 |     printf("\t-debug <int>\n");
659 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
660 |     printf("\t-binary <int>\n");
661 |     printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
662 |     printf("\t-save-vocab <file>\n");
663 |     printf("\t\tThe vocabulary will be saved to <file>\n");
664 |     printf("\t-read-vocab <file>\n");
665 |     printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
666 |     printf("\t-cbow <int>\n");
667 |     printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
668 |     printf("\nExamples:\n");
669 |     printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
670 |     return 0;
671 |   }
672 |   output_file[0] = 0;
673 |   save_vocab_file[0] = 0;
674 |   read_vocab_file[0] = 0;
675 |   if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
676 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
677 |   if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
678 |   if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
679 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
680 |   if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
681 |   if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
682 |   if (cbow) alpha = 0.05;
683 |   if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
684 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
685 |   if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
686 |   if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
687 |   if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
688 |   if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
689 |   if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
690 |   if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
691 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
692 |   if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
693 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
694 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
695 |   expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
696 |   for (i = 0; i < EXP_TABLE_SIZE; i++) {
697 |     expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
698 |     expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
699 |   }
700 |   TrainModel();
701 |   return 0;
702 | }
703 | 


--------------------------------------------------------------------------------