├── README.md
├── embedding
    ├── cw.c
    ├── glove.c
    ├── lbl.c
    ├── nnlm.c
    ├── order.c
    └── word2vec.c
└── evaluation
    ├── avg
        ├── README.md
        ├── avg.py
        ├── avg_embedding.cpp
        ├── imdb_test.txt
        └── imdb_train.txt
    ├── cnn
        ├── README.md
        ├── cnn.py
        ├── cnn_senna.cpp
        ├── fileutil.hpp
        ├── makefile
        ├── tree_dev.txt
        ├── tree_test.txt
        └── tree_train.txt
    ├── ner
        ├── README.md
        ├── default.config
        ├── ner.jar
        └── ner.py
    ├── pos
        ├── README.md
        ├── fileutil.hpp
        ├── makefile
        ├── pos.py
        ├── pos_test.txt
        ├── pos_train.txt
        ├── pos_valid.txt
        └── sennaseg.cpp
    ├── syn_sem
        ├── README.md
        ├── compute-accuracy-txt.c
        ├── king.py
        └── questions-words.txt
    ├── tfl
        ├── README.md
        ├── toefl.cpp
        ├── toefl.py
        └── toefl.txt
    └── ws
        ├── README.md
        ├── ws.cpp
        ├── ws.py
        ├── ws353.txt
        ├── ws353_relatedness.txt
        └── ws353_similarity.txt


/README.md:
--------------------------------------------------------------------------------
 1 | # compare
 2 | 
 3 | This is the source code of [How to Generate a Good Word Embedding?](http://arxiv.org/abs/1507.05523).
 4 | 
 5 | Folder **embedding** contains all embedding algorithms we used in this paper.
 6 | 
 7 | Folder **evaluation** contains all evaluation tasks in the paper.
 8 | 
 9 | The Chinese version of Introduction is available at [《How to Generate a Good Word Embedding?》导读](http://licstar.net/archives/620).
10 | 


--------------------------------------------------------------------------------
/embedding/glove.c:
--------------------------------------------------------------------------------
  1 | //  GloVe: Global Vectors for Word Representation
  2 | //
  3 | //  Copyright (c) 2014 The Board of Trustees of
  4 | //  The Leland Stanford Junior University. All Rights Reserved.
  5 | //
  6 | //  Licensed under the Apache License, Version 2.0 (the "License");
  7 | //  you may not use this file except in compliance with the License.
  8 | //  You may obtain a copy of the License at
  9 | //
 10 | //      http://www.apache.org/licenses/LICENSE-2.0
 11 | //
 12 | //  Unless required by applicable law or agreed to in writing, software
 13 | //  distributed under the License is distributed on an "AS IS" BASIS,
 14 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | //  See the License for the specific language governing permissions and
 16 | //  limitations under the License.
 17 | //
 18 | //
 19 | //  For more information, bug reports, fixes, contact:
 20 | //    Jeffrey Pennington (jpennin@stanford.edu)
 21 | //    GlobalVectors@googlegroups.com
 22 | //    http://www-nlp.stanford.edu/projects/glove/
 23 | 
 24 | 
 25 | #include <stdio.h>
 26 | #include <stdlib.h>
 27 | #include <string.h>
 28 | #include <math.h>
 29 | #include <pthread.h>
 30 | 
 31 | #define _FILE_OFFSET_BITS 64
 32 | #define MAX_STRING_LENGTH 1000
 33 | 
 34 | typedef double real;
 35 | 
 36 | typedef struct cooccur_rec {
 37 |     int word1;
 38 |     int word2;
 39 |     real val;
 40 | } CREC;
 41 | 
 42 | int verbose = 2; // 0, 1, or 2
 43 | int num_threads = 8; // pthreads
 44 | int num_iter = 25; // Number of full passes through cooccurrence matrix
 45 | int vector_size = 50; // Word vector size
 46 | int save_gradsq = 0; // By default don't save squared gradient values
 47 | int use_binary = 1; // 0: save as text files; 1: save as binary; 2: both. For binary, save both word and context word vectors.
 48 | int model = 2; // For text file output only. 0: concatenate word and context vectors (and biases) i.e. save everything; 1: Just save word vectors (no bias); 2: Save (word + context word) vectors (no biases)
 49 | real eta = 0.05; // Initial learning rate
 50 | real alpha = 0.75, x_max = 100.0; // Weighting function parameters, not extremely sensitive to corpus, though may need adjustment for very small or very large corpora
 51 | real *W, *gradsq, *cost, *cost_valid;
 52 | double train_percentage = 0.95;
 53 | long long num_lines, *lines_per_thread, vocab_size;
 54 | char *vocab_file, *input_file, *save_W_file, *save_gradsq_file;
 55 | long long *word_cnt, train_words;
 56 | real sample = 1e-4;
 57 | 
 58 | /* Efficient string comparison */
 59 | int scmp( char *s1, char *s2 ) {
 60 |     while(*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
 61 |     return(*s1 - *s2);
 62 | }
 63 | 
 64 | void initialize_parameters() {
 65 | 	long long a, b;
 66 | 	vector_size++; // Temporarily increment to allocate space for bias
 67 |     
 68 | 	/* Allocate space for word vectors and context word vectors, and correspodning gradsq */
 69 | 	a = posix_memalign((void **)&W, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc
 70 |     if (W == NULL) {
 71 |         fprintf(stderr, "Error allocating memory for W\n");
 72 |         exit(1);
 73 |     }
 74 |     a = posix_memalign((void **)&gradsq, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc
 75 | 	if (gradsq == NULL) {
 76 |         fprintf(stderr, "Error allocating memory for gradsq\n");
 77 |         exit(1);
 78 |     }
 79 | 	for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size;
 80 | 	for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) gradsq[a * vector_size + b] = 1.0; // So initial value of eta is equal to initial learning rate
 81 | 	vector_size--;
 82 | }
 83 | 
 84 | /* Train the GloVe model */
 85 | void *glove_thread(void *vid) {
 86 |     long long a, b ,l1, l2;
 87 |     long long id = (long long) vid;
 88 |     long long thread_lines_train;
 89 |     long long thread_lines_total;
 90 |     CREC cr;
 91 |     real diff, fdiff, temp1, temp2;
 92 |     FILE *fin;
 93 |     fin = fopen(input_file, "rb");
 94 |     fseeko(fin, (num_lines / num_threads * id) * (sizeof(CREC)), SEEK_SET); //Threads spaced roughly equally throughout file
 95 |     cost[id] = 0;
 96 |     cost_valid[id] = 0;
 97 |     
 98 |     thread_lines_train = (long long)(lines_per_thread[id] * train_percentage);
 99 |     thread_lines_total = lines_per_thread[id];
100 | 
101 |     for(a = 0; a < thread_lines_train; a++) {
102 |         fread(&cr, sizeof(CREC), 1, fin);
103 |         if(feof(fin)) break;
104 |         
105 |         if (sample > 0) {
106 |             real r = (sample * train_words) / word_cnt[cr.word1];
107 |             real keep = sqrt(r) + r;
108 |             if(keep < 1) cr.val *= keep;
109 |         }
110 |         
111 |         /* Get location of words in W & gradsq */
112 |         l1 = (cr.word1 - 1LL) * (vector_size + 1); // cr word indices start at 1
113 |         l2 = ((cr.word2 - 1LL) + vocab_size) * (vector_size + 1); // shift by vocab_size to get separate vectors for context words
114 |         
115 |         /* Calculate cost, save diff for gradients */
116 |         diff = 0;
117 |         for(b = 0; b < vector_size; b++) diff += W[b + l1] * W[b + l2]; // dot product of word and context word vector
118 |         diff += W[vector_size + l1] + W[vector_size + l2] - log(cr.val); // add separate bias for each word
119 |         fdiff = (cr.val > x_max) ? diff : pow(cr.val / x_max, alpha) * diff; // multiply weighting function (f) with diff
120 |         //fdiff = diff;
121 |         cost[id] += 0.5 * fdiff * diff; // weighted squared error
122 |         
123 |         /* Adaptive gradient updates */
124 |         fdiff *= eta; // for ease in calculating gradient
125 |         for(b = 0; b < vector_size; b++) {
126 |             // learning rate times gradient for word vectors
127 |             temp1 = fdiff * W[b + l2];
128 |             temp2 = fdiff * W[b + l1];
129 |             // adaptive updates
130 |             W[b + l1] -= temp1 / sqrt(gradsq[b + l1]);
131 |             W[b + l2] -= temp2 / sqrt(gradsq[b + l2]);
132 |             gradsq[b + l1] += temp1 * temp1;
133 |             gradsq[b + l2] += temp2 * temp2;
134 |         }
135 |         // updates for bias terms
136 |         W[vector_size + l1] -= fdiff / sqrt(gradsq[vector_size + l1]);
137 |         W[vector_size + l2] -= fdiff / sqrt(gradsq[vector_size + l2]);
138 |         fdiff *= fdiff;
139 |         gradsq[vector_size + l1] += fdiff;
140 |         gradsq[vector_size + l2] += fdiff;
141 |         
142 |     }
143 |     
144 | 
145 |     for(a = thread_lines_train; a < thread_lines_total; a++) {
146 |         fread(&cr, sizeof(CREC), 1, fin);
147 |         if(feof(fin)) break;
148 |         
149 |         if (sample > 0) {
150 |             real r = (sample * train_words) / word_cnt[cr.word1];
151 |             real keep = sqrt(r) + r;
152 |             if(keep < 1) cr.val *= keep;
153 |         }
154 |         
155 |         /* Get location of words in W & gradsq */
156 |         l1 = (cr.word1 - 1LL) * (vector_size + 1); // cr word indices start at 1
157 |         l2 = ((cr.word2 - 1LL) + vocab_size) * (vector_size + 1); // shift by vocab_size to get separate vectors for context words
158 |         
159 |         /* Calculate cost, save diff for gradients */
160 |         diff = 0;
161 |         for(b = 0; b < vector_size; b++) diff += W[b + l1] * W[b + l2]; // dot product of word and context word vector
162 |         diff += W[vector_size + l1] + W[vector_size + l2] - log(cr.val); // add separate bias for each word
163 |         fdiff = (cr.val > x_max) ? diff : pow(cr.val / x_max, alpha) * diff; // multiply weighting function (f) with diff
164 |         //fdiff = diff;
165 |         cost_valid[id] += 0.5 * fdiff * diff; // weighted squared error
166 |     }
167 | 
168 |     fclose(fin);
169 |     pthread_exit(NULL);
170 | }
171 | 
172 | /* Save params to file */
173 | int save_params() {
174 |     long long a, b;
175 |     char format[20];
176 |     char output_file[MAX_STRING_LENGTH], output_file_gsq[MAX_STRING_LENGTH];
177 |     char *word = malloc(sizeof(char) * MAX_STRING_LENGTH);
178 |     FILE *fid, *fout, *fgs;
179 |     
180 |     if(use_binary > 0) { // Save parameters in binary file
181 |         sprintf(output_file,"%s.bin",save_W_file);
182 |         fout = fopen(output_file,"wb");
183 |         if(fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;}
184 |         for(a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&W[a], sizeof(real), 1,fout);
185 |         fclose(fout);
186 |         if(save_gradsq > 0) {
187 |             sprintf(output_file_gsq,"%s.bin",save_gradsq_file);
188 |             fgs = fopen(output_file_gsq,"wb");
189 |             if(fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;}
190 |             for(a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&gradsq[a], sizeof(real), 1,fgs);
191 |             fclose(fgs);
192 |         }
193 |     }
194 |     if(use_binary != 1) { // Save parameters in text file
195 |         sprintf(output_file,"%s.txt",save_W_file);
196 |         if(save_gradsq > 0) {
197 |             sprintf(output_file_gsq,"%s.txt",save_gradsq_file);
198 |             fgs = fopen(output_file_gsq,"wb");
199 |             if(fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;}
200 |         }
201 |         fout = fopen(output_file,"wb");
202 |         fprintf(fout, "%lld %d\n",vocab_size,vector_size);
203 |         if(fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;}
204 |         fid = fopen(vocab_file, "r");
205 |         sprintf(format,"%%%ds",MAX_STRING_LENGTH);
206 |         if(fid == NULL) {fprintf(stderr, "Unable to open file %s.\n",vocab_file); return 1;}
207 |         for(a = 0; a < vocab_size; a++) {
208 |             if(fscanf(fid,format,word) == 0) return 1;
209 |             fprintf(fout, "%s",word);
210 |             if(model == 0) { // Save all parameters (including bias)
211 |                 for(b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]);
212 |                 for(b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[(vocab_size + a) * (vector_size + 1) + b]);
213 |             }
214 |             if(model == 1) // Save only "word" vectors (without bias)
215 |                 for(b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]);
216 |             if(model == 2) // Save "word + context word" vectors (without bias)
217 |                 for(b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b] + W[(vocab_size + a) * (vector_size + 1) + b]);
218 |             fprintf(fout,"\n");
219 |             if(save_gradsq > 0) { // Save gradsq
220 |                 fprintf(fgs, "%s",word);
221 |                 for(b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[a * (vector_size + 1) + b]);
222 |                 for(b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[(vocab_size + a) * (vector_size + 1) + b]);
223 |                 fprintf(fgs,"\n");
224 |             }
225 |             if(fscanf(fid,format,word) == 0) return 1; // Eat irrelevant frequency entry
226 |         }
227 |         fclose(fid);
228 |         fclose(fout);
229 |         if(save_gradsq > 0) fclose(fgs);
230 |     }
231 |     return 0;
232 | }
233 | 
234 | /* Train model */
235 | int train_glove() {
236 |     long long a, file_size;
237 |     int b;
238 |     char tfile[MAX_STRING_LENGTH];
239 |     FILE *fin;
240 |     real total_cost = 0;
241 |     real total_cost_valid = 0;
242 |     fprintf(stderr, "TRAINING MODEL\n");
243 |     
244 |     fin = fopen(input_file, "rb");
245 |     if(fin == NULL) {fprintf(stderr,"Unable to open cooccurrence file %s.\n",input_file); return 1;}
246 |     fseeko(fin, 0, SEEK_END);
247 |     file_size = ftello(fin);
248 |     num_lines = file_size/(sizeof(CREC)); // Assuming the file isn't corrupt and consists only of CREC's
249 |     fclose(fin);
250 |     fprintf(stderr,"Read %lld lines.\n", num_lines);
251 |     if(verbose > 1) fprintf(stderr,"Initializing parameters...");
252 |     initialize_parameters();
253 |     if(verbose > 1) fprintf(stderr,"done.\n");
254 |     if(verbose > 0) fprintf(stderr,"vector size: %d\n", vector_size);
255 |     if(verbose > 0) fprintf(stderr,"vocab size: %lld\n", vocab_size);
256 |     if(verbose > 0) fprintf(stderr,"x_max: %lf\n", x_max);
257 |     if(verbose > 0) fprintf(stderr,"alpha: %lf\n", alpha);
258 |     pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
259 |     lines_per_thread = (long long *) malloc(num_threads * sizeof(long long));
260 |     
261 |     // Lock-free asynchronous SGD
262 |     
263 |     strcpy(tfile, save_W_file);
264 |     
265 |     for(b = 0; b < num_iter; b++) {
266 |         total_cost_valid = total_cost = 0;
267 |         for (a = 0; a < num_threads - 1; a++) lines_per_thread[a] = num_lines / num_threads;
268 |         lines_per_thread[a] = num_lines / num_threads + num_lines % num_threads;
269 |         for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, glove_thread, (void *)a);
270 |         for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
271 |         for (a = 0; a < num_threads; a++) total_cost += cost[a];
272 |         for (a = 0; a < num_threads; a++) total_cost_valid += cost_valid[a];
273 |         fprintf(stdout,"iter: %03d  cost: %lf  valid: %lf\n", b+1, 
274 |             total_cost/(num_lines*train_percentage), 
275 |             total_cost_valid/(num_lines*(1-train_percentage)));
276 |         fflush(stdout);
277 |         sprintf(save_W_file, "%s_%d", tfile, b+1);
278 |         //if(b % 10 == 9)
279 |             save_params();
280 |     }
281 |     //strcpy(save_W_file, tfile);
282 |     return save_params();
283 | }
284 | 
285 | int find_arg(char *str, int argc, char **argv) {
286 |     int i;
287 |     for (i = 1; i < argc; i++) {
288 |         if(!scmp(str, argv[i])) {
289 |             if (i == argc - 1) {
290 |                 printf("No argument given for %s\n", str);
291 |                 exit(1);
292 |             }
293 |             return i;
294 |         }
295 |     }
296 |     return -1;
297 | }
298 | 
299 | int main(int argc, char **argv) {
300 |     int i, j = 0;
301 |     FILE *fid;
302 |     char format[20], str[MAX_STRING_LENGTH + 1];
303 |     long long id;
304 |     vocab_file = malloc(sizeof(char) * MAX_STRING_LENGTH);
305 |     input_file = malloc(sizeof(char) * MAX_STRING_LENGTH);
306 |     save_W_file = malloc(sizeof(char) * MAX_STRING_LENGTH);
307 |     save_gradsq_file = malloc(sizeof(char) * MAX_STRING_LENGTH);
308 |     
309 |     if (argc == 1) {
310 |         printf("GloVe: Global Vectors for Word Representation, v0.2\n");
311 |         printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n");
312 |         printf("Usage options:\n");
313 |         printf("\t-verbose <int>\n");
314 |         printf("\t\tSet verbosity: 0, 1, or 2 (default)\n");
315 |         printf("\t-vector-size <int>\n");
316 |         printf("\t\tDimension of word vector representations (excluding bias term); default 50\n");
317 |         printf("\t-threads <int>\n");
318 |         printf("\t\tNumber of threads; default 8\n");
319 |         printf("\t-iter <int>\n");
320 |         printf("\t\tNumber of training iterations; default 25\n");
321 |         printf("\t-eta <float>\n");
322 |         printf("\t\tInitial learning rate; default 0.05\n");
323 |         printf("\t-alpha <float>\n");
324 |         printf("\t\tParameter in exponent of weighting function; default 0.75\n");
325 |         printf("\t-x-max <float>\n");
326 |         printf("\t\tParameter specifying cutoff in weighting function; default 100.0\n");
327 |         printf("\t-binary <int>\n");
328 |         printf("\t\tSave output in binary format (0: text, 1: binary, 2: both); default 0\n");
329 |         printf("\t-model <int>\n");
330 |         printf("\t\tModel for word vector output (for text output only); default 2\n");
331 |         printf("\t\t   0: output all data, for both word and context word vectors, including bias terms\n");
332 |         printf("\t\t   1: output word vectors, excluding bias terms\n");
333 |         printf("\t\t   2: output word vectors + context word vectors, excluding bias terms\n");
334 |         printf("\t-input-file <file>\n");
335 |         printf("\t\tBinary input file of shuffled cooccurrence data (produced by 'cooccur' and 'shuffle'); default cooccurrence.shuf.bin\n");
336 |         printf("\t-vocab-file <file>\n");
337 |         printf("\t\tFile containing vocabulary (truncated unigram counts, produced by 'vocab_count'); default vocab.txt\n");
338 |         printf("\t-save-file <file>\n");
339 |         printf("\t\tFilename, excluding extension, for word vector output; default vectors\n");
340 |         printf("\t-gradsq-file <file>\n");
341 |         printf("\t\tFilename, excluding extension, for squared gradient output; default gradsq\n");
342 |         printf("\t-save-gradsq <int>\n");
343 |         printf("\t\tSave accumulated squared gradients; default 0 (off); ignored if gradsq-file is specified\n");
344 |         printf("\nExample usage:\n");
345 |         printf("./glove -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file vectors -gradsq-file gradsq -verbose 2 -vector-size 100 -threads 16 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 2 -model 2\n\n");
346 |         return 0;
347 |     }
348 |     
349 |     
350 |     if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]);
351 |     if ((i = find_arg((char *)"-vector-size", argc, argv)) > 0) vector_size = atoi(argv[i + 1]);
352 |     if ((i = find_arg((char *)"-iter", argc, argv)) > 0) num_iter = atoi(argv[i + 1]);
353 |     if ((i = find_arg((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
354 |     cost = malloc(sizeof(real) * num_threads);
355 |     cost_valid = malloc(sizeof(real) * num_threads);
356 |     if ((i = find_arg((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
357 |     if ((i = find_arg((char *)"-x-max", argc, argv)) > 0) x_max = atof(argv[i + 1]);
358 |     if ((i = find_arg((char *)"-eta", argc, argv)) > 0) eta = atof(argv[i + 1]);
359 |     if ((i = find_arg((char *)"-binary", argc, argv)) > 0) use_binary = atoi(argv[i + 1]);
360 |     if ((i = find_arg((char *)"-model", argc, argv)) > 0) model = atoi(argv[i + 1]);
361 |     if(model != 0 && model != 1) model = 2;
362 |     if ((i = find_arg((char *)"-save-gradsq", argc, argv)) > 0) save_gradsq = atoi(argv[i + 1]);
363 |     if ((i = find_arg((char *)"-vocab-file", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]);
364 |     else strcpy(vocab_file, (char *)"vocab.txt");
365 |     if ((i = find_arg((char *)"-save-file", argc, argv)) > 0) strcpy(save_W_file, argv[i + 1]);
366 |     else strcpy(save_W_file, (char *)"vectors");
367 |     if ((i = find_arg((char *)"-gradsq-file", argc, argv)) > 0) {
368 |         strcpy(save_gradsq_file, argv[i + 1]);
369 |         save_gradsq = 1;
370 |     }
371 |     else if(save_gradsq > 0) strcpy(save_gradsq_file, (char *)"gradsq");
372 |     if ((i = find_arg((char *)"-input-file", argc, argv)) > 0) strcpy(input_file, argv[i + 1]);
373 |     else strcpy(input_file, (char *)"cooccurrence.shuf.bin");
374 |     
375 |     // count vocab_size
376 |     vocab_size = 0;
377 |     fid = fopen(vocab_file, "r");
378 |     if(fid == NULL) {fprintf(stderr, "Unable to open vocab file %s.\n",vocab_file); return 1;}
379 |     while ((i = getc(fid)) != EOF) if (i == '\n') vocab_size++; // Count number of entries in vocab_file
380 |     fclose(fid);
381 |     
382 |     train_words = 0;
383 |     word_cnt = (long long *)malloc(sizeof(long long) * (vocab_size+1)); // frequency for each word
384 |     fid = fopen(vocab_file, "r");
385 |     sprintf(format,"%%%ds %%lld", MAX_STRING_LENGTH); // Format to read from vocab file, which has frequency data
386 |     while(fscanf(fid, format, str, &id) != EOF) {
387 |         word_cnt[++j] = id;
388 |         train_words += id;
389 |     }
390 |     fclose(fid);
391 |     
392 |     return train_glove();
393 | }
394 | 


--------------------------------------------------------------------------------
/embedding/lbl.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 100
 22 | #define EXP_TABLE_SIZE 1000
 23 | #define MAX_EXP 6
 24 | #define MAX_SENTENCE_LENGTH 1000
 25 | #define MAX_CODE_LENGTH 40
 26 | 
 27 | const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 28 | 
 29 | typedef float real;                    // Precision of float numbers
 30 | 
 31 | struct vocab_word {
 32 |   long long cn;
 33 |   int *point;
 34 |   char *word, *code, codelen;
 35 | };
 36 | 
 37 | char train_file[MAX_STRING], output_file[MAX_STRING];
 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 39 | struct vocab_word *vocab;
 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
 41 | int *vocab_hash;
 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100, input_size = 500, hidden_size = 50;
 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
 44 | real alpha = 0.025, starting_alpha, sample = 1e-3;
 45 | real *syn0, *syn1, *syn1neg, *expTable, *syn1neg_gdsq, *syn0_gdsq, *hidden, *hidden_gdsq;
 46 | double *loss, *lossV, sum_loss, sum_lossV;
 47 | long long *loss_cnt, *lossV_cnt, sum_loss_cnt, sum_lossV_cnt;
 48 | 
 49 | clock_t start;
 50 | 
 51 | int hs = 0, negative = 5;
 52 | const int table_size = 1e8;
 53 | int *table;
 54 | 
 55 | void InitUnigramTable() {
 56 |   int a, i;
 57 |   long long train_words_pow = 0;
 58 |   real d1, power = 0.75;
 59 |   table = (int *)malloc(table_size * sizeof(int));
 60 |   for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
 61 |   i = 0;
 62 |   d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
 63 |   for (a = 0; a < table_size; a++) {
 64 |     table[a] = i;
 65 |     if (a / (real)table_size > d1) {
 66 |       i++;
 67 |       d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
 68 |     }
 69 |     if (i >= vocab_size) i = vocab_size - 1;
 70 |   }
 71 | }
 72 | 
 73 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 74 | void ReadWord(char *word, FILE *fin) {
 75 |   int a = 0, ch;
 76 |   while (!feof(fin)) {
 77 |     ch = fgetc(fin);
 78 |     if (ch == 13) continue;
 79 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 80 |       if (a > 0) {
 81 |         if (ch == '\n') ungetc(ch, fin);
 82 |         break;
 83 |       }
 84 |       if (ch == '\n') {
 85 |         strcpy(word, (char *)"</s>");
 86 |         return;
 87 |       } else continue;
 88 |     }
 89 |     word[a] = ch;
 90 |     a++;
 91 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 92 |   }
 93 |   word[a] = 0;
 94 | }
 95 | 
 96 | // Returns hash value of a word
 97 | int GetWordHash(char *word) {
 98 |   unsigned long long a, hash = 0;
 99 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
100 |   hash = hash % vocab_hash_size;
101 |   return hash;
102 | }
103 | 
104 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
105 | int SearchVocab(char *word) {
106 |   unsigned int hash = GetWordHash(word);
107 |   while (1) {
108 |     if (vocab_hash[hash] == -1) return -1;
109 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
110 |     hash = (hash + 1) % vocab_hash_size;
111 |   }
112 |   return -1;
113 | }
114 | 
115 | // Reads a word and returns its index in the vocabulary
116 | int ReadWordIndex(FILE *fin) {
117 |   char word[MAX_STRING];
118 |   ReadWord(word, fin);
119 |   if (feof(fin)) return -1;
120 |   return SearchVocab(word);
121 | }
122 | 
123 | // Adds a word to the vocabulary
124 | int AddWordToVocab(char *word) {
125 |   unsigned int hash, length = strlen(word) + 1;
126 |   if (length > MAX_STRING) length = MAX_STRING;
127 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
128 |   strcpy(vocab[vocab_size].word, word);
129 |   vocab[vocab_size].cn = 0;
130 |   vocab_size++;
131 |   // Reallocate memory if needed
132 |   if (vocab_size + 2 >= vocab_max_size) {
133 |     vocab_max_size += 1000;
134 |     vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
135 |   }
136 |   hash = GetWordHash(word);
137 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
138 |   vocab_hash[hash] = vocab_size - 1;
139 |   return vocab_size - 1;
140 | }
141 | 
142 | // Used later for sorting by word counts
143 | int VocabCompare(const void *a, const void *b) {
144 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
145 | }
146 | 
147 | // Sorts the vocabulary by frequency using word counts
148 | void SortVocab() {
149 |   int a, size;
150 |   unsigned int hash;
151 |   // Sort the vocabulary and keep </s> at the first position
152 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
153 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
154 |   size = vocab_size;
155 |   train_words = 0;
156 |   for (a = 0; a < size; a++) {
157 |     // Words occuring less than min_count times will be discarded from the vocab
158 |     if ((vocab[a].cn < min_count) && (a != 0)) {
159 |       vocab_size--;
160 |       free(vocab[a].word);
161 |     } else {
162 |       // Hash will be re-computed, as after the sorting it is not actual
163 |       hash=GetWordHash(vocab[a].word);
164 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
165 |       vocab_hash[hash] = a;
166 |       train_words += vocab[a].cn;
167 |     }
168 |   }
169 |   vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
170 |   // Allocate memory for the binary tree construction
171 |   for (a = 0; a < vocab_size; a++) {
172 |     vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
173 |     vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
174 |   }
175 | }
176 | 
177 | // Reduces the vocabulary by removing infrequent tokens
178 | void ReduceVocab() {
179 |   int a, b = 0;
180 |   unsigned int hash;
181 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
182 |     vocab[b].cn = vocab[a].cn;
183 |     vocab[b].word = vocab[a].word;
184 |     b++;
185 |   } else free(vocab[a].word);
186 |   vocab_size = b;
187 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
188 |   for (a = 0; a < vocab_size; a++) {
189 |     // Hash will be re-computed, as it is not actual
190 |     hash = GetWordHash(vocab[a].word);
191 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
192 |     vocab_hash[hash] = a;
193 |   }
194 |   fflush(stdout);
195 |   min_reduce++;
196 | }
197 | 
198 | // Create binary Huffman tree using the word counts
199 | // Frequent words will have short uniqe binary codes
200 | void CreateBinaryTree() {
201 |   long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
202 |   char code[MAX_CODE_LENGTH];
203 |   long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
204 |   long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
205 |   long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
206 |   for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
207 |   for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
208 |   pos1 = vocab_size - 1;
209 |   pos2 = vocab_size;
210 |   // Following algorithm constructs the Huffman tree by adding one node at a time
211 |   for (a = 0; a < vocab_size - 1; a++) {
212 |     // First, find two smallest nodes 'min1, min2'
213 |     if (pos1 >= 0) {
214 |       if (count[pos1] < count[pos2]) {
215 |         min1i = pos1;
216 |         pos1--;
217 |       } else {
218 |         min1i = pos2;
219 |         pos2++;
220 |       }
221 |     } else {
222 |       min1i = pos2;
223 |       pos2++;
224 |     }
225 |     if (pos1 >= 0) {
226 |       if (count[pos1] < count[pos2]) {
227 |         min2i = pos1;
228 |         pos1--;
229 |       } else {
230 |         min2i = pos2;
231 |         pos2++;
232 |       }
233 |     } else {
234 |       min2i = pos2;
235 |       pos2++;
236 |     }
237 |     count[vocab_size + a] = count[min1i] + count[min2i];
238 |     parent_node[min1i] = vocab_size + a;
239 |     parent_node[min2i] = vocab_size + a;
240 |     binary[min2i] = 1;
241 |   }
242 |   // Now assign binary code to each vocabulary word
243 |   for (a = 0; a < vocab_size; a++) {
244 |     b = a;
245 |     i = 0;
246 |     while (1) {
247 |       code[i] = binary[b];
248 |       point[i] = b;
249 |       i++;
250 |       b = parent_node[b];
251 |       if (b == vocab_size * 2 - 2) break;
252 |     }
253 |     vocab[a].codelen = i;
254 |     vocab[a].point[0] = vocab_size - 2;
255 |     for (b = 0; b < i; b++) {
256 |       vocab[a].code[i - b - 1] = code[b];
257 |       vocab[a].point[i - b] = point[b] - vocab_size;
258 |     }
259 |   }
260 |   free(count);
261 |   free(binary);
262 |   free(parent_node);
263 | }
264 | 
265 | void LearnVocabFromTrainFile() {
266 |   char word[MAX_STRING];
267 |   FILE *fin;
268 |   long long a, i;
269 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
270 |   fin = fopen(train_file, "rb");
271 |   if (fin == NULL) {
272 |     printf("ERROR: training data file not found!\n");
273 |     exit(1);
274 |   }
275 |   vocab_size = 0;
276 |   AddWordToVocab((char *)"</s>");
277 |   while (1) {
278 |     ReadWord(word, fin);
279 |     if (feof(fin)) break;
280 |     train_words++;
281 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
282 |       printf("%lldK%c", train_words / 1000, 13);
283 |       fflush(stdout);
284 |     }
285 |     i = SearchVocab(word);
286 |     if (i == -1) {
287 |       a = AddWordToVocab(word);
288 |       vocab[a].cn = 1;
289 |     } else vocab[i].cn++;
290 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
291 |   }
292 |   SortVocab();
293 |   if (debug_mode > 0) {
294 |     printf("Vocab size: %lld\n", vocab_size);
295 |     printf("Words in train file: %lld\n", train_words);
296 |   }
297 |   file_size = ftell(fin);
298 |   fclose(fin);
299 | }
300 | 
301 | void SaveVocab() {
302 |   long long i;
303 |   FILE *fo = fopen(save_vocab_file, "wb");
304 |   for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
305 |   fclose(fo);
306 | }
307 | 
308 | void ReadVocab() {
309 |   long long a, i = 0;
310 |   char c;
311 |   char word[MAX_STRING];
312 |   FILE *fin = fopen(read_vocab_file, "rb");
313 |   if (fin == NULL) {
314 |     printf("Vocabulary file not found\n");
315 |     exit(1);
316 |   }
317 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
318 |   vocab_size = 0;
319 |   while (1) {
320 |     ReadWord(word, fin);
321 |     if (feof(fin)) break;
322 |     a = AddWordToVocab(word);
323 |     fscanf(fin, "%lld%c", &vocab[a].cn, &c);
324 |     i++;
325 |   }
326 |   SortVocab();
327 |   if (debug_mode > 0) {
328 |     printf("Vocab size: %lld\n", vocab_size);
329 |     printf("Words in train file: %lld\n", train_words);
330 |   }
331 |   fin = fopen(train_file, "rb");
332 |   if (fin == NULL) {
333 |     printf("ERROR: training data file not found!\n");
334 |     exit(1);
335 |   }
336 |   fseek(fin, 0, SEEK_END);
337 |   file_size = ftell(fin);
338 |   fclose(fin);
339 | }
340 | 
341 | void InitNet() {
342 |   long long a, b;
343 |   unsigned long long next_random = 1;
344 |   a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
345 |   a = posix_memalign((void **)&syn0_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real));
346 |   if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
347 |   if (hs) {
348 |     a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
349 |     if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
350 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
351 |      syn1[a * layer1_size + b] = 0;
352 |   }
353 |   if (negative>0) {
354 |     a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * hidden_size * sizeof(real));
355 | 	a = posix_memalign((void **)&syn1neg_gdsq, 128, (long long)vocab_size * hidden_size * sizeof(real));
356 |     if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
357 | 	for (a = 0; a < vocab_size; a++) for (b = 0; b < hidden_size; b++){
358 |       syn1neg[a * hidden_size + b] = 0;
359 |       syn1neg_gdsq[a * hidden_size + b] = 1e-8;
360 |     }
361 |     a = posix_memalign((void **)&hidden, 128, (long long)input_size * hidden_size * sizeof(real));
362 |     a = posix_memalign((void **)&hidden_gdsq, 128, (long long)input_size * hidden_size * sizeof(real));
363 |     for (a = 0; a < input_size; a++) for (b = 0; b < hidden_size; b++) {
364 |       next_random = next_random * (unsigned long long)25214903917 + 11;
365 |       hidden[a * hidden_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / hidden_size;
366 |       hidden_gdsq[a * hidden_size + b] = 1e-8;
367 |     }
368 |   }
369 |   for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
370 |     next_random = next_random * (unsigned long long)25214903917 + 11;
371 |     syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
372 |     syn0_gdsq[a * layer1_size + b] = 1e-8;
373 |   }
374 |   CreateBinaryTree();
375 | }
376 | 
377 | void writeWV(char *output_file){
378 |   long long a, b;
379 |   FILE *fo = fopen(output_file, "wb");
380 |     fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
381 |     for (a = 0; a < vocab_size; a++) {
382 |       fprintf(fo, "%s ", vocab[a].word);
383 |       if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
384 |       else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
385 |       fprintf(fo, "\n");
386 |     }
387 |     fclose(fo);
388 | }
389 | 
390 | typedef unsigned long uint64_t;
391 | typedef unsigned int uint32_t;
392 | 
393 | double rsqrt64(double number) {
394 |   uint64_t i;
395 |   double x2, y;
396 |   x2 = number * 0.5;
397 |   y = number;
398 |   i = *(uint64_t *) &y;
399 |   i = 0x5fe6eb50c7b537a9 - (i >> 1);
400 |   y = *(double *) &i;
401 |   y = y * (1.5 - (x2 * y * y));
402 |   return y;
403 | }
404 | 
405 | float rsqrt(float number){
406 |   uint32_t i;
407 |   float x2, y;
408 |   x2 = number * 0.5F;
409 |   y  = number;
410 |   i  = *(uint32_t *) &y;
411 |   i  = 0x5f3759df - ( i >> 1 );
412 |   y  = *(float *) &i;
413 |   y  = y * ( 1.5F - ( x2 * y * y ) );
414 |   return y;
415 | }
416 | 
417 | 
418 | //b = Ax
419 | void fastmult(real *A, real *x, real *b, int xlen, int blen){
420 |   real val1, val2, val3, val4;
421 |   real val5, val6, val7, val8;
422 |   int i, j;
423 |   for (i=0; i<blen/8*8; i+=8) {
424 |     val1=0;
425 |     val2=0;
426 |     val3=0;
427 |     val4=0;
428 | 
429 |     val5=0;
430 |     val6=0;
431 |     val7=0;
432 |     val8=0;
433 | 
434 |     for (j=0; j<xlen; j++) {
435 |       val1 += x[j] * A[j+(i+0)*xlen];
436 |       val2 += x[j] * A[j+(i+1)*xlen];
437 |       val3 += x[j] * A[j+(i+2)*xlen];
438 |       val4 += x[j] * A[j+(i+3)*xlen];
439 | 
440 |       val5 += x[j] * A[j+(i+4)*xlen];
441 |       val6 += x[j] * A[j+(i+5)*xlen];
442 |       val7 += x[j] * A[j+(i+6)*xlen];
443 |       val8 += x[j] * A[j+(i+7)*xlen];
444 |     }
445 |     b[i+0] += val1;
446 |     b[i+1] += val2;
447 |     b[i+2] += val3;
448 |     b[i+3] += val4;
449 | 
450 |     b[i+4] += val5;
451 |     b[i+5] += val6;
452 |     b[i+6] += val7;
453 |     b[i+7] += val8;
454 |   }
455 | 
456 |   for (; i<blen; i++) {
457 |     for (j=0; j<xlen; j++) {
458 |       b[i] += x[j] * A[j+i*xlen];
459 |     }
460 |   }
461 | }
462 | 
463 | void *TrainModelThread(void *_id) {
464 |   long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
465 |   long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
466 |   long long l1, l2, c, target, label;
467 |   long long id = (long long)_id;
468 |   unsigned long long next_random = id + clock(); //must add ``clock'' if using this kind of iter.
469 |   real f, g;
470 |   clock_t now;
471 |   real *neu1 = (real *)calloc(input_size, sizeof(real));
472 |   real *neu1e = (real *)calloc(input_size, sizeof(real));
473 |   real *neu2 = (real *)calloc(hidden_size, sizeof(real));
474 |   real *neu2e = (real *)calloc(hidden_size, sizeof(real));
475 |   double err = 0, errV = 0; //loss
476 |   long long err_cnt = 0, errV_cnt = 0;
477 | 
478 |   FILE *fi = fopen(train_file, "rb");
479 |   fseek(fi, file_size / (long long)num_threads * id, SEEK_SET);
480 |   while (1) {
481 |     if (word_count - last_word_count > 10000) {
482 |       loss[id] += err;
483 |       lossV[id] += errV;
484 |       loss_cnt[id] += err_cnt;
485 |       lossV_cnt[id] += errV_cnt;
486 | 
487 |       sum_loss += err;
488 |       sum_lossV += errV;
489 |       sum_loss_cnt += err_cnt;
490 |       sum_lossV_cnt += errV_cnt;
491 |       
492 |       err = errV = 0;
493 |       err_cnt = errV_cnt = 0;
494 |       word_count_actual += word_count - last_word_count;
495 |       last_word_count = word_count;
496 |       if ((debug_mode > 1) && id == 0) {
497 |         now=clock();
498 |         printf("%cAlpha: %f  Err: %lf  ErrV: %lf  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
499 |          -sum_loss / sum_loss_cnt / (negative + 1),
500 |          -sum_lossV / sum_lossV_cnt / (negative + 1),
501 |          word_count_actual / (real)(train_words + 1) * 100,
502 |          word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
503 |         fflush(stdout);
504 |       }
505 |       //alpha = starting_alpha;
506 |       //alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
507 |       //if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
508 |     }
509 |     if (sentence_length == 0) {
510 |       while (1) {
511 |         word = ReadWordIndex(fi);
512 |         if (feof(fi)) break;
513 |         if (word == -1) continue;
514 |         word_count++;
515 |         if (word == 0) break;
516 |         // The subsampling randomly discards frequent words while keeping the ranking same
517 |         if (sample > 0) {
518 |           real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
519 |           next_random = next_random * (unsigned long long)25214903917 + 11;
520 |           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
521 |         }
522 |         sen[sentence_length] = word;
523 |         sentence_length++;
524 |         if (sentence_length >= MAX_SENTENCE_LENGTH) break;
525 |       }
526 |       sentence_position = 0;
527 |     }
528 |     int validSet = 0;
529 |     if(word_count > (int)(0.95 * train_words / num_threads))
530 |       validSet = 1;
531 |     if (feof(fi) || (word_count > train_words / num_threads)) {
532 |       word_count_actual += word_count - last_word_count;
533 |       break;
534 |     }
535 |     word = sen[sentence_position];
536 |     if (word == -1) continue;
537 |     //for (c = 0; c < input_size; c++) neu1[c] = 0;
538 |     for (c = 0; c < input_size; c++) neu1e[c] = 0;
539 |     for (c = 0; c < hidden_size; c++) neu2[c] = 0;
540 |     for (c = 0; c < hidden_size; c++) neu2e[c] = 0;
541 |     next_random = next_random * (unsigned long long)25214903917 + 11;
542 |     //b = next_random % window;
543 |     b = 0;
544 |     if (cbow) {  //train the cbow architecture
545 |       // in -> hidden
546 |       cw = 0;
547 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
548 |         c = sentence_position - window + a;
549 |         last_word = sen[c];
550 |         if (c < 0) last_word = 0;
551 |         if (c >= sentence_length) last_word = 0;
552 |         if (last_word == -1) last_word = 0;
553 |         for (c = 0; c < layer1_size; c++) neu1[cw * layer1_size + c] = syn0[c + last_word * layer1_size];
554 |         cw++;
555 |       }
556 |       fastmult(hidden, neu1, neu2, input_size, hidden_size);
557 |       //for(a = 0; a < hidden_size; a++)
558 |        // neu2[a] = tanh(neu2[a]);
559 |       if (cw == window * 2) {
560 |         //for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
561 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
562 |           f = 0;
563 |           l2 = vocab[word].point[d] * layer1_size;
564 |           // Propagate hidden -> output
565 |           for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
566 |           if (f <= -MAX_EXP) continue;
567 |           else if (f >= MAX_EXP) continue;
568 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
569 |           // 'g' is the gradient multiplied by the learning rate
570 |           g = (1 - vocab[word].code[d] - f) * alpha;
571 |           // Propagate errors output -> hidden
572 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
573 |           // Learn weights hidden -> output
574 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
575 |         }
576 |         // NEGATIVE SAMPLING
577 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
578 |           if (d == 0) {
579 |             target = word;
580 |             label = 1;
581 |           } else {
582 |             next_random = next_random * (unsigned long long)25214903917 + 11;
583 |             target = table[(next_random >> 16) % table_size];
584 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
585 |             if (target == word) continue;
586 |             label = 0;
587 |           }
588 |           l2 = target * hidden_size;
589 |           f = 0;
590 |           for (c = 0; c < hidden_size; c++) f += neu2[c] * syn1neg[c + l2];
591 |           if (f > MAX_EXP) g = (label - 1);
592 |           else if (f < -MAX_EXP) g = (label - 0);
593 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
594 | 
595 |           if(label) f = -f;
596 |           if(validSet) errV += log(1/(1+exp(f)));
597 |           else err += log(1/(1+exp(f)));
598 |           
599 |           //for (c = 0; c < input_size; c++) neu1e[c] += g * syn1neg[c + l2];
600 |           //for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
601 |           if(!validSet) for (c = 0; c < hidden_size; c++) {
602 |             neu2e[c] += g * syn1neg[c + l2];// *(1 - neu2[c] * neu2[c]);
603 |             real diff = g * neu2[c];
604 |             syn1neg_gdsq[c + l2] += diff * diff;
605 |             syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]);
606 |             //
607 |           }
608 |         }
609 | 
610 |         if(validSet) errV_cnt++;
611 |         else err_cnt++;
612 |         // hidden -> in
613 |         if(!validSet) {
614 |           long long i, j;
615 |           for(i = 0; i < hidden_size; i++){
616 |             for(j = 0; j < input_size; j++){
617 |               neu1e[j] += neu2e[i] * hidden[i*input_size+j];
618 |             }
619 |           }
620 | 
621 |           for(i = 0; i < hidden_size; i++){
622 |             for(j = 0; j < input_size; j++){
623 |               int t = i*input_size+j;
624 |               real diff = neu1[j] * neu2e[i];
625 |               hidden_gdsq[t] += diff * diff;
626 |               hidden[t] += alpha * diff * rsqrt64(hidden_gdsq[t]);
627 |             }
628 |           }
629 |           cw = 0;
630 |           for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
631 |             c = sentence_position - window + a;
632 |             last_word = sen[c];
633 |             if (c < 0) last_word = 0;
634 |             if (c >= sentence_length) last_word = 0;
635 |             if (last_word == -1) last_word = 0;
636 |             //for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
637 |             for (c = 0; c < layer1_size; c++) {
638 |               real diff = neu1e[cw * layer1_size + c];
639 |               long long p = c + last_word * layer1_size;
640 |               syn0_gdsq[p] += diff * diff;
641 |               syn0[p] += alpha * diff * rsqrt(syn0_gdsq[p]);
642 |             }
643 |             cw++;
644 |           }
645 |         }
646 |       }
647 |     } else {  //train skip-gram
648 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
649 |         c = sentence_position - window + a;
650 |         if (c < 0) continue;
651 |         if (c >= sentence_length) continue;
652 |         last_word = sen[c];
653 |         if (last_word == -1) continue;
654 |         l1 = last_word * layer1_size;
655 |         for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
656 |         // HIERARCHICAL SOFTMAX
657 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
658 |           f = 0;
659 |           l2 = vocab[word].point[d] * layer1_size;
660 |           // Propagate hidden -> output
661 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
662 |           if (f <= -MAX_EXP) continue;
663 |           else if (f >= MAX_EXP) continue;
664 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
665 |           // 'g' is the gradient multiplied by the learning rate
666 |           g = (1 - vocab[word].code[d] - f) * alpha;
667 |           // Propagate errors output -> hidden
668 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
669 |           // Learn weights hidden -> output
670 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
671 |         }
672 |         // NEGATIVE SAMPLING
673 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
674 |           if (d == 0) {
675 |             target = word;
676 |             label = 1;
677 |           } else {
678 |             next_random = next_random * (unsigned long long)25214903917 + 11;
679 |             target = table[(next_random >> 16) % table_size];
680 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
681 |             if (target == word) continue;
682 |             label = 0;
683 |           }
684 |           l2 = target * layer1_size;
685 |           f = 0;
686 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
687 |           if (f > MAX_EXP) g = (label - 1);
688 |           else if (f < -MAX_EXP) g = (label - 0);
689 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
690 | 
691 |           if(label) f = -f;
692 |           if(validSet) errV += log(1/(1+exp(f)));
693 |           else err += log(1/(1+exp(f)));
694 |           
695 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
696 |           for (c = 0; c < layer1_size; c++) {
697 |             real diff = g * syn0[c + l1];
698 |             syn1neg_gdsq[c + l2] += diff * diff;
699 |             syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]);
700 |             //
701 |           }
702 |         }
703 |         // Learn weights input -> hidden
704 |         for (c = 0; c < layer1_size; c++) {
705 |           real diff = neu1e[c];
706 |           syn0_gdsq[c + l1] += diff * diff;
707 |           syn0[c + l1] += alpha * diff * rsqrt(syn0_gdsq[c + l1]);
708 |         }
709 |         if(validSet) errV_cnt++;
710 |         else err_cnt++;
711 |       }
712 |       
713 |     }
714 |     sentence_position++;
715 |     if (sentence_position >= sentence_length) {
716 |       sentence_length = 0;
717 |       continue;
718 |     }
719 |   }
720 |   fclose(fi);
721 |   free(neu1);
722 |   free(neu1e);
723 |   pthread_exit(NULL);
724 | }
725 | 
726 | void writeFile(const char *name, double *A, long long size){
727 |   FILE *fout = fopen(name, "wb");
728 |   fwrite(A, sizeof(real), size, fout);
729 |   fclose(fout);
730 | }
731 | 
732 | void dump(){
733 |   writeFile("syn0", syn0, vocab_size * layer1_size);
734 |   writeFile("syn0_gdsq", syn0_gdsq, vocab_size * layer1_size);
735 |   writeFile("syn1neg", syn1neg, vocab_size * hidden_size);
736 |   writeFile("syn1neg_gdsq", syn1neg_gdsq, vocab_size * hidden_size);
737 |   writeFile("hidden", hidden, input_size * hidden_size);
738 |   writeFile("hidden_gdsq", hidden_gdsq, input_size * hidden_size);
739 | }
740 | 
741 | void TrainModel() {
742 |   long a, b, c, d;
743 |   FILE *fo;
744 |   char ffname[100];
745 |   pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
746 |   loss = (double *)malloc(num_threads * sizeof(double));
747 |   lossV = (double *)malloc(num_threads * sizeof(double));
748 |   loss_cnt = (long long *)malloc(num_threads * sizeof(long long));
749 |   lossV_cnt = (long long *)malloc(num_threads * sizeof(long long));
750 | 
751 |   printf("Starting training using file %s\n", train_file);
752 |   starting_alpha = alpha;
753 |   if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
754 |   if (save_vocab_file[0] != 0) SaveVocab();
755 |   if (output_file[0] == 0) return;
756 |   InitNet();
757 |   if (negative > 0) InitUnigramTable();
758 |   
759 |   for(b = 1; b <= iter; b++){
760 |     start = clock();
761 |     word_count_actual = 0;
762 |     for (a = 0; a < num_threads; a++){
763 |       loss[a] = lossV[a] = 0;
764 |       loss_cnt[a] = lossV_cnt[a] = 0;
765 |     }
766 |     sum_loss = sum_lossV = 0;
767 |     sum_loss_cnt = sum_lossV_cnt = 0;
768 | 
769 |     for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
770 |     for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
771 | 
772 |     sprintf(ffname, "%s_%ld", output_file, b);
773 |     writeWV(ffname);
774 |     dump();
775 |     printf("%c", 13);
776 | 
777 |     sum_loss = sum_lossV = 0;
778 |     sum_loss_cnt = sum_lossV_cnt = 0;
779 |     for (a = 0; a < num_threads; a++){
780 |       sum_loss += loss[a];
781 |       sum_lossV += lossV[a];
782 |       sum_loss_cnt += loss_cnt[a];
783 |       sum_lossV_cnt += lossV_cnt[a];
784 |     }
785 |     fprintf(stderr, "Iter: %ld  Err: %lf  ErrV: %lf\n", b,
786 |       -sum_loss / sum_loss_cnt / (negative + 1),
787 |       -sum_lossV / sum_lossV_cnt / (negative + 1));
788 |     fflush(stderr);
789 |   }
790 |   
791 |   if (classes == 0) {
792 |     // Save the word vectors
793 |     //writeWV(output_file);
794 |   } else {
795 |     fo = fopen(output_file, "wb");
796 |     // Run K-means on the word vectors
797 |     int clcn = classes, iter = 10, closeid;
798 |     int *centcn = (int *)malloc(classes * sizeof(int));
799 |     int *cl = (int *)calloc(vocab_size, sizeof(int));
800 |     real closev, x;
801 |     real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
802 |     for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
803 |     for (a = 0; a < iter; a++) {
804 |       for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
805 |       for (b = 0; b < clcn; b++) centcn[b] = 1;
806 |       for (c = 0; c < vocab_size; c++) {
807 |         for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
808 |         centcn[cl[c]]++;
809 |       }
810 |       for (b = 0; b < clcn; b++) {
811 |         closev = 0;
812 |         for (c = 0; c < layer1_size; c++) {
813 |           cent[layer1_size * b + c] /= centcn[b];
814 |           closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
815 |         }
816 |         closev = sqrt(closev);
817 |         for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
818 |       }
819 |       for (c = 0; c < vocab_size; c++) {
820 |         closev = -10;
821 |         closeid = 0;
822 |         for (d = 0; d < clcn; d++) {
823 |           x = 0;
824 |           for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
825 |           if (x > closev) {
826 |             closev = x;
827 |             closeid = d;
828 |           }
829 |         }
830 |         cl[c] = closeid;
831 |       }
832 |     }
833 |     // Save the K-means classes
834 |     for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
835 |     free(centcn);
836 |     free(cent);
837 |     free(cl);
838 |     fclose(fo);
839 |   }
840 |   
841 | }
842 | 
843 | int ArgPos(char *str, int argc, char **argv) {
844 |   int a;
845 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
846 |     if (a == argc - 1) {
847 |       printf("Argument missing for %s\n", str);
848 |       exit(1);
849 |     }
850 |     return a;
851 |   }
852 |   return -1;
853 | }
854 | 
855 | int main(int argc, char **argv) {
856 |   int i;
857 |   if (argc == 1) {
858 |     printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
859 |     printf("Options:\n");
860 |     printf("Parameters for training:\n");
861 |     printf("\t-train <file>\n");
862 |     printf("\t\tUse text data from <file> to train the model\n");
863 |     printf("\t-output <file>\n");
864 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
865 |     printf("\t-size <int>\n");
866 |     printf("\t\tSet size of word vectors; default is 100\n");
867 |     printf("\t-window <int>\n");
868 |     printf("\t\tSet max skip length between words; default is 5\n");
869 |     printf("\t-sample <float>\n");
870 |     printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
871 |     printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
872 |     printf("\t-hs <int>\n");
873 |     printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
874 |     printf("\t-negative <int>\n");
875 |     printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
876 |     printf("\t-threads <int>\n");
877 |     printf("\t\tUse <int> threads (default 12)\n");
878 |     printf("\t-iter <int>\n");
879 |     printf("\t\tRun more training iterations (default 5)\n");
880 |     printf("\t-min-count <int>\n");
881 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
882 |     printf("\t-alpha <float>\n");
883 |     printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
884 |     printf("\t-classes <int>\n");
885 |     printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
886 |     printf("\t-debug <int>\n");
887 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
888 |     printf("\t-binary <int>\n");
889 |     printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
890 |     printf("\t-save-vocab <file>\n");
891 |     printf("\t\tThe vocabulary will be saved to <file>\n");
892 |     printf("\t-read-vocab <file>\n");
893 |     printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
894 |     printf("\t-cbow <int>\n");
895 |     printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
896 |     printf("\nExamples:\n");
897 |     printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
898 |     return 0;
899 |   }
900 |   output_file[0] = 0;
901 |   save_vocab_file[0] = 0;
902 |   read_vocab_file[0] = 0;
903 |   if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
904 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
905 |   if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
906 |   if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
907 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
908 |   if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
909 |   if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
910 |   if (cbow) alpha = 0.05;
911 |   if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
912 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
913 |   if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
914 |   if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
915 |   if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
916 |   if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
917 |   if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
918 |   if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
919 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
920 |   if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
921 |   input_size = layer1_size * window * 2;
922 |   hidden_size = layer1_size;
923 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
924 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
925 |   expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
926 |   for (i = 0; i < EXP_TABLE_SIZE; i++) {
927 |     expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
928 |     expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
929 |   }
930 |   TrainModel();
931 |   return 0;
932 | }
933 | 


--------------------------------------------------------------------------------
/embedding/nnlm.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 100
 22 | #define EXP_TABLE_SIZE 1000
 23 | #define MAX_EXP 6
 24 | #define MAX_SENTENCE_LENGTH 1000
 25 | #define MAX_CODE_LENGTH 40
 26 | 
 27 | const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 28 | 
 29 | typedef float real;                    // Precision of float numbers
 30 | 
 31 | struct vocab_word {
 32 |   long long cn;
 33 |   int *point;
 34 |   char *word, *code, codelen;
 35 | };
 36 | 
 37 | char train_file[MAX_STRING], output_file[MAX_STRING];
 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 39 | struct vocab_word *vocab;
 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
 41 | int *vocab_hash;
 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100, input_size = 500, hidden_size = 50;
 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
 44 | real alpha = 0.025, starting_alpha, sample = 1e-3;
 45 | real *syn0, *syn1, *syn1neg, *expTable, *syn1neg_gdsq, *syn0_gdsq, *hidden, *hidden_gdsq;
 46 | double *loss, *lossV, sum_loss, sum_lossV;
 47 | long long *loss_cnt, *lossV_cnt, sum_loss_cnt, sum_lossV_cnt;
 48 | 
 49 | clock_t start;
 50 | 
 51 | int hs = 0, negative = 5;
 52 | const int table_size = 1e8;
 53 | int *table;
 54 | 
 55 | void InitUnigramTable() {
 56 |   int a, i;
 57 |   long long train_words_pow = 0;
 58 |   real d1, power = 0.75;
 59 |   table = (int *)malloc(table_size * sizeof(int));
 60 |   for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
 61 |   i = 0;
 62 |   d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
 63 |   for (a = 0; a < table_size; a++) {
 64 |     table[a] = i;
 65 |     if (a / (real)table_size > d1) {
 66 |       i++;
 67 |       d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
 68 |     }
 69 |     if (i >= vocab_size) i = vocab_size - 1;
 70 |   }
 71 | }
 72 | 
 73 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 74 | void ReadWord(char *word, FILE *fin) {
 75 |   int a = 0, ch;
 76 |   while (!feof(fin)) {
 77 |     ch = fgetc(fin);
 78 |     if (ch == 13) continue;
 79 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 80 |       if (a > 0) {
 81 |         if (ch == '\n') ungetc(ch, fin);
 82 |         break;
 83 |       }
 84 |       if (ch == '\n') {
 85 |         strcpy(word, (char *)"</s>");
 86 |         return;
 87 |       } else continue;
 88 |     }
 89 |     word[a] = ch;
 90 |     a++;
 91 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 92 |   }
 93 |   word[a] = 0;
 94 | }
 95 | 
 96 | // Returns hash value of a word
 97 | int GetWordHash(char *word) {
 98 |   unsigned long long a, hash = 0;
 99 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
100 |   hash = hash % vocab_hash_size;
101 |   return hash;
102 | }
103 | 
104 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
105 | int SearchVocab(char *word) {
106 |   unsigned int hash = GetWordHash(word);
107 |   while (1) {
108 |     if (vocab_hash[hash] == -1) return -1;
109 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
110 |     hash = (hash + 1) % vocab_hash_size;
111 |   }
112 |   return -1;
113 | }
114 | 
115 | // Reads a word and returns its index in the vocabulary
116 | int ReadWordIndex(FILE *fin) {
117 |   char word[MAX_STRING];
118 |   ReadWord(word, fin);
119 |   if (feof(fin)) return -1;
120 |   return SearchVocab(word);
121 | }
122 | 
123 | // Adds a word to the vocabulary
124 | int AddWordToVocab(char *word) {
125 |   unsigned int hash, length = strlen(word) + 1;
126 |   if (length > MAX_STRING) length = MAX_STRING;
127 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
128 |   strcpy(vocab[vocab_size].word, word);
129 |   vocab[vocab_size].cn = 0;
130 |   vocab_size++;
131 |   // Reallocate memory if needed
132 |   if (vocab_size + 2 >= vocab_max_size) {
133 |     vocab_max_size += 1000;
134 |     vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
135 |   }
136 |   hash = GetWordHash(word);
137 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
138 |   vocab_hash[hash] = vocab_size - 1;
139 |   return vocab_size - 1;
140 | }
141 | 
142 | // Used later for sorting by word counts
143 | int VocabCompare(const void *a, const void *b) {
144 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
145 | }
146 | 
147 | // Sorts the vocabulary by frequency using word counts
148 | void SortVocab() {
149 |   int a, size;
150 |   unsigned int hash;
151 |   // Sort the vocabulary and keep </s> at the first position
152 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
153 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
154 |   size = vocab_size;
155 |   train_words = 0;
156 |   for (a = 0; a < size; a++) {
157 |     // Words occuring less than min_count times will be discarded from the vocab
158 |     if ((vocab[a].cn < min_count) && (a != 0)) {
159 |       vocab_size--;
160 |       free(vocab[a].word);
161 |     } else {
162 |       // Hash will be re-computed, as after the sorting it is not actual
163 |       hash=GetWordHash(vocab[a].word);
164 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
165 |       vocab_hash[hash] = a;
166 |       train_words += vocab[a].cn;
167 |     }
168 |   }
169 |   vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
170 |   // Allocate memory for the binary tree construction
171 |   for (a = 0; a < vocab_size; a++) {
172 |     vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
173 |     vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
174 |   }
175 | }
176 | 
177 | // Reduces the vocabulary by removing infrequent tokens
178 | void ReduceVocab() {
179 |   int a, b = 0;
180 |   unsigned int hash;
181 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
182 |     vocab[b].cn = vocab[a].cn;
183 |     vocab[b].word = vocab[a].word;
184 |     b++;
185 |   } else free(vocab[a].word);
186 |   vocab_size = b;
187 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
188 |   for (a = 0; a < vocab_size; a++) {
189 |     // Hash will be re-computed, as it is not actual
190 |     hash = GetWordHash(vocab[a].word);
191 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
192 |     vocab_hash[hash] = a;
193 |   }
194 |   fflush(stdout);
195 |   min_reduce++;
196 | }
197 | 
198 | // Create binary Huffman tree using the word counts
199 | // Frequent words will have short uniqe binary codes
200 | void CreateBinaryTree() {
201 |   long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
202 |   char code[MAX_CODE_LENGTH];
203 |   long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
204 |   long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
205 |   long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
206 |   for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
207 |   for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
208 |   pos1 = vocab_size - 1;
209 |   pos2 = vocab_size;
210 |   // Following algorithm constructs the Huffman tree by adding one node at a time
211 |   for (a = 0; a < vocab_size - 1; a++) {
212 |     // First, find two smallest nodes 'min1, min2'
213 |     if (pos1 >= 0) {
214 |       if (count[pos1] < count[pos2]) {
215 |         min1i = pos1;
216 |         pos1--;
217 |       } else {
218 |         min1i = pos2;
219 |         pos2++;
220 |       }
221 |     } else {
222 |       min1i = pos2;
223 |       pos2++;
224 |     }
225 |     if (pos1 >= 0) {
226 |       if (count[pos1] < count[pos2]) {
227 |         min2i = pos1;
228 |         pos1--;
229 |       } else {
230 |         min2i = pos2;
231 |         pos2++;
232 |       }
233 |     } else {
234 |       min2i = pos2;
235 |       pos2++;
236 |     }
237 |     count[vocab_size + a] = count[min1i] + count[min2i];
238 |     parent_node[min1i] = vocab_size + a;
239 |     parent_node[min2i] = vocab_size + a;
240 |     binary[min2i] = 1;
241 |   }
242 |   // Now assign binary code to each vocabulary word
243 |   for (a = 0; a < vocab_size; a++) {
244 |     b = a;
245 |     i = 0;
246 |     while (1) {
247 |       code[i] = binary[b];
248 |       point[i] = b;
249 |       i++;
250 |       b = parent_node[b];
251 |       if (b == vocab_size * 2 - 2) break;
252 |     }
253 |     vocab[a].codelen = i;
254 |     vocab[a].point[0] = vocab_size - 2;
255 |     for (b = 0; b < i; b++) {
256 |       vocab[a].code[i - b - 1] = code[b];
257 |       vocab[a].point[i - b] = point[b] - vocab_size;
258 |     }
259 |   }
260 |   free(count);
261 |   free(binary);
262 |   free(parent_node);
263 | }
264 | 
265 | void LearnVocabFromTrainFile() {
266 |   char word[MAX_STRING];
267 |   FILE *fin;
268 |   long long a, i;
269 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
270 |   fin = fopen(train_file, "rb");
271 |   if (fin == NULL) {
272 |     printf("ERROR: training data file not found!\n");
273 |     exit(1);
274 |   }
275 |   vocab_size = 0;
276 |   AddWordToVocab((char *)"</s>");
277 |   while (1) {
278 |     ReadWord(word, fin);
279 |     if (feof(fin)) break;
280 |     train_words++;
281 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
282 |       printf("%lldK%c", train_words / 1000, 13);
283 |       fflush(stdout);
284 |     }
285 |     i = SearchVocab(word);
286 |     if (i == -1) {
287 |       a = AddWordToVocab(word);
288 |       vocab[a].cn = 1;
289 |     } else vocab[i].cn++;
290 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
291 |   }
292 |   SortVocab();
293 |   if (debug_mode > 0) {
294 |     printf("Vocab size: %lld\n", vocab_size);
295 |     printf("Words in train file: %lld\n", train_words);
296 |   }
297 |   file_size = ftell(fin);
298 |   fclose(fin);
299 | }
300 | 
301 | void SaveVocab() {
302 |   long long i;
303 |   FILE *fo = fopen(save_vocab_file, "wb");
304 |   for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
305 |   fclose(fo);
306 | }
307 | 
308 | void ReadVocab() {
309 |   long long a, i = 0;
310 |   char c;
311 |   char word[MAX_STRING];
312 |   FILE *fin = fopen(read_vocab_file, "rb");
313 |   if (fin == NULL) {
314 |     printf("Vocabulary file not found\n");
315 |     exit(1);
316 |   }
317 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
318 |   vocab_size = 0;
319 |   while (1) {
320 |     ReadWord(word, fin);
321 |     if (feof(fin)) break;
322 |     a = AddWordToVocab(word);
323 |     fscanf(fin, "%lld%c", &vocab[a].cn, &c);
324 |     i++;
325 |   }
326 |   SortVocab();
327 |   if (debug_mode > 0) {
328 |     printf("Vocab size: %lld\n", vocab_size);
329 |     printf("Words in train file: %lld\n", train_words);
330 |   }
331 |   fin = fopen(train_file, "rb");
332 |   if (fin == NULL) {
333 |     printf("ERROR: training data file not found!\n");
334 |     exit(1);
335 |   }
336 |   fseek(fin, 0, SEEK_END);
337 |   file_size = ftell(fin);
338 |   fclose(fin);
339 | }
340 | 
341 | void InitNet() {
342 |   long long a, b;
343 |   unsigned long long next_random = 1;
344 |   a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
345 |   a = posix_memalign((void **)&syn0_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real));
346 |   if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
347 |   if (hs) {
348 |     a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
349 |     if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
350 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
351 |      syn1[a * layer1_size + b] = 0;
352 |   }
353 |   if (negative>0) {
354 |     a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * hidden_size * sizeof(real));
355 | 	a = posix_memalign((void **)&syn1neg_gdsq, 128, (long long)vocab_size * hidden_size * sizeof(real));
356 |     if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
357 | 	for (a = 0; a < vocab_size; a++) for (b = 0; b < hidden_size; b++){
358 |       syn1neg[a * hidden_size + b] = 0;
359 |       syn1neg_gdsq[a * hidden_size + b] = 1e-8;
360 |     }
361 |     a = posix_memalign((void **)&hidden, 128, (long long)input_size * hidden_size * sizeof(real));
362 |     a = posix_memalign((void **)&hidden_gdsq, 128, (long long)input_size * hidden_size * sizeof(real));
363 |     for (a = 0; a < input_size; a++) for (b = 0; b < hidden_size; b++) {
364 |       next_random = next_random * (unsigned long long)25214903917 + 11;
365 |       hidden[a * hidden_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / hidden_size;
366 |       hidden_gdsq[a * hidden_size + b] = 1e-8;
367 |     }
368 |   }
369 |   for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
370 |     next_random = next_random * (unsigned long long)25214903917 + 11;
371 |     syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
372 |     syn0_gdsq[a * layer1_size + b] = 1e-8;
373 |   }
374 |   CreateBinaryTree();
375 | }
376 | 
377 | void writeWV(char *output_file){
378 |   long long a, b;
379 |   FILE *fo = fopen(output_file, "wb");
380 |     fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
381 |     for (a = 0; a < vocab_size; a++) {
382 |       fprintf(fo, "%s ", vocab[a].word);
383 |       if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
384 |       else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
385 |       fprintf(fo, "\n");
386 |     }
387 |     fclose(fo);
388 | }
389 | 
390 | typedef unsigned long uint64_t;
391 | typedef unsigned int uint32_t;
392 | 
393 | double rsqrt64(double number) {
394 |   uint64_t i;
395 |   double x2, y;
396 |   x2 = number * 0.5;
397 |   y = number;
398 |   i = *(uint64_t *) &y;
399 |   i = 0x5fe6eb50c7b537a9 - (i >> 1);
400 |   y = *(double *) &i;
401 |   y = y * (1.5 - (x2 * y * y));
402 |   return y;
403 | }
404 | 
405 | float rsqrt(float number){
406 |   uint32_t i;
407 |   float x2, y;
408 |   x2 = number * 0.5F;
409 |   y  = number;
410 |   i  = *(uint32_t *) &y;
411 |   i  = 0x5f3759df - ( i >> 1 );
412 |   y  = *(float *) &i;
413 |   y  = y * ( 1.5F - ( x2 * y * y ) );
414 |   return y;
415 | }
416 | 
417 | 
418 | //b = Ax
419 | void fastmult(real *A, real *x, real *b, int xlen, int blen){
420 |   real val1, val2, val3, val4;
421 |   real val5, val6, val7, val8;
422 |   int i, j;
423 |   for (i=0; i<blen/8*8; i+=8) {
424 |     val1=0;
425 |     val2=0;
426 |     val3=0;
427 |     val4=0;
428 | 
429 |     val5=0;
430 |     val6=0;
431 |     val7=0;
432 |     val8=0;
433 | 
434 |     for (j=0; j<xlen; j++) {
435 |       val1 += x[j] * A[j+(i+0)*xlen];
436 |       val2 += x[j] * A[j+(i+1)*xlen];
437 |       val3 += x[j] * A[j+(i+2)*xlen];
438 |       val4 += x[j] * A[j+(i+3)*xlen];
439 | 
440 |       val5 += x[j] * A[j+(i+4)*xlen];
441 |       val6 += x[j] * A[j+(i+5)*xlen];
442 |       val7 += x[j] * A[j+(i+6)*xlen];
443 |       val8 += x[j] * A[j+(i+7)*xlen];
444 |     }
445 |     b[i+0] += val1;
446 |     b[i+1] += val2;
447 |     b[i+2] += val3;
448 |     b[i+3] += val4;
449 | 
450 |     b[i+4] += val5;
451 |     b[i+5] += val6;
452 |     b[i+6] += val7;
453 |     b[i+7] += val8;
454 |   }
455 | 
456 |   for (; i<blen; i++) {
457 |     for (j=0; j<xlen; j++) {
458 |       b[i] += x[j] * A[j+i*xlen];
459 |     }
460 |   }
461 | }
462 | 
463 | void *TrainModelThread(void *_id) {
464 |   long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
465 |   long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
466 |   long long l1, l2, c, target, label;
467 |   long long id = (long long)_id;
468 |   unsigned long long next_random = id + clock(); //must add ``clock'' if using this kind of iter.
469 |   real f, g;
470 |   clock_t now;
471 |   real *neu1 = (real *)calloc(input_size, sizeof(real));
472 |   real *neu1e = (real *)calloc(input_size, sizeof(real));
473 |   real *neu2 = (real *)calloc(hidden_size, sizeof(real));
474 |   real *neu2e = (real *)calloc(hidden_size, sizeof(real));
475 |   double err = 0, errV = 0; //loss
476 |   long long err_cnt = 0, errV_cnt = 0;
477 | 
478 |   FILE *fi = fopen(train_file, "rb");
479 |   fseek(fi, file_size / (long long)num_threads * id, SEEK_SET);
480 |   while (1) {
481 |     if (word_count - last_word_count > 10000) {
482 |       loss[id] += err;
483 |       lossV[id] += errV;
484 |       loss_cnt[id] += err_cnt;
485 |       lossV_cnt[id] += errV_cnt;
486 | 
487 |       sum_loss += err;
488 |       sum_lossV += errV;
489 |       sum_loss_cnt += err_cnt;
490 |       sum_lossV_cnt += errV_cnt;
491 |       
492 |       err = errV = 0;
493 |       err_cnt = errV_cnt = 0;
494 |       word_count_actual += word_count - last_word_count;
495 |       last_word_count = word_count;
496 |       if ((debug_mode > 1) && id == 0) {
497 |         now=clock();
498 |         printf("%cAlpha: %f  Err: %lf  ErrV: %lf  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
499 |          -sum_loss / sum_loss_cnt / (negative + 1),
500 |          -sum_lossV / sum_lossV_cnt / (negative + 1),
501 |          word_count_actual / (real)(train_words + 1) * 100,
502 |          word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
503 |         fflush(stdout);
504 |       }
505 |       //alpha = starting_alpha;
506 |       //alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
507 |       //if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
508 |     }
509 |     if (sentence_length == 0) {
510 |       while (1) {
511 |         word = ReadWordIndex(fi);
512 |         if (feof(fi)) break;
513 |         if (word == -1) continue;
514 |         word_count++;
515 |         if (word == 0) break;
516 |         // The subsampling randomly discards frequent words while keeping the ranking same
517 |         if (sample > 0) {
518 |           real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
519 |           next_random = next_random * (unsigned long long)25214903917 + 11;
520 |           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
521 |         }
522 |         sen[sentence_length] = word;
523 |         sentence_length++;
524 |         if (sentence_length >= MAX_SENTENCE_LENGTH) break;
525 |       }
526 |       sentence_position = 0;
527 |     }
528 |     int validSet = 0;
529 |     if(word_count > (int)(0.95 * train_words / num_threads))
530 |       validSet = 1;
531 |     if (feof(fi) || (word_count > train_words / num_threads)) {
532 |       word_count_actual += word_count - last_word_count;
533 |       break;
534 |     }
535 |     word = sen[sentence_position];
536 |     if (word == -1) continue;
537 |     //for (c = 0; c < input_size; c++) neu1[c] = 0;
538 |     for (c = 0; c < input_size; c++) neu1e[c] = 0;
539 |     for (c = 0; c < hidden_size; c++) neu2[c] = 0;
540 |     for (c = 0; c < hidden_size; c++) neu2e[c] = 0;
541 |     next_random = next_random * (unsigned long long)25214903917 + 11;
542 |     //b = next_random % window;
543 |     b = 0;
544 |     if (cbow) {  //train the cbow architecture
545 |       // in -> hidden
546 |       cw = 0;
547 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
548 |         c = sentence_position - window + a;
549 |         last_word = sen[c];
550 |         if (c < 0) last_word = 0;
551 |         if (c >= sentence_length) last_word = 0;
552 |         if (last_word == -1) last_word = 0;
553 |         for (c = 0; c < layer1_size; c++) neu1[cw * layer1_size + c] = syn0[c + last_word * layer1_size];
554 |         cw++;
555 |       }
556 |       fastmult(hidden, neu1, neu2, input_size, hidden_size);
557 |       for(a = 0; a < hidden_size; a++)
558 |         neu2[a] = tanh(neu2[a]);
559 |       if (cw == window * 2) {
560 |         //for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
561 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
562 |           f = 0;
563 |           l2 = vocab[word].point[d] * layer1_size;
564 |           // Propagate hidden -> output
565 |           for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
566 |           if (f <= -MAX_EXP) continue;
567 |           else if (f >= MAX_EXP) continue;
568 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
569 |           // 'g' is the gradient multiplied by the learning rate
570 |           g = (1 - vocab[word].code[d] - f) * alpha;
571 |           // Propagate errors output -> hidden
572 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
573 |           // Learn weights hidden -> output
574 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
575 |         }
576 |         // NEGATIVE SAMPLING
577 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
578 |           if (d == 0) {
579 |             target = word;
580 |             label = 1;
581 |           } else {
582 |             next_random = next_random * (unsigned long long)25214903917 + 11;
583 |             target = table[(next_random >> 16) % table_size];
584 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
585 |             if (target == word) continue;
586 |             label = 0;
587 |           }
588 |           l2 = target * hidden_size;
589 |           f = 0;
590 |           for (c = 0; c < hidden_size; c++) f += neu2[c] * syn1neg[c + l2];
591 |           if (f > MAX_EXP) g = (label - 1);
592 |           else if (f < -MAX_EXP) g = (label - 0);
593 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
594 | 
595 |           if(label) f = -f;
596 |           if(validSet) errV += log(1/(1+exp(f)));
597 |           else err += log(1/(1+exp(f)));
598 |           
599 |           //for (c = 0; c < input_size; c++) neu1e[c] += g * syn1neg[c + l2];
600 |           //for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
601 |           if(!validSet) for (c = 0; c < hidden_size; c++) {
602 |             neu2e[c] += g * syn1neg[c + l2] * (1-neu2[c]*neu2[c]);
603 |             real diff = g * neu2[c];
604 |             syn1neg_gdsq[c + l2] += diff * diff;
605 |             syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]);
606 |             //
607 |           }
608 |         }
609 | 
610 |         if(validSet) errV_cnt++;
611 |         else err_cnt++;
612 |         // hidden -> in
613 |         if(!validSet) {
614 |           long long i, j;
615 |           for(i = 0; i < hidden_size; i++){
616 |             for(j = 0; j < input_size; j++){
617 |               neu1e[j] += neu2e[i] * hidden[i*input_size+j];
618 |             }
619 |           }
620 | 
621 |           for(i = 0; i < hidden_size; i++){
622 |             for(j = 0; j < input_size; j++){
623 |               int t = i*input_size+j;
624 |               real diff = neu1[j] * neu2e[i];
625 |               hidden_gdsq[t] += diff * diff;
626 |               hidden[t] += alpha * diff * rsqrt64(hidden_gdsq[t]);
627 |             }
628 |           }
629 |           cw = 0;
630 |           for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
631 |             c = sentence_position - window + a;
632 |             last_word = sen[c];
633 |             if (c < 0) last_word = 0;
634 |             if (c >= sentence_length) last_word = 0;
635 |             if (last_word == -1) last_word = 0;
636 |             //for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
637 |             for (c = 0; c < layer1_size; c++) {
638 |               real diff = neu1e[cw * layer1_size + c];
639 |               long long p = c + last_word * layer1_size;
640 |               syn0_gdsq[p] += diff * diff;
641 |               syn0[p] += alpha * diff * rsqrt(syn0_gdsq[p]);
642 |             }
643 |             cw++;
644 |           }
645 |         }
646 |       }
647 |     } else {  //train skip-gram
648 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
649 |         c = sentence_position - window + a;
650 |         if (c < 0) continue;
651 |         if (c >= sentence_length) continue;
652 |         last_word = sen[c];
653 |         if (last_word == -1) continue;
654 |         l1 = last_word * layer1_size;
655 |         for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
656 |         // HIERARCHICAL SOFTMAX
657 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
658 |           f = 0;
659 |           l2 = vocab[word].point[d] * layer1_size;
660 |           // Propagate hidden -> output
661 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
662 |           if (f <= -MAX_EXP) continue;
663 |           else if (f >= MAX_EXP) continue;
664 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
665 |           // 'g' is the gradient multiplied by the learning rate
666 |           g = (1 - vocab[word].code[d] - f) * alpha;
667 |           // Propagate errors output -> hidden
668 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
669 |           // Learn weights hidden -> output
670 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
671 |         }
672 |         // NEGATIVE SAMPLING
673 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
674 |           if (d == 0) {
675 |             target = word;
676 |             label = 1;
677 |           } else {
678 |             next_random = next_random * (unsigned long long)25214903917 + 11;
679 |             target = table[(next_random >> 16) % table_size];
680 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
681 |             if (target == word) continue;
682 |             label = 0;
683 |           }
684 |           l2 = target * layer1_size;
685 |           f = 0;
686 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
687 |           if (f > MAX_EXP) g = (label - 1);
688 |           else if (f < -MAX_EXP) g = (label - 0);
689 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
690 | 
691 |           if(label) f = -f;
692 |           if(validSet) errV += log(1/(1+exp(f)));
693 |           else err += log(1/(1+exp(f)));
694 |           
695 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
696 |           for (c = 0; c < layer1_size; c++) {
697 |             real diff = g * syn0[c + l1];
698 |             syn1neg_gdsq[c + l2] += diff * diff;
699 |             syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]);
700 |             //
701 |           }
702 |         }
703 |         // Learn weights input -> hidden
704 |         for (c = 0; c < layer1_size; c++) {
705 |           real diff = neu1e[c];
706 |           syn0_gdsq[c + l1] += diff * diff;
707 |           syn0[c + l1] += alpha * diff * rsqrt(syn0_gdsq[c + l1]);
708 |         }
709 |         if(validSet) errV_cnt++;
710 |         else err_cnt++;
711 |       }
712 |       
713 |     }
714 |     sentence_position++;
715 |     if (sentence_position >= sentence_length) {
716 |       sentence_length = 0;
717 |       continue;
718 |     }
719 |   }
720 |   fclose(fi);
721 |   free(neu1);
722 |   free(neu1e);
723 |   pthread_exit(NULL);
724 | }
725 | 
726 | 
727 | void TrainModel() {
728 |   long a, b, c, d;
729 |   FILE *fo;
730 |   char ffname[100];
731 |   pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
732 |   loss = (double *)malloc(num_threads * sizeof(double));
733 |   lossV = (double *)malloc(num_threads * sizeof(double));
734 |   loss_cnt = (long long *)malloc(num_threads * sizeof(long long));
735 |   lossV_cnt = (long long *)malloc(num_threads * sizeof(long long));
736 | 
737 |   printf("Starting training using file %s\n", train_file);
738 |   starting_alpha = alpha;
739 |   if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
740 |   if (save_vocab_file[0] != 0) SaveVocab();
741 |   if (output_file[0] == 0) return;
742 |   InitNet();
743 |   if (negative > 0) InitUnigramTable();
744 |   
745 |   for(b = 1; b <= iter; b++){
746 |     start = clock();
747 |     word_count_actual = 0;
748 |     for (a = 0; a < num_threads; a++){
749 |       loss[a] = lossV[a] = 0;
750 |       loss_cnt[a] = lossV_cnt[a] = 0;
751 |     }
752 |     sum_loss = sum_lossV = 0;
753 |     sum_loss_cnt = sum_lossV_cnt = 0;
754 | 
755 |     for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
756 |     for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
757 | 
758 |     sprintf(ffname, "%s_%ld", output_file, b);
759 |     writeWV(ffname);
760 |     printf("%c", 13);
761 | 
762 |     sum_loss = sum_lossV = 0;
763 |     sum_loss_cnt = sum_lossV_cnt = 0;
764 |     for (a = 0; a < num_threads; a++){
765 |       sum_loss += loss[a];
766 |       sum_lossV += lossV[a];
767 |       sum_loss_cnt += loss_cnt[a];
768 |       sum_lossV_cnt += lossV_cnt[a];
769 |     }
770 |     fprintf(stderr, "Iter: %ld  Err: %lf  ErrV: %lf\n", b,
771 |       -sum_loss / sum_loss_cnt / (negative + 1),
772 |       -sum_lossV / sum_lossV_cnt / (negative + 1));
773 |     fflush(stderr);
774 |   }
775 |   
776 |   if (classes == 0) {
777 |     // Save the word vectors
778 |     //writeWV(output_file);
779 |   } else {
780 |     fo = fopen(output_file, "wb");
781 |     // Run K-means on the word vectors
782 |     int clcn = classes, iter = 10, closeid;
783 |     int *centcn = (int *)malloc(classes * sizeof(int));
784 |     int *cl = (int *)calloc(vocab_size, sizeof(int));
785 |     real closev, x;
786 |     real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
787 |     for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
788 |     for (a = 0; a < iter; a++) {
789 |       for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
790 |       for (b = 0; b < clcn; b++) centcn[b] = 1;
791 |       for (c = 0; c < vocab_size; c++) {
792 |         for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
793 |         centcn[cl[c]]++;
794 |       }
795 |       for (b = 0; b < clcn; b++) {
796 |         closev = 0;
797 |         for (c = 0; c < layer1_size; c++) {
798 |           cent[layer1_size * b + c] /= centcn[b];
799 |           closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
800 |         }
801 |         closev = sqrt(closev);
802 |         for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
803 |       }
804 |       for (c = 0; c < vocab_size; c++) {
805 |         closev = -10;
806 |         closeid = 0;
807 |         for (d = 0; d < clcn; d++) {
808 |           x = 0;
809 |           for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
810 |           if (x > closev) {
811 |             closev = x;
812 |             closeid = d;
813 |           }
814 |         }
815 |         cl[c] = closeid;
816 |       }
817 |     }
818 |     // Save the K-means classes
819 |     for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
820 |     free(centcn);
821 |     free(cent);
822 |     free(cl);
823 |     fclose(fo);
824 |   }
825 |   
826 | }
827 | 
828 | int ArgPos(char *str, int argc, char **argv) {
829 |   int a;
830 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
831 |     if (a == argc - 1) {
832 |       printf("Argument missing for %s\n", str);
833 |       exit(1);
834 |     }
835 |     return a;
836 |   }
837 |   return -1;
838 | }
839 | 
840 | int main(int argc, char **argv) {
841 |   int i;
842 |   if (argc == 1) {
843 |     printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
844 |     printf("Options:\n");
845 |     printf("Parameters for training:\n");
846 |     printf("\t-train <file>\n");
847 |     printf("\t\tUse text data from <file> to train the model\n");
848 |     printf("\t-output <file>\n");
849 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
850 |     printf("\t-size <int>\n");
851 |     printf("\t\tSet size of word vectors; default is 100\n");
852 |     printf("\t-window <int>\n");
853 |     printf("\t\tSet max skip length between words; default is 5\n");
854 |     printf("\t-sample <float>\n");
855 |     printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
856 |     printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
857 |     printf("\t-hs <int>\n");
858 |     printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
859 |     printf("\t-negative <int>\n");
860 |     printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
861 |     printf("\t-threads <int>\n");
862 |     printf("\t\tUse <int> threads (default 12)\n");
863 |     printf("\t-iter <int>\n");
864 |     printf("\t\tRun more training iterations (default 5)\n");
865 |     printf("\t-min-count <int>\n");
866 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
867 |     printf("\t-alpha <float>\n");
868 |     printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
869 |     printf("\t-classes <int>\n");
870 |     printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
871 |     printf("\t-debug <int>\n");
872 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
873 |     printf("\t-binary <int>\n");
874 |     printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
875 |     printf("\t-save-vocab <file>\n");
876 |     printf("\t\tThe vocabulary will be saved to <file>\n");
877 |     printf("\t-read-vocab <file>\n");
878 |     printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
879 |     printf("\t-cbow <int>\n");
880 |     printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
881 |     printf("\nExamples:\n");
882 |     printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
883 |     return 0;
884 |   }
885 |   output_file[0] = 0;
886 |   save_vocab_file[0] = 0;
887 |   read_vocab_file[0] = 0;
888 |   if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
889 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
890 |   if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
891 |   if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
892 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
893 |   if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
894 |   if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
895 |   if (cbow) alpha = 0.05;
896 |   if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
897 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
898 |   if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
899 |   if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
900 |   if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
901 |   if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
902 |   if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
903 |   if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
904 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
905 |   if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
906 |   input_size = layer1_size * window * 2;
907 |   hidden_size = layer1_size;
908 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
909 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
910 |   expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
911 |   for (i = 0; i < EXP_TABLE_SIZE; i++) {
912 |     expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
913 |     expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
914 |   }
915 |   TrainModel();
916 |   return 0;
917 | }
918 | 


--------------------------------------------------------------------------------
/embedding/order.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 100
 22 | #define EXP_TABLE_SIZE 1000
 23 | #define MAX_EXP 6
 24 | #define MAX_SENTENCE_LENGTH 1000
 25 | #define MAX_CODE_LENGTH 40
 26 | 
 27 | const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 28 | 
 29 | typedef float real;                    // Precision of float numbers
 30 | 
 31 | struct vocab_word {
 32 |   long long cn;
 33 |   int *point;
 34 |   char *word, *code, codelen;
 35 | };
 36 | 
 37 | char train_file[MAX_STRING], output_file[MAX_STRING];
 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 39 | struct vocab_word *vocab;
 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
 41 | int *vocab_hash;
 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100, input_size = 500;
 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
 44 | real alpha = 0.025, starting_alpha, sample = 1e-3;
 45 | real *syn0, *syn1, *syn1neg, *expTable, *syn1neg_gdsq, *syn0_gdsq;
 46 | double *loss, *lossV, sum_loss, sum_lossV;
 47 | long long *loss_cnt, *lossV_cnt, sum_loss_cnt, sum_lossV_cnt;
 48 | 
 49 | clock_t start;
 50 | 
 51 | int hs = 0, negative = 5;
 52 | const int table_size = 1e8;
 53 | int *table;
 54 | 
 55 | void InitUnigramTable() {
 56 |   int a, i;
 57 |   long long train_words_pow = 0;
 58 |   real d1, power = 0.75;
 59 |   table = (int *)malloc(table_size * sizeof(int));
 60 |   for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
 61 |   i = 0;
 62 |   d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
 63 |   for (a = 0; a < table_size; a++) {
 64 |     table[a] = i;
 65 |     if (a / (real)table_size > d1) {
 66 |       i++;
 67 |       d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
 68 |     }
 69 |     if (i >= vocab_size) i = vocab_size - 1;
 70 |   }
 71 | }
 72 | 
 73 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 74 | void ReadWord(char *word, FILE *fin) {
 75 |   int a = 0, ch;
 76 |   while (!feof(fin)) {
 77 |     ch = fgetc(fin);
 78 |     if (ch == 13) continue;
 79 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 80 |       if (a > 0) {
 81 |         if (ch == '\n') ungetc(ch, fin);
 82 |         break;
 83 |       }
 84 |       if (ch == '\n') {
 85 |         strcpy(word, (char *)"</s>");
 86 |         return;
 87 |       } else continue;
 88 |     }
 89 |     word[a] = ch;
 90 |     a++;
 91 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 92 |   }
 93 |   word[a] = 0;
 94 | }
 95 | 
 96 | // Returns hash value of a word
 97 | int GetWordHash(char *word) {
 98 |   unsigned long long a, hash = 0;
 99 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
100 |   hash = hash % vocab_hash_size;
101 |   return hash;
102 | }
103 | 
104 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
105 | int SearchVocab(char *word) {
106 |   unsigned int hash = GetWordHash(word);
107 |   while (1) {
108 |     if (vocab_hash[hash] == -1) return -1;
109 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
110 |     hash = (hash + 1) % vocab_hash_size;
111 |   }
112 |   return -1;
113 | }
114 | 
115 | // Reads a word and returns its index in the vocabulary
116 | int ReadWordIndex(FILE *fin) {
117 |   char word[MAX_STRING];
118 |   ReadWord(word, fin);
119 |   if (feof(fin)) return -1;
120 |   return SearchVocab(word);
121 | }
122 | 
123 | // Adds a word to the vocabulary
124 | int AddWordToVocab(char *word) {
125 |   unsigned int hash, length = strlen(word) + 1;
126 |   if (length > MAX_STRING) length = MAX_STRING;
127 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
128 |   strcpy(vocab[vocab_size].word, word);
129 |   vocab[vocab_size].cn = 0;
130 |   vocab_size++;
131 |   // Reallocate memory if needed
132 |   if (vocab_size + 2 >= vocab_max_size) {
133 |     vocab_max_size += 1000;
134 |     vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
135 |   }
136 |   hash = GetWordHash(word);
137 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
138 |   vocab_hash[hash] = vocab_size - 1;
139 |   return vocab_size - 1;
140 | }
141 | 
142 | // Used later for sorting by word counts
143 | int VocabCompare(const void *a, const void *b) {
144 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
145 | }
146 | 
147 | // Sorts the vocabulary by frequency using word counts
148 | void SortVocab() {
149 |   int a, size;
150 |   unsigned int hash;
151 |   // Sort the vocabulary and keep </s> at the first position
152 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
153 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
154 |   size = vocab_size;
155 |   train_words = 0;
156 |   for (a = 0; a < size; a++) {
157 |     // Words occuring less than min_count times will be discarded from the vocab
158 |     if ((vocab[a].cn < min_count) && (a != 0)) {
159 |       vocab_size--;
160 |       free(vocab[a].word);
161 |     } else {
162 |       // Hash will be re-computed, as after the sorting it is not actual
163 |       hash=GetWordHash(vocab[a].word);
164 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
165 |       vocab_hash[hash] = a;
166 |       train_words += vocab[a].cn;
167 |     }
168 |   }
169 |   vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
170 |   // Allocate memory for the binary tree construction
171 |   for (a = 0; a < vocab_size; a++) {
172 |     vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
173 |     vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
174 |   }
175 | }
176 | 
177 | // Reduces the vocabulary by removing infrequent tokens
178 | void ReduceVocab() {
179 |   int a, b = 0;
180 |   unsigned int hash;
181 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
182 |     vocab[b].cn = vocab[a].cn;
183 |     vocab[b].word = vocab[a].word;
184 |     b++;
185 |   } else free(vocab[a].word);
186 |   vocab_size = b;
187 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
188 |   for (a = 0; a < vocab_size; a++) {
189 |     // Hash will be re-computed, as it is not actual
190 |     hash = GetWordHash(vocab[a].word);
191 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
192 |     vocab_hash[hash] = a;
193 |   }
194 |   fflush(stdout);
195 |   min_reduce++;
196 | }
197 | 
198 | // Create binary Huffman tree using the word counts
199 | // Frequent words will have short uniqe binary codes
200 | void CreateBinaryTree() {
201 |   long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
202 |   char code[MAX_CODE_LENGTH];
203 |   long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
204 |   long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
205 |   long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
206 |   for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
207 |   for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
208 |   pos1 = vocab_size - 1;
209 |   pos2 = vocab_size;
210 |   // Following algorithm constructs the Huffman tree by adding one node at a time
211 |   for (a = 0; a < vocab_size - 1; a++) {
212 |     // First, find two smallest nodes 'min1, min2'
213 |     if (pos1 >= 0) {
214 |       if (count[pos1] < count[pos2]) {
215 |         min1i = pos1;
216 |         pos1--;
217 |       } else {
218 |         min1i = pos2;
219 |         pos2++;
220 |       }
221 |     } else {
222 |       min1i = pos2;
223 |       pos2++;
224 |     }
225 |     if (pos1 >= 0) {
226 |       if (count[pos1] < count[pos2]) {
227 |         min2i = pos1;
228 |         pos1--;
229 |       } else {
230 |         min2i = pos2;
231 |         pos2++;
232 |       }
233 |     } else {
234 |       min2i = pos2;
235 |       pos2++;
236 |     }
237 |     count[vocab_size + a] = count[min1i] + count[min2i];
238 |     parent_node[min1i] = vocab_size + a;
239 |     parent_node[min2i] = vocab_size + a;
240 |     binary[min2i] = 1;
241 |   }
242 |   // Now assign binary code to each vocabulary word
243 |   for (a = 0; a < vocab_size; a++) {
244 |     b = a;
245 |     i = 0;
246 |     while (1) {
247 |       code[i] = binary[b];
248 |       point[i] = b;
249 |       i++;
250 |       b = parent_node[b];
251 |       if (b == vocab_size * 2 - 2) break;
252 |     }
253 |     vocab[a].codelen = i;
254 |     vocab[a].point[0] = vocab_size - 2;
255 |     for (b = 0; b < i; b++) {
256 |       vocab[a].code[i - b - 1] = code[b];
257 |       vocab[a].point[i - b] = point[b] - vocab_size;
258 |     }
259 |   }
260 |   free(count);
261 |   free(binary);
262 |   free(parent_node);
263 | }
264 | 
265 | void LearnVocabFromTrainFile() {
266 |   char word[MAX_STRING];
267 |   FILE *fin;
268 |   long long a, i;
269 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
270 |   fin = fopen(train_file, "rb");
271 |   if (fin == NULL) {
272 |     printf("ERROR: training data file not found!\n");
273 |     exit(1);
274 |   }
275 |   vocab_size = 0;
276 |   AddWordToVocab((char *)"</s>");
277 |   while (1) {
278 |     ReadWord(word, fin);
279 |     if (feof(fin)) break;
280 |     train_words++;
281 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
282 |       printf("%lldK%c", train_words / 1000, 13);
283 |       fflush(stdout);
284 |     }
285 |     i = SearchVocab(word);
286 |     if (i == -1) {
287 |       a = AddWordToVocab(word);
288 |       vocab[a].cn = 1;
289 |     } else vocab[i].cn++;
290 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
291 |   }
292 |   SortVocab();
293 |   if (debug_mode > 0) {
294 |     printf("Vocab size: %lld\n", vocab_size);
295 |     printf("Words in train file: %lld\n", train_words);
296 |   }
297 |   file_size = ftell(fin);
298 |   fclose(fin);
299 | }
300 | 
301 | void SaveVocab() {
302 |   long long i;
303 |   FILE *fo = fopen(save_vocab_file, "wb");
304 |   for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
305 |   fclose(fo);
306 | }
307 | 
308 | void ReadVocab() {
309 |   long long a, i = 0;
310 |   char c;
311 |   char word[MAX_STRING];
312 |   FILE *fin = fopen(read_vocab_file, "rb");
313 |   if (fin == NULL) {
314 |     printf("Vocabulary file not found\n");
315 |     exit(1);
316 |   }
317 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
318 |   vocab_size = 0;
319 |   while (1) {
320 |     ReadWord(word, fin);
321 |     if (feof(fin)) break;
322 |     a = AddWordToVocab(word);
323 |     fscanf(fin, "%lld%c", &vocab[a].cn, &c);
324 |     i++;
325 |   }
326 |   SortVocab();
327 |   if (debug_mode > 0) {
328 |     printf("Vocab size: %lld\n", vocab_size);
329 |     printf("Words in train file: %lld\n", train_words);
330 |   }
331 |   fin = fopen(train_file, "rb");
332 |   if (fin == NULL) {
333 |     printf("ERROR: training data file not found!\n");
334 |     exit(1);
335 |   }
336 |   fseek(fin, 0, SEEK_END);
337 |   file_size = ftell(fin);
338 |   fclose(fin);
339 | }
340 | 
341 | void InitNet() {
342 |   long long a, b;
343 |   unsigned long long next_random = 1;
344 |   a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
345 |   a = posix_memalign((void **)&syn0_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real));
346 |   if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
347 |   if (hs) {
348 |     a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
349 |     if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
350 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
351 |      syn1[a * layer1_size + b] = 0;
352 |   }
353 |   if (negative>0) {
354 |     a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * input_size * sizeof(real));
355 |     a = posix_memalign((void **)&syn1neg_gdsq, 128, (long long)vocab_size * input_size * sizeof(real));
356 |     if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
357 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < input_size; b++){
358 |       syn1neg[a * input_size + b] = 0;
359 |       syn1neg_gdsq[a * input_size + b] = 1e-8;
360 |     }
361 |      
362 |   }
363 |   for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
364 |     next_random = next_random * (unsigned long long)25214903917 + 11;
365 |     syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
366 |     syn0_gdsq[a * layer1_size + b] = 1e-8;
367 |   }
368 |   CreateBinaryTree();
369 | }
370 | 
371 | void writeWV(char *output_file){
372 |   long long a, b;
373 |   FILE *fo = fopen(output_file, "wb");
374 |     fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
375 |     for (a = 0; a < vocab_size; a++) {
376 |       fprintf(fo, "%s ", vocab[a].word);
377 |       if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
378 |       else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
379 |       fprintf(fo, "\n");
380 |     }
381 |     fclose(fo);
382 | }
383 | 
384 | typedef unsigned long uint64_t;
385 | typedef unsigned int uint32_t;
386 | 
387 | double rsqrt64(double number) {
388 |   uint64_t i;
389 |   double x2, y;
390 |   x2 = number * 0.5;
391 |   y = number;
392 |   i = *(uint64_t *) &y;
393 |   i = 0x5fe6eb50c7b537a9 - (i >> 1);
394 |   y = *(double *) &i;
395 |   y = y * (1.5 - (x2 * y * y));
396 |   return y;
397 | }
398 | 
399 | float rsqrt(float number){
400 |   uint32_t i;
401 |   float x2, y;
402 |   x2 = number * 0.5F;
403 |   y  = number;
404 |   i  = *(uint32_t *) &y;
405 |   i  = 0x5f3759df - ( i >> 1 );
406 |   y  = *(float *) &i;
407 |   y  = y * ( 1.5F - ( x2 * y * y ) );
408 |   return y;
409 | }
410 | 
411 | void *TrainModelThread(void *_id) {
412 |   long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
413 |   long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
414 |   long long l1, l2, c, target, label;
415 |   long long id = (long long)_id;
416 |   unsigned long long next_random = id + clock(); //must add ``clock'' if using this kind of iter.
417 |   real f, g;
418 |   clock_t now;
419 |   real *neu1 = (real *)calloc(input_size, sizeof(real));
420 |   real *neu1e = (real *)calloc(input_size, sizeof(real));
421 |   double err = 0, errV = 0; //loss
422 |   long long err_cnt = 0, errV_cnt = 0;
423 | 
424 |   FILE *fi = fopen(train_file, "rb");
425 |   fseek(fi, file_size / (long long)num_threads * id, SEEK_SET);
426 |   while (1) {
427 |     if (word_count - last_word_count > 10000) {
428 |       loss[id] += err;
429 |       lossV[id] += errV;
430 |       loss_cnt[id] += err_cnt;
431 |       lossV_cnt[id] += errV_cnt;
432 | 
433 |       sum_loss += err;
434 |       sum_lossV += errV;
435 |       sum_loss_cnt += err_cnt;
436 |       sum_lossV_cnt += errV_cnt;
437 |       
438 |       err = errV = 0;
439 |       err_cnt = errV_cnt = 0;
440 |       word_count_actual += word_count - last_word_count;
441 |       last_word_count = word_count;
442 |       if ((debug_mode > 1) && id == 0) {
443 |         now=clock();
444 |         printf("%cAlpha: %f  Err: %lf  ErrV: %lf  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
445 |          -sum_loss / sum_loss_cnt / (negative + 1),
446 |          -sum_lossV / sum_lossV_cnt / (negative + 1),
447 |          word_count_actual / (real)(train_words + 1) * 100,
448 |          word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
449 |         fflush(stdout);
450 |       }
451 |       //alpha = starting_alpha;
452 |       //alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
453 |       //if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
454 |     }
455 |     if (sentence_length == 0) {
456 |       while (1) {
457 |         word = ReadWordIndex(fi);
458 |         if (feof(fi)) break;
459 |         if (word == -1) continue;
460 |         word_count++;
461 |         if (word == 0) break;
462 |         // The subsampling randomly discards frequent words while keeping the ranking same
463 |         if (sample > 0) {
464 |           real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
465 |           next_random = next_random * (unsigned long long)25214903917 + 11;
466 |           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
467 |         }
468 |         sen[sentence_length] = word;
469 |         sentence_length++;
470 |         if (sentence_length >= MAX_SENTENCE_LENGTH) break;
471 |       }
472 |       sentence_position = 0;
473 |     }
474 |     int validSet = 0;
475 |     if(word_count > (int)(0.95 * train_words / num_threads))
476 |       validSet = 1;
477 |     if (feof(fi) || (word_count > train_words / num_threads)) {
478 |       word_count_actual += word_count - last_word_count;
479 |       break;
480 |     }
481 |     word = sen[sentence_position];
482 |     if (word == -1) continue;
483 |     //for (c = 0; c < input_size; c++) neu1[c] = 0;
484 |     for (c = 0; c < input_size; c++) neu1e[c] = 0;
485 |     next_random = next_random * (unsigned long long)25214903917 + 11;
486 |     //b = next_random % window;
487 |     b = 0;
488 |     if (cbow) {  //train the cbow architecture
489 |       // in -> hidden
490 |       cw = 0;
491 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
492 |         c = sentence_position - window + a;
493 |         last_word = sen[c];
494 |         if (c < 0) last_word = 0;
495 |         if (c >= sentence_length) last_word = 0;
496 |         if (last_word == -1) last_word = 0;
497 |         for (c = 0; c < layer1_size; c++) neu1[cw * layer1_size + c] = syn0[c + last_word * layer1_size];
498 |         cw++;
499 |       }
500 |       if (cw == window * 2) {
501 |         //for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
502 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
503 |           f = 0;
504 |           l2 = vocab[word].point[d] * layer1_size;
505 |           // Propagate hidden -> output
506 |           for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
507 |           if (f <= -MAX_EXP) continue;
508 |           else if (f >= MAX_EXP) continue;
509 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
510 |           // 'g' is the gradient multiplied by the learning rate
511 |           g = (1 - vocab[word].code[d] - f) * alpha;
512 |           // Propagate errors output -> hidden
513 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
514 |           // Learn weights hidden -> output
515 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
516 |         }
517 |         // NEGATIVE SAMPLING
518 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
519 |           if (d == 0) {
520 |             target = word;
521 |             label = 1;
522 |           } else {
523 |             next_random = next_random * (unsigned long long)25214903917 + 11;
524 |             target = table[(next_random >> 16) % table_size];
525 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
526 |             if (target == word) continue;
527 |             label = 0;
528 |           }
529 |           l2 = target * input_size;
530 |           f = 0;
531 |           for (c = 0; c < input_size; c++) f += neu1[c] * syn1neg[c + l2];
532 |           if (f > MAX_EXP) g = (label - 1);
533 |           else if (f < -MAX_EXP) g = (label - 0);
534 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
535 | 
536 |           if(label) f = -f;
537 |           if(validSet) errV += log(1/(1+exp(f)));
538 |           else err += log(1/(1+exp(f)));
539 |           
540 |           for (c = 0; c < input_size; c++) neu1e[c] += g * syn1neg[c + l2];
541 |           //for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
542 |           if(!validSet) for (c = 0; c < input_size; c++) {
543 |             real diff = g * neu1[c];
544 |             syn1neg_gdsq[c + l2] += diff * diff;
545 |             syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]);
546 |             //
547 |           }
548 |         }
549 | 
550 |         if(validSet) errV_cnt++;
551 |         else err_cnt++;
552 |         // hidden -> in
553 |         cw = 0;
554 |         if(!validSet) for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
555 |           c = sentence_position - window + a;
556 |           last_word = sen[c];
557 |           if (c < 0) last_word = 0;
558 |           if (c >= sentence_length) last_word = 0;
559 |           if (last_word == -1) last_word = 0;
560 |           //for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
561 |           for (c = 0; c < layer1_size; c++) {
562 |             real diff = neu1e[cw * layer1_size + c];
563 |             long long p = c + last_word * layer1_size;
564 |             syn0_gdsq[p] += diff * diff;
565 |             syn0[p] += alpha * diff * rsqrt(syn0_gdsq[p]);
566 |           }
567 |           cw++;
568 |         }
569 |       }
570 |     } else {  //train skip-gram
571 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
572 |         c = sentence_position - window + a;
573 |         if (c < 0) continue;
574 |         if (c >= sentence_length) continue;
575 |         last_word = sen[c];
576 |         if (last_word == -1) continue;
577 |         l1 = last_word * layer1_size;
578 |         for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
579 |         // HIERARCHICAL SOFTMAX
580 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
581 |           f = 0;
582 |           l2 = vocab[word].point[d] * layer1_size;
583 |           // Propagate hidden -> output
584 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
585 |           if (f <= -MAX_EXP) continue;
586 |           else if (f >= MAX_EXP) continue;
587 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
588 |           // 'g' is the gradient multiplied by the learning rate
589 |           g = (1 - vocab[word].code[d] - f) * alpha;
590 |           // Propagate errors output -> hidden
591 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
592 |           // Learn weights hidden -> output
593 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
594 |         }
595 |         // NEGATIVE SAMPLING
596 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
597 |           if (d == 0) {
598 |             target = word;
599 |             label = 1;
600 |           } else {
601 |             next_random = next_random * (unsigned long long)25214903917 + 11;
602 |             target = table[(next_random >> 16) % table_size];
603 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
604 |             if (target == word) continue;
605 |             label = 0;
606 |           }
607 |           l2 = target * layer1_size;
608 |           f = 0;
609 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
610 |           if (f > MAX_EXP) g = (label - 1);
611 |           else if (f < -MAX_EXP) g = (label - 0);
612 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
613 | 
614 |           if(label) f = -f;
615 |           if(validSet) errV += log(1/(1+exp(f)));
616 |           else err += log(1/(1+exp(f)));
617 |           
618 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
619 |           for (c = 0; c < layer1_size; c++) {
620 |             real diff = g * syn0[c + l1];
621 |             syn1neg_gdsq[c + l2] += diff * diff;
622 |             syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]);
623 |             //
624 |           }
625 |         }
626 |         // Learn weights input -> hidden
627 |         for (c = 0; c < layer1_size; c++) {
628 |           real diff = neu1e[c];
629 |           syn0_gdsq[c + l1] += diff * diff;
630 |           syn0[c + l1] += alpha * diff * rsqrt(syn0_gdsq[c + l1]);
631 |         }
632 |         if(validSet) errV_cnt++;
633 |         else err_cnt++;
634 |       }
635 |       
636 |     }
637 |     sentence_position++;
638 |     if (sentence_position >= sentence_length) {
639 |       sentence_length = 0;
640 |       continue;
641 |     }
642 |   }
643 |   fclose(fi);
644 |   free(neu1);
645 |   free(neu1e);
646 |   pthread_exit(NULL);
647 | }
648 | 
649 | 
650 | void TrainModel() {
651 |   long a, b, c, d;
652 |   FILE *fo;
653 |   char ffname[100];
654 |   pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
655 |   loss = (double *)malloc(num_threads * sizeof(double));
656 |   lossV = (double *)malloc(num_threads * sizeof(double));
657 |   loss_cnt = (long long *)malloc(num_threads * sizeof(long long));
658 |   lossV_cnt = (long long *)malloc(num_threads * sizeof(long long));
659 | 
660 |   printf("Starting training using file %s\n", train_file);
661 |   starting_alpha = alpha;
662 |   if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
663 |   if (save_vocab_file[0] != 0) SaveVocab();
664 |   if (output_file[0] == 0) return;
665 |   InitNet();
666 |   if (negative > 0) InitUnigramTable();
667 |   
668 |   for(b = 1; b <= iter; b++){
669 |     start = clock();
670 |     word_count_actual = 0;
671 |     for (a = 0; a < num_threads; a++){
672 |       loss[a] = lossV[a] = 0;
673 |       loss_cnt[a] = lossV_cnt[a] = 0;
674 |     }
675 |     sum_loss = sum_lossV = 0;
676 |     sum_loss_cnt = sum_lossV_cnt = 0;
677 | 
678 |     for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
679 |     for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
680 | 
681 |     sprintf(ffname, "%s_%ld", output_file, b);
682 |     writeWV(ffname);
683 |     printf("%c", 13);
684 | 
685 |     sum_loss = sum_lossV = 0;
686 |     sum_loss_cnt = sum_lossV_cnt = 0;
687 |     for (a = 0; a < num_threads; a++){
688 |       sum_loss += loss[a];
689 |       sum_lossV += lossV[a];
690 |       sum_loss_cnt += loss_cnt[a];
691 |       sum_lossV_cnt += lossV_cnt[a];
692 |     }
693 |     fprintf(stderr, "Iter: %ld  Err: %lf  ErrV: %lf\n", b,
694 |       -sum_loss / sum_loss_cnt / (negative + 1),
695 |       -sum_lossV / sum_lossV_cnt / (negative + 1));
696 |     fflush(stderr);
697 |   }
698 |   
699 |   if (classes == 0) {
700 |     // Save the word vectors
701 |     //writeWV(output_file);
702 |   } else {
703 |     fo = fopen(output_file, "wb");
704 |     // Run K-means on the word vectors
705 |     int clcn = classes, iter = 10, closeid;
706 |     int *centcn = (int *)malloc(classes * sizeof(int));
707 |     int *cl = (int *)calloc(vocab_size, sizeof(int));
708 |     real closev, x;
709 |     real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
710 |     for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
711 |     for (a = 0; a < iter; a++) {
712 |       for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
713 |       for (b = 0; b < clcn; b++) centcn[b] = 1;
714 |       for (c = 0; c < vocab_size; c++) {
715 |         for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
716 |         centcn[cl[c]]++;
717 |       }
718 |       for (b = 0; b < clcn; b++) {
719 |         closev = 0;
720 |         for (c = 0; c < layer1_size; c++) {
721 |           cent[layer1_size * b + c] /= centcn[b];
722 |           closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
723 |         }
724 |         closev = sqrt(closev);
725 |         for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
726 |       }
727 |       for (c = 0; c < vocab_size; c++) {
728 |         closev = -10;
729 |         closeid = 0;
730 |         for (d = 0; d < clcn; d++) {
731 |           x = 0;
732 |           for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
733 |           if (x > closev) {
734 |             closev = x;
735 |             closeid = d;
736 |           }
737 |         }
738 |         cl[c] = closeid;
739 |       }
740 |     }
741 |     // Save the K-means classes
742 |     for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
743 |     free(centcn);
744 |     free(cent);
745 |     free(cl);
746 |     fclose(fo);
747 |   }
748 |   
749 | }
750 | 
751 | int ArgPos(char *str, int argc, char **argv) {
752 |   int a;
753 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
754 |     if (a == argc - 1) {
755 |       printf("Argument missing for %s\n", str);
756 |       exit(1);
757 |     }
758 |     return a;
759 |   }
760 |   return -1;
761 | }
762 | 
763 | int main(int argc, char **argv) {
764 |   int i;
765 |   if (argc == 1) {
766 |     printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
767 |     printf("Options:\n");
768 |     printf("Parameters for training:\n");
769 |     printf("\t-train <file>\n");
770 |     printf("\t\tUse text data from <file> to train the model\n");
771 |     printf("\t-output <file>\n");
772 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
773 |     printf("\t-size <int>\n");
774 |     printf("\t\tSet size of word vectors; default is 100\n");
775 |     printf("\t-window <int>\n");
776 |     printf("\t\tSet max skip length between words; default is 5\n");
777 |     printf("\t-sample <float>\n");
778 |     printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
779 |     printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
780 |     printf("\t-hs <int>\n");
781 |     printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
782 |     printf("\t-negative <int>\n");
783 |     printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
784 |     printf("\t-threads <int>\n");
785 |     printf("\t\tUse <int> threads (default 12)\n");
786 |     printf("\t-iter <int>\n");
787 |     printf("\t\tRun more training iterations (default 5)\n");
788 |     printf("\t-min-count <int>\n");
789 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
790 |     printf("\t-alpha <float>\n");
791 |     printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
792 |     printf("\t-classes <int>\n");
793 |     printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
794 |     printf("\t-debug <int>\n");
795 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
796 |     printf("\t-binary <int>\n");
797 |     printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
798 |     printf("\t-save-vocab <file>\n");
799 |     printf("\t\tThe vocabulary will be saved to <file>\n");
800 |     printf("\t-read-vocab <file>\n");
801 |     printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
802 |     printf("\t-cbow <int>\n");
803 |     printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
804 |     printf("\nExamples:\n");
805 |     printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
806 |     return 0;
807 |   }
808 |   output_file[0] = 0;
809 |   save_vocab_file[0] = 0;
810 |   read_vocab_file[0] = 0;
811 |   if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
812 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
813 |   if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
814 |   if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
815 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
816 |   if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
817 |   if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
818 |   if (cbow) alpha = 0.05;
819 |   if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
820 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
821 |   if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
822 |   if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
823 |   if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
824 |   if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
825 |   if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
826 |   if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
827 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
828 |   if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
829 |   input_size = layer1_size * window * 2;
830 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
831 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
832 |   expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
833 |   for (i = 0; i < EXP_TABLE_SIZE; i++) {
834 |     expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
835 |     expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
836 |   }
837 |   TrainModel();
838 |   return 0;
839 | }
840 | 


--------------------------------------------------------------------------------
/embedding/word2vec.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 100
 22 | #define EXP_TABLE_SIZE 1000
 23 | #define MAX_EXP 6
 24 | #define MAX_SENTENCE_LENGTH 1000
 25 | #define MAX_CODE_LENGTH 40
 26 | 
 27 | const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 28 | 
 29 | typedef float real;                    // Precision of float numbers
 30 | 
 31 | struct vocab_word {
 32 |   long long cn;
 33 |   int *point;
 34 |   char *word, *code, codelen;
 35 | };
 36 | 
 37 | char train_file[MAX_STRING], output_file[MAX_STRING];
 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 39 | struct vocab_word *vocab;
 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
 41 | int *vocab_hash;
 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
 44 | real alpha = 0.025, starting_alpha, sample = 1e-3;
 45 | real *syn0, *syn1, *syn1neg, *expTable, *syn1neg_gdsq, *syn0_gdsq;
 46 | double *loss, *lossV, sum_loss, sum_lossV;
 47 | long long *loss_cnt, *lossV_cnt, sum_loss_cnt, sum_lossV_cnt;
 48 | 
 49 | clock_t start;
 50 | 
 51 | int hs = 0, negative = 5;
 52 | const int table_size = 1e8;
 53 | int *table;
 54 | 
 55 | void InitUnigramTable() {
 56 |   int a, i;
 57 |   long long train_words_pow = 0;
 58 |   real d1, power = 0.75;
 59 |   table = (int *)malloc(table_size * sizeof(int));
 60 |   for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
 61 |   i = 0;
 62 |   d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
 63 |   for (a = 0; a < table_size; a++) {
 64 |     table[a] = i;
 65 |     if (a / (real)table_size > d1) {
 66 |       i++;
 67 |       d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
 68 |     }
 69 |     if (i >= vocab_size) i = vocab_size - 1;
 70 |   }
 71 | }
 72 | 
 73 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 74 | void ReadWord(char *word, FILE *fin) {
 75 |   int a = 0, ch;
 76 |   while (!feof(fin)) {
 77 |     ch = fgetc(fin);
 78 |     if (ch == 13) continue;
 79 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 80 |       if (a > 0) {
 81 |         if (ch == '\n') ungetc(ch, fin);
 82 |         break;
 83 |       }
 84 |       if (ch == '\n') {
 85 |         strcpy(word, (char *)"</s>");
 86 |         return;
 87 |       } else continue;
 88 |     }
 89 |     word[a] = ch;
 90 |     a++;
 91 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 92 |   }
 93 |   word[a] = 0;
 94 | }
 95 | 
 96 | // Returns hash value of a word
 97 | int GetWordHash(char *word) {
 98 |   unsigned long long a, hash = 0;
 99 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
100 |   hash = hash % vocab_hash_size;
101 |   return hash;
102 | }
103 | 
104 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
105 | int SearchVocab(char *word) {
106 |   unsigned int hash = GetWordHash(word);
107 |   while (1) {
108 |     if (vocab_hash[hash] == -1) return -1;
109 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
110 |     hash = (hash + 1) % vocab_hash_size;
111 |   }
112 |   return -1;
113 | }
114 | 
115 | // Reads a word and returns its index in the vocabulary
116 | int ReadWordIndex(FILE *fin) {
117 |   char word[MAX_STRING];
118 |   ReadWord(word, fin);
119 |   if (feof(fin)) return -1;
120 |   return SearchVocab(word);
121 | }
122 | 
123 | // Adds a word to the vocabulary
124 | int AddWordToVocab(char *word) {
125 |   unsigned int hash, length = strlen(word) + 1;
126 |   if (length > MAX_STRING) length = MAX_STRING;
127 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
128 |   strcpy(vocab[vocab_size].word, word);
129 |   vocab[vocab_size].cn = 0;
130 |   vocab_size++;
131 |   // Reallocate memory if needed
132 |   if (vocab_size + 2 >= vocab_max_size) {
133 |     vocab_max_size += 1000;
134 |     vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
135 |   }
136 |   hash = GetWordHash(word);
137 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
138 |   vocab_hash[hash] = vocab_size - 1;
139 |   return vocab_size - 1;
140 | }
141 | 
142 | // Used later for sorting by word counts
143 | int VocabCompare(const void *a, const void *b) {
144 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
145 | }
146 | 
147 | // Sorts the vocabulary by frequency using word counts
148 | void SortVocab() {
149 |   int a, size;
150 |   unsigned int hash;
151 |   // Sort the vocabulary and keep </s> at the first position
152 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
153 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
154 |   size = vocab_size;
155 |   train_words = 0;
156 |   for (a = 0; a < size; a++) {
157 |     // Words occuring less than min_count times will be discarded from the vocab
158 |     if ((vocab[a].cn < min_count) && (a != 0)) {
159 |       vocab_size--;
160 |       free(vocab[a].word);
161 |     } else {
162 |       // Hash will be re-computed, as after the sorting it is not actual
163 |       hash=GetWordHash(vocab[a].word);
164 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
165 |       vocab_hash[hash] = a;
166 |       train_words += vocab[a].cn;
167 |     }
168 |   }
169 |   vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
170 |   // Allocate memory for the binary tree construction
171 |   for (a = 0; a < vocab_size; a++) {
172 |     vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
173 |     vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
174 |   }
175 | }
176 | 
177 | // Reduces the vocabulary by removing infrequent tokens
178 | void ReduceVocab() {
179 |   int a, b = 0;
180 |   unsigned int hash;
181 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
182 |     vocab[b].cn = vocab[a].cn;
183 |     vocab[b].word = vocab[a].word;
184 |     b++;
185 |   } else free(vocab[a].word);
186 |   vocab_size = b;
187 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
188 |   for (a = 0; a < vocab_size; a++) {
189 |     // Hash will be re-computed, as it is not actual
190 |     hash = GetWordHash(vocab[a].word);
191 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
192 |     vocab_hash[hash] = a;
193 |   }
194 |   fflush(stdout);
195 |   min_reduce++;
196 | }
197 | 
198 | // Create binary Huffman tree using the word counts
199 | // Frequent words will have short uniqe binary codes
200 | void CreateBinaryTree() {
201 |   long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
202 |   char code[MAX_CODE_LENGTH];
203 |   long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
204 |   long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
205 |   long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
206 |   for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
207 |   for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
208 |   pos1 = vocab_size - 1;
209 |   pos2 = vocab_size;
210 |   // Following algorithm constructs the Huffman tree by adding one node at a time
211 |   for (a = 0; a < vocab_size - 1; a++) {
212 |     // First, find two smallest nodes 'min1, min2'
213 |     if (pos1 >= 0) {
214 |       if (count[pos1] < count[pos2]) {
215 |         min1i = pos1;
216 |         pos1--;
217 |       } else {
218 |         min1i = pos2;
219 |         pos2++;
220 |       }
221 |     } else {
222 |       min1i = pos2;
223 |       pos2++;
224 |     }
225 |     if (pos1 >= 0) {
226 |       if (count[pos1] < count[pos2]) {
227 |         min2i = pos1;
228 |         pos1--;
229 |       } else {
230 |         min2i = pos2;
231 |         pos2++;
232 |       }
233 |     } else {
234 |       min2i = pos2;
235 |       pos2++;
236 |     }
237 |     count[vocab_size + a] = count[min1i] + count[min2i];
238 |     parent_node[min1i] = vocab_size + a;
239 |     parent_node[min2i] = vocab_size + a;
240 |     binary[min2i] = 1;
241 |   }
242 |   // Now assign binary code to each vocabulary word
243 |   for (a = 0; a < vocab_size; a++) {
244 |     b = a;
245 |     i = 0;
246 |     while (1) {
247 |       code[i] = binary[b];
248 |       point[i] = b;
249 |       i++;
250 |       b = parent_node[b];
251 |       if (b == vocab_size * 2 - 2) break;
252 |     }
253 |     vocab[a].codelen = i;
254 |     vocab[a].point[0] = vocab_size - 2;
255 |     for (b = 0; b < i; b++) {
256 |       vocab[a].code[i - b - 1] = code[b];
257 |       vocab[a].point[i - b] = point[b] - vocab_size;
258 |     }
259 |   }
260 |   free(count);
261 |   free(binary);
262 |   free(parent_node);
263 | }
264 | 
265 | void LearnVocabFromTrainFile() {
266 |   char word[MAX_STRING];
267 |   FILE *fin;
268 |   long long a, i;
269 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
270 |   fin = fopen(train_file, "rb");
271 |   if (fin == NULL) {
272 |     printf("ERROR: training data file not found!\n");
273 |     exit(1);
274 |   }
275 |   vocab_size = 0;
276 |   AddWordToVocab((char *)"</s>");
277 |   while (1) {
278 |     ReadWord(word, fin);
279 |     if (feof(fin)) break;
280 |     train_words++;
281 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
282 |       printf("%lldK%c", train_words / 1000, 13);
283 |       fflush(stdout);
284 |     }
285 |     i = SearchVocab(word);
286 |     if (i == -1) {
287 |       a = AddWordToVocab(word);
288 |       vocab[a].cn = 1;
289 |     } else vocab[i].cn++;
290 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
291 |   }
292 |   SortVocab();
293 |   if (debug_mode > 0) {
294 |     printf("Vocab size: %lld\n", vocab_size);
295 |     printf("Words in train file: %lld\n", train_words);
296 |   }
297 |   file_size = ftell(fin);
298 |   fclose(fin);
299 | }
300 | 
301 | void SaveVocab() {
302 |   long long i;
303 |   FILE *fo = fopen(save_vocab_file, "wb");
304 |   for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
305 |   fclose(fo);
306 | }
307 | 
308 | void ReadVocab() {
309 |   long long a, i = 0;
310 |   char c;
311 |   char word[MAX_STRING];
312 |   FILE *fin = fopen(read_vocab_file, "rb");
313 |   if (fin == NULL) {
314 |     printf("Vocabulary file not found\n");
315 |     exit(1);
316 |   }
317 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
318 |   vocab_size = 0;
319 |   while (1) {
320 |     ReadWord(word, fin);
321 |     if (feof(fin)) break;
322 |     a = AddWordToVocab(word);
323 |     fscanf(fin, "%lld%c", &vocab[a].cn, &c);
324 |     i++;
325 |   }
326 |   SortVocab();
327 |   if (debug_mode > 0) {
328 |     printf("Vocab size: %lld\n", vocab_size);
329 |     printf("Words in train file: %lld\n", train_words);
330 |   }
331 |   fin = fopen(train_file, "rb");
332 |   if (fin == NULL) {
333 |     printf("ERROR: training data file not found!\n");
334 |     exit(1);
335 |   }
336 |   fseek(fin, 0, SEEK_END);
337 |   file_size = ftell(fin);
338 |   fclose(fin);
339 | }
340 | 
341 | void InitNet() {
342 |   long long a, b;
343 |   unsigned long long next_random = 1;
344 |   a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
345 |   a = posix_memalign((void **)&syn0_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real));
346 |   if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
347 |   if (hs) {
348 |     a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
349 |     if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
350 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
351 |      syn1[a * layer1_size + b] = 0;
352 |   }
353 |   if (negative>0) {
354 |     a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
355 |     a = posix_memalign((void **)&syn1neg_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real));
356 |     if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
357 |     for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++){
358 |       syn1neg[a * layer1_size + b] = 0;
359 |       syn1neg_gdsq[a * layer1_size + b] = 1e-8;
360 |     }
361 |      
362 |   }
363 |   for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
364 |     next_random = next_random * (unsigned long long)25214903917 + 11;
365 |     syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
366 |     syn0_gdsq[a * layer1_size + b] = 1e-8;
367 |   }
368 |   CreateBinaryTree();
369 | }
370 | 
371 | void writeWV(char *output_file){
372 |   long long a, b;
373 |   FILE *fo = fopen(output_file, "wb");
374 |     fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
375 |     for (a = 0; a < vocab_size; a++) {
376 |       fprintf(fo, "%s ", vocab[a].word);
377 |       if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
378 |       else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
379 |       fprintf(fo, "\n");
380 |     }
381 |     fclose(fo);
382 | }
383 | 
384 | typedef unsigned long uint64_t;
385 | typedef unsigned int uint32_t;
386 | 
387 | double rsqrt64(double number) {
388 |   uint64_t i;
389 |   double x2, y;
390 |   x2 = number * 0.5;
391 |   y = number;
392 |   i = *(uint64_t *) &y;
393 |   i = 0x5fe6eb50c7b537a9 - (i >> 1);
394 |   y = *(double *) &i;
395 |   y = y * (1.5 - (x2 * y * y));
396 |   return y;
397 | }
398 | 
399 | float rsqrt(float number){
400 |   uint32_t i;
401 |   float x2, y;
402 |   x2 = number * 0.5F;
403 |   y  = number;
404 |   i  = *(uint32_t *) &y;
405 |   i  = 0x5f3759df - ( i >> 1 );
406 |   y  = *(float *) &i;
407 |   y  = y * ( 1.5F - ( x2 * y * y ) );
408 |   return y;
409 | }
410 | 
411 | void *TrainModelThread(void *_id) {
412 |   long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
413 |   long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
414 |   long long l1, l2, c, target, label;
415 |   long long id = (long long)_id;
416 |   unsigned long long next_random = id + clock(); //must add ``clock'' if using this kind of iter.
417 |   real f, g;
418 |   clock_t now;
419 |   real *neu1 = (real *)calloc(layer1_size, sizeof(real));
420 |   real *neu1e = (real *)calloc(layer1_size, sizeof(real));
421 |   double err = 0, errV = 0; //loss
422 |   long long err_cnt = 0, errV_cnt = 0;
423 | 
424 |   FILE *fi = fopen(train_file, "rb");
425 |   fseek(fi, file_size / (long long)num_threads * id, SEEK_SET);
426 |   while (1) {
427 |     if (word_count - last_word_count > 10000) {
428 |       loss[id] += err;
429 |       lossV[id] += errV;
430 |       loss_cnt[id] += err_cnt;
431 |       lossV_cnt[id] += errV_cnt;
432 | 
433 |       sum_loss += err;
434 |       sum_lossV += errV;
435 |       sum_loss_cnt += err_cnt;
436 |       sum_lossV_cnt += errV_cnt;
437 |       
438 |       err = errV = 0;
439 |       err_cnt = errV_cnt = 0;
440 |       word_count_actual += word_count - last_word_count;
441 |       last_word_count = word_count;
442 |       if ((debug_mode > 1) && id == 0) {
443 |         now=clock();
444 |         printf("%cAlpha: %f  Err: %lf  ErrV: %lf  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
445 |          -sum_loss / sum_loss_cnt / (negative + 1),
446 |          -sum_lossV / sum_lossV_cnt / (negative + 1),
447 |          word_count_actual / (real)(train_words + 1) * 100,
448 |          word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
449 |         fflush(stdout);
450 |       }
451 |       //alpha = starting_alpha;
452 |       //alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
453 |       //if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
454 |     }
455 |     if (sentence_length == 0) {
456 |       while (1) {
457 |         word = ReadWordIndex(fi);
458 |         if (feof(fi)) break;
459 |         if (word == -1) continue;
460 |         word_count++;
461 |         if (word == 0) break;
462 |         // The subsampling randomly discards frequent words while keeping the ranking same
463 |         if (sample > 0) {
464 |           real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
465 |           next_random = next_random * (unsigned long long)25214903917 + 11;
466 |           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
467 |         }
468 |         sen[sentence_length] = word;
469 |         sentence_length++;
470 |         if (sentence_length >= MAX_SENTENCE_LENGTH) break;
471 |       }
472 |       sentence_position = 0;
473 |     }
474 |     int validSet = 0;
475 |     if(word_count > (int)(0.95 * train_words / num_threads))
476 |       validSet = 1;
477 |     if (feof(fi) || (word_count > train_words / num_threads)) {
478 |       word_count_actual += word_count - last_word_count;
479 |       break;
480 |     }
481 |     word = sen[sentence_position];
482 |     if (word == -1) continue;
483 |     for (c = 0; c < layer1_size; c++) neu1[c] = 0;
484 |     for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
485 |     next_random = next_random * (unsigned long long)25214903917 + 11;
486 |     //b = next_random % window;
487 |     b = 0;
488 |     if (cbow) {  //train the cbow architecture
489 |       // in -> hidden
490 |       cw = 0;
491 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
492 |         c = sentence_position - window + a;
493 |         last_word = sen[c];
494 |         if (c < 0) last_word = 0;
495 |         if (c >= sentence_length) last_word = 0;
496 |         if (last_word == -1) last_word = 0;
497 |         for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
498 |         cw++;
499 |       }
500 |       if (cw == window * 2) {
501 |         for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
502 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
503 |           f = 0;
504 |           l2 = vocab[word].point[d] * layer1_size;
505 |           // Propagate hidden -> output
506 |           for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
507 |           if (f <= -MAX_EXP) continue;
508 |           else if (f >= MAX_EXP) continue;
509 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
510 |           // 'g' is the gradient multiplied by the learning rate
511 |           g = (1 - vocab[word].code[d] - f) * alpha;
512 |           // Propagate errors output -> hidden
513 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
514 |           // Learn weights hidden -> output
515 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
516 |         }
517 |         // NEGATIVE SAMPLING
518 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
519 |           if (d == 0) {
520 |             target = word;
521 |             label = 1;
522 |           } else {
523 |             next_random = next_random * (unsigned long long)25214903917 + 11;
524 |             target = table[(next_random >> 16) % table_size];
525 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
526 |             if (target == word) continue;
527 |             label = 0;
528 |           }
529 |           l2 = target * layer1_size;
530 |           f = 0;
531 |           for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
532 |           if (f > MAX_EXP) g = (label - 1);
533 |           else if (f < -MAX_EXP) g = (label - 0);
534 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
535 | 
536 |           if(label) f = -f;
537 |           if(validSet) errV += log(1/(1+exp(f)));
538 |           else err += log(1/(1+exp(f)));
539 |           
540 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
541 |           //for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
542 |           if(!validSet) for (c = 0; c < layer1_size; c++) {
543 |             real diff = g * neu1[c];
544 |             syn1neg_gdsq[c + l2] += diff * diff;
545 |             syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]);
546 |             //
547 |           }
548 |         }
549 | 
550 |         if(validSet) errV_cnt++;
551 |         else err_cnt++;
552 |         // hidden -> in
553 |         if(!validSet) for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
554 |           c = sentence_position - window + a;
555 |           last_word = sen[c];
556 |           if (c < 0) last_word = 0;
557 |           if (c >= sentence_length) last_word = 0;
558 |           if (last_word == -1) last_word = 0;
559 |           //for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
560 |           for (c = 0; c < layer1_size; c++) {
561 |             real diff = neu1e[c];
562 |             long long p = c + last_word * layer1_size;
563 |             syn0_gdsq[p] += diff * diff;
564 |             syn0[p] += alpha * diff * rsqrt(syn0_gdsq[p]);
565 |           }
566 |         }
567 |       }
568 |     } else {  //train skip-gram
569 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
570 |         c = sentence_position - window + a;
571 |         if (c < 0) continue;
572 |         if (c >= sentence_length) continue;
573 |         last_word = sen[c];
574 |         if (last_word == -1) continue;
575 |         l1 = last_word * layer1_size;
576 |         for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
577 |         // HIERARCHICAL SOFTMAX
578 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
579 |           f = 0;
580 |           l2 = vocab[word].point[d] * layer1_size;
581 |           // Propagate hidden -> output
582 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
583 |           if (f <= -MAX_EXP) continue;
584 |           else if (f >= MAX_EXP) continue;
585 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
586 |           // 'g' is the gradient multiplied by the learning rate
587 |           g = (1 - vocab[word].code[d] - f) * alpha;
588 |           // Propagate errors output -> hidden
589 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
590 |           // Learn weights hidden -> output
591 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
592 |         }
593 |         // NEGATIVE SAMPLING
594 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
595 |           if (d == 0) {
596 |             target = word;
597 |             label = 1;
598 |           } else {
599 |             next_random = next_random * (unsigned long long)25214903917 + 11;
600 |             target = table[(next_random >> 16) % table_size];
601 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
602 |             if (target == word) continue;
603 |             label = 0;
604 |           }
605 |           l2 = target * layer1_size;
606 |           f = 0;
607 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
608 |           if (f > MAX_EXP) g = (label - 1);
609 |           else if (f < -MAX_EXP) g = (label - 0);
610 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
611 | 
612 |           if(label) f = -f;
613 |           if(validSet) errV += log(1/(1+exp(f)));
614 |           else err += log(1/(1+exp(f)));
615 |           
616 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
617 |           for (c = 0; c < layer1_size; c++) {
618 |             real diff = g * syn0[c + l1];
619 |             syn1neg_gdsq[c + l2] += diff * diff;
620 |             syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]);
621 |             //
622 |           }
623 |         }
624 |         // Learn weights input -> hidden
625 |         for (c = 0; c < layer1_size; c++) {
626 |           real diff = neu1e[c];
627 |           syn0_gdsq[c + l1] += diff * diff;
628 |           syn0[c + l1] += alpha * diff * rsqrt(syn0_gdsq[c + l1]);
629 |         }
630 |         if(validSet) errV_cnt++;
631 |         else err_cnt++;
632 |       }
633 |       
634 |     }
635 |     sentence_position++;
636 |     if (sentence_position >= sentence_length) {
637 |       sentence_length = 0;
638 |       continue;
639 |     }
640 |   }
641 |   fclose(fi);
642 |   free(neu1);
643 |   free(neu1e);
644 |   pthread_exit(NULL);
645 | }
646 | 
647 | 
648 | void TrainModel() {
649 |   long a, b, c, d;
650 |   FILE *fo;
651 |   char ffname[100];
652 |   pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
653 |   loss = (double *)malloc(num_threads * sizeof(double));
654 |   lossV = (double *)malloc(num_threads * sizeof(double));
655 |   loss_cnt = (long long *)malloc(num_threads * sizeof(long long));
656 |   lossV_cnt = (long long *)malloc(num_threads * sizeof(long long));
657 | 
658 |   printf("Starting training using file %s\n", train_file);
659 |   starting_alpha = alpha;
660 |   if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
661 |   if (save_vocab_file[0] != 0) SaveVocab();
662 |   if (output_file[0] == 0) return;
663 |   InitNet();
664 |   if (negative > 0) InitUnigramTable();
665 |   
666 |   for(b = 1; b <= iter; b++){
667 |     start = clock();
668 |     word_count_actual = 0;
669 |     for (a = 0; a < num_threads; a++){
670 |       loss[a] = lossV[a] = 0;
671 |       loss_cnt[a] = lossV_cnt[a] = 0;
672 |     }
673 |     sum_loss = sum_lossV = 0;
674 |     sum_loss_cnt = sum_lossV_cnt = 0;
675 | 
676 |     for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
677 |     for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
678 | 
679 |     if(b < 100 || 
680 |        b < 1000 && b % 10 == 0 ||
681 |        b % 100 == 0){
682 |       sprintf(ffname, "%s_%ld", output_file, b);
683 |       writeWV(ffname);
684 |     }
685 |     
686 |     printf("%c", 13);
687 | 
688 |     sum_loss = sum_lossV = 0;
689 |     sum_loss_cnt = sum_lossV_cnt = 0;
690 |     for (a = 0; a < num_threads; a++){
691 |       sum_loss += loss[a];
692 |       sum_lossV += lossV[a];
693 |       sum_loss_cnt += loss_cnt[a];
694 |       sum_lossV_cnt += lossV_cnt[a];
695 |     }
696 |     fprintf(stderr, "Iter: %ld  Err: %lf  ErrV: %lf\n", b,
697 |       -sum_loss / sum_loss_cnt / (negative + 1),
698 |       -sum_lossV / sum_lossV_cnt / (negative + 1));
699 |     fflush(stderr);
700 |   }
701 |   
702 |   if (classes == 0) {
703 |     // Save the word vectors
704 |     //writeWV(output_file);
705 |   } else {
706 |     fo = fopen(output_file, "wb");
707 |     // Run K-means on the word vectors
708 |     int clcn = classes, iter = 10, closeid;
709 |     int *centcn = (int *)malloc(classes * sizeof(int));
710 |     int *cl = (int *)calloc(vocab_size, sizeof(int));
711 |     real closev, x;
712 |     real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
713 |     for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
714 |     for (a = 0; a < iter; a++) {
715 |       for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
716 |       for (b = 0; b < clcn; b++) centcn[b] = 1;
717 |       for (c = 0; c < vocab_size; c++) {
718 |         for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
719 |         centcn[cl[c]]++;
720 |       }
721 |       for (b = 0; b < clcn; b++) {
722 |         closev = 0;
723 |         for (c = 0; c < layer1_size; c++) {
724 |           cent[layer1_size * b + c] /= centcn[b];
725 |           closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
726 |         }
727 |         closev = sqrt(closev);
728 |         for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
729 |       }
730 |       for (c = 0; c < vocab_size; c++) {
731 |         closev = -10;
732 |         closeid = 0;
733 |         for (d = 0; d < clcn; d++) {
734 |           x = 0;
735 |           for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
736 |           if (x > closev) {
737 |             closev = x;
738 |             closeid = d;
739 |           }
740 |         }
741 |         cl[c] = closeid;
742 |       }
743 |     }
744 |     // Save the K-means classes
745 |     for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
746 |     free(centcn);
747 |     free(cent);
748 |     free(cl);
749 |     fclose(fo);
750 |   }
751 |   
752 | }
753 | 
754 | int ArgPos(char *str, int argc, char **argv) {
755 |   int a;
756 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
757 |     if (a == argc - 1) {
758 |       printf("Argument missing for %s\n", str);
759 |       exit(1);
760 |     }
761 |     return a;
762 |   }
763 |   return -1;
764 | }
765 | 
766 | int main(int argc, char **argv) {
767 |   int i;
768 |   if (argc == 1) {
769 |     printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
770 |     printf("Options:\n");
771 |     printf("Parameters for training:\n");
772 |     printf("\t-train <file>\n");
773 |     printf("\t\tUse text data from <file> to train the model\n");
774 |     printf("\t-output <file>\n");
775 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
776 |     printf("\t-size <int>\n");
777 |     printf("\t\tSet size of word vectors; default is 100\n");
778 |     printf("\t-window <int>\n");
779 |     printf("\t\tSet max skip length between words; default is 5\n");
780 |     printf("\t-sample <float>\n");
781 |     printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
782 |     printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
783 |     printf("\t-hs <int>\n");
784 |     printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
785 |     printf("\t-negative <int>\n");
786 |     printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
787 |     printf("\t-threads <int>\n");
788 |     printf("\t\tUse <int> threads (default 12)\n");
789 |     printf("\t-iter <int>\n");
790 |     printf("\t\tRun more training iterations (default 5)\n");
791 |     printf("\t-min-count <int>\n");
792 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
793 |     printf("\t-alpha <float>\n");
794 |     printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
795 |     printf("\t-classes <int>\n");
796 |     printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
797 |     printf("\t-debug <int>\n");
798 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
799 |     printf("\t-binary <int>\n");
800 |     printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
801 |     printf("\t-save-vocab <file>\n");
802 |     printf("\t\tThe vocabulary will be saved to <file>\n");
803 |     printf("\t-read-vocab <file>\n");
804 |     printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
805 |     printf("\t-cbow <int>\n");
806 |     printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
807 |     printf("\nExamples:\n");
808 |     printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
809 |     return 0;
810 |   }
811 |   output_file[0] = 0;
812 |   save_vocab_file[0] = 0;
813 |   read_vocab_file[0] = 0;
814 |   if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
815 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
816 |   if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
817 |   if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
818 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
819 |   if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
820 |   if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
821 |   if (cbow) alpha = 0.05;
822 |   if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
823 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
824 |   if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
825 |   if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
826 |   if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
827 |   if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
828 |   if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
829 |   if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
830 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
831 |   if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
832 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
833 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
834 |   expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
835 |   for (i = 0; i < EXP_TABLE_SIZE; i++) {
836 |     expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
837 |     expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
838 |   }
839 |   TrainModel();
840 |   return 0;
841 | }
842 | 


--------------------------------------------------------------------------------
/evaluation/avg/README.md:
--------------------------------------------------------------------------------
1 | # avg
2 | 1. install liblinear
3 | 2. compile avg_embedding.cpp
4 | 3. run avg.py
5 | 


--------------------------------------------------------------------------------
/evaluation/avg/avg.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | import time
 4 | 
 5 | #models = ["hidden", "lbl", "senna"]
 6 | #models = ["rand"]
 7 | models = ["ivlblskip", "ivlblcbow"]
 8 | 
 9 | liblinear_dir = "liblinear-1.94"
10 | 
11 | def func(msg, vec_dir, ret_dir):
12 |     vec_file = "%s/%s" % (vec_dir, msg)
13 |     train_file = "%s_%s_train.txt" % (vec_dir, msg)
14 |     test_file = "%s_%s_test.txt" % (vec_dir, msg)
15 |     model_file = "%s.model" % train_file
16 |     tmp_file = "%s_%s_out" % (vec_dir, msg)
17 |     out_file = "%s/%s" % (ret_dir, msg)
18 |     os.system("./avg_embedding %s imdb_train.txt imdb_test.txt %s %s" % (vec_file, train_file, test_file))
19 |     os.system("%s/train %s" % (liblinear_dir, train_file))
20 |     os.system("%s/predict %s %s %s > %s" % (liblinear_dir, train_file, model_file, tmp_file, out_file))
21 |     os.system("%s/predict %s %s %s >> %s" % (liblinear_dir, test_file, model_file, tmp_file, out_file))
22 |     os.system("rm %s" % train_file)
23 |     os.system("rm %s" % test_file)
24 |     os.system("rm %s" % model_file)
25 |     os.system("rm %s" % tmp_file)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     pool = multiprocessing.Pool(processes=20)
30 | 
31 |     for model in models:
32 |         vec_dir = "vec_%s" % model
33 |         ret_dir = "ret_avg2_%s" % model
34 | 
35 |         if not os.path.exists(ret_dir): 
36 |             os.makedirs(ret_dir) 
37 | 
38 |         for lists in os.listdir(vec_dir):
39 |             if not os.path.exists(os.path.join(ret_dir, lists)):
40 | 
41 |                 x = lists.replace('.txt','').replace('.bz2','').split('_')
42 |                 #if not "v50" in lists:
43 |                 #    continue
44 |                 if int(x[-1]) > 10 and "10m" in lists and int(x[-1]) % 100 != 0:
45 |                     continue
46 |                 if int(x[-1]) > 10 and "100m" in lists and int(x[-1]) % 10 != 0:
47 |                     continue
48 |                 print lists
49 |                 pool.apply_async(func, (lists, vec_dir, ret_dir, ))
50 |     pool.close()
51 |     pool.join()
52 |     print "Sub-process(es) done."
53 |     
54 |     
55 | 


--------------------------------------------------------------------------------
/evaluation/avg/avg_embedding.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/avg/avg_embedding.cpp


--------------------------------------------------------------------------------
/evaluation/cnn/README.md:
--------------------------------------------------------------------------------
1 | # cnn
2 | 1. make
3 | 2. run cnn.py
4 | 


--------------------------------------------------------------------------------
/evaluation/cnn/cnn.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | import time
 4 | 
 5 | #models = ["hidden", "lbl", "senna"]
 6 | models = ["ivlblskip", "ivlblcbow"]
 7 | #models = ["rand"]
 8 | 
 9 | def func(msg, vec_dir, ret_dir):
10 |     vec_file = "%s/%s" % (vec_dir, msg)
11 |     out_file = "%s/%s" % (ret_dir, msg)
12 |     for i in range(0, 5):
13 |         os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 %d tree_dev.txt 5 90 >> %s" % (vec_file, i, out_file))
14 |     #os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 1 tree_dev.txt 5 90 >> %s" % (vec_file, out_file))
15 |     #os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 2 tree_dev.txt 5 90 >> %s" % (vec_file, out_file))
16 |     #os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 3 tree_dev.txt 5 90 >> %s" % (vec_file, out_file))
17 |     #os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 4 tree_dev.txt 5 90 >> %s" % (vec_file, out_file))
18 | 
19 | 
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     pool = multiprocessing.Pool(processes=1)
24 | 
25 |     for model in models:
26 |         vec_dir = "vec_%s" % model
27 |         ret_dir = "ret_cnn_%s" % model
28 | 
29 |         if not os.path.exists(ret_dir):
30 |             os.makedirs(ret_dir)
31 |         #func('50_2_ns5_16')
32 |         
33 |         for lists in os.listdir(vec_dir):
34 |             if not os.path.exists(os.path.join(ret_dir, lists)):
35 | 
36 |                 x = lists.replace('.txt','').replace('.bz2','').split('_')
37 |                 #if not "v50" in lists:
38 |                 #    continue
39 |                 iter = int(x[-1])
40 |                 if not (iter == 1 or iter == 3 or iter == 5 or iter == 20 or iter == 10 or iter == 33 or iter == 100 or iter == 1000 or iter == 10000):
41 |                     continue
42 |                 print lists
43 |                 pool.apply_async(func, (lists, vec_dir, ret_dir, ))
44 |     pool.close()
45 |     pool.join()
46 |     print "Sub-process(es) done."
47 |     
48 |     
49 | 


--------------------------------------------------------------------------------
/evaluation/cnn/cnn_senna.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/cnn/cnn_senna.cpp


--------------------------------------------------------------------------------
/evaluation/cnn/fileutil.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/cnn/fileutil.hpp


--------------------------------------------------------------------------------
/evaluation/cnn/makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | GCC = gcc
 3 | CFLAGS = -lm -O2 -Wall -funroll-loops -ffast-math
 4 | #CFLAGS = -lm -O2 -Wall
 5 | 
 6 | all: cnn_senna
 7 | 
 8 | 
 9 | cnn_senna : cnn_senna.cpp
10 | 	$(CC) $(CFLAGS) $(OPT_DEF) cnn_senna.cpp -DLINUX -fopenmp -O2 -g -std=c++0x -o cnn_senna
11 | 
12 | clean:
13 | 	rm -rf *.o cnn_senna 
14 | 


--------------------------------------------------------------------------------
/evaluation/ner/README.md:
--------------------------------------------------------------------------------
1 | # ner
2 | 1. download http://cogcomp.cs.illinois.edu/experiments/ACL2010_NER_Experiments.zip.
3 | 2. unzip Data folder in ACL2010_NER_Experiments.zip.
4 | 3. run ner.py
5 | 
6 | ner.jar is modified from http://cogcomp.cs.illinois.edu/Data/ACL2010_NER_Experiments.php


--------------------------------------------------------------------------------
/evaluation/ner/default.config:
--------------------------------------------------------------------------------
 1 | configFilename	cwRcv50DimOverall0.3
 2 | sortLexicallyFilesInFolders	true
 3 | treatAllFilesInFolderAsOneBigDocument	false
 4 | inferenceMethod			GREEDY
 5 | beamSize					5
 6 | thresholdPrediction			false
 7 | predictionConfidenceThreshold	-1
 8 | labelTypes				PER ORG LOC MISC
 9 | logging					true
10 | debuggingLogPath			DebugLog/
11 | taggingEncodingScheme		BILOU
12 | pathToGazetteers			Data/KnownLists
13 | pathsToBrownClusters		Data/BrownHierarchicalWordClusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt	Data/BrownHierarchicalWordClusters/brownBllipClusters		Data/BrownHierarchicalWordClusters/rcv1.clean.tokenized-c1000-p1.paths.txt
14 | minWordAppThresholdsForBrownClusters	5	5	5
15 | isLowercaseBrownClusters	false	false	false
16 | pathsToWordEmbeddings	Data/WordEmbedding/model-2280000000.LEARNING_RATE=1e-08.EMBEDDING_LEARNING_RATE=1e-07.EMBEDDING_SIZE=50.txt
17 | embeddingDimensionalities	50
18 | minWordAppThresholdsForEmbeddings	0
19 | normalizationConstantsForEmbeddings 0.3
20 | normalizationMethodsForEmbeddings	OVERALL
21 | isLowercaseWordEmbeddings	false
22 | pathToModelFile			Data/Models/
23 | tokenizationScheme			DualTokenizationScheme
24 | Forms					1
25 | Capitalization				1
26 | WordTypeInformation		1
27 | Affixes					1
28 | PreviousTag1				1
29 | PreviousTag2				1
30 | GazetteersFeatures			0
31 | WordEmbeddings			1
32 | BrownClusterPaths			0
33 | aggregateContext			0
34 | aggregateGazetteerMatches	0
35 | prevTagsForContext		0
36 | PredictionsLevel1			0


--------------------------------------------------------------------------------
/evaluation/ner/ner.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/ner/ner.jar


--------------------------------------------------------------------------------
/evaluation/ner/ner.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | import time
 4 | 
 5 | 
 6 | models = ["ivlblskip", "ivlblcbow"]
 7 | #models = ["rand"]
 8 | 
 9 | def func(msg, vec_dir, ret_dir):
10 |     vec_file = "%s/%s" % (vec_dir, msg)
11 |     out_file = "%s/%s" % (ret_dir, msg)
12 |     arg = "java -jar ner.jar %s %s > %s" % (vec_file.replace("/","_").replace(":","_"), vec_file, out_file)
13 |     if not os.path.exists(out_file):
14 |         print arg
15 |         os.system(arg)
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     pool = multiprocessing.Pool(processes=3)
20 | 
21 |     for model in models:
22 |         vec_dir = "vec_%s" % model
23 |         ret_dir = "ret_ner_%s" % model
24 | 
25 |         if not os.path.exists(ret_dir):
26 |             os.makedirs(ret_dir)
27 |         
28 |         for lists in os.listdir(vec_dir):
29 |             if not os.path.exists(os.path.join(ret_dir, lists)):
30 | 
31 |                 x = lists.replace('.txt','').replace('.bz2','').split('_')
32 |                 #if not "all_1b" in lists:
33 |                 #    continue
34 |                 iter = int(x[-1])
35 |                 if not (iter == 1 or iter == 3 or iter == 5 or iter == 20 or iter == 10 or iter == 33 or iter == 100 or iter == 1000 or iter == 10000):
36 |                     continue
37 |                 print lists
38 |                 pool.apply_async(func, (lists, vec_dir, ret_dir, ))
39 |     pool.close()
40 |     pool.join()
41 |     print "Sub-process(es) done."
42 |     
43 |     
44 | 


--------------------------------------------------------------------------------
/evaluation/pos/README.md:
--------------------------------------------------------------------------------
1 | # pos
2 | 1. make
3 | 2. run pos.py
4 | 


--------------------------------------------------------------------------------
/evaluation/pos/fileutil.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/pos/fileutil.hpp


--------------------------------------------------------------------------------
/evaluation/pos/makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | GCC = gcc
 3 | CFLAGS = -lm -O2 -Wall -funroll-loops -ffast-math
 4 | #CFLAGS = -lm -O2 -Wall
 5 | 
 6 | all: senna_tag
 7 | 
 8 | senna_tag : sennaseg.cpp
 9 | 	$(CC) $(CFLAGS) $(OPT_DEF) sennaseg.cpp -fopenmp -DLINUX -o senna_tag
10 | 
11 | clean:
12 | 	rm -rf *.o senna_tag
13 | 


--------------------------------------------------------------------------------
/evaluation/pos/pos.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | import time
 4 | 
 5 | #models = ["hidden", "lbl", "senna"]
 6 | #models = ["rand"]
 7 | models = ["ivlblskip", "ivlblcbow"]
 8 | 
 9 | def func(msg, vec_dir, ret_dir):
10 |     vec_file = "%s/%s" % (vec_dir, msg)
11 |     out_file = "%s/%s" % (ret_dir, msg)
12 |     arg = "./senna_tag %s > %s" % (vec_file, out_file)
13 |     print arg
14 |     os.system(arg)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     pool = multiprocessing.Pool(processes=8)
19 | 
20 |     for model in models:
21 |         vec_dir = "vec_%s" % model
22 |         ret_dir = "ret_pos_%s" % model
23 | 
24 |         if not os.path.exists(ret_dir):
25 |             os.makedirs(ret_dir)
26 |         
27 |         for lists in os.listdir(vec_dir):
28 |             if not os.path.exists(os.path.join(ret_dir, lists)):
29 | 
30 |                 x = lists.replace('.txt','').replace('.bz2','').split('_')
31 |                 #if not "v50" in lists:
32 |                 #    continue
33 |                 iter = int(x[-1])
34 |                 if not (iter == 1 or iter == 3 or iter == 10 or iter == 5 or iter == 20 or iter == 33 or iter == 100 or iter == 1000 or iter == 10000):
35 |                     continue
36 |                 print lists
37 |                 pool.apply_async(func, (lists, vec_dir, ret_dir, ))
38 |     pool.close()
39 |     pool.join()
40 |     print "Sub-process(es) done."
41 |     
42 |     
43 | 


--------------------------------------------------------------------------------
/evaluation/pos/sennaseg.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/pos/sennaseg.cpp


--------------------------------------------------------------------------------
/evaluation/syn_sem/README.md:
--------------------------------------------------------------------------------
1 | # syn/sem
2 | 1. compile compute-accuracy-txt.c
3 | 2. run king.py
4 | 


--------------------------------------------------------------------------------
/evaluation/syn_sem/compute-accuracy-txt.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <malloc.h>
 20 | #include <ctype.h>
 21 | 
 22 | const long long max_size = 2000;         // max length of strings
 23 | const long long N = 1;                   // number of closest words
 24 | const long long max_w = 50;              // max length of vocabulary entries
 25 | 
 26 | int main(int argc, char **argv)
 27 | {
 28 |   FILE *f;
 29 |   char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
 30 |   float dist, len, bestd[N], vec[max_size];
 31 |   long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
 32 |   float *M;
 33 |   char *vocab;
 34 |   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
 35 |   if (argc < 2) {
 36 |     printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
 37 |     return 0;
 38 |   }
 39 |   strcpy(file_name, argv[1]);
 40 |   if (argc > 2) threshold = atoi(argv[2]);
 41 |   f = fopen(file_name, "rb");
 42 |   if (f == NULL) {
 43 |     printf("Input file not found\n");
 44 |     return -1;
 45 |   }
 46 |   fscanf(f, "%lld", &words);
 47 |   if (threshold) if (words > threshold) words = threshold;
 48 |   fscanf(f, "%lld", &size);
 49 |   vocab = (char *)malloc(words * max_w * sizeof(char));
 50 |   M = (float *)malloc(words * size * sizeof(float));
 51 |   if (M == NULL) {
 52 |     printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
 53 |     return -1;
 54 |   }
 55 |   for (b = 0; b < words; b++) {
 56 |     fscanf(f, "%s%c", &vocab[b * max_w], &ch);
 57 |     for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
 58 |     for (a = 0; a < size; a++) fscanf(f, "%f", &M[a + b * size]);//fread(&M[a + b * size], sizeof(float), 1, f);
 59 |     len = 0;
 60 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 61 |     len = sqrt(len);
 62 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 63 |   }
 64 |   fclose(f);
 65 |   TCN = 0;
 66 |   while (1) {
 67 |     for (a = 0; a < N; a++) bestd[a] = 0;
 68 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 69 |     scanf("%s", st1);
 70 |     for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]);
 71 |     if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
 72 |       if (TCN == 0) TCN = 1;
 73 |       if (QID != 0) {
 74 |         printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
 75 |         printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
 76 |       }
 77 |       QID++;
 78 |       scanf("%s", st1);
 79 |       if (feof(stdin)) break;
 80 |       printf("%s:\n", st1);
 81 |       TCN = 0;
 82 |       CCN = 0;
 83 |       continue;
 84 |     }
 85 |     if (!strcmp(st1, "EXIT")) break;
 86 |     scanf("%s", st2);
 87 |     for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]);
 88 |     scanf("%s", st3);
 89 |     for (a = 0; a<strlen(st3); a++) st3[a] = toupper(st3[a]);
 90 |     scanf("%s", st4);
 91 |     for (a = 0; a < strlen(st4); a++) st4[a] = toupper(st4[a]);
 92 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st1)) break;
 93 |     b1 = b;
 94 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st2)) break;
 95 |     b2 = b;
 96 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st3)) break;
 97 |     b3 = b;
 98 |     for (a = 0; a < N; a++) bestd[a] = 0;
 99 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
100 |     TQ++;
101 |     if (b1 == words) continue;
102 |     if (b2 == words) continue;
103 |     if (b3 == words) continue;
104 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st4)) break;
105 |     if (b == words) continue;
106 |     for (a = 0; a < size; a++) vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
107 |     TQS++;
108 |     for (c = 0; c < words; c++) {
109 |       if (c == b1) continue;
110 |       if (c == b2) continue;
111 |       if (c == b3) continue;
112 |       dist = 0;
113 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
114 |       for (a = 0; a < N; a++) {
115 |         if (dist > bestd[a]) {
116 |           for (d = N - 1; d > a; d--) {
117 |             bestd[d] = bestd[d - 1];
118 |             strcpy(bestw[d], bestw[d - 1]);
119 |           }
120 |           bestd[a] = dist;
121 |           strcpy(bestw[a], &vocab[c * max_w]);
122 |           break;
123 |         }
124 |       }
125 |     }
126 |     if (!strcmp(st4, bestw[0])) {
127 |       CCN++;
128 |       CACN++;
129 |       if (QID <= 5) SEAC++; else SYAC++;
130 |     }
131 |     if (QID <= 5) SECN++; else SYCN++;
132 |     TCN++;
133 |     TACN++;
134 |   }
135 |   printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
136 |   return 0;
137 | }
138 | 


--------------------------------------------------------------------------------
/evaluation/syn_sem/king.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | import time
 4 | 
 5 | #models = ["hidden", "lbl", "senna"]
 6 | models = ["ivlblskip", "ivlblcbow"]
 7 | 
 8 | def func(msg, vec_dir, ret_dir):
 9 |     arg = './compute-accuracy-txt %s/%s 0 < questions-words.txt > %s/%s' % (vec_dir, msg, ret_dir, msg)
10 |     print arg
11 |     os.system(arg)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     pool = multiprocessing.Pool(processes=16)
16 | 
17 |     for model in models:
18 |         vec_dir = "vec_%s" % model
19 |         ret_dir = "ret_king_%s" % model
20 | 
21 |         if not os.path.exists(ret_dir):
22 |             os.makedirs(ret_dir)
23 | 
24 |         for lists in os.listdir(vec_dir):
25 |             if not os.path.exists(os.path.join(ret_dir, lists)):
26 |                 x = lists.replace('.txt','').replace('.bz2','').split('_')
27 |                 #if int(x[-1]) != 1 and int(x[-1]) % 5 != 0:
28 |                 #   continue
29 |                 #if not "v50" in lists:
30 |                 #    continue
31 |                 print model, lists
32 |                 if int(x[-1]) > 10 and "10m" in lists and int(x[-1]) % 100 != 0:
33 |                     continue
34 |                 if int(x[-1]) > 10 and "100m" in lists and int(x[-1]) % 10 != 0:
35 |                     continue
36 |                 
37 |                 pool.apply_async(func, (lists, vec_dir, ret_dir, ))
38 |     pool.close()
39 |     pool.join()
40 |     print "Sub-process(es) done."
41 |     
42 |     
43 | 


--------------------------------------------------------------------------------
/evaluation/tfl/README.md:
--------------------------------------------------------------------------------
1 | # tfl
2 | 1. compile toefl.cpp
3 | 2. run toefl.py
4 | 


--------------------------------------------------------------------------------
/evaluation/tfl/toefl.cpp:
--------------------------------------------------------------------------------
  1 | #define _CRT_SECURE_NO_WARNINGS
  2 | #include <stdio.h>
  3 | #include <math.h>
  4 | #include <stdlib.h>
  5 | #include <map>
  6 | #include <string>
  7 | #include <vector>
  8 | using namespace std;
  9 | struct node {
 10 | 	string w;
 11 | 	string c[4];
 12 | 	int ans;
 13 | };
 14 | map<string, double *> dict;
 15 | 
 16 | const int MAX_STRING = 1000;
 17 | int size;
 18 | int ReadEmbedding(const char *file_name) {
 19 | 	FILE *f = fopen(file_name, "rb");
 20 | 	if (f == NULL) {
 21 | 		printf("Embedding file not found\n");
 22 | 		return -1;
 23 | 	}
 24 | 	int wordNum;
 25 | 	fscanf(f, "%d", &wordNum);
 26 | 	fscanf(f, "%d", &size);
 27 | 
 28 | 	char str[MAX_STRING];
 29 | 	double *tmp = new double[size];
 30 | 	for (int b = 0; b < wordNum; b++) {
 31 | 		char ch;
 32 | 		fscanf(f, "%s%c", str, &ch);
 33 | 		/*for (int i = 0; str[i]; i++){
 34 | 		if (str[i] >= 'A' && str[i] <= 'Z'){
 35 | 		str[i] = str[i] - 'A' + 'a';
 36 | 		}
 37 | 		}*/
 38 | 		map<string, double*>::iterator it = dict.find(str);
 39 | 		double *v = tmp;
 40 | 		if (it != dict.end()) {
 41 | 			if (it->second == NULL) {
 42 | 				it->second = new double[size];
 43 | 				v = it->second;
 44 | 			}
 45 | 		}
 46 | 		for (int a = 0; a < size; a++)
 47 | 			fscanf(f, "%lf", &v[a]);
 48 | 	}
 49 | 	fclose(f);
 50 | 	return 0;
 51 | }
 52 | 
 53 | const double eps = 1e-8;
 54 | double cosvec(double *a, double *b) {
 55 | 	if (a == NULL || b == NULL)
 56 | 		return 0;
 57 | 	double t1 = 0, t2 = 0, t3 = 0;
 58 | 	for (int i = 0; i < size; i++) {
 59 | 		t1 += a[i] * b[i];
 60 | 		t2 += a[i] * a[i];
 61 | 		t3 += b[i] * b[i];
 62 | 	}
 63 | 	return t1 / sqrt(t2 + eps) / sqrt(t3 + eps);
 64 | }
 65 | 
 66 | 
 67 | double pearson(vector<double> &a, vector<double> &b) {
 68 | 	double avg_a = 0, avg_b = 0;
 69 | 	int n = a.size();
 70 | 	for (int i = 0; i < n; i++) {
 71 | 		avg_a += a[i];
 72 | 		avg_b += b[i];
 73 | 	}
 74 | 	avg_a /= n;
 75 | 	avg_b /= n;
 76 | 	double v1 = 0, v2 = 0, v3 = 0;
 77 | 	for (int i = 0; i < n; i++) {
 78 | 		v1 += (a[i] - avg_a) * (b[i] - avg_b);
 79 | 		v2 += (a[i] - avg_a) * (a[i] - avg_a);
 80 | 		v3 += (b[i] - avg_b) * (b[i] - avg_b);
 81 | 	}
 82 | 	return v1 / sqrt(v2 + eps) / sqrt(v3 + eps);
 83 | }
 84 | 
 85 | void solve(const char *dataset, const char *embedding) {
 86 | 	vector<node> lst;
 87 | 
 88 | 	char w1[MAX_STRING], w2[MAX_STRING];
 89 | 	double val;
 90 | 	FILE *fd = fopen(dataset, "r");
 91 | 	while (fscanf(fd, "%s%s", w1, w2) != EOF) {
 92 | 		node n;
 93 | 		n.w = w2;
 94 | 		dict[w2] = NULL;
 95 | 		for (int i = 0; i < 4; i++) {
 96 | 			fscanf(fd, "%s%s", w1, w2);
 97 | 			n.c[i] = w2;
 98 | 			dict[w2] = NULL;
 99 | 		}
100 | 		fscanf(fd, "%s", w1);
101 | 		n.ans = w1[0] - 'a';
102 | 		lst.push_back(n);
103 | 	}
104 | 	fclose(fd);
105 | 
106 | 	if (embedding)
107 | 		ReadEmbedding(embedding);
108 | 
109 | 	for (map<string, double*>::iterator it = dict.begin(); it != dict.end(); it++) {
110 | 		if (it->second == NULL) {
111 | 			fprintf(stderr, "cannot find word: %s\n", it->first.c_str());
112 | 		}
113 | 	}
114 | 
115 | 	int correct = 0;
116 | 	for (int i = 0; i < lst.size(); i++) {
117 | 		double *v1 = dict[lst[i].w];
118 | 		int bestid = -1;
119 | 		double best = -1;
120 | 		for (int j = 0; j < 4; j++) {
121 | 			double *v2 = dict[lst[i].c[j]];
122 | 			double s = cosvec(v1, v2);
123 | 			if (s > best) {
124 | 				best = s;
125 | 				bestid = j;
126 | 			}
127 | 		}
128 | 		if (bestid == lst[i].ans) {
129 | 			correct++;
130 | 		}
131 | 	}
132 | 	printf("%lf\n", 1.0*correct / lst.size());
133 | }
134 | 
135 | 
136 | int main(int argc, char **argv) {
137 | 	if (argc != 2) {
138 | 		printf("Useage: ./toefl embedding > result\n");
139 | 		return 0;
140 | 	}
141 | 
142 | 	solve("toefl.txt", argv[1]);
143 | 
144 | 	return 0;
145 | }
146 | 


--------------------------------------------------------------------------------
/evaluation/tfl/toefl.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | import time
 4 | 
 5 | #models = ["hidden", "lbl", "senna"]
 6 | #models = ["turian"]
 7 | models = ["ivlblskip", "ivlblcbow"]
 8 | 
 9 | def func(msg, vec_dir, ret_dir):
10 |     arg = './toefl %s/%s > %s/%s' % (vec_dir, msg, ret_dir, msg)
11 |     print arg
12 |     os.system(arg)
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     pool = multiprocessing.Pool(processes=16)
17 | 
18 |     for model in models:
19 |         vec_dir = "vec_%s" % model
20 |         ret_dir = "ret_toefl_%s" % model
21 | 
22 |         if not os.path.exists(ret_dir):
23 |             os.makedirs(ret_dir)
24 | 
25 |         for lists in os.listdir(vec_dir):
26 |             if not os.path.exists(os.path.join(ret_dir, lists)):
27 | 
28 |                 x = lists.replace('.txt','').replace('.bz2','').split('_')
29 |                 #if not "v50" in lists:
30 |                 #    continue
31 |                 #if int(x[-1]) > 10 and ("10m" in lists or "13m" in lists) and int(x[-1]) % 100 != 0:
32 |                 #    continue
33 |                 #if int(x[-1]) > 10 and "100m" in lists and int(x[-1]) % 10 != 0:
34 |                 #    continue
35 |                 print lists
36 |                 pool.apply_async(func, (lists, vec_dir, ret_dir, ))
37 |     pool.close()
38 |     pool.join()
39 |     print "Sub-process(es) done."
40 |     
41 |     
42 | 


--------------------------------------------------------------------------------
/evaluation/tfl/toefl.txt:
--------------------------------------------------------------------------------
  1 | 1.	enormously
  2 | a.	appropriately
  3 | b.	uniquely
  4 | c.	tremendously
  5 | d.	decidedly
  6 | c
  7 | 2.	provisions
  8 | a.	stipulations
  9 | b.	interrelations
 10 | c.	jurisdictions
 11 | d.	interpretations
 12 | a
 13 | 3.	haphazardly
 14 | a.	dangerously
 15 | b.	densely
 16 | c.	randomly
 17 | d.	linearly
 18 | c
 19 | 4.	prominent
 20 | a.	battered
 21 | b.	ancient
 22 | c.	mysterious
 23 | d.	conspicuous
 24 | d
 25 | 5.	zenith
 26 | a.	completion
 27 | b.	pinnacle
 28 | c.	outset
 29 | d.	decline
 30 | b
 31 | 6.	flawed
 32 | a.	tiny
 33 | b.	imperfect
 34 | c.	lustrous
 35 | d.	crude
 36 | b
 37 | 7.	urgently
 38 | a.	typically
 39 | b.	conceivably
 40 | c.	tentatively
 41 | d.	desperately
 42 | d
 43 | 8.	consumed
 44 | a.	bred
 45 | b.	caught
 46 | c.	eaten
 47 | d.	supplied
 48 | c
 49 | 9.	advent
 50 | a.	coming
 51 | b.	arrest
 52 | c.	financing
 53 | d.	stability
 54 | a
 55 | 10.	concisely
 56 | a.	powerfully
 57 | b.	positively
 58 | c.	freely
 59 | d.	succinctly
 60 | d
 61 | 11.	salutes
 62 | a.	information
 63 | b.	ceremonies
 64 | c.	greetings
 65 | d.	privileges
 66 | c
 67 | 12.	solitary
 68 | a.	alert
 69 | b.	restless
 70 | c.	alone
 71 | d.	fearless
 72 | c
 73 | 13.	hasten
 74 | a.	permit
 75 | b.	determine
 76 | c.	accelerate
 77 | d.	accompany
 78 | c
 79 | 14.	perseverance
 80 | a.	endurance
 81 | b.	skill
 82 | c.	generosity
 83 | d.	disturbance
 84 | a
 85 | 15.	fanciful
 86 | a.	familiar
 87 | b.	imaginative
 88 | c.	apparent
 89 | d.	logical
 90 | b
 91 | 16.	showed
 92 | a.	demonstrated
 93 | b.	published
 94 | c.	repeated
 95 | d.	postponed
 96 | a
 97 | 17.	constantly
 98 | a.	instantly
 99 | b.	continually
100 | c.	rapidly
101 | d.	accidentally
102 | b
103 | 18.	issues
104 | a.	training
105 | b.	salaries
106 | c.	subjects
107 | d.	benefits
108 | c
109 | 19.	furnish
110 | a.	supply
111 | b.	impress
112 | c.	protect
113 | d.	advise
114 | a
115 | 20.	costly
116 | a.	expensive
117 | b.	beautiful
118 | c.	popular
119 | d.	complicated
120 | a
121 | 21.	recognized
122 | a.	successful
123 | b.	depicted
124 | c.	acknowledged
125 | d.	welcomed
126 | c
127 | 22.	spot
128 | a.	climate
129 | b.	latitude
130 | c.	sea
131 | d.	location
132 | d
133 | 23.	make
134 | a.	earn
135 | b.	print
136 | c.	trade
137 | d.	borrow
138 | a
139 | 24.	often
140 | a.	definitely
141 | b.	frequently
142 | c.	chemically
143 | d.	hardly
144 | b
145 | 25.	easygoing
146 | a.	frontier
147 | b.	boring
148 | c.	farming
149 | d.	relaxed
150 | d
151 | 26.	debate
152 | a.	war
153 | b.	argument
154 | c.	election
155 | d.	competition
156 | b
157 | 27.	narrow
158 | a.	clear
159 | b.	freezing
160 | c.	thin
161 | d.	poisonous
162 | c
163 | 28.	arranged
164 | a.	planned
165 | b.	explained
166 | c.	studied
167 | d.	discarded
168 | a
169 | 29.	infinite
170 | a.	limitless
171 | b.	relative
172 | c.	unusual
173 | d.	structural
174 | a
175 | 30.	showy
176 | a.	striking
177 | b.	prickly
178 | c.	entertaining
179 | d.	incidental
180 | a
181 | 31.	levied
182 | a.	imposed
183 | b.	believed
184 | c.	requested
185 | d.	correlated
186 | a
187 | 32.	deftly
188 | a.	skillfully
189 | b.	prudently
190 | c.	occasionally
191 | d.	humorously
192 | a
193 | 33.	distribute
194 | a.	commercialize
195 | b.	circulate
196 | c.	research
197 | d.	acknowledge
198 | b
199 | 34.	discrepancies
200 | a.	weights
201 | b.	deposits
202 | c.	wavelengths
203 | d.	differences
204 | d
205 | 35.	prolific
206 | a.	productive
207 | b.	serious
208 | c.	capable
209 | d.	promising
210 | a
211 | 36.	unmatched
212 | a.	unrecognized
213 | b.	unequaled
214 | c.	alienated
215 | d.	emulated
216 | b
217 | 37.	peculiarly
218 | a.	partly
219 | b.	uniquely
220 | c.	patriotically
221 | d.	suspiciously
222 | b
223 | 38.	hue
224 | a.	glare
225 | b.	contrast
226 | c.	color
227 | d.	scent
228 | c
229 | 39.	hind
230 | a.	curved
231 | b.	muscular
232 | c.	hairy
233 | d.	rear
234 | d
235 | 40.	highlight
236 | a.	alter
237 | b.	imitate
238 | c.	accentuate
239 | d.	restore
240 | c
241 | 41.	hastily
242 | a.	hurriedly
243 | b.	shrewdly
244 | c.	habitually
245 | d.	chronologically
246 | a
247 | 42.	temperate
248 | a.	cold
249 | b.	mild
250 | c.	short
251 | d.	windy
252 | b
253 | 43.	grin
254 | a.	exercise
255 | b.	rest
256 | c.	joke
257 | d.	smile
258 | d
259 | 44.	verbally
260 | a.	orally
261 | b.	overtly
262 | c.	fittingly
263 | d.	verbosely
264 | a
265 | 45.	physician
266 | a.	chemist
267 | b.	pharmacist
268 | c.	nurse
269 | d.	doctor
270 | d
271 | 46.	essentially
272 | a.	possibly
273 | b.	eagerly
274 | c.	basically
275 | d.	ordinarily
276 | c
277 | 47.	keen
278 | a.	useful
279 | b.	simple
280 | c.	famous
281 | d.	sharp
282 | d
283 | 48.	situated
284 | a.	rotating
285 | b.	isolated
286 | c.	emptying
287 | d.	positioned
288 | d
289 | 49.	principal
290 | a.	most
291 | b.	numerous
292 | c.	major
293 | d.	exceptional
294 | c
295 | 50.	slowly
296 | a.	rarely
297 | b.	gradually
298 | c.	effectively
299 | d.	continuously
300 | b
301 | 51.	built
302 | a.	constructed
303 | b.	proposed
304 | c.	financed
305 | d.	organized
306 | a
307 | 52.	tasks
308 | a.	customers
309 | b.	materials
310 | c.	shops
311 | d.	jobs
312 | d
313 | 53.	unlikely
314 | a.	improbable
315 | b.	disagreeable
316 | c.	different
317 | d.	unpopular
318 | a
319 | 54.	halfheartedly
320 | a.	customarily
321 | b.	bipartisanly
322 | c.	apathetically
323 | d.	unconventionally
324 | c
325 | 55.	annals
326 | a.	homes
327 | b.	trails
328 | c.	chronicles
329 | d.	songs
330 | c
331 | 56.	wildly
332 | a.	distinctively
333 | b.	mysteriously
334 | c.	abruptly
335 | d.	furiously
336 | d
337 | 57.	hailed
338 | a.	judged
339 | b.	acclaimed
340 | c.	remembered
341 | d.	addressed
342 | b
343 | 58.	command
344 | a.	observation
345 | b.	love
346 | c.	awareness
347 | d.	mastery
348 | d
349 | 59.	concocted
350 | a.	devised
351 | b.	cleaned
352 | c.	requested
353 | d.	supervised
354 | a
355 | 60.	prospective
356 | a.	particular
357 | b.	prudent
358 | c.	potential
359 | d.	prominent
360 | c
361 | 61.	generally
362 | a.	descriptively
363 | b.	broadly
364 | c.	controversially
365 | d.	accurately
366 | b
367 | 62.	sustained
368 | a.	prolonged
369 | b.	refined
370 | c.	lowered
371 | d.	analyzed
372 | a
373 | 63.	perilous
374 | a.	binding
375 | b.	exciting
376 | c.	offensive
377 | d.	dangerous
378 | d
379 | 64.	tranquillity
380 | a.	peacefulness
381 | b.	harshness
382 | c.	weariness
383 | d.	happiness
384 | a
385 | 65.	dissipate
386 | a.	disperse
387 | b.	isolate
388 | c.	disguise
389 | d.	photograph
390 | a
391 | 66.	primarily
392 | a.	occasionally
393 | b.	cautiously
394 | c.	consistently
395 | d.	chiefly
396 | d
397 | 67.	colloquial
398 | a.	recorded
399 | b.	misunderstood
400 | c.	incorrect
401 | d.	conversational
402 | d
403 | 68.	resolved
404 | a.	publicized
405 | b.	forgotten
406 | c.	settled
407 | d.	examined
408 | c
409 | 69.	feasible
410 | a.	permitted
411 | b.	possible
412 | c.	equitable
413 | d.	evident
414 | b
415 | 70.	expeditiously
416 | a.	frequently
417 | b.	actually
418 | c.	rapidly
419 | d.	repeatedly
420 | c
421 | 71.	percentage
422 | a.	volume
423 | b.	sample
424 | c.	proportion
425 | d.	profit
426 | c
427 | 72.	terminated
428 | a.	ended
429 | b.	posed
430 | c.	postponed
431 | d.	evaluated
432 | a
433 | 73.	uniform
434 | a.	hard
435 | b.	complex
436 | c.	alike
437 | d.	sharp
438 | c
439 | 74.	figure
440 | a.	list
441 | b.	solve
442 | c.	divide
443 | d.	express
444 | b
445 | 75.	sufficient
446 | a.	recent
447 | b.	physiological
448 | c.	enough
449 | d.	valuable
450 | c
451 | 76.	fashion
452 | a.	ration
453 | b.	fathom
454 | c.	craze
455 | d.	manner
456 | d
457 | 77.	marketed
458 | a.	frozen
459 | b.	sold
460 | c.	sweetened
461 | d.	diluted
462 | b
463 | 78.	bigger
464 | a.	steadier
465 | b.	closer
466 | c.	larger
467 | d.	better
468 | c
469 | 79.	roots
470 | a.	origins
471 | b.	rituals
472 | c.	cure
473 | d.	function
474 | a
475 | 80.	normally
476 | a.	haltingly
477 | b.	ordinarily
478 | c.	permanently
479 | d.	periodically
480 | b
481 | 


--------------------------------------------------------------------------------
/evaluation/ws/README.md:
--------------------------------------------------------------------------------
1 | # ws
2 | 1. compile ws.cpp
3 | 2. run ws.py
4 | 


--------------------------------------------------------------------------------
/evaluation/ws/ws.cpp:
--------------------------------------------------------------------------------
  1 | #define _CRT_SECURE_NO_WARNINGS
  2 | #include <stdio.h>
  3 | #include <math.h>
  4 | #include <stdlib.h>
  5 | #include <map>
  6 | #include <string>
  7 | #include <vector>
  8 | using namespace std;
  9 | struct node {
 10 | 	string w1, w2;
 11 | 	double val;
 12 | };
 13 | map<string, double *> dict;
 14 | 
 15 | const int MAX_STRING = 1000;
 16 | int size;
 17 | int ReadEmbedding(const char *file_name) {
 18 | 	FILE *f = fopen(file_name, "rb");
 19 | 	if (f == NULL) {
 20 | 		printf("Embedding file not found\n");
 21 | 		return -1;
 22 | 	}
 23 | 	int wordNum;
 24 | 	fscanf(f, "%d", &wordNum);
 25 | 	fscanf(f, "%d", &size);
 26 | 
 27 | 	char str[MAX_STRING];
 28 | 	double *tmp = new double[size];
 29 | 	for (int b = 0; b < wordNum; b++) {
 30 | 		char ch;
 31 | 		fscanf(f, "%s%c", str, &ch);
 32 | 		/*for (int i = 0; str[i]; i++){
 33 | 		if (str[i] >= 'A' && str[i] <= 'Z'){
 34 | 		str[i] = str[i] - 'A' + 'a';
 35 | 		}
 36 | 		}*/
 37 | 		map<string, double*>::iterator it = dict.find(str);
 38 | 		double *v = tmp;
 39 | 		if (it != dict.end()) {
 40 | 			if (it->second == NULL) {
 41 | 				it->second = new double[size];
 42 | 				v = it->second;
 43 | 			}
 44 | 		}
 45 | 		for (int a = 0; a < size; a++)
 46 | 			fscanf(f, "%lf", &v[a]);
 47 | 	}
 48 | 	fclose(f);
 49 | 	return 0;
 50 | }
 51 | 
 52 | const double eps = 1e-8;
 53 | double cosvec(double *a, double *b) {
 54 | 	if (a == NULL || b == NULL)
 55 | 		return 0;
 56 | 	double t1 = 0, t2 = 0, t3 = 0;
 57 | 	for (int i = 0; i < size; i++) {
 58 | 		t1 += a[i] * b[i];
 59 | 		t2 += a[i] * a[i];
 60 | 		t3 += b[i] * b[i];
 61 | 	}
 62 | 	return t1 / sqrt(t2 + eps) / sqrt(t3 + eps);
 63 | }
 64 | 
 65 | 
 66 | double pearson(vector<double> &a, vector<double> &b) {
 67 | 	double avg_a = 0, avg_b = 0;
 68 | 	int n = a.size();
 69 | 	for (int i = 0; i < n; i++) {
 70 | 		avg_a += a[i];
 71 | 		avg_b += b[i];
 72 | 	}
 73 | 	avg_a /= n;
 74 | 	avg_b /= n;
 75 | 	double v1 = 0, v2 = 0, v3 = 0;
 76 | 	for (int i = 0; i < n; i++) {
 77 | 		v1 += (a[i] - avg_a) * (b[i] - avg_b);
 78 | 		v2 += (a[i] - avg_a) * (a[i] - avg_a);
 79 | 		v3 += (b[i] - avg_b) * (b[i] - avg_b);
 80 | 	}
 81 | 	return v1 / sqrt(v2 + eps) / sqrt(v3 + eps);
 82 | }
 83 | 
 84 | void solve(const char *dataset, const char *embedding) {
 85 | 	vector<node> lst;
 86 | 
 87 | 	char w1[MAX_STRING], w2[MAX_STRING];
 88 | 	double val;
 89 | 	FILE *fd = fopen(dataset, "r");
 90 | 	while (fscanf(fd, "%s%s%lf", w1, w2, &val) != EOF) {
 91 | 		node n;
 92 | 		n.w1 = w1;
 93 | 		n.w2 = w2;
 94 | 		n.val = val;
 95 | 		lst.push_back(n);
 96 | 		if (embedding) {
 97 | 			dict[n.w1] = NULL;
 98 | 			dict[n.w2] = NULL;
 99 | 		}
100 | 
101 | 	}
102 | 	fclose(fd);
103 | 
104 | 	if (embedding)
105 | 		ReadEmbedding(embedding);
106 | 
107 | 	for (map<string, double*>::iterator it = dict.begin(); it != dict.end(); it++) {
108 | 		if (it->second == NULL) {
109 | 			fprintf(stderr, "cannot find word: %s\n", it->first.c_str());
110 | 		}
111 | 	}
112 | 
113 | 	vector<double> aa, bb;
114 | 	for (int i = 0; i < lst.size(); i++) {
115 | 		double *v1 = dict[lst[i].w1];
116 | 		double *v2 = dict[lst[i].w2];
117 | 		aa.push_back(lst[i].val);
118 | 		bb.push_back(cosvec(v1, v2));
119 | 	}
120 | 	printf("%lf\n", pearson(aa, bb));
121 | }
122 | 
123 | 
124 | int main(int argc, char **argv) {
125 | 	if (argc != 2) {
126 | 		printf("Useage: ./ws embedding > pearson\n");
127 | 		return 0;
128 | 	}
129 | 
130 | 	solve("ws353.txt", argv[1]);
131 | 	solve("ws353_relatedness.txt", NULL);
132 | 	solve("ws353_similarity.txt", NULL);
133 | 
134 | 	return 0;
135 | }
136 | 


--------------------------------------------------------------------------------
/evaluation/ws/ws.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | import time
 4 | 
 5 | #models = ["hidden", "lbl", "senna"]
 6 | #models = ["turian"]
 7 | models = ["ivlblskip", "ivlblcbow"]
 8 | 
 9 | def func(msg, vec_dir, ret_dir):
10 |     arg = './ws %s/%s > %s/%s' % (vec_dir, msg, ret_dir, msg)
11 |     print arg
12 |     os.system(arg)
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     pool = multiprocessing.Pool(processes=16)
17 | 
18 |     for model in models:
19 |         vec_dir = "vec_%s" % model
20 |         ret_dir = "ret_ws_%s" % model
21 | 
22 |         if not os.path.exists(ret_dir):
23 |             os.makedirs(ret_dir)
24 | 
25 |         for lists in os.listdir(vec_dir):
26 |             if not os.path.exists(os.path.join(ret_dir, lists)):
27 | 
28 |                 x = lists.replace('.txt','').replace('.bz2','').split('_')
29 |                 #if not "v50" in lists:
30 |                 #    continue
31 |                 #if int(x[-1]) > 10 and ("10m" in lists or "13m" in lists) and int(x[-1]) % 100 != 0:
32 |                 #    continue
33 |                 #if int(x[-1]) > 10 and "100m" in lists and int(x[-1]) % 10 != 0:
34 |                 #    continue
35 |                 print lists
36 |                 pool.apply_async(func, (lists, vec_dir, ret_dir, ))
37 |     pool.close()
38 |     pool.join()
39 |     print "Sub-process(es) done."
40 |     
41 |     
42 | 


--------------------------------------------------------------------------------
/evaluation/ws/ws353.txt:
--------------------------------------------------------------------------------
  1 | love	sex	6.77
  2 | tiger	cat	7.35
  3 | tiger	tiger	10.00
  4 | book	paper	7.46
  5 | computer	keyboard	7.62
  6 | computer	internet	7.58
  7 | plane	car	5.77
  8 | train	car	6.31
  9 | telephone	communication	7.50
 10 | television	radio	6.77
 11 | media	radio	7.42
 12 | drug	abuse	6.85
 13 | bread	butter	6.19
 14 | cucumber	potato	5.92
 15 | doctor	nurse	7.00
 16 | professor	doctor	6.62
 17 | student	professor	6.81
 18 | smart	student	4.62
 19 | smart	stupid	5.81
 20 | company	stock	7.08
 21 | stock	market	8.08
 22 | stock	phone	1.62
 23 | stock	CD	1.31
 24 | stock	jaguar	0.92
 25 | stock	egg	1.81
 26 | fertility	egg	6.69
 27 | stock	live	3.73
 28 | stock	life	0.92
 29 | book	library	7.46
 30 | bank	money	8.12
 31 | wood	forest	7.73
 32 | money	cash	9.15
 33 | professor	cucumber	0.31
 34 | king	cabbage	0.23
 35 | king	queen	8.58
 36 | king	rook	5.92
 37 | bishop	rabbi	6.69
 38 | Jerusalem	Israel	8.46
 39 | Jerusalem	Palestinian	7.65
 40 | holy	sex	1.62
 41 | fuck	sex	9.44
 42 | Maradona	football	8.62
 43 | football	soccer	9.03
 44 | football	basketball	6.81
 45 | football	tennis	6.63
 46 | tennis	racket	7.56
 47 | Arafat	peace	6.73
 48 | Arafat	terror	7.65
 49 | Arafat	Jackson	2.50
 50 | law	lawyer	8.38
 51 | movie	star	7.38
 52 | movie	popcorn	6.19
 53 | movie	critic	6.73
 54 | movie	theater	7.92
 55 | physics	proton	8.12
 56 | physics	chemistry	7.35
 57 | space	chemistry	4.88
 58 | alcohol	chemistry	5.54
 59 | vodka	gin	8.46
 60 | vodka	brandy	8.13
 61 | drink	car	3.04
 62 | drink	ear	1.31
 63 | drink	mouth	5.96
 64 | drink	eat	6.87
 65 | baby	mother	7.85
 66 | drink	mother	2.65
 67 | car	automobile	8.94
 68 | gem	jewel	8.96
 69 | journey	voyage	9.29
 70 | boy	lad	8.83
 71 | coast	shore	9.10
 72 | asylum	madhouse	8.87
 73 | magician	wizard	9.02
 74 | midday	noon	9.29
 75 | furnace	stove	8.79
 76 | food	fruit	7.52
 77 | bird	cock	7.10
 78 | bird	crane	7.38
 79 | tool	implement	6.46
 80 | brother	monk	6.27
 81 | crane	implement	2.69
 82 | lad	brother	4.46
 83 | journey	car	5.85
 84 | monk	oracle	5.00
 85 | cemetery	woodland	2.08
 86 | food	rooster	4.42
 87 | coast	hill	4.38
 88 | forest	graveyard	1.85
 89 | shore	woodland	3.08
 90 | monk	slave	0.92
 91 | coast	forest	3.15
 92 | lad	wizard	0.92
 93 | chord	smile	0.54
 94 | glass	magician	2.08
 95 | noon	string	0.54
 96 | rooster	voyage	0.62
 97 | money	dollar	8.42
 98 | money	cash	9.08
 99 | money	currency	9.04
100 | money	wealth	8.27
101 | money	property	7.57
102 | money	possession	7.29
103 | money	bank	8.50
104 | money	deposit	7.73
105 | money	withdrawal	6.88
106 | money	laundering	5.65
107 | money	operation	3.31
108 | tiger	jaguar	8.00
109 | tiger	feline	8.00
110 | tiger	carnivore	7.08
111 | tiger	mammal	6.85
112 | tiger	animal	7.00
113 | tiger	organism	4.77
114 | tiger	fauna	5.62
115 | tiger	zoo	5.87
116 | psychology	psychiatry	8.08
117 | psychology	anxiety	7.00
118 | psychology	fear	6.85
119 | psychology	depression	7.42
120 | psychology	clinic	6.58
121 | psychology	doctor	6.42
122 | psychology	Freud	8.21
123 | psychology	mind	7.69
124 | psychology	health	7.23
125 | psychology	science	6.71
126 | psychology	discipline	5.58
127 | psychology	cognition	7.48
128 | planet	star	8.45
129 | planet	constellation	8.06
130 | planet	moon	8.08
131 | planet	sun	8.02
132 | planet	galaxy	8.11
133 | planet	space	7.92
134 | planet	astronomer	7.94
135 | precedent	example	5.85
136 | precedent	information	3.85
137 | precedent	cognition	2.81
138 | precedent	law	6.65
139 | precedent	collection	2.50
140 | precedent	group	1.77
141 | precedent	antecedent	6.04
142 | cup	coffee	6.58
143 | cup	tableware	6.85
144 | cup	article	2.40
145 | cup	artifact	2.92
146 | cup	object	3.69
147 | cup	entity	2.15
148 | cup	drink	7.25
149 | cup	food	5.00
150 | cup	substance	1.92
151 | cup	liquid	5.90
152 | jaguar	cat	7.42
153 | jaguar	car	7.27
154 | energy	secretary	1.81
155 | secretary	senate	5.06
156 | energy	laboratory	5.09
157 | computer	laboratory	6.78
158 | weapon	secret	6.06
159 | FBI	fingerprint	6.94
160 | FBI	investigation	8.31
161 | investigation	effort	4.59
162 | Mars	water	2.94
163 | Mars	scientist	5.63
164 | news	report	8.16
165 | canyon	landscape	7.53
166 | image	surface	4.56
167 | discovery	space	6.34
168 | water	seepage	6.56
169 | sign	recess	2.38
170 | Wednesday	news	2.22
171 | mile	kilometer	8.66
172 | computer	news	4.47
173 | territory	surface	5.34
174 | atmosphere	landscape	3.69
175 | president	medal	3.00
176 | war	troops	8.13
177 | record	number	6.31
178 | skin	eye	6.22
179 | Japanese	American	6.50
180 | theater	history	3.91
181 | volunteer	motto	2.56
182 | prejudice	recognition	3.00
183 | decoration	valor	5.63
184 | century	year	7.59
185 | century	nation	3.16
186 | delay	racism	1.19
187 | delay	news	3.31
188 | minister	party	6.63
189 | peace	plan	4.75
190 | minority	peace	3.69
191 | attempt	peace	4.25
192 | government	crisis	6.56
193 | deployment	departure	4.25
194 | deployment	withdrawal	5.88
195 | energy	crisis	5.94
196 | announcement	news	7.56
197 | announcement	effort	2.75
198 | stroke	hospital	7.03
199 | disability	death	5.47
200 | victim	emergency	6.47
201 | treatment	recovery	7.91
202 | journal	association	4.97
203 | doctor	personnel	5.00
204 | doctor	liability	5.19
205 | liability	insurance	7.03
206 | school	center	3.44
207 | reason	hypertension	2.31
208 | reason	criterion	5.91
209 | hundred	percent	7.38
210 | Harvard	Yale	8.13
211 | hospital	infrastructure	4.63
212 | death	row	5.25
213 | death	inmate	5.03
214 | lawyer	evidence	6.69
215 | life	death	7.88
216 | life	term	4.50
217 | word	similarity	4.75
218 | board	recommendation	4.47
219 | governor	interview	3.25
220 | OPEC	country	5.63
221 | peace	atmosphere	3.69
222 | peace	insurance	2.94
223 | territory	kilometer	5.28
224 | travel	activity	5.00
225 | competition	price	6.44
226 | consumer	confidence	4.13
227 | consumer	energy	4.75
228 | problem	airport	2.38
229 | car	flight	4.94
230 | credit	card	8.06
231 | credit	information	5.31
232 | hotel	reservation	8.03
233 | grocery	money	5.94
234 | registration	arrangement	6.00
235 | arrangement	accommodation	5.41
236 | month	hotel	1.81
237 | type	kind	8.97
238 | arrival	hotel	6.00
239 | bed	closet	6.72
240 | closet	clothes	8.00
241 | situation	conclusion	4.81
242 | situation	isolation	3.88
243 | impartiality	interest	5.16
244 | direction	combination	2.25
245 | street	place	6.44
246 | street	avenue	8.88
247 | street	block	6.88
248 | street	children	4.94
249 | listing	proximity	2.56
250 | listing	category	6.38
251 | cell	phone	7.81
252 | production	hike	1.75
253 | benchmark	index	4.25
254 | media	trading	3.88
255 | media	gain	2.88
256 | dividend	payment	7.63
257 | dividend	calculation	6.48
258 | calculation	computation	8.44
259 | currency	market	7.50
260 | OPEC	oil	8.59
261 | oil	stock	6.34
262 | announcement	production	3.38
263 | announcement	warning	6.00
264 | profit	warning	3.88
265 | profit	loss	7.63
266 | dollar	yen	7.78
267 | dollar	buck	9.22
268 | dollar	profit	7.38
269 | dollar	loss	6.09
270 | computer	software	8.50
271 | network	hardware	8.31
272 | phone	equipment	7.13
273 | equipment	maker	5.91
274 | luxury	car	6.47
275 | five	month	3.38
276 | report	gain	3.63
277 | investor	earning	7.13
278 | liquid	water	7.89
279 | baseball	season	5.97
280 | game	victory	7.03
281 | game	team	7.69
282 | marathon	sprint	7.47
283 | game	series	6.19
284 | game	defeat	6.97
285 | seven	series	3.56
286 | seafood	sea	7.47
287 | seafood	food	8.34
288 | seafood	lobster	8.70
289 | lobster	food	7.81
290 | lobster	wine	5.70
291 | food	preparation	6.22
292 | video	archive	6.34
293 | start	year	4.06
294 | start	match	4.47
295 | game	round	5.97
296 | boxing	round	7.61
297 | championship	tournament	8.36
298 | fighting	defeating	7.41
299 | line	insurance	2.69
300 | day	summer	3.94
301 | summer	drought	7.16
302 | summer	nature	5.63
303 | day	dawn	7.53
304 | nature	environment	8.31
305 | environment	ecology	8.81
306 | nature	man	6.25
307 | man	woman	8.30
308 | man	governor	5.25
309 | murder	manslaughter	8.53
310 | soap	opera	7.94
311 | opera	performance	6.88
312 | life	lesson	5.94
313 | focus	life	4.06
314 | production	crew	6.25
315 | television	film	7.72
316 | lover	quarrel	6.19
317 | viewer	serial	2.97
318 | possibility	girl	1.94
319 | population	development	3.75
320 | morality	importance	3.31
321 | morality	marriage	3.69
322 | Mexico	Brazil	7.44
323 | gender	equality	6.41
324 | change	attitude	5.44
325 | family	planning	6.25
326 | opera	industry	2.63
327 | sugar	approach	0.88
328 | practice	institution	3.19
329 | ministry	culture	4.69
330 | problem	challenge	6.75
331 | size	prominence	5.31
332 | country	citizen	7.31
333 | planet	people	5.75
334 | development	issue	3.97
335 | experience	music	3.47
336 | music	project	3.63
337 | glass	metal	5.56
338 | aluminum	metal	7.83
339 | chance	credibility	3.88
340 | exhibit	memorabilia	5.31
341 | concert	virtuoso	6.81
342 | rock	jazz	7.59
343 | museum	theater	7.19
344 | observation	architecture	4.38
345 | space	world	6.53
346 | preservation	world	6.19
347 | admission	ticket	7.69
348 | shower	thunderstorm	6.31
349 | shower	flood	6.03
350 | weather	forecast	8.34
351 | disaster	area	6.25
352 | governor	office	6.34
353 | architecture	century	3.78
354 | 


--------------------------------------------------------------------------------
/evaluation/ws/ws353_relatedness.txt:
--------------------------------------------------------------------------------
  1 | computer	keyboard	7.62
  2 | Jerusalem	Israel	8.46
  3 | planet	galaxy	8.11
  4 | canyon	landscape	7.53
  5 | OPEC	country	5.63
  6 | day	summer	3.94
  7 | day	dawn	7.53
  8 | country	citizen	7.31
  9 | planet	people	5.75
 10 | environment	ecology	8.81
 11 | Maradona	football	8.62
 12 | OPEC	oil	8.59
 13 | money	bank	8.50
 14 | computer	software	8.50
 15 | law	lawyer	8.38
 16 | weather	forecast	8.34
 17 | network	hardware	8.31
 18 | nature	environment	8.31
 19 | FBI	investigation	8.31
 20 | money	wealth	8.27
 21 | psychology	Freud	8.21
 22 | news	report	8.16
 23 | war	troops	8.13
 24 | physics	proton	8.12
 25 | bank	money	8.12
 26 | stock	market	8.08
 27 | planet	constellation	8.06
 28 | credit	card	8.06
 29 | hotel	reservation	8.03
 30 | closet	clothes	8.00
 31 | soap	opera	7.94
 32 | planet	astronomer	7.94
 33 | planet	space	7.92
 34 | movie	theater	7.92
 35 | treatment	recovery	7.91
 36 | baby	mother	7.85
 37 | money	deposit	7.73
 38 | television	film	7.72
 39 | psychology	mind	7.69
 40 | game	team	7.69
 41 | admission	ticket	7.69
 42 | Jerusalem	Palestinian	7.65
 43 | Arafat	terror	7.65
 44 | boxing	round	7.61
 45 | computer	internet	7.58
 46 | money	property	7.57
 47 | tennis	racket	7.56
 48 | telephone	communication	7.50
 49 | currency	market	7.50
 50 | psychology	cognition	7.48
 51 | seafood	sea	7.47
 52 | book	paper	7.46
 53 | book	library	7.46
 54 | psychology	depression	7.42
 55 | fighting	defeating	7.41
 56 | movie	star	7.38
 57 | hundred	percent	7.38
 58 | dollar	profit	7.38
 59 | money	possession	7.29
 60 | cup	drink	7.25
 61 | psychology	health	7.23
 62 | summer	drought	7.16
 63 | investor	earning	7.13
 64 | company	stock	7.08
 65 | stroke	hospital	7.03
 66 | liability	insurance	7.03
 67 | game	victory	7.03
 68 | psychology	anxiety	7.00
 69 | game	defeat	6.97
 70 | FBI	fingerprint	6.94
 71 | money	withdrawal	6.88
 72 | psychology	fear	6.85
 73 | drug	abuse	6.85
 74 | concert	virtuoso	6.81
 75 | computer	laboratory	6.78
 76 | love	sex	6.77
 77 | problem	challenge	6.75
 78 | movie	critic	6.73
 79 | Arafat	peace	6.73
 80 | bed	closet	6.72
 81 | lawyer	evidence	6.69
 82 | fertility	egg	6.69
 83 | precedent	law	6.65
 84 | minister	party	6.63
 85 | psychology	clinic	6.58
 86 | cup	coffee	6.58
 87 | water	seepage	6.56
 88 | government	crisis	6.56
 89 | space	world	6.53
 90 | dividend	calculation	6.48
 91 | victim	emergency	6.47
 92 | luxury	car	6.47
 93 | tool	implement	6.46
 94 | competition	price	6.44
 95 | psychology	doctor	6.42
 96 | gender	equality	6.41
 97 | listing	category	6.38
 98 | video	archive	6.34
 99 | oil	stock	6.34
100 | governor	office	6.34
101 | discovery	space	6.34
102 | record	number	6.31
103 | brother	monk	6.27
104 | production	crew	6.25
105 | nature	man	6.25
106 | family	planning	6.25
107 | disaster	area	6.25
108 | food	preparation	6.22
109 | preservation	world	6.19
110 | movie	popcorn	6.19
111 | lover	quarrel	6.19
112 | game	series	6.19
113 | dollar	loss	6.09
114 | weapon	secret	6.06
115 | shower	flood	6.03
116 | registration	arrangement	6.00
117 | arrival	hotel	6.00
118 | announcement	warning	6.00
119 | game	round	5.97
120 | baseball	season	5.97
121 | drink	mouth	5.96
122 | life	lesson	5.94
123 | grocery	money	5.94
124 | energy	crisis	5.94
125 | reason	criterion	5.91
126 | equipment	maker	5.91
127 | cup	liquid	5.90
128 | deployment	withdrawal	5.88
129 | tiger	zoo	5.87
130 | journey	car	5.85
131 | money	laundering	5.65
132 | summer	nature	5.63
133 | decoration	valor	5.63
134 | Mars	scientist	5.63
135 | alcohol	chemistry	5.54
136 | disability	death	5.47
137 | change	attitude	5.44
138 | arrangement	accommodation	5.41
139 | territory	surface	5.34
140 | size	prominence	5.31
141 | exhibit	memorabilia	5.31
142 | credit	information	5.31
143 | territory	kilometer	5.28
144 | death	row	5.25
145 | doctor	liability	5.19
146 | impartiality	interest	5.16
147 | energy	laboratory	5.09
148 | secretary	senate	5.06
149 | death	inmate	5.03
150 | monk	oracle	5.00
151 | cup	food	5.00
152 | journal	association	4.97
153 | street	children	4.94
154 | car	flight	4.94
155 | space	chemistry	4.88
156 | situation	conclusion	4.81
157 | word	similarity	4.75
158 | peace	plan	4.75
159 | consumer	energy	4.75
160 | ministry	culture	4.69
161 | smart	student	4.62
162 | investigation	effort	4.59
163 | image	surface	4.56
164 | life	term	4.50
165 | start	match	4.47
166 | computer	news	4.47
167 | board	recommendation	4.47
168 | lad	brother	4.46
169 | observation	architecture	4.38
170 | coast	hill	4.38
171 | deployment	departure	4.25
172 | benchmark	index	4.25
173 | attempt	peace	4.25
174 | consumer	confidence	4.13
175 | start	year	4.06
176 | focus	life	4.06
177 | development	issue	3.97
178 | theater	history	3.91
179 | situation	isolation	3.88
180 | profit	warning	3.88
181 | media	trading	3.88
182 | chance	credibility	3.88
183 | precedent	information	3.85
184 | architecture	century	3.78
185 | population	development	3.75
186 | stock	live	3.73
187 | peace	atmosphere	3.69
188 | morality	marriage	3.69
189 | minority	peace	3.69
190 | atmosphere	landscape	3.69
191 | report	gain	3.63
192 | music	project	3.63
193 | seven	series	3.56
194 | experience	music	3.47
195 | school	center	3.44
196 | five	month	3.38
197 | announcement	production	3.38
198 | morality	importance	3.31
199 | money	operation	3.31
200 | delay	news	3.31
201 | governor	interview	3.25
202 | practice	institution	3.19
203 | century	nation	3.16
204 | coast	forest	3.15
205 | shore	woodland	3.08
206 | drink	car	3.04
207 | president	medal	3.00
208 | prejudice	recognition	3.00
209 | viewer	serial	2.97
210 | peace	insurance	2.94
211 | Mars	water	2.94
212 | media	gain	2.88
213 | precedent	cognition	2.81
214 | announcement	effort	2.75
215 | line	insurance	2.69
216 | crane	implement	2.69
217 | drink	mother	2.65
218 | opera	industry	2.63
219 | volunteer	motto	2.56
220 | listing	proximity	2.56
221 | precedent	collection	2.50
222 | cup	article	2.40
223 | sign	recess	2.38
224 | problem	airport	2.38
225 | reason	hypertension	2.31
226 | direction	combination	2.25
227 | Wednesday	news	2.22
228 | glass	magician	2.08
229 | cemetery	woodland	2.08
230 | possibility	girl	1.94
231 | cup	substance	1.92
232 | forest	graveyard	1.85
233 | stock	egg	1.81
234 | month	hotel	1.81
235 | energy	secretary	1.81
236 | precedent	group	1.77
237 | production	hike	1.75
238 | stock	phone	1.62
239 | holy	sex	1.62
240 | stock	CD	1.31
241 | drink	ear	1.31
242 | delay	racism	1.19
243 | stock	life	0.92
244 | stock	jaguar	0.92
245 | monk	slave	0.92
246 | lad	wizard	0.92
247 | sugar	approach	0.88
248 | rooster	voyage	0.62
249 | noon	string	0.54
250 | chord	smile	0.54
251 | professor	cucumber	0.31
252 | king	cabbage	0.23
253 | 


--------------------------------------------------------------------------------
/evaluation/ws/ws353_similarity.txt:
--------------------------------------------------------------------------------
  1 | tiger	cat	7.35
  2 | tiger	tiger	10.00
  3 | plane	car	5.77
  4 | train	car	6.31
  5 | television	radio	6.77
  6 | media	radio	7.42
  7 | bread	butter	6.19
  8 | cucumber	potato	5.92
  9 | doctor	nurse	7.00
 10 | professor	doctor	6.62
 11 | student	professor	6.81
 12 | smart	stupid	5.81
 13 | wood	forest	7.73
 14 | money	cash	9.15
 15 | king	queen	8.58
 16 | king	rook	5.92
 17 | bishop	rabbi	6.69
 18 | fuck	sex	9.44
 19 | football	soccer	9.03
 20 | football	basketball	6.81
 21 | football	tennis	6.63
 22 | Arafat	Jackson	2.50
 23 | physics	chemistry	7.35
 24 | vodka	gin	8.46
 25 | vodka	brandy	8.13
 26 | drink	eat	6.87
 27 | car	automobile	8.94
 28 | gem	jewel	8.96
 29 | journey	voyage	9.29
 30 | boy	lad	8.83
 31 | coast	shore	9.10
 32 | asylum	madhouse	8.87
 33 | magician	wizard	9.02
 34 | midday	noon	9.29
 35 | furnace	stove	8.79
 36 | food	fruit	7.52
 37 | bird	cock	7.10
 38 | bird	crane	7.38
 39 | food	rooster	4.42
 40 | money	dollar	8.42
 41 | money	currency	9.04
 42 | tiger	jaguar	8.00
 43 | tiger	feline	8.00
 44 | tiger	carnivore	7.08
 45 | tiger	mammal	6.85
 46 | tiger	animal	7.00
 47 | tiger	organism	4.77
 48 | tiger	fauna	5.62
 49 | psychology	psychiatry	8.08
 50 | psychology	science	6.71
 51 | psychology	discipline	5.58
 52 | planet	star	8.45
 53 | planet	moon	8.08
 54 | planet	sun	8.02
 55 | precedent	example	5.85
 56 | precedent	antecedent	6.04
 57 | cup	tableware	6.85
 58 | cup	artifact	2.92
 59 | cup	object	3.69
 60 | cup	entity	2.15
 61 | jaguar	cat	7.42
 62 | jaguar	car	7.27
 63 | mile	kilometer	8.66
 64 | skin	eye	6.22
 65 | Japanese	American	6.50
 66 | century	year	7.59
 67 | announcement	news	7.56
 68 | doctor	personnel	5.00
 69 | Harvard	Yale	8.13
 70 | hospital	infrastructure	4.63
 71 | life	death	7.88
 72 | travel	activity	5.00
 73 | type	kind	8.97
 74 | street	place	6.44
 75 | street	avenue	8.88
 76 | street	block	6.88
 77 | cell	phone	7.81
 78 | dividend	payment	7.63
 79 | calculation	computation	8.44
 80 | profit	loss	7.63
 81 | dollar	yen	7.78
 82 | dollar	buck	9.22
 83 | phone	equipment	7.13
 84 | liquid	water	7.89
 85 | marathon	sprint	7.47
 86 | seafood	food	8.34
 87 | seafood	lobster	8.70
 88 | lobster	food	7.81
 89 | lobster	wine	5.70
 90 | championship	tournament	8.36
 91 | man	woman	8.30
 92 | man	governor	5.25
 93 | murder	manslaughter	8.53
 94 | opera	performance	6.88
 95 | Mexico	Brazil	7.44
 96 | glass	metal	5.56
 97 | aluminum	metal	7.83
 98 | rock	jazz	7.59
 99 | museum	theater	7.19
100 | shower	thunderstorm	6.31
101 | monk	oracle	5.00
102 | cup	food	5.00
103 | journal	association	4.97
104 | street	children	4.94
105 | car	flight	4.94
106 | space	chemistry	4.88
107 | situation	conclusion	4.81
108 | word	similarity	4.75
109 | peace	plan	4.75
110 | consumer	energy	4.75
111 | ministry	culture	4.69
112 | smart	student	4.62
113 | investigation	effort	4.59
114 | image	surface	4.56
115 | life	term	4.50
116 | start	match	4.47
117 | computer	news	4.47
118 | board	recommendation	4.47
119 | lad	brother	4.46
120 | observation	architecture	4.38
121 | coast	hill	4.38
122 | deployment	departure	4.25
123 | benchmark	index	4.25
124 | attempt	peace	4.25
125 | consumer	confidence	4.13
126 | start	year	4.06
127 | focus	life	4.06
128 | development	issue	3.97
129 | theater	history	3.91
130 | situation	isolation	3.88
131 | profit	warning	3.88
132 | media	trading	3.88
133 | chance	credibility	3.88
134 | precedent	information	3.85
135 | architecture	century	3.78
136 | population	development	3.75
137 | stock	live	3.73
138 | peace	atmosphere	3.69
139 | morality	marriage	3.69
140 | minority	peace	3.69
141 | atmosphere	landscape	3.69
142 | report	gain	3.63
143 | music	project	3.63
144 | seven	series	3.56
145 | experience	music	3.47
146 | school	center	3.44
147 | five	month	3.38
148 | announcement	production	3.38
149 | morality	importance	3.31
150 | money	operation	3.31
151 | delay	news	3.31
152 | governor	interview	3.25
153 | practice	institution	3.19
154 | century	nation	3.16
155 | coast	forest	3.15
156 | shore	woodland	3.08
157 | drink	car	3.04
158 | president	medal	3.00
159 | prejudice	recognition	3.00
160 | viewer	serial	2.97
161 | peace	insurance	2.94
162 | Mars	water	2.94
163 | media	gain	2.88
164 | precedent	cognition	2.81
165 | announcement	effort	2.75
166 | line	insurance	2.69
167 | crane	implement	2.69
168 | drink	mother	2.65
169 | opera	industry	2.63
170 | volunteer	motto	2.56
171 | listing	proximity	2.56
172 | precedent	collection	2.50
173 | cup	article	2.40
174 | sign	recess	2.38
175 | problem	airport	2.38
176 | reason	hypertension	2.31
177 | direction	combination	2.25
178 | Wednesday	news	2.22
179 | glass	magician	2.08
180 | cemetery	woodland	2.08
181 | possibility	girl	1.94
182 | cup	substance	1.92
183 | forest	graveyard	1.85
184 | stock	egg	1.81
185 | month	hotel	1.81
186 | energy	secretary	1.81
187 | precedent	group	1.77
188 | production	hike	1.75
189 | stock	phone	1.62
190 | holy	sex	1.62
191 | stock	CD	1.31
192 | drink	ear	1.31
193 | delay	racism	1.19
194 | stock	life	0.92
195 | stock	jaguar	0.92
196 | monk	slave	0.92
197 | lad	wizard	0.92
198 | sugar	approach	0.88
199 | rooster	voyage	0.62
200 | noon	string	0.54
201 | chord	smile	0.54
202 | professor	cucumber	0.31
203 | king	cabbage	0.23
204 | 


--------------------------------------------------------------------------------