├── README.md ├── embedding ├── cw.c ├── glove.c ├── lbl.c ├── nnlm.c ├── order.c └── word2vec.c └── evaluation ├── avg ├── README.md ├── avg.py ├── avg_embedding.cpp ├── imdb_test.txt └── imdb_train.txt ├── cnn ├── README.md ├── cnn.py ├── cnn_senna.cpp ├── fileutil.hpp ├── makefile ├── tree_dev.txt ├── tree_test.txt └── tree_train.txt ├── ner ├── README.md ├── default.config ├── ner.jar └── ner.py ├── pos ├── README.md ├── fileutil.hpp ├── makefile ├── pos.py ├── pos_test.txt ├── pos_train.txt ├── pos_valid.txt └── sennaseg.cpp ├── syn_sem ├── README.md ├── compute-accuracy-txt.c ├── king.py └── questions-words.txt ├── tfl ├── README.md ├── toefl.cpp ├── toefl.py └── toefl.txt └── ws ├── README.md ├── ws.cpp ├── ws.py ├── ws353.txt ├── ws353_relatedness.txt └── ws353_similarity.txt /README.md: -------------------------------------------------------------------------------- 1 | # compare 2 | 3 | This is the source code of [How to Generate a Good Word Embedding?](http://arxiv.org/abs/1507.05523). 4 | 5 | Folder **embedding** contains all embedding algorithms we used in this paper. 6 | 7 | Folder **evaluation** contains all evaluation tasks in the paper. 8 | 9 | The Chinese version of Introduction is available at [《How to Generate a Good Word Embedding?》导读](http://licstar.net/archives/620). 10 | -------------------------------------------------------------------------------- /embedding/glove.c: -------------------------------------------------------------------------------- 1 | // GloVe: Global Vectors for Word Representation 2 | // 3 | // Copyright (c) 2014 The Board of Trustees of 4 | // The Leland Stanford Junior University. All Rights Reserved. 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | // 19 | // For more information, bug reports, fixes, contact: 20 | // Jeffrey Pennington (jpennin@stanford.edu) 21 | // GlobalVectors@googlegroups.com 22 | // http://www-nlp.stanford.edu/projects/glove/ 23 | 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #define _FILE_OFFSET_BITS 64 32 | #define MAX_STRING_LENGTH 1000 33 | 34 | typedef double real; 35 | 36 | typedef struct cooccur_rec { 37 | int word1; 38 | int word2; 39 | real val; 40 | } CREC; 41 | 42 | int verbose = 2; // 0, 1, or 2 43 | int num_threads = 8; // pthreads 44 | int num_iter = 25; // Number of full passes through cooccurrence matrix 45 | int vector_size = 50; // Word vector size 46 | int save_gradsq = 0; // By default don't save squared gradient values 47 | int use_binary = 1; // 0: save as text files; 1: save as binary; 2: both. For binary, save both word and context word vectors. 48 | int model = 2; // For text file output only. 0: concatenate word and context vectors (and biases) i.e. save everything; 1: Just save word vectors (no bias); 2: Save (word + context word) vectors (no biases) 49 | real eta = 0.05; // Initial learning rate 50 | real alpha = 0.75, x_max = 100.0; // Weighting function parameters, not extremely sensitive to corpus, though may need adjustment for very small or very large corpora 51 | real *W, *gradsq, *cost, *cost_valid; 52 | double train_percentage = 0.95; 53 | long long num_lines, *lines_per_thread, vocab_size; 54 | char *vocab_file, *input_file, *save_W_file, *save_gradsq_file; 55 | long long *word_cnt, train_words; 56 | real sample = 1e-4; 57 | 58 | /* Efficient string comparison */ 59 | int scmp( char *s1, char *s2 ) { 60 | while(*s1 != '\0' && *s1 == *s2) {s1++; s2++;} 61 | return(*s1 - *s2); 62 | } 63 | 64 | void initialize_parameters() { 65 | long long a, b; 66 | vector_size++; // Temporarily increment to allocate space for bias 67 | 68 | /* Allocate space for word vectors and context word vectors, and correspodning gradsq */ 69 | a = posix_memalign((void **)&W, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc 70 | if (W == NULL) { 71 | fprintf(stderr, "Error allocating memory for W\n"); 72 | exit(1); 73 | } 74 | a = posix_memalign((void **)&gradsq, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc 75 | if (gradsq == NULL) { 76 | fprintf(stderr, "Error allocating memory for gradsq\n"); 77 | exit(1); 78 | } 79 | for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size; 80 | for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) gradsq[a * vector_size + b] = 1.0; // So initial value of eta is equal to initial learning rate 81 | vector_size--; 82 | } 83 | 84 | /* Train the GloVe model */ 85 | void *glove_thread(void *vid) { 86 | long long a, b ,l1, l2; 87 | long long id = (long long) vid; 88 | long long thread_lines_train; 89 | long long thread_lines_total; 90 | CREC cr; 91 | real diff, fdiff, temp1, temp2; 92 | FILE *fin; 93 | fin = fopen(input_file, "rb"); 94 | fseeko(fin, (num_lines / num_threads * id) * (sizeof(CREC)), SEEK_SET); //Threads spaced roughly equally throughout file 95 | cost[id] = 0; 96 | cost_valid[id] = 0; 97 | 98 | thread_lines_train = (long long)(lines_per_thread[id] * train_percentage); 99 | thread_lines_total = lines_per_thread[id]; 100 | 101 | for(a = 0; a < thread_lines_train; a++) { 102 | fread(&cr, sizeof(CREC), 1, fin); 103 | if(feof(fin)) break; 104 | 105 | if (sample > 0) { 106 | real r = (sample * train_words) / word_cnt[cr.word1]; 107 | real keep = sqrt(r) + r; 108 | if(keep < 1) cr.val *= keep; 109 | } 110 | 111 | /* Get location of words in W & gradsq */ 112 | l1 = (cr.word1 - 1LL) * (vector_size + 1); // cr word indices start at 1 113 | l2 = ((cr.word2 - 1LL) + vocab_size) * (vector_size + 1); // shift by vocab_size to get separate vectors for context words 114 | 115 | /* Calculate cost, save diff for gradients */ 116 | diff = 0; 117 | for(b = 0; b < vector_size; b++) diff += W[b + l1] * W[b + l2]; // dot product of word and context word vector 118 | diff += W[vector_size + l1] + W[vector_size + l2] - log(cr.val); // add separate bias for each word 119 | fdiff = (cr.val > x_max) ? diff : pow(cr.val / x_max, alpha) * diff; // multiply weighting function (f) with diff 120 | //fdiff = diff; 121 | cost[id] += 0.5 * fdiff * diff; // weighted squared error 122 | 123 | /* Adaptive gradient updates */ 124 | fdiff *= eta; // for ease in calculating gradient 125 | for(b = 0; b < vector_size; b++) { 126 | // learning rate times gradient for word vectors 127 | temp1 = fdiff * W[b + l2]; 128 | temp2 = fdiff * W[b + l1]; 129 | // adaptive updates 130 | W[b + l1] -= temp1 / sqrt(gradsq[b + l1]); 131 | W[b + l2] -= temp2 / sqrt(gradsq[b + l2]); 132 | gradsq[b + l1] += temp1 * temp1; 133 | gradsq[b + l2] += temp2 * temp2; 134 | } 135 | // updates for bias terms 136 | W[vector_size + l1] -= fdiff / sqrt(gradsq[vector_size + l1]); 137 | W[vector_size + l2] -= fdiff / sqrt(gradsq[vector_size + l2]); 138 | fdiff *= fdiff; 139 | gradsq[vector_size + l1] += fdiff; 140 | gradsq[vector_size + l2] += fdiff; 141 | 142 | } 143 | 144 | 145 | for(a = thread_lines_train; a < thread_lines_total; a++) { 146 | fread(&cr, sizeof(CREC), 1, fin); 147 | if(feof(fin)) break; 148 | 149 | if (sample > 0) { 150 | real r = (sample * train_words) / word_cnt[cr.word1]; 151 | real keep = sqrt(r) + r; 152 | if(keep < 1) cr.val *= keep; 153 | } 154 | 155 | /* Get location of words in W & gradsq */ 156 | l1 = (cr.word1 - 1LL) * (vector_size + 1); // cr word indices start at 1 157 | l2 = ((cr.word2 - 1LL) + vocab_size) * (vector_size + 1); // shift by vocab_size to get separate vectors for context words 158 | 159 | /* Calculate cost, save diff for gradients */ 160 | diff = 0; 161 | for(b = 0; b < vector_size; b++) diff += W[b + l1] * W[b + l2]; // dot product of word and context word vector 162 | diff += W[vector_size + l1] + W[vector_size + l2] - log(cr.val); // add separate bias for each word 163 | fdiff = (cr.val > x_max) ? diff : pow(cr.val / x_max, alpha) * diff; // multiply weighting function (f) with diff 164 | //fdiff = diff; 165 | cost_valid[id] += 0.5 * fdiff * diff; // weighted squared error 166 | } 167 | 168 | fclose(fin); 169 | pthread_exit(NULL); 170 | } 171 | 172 | /* Save params to file */ 173 | int save_params() { 174 | long long a, b; 175 | char format[20]; 176 | char output_file[MAX_STRING_LENGTH], output_file_gsq[MAX_STRING_LENGTH]; 177 | char *word = malloc(sizeof(char) * MAX_STRING_LENGTH); 178 | FILE *fid, *fout, *fgs; 179 | 180 | if(use_binary > 0) { // Save parameters in binary file 181 | sprintf(output_file,"%s.bin",save_W_file); 182 | fout = fopen(output_file,"wb"); 183 | if(fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;} 184 | for(a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&W[a], sizeof(real), 1,fout); 185 | fclose(fout); 186 | if(save_gradsq > 0) { 187 | sprintf(output_file_gsq,"%s.bin",save_gradsq_file); 188 | fgs = fopen(output_file_gsq,"wb"); 189 | if(fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;} 190 | for(a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&gradsq[a], sizeof(real), 1,fgs); 191 | fclose(fgs); 192 | } 193 | } 194 | if(use_binary != 1) { // Save parameters in text file 195 | sprintf(output_file,"%s.txt",save_W_file); 196 | if(save_gradsq > 0) { 197 | sprintf(output_file_gsq,"%s.txt",save_gradsq_file); 198 | fgs = fopen(output_file_gsq,"wb"); 199 | if(fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;} 200 | } 201 | fout = fopen(output_file,"wb"); 202 | fprintf(fout, "%lld %d\n",vocab_size,vector_size); 203 | if(fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;} 204 | fid = fopen(vocab_file, "r"); 205 | sprintf(format,"%%%ds",MAX_STRING_LENGTH); 206 | if(fid == NULL) {fprintf(stderr, "Unable to open file %s.\n",vocab_file); return 1;} 207 | for(a = 0; a < vocab_size; a++) { 208 | if(fscanf(fid,format,word) == 0) return 1; 209 | fprintf(fout, "%s",word); 210 | if(model == 0) { // Save all parameters (including bias) 211 | for(b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]); 212 | for(b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[(vocab_size + a) * (vector_size + 1) + b]); 213 | } 214 | if(model == 1) // Save only "word" vectors (without bias) 215 | for(b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]); 216 | if(model == 2) // Save "word + context word" vectors (without bias) 217 | for(b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b] + W[(vocab_size + a) * (vector_size + 1) + b]); 218 | fprintf(fout,"\n"); 219 | if(save_gradsq > 0) { // Save gradsq 220 | fprintf(fgs, "%s",word); 221 | for(b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[a * (vector_size + 1) + b]); 222 | for(b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[(vocab_size + a) * (vector_size + 1) + b]); 223 | fprintf(fgs,"\n"); 224 | } 225 | if(fscanf(fid,format,word) == 0) return 1; // Eat irrelevant frequency entry 226 | } 227 | fclose(fid); 228 | fclose(fout); 229 | if(save_gradsq > 0) fclose(fgs); 230 | } 231 | return 0; 232 | } 233 | 234 | /* Train model */ 235 | int train_glove() { 236 | long long a, file_size; 237 | int b; 238 | char tfile[MAX_STRING_LENGTH]; 239 | FILE *fin; 240 | real total_cost = 0; 241 | real total_cost_valid = 0; 242 | fprintf(stderr, "TRAINING MODEL\n"); 243 | 244 | fin = fopen(input_file, "rb"); 245 | if(fin == NULL) {fprintf(stderr,"Unable to open cooccurrence file %s.\n",input_file); return 1;} 246 | fseeko(fin, 0, SEEK_END); 247 | file_size = ftello(fin); 248 | num_lines = file_size/(sizeof(CREC)); // Assuming the file isn't corrupt and consists only of CREC's 249 | fclose(fin); 250 | fprintf(stderr,"Read %lld lines.\n", num_lines); 251 | if(verbose > 1) fprintf(stderr,"Initializing parameters..."); 252 | initialize_parameters(); 253 | if(verbose > 1) fprintf(stderr,"done.\n"); 254 | if(verbose > 0) fprintf(stderr,"vector size: %d\n", vector_size); 255 | if(verbose > 0) fprintf(stderr,"vocab size: %lld\n", vocab_size); 256 | if(verbose > 0) fprintf(stderr,"x_max: %lf\n", x_max); 257 | if(verbose > 0) fprintf(stderr,"alpha: %lf\n", alpha); 258 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 259 | lines_per_thread = (long long *) malloc(num_threads * sizeof(long long)); 260 | 261 | // Lock-free asynchronous SGD 262 | 263 | strcpy(tfile, save_W_file); 264 | 265 | for(b = 0; b < num_iter; b++) { 266 | total_cost_valid = total_cost = 0; 267 | for (a = 0; a < num_threads - 1; a++) lines_per_thread[a] = num_lines / num_threads; 268 | lines_per_thread[a] = num_lines / num_threads + num_lines % num_threads; 269 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, glove_thread, (void *)a); 270 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 271 | for (a = 0; a < num_threads; a++) total_cost += cost[a]; 272 | for (a = 0; a < num_threads; a++) total_cost_valid += cost_valid[a]; 273 | fprintf(stdout,"iter: %03d cost: %lf valid: %lf\n", b+1, 274 | total_cost/(num_lines*train_percentage), 275 | total_cost_valid/(num_lines*(1-train_percentage))); 276 | fflush(stdout); 277 | sprintf(save_W_file, "%s_%d", tfile, b+1); 278 | //if(b % 10 == 9) 279 | save_params(); 280 | } 281 | //strcpy(save_W_file, tfile); 282 | return save_params(); 283 | } 284 | 285 | int find_arg(char *str, int argc, char **argv) { 286 | int i; 287 | for (i = 1; i < argc; i++) { 288 | if(!scmp(str, argv[i])) { 289 | if (i == argc - 1) { 290 | printf("No argument given for %s\n", str); 291 | exit(1); 292 | } 293 | return i; 294 | } 295 | } 296 | return -1; 297 | } 298 | 299 | int main(int argc, char **argv) { 300 | int i, j = 0; 301 | FILE *fid; 302 | char format[20], str[MAX_STRING_LENGTH + 1]; 303 | long long id; 304 | vocab_file = malloc(sizeof(char) * MAX_STRING_LENGTH); 305 | input_file = malloc(sizeof(char) * MAX_STRING_LENGTH); 306 | save_W_file = malloc(sizeof(char) * MAX_STRING_LENGTH); 307 | save_gradsq_file = malloc(sizeof(char) * MAX_STRING_LENGTH); 308 | 309 | if (argc == 1) { 310 | printf("GloVe: Global Vectors for Word Representation, v0.2\n"); 311 | printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n"); 312 | printf("Usage options:\n"); 313 | printf("\t-verbose \n"); 314 | printf("\t\tSet verbosity: 0, 1, or 2 (default)\n"); 315 | printf("\t-vector-size \n"); 316 | printf("\t\tDimension of word vector representations (excluding bias term); default 50\n"); 317 | printf("\t-threads \n"); 318 | printf("\t\tNumber of threads; default 8\n"); 319 | printf("\t-iter \n"); 320 | printf("\t\tNumber of training iterations; default 25\n"); 321 | printf("\t-eta \n"); 322 | printf("\t\tInitial learning rate; default 0.05\n"); 323 | printf("\t-alpha \n"); 324 | printf("\t\tParameter in exponent of weighting function; default 0.75\n"); 325 | printf("\t-x-max \n"); 326 | printf("\t\tParameter specifying cutoff in weighting function; default 100.0\n"); 327 | printf("\t-binary \n"); 328 | printf("\t\tSave output in binary format (0: text, 1: binary, 2: both); default 0\n"); 329 | printf("\t-model \n"); 330 | printf("\t\tModel for word vector output (for text output only); default 2\n"); 331 | printf("\t\t 0: output all data, for both word and context word vectors, including bias terms\n"); 332 | printf("\t\t 1: output word vectors, excluding bias terms\n"); 333 | printf("\t\t 2: output word vectors + context word vectors, excluding bias terms\n"); 334 | printf("\t-input-file \n"); 335 | printf("\t\tBinary input file of shuffled cooccurrence data (produced by 'cooccur' and 'shuffle'); default cooccurrence.shuf.bin\n"); 336 | printf("\t-vocab-file \n"); 337 | printf("\t\tFile containing vocabulary (truncated unigram counts, produced by 'vocab_count'); default vocab.txt\n"); 338 | printf("\t-save-file \n"); 339 | printf("\t\tFilename, excluding extension, for word vector output; default vectors\n"); 340 | printf("\t-gradsq-file \n"); 341 | printf("\t\tFilename, excluding extension, for squared gradient output; default gradsq\n"); 342 | printf("\t-save-gradsq \n"); 343 | printf("\t\tSave accumulated squared gradients; default 0 (off); ignored if gradsq-file is specified\n"); 344 | printf("\nExample usage:\n"); 345 | printf("./glove -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file vectors -gradsq-file gradsq -verbose 2 -vector-size 100 -threads 16 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 2 -model 2\n\n"); 346 | return 0; 347 | } 348 | 349 | 350 | if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]); 351 | if ((i = find_arg((char *)"-vector-size", argc, argv)) > 0) vector_size = atoi(argv[i + 1]); 352 | if ((i = find_arg((char *)"-iter", argc, argv)) > 0) num_iter = atoi(argv[i + 1]); 353 | if ((i = find_arg((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 354 | cost = malloc(sizeof(real) * num_threads); 355 | cost_valid = malloc(sizeof(real) * num_threads); 356 | if ((i = find_arg((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 357 | if ((i = find_arg((char *)"-x-max", argc, argv)) > 0) x_max = atof(argv[i + 1]); 358 | if ((i = find_arg((char *)"-eta", argc, argv)) > 0) eta = atof(argv[i + 1]); 359 | if ((i = find_arg((char *)"-binary", argc, argv)) > 0) use_binary = atoi(argv[i + 1]); 360 | if ((i = find_arg((char *)"-model", argc, argv)) > 0) model = atoi(argv[i + 1]); 361 | if(model != 0 && model != 1) model = 2; 362 | if ((i = find_arg((char *)"-save-gradsq", argc, argv)) > 0) save_gradsq = atoi(argv[i + 1]); 363 | if ((i = find_arg((char *)"-vocab-file", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]); 364 | else strcpy(vocab_file, (char *)"vocab.txt"); 365 | if ((i = find_arg((char *)"-save-file", argc, argv)) > 0) strcpy(save_W_file, argv[i + 1]); 366 | else strcpy(save_W_file, (char *)"vectors"); 367 | if ((i = find_arg((char *)"-gradsq-file", argc, argv)) > 0) { 368 | strcpy(save_gradsq_file, argv[i + 1]); 369 | save_gradsq = 1; 370 | } 371 | else if(save_gradsq > 0) strcpy(save_gradsq_file, (char *)"gradsq"); 372 | if ((i = find_arg((char *)"-input-file", argc, argv)) > 0) strcpy(input_file, argv[i + 1]); 373 | else strcpy(input_file, (char *)"cooccurrence.shuf.bin"); 374 | 375 | // count vocab_size 376 | vocab_size = 0; 377 | fid = fopen(vocab_file, "r"); 378 | if(fid == NULL) {fprintf(stderr, "Unable to open vocab file %s.\n",vocab_file); return 1;} 379 | while ((i = getc(fid)) != EOF) if (i == '\n') vocab_size++; // Count number of entries in vocab_file 380 | fclose(fid); 381 | 382 | train_words = 0; 383 | word_cnt = (long long *)malloc(sizeof(long long) * (vocab_size+1)); // frequency for each word 384 | fid = fopen(vocab_file, "r"); 385 | sprintf(format,"%%%ds %%lld", MAX_STRING_LENGTH); // Format to read from vocab file, which has frequency data 386 | while(fscanf(fid, format, str, &id) != EOF) { 387 | word_cnt[++j] = id; 388 | train_words += id; 389 | } 390 | fclose(fid); 391 | 392 | return train_glove(); 393 | } 394 | -------------------------------------------------------------------------------- /embedding/lbl.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 100 22 | #define EXP_TABLE_SIZE 1000 23 | #define MAX_EXP 6 24 | #define MAX_SENTENCE_LENGTH 1000 25 | #define MAX_CODE_LENGTH 40 26 | 27 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 28 | 29 | typedef float real; // Precision of float numbers 30 | 31 | struct vocab_word { 32 | long long cn; 33 | int *point; 34 | char *word, *code, codelen; 35 | }; 36 | 37 | char train_file[MAX_STRING], output_file[MAX_STRING]; 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 39 | struct vocab_word *vocab; 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 41 | int *vocab_hash; 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100, input_size = 500, hidden_size = 50; 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 44 | real alpha = 0.025, starting_alpha, sample = 1e-3; 45 | real *syn0, *syn1, *syn1neg, *expTable, *syn1neg_gdsq, *syn0_gdsq, *hidden, *hidden_gdsq; 46 | double *loss, *lossV, sum_loss, sum_lossV; 47 | long long *loss_cnt, *lossV_cnt, sum_loss_cnt, sum_lossV_cnt; 48 | 49 | clock_t start; 50 | 51 | int hs = 0, negative = 5; 52 | const int table_size = 1e8; 53 | int *table; 54 | 55 | void InitUnigramTable() { 56 | int a, i; 57 | long long train_words_pow = 0; 58 | real d1, power = 0.75; 59 | table = (int *)malloc(table_size * sizeof(int)); 60 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 61 | i = 0; 62 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 63 | for (a = 0; a < table_size; a++) { 64 | table[a] = i; 65 | if (a / (real)table_size > d1) { 66 | i++; 67 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 68 | } 69 | if (i >= vocab_size) i = vocab_size - 1; 70 | } 71 | } 72 | 73 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 74 | void ReadWord(char *word, FILE *fin) { 75 | int a = 0, ch; 76 | while (!feof(fin)) { 77 | ch = fgetc(fin); 78 | if (ch == 13) continue; 79 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 80 | if (a > 0) { 81 | if (ch == '\n') ungetc(ch, fin); 82 | break; 83 | } 84 | if (ch == '\n') { 85 | strcpy(word, (char *)""); 86 | return; 87 | } else continue; 88 | } 89 | word[a] = ch; 90 | a++; 91 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 92 | } 93 | word[a] = 0; 94 | } 95 | 96 | // Returns hash value of a word 97 | int GetWordHash(char *word) { 98 | unsigned long long a, hash = 0; 99 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 100 | hash = hash % vocab_hash_size; 101 | return hash; 102 | } 103 | 104 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 105 | int SearchVocab(char *word) { 106 | unsigned int hash = GetWordHash(word); 107 | while (1) { 108 | if (vocab_hash[hash] == -1) return -1; 109 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 110 | hash = (hash + 1) % vocab_hash_size; 111 | } 112 | return -1; 113 | } 114 | 115 | // Reads a word and returns its index in the vocabulary 116 | int ReadWordIndex(FILE *fin) { 117 | char word[MAX_STRING]; 118 | ReadWord(word, fin); 119 | if (feof(fin)) return -1; 120 | return SearchVocab(word); 121 | } 122 | 123 | // Adds a word to the vocabulary 124 | int AddWordToVocab(char *word) { 125 | unsigned int hash, length = strlen(word) + 1; 126 | if (length > MAX_STRING) length = MAX_STRING; 127 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 128 | strcpy(vocab[vocab_size].word, word); 129 | vocab[vocab_size].cn = 0; 130 | vocab_size++; 131 | // Reallocate memory if needed 132 | if (vocab_size + 2 >= vocab_max_size) { 133 | vocab_max_size += 1000; 134 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 135 | } 136 | hash = GetWordHash(word); 137 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 138 | vocab_hash[hash] = vocab_size - 1; 139 | return vocab_size - 1; 140 | } 141 | 142 | // Used later for sorting by word counts 143 | int VocabCompare(const void *a, const void *b) { 144 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 145 | } 146 | 147 | // Sorts the vocabulary by frequency using word counts 148 | void SortVocab() { 149 | int a, size; 150 | unsigned int hash; 151 | // Sort the vocabulary and keep at the first position 152 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 153 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 154 | size = vocab_size; 155 | train_words = 0; 156 | for (a = 0; a < size; a++) { 157 | // Words occuring less than min_count times will be discarded from the vocab 158 | if ((vocab[a].cn < min_count) && (a != 0)) { 159 | vocab_size--; 160 | free(vocab[a].word); 161 | } else { 162 | // Hash will be re-computed, as after the sorting it is not actual 163 | hash=GetWordHash(vocab[a].word); 164 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 165 | vocab_hash[hash] = a; 166 | train_words += vocab[a].cn; 167 | } 168 | } 169 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 170 | // Allocate memory for the binary tree construction 171 | for (a = 0; a < vocab_size; a++) { 172 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 173 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 174 | } 175 | } 176 | 177 | // Reduces the vocabulary by removing infrequent tokens 178 | void ReduceVocab() { 179 | int a, b = 0; 180 | unsigned int hash; 181 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 182 | vocab[b].cn = vocab[a].cn; 183 | vocab[b].word = vocab[a].word; 184 | b++; 185 | } else free(vocab[a].word); 186 | vocab_size = b; 187 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 188 | for (a = 0; a < vocab_size; a++) { 189 | // Hash will be re-computed, as it is not actual 190 | hash = GetWordHash(vocab[a].word); 191 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 192 | vocab_hash[hash] = a; 193 | } 194 | fflush(stdout); 195 | min_reduce++; 196 | } 197 | 198 | // Create binary Huffman tree using the word counts 199 | // Frequent words will have short uniqe binary codes 200 | void CreateBinaryTree() { 201 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 202 | char code[MAX_CODE_LENGTH]; 203 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 204 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 205 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 206 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 207 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 208 | pos1 = vocab_size - 1; 209 | pos2 = vocab_size; 210 | // Following algorithm constructs the Huffman tree by adding one node at a time 211 | for (a = 0; a < vocab_size - 1; a++) { 212 | // First, find two smallest nodes 'min1, min2' 213 | if (pos1 >= 0) { 214 | if (count[pos1] < count[pos2]) { 215 | min1i = pos1; 216 | pos1--; 217 | } else { 218 | min1i = pos2; 219 | pos2++; 220 | } 221 | } else { 222 | min1i = pos2; 223 | pos2++; 224 | } 225 | if (pos1 >= 0) { 226 | if (count[pos1] < count[pos2]) { 227 | min2i = pos1; 228 | pos1--; 229 | } else { 230 | min2i = pos2; 231 | pos2++; 232 | } 233 | } else { 234 | min2i = pos2; 235 | pos2++; 236 | } 237 | count[vocab_size + a] = count[min1i] + count[min2i]; 238 | parent_node[min1i] = vocab_size + a; 239 | parent_node[min2i] = vocab_size + a; 240 | binary[min2i] = 1; 241 | } 242 | // Now assign binary code to each vocabulary word 243 | for (a = 0; a < vocab_size; a++) { 244 | b = a; 245 | i = 0; 246 | while (1) { 247 | code[i] = binary[b]; 248 | point[i] = b; 249 | i++; 250 | b = parent_node[b]; 251 | if (b == vocab_size * 2 - 2) break; 252 | } 253 | vocab[a].codelen = i; 254 | vocab[a].point[0] = vocab_size - 2; 255 | for (b = 0; b < i; b++) { 256 | vocab[a].code[i - b - 1] = code[b]; 257 | vocab[a].point[i - b] = point[b] - vocab_size; 258 | } 259 | } 260 | free(count); 261 | free(binary); 262 | free(parent_node); 263 | } 264 | 265 | void LearnVocabFromTrainFile() { 266 | char word[MAX_STRING]; 267 | FILE *fin; 268 | long long a, i; 269 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 270 | fin = fopen(train_file, "rb"); 271 | if (fin == NULL) { 272 | printf("ERROR: training data file not found!\n"); 273 | exit(1); 274 | } 275 | vocab_size = 0; 276 | AddWordToVocab((char *)""); 277 | while (1) { 278 | ReadWord(word, fin); 279 | if (feof(fin)) break; 280 | train_words++; 281 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 282 | printf("%lldK%c", train_words / 1000, 13); 283 | fflush(stdout); 284 | } 285 | i = SearchVocab(word); 286 | if (i == -1) { 287 | a = AddWordToVocab(word); 288 | vocab[a].cn = 1; 289 | } else vocab[i].cn++; 290 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 291 | } 292 | SortVocab(); 293 | if (debug_mode > 0) { 294 | printf("Vocab size: %lld\n", vocab_size); 295 | printf("Words in train file: %lld\n", train_words); 296 | } 297 | file_size = ftell(fin); 298 | fclose(fin); 299 | } 300 | 301 | void SaveVocab() { 302 | long long i; 303 | FILE *fo = fopen(save_vocab_file, "wb"); 304 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 305 | fclose(fo); 306 | } 307 | 308 | void ReadVocab() { 309 | long long a, i = 0; 310 | char c; 311 | char word[MAX_STRING]; 312 | FILE *fin = fopen(read_vocab_file, "rb"); 313 | if (fin == NULL) { 314 | printf("Vocabulary file not found\n"); 315 | exit(1); 316 | } 317 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 318 | vocab_size = 0; 319 | while (1) { 320 | ReadWord(word, fin); 321 | if (feof(fin)) break; 322 | a = AddWordToVocab(word); 323 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 324 | i++; 325 | } 326 | SortVocab(); 327 | if (debug_mode > 0) { 328 | printf("Vocab size: %lld\n", vocab_size); 329 | printf("Words in train file: %lld\n", train_words); 330 | } 331 | fin = fopen(train_file, "rb"); 332 | if (fin == NULL) { 333 | printf("ERROR: training data file not found!\n"); 334 | exit(1); 335 | } 336 | fseek(fin, 0, SEEK_END); 337 | file_size = ftell(fin); 338 | fclose(fin); 339 | } 340 | 341 | void InitNet() { 342 | long long a, b; 343 | unsigned long long next_random = 1; 344 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 345 | a = posix_memalign((void **)&syn0_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real)); 346 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 347 | if (hs) { 348 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 349 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 350 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 351 | syn1[a * layer1_size + b] = 0; 352 | } 353 | if (negative>0) { 354 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * hidden_size * sizeof(real)); 355 | a = posix_memalign((void **)&syn1neg_gdsq, 128, (long long)vocab_size * hidden_size * sizeof(real)); 356 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 357 | for (a = 0; a < vocab_size; a++) for (b = 0; b < hidden_size; b++){ 358 | syn1neg[a * hidden_size + b] = 0; 359 | syn1neg_gdsq[a * hidden_size + b] = 1e-8; 360 | } 361 | a = posix_memalign((void **)&hidden, 128, (long long)input_size * hidden_size * sizeof(real)); 362 | a = posix_memalign((void **)&hidden_gdsq, 128, (long long)input_size * hidden_size * sizeof(real)); 363 | for (a = 0; a < input_size; a++) for (b = 0; b < hidden_size; b++) { 364 | next_random = next_random * (unsigned long long)25214903917 + 11; 365 | hidden[a * hidden_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / hidden_size; 366 | hidden_gdsq[a * hidden_size + b] = 1e-8; 367 | } 368 | } 369 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 370 | next_random = next_random * (unsigned long long)25214903917 + 11; 371 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 372 | syn0_gdsq[a * layer1_size + b] = 1e-8; 373 | } 374 | CreateBinaryTree(); 375 | } 376 | 377 | void writeWV(char *output_file){ 378 | long long a, b; 379 | FILE *fo = fopen(output_file, "wb"); 380 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 381 | for (a = 0; a < vocab_size; a++) { 382 | fprintf(fo, "%s ", vocab[a].word); 383 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 384 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 385 | fprintf(fo, "\n"); 386 | } 387 | fclose(fo); 388 | } 389 | 390 | typedef unsigned long uint64_t; 391 | typedef unsigned int uint32_t; 392 | 393 | double rsqrt64(double number) { 394 | uint64_t i; 395 | double x2, y; 396 | x2 = number * 0.5; 397 | y = number; 398 | i = *(uint64_t *) &y; 399 | i = 0x5fe6eb50c7b537a9 - (i >> 1); 400 | y = *(double *) &i; 401 | y = y * (1.5 - (x2 * y * y)); 402 | return y; 403 | } 404 | 405 | float rsqrt(float number){ 406 | uint32_t i; 407 | float x2, y; 408 | x2 = number * 0.5F; 409 | y = number; 410 | i = *(uint32_t *) &y; 411 | i = 0x5f3759df - ( i >> 1 ); 412 | y = *(float *) &i; 413 | y = y * ( 1.5F - ( x2 * y * y ) ); 414 | return y; 415 | } 416 | 417 | 418 | //b = Ax 419 | void fastmult(real *A, real *x, real *b, int xlen, int blen){ 420 | real val1, val2, val3, val4; 421 | real val5, val6, val7, val8; 422 | int i, j; 423 | for (i=0; i 10000) { 482 | loss[id] += err; 483 | lossV[id] += errV; 484 | loss_cnt[id] += err_cnt; 485 | lossV_cnt[id] += errV_cnt; 486 | 487 | sum_loss += err; 488 | sum_lossV += errV; 489 | sum_loss_cnt += err_cnt; 490 | sum_lossV_cnt += errV_cnt; 491 | 492 | err = errV = 0; 493 | err_cnt = errV_cnt = 0; 494 | word_count_actual += word_count - last_word_count; 495 | last_word_count = word_count; 496 | if ((debug_mode > 1) && id == 0) { 497 | now=clock(); 498 | printf("%cAlpha: %f Err: %lf ErrV: %lf Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 499 | -sum_loss / sum_loss_cnt / (negative + 1), 500 | -sum_lossV / sum_lossV_cnt / (negative + 1), 501 | word_count_actual / (real)(train_words + 1) * 100, 502 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 503 | fflush(stdout); 504 | } 505 | //alpha = starting_alpha; 506 | //alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 507 | //if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 508 | } 509 | if (sentence_length == 0) { 510 | while (1) { 511 | word = ReadWordIndex(fi); 512 | if (feof(fi)) break; 513 | if (word == -1) continue; 514 | word_count++; 515 | if (word == 0) break; 516 | // The subsampling randomly discards frequent words while keeping the ranking same 517 | if (sample > 0) { 518 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 519 | next_random = next_random * (unsigned long long)25214903917 + 11; 520 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 521 | } 522 | sen[sentence_length] = word; 523 | sentence_length++; 524 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 525 | } 526 | sentence_position = 0; 527 | } 528 | int validSet = 0; 529 | if(word_count > (int)(0.95 * train_words / num_threads)) 530 | validSet = 1; 531 | if (feof(fi) || (word_count > train_words / num_threads)) { 532 | word_count_actual += word_count - last_word_count; 533 | break; 534 | } 535 | word = sen[sentence_position]; 536 | if (word == -1) continue; 537 | //for (c = 0; c < input_size; c++) neu1[c] = 0; 538 | for (c = 0; c < input_size; c++) neu1e[c] = 0; 539 | for (c = 0; c < hidden_size; c++) neu2[c] = 0; 540 | for (c = 0; c < hidden_size; c++) neu2e[c] = 0; 541 | next_random = next_random * (unsigned long long)25214903917 + 11; 542 | //b = next_random % window; 543 | b = 0; 544 | if (cbow) { //train the cbow architecture 545 | // in -> hidden 546 | cw = 0; 547 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 548 | c = sentence_position - window + a; 549 | last_word = sen[c]; 550 | if (c < 0) last_word = 0; 551 | if (c >= sentence_length) last_word = 0; 552 | if (last_word == -1) last_word = 0; 553 | for (c = 0; c < layer1_size; c++) neu1[cw * layer1_size + c] = syn0[c + last_word * layer1_size]; 554 | cw++; 555 | } 556 | fastmult(hidden, neu1, neu2, input_size, hidden_size); 557 | //for(a = 0; a < hidden_size; a++) 558 | // neu2[a] = tanh(neu2[a]); 559 | if (cw == window * 2) { 560 | //for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 561 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 562 | f = 0; 563 | l2 = vocab[word].point[d] * layer1_size; 564 | // Propagate hidden -> output 565 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 566 | if (f <= -MAX_EXP) continue; 567 | else if (f >= MAX_EXP) continue; 568 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 569 | // 'g' is the gradient multiplied by the learning rate 570 | g = (1 - vocab[word].code[d] - f) * alpha; 571 | // Propagate errors output -> hidden 572 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 573 | // Learn weights hidden -> output 574 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 575 | } 576 | // NEGATIVE SAMPLING 577 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 578 | if (d == 0) { 579 | target = word; 580 | label = 1; 581 | } else { 582 | next_random = next_random * (unsigned long long)25214903917 + 11; 583 | target = table[(next_random >> 16) % table_size]; 584 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 585 | if (target == word) continue; 586 | label = 0; 587 | } 588 | l2 = target * hidden_size; 589 | f = 0; 590 | for (c = 0; c < hidden_size; c++) f += neu2[c] * syn1neg[c + l2]; 591 | if (f > MAX_EXP) g = (label - 1); 592 | else if (f < -MAX_EXP) g = (label - 0); 593 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]); 594 | 595 | if(label) f = -f; 596 | if(validSet) errV += log(1/(1+exp(f))); 597 | else err += log(1/(1+exp(f))); 598 | 599 | //for (c = 0; c < input_size; c++) neu1e[c] += g * syn1neg[c + l2]; 600 | //for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 601 | if(!validSet) for (c = 0; c < hidden_size; c++) { 602 | neu2e[c] += g * syn1neg[c + l2];// *(1 - neu2[c] * neu2[c]); 603 | real diff = g * neu2[c]; 604 | syn1neg_gdsq[c + l2] += diff * diff; 605 | syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]); 606 | // 607 | } 608 | } 609 | 610 | if(validSet) errV_cnt++; 611 | else err_cnt++; 612 | // hidden -> in 613 | if(!validSet) { 614 | long long i, j; 615 | for(i = 0; i < hidden_size; i++){ 616 | for(j = 0; j < input_size; j++){ 617 | neu1e[j] += neu2e[i] * hidden[i*input_size+j]; 618 | } 619 | } 620 | 621 | for(i = 0; i < hidden_size; i++){ 622 | for(j = 0; j < input_size; j++){ 623 | int t = i*input_size+j; 624 | real diff = neu1[j] * neu2e[i]; 625 | hidden_gdsq[t] += diff * diff; 626 | hidden[t] += alpha * diff * rsqrt64(hidden_gdsq[t]); 627 | } 628 | } 629 | cw = 0; 630 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 631 | c = sentence_position - window + a; 632 | last_word = sen[c]; 633 | if (c < 0) last_word = 0; 634 | if (c >= sentence_length) last_word = 0; 635 | if (last_word == -1) last_word = 0; 636 | //for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 637 | for (c = 0; c < layer1_size; c++) { 638 | real diff = neu1e[cw * layer1_size + c]; 639 | long long p = c + last_word * layer1_size; 640 | syn0_gdsq[p] += diff * diff; 641 | syn0[p] += alpha * diff * rsqrt(syn0_gdsq[p]); 642 | } 643 | cw++; 644 | } 645 | } 646 | } 647 | } else { //train skip-gram 648 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 649 | c = sentence_position - window + a; 650 | if (c < 0) continue; 651 | if (c >= sentence_length) continue; 652 | last_word = sen[c]; 653 | if (last_word == -1) continue; 654 | l1 = last_word * layer1_size; 655 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 656 | // HIERARCHICAL SOFTMAX 657 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 658 | f = 0; 659 | l2 = vocab[word].point[d] * layer1_size; 660 | // Propagate hidden -> output 661 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; 662 | if (f <= -MAX_EXP) continue; 663 | else if (f >= MAX_EXP) continue; 664 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 665 | // 'g' is the gradient multiplied by the learning rate 666 | g = (1 - vocab[word].code[d] - f) * alpha; 667 | // Propagate errors output -> hidden 668 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 669 | // Learn weights hidden -> output 670 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; 671 | } 672 | // NEGATIVE SAMPLING 673 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 674 | if (d == 0) { 675 | target = word; 676 | label = 1; 677 | } else { 678 | next_random = next_random * (unsigned long long)25214903917 + 11; 679 | target = table[(next_random >> 16) % table_size]; 680 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 681 | if (target == word) continue; 682 | label = 0; 683 | } 684 | l2 = target * layer1_size; 685 | f = 0; 686 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 687 | if (f > MAX_EXP) g = (label - 1); 688 | else if (f < -MAX_EXP) g = (label - 0); 689 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]); 690 | 691 | if(label) f = -f; 692 | if(validSet) errV += log(1/(1+exp(f))); 693 | else err += log(1/(1+exp(f))); 694 | 695 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 696 | for (c = 0; c < layer1_size; c++) { 697 | real diff = g * syn0[c + l1]; 698 | syn1neg_gdsq[c + l2] += diff * diff; 699 | syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]); 700 | // 701 | } 702 | } 703 | // Learn weights input -> hidden 704 | for (c = 0; c < layer1_size; c++) { 705 | real diff = neu1e[c]; 706 | syn0_gdsq[c + l1] += diff * diff; 707 | syn0[c + l1] += alpha * diff * rsqrt(syn0_gdsq[c + l1]); 708 | } 709 | if(validSet) errV_cnt++; 710 | else err_cnt++; 711 | } 712 | 713 | } 714 | sentence_position++; 715 | if (sentence_position >= sentence_length) { 716 | sentence_length = 0; 717 | continue; 718 | } 719 | } 720 | fclose(fi); 721 | free(neu1); 722 | free(neu1e); 723 | pthread_exit(NULL); 724 | } 725 | 726 | void writeFile(const char *name, double *A, long long size){ 727 | FILE *fout = fopen(name, "wb"); 728 | fwrite(A, sizeof(real), size, fout); 729 | fclose(fout); 730 | } 731 | 732 | void dump(){ 733 | writeFile("syn0", syn0, vocab_size * layer1_size); 734 | writeFile("syn0_gdsq", syn0_gdsq, vocab_size * layer1_size); 735 | writeFile("syn1neg", syn1neg, vocab_size * hidden_size); 736 | writeFile("syn1neg_gdsq", syn1neg_gdsq, vocab_size * hidden_size); 737 | writeFile("hidden", hidden, input_size * hidden_size); 738 | writeFile("hidden_gdsq", hidden_gdsq, input_size * hidden_size); 739 | } 740 | 741 | void TrainModel() { 742 | long a, b, c, d; 743 | FILE *fo; 744 | char ffname[100]; 745 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 746 | loss = (double *)malloc(num_threads * sizeof(double)); 747 | lossV = (double *)malloc(num_threads * sizeof(double)); 748 | loss_cnt = (long long *)malloc(num_threads * sizeof(long long)); 749 | lossV_cnt = (long long *)malloc(num_threads * sizeof(long long)); 750 | 751 | printf("Starting training using file %s\n", train_file); 752 | starting_alpha = alpha; 753 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 754 | if (save_vocab_file[0] != 0) SaveVocab(); 755 | if (output_file[0] == 0) return; 756 | InitNet(); 757 | if (negative > 0) InitUnigramTable(); 758 | 759 | for(b = 1; b <= iter; b++){ 760 | start = clock(); 761 | word_count_actual = 0; 762 | for (a = 0; a < num_threads; a++){ 763 | loss[a] = lossV[a] = 0; 764 | loss_cnt[a] = lossV_cnt[a] = 0; 765 | } 766 | sum_loss = sum_lossV = 0; 767 | sum_loss_cnt = sum_lossV_cnt = 0; 768 | 769 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 770 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 771 | 772 | sprintf(ffname, "%s_%ld", output_file, b); 773 | writeWV(ffname); 774 | dump(); 775 | printf("%c", 13); 776 | 777 | sum_loss = sum_lossV = 0; 778 | sum_loss_cnt = sum_lossV_cnt = 0; 779 | for (a = 0; a < num_threads; a++){ 780 | sum_loss += loss[a]; 781 | sum_lossV += lossV[a]; 782 | sum_loss_cnt += loss_cnt[a]; 783 | sum_lossV_cnt += lossV_cnt[a]; 784 | } 785 | fprintf(stderr, "Iter: %ld Err: %lf ErrV: %lf\n", b, 786 | -sum_loss / sum_loss_cnt / (negative + 1), 787 | -sum_lossV / sum_lossV_cnt / (negative + 1)); 788 | fflush(stderr); 789 | } 790 | 791 | if (classes == 0) { 792 | // Save the word vectors 793 | //writeWV(output_file); 794 | } else { 795 | fo = fopen(output_file, "wb"); 796 | // Run K-means on the word vectors 797 | int clcn = classes, iter = 10, closeid; 798 | int *centcn = (int *)malloc(classes * sizeof(int)); 799 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 800 | real closev, x; 801 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 802 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 803 | for (a = 0; a < iter; a++) { 804 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 805 | for (b = 0; b < clcn; b++) centcn[b] = 1; 806 | for (c = 0; c < vocab_size; c++) { 807 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 808 | centcn[cl[c]]++; 809 | } 810 | for (b = 0; b < clcn; b++) { 811 | closev = 0; 812 | for (c = 0; c < layer1_size; c++) { 813 | cent[layer1_size * b + c] /= centcn[b]; 814 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 815 | } 816 | closev = sqrt(closev); 817 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 818 | } 819 | for (c = 0; c < vocab_size; c++) { 820 | closev = -10; 821 | closeid = 0; 822 | for (d = 0; d < clcn; d++) { 823 | x = 0; 824 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 825 | if (x > closev) { 826 | closev = x; 827 | closeid = d; 828 | } 829 | } 830 | cl[c] = closeid; 831 | } 832 | } 833 | // Save the K-means classes 834 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 835 | free(centcn); 836 | free(cent); 837 | free(cl); 838 | fclose(fo); 839 | } 840 | 841 | } 842 | 843 | int ArgPos(char *str, int argc, char **argv) { 844 | int a; 845 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 846 | if (a == argc - 1) { 847 | printf("Argument missing for %s\n", str); 848 | exit(1); 849 | } 850 | return a; 851 | } 852 | return -1; 853 | } 854 | 855 | int main(int argc, char **argv) { 856 | int i; 857 | if (argc == 1) { 858 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 859 | printf("Options:\n"); 860 | printf("Parameters for training:\n"); 861 | printf("\t-train \n"); 862 | printf("\t\tUse text data from to train the model\n"); 863 | printf("\t-output \n"); 864 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 865 | printf("\t-size \n"); 866 | printf("\t\tSet size of word vectors; default is 100\n"); 867 | printf("\t-window \n"); 868 | printf("\t\tSet max skip length between words; default is 5\n"); 869 | printf("\t-sample \n"); 870 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 871 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 872 | printf("\t-hs \n"); 873 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 874 | printf("\t-negative \n"); 875 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 876 | printf("\t-threads \n"); 877 | printf("\t\tUse threads (default 12)\n"); 878 | printf("\t-iter \n"); 879 | printf("\t\tRun more training iterations (default 5)\n"); 880 | printf("\t-min-count \n"); 881 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 882 | printf("\t-alpha \n"); 883 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 884 | printf("\t-classes \n"); 885 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 886 | printf("\t-debug \n"); 887 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 888 | printf("\t-binary \n"); 889 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 890 | printf("\t-save-vocab \n"); 891 | printf("\t\tThe vocabulary will be saved to \n"); 892 | printf("\t-read-vocab \n"); 893 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 894 | printf("\t-cbow \n"); 895 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 896 | printf("\nExamples:\n"); 897 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 898 | return 0; 899 | } 900 | output_file[0] = 0; 901 | save_vocab_file[0] = 0; 902 | read_vocab_file[0] = 0; 903 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 904 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 905 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 906 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 907 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 908 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 909 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 910 | if (cbow) alpha = 0.05; 911 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 912 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 913 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 914 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 915 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 916 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 917 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 918 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 919 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 920 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 921 | input_size = layer1_size * window * 2; 922 | hidden_size = layer1_size; 923 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 924 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 925 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 926 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 927 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 928 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 929 | } 930 | TrainModel(); 931 | return 0; 932 | } 933 | -------------------------------------------------------------------------------- /embedding/nnlm.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 100 22 | #define EXP_TABLE_SIZE 1000 23 | #define MAX_EXP 6 24 | #define MAX_SENTENCE_LENGTH 1000 25 | #define MAX_CODE_LENGTH 40 26 | 27 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 28 | 29 | typedef float real; // Precision of float numbers 30 | 31 | struct vocab_word { 32 | long long cn; 33 | int *point; 34 | char *word, *code, codelen; 35 | }; 36 | 37 | char train_file[MAX_STRING], output_file[MAX_STRING]; 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 39 | struct vocab_word *vocab; 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 41 | int *vocab_hash; 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100, input_size = 500, hidden_size = 50; 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 44 | real alpha = 0.025, starting_alpha, sample = 1e-3; 45 | real *syn0, *syn1, *syn1neg, *expTable, *syn1neg_gdsq, *syn0_gdsq, *hidden, *hidden_gdsq; 46 | double *loss, *lossV, sum_loss, sum_lossV; 47 | long long *loss_cnt, *lossV_cnt, sum_loss_cnt, sum_lossV_cnt; 48 | 49 | clock_t start; 50 | 51 | int hs = 0, negative = 5; 52 | const int table_size = 1e8; 53 | int *table; 54 | 55 | void InitUnigramTable() { 56 | int a, i; 57 | long long train_words_pow = 0; 58 | real d1, power = 0.75; 59 | table = (int *)malloc(table_size * sizeof(int)); 60 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 61 | i = 0; 62 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 63 | for (a = 0; a < table_size; a++) { 64 | table[a] = i; 65 | if (a / (real)table_size > d1) { 66 | i++; 67 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 68 | } 69 | if (i >= vocab_size) i = vocab_size - 1; 70 | } 71 | } 72 | 73 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 74 | void ReadWord(char *word, FILE *fin) { 75 | int a = 0, ch; 76 | while (!feof(fin)) { 77 | ch = fgetc(fin); 78 | if (ch == 13) continue; 79 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 80 | if (a > 0) { 81 | if (ch == '\n') ungetc(ch, fin); 82 | break; 83 | } 84 | if (ch == '\n') { 85 | strcpy(word, (char *)""); 86 | return; 87 | } else continue; 88 | } 89 | word[a] = ch; 90 | a++; 91 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 92 | } 93 | word[a] = 0; 94 | } 95 | 96 | // Returns hash value of a word 97 | int GetWordHash(char *word) { 98 | unsigned long long a, hash = 0; 99 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 100 | hash = hash % vocab_hash_size; 101 | return hash; 102 | } 103 | 104 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 105 | int SearchVocab(char *word) { 106 | unsigned int hash = GetWordHash(word); 107 | while (1) { 108 | if (vocab_hash[hash] == -1) return -1; 109 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 110 | hash = (hash + 1) % vocab_hash_size; 111 | } 112 | return -1; 113 | } 114 | 115 | // Reads a word and returns its index in the vocabulary 116 | int ReadWordIndex(FILE *fin) { 117 | char word[MAX_STRING]; 118 | ReadWord(word, fin); 119 | if (feof(fin)) return -1; 120 | return SearchVocab(word); 121 | } 122 | 123 | // Adds a word to the vocabulary 124 | int AddWordToVocab(char *word) { 125 | unsigned int hash, length = strlen(word) + 1; 126 | if (length > MAX_STRING) length = MAX_STRING; 127 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 128 | strcpy(vocab[vocab_size].word, word); 129 | vocab[vocab_size].cn = 0; 130 | vocab_size++; 131 | // Reallocate memory if needed 132 | if (vocab_size + 2 >= vocab_max_size) { 133 | vocab_max_size += 1000; 134 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 135 | } 136 | hash = GetWordHash(word); 137 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 138 | vocab_hash[hash] = vocab_size - 1; 139 | return vocab_size - 1; 140 | } 141 | 142 | // Used later for sorting by word counts 143 | int VocabCompare(const void *a, const void *b) { 144 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 145 | } 146 | 147 | // Sorts the vocabulary by frequency using word counts 148 | void SortVocab() { 149 | int a, size; 150 | unsigned int hash; 151 | // Sort the vocabulary and keep at the first position 152 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 153 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 154 | size = vocab_size; 155 | train_words = 0; 156 | for (a = 0; a < size; a++) { 157 | // Words occuring less than min_count times will be discarded from the vocab 158 | if ((vocab[a].cn < min_count) && (a != 0)) { 159 | vocab_size--; 160 | free(vocab[a].word); 161 | } else { 162 | // Hash will be re-computed, as after the sorting it is not actual 163 | hash=GetWordHash(vocab[a].word); 164 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 165 | vocab_hash[hash] = a; 166 | train_words += vocab[a].cn; 167 | } 168 | } 169 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 170 | // Allocate memory for the binary tree construction 171 | for (a = 0; a < vocab_size; a++) { 172 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 173 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 174 | } 175 | } 176 | 177 | // Reduces the vocabulary by removing infrequent tokens 178 | void ReduceVocab() { 179 | int a, b = 0; 180 | unsigned int hash; 181 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 182 | vocab[b].cn = vocab[a].cn; 183 | vocab[b].word = vocab[a].word; 184 | b++; 185 | } else free(vocab[a].word); 186 | vocab_size = b; 187 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 188 | for (a = 0; a < vocab_size; a++) { 189 | // Hash will be re-computed, as it is not actual 190 | hash = GetWordHash(vocab[a].word); 191 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 192 | vocab_hash[hash] = a; 193 | } 194 | fflush(stdout); 195 | min_reduce++; 196 | } 197 | 198 | // Create binary Huffman tree using the word counts 199 | // Frequent words will have short uniqe binary codes 200 | void CreateBinaryTree() { 201 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 202 | char code[MAX_CODE_LENGTH]; 203 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 204 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 205 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 206 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 207 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 208 | pos1 = vocab_size - 1; 209 | pos2 = vocab_size; 210 | // Following algorithm constructs the Huffman tree by adding one node at a time 211 | for (a = 0; a < vocab_size - 1; a++) { 212 | // First, find two smallest nodes 'min1, min2' 213 | if (pos1 >= 0) { 214 | if (count[pos1] < count[pos2]) { 215 | min1i = pos1; 216 | pos1--; 217 | } else { 218 | min1i = pos2; 219 | pos2++; 220 | } 221 | } else { 222 | min1i = pos2; 223 | pos2++; 224 | } 225 | if (pos1 >= 0) { 226 | if (count[pos1] < count[pos2]) { 227 | min2i = pos1; 228 | pos1--; 229 | } else { 230 | min2i = pos2; 231 | pos2++; 232 | } 233 | } else { 234 | min2i = pos2; 235 | pos2++; 236 | } 237 | count[vocab_size + a] = count[min1i] + count[min2i]; 238 | parent_node[min1i] = vocab_size + a; 239 | parent_node[min2i] = vocab_size + a; 240 | binary[min2i] = 1; 241 | } 242 | // Now assign binary code to each vocabulary word 243 | for (a = 0; a < vocab_size; a++) { 244 | b = a; 245 | i = 0; 246 | while (1) { 247 | code[i] = binary[b]; 248 | point[i] = b; 249 | i++; 250 | b = parent_node[b]; 251 | if (b == vocab_size * 2 - 2) break; 252 | } 253 | vocab[a].codelen = i; 254 | vocab[a].point[0] = vocab_size - 2; 255 | for (b = 0; b < i; b++) { 256 | vocab[a].code[i - b - 1] = code[b]; 257 | vocab[a].point[i - b] = point[b] - vocab_size; 258 | } 259 | } 260 | free(count); 261 | free(binary); 262 | free(parent_node); 263 | } 264 | 265 | void LearnVocabFromTrainFile() { 266 | char word[MAX_STRING]; 267 | FILE *fin; 268 | long long a, i; 269 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 270 | fin = fopen(train_file, "rb"); 271 | if (fin == NULL) { 272 | printf("ERROR: training data file not found!\n"); 273 | exit(1); 274 | } 275 | vocab_size = 0; 276 | AddWordToVocab((char *)""); 277 | while (1) { 278 | ReadWord(word, fin); 279 | if (feof(fin)) break; 280 | train_words++; 281 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 282 | printf("%lldK%c", train_words / 1000, 13); 283 | fflush(stdout); 284 | } 285 | i = SearchVocab(word); 286 | if (i == -1) { 287 | a = AddWordToVocab(word); 288 | vocab[a].cn = 1; 289 | } else vocab[i].cn++; 290 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 291 | } 292 | SortVocab(); 293 | if (debug_mode > 0) { 294 | printf("Vocab size: %lld\n", vocab_size); 295 | printf("Words in train file: %lld\n", train_words); 296 | } 297 | file_size = ftell(fin); 298 | fclose(fin); 299 | } 300 | 301 | void SaveVocab() { 302 | long long i; 303 | FILE *fo = fopen(save_vocab_file, "wb"); 304 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 305 | fclose(fo); 306 | } 307 | 308 | void ReadVocab() { 309 | long long a, i = 0; 310 | char c; 311 | char word[MAX_STRING]; 312 | FILE *fin = fopen(read_vocab_file, "rb"); 313 | if (fin == NULL) { 314 | printf("Vocabulary file not found\n"); 315 | exit(1); 316 | } 317 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 318 | vocab_size = 0; 319 | while (1) { 320 | ReadWord(word, fin); 321 | if (feof(fin)) break; 322 | a = AddWordToVocab(word); 323 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 324 | i++; 325 | } 326 | SortVocab(); 327 | if (debug_mode > 0) { 328 | printf("Vocab size: %lld\n", vocab_size); 329 | printf("Words in train file: %lld\n", train_words); 330 | } 331 | fin = fopen(train_file, "rb"); 332 | if (fin == NULL) { 333 | printf("ERROR: training data file not found!\n"); 334 | exit(1); 335 | } 336 | fseek(fin, 0, SEEK_END); 337 | file_size = ftell(fin); 338 | fclose(fin); 339 | } 340 | 341 | void InitNet() { 342 | long long a, b; 343 | unsigned long long next_random = 1; 344 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 345 | a = posix_memalign((void **)&syn0_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real)); 346 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 347 | if (hs) { 348 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 349 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 350 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 351 | syn1[a * layer1_size + b] = 0; 352 | } 353 | if (negative>0) { 354 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * hidden_size * sizeof(real)); 355 | a = posix_memalign((void **)&syn1neg_gdsq, 128, (long long)vocab_size * hidden_size * sizeof(real)); 356 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 357 | for (a = 0; a < vocab_size; a++) for (b = 0; b < hidden_size; b++){ 358 | syn1neg[a * hidden_size + b] = 0; 359 | syn1neg_gdsq[a * hidden_size + b] = 1e-8; 360 | } 361 | a = posix_memalign((void **)&hidden, 128, (long long)input_size * hidden_size * sizeof(real)); 362 | a = posix_memalign((void **)&hidden_gdsq, 128, (long long)input_size * hidden_size * sizeof(real)); 363 | for (a = 0; a < input_size; a++) for (b = 0; b < hidden_size; b++) { 364 | next_random = next_random * (unsigned long long)25214903917 + 11; 365 | hidden[a * hidden_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / hidden_size; 366 | hidden_gdsq[a * hidden_size + b] = 1e-8; 367 | } 368 | } 369 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 370 | next_random = next_random * (unsigned long long)25214903917 + 11; 371 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 372 | syn0_gdsq[a * layer1_size + b] = 1e-8; 373 | } 374 | CreateBinaryTree(); 375 | } 376 | 377 | void writeWV(char *output_file){ 378 | long long a, b; 379 | FILE *fo = fopen(output_file, "wb"); 380 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 381 | for (a = 0; a < vocab_size; a++) { 382 | fprintf(fo, "%s ", vocab[a].word); 383 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 384 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 385 | fprintf(fo, "\n"); 386 | } 387 | fclose(fo); 388 | } 389 | 390 | typedef unsigned long uint64_t; 391 | typedef unsigned int uint32_t; 392 | 393 | double rsqrt64(double number) { 394 | uint64_t i; 395 | double x2, y; 396 | x2 = number * 0.5; 397 | y = number; 398 | i = *(uint64_t *) &y; 399 | i = 0x5fe6eb50c7b537a9 - (i >> 1); 400 | y = *(double *) &i; 401 | y = y * (1.5 - (x2 * y * y)); 402 | return y; 403 | } 404 | 405 | float rsqrt(float number){ 406 | uint32_t i; 407 | float x2, y; 408 | x2 = number * 0.5F; 409 | y = number; 410 | i = *(uint32_t *) &y; 411 | i = 0x5f3759df - ( i >> 1 ); 412 | y = *(float *) &i; 413 | y = y * ( 1.5F - ( x2 * y * y ) ); 414 | return y; 415 | } 416 | 417 | 418 | //b = Ax 419 | void fastmult(real *A, real *x, real *b, int xlen, int blen){ 420 | real val1, val2, val3, val4; 421 | real val5, val6, val7, val8; 422 | int i, j; 423 | for (i=0; i 10000) { 482 | loss[id] += err; 483 | lossV[id] += errV; 484 | loss_cnt[id] += err_cnt; 485 | lossV_cnt[id] += errV_cnt; 486 | 487 | sum_loss += err; 488 | sum_lossV += errV; 489 | sum_loss_cnt += err_cnt; 490 | sum_lossV_cnt += errV_cnt; 491 | 492 | err = errV = 0; 493 | err_cnt = errV_cnt = 0; 494 | word_count_actual += word_count - last_word_count; 495 | last_word_count = word_count; 496 | if ((debug_mode > 1) && id == 0) { 497 | now=clock(); 498 | printf("%cAlpha: %f Err: %lf ErrV: %lf Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 499 | -sum_loss / sum_loss_cnt / (negative + 1), 500 | -sum_lossV / sum_lossV_cnt / (negative + 1), 501 | word_count_actual / (real)(train_words + 1) * 100, 502 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 503 | fflush(stdout); 504 | } 505 | //alpha = starting_alpha; 506 | //alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 507 | //if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 508 | } 509 | if (sentence_length == 0) { 510 | while (1) { 511 | word = ReadWordIndex(fi); 512 | if (feof(fi)) break; 513 | if (word == -1) continue; 514 | word_count++; 515 | if (word == 0) break; 516 | // The subsampling randomly discards frequent words while keeping the ranking same 517 | if (sample > 0) { 518 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 519 | next_random = next_random * (unsigned long long)25214903917 + 11; 520 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 521 | } 522 | sen[sentence_length] = word; 523 | sentence_length++; 524 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 525 | } 526 | sentence_position = 0; 527 | } 528 | int validSet = 0; 529 | if(word_count > (int)(0.95 * train_words / num_threads)) 530 | validSet = 1; 531 | if (feof(fi) || (word_count > train_words / num_threads)) { 532 | word_count_actual += word_count - last_word_count; 533 | break; 534 | } 535 | word = sen[sentence_position]; 536 | if (word == -1) continue; 537 | //for (c = 0; c < input_size; c++) neu1[c] = 0; 538 | for (c = 0; c < input_size; c++) neu1e[c] = 0; 539 | for (c = 0; c < hidden_size; c++) neu2[c] = 0; 540 | for (c = 0; c < hidden_size; c++) neu2e[c] = 0; 541 | next_random = next_random * (unsigned long long)25214903917 + 11; 542 | //b = next_random % window; 543 | b = 0; 544 | if (cbow) { //train the cbow architecture 545 | // in -> hidden 546 | cw = 0; 547 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 548 | c = sentence_position - window + a; 549 | last_word = sen[c]; 550 | if (c < 0) last_word = 0; 551 | if (c >= sentence_length) last_word = 0; 552 | if (last_word == -1) last_word = 0; 553 | for (c = 0; c < layer1_size; c++) neu1[cw * layer1_size + c] = syn0[c + last_word * layer1_size]; 554 | cw++; 555 | } 556 | fastmult(hidden, neu1, neu2, input_size, hidden_size); 557 | for(a = 0; a < hidden_size; a++) 558 | neu2[a] = tanh(neu2[a]); 559 | if (cw == window * 2) { 560 | //for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 561 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 562 | f = 0; 563 | l2 = vocab[word].point[d] * layer1_size; 564 | // Propagate hidden -> output 565 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 566 | if (f <= -MAX_EXP) continue; 567 | else if (f >= MAX_EXP) continue; 568 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 569 | // 'g' is the gradient multiplied by the learning rate 570 | g = (1 - vocab[word].code[d] - f) * alpha; 571 | // Propagate errors output -> hidden 572 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 573 | // Learn weights hidden -> output 574 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 575 | } 576 | // NEGATIVE SAMPLING 577 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 578 | if (d == 0) { 579 | target = word; 580 | label = 1; 581 | } else { 582 | next_random = next_random * (unsigned long long)25214903917 + 11; 583 | target = table[(next_random >> 16) % table_size]; 584 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 585 | if (target == word) continue; 586 | label = 0; 587 | } 588 | l2 = target * hidden_size; 589 | f = 0; 590 | for (c = 0; c < hidden_size; c++) f += neu2[c] * syn1neg[c + l2]; 591 | if (f > MAX_EXP) g = (label - 1); 592 | else if (f < -MAX_EXP) g = (label - 0); 593 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]); 594 | 595 | if(label) f = -f; 596 | if(validSet) errV += log(1/(1+exp(f))); 597 | else err += log(1/(1+exp(f))); 598 | 599 | //for (c = 0; c < input_size; c++) neu1e[c] += g * syn1neg[c + l2]; 600 | //for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 601 | if(!validSet) for (c = 0; c < hidden_size; c++) { 602 | neu2e[c] += g * syn1neg[c + l2] * (1-neu2[c]*neu2[c]); 603 | real diff = g * neu2[c]; 604 | syn1neg_gdsq[c + l2] += diff * diff; 605 | syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]); 606 | // 607 | } 608 | } 609 | 610 | if(validSet) errV_cnt++; 611 | else err_cnt++; 612 | // hidden -> in 613 | if(!validSet) { 614 | long long i, j; 615 | for(i = 0; i < hidden_size; i++){ 616 | for(j = 0; j < input_size; j++){ 617 | neu1e[j] += neu2e[i] * hidden[i*input_size+j]; 618 | } 619 | } 620 | 621 | for(i = 0; i < hidden_size; i++){ 622 | for(j = 0; j < input_size; j++){ 623 | int t = i*input_size+j; 624 | real diff = neu1[j] * neu2e[i]; 625 | hidden_gdsq[t] += diff * diff; 626 | hidden[t] += alpha * diff * rsqrt64(hidden_gdsq[t]); 627 | } 628 | } 629 | cw = 0; 630 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 631 | c = sentence_position - window + a; 632 | last_word = sen[c]; 633 | if (c < 0) last_word = 0; 634 | if (c >= sentence_length) last_word = 0; 635 | if (last_word == -1) last_word = 0; 636 | //for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 637 | for (c = 0; c < layer1_size; c++) { 638 | real diff = neu1e[cw * layer1_size + c]; 639 | long long p = c + last_word * layer1_size; 640 | syn0_gdsq[p] += diff * diff; 641 | syn0[p] += alpha * diff * rsqrt(syn0_gdsq[p]); 642 | } 643 | cw++; 644 | } 645 | } 646 | } 647 | } else { //train skip-gram 648 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 649 | c = sentence_position - window + a; 650 | if (c < 0) continue; 651 | if (c >= sentence_length) continue; 652 | last_word = sen[c]; 653 | if (last_word == -1) continue; 654 | l1 = last_word * layer1_size; 655 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 656 | // HIERARCHICAL SOFTMAX 657 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 658 | f = 0; 659 | l2 = vocab[word].point[d] * layer1_size; 660 | // Propagate hidden -> output 661 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; 662 | if (f <= -MAX_EXP) continue; 663 | else if (f >= MAX_EXP) continue; 664 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 665 | // 'g' is the gradient multiplied by the learning rate 666 | g = (1 - vocab[word].code[d] - f) * alpha; 667 | // Propagate errors output -> hidden 668 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 669 | // Learn weights hidden -> output 670 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; 671 | } 672 | // NEGATIVE SAMPLING 673 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 674 | if (d == 0) { 675 | target = word; 676 | label = 1; 677 | } else { 678 | next_random = next_random * (unsigned long long)25214903917 + 11; 679 | target = table[(next_random >> 16) % table_size]; 680 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 681 | if (target == word) continue; 682 | label = 0; 683 | } 684 | l2 = target * layer1_size; 685 | f = 0; 686 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 687 | if (f > MAX_EXP) g = (label - 1); 688 | else if (f < -MAX_EXP) g = (label - 0); 689 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]); 690 | 691 | if(label) f = -f; 692 | if(validSet) errV += log(1/(1+exp(f))); 693 | else err += log(1/(1+exp(f))); 694 | 695 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 696 | for (c = 0; c < layer1_size; c++) { 697 | real diff = g * syn0[c + l1]; 698 | syn1neg_gdsq[c + l2] += diff * diff; 699 | syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]); 700 | // 701 | } 702 | } 703 | // Learn weights input -> hidden 704 | for (c = 0; c < layer1_size; c++) { 705 | real diff = neu1e[c]; 706 | syn0_gdsq[c + l1] += diff * diff; 707 | syn0[c + l1] += alpha * diff * rsqrt(syn0_gdsq[c + l1]); 708 | } 709 | if(validSet) errV_cnt++; 710 | else err_cnt++; 711 | } 712 | 713 | } 714 | sentence_position++; 715 | if (sentence_position >= sentence_length) { 716 | sentence_length = 0; 717 | continue; 718 | } 719 | } 720 | fclose(fi); 721 | free(neu1); 722 | free(neu1e); 723 | pthread_exit(NULL); 724 | } 725 | 726 | 727 | void TrainModel() { 728 | long a, b, c, d; 729 | FILE *fo; 730 | char ffname[100]; 731 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 732 | loss = (double *)malloc(num_threads * sizeof(double)); 733 | lossV = (double *)malloc(num_threads * sizeof(double)); 734 | loss_cnt = (long long *)malloc(num_threads * sizeof(long long)); 735 | lossV_cnt = (long long *)malloc(num_threads * sizeof(long long)); 736 | 737 | printf("Starting training using file %s\n", train_file); 738 | starting_alpha = alpha; 739 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 740 | if (save_vocab_file[0] != 0) SaveVocab(); 741 | if (output_file[0] == 0) return; 742 | InitNet(); 743 | if (negative > 0) InitUnigramTable(); 744 | 745 | for(b = 1; b <= iter; b++){ 746 | start = clock(); 747 | word_count_actual = 0; 748 | for (a = 0; a < num_threads; a++){ 749 | loss[a] = lossV[a] = 0; 750 | loss_cnt[a] = lossV_cnt[a] = 0; 751 | } 752 | sum_loss = sum_lossV = 0; 753 | sum_loss_cnt = sum_lossV_cnt = 0; 754 | 755 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 756 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 757 | 758 | sprintf(ffname, "%s_%ld", output_file, b); 759 | writeWV(ffname); 760 | printf("%c", 13); 761 | 762 | sum_loss = sum_lossV = 0; 763 | sum_loss_cnt = sum_lossV_cnt = 0; 764 | for (a = 0; a < num_threads; a++){ 765 | sum_loss += loss[a]; 766 | sum_lossV += lossV[a]; 767 | sum_loss_cnt += loss_cnt[a]; 768 | sum_lossV_cnt += lossV_cnt[a]; 769 | } 770 | fprintf(stderr, "Iter: %ld Err: %lf ErrV: %lf\n", b, 771 | -sum_loss / sum_loss_cnt / (negative + 1), 772 | -sum_lossV / sum_lossV_cnt / (negative + 1)); 773 | fflush(stderr); 774 | } 775 | 776 | if (classes == 0) { 777 | // Save the word vectors 778 | //writeWV(output_file); 779 | } else { 780 | fo = fopen(output_file, "wb"); 781 | // Run K-means on the word vectors 782 | int clcn = classes, iter = 10, closeid; 783 | int *centcn = (int *)malloc(classes * sizeof(int)); 784 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 785 | real closev, x; 786 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 787 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 788 | for (a = 0; a < iter; a++) { 789 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 790 | for (b = 0; b < clcn; b++) centcn[b] = 1; 791 | for (c = 0; c < vocab_size; c++) { 792 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 793 | centcn[cl[c]]++; 794 | } 795 | for (b = 0; b < clcn; b++) { 796 | closev = 0; 797 | for (c = 0; c < layer1_size; c++) { 798 | cent[layer1_size * b + c] /= centcn[b]; 799 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 800 | } 801 | closev = sqrt(closev); 802 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 803 | } 804 | for (c = 0; c < vocab_size; c++) { 805 | closev = -10; 806 | closeid = 0; 807 | for (d = 0; d < clcn; d++) { 808 | x = 0; 809 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 810 | if (x > closev) { 811 | closev = x; 812 | closeid = d; 813 | } 814 | } 815 | cl[c] = closeid; 816 | } 817 | } 818 | // Save the K-means classes 819 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 820 | free(centcn); 821 | free(cent); 822 | free(cl); 823 | fclose(fo); 824 | } 825 | 826 | } 827 | 828 | int ArgPos(char *str, int argc, char **argv) { 829 | int a; 830 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 831 | if (a == argc - 1) { 832 | printf("Argument missing for %s\n", str); 833 | exit(1); 834 | } 835 | return a; 836 | } 837 | return -1; 838 | } 839 | 840 | int main(int argc, char **argv) { 841 | int i; 842 | if (argc == 1) { 843 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 844 | printf("Options:\n"); 845 | printf("Parameters for training:\n"); 846 | printf("\t-train \n"); 847 | printf("\t\tUse text data from to train the model\n"); 848 | printf("\t-output \n"); 849 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 850 | printf("\t-size \n"); 851 | printf("\t\tSet size of word vectors; default is 100\n"); 852 | printf("\t-window \n"); 853 | printf("\t\tSet max skip length between words; default is 5\n"); 854 | printf("\t-sample \n"); 855 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 856 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 857 | printf("\t-hs \n"); 858 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 859 | printf("\t-negative \n"); 860 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 861 | printf("\t-threads \n"); 862 | printf("\t\tUse threads (default 12)\n"); 863 | printf("\t-iter \n"); 864 | printf("\t\tRun more training iterations (default 5)\n"); 865 | printf("\t-min-count \n"); 866 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 867 | printf("\t-alpha \n"); 868 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 869 | printf("\t-classes \n"); 870 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 871 | printf("\t-debug \n"); 872 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 873 | printf("\t-binary \n"); 874 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 875 | printf("\t-save-vocab \n"); 876 | printf("\t\tThe vocabulary will be saved to \n"); 877 | printf("\t-read-vocab \n"); 878 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 879 | printf("\t-cbow \n"); 880 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 881 | printf("\nExamples:\n"); 882 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 883 | return 0; 884 | } 885 | output_file[0] = 0; 886 | save_vocab_file[0] = 0; 887 | read_vocab_file[0] = 0; 888 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 889 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 890 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 891 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 892 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 893 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 894 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 895 | if (cbow) alpha = 0.05; 896 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 897 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 898 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 899 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 900 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 901 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 902 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 903 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 904 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 905 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 906 | input_size = layer1_size * window * 2; 907 | hidden_size = layer1_size; 908 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 909 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 910 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 911 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 912 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 913 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 914 | } 915 | TrainModel(); 916 | return 0; 917 | } 918 | -------------------------------------------------------------------------------- /embedding/order.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 100 22 | #define EXP_TABLE_SIZE 1000 23 | #define MAX_EXP 6 24 | #define MAX_SENTENCE_LENGTH 1000 25 | #define MAX_CODE_LENGTH 40 26 | 27 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 28 | 29 | typedef float real; // Precision of float numbers 30 | 31 | struct vocab_word { 32 | long long cn; 33 | int *point; 34 | char *word, *code, codelen; 35 | }; 36 | 37 | char train_file[MAX_STRING], output_file[MAX_STRING]; 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 39 | struct vocab_word *vocab; 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 41 | int *vocab_hash; 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100, input_size = 500; 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 44 | real alpha = 0.025, starting_alpha, sample = 1e-3; 45 | real *syn0, *syn1, *syn1neg, *expTable, *syn1neg_gdsq, *syn0_gdsq; 46 | double *loss, *lossV, sum_loss, sum_lossV; 47 | long long *loss_cnt, *lossV_cnt, sum_loss_cnt, sum_lossV_cnt; 48 | 49 | clock_t start; 50 | 51 | int hs = 0, negative = 5; 52 | const int table_size = 1e8; 53 | int *table; 54 | 55 | void InitUnigramTable() { 56 | int a, i; 57 | long long train_words_pow = 0; 58 | real d1, power = 0.75; 59 | table = (int *)malloc(table_size * sizeof(int)); 60 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 61 | i = 0; 62 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 63 | for (a = 0; a < table_size; a++) { 64 | table[a] = i; 65 | if (a / (real)table_size > d1) { 66 | i++; 67 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 68 | } 69 | if (i >= vocab_size) i = vocab_size - 1; 70 | } 71 | } 72 | 73 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 74 | void ReadWord(char *word, FILE *fin) { 75 | int a = 0, ch; 76 | while (!feof(fin)) { 77 | ch = fgetc(fin); 78 | if (ch == 13) continue; 79 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 80 | if (a > 0) { 81 | if (ch == '\n') ungetc(ch, fin); 82 | break; 83 | } 84 | if (ch == '\n') { 85 | strcpy(word, (char *)""); 86 | return; 87 | } else continue; 88 | } 89 | word[a] = ch; 90 | a++; 91 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 92 | } 93 | word[a] = 0; 94 | } 95 | 96 | // Returns hash value of a word 97 | int GetWordHash(char *word) { 98 | unsigned long long a, hash = 0; 99 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 100 | hash = hash % vocab_hash_size; 101 | return hash; 102 | } 103 | 104 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 105 | int SearchVocab(char *word) { 106 | unsigned int hash = GetWordHash(word); 107 | while (1) { 108 | if (vocab_hash[hash] == -1) return -1; 109 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 110 | hash = (hash + 1) % vocab_hash_size; 111 | } 112 | return -1; 113 | } 114 | 115 | // Reads a word and returns its index in the vocabulary 116 | int ReadWordIndex(FILE *fin) { 117 | char word[MAX_STRING]; 118 | ReadWord(word, fin); 119 | if (feof(fin)) return -1; 120 | return SearchVocab(word); 121 | } 122 | 123 | // Adds a word to the vocabulary 124 | int AddWordToVocab(char *word) { 125 | unsigned int hash, length = strlen(word) + 1; 126 | if (length > MAX_STRING) length = MAX_STRING; 127 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 128 | strcpy(vocab[vocab_size].word, word); 129 | vocab[vocab_size].cn = 0; 130 | vocab_size++; 131 | // Reallocate memory if needed 132 | if (vocab_size + 2 >= vocab_max_size) { 133 | vocab_max_size += 1000; 134 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 135 | } 136 | hash = GetWordHash(word); 137 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 138 | vocab_hash[hash] = vocab_size - 1; 139 | return vocab_size - 1; 140 | } 141 | 142 | // Used later for sorting by word counts 143 | int VocabCompare(const void *a, const void *b) { 144 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 145 | } 146 | 147 | // Sorts the vocabulary by frequency using word counts 148 | void SortVocab() { 149 | int a, size; 150 | unsigned int hash; 151 | // Sort the vocabulary and keep at the first position 152 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 153 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 154 | size = vocab_size; 155 | train_words = 0; 156 | for (a = 0; a < size; a++) { 157 | // Words occuring less than min_count times will be discarded from the vocab 158 | if ((vocab[a].cn < min_count) && (a != 0)) { 159 | vocab_size--; 160 | free(vocab[a].word); 161 | } else { 162 | // Hash will be re-computed, as after the sorting it is not actual 163 | hash=GetWordHash(vocab[a].word); 164 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 165 | vocab_hash[hash] = a; 166 | train_words += vocab[a].cn; 167 | } 168 | } 169 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 170 | // Allocate memory for the binary tree construction 171 | for (a = 0; a < vocab_size; a++) { 172 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 173 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 174 | } 175 | } 176 | 177 | // Reduces the vocabulary by removing infrequent tokens 178 | void ReduceVocab() { 179 | int a, b = 0; 180 | unsigned int hash; 181 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 182 | vocab[b].cn = vocab[a].cn; 183 | vocab[b].word = vocab[a].word; 184 | b++; 185 | } else free(vocab[a].word); 186 | vocab_size = b; 187 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 188 | for (a = 0; a < vocab_size; a++) { 189 | // Hash will be re-computed, as it is not actual 190 | hash = GetWordHash(vocab[a].word); 191 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 192 | vocab_hash[hash] = a; 193 | } 194 | fflush(stdout); 195 | min_reduce++; 196 | } 197 | 198 | // Create binary Huffman tree using the word counts 199 | // Frequent words will have short uniqe binary codes 200 | void CreateBinaryTree() { 201 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 202 | char code[MAX_CODE_LENGTH]; 203 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 204 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 205 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 206 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 207 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 208 | pos1 = vocab_size - 1; 209 | pos2 = vocab_size; 210 | // Following algorithm constructs the Huffman tree by adding one node at a time 211 | for (a = 0; a < vocab_size - 1; a++) { 212 | // First, find two smallest nodes 'min1, min2' 213 | if (pos1 >= 0) { 214 | if (count[pos1] < count[pos2]) { 215 | min1i = pos1; 216 | pos1--; 217 | } else { 218 | min1i = pos2; 219 | pos2++; 220 | } 221 | } else { 222 | min1i = pos2; 223 | pos2++; 224 | } 225 | if (pos1 >= 0) { 226 | if (count[pos1] < count[pos2]) { 227 | min2i = pos1; 228 | pos1--; 229 | } else { 230 | min2i = pos2; 231 | pos2++; 232 | } 233 | } else { 234 | min2i = pos2; 235 | pos2++; 236 | } 237 | count[vocab_size + a] = count[min1i] + count[min2i]; 238 | parent_node[min1i] = vocab_size + a; 239 | parent_node[min2i] = vocab_size + a; 240 | binary[min2i] = 1; 241 | } 242 | // Now assign binary code to each vocabulary word 243 | for (a = 0; a < vocab_size; a++) { 244 | b = a; 245 | i = 0; 246 | while (1) { 247 | code[i] = binary[b]; 248 | point[i] = b; 249 | i++; 250 | b = parent_node[b]; 251 | if (b == vocab_size * 2 - 2) break; 252 | } 253 | vocab[a].codelen = i; 254 | vocab[a].point[0] = vocab_size - 2; 255 | for (b = 0; b < i; b++) { 256 | vocab[a].code[i - b - 1] = code[b]; 257 | vocab[a].point[i - b] = point[b] - vocab_size; 258 | } 259 | } 260 | free(count); 261 | free(binary); 262 | free(parent_node); 263 | } 264 | 265 | void LearnVocabFromTrainFile() { 266 | char word[MAX_STRING]; 267 | FILE *fin; 268 | long long a, i; 269 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 270 | fin = fopen(train_file, "rb"); 271 | if (fin == NULL) { 272 | printf("ERROR: training data file not found!\n"); 273 | exit(1); 274 | } 275 | vocab_size = 0; 276 | AddWordToVocab((char *)""); 277 | while (1) { 278 | ReadWord(word, fin); 279 | if (feof(fin)) break; 280 | train_words++; 281 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 282 | printf("%lldK%c", train_words / 1000, 13); 283 | fflush(stdout); 284 | } 285 | i = SearchVocab(word); 286 | if (i == -1) { 287 | a = AddWordToVocab(word); 288 | vocab[a].cn = 1; 289 | } else vocab[i].cn++; 290 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 291 | } 292 | SortVocab(); 293 | if (debug_mode > 0) { 294 | printf("Vocab size: %lld\n", vocab_size); 295 | printf("Words in train file: %lld\n", train_words); 296 | } 297 | file_size = ftell(fin); 298 | fclose(fin); 299 | } 300 | 301 | void SaveVocab() { 302 | long long i; 303 | FILE *fo = fopen(save_vocab_file, "wb"); 304 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 305 | fclose(fo); 306 | } 307 | 308 | void ReadVocab() { 309 | long long a, i = 0; 310 | char c; 311 | char word[MAX_STRING]; 312 | FILE *fin = fopen(read_vocab_file, "rb"); 313 | if (fin == NULL) { 314 | printf("Vocabulary file not found\n"); 315 | exit(1); 316 | } 317 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 318 | vocab_size = 0; 319 | while (1) { 320 | ReadWord(word, fin); 321 | if (feof(fin)) break; 322 | a = AddWordToVocab(word); 323 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 324 | i++; 325 | } 326 | SortVocab(); 327 | if (debug_mode > 0) { 328 | printf("Vocab size: %lld\n", vocab_size); 329 | printf("Words in train file: %lld\n", train_words); 330 | } 331 | fin = fopen(train_file, "rb"); 332 | if (fin == NULL) { 333 | printf("ERROR: training data file not found!\n"); 334 | exit(1); 335 | } 336 | fseek(fin, 0, SEEK_END); 337 | file_size = ftell(fin); 338 | fclose(fin); 339 | } 340 | 341 | void InitNet() { 342 | long long a, b; 343 | unsigned long long next_random = 1; 344 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 345 | a = posix_memalign((void **)&syn0_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real)); 346 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 347 | if (hs) { 348 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 349 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 350 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 351 | syn1[a * layer1_size + b] = 0; 352 | } 353 | if (negative>0) { 354 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * input_size * sizeof(real)); 355 | a = posix_memalign((void **)&syn1neg_gdsq, 128, (long long)vocab_size * input_size * sizeof(real)); 356 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 357 | for (a = 0; a < vocab_size; a++) for (b = 0; b < input_size; b++){ 358 | syn1neg[a * input_size + b] = 0; 359 | syn1neg_gdsq[a * input_size + b] = 1e-8; 360 | } 361 | 362 | } 363 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 364 | next_random = next_random * (unsigned long long)25214903917 + 11; 365 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 366 | syn0_gdsq[a * layer1_size + b] = 1e-8; 367 | } 368 | CreateBinaryTree(); 369 | } 370 | 371 | void writeWV(char *output_file){ 372 | long long a, b; 373 | FILE *fo = fopen(output_file, "wb"); 374 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 375 | for (a = 0; a < vocab_size; a++) { 376 | fprintf(fo, "%s ", vocab[a].word); 377 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 378 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 379 | fprintf(fo, "\n"); 380 | } 381 | fclose(fo); 382 | } 383 | 384 | typedef unsigned long uint64_t; 385 | typedef unsigned int uint32_t; 386 | 387 | double rsqrt64(double number) { 388 | uint64_t i; 389 | double x2, y; 390 | x2 = number * 0.5; 391 | y = number; 392 | i = *(uint64_t *) &y; 393 | i = 0x5fe6eb50c7b537a9 - (i >> 1); 394 | y = *(double *) &i; 395 | y = y * (1.5 - (x2 * y * y)); 396 | return y; 397 | } 398 | 399 | float rsqrt(float number){ 400 | uint32_t i; 401 | float x2, y; 402 | x2 = number * 0.5F; 403 | y = number; 404 | i = *(uint32_t *) &y; 405 | i = 0x5f3759df - ( i >> 1 ); 406 | y = *(float *) &i; 407 | y = y * ( 1.5F - ( x2 * y * y ) ); 408 | return y; 409 | } 410 | 411 | void *TrainModelThread(void *_id) { 412 | long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; 413 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 414 | long long l1, l2, c, target, label; 415 | long long id = (long long)_id; 416 | unsigned long long next_random = id + clock(); //must add ``clock'' if using this kind of iter. 417 | real f, g; 418 | clock_t now; 419 | real *neu1 = (real *)calloc(input_size, sizeof(real)); 420 | real *neu1e = (real *)calloc(input_size, sizeof(real)); 421 | double err = 0, errV = 0; //loss 422 | long long err_cnt = 0, errV_cnt = 0; 423 | 424 | FILE *fi = fopen(train_file, "rb"); 425 | fseek(fi, file_size / (long long)num_threads * id, SEEK_SET); 426 | while (1) { 427 | if (word_count - last_word_count > 10000) { 428 | loss[id] += err; 429 | lossV[id] += errV; 430 | loss_cnt[id] += err_cnt; 431 | lossV_cnt[id] += errV_cnt; 432 | 433 | sum_loss += err; 434 | sum_lossV += errV; 435 | sum_loss_cnt += err_cnt; 436 | sum_lossV_cnt += errV_cnt; 437 | 438 | err = errV = 0; 439 | err_cnt = errV_cnt = 0; 440 | word_count_actual += word_count - last_word_count; 441 | last_word_count = word_count; 442 | if ((debug_mode > 1) && id == 0) { 443 | now=clock(); 444 | printf("%cAlpha: %f Err: %lf ErrV: %lf Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 445 | -sum_loss / sum_loss_cnt / (negative + 1), 446 | -sum_lossV / sum_lossV_cnt / (negative + 1), 447 | word_count_actual / (real)(train_words + 1) * 100, 448 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 449 | fflush(stdout); 450 | } 451 | //alpha = starting_alpha; 452 | //alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 453 | //if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 454 | } 455 | if (sentence_length == 0) { 456 | while (1) { 457 | word = ReadWordIndex(fi); 458 | if (feof(fi)) break; 459 | if (word == -1) continue; 460 | word_count++; 461 | if (word == 0) break; 462 | // The subsampling randomly discards frequent words while keeping the ranking same 463 | if (sample > 0) { 464 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 465 | next_random = next_random * (unsigned long long)25214903917 + 11; 466 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 467 | } 468 | sen[sentence_length] = word; 469 | sentence_length++; 470 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 471 | } 472 | sentence_position = 0; 473 | } 474 | int validSet = 0; 475 | if(word_count > (int)(0.95 * train_words / num_threads)) 476 | validSet = 1; 477 | if (feof(fi) || (word_count > train_words / num_threads)) { 478 | word_count_actual += word_count - last_word_count; 479 | break; 480 | } 481 | word = sen[sentence_position]; 482 | if (word == -1) continue; 483 | //for (c = 0; c < input_size; c++) neu1[c] = 0; 484 | for (c = 0; c < input_size; c++) neu1e[c] = 0; 485 | next_random = next_random * (unsigned long long)25214903917 + 11; 486 | //b = next_random % window; 487 | b = 0; 488 | if (cbow) { //train the cbow architecture 489 | // in -> hidden 490 | cw = 0; 491 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 492 | c = sentence_position - window + a; 493 | last_word = sen[c]; 494 | if (c < 0) last_word = 0; 495 | if (c >= sentence_length) last_word = 0; 496 | if (last_word == -1) last_word = 0; 497 | for (c = 0; c < layer1_size; c++) neu1[cw * layer1_size + c] = syn0[c + last_word * layer1_size]; 498 | cw++; 499 | } 500 | if (cw == window * 2) { 501 | //for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 502 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 503 | f = 0; 504 | l2 = vocab[word].point[d] * layer1_size; 505 | // Propagate hidden -> output 506 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 507 | if (f <= -MAX_EXP) continue; 508 | else if (f >= MAX_EXP) continue; 509 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 510 | // 'g' is the gradient multiplied by the learning rate 511 | g = (1 - vocab[word].code[d] - f) * alpha; 512 | // Propagate errors output -> hidden 513 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 514 | // Learn weights hidden -> output 515 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 516 | } 517 | // NEGATIVE SAMPLING 518 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 519 | if (d == 0) { 520 | target = word; 521 | label = 1; 522 | } else { 523 | next_random = next_random * (unsigned long long)25214903917 + 11; 524 | target = table[(next_random >> 16) % table_size]; 525 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 526 | if (target == word) continue; 527 | label = 0; 528 | } 529 | l2 = target * input_size; 530 | f = 0; 531 | for (c = 0; c < input_size; c++) f += neu1[c] * syn1neg[c + l2]; 532 | if (f > MAX_EXP) g = (label - 1); 533 | else if (f < -MAX_EXP) g = (label - 0); 534 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]); 535 | 536 | if(label) f = -f; 537 | if(validSet) errV += log(1/(1+exp(f))); 538 | else err += log(1/(1+exp(f))); 539 | 540 | for (c = 0; c < input_size; c++) neu1e[c] += g * syn1neg[c + l2]; 541 | //for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 542 | if(!validSet) for (c = 0; c < input_size; c++) { 543 | real diff = g * neu1[c]; 544 | syn1neg_gdsq[c + l2] += diff * diff; 545 | syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]); 546 | // 547 | } 548 | } 549 | 550 | if(validSet) errV_cnt++; 551 | else err_cnt++; 552 | // hidden -> in 553 | cw = 0; 554 | if(!validSet) for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 555 | c = sentence_position - window + a; 556 | last_word = sen[c]; 557 | if (c < 0) last_word = 0; 558 | if (c >= sentence_length) last_word = 0; 559 | if (last_word == -1) last_word = 0; 560 | //for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 561 | for (c = 0; c < layer1_size; c++) { 562 | real diff = neu1e[cw * layer1_size + c]; 563 | long long p = c + last_word * layer1_size; 564 | syn0_gdsq[p] += diff * diff; 565 | syn0[p] += alpha * diff * rsqrt(syn0_gdsq[p]); 566 | } 567 | cw++; 568 | } 569 | } 570 | } else { //train skip-gram 571 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 572 | c = sentence_position - window + a; 573 | if (c < 0) continue; 574 | if (c >= sentence_length) continue; 575 | last_word = sen[c]; 576 | if (last_word == -1) continue; 577 | l1 = last_word * layer1_size; 578 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 579 | // HIERARCHICAL SOFTMAX 580 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 581 | f = 0; 582 | l2 = vocab[word].point[d] * layer1_size; 583 | // Propagate hidden -> output 584 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; 585 | if (f <= -MAX_EXP) continue; 586 | else if (f >= MAX_EXP) continue; 587 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 588 | // 'g' is the gradient multiplied by the learning rate 589 | g = (1 - vocab[word].code[d] - f) * alpha; 590 | // Propagate errors output -> hidden 591 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 592 | // Learn weights hidden -> output 593 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; 594 | } 595 | // NEGATIVE SAMPLING 596 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 597 | if (d == 0) { 598 | target = word; 599 | label = 1; 600 | } else { 601 | next_random = next_random * (unsigned long long)25214903917 + 11; 602 | target = table[(next_random >> 16) % table_size]; 603 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 604 | if (target == word) continue; 605 | label = 0; 606 | } 607 | l2 = target * layer1_size; 608 | f = 0; 609 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 610 | if (f > MAX_EXP) g = (label - 1); 611 | else if (f < -MAX_EXP) g = (label - 0); 612 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]); 613 | 614 | if(label) f = -f; 615 | if(validSet) errV += log(1/(1+exp(f))); 616 | else err += log(1/(1+exp(f))); 617 | 618 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 619 | for (c = 0; c < layer1_size; c++) { 620 | real diff = g * syn0[c + l1]; 621 | syn1neg_gdsq[c + l2] += diff * diff; 622 | syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]); 623 | // 624 | } 625 | } 626 | // Learn weights input -> hidden 627 | for (c = 0; c < layer1_size; c++) { 628 | real diff = neu1e[c]; 629 | syn0_gdsq[c + l1] += diff * diff; 630 | syn0[c + l1] += alpha * diff * rsqrt(syn0_gdsq[c + l1]); 631 | } 632 | if(validSet) errV_cnt++; 633 | else err_cnt++; 634 | } 635 | 636 | } 637 | sentence_position++; 638 | if (sentence_position >= sentence_length) { 639 | sentence_length = 0; 640 | continue; 641 | } 642 | } 643 | fclose(fi); 644 | free(neu1); 645 | free(neu1e); 646 | pthread_exit(NULL); 647 | } 648 | 649 | 650 | void TrainModel() { 651 | long a, b, c, d; 652 | FILE *fo; 653 | char ffname[100]; 654 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 655 | loss = (double *)malloc(num_threads * sizeof(double)); 656 | lossV = (double *)malloc(num_threads * sizeof(double)); 657 | loss_cnt = (long long *)malloc(num_threads * sizeof(long long)); 658 | lossV_cnt = (long long *)malloc(num_threads * sizeof(long long)); 659 | 660 | printf("Starting training using file %s\n", train_file); 661 | starting_alpha = alpha; 662 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 663 | if (save_vocab_file[0] != 0) SaveVocab(); 664 | if (output_file[0] == 0) return; 665 | InitNet(); 666 | if (negative > 0) InitUnigramTable(); 667 | 668 | for(b = 1; b <= iter; b++){ 669 | start = clock(); 670 | word_count_actual = 0; 671 | for (a = 0; a < num_threads; a++){ 672 | loss[a] = lossV[a] = 0; 673 | loss_cnt[a] = lossV_cnt[a] = 0; 674 | } 675 | sum_loss = sum_lossV = 0; 676 | sum_loss_cnt = sum_lossV_cnt = 0; 677 | 678 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 679 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 680 | 681 | sprintf(ffname, "%s_%ld", output_file, b); 682 | writeWV(ffname); 683 | printf("%c", 13); 684 | 685 | sum_loss = sum_lossV = 0; 686 | sum_loss_cnt = sum_lossV_cnt = 0; 687 | for (a = 0; a < num_threads; a++){ 688 | sum_loss += loss[a]; 689 | sum_lossV += lossV[a]; 690 | sum_loss_cnt += loss_cnt[a]; 691 | sum_lossV_cnt += lossV_cnt[a]; 692 | } 693 | fprintf(stderr, "Iter: %ld Err: %lf ErrV: %lf\n", b, 694 | -sum_loss / sum_loss_cnt / (negative + 1), 695 | -sum_lossV / sum_lossV_cnt / (negative + 1)); 696 | fflush(stderr); 697 | } 698 | 699 | if (classes == 0) { 700 | // Save the word vectors 701 | //writeWV(output_file); 702 | } else { 703 | fo = fopen(output_file, "wb"); 704 | // Run K-means on the word vectors 705 | int clcn = classes, iter = 10, closeid; 706 | int *centcn = (int *)malloc(classes * sizeof(int)); 707 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 708 | real closev, x; 709 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 710 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 711 | for (a = 0; a < iter; a++) { 712 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 713 | for (b = 0; b < clcn; b++) centcn[b] = 1; 714 | for (c = 0; c < vocab_size; c++) { 715 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 716 | centcn[cl[c]]++; 717 | } 718 | for (b = 0; b < clcn; b++) { 719 | closev = 0; 720 | for (c = 0; c < layer1_size; c++) { 721 | cent[layer1_size * b + c] /= centcn[b]; 722 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 723 | } 724 | closev = sqrt(closev); 725 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 726 | } 727 | for (c = 0; c < vocab_size; c++) { 728 | closev = -10; 729 | closeid = 0; 730 | for (d = 0; d < clcn; d++) { 731 | x = 0; 732 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 733 | if (x > closev) { 734 | closev = x; 735 | closeid = d; 736 | } 737 | } 738 | cl[c] = closeid; 739 | } 740 | } 741 | // Save the K-means classes 742 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 743 | free(centcn); 744 | free(cent); 745 | free(cl); 746 | fclose(fo); 747 | } 748 | 749 | } 750 | 751 | int ArgPos(char *str, int argc, char **argv) { 752 | int a; 753 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 754 | if (a == argc - 1) { 755 | printf("Argument missing for %s\n", str); 756 | exit(1); 757 | } 758 | return a; 759 | } 760 | return -1; 761 | } 762 | 763 | int main(int argc, char **argv) { 764 | int i; 765 | if (argc == 1) { 766 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 767 | printf("Options:\n"); 768 | printf("Parameters for training:\n"); 769 | printf("\t-train \n"); 770 | printf("\t\tUse text data from to train the model\n"); 771 | printf("\t-output \n"); 772 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 773 | printf("\t-size \n"); 774 | printf("\t\tSet size of word vectors; default is 100\n"); 775 | printf("\t-window \n"); 776 | printf("\t\tSet max skip length between words; default is 5\n"); 777 | printf("\t-sample \n"); 778 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 779 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 780 | printf("\t-hs \n"); 781 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 782 | printf("\t-negative \n"); 783 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 784 | printf("\t-threads \n"); 785 | printf("\t\tUse threads (default 12)\n"); 786 | printf("\t-iter \n"); 787 | printf("\t\tRun more training iterations (default 5)\n"); 788 | printf("\t-min-count \n"); 789 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 790 | printf("\t-alpha \n"); 791 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 792 | printf("\t-classes \n"); 793 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 794 | printf("\t-debug \n"); 795 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 796 | printf("\t-binary \n"); 797 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 798 | printf("\t-save-vocab \n"); 799 | printf("\t\tThe vocabulary will be saved to \n"); 800 | printf("\t-read-vocab \n"); 801 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 802 | printf("\t-cbow \n"); 803 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 804 | printf("\nExamples:\n"); 805 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 806 | return 0; 807 | } 808 | output_file[0] = 0; 809 | save_vocab_file[0] = 0; 810 | read_vocab_file[0] = 0; 811 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 812 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 813 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 814 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 815 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 816 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 817 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 818 | if (cbow) alpha = 0.05; 819 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 820 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 821 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 822 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 823 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 824 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 825 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 826 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 827 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 828 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 829 | input_size = layer1_size * window * 2; 830 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 831 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 832 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 833 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 834 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 835 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 836 | } 837 | TrainModel(); 838 | return 0; 839 | } 840 | -------------------------------------------------------------------------------- /embedding/word2vec.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 100 22 | #define EXP_TABLE_SIZE 1000 23 | #define MAX_EXP 6 24 | #define MAX_SENTENCE_LENGTH 1000 25 | #define MAX_CODE_LENGTH 40 26 | 27 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 28 | 29 | typedef float real; // Precision of float numbers 30 | 31 | struct vocab_word { 32 | long long cn; 33 | int *point; 34 | char *word, *code, codelen; 35 | }; 36 | 37 | char train_file[MAX_STRING], output_file[MAX_STRING]; 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 39 | struct vocab_word *vocab; 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 41 | int *vocab_hash; 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 44 | real alpha = 0.025, starting_alpha, sample = 1e-3; 45 | real *syn0, *syn1, *syn1neg, *expTable, *syn1neg_gdsq, *syn0_gdsq; 46 | double *loss, *lossV, sum_loss, sum_lossV; 47 | long long *loss_cnt, *lossV_cnt, sum_loss_cnt, sum_lossV_cnt; 48 | 49 | clock_t start; 50 | 51 | int hs = 0, negative = 5; 52 | const int table_size = 1e8; 53 | int *table; 54 | 55 | void InitUnigramTable() { 56 | int a, i; 57 | long long train_words_pow = 0; 58 | real d1, power = 0.75; 59 | table = (int *)malloc(table_size * sizeof(int)); 60 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 61 | i = 0; 62 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 63 | for (a = 0; a < table_size; a++) { 64 | table[a] = i; 65 | if (a / (real)table_size > d1) { 66 | i++; 67 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 68 | } 69 | if (i >= vocab_size) i = vocab_size - 1; 70 | } 71 | } 72 | 73 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 74 | void ReadWord(char *word, FILE *fin) { 75 | int a = 0, ch; 76 | while (!feof(fin)) { 77 | ch = fgetc(fin); 78 | if (ch == 13) continue; 79 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 80 | if (a > 0) { 81 | if (ch == '\n') ungetc(ch, fin); 82 | break; 83 | } 84 | if (ch == '\n') { 85 | strcpy(word, (char *)""); 86 | return; 87 | } else continue; 88 | } 89 | word[a] = ch; 90 | a++; 91 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 92 | } 93 | word[a] = 0; 94 | } 95 | 96 | // Returns hash value of a word 97 | int GetWordHash(char *word) { 98 | unsigned long long a, hash = 0; 99 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 100 | hash = hash % vocab_hash_size; 101 | return hash; 102 | } 103 | 104 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 105 | int SearchVocab(char *word) { 106 | unsigned int hash = GetWordHash(word); 107 | while (1) { 108 | if (vocab_hash[hash] == -1) return -1; 109 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 110 | hash = (hash + 1) % vocab_hash_size; 111 | } 112 | return -1; 113 | } 114 | 115 | // Reads a word and returns its index in the vocabulary 116 | int ReadWordIndex(FILE *fin) { 117 | char word[MAX_STRING]; 118 | ReadWord(word, fin); 119 | if (feof(fin)) return -1; 120 | return SearchVocab(word); 121 | } 122 | 123 | // Adds a word to the vocabulary 124 | int AddWordToVocab(char *word) { 125 | unsigned int hash, length = strlen(word) + 1; 126 | if (length > MAX_STRING) length = MAX_STRING; 127 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 128 | strcpy(vocab[vocab_size].word, word); 129 | vocab[vocab_size].cn = 0; 130 | vocab_size++; 131 | // Reallocate memory if needed 132 | if (vocab_size + 2 >= vocab_max_size) { 133 | vocab_max_size += 1000; 134 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 135 | } 136 | hash = GetWordHash(word); 137 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 138 | vocab_hash[hash] = vocab_size - 1; 139 | return vocab_size - 1; 140 | } 141 | 142 | // Used later for sorting by word counts 143 | int VocabCompare(const void *a, const void *b) { 144 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 145 | } 146 | 147 | // Sorts the vocabulary by frequency using word counts 148 | void SortVocab() { 149 | int a, size; 150 | unsigned int hash; 151 | // Sort the vocabulary and keep at the first position 152 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 153 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 154 | size = vocab_size; 155 | train_words = 0; 156 | for (a = 0; a < size; a++) { 157 | // Words occuring less than min_count times will be discarded from the vocab 158 | if ((vocab[a].cn < min_count) && (a != 0)) { 159 | vocab_size--; 160 | free(vocab[a].word); 161 | } else { 162 | // Hash will be re-computed, as after the sorting it is not actual 163 | hash=GetWordHash(vocab[a].word); 164 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 165 | vocab_hash[hash] = a; 166 | train_words += vocab[a].cn; 167 | } 168 | } 169 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 170 | // Allocate memory for the binary tree construction 171 | for (a = 0; a < vocab_size; a++) { 172 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 173 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 174 | } 175 | } 176 | 177 | // Reduces the vocabulary by removing infrequent tokens 178 | void ReduceVocab() { 179 | int a, b = 0; 180 | unsigned int hash; 181 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 182 | vocab[b].cn = vocab[a].cn; 183 | vocab[b].word = vocab[a].word; 184 | b++; 185 | } else free(vocab[a].word); 186 | vocab_size = b; 187 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 188 | for (a = 0; a < vocab_size; a++) { 189 | // Hash will be re-computed, as it is not actual 190 | hash = GetWordHash(vocab[a].word); 191 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 192 | vocab_hash[hash] = a; 193 | } 194 | fflush(stdout); 195 | min_reduce++; 196 | } 197 | 198 | // Create binary Huffman tree using the word counts 199 | // Frequent words will have short uniqe binary codes 200 | void CreateBinaryTree() { 201 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 202 | char code[MAX_CODE_LENGTH]; 203 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 204 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 205 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 206 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 207 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 208 | pos1 = vocab_size - 1; 209 | pos2 = vocab_size; 210 | // Following algorithm constructs the Huffman tree by adding one node at a time 211 | for (a = 0; a < vocab_size - 1; a++) { 212 | // First, find two smallest nodes 'min1, min2' 213 | if (pos1 >= 0) { 214 | if (count[pos1] < count[pos2]) { 215 | min1i = pos1; 216 | pos1--; 217 | } else { 218 | min1i = pos2; 219 | pos2++; 220 | } 221 | } else { 222 | min1i = pos2; 223 | pos2++; 224 | } 225 | if (pos1 >= 0) { 226 | if (count[pos1] < count[pos2]) { 227 | min2i = pos1; 228 | pos1--; 229 | } else { 230 | min2i = pos2; 231 | pos2++; 232 | } 233 | } else { 234 | min2i = pos2; 235 | pos2++; 236 | } 237 | count[vocab_size + a] = count[min1i] + count[min2i]; 238 | parent_node[min1i] = vocab_size + a; 239 | parent_node[min2i] = vocab_size + a; 240 | binary[min2i] = 1; 241 | } 242 | // Now assign binary code to each vocabulary word 243 | for (a = 0; a < vocab_size; a++) { 244 | b = a; 245 | i = 0; 246 | while (1) { 247 | code[i] = binary[b]; 248 | point[i] = b; 249 | i++; 250 | b = parent_node[b]; 251 | if (b == vocab_size * 2 - 2) break; 252 | } 253 | vocab[a].codelen = i; 254 | vocab[a].point[0] = vocab_size - 2; 255 | for (b = 0; b < i; b++) { 256 | vocab[a].code[i - b - 1] = code[b]; 257 | vocab[a].point[i - b] = point[b] - vocab_size; 258 | } 259 | } 260 | free(count); 261 | free(binary); 262 | free(parent_node); 263 | } 264 | 265 | void LearnVocabFromTrainFile() { 266 | char word[MAX_STRING]; 267 | FILE *fin; 268 | long long a, i; 269 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 270 | fin = fopen(train_file, "rb"); 271 | if (fin == NULL) { 272 | printf("ERROR: training data file not found!\n"); 273 | exit(1); 274 | } 275 | vocab_size = 0; 276 | AddWordToVocab((char *)""); 277 | while (1) { 278 | ReadWord(word, fin); 279 | if (feof(fin)) break; 280 | train_words++; 281 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 282 | printf("%lldK%c", train_words / 1000, 13); 283 | fflush(stdout); 284 | } 285 | i = SearchVocab(word); 286 | if (i == -1) { 287 | a = AddWordToVocab(word); 288 | vocab[a].cn = 1; 289 | } else vocab[i].cn++; 290 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 291 | } 292 | SortVocab(); 293 | if (debug_mode > 0) { 294 | printf("Vocab size: %lld\n", vocab_size); 295 | printf("Words in train file: %lld\n", train_words); 296 | } 297 | file_size = ftell(fin); 298 | fclose(fin); 299 | } 300 | 301 | void SaveVocab() { 302 | long long i; 303 | FILE *fo = fopen(save_vocab_file, "wb"); 304 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 305 | fclose(fo); 306 | } 307 | 308 | void ReadVocab() { 309 | long long a, i = 0; 310 | char c; 311 | char word[MAX_STRING]; 312 | FILE *fin = fopen(read_vocab_file, "rb"); 313 | if (fin == NULL) { 314 | printf("Vocabulary file not found\n"); 315 | exit(1); 316 | } 317 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 318 | vocab_size = 0; 319 | while (1) { 320 | ReadWord(word, fin); 321 | if (feof(fin)) break; 322 | a = AddWordToVocab(word); 323 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 324 | i++; 325 | } 326 | SortVocab(); 327 | if (debug_mode > 0) { 328 | printf("Vocab size: %lld\n", vocab_size); 329 | printf("Words in train file: %lld\n", train_words); 330 | } 331 | fin = fopen(train_file, "rb"); 332 | if (fin == NULL) { 333 | printf("ERROR: training data file not found!\n"); 334 | exit(1); 335 | } 336 | fseek(fin, 0, SEEK_END); 337 | file_size = ftell(fin); 338 | fclose(fin); 339 | } 340 | 341 | void InitNet() { 342 | long long a, b; 343 | unsigned long long next_random = 1; 344 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 345 | a = posix_memalign((void **)&syn0_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real)); 346 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 347 | if (hs) { 348 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 349 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 350 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 351 | syn1[a * layer1_size + b] = 0; 352 | } 353 | if (negative>0) { 354 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); 355 | a = posix_memalign((void **)&syn1neg_gdsq, 128, (long long)vocab_size * layer1_size * sizeof(real)); 356 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 357 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++){ 358 | syn1neg[a * layer1_size + b] = 0; 359 | syn1neg_gdsq[a * layer1_size + b] = 1e-8; 360 | } 361 | 362 | } 363 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 364 | next_random = next_random * (unsigned long long)25214903917 + 11; 365 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 366 | syn0_gdsq[a * layer1_size + b] = 1e-8; 367 | } 368 | CreateBinaryTree(); 369 | } 370 | 371 | void writeWV(char *output_file){ 372 | long long a, b; 373 | FILE *fo = fopen(output_file, "wb"); 374 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 375 | for (a = 0; a < vocab_size; a++) { 376 | fprintf(fo, "%s ", vocab[a].word); 377 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 378 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 379 | fprintf(fo, "\n"); 380 | } 381 | fclose(fo); 382 | } 383 | 384 | typedef unsigned long uint64_t; 385 | typedef unsigned int uint32_t; 386 | 387 | double rsqrt64(double number) { 388 | uint64_t i; 389 | double x2, y; 390 | x2 = number * 0.5; 391 | y = number; 392 | i = *(uint64_t *) &y; 393 | i = 0x5fe6eb50c7b537a9 - (i >> 1); 394 | y = *(double *) &i; 395 | y = y * (1.5 - (x2 * y * y)); 396 | return y; 397 | } 398 | 399 | float rsqrt(float number){ 400 | uint32_t i; 401 | float x2, y; 402 | x2 = number * 0.5F; 403 | y = number; 404 | i = *(uint32_t *) &y; 405 | i = 0x5f3759df - ( i >> 1 ); 406 | y = *(float *) &i; 407 | y = y * ( 1.5F - ( x2 * y * y ) ); 408 | return y; 409 | } 410 | 411 | void *TrainModelThread(void *_id) { 412 | long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; 413 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 414 | long long l1, l2, c, target, label; 415 | long long id = (long long)_id; 416 | unsigned long long next_random = id + clock(); //must add ``clock'' if using this kind of iter. 417 | real f, g; 418 | clock_t now; 419 | real *neu1 = (real *)calloc(layer1_size, sizeof(real)); 420 | real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 421 | double err = 0, errV = 0; //loss 422 | long long err_cnt = 0, errV_cnt = 0; 423 | 424 | FILE *fi = fopen(train_file, "rb"); 425 | fseek(fi, file_size / (long long)num_threads * id, SEEK_SET); 426 | while (1) { 427 | if (word_count - last_word_count > 10000) { 428 | loss[id] += err; 429 | lossV[id] += errV; 430 | loss_cnt[id] += err_cnt; 431 | lossV_cnt[id] += errV_cnt; 432 | 433 | sum_loss += err; 434 | sum_lossV += errV; 435 | sum_loss_cnt += err_cnt; 436 | sum_lossV_cnt += errV_cnt; 437 | 438 | err = errV = 0; 439 | err_cnt = errV_cnt = 0; 440 | word_count_actual += word_count - last_word_count; 441 | last_word_count = word_count; 442 | if ((debug_mode > 1) && id == 0) { 443 | now=clock(); 444 | printf("%cAlpha: %f Err: %lf ErrV: %lf Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 445 | -sum_loss / sum_loss_cnt / (negative + 1), 446 | -sum_lossV / sum_lossV_cnt / (negative + 1), 447 | word_count_actual / (real)(train_words + 1) * 100, 448 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 449 | fflush(stdout); 450 | } 451 | //alpha = starting_alpha; 452 | //alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 453 | //if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 454 | } 455 | if (sentence_length == 0) { 456 | while (1) { 457 | word = ReadWordIndex(fi); 458 | if (feof(fi)) break; 459 | if (word == -1) continue; 460 | word_count++; 461 | if (word == 0) break; 462 | // The subsampling randomly discards frequent words while keeping the ranking same 463 | if (sample > 0) { 464 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 465 | next_random = next_random * (unsigned long long)25214903917 + 11; 466 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 467 | } 468 | sen[sentence_length] = word; 469 | sentence_length++; 470 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 471 | } 472 | sentence_position = 0; 473 | } 474 | int validSet = 0; 475 | if(word_count > (int)(0.95 * train_words / num_threads)) 476 | validSet = 1; 477 | if (feof(fi) || (word_count > train_words / num_threads)) { 478 | word_count_actual += word_count - last_word_count; 479 | break; 480 | } 481 | word = sen[sentence_position]; 482 | if (word == -1) continue; 483 | for (c = 0; c < layer1_size; c++) neu1[c] = 0; 484 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 485 | next_random = next_random * (unsigned long long)25214903917 + 11; 486 | //b = next_random % window; 487 | b = 0; 488 | if (cbow) { //train the cbow architecture 489 | // in -> hidden 490 | cw = 0; 491 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 492 | c = sentence_position - window + a; 493 | last_word = sen[c]; 494 | if (c < 0) last_word = 0; 495 | if (c >= sentence_length) last_word = 0; 496 | if (last_word == -1) last_word = 0; 497 | for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; 498 | cw++; 499 | } 500 | if (cw == window * 2) { 501 | for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 502 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 503 | f = 0; 504 | l2 = vocab[word].point[d] * layer1_size; 505 | // Propagate hidden -> output 506 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 507 | if (f <= -MAX_EXP) continue; 508 | else if (f >= MAX_EXP) continue; 509 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 510 | // 'g' is the gradient multiplied by the learning rate 511 | g = (1 - vocab[word].code[d] - f) * alpha; 512 | // Propagate errors output -> hidden 513 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 514 | // Learn weights hidden -> output 515 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 516 | } 517 | // NEGATIVE SAMPLING 518 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 519 | if (d == 0) { 520 | target = word; 521 | label = 1; 522 | } else { 523 | next_random = next_random * (unsigned long long)25214903917 + 11; 524 | target = table[(next_random >> 16) % table_size]; 525 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 526 | if (target == word) continue; 527 | label = 0; 528 | } 529 | l2 = target * layer1_size; 530 | f = 0; 531 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; 532 | if (f > MAX_EXP) g = (label - 1); 533 | else if (f < -MAX_EXP) g = (label - 0); 534 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]); 535 | 536 | if(label) f = -f; 537 | if(validSet) errV += log(1/(1+exp(f))); 538 | else err += log(1/(1+exp(f))); 539 | 540 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 541 | //for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 542 | if(!validSet) for (c = 0; c < layer1_size; c++) { 543 | real diff = g * neu1[c]; 544 | syn1neg_gdsq[c + l2] += diff * diff; 545 | syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]); 546 | // 547 | } 548 | } 549 | 550 | if(validSet) errV_cnt++; 551 | else err_cnt++; 552 | // hidden -> in 553 | if(!validSet) for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 554 | c = sentence_position - window + a; 555 | last_word = sen[c]; 556 | if (c < 0) last_word = 0; 557 | if (c >= sentence_length) last_word = 0; 558 | if (last_word == -1) last_word = 0; 559 | //for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 560 | for (c = 0; c < layer1_size; c++) { 561 | real diff = neu1e[c]; 562 | long long p = c + last_word * layer1_size; 563 | syn0_gdsq[p] += diff * diff; 564 | syn0[p] += alpha * diff * rsqrt(syn0_gdsq[p]); 565 | } 566 | } 567 | } 568 | } else { //train skip-gram 569 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 570 | c = sentence_position - window + a; 571 | if (c < 0) continue; 572 | if (c >= sentence_length) continue; 573 | last_word = sen[c]; 574 | if (last_word == -1) continue; 575 | l1 = last_word * layer1_size; 576 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 577 | // HIERARCHICAL SOFTMAX 578 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 579 | f = 0; 580 | l2 = vocab[word].point[d] * layer1_size; 581 | // Propagate hidden -> output 582 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; 583 | if (f <= -MAX_EXP) continue; 584 | else if (f >= MAX_EXP) continue; 585 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 586 | // 'g' is the gradient multiplied by the learning rate 587 | g = (1 - vocab[word].code[d] - f) * alpha; 588 | // Propagate errors output -> hidden 589 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 590 | // Learn weights hidden -> output 591 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; 592 | } 593 | // NEGATIVE SAMPLING 594 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 595 | if (d == 0) { 596 | target = word; 597 | label = 1; 598 | } else { 599 | next_random = next_random * (unsigned long long)25214903917 + 11; 600 | target = table[(next_random >> 16) % table_size]; 601 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 602 | if (target == word) continue; 603 | label = 0; 604 | } 605 | l2 = target * layer1_size; 606 | f = 0; 607 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 608 | if (f > MAX_EXP) g = (label - 1); 609 | else if (f < -MAX_EXP) g = (label - 0); 610 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]); 611 | 612 | if(label) f = -f; 613 | if(validSet) errV += log(1/(1+exp(f))); 614 | else err += log(1/(1+exp(f))); 615 | 616 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 617 | for (c = 0; c < layer1_size; c++) { 618 | real diff = g * syn0[c + l1]; 619 | syn1neg_gdsq[c + l2] += diff * diff; 620 | syn1neg[c + l2] += alpha * diff * rsqrt(syn1neg_gdsq[c + l2]); 621 | // 622 | } 623 | } 624 | // Learn weights input -> hidden 625 | for (c = 0; c < layer1_size; c++) { 626 | real diff = neu1e[c]; 627 | syn0_gdsq[c + l1] += diff * diff; 628 | syn0[c + l1] += alpha * diff * rsqrt(syn0_gdsq[c + l1]); 629 | } 630 | if(validSet) errV_cnt++; 631 | else err_cnt++; 632 | } 633 | 634 | } 635 | sentence_position++; 636 | if (sentence_position >= sentence_length) { 637 | sentence_length = 0; 638 | continue; 639 | } 640 | } 641 | fclose(fi); 642 | free(neu1); 643 | free(neu1e); 644 | pthread_exit(NULL); 645 | } 646 | 647 | 648 | void TrainModel() { 649 | long a, b, c, d; 650 | FILE *fo; 651 | char ffname[100]; 652 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 653 | loss = (double *)malloc(num_threads * sizeof(double)); 654 | lossV = (double *)malloc(num_threads * sizeof(double)); 655 | loss_cnt = (long long *)malloc(num_threads * sizeof(long long)); 656 | lossV_cnt = (long long *)malloc(num_threads * sizeof(long long)); 657 | 658 | printf("Starting training using file %s\n", train_file); 659 | starting_alpha = alpha; 660 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 661 | if (save_vocab_file[0] != 0) SaveVocab(); 662 | if (output_file[0] == 0) return; 663 | InitNet(); 664 | if (negative > 0) InitUnigramTable(); 665 | 666 | for(b = 1; b <= iter; b++){ 667 | start = clock(); 668 | word_count_actual = 0; 669 | for (a = 0; a < num_threads; a++){ 670 | loss[a] = lossV[a] = 0; 671 | loss_cnt[a] = lossV_cnt[a] = 0; 672 | } 673 | sum_loss = sum_lossV = 0; 674 | sum_loss_cnt = sum_lossV_cnt = 0; 675 | 676 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 677 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 678 | 679 | if(b < 100 || 680 | b < 1000 && b % 10 == 0 || 681 | b % 100 == 0){ 682 | sprintf(ffname, "%s_%ld", output_file, b); 683 | writeWV(ffname); 684 | } 685 | 686 | printf("%c", 13); 687 | 688 | sum_loss = sum_lossV = 0; 689 | sum_loss_cnt = sum_lossV_cnt = 0; 690 | for (a = 0; a < num_threads; a++){ 691 | sum_loss += loss[a]; 692 | sum_lossV += lossV[a]; 693 | sum_loss_cnt += loss_cnt[a]; 694 | sum_lossV_cnt += lossV_cnt[a]; 695 | } 696 | fprintf(stderr, "Iter: %ld Err: %lf ErrV: %lf\n", b, 697 | -sum_loss / sum_loss_cnt / (negative + 1), 698 | -sum_lossV / sum_lossV_cnt / (negative + 1)); 699 | fflush(stderr); 700 | } 701 | 702 | if (classes == 0) { 703 | // Save the word vectors 704 | //writeWV(output_file); 705 | } else { 706 | fo = fopen(output_file, "wb"); 707 | // Run K-means on the word vectors 708 | int clcn = classes, iter = 10, closeid; 709 | int *centcn = (int *)malloc(classes * sizeof(int)); 710 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 711 | real closev, x; 712 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 713 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 714 | for (a = 0; a < iter; a++) { 715 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 716 | for (b = 0; b < clcn; b++) centcn[b] = 1; 717 | for (c = 0; c < vocab_size; c++) { 718 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 719 | centcn[cl[c]]++; 720 | } 721 | for (b = 0; b < clcn; b++) { 722 | closev = 0; 723 | for (c = 0; c < layer1_size; c++) { 724 | cent[layer1_size * b + c] /= centcn[b]; 725 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 726 | } 727 | closev = sqrt(closev); 728 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 729 | } 730 | for (c = 0; c < vocab_size; c++) { 731 | closev = -10; 732 | closeid = 0; 733 | for (d = 0; d < clcn; d++) { 734 | x = 0; 735 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 736 | if (x > closev) { 737 | closev = x; 738 | closeid = d; 739 | } 740 | } 741 | cl[c] = closeid; 742 | } 743 | } 744 | // Save the K-means classes 745 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 746 | free(centcn); 747 | free(cent); 748 | free(cl); 749 | fclose(fo); 750 | } 751 | 752 | } 753 | 754 | int ArgPos(char *str, int argc, char **argv) { 755 | int a; 756 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 757 | if (a == argc - 1) { 758 | printf("Argument missing for %s\n", str); 759 | exit(1); 760 | } 761 | return a; 762 | } 763 | return -1; 764 | } 765 | 766 | int main(int argc, char **argv) { 767 | int i; 768 | if (argc == 1) { 769 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 770 | printf("Options:\n"); 771 | printf("Parameters for training:\n"); 772 | printf("\t-train \n"); 773 | printf("\t\tUse text data from to train the model\n"); 774 | printf("\t-output \n"); 775 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 776 | printf("\t-size \n"); 777 | printf("\t\tSet size of word vectors; default is 100\n"); 778 | printf("\t-window \n"); 779 | printf("\t\tSet max skip length between words; default is 5\n"); 780 | printf("\t-sample \n"); 781 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 782 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 783 | printf("\t-hs \n"); 784 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 785 | printf("\t-negative \n"); 786 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 787 | printf("\t-threads \n"); 788 | printf("\t\tUse threads (default 12)\n"); 789 | printf("\t-iter \n"); 790 | printf("\t\tRun more training iterations (default 5)\n"); 791 | printf("\t-min-count \n"); 792 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 793 | printf("\t-alpha \n"); 794 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 795 | printf("\t-classes \n"); 796 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 797 | printf("\t-debug \n"); 798 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 799 | printf("\t-binary \n"); 800 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 801 | printf("\t-save-vocab \n"); 802 | printf("\t\tThe vocabulary will be saved to \n"); 803 | printf("\t-read-vocab \n"); 804 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 805 | printf("\t-cbow \n"); 806 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 807 | printf("\nExamples:\n"); 808 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 809 | return 0; 810 | } 811 | output_file[0] = 0; 812 | save_vocab_file[0] = 0; 813 | read_vocab_file[0] = 0; 814 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 815 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 816 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 817 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 818 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 819 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 820 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 821 | if (cbow) alpha = 0.05; 822 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 823 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 824 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 825 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 826 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 827 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 828 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 829 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 830 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 831 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 832 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 833 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 834 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 835 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 836 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 837 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 838 | } 839 | TrainModel(); 840 | return 0; 841 | } 842 | -------------------------------------------------------------------------------- /evaluation/avg/README.md: -------------------------------------------------------------------------------- 1 | # avg 2 | 1. install liblinear 3 | 2. compile avg_embedding.cpp 4 | 3. run avg.py 5 | -------------------------------------------------------------------------------- /evaluation/avg/avg.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import time 4 | 5 | #models = ["hidden", "lbl", "senna"] 6 | #models = ["rand"] 7 | models = ["ivlblskip", "ivlblcbow"] 8 | 9 | liblinear_dir = "liblinear-1.94" 10 | 11 | def func(msg, vec_dir, ret_dir): 12 | vec_file = "%s/%s" % (vec_dir, msg) 13 | train_file = "%s_%s_train.txt" % (vec_dir, msg) 14 | test_file = "%s_%s_test.txt" % (vec_dir, msg) 15 | model_file = "%s.model" % train_file 16 | tmp_file = "%s_%s_out" % (vec_dir, msg) 17 | out_file = "%s/%s" % (ret_dir, msg) 18 | os.system("./avg_embedding %s imdb_train.txt imdb_test.txt %s %s" % (vec_file, train_file, test_file)) 19 | os.system("%s/train %s" % (liblinear_dir, train_file)) 20 | os.system("%s/predict %s %s %s > %s" % (liblinear_dir, train_file, model_file, tmp_file, out_file)) 21 | os.system("%s/predict %s %s %s >> %s" % (liblinear_dir, test_file, model_file, tmp_file, out_file)) 22 | os.system("rm %s" % train_file) 23 | os.system("rm %s" % test_file) 24 | os.system("rm %s" % model_file) 25 | os.system("rm %s" % tmp_file) 26 | 27 | 28 | if __name__ == "__main__": 29 | pool = multiprocessing.Pool(processes=20) 30 | 31 | for model in models: 32 | vec_dir = "vec_%s" % model 33 | ret_dir = "ret_avg2_%s" % model 34 | 35 | if not os.path.exists(ret_dir): 36 | os.makedirs(ret_dir) 37 | 38 | for lists in os.listdir(vec_dir): 39 | if not os.path.exists(os.path.join(ret_dir, lists)): 40 | 41 | x = lists.replace('.txt','').replace('.bz2','').split('_') 42 | #if not "v50" in lists: 43 | # continue 44 | if int(x[-1]) > 10 and "10m" in lists and int(x[-1]) % 100 != 0: 45 | continue 46 | if int(x[-1]) > 10 and "100m" in lists and int(x[-1]) % 10 != 0: 47 | continue 48 | print lists 49 | pool.apply_async(func, (lists, vec_dir, ret_dir, )) 50 | pool.close() 51 | pool.join() 52 | print "Sub-process(es) done." 53 | 54 | 55 | -------------------------------------------------------------------------------- /evaluation/avg/avg_embedding.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/avg/avg_embedding.cpp -------------------------------------------------------------------------------- /evaluation/cnn/README.md: -------------------------------------------------------------------------------- 1 | # cnn 2 | 1. make 3 | 2. run cnn.py 4 | -------------------------------------------------------------------------------- /evaluation/cnn/cnn.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import time 4 | 5 | #models = ["hidden", "lbl", "senna"] 6 | models = ["ivlblskip", "ivlblcbow"] 7 | #models = ["rand"] 8 | 9 | def func(msg, vec_dir, ret_dir): 10 | vec_file = "%s/%s" % (vec_dir, msg) 11 | out_file = "%s/%s" % (ret_dir, msg) 12 | for i in range(0, 5): 13 | os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 %d tree_dev.txt 5 90 >> %s" % (vec_file, i, out_file)) 14 | #os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 1 tree_dev.txt 5 90 >> %s" % (vec_file, out_file)) 15 | #os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 2 tree_dev.txt 5 90 >> %s" % (vec_file, out_file)) 16 | #os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 3 tree_dev.txt 5 90 >> %s" % (vec_file, out_file)) 17 | #os.system("./cnn_senna %s tree_train.txt tree_test.txt 5 4 tree_dev.txt 5 90 >> %s" % (vec_file, out_file)) 18 | 19 | 20 | 21 | 22 | if __name__ == "__main__": 23 | pool = multiprocessing.Pool(processes=1) 24 | 25 | for model in models: 26 | vec_dir = "vec_%s" % model 27 | ret_dir = "ret_cnn_%s" % model 28 | 29 | if not os.path.exists(ret_dir): 30 | os.makedirs(ret_dir) 31 | #func('50_2_ns5_16') 32 | 33 | for lists in os.listdir(vec_dir): 34 | if not os.path.exists(os.path.join(ret_dir, lists)): 35 | 36 | x = lists.replace('.txt','').replace('.bz2','').split('_') 37 | #if not "v50" in lists: 38 | # continue 39 | iter = int(x[-1]) 40 | if not (iter == 1 or iter == 3 or iter == 5 or iter == 20 or iter == 10 or iter == 33 or iter == 100 or iter == 1000 or iter == 10000): 41 | continue 42 | print lists 43 | pool.apply_async(func, (lists, vec_dir, ret_dir, )) 44 | pool.close() 45 | pool.join() 46 | print "Sub-process(es) done." 47 | 48 | 49 | -------------------------------------------------------------------------------- /evaluation/cnn/cnn_senna.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/cnn/cnn_senna.cpp -------------------------------------------------------------------------------- /evaluation/cnn/fileutil.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/cnn/fileutil.hpp -------------------------------------------------------------------------------- /evaluation/cnn/makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | GCC = gcc 3 | CFLAGS = -lm -O2 -Wall -funroll-loops -ffast-math 4 | #CFLAGS = -lm -O2 -Wall 5 | 6 | all: cnn_senna 7 | 8 | 9 | cnn_senna : cnn_senna.cpp 10 | $(CC) $(CFLAGS) $(OPT_DEF) cnn_senna.cpp -DLINUX -fopenmp -O2 -g -std=c++0x -o cnn_senna 11 | 12 | clean: 13 | rm -rf *.o cnn_senna 14 | -------------------------------------------------------------------------------- /evaluation/ner/README.md: -------------------------------------------------------------------------------- 1 | # ner 2 | 1. download http://cogcomp.cs.illinois.edu/experiments/ACL2010_NER_Experiments.zip. 3 | 2. unzip Data folder in ACL2010_NER_Experiments.zip. 4 | 3. run ner.py 5 | 6 | ner.jar is modified from http://cogcomp.cs.illinois.edu/Data/ACL2010_NER_Experiments.php -------------------------------------------------------------------------------- /evaluation/ner/default.config: -------------------------------------------------------------------------------- 1 | configFilename cwRcv50DimOverall0.3 2 | sortLexicallyFilesInFolders true 3 | treatAllFilesInFolderAsOneBigDocument false 4 | inferenceMethod GREEDY 5 | beamSize 5 6 | thresholdPrediction false 7 | predictionConfidenceThreshold -1 8 | labelTypes PER ORG LOC MISC 9 | logging true 10 | debuggingLogPath DebugLog/ 11 | taggingEncodingScheme BILOU 12 | pathToGazetteers Data/KnownLists 13 | pathsToBrownClusters Data/BrownHierarchicalWordClusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt Data/BrownHierarchicalWordClusters/brownBllipClusters Data/BrownHierarchicalWordClusters/rcv1.clean.tokenized-c1000-p1.paths.txt 14 | minWordAppThresholdsForBrownClusters 5 5 5 15 | isLowercaseBrownClusters false false false 16 | pathsToWordEmbeddings Data/WordEmbedding/model-2280000000.LEARNING_RATE=1e-08.EMBEDDING_LEARNING_RATE=1e-07.EMBEDDING_SIZE=50.txt 17 | embeddingDimensionalities 50 18 | minWordAppThresholdsForEmbeddings 0 19 | normalizationConstantsForEmbeddings 0.3 20 | normalizationMethodsForEmbeddings OVERALL 21 | isLowercaseWordEmbeddings false 22 | pathToModelFile Data/Models/ 23 | tokenizationScheme DualTokenizationScheme 24 | Forms 1 25 | Capitalization 1 26 | WordTypeInformation 1 27 | Affixes 1 28 | PreviousTag1 1 29 | PreviousTag2 1 30 | GazetteersFeatures 0 31 | WordEmbeddings 1 32 | BrownClusterPaths 0 33 | aggregateContext 0 34 | aggregateGazetteerMatches 0 35 | prevTagsForContext 0 36 | PredictionsLevel1 0 -------------------------------------------------------------------------------- /evaluation/ner/ner.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/ner/ner.jar -------------------------------------------------------------------------------- /evaluation/ner/ner.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import time 4 | 5 | 6 | models = ["ivlblskip", "ivlblcbow"] 7 | #models = ["rand"] 8 | 9 | def func(msg, vec_dir, ret_dir): 10 | vec_file = "%s/%s" % (vec_dir, msg) 11 | out_file = "%s/%s" % (ret_dir, msg) 12 | arg = "java -jar ner.jar %s %s > %s" % (vec_file.replace("/","_").replace(":","_"), vec_file, out_file) 13 | if not os.path.exists(out_file): 14 | print arg 15 | os.system(arg) 16 | 17 | 18 | if __name__ == "__main__": 19 | pool = multiprocessing.Pool(processes=3) 20 | 21 | for model in models: 22 | vec_dir = "vec_%s" % model 23 | ret_dir = "ret_ner_%s" % model 24 | 25 | if not os.path.exists(ret_dir): 26 | os.makedirs(ret_dir) 27 | 28 | for lists in os.listdir(vec_dir): 29 | if not os.path.exists(os.path.join(ret_dir, lists)): 30 | 31 | x = lists.replace('.txt','').replace('.bz2','').split('_') 32 | #if not "all_1b" in lists: 33 | # continue 34 | iter = int(x[-1]) 35 | if not (iter == 1 or iter == 3 or iter == 5 or iter == 20 or iter == 10 or iter == 33 or iter == 100 or iter == 1000 or iter == 10000): 36 | continue 37 | print lists 38 | pool.apply_async(func, (lists, vec_dir, ret_dir, )) 39 | pool.close() 40 | pool.join() 41 | print "Sub-process(es) done." 42 | 43 | 44 | -------------------------------------------------------------------------------- /evaluation/pos/README.md: -------------------------------------------------------------------------------- 1 | # pos 2 | 1. make 3 | 2. run pos.py 4 | -------------------------------------------------------------------------------- /evaluation/pos/fileutil.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/pos/fileutil.hpp -------------------------------------------------------------------------------- /evaluation/pos/makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | GCC = gcc 3 | CFLAGS = -lm -O2 -Wall -funroll-loops -ffast-math 4 | #CFLAGS = -lm -O2 -Wall 5 | 6 | all: senna_tag 7 | 8 | senna_tag : sennaseg.cpp 9 | $(CC) $(CFLAGS) $(OPT_DEF) sennaseg.cpp -fopenmp -DLINUX -o senna_tag 10 | 11 | clean: 12 | rm -rf *.o senna_tag 13 | -------------------------------------------------------------------------------- /evaluation/pos/pos.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import time 4 | 5 | #models = ["hidden", "lbl", "senna"] 6 | #models = ["rand"] 7 | models = ["ivlblskip", "ivlblcbow"] 8 | 9 | def func(msg, vec_dir, ret_dir): 10 | vec_file = "%s/%s" % (vec_dir, msg) 11 | out_file = "%s/%s" % (ret_dir, msg) 12 | arg = "./senna_tag %s > %s" % (vec_file, out_file) 13 | print arg 14 | os.system(arg) 15 | 16 | 17 | if __name__ == "__main__": 18 | pool = multiprocessing.Pool(processes=8) 19 | 20 | for model in models: 21 | vec_dir = "vec_%s" % model 22 | ret_dir = "ret_pos_%s" % model 23 | 24 | if not os.path.exists(ret_dir): 25 | os.makedirs(ret_dir) 26 | 27 | for lists in os.listdir(vec_dir): 28 | if not os.path.exists(os.path.join(ret_dir, lists)): 29 | 30 | x = lists.replace('.txt','').replace('.bz2','').split('_') 31 | #if not "v50" in lists: 32 | # continue 33 | iter = int(x[-1]) 34 | if not (iter == 1 or iter == 3 or iter == 10 or iter == 5 or iter == 20 or iter == 33 or iter == 100 or iter == 1000 or iter == 10000): 35 | continue 36 | print lists 37 | pool.apply_async(func, (lists, vec_dir, ret_dir, )) 38 | pool.close() 39 | pool.join() 40 | print "Sub-process(es) done." 41 | 42 | 43 | -------------------------------------------------------------------------------- /evaluation/pos/sennaseg.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/licstar/compare/e0983f7f7c87e5b653fa2d034e2e9b962583fb8d/evaluation/pos/sennaseg.cpp -------------------------------------------------------------------------------- /evaluation/syn_sem/README.md: -------------------------------------------------------------------------------- 1 | # syn/sem 2 | 1. compile compute-accuracy-txt.c 3 | 2. run king.py 4 | -------------------------------------------------------------------------------- /evaluation/syn_sem/compute-accuracy-txt.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | const long long max_size = 2000; // max length of strings 23 | const long long N = 1; // number of closest words 24 | const long long max_w = 50; // max length of vocabulary entries 25 | 26 | int main(int argc, char **argv) 27 | { 28 | FILE *f; 29 | char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch; 30 | float dist, len, bestd[N], vec[max_size]; 31 | long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; 32 | float *M; 33 | char *vocab; 34 | int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0; 35 | if (argc < 2) { 36 | printf("Usage: ./compute-accuracy \nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | if (argc > 2) threshold = atoi(argv[2]); 41 | f = fopen(file_name, "rb"); 42 | if (f == NULL) { 43 | printf("Input file not found\n"); 44 | return -1; 45 | } 46 | fscanf(f, "%lld", &words); 47 | if (threshold) if (words > threshold) words = threshold; 48 | fscanf(f, "%lld", &size); 49 | vocab = (char *)malloc(words * max_w * sizeof(char)); 50 | M = (float *)malloc(words * size * sizeof(float)); 51 | if (M == NULL) { 52 | printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576); 53 | return -1; 54 | } 55 | for (b = 0; b < words; b++) { 56 | fscanf(f, "%s%c", &vocab[b * max_w], &ch); 57 | for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]); 58 | for (a = 0; a < size; a++) fscanf(f, "%f", &M[a + b * size]);//fread(&M[a + b * size], sizeof(float), 1, f); 59 | len = 0; 60 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 61 | len = sqrt(len); 62 | for (a = 0; a < size; a++) M[a + b * size] /= len; 63 | } 64 | fclose(f); 65 | TCN = 0; 66 | while (1) { 67 | for (a = 0; a < N; a++) bestd[a] = 0; 68 | for (a = 0; a < N; a++) bestw[a][0] = 0; 69 | scanf("%s", st1); 70 | for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]); 71 | if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) { 72 | if (TCN == 0) TCN = 1; 73 | if (QID != 0) { 74 | printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN); 75 | printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100); 76 | } 77 | QID++; 78 | scanf("%s", st1); 79 | if (feof(stdin)) break; 80 | printf("%s:\n", st1); 81 | TCN = 0; 82 | CCN = 0; 83 | continue; 84 | } 85 | if (!strcmp(st1, "EXIT")) break; 86 | scanf("%s", st2); 87 | for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]); 88 | scanf("%s", st3); 89 | for (a = 0; a bestd[a]) { 116 | for (d = N - 1; d > a; d--) { 117 | bestd[d] = bestd[d - 1]; 118 | strcpy(bestw[d], bestw[d - 1]); 119 | } 120 | bestd[a] = dist; 121 | strcpy(bestw[a], &vocab[c * max_w]); 122 | break; 123 | } 124 | } 125 | } 126 | if (!strcmp(st4, bestw[0])) { 127 | CCN++; 128 | CACN++; 129 | if (QID <= 5) SEAC++; else SYAC++; 130 | } 131 | if (QID <= 5) SECN++; else SYCN++; 132 | TCN++; 133 | TACN++; 134 | } 135 | printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100); 136 | return 0; 137 | } 138 | -------------------------------------------------------------------------------- /evaluation/syn_sem/king.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import time 4 | 5 | #models = ["hidden", "lbl", "senna"] 6 | models = ["ivlblskip", "ivlblcbow"] 7 | 8 | def func(msg, vec_dir, ret_dir): 9 | arg = './compute-accuracy-txt %s/%s 0 < questions-words.txt > %s/%s' % (vec_dir, msg, ret_dir, msg) 10 | print arg 11 | os.system(arg) 12 | 13 | 14 | if __name__ == "__main__": 15 | pool = multiprocessing.Pool(processes=16) 16 | 17 | for model in models: 18 | vec_dir = "vec_%s" % model 19 | ret_dir = "ret_king_%s" % model 20 | 21 | if not os.path.exists(ret_dir): 22 | os.makedirs(ret_dir) 23 | 24 | for lists in os.listdir(vec_dir): 25 | if not os.path.exists(os.path.join(ret_dir, lists)): 26 | x = lists.replace('.txt','').replace('.bz2','').split('_') 27 | #if int(x[-1]) != 1 and int(x[-1]) % 5 != 0: 28 | # continue 29 | #if not "v50" in lists: 30 | # continue 31 | print model, lists 32 | if int(x[-1]) > 10 and "10m" in lists and int(x[-1]) % 100 != 0: 33 | continue 34 | if int(x[-1]) > 10 and "100m" in lists and int(x[-1]) % 10 != 0: 35 | continue 36 | 37 | pool.apply_async(func, (lists, vec_dir, ret_dir, )) 38 | pool.close() 39 | pool.join() 40 | print "Sub-process(es) done." 41 | 42 | 43 | -------------------------------------------------------------------------------- /evaluation/tfl/README.md: -------------------------------------------------------------------------------- 1 | # tfl 2 | 1. compile toefl.cpp 3 | 2. run toefl.py 4 | -------------------------------------------------------------------------------- /evaluation/tfl/toefl.cpp: -------------------------------------------------------------------------------- 1 | #define _CRT_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using namespace std; 9 | struct node { 10 | string w; 11 | string c[4]; 12 | int ans; 13 | }; 14 | map dict; 15 | 16 | const int MAX_STRING = 1000; 17 | int size; 18 | int ReadEmbedding(const char *file_name) { 19 | FILE *f = fopen(file_name, "rb"); 20 | if (f == NULL) { 21 | printf("Embedding file not found\n"); 22 | return -1; 23 | } 24 | int wordNum; 25 | fscanf(f, "%d", &wordNum); 26 | fscanf(f, "%d", &size); 27 | 28 | char str[MAX_STRING]; 29 | double *tmp = new double[size]; 30 | for (int b = 0; b < wordNum; b++) { 31 | char ch; 32 | fscanf(f, "%s%c", str, &ch); 33 | /*for (int i = 0; str[i]; i++){ 34 | if (str[i] >= 'A' && str[i] <= 'Z'){ 35 | str[i] = str[i] - 'A' + 'a'; 36 | } 37 | }*/ 38 | map::iterator it = dict.find(str); 39 | double *v = tmp; 40 | if (it != dict.end()) { 41 | if (it->second == NULL) { 42 | it->second = new double[size]; 43 | v = it->second; 44 | } 45 | } 46 | for (int a = 0; a < size; a++) 47 | fscanf(f, "%lf", &v[a]); 48 | } 49 | fclose(f); 50 | return 0; 51 | } 52 | 53 | const double eps = 1e-8; 54 | double cosvec(double *a, double *b) { 55 | if (a == NULL || b == NULL) 56 | return 0; 57 | double t1 = 0, t2 = 0, t3 = 0; 58 | for (int i = 0; i < size; i++) { 59 | t1 += a[i] * b[i]; 60 | t2 += a[i] * a[i]; 61 | t3 += b[i] * b[i]; 62 | } 63 | return t1 / sqrt(t2 + eps) / sqrt(t3 + eps); 64 | } 65 | 66 | 67 | double pearson(vector &a, vector &b) { 68 | double avg_a = 0, avg_b = 0; 69 | int n = a.size(); 70 | for (int i = 0; i < n; i++) { 71 | avg_a += a[i]; 72 | avg_b += b[i]; 73 | } 74 | avg_a /= n; 75 | avg_b /= n; 76 | double v1 = 0, v2 = 0, v3 = 0; 77 | for (int i = 0; i < n; i++) { 78 | v1 += (a[i] - avg_a) * (b[i] - avg_b); 79 | v2 += (a[i] - avg_a) * (a[i] - avg_a); 80 | v3 += (b[i] - avg_b) * (b[i] - avg_b); 81 | } 82 | return v1 / sqrt(v2 + eps) / sqrt(v3 + eps); 83 | } 84 | 85 | void solve(const char *dataset, const char *embedding) { 86 | vector lst; 87 | 88 | char w1[MAX_STRING], w2[MAX_STRING]; 89 | double val; 90 | FILE *fd = fopen(dataset, "r"); 91 | while (fscanf(fd, "%s%s", w1, w2) != EOF) { 92 | node n; 93 | n.w = w2; 94 | dict[w2] = NULL; 95 | for (int i = 0; i < 4; i++) { 96 | fscanf(fd, "%s%s", w1, w2); 97 | n.c[i] = w2; 98 | dict[w2] = NULL; 99 | } 100 | fscanf(fd, "%s", w1); 101 | n.ans = w1[0] - 'a'; 102 | lst.push_back(n); 103 | } 104 | fclose(fd); 105 | 106 | if (embedding) 107 | ReadEmbedding(embedding); 108 | 109 | for (map::iterator it = dict.begin(); it != dict.end(); it++) { 110 | if (it->second == NULL) { 111 | fprintf(stderr, "cannot find word: %s\n", it->first.c_str()); 112 | } 113 | } 114 | 115 | int correct = 0; 116 | for (int i = 0; i < lst.size(); i++) { 117 | double *v1 = dict[lst[i].w]; 118 | int bestid = -1; 119 | double best = -1; 120 | for (int j = 0; j < 4; j++) { 121 | double *v2 = dict[lst[i].c[j]]; 122 | double s = cosvec(v1, v2); 123 | if (s > best) { 124 | best = s; 125 | bestid = j; 126 | } 127 | } 128 | if (bestid == lst[i].ans) { 129 | correct++; 130 | } 131 | } 132 | printf("%lf\n", 1.0*correct / lst.size()); 133 | } 134 | 135 | 136 | int main(int argc, char **argv) { 137 | if (argc != 2) { 138 | printf("Useage: ./toefl embedding > result\n"); 139 | return 0; 140 | } 141 | 142 | solve("toefl.txt", argv[1]); 143 | 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /evaluation/tfl/toefl.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import time 4 | 5 | #models = ["hidden", "lbl", "senna"] 6 | #models = ["turian"] 7 | models = ["ivlblskip", "ivlblcbow"] 8 | 9 | def func(msg, vec_dir, ret_dir): 10 | arg = './toefl %s/%s > %s/%s' % (vec_dir, msg, ret_dir, msg) 11 | print arg 12 | os.system(arg) 13 | 14 | 15 | if __name__ == "__main__": 16 | pool = multiprocessing.Pool(processes=16) 17 | 18 | for model in models: 19 | vec_dir = "vec_%s" % model 20 | ret_dir = "ret_toefl_%s" % model 21 | 22 | if not os.path.exists(ret_dir): 23 | os.makedirs(ret_dir) 24 | 25 | for lists in os.listdir(vec_dir): 26 | if not os.path.exists(os.path.join(ret_dir, lists)): 27 | 28 | x = lists.replace('.txt','').replace('.bz2','').split('_') 29 | #if not "v50" in lists: 30 | # continue 31 | #if int(x[-1]) > 10 and ("10m" in lists or "13m" in lists) and int(x[-1]) % 100 != 0: 32 | # continue 33 | #if int(x[-1]) > 10 and "100m" in lists and int(x[-1]) % 10 != 0: 34 | # continue 35 | print lists 36 | pool.apply_async(func, (lists, vec_dir, ret_dir, )) 37 | pool.close() 38 | pool.join() 39 | print "Sub-process(es) done." 40 | 41 | 42 | -------------------------------------------------------------------------------- /evaluation/tfl/toefl.txt: -------------------------------------------------------------------------------- 1 | 1. enormously 2 | a. appropriately 3 | b. uniquely 4 | c. tremendously 5 | d. decidedly 6 | c 7 | 2. provisions 8 | a. stipulations 9 | b. interrelations 10 | c. jurisdictions 11 | d. interpretations 12 | a 13 | 3. haphazardly 14 | a. dangerously 15 | b. densely 16 | c. randomly 17 | d. linearly 18 | c 19 | 4. prominent 20 | a. battered 21 | b. ancient 22 | c. mysterious 23 | d. conspicuous 24 | d 25 | 5. zenith 26 | a. completion 27 | b. pinnacle 28 | c. outset 29 | d. decline 30 | b 31 | 6. flawed 32 | a. tiny 33 | b. imperfect 34 | c. lustrous 35 | d. crude 36 | b 37 | 7. urgently 38 | a. typically 39 | b. conceivably 40 | c. tentatively 41 | d. desperately 42 | d 43 | 8. consumed 44 | a. bred 45 | b. caught 46 | c. eaten 47 | d. supplied 48 | c 49 | 9. advent 50 | a. coming 51 | b. arrest 52 | c. financing 53 | d. stability 54 | a 55 | 10. concisely 56 | a. powerfully 57 | b. positively 58 | c. freely 59 | d. succinctly 60 | d 61 | 11. salutes 62 | a. information 63 | b. ceremonies 64 | c. greetings 65 | d. privileges 66 | c 67 | 12. solitary 68 | a. alert 69 | b. restless 70 | c. alone 71 | d. fearless 72 | c 73 | 13. hasten 74 | a. permit 75 | b. determine 76 | c. accelerate 77 | d. accompany 78 | c 79 | 14. perseverance 80 | a. endurance 81 | b. skill 82 | c. generosity 83 | d. disturbance 84 | a 85 | 15. fanciful 86 | a. familiar 87 | b. imaginative 88 | c. apparent 89 | d. logical 90 | b 91 | 16. showed 92 | a. demonstrated 93 | b. published 94 | c. repeated 95 | d. postponed 96 | a 97 | 17. constantly 98 | a. instantly 99 | b. continually 100 | c. rapidly 101 | d. accidentally 102 | b 103 | 18. issues 104 | a. training 105 | b. salaries 106 | c. subjects 107 | d. benefits 108 | c 109 | 19. furnish 110 | a. supply 111 | b. impress 112 | c. protect 113 | d. advise 114 | a 115 | 20. costly 116 | a. expensive 117 | b. beautiful 118 | c. popular 119 | d. complicated 120 | a 121 | 21. recognized 122 | a. successful 123 | b. depicted 124 | c. acknowledged 125 | d. welcomed 126 | c 127 | 22. spot 128 | a. climate 129 | b. latitude 130 | c. sea 131 | d. location 132 | d 133 | 23. make 134 | a. earn 135 | b. print 136 | c. trade 137 | d. borrow 138 | a 139 | 24. often 140 | a. definitely 141 | b. frequently 142 | c. chemically 143 | d. hardly 144 | b 145 | 25. easygoing 146 | a. frontier 147 | b. boring 148 | c. farming 149 | d. relaxed 150 | d 151 | 26. debate 152 | a. war 153 | b. argument 154 | c. election 155 | d. competition 156 | b 157 | 27. narrow 158 | a. clear 159 | b. freezing 160 | c. thin 161 | d. poisonous 162 | c 163 | 28. arranged 164 | a. planned 165 | b. explained 166 | c. studied 167 | d. discarded 168 | a 169 | 29. infinite 170 | a. limitless 171 | b. relative 172 | c. unusual 173 | d. structural 174 | a 175 | 30. showy 176 | a. striking 177 | b. prickly 178 | c. entertaining 179 | d. incidental 180 | a 181 | 31. levied 182 | a. imposed 183 | b. believed 184 | c. requested 185 | d. correlated 186 | a 187 | 32. deftly 188 | a. skillfully 189 | b. prudently 190 | c. occasionally 191 | d. humorously 192 | a 193 | 33. distribute 194 | a. commercialize 195 | b. circulate 196 | c. research 197 | d. acknowledge 198 | b 199 | 34. discrepancies 200 | a. weights 201 | b. deposits 202 | c. wavelengths 203 | d. differences 204 | d 205 | 35. prolific 206 | a. productive 207 | b. serious 208 | c. capable 209 | d. promising 210 | a 211 | 36. unmatched 212 | a. unrecognized 213 | b. unequaled 214 | c. alienated 215 | d. emulated 216 | b 217 | 37. peculiarly 218 | a. partly 219 | b. uniquely 220 | c. patriotically 221 | d. suspiciously 222 | b 223 | 38. hue 224 | a. glare 225 | b. contrast 226 | c. color 227 | d. scent 228 | c 229 | 39. hind 230 | a. curved 231 | b. muscular 232 | c. hairy 233 | d. rear 234 | d 235 | 40. highlight 236 | a. alter 237 | b. imitate 238 | c. accentuate 239 | d. restore 240 | c 241 | 41. hastily 242 | a. hurriedly 243 | b. shrewdly 244 | c. habitually 245 | d. chronologically 246 | a 247 | 42. temperate 248 | a. cold 249 | b. mild 250 | c. short 251 | d. windy 252 | b 253 | 43. grin 254 | a. exercise 255 | b. rest 256 | c. joke 257 | d. smile 258 | d 259 | 44. verbally 260 | a. orally 261 | b. overtly 262 | c. fittingly 263 | d. verbosely 264 | a 265 | 45. physician 266 | a. chemist 267 | b. pharmacist 268 | c. nurse 269 | d. doctor 270 | d 271 | 46. essentially 272 | a. possibly 273 | b. eagerly 274 | c. basically 275 | d. ordinarily 276 | c 277 | 47. keen 278 | a. useful 279 | b. simple 280 | c. famous 281 | d. sharp 282 | d 283 | 48. situated 284 | a. rotating 285 | b. isolated 286 | c. emptying 287 | d. positioned 288 | d 289 | 49. principal 290 | a. most 291 | b. numerous 292 | c. major 293 | d. exceptional 294 | c 295 | 50. slowly 296 | a. rarely 297 | b. gradually 298 | c. effectively 299 | d. continuously 300 | b 301 | 51. built 302 | a. constructed 303 | b. proposed 304 | c. financed 305 | d. organized 306 | a 307 | 52. tasks 308 | a. customers 309 | b. materials 310 | c. shops 311 | d. jobs 312 | d 313 | 53. unlikely 314 | a. improbable 315 | b. disagreeable 316 | c. different 317 | d. unpopular 318 | a 319 | 54. halfheartedly 320 | a. customarily 321 | b. bipartisanly 322 | c. apathetically 323 | d. unconventionally 324 | c 325 | 55. annals 326 | a. homes 327 | b. trails 328 | c. chronicles 329 | d. songs 330 | c 331 | 56. wildly 332 | a. distinctively 333 | b. mysteriously 334 | c. abruptly 335 | d. furiously 336 | d 337 | 57. hailed 338 | a. judged 339 | b. acclaimed 340 | c. remembered 341 | d. addressed 342 | b 343 | 58. command 344 | a. observation 345 | b. love 346 | c. awareness 347 | d. mastery 348 | d 349 | 59. concocted 350 | a. devised 351 | b. cleaned 352 | c. requested 353 | d. supervised 354 | a 355 | 60. prospective 356 | a. particular 357 | b. prudent 358 | c. potential 359 | d. prominent 360 | c 361 | 61. generally 362 | a. descriptively 363 | b. broadly 364 | c. controversially 365 | d. accurately 366 | b 367 | 62. sustained 368 | a. prolonged 369 | b. refined 370 | c. lowered 371 | d. analyzed 372 | a 373 | 63. perilous 374 | a. binding 375 | b. exciting 376 | c. offensive 377 | d. dangerous 378 | d 379 | 64. tranquillity 380 | a. peacefulness 381 | b. harshness 382 | c. weariness 383 | d. happiness 384 | a 385 | 65. dissipate 386 | a. disperse 387 | b. isolate 388 | c. disguise 389 | d. photograph 390 | a 391 | 66. primarily 392 | a. occasionally 393 | b. cautiously 394 | c. consistently 395 | d. chiefly 396 | d 397 | 67. colloquial 398 | a. recorded 399 | b. misunderstood 400 | c. incorrect 401 | d. conversational 402 | d 403 | 68. resolved 404 | a. publicized 405 | b. forgotten 406 | c. settled 407 | d. examined 408 | c 409 | 69. feasible 410 | a. permitted 411 | b. possible 412 | c. equitable 413 | d. evident 414 | b 415 | 70. expeditiously 416 | a. frequently 417 | b. actually 418 | c. rapidly 419 | d. repeatedly 420 | c 421 | 71. percentage 422 | a. volume 423 | b. sample 424 | c. proportion 425 | d. profit 426 | c 427 | 72. terminated 428 | a. ended 429 | b. posed 430 | c. postponed 431 | d. evaluated 432 | a 433 | 73. uniform 434 | a. hard 435 | b. complex 436 | c. alike 437 | d. sharp 438 | c 439 | 74. figure 440 | a. list 441 | b. solve 442 | c. divide 443 | d. express 444 | b 445 | 75. sufficient 446 | a. recent 447 | b. physiological 448 | c. enough 449 | d. valuable 450 | c 451 | 76. fashion 452 | a. ration 453 | b. fathom 454 | c. craze 455 | d. manner 456 | d 457 | 77. marketed 458 | a. frozen 459 | b. sold 460 | c. sweetened 461 | d. diluted 462 | b 463 | 78. bigger 464 | a. steadier 465 | b. closer 466 | c. larger 467 | d. better 468 | c 469 | 79. roots 470 | a. origins 471 | b. rituals 472 | c. cure 473 | d. function 474 | a 475 | 80. normally 476 | a. haltingly 477 | b. ordinarily 478 | c. permanently 479 | d. periodically 480 | b 481 | -------------------------------------------------------------------------------- /evaluation/ws/README.md: -------------------------------------------------------------------------------- 1 | # ws 2 | 1. compile ws.cpp 3 | 2. run ws.py 4 | -------------------------------------------------------------------------------- /evaluation/ws/ws.cpp: -------------------------------------------------------------------------------- 1 | #define _CRT_SECURE_NO_WARNINGS 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using namespace std; 9 | struct node { 10 | string w1, w2; 11 | double val; 12 | }; 13 | map dict; 14 | 15 | const int MAX_STRING = 1000; 16 | int size; 17 | int ReadEmbedding(const char *file_name) { 18 | FILE *f = fopen(file_name, "rb"); 19 | if (f == NULL) { 20 | printf("Embedding file not found\n"); 21 | return -1; 22 | } 23 | int wordNum; 24 | fscanf(f, "%d", &wordNum); 25 | fscanf(f, "%d", &size); 26 | 27 | char str[MAX_STRING]; 28 | double *tmp = new double[size]; 29 | for (int b = 0; b < wordNum; b++) { 30 | char ch; 31 | fscanf(f, "%s%c", str, &ch); 32 | /*for (int i = 0; str[i]; i++){ 33 | if (str[i] >= 'A' && str[i] <= 'Z'){ 34 | str[i] = str[i] - 'A' + 'a'; 35 | } 36 | }*/ 37 | map::iterator it = dict.find(str); 38 | double *v = tmp; 39 | if (it != dict.end()) { 40 | if (it->second == NULL) { 41 | it->second = new double[size]; 42 | v = it->second; 43 | } 44 | } 45 | for (int a = 0; a < size; a++) 46 | fscanf(f, "%lf", &v[a]); 47 | } 48 | fclose(f); 49 | return 0; 50 | } 51 | 52 | const double eps = 1e-8; 53 | double cosvec(double *a, double *b) { 54 | if (a == NULL || b == NULL) 55 | return 0; 56 | double t1 = 0, t2 = 0, t3 = 0; 57 | for (int i = 0; i < size; i++) { 58 | t1 += a[i] * b[i]; 59 | t2 += a[i] * a[i]; 60 | t3 += b[i] * b[i]; 61 | } 62 | return t1 / sqrt(t2 + eps) / sqrt(t3 + eps); 63 | } 64 | 65 | 66 | double pearson(vector &a, vector &b) { 67 | double avg_a = 0, avg_b = 0; 68 | int n = a.size(); 69 | for (int i = 0; i < n; i++) { 70 | avg_a += a[i]; 71 | avg_b += b[i]; 72 | } 73 | avg_a /= n; 74 | avg_b /= n; 75 | double v1 = 0, v2 = 0, v3 = 0; 76 | for (int i = 0; i < n; i++) { 77 | v1 += (a[i] - avg_a) * (b[i] - avg_b); 78 | v2 += (a[i] - avg_a) * (a[i] - avg_a); 79 | v3 += (b[i] - avg_b) * (b[i] - avg_b); 80 | } 81 | return v1 / sqrt(v2 + eps) / sqrt(v3 + eps); 82 | } 83 | 84 | void solve(const char *dataset, const char *embedding) { 85 | vector lst; 86 | 87 | char w1[MAX_STRING], w2[MAX_STRING]; 88 | double val; 89 | FILE *fd = fopen(dataset, "r"); 90 | while (fscanf(fd, "%s%s%lf", w1, w2, &val) != EOF) { 91 | node n; 92 | n.w1 = w1; 93 | n.w2 = w2; 94 | n.val = val; 95 | lst.push_back(n); 96 | if (embedding) { 97 | dict[n.w1] = NULL; 98 | dict[n.w2] = NULL; 99 | } 100 | 101 | } 102 | fclose(fd); 103 | 104 | if (embedding) 105 | ReadEmbedding(embedding); 106 | 107 | for (map::iterator it = dict.begin(); it != dict.end(); it++) { 108 | if (it->second == NULL) { 109 | fprintf(stderr, "cannot find word: %s\n", it->first.c_str()); 110 | } 111 | } 112 | 113 | vector aa, bb; 114 | for (int i = 0; i < lst.size(); i++) { 115 | double *v1 = dict[lst[i].w1]; 116 | double *v2 = dict[lst[i].w2]; 117 | aa.push_back(lst[i].val); 118 | bb.push_back(cosvec(v1, v2)); 119 | } 120 | printf("%lf\n", pearson(aa, bb)); 121 | } 122 | 123 | 124 | int main(int argc, char **argv) { 125 | if (argc != 2) { 126 | printf("Useage: ./ws embedding > pearson\n"); 127 | return 0; 128 | } 129 | 130 | solve("ws353.txt", argv[1]); 131 | solve("ws353_relatedness.txt", NULL); 132 | solve("ws353_similarity.txt", NULL); 133 | 134 | return 0; 135 | } 136 | -------------------------------------------------------------------------------- /evaluation/ws/ws.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import time 4 | 5 | #models = ["hidden", "lbl", "senna"] 6 | #models = ["turian"] 7 | models = ["ivlblskip", "ivlblcbow"] 8 | 9 | def func(msg, vec_dir, ret_dir): 10 | arg = './ws %s/%s > %s/%s' % (vec_dir, msg, ret_dir, msg) 11 | print arg 12 | os.system(arg) 13 | 14 | 15 | if __name__ == "__main__": 16 | pool = multiprocessing.Pool(processes=16) 17 | 18 | for model in models: 19 | vec_dir = "vec_%s" % model 20 | ret_dir = "ret_ws_%s" % model 21 | 22 | if not os.path.exists(ret_dir): 23 | os.makedirs(ret_dir) 24 | 25 | for lists in os.listdir(vec_dir): 26 | if not os.path.exists(os.path.join(ret_dir, lists)): 27 | 28 | x = lists.replace('.txt','').replace('.bz2','').split('_') 29 | #if not "v50" in lists: 30 | # continue 31 | #if int(x[-1]) > 10 and ("10m" in lists or "13m" in lists) and int(x[-1]) % 100 != 0: 32 | # continue 33 | #if int(x[-1]) > 10 and "100m" in lists and int(x[-1]) % 10 != 0: 34 | # continue 35 | print lists 36 | pool.apply_async(func, (lists, vec_dir, ret_dir, )) 37 | pool.close() 38 | pool.join() 39 | print "Sub-process(es) done." 40 | 41 | 42 | -------------------------------------------------------------------------------- /evaluation/ws/ws353.txt: -------------------------------------------------------------------------------- 1 | love sex 6.77 2 | tiger cat 7.35 3 | tiger tiger 10.00 4 | book paper 7.46 5 | computer keyboard 7.62 6 | computer internet 7.58 7 | plane car 5.77 8 | train car 6.31 9 | telephone communication 7.50 10 | television radio 6.77 11 | media radio 7.42 12 | drug abuse 6.85 13 | bread butter 6.19 14 | cucumber potato 5.92 15 | doctor nurse 7.00 16 | professor doctor 6.62 17 | student professor 6.81 18 | smart student 4.62 19 | smart stupid 5.81 20 | company stock 7.08 21 | stock market 8.08 22 | stock phone 1.62 23 | stock CD 1.31 24 | stock jaguar 0.92 25 | stock egg 1.81 26 | fertility egg 6.69 27 | stock live 3.73 28 | stock life 0.92 29 | book library 7.46 30 | bank money 8.12 31 | wood forest 7.73 32 | money cash 9.15 33 | professor cucumber 0.31 34 | king cabbage 0.23 35 | king queen 8.58 36 | king rook 5.92 37 | bishop rabbi 6.69 38 | Jerusalem Israel 8.46 39 | Jerusalem Palestinian 7.65 40 | holy sex 1.62 41 | fuck sex 9.44 42 | Maradona football 8.62 43 | football soccer 9.03 44 | football basketball 6.81 45 | football tennis 6.63 46 | tennis racket 7.56 47 | Arafat peace 6.73 48 | Arafat terror 7.65 49 | Arafat Jackson 2.50 50 | law lawyer 8.38 51 | movie star 7.38 52 | movie popcorn 6.19 53 | movie critic 6.73 54 | movie theater 7.92 55 | physics proton 8.12 56 | physics chemistry 7.35 57 | space chemistry 4.88 58 | alcohol chemistry 5.54 59 | vodka gin 8.46 60 | vodka brandy 8.13 61 | drink car 3.04 62 | drink ear 1.31 63 | drink mouth 5.96 64 | drink eat 6.87 65 | baby mother 7.85 66 | drink mother 2.65 67 | car automobile 8.94 68 | gem jewel 8.96 69 | journey voyage 9.29 70 | boy lad 8.83 71 | coast shore 9.10 72 | asylum madhouse 8.87 73 | magician wizard 9.02 74 | midday noon 9.29 75 | furnace stove 8.79 76 | food fruit 7.52 77 | bird cock 7.10 78 | bird crane 7.38 79 | tool implement 6.46 80 | brother monk 6.27 81 | crane implement 2.69 82 | lad brother 4.46 83 | journey car 5.85 84 | monk oracle 5.00 85 | cemetery woodland 2.08 86 | food rooster 4.42 87 | coast hill 4.38 88 | forest graveyard 1.85 89 | shore woodland 3.08 90 | monk slave 0.92 91 | coast forest 3.15 92 | lad wizard 0.92 93 | chord smile 0.54 94 | glass magician 2.08 95 | noon string 0.54 96 | rooster voyage 0.62 97 | money dollar 8.42 98 | money cash 9.08 99 | money currency 9.04 100 | money wealth 8.27 101 | money property 7.57 102 | money possession 7.29 103 | money bank 8.50 104 | money deposit 7.73 105 | money withdrawal 6.88 106 | money laundering 5.65 107 | money operation 3.31 108 | tiger jaguar 8.00 109 | tiger feline 8.00 110 | tiger carnivore 7.08 111 | tiger mammal 6.85 112 | tiger animal 7.00 113 | tiger organism 4.77 114 | tiger fauna 5.62 115 | tiger zoo 5.87 116 | psychology psychiatry 8.08 117 | psychology anxiety 7.00 118 | psychology fear 6.85 119 | psychology depression 7.42 120 | psychology clinic 6.58 121 | psychology doctor 6.42 122 | psychology Freud 8.21 123 | psychology mind 7.69 124 | psychology health 7.23 125 | psychology science 6.71 126 | psychology discipline 5.58 127 | psychology cognition 7.48 128 | planet star 8.45 129 | planet constellation 8.06 130 | planet moon 8.08 131 | planet sun 8.02 132 | planet galaxy 8.11 133 | planet space 7.92 134 | planet astronomer 7.94 135 | precedent example 5.85 136 | precedent information 3.85 137 | precedent cognition 2.81 138 | precedent law 6.65 139 | precedent collection 2.50 140 | precedent group 1.77 141 | precedent antecedent 6.04 142 | cup coffee 6.58 143 | cup tableware 6.85 144 | cup article 2.40 145 | cup artifact 2.92 146 | cup object 3.69 147 | cup entity 2.15 148 | cup drink 7.25 149 | cup food 5.00 150 | cup substance 1.92 151 | cup liquid 5.90 152 | jaguar cat 7.42 153 | jaguar car 7.27 154 | energy secretary 1.81 155 | secretary senate 5.06 156 | energy laboratory 5.09 157 | computer laboratory 6.78 158 | weapon secret 6.06 159 | FBI fingerprint 6.94 160 | FBI investigation 8.31 161 | investigation effort 4.59 162 | Mars water 2.94 163 | Mars scientist 5.63 164 | news report 8.16 165 | canyon landscape 7.53 166 | image surface 4.56 167 | discovery space 6.34 168 | water seepage 6.56 169 | sign recess 2.38 170 | Wednesday news 2.22 171 | mile kilometer 8.66 172 | computer news 4.47 173 | territory surface 5.34 174 | atmosphere landscape 3.69 175 | president medal 3.00 176 | war troops 8.13 177 | record number 6.31 178 | skin eye 6.22 179 | Japanese American 6.50 180 | theater history 3.91 181 | volunteer motto 2.56 182 | prejudice recognition 3.00 183 | decoration valor 5.63 184 | century year 7.59 185 | century nation 3.16 186 | delay racism 1.19 187 | delay news 3.31 188 | minister party 6.63 189 | peace plan 4.75 190 | minority peace 3.69 191 | attempt peace 4.25 192 | government crisis 6.56 193 | deployment departure 4.25 194 | deployment withdrawal 5.88 195 | energy crisis 5.94 196 | announcement news 7.56 197 | announcement effort 2.75 198 | stroke hospital 7.03 199 | disability death 5.47 200 | victim emergency 6.47 201 | treatment recovery 7.91 202 | journal association 4.97 203 | doctor personnel 5.00 204 | doctor liability 5.19 205 | liability insurance 7.03 206 | school center 3.44 207 | reason hypertension 2.31 208 | reason criterion 5.91 209 | hundred percent 7.38 210 | Harvard Yale 8.13 211 | hospital infrastructure 4.63 212 | death row 5.25 213 | death inmate 5.03 214 | lawyer evidence 6.69 215 | life death 7.88 216 | life term 4.50 217 | word similarity 4.75 218 | board recommendation 4.47 219 | governor interview 3.25 220 | OPEC country 5.63 221 | peace atmosphere 3.69 222 | peace insurance 2.94 223 | territory kilometer 5.28 224 | travel activity 5.00 225 | competition price 6.44 226 | consumer confidence 4.13 227 | consumer energy 4.75 228 | problem airport 2.38 229 | car flight 4.94 230 | credit card 8.06 231 | credit information 5.31 232 | hotel reservation 8.03 233 | grocery money 5.94 234 | registration arrangement 6.00 235 | arrangement accommodation 5.41 236 | month hotel 1.81 237 | type kind 8.97 238 | arrival hotel 6.00 239 | bed closet 6.72 240 | closet clothes 8.00 241 | situation conclusion 4.81 242 | situation isolation 3.88 243 | impartiality interest 5.16 244 | direction combination 2.25 245 | street place 6.44 246 | street avenue 8.88 247 | street block 6.88 248 | street children 4.94 249 | listing proximity 2.56 250 | listing category 6.38 251 | cell phone 7.81 252 | production hike 1.75 253 | benchmark index 4.25 254 | media trading 3.88 255 | media gain 2.88 256 | dividend payment 7.63 257 | dividend calculation 6.48 258 | calculation computation 8.44 259 | currency market 7.50 260 | OPEC oil 8.59 261 | oil stock 6.34 262 | announcement production 3.38 263 | announcement warning 6.00 264 | profit warning 3.88 265 | profit loss 7.63 266 | dollar yen 7.78 267 | dollar buck 9.22 268 | dollar profit 7.38 269 | dollar loss 6.09 270 | computer software 8.50 271 | network hardware 8.31 272 | phone equipment 7.13 273 | equipment maker 5.91 274 | luxury car 6.47 275 | five month 3.38 276 | report gain 3.63 277 | investor earning 7.13 278 | liquid water 7.89 279 | baseball season 5.97 280 | game victory 7.03 281 | game team 7.69 282 | marathon sprint 7.47 283 | game series 6.19 284 | game defeat 6.97 285 | seven series 3.56 286 | seafood sea 7.47 287 | seafood food 8.34 288 | seafood lobster 8.70 289 | lobster food 7.81 290 | lobster wine 5.70 291 | food preparation 6.22 292 | video archive 6.34 293 | start year 4.06 294 | start match 4.47 295 | game round 5.97 296 | boxing round 7.61 297 | championship tournament 8.36 298 | fighting defeating 7.41 299 | line insurance 2.69 300 | day summer 3.94 301 | summer drought 7.16 302 | summer nature 5.63 303 | day dawn 7.53 304 | nature environment 8.31 305 | environment ecology 8.81 306 | nature man 6.25 307 | man woman 8.30 308 | man governor 5.25 309 | murder manslaughter 8.53 310 | soap opera 7.94 311 | opera performance 6.88 312 | life lesson 5.94 313 | focus life 4.06 314 | production crew 6.25 315 | television film 7.72 316 | lover quarrel 6.19 317 | viewer serial 2.97 318 | possibility girl 1.94 319 | population development 3.75 320 | morality importance 3.31 321 | morality marriage 3.69 322 | Mexico Brazil 7.44 323 | gender equality 6.41 324 | change attitude 5.44 325 | family planning 6.25 326 | opera industry 2.63 327 | sugar approach 0.88 328 | practice institution 3.19 329 | ministry culture 4.69 330 | problem challenge 6.75 331 | size prominence 5.31 332 | country citizen 7.31 333 | planet people 5.75 334 | development issue 3.97 335 | experience music 3.47 336 | music project 3.63 337 | glass metal 5.56 338 | aluminum metal 7.83 339 | chance credibility 3.88 340 | exhibit memorabilia 5.31 341 | concert virtuoso 6.81 342 | rock jazz 7.59 343 | museum theater 7.19 344 | observation architecture 4.38 345 | space world 6.53 346 | preservation world 6.19 347 | admission ticket 7.69 348 | shower thunderstorm 6.31 349 | shower flood 6.03 350 | weather forecast 8.34 351 | disaster area 6.25 352 | governor office 6.34 353 | architecture century 3.78 354 | -------------------------------------------------------------------------------- /evaluation/ws/ws353_relatedness.txt: -------------------------------------------------------------------------------- 1 | computer keyboard 7.62 2 | Jerusalem Israel 8.46 3 | planet galaxy 8.11 4 | canyon landscape 7.53 5 | OPEC country 5.63 6 | day summer 3.94 7 | day dawn 7.53 8 | country citizen 7.31 9 | planet people 5.75 10 | environment ecology 8.81 11 | Maradona football 8.62 12 | OPEC oil 8.59 13 | money bank 8.50 14 | computer software 8.50 15 | law lawyer 8.38 16 | weather forecast 8.34 17 | network hardware 8.31 18 | nature environment 8.31 19 | FBI investigation 8.31 20 | money wealth 8.27 21 | psychology Freud 8.21 22 | news report 8.16 23 | war troops 8.13 24 | physics proton 8.12 25 | bank money 8.12 26 | stock market 8.08 27 | planet constellation 8.06 28 | credit card 8.06 29 | hotel reservation 8.03 30 | closet clothes 8.00 31 | soap opera 7.94 32 | planet astronomer 7.94 33 | planet space 7.92 34 | movie theater 7.92 35 | treatment recovery 7.91 36 | baby mother 7.85 37 | money deposit 7.73 38 | television film 7.72 39 | psychology mind 7.69 40 | game team 7.69 41 | admission ticket 7.69 42 | Jerusalem Palestinian 7.65 43 | Arafat terror 7.65 44 | boxing round 7.61 45 | computer internet 7.58 46 | money property 7.57 47 | tennis racket 7.56 48 | telephone communication 7.50 49 | currency market 7.50 50 | psychology cognition 7.48 51 | seafood sea 7.47 52 | book paper 7.46 53 | book library 7.46 54 | psychology depression 7.42 55 | fighting defeating 7.41 56 | movie star 7.38 57 | hundred percent 7.38 58 | dollar profit 7.38 59 | money possession 7.29 60 | cup drink 7.25 61 | psychology health 7.23 62 | summer drought 7.16 63 | investor earning 7.13 64 | company stock 7.08 65 | stroke hospital 7.03 66 | liability insurance 7.03 67 | game victory 7.03 68 | psychology anxiety 7.00 69 | game defeat 6.97 70 | FBI fingerprint 6.94 71 | money withdrawal 6.88 72 | psychology fear 6.85 73 | drug abuse 6.85 74 | concert virtuoso 6.81 75 | computer laboratory 6.78 76 | love sex 6.77 77 | problem challenge 6.75 78 | movie critic 6.73 79 | Arafat peace 6.73 80 | bed closet 6.72 81 | lawyer evidence 6.69 82 | fertility egg 6.69 83 | precedent law 6.65 84 | minister party 6.63 85 | psychology clinic 6.58 86 | cup coffee 6.58 87 | water seepage 6.56 88 | government crisis 6.56 89 | space world 6.53 90 | dividend calculation 6.48 91 | victim emergency 6.47 92 | luxury car 6.47 93 | tool implement 6.46 94 | competition price 6.44 95 | psychology doctor 6.42 96 | gender equality 6.41 97 | listing category 6.38 98 | video archive 6.34 99 | oil stock 6.34 100 | governor office 6.34 101 | discovery space 6.34 102 | record number 6.31 103 | brother monk 6.27 104 | production crew 6.25 105 | nature man 6.25 106 | family planning 6.25 107 | disaster area 6.25 108 | food preparation 6.22 109 | preservation world 6.19 110 | movie popcorn 6.19 111 | lover quarrel 6.19 112 | game series 6.19 113 | dollar loss 6.09 114 | weapon secret 6.06 115 | shower flood 6.03 116 | registration arrangement 6.00 117 | arrival hotel 6.00 118 | announcement warning 6.00 119 | game round 5.97 120 | baseball season 5.97 121 | drink mouth 5.96 122 | life lesson 5.94 123 | grocery money 5.94 124 | energy crisis 5.94 125 | reason criterion 5.91 126 | equipment maker 5.91 127 | cup liquid 5.90 128 | deployment withdrawal 5.88 129 | tiger zoo 5.87 130 | journey car 5.85 131 | money laundering 5.65 132 | summer nature 5.63 133 | decoration valor 5.63 134 | Mars scientist 5.63 135 | alcohol chemistry 5.54 136 | disability death 5.47 137 | change attitude 5.44 138 | arrangement accommodation 5.41 139 | territory surface 5.34 140 | size prominence 5.31 141 | exhibit memorabilia 5.31 142 | credit information 5.31 143 | territory kilometer 5.28 144 | death row 5.25 145 | doctor liability 5.19 146 | impartiality interest 5.16 147 | energy laboratory 5.09 148 | secretary senate 5.06 149 | death inmate 5.03 150 | monk oracle 5.00 151 | cup food 5.00 152 | journal association 4.97 153 | street children 4.94 154 | car flight 4.94 155 | space chemistry 4.88 156 | situation conclusion 4.81 157 | word similarity 4.75 158 | peace plan 4.75 159 | consumer energy 4.75 160 | ministry culture 4.69 161 | smart student 4.62 162 | investigation effort 4.59 163 | image surface 4.56 164 | life term 4.50 165 | start match 4.47 166 | computer news 4.47 167 | board recommendation 4.47 168 | lad brother 4.46 169 | observation architecture 4.38 170 | coast hill 4.38 171 | deployment departure 4.25 172 | benchmark index 4.25 173 | attempt peace 4.25 174 | consumer confidence 4.13 175 | start year 4.06 176 | focus life 4.06 177 | development issue 3.97 178 | theater history 3.91 179 | situation isolation 3.88 180 | profit warning 3.88 181 | media trading 3.88 182 | chance credibility 3.88 183 | precedent information 3.85 184 | architecture century 3.78 185 | population development 3.75 186 | stock live 3.73 187 | peace atmosphere 3.69 188 | morality marriage 3.69 189 | minority peace 3.69 190 | atmosphere landscape 3.69 191 | report gain 3.63 192 | music project 3.63 193 | seven series 3.56 194 | experience music 3.47 195 | school center 3.44 196 | five month 3.38 197 | announcement production 3.38 198 | morality importance 3.31 199 | money operation 3.31 200 | delay news 3.31 201 | governor interview 3.25 202 | practice institution 3.19 203 | century nation 3.16 204 | coast forest 3.15 205 | shore woodland 3.08 206 | drink car 3.04 207 | president medal 3.00 208 | prejudice recognition 3.00 209 | viewer serial 2.97 210 | peace insurance 2.94 211 | Mars water 2.94 212 | media gain 2.88 213 | precedent cognition 2.81 214 | announcement effort 2.75 215 | line insurance 2.69 216 | crane implement 2.69 217 | drink mother 2.65 218 | opera industry 2.63 219 | volunteer motto 2.56 220 | listing proximity 2.56 221 | precedent collection 2.50 222 | cup article 2.40 223 | sign recess 2.38 224 | problem airport 2.38 225 | reason hypertension 2.31 226 | direction combination 2.25 227 | Wednesday news 2.22 228 | glass magician 2.08 229 | cemetery woodland 2.08 230 | possibility girl 1.94 231 | cup substance 1.92 232 | forest graveyard 1.85 233 | stock egg 1.81 234 | month hotel 1.81 235 | energy secretary 1.81 236 | precedent group 1.77 237 | production hike 1.75 238 | stock phone 1.62 239 | holy sex 1.62 240 | stock CD 1.31 241 | drink ear 1.31 242 | delay racism 1.19 243 | stock life 0.92 244 | stock jaguar 0.92 245 | monk slave 0.92 246 | lad wizard 0.92 247 | sugar approach 0.88 248 | rooster voyage 0.62 249 | noon string 0.54 250 | chord smile 0.54 251 | professor cucumber 0.31 252 | king cabbage 0.23 253 | -------------------------------------------------------------------------------- /evaluation/ws/ws353_similarity.txt: -------------------------------------------------------------------------------- 1 | tiger cat 7.35 2 | tiger tiger 10.00 3 | plane car 5.77 4 | train car 6.31 5 | television radio 6.77 6 | media radio 7.42 7 | bread butter 6.19 8 | cucumber potato 5.92 9 | doctor nurse 7.00 10 | professor doctor 6.62 11 | student professor 6.81 12 | smart stupid 5.81 13 | wood forest 7.73 14 | money cash 9.15 15 | king queen 8.58 16 | king rook 5.92 17 | bishop rabbi 6.69 18 | fuck sex 9.44 19 | football soccer 9.03 20 | football basketball 6.81 21 | football tennis 6.63 22 | Arafat Jackson 2.50 23 | physics chemistry 7.35 24 | vodka gin 8.46 25 | vodka brandy 8.13 26 | drink eat 6.87 27 | car automobile 8.94 28 | gem jewel 8.96 29 | journey voyage 9.29 30 | boy lad 8.83 31 | coast shore 9.10 32 | asylum madhouse 8.87 33 | magician wizard 9.02 34 | midday noon 9.29 35 | furnace stove 8.79 36 | food fruit 7.52 37 | bird cock 7.10 38 | bird crane 7.38 39 | food rooster 4.42 40 | money dollar 8.42 41 | money currency 9.04 42 | tiger jaguar 8.00 43 | tiger feline 8.00 44 | tiger carnivore 7.08 45 | tiger mammal 6.85 46 | tiger animal 7.00 47 | tiger organism 4.77 48 | tiger fauna 5.62 49 | psychology psychiatry 8.08 50 | psychology science 6.71 51 | psychology discipline 5.58 52 | planet star 8.45 53 | planet moon 8.08 54 | planet sun 8.02 55 | precedent example 5.85 56 | precedent antecedent 6.04 57 | cup tableware 6.85 58 | cup artifact 2.92 59 | cup object 3.69 60 | cup entity 2.15 61 | jaguar cat 7.42 62 | jaguar car 7.27 63 | mile kilometer 8.66 64 | skin eye 6.22 65 | Japanese American 6.50 66 | century year 7.59 67 | announcement news 7.56 68 | doctor personnel 5.00 69 | Harvard Yale 8.13 70 | hospital infrastructure 4.63 71 | life death 7.88 72 | travel activity 5.00 73 | type kind 8.97 74 | street place 6.44 75 | street avenue 8.88 76 | street block 6.88 77 | cell phone 7.81 78 | dividend payment 7.63 79 | calculation computation 8.44 80 | profit loss 7.63 81 | dollar yen 7.78 82 | dollar buck 9.22 83 | phone equipment 7.13 84 | liquid water 7.89 85 | marathon sprint 7.47 86 | seafood food 8.34 87 | seafood lobster 8.70 88 | lobster food 7.81 89 | lobster wine 5.70 90 | championship tournament 8.36 91 | man woman 8.30 92 | man governor 5.25 93 | murder manslaughter 8.53 94 | opera performance 6.88 95 | Mexico Brazil 7.44 96 | glass metal 5.56 97 | aluminum metal 7.83 98 | rock jazz 7.59 99 | museum theater 7.19 100 | shower thunderstorm 6.31 101 | monk oracle 5.00 102 | cup food 5.00 103 | journal association 4.97 104 | street children 4.94 105 | car flight 4.94 106 | space chemistry 4.88 107 | situation conclusion 4.81 108 | word similarity 4.75 109 | peace plan 4.75 110 | consumer energy 4.75 111 | ministry culture 4.69 112 | smart student 4.62 113 | investigation effort 4.59 114 | image surface 4.56 115 | life term 4.50 116 | start match 4.47 117 | computer news 4.47 118 | board recommendation 4.47 119 | lad brother 4.46 120 | observation architecture 4.38 121 | coast hill 4.38 122 | deployment departure 4.25 123 | benchmark index 4.25 124 | attempt peace 4.25 125 | consumer confidence 4.13 126 | start year 4.06 127 | focus life 4.06 128 | development issue 3.97 129 | theater history 3.91 130 | situation isolation 3.88 131 | profit warning 3.88 132 | media trading 3.88 133 | chance credibility 3.88 134 | precedent information 3.85 135 | architecture century 3.78 136 | population development 3.75 137 | stock live 3.73 138 | peace atmosphere 3.69 139 | morality marriage 3.69 140 | minority peace 3.69 141 | atmosphere landscape 3.69 142 | report gain 3.63 143 | music project 3.63 144 | seven series 3.56 145 | experience music 3.47 146 | school center 3.44 147 | five month 3.38 148 | announcement production 3.38 149 | morality importance 3.31 150 | money operation 3.31 151 | delay news 3.31 152 | governor interview 3.25 153 | practice institution 3.19 154 | century nation 3.16 155 | coast forest 3.15 156 | shore woodland 3.08 157 | drink car 3.04 158 | president medal 3.00 159 | prejudice recognition 3.00 160 | viewer serial 2.97 161 | peace insurance 2.94 162 | Mars water 2.94 163 | media gain 2.88 164 | precedent cognition 2.81 165 | announcement effort 2.75 166 | line insurance 2.69 167 | crane implement 2.69 168 | drink mother 2.65 169 | opera industry 2.63 170 | volunteer motto 2.56 171 | listing proximity 2.56 172 | precedent collection 2.50 173 | cup article 2.40 174 | sign recess 2.38 175 | problem airport 2.38 176 | reason hypertension 2.31 177 | direction combination 2.25 178 | Wednesday news 2.22 179 | glass magician 2.08 180 | cemetery woodland 2.08 181 | possibility girl 1.94 182 | cup substance 1.92 183 | forest graveyard 1.85 184 | stock egg 1.81 185 | month hotel 1.81 186 | energy secretary 1.81 187 | precedent group 1.77 188 | production hike 1.75 189 | stock phone 1.62 190 | holy sex 1.62 191 | stock CD 1.31 192 | drink ear 1.31 193 | delay racism 1.19 194 | stock life 0.92 195 | stock jaguar 0.92 196 | monk slave 0.92 197 | lad wizard 0.92 198 | sugar approach 0.88 199 | rooster voyage 0.62 200 | noon string 0.54 201 | chord smile 0.54 202 | professor cucumber 0.31 203 | king cabbage 0.23 204 | --------------------------------------------------------------------------------