├── README.md ├── distancecli ├── distancecli.c ├── makefile ├── phpword2vec.php └── w2v └── trunk ├── .svn ├── entries ├── format ├── pristine │ ├── 13 │ │ └── 13294f538c32fae2da1c85726695d77d70980247.svn-base │ ├── 21 │ │ └── 210681519593463cd6742bbec2abc1a253932bb1.svn-base │ ├── 23 │ │ └── 2334b431b808544014e14d0ddbb66ccb03d13277.svn-base │ ├── 51 │ │ └── 518a7ad549627c6ef8cf05b49408fcf0f6157460.svn-base │ ├── 72 │ │ └── 724bf0b7fd08d78098c1ccc622ada62ad58093ba.svn-base │ ├── 80 │ │ └── 80740eb5930e039b8002a6c2213cd152847a4169.svn-base │ ├── 83 │ │ └── 83a04fdb0a7cc66001a1abe29157acbe28321564.svn-base │ ├── 91 │ │ └── 91063b176c2f3543afd684071bf4677203917a52.svn-base │ ├── 2b │ │ └── 2b8b815229aa8a61e483fb4ba0588b8b6c491890.svn-base │ ├── 4e │ │ └── 4ea10e60b208f31ae965718f905268ac42fbf1ac.svn-base │ ├── 6f │ │ └── 6ffd58121b45291fcc42a5484d2e3f1ef1156b0d.svn-base │ ├── 8c │ │ └── 8ccd7b8850b84c7d306aebd933c2f1a26d264320.svn-base │ ├── 9a │ │ └── 9a7277255e393a35ce6a0738867c29304f43b55c.svn-base │ ├── c7 │ │ └── c7b37d6aa035fe7b53a54351b44ab577d2fd3337.svn-base │ ├── ea │ │ └── ea5f636000c445177e5f2f14af11f716b1e91bd0.svn-base │ ├── f4 │ │ └── f4f8420f4ff647df0f4196ceee895888fb7f63f7.svn-base │ └── fa │ │ └── fa92df4bbe788f2d51827c762c63bd8e470edf31.svn-base └── wc.db ├── LICENSE ├── README.txt ├── compute-accuracy.c ├── demo-analogy.sh ├── demo-classes.sh ├── demo-phrase-accuracy.sh ├── demo-phrases.sh ├── demo-train-big-model-v1.sh ├── demo-word-accuracy.sh ├── demo-word.sh ├── distance.c ├── makefile ├── questions-phrases.txt ├── questions-words.txt ├── vectors.bin ├── word-analogy.c ├── word2phrase.c └── word2vec.c /README.md: -------------------------------------------------------------------------------- 1 | # phpword2vec 2 | php调用word2vec实现机器学习 3 | ### 使用方法 4 | 执行make进行编译 5 | 执行phpphpword2vec.php可以得到当前关键词的文档向量(该工具是把300维向量转化文档向量的工具) 6 | php直接调用然后可以进行svm等分类操作 7 | 该工具在已经有训练数据后调用 8 | ### 项目地址 9 | github:https://github.com/qieangel2013/phpword2vec 10 | oschina:https://gitee.com/qieangel2013/phpword2vec 11 | ### 如果你对我的辛勤劳动给予肯定,请给我捐赠,你的捐赠是我最大的动力 12 | ![](https://github.com/qieangel2013/zys/blob/master/public/images/pw.jpg) 13 | ![](https://github.com/qieangel2013/zys/blob/master/public/images/pay.png) 14 | [项目捐赠列表](https://github.com/qieangel2013/zys/wiki/%E9%A1%B9%E7%9B%AE%E6%8D%90%E8%B5%A0) 15 | 16 | 17 | -------------------------------------------------------------------------------- /distancecli: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qieangel2013/phpword2vec/e12958b7bd6f5382ec1e14da3759f904dd5fe3e3/distancecli -------------------------------------------------------------------------------- /distancecli.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char *bestw[N]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn,mmn, bi[100]; 31 | //char ch; 32 | float *M; 33 | char *vocab; 34 | if (argc < 2) { 35 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 36 | return 0; 37 | } 38 | strcpy(file_name, argv[1]); 39 | f = fopen(file_name, "rb"); 40 | if (f == NULL) { 41 | printf("Input file not found\n"); 42 | return -1; 43 | } 44 | fscanf(f, "%lld", &words); 45 | fscanf(f, "%lld", &size); 46 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 47 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 48 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 49 | if (M == NULL) { 50 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 51 | return -1; 52 | } 53 | for (b = 0; b < words; b++) { 54 | a = 0; 55 | while (1) { 56 | vocab[b * max_w + a] = fgetc(f); 57 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 58 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 59 | } 60 | vocab[b * max_w + a] = 0; 61 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 62 | len = 0; 63 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 64 | len = sqrt(len); 65 | for (a = 0; a < size; a++) M[a + b * size] /= len; 66 | } 67 | fclose(f); 68 | //while (1) { 69 | for (a = 0; a < N; a++) bestd[a] = 0; 70 | for (a = 0; a < N; a++) bestw[a][0] = 0; 71 | //printf("Enter word or sentence (EXIT to break): "); 72 | a = 0; 73 | //while (1) { 74 | //st1[a] = fgetc(stdin); 75 | for (mmn = 2; mmn < sizeof(argv); mmn++) { 76 | strcpy(st1,argv[2]); 77 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 78 | st1[a] = 0; 79 | break; 80 | } 81 | a++; 82 | } 83 | //if (!strcmp(st1, "EXIT")) break; 84 | cn = 0; 85 | b = 0; 86 | c = 0; 87 | while (1) { 88 | st[cn][b] = st1[c]; 89 | b++; 90 | c++; 91 | st[cn][b] = 0; 92 | if (st1[c] == 0) break; 93 | if (st1[c] == ' ') { 94 | cn++; 95 | b = 0; 96 | c++; 97 | } 98 | } 99 | cn++; 100 | for (a = 0; a < cn; a++) { 101 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 102 | if (b == words) b = -1; 103 | bi[a] = b; 104 | //printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 105 | if (b == -1) { 106 | //printf("Out of dictionary word!\n"); 107 | break; 108 | } 109 | } 110 | //if (b == -1) continue; 111 | //printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 112 | for (a = 0; a < size; a++) vec[a] = 0; 113 | for (b = 0; b < cn; b++) { 114 | if (bi[b] == -1) continue; 115 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 116 | } 117 | len = 0; 118 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 119 | len = sqrt(len); 120 | for (a = 0; a < size; a++) vec[a] /= len; 121 | for (a = 0; a < N; a++) bestd[a] = -1; 122 | for (a = 0; a < N; a++) bestw[a][0] = 0; 123 | for (c = 0; c < words; c++) { 124 | a = 0; 125 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 126 | if (a == 1) continue; 127 | dist = 0; 128 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 129 | for (a = 0; a < N; a++) { 130 | if (dist > bestd[a]) { 131 | for (d = N - 1; d > a; d--) { 132 | bestd[d] = bestd[d - 1]; 133 | strcpy(bestw[d], bestw[d - 1]); 134 | } 135 | bestd[a] = dist; 136 | strcpy(bestw[a], &vocab[c * max_w]); 137 | break; 138 | } 139 | } 140 | } 141 | if(sizeof(bestw)>0 && strlen(bestw[0])!=0){ 142 | for (a = 0; a < N; a++) printf("%s,%f\n", bestw[a], bestd[a]); 143 | }else{ 144 | printf("%s\n", bestw[a]); 145 | } 146 | 147 | //} 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result 4 | 5 | all: distancecli 6 | 7 | distancecli : distancecli.c 8 | $(CC) distancecli.c -o distancecli $(CFLAGS) 9 | 10 | clean: 11 | rm -rf distancecli -------------------------------------------------------------------------------- /phpword2vec.php: -------------------------------------------------------------------------------- 1 | $v) { 17 | $tmpdata=explode(",",$v); 18 | if($tmpdata[0]==$keyword){ 19 | $resultdata=$tmpdata; 20 | break; 21 | } 22 | } 23 | return $resultdata; 24 | } else { 25 | return array(); 26 | } 27 | } 28 | print_r(distance("警察")); 29 | ?> 30 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/entries: -------------------------------------------------------------------------------- 1 | 12 2 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/format: -------------------------------------------------------------------------------- 1 | 12 2 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/13/13294f538c32fae2da1c85726695d77d70980247.svn-base: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./distance vectors.bin 8 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/21/210681519593463cd6742bbec2abc1a253932bb1.svn-base: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char *bestw[N]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | char ch; 32 | float *M; 33 | char *vocab; 34 | if (argc < 2) { 35 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 36 | return 0; 37 | } 38 | strcpy(file_name, argv[1]); 39 | f = fopen(file_name, "rb"); 40 | if (f == NULL) { 41 | printf("Input file not found\n"); 42 | return -1; 43 | } 44 | fscanf(f, "%lld", &words); 45 | fscanf(f, "%lld", &size); 46 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 47 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 48 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 49 | if (M == NULL) { 50 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 51 | return -1; 52 | } 53 | for (b = 0; b < words; b++) { 54 | a = 0; 55 | while (1) { 56 | vocab[b * max_w + a] = fgetc(f); 57 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 58 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 59 | } 60 | vocab[b * max_w + a] = 0; 61 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 62 | len = 0; 63 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 64 | len = sqrt(len); 65 | for (a = 0; a < size; a++) M[a + b * size] /= len; 66 | } 67 | fclose(f); 68 | while (1) { 69 | for (a = 0; a < N; a++) bestd[a] = 0; 70 | for (a = 0; a < N; a++) bestw[a][0] = 0; 71 | printf("Enter word or sentence (EXIT to break): "); 72 | a = 0; 73 | while (1) { 74 | st1[a] = fgetc(stdin); 75 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 76 | st1[a] = 0; 77 | break; 78 | } 79 | a++; 80 | } 81 | if (!strcmp(st1, "EXIT")) break; 82 | cn = 0; 83 | b = 0; 84 | c = 0; 85 | while (1) { 86 | st[cn][b] = st1[c]; 87 | b++; 88 | c++; 89 | st[cn][b] = 0; 90 | if (st1[c] == 0) break; 91 | if (st1[c] == ' ') { 92 | cn++; 93 | b = 0; 94 | c++; 95 | } 96 | } 97 | cn++; 98 | for (a = 0; a < cn; a++) { 99 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 100 | if (b == words) b = -1; 101 | bi[a] = b; 102 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 103 | if (b == -1) { 104 | printf("Out of dictionary word!\n"); 105 | break; 106 | } 107 | } 108 | if (b == -1) continue; 109 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 110 | for (a = 0; a < size; a++) vec[a] = 0; 111 | for (b = 0; b < cn; b++) { 112 | if (bi[b] == -1) continue; 113 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 114 | } 115 | len = 0; 116 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 117 | len = sqrt(len); 118 | for (a = 0; a < size; a++) vec[a] /= len; 119 | for (a = 0; a < N; a++) bestd[a] = -1; 120 | for (a = 0; a < N; a++) bestw[a][0] = 0; 121 | for (c = 0; c < words; c++) { 122 | a = 0; 123 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 124 | if (a == 1) continue; 125 | dist = 0; 126 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 127 | for (a = 0; a < N; a++) { 128 | if (dist > bestd[a]) { 129 | for (d = N - 1; d > a; d--) { 130 | bestd[d] = bestd[d - 1]; 131 | strcpy(bestw[d], bestw[d - 1]); 132 | } 133 | bestd[a] = dist; 134 | strcpy(bestw[a], &vocab[c * max_w]); 135 | break; 136 | } 137 | } 138 | } 139 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 140 | } 141 | return 0; 142 | } 143 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/23/2334b431b808544014e14d0ddbb66ccb03d13277.svn-base: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | const long long max_size = 2000; // max length of strings 23 | const long long N = 1; // number of closest words 24 | const long long max_w = 50; // max length of vocabulary entries 25 | 26 | int main(int argc, char **argv) 27 | { 28 | FILE *f; 29 | char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch; 30 | float dist, len, bestd[N], vec[max_size]; 31 | long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; 32 | float *M; 33 | char *vocab; 34 | int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0; 35 | if (argc < 2) { 36 | printf("Usage: ./compute-accuracy \nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | if (argc > 2) threshold = atoi(argv[2]); 41 | f = fopen(file_name, "rb"); 42 | if (f == NULL) { 43 | printf("Input file not found\n"); 44 | return -1; 45 | } 46 | fscanf(f, "%lld", &words); 47 | if (threshold) if (words > threshold) words = threshold; 48 | fscanf(f, "%lld", &size); 49 | vocab = (char *)malloc(words * max_w * sizeof(char)); 50 | M = (float *)malloc(words * size * sizeof(float)); 51 | if (M == NULL) { 52 | printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576); 53 | return -1; 54 | } 55 | for (b = 0; b < words; b++) { 56 | a = 0; 57 | while (1) { 58 | vocab[b * max_w + a] = fgetc(f); 59 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 60 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 61 | } 62 | vocab[b * max_w + a] = 0; 63 | for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]); 64 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 65 | len = 0; 66 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 67 | len = sqrt(len); 68 | for (a = 0; a < size; a++) M[a + b * size] /= len; 69 | } 70 | fclose(f); 71 | TCN = 0; 72 | while (1) { 73 | for (a = 0; a < N; a++) bestd[a] = 0; 74 | for (a = 0; a < N; a++) bestw[a][0] = 0; 75 | scanf("%s", st1); 76 | for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]); 77 | if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) { 78 | if (TCN == 0) TCN = 1; 79 | if (QID != 0) { 80 | printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN); 81 | printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100); 82 | } 83 | QID++; 84 | scanf("%s", st1); 85 | if (feof(stdin)) break; 86 | printf("%s:\n", st1); 87 | TCN = 0; 88 | CCN = 0; 89 | continue; 90 | } 91 | if (!strcmp(st1, "EXIT")) break; 92 | scanf("%s", st2); 93 | for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]); 94 | scanf("%s", st3); 95 | for (a = 0; a bestd[a]) { 122 | for (d = N - 1; d > a; d--) { 123 | bestd[d] = bestd[d - 1]; 124 | strcpy(bestw[d], bestw[d - 1]); 125 | } 126 | bestd[a] = dist; 127 | strcpy(bestw[a], &vocab[c * max_w]); 128 | break; 129 | } 130 | } 131 | } 132 | if (!strcmp(st4, bestw[0])) { 133 | CCN++; 134 | CACN++; 135 | if (QID <= 5) SEAC++; else SYAC++; 136 | } 137 | if (QID <= 5) SECN++; else SYCN++; 138 | TCN++; 139 | TACN++; 140 | } 141 | printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100); 142 | return 0; 143 | } 144 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/2b/2b8b815229aa8a61e483fb4ba0588b8b6c491890.svn-base: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/4e/4ea10e60b208f31ae965718f905268ac42fbf1ac.svn-base: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 60 22 | 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary 24 | 25 | typedef float real; // Precision of float numbers 26 | 27 | struct vocab_word { 28 | long long cn; 29 | char *word; 30 | }; 31 | 32 | char train_file[MAX_STRING], output_file[MAX_STRING]; 33 | struct vocab_word *vocab; 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1; 35 | long long vocab_max_size = 10000, vocab_size = 0; 36 | long long train_words = 0; 37 | real threshold = 100; 38 | 39 | unsigned long long next_random = 1; 40 | 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 42 | void ReadWord(char *word, FILE *fin) { 43 | int a = 0, ch; 44 | while (!feof(fin)) { 45 | ch = fgetc(fin); 46 | if (ch == 13) continue; 47 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 48 | if (a > 0) { 49 | if (ch == '\n') ungetc(ch, fin); 50 | break; 51 | } 52 | if (ch == '\n') { 53 | strcpy(word, (char *)""); 54 | return; 55 | } else continue; 56 | } 57 | word[a] = ch; 58 | a++; 59 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 60 | } 61 | word[a] = 0; 62 | } 63 | 64 | // Returns hash value of a word 65 | int GetWordHash(char *word) { 66 | unsigned long long a, hash = 1; 67 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 68 | hash = hash % vocab_hash_size; 69 | return hash; 70 | } 71 | 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 73 | int SearchVocab(char *word) { 74 | unsigned int hash = GetWordHash(word); 75 | while (1) { 76 | if (vocab_hash[hash] == -1) return -1; 77 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 78 | hash = (hash + 1) % vocab_hash_size; 79 | } 80 | return -1; 81 | } 82 | 83 | // Reads a word and returns its index in the vocabulary 84 | int ReadWordIndex(FILE *fin) { 85 | char word[MAX_STRING]; 86 | ReadWord(word, fin); 87 | if (feof(fin)) return -1; 88 | return SearchVocab(word); 89 | } 90 | 91 | // Adds a word to the vocabulary 92 | int AddWordToVocab(char *word) { 93 | unsigned int hash, length = strlen(word) + 1; 94 | if (length > MAX_STRING) length = MAX_STRING; 95 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 96 | strcpy(vocab[vocab_size].word, word); 97 | vocab[vocab_size].cn = 0; 98 | vocab_size++; 99 | // Reallocate memory if needed 100 | if (vocab_size + 2 >= vocab_max_size) { 101 | vocab_max_size += 10000; 102 | vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 103 | } 104 | hash = GetWordHash(word); 105 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 106 | vocab_hash[hash]=vocab_size - 1; 107 | return vocab_size - 1; 108 | } 109 | 110 | // Used later for sorting by word counts 111 | int VocabCompare(const void *a, const void *b) { 112 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 113 | } 114 | 115 | // Sorts the vocabulary by frequency using word counts 116 | void SortVocab() { 117 | int a; 118 | unsigned int hash; 119 | // Sort the vocabulary and keep at the first position 120 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 121 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 122 | for (a = 0; a < vocab_size; a++) { 123 | // Words occuring less than min_count times will be discarded from the vocab 124 | if (vocab[a].cn < min_count) { 125 | vocab_size--; 126 | free(vocab[vocab_size].word); 127 | } else { 128 | // Hash will be re-computed, as after the sorting it is not actual 129 | hash = GetWordHash(vocab[a].word); 130 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 131 | vocab_hash[hash] = a; 132 | } 133 | } 134 | vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word)); 135 | } 136 | 137 | // Reduces the vocabulary by removing infrequent tokens 138 | void ReduceVocab() { 139 | int a, b = 0; 140 | unsigned int hash; 141 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 142 | vocab[b].cn = vocab[a].cn; 143 | vocab[b].word = vocab[a].word; 144 | b++; 145 | } else free(vocab[a].word); 146 | vocab_size = b; 147 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 148 | for (a = 0; a < vocab_size; a++) { 149 | // Hash will be re-computed, as it is not actual 150 | hash = GetWordHash(vocab[a].word); 151 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 152 | vocab_hash[hash] = a; 153 | } 154 | fflush(stdout); 155 | min_reduce++; 156 | } 157 | 158 | void LearnVocabFromTrainFile() { 159 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 160 | FILE *fin; 161 | long long a, i, start = 1; 162 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 163 | fin = fopen(train_file, "rb"); 164 | if (fin == NULL) { 165 | printf("ERROR: training data file not found!\n"); 166 | exit(1); 167 | } 168 | vocab_size = 0; 169 | AddWordToVocab((char *)""); 170 | while (1) { 171 | ReadWord(word, fin); 172 | if (feof(fin)) break; 173 | if (!strcmp(word, "")) { 174 | start = 1; 175 | continue; 176 | } else start = 0; 177 | train_words++; 178 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 179 | printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13); 180 | fflush(stdout); 181 | } 182 | i = SearchVocab(word); 183 | if (i == -1) { 184 | a = AddWordToVocab(word); 185 | vocab[a].cn = 1; 186 | } else vocab[i].cn++; 187 | if (start) continue; 188 | sprintf(bigram_word, "%s_%s", last_word, word); 189 | bigram_word[MAX_STRING - 1] = 0; 190 | strcpy(last_word, word); 191 | i = SearchVocab(bigram_word); 192 | if (i == -1) { 193 | a = AddWordToVocab(bigram_word); 194 | vocab[a].cn = 1; 195 | } else vocab[i].cn++; 196 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 197 | } 198 | SortVocab(); 199 | if (debug_mode > 0) { 200 | printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); 201 | printf("Words in train file: %lld\n", train_words); 202 | } 203 | fclose(fin); 204 | } 205 | 206 | void TrainModel() { 207 | long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0; 208 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 209 | real score; 210 | FILE *fo, *fin; 211 | printf("Starting training using file %s\n", train_file); 212 | LearnVocabFromTrainFile(); 213 | fin = fopen(train_file, "rb"); 214 | fo = fopen(output_file, "wb"); 215 | word[0] = 0; 216 | while (1) { 217 | strcpy(last_word, word); 218 | ReadWord(word, fin); 219 | if (feof(fin)) break; 220 | if (!strcmp(word, "")) { 221 | fprintf(fo, "\n"); 222 | continue; 223 | } 224 | cn++; 225 | if ((debug_mode > 1) && (cn % 100000 == 0)) { 226 | printf("Words written: %lldK%c", cn / 1000, 13); 227 | fflush(stdout); 228 | } 229 | oov = 0; 230 | i = SearchVocab(word); 231 | if (i == -1) oov = 1; else pb = vocab[i].cn; 232 | if (li == -1) oov = 1; 233 | li = i; 234 | sprintf(bigram_word, "%s_%s", last_word, word); 235 | bigram_word[MAX_STRING - 1] = 0; 236 | i = SearchVocab(bigram_word); 237 | if (i == -1) oov = 1; else pab = vocab[i].cn; 238 | if (pa < min_count) oov = 1; 239 | if (pb < min_count) oov = 1; 240 | if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words; 241 | if (score > threshold) { 242 | fprintf(fo, "_%s", word); 243 | pb = 0; 244 | } else fprintf(fo, " %s", word); 245 | pa = pb; 246 | } 247 | fclose(fo); 248 | fclose(fin); 249 | } 250 | 251 | int ArgPos(char *str, int argc, char **argv) { 252 | int a; 253 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 254 | if (a == argc - 1) { 255 | printf("Argument missing for %s\n", str); 256 | exit(1); 257 | } 258 | return a; 259 | } 260 | return -1; 261 | } 262 | 263 | int main(int argc, char **argv) { 264 | int i; 265 | if (argc == 1) { 266 | printf("WORD2PHRASE tool v0.1a\n\n"); 267 | printf("Options:\n"); 268 | printf("Parameters for training:\n"); 269 | printf("\t-train \n"); 270 | printf("\t\tUse text data from to train the model\n"); 271 | printf("\t-output \n"); 272 | printf("\t\tUse to save the resulting word vectors / word clusters / phrases\n"); 273 | printf("\t-min-count \n"); 274 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 275 | printf("\t-threshold \n"); 276 | printf("\t\t The value represents threshold for forming the phrases (higher means less phrases); default 100\n"); 277 | printf("\t-debug \n"); 278 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 279 | printf("\nExamples:\n"); 280 | printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n"); 281 | return 0; 282 | } 283 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 284 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 285 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 286 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 287 | if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]); 288 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 289 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 290 | TrainModel(); 291 | return 0; 292 | } 293 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/51/518a7ad549627c6ef8cf05b49408fcf0f6157460.svn-base: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt 12 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/72/724bf0b7fd08d78098c1ccc622ada62ad58093ba.svn-base: -------------------------------------------------------------------------------- 1 | ############################################################################################### 2 | # 3 | # Script for training good word and phrase vector model using public corpora, version 1.0. 4 | # The training time will be from several hours to about a day. 5 | # 6 | # Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains 7 | # a 500-dimensional vector model and evaluates it on word and phrase analogy tasks. 8 | # 9 | ############################################################################################### 10 | 11 | # This function will convert text to lowercase and remove special characters 12 | normalize_text() { 13 | awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ 14 | -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/
/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ 15 | -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ 16 | -e 's/«/ /g' | tr 0-9 " " 17 | } 18 | 19 | mkdir word2vec 20 | cd word2vec 21 | 22 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 23 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz 24 | gzip -d news.2012.en.shuffled.gz 25 | gzip -d news.2013.en.shuffled.gz 26 | normalize_text < news.2012.en.shuffled > data.txt 27 | normalize_text < news.2013.en.shuffled >> data.txt 28 | 29 | wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz 30 | tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz 31 | for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do 32 | normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt 33 | done 34 | 35 | wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus 36 | tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt 37 | for i in `ls webbase_all`; do 38 | normalize_text < webbase_all/$i >> data.txt 39 | done 40 | 41 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 42 | bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e ' 43 | # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase 44 | # letters (a-z, converted from A-Z), and spaces (never consecutive)... 45 | # All other characters are converted to spaces. Only text which normally appears. 46 | # in the web browser is displayed. Tables are removed. Image captions are. 47 | # preserved. Links are converted to normal text. Digits are spelled out. 48 | # *** Modified to not spell digits or throw away non-ASCII characters *** 49 | 50 | # Written by Matt Mahoney, June 10, 2006. This program is released to the public domain. 51 | 52 | $/=">"; # input record separator 53 | while (<>) { 54 | if (/ ... 55 | if (/#redirect/i) {$text=0;} # remove #REDIRECT 56 | if ($text) { 57 | 58 | # Remove any text not normally visible 59 | if (/<\/text>/) {$text=0;} 60 | s/<.*>//; # remove xml tags 61 | s/&/&/g; # decode URL encoded chars 62 | s/<//g; 64 | s///g; # remove references ... 65 | s/<[^>]*>//g; # remove xhtml tags 66 | s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text 67 | s/\|thumb//ig; # remove images links, preserve caption 68 | s/\|left//ig; 69 | s/\|right//ig; 70 | s/\|\d+px//ig; 71 | s/\[\[image:[^\[\]]*\|//ig; 72 | s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup 73 | s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages 74 | s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text 75 | s/{{[^}]*}}//g; # remove {{icons}} and {tables} 76 | s/{[^}]*}//g; 77 | s/\[//g; # remove [ and ] 78 | s/\]//g; 79 | s/&[^;]*;/ /g; # remove URL encoded chars 80 | 81 | $_=" $_ "; 82 | chop; 83 | print $_; 84 | } 85 | } 86 | ' | normalize_text | awk '{if (NF>1) print;}' >> data.txt 87 | 88 | wget http://word2vec.googlecode.com/svn/trunk/word2vec.c 89 | wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c 90 | wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c 91 | wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt 92 | wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt 93 | gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops 94 | gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops 95 | gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops 96 | ./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2 97 | ./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2 98 | ./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10 99 | ./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions 100 | ./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage 101 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/80/80740eb5930e039b8002a6c2213cd152847a4169.svn-base: -------------------------------------------------------------------------------- 1 | Tools for computing distributed representtion of words 2 | ------------------------------------------------------ 3 | 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts. 5 | 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following: 8 | - desired vector dimensionality 9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model 10 | - training algorithm: hierarchical softmax and / or negative sampling 11 | - threshold for downsampling the frequent words 12 | - number of threads to use 13 | - the format of the output word vector file (text or binary) 14 | 15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 16 | 17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training 18 | is finished, the user can interactively explore the similarity of the words. 19 | 20 | More information about the scripts is provided at https://code.google.com/p/word2vec/ 21 | 22 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/83/83a04fdb0a7cc66001a1abe29157acbe28321564.svn-base: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./distance vectors-phrase.bin 12 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/8c/8ccd7b8850b84c7d306aebd933c2f1a26d264320.svn-base: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result 4 | 5 | all: word2vec word2phrase distance word-analogy compute-accuracy 6 | 7 | word2vec : word2vec.c 8 | $(CC) word2vec.c -o word2vec $(CFLAGS) 9 | word2phrase : word2phrase.c 10 | $(CC) word2phrase.c -o word2phrase $(CFLAGS) 11 | distance : distance.c 12 | $(CC) distance.c -o distance $(CFLAGS) 13 | word-analogy : word-analogy.c 14 | $(CC) word-analogy.c -o word-analogy $(CFLAGS) 15 | compute-accuracy : compute-accuracy.c 16 | $(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS) 17 | chmod +x *.sh 18 | 19 | clean: 20 | rm -rf word2vec word2phrase distance word-analogy compute-accuracy -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/91/91063b176c2f3543afd684071bf4677203917a52.svn-base: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt 8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt 9 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/9a/9a7277255e393a35ce6a0738867c29304f43b55c.svn-base: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char bestw[N][max_size]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | char ch; 32 | float *M; 33 | char *vocab; 34 | if (argc < 2) { 35 | printf("Usage: ./word-analogy \nwhere FILE contains word projections in the BINARY FORMAT\n"); 36 | return 0; 37 | } 38 | strcpy(file_name, argv[1]); 39 | f = fopen(file_name, "rb"); 40 | if (f == NULL) { 41 | printf("Input file not found\n"); 42 | return -1; 43 | } 44 | fscanf(f, "%lld", &words); 45 | fscanf(f, "%lld", &size); 46 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 47 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 48 | if (M == NULL) { 49 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 50 | return -1; 51 | } 52 | for (b = 0; b < words; b++) { 53 | a = 0; 54 | while (1) { 55 | vocab[b * max_w + a] = fgetc(f); 56 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 57 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 58 | } 59 | vocab[b * max_w + a] = 0; 60 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 61 | len = 0; 62 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 63 | len = sqrt(len); 64 | for (a = 0; a < size; a++) M[a + b * size] /= len; 65 | } 66 | fclose(f); 67 | while (1) { 68 | for (a = 0; a < N; a++) bestd[a] = 0; 69 | for (a = 0; a < N; a++) bestw[a][0] = 0; 70 | printf("Enter three words (EXIT to break): "); 71 | a = 0; 72 | while (1) { 73 | st1[a] = fgetc(stdin); 74 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 75 | st1[a] = 0; 76 | break; 77 | } 78 | a++; 79 | } 80 | if (!strcmp(st1, "EXIT")) break; 81 | cn = 0; 82 | b = 0; 83 | c = 0; 84 | while (1) { 85 | st[cn][b] = st1[c]; 86 | b++; 87 | c++; 88 | st[cn][b] = 0; 89 | if (st1[c] == 0) break; 90 | if (st1[c] == ' ') { 91 | cn++; 92 | b = 0; 93 | c++; 94 | } 95 | } 96 | cn++; 97 | if (cn < 3) { 98 | printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn); 99 | continue; 100 | } 101 | for (a = 0; a < cn; a++) { 102 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 103 | if (b == words) b = 0; 104 | bi[a] = b; 105 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 106 | if (b == 0) { 107 | printf("Out of dictionary word!\n"); 108 | break; 109 | } 110 | } 111 | if (b == 0) continue; 112 | printf("\n Word Distance\n------------------------------------------------------------------------\n"); 113 | for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size]; 114 | len = 0; 115 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 116 | len = sqrt(len); 117 | for (a = 0; a < size; a++) vec[a] /= len; 118 | for (a = 0; a < N; a++) bestd[a] = 0; 119 | for (a = 0; a < N; a++) bestw[a][0] = 0; 120 | for (c = 0; c < words; c++) { 121 | if (c == bi[0]) continue; 122 | if (c == bi[1]) continue; 123 | if (c == bi[2]) continue; 124 | a = 0; 125 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 126 | if (a == 1) continue; 127 | dist = 0; 128 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 129 | for (a = 0; a < N; a++) { 130 | if (dist > bestd[a]) { 131 | for (d = N - 1; d > a; d--) { 132 | bestd[d] = bestd[d - 1]; 133 | strcpy(bestw[d], bestw[d - 1]); 134 | } 135 | bestd[a] = dist; 136 | strcpy(bestw[a], &vocab[c * max_w]); 137 | break; 138 | } 139 | } 140 | } 141 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 142 | } 143 | return 0; 144 | } 145 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/c7/c7b37d6aa035fe7b53a54351b44ab577d2fd3337.svn-base: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 100 22 | #define EXP_TABLE_SIZE 1000 23 | #define MAX_EXP 6 24 | #define MAX_SENTENCE_LENGTH 1000 25 | #define MAX_CODE_LENGTH 40 26 | 27 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 28 | 29 | typedef float real; // Precision of float numbers 30 | 31 | struct vocab_word { 32 | long long cn; 33 | int *point; 34 | char *word, *code, codelen; 35 | }; 36 | 37 | char train_file[MAX_STRING], output_file[MAX_STRING]; 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 39 | struct vocab_word *vocab; 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 41 | int *vocab_hash; 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 44 | real alpha = 0.025, starting_alpha, sample = 1e-3; 45 | real *syn0, *syn1, *syn1neg, *expTable; 46 | clock_t start; 47 | 48 | int hs = 0, negative = 5; 49 | const int table_size = 1e8; 50 | int *table; 51 | 52 | void InitUnigramTable() { 53 | int a, i; 54 | long long train_words_pow = 0; 55 | real d1, power = 0.75; 56 | table = (int *)malloc(table_size * sizeof(int)); 57 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 58 | i = 0; 59 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 60 | for (a = 0; a < table_size; a++) { 61 | table[a] = i; 62 | if (a / (real)table_size > d1) { 63 | i++; 64 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 65 | } 66 | if (i >= vocab_size) i = vocab_size - 1; 67 | } 68 | } 69 | 70 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 71 | void ReadWord(char *word, FILE *fin) { 72 | int a = 0, ch; 73 | while (!feof(fin)) { 74 | ch = fgetc(fin); 75 | if (ch == 13) continue; 76 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 77 | if (a > 0) { 78 | if (ch == '\n') ungetc(ch, fin); 79 | break; 80 | } 81 | if (ch == '\n') { 82 | strcpy(word, (char *)""); 83 | return; 84 | } else continue; 85 | } 86 | word[a] = ch; 87 | a++; 88 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 89 | } 90 | word[a] = 0; 91 | } 92 | 93 | // Returns hash value of a word 94 | int GetWordHash(char *word) { 95 | unsigned long long a, hash = 0; 96 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 97 | hash = hash % vocab_hash_size; 98 | return hash; 99 | } 100 | 101 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 102 | int SearchVocab(char *word) { 103 | unsigned int hash = GetWordHash(word); 104 | while (1) { 105 | if (vocab_hash[hash] == -1) return -1; 106 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 107 | hash = (hash + 1) % vocab_hash_size; 108 | } 109 | return -1; 110 | } 111 | 112 | // Reads a word and returns its index in the vocabulary 113 | int ReadWordIndex(FILE *fin) { 114 | char word[MAX_STRING]; 115 | ReadWord(word, fin); 116 | if (feof(fin)) return -1; 117 | return SearchVocab(word); 118 | } 119 | 120 | // Adds a word to the vocabulary 121 | int AddWordToVocab(char *word) { 122 | unsigned int hash, length = strlen(word) + 1; 123 | if (length > MAX_STRING) length = MAX_STRING; 124 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 125 | strcpy(vocab[vocab_size].word, word); 126 | vocab[vocab_size].cn = 0; 127 | vocab_size++; 128 | // Reallocate memory if needed 129 | if (vocab_size + 2 >= vocab_max_size) { 130 | vocab_max_size += 1000; 131 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 132 | } 133 | hash = GetWordHash(word); 134 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 135 | vocab_hash[hash] = vocab_size - 1; 136 | return vocab_size - 1; 137 | } 138 | 139 | // Used later for sorting by word counts 140 | int VocabCompare(const void *a, const void *b) { 141 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 142 | } 143 | 144 | // Sorts the vocabulary by frequency using word counts 145 | void SortVocab() { 146 | int a, size; 147 | unsigned int hash; 148 | // Sort the vocabulary and keep at the first position 149 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 150 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 151 | size = vocab_size; 152 | train_words = 0; 153 | for (a = 0; a < size; a++) { 154 | // Words occuring less than min_count times will be discarded from the vocab 155 | if ((vocab[a].cn < min_count) && (a != 0)) { 156 | vocab_size--; 157 | free(vocab[a].word); 158 | } else { 159 | // Hash will be re-computed, as after the sorting it is not actual 160 | hash=GetWordHash(vocab[a].word); 161 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 162 | vocab_hash[hash] = a; 163 | train_words += vocab[a].cn; 164 | } 165 | } 166 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 167 | // Allocate memory for the binary tree construction 168 | for (a = 0; a < vocab_size; a++) { 169 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 170 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 171 | } 172 | } 173 | 174 | // Reduces the vocabulary by removing infrequent tokens 175 | void ReduceVocab() { 176 | int a, b = 0; 177 | unsigned int hash; 178 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 179 | vocab[b].cn = vocab[a].cn; 180 | vocab[b].word = vocab[a].word; 181 | b++; 182 | } else free(vocab[a].word); 183 | vocab_size = b; 184 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 185 | for (a = 0; a < vocab_size; a++) { 186 | // Hash will be re-computed, as it is not actual 187 | hash = GetWordHash(vocab[a].word); 188 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 189 | vocab_hash[hash] = a; 190 | } 191 | fflush(stdout); 192 | min_reduce++; 193 | } 194 | 195 | // Create binary Huffman tree using the word counts 196 | // Frequent words will have short uniqe binary codes 197 | void CreateBinaryTree() { 198 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 199 | char code[MAX_CODE_LENGTH]; 200 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 201 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 202 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 203 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 204 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 205 | pos1 = vocab_size - 1; 206 | pos2 = vocab_size; 207 | // Following algorithm constructs the Huffman tree by adding one node at a time 208 | for (a = 0; a < vocab_size - 1; a++) { 209 | // First, find two smallest nodes 'min1, min2' 210 | if (pos1 >= 0) { 211 | if (count[pos1] < count[pos2]) { 212 | min1i = pos1; 213 | pos1--; 214 | } else { 215 | min1i = pos2; 216 | pos2++; 217 | } 218 | } else { 219 | min1i = pos2; 220 | pos2++; 221 | } 222 | if (pos1 >= 0) { 223 | if (count[pos1] < count[pos2]) { 224 | min2i = pos1; 225 | pos1--; 226 | } else { 227 | min2i = pos2; 228 | pos2++; 229 | } 230 | } else { 231 | min2i = pos2; 232 | pos2++; 233 | } 234 | count[vocab_size + a] = count[min1i] + count[min2i]; 235 | parent_node[min1i] = vocab_size + a; 236 | parent_node[min2i] = vocab_size + a; 237 | binary[min2i] = 1; 238 | } 239 | // Now assign binary code to each vocabulary word 240 | for (a = 0; a < vocab_size; a++) { 241 | b = a; 242 | i = 0; 243 | while (1) { 244 | code[i] = binary[b]; 245 | point[i] = b; 246 | i++; 247 | b = parent_node[b]; 248 | if (b == vocab_size * 2 - 2) break; 249 | } 250 | vocab[a].codelen = i; 251 | vocab[a].point[0] = vocab_size - 2; 252 | for (b = 0; b < i; b++) { 253 | vocab[a].code[i - b - 1] = code[b]; 254 | vocab[a].point[i - b] = point[b] - vocab_size; 255 | } 256 | } 257 | free(count); 258 | free(binary); 259 | free(parent_node); 260 | } 261 | 262 | void LearnVocabFromTrainFile() { 263 | char word[MAX_STRING]; 264 | FILE *fin; 265 | long long a, i; 266 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 267 | fin = fopen(train_file, "rb"); 268 | if (fin == NULL) { 269 | printf("ERROR: training data file not found!\n"); 270 | exit(1); 271 | } 272 | vocab_size = 0; 273 | AddWordToVocab((char *)""); 274 | while (1) { 275 | ReadWord(word, fin); 276 | if (feof(fin)) break; 277 | train_words++; 278 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 279 | printf("%lldK%c", train_words / 1000, 13); 280 | fflush(stdout); 281 | } 282 | i = SearchVocab(word); 283 | if (i == -1) { 284 | a = AddWordToVocab(word); 285 | vocab[a].cn = 1; 286 | } else vocab[i].cn++; 287 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 288 | } 289 | SortVocab(); 290 | if (debug_mode > 0) { 291 | printf("Vocab size: %lld\n", vocab_size); 292 | printf("Words in train file: %lld\n", train_words); 293 | } 294 | file_size = ftell(fin); 295 | fclose(fin); 296 | } 297 | 298 | void SaveVocab() { 299 | long long i; 300 | FILE *fo = fopen(save_vocab_file, "wb"); 301 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 302 | fclose(fo); 303 | } 304 | 305 | void ReadVocab() { 306 | long long a, i = 0; 307 | char c; 308 | char word[MAX_STRING]; 309 | FILE *fin = fopen(read_vocab_file, "rb"); 310 | if (fin == NULL) { 311 | printf("Vocabulary file not found\n"); 312 | exit(1); 313 | } 314 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 315 | vocab_size = 0; 316 | while (1) { 317 | ReadWord(word, fin); 318 | if (feof(fin)) break; 319 | a = AddWordToVocab(word); 320 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 321 | i++; 322 | } 323 | SortVocab(); 324 | if (debug_mode > 0) { 325 | printf("Vocab size: %lld\n", vocab_size); 326 | printf("Words in train file: %lld\n", train_words); 327 | } 328 | fin = fopen(train_file, "rb"); 329 | if (fin == NULL) { 330 | printf("ERROR: training data file not found!\n"); 331 | exit(1); 332 | } 333 | fseek(fin, 0, SEEK_END); 334 | file_size = ftell(fin); 335 | fclose(fin); 336 | } 337 | 338 | void InitNet() { 339 | long long a, b; 340 | unsigned long long next_random = 1; 341 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 342 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 343 | if (hs) { 344 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 345 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 346 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 347 | syn1[a * layer1_size + b] = 0; 348 | } 349 | if (negative>0) { 350 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); 351 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 352 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 353 | syn1neg[a * layer1_size + b] = 0; 354 | } 355 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 356 | next_random = next_random * (unsigned long long)25214903917 + 11; 357 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 358 | } 359 | CreateBinaryTree(); 360 | } 361 | 362 | void *TrainModelThread(void *id) { 363 | long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; 364 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 365 | long long l1, l2, c, target, label, local_iter = iter; 366 | unsigned long long next_random = (long long)id; 367 | real f, g; 368 | clock_t now; 369 | real *neu1 = (real *)calloc(layer1_size, sizeof(real)); 370 | real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 371 | FILE *fi = fopen(train_file, "rb"); 372 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 373 | while (1) { 374 | if (word_count - last_word_count > 10000) { 375 | word_count_actual += word_count - last_word_count; 376 | last_word_count = word_count; 377 | if ((debug_mode > 1)) { 378 | now=clock(); 379 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 380 | word_count_actual / (real)(iter * train_words + 1) * 100, 381 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 382 | fflush(stdout); 383 | } 384 | alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 385 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 386 | } 387 | if (sentence_length == 0) { 388 | while (1) { 389 | word = ReadWordIndex(fi); 390 | if (feof(fi)) break; 391 | if (word == -1) continue; 392 | word_count++; 393 | if (word == 0) break; 394 | // The subsampling randomly discards frequent words while keeping the ranking same 395 | if (sample > 0) { 396 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 397 | next_random = next_random * (unsigned long long)25214903917 + 11; 398 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 399 | } 400 | sen[sentence_length] = word; 401 | sentence_length++; 402 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 403 | } 404 | sentence_position = 0; 405 | } 406 | if (feof(fi) || (word_count > train_words / num_threads)) { 407 | word_count_actual += word_count - last_word_count; 408 | local_iter--; 409 | if (local_iter == 0) break; 410 | word_count = 0; 411 | last_word_count = 0; 412 | sentence_length = 0; 413 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 414 | continue; 415 | } 416 | word = sen[sentence_position]; 417 | if (word == -1) continue; 418 | for (c = 0; c < layer1_size; c++) neu1[c] = 0; 419 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 420 | next_random = next_random * (unsigned long long)25214903917 + 11; 421 | b = next_random % window; 422 | if (cbow) { //train the cbow architecture 423 | // in -> hidden 424 | cw = 0; 425 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 426 | c = sentence_position - window + a; 427 | if (c < 0) continue; 428 | if (c >= sentence_length) continue; 429 | last_word = sen[c]; 430 | if (last_word == -1) continue; 431 | for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; 432 | cw++; 433 | } 434 | if (cw) { 435 | for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 436 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 437 | f = 0; 438 | l2 = vocab[word].point[d] * layer1_size; 439 | // Propagate hidden -> output 440 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 441 | if (f <= -MAX_EXP) continue; 442 | else if (f >= MAX_EXP) continue; 443 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 444 | // 'g' is the gradient multiplied by the learning rate 445 | g = (1 - vocab[word].code[d] - f) * alpha; 446 | // Propagate errors output -> hidden 447 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 448 | // Learn weights hidden -> output 449 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 450 | } 451 | // NEGATIVE SAMPLING 452 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 453 | if (d == 0) { 454 | target = word; 455 | label = 1; 456 | } else { 457 | next_random = next_random * (unsigned long long)25214903917 + 11; 458 | target = table[(next_random >> 16) % table_size]; 459 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 460 | if (target == word) continue; 461 | label = 0; 462 | } 463 | l2 = target * layer1_size; 464 | f = 0; 465 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; 466 | if (f > MAX_EXP) g = (label - 1) * alpha; 467 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 468 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 469 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 470 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 471 | } 472 | // hidden -> in 473 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 474 | c = sentence_position - window + a; 475 | if (c < 0) continue; 476 | if (c >= sentence_length) continue; 477 | last_word = sen[c]; 478 | if (last_word == -1) continue; 479 | for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 480 | } 481 | } 482 | } else { //train skip-gram 483 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 484 | c = sentence_position - window + a; 485 | if (c < 0) continue; 486 | if (c >= sentence_length) continue; 487 | last_word = sen[c]; 488 | if (last_word == -1) continue; 489 | l1 = last_word * layer1_size; 490 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 491 | // HIERARCHICAL SOFTMAX 492 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 493 | f = 0; 494 | l2 = vocab[word].point[d] * layer1_size; 495 | // Propagate hidden -> output 496 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; 497 | if (f <= -MAX_EXP) continue; 498 | else if (f >= MAX_EXP) continue; 499 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 500 | // 'g' is the gradient multiplied by the learning rate 501 | g = (1 - vocab[word].code[d] - f) * alpha; 502 | // Propagate errors output -> hidden 503 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 504 | // Learn weights hidden -> output 505 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; 506 | } 507 | // NEGATIVE SAMPLING 508 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 509 | if (d == 0) { 510 | target = word; 511 | label = 1; 512 | } else { 513 | next_random = next_random * (unsigned long long)25214903917 + 11; 514 | target = table[(next_random >> 16) % table_size]; 515 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 516 | if (target == word) continue; 517 | label = 0; 518 | } 519 | l2 = target * layer1_size; 520 | f = 0; 521 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 522 | if (f > MAX_EXP) g = (label - 1) * alpha; 523 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 524 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 525 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 526 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; 527 | } 528 | // Learn weights input -> hidden 529 | for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; 530 | } 531 | } 532 | sentence_position++; 533 | if (sentence_position >= sentence_length) { 534 | sentence_length = 0; 535 | continue; 536 | } 537 | } 538 | fclose(fi); 539 | free(neu1); 540 | free(neu1e); 541 | pthread_exit(NULL); 542 | } 543 | 544 | void TrainModel() { 545 | long a, b, c, d; 546 | FILE *fo; 547 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 548 | printf("Starting training using file %s\n", train_file); 549 | starting_alpha = alpha; 550 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 551 | if (save_vocab_file[0] != 0) SaveVocab(); 552 | if (output_file[0] == 0) return; 553 | InitNet(); 554 | if (negative > 0) InitUnigramTable(); 555 | start = clock(); 556 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 557 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 558 | fo = fopen(output_file, "wb"); 559 | if (classes == 0) { 560 | // Save the word vectors 561 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 562 | for (a = 0; a < vocab_size; a++) { 563 | fprintf(fo, "%s ", vocab[a].word); 564 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 565 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 566 | fprintf(fo, "\n"); 567 | } 568 | } else { 569 | // Run K-means on the word vectors 570 | int clcn = classes, iter = 10, closeid; 571 | int *centcn = (int *)malloc(classes * sizeof(int)); 572 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 573 | real closev, x; 574 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 575 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 576 | for (a = 0; a < iter; a++) { 577 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 578 | for (b = 0; b < clcn; b++) centcn[b] = 1; 579 | for (c = 0; c < vocab_size; c++) { 580 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 581 | centcn[cl[c]]++; 582 | } 583 | for (b = 0; b < clcn; b++) { 584 | closev = 0; 585 | for (c = 0; c < layer1_size; c++) { 586 | cent[layer1_size * b + c] /= centcn[b]; 587 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 588 | } 589 | closev = sqrt(closev); 590 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 591 | } 592 | for (c = 0; c < vocab_size; c++) { 593 | closev = -10; 594 | closeid = 0; 595 | for (d = 0; d < clcn; d++) { 596 | x = 0; 597 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 598 | if (x > closev) { 599 | closev = x; 600 | closeid = d; 601 | } 602 | } 603 | cl[c] = closeid; 604 | } 605 | } 606 | // Save the K-means classes 607 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 608 | free(centcn); 609 | free(cent); 610 | free(cl); 611 | } 612 | fclose(fo); 613 | } 614 | 615 | int ArgPos(char *str, int argc, char **argv) { 616 | int a; 617 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 618 | if (a == argc - 1) { 619 | printf("Argument missing for %s\n", str); 620 | exit(1); 621 | } 622 | return a; 623 | } 624 | return -1; 625 | } 626 | 627 | int main(int argc, char **argv) { 628 | int i; 629 | if (argc == 1) { 630 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 631 | printf("Options:\n"); 632 | printf("Parameters for training:\n"); 633 | printf("\t-train \n"); 634 | printf("\t\tUse text data from to train the model\n"); 635 | printf("\t-output \n"); 636 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 637 | printf("\t-size \n"); 638 | printf("\t\tSet size of word vectors; default is 100\n"); 639 | printf("\t-window \n"); 640 | printf("\t\tSet max skip length between words; default is 5\n"); 641 | printf("\t-sample \n"); 642 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 643 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 644 | printf("\t-hs \n"); 645 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 646 | printf("\t-negative \n"); 647 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 648 | printf("\t-threads \n"); 649 | printf("\t\tUse threads (default 12)\n"); 650 | printf("\t-iter \n"); 651 | printf("\t\tRun more training iterations (default 5)\n"); 652 | printf("\t-min-count \n"); 653 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 654 | printf("\t-alpha \n"); 655 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 656 | printf("\t-classes \n"); 657 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 658 | printf("\t-debug \n"); 659 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 660 | printf("\t-binary \n"); 661 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 662 | printf("\t-save-vocab \n"); 663 | printf("\t\tThe vocabulary will be saved to \n"); 664 | printf("\t-read-vocab \n"); 665 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 666 | printf("\t-cbow \n"); 667 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 668 | printf("\nExamples:\n"); 669 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 670 | return 0; 671 | } 672 | output_file[0] = 0; 673 | save_vocab_file[0] = 0; 674 | read_vocab_file[0] = 0; 675 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 676 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 677 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 678 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 679 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 680 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 681 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 682 | if (cbow) alpha = 0.05; 683 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 684 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 685 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 686 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 687 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 688 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 689 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 690 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 691 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 692 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 693 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 694 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 695 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 696 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 697 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 698 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 699 | } 700 | TrainModel(); 701 | return 0; 702 | } 703 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/ea/ea5f636000c445177e5f2f14af11f716b1e91bd0.svn-base: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | echo --------------------------------------------------------------------------------------------------- 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set 8 | echo Example input: paris france berlin 9 | echo --------------------------------------------------------------------------------------------------- 10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 11 | ./word-analogy vectors.bin 12 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/pristine/f4/f4f8420f4ff647df0f4196ceee895888fb7f63f7.svn-base: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500 7 | sort classes.txt -k 2 -n > classes.sorted.txt 8 | echo The word classes were saved to file classes.sorted.txt 9 | -------------------------------------------------------------------------------- /w2v/trunk/.svn/wc.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qieangel2013/phpword2vec/e12958b7bd6f5382ec1e14da3759f904dd5fe3e3/w2v/trunk/.svn/wc.db -------------------------------------------------------------------------------- /w2v/trunk/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /w2v/trunk/README.txt: -------------------------------------------------------------------------------- 1 | Tools for computing distributed representtion of words 2 | ------------------------------------------------------ 3 | 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts. 5 | 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following: 8 | - desired vector dimensionality 9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model 10 | - training algorithm: hierarchical softmax and / or negative sampling 11 | - threshold for downsampling the frequent words 12 | - number of threads to use 13 | - the format of the output word vector file (text or binary) 14 | 15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 16 | 17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training 18 | is finished, the user can interactively explore the similarity of the words. 19 | 20 | More information about the scripts is provided at https://code.google.com/p/word2vec/ 21 | 22 | -------------------------------------------------------------------------------- /w2v/trunk/compute-accuracy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | const long long max_size = 2000; // max length of strings 23 | const long long N = 1; // number of closest words 24 | const long long max_w = 50; // max length of vocabulary entries 25 | 26 | int main(int argc, char **argv) 27 | { 28 | FILE *f; 29 | char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch; 30 | float dist, len, bestd[N], vec[max_size]; 31 | long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; 32 | float *M; 33 | char *vocab; 34 | int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0; 35 | if (argc < 2) { 36 | printf("Usage: ./compute-accuracy \nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | if (argc > 2) threshold = atoi(argv[2]); 41 | f = fopen(file_name, "rb"); 42 | if (f == NULL) { 43 | printf("Input file not found\n"); 44 | return -1; 45 | } 46 | fscanf(f, "%lld", &words); 47 | if (threshold) if (words > threshold) words = threshold; 48 | fscanf(f, "%lld", &size); 49 | vocab = (char *)malloc(words * max_w * sizeof(char)); 50 | M = (float *)malloc(words * size * sizeof(float)); 51 | if (M == NULL) { 52 | printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576); 53 | return -1; 54 | } 55 | for (b = 0; b < words; b++) { 56 | a = 0; 57 | while (1) { 58 | vocab[b * max_w + a] = fgetc(f); 59 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 60 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 61 | } 62 | vocab[b * max_w + a] = 0; 63 | for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]); 64 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 65 | len = 0; 66 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 67 | len = sqrt(len); 68 | for (a = 0; a < size; a++) M[a + b * size] /= len; 69 | } 70 | fclose(f); 71 | TCN = 0; 72 | while (1) { 73 | for (a = 0; a < N; a++) bestd[a] = 0; 74 | for (a = 0; a < N; a++) bestw[a][0] = 0; 75 | scanf("%s", st1); 76 | for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]); 77 | if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) { 78 | if (TCN == 0) TCN = 1; 79 | if (QID != 0) { 80 | printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN); 81 | printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100); 82 | } 83 | QID++; 84 | scanf("%s", st1); 85 | if (feof(stdin)) break; 86 | printf("%s:\n", st1); 87 | TCN = 0; 88 | CCN = 0; 89 | continue; 90 | } 91 | if (!strcmp(st1, "EXIT")) break; 92 | scanf("%s", st2); 93 | for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]); 94 | scanf("%s", st3); 95 | for (a = 0; a bestd[a]) { 122 | for (d = N - 1; d > a; d--) { 123 | bestd[d] = bestd[d - 1]; 124 | strcpy(bestw[d], bestw[d - 1]); 125 | } 126 | bestd[a] = dist; 127 | strcpy(bestw[a], &vocab[c * max_w]); 128 | break; 129 | } 130 | } 131 | } 132 | if (!strcmp(st4, bestw[0])) { 133 | CCN++; 134 | CACN++; 135 | if (QID <= 5) SEAC++; else SYAC++; 136 | } 137 | if (QID <= 5) SECN++; else SYCN++; 138 | TCN++; 139 | TACN++; 140 | } 141 | printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100); 142 | return 0; 143 | } 144 | -------------------------------------------------------------------------------- /w2v/trunk/demo-analogy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | echo --------------------------------------------------------------------------------------------------- 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set 8 | echo Example input: paris france berlin 9 | echo --------------------------------------------------------------------------------------------------- 10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 11 | ./word-analogy vectors.bin 12 | -------------------------------------------------------------------------------- /w2v/trunk/demo-classes.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500 7 | sort classes.txt -k 2 -n > classes.sorted.txt 8 | echo The word classes were saved to file classes.sorted.txt 9 | -------------------------------------------------------------------------------- /w2v/trunk/demo-phrase-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt 12 | -------------------------------------------------------------------------------- /w2v/trunk/demo-phrases.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./distance vectors-phrase.bin 12 | -------------------------------------------------------------------------------- /w2v/trunk/demo-train-big-model-v1.sh: -------------------------------------------------------------------------------- 1 | ############################################################################################### 2 | # 3 | # Script for training good word and phrase vector model using public corpora, version 1.0. 4 | # The training time will be from several hours to about a day. 5 | # 6 | # Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains 7 | # a 500-dimensional vector model and evaluates it on word and phrase analogy tasks. 8 | # 9 | ############################################################################################### 10 | 11 | # This function will convert text to lowercase and remove special characters 12 | normalize_text() { 13 | awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ 14 | -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/
/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ 15 | -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ 16 | -e 's/«/ /g' | tr 0-9 " " 17 | } 18 | 19 | mkdir word2vec 20 | cd word2vec 21 | 22 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 23 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz 24 | gzip -d news.2012.en.shuffled.gz 25 | gzip -d news.2013.en.shuffled.gz 26 | normalize_text < news.2012.en.shuffled > data.txt 27 | normalize_text < news.2013.en.shuffled >> data.txt 28 | 29 | wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz 30 | tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz 31 | for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do 32 | normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt 33 | done 34 | 35 | wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus 36 | tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt 37 | for i in `ls webbase_all`; do 38 | normalize_text < webbase_all/$i >> data.txt 39 | done 40 | 41 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 42 | bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e ' 43 | # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase 44 | # letters (a-z, converted from A-Z), and spaces (never consecutive)... 45 | # All other characters are converted to spaces. Only text which normally appears. 46 | # in the web browser is displayed. Tables are removed. Image captions are. 47 | # preserved. Links are converted to normal text. Digits are spelled out. 48 | # *** Modified to not spell digits or throw away non-ASCII characters *** 49 | 50 | # Written by Matt Mahoney, June 10, 2006. This program is released to the public domain. 51 | 52 | $/=">"; # input record separator 53 | while (<>) { 54 | if (/ ... 55 | if (/#redirect/i) {$text=0;} # remove #REDIRECT 56 | if ($text) { 57 | 58 | # Remove any text not normally visible 59 | if (/<\/text>/) {$text=0;} 60 | s/<.*>//; # remove xml tags 61 | s/&/&/g; # decode URL encoded chars 62 | s/<//g; 64 | s///g; # remove references ... 65 | s/<[^>]*>//g; # remove xhtml tags 66 | s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text 67 | s/\|thumb//ig; # remove images links, preserve caption 68 | s/\|left//ig; 69 | s/\|right//ig; 70 | s/\|\d+px//ig; 71 | s/\[\[image:[^\[\]]*\|//ig; 72 | s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup 73 | s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages 74 | s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text 75 | s/{{[^}]*}}//g; # remove {{icons}} and {tables} 76 | s/{[^}]*}//g; 77 | s/\[//g; # remove [ and ] 78 | s/\]//g; 79 | s/&[^;]*;/ /g; # remove URL encoded chars 80 | 81 | $_=" $_ "; 82 | chop; 83 | print $_; 84 | } 85 | } 86 | ' | normalize_text | awk '{if (NF>1) print;}' >> data.txt 87 | 88 | wget http://word2vec.googlecode.com/svn/trunk/word2vec.c 89 | wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c 90 | wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c 91 | wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt 92 | wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt 93 | gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops 94 | gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops 95 | gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops 96 | ./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2 97 | ./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2 98 | ./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10 99 | ./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions 100 | ./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage 101 | -------------------------------------------------------------------------------- /w2v/trunk/demo-word-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt 8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt 9 | -------------------------------------------------------------------------------- /w2v/trunk/demo-word.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./distance vectors.bin 8 | -------------------------------------------------------------------------------- /w2v/trunk/distance.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char *bestw[N]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | char ch; 32 | float *M; 33 | char *vocab; 34 | if (argc < 2) { 35 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 36 | return 0; 37 | } 38 | strcpy(file_name, argv[1]); 39 | f = fopen(file_name, "rb"); 40 | if (f == NULL) { 41 | printf("Input file not found\n"); 42 | return -1; 43 | } 44 | fscanf(f, "%lld", &words); 45 | fscanf(f, "%lld", &size); 46 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 47 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 48 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 49 | if (M == NULL) { 50 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 51 | return -1; 52 | } 53 | for (b = 0; b < words; b++) { 54 | a = 0; 55 | while (1) { 56 | vocab[b * max_w + a] = fgetc(f); 57 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 58 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 59 | } 60 | vocab[b * max_w + a] = 0; 61 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 62 | len = 0; 63 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 64 | len = sqrt(len); 65 | for (a = 0; a < size; a++) M[a + b * size] /= len; 66 | } 67 | fclose(f); 68 | while (1) { 69 | for (a = 0; a < N; a++) bestd[a] = 0; 70 | for (a = 0; a < N; a++) bestw[a][0] = 0; 71 | printf("Enter word or sentence (EXIT to break): "); 72 | a = 0; 73 | while (1) { 74 | st1[a] = fgetc(stdin); 75 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 76 | st1[a] = 0; 77 | break; 78 | } 79 | a++; 80 | } 81 | if (!strcmp(st1, "EXIT")) break; 82 | cn = 0; 83 | b = 0; 84 | c = 0; 85 | while (1) { 86 | st[cn][b] = st1[c]; 87 | b++; 88 | c++; 89 | st[cn][b] = 0; 90 | if (st1[c] == 0) break; 91 | if (st1[c] == ' ') { 92 | cn++; 93 | b = 0; 94 | c++; 95 | } 96 | } 97 | cn++; 98 | for (a = 0; a < cn; a++) { 99 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 100 | if (b == words) b = -1; 101 | bi[a] = b; 102 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 103 | if (b == -1) { 104 | printf("Out of dictionary word!\n"); 105 | break; 106 | } 107 | } 108 | if (b == -1) continue; 109 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 110 | for (a = 0; a < size; a++) vec[a] = 0; 111 | for (b = 0; b < cn; b++) { 112 | if (bi[b] == -1) continue; 113 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 114 | } 115 | len = 0; 116 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 117 | len = sqrt(len); 118 | for (a = 0; a < size; a++) vec[a] /= len; 119 | for (a = 0; a < N; a++) bestd[a] = -1; 120 | for (a = 0; a < N; a++) bestw[a][0] = 0; 121 | for (c = 0; c < words; c++) { 122 | a = 0; 123 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 124 | if (a == 1) continue; 125 | dist = 0; 126 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 127 | for (a = 0; a < N; a++) { 128 | if (dist > bestd[a]) { 129 | for (d = N - 1; d > a; d--) { 130 | bestd[d] = bestd[d - 1]; 131 | strcpy(bestw[d], bestw[d - 1]); 132 | } 133 | bestd[a] = dist; 134 | strcpy(bestw[a], &vocab[c * max_w]); 135 | break; 136 | } 137 | } 138 | } 139 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 140 | } 141 | return 0; 142 | } 143 | -------------------------------------------------------------------------------- /w2v/trunk/makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result 4 | 5 | all: word2vec word2phrase distance word-analogy compute-accuracy 6 | 7 | word2vec : word2vec.c 8 | $(CC) word2vec.c -o word2vec $(CFLAGS) 9 | word2phrase : word2phrase.c 10 | $(CC) word2phrase.c -o word2phrase $(CFLAGS) 11 | distance : distance.c 12 | $(CC) distance.c -o distance $(CFLAGS) 13 | word-analogy : word-analogy.c 14 | $(CC) word-analogy.c -o word-analogy $(CFLAGS) 15 | compute-accuracy : compute-accuracy.c 16 | $(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS) 17 | chmod +x *.sh 18 | 19 | clean: 20 | rm -rf word2vec word2phrase distance word-analogy compute-accuracy -------------------------------------------------------------------------------- /w2v/trunk/vectors.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qieangel2013/phpword2vec/e12958b7bd6f5382ec1e14da3759f904dd5fe3e3/w2v/trunk/vectors.bin -------------------------------------------------------------------------------- /w2v/trunk/word-analogy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char bestw[N][max_size]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | char ch; 32 | float *M; 33 | char *vocab; 34 | if (argc < 2) { 35 | printf("Usage: ./word-analogy \nwhere FILE contains word projections in the BINARY FORMAT\n"); 36 | return 0; 37 | } 38 | strcpy(file_name, argv[1]); 39 | f = fopen(file_name, "rb"); 40 | if (f == NULL) { 41 | printf("Input file not found\n"); 42 | return -1; 43 | } 44 | fscanf(f, "%lld", &words); 45 | fscanf(f, "%lld", &size); 46 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 47 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 48 | if (M == NULL) { 49 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 50 | return -1; 51 | } 52 | for (b = 0; b < words; b++) { 53 | a = 0; 54 | while (1) { 55 | vocab[b * max_w + a] = fgetc(f); 56 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 57 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 58 | } 59 | vocab[b * max_w + a] = 0; 60 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 61 | len = 0; 62 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 63 | len = sqrt(len); 64 | for (a = 0; a < size; a++) M[a + b * size] /= len; 65 | } 66 | fclose(f); 67 | while (1) { 68 | for (a = 0; a < N; a++) bestd[a] = 0; 69 | for (a = 0; a < N; a++) bestw[a][0] = 0; 70 | printf("Enter three words (EXIT to break): "); 71 | a = 0; 72 | while (1) { 73 | st1[a] = fgetc(stdin); 74 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 75 | st1[a] = 0; 76 | break; 77 | } 78 | a++; 79 | } 80 | if (!strcmp(st1, "EXIT")) break; 81 | cn = 0; 82 | b = 0; 83 | c = 0; 84 | while (1) { 85 | st[cn][b] = st1[c]; 86 | b++; 87 | c++; 88 | st[cn][b] = 0; 89 | if (st1[c] == 0) break; 90 | if (st1[c] == ' ') { 91 | cn++; 92 | b = 0; 93 | c++; 94 | } 95 | } 96 | cn++; 97 | if (cn < 3) { 98 | printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn); 99 | continue; 100 | } 101 | for (a = 0; a < cn; a++) { 102 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 103 | if (b == words) b = 0; 104 | bi[a] = b; 105 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 106 | if (b == 0) { 107 | printf("Out of dictionary word!\n"); 108 | break; 109 | } 110 | } 111 | if (b == 0) continue; 112 | printf("\n Word Distance\n------------------------------------------------------------------------\n"); 113 | for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size]; 114 | len = 0; 115 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 116 | len = sqrt(len); 117 | for (a = 0; a < size; a++) vec[a] /= len; 118 | for (a = 0; a < N; a++) bestd[a] = 0; 119 | for (a = 0; a < N; a++) bestw[a][0] = 0; 120 | for (c = 0; c < words; c++) { 121 | if (c == bi[0]) continue; 122 | if (c == bi[1]) continue; 123 | if (c == bi[2]) continue; 124 | a = 0; 125 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 126 | if (a == 1) continue; 127 | dist = 0; 128 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 129 | for (a = 0; a < N; a++) { 130 | if (dist > bestd[a]) { 131 | for (d = N - 1; d > a; d--) { 132 | bestd[d] = bestd[d - 1]; 133 | strcpy(bestw[d], bestw[d - 1]); 134 | } 135 | bestd[a] = dist; 136 | strcpy(bestw[a], &vocab[c * max_w]); 137 | break; 138 | } 139 | } 140 | } 141 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 142 | } 143 | return 0; 144 | } 145 | -------------------------------------------------------------------------------- /w2v/trunk/word2phrase.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 60 22 | 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary 24 | 25 | typedef float real; // Precision of float numbers 26 | 27 | struct vocab_word { 28 | long long cn; 29 | char *word; 30 | }; 31 | 32 | char train_file[MAX_STRING], output_file[MAX_STRING]; 33 | struct vocab_word *vocab; 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1; 35 | long long vocab_max_size = 10000, vocab_size = 0; 36 | long long train_words = 0; 37 | real threshold = 100; 38 | 39 | unsigned long long next_random = 1; 40 | 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 42 | void ReadWord(char *word, FILE *fin) { 43 | int a = 0, ch; 44 | while (!feof(fin)) { 45 | ch = fgetc(fin); 46 | if (ch == 13) continue; 47 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 48 | if (a > 0) { 49 | if (ch == '\n') ungetc(ch, fin); 50 | break; 51 | } 52 | if (ch == '\n') { 53 | strcpy(word, (char *)""); 54 | return; 55 | } else continue; 56 | } 57 | word[a] = ch; 58 | a++; 59 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 60 | } 61 | word[a] = 0; 62 | } 63 | 64 | // Returns hash value of a word 65 | int GetWordHash(char *word) { 66 | unsigned long long a, hash = 1; 67 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 68 | hash = hash % vocab_hash_size; 69 | return hash; 70 | } 71 | 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 73 | int SearchVocab(char *word) { 74 | unsigned int hash = GetWordHash(word); 75 | while (1) { 76 | if (vocab_hash[hash] == -1) return -1; 77 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 78 | hash = (hash + 1) % vocab_hash_size; 79 | } 80 | return -1; 81 | } 82 | 83 | // Reads a word and returns its index in the vocabulary 84 | int ReadWordIndex(FILE *fin) { 85 | char word[MAX_STRING]; 86 | ReadWord(word, fin); 87 | if (feof(fin)) return -1; 88 | return SearchVocab(word); 89 | } 90 | 91 | // Adds a word to the vocabulary 92 | int AddWordToVocab(char *word) { 93 | unsigned int hash, length = strlen(word) + 1; 94 | if (length > MAX_STRING) length = MAX_STRING; 95 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 96 | strcpy(vocab[vocab_size].word, word); 97 | vocab[vocab_size].cn = 0; 98 | vocab_size++; 99 | // Reallocate memory if needed 100 | if (vocab_size + 2 >= vocab_max_size) { 101 | vocab_max_size += 10000; 102 | vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 103 | } 104 | hash = GetWordHash(word); 105 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 106 | vocab_hash[hash]=vocab_size - 1; 107 | return vocab_size - 1; 108 | } 109 | 110 | // Used later for sorting by word counts 111 | int VocabCompare(const void *a, const void *b) { 112 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 113 | } 114 | 115 | // Sorts the vocabulary by frequency using word counts 116 | void SortVocab() { 117 | int a; 118 | unsigned int hash; 119 | // Sort the vocabulary and keep at the first position 120 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 121 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 122 | for (a = 0; a < vocab_size; a++) { 123 | // Words occuring less than min_count times will be discarded from the vocab 124 | if (vocab[a].cn < min_count) { 125 | vocab_size--; 126 | free(vocab[vocab_size].word); 127 | } else { 128 | // Hash will be re-computed, as after the sorting it is not actual 129 | hash = GetWordHash(vocab[a].word); 130 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 131 | vocab_hash[hash] = a; 132 | } 133 | } 134 | vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word)); 135 | } 136 | 137 | // Reduces the vocabulary by removing infrequent tokens 138 | void ReduceVocab() { 139 | int a, b = 0; 140 | unsigned int hash; 141 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 142 | vocab[b].cn = vocab[a].cn; 143 | vocab[b].word = vocab[a].word; 144 | b++; 145 | } else free(vocab[a].word); 146 | vocab_size = b; 147 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 148 | for (a = 0; a < vocab_size; a++) { 149 | // Hash will be re-computed, as it is not actual 150 | hash = GetWordHash(vocab[a].word); 151 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 152 | vocab_hash[hash] = a; 153 | } 154 | fflush(stdout); 155 | min_reduce++; 156 | } 157 | 158 | void LearnVocabFromTrainFile() { 159 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 160 | FILE *fin; 161 | long long a, i, start = 1; 162 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 163 | fin = fopen(train_file, "rb"); 164 | if (fin == NULL) { 165 | printf("ERROR: training data file not found!\n"); 166 | exit(1); 167 | } 168 | vocab_size = 0; 169 | AddWordToVocab((char *)""); 170 | while (1) { 171 | ReadWord(word, fin); 172 | if (feof(fin)) break; 173 | if (!strcmp(word, "")) { 174 | start = 1; 175 | continue; 176 | } else start = 0; 177 | train_words++; 178 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 179 | printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13); 180 | fflush(stdout); 181 | } 182 | i = SearchVocab(word); 183 | if (i == -1) { 184 | a = AddWordToVocab(word); 185 | vocab[a].cn = 1; 186 | } else vocab[i].cn++; 187 | if (start) continue; 188 | sprintf(bigram_word, "%s_%s", last_word, word); 189 | bigram_word[MAX_STRING - 1] = 0; 190 | strcpy(last_word, word); 191 | i = SearchVocab(bigram_word); 192 | if (i == -1) { 193 | a = AddWordToVocab(bigram_word); 194 | vocab[a].cn = 1; 195 | } else vocab[i].cn++; 196 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 197 | } 198 | SortVocab(); 199 | if (debug_mode > 0) { 200 | printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); 201 | printf("Words in train file: %lld\n", train_words); 202 | } 203 | fclose(fin); 204 | } 205 | 206 | void TrainModel() { 207 | long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0; 208 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 209 | real score; 210 | FILE *fo, *fin; 211 | printf("Starting training using file %s\n", train_file); 212 | LearnVocabFromTrainFile(); 213 | fin = fopen(train_file, "rb"); 214 | fo = fopen(output_file, "wb"); 215 | word[0] = 0; 216 | while (1) { 217 | strcpy(last_word, word); 218 | ReadWord(word, fin); 219 | if (feof(fin)) break; 220 | if (!strcmp(word, "")) { 221 | fprintf(fo, "\n"); 222 | continue; 223 | } 224 | cn++; 225 | if ((debug_mode > 1) && (cn % 100000 == 0)) { 226 | printf("Words written: %lldK%c", cn / 1000, 13); 227 | fflush(stdout); 228 | } 229 | oov = 0; 230 | i = SearchVocab(word); 231 | if (i == -1) oov = 1; else pb = vocab[i].cn; 232 | if (li == -1) oov = 1; 233 | li = i; 234 | sprintf(bigram_word, "%s_%s", last_word, word); 235 | bigram_word[MAX_STRING - 1] = 0; 236 | i = SearchVocab(bigram_word); 237 | if (i == -1) oov = 1; else pab = vocab[i].cn; 238 | if (pa < min_count) oov = 1; 239 | if (pb < min_count) oov = 1; 240 | if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words; 241 | if (score > threshold) { 242 | fprintf(fo, "_%s", word); 243 | pb = 0; 244 | } else fprintf(fo, " %s", word); 245 | pa = pb; 246 | } 247 | fclose(fo); 248 | fclose(fin); 249 | } 250 | 251 | int ArgPos(char *str, int argc, char **argv) { 252 | int a; 253 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 254 | if (a == argc - 1) { 255 | printf("Argument missing for %s\n", str); 256 | exit(1); 257 | } 258 | return a; 259 | } 260 | return -1; 261 | } 262 | 263 | int main(int argc, char **argv) { 264 | int i; 265 | if (argc == 1) { 266 | printf("WORD2PHRASE tool v0.1a\n\n"); 267 | printf("Options:\n"); 268 | printf("Parameters for training:\n"); 269 | printf("\t-train \n"); 270 | printf("\t\tUse text data from to train the model\n"); 271 | printf("\t-output \n"); 272 | printf("\t\tUse to save the resulting word vectors / word clusters / phrases\n"); 273 | printf("\t-min-count \n"); 274 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 275 | printf("\t-threshold \n"); 276 | printf("\t\t The value represents threshold for forming the phrases (higher means less phrases); default 100\n"); 277 | printf("\t-debug \n"); 278 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 279 | printf("\nExamples:\n"); 280 | printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n"); 281 | return 0; 282 | } 283 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 284 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 285 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 286 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 287 | if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]); 288 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 289 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 290 | TrainModel(); 291 | return 0; 292 | } 293 | -------------------------------------------------------------------------------- /w2v/trunk/word2vec.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 100 22 | #define EXP_TABLE_SIZE 1000 23 | #define MAX_EXP 6 24 | #define MAX_SENTENCE_LENGTH 1000 25 | #define MAX_CODE_LENGTH 40 26 | 27 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 28 | 29 | typedef float real; // Precision of float numbers 30 | 31 | struct vocab_word { 32 | long long cn; 33 | int *point; 34 | char *word, *code, codelen; 35 | }; 36 | 37 | char train_file[MAX_STRING], output_file[MAX_STRING]; 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 39 | struct vocab_word *vocab; 40 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 41 | int *vocab_hash; 42 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; 43 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 44 | real alpha = 0.025, starting_alpha, sample = 1e-3; 45 | real *syn0, *syn1, *syn1neg, *expTable; 46 | clock_t start; 47 | 48 | int hs = 0, negative = 5; 49 | const int table_size = 1e8; 50 | int *table; 51 | 52 | void InitUnigramTable() { 53 | int a, i; 54 | long long train_words_pow = 0; 55 | real d1, power = 0.75; 56 | table = (int *)malloc(table_size * sizeof(int)); 57 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 58 | i = 0; 59 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 60 | for (a = 0; a < table_size; a++) { 61 | table[a] = i; 62 | if (a / (real)table_size > d1) { 63 | i++; 64 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 65 | } 66 | if (i >= vocab_size) i = vocab_size - 1; 67 | } 68 | } 69 | 70 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 71 | void ReadWord(char *word, FILE *fin) { 72 | int a = 0, ch; 73 | while (!feof(fin)) { 74 | ch = fgetc(fin); 75 | if (ch == 13) continue; 76 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 77 | if (a > 0) { 78 | if (ch == '\n') ungetc(ch, fin); 79 | break; 80 | } 81 | if (ch == '\n') { 82 | strcpy(word, (char *)""); 83 | return; 84 | } else continue; 85 | } 86 | word[a] = ch; 87 | a++; 88 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 89 | } 90 | word[a] = 0; 91 | } 92 | 93 | // Returns hash value of a word 94 | int GetWordHash(char *word) { 95 | unsigned long long a, hash = 0; 96 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 97 | hash = hash % vocab_hash_size; 98 | return hash; 99 | } 100 | 101 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 102 | int SearchVocab(char *word) { 103 | unsigned int hash = GetWordHash(word); 104 | while (1) { 105 | if (vocab_hash[hash] == -1) return -1; 106 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 107 | hash = (hash + 1) % vocab_hash_size; 108 | } 109 | return -1; 110 | } 111 | 112 | // Reads a word and returns its index in the vocabulary 113 | int ReadWordIndex(FILE *fin) { 114 | char word[MAX_STRING]; 115 | ReadWord(word, fin); 116 | if (feof(fin)) return -1; 117 | return SearchVocab(word); 118 | } 119 | 120 | // Adds a word to the vocabulary 121 | int AddWordToVocab(char *word) { 122 | unsigned int hash, length = strlen(word) + 1; 123 | if (length > MAX_STRING) length = MAX_STRING; 124 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 125 | strcpy(vocab[vocab_size].word, word); 126 | vocab[vocab_size].cn = 0; 127 | vocab_size++; 128 | // Reallocate memory if needed 129 | if (vocab_size + 2 >= vocab_max_size) { 130 | vocab_max_size += 1000; 131 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 132 | } 133 | hash = GetWordHash(word); 134 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 135 | vocab_hash[hash] = vocab_size - 1; 136 | return vocab_size - 1; 137 | } 138 | 139 | // Used later for sorting by word counts 140 | int VocabCompare(const void *a, const void *b) { 141 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 142 | } 143 | 144 | // Sorts the vocabulary by frequency using word counts 145 | void SortVocab() { 146 | int a, size; 147 | unsigned int hash; 148 | // Sort the vocabulary and keep at the first position 149 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 150 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 151 | size = vocab_size; 152 | train_words = 0; 153 | for (a = 0; a < size; a++) { 154 | // Words occuring less than min_count times will be discarded from the vocab 155 | if ((vocab[a].cn < min_count) && (a != 0)) { 156 | vocab_size--; 157 | free(vocab[a].word); 158 | } else { 159 | // Hash will be re-computed, as after the sorting it is not actual 160 | hash=GetWordHash(vocab[a].word); 161 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 162 | vocab_hash[hash] = a; 163 | train_words += vocab[a].cn; 164 | } 165 | } 166 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 167 | // Allocate memory for the binary tree construction 168 | for (a = 0; a < vocab_size; a++) { 169 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 170 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 171 | } 172 | } 173 | 174 | // Reduces the vocabulary by removing infrequent tokens 175 | void ReduceVocab() { 176 | int a, b = 0; 177 | unsigned int hash; 178 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 179 | vocab[b].cn = vocab[a].cn; 180 | vocab[b].word = vocab[a].word; 181 | b++; 182 | } else free(vocab[a].word); 183 | vocab_size = b; 184 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 185 | for (a = 0; a < vocab_size; a++) { 186 | // Hash will be re-computed, as it is not actual 187 | hash = GetWordHash(vocab[a].word); 188 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 189 | vocab_hash[hash] = a; 190 | } 191 | fflush(stdout); 192 | min_reduce++; 193 | } 194 | 195 | // Create binary Huffman tree using the word counts 196 | // Frequent words will have short uniqe binary codes 197 | void CreateBinaryTree() { 198 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 199 | char code[MAX_CODE_LENGTH]; 200 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 201 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 202 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 203 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 204 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 205 | pos1 = vocab_size - 1; 206 | pos2 = vocab_size; 207 | // Following algorithm constructs the Huffman tree by adding one node at a time 208 | for (a = 0; a < vocab_size - 1; a++) { 209 | // First, find two smallest nodes 'min1, min2' 210 | if (pos1 >= 0) { 211 | if (count[pos1] < count[pos2]) { 212 | min1i = pos1; 213 | pos1--; 214 | } else { 215 | min1i = pos2; 216 | pos2++; 217 | } 218 | } else { 219 | min1i = pos2; 220 | pos2++; 221 | } 222 | if (pos1 >= 0) { 223 | if (count[pos1] < count[pos2]) { 224 | min2i = pos1; 225 | pos1--; 226 | } else { 227 | min2i = pos2; 228 | pos2++; 229 | } 230 | } else { 231 | min2i = pos2; 232 | pos2++; 233 | } 234 | count[vocab_size + a] = count[min1i] + count[min2i]; 235 | parent_node[min1i] = vocab_size + a; 236 | parent_node[min2i] = vocab_size + a; 237 | binary[min2i] = 1; 238 | } 239 | // Now assign binary code to each vocabulary word 240 | for (a = 0; a < vocab_size; a++) { 241 | b = a; 242 | i = 0; 243 | while (1) { 244 | code[i] = binary[b]; 245 | point[i] = b; 246 | i++; 247 | b = parent_node[b]; 248 | if (b == vocab_size * 2 - 2) break; 249 | } 250 | vocab[a].codelen = i; 251 | vocab[a].point[0] = vocab_size - 2; 252 | for (b = 0; b < i; b++) { 253 | vocab[a].code[i - b - 1] = code[b]; 254 | vocab[a].point[i - b] = point[b] - vocab_size; 255 | } 256 | } 257 | free(count); 258 | free(binary); 259 | free(parent_node); 260 | } 261 | 262 | void LearnVocabFromTrainFile() { 263 | char word[MAX_STRING]; 264 | FILE *fin; 265 | long long a, i; 266 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 267 | fin = fopen(train_file, "rb"); 268 | if (fin == NULL) { 269 | printf("ERROR: training data file not found!\n"); 270 | exit(1); 271 | } 272 | vocab_size = 0; 273 | AddWordToVocab((char *)""); 274 | while (1) { 275 | ReadWord(word, fin); 276 | if (feof(fin)) break; 277 | train_words++; 278 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 279 | printf("%lldK%c", train_words / 1000, 13); 280 | fflush(stdout); 281 | } 282 | i = SearchVocab(word); 283 | if (i == -1) { 284 | a = AddWordToVocab(word); 285 | vocab[a].cn = 1; 286 | } else vocab[i].cn++; 287 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 288 | } 289 | SortVocab(); 290 | if (debug_mode > 0) { 291 | printf("Vocab size: %lld\n", vocab_size); 292 | printf("Words in train file: %lld\n", train_words); 293 | } 294 | file_size = ftell(fin); 295 | fclose(fin); 296 | } 297 | 298 | void SaveVocab() { 299 | long long i; 300 | FILE *fo = fopen(save_vocab_file, "wb"); 301 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 302 | fclose(fo); 303 | } 304 | 305 | void ReadVocab() { 306 | long long a, i = 0; 307 | char c; 308 | char word[MAX_STRING]; 309 | FILE *fin = fopen(read_vocab_file, "rb"); 310 | if (fin == NULL) { 311 | printf("Vocabulary file not found\n"); 312 | exit(1); 313 | } 314 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 315 | vocab_size = 0; 316 | while (1) { 317 | ReadWord(word, fin); 318 | if (feof(fin)) break; 319 | a = AddWordToVocab(word); 320 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 321 | i++; 322 | } 323 | SortVocab(); 324 | if (debug_mode > 0) { 325 | printf("Vocab size: %lld\n", vocab_size); 326 | printf("Words in train file: %lld\n", train_words); 327 | } 328 | fin = fopen(train_file, "rb"); 329 | if (fin == NULL) { 330 | printf("ERROR: training data file not found!\n"); 331 | exit(1); 332 | } 333 | fseek(fin, 0, SEEK_END); 334 | file_size = ftell(fin); 335 | fclose(fin); 336 | } 337 | 338 | void InitNet() { 339 | long long a, b; 340 | unsigned long long next_random = 1; 341 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 342 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 343 | if (hs) { 344 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 345 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 346 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 347 | syn1[a * layer1_size + b] = 0; 348 | } 349 | if (negative>0) { 350 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); 351 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 352 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 353 | syn1neg[a * layer1_size + b] = 0; 354 | } 355 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 356 | next_random = next_random * (unsigned long long)25214903917 + 11; 357 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 358 | } 359 | CreateBinaryTree(); 360 | } 361 | 362 | void *TrainModelThread(void *id) { 363 | long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; 364 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 365 | long long l1, l2, c, target, label, local_iter = iter; 366 | unsigned long long next_random = (long long)id; 367 | real f, g; 368 | clock_t now; 369 | real *neu1 = (real *)calloc(layer1_size, sizeof(real)); 370 | real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 371 | FILE *fi = fopen(train_file, "rb"); 372 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 373 | while (1) { 374 | if (word_count - last_word_count > 10000) { 375 | word_count_actual += word_count - last_word_count; 376 | last_word_count = word_count; 377 | if ((debug_mode > 1)) { 378 | now=clock(); 379 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 380 | word_count_actual / (real)(iter * train_words + 1) * 100, 381 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 382 | fflush(stdout); 383 | } 384 | alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 385 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 386 | } 387 | if (sentence_length == 0) { 388 | while (1) { 389 | word = ReadWordIndex(fi); 390 | if (feof(fi)) break; 391 | if (word == -1) continue; 392 | word_count++; 393 | if (word == 0) break; 394 | // The subsampling randomly discards frequent words while keeping the ranking same 395 | if (sample > 0) { 396 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 397 | next_random = next_random * (unsigned long long)25214903917 + 11; 398 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 399 | } 400 | sen[sentence_length] = word; 401 | sentence_length++; 402 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 403 | } 404 | sentence_position = 0; 405 | } 406 | if (feof(fi) || (word_count > train_words / num_threads)) { 407 | word_count_actual += word_count - last_word_count; 408 | local_iter--; 409 | if (local_iter == 0) break; 410 | word_count = 0; 411 | last_word_count = 0; 412 | sentence_length = 0; 413 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 414 | continue; 415 | } 416 | word = sen[sentence_position]; 417 | if (word == -1) continue; 418 | for (c = 0; c < layer1_size; c++) neu1[c] = 0; 419 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 420 | next_random = next_random * (unsigned long long)25214903917 + 11; 421 | b = next_random % window; 422 | if (cbow) { //train the cbow architecture 423 | // in -> hidden 424 | cw = 0; 425 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 426 | c = sentence_position - window + a; 427 | if (c < 0) continue; 428 | if (c >= sentence_length) continue; 429 | last_word = sen[c]; 430 | if (last_word == -1) continue; 431 | for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; 432 | cw++; 433 | } 434 | if (cw) { 435 | for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 436 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 437 | f = 0; 438 | l2 = vocab[word].point[d] * layer1_size; 439 | // Propagate hidden -> output 440 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 441 | if (f <= -MAX_EXP) continue; 442 | else if (f >= MAX_EXP) continue; 443 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 444 | // 'g' is the gradient multiplied by the learning rate 445 | g = (1 - vocab[word].code[d] - f) * alpha; 446 | // Propagate errors output -> hidden 447 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 448 | // Learn weights hidden -> output 449 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 450 | } 451 | // NEGATIVE SAMPLING 452 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 453 | if (d == 0) { 454 | target = word; 455 | label = 1; 456 | } else { 457 | next_random = next_random * (unsigned long long)25214903917 + 11; 458 | target = table[(next_random >> 16) % table_size]; 459 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 460 | if (target == word) continue; 461 | label = 0; 462 | } 463 | l2 = target * layer1_size; 464 | f = 0; 465 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; 466 | if (f > MAX_EXP) g = (label - 1) * alpha; 467 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 468 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 469 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 470 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 471 | } 472 | // hidden -> in 473 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 474 | c = sentence_position - window + a; 475 | if (c < 0) continue; 476 | if (c >= sentence_length) continue; 477 | last_word = sen[c]; 478 | if (last_word == -1) continue; 479 | for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 480 | } 481 | } 482 | } else { //train skip-gram 483 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 484 | c = sentence_position - window + a; 485 | if (c < 0) continue; 486 | if (c >= sentence_length) continue; 487 | last_word = sen[c]; 488 | if (last_word == -1) continue; 489 | l1 = last_word * layer1_size; 490 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 491 | // HIERARCHICAL SOFTMAX 492 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 493 | f = 0; 494 | l2 = vocab[word].point[d] * layer1_size; 495 | // Propagate hidden -> output 496 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; 497 | if (f <= -MAX_EXP) continue; 498 | else if (f >= MAX_EXP) continue; 499 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 500 | // 'g' is the gradient multiplied by the learning rate 501 | g = (1 - vocab[word].code[d] - f) * alpha; 502 | // Propagate errors output -> hidden 503 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 504 | // Learn weights hidden -> output 505 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; 506 | } 507 | // NEGATIVE SAMPLING 508 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 509 | if (d == 0) { 510 | target = word; 511 | label = 1; 512 | } else { 513 | next_random = next_random * (unsigned long long)25214903917 + 11; 514 | target = table[(next_random >> 16) % table_size]; 515 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 516 | if (target == word) continue; 517 | label = 0; 518 | } 519 | l2 = target * layer1_size; 520 | f = 0; 521 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 522 | if (f > MAX_EXP) g = (label - 1) * alpha; 523 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 524 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 525 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 526 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; 527 | } 528 | // Learn weights input -> hidden 529 | for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; 530 | } 531 | } 532 | sentence_position++; 533 | if (sentence_position >= sentence_length) { 534 | sentence_length = 0; 535 | continue; 536 | } 537 | } 538 | fclose(fi); 539 | free(neu1); 540 | free(neu1e); 541 | pthread_exit(NULL); 542 | } 543 | 544 | void TrainModel() { 545 | long a, b, c, d; 546 | FILE *fo; 547 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 548 | printf("Starting training using file %s\n", train_file); 549 | starting_alpha = alpha; 550 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 551 | if (save_vocab_file[0] != 0) SaveVocab(); 552 | if (output_file[0] == 0) return; 553 | InitNet(); 554 | if (negative > 0) InitUnigramTable(); 555 | start = clock(); 556 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 557 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 558 | fo = fopen(output_file, "wb"); 559 | if (classes == 0) { 560 | // Save the word vectors 561 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 562 | for (a = 0; a < vocab_size; a++) { 563 | fprintf(fo, "%s ", vocab[a].word); 564 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 565 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 566 | fprintf(fo, "\n"); 567 | } 568 | } else { 569 | // Run K-means on the word vectors 570 | int clcn = classes, iter = 10, closeid; 571 | int *centcn = (int *)malloc(classes * sizeof(int)); 572 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 573 | real closev, x; 574 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 575 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 576 | for (a = 0; a < iter; a++) { 577 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 578 | for (b = 0; b < clcn; b++) centcn[b] = 1; 579 | for (c = 0; c < vocab_size; c++) { 580 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 581 | centcn[cl[c]]++; 582 | } 583 | for (b = 0; b < clcn; b++) { 584 | closev = 0; 585 | for (c = 0; c < layer1_size; c++) { 586 | cent[layer1_size * b + c] /= centcn[b]; 587 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 588 | } 589 | closev = sqrt(closev); 590 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 591 | } 592 | for (c = 0; c < vocab_size; c++) { 593 | closev = -10; 594 | closeid = 0; 595 | for (d = 0; d < clcn; d++) { 596 | x = 0; 597 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 598 | if (x > closev) { 599 | closev = x; 600 | closeid = d; 601 | } 602 | } 603 | cl[c] = closeid; 604 | } 605 | } 606 | // Save the K-means classes 607 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 608 | free(centcn); 609 | free(cent); 610 | free(cl); 611 | } 612 | fclose(fo); 613 | } 614 | 615 | int ArgPos(char *str, int argc, char **argv) { 616 | int a; 617 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 618 | if (a == argc - 1) { 619 | printf("Argument missing for %s\n", str); 620 | exit(1); 621 | } 622 | return a; 623 | } 624 | return -1; 625 | } 626 | 627 | int main(int argc, char **argv) { 628 | int i; 629 | if (argc == 1) { 630 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 631 | printf("Options:\n"); 632 | printf("Parameters for training:\n"); 633 | printf("\t-train \n"); 634 | printf("\t\tUse text data from to train the model\n"); 635 | printf("\t-output \n"); 636 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 637 | printf("\t-size \n"); 638 | printf("\t\tSet size of word vectors; default is 100\n"); 639 | printf("\t-window \n"); 640 | printf("\t\tSet max skip length between words; default is 5\n"); 641 | printf("\t-sample \n"); 642 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 643 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 644 | printf("\t-hs \n"); 645 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 646 | printf("\t-negative \n"); 647 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 648 | printf("\t-threads \n"); 649 | printf("\t\tUse threads (default 12)\n"); 650 | printf("\t-iter \n"); 651 | printf("\t\tRun more training iterations (default 5)\n"); 652 | printf("\t-min-count \n"); 653 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 654 | printf("\t-alpha \n"); 655 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 656 | printf("\t-classes \n"); 657 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 658 | printf("\t-debug \n"); 659 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 660 | printf("\t-binary \n"); 661 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 662 | printf("\t-save-vocab \n"); 663 | printf("\t\tThe vocabulary will be saved to \n"); 664 | printf("\t-read-vocab \n"); 665 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 666 | printf("\t-cbow \n"); 667 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 668 | printf("\nExamples:\n"); 669 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 670 | return 0; 671 | } 672 | output_file[0] = 0; 673 | save_vocab_file[0] = 0; 674 | read_vocab_file[0] = 0; 675 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 676 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 677 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 678 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 679 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 680 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 681 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 682 | if (cbow) alpha = 0.05; 683 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 684 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 685 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 686 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 687 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 688 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 689 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 690 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 691 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 692 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 693 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 694 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 695 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 696 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 697 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 698 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 699 | } 700 | TrainModel(); 701 | return 0; 702 | } 703 | --------------------------------------------------------------------------------