├── LICENSE ├── README.md ├── compute-accuracy.c ├── distance.c ├── distance_fast.c ├── distance_txt.c ├── kmeans_txt.c ├── makefile ├── word-analogy.c ├── word2phrase.c └── word2vec.c /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # incremental-word2vec 2 | Modify word2vec such that it's possible to "condition" on existing embeddings for some words, and induce embeddings for new words. 3 | 4 | 5 | # usage: 6 | ``` 7 | ./word2vec -train testdemo.txt -output testdemo.oldmodel -size 200 -threads 12 8 | ./word2vec -train new_data.txt -output testdemo.newmodel -size 200 -threads 12 -fixed-embeddings testdemo.oldmodel 9 | ``` 10 | -------------------------------------------------------------------------------- /compute-accuracy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | const long long max_size = 2000; // max length of strings 23 | const long long N = 1; // number of closest words 24 | const long long max_w = 50; // max length of vocabulary entries 25 | 26 | int main(int argc, char **argv) 27 | { 28 | FILE *f; 29 | char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size]; 30 | float dist, len, bestd[N], vec[max_size]; 31 | long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; 32 | float *M; 33 | char *vocab; 34 | int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0; 35 | if (argc < 2) { 36 | printf("Usage: ./compute-accuracy \nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | if (argc > 2) threshold = atoi(argv[2]); 41 | f = fopen(file_name, "rb"); 42 | if (f == NULL) { 43 | printf("Input file not found\n"); 44 | return -1; 45 | } 46 | fscanf(f, "%lld", &words); 47 | if (threshold) if (words > threshold) words = threshold; 48 | fscanf(f, "%lld", &size); 49 | vocab = (char *)malloc(words * max_w * sizeof(char)); 50 | M = (float *)malloc(words * size * sizeof(float)); 51 | if (M == NULL) { 52 | printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576); 53 | return -1; 54 | } 55 | for (b = 0; b < words; b++) { 56 | a = 0; 57 | while (1) { 58 | vocab[b * max_w + a] = fgetc(f); 59 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 60 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 61 | } 62 | vocab[b * max_w + a] = 0; 63 | for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]); 64 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 65 | len = 0; 66 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 67 | len = sqrt(len); 68 | for (a = 0; a < size; a++) M[a + b * size] /= len; 69 | } 70 | fclose(f); 71 | TCN = 0; 72 | while (1) { 73 | for (a = 0; a < N; a++) bestd[a] = 0; 74 | for (a = 0; a < N; a++) bestw[a][0] = 0; 75 | scanf("%s", st1); 76 | for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]); 77 | if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) { 78 | if (TCN == 0) TCN = 1; 79 | if (QID != 0) { 80 | printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN); 81 | printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100); 82 | } 83 | QID++; 84 | scanf("%s", st1); 85 | if (feof(stdin)) break; 86 | printf("%s:\n", st1); 87 | TCN = 0; 88 | CCN = 0; 89 | continue; 90 | } 91 | if (!strcmp(st1, "EXIT")) break; 92 | scanf("%s", st2); 93 | for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]); 94 | scanf("%s", st3); 95 | for (a = 0; a bestd[a]) { 122 | for (d = N - 1; d > a; d--) { 123 | bestd[d] = bestd[d - 1]; 124 | strcpy(bestw[d], bestw[d - 1]); 125 | } 126 | bestd[a] = dist; 127 | strcpy(bestw[a], &vocab[c * max_w]); 128 | break; 129 | } 130 | } 131 | } 132 | if (!strcmp(st4, bestw[0])) { 133 | CCN++; 134 | CACN++; 135 | if (QID <= 5) SEAC++; else SYAC++; 136 | } 137 | if (QID <= 5) SECN++; else SYCN++; 138 | TCN++; 139 | TACN++; 140 | } 141 | printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100); 142 | return 0; 143 | } 144 | -------------------------------------------------------------------------------- /distance.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | #define MAX_STRING 100 25 | void ReadWord(char *word, FILE *fin) { 26 | int a = 0, ch; 27 | while (!feof(fin)) { 28 | ch = fgetc(fin); 29 | if (ch == 13) continue; 30 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 31 | if (a > 0) { 32 | if (ch == '\n') ungetc(ch, fin); 33 | break; 34 | } 35 | if (ch == '\n') { 36 | strcpy(word, (char *)""); 37 | return; 38 | } else continue; 39 | } 40 | word[a] = ch; 41 | a++; 42 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 43 | } 44 | word[a] = 0; 45 | printf("%s\n", word); 46 | } 47 | 48 | int main(int argc, char **argv) { 49 | FILE *f; 50 | char st1[max_size]; 51 | char *bestw[N]; 52 | char file_name[max_size], st[100][max_size]; 53 | float dist, len, bestd[N], vec[max_size]; 54 | long long words, size, a, b, c, d, cn, bi[100]; 55 | float *M; 56 | char *vocab; 57 | if (argc < 2) { 58 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 59 | return 0; 60 | } 61 | strcpy(file_name, argv[1]); 62 | f = fopen(file_name, "rb"); 63 | if (f == NULL) { 64 | printf("Input file not found\n"); 65 | return -1; 66 | } 67 | fscanf(f, "%lld", &words); 68 | fscanf(f, "%lld", &size); 69 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 70 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 71 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 72 | if (M == NULL) { 73 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 74 | return -1; 75 | } 76 | for (b = 0; b < words; b++) { 77 | a = 0; 78 | while (1) { 79 | vocab[b * max_w + a] = fgetc(f); 80 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 81 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 82 | } 83 | vocab[b * max_w + a] = 0; 84 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 85 | len = 0; 86 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 87 | len = sqrt(len); 88 | for (a = 0; a < size; a++) M[a + b * size] /= len; 89 | } 90 | fclose(f); 91 | while (1) { 92 | for (a = 0; a < N; a++) bestd[a] = 0; 93 | for (a = 0; a < N; a++) bestw[a][0] = 0; 94 | printf("Enter word or sentence (EXIT to break): "); 95 | a = 0; 96 | while (1) { 97 | st1[a] = fgetc(stdin); 98 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 99 | st1[a] = 0; 100 | break; 101 | } 102 | a++; 103 | } 104 | if (!strcmp(st1, "EXIT")) break; 105 | cn = 0; 106 | b = 0; 107 | c = 0; 108 | while (1) { 109 | st[cn][b] = st1[c]; 110 | b++; 111 | c++; 112 | st[cn][b] = 0; 113 | if (st1[c] == 0) break; 114 | if (st1[c] == ' ') { 115 | cn++; 116 | b = 0; 117 | c++; 118 | } 119 | } 120 | cn++; 121 | for (a = 0; a < cn; a++) { 122 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 123 | if (b == words) b = -1; 124 | bi[a] = b; 125 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 126 | if (b == -1) { 127 | printf("Out of dictionary word!\n"); 128 | break; 129 | } 130 | } 131 | if (b == -1) continue; 132 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 133 | for (a = 0; a < size; a++) vec[a] = 0; 134 | for (b = 0; b < cn; b++) { 135 | if (bi[b] == -1) continue; 136 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 137 | } 138 | len = 0; 139 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 140 | len = sqrt(len); 141 | for (a = 0; a < size; a++) vec[a] /= len; 142 | for (a = 0; a < N; a++) bestd[a] = -1; 143 | for (a = 0; a < N; a++) bestw[a][0] = 0; 144 | for (c = 0; c < words; c++) { 145 | a = 0; 146 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 147 | if (a == 1) continue; 148 | dist = 0; 149 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 150 | for (a = 0; a < N; a++) { 151 | if (dist > bestd[a]) { 152 | for (d = N - 1; d > a; d--) { 153 | bestd[d] = bestd[d - 1]; 154 | strcpy(bestw[d], bestw[d - 1]); 155 | } 156 | bestd[a] = dist; 157 | strcpy(bestw[a], &vocab[c * max_w]); 158 | break; 159 | } 160 | } 161 | } 162 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 163 | } 164 | return 0; 165 | } 166 | -------------------------------------------------------------------------------- /distance_fast.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | const long long max_size = 2000; // max length of strings 22 | const long long N = 10; // number of closest words that will be shown 23 | const long long max_w = 50; // max length of vocabulary entries 24 | 25 | #define MAX_STRING 100 26 | void ReadWord(char *word, FILE *fin) { 27 | int a = 0, ch; 28 | while (!feof(fin)) { 29 | ch = fgetc(fin); 30 | if (ch == 13) continue; 31 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 32 | if (a > 0) { 33 | if (ch == '\n') ungetc(ch, fin); 34 | break; 35 | } 36 | if (ch == '\n') { 37 | strcpy(word, (char *)""); 38 | return; 39 | } else continue; 40 | } 41 | word[a] = ch; 42 | a++; 43 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 44 | } 45 | word[a] = 0; 46 | } 47 | 48 | int main(int argc, char **argv) { 49 | FILE *f; 50 | char st1[max_size]; 51 | char *bestw[N]; 52 | char file_name[max_size], st[100][max_size]; 53 | float dist, len, bestd[N], bestclasses[N], vec[max_size]; 54 | int bestclasses_ids[N]; 55 | long long words, size, a, b, c, d, e, cn, bi[100]; 56 | float *M; 57 | char *vocab; 58 | char word[MAX_STRING]; 59 | clock_t begin; 60 | if (argc < 2) { 61 | printf("Usage: ./kmeans_txt \nwhere FILE contains features\n "); 62 | return 0; 63 | } 64 | strcpy(file_name, argv[1]); 65 | int classes = atoi(argv[2]); 66 | f = fopen(file_name, "rb"); 67 | if (f == NULL) { 68 | printf("Input file not found\n"); 69 | return -1; 70 | } 71 | 72 | printf("reading data\n"); 73 | ReadWord(word, f); 74 | words = atoi(word); 75 | ReadWord(word, f); 76 | size = atoi(word); 77 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 78 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 79 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 80 | if (M == NULL) { 81 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 82 | return -1; 83 | } 84 | for (b = 0; b < words; b++) { 85 | a = 0; 86 | while (1) { 87 | vocab[b * max_w + a] = fgetc(f); 88 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 89 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 90 | } 91 | vocab[b * max_w + a] = 0; 92 | for (a = 0; a < size; a++) { 93 | ReadWord(word,f); 94 | M[a + b * size] = atof(word); 95 | } 96 | len = 0; 97 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 98 | len = sqrt(len); 99 | for (a = 0; a < size; a++) M[a + b * size] /= len; 100 | } 101 | fclose(f); 102 | 103 | //run kmeans 104 | printf("running k-means with %i classes...\n",classes); 105 | int clcn = classes, iter = 10, closeid; 106 | int *centcn = (int *)malloc(classes * sizeof(int)); 107 | int *cl = (int *)calloc(words, sizeof(int)); 108 | float closev, x; 109 | float *cent = (float *)calloc(classes * size, sizeof(float)); 110 | for (a = 0; a < words; a++) cl[a] = a % clcn; 111 | for (a = 0; a < iter; a++) { 112 | for (b = 0; b < clcn * size; b++) cent[b] = 0; 113 | for (b = 0; b < clcn; b++) centcn[b] = 1; 114 | for (c = 0; c < words; c++) { 115 | for (d = 0; d < size; d++) cent[size * cl[c] + d] += M[c * size + d]; 116 | centcn[cl[c]]++; 117 | } 118 | for (b = 0; b < clcn; b++) { 119 | closev = 0; 120 | for (c = 0; c < size; c++) { 121 | cent[size * b + c] /= centcn[b]; 122 | closev += cent[size * b + c] * cent[size * b + c]; 123 | } 124 | closev = sqrt(closev); 125 | for (c = 0; c < size; c++) cent[size * b + c] /= closev; 126 | } 127 | for (c = 0; c < words; c++) { 128 | closev = -10; 129 | closeid = 0; 130 | for (d = 0; d < clcn; d++) { 131 | x = 0; 132 | for (b = 0; b < size; b++) x += cent[size * d + b] * M[c * size + b]; 133 | if (x > closev) { 134 | closev = x; 135 | closeid = d; 136 | } 137 | } 138 | cl[c] = closeid; 139 | } 140 | } 141 | 142 | // build an array of words ordered by class and their offsets (index where each class starts) 143 | int class_words[words]; 144 | int class_offsets[classes]; 145 | for(a = 0; a < classes; a++) class_offsets[a]=0; 146 | for(a = 0; a < words; a++) class_offsets[cl[a]]++; 147 | for(a = 1; a < classes; a++) class_offsets[a] += class_offsets[a-1]; 148 | for(a = 0; a < words; a++) class_words[--class_offsets[cl[a]]] = a; 149 | 150 | //reading from input 151 | while (1) { 152 | for (a = 0; a < N; a++) bestd[a] = 0; 153 | for (a = 0; a < N; a++) bestclasses[a] = 0; 154 | for (a = 0; a < N; a++) bestw[a][0] = 0; 155 | printf("Enter word or sentence (EXIT to break): "); 156 | a = 0; 157 | while (1) { 158 | st1[a] = fgetc(stdin); 159 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 160 | st1[a] = 0; 161 | break; 162 | } 163 | a++; 164 | } 165 | if (!strcmp(st1, "EXIT")) break; 166 | cn = 0; 167 | b = 0; 168 | c = 0; 169 | while (1) { 170 | st[cn][b] = st1[c]; 171 | b++; 172 | c++; 173 | st[cn][b] = 0; 174 | if (st1[c] == 0) break; 175 | if (st1[c] == ' ') { 176 | cn++; 177 | b = 0; 178 | c++; 179 | } 180 | } 181 | cn++; 182 | for (a = 0; a < cn; a++) { 183 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 184 | if (b == words) b = -1; 185 | bi[a] = b; 186 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 187 | if (b == -1) { 188 | printf("Out of dictionary word!\n"); 189 | break; 190 | } 191 | } 192 | if (b == -1) continue; 193 | begin = clock(); 194 | 195 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 196 | 197 | for (a = 0; a < size; a++) vec[a] = 0; 198 | for (b = 0; b < cn; b++) { 199 | if (bi[b] == -1) continue; 200 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 201 | } 202 | 203 | len = 0; 204 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 205 | len = sqrt(len); 206 | for (a = 0; a < size; a++) vec[a] /= len; 207 | 208 | // find top N centroids 209 | for (a = 0; a < N; a++) bestclasses[a] = -1; 210 | for (a = 0; a < N; a++) bestclasses_ids[a] = -1; 211 | for (c = 0; c < classes; c++){ 212 | dist = 0; 213 | for (a = 0; a < size; a++) dist += vec[a] * cent[a + size * c]; 214 | for (a = 0; a < N; a++) { 215 | if (dist > bestclasses[a]) { 216 | for(d = N - 1; d > a; d--){ 217 | bestclasses[d] = bestclasses[d-1]; 218 | bestclasses_ids[d] = bestclasses_ids[d-1]; 219 | } 220 | bestclasses[a] = dist; 221 | bestclasses_ids[a] = c; 222 | break; 223 | } 224 | } 225 | } 226 | 227 | // find top N words in the centroids 228 | for (a = 0; a < N; a++) bestd[a] = -1; 229 | for (a = 0; a < N; a++) bestw[a][0] = 0; 230 | for (a = 0; a < N; a++){ 231 | c = words; 232 | if(bestclasses_ids[a] < classes-1) c = class_offsets[bestclasses_ids[a]+1]; 233 | b = class_offsets[bestclasses_ids[a]]; 234 | for(; b < c; b++){ 235 | dist = 0; 236 | for (d = 0; d < size; d++) dist += vec[d] * M[d + class_words[b] * size]; 237 | for (d = 0; d < N; d++){ 238 | if(dist > bestd[d]){ 239 | for (e = N -1; e > d; e--){ 240 | bestd[e] = bestd[e-1]; 241 | strcpy(bestw[e], bestw[e-1]); 242 | } 243 | bestd[d] = dist; 244 | strcpy(bestw[d], &vocab[class_words[b] * max_w]); 245 | break; 246 | } 247 | } 248 | } 249 | } 250 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 251 | printf("time spent = %f seconds\n", (double)(clock() - begin) / CLOCKS_PER_SEC); 252 | } 253 | // Save the K-means classes 254 | 255 | free(centcn); 256 | free(cent); 257 | free(cl); 258 | 259 | //start running distance 260 | return 0; 261 | } 262 | -------------------------------------------------------------------------------- /distance_txt.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | const long long max_size = 2000; // max length of strings 22 | const long long N = 400; // number of closest words that will be shown 23 | const long long max_w = 1000; // max length of vocabulary entries 24 | 25 | #define MAX_STRING 1000 26 | void ReadWord(char *word, FILE *fin) { 27 | int a = 0, ch; 28 | while (!feof(fin)) { 29 | ch = fgetc(fin); 30 | if (ch == 13) continue; 31 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 32 | if (a > 0) { 33 | if (ch == '\n') ungetc(ch, fin); 34 | break; 35 | } 36 | if (ch == '\n') { 37 | strcpy(word, (char *)""); 38 | return; 39 | } else continue; 40 | } 41 | word[a] = ch; 42 | a++; 43 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 44 | } 45 | word[a] = 0; 46 | } 47 | 48 | int main(int argc, char **argv) { 49 | FILE *f; 50 | char st1[max_size]; 51 | char *bestw[N]; 52 | char file_name[max_size], st[100][max_size]; 53 | float dist, len, bestd[N], vec[max_size]; 54 | long long words, size, a, b, c, d, cn, bi[100]; 55 | float *M; 56 | char *vocab; 57 | char word[MAX_STRING]; 58 | clock_t begin; 59 | if (argc < 2) { 60 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 61 | return 0; 62 | } 63 | strcpy(file_name, argv[1]); 64 | f = fopen(file_name, "rb"); 65 | if (f == NULL) { 66 | printf("Input file not found\n"); 67 | return -1; 68 | } 69 | ReadWord(word, f); 70 | words = atoi(word); 71 | ReadWord(word, f); 72 | size = atoi(word); 73 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 74 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 75 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 76 | if (M == NULL) { 77 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 78 | return -1; 79 | } 80 | for (b = 0; b < words; b++) { 81 | a = 0; 82 | while (1) { 83 | vocab[b * max_w + a] = fgetc(f); 84 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 85 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 86 | } 87 | vocab[b * max_w + a] = 0; 88 | for (a = 0; a < size; a++) { 89 | ReadWord(word,f); 90 | M[a + b * size] = atof(word); 91 | } 92 | len = 0; 93 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 94 | len = sqrt(len); 95 | for (a = 0; a < size; a++) M[a + b * size] /= len; 96 | } 97 | fclose(f); 98 | while (1) { 99 | for (a = 0; a < N; a++) bestd[a] = 0; 100 | for (a = 0; a < N; a++) bestw[a][0] = 0; 101 | printf("Enter word or sentence (EXIT to break): "); 102 | a = 0; 103 | while (1) { 104 | st1[a] = fgetc(stdin); 105 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 106 | st1[a] = 0; 107 | break; 108 | } 109 | a++; 110 | } 111 | if (!strcmp(st1, "EXIT")) break; 112 | cn = 0; 113 | b = 0; 114 | c = 0; 115 | while (1) { 116 | st[cn][b] = st1[c]; 117 | b++; 118 | c++; 119 | st[cn][b] = 0; 120 | if (st1[c] == 0) break; 121 | if (st1[c] == ' ') { 122 | cn++; 123 | b = 0; 124 | c++; 125 | } 126 | } 127 | cn++; 128 | for (a = 0; a < cn; a++) { 129 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 130 | if (b == words) b = -1; 131 | bi[a] = b; 132 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 133 | if (b == -1) { 134 | printf("Out of dictionary word!\n"); 135 | break; 136 | } 137 | } 138 | if (b == -1) continue; 139 | begin = clock(); 140 | 141 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 142 | for (a = 0; a < size; a++) vec[a] = 0; 143 | for (b = 0; b < cn; b++) { 144 | if (bi[b] == -1) continue; 145 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 146 | } 147 | len = 0; 148 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 149 | len = sqrt(len); 150 | for (a = 0; a < size; a++) vec[a] /= len; 151 | for (a = 0; a < N; a++) bestd[a] = -1; 152 | for (a = 0; a < N; a++) bestw[a][0] = 0; 153 | for (c = 0; c < words; c++) { 154 | a = 0; 155 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 156 | if (a == 1) continue; 157 | dist = 0; 158 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 159 | for (a = 0; a < N; a++) { 160 | if (dist > bestd[a]) { 161 | for (d = N - 1; d > a; d--) { 162 | bestd[d] = bestd[d - 1]; 163 | strcpy(bestw[d], bestw[d - 1]); 164 | } 165 | bestd[a] = dist; 166 | strcpy(bestw[a], &vocab[c * max_w]); 167 | break; 168 | } 169 | } 170 | } 171 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 172 | printf("time spent = %f seconds\n", (double)(clock() - begin) / CLOCKS_PER_SEC); 173 | } 174 | return 0; 175 | } 176 | -------------------------------------------------------------------------------- /kmeans_txt.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | #define MAX_STRING 100 25 | void ReadWord(char *word, FILE *fin) { 26 | int a = 0, ch; 27 | while (!feof(fin)) { 28 | ch = fgetc(fin); 29 | if (ch == 13) continue; 30 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 31 | if (a > 0) { 32 | if (ch == '\n') ungetc(ch, fin); 33 | break; 34 | } 35 | if (ch == '\n') { 36 | strcpy(word, (char *)""); 37 | return; 38 | } else continue; 39 | } 40 | word[a] = ch; 41 | a++; 42 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 43 | } 44 | word[a] = 0; 45 | } 46 | 47 | int main(int argc, char **argv) { 48 | FILE *f; 49 | char *bestw[N]; 50 | char file_name[max_size], output_file[max_size]; 51 | float len; 52 | long long words, size, a, b, c, d; 53 | float *M; 54 | char *vocab; 55 | char word[MAX_STRING]; 56 | if (argc < 3) { 57 | printf("Usage: ./kmeans_txt \nwhere FILE contains features\n "); 58 | return 0; 59 | } 60 | strcpy(file_name, argv[1]); 61 | strcpy(output_file, argv[2]); 62 | int classes = atoi(argv[3]); 63 | f = fopen(file_name, "rb"); 64 | if (f == NULL) { 65 | printf("Input file not found\n"); 66 | return -1; 67 | } 68 | 69 | FILE *fo = fopen(output_file, "wb"); 70 | 71 | ReadWord(word, f); 72 | words = atoi(word); 73 | ReadWord(word, f); 74 | size = atoi(word); 75 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 76 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 77 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 78 | if (M == NULL) { 79 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 80 | return -1; 81 | } 82 | for (b = 0; b < words; b++) { 83 | a = 0; 84 | while (1) { 85 | vocab[b * max_w + a] = fgetc(f); 86 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 87 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 88 | } 89 | vocab[b * max_w + a] = 0; 90 | for (a = 0; a < size; a++) { 91 | ReadWord(word,f); 92 | M[a + b * size] = atof(word); 93 | } 94 | len = 0; 95 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 96 | len = sqrt(len); 97 | for (a = 0; a < size; a++) M[a + b * size] /= len; 98 | } 99 | fclose(f); 100 | 101 | //run kmeans 102 | int clcn = classes, iter = 2, closeid; 103 | int *centcn = (int *)malloc(classes * sizeof(int)); 104 | int *cl = (int *)calloc(words, sizeof(int)); 105 | float closev, x; 106 | float *cent = (float *)calloc(classes * size, sizeof(float)); 107 | for (a = 0; a < words; a++) cl[a] = a % clcn; 108 | for (a = 0; a < iter; a++) { 109 | for (b = 0; b < clcn * size; b++) cent[b] = 0; 110 | for (b = 0; b < clcn; b++) centcn[b] = 1; 111 | for (c = 0; c < words; c++) { 112 | for (d = 0; d < size; d++) cent[size * cl[c] + d] += M[c * size + d]; 113 | centcn[cl[c]]++; 114 | } 115 | for (b = 0; b < clcn; b++) { 116 | closev = 0; 117 | for (c = 0; c < size; c++) { 118 | cent[size * b + c] /= centcn[b]; 119 | closev += cent[size * b + c] * cent[size * b + c]; 120 | } 121 | closev = sqrt(closev); 122 | for (c = 0; c < size; c++) cent[size * b + c] /= closev; 123 | } 124 | for (c = 0; c < words; c++) { 125 | closev = -10; 126 | closeid = 0; 127 | for (d = 0; d < clcn; d++) { 128 | x = 0; 129 | for (b = 0; b < size; b++) x += cent[size * d + b] * M[c * size + b]; 130 | if (x > closev) { 131 | closev = x; 132 | closeid = d; 133 | } 134 | } 135 | cl[c] = closeid; 136 | } 137 | } 138 | 139 | // build an array of words ordered by class and their offsets (index where each class starts) 140 | int class_words[words]; 141 | int class_offsets[classes]; 142 | for(a = 0; a < classes; a++) class_offsets[a]=0; 143 | for(a = 0; a < words; a++) class_offsets[cl[a]]++; 144 | for(a = 1; a < classes; a++) class_offsets[a] += class_offsets[a-1]; 145 | for(a = 0; a < words; a++) class_words[--class_offsets[cl[a]]] = a; 146 | 147 | for (a = 0; a < classes; a++){ 148 | c = words; 149 | if(a < classes-1) c = class_offsets[a+1]; 150 | b = class_offsets[a]; 151 | for(; b < c; b++){ 152 | fprintf(fo, "%lld %s\n", a ,&vocab[class_words[b] * max_w]); 153 | } 154 | } 155 | // Save the K-means classes 156 | //for (a = 0; a < words; a++) fprintf(fo, "%s %d\n", &vocab[a * max_w], cl[a]); 157 | free(centcn); 158 | free(cent); 159 | free(cl); 160 | free(M); 161 | free(vocab); 162 | return 0; 163 | } 164 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result -g 4 | 5 | all: word2vec word2phrase distance word-analogy compute-accuracy distance_txt distance_fast kmeans_txt 6 | 7 | word2vec : word2vec.c 8 | # touch word2vec 9 | $(CC) word2vec.c -o word2vec $(CFLAGS) 10 | word2phrase : word2phrase.c 11 | $(CC) word2phrase.c -o word2phrase $(CFLAGS) 12 | distance : distance.c 13 | $(CC) distance.c -o distance $(CFLAGS) 14 | distance_txt : distance_txt.c 15 | $(CC) distance_txt.c -o distance_txt $(CFLAGS) 16 | distance_fast : distance_fast.c 17 | $(CC) distance_fast.c -o distance_fast $(CFLAGS) 18 | kmeans_txt : kmeans_txt.c 19 | $(CC) kmeans_txt.c -o kmeans_txt $(CFLAGS) 20 | word-analogy : word-analogy.c 21 | $(CC) word-analogy.c -o word-analogy $(CFLAGS) 22 | compute-accuracy : compute-accuracy.c 23 | $(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS) 24 | # chmod +x *.sh 25 | clean: 26 | rm -rf word2vec word2phrase distance word-analogy compute-accuracy distance_txt kmeans_txt 27 | -------------------------------------------------------------------------------- /word-analogy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char bestw[N][max_size]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | float *M; 32 | char *vocab; 33 | if (argc < 2) { 34 | printf("Usage: ./word-analogy \nwhere FILE contains word projections in the BINARY FORMAT\n"); 35 | return 0; 36 | } 37 | strcpy(file_name, argv[1]); 38 | f = fopen(file_name, "rb"); 39 | if (f == NULL) { 40 | printf("Input file not found\n"); 41 | return -1; 42 | } 43 | fscanf(f, "%lld", &words); 44 | fscanf(f, "%lld", &size); 45 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 46 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 47 | if (M == NULL) { 48 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 49 | return -1; 50 | } 51 | for (b = 0; b < words; b++) { 52 | a = 0; 53 | while (1) { 54 | vocab[b * max_w + a] = fgetc(f); 55 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 56 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 57 | } 58 | vocab[b * max_w + a] = 0; 59 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 60 | len = 0; 61 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 62 | len = sqrt(len); 63 | for (a = 0; a < size; a++) M[a + b * size] /= len; 64 | } 65 | fclose(f); 66 | while (1) { 67 | for (a = 0; a < N; a++) bestd[a] = 0; 68 | for (a = 0; a < N; a++) bestw[a][0] = 0; 69 | printf("Enter three words (EXIT to break): "); 70 | a = 0; 71 | while (1) { 72 | st1[a] = fgetc(stdin); 73 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 74 | st1[a] = 0; 75 | break; 76 | } 77 | a++; 78 | } 79 | if (!strcmp(st1, "EXIT")) break; 80 | cn = 0; 81 | b = 0; 82 | c = 0; 83 | while (1) { 84 | st[cn][b] = st1[c]; 85 | b++; 86 | c++; 87 | st[cn][b] = 0; 88 | if (st1[c] == 0) break; 89 | if (st1[c] == ' ') { 90 | cn++; 91 | b = 0; 92 | c++; 93 | } 94 | } 95 | cn++; 96 | if (cn < 3) { 97 | printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn); 98 | continue; 99 | } 100 | for (a = 0; a < cn; a++) { 101 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 102 | if (b == words) b = 0; 103 | bi[a] = b; 104 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 105 | if (b == 0) { 106 | printf("Out of dictionary word!\n"); 107 | break; 108 | } 109 | } 110 | if (b == 0) continue; 111 | printf("\n Word Distance\n------------------------------------------------------------------------\n"); 112 | for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size]; 113 | len = 0; 114 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 115 | len = sqrt(len); 116 | for (a = 0; a < size; a++) vec[a] /= len; 117 | for (a = 0; a < N; a++) bestd[a] = 0; 118 | for (a = 0; a < N; a++) bestw[a][0] = 0; 119 | for (c = 0; c < words; c++) { 120 | if (c == bi[0]) continue; 121 | if (c == bi[1]) continue; 122 | if (c == bi[2]) continue; 123 | a = 0; 124 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 125 | if (a == 1) continue; 126 | dist = 0; 127 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 128 | for (a = 0; a < N; a++) { 129 | if (dist > bestd[a]) { 130 | for (d = N - 1; d > a; d--) { 131 | bestd[d] = bestd[d - 1]; 132 | strcpy(bestw[d], bestw[d - 1]); 133 | } 134 | bestd[a] = dist; 135 | strcpy(bestw[a], &vocab[c * max_w]); 136 | break; 137 | } 138 | } 139 | } 140 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 141 | } 142 | return 0; 143 | } 144 | -------------------------------------------------------------------------------- /word2phrase.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 60 22 | 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary 24 | 25 | typedef float real; // Precision of float numbers 26 | 27 | struct vocab_word { 28 | long long cn; 29 | char *word; 30 | }; 31 | 32 | char train_file[MAX_STRING], output_file[MAX_STRING]; 33 | struct vocab_word *vocab; 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1; 35 | long long vocab_max_size = 10000, vocab_size = 0; 36 | long long train_words = 0; 37 | real threshold = 100; 38 | 39 | unsigned long long next_random = 1; 40 | 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 42 | void ReadWord(char *word, FILE *fin) { 43 | int a = 0, ch; 44 | while (!feof(fin)) { 45 | ch = fgetc(fin); 46 | if (ch == 13) continue; 47 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 48 | if (a > 0) { 49 | if (ch == '\n') ungetc(ch, fin); 50 | break; 51 | } 52 | if (ch == '\n') { 53 | strcpy(word, (char *)""); 54 | return; 55 | } else continue; 56 | } 57 | word[a] = ch; 58 | a++; 59 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 60 | } 61 | word[a] = 0; 62 | } 63 | 64 | // Returns hash value of a word 65 | int GetWordHash(char *word) { 66 | unsigned long long a, hash = 1; 67 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 68 | hash = hash % vocab_hash_size; 69 | return hash; 70 | } 71 | 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 73 | int SearchVocab(char *word) { 74 | unsigned int hash = GetWordHash(word); 75 | while (1) { 76 | if (vocab_hash[hash] == -1) return -1; 77 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 78 | hash = (hash + 1) % vocab_hash_size; 79 | } 80 | return -1; 81 | } 82 | 83 | // Reads a word and returns its index in the vocabulary 84 | int ReadWordIndex(FILE *fin) { 85 | char word[MAX_STRING]; 86 | ReadWord(word, fin); 87 | if (feof(fin)) return -1; 88 | return SearchVocab(word); 89 | } 90 | 91 | // Adds a word to the vocabulary 92 | int AddWordToVocab(char *word) { 93 | unsigned int hash, length = strlen(word) + 1; 94 | if (length > MAX_STRING) length = MAX_STRING; 95 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 96 | strcpy(vocab[vocab_size].word, word); 97 | vocab[vocab_size].cn = 0; 98 | vocab_size++; 99 | // Reallocate memory if needed 100 | if (vocab_size + 2 >= vocab_max_size) { 101 | vocab_max_size += 10000; 102 | vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 103 | } 104 | hash = GetWordHash(word); 105 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 106 | vocab_hash[hash]=vocab_size - 1; 107 | return vocab_size - 1; 108 | } 109 | 110 | // Used later for sorting by word counts 111 | int VocabCompare(const void *a, const void *b) { 112 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 113 | } 114 | 115 | // Sorts the vocabulary by frequency using word counts 116 | void SortVocab() { 117 | int a; 118 | unsigned int hash; 119 | // Sort the vocabulary and keep at the first position 120 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 121 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 122 | for (a = 0; a < vocab_size; a++) { 123 | // Words occuring less than min_count times will be discarded from the vocab 124 | if (vocab[a].cn < min_count) { 125 | vocab_size--; 126 | free(vocab[vocab_size].word); 127 | } else { 128 | // Hash will be re-computed, as after the sorting it is not actual 129 | hash = GetWordHash(vocab[a].word); 130 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 131 | vocab_hash[hash] = a; 132 | } 133 | } 134 | vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word)); 135 | } 136 | 137 | // Reduces the vocabulary by removing infrequent tokens 138 | void ReduceVocab() { 139 | int a, b = 0; 140 | unsigned int hash; 141 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 142 | vocab[b].cn = vocab[a].cn; 143 | vocab[b].word = vocab[a].word; 144 | b++; 145 | } else free(vocab[a].word); 146 | vocab_size = b; 147 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 148 | for (a = 0; a < vocab_size; a++) { 149 | // Hash will be re-computed, as it is not actual 150 | hash = GetWordHash(vocab[a].word); 151 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 152 | vocab_hash[hash] = a; 153 | } 154 | fflush(stdout); 155 | min_reduce++; 156 | } 157 | 158 | void LearnVocabFromTrainFile() { 159 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 160 | FILE *fin; 161 | long long a, i, start = 1; 162 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 163 | fin = fopen(train_file, "rb"); 164 | if (fin == NULL) { 165 | printf("ERROR: training data file not found!\n"); 166 | exit(1); 167 | } 168 | vocab_size = 0; 169 | AddWordToVocab((char *)""); 170 | while (1) { 171 | ReadWord(word, fin); 172 | if (feof(fin)) break; 173 | if (!strcmp(word, "")) { 174 | start = 1; 175 | continue; 176 | } else start = 0; 177 | train_words++; 178 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 179 | printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13); 180 | fflush(stdout); 181 | } 182 | i = SearchVocab(word); 183 | if (i == -1) { 184 | a = AddWordToVocab(word); 185 | vocab[a].cn = 1; 186 | } else vocab[i].cn++; 187 | if (start) continue; 188 | sprintf(bigram_word, "%s_%s", last_word, word); 189 | bigram_word[MAX_STRING - 1] = 0; 190 | strcpy(last_word, word); 191 | i = SearchVocab(bigram_word); 192 | if (i == -1) { 193 | a = AddWordToVocab(bigram_word); 194 | vocab[a].cn = 1; 195 | } else vocab[i].cn++; 196 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 197 | } 198 | SortVocab(); 199 | if (debug_mode > 0) { 200 | printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); 201 | printf("Words in train file: %lld\n", train_words); 202 | } 203 | fclose(fin); 204 | } 205 | 206 | void TrainModel() { 207 | long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0; 208 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 209 | real score; 210 | FILE *fo, *fin; 211 | printf("Starting training using file %s\n", train_file); 212 | LearnVocabFromTrainFile(); 213 | fin = fopen(train_file, "rb"); 214 | fo = fopen(output_file, "wb"); 215 | word[0] = 0; 216 | while (1) { 217 | strcpy(last_word, word); 218 | ReadWord(word, fin); 219 | if (feof(fin)) break; 220 | if (!strcmp(word, "")) { 221 | fprintf(fo, "\n"); 222 | continue; 223 | } 224 | cn++; 225 | if ((debug_mode > 1) && (cn % 100000 == 0)) { 226 | printf("Words written: %lldK%c", cn / 1000, 13); 227 | fflush(stdout); 228 | } 229 | oov = 0; 230 | i = SearchVocab(word); 231 | if (i == -1) oov = 1; else pb = vocab[i].cn; 232 | if (li == -1) oov = 1; 233 | li = i; 234 | sprintf(bigram_word, "%s_%s", last_word, word); 235 | bigram_word[MAX_STRING - 1] = 0; 236 | i = SearchVocab(bigram_word); 237 | if (i == -1) oov = 1; else pab = vocab[i].cn; 238 | if (pa < min_count) oov = 1; 239 | if (pb < min_count) oov = 1; 240 | if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words; 241 | if (score > threshold) { 242 | fprintf(fo, "_%s", word); 243 | pb = 0; 244 | } else fprintf(fo, " %s", word); 245 | pa = pb; 246 | } 247 | fclose(fo); 248 | fclose(fin); 249 | } 250 | 251 | int ArgPos(char *str, int argc, char **argv) { 252 | int a; 253 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 254 | if (a == argc - 1) { 255 | printf("Argument missing for %s\n", str); 256 | exit(1); 257 | } 258 | return a; 259 | } 260 | return -1; 261 | } 262 | 263 | int main(int argc, char **argv) { 264 | int i; 265 | if (argc == 1) { 266 | printf("WORD2PHRASE tool v0.1a\n\n"); 267 | printf("Options:\n"); 268 | printf("Parameters for training:\n"); 269 | printf("\t-train \n"); 270 | printf("\t\tUse text data from to train the model\n"); 271 | printf("\t-output \n"); 272 | printf("\t\tUse to save the resulting word vectors / word clusters / phrases\n"); 273 | printf("\t-min-count \n"); 274 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 275 | printf("\t-threshold \n"); 276 | printf("\t\t The value represents threshold for forming the phrases (higher means less phrases); default 100\n"); 277 | printf("\t-debug \n"); 278 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 279 | printf("\nExamples:\n"); 280 | printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n"); 281 | return 0; 282 | } 283 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 284 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 285 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 286 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 287 | if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]); 288 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 289 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 290 | TrainModel(); 291 | return 0; 292 | } 293 | -------------------------------------------------------------------------------- /word2vec.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #define MAX_STRING 10000 25 | #define EXP_TABLE_SIZE 1000 26 | #define MAX_EXP 6 27 | #define MAX_SENTENCE_LENGTH 1000 28 | #define MAX_CODE_LENGTH 40 29 | 30 | const int vocab_hash_size = 30 * 1000 * 1000; // Maximum 30 * 0.7 = 21M words in the vocabulary 31 | 32 | typedef float real; // Precision of float numbers 33 | 34 | struct vocab_word { 35 | long long cn; 36 | int *point; 37 | bool fixed; 38 | char *word, *code, codelen; 39 | }; 40 | 41 | char train_file[MAX_STRING], output_word_vectors_file[MAX_STRING], output_context_vectors_file[MAX_STRING]; 42 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 43 | char fixed_word_vectors_file[MAX_STRING], fixed_context_vectors_file[MAX_STRING]; 44 | struct vocab_word *vocab; 45 | int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 46 | int *vocab_hash; 47 | long long vocab_max_size = 10000, vocab_size = 0, layer1_size = 100; 48 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 49 | real alpha = 0.025, starting_alpha, sample = 1e-3; 50 | real *syn0, *syn1, *syn1neg, *expTable; 51 | clock_t start; 52 | 53 | int window_offset, window_layer_size; 54 | 55 | int window_hidden_size = 500; 56 | 57 | int hs = 0, negative = 5, no_header = 0; 58 | const int table_size = 1e8; 59 | int *table; 60 | 61 | //constrastive negative sampling 62 | char negative_classes_file[MAX_STRING]; 63 | int *word_to_group; 64 | int *group_to_table; //group_size*table_size 65 | int class_number; 66 | 67 | real hardTanh(real x){ 68 | if(x>=1){ 69 | return 1; 70 | } 71 | else if(x<=-1){ 72 | return -1; 73 | } 74 | else{ 75 | return x; 76 | } 77 | } 78 | 79 | real dHardTanh(real x, real g){ 80 | if(x > 1 && g > 0){ 81 | return 0; 82 | } 83 | if(x < -1 && g < 0){ 84 | return 0; 85 | } 86 | return 1; 87 | } 88 | 89 | void InitUnigramTable() { 90 | int a, i; 91 | long long train_words_pow = 0; 92 | real d1, power = 0.75; 93 | table = (int *)malloc(table_size * sizeof(int)); 94 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 95 | i = 0; 96 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 97 | for (a = 0; a < table_size; a++) { 98 | table[a] = i; 99 | if (a / (real)table_size > d1) { 100 | i++; 101 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 102 | } 103 | if (i >= vocab_size) i = vocab_size - 1; 104 | } 105 | } 106 | 107 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 108 | void ReadWord(char *word, FILE *fin) { 109 | int a = 0, ch; 110 | while (!feof(fin)) { 111 | ch = fgetc(fin); 112 | if (ch == 13) continue; 113 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 114 | if (a > 0) { 115 | if (ch == '\n') ungetc(ch, fin); 116 | break; 117 | } 118 | if (ch == '\n') { 119 | strcpy(word, (char *)""); 120 | return; 121 | } else continue; 122 | } 123 | word[a] = ch; 124 | a++; 125 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 126 | } 127 | word[a] = 0; 128 | } 129 | 130 | // Returns hash value of a word 131 | int GetWordHash(char *word) { 132 | unsigned long long a, hash = 0; 133 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 134 | hash = hash % vocab_hash_size; 135 | return hash; 136 | } 137 | 138 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 139 | int SearchVocab(char *word) { 140 | unsigned int hash = GetWordHash(word); 141 | while (1) { 142 | if (vocab_hash[hash] == -1) return -1; 143 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 144 | hash = (hash + 1) % vocab_hash_size; 145 | } 146 | return -1; 147 | } 148 | 149 | // Reads a word and returns its index in the vocabulary 150 | int ReadWordIndex(FILE *fin) { 151 | char word[MAX_STRING]; 152 | ReadWord(word, fin); 153 | if (feof(fin)) return -1; 154 | return SearchVocab(word); 155 | } 156 | 157 | // Adds a word to the vocabulary 158 | int AddWordToVocab(char *word) { 159 | unsigned int hash, length = strlen(word) + 1; 160 | if (length > MAX_STRING) length = MAX_STRING; 161 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 162 | strcpy(vocab[vocab_size].word, word); 163 | vocab[vocab_size].cn = 0; 164 | vocab[vocab_size].fixed = false; 165 | vocab_size++; 166 | // Reallocate memory if needed 167 | if (vocab_size + 2 >= vocab_max_size) { 168 | vocab_max_size += 10000; 169 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 170 | } 171 | hash = GetWordHash(word); 172 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 173 | vocab_hash[hash] = vocab_size - 1; 174 | return vocab_size - 1; 175 | } 176 | 177 | // Used later for sorting by word counts 178 | int VocabCompare(const void *a, const void *b) { 179 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 180 | } 181 | 182 | // Sorts the vocabulary by frequency using word counts 183 | void SortVocab() { 184 | int a, size; 185 | unsigned int hash; 186 | // Sort the vocabulary and keep at the first position 187 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 188 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 189 | size = vocab_size; 190 | train_words = 0; 191 | for (a = 0; a < size; a++) { 192 | // Words occuring less than min_count times will be discarded from the vocab 193 | if ((vocab[a].cn < min_count) && (a != 0)) { 194 | vocab_size--; 195 | free(vocab[a].word); 196 | } else { 197 | // Hash will be re-computed, as after the sorting it is not actual 198 | hash=GetWordHash(vocab[a].word); 199 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 200 | vocab_hash[hash] = a; 201 | train_words += vocab[a].cn; 202 | } 203 | } 204 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 205 | // Allocate memory for the binary tree construction 206 | for (a = 0; a < vocab_size; a++) { 207 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 208 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 209 | } 210 | } 211 | 212 | // Reduces the vocabulary by removing infrequent tokens 213 | void ReduceVocab() { 214 | int a, b = 0; 215 | unsigned int hash; 216 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 217 | vocab[b].cn = vocab[a].cn; 218 | vocab[b].word = vocab[a].word; 219 | vocab[b].fixed = vocab[a].fixed; 220 | b++; 221 | } else free(vocab[a].word); 222 | vocab_size = b; 223 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 224 | for (a = 0; a < vocab_size; a++) { 225 | // Hash will be re-computed, as it is not actual 226 | hash = GetWordHash(vocab[a].word); 227 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 228 | vocab_hash[hash] = a; 229 | } 230 | fflush(stdout); 231 | min_reduce++; 232 | } 233 | 234 | void ReadVectors(real *word_vectors, char *embeddings_filename) { 235 | FILE * fp = NULL; 236 | char *line = NULL; 237 | char *buffer = NULL; 238 | size_t len = 0; 239 | int vector_size = 0; 240 | ssize_t read; 241 | bool first_line = true; 242 | int j; 243 | 244 | if (embeddings_filename[0] == 0) { return; } 245 | 246 | fp = fopen(embeddings_filename, "r"); 247 | if (fp == NULL) exit(EXIT_FAILURE); 248 | 249 | while ((read = getline(&line, &len, fp)) != -1) { 250 | char *first_field = NULL; 251 | long long i; 252 | // assert read != 0 253 | // discard the first line in the embeddings file (metadata) 254 | if (first_line) { 255 | first_line = false; 256 | // ignore the number of words 257 | buffer = strtok(line, " "); 258 | assert(buffer != NULL); 259 | // read the vector size 260 | buffer = strtok(NULL, " "); 261 | assert(buffer != NULL); 262 | vector_size = atoi(buffer); 263 | assert(vector_size > 0); 264 | continue; 265 | } 266 | 267 | // each line consists of the word and its embeddings. only read the word. 268 | first_field = strtok(line, " "); 269 | assert(first_field != NULL); 270 | i = SearchVocab(first_field); 271 | 272 | // ignore words which are not already in the vocabulary 273 | if (i == -1) { continue; } 274 | 275 | // now read the vector one value at a time. 276 | for (j = 0; j < vector_size; ++j) { 277 | buffer = strtok(NULL, " "); 278 | assert(buffer != NULL); 279 | real value = atof(buffer); 280 | word_vectors[i * layer1_size + j] = value; 281 | } 282 | } 283 | fclose(fp); 284 | } 285 | 286 | // add the words in fixed_word_vectors_file to the vocabulary. 287 | void ReadFixedEmbeddingsVocab() { 288 | FILE * fp = NULL; 289 | char * line = NULL; 290 | size_t len = 0; 291 | ssize_t read; 292 | 293 | printf("entering ReadFixedEmbeddingsVocab()\n"); 294 | 295 | // do nothing if the file isn't specified. 296 | if (fixed_word_vectors_file[0] == 0) return; 297 | 298 | // assume it's a text mode embeddings file 299 | fp = fopen(fixed_word_vectors_file, "r"); 300 | if (fp == NULL) exit(EXIT_FAILURE); 301 | 302 | bool first_line = true; 303 | while ((read = getline(&line, &len, fp)) != -1) { 304 | char * first_field = NULL; 305 | long long i; 306 | // discard the first line in the embeddings file (metadata) 307 | if (first_line) { first_line = false; continue; } 308 | // each line consists of the word and its embeddings. only read the word. 309 | first_field = strtok(line, " "); 310 | i = SearchVocab(first_field); 311 | // ignore new words and update the count of existing words so that they won't be pruned. 312 | if (i == -1) { 313 | // do nothing. 314 | continue; 315 | } else { 316 | vocab[i].cn = fmax(min_count, vocab[i].cn); 317 | vocab[i].fixed = true; 318 | } 319 | } 320 | fclose(fp); 321 | } 322 | 323 | void LearnVocabFromTrainFile() { 324 | char word[MAX_STRING]; 325 | FILE *fin; 326 | long long a, i; 327 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 328 | fin = fopen(train_file, "rb"); 329 | if (fin == NULL) { 330 | printf("ERROR: training data file not found!\n"); 331 | exit(1); 332 | } 333 | vocab_size = 0; 334 | AddWordToVocab((char *)""); 335 | while (1) { 336 | ReadWord(word, fin); 337 | if (feof(fin)) break; 338 | train_words++; 339 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 340 | printf("%lldK%c", train_words / 1000, 13); 341 | fflush(stdout); 342 | } 343 | i = SearchVocab(word); 344 | if (i == -1) { 345 | a = AddWordToVocab(word); 346 | vocab[a].cn = 1; 347 | vocab[a].fixed = false; 348 | } else vocab[i].cn++; 349 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 350 | } 351 | ReadFixedEmbeddingsVocab(); 352 | printf("will sort vocab...\n"); 353 | SortVocab(); 354 | if (debug_mode > 0) { 355 | printf("Vocab size: %lld\n", vocab_size); 356 | printf("Words in train file: %lld\n", train_words); 357 | } 358 | file_size = ftell(fin); 359 | fclose(fin); 360 | } 361 | 362 | void SaveVocab() { 363 | long long i; 364 | FILE *fo = fopen(save_vocab_file, "wb"); 365 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 366 | fclose(fo); 367 | } 368 | 369 | void ReadVocab() { 370 | long long a, i = 0; 371 | char c; 372 | char word[MAX_STRING]; 373 | FILE *fin = fopen(read_vocab_file, "rb"); 374 | if (fin == NULL) { 375 | printf("Vocabulary file not found\n"); 376 | exit(1); 377 | } 378 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 379 | vocab_size = 0; 380 | while (1) { 381 | ReadWord(word, fin); 382 | if (feof(fin)) break; 383 | a = AddWordToVocab(word); 384 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 385 | i++; 386 | } 387 | ReadFixedEmbeddingsVocab(); 388 | SortVocab(); 389 | if (debug_mode > 0) { 390 | printf("Vocab size: %lld\n", vocab_size); 391 | printf("Words in train file: %lld\n", train_words); 392 | } 393 | fin = fopen(train_file, "rb"); 394 | if (fin == NULL) { 395 | printf("ERROR: training data file not found!\n"); 396 | exit(1); 397 | } 398 | fseek(fin, 0, SEEK_END); 399 | file_size = ftell(fin); 400 | fclose(fin); 401 | } 402 | 403 | void InitClassUnigramTable() { 404 | long long a,c; 405 | printf("loading class unigrams \n"); 406 | FILE *fin = fopen(negative_classes_file, "rb"); 407 | if (fin == NULL) { 408 | printf("ERROR: class file not found!\n"); 409 | exit(1); 410 | } 411 | word_to_group = (int *)malloc(vocab_size * sizeof(int)); 412 | for(a = 0; a < vocab_size; a++) word_to_group[a] = -1; 413 | char class[MAX_STRING]; 414 | char prev_class[MAX_STRING]; 415 | prev_class[0] = 0; 416 | char word[MAX_STRING]; 417 | class_number = -1; 418 | while (1) { 419 | if (feof(fin)) break; 420 | ReadWord(class, fin); 421 | ReadWord(word, fin); 422 | int word_index = SearchVocab(word); 423 | if (word_index != -1){ 424 | if(strcmp(class, prev_class) != 0){ 425 | class_number++; 426 | strcpy(prev_class, class); 427 | } 428 | word_to_group[word_index] = class_number; 429 | } 430 | ReadWord(word, fin); 431 | } 432 | class_number++; 433 | fclose(fin); 434 | 435 | group_to_table = (int *)malloc(table_size * class_number * sizeof(int)); 436 | long long train_words_pow = 0; 437 | real d1, power = 0.75; 438 | 439 | for(c = 0; c < class_number; c++){ 440 | long long offset = c * table_size; 441 | train_words_pow = 0; 442 | for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power); 443 | int i = 0; 444 | while(word_to_group[i]!=c && i < vocab_size) i++; 445 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 446 | for (a = 0; a < table_size; a++) { 447 | //printf("index %lld , word %d\n", a, i); 448 | group_to_table[offset + a] = i; 449 | if (a / (real)table_size > d1) { 450 | i++; 451 | while(word_to_group[i]!=c && i < vocab_size) i++; 452 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 453 | } 454 | if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--; 455 | } 456 | } 457 | } 458 | 459 | void InitNet() { 460 | long long a, b; 461 | unsigned long long next_random = 1; 462 | window_layer_size = layer1_size * window * 2; 463 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 464 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 465 | 466 | if (negative>0) { 467 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); 468 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 469 | 470 | // Initialize syn1neg 471 | for (a = 0; a < vocab_size; a++) { 472 | for (b = 0; b < layer1_size; b++) { 473 | syn1neg[a * layer1_size + b] = 0; 474 | } 475 | } 476 | // Overwrite syn1neg for words in the fixed vocabulary 477 | ReadVectors(syn1neg, fixed_context_vectors_file); 478 | } 479 | // for each word in the vocab 480 | for (a = 0; a < vocab_size; a++) { 481 | // for each element in its vector representation 482 | for (b = 0; b < layer1_size; b++) { 483 | next_random = next_random * (unsigned long long)25214903917 + 11; 484 | // initialize syn0 485 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 486 | } 487 | } 488 | // Overwrite syn0 for words in the fixed vocabulary 489 | ReadVectors(syn0, fixed_word_vectors_file); 490 | } 491 | 492 | void *TrainModelThread(void *id) { 493 | long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; 494 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 495 | long long l1, l2, c, target, label, local_iter = iter; 496 | unsigned long long next_random = (long long)id; 497 | real f, g; 498 | clock_t now; 499 | int input_len_1 = (type == 2 || type == 4)? window_layer_size : layer1_size; 500 | real *neu1e = (real *)calloc(input_len_1, sizeof(real)); 501 | 502 | FILE *fi = fopen(train_file, "rb"); 503 | 504 | // Each thread starts reading from a different part of the file. 505 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 506 | 507 | // keep processing tokens from the training data until a check on the number of tokens processed 508 | // followed by another check on the number of passes over training data breaks the loop. 509 | while (1) { 510 | 511 | // every 10K updates, do some housekeeping such as updating the global count of processed tokens 512 | // (shard by all threads), print a progress report, update the learning rate (shared by all threads) 513 | if (word_count - last_word_count > 10000) { 514 | word_count_actual += word_count - last_word_count; 515 | last_word_count = word_count; 516 | if ((debug_mode > 1)) { 517 | now=clock(); 518 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 519 | word_count_actual / (real)(iter * train_words + 1) * 100, 520 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 521 | fflush(stdout); 522 | } 523 | // linearly decrease the learning rate (once after 1000 updates). 524 | alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 525 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 526 | } 527 | 528 | // sentence_length is reset before the first pass over the training data, after processing 529 | // all tokens in a sentence, and again after the thread processes the number of tokens it's 530 | // supposed to process in each pass. 531 | if (sentence_length == 0) { 532 | // Read next token sequence from the train file. 533 | while (1) { 534 | word = ReadWordIndex(fi); 535 | if (feof(fi)) break; 536 | // skip words which don't belong to the vocabulary. 537 | if (word == -1) continue; 538 | word_count++; 539 | // word == 0 indicates a line break 540 | if (word == 0) break; 541 | // The subsampling randomly discards frequent words while keeping the ranking same 542 | if (sample > 0) { 543 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 544 | next_random = next_random * (unsigned long long)25214903917 + 11; 545 | // skip more frequent words more often 546 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 547 | } 548 | sen[sentence_length] = word; 549 | sentence_length++; 550 | // ignore later words in sentences which are too long. 551 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 552 | } 553 | sentence_position = 0; 554 | } 555 | 556 | // (approx) finished processing my share of tokens in this pass over the training data 557 | if (feof(fi) || (word_count > train_words / num_threads)) { 558 | word_count_actual += word_count - last_word_count; 559 | local_iter--; 560 | // if done with all my iterations, quit the loop. 561 | if (local_iter == 0) break; 562 | word_count = 0; 563 | last_word_count = 0; 564 | sentence_length = 0; 565 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 566 | continue; 567 | } 568 | 569 | // this is the "middle word", at sentence_position in the sentence. 570 | word = sen[sentence_position]; 571 | 572 | if (word == -1) continue; 573 | 574 | // zero initialize the neu* (real) vectors. 575 | for (c = 0; c < input_len_1; c++) neu1e[c] = 0; 576 | 577 | // let window = 5, then b = 0, 1, 2, 3, or 4. 578 | next_random = next_random * (unsigned long long)25214903917 + 11; 579 | b = next_random % window; 580 | 581 | if(type==1) { //train skip-gram 582 | // loop over context words 583 | // let window = 5, b = 3, then a loops over 3,4,6,7,8 584 | // which correspond to context words at relative positions: -2, -1, 1, 2, 3 585 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 586 | // the absolute position in the sentence for the current context word 587 | c = sentence_position - window + a; 588 | if (c < 0) continue; 589 | if (c >= sentence_length) continue; 590 | // the index of the current context word 591 | last_word = sen[c]; 592 | if (last_word == -1) continue; 593 | // the embedding of last_word starts at index l1 in the syn0 array 594 | l1 = last_word * layer1_size; 595 | // now, reuse c to loop over (and reset) the neu1e array 596 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 597 | // NEGATIVE SAMPLING 598 | // loop over samples 599 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 600 | 601 | // in the first iteration, target is the correct word in the middle, with label = 1 602 | if (d == 0) { 603 | target = word; 604 | label = 1; 605 | } else { 606 | // in the remaining iterations, target is a random word from vocab, with label = 0 607 | next_random = next_random * (unsigned long long)25214903917 + 11; 608 | if(word_to_group != NULL && word_to_group[word] != -1){ 609 | target = word; 610 | while(target == word) { 611 | target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size]; 612 | next_random = next_random * (unsigned long long)25214903917 + 11; 613 | } 614 | //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word); 615 | } 616 | else{ 617 | target = table[(next_random >> 16) % table_size]; 618 | } 619 | // fix bad samples 620 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 621 | if (target == word) continue; 622 | label = 0; 623 | } 624 | 625 | // the index of the embedding of target word in the syn0 array. 626 | l2 = target * layer1_size; 627 | 628 | // f = syn0[context] .* syn1neg[target] 629 | // f is the dot product between the context word's embedding (in syn0) 630 | // and the target word's embedding in syn1neg 631 | f = 0; 632 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 633 | 634 | // compute the scalar multiplier of the gradient update, which includes the sign 635 | if (f > MAX_EXP) 636 | g = (label - 1) * alpha; 637 | else if (f < -MAX_EXP) 638 | g = (label - 0) * alpha; 639 | else 640 | g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 641 | 642 | // for the current context word, neu1e accumulates the updates that should be later added to 643 | // the (word embedding) of the context word in syn0 644 | for (c = 0; c < layer1_size; c++) { 645 | neu1e[c] += g * syn1neg[c + l2]; 646 | } 647 | 648 | // Update the (context embedding) for the current target word in syn1neg 649 | if (!vocab[target].fixed || !fixed_context_vectors_file[0]) { 650 | for (c = 0; c < layer1_size; c++) { 651 | syn1neg[c + l2] += g * syn0[c + l1]; 652 | } 653 | } 654 | } 655 | 656 | // Learn weights input -> hidden 657 | // update the (word embedding) for the current context word in syn0 658 | if (!vocab[last_word].fixed) { 659 | for (c = 0; c < layer1_size; c++) { 660 | syn0[c + l1] += neu1e[c]; 661 | } 662 | } 663 | } 664 | } 665 | else{ 666 | printf("unknown type %i", type); 667 | exit(0); 668 | } 669 | sentence_position++; 670 | // done reading this sentence. 671 | if (sentence_position >= sentence_length) { 672 | sentence_length = 0; 673 | continue; 674 | } 675 | } 676 | fclose(fi); 677 | free(neu1e); 678 | pthread_exit(NULL); 679 | } 680 | 681 | void TrainModel() { 682 | long a, b, c, d; 683 | FILE *fo; 684 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 685 | printf("Starting training using file %s\n", train_file); 686 | starting_alpha = alpha; 687 | if (read_vocab_file[0] != 0) { ReadVocab(); } 688 | else { LearnVocabFromTrainFile(); } 689 | if (save_vocab_file[0] != 0) SaveVocab(); 690 | if (output_word_vectors_file[0] == 0) { 691 | printf("output file not specified.\n"); 692 | return; 693 | } 694 | printf("entering InitNet()...\n"); 695 | InitNet(); 696 | printf("exited InitNet()\n"); 697 | if (negative > 0) InitUnigramTable(); 698 | if (negative_classes_file[0] != 0) InitClassUnigramTable(); 699 | start = clock(); 700 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 701 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 702 | 703 | // Save the word vectors 704 | fo = fopen(output_word_vectors_file, "wb"); 705 | if (classes == 0) { 706 | if (!no_header) { 707 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 708 | } 709 | for (a = 0; a < vocab_size; a++) { 710 | // skip words which we already have embeddings for 711 | if (vocab[a].fixed) { continue; } 712 | fprintf(fo, "%s ", vocab[a].word); 713 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 714 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 715 | fprintf(fo, "\n"); 716 | } 717 | } 718 | fclose(fo); 719 | 720 | // Save the context vectors 721 | fo = fopen(output_context_vectors_file, "wb"); 722 | if (classes == 0) { 723 | if (!no_header) { 724 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 725 | } 726 | for (a = 0; a < vocab_size; a++) { 727 | // skip contexts which we already have embeddings for 728 | if (vocab[a].fixed && fixed_context_vectors_file[0]) { continue; } 729 | fprintf(fo, "%s ", vocab[a].word); 730 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn1neg[a * layer1_size + b], sizeof(real), 1, fo); 731 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn1neg[a * layer1_size + b]); 732 | fprintf(fo, "\n"); 733 | } 734 | } 735 | fclose(fo); 736 | } 737 | 738 | int ArgPos(char *str, int argc, char **argv) { 739 | int a; 740 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 741 | if (a == argc - 1) { 742 | printf("Argument missing for %s\n", str); 743 | exit(1); 744 | } 745 | return a; 746 | } 747 | return -1; 748 | } 749 | 750 | int main(int argc, char **argv) { 751 | int i; 752 | if (argc == 1) { 753 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 754 | printf("Options:\n"); 755 | printf("Parameters for training:\n"); 756 | printf("\t-train \n"); 757 | printf("\t\tUse text data from to train the model\n"); 758 | printf("\t-output \n"); 759 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 760 | printf("\t-size \n"); 761 | printf("\t\tSet size of word vectors; default is 100\n"); 762 | printf("\t-window \n"); 763 | printf("\t\tSet max skip length between words; default is 5\n"); 764 | printf("\t-sample \n"); 765 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 766 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 767 | printf("\t-hs \n"); 768 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 769 | printf("\t-negative \n"); 770 | printf("\t-negative-classes \n"); 771 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 772 | printf("\t-threads \n"); 773 | printf("\t\tUse threads (default 12)\n"); 774 | printf("\t-iter \n"); 775 | printf("\t\tRun more training iterations (default 5)\n"); 776 | printf("\t-min-count \n"); 777 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 778 | printf("\t-alpha \n"); 779 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 780 | printf("\t-classes \n"); 781 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 782 | printf("\t-debug \n"); 783 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 784 | printf("\t-binary \n"); 785 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 786 | printf("\t-save-vocab \n"); 787 | printf("\t\tThe vocabulary will be saved to \n"); 788 | printf("\t-read-vocab \n"); 789 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 790 | printf("\t-type \n"); 791 | printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n"); 792 | printf("\t-fix-embeddings \n"); 793 | printf("\t\tUse the provided embeddings to induce embeddings for new words not specified in this file.\n"); 794 | printf("\t-no-header \n"); 795 | printf("\t\tDon't write the header line in output embeddings file(s)\n"); 796 | printf("\nExamples:\n"); 797 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n"); 798 | return 0; 799 | } 800 | output_word_vectors_file[0] = 0; 801 | save_vocab_file[0] = 0; 802 | read_vocab_file[0] = 0; 803 | negative_classes_file[0] = 0; 804 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 805 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 806 | if ((i = ArgPos((char *)"-fix-embeddings", argc, argv)) > 0) { 807 | strcpy(fixed_word_vectors_file, argv[i + 1]); 808 | if (access( fixed_word_vectors_file, F_OK ) == -1) { 809 | // fixed word embeddings file does not exist 810 | fixed_word_vectors_file[0] = 0; 811 | } 812 | strcpy(fixed_context_vectors_file, fixed_word_vectors_file); 813 | strcat(fixed_context_vectors_file, ".context"); 814 | if (access( fixed_context_vectors_file, F_OK ) == -1) { 815 | // fixed context embeddings file does not exist 816 | fixed_context_vectors_file[0] = 0; 817 | } 818 | } 819 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 820 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 821 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 822 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 823 | if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]); 824 | if (type==0 || type==2 || type==4) alpha = 0.05; 825 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 826 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) { 827 | strcpy(output_word_vectors_file, argv[i + 1]); 828 | strcpy(output_context_vectors_file, output_word_vectors_file); 829 | strcat(output_context_vectors_file, ".context"); 830 | } 831 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 832 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 833 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 834 | if ((i = ArgPos((char *)"-no-header", argc, argv)) > 0) no_header = atoi(argv[i + 1]); 835 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 836 | if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]); 837 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 838 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 839 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 840 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 841 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 842 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 843 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 844 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 845 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 846 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 847 | } 848 | TrainModel(); 849 | return 0; 850 | } 851 | --------------------------------------------------------------------------------