├── LICENSE
├── README.md
├── compute-accuracy.c
├── distance.c
├── distance_fast.c
├── distance_txt.c
├── kmeans_txt.c
├── makefile
├── word-analogy.c
├── word2phrase.c
└── word2vec.c


/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # incremental-word2vec
 2 | Modify word2vec such that it's possible to "condition" on existing embeddings for some words, and induce embeddings for new words.
 3 | 
 4 | 
 5 | # usage:
 6 | ```
 7 | ./word2vec -train testdemo.txt -output testdemo.oldmodel -size 200 -threads 12
 8 | ./word2vec -train new_data.txt -output testdemo.newmodel -size 200 -threads 12 -fixed-embeddings testdemo.oldmodel
 9 | ```
10 | 


--------------------------------------------------------------------------------
/compute-accuracy.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <stdlib.h>
 20 | #include <ctype.h>
 21 | 
 22 | const long long max_size = 2000;         // max length of strings
 23 | const long long N = 1;                   // number of closest words
 24 | const long long max_w = 50;              // max length of vocabulary entries
 25 | 
 26 | int main(int argc, char **argv)
 27 | {
 28 |   FILE *f;
 29 |   char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size];
 30 |   float dist, len, bestd[N], vec[max_size];
 31 |   long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
 32 |   float *M;
 33 |   char *vocab;
 34 |   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
 35 |   if (argc < 2) {
 36 |     printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
 37 |     return 0;
 38 |   }
 39 |   strcpy(file_name, argv[1]);
 40 |   if (argc > 2) threshold = atoi(argv[2]);
 41 |   f = fopen(file_name, "rb");
 42 |   if (f == NULL) {
 43 |     printf("Input file not found\n");
 44 |     return -1;
 45 |   }
 46 |   fscanf(f, "%lld", &words);
 47 |   if (threshold) if (words > threshold) words = threshold;
 48 |   fscanf(f, "%lld", &size);
 49 |   vocab = (char *)malloc(words * max_w * sizeof(char));
 50 |   M = (float *)malloc(words * size * sizeof(float));
 51 |   if (M == NULL) {
 52 |     printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
 53 |     return -1;
 54 |   }
 55 |   for (b = 0; b < words; b++) {
 56 |     a = 0;
 57 |     while (1) {
 58 |       vocab[b * max_w + a] = fgetc(f);
 59 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 60 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 61 |     }
 62 |     vocab[b * max_w + a] = 0;
 63 |     for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
 64 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 65 |     len = 0;
 66 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 67 |     len = sqrt(len);
 68 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 69 |   }
 70 |   fclose(f);
 71 |   TCN = 0;
 72 |   while (1) {
 73 |     for (a = 0; a < N; a++) bestd[a] = 0;
 74 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 75 |     scanf("%s", st1);
 76 |     for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]);
 77 |     if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
 78 |       if (TCN == 0) TCN = 1;
 79 |       if (QID != 0) {
 80 |         printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
 81 |         printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
 82 |       }
 83 |       QID++;
 84 |       scanf("%s", st1);
 85 |       if (feof(stdin)) break;
 86 |       printf("%s:\n", st1);
 87 |       TCN = 0;
 88 |       CCN = 0;
 89 |       continue;
 90 |     }
 91 |     if (!strcmp(st1, "EXIT")) break;
 92 |     scanf("%s", st2);
 93 |     for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]);
 94 |     scanf("%s", st3);
 95 |     for (a = 0; a<strlen(st3); a++) st3[a] = toupper(st3[a]);
 96 |     scanf("%s", st4);
 97 |     for (a = 0; a < strlen(st4); a++) st4[a] = toupper(st4[a]);
 98 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st1)) break;
 99 |     b1 = b;
100 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st2)) break;
101 |     b2 = b;
102 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st3)) break;
103 |     b3 = b;
104 |     for (a = 0; a < N; a++) bestd[a] = 0;
105 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
106 |     TQ++;
107 |     if (b1 == words) continue;
108 |     if (b2 == words) continue;
109 |     if (b3 == words) continue;
110 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st4)) break;
111 |     if (b == words) continue;
112 |     for (a = 0; a < size; a++) vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
113 |     TQS++;
114 |     for (c = 0; c < words; c++) {
115 |       if (c == b1) continue;
116 |       if (c == b2) continue;
117 |       if (c == b3) continue;
118 |       dist = 0;
119 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
120 |       for (a = 0; a < N; a++) {
121 |         if (dist > bestd[a]) {
122 |           for (d = N - 1; d > a; d--) {
123 |             bestd[d] = bestd[d - 1];
124 |             strcpy(bestw[d], bestw[d - 1]);
125 |           }
126 |           bestd[a] = dist;
127 |           strcpy(bestw[a], &vocab[c * max_w]);
128 |           break;
129 |         }
130 |       }
131 |     }
132 |     if (!strcmp(st4, bestw[0])) {
133 |       CCN++;
134 |       CACN++;
135 |       if (QID <= 5) SEAC++; else SYAC++;
136 |     }
137 |     if (QID <= 5) SECN++; else SYCN++;
138 |     TCN++;
139 |     TACN++;
140 |   }
141 |   printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
142 |   return 0;
143 | }
144 | 


--------------------------------------------------------------------------------
/distance.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <stdlib.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | #define MAX_STRING 100
 25 | void ReadWord(char *word, FILE *fin) {
 26 |   int a = 0, ch;
 27 |   while (!feof(fin)) {
 28 |     ch = fgetc(fin);
 29 |     if (ch == 13) continue;
 30 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 31 |       if (a > 0) {
 32 |         if (ch == '\n') ungetc(ch, fin);
 33 |         break;
 34 |       }
 35 |       if (ch == '\n') {
 36 |         strcpy(word, (char *)"</s>");
 37 |         return;
 38 |       } else continue;
 39 |     }
 40 |     word[a] = ch;
 41 |     a++;
 42 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 43 |   }
 44 |   word[a] = 0;
 45 |   printf("%s\n", word);
 46 | }
 47 | 
 48 | int main(int argc, char **argv) {
 49 |   FILE *f;
 50 |   char st1[max_size];
 51 |   char *bestw[N];
 52 |   char file_name[max_size], st[100][max_size];
 53 |   float dist, len, bestd[N], vec[max_size];
 54 |   long long words, size, a, b, c, d, cn, bi[100];
 55 |   float *M;
 56 |   char *vocab;
 57 |   if (argc < 2) {
 58 |     printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 59 |     return 0;
 60 |   }
 61 |   strcpy(file_name, argv[1]);
 62 |   f = fopen(file_name, "rb");
 63 |   if (f == NULL) {
 64 |     printf("Input file not found\n");
 65 |     return -1;
 66 |   }
 67 |   fscanf(f, "%lld", &words);
 68 |   fscanf(f, "%lld", &size);
 69 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 70 |   for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
 71 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 72 |   if (M == NULL) {
 73 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 74 |     return -1;
 75 |   }
 76 |   for (b = 0; b < words; b++) {
 77 |     a = 0;
 78 |     while (1) {
 79 |       vocab[b * max_w + a] = fgetc(f);
 80 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 81 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 82 |     }
 83 |     vocab[b * max_w + a] = 0;
 84 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 85 |     len = 0;
 86 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 87 |     len = sqrt(len);
 88 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 89 |   }
 90 |   fclose(f);
 91 |   while (1) {
 92 |     for (a = 0; a < N; a++) bestd[a] = 0;
 93 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 94 |     printf("Enter word or sentence (EXIT to break): ");
 95 |     a = 0;
 96 |     while (1) {
 97 |       st1[a] = fgetc(stdin);
 98 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 99 |         st1[a] = 0;
100 |         break;
101 |       }
102 |       a++;
103 |     }
104 |     if (!strcmp(st1, "EXIT")) break;
105 |     cn = 0;
106 |     b = 0;
107 |     c = 0;
108 |     while (1) {
109 |       st[cn][b] = st1[c];
110 |       b++;
111 |       c++;
112 |       st[cn][b] = 0;
113 |       if (st1[c] == 0) break;
114 |       if (st1[c] == ' ') {
115 |         cn++;
116 |         b = 0;
117 |         c++;
118 |       }
119 |     }
120 |     cn++;
121 |     for (a = 0; a < cn; a++) {
122 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
123 |       if (b == words) b = -1;
124 |       bi[a] = b;
125 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
126 |       if (b == -1) {
127 |         printf("Out of dictionary word!\n");
128 |         break;
129 |       }
130 |     }
131 |     if (b == -1) continue;
132 |     printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
133 |     for (a = 0; a < size; a++) vec[a] = 0;
134 |     for (b = 0; b < cn; b++) {
135 |       if (bi[b] == -1) continue;
136 |       for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
137 |     }
138 |     len = 0;
139 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
140 |     len = sqrt(len);
141 |     for (a = 0; a < size; a++) vec[a] /= len;
142 |     for (a = 0; a < N; a++) bestd[a] = -1;
143 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
144 |     for (c = 0; c < words; c++) {
145 |       a = 0;
146 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
147 |       if (a == 1) continue;
148 |       dist = 0;
149 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
150 |       for (a = 0; a < N; a++) {
151 |         if (dist > bestd[a]) {
152 |           for (d = N - 1; d > a; d--) {
153 |             bestd[d] = bestd[d - 1];
154 |             strcpy(bestw[d], bestw[d - 1]);
155 |           }
156 |           bestd[a] = dist;
157 |           strcpy(bestw[a], &vocab[c * max_w]);
158 |           break;
159 |         }
160 |       }
161 |     }
162 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
163 |   }
164 |   return 0;
165 | }
166 | 


--------------------------------------------------------------------------------
/distance_fast.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <stdlib.h>
 19 | #include <time.h>
 20 | 
 21 | const long long max_size = 2000;         // max length of strings
 22 | const long long N = 10;                  // number of closest words that will be shown
 23 | const long long max_w = 50;              // max length of vocabulary entries
 24 | 
 25 | #define MAX_STRING 100
 26 | void ReadWord(char *word, FILE *fin) {
 27 |   int a = 0, ch;
 28 |   while (!feof(fin)) {
 29 |     ch = fgetc(fin);
 30 |     if (ch == 13) continue;
 31 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 32 |       if (a > 0) {
 33 |         if (ch == '\n') ungetc(ch, fin);
 34 |         break;
 35 |       }
 36 |       if (ch == '\n') {
 37 |         strcpy(word, (char *)"</s>");
 38 |         return;
 39 |       } else continue;
 40 |     }
 41 |     word[a] = ch;
 42 |     a++;
 43 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 44 |   }
 45 |   word[a] = 0;
 46 | }
 47 | 
 48 | int main(int argc, char **argv) {
 49 |   FILE *f;
 50 |   char st1[max_size];
 51 |   char *bestw[N];
 52 |   char file_name[max_size], st[100][max_size];
 53 |   float dist, len, bestd[N], bestclasses[N], vec[max_size];
 54 |   int bestclasses_ids[N];
 55 |   long long words, size, a, b, c, d, e, cn, bi[100];
 56 |   float *M;
 57 |   char *vocab;
 58 |   char word[MAX_STRING];
 59 |   clock_t begin;
 60 |   if (argc < 2) {
 61 |     printf("Usage: ./kmeans_txt <FILE>\nwhere FILE contains features\n <number_of_classes>");
 62 |     return 0;
 63 |   }
 64 |   strcpy(file_name, argv[1]);
 65 |   int classes = atoi(argv[2]);
 66 |   f = fopen(file_name, "rb");
 67 |   if (f == NULL) {
 68 |     printf("Input file not found\n");
 69 |     return -1;
 70 |   }
 71 |   
 72 |   printf("reading data\n");
 73 |   ReadWord(word, f);
 74 |   words = atoi(word);
 75 |   ReadWord(word, f);
 76 |   size = atoi(word);
 77 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 78 |   for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
 79 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 80 |   if (M == NULL) {
 81 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 82 |     return -1;
 83 |   }
 84 |   for (b = 0; b < words; b++) {
 85 |     a = 0;
 86 |     while (1) {
 87 |       vocab[b * max_w + a] = fgetc(f);
 88 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 89 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 90 |     }
 91 |     vocab[b * max_w + a] = 0;
 92 |     for (a = 0; a < size; a++) {
 93 |         ReadWord(word,f); 
 94 |         M[a + b * size] = atof(word); 
 95 |     }
 96 |     len = 0;
 97 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 98 |     len = sqrt(len);
 99 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
100 |   }
101 |   fclose(f);
102 |   
103 |   //run kmeans
104 |   printf("running k-means with %i classes...\n",classes);
105 |   int clcn = classes, iter = 10, closeid;
106 |   int *centcn = (int *)malloc(classes * sizeof(int));
107 |   int *cl = (int *)calloc(words, sizeof(int));
108 |   float closev, x;
109 |   float *cent = (float *)calloc(classes * size, sizeof(float));
110 |   for (a = 0; a < words; a++) cl[a] = a % clcn;
111 |   for (a = 0; a < iter; a++) {
112 |     for (b = 0; b < clcn * size; b++) cent[b] = 0;
113 |     for (b = 0; b < clcn; b++) centcn[b] = 1;
114 |     for (c = 0; c < words; c++) {
115 |       for (d = 0; d < size; d++) cent[size * cl[c] + d] += M[c * size + d];
116 |       centcn[cl[c]]++;
117 |     }
118 |     for (b = 0; b < clcn; b++) {
119 |       closev = 0;
120 |       for (c = 0; c < size; c++) {
121 |         cent[size * b + c] /= centcn[b];
122 |         closev += cent[size * b + c] * cent[size * b + c];
123 |       }
124 |       closev = sqrt(closev);
125 |       for (c = 0; c < size; c++) cent[size * b + c] /= closev;
126 |     }
127 |     for (c = 0; c < words; c++) {
128 |       closev = -10;
129 |       closeid = 0;
130 |       for (d = 0; d < clcn; d++) {
131 |         x = 0;
132 |         for (b = 0; b < size; b++) x += cent[size * d + b] * M[c * size + b];
133 |         if (x > closev) {
134 |           closev = x;
135 |           closeid = d;
136 |         }
137 |       }
138 |       cl[c] = closeid;
139 |     }
140 |   }
141 |   
142 |   // build an array of words ordered by class and their offsets (index where each class starts)
143 |   int class_words[words];
144 |   int class_offsets[classes];
145 |   for(a = 0; a < classes; a++) class_offsets[a]=0;
146 |   for(a = 0; a < words; a++) class_offsets[cl[a]]++;
147 |   for(a = 1; a < classes; a++) class_offsets[a] += class_offsets[a-1];
148 |   for(a = 0; a < words; a++) class_words[--class_offsets[cl[a]]] = a;   
149 |   
150 |   //reading from input
151 |   while (1) {
152 |     for (a = 0; a < N; a++) bestd[a] = 0;
153 |     for (a = 0; a < N; a++) bestclasses[a] = 0;
154 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
155 |     printf("Enter word or sentence (EXIT to break): ");
156 |     a = 0;
157 |     while (1) {
158 |       st1[a] = fgetc(stdin);
159 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
160 |         st1[a] = 0;
161 |         break;
162 |       }
163 |       a++;
164 |     }
165 |     if (!strcmp(st1, "EXIT")) break;
166 |     cn = 0;
167 |     b = 0;
168 |     c = 0;
169 |     while (1) {
170 |       st[cn][b] = st1[c];
171 |       b++;
172 |       c++;
173 |       st[cn][b] = 0;
174 |       if (st1[c] == 0) break;
175 |       if (st1[c] == ' ') {
176 |         cn++;
177 |         b = 0;
178 |         c++;
179 |       }
180 |     }
181 |     cn++;
182 |     for (a = 0; a < cn; a++) {
183 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
184 |       if (b == words) b = -1;
185 |       bi[a] = b;
186 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
187 |       if (b == -1) {
188 |         printf("Out of dictionary word!\n");
189 |         break;
190 |       }
191 |     }
192 |     if (b == -1) continue;
193 |     begin = clock();
194 |     
195 |     printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
196 |     
197 |     for (a = 0; a < size; a++) vec[a] = 0;
198 |     for (b = 0; b < cn; b++) {
199 |       if (bi[b] == -1) continue;
200 |       for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
201 |     }
202 |     
203 |     len = 0;
204 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
205 |     len = sqrt(len);
206 |     for (a = 0; a < size; a++) vec[a] /= len;
207 |     
208 |     // find top N centroids
209 |     for (a = 0; a < N; a++) bestclasses[a] = -1;
210 |     for (a = 0; a < N; a++) bestclasses_ids[a] = -1;
211 |     for (c = 0; c < classes; c++){
212 |     	dist = 0;
213 |     	for (a = 0; a < size; a++) dist += vec[a] * cent[a + size * c];
214 |         for (a = 0; a < N; a++) {
215 |         if (dist > bestclasses[a]) {
216 |           	for(d = N - 1; d > a; d--){
217 |           		bestclasses[d] = bestclasses[d-1];
218 |           		bestclasses_ids[d] = bestclasses_ids[d-1];
219 |           	}
220 |           	bestclasses[a] = dist;
221 |           	bestclasses_ids[a] = c;
222 |           	break;
223 |         }
224 |     	}
225 |     }
226 |     
227 |     // find top N words in the centroids
228 |     for (a = 0; a < N; a++) bestd[a] = -1;
229 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
230 |     for (a = 0; a < N; a++){
231 |    		c = words;
232 |    		if(bestclasses_ids[a] < classes-1) c = class_offsets[bestclasses_ids[a]+1];
233 |    		b = class_offsets[bestclasses_ids[a]];
234 |    		for(; b < c; b++){
235 |    			dist = 0;
236 |             for (d = 0; d < size; d++) dist += vec[d] * M[d + class_words[b] * size];
237 |             for (d = 0; d < N; d++){
238 |             	if(dist > bestd[d]){
239 |             		for (e = N -1; e > d; e--){
240 |             			bestd[e] = bestd[e-1];
241 |             			strcpy(bestw[e], bestw[e-1]);
242 |             		}
243 |             		bestd[d] = dist;
244 |             		strcpy(bestw[d], &vocab[class_words[b] * max_w]);
245 |             		break;
246 |             	}
247 |             }
248 |    	    }
249 |    	}
250 | 	for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);   
251 | 	printf("time spent = %f seconds\n", (double)(clock() - begin) / CLOCKS_PER_SEC);
252 |   }
253 |   // Save the K-means classes
254 | 
255 |   free(centcn);
256 |   free(cent);
257 |   free(cl);
258 |   
259 |   //start running distance
260 |   return 0;
261 | }
262 | 


--------------------------------------------------------------------------------
/distance_txt.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <stdlib.h>
 19 | #include <time.h>
 20 | 
 21 | const long long max_size = 2000;         // max length of strings
 22 | const long long N = 400;                  // number of closest words that will be shown
 23 | const long long max_w = 1000;              // max length of vocabulary entries
 24 | 
 25 | #define MAX_STRING 1000
 26 | void ReadWord(char *word, FILE *fin) {
 27 |   int a = 0, ch;
 28 |   while (!feof(fin)) {
 29 |     ch = fgetc(fin);
 30 |     if (ch == 13) continue;
 31 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 32 |       if (a > 0) {
 33 |         if (ch == '\n') ungetc(ch, fin);
 34 |         break;
 35 |       }
 36 |       if (ch == '\n') {
 37 |         strcpy(word, (char *)"</s>");
 38 |         return;
 39 |       } else continue;
 40 |     }
 41 |     word[a] = ch;
 42 |     a++;
 43 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 44 |   }
 45 |   word[a] = 0;
 46 | }
 47 | 
 48 | int main(int argc, char **argv) {
 49 |   FILE *f;
 50 |   char st1[max_size];
 51 |   char *bestw[N];
 52 |   char file_name[max_size], st[100][max_size];
 53 |   float dist, len, bestd[N], vec[max_size];
 54 |   long long words, size, a, b, c, d, cn, bi[100];
 55 |   float *M;
 56 |   char *vocab;
 57 |   char word[MAX_STRING];
 58 |   clock_t begin;
 59 |   if (argc < 2) {
 60 |     printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 61 |     return 0;
 62 |   }
 63 |   strcpy(file_name, argv[1]);
 64 |   f = fopen(file_name, "rb");
 65 |   if (f == NULL) {
 66 |     printf("Input file not found\n");
 67 |     return -1;
 68 |   }
 69 |   ReadWord(word, f);
 70 |   words = atoi(word);
 71 |   ReadWord(word, f);
 72 |   size = atoi(word);
 73 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 74 |   for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
 75 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 76 |   if (M == NULL) {
 77 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 78 |     return -1;
 79 |   }
 80 |   for (b = 0; b < words; b++) {
 81 |     a = 0;
 82 |     while (1) {
 83 |       vocab[b * max_w + a] = fgetc(f);
 84 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 85 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 86 |     }
 87 |     vocab[b * max_w + a] = 0;
 88 |     for (a = 0; a < size; a++) {
 89 |         ReadWord(word,f); 
 90 |         M[a + b * size] = atof(word); 
 91 |     }
 92 |     len = 0;
 93 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 94 |     len = sqrt(len);
 95 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 96 |   }
 97 |   fclose(f);
 98 |   while (1) {
 99 |     for (a = 0; a < N; a++) bestd[a] = 0;
100 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
101 |     printf("Enter word or sentence (EXIT to break): ");
102 |     a = 0;
103 |     while (1) {
104 |       st1[a] = fgetc(stdin);
105 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
106 |         st1[a] = 0;
107 |         break;
108 |       }
109 |       a++;
110 |     }
111 |     if (!strcmp(st1, "EXIT")) break;
112 |     cn = 0;
113 |     b = 0;
114 |     c = 0;
115 |     while (1) {
116 |       st[cn][b] = st1[c];
117 |       b++;
118 |       c++;
119 |       st[cn][b] = 0;
120 |       if (st1[c] == 0) break;
121 |       if (st1[c] == ' ') {
122 |         cn++;
123 |         b = 0;
124 |         c++;
125 |       }
126 |     }
127 |     cn++;
128 |     for (a = 0; a < cn; a++) {
129 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
130 |       if (b == words) b = -1;
131 |       bi[a] = b;
132 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
133 |       if (b == -1) {
134 |         printf("Out of dictionary word!\n");
135 |         break;
136 |       }
137 |     }
138 |     if (b == -1) continue;
139 |     begin = clock();
140 | 
141 |     printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
142 |     for (a = 0; a < size; a++) vec[a] = 0;
143 |     for (b = 0; b < cn; b++) {
144 |       if (bi[b] == -1) continue;
145 |       for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
146 |     }
147 |     len = 0;
148 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
149 |     len = sqrt(len);
150 |     for (a = 0; a < size; a++) vec[a] /= len;
151 |     for (a = 0; a < N; a++) bestd[a] = -1;
152 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
153 |     for (c = 0; c < words; c++) {
154 |       a = 0;
155 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
156 |       if (a == 1) continue;
157 |       dist = 0;
158 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
159 |       for (a = 0; a < N; a++) {
160 |         if (dist > bestd[a]) {
161 |           for (d = N - 1; d > a; d--) {
162 |             bestd[d] = bestd[d - 1];
163 |             strcpy(bestw[d], bestw[d - 1]);
164 |           }
165 |           bestd[a] = dist;
166 |           strcpy(bestw[a], &vocab[c * max_w]);
167 |           break;
168 |         }
169 |       }
170 |     }
171 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
172 |     printf("time spent = %f seconds\n", (double)(clock() - begin) / CLOCKS_PER_SEC);
173 |   }
174 |   return 0;
175 | }
176 | 


--------------------------------------------------------------------------------
/kmeans_txt.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <stdlib.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | #define MAX_STRING 100
 25 | void ReadWord(char *word, FILE *fin) {
 26 |   int a = 0, ch;
 27 |   while (!feof(fin)) {
 28 |     ch = fgetc(fin);
 29 |     if (ch == 13) continue;
 30 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 31 |       if (a > 0) {
 32 |         if (ch == '\n') ungetc(ch, fin);
 33 |         break;
 34 |       }
 35 |       if (ch == '\n') {
 36 |         strcpy(word, (char *)"</s>");
 37 |         return;
 38 |       } else continue;
 39 |     }
 40 |     word[a] = ch;
 41 |     a++;
 42 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 43 |   }
 44 |   word[a] = 0;
 45 | }
 46 | 
 47 | int main(int argc, char **argv) {
 48 |   FILE *f;
 49 |   char *bestw[N];
 50 |   char file_name[max_size], output_file[max_size];
 51 |   float len;
 52 |   long long words, size, a, b, c, d;
 53 |   float *M;
 54 |   char *vocab;
 55 |   char word[MAX_STRING];
 56 |   if (argc < 3) {
 57 |     printf("Usage: ./kmeans_txt <FILE>\nwhere FILE contains features\n <number_of_classes>");
 58 |     return 0;
 59 |   }
 60 |   strcpy(file_name, argv[1]);
 61 |   strcpy(output_file, argv[2]);
 62 |   int classes = atoi(argv[3]);
 63 |   f = fopen(file_name, "rb");
 64 |   if (f == NULL) {
 65 |     printf("Input file not found\n");
 66 |     return -1;
 67 |   }
 68 |   
 69 |   FILE *fo = fopen(output_file, "wb");
 70 |   
 71 |   ReadWord(word, f);
 72 |   words = atoi(word);
 73 |   ReadWord(word, f);
 74 |   size = atoi(word);
 75 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 76 |   for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
 77 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 78 |   if (M == NULL) {
 79 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 80 |     return -1;
 81 |   }
 82 |   for (b = 0; b < words; b++) {
 83 |     a = 0;
 84 |     while (1) {
 85 |       vocab[b * max_w + a] = fgetc(f);
 86 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 87 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 88 |     }
 89 |     vocab[b * max_w + a] = 0;
 90 |     for (a = 0; a < size; a++) {
 91 |         ReadWord(word,f); 
 92 |         M[a + b * size] = atof(word); 
 93 |     }
 94 |     len = 0;
 95 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 96 |     len = sqrt(len);
 97 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 98 |   }
 99 |   fclose(f);
100 |   
101 |   //run kmeans
102 |   int clcn = classes, iter = 2, closeid;
103 |   int *centcn = (int *)malloc(classes * sizeof(int));
104 |   int *cl = (int *)calloc(words, sizeof(int));
105 |   float closev, x;
106 |   float *cent = (float *)calloc(classes * size, sizeof(float));
107 |   for (a = 0; a < words; a++) cl[a] = a % clcn;
108 |     for (a = 0; a < iter; a++) {
109 |       for (b = 0; b < clcn * size; b++) cent[b] = 0;
110 |       for (b = 0; b < clcn; b++) centcn[b] = 1;
111 |       for (c = 0; c < words; c++) {
112 |         for (d = 0; d < size; d++) cent[size * cl[c] + d] += M[c * size + d];
113 |         centcn[cl[c]]++;
114 |       }
115 |       for (b = 0; b < clcn; b++) {
116 |         closev = 0;
117 |         for (c = 0; c < size; c++) {
118 |           cent[size * b + c] /= centcn[b];
119 |           closev += cent[size * b + c] * cent[size * b + c];
120 |         }
121 |         closev = sqrt(closev);
122 |         for (c = 0; c < size; c++) cent[size * b + c] /= closev;
123 |       }
124 |       for (c = 0; c < words; c++) {
125 |         closev = -10;
126 |         closeid = 0;
127 |         for (d = 0; d < clcn; d++) {
128 |           x = 0;
129 |           for (b = 0; b < size; b++) x += cent[size * d + b] * M[c * size + b];
130 |           if (x > closev) {
131 |             closev = x;
132 |             closeid = d;
133 |           }
134 |         }
135 |         cl[c] = closeid;
136 |       }
137 |     }
138 |     
139 |     // build an array of words ordered by class and their offsets (index where each class starts)
140 |     int class_words[words];
141 |     int class_offsets[classes];
142 |     for(a = 0; a < classes; a++) class_offsets[a]=0;
143 |     for(a = 0; a < words; a++) class_offsets[cl[a]]++;
144 |     for(a = 1; a < classes; a++) class_offsets[a] += class_offsets[a-1];
145 |     for(a = 0; a < words; a++) class_words[--class_offsets[cl[a]]] = a;
146 |     
147 |     for (a = 0; a < classes; a++){
148 |     	c = words;
149 |     	if(a < classes-1) c = class_offsets[a+1];
150 |     	b = class_offsets[a];
151 |     	for(; b < c; b++){
152 |     	    fprintf(fo, "%lld %s\n", a ,&vocab[class_words[b] * max_w]);
153 |     	}
154 |     }
155 |      // Save the K-means classes
156 |     //for (a = 0; a < words; a++) fprintf(fo, "%s %d\n", &vocab[a * max_w], cl[a]);
157 |     free(centcn);
158 |     free(cent);
159 |     free(cl);
160 |     free(M);
161 |     free(vocab);
162 |   return 0;
163 | }
164 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result -g
 4 | 
 5 | all: word2vec word2phrase distance word-analogy compute-accuracy distance_txt distance_fast kmeans_txt
 6 | 
 7 | word2vec : word2vec.c
 8 | #	touch word2vec
 9 | 	$(CC) word2vec.c -o word2vec $(CFLAGS)
10 | word2phrase : word2phrase.c
11 | 	$(CC) word2phrase.c -o word2phrase $(CFLAGS)
12 | distance : distance.c
13 | 	$(CC) distance.c -o distance $(CFLAGS)
14 | distance_txt : distance_txt.c
15 | 	$(CC) distance_txt.c -o distance_txt $(CFLAGS)
16 | distance_fast : distance_fast.c
17 | 	$(CC) distance_fast.c -o distance_fast $(CFLAGS)
18 | kmeans_txt : kmeans_txt.c
19 | 	$(CC) kmeans_txt.c -o kmeans_txt $(CFLAGS)
20 | word-analogy : word-analogy.c
21 | 	$(CC) word-analogy.c -o word-analogy $(CFLAGS)
22 | compute-accuracy : compute-accuracy.c
23 | 	$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
24 | #	chmod +x *.sh
25 | clean:
26 | 	rm -rf word2vec word2phrase distance word-analogy compute-accuracy distance_txt kmeans_txt
27 | 


--------------------------------------------------------------------------------
/word-analogy.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <stdlib.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char bestw[N][max_size];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   float *M;
 32 |   char *vocab;
 33 |   if (argc < 2) {
 34 |     printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 35 |     return 0;
 36 |   }
 37 |   strcpy(file_name, argv[1]);
 38 |   f = fopen(file_name, "rb");
 39 |   if (f == NULL) {
 40 |     printf("Input file not found\n");
 41 |     return -1;
 42 |   }
 43 |   fscanf(f, "%lld", &words);
 44 |   fscanf(f, "%lld", &size);
 45 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 46 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 47 |   if (M == NULL) {
 48 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 49 |     return -1;
 50 |   }
 51 |   for (b = 0; b < words; b++) {
 52 |     a = 0;
 53 |     while (1) {
 54 |       vocab[b * max_w + a] = fgetc(f);
 55 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 56 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 57 |     }
 58 |     vocab[b * max_w + a] = 0;
 59 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 60 |     len = 0;
 61 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 62 |     len = sqrt(len);
 63 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 64 |   }
 65 |   fclose(f);
 66 |   while (1) {
 67 |     for (a = 0; a < N; a++) bestd[a] = 0;
 68 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 69 |     printf("Enter three words (EXIT to break): ");
 70 |     a = 0;
 71 |     while (1) {
 72 |       st1[a] = fgetc(stdin);
 73 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 74 |         st1[a] = 0;
 75 |         break;
 76 |       }
 77 |       a++;
 78 |     }
 79 |     if (!strcmp(st1, "EXIT")) break;
 80 |     cn = 0;
 81 |     b = 0;
 82 |     c = 0;
 83 |     while (1) {
 84 |       st[cn][b] = st1[c];
 85 |       b++;
 86 |       c++;
 87 |       st[cn][b] = 0;
 88 |       if (st1[c] == 0) break;
 89 |       if (st1[c] == ' ') {
 90 |         cn++;
 91 |         b = 0;
 92 |         c++;
 93 |       }
 94 |     }
 95 |     cn++;
 96 |     if (cn < 3) {
 97 |       printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
 98 |       continue;
 99 |     }
100 |     for (a = 0; a < cn; a++) {
101 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
102 |       if (b == words) b = 0;
103 |       bi[a] = b;
104 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
105 |       if (b == 0) {
106 |         printf("Out of dictionary word!\n");
107 |         break;
108 |       }
109 |     }
110 |     if (b == 0) continue;
111 |     printf("\n                                              Word              Distance\n------------------------------------------------------------------------\n");
112 |     for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
113 |     len = 0;
114 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
115 |     len = sqrt(len);
116 |     for (a = 0; a < size; a++) vec[a] /= len;
117 |     for (a = 0; a < N; a++) bestd[a] = 0;
118 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
119 |     for (c = 0; c < words; c++) {
120 |       if (c == bi[0]) continue;
121 |       if (c == bi[1]) continue;
122 |       if (c == bi[2]) continue;
123 |       a = 0;
124 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
125 |       if (a == 1) continue;
126 |       dist = 0;
127 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
128 |       for (a = 0; a < N; a++) {
129 |         if (dist > bestd[a]) {
130 |           for (d = N - 1; d > a; d--) {
131 |             bestd[d] = bestd[d - 1];
132 |             strcpy(bestw[d], bestw[d - 1]);
133 |           }
134 |           bestd[a] = dist;
135 |           strcpy(bestw[a], &vocab[c * max_w]);
136 |           break;
137 |         }
138 |       }
139 |     }
140 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
141 |   }
142 |   return 0;
143 | }
144 | 


--------------------------------------------------------------------------------
/word2phrase.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 60
 22 | 
 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary
 24 | 
 25 | typedef float real;                    // Precision of float numbers
 26 | 
 27 | struct vocab_word {
 28 |   long long cn;
 29 |   char *word;
 30 | };
 31 | 
 32 | char train_file[MAX_STRING], output_file[MAX_STRING];
 33 | struct vocab_word *vocab;
 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1;
 35 | long long vocab_max_size = 10000, vocab_size = 0;
 36 | long long train_words = 0;
 37 | real threshold = 100;
 38 | 
 39 | unsigned long long next_random = 1;
 40 | 
 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 42 | void ReadWord(char *word, FILE *fin) {
 43 |   int a = 0, ch;
 44 |   while (!feof(fin)) {
 45 |     ch = fgetc(fin);
 46 |     if (ch == 13) continue;
 47 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 48 |       if (a > 0) {
 49 |         if (ch == '\n') ungetc(ch, fin);
 50 |         break;
 51 |       }
 52 |       if (ch == '\n') {
 53 |         strcpy(word, (char *)"</s>");
 54 |         return;
 55 |       } else continue;
 56 |     }
 57 |     word[a] = ch;
 58 |     a++;
 59 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 60 |   }
 61 |   word[a] = 0;
 62 | }
 63 | 
 64 | // Returns hash value of a word
 65 | int GetWordHash(char *word) {
 66 |   unsigned long long a, hash = 1;
 67 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
 68 |   hash = hash % vocab_hash_size;
 69 |   return hash;
 70 | }
 71 | 
 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
 73 | int SearchVocab(char *word) {
 74 |   unsigned int hash = GetWordHash(word);
 75 |   while (1) {
 76 |     if (vocab_hash[hash] == -1) return -1;
 77 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
 78 |     hash = (hash + 1) % vocab_hash_size;
 79 |   }
 80 |   return -1;
 81 | }
 82 | 
 83 | // Reads a word and returns its index in the vocabulary
 84 | int ReadWordIndex(FILE *fin) {
 85 |   char word[MAX_STRING];
 86 |   ReadWord(word, fin);
 87 |   if (feof(fin)) return -1;
 88 |   return SearchVocab(word);
 89 | }
 90 | 
 91 | // Adds a word to the vocabulary
 92 | int AddWordToVocab(char *word) {
 93 |   unsigned int hash, length = strlen(word) + 1;
 94 |   if (length > MAX_STRING) length = MAX_STRING;
 95 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
 96 |   strcpy(vocab[vocab_size].word, word);
 97 |   vocab[vocab_size].cn = 0;
 98 |   vocab_size++;
 99 |   // Reallocate memory if needed
100 |   if (vocab_size + 2 >= vocab_max_size) {
101 |     vocab_max_size += 10000;
102 |     vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
103 |   }
104 |   hash = GetWordHash(word);
105 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
106 |   vocab_hash[hash]=vocab_size - 1;
107 |   return vocab_size - 1;
108 | }
109 | 
110 | // Used later for sorting by word counts
111 | int VocabCompare(const void *a, const void *b) {
112 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
113 | }
114 | 
115 | // Sorts the vocabulary by frequency using word counts
116 | void SortVocab() {
117 |   int a;
118 |   unsigned int hash;
119 |   // Sort the vocabulary and keep </s> at the first position
120 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
121 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
122 |   for (a = 0; a < vocab_size; a++) {
123 |     // Words occuring less than min_count times will be discarded from the vocab
124 |     if (vocab[a].cn < min_count) {
125 |       vocab_size--;
126 |       free(vocab[vocab_size].word);
127 |     } else {
128 |       // Hash will be re-computed, as after the sorting it is not actual
129 |       hash = GetWordHash(vocab[a].word);
130 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
131 |       vocab_hash[hash] = a;
132 |     }
133 |   }
134 |   vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word));
135 | }
136 | 
137 | // Reduces the vocabulary by removing infrequent tokens
138 | void ReduceVocab() {
139 |   int a, b = 0;
140 |   unsigned int hash;
141 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
142 |     vocab[b].cn = vocab[a].cn;
143 |     vocab[b].word = vocab[a].word;
144 |     b++;
145 |   } else free(vocab[a].word);
146 |   vocab_size = b;
147 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
148 |   for (a = 0; a < vocab_size; a++) {
149 |     // Hash will be re-computed, as it is not actual
150 |     hash = GetWordHash(vocab[a].word);
151 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
152 |     vocab_hash[hash] = a;
153 |   }
154 |   fflush(stdout);
155 |   min_reduce++;
156 | }
157 | 
158 | void LearnVocabFromTrainFile() {
159 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
160 |   FILE *fin;
161 |   long long a, i, start = 1;
162 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
163 |   fin = fopen(train_file, "rb");
164 |   if (fin == NULL) {
165 |     printf("ERROR: training data file not found!\n");
166 |     exit(1);
167 |   }
168 |   vocab_size = 0;
169 |   AddWordToVocab((char *)"</s>");
170 |   while (1) {
171 |     ReadWord(word, fin);
172 |     if (feof(fin)) break;
173 |     if (!strcmp(word, "</s>")) {
174 |       start = 1;
175 |       continue;
176 |     } else start = 0;
177 |     train_words++;
178 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
179 |       printf("Words processed: %lldK     Vocab size: %lldK  %c", train_words / 1000, vocab_size / 1000, 13);
180 |       fflush(stdout);
181 |     }
182 |     i = SearchVocab(word);
183 |     if (i == -1) {
184 |       a = AddWordToVocab(word);
185 |       vocab[a].cn = 1;
186 |     } else vocab[i].cn++;
187 |     if (start) continue;
188 |     sprintf(bigram_word, "%s_%s", last_word, word);
189 |     bigram_word[MAX_STRING - 1] = 0;
190 |     strcpy(last_word, word);
191 |     i = SearchVocab(bigram_word);
192 |     if (i == -1) {
193 |       a = AddWordToVocab(bigram_word);
194 |       vocab[a].cn = 1;
195 |     } else vocab[i].cn++;
196 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
197 |   }
198 |   SortVocab();
199 |   if (debug_mode > 0) {
200 |     printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
201 |     printf("Words in train file: %lld\n", train_words);
202 |   }
203 |   fclose(fin);
204 | }
205 | 
206 | void TrainModel() {
207 |   long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
208 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
209 |   real score;
210 |   FILE *fo, *fin;
211 |   printf("Starting training using file %s\n", train_file);
212 |   LearnVocabFromTrainFile();
213 |   fin = fopen(train_file, "rb");
214 |   fo = fopen(output_file, "wb");
215 |   word[0] = 0;
216 |   while (1) {
217 |     strcpy(last_word, word);
218 |     ReadWord(word, fin);
219 |     if (feof(fin)) break;
220 |     if (!strcmp(word, "</s>")) {
221 |       fprintf(fo, "\n");
222 |       continue;
223 |     }
224 |     cn++;
225 |     if ((debug_mode > 1) && (cn % 100000 == 0)) {
226 |       printf("Words written: %lldK%c", cn / 1000, 13);
227 |       fflush(stdout);
228 |     }
229 |     oov = 0;
230 |     i = SearchVocab(word);
231 |     if (i == -1) oov = 1; else pb = vocab[i].cn;
232 |     if (li == -1) oov = 1;
233 |     li = i;
234 |     sprintf(bigram_word, "%s_%s", last_word, word);
235 |     bigram_word[MAX_STRING - 1] = 0;
236 |     i = SearchVocab(bigram_word);
237 |     if (i == -1) oov = 1; else pab = vocab[i].cn;
238 |     if (pa < min_count) oov = 1;
239 |     if (pb < min_count) oov = 1;
240 |     if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
241 |     if (score > threshold) {
242 |       fprintf(fo, "_%s", word);
243 |       pb = 0;
244 |     } else fprintf(fo, " %s", word);
245 |     pa = pb;
246 |   }
247 |   fclose(fo);
248 |   fclose(fin);
249 | }
250 | 
251 | int ArgPos(char *str, int argc, char **argv) {
252 |   int a;
253 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
254 |     if (a == argc - 1) {
255 |       printf("Argument missing for %s\n", str);
256 |       exit(1);
257 |     }
258 |     return a;
259 |   }
260 |   return -1;
261 | }
262 | 
263 | int main(int argc, char **argv) {
264 |   int i;
265 |   if (argc == 1) {
266 |     printf("WORD2PHRASE tool v0.1a\n\n");
267 |     printf("Options:\n");
268 |     printf("Parameters for training:\n");
269 |     printf("\t-train <file>\n");
270 |     printf("\t\tUse text data from <file> to train the model\n");
271 |     printf("\t-output <file>\n");
272 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters / phrases\n");
273 |     printf("\t-min-count <int>\n");
274 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
275 |     printf("\t-threshold <float>\n");
276 |     printf("\t\t The <float> value represents threshold for forming the phrases (higher means less phrases); default 100\n");
277 |     printf("\t-debug <int>\n");
278 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
279 |     printf("\nExamples:\n");
280 |     printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n");
281 |     return 0;
282 |   }
283 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
284 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
285 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
286 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
287 |   if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]);
288 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
289 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
290 |   TrainModel();
291 |   return 0;
292 | }
293 | 


--------------------------------------------------------------------------------
/word2vec.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | #include <stdbool.h>
 21 | #include <assert.h>
 22 | #include <unistd.h>
 23 | 
 24 | #define MAX_STRING 10000
 25 | #define EXP_TABLE_SIZE 1000
 26 | #define MAX_EXP 6
 27 | #define MAX_SENTENCE_LENGTH 1000
 28 | #define MAX_CODE_LENGTH 40
 29 | 
 30 | const int vocab_hash_size = 30 * 1000 * 1000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 31 | 
 32 | typedef float real;                    // Precision of float numbers
 33 | 
 34 | struct vocab_word {
 35 |   long long cn;
 36 |   int *point;
 37 |   bool fixed;
 38 |   char *word, *code, codelen;
 39 | };
 40 | 
 41 | char train_file[MAX_STRING], output_word_vectors_file[MAX_STRING], output_context_vectors_file[MAX_STRING];
 42 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 43 | char fixed_word_vectors_file[MAX_STRING], fixed_context_vectors_file[MAX_STRING];
 44 | struct vocab_word *vocab;
 45 | int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
 46 | int *vocab_hash;
 47 | long long vocab_max_size = 10000, vocab_size = 0, layer1_size = 100;
 48 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
 49 | real alpha = 0.025, starting_alpha, sample = 1e-3;
 50 | real *syn0, *syn1, *syn1neg, *expTable;
 51 | clock_t start;
 52 | 
 53 | int window_offset, window_layer_size;
 54 | 
 55 | int window_hidden_size = 500; 
 56 | 
 57 | int hs = 0, negative = 5, no_header = 0;
 58 | const int table_size = 1e8;
 59 | int *table;
 60 | 
 61 | //constrastive negative sampling
 62 | char negative_classes_file[MAX_STRING];
 63 | int *word_to_group;
 64 | int *group_to_table; //group_size*table_size
 65 | int class_number;
 66 | 
 67 | real hardTanh(real x){
 68 | 	if(x>=1){
 69 | 		return 1;
 70 | 	}
 71 | 	else if(x<=-1){
 72 | 		return -1;
 73 | 	}
 74 | 	else{
 75 | 		return x;
 76 | 	}
 77 | }
 78 | 
 79 | real dHardTanh(real x, real g){
 80 | 	if(x > 1 && g > 0){
 81 | 		return 0;
 82 | 	}
 83 | 	if(x < -1 && g < 0){
 84 | 		return 0;
 85 | 	}
 86 | 	return 1;
 87 | }
 88 | 
 89 | void InitUnigramTable() {
 90 |   int a, i;
 91 |   long long train_words_pow = 0;
 92 |   real d1, power = 0.75;
 93 |   table = (int *)malloc(table_size * sizeof(int));
 94 |   for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
 95 |   i = 0;
 96 |   d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
 97 |   for (a = 0; a < table_size; a++) {
 98 |     table[a] = i;
 99 |     if (a / (real)table_size > d1) {
100 |       i++;
101 |       d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
102 |     }
103 |     if (i >= vocab_size) i = vocab_size - 1;
104 |   }
105 | }
106 | 
107 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
108 | void ReadWord(char *word, FILE *fin) {
109 |   int a = 0, ch;
110 |   while (!feof(fin)) {
111 |     ch = fgetc(fin);
112 |     if (ch == 13) continue;
113 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
114 |       if (a > 0) {
115 |         if (ch == '\n') ungetc(ch, fin);
116 |         break;
117 |       }
118 |       if (ch == '\n') {
119 |         strcpy(word, (char *)"</s>");
120 |         return;
121 |       } else continue;
122 |     }
123 |     word[a] = ch;
124 |     a++;
125 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
126 |   }
127 |   word[a] = 0;
128 | }
129 | 
130 | // Returns hash value of a word
131 | int GetWordHash(char *word) {
132 |   unsigned long long a, hash = 0;
133 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
134 |   hash = hash % vocab_hash_size;
135 |   return hash;
136 | }
137 | 
138 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
139 | int SearchVocab(char *word) {
140 |   unsigned int hash = GetWordHash(word);
141 |   while (1) {
142 |     if (vocab_hash[hash] == -1) return -1;
143 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
144 |     hash = (hash + 1) % vocab_hash_size;
145 |   }
146 |   return -1;
147 | }
148 | 
149 | // Reads a word and returns its index in the vocabulary
150 | int ReadWordIndex(FILE *fin) {
151 |   char word[MAX_STRING];
152 |   ReadWord(word, fin);
153 |   if (feof(fin)) return -1;
154 |   return SearchVocab(word);
155 | }
156 | 
157 | // Adds a word to the vocabulary
158 | int AddWordToVocab(char *word) {
159 |   unsigned int hash, length = strlen(word) + 1;
160 |   if (length > MAX_STRING) length = MAX_STRING;
161 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
162 |   strcpy(vocab[vocab_size].word, word);
163 |   vocab[vocab_size].cn = 0;
164 |   vocab[vocab_size].fixed = false;
165 |   vocab_size++;
166 |   // Reallocate memory if needed
167 |   if (vocab_size + 2 >= vocab_max_size) {
168 |     vocab_max_size += 10000;
169 |     vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
170 |   }
171 |   hash = GetWordHash(word);
172 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
173 |   vocab_hash[hash] = vocab_size - 1;
174 |   return vocab_size - 1;
175 | }
176 | 
177 | // Used later for sorting by word counts
178 | int VocabCompare(const void *a, const void *b) {
179 |   return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
180 | }
181 | 
182 | // Sorts the vocabulary by frequency using word counts
183 | void SortVocab() {
184 |   int a, size;
185 |   unsigned int hash;
186 |   // Sort the vocabulary and keep </s> at the first position
187 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
188 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
189 |   size = vocab_size;
190 |   train_words = 0;
191 |   for (a = 0; a < size; a++) {
192 |     // Words occuring less than min_count times will be discarded from the vocab
193 |     if ((vocab[a].cn < min_count) && (a != 0)) {
194 |       vocab_size--;
195 |       free(vocab[a].word);
196 |     } else {
197 |       // Hash will be re-computed, as after the sorting it is not actual
198 |       hash=GetWordHash(vocab[a].word);
199 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
200 |       vocab_hash[hash] = a;
201 |       train_words += vocab[a].cn;
202 |     }
203 |   }
204 |   vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
205 |   // Allocate memory for the binary tree construction
206 |   for (a = 0; a < vocab_size; a++) {
207 |     vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
208 |     vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
209 |   }
210 | }
211 | 
212 | // Reduces the vocabulary by removing infrequent tokens
213 | void ReduceVocab() {
214 |   int a, b = 0;
215 |   unsigned int hash;
216 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
217 |       vocab[b].cn = vocab[a].cn;
218 |       vocab[b].word = vocab[a].word;
219 |       vocab[b].fixed = vocab[a].fixed;
220 |       b++;
221 |     } else free(vocab[a].word);
222 |   vocab_size = b;
223 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
224 |   for (a = 0; a < vocab_size; a++) {
225 |     // Hash will be re-computed, as it is not actual
226 |     hash = GetWordHash(vocab[a].word);
227 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
228 |     vocab_hash[hash] = a;
229 |   }
230 |   fflush(stdout);
231 |   min_reduce++;
232 | }
233 | 
234 | void ReadVectors(real *word_vectors, char *embeddings_filename) {
235 |   FILE * fp = NULL;
236 |   char *line = NULL;
237 |   char *buffer = NULL;
238 |   size_t len = 0;
239 |   int vector_size = 0;
240 |   ssize_t read;
241 |   bool first_line = true;
242 |   int j;
243 | 
244 |   if (embeddings_filename[0] == 0) { return; }
245 | 
246 |   fp = fopen(embeddings_filename, "r");
247 |   if (fp == NULL) exit(EXIT_FAILURE);
248 | 
249 |   while ((read = getline(&line, &len, fp)) != -1) {
250 |     char *first_field = NULL;
251 |     long long i;
252 |     // assert read != 0
253 |     // discard the first line in the embeddings file (metadata)
254 |     if (first_line) { 
255 |       first_line = false; 
256 |       // ignore the number of words
257 |       buffer = strtok(line, " ");
258 |       assert(buffer != NULL);
259 |       // read the vector size
260 |       buffer = strtok(NULL, " ");
261 |       assert(buffer != NULL);
262 |       vector_size = atoi(buffer);
263 |       assert(vector_size > 0);
264 |       continue;
265 |     }
266 |     
267 |     // each line consists of the word and its embeddings. only read the word.
268 |     first_field = strtok(line, " ");
269 |     assert(first_field != NULL);
270 |     i = SearchVocab(first_field);
271 |     
272 |     // ignore words which are not already in the vocabulary
273 |     if (i == -1) { continue; }
274 | 
275 |     // now read the vector one value at a time.
276 |     for (j = 0; j < vector_size; ++j) {
277 |       buffer = strtok(NULL, " ");
278 |       assert(buffer != NULL);
279 |       real value = atof(buffer);
280 |       word_vectors[i * layer1_size + j] = value;
281 |     }
282 |   }
283 |   fclose(fp);
284 | }
285 | 
286 | // add the words in fixed_word_vectors_file to the vocabulary.
287 | void ReadFixedEmbeddingsVocab() {
288 |   FILE * fp = NULL;
289 |   char * line = NULL;
290 |   size_t len = 0;
291 |   ssize_t read;
292 | 
293 |   printf("entering ReadFixedEmbeddingsVocab()\n");
294 | 
295 |   // do nothing if the file isn't specified.
296 |   if (fixed_word_vectors_file[0] == 0) return;
297 | 
298 |   // assume it's a text mode embeddings file
299 |   fp = fopen(fixed_word_vectors_file, "r");
300 |   if (fp == NULL) exit(EXIT_FAILURE);
301 | 
302 |   bool first_line = true;
303 |   while ((read = getline(&line, &len, fp)) != -1) {
304 |     char * first_field = NULL;
305 |     long long i;
306 |     // discard the first line in the embeddings file (metadata)
307 |     if (first_line) { first_line = false; continue; }
308 |     // each line consists of the word and its embeddings. only read the word.
309 |     first_field = strtok(line, " ");
310 |     i = SearchVocab(first_field);
311 |     // ignore new words and update the count of existing words so that they won't be pruned.
312 |     if (i == -1) {
313 |       // do nothing.
314 |       continue;
315 |     } else {
316 |       vocab[i].cn = fmax(min_count, vocab[i].cn);
317 |       vocab[i].fixed = true;
318 |     }
319 |   }
320 |   fclose(fp);
321 | }
322 | 
323 | void LearnVocabFromTrainFile() {
324 |   char word[MAX_STRING];
325 |   FILE *fin;
326 |   long long a, i;
327 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
328 |   fin = fopen(train_file, "rb");
329 |   if (fin == NULL) {
330 |     printf("ERROR: training data file not found!\n");
331 |     exit(1);
332 |   }
333 |   vocab_size = 0;
334 |   AddWordToVocab((char *)"</s>");
335 |   while (1) {
336 |     ReadWord(word, fin);
337 |     if (feof(fin)) break;
338 |     train_words++;
339 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
340 |       printf("%lldK%c", train_words / 1000, 13);
341 |       fflush(stdout);
342 |     }
343 |     i = SearchVocab(word);
344 |     if (i == -1) {
345 |       a = AddWordToVocab(word);
346 |       vocab[a].cn = 1;
347 |       vocab[a].fixed = false;
348 |     } else vocab[i].cn++;
349 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
350 |   }
351 |   ReadFixedEmbeddingsVocab();
352 |   printf("will sort vocab...\n");
353 |   SortVocab();
354 |   if (debug_mode > 0) {
355 |     printf("Vocab size: %lld\n", vocab_size);
356 |     printf("Words in train file: %lld\n", train_words);
357 |   }
358 |   file_size = ftell(fin);
359 |   fclose(fin);
360 | }
361 | 
362 | void SaveVocab() {
363 |   long long i;
364 |   FILE *fo = fopen(save_vocab_file, "wb");
365 |   for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
366 |   fclose(fo);
367 | }
368 | 
369 | void ReadVocab() {
370 |   long long a, i = 0;
371 |   char c;
372 |   char word[MAX_STRING];
373 |   FILE *fin = fopen(read_vocab_file, "rb");
374 |   if (fin == NULL) {
375 |     printf("Vocabulary file not found\n");
376 |     exit(1);
377 |   }
378 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
379 |   vocab_size = 0;
380 |   while (1) {
381 |     ReadWord(word, fin);
382 |     if (feof(fin)) break;
383 |     a = AddWordToVocab(word);
384 |     fscanf(fin, "%lld%c", &vocab[a].cn, &c);
385 |     i++;
386 |   }
387 |   ReadFixedEmbeddingsVocab();
388 |   SortVocab();
389 |   if (debug_mode > 0) {
390 |     printf("Vocab size: %lld\n", vocab_size);
391 |     printf("Words in train file: %lld\n", train_words);
392 |   }
393 |   fin = fopen(train_file, "rb");
394 |   if (fin == NULL) {
395 |     printf("ERROR: training data file not found!\n");
396 |     exit(1);
397 |   }
398 |   fseek(fin, 0, SEEK_END);
399 |   file_size = ftell(fin);
400 |   fclose(fin);
401 | }
402 | 
403 | void InitClassUnigramTable() {
404 |   long long a,c;
405 |   printf("loading class unigrams \n");
406 |   FILE *fin = fopen(negative_classes_file, "rb");
407 |   if (fin == NULL) {
408 |     printf("ERROR: class file not found!\n");
409 |     exit(1);
410 |   }
411 |   word_to_group = (int *)malloc(vocab_size * sizeof(int));
412 |   for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
413 |   char class[MAX_STRING];
414 |   char prev_class[MAX_STRING];
415 |   prev_class[0] = 0;
416 |   char word[MAX_STRING];
417 |   class_number = -1;
418 |   while (1) {
419 |     if (feof(fin)) break;
420 |     ReadWord(class, fin);
421 |     ReadWord(word, fin);
422 |     int word_index = SearchVocab(word);
423 |     if (word_index != -1){
424 |       if(strcmp(class, prev_class) != 0){
425 |         class_number++;
426 |         strcpy(prev_class, class);
427 |       }
428 |       word_to_group[word_index] = class_number;
429 |     }
430 |     ReadWord(word, fin);
431 |   }
432 |   class_number++;
433 |   fclose(fin);
434 |   
435 |   group_to_table = (int *)malloc(table_size * class_number * sizeof(int)); 
436 |   long long train_words_pow = 0;
437 |   real d1, power = 0.75;
438 |   
439 |   for(c = 0; c < class_number; c++){
440 |     long long offset = c * table_size;
441 |     train_words_pow = 0;
442 |     for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
443 |     int i = 0;
444 |     while(word_to_group[i]!=c && i < vocab_size) i++;
445 |     d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
446 |     for (a = 0; a < table_size; a++) {
447 |       //printf("index %lld , word %d\n", a, i);
448 |       group_to_table[offset + a] = i;
449 |       if (a / (real)table_size > d1) {
450 |         i++;
451 |         while(word_to_group[i]!=c && i < vocab_size) i++;
452 |         d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
453 |       }
454 |       if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
455 |     }
456 |   }
457 | }
458 | 
459 | void InitNet() {
460 |   long long a, b;
461 |   unsigned long long next_random = 1;
462 |   window_layer_size = layer1_size * window * 2;
463 |   a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
464 |   if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
465 |   
466 |   if (negative>0) {
467 |     a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
468 |     if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
469 |     
470 |     // Initialize syn1neg
471 |     for (a = 0; a < vocab_size; a++) {
472 |       for (b = 0; b < layer1_size; b++) {
473 |         syn1neg[a * layer1_size + b] = 0;
474 |       }
475 |     }
476 |     // Overwrite syn1neg for words in the fixed vocabulary
477 |     ReadVectors(syn1neg, fixed_context_vectors_file);
478 |   }
479 |   // for each word in the vocab
480 |   for (a = 0; a < vocab_size; a++) {
481 |     // for each element in its vector representation
482 |     for (b = 0; b < layer1_size; b++) {
483 |       next_random = next_random * (unsigned long long)25214903917 + 11;
484 |       // initialize syn0
485 |       syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
486 |     }
487 |   }
488 |   // Overwrite syn0 for words in the fixed vocabulary
489 |   ReadVectors(syn0, fixed_word_vectors_file);
490 | }
491 | 
492 | void *TrainModelThread(void *id) {
493 |   long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
494 |   long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
495 |   long long l1, l2, c, target, label, local_iter = iter;
496 |   unsigned long long next_random = (long long)id;
497 |   real f, g;
498 |   clock_t now;
499 |   int input_len_1 = (type == 2 || type == 4)? window_layer_size : layer1_size;
500 |   real *neu1e = (real *)calloc(input_len_1, sizeof(real));
501 | 
502 |   FILE *fi = fopen(train_file, "rb");
503 | 
504 |   // Each thread starts reading from a different part of the file.
505 |   fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
506 | 
507 |   // keep processing tokens from the training data until a check on the number of tokens processed
508 |   // followed by another check on the number of passes over training data breaks the loop.
509 |   while (1) {
510 | 
511 |     // every 10K updates, do some housekeeping such as updating the global count of processed tokens
512 |     // (shard by all threads), print a progress report, update the learning rate (shared by all threads)
513 |     if (word_count - last_word_count > 10000) {
514 |       word_count_actual += word_count - last_word_count;
515 |       last_word_count = word_count;
516 |       if ((debug_mode > 1)) {
517 |         now=clock();
518 |         printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
519 |                word_count_actual / (real)(iter * train_words + 1) * 100,
520 |                word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
521 |         fflush(stdout);
522 |       }
523 |       // linearly decrease the learning rate (once after 1000 updates).
524 |       alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
525 |       if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
526 |     }
527 | 
528 |     // sentence_length is reset before the first pass over the training data, after processing
529 |     // all tokens in a sentence, and again after the thread processes the number of tokens it's
530 |     //  supposed to process in each pass.
531 |     if (sentence_length == 0) {
532 |       // Read next token sequence from the train file.
533 |       while (1) {
534 |         word = ReadWordIndex(fi);
535 |         if (feof(fi)) break;
536 |         // skip words which don't belong to the vocabulary. 
537 |         if (word == -1) continue;
538 |         word_count++;
539 |         // word == 0 indicates a line break
540 |         if (word == 0) break;
541 |         // The subsampling randomly discards frequent words while keeping the ranking same
542 |         if (sample > 0) {
543 |           real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
544 |           next_random = next_random * (unsigned long long)25214903917 + 11;
545 |           // skip more frequent words more often
546 |           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
547 |         }
548 |         sen[sentence_length] = word;
549 |         sentence_length++;
550 |         // ignore later words in sentences which are too long.
551 |         if (sentence_length >= MAX_SENTENCE_LENGTH) break;
552 |       }
553 |       sentence_position = 0;
554 |     }
555 | 
556 |     // (approx) finished processing my share of tokens in this pass over the training data
557 |     if (feof(fi) || (word_count > train_words / num_threads)) {
558 |       word_count_actual += word_count - last_word_count;
559 |       local_iter--;
560 |       // if done with all my iterations, quit the loop.
561 |       if (local_iter == 0) break;
562 |       word_count = 0;
563 |       last_word_count = 0;
564 |       sentence_length = 0;
565 |       fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
566 |       continue;
567 |     }
568 | 
569 |     // this is the "middle word", at sentence_position in the sentence.
570 |     word = sen[sentence_position];
571 |     
572 |     if (word == -1) continue;
573 | 
574 |     // zero initialize the neu* (real) vectors.
575 |     for (c = 0; c < input_len_1; c++) neu1e[c] = 0;    
576 | 
577 |     // let window = 5, then b = 0, 1, 2, 3, or 4.
578 |     next_random = next_random * (unsigned long long)25214903917 + 11;
579 |     b = next_random % window;
580 | 
581 |     if(type==1) {  //train skip-gram
582 |       // loop over context words
583 |       // let window = 5, b = 3, then a loops over 3,4,6,7,8
584 |       // which correspond to context words at relative positions: -2, -1, 1, 2, 3
585 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
586 |           // the absolute position in the sentence for the current context word
587 |           c = sentence_position - window + a;
588 |           if (c < 0) continue;
589 |           if (c >= sentence_length) continue;
590 |           // the index of the current context word
591 |           last_word = sen[c];
592 |           if (last_word == -1) continue;
593 |           // the embedding of last_word starts at index l1 in the syn0 array
594 |           l1 = last_word * layer1_size;
595 |           // now, reuse c to loop over (and reset) the neu1e array
596 |           for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
597 |           // NEGATIVE SAMPLING
598 |           // loop over samples
599 |           if (negative > 0) for (d = 0; d < negative + 1; d++) {
600 | 
601 |               // in the first iteration, target is the correct word in the middle, with label = 1
602 |               if (d == 0) {
603 |                 target = word;
604 |                 label = 1;
605 |               } else {
606 |                 // in the remaining iterations, target is a random word from vocab, with label = 0
607 |                 next_random = next_random * (unsigned long long)25214903917 + 11;
608 |                 if(word_to_group != NULL && word_to_group[word] != -1){
609 |                   target = word;
610 |                   while(target == word) {
611 |                     target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
612 |                     next_random = next_random * (unsigned long long)25214903917 + 11;
613 |                   }
614 |                   //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
615 |                 }
616 |                 else{
617 |                   target = table[(next_random >> 16) % table_size];
618 |                 }
619 |                 // fix bad samples
620 |                 if (target == 0) target = next_random % (vocab_size - 1) + 1;
621 |                 if (target == word) continue;
622 |                 label = 0;
623 |               }
624 |               
625 |               // the index of the embedding of target word in the syn0 array.
626 |               l2 = target * layer1_size;
627 |               
628 |               // f = syn0[context] .* syn1neg[target]
629 |               // f is the dot product between the context word's embedding (in syn0)
630 |               // and the target word's embedding in syn1neg
631 |               f = 0;
632 |               for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
633 |               
634 |               // compute the scalar multiplier of the gradient update, which includes the sign
635 |               if (f > MAX_EXP) 
636 |                 g = (label - 1) * alpha;
637 |               else if (f < -MAX_EXP) 
638 |                 g = (label - 0) * alpha;
639 |               else 
640 |                 g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
641 |           
642 |               // for the current context word, neu1e accumulates the updates that should be later added to 
643 |               // the (word embedding) of the context word in syn0
644 |               for (c = 0; c < layer1_size; c++) {
645 |                 neu1e[c] += g * syn1neg[c + l2];
646 |               }
647 | 
648 |               // Update the (context embedding) for the current target word in syn1neg
649 |               if (!vocab[target].fixed || !fixed_context_vectors_file[0]) {
650 |                 for (c = 0; c < layer1_size; c++) { 
651 |                   syn1neg[c + l2] += g * syn0[c + l1]; 
652 |                 }
653 |               }
654 |             }
655 | 
656 |           // Learn weights input -> hidden
657 |           // update the (word embedding) for the current context word in syn0
658 |           if (!vocab[last_word].fixed) {
659 |             for (c = 0; c < layer1_size; c++) { 
660 |               syn0[c + l1] += neu1e[c]; 
661 |             }
662 |           }
663 |         }
664 |     }
665 |     else{
666 |       printf("unknown type %i", type);
667 |       exit(0);
668 |     }
669 |     sentence_position++;
670 |     // done reading this sentence.
671 |     if (sentence_position >= sentence_length) {
672 |       sentence_length = 0;
673 |       continue;
674 |     }
675 |   }
676 |   fclose(fi);
677 |   free(neu1e);
678 |   pthread_exit(NULL);
679 | }
680 | 
681 | void TrainModel() {
682 |   long a, b, c, d;
683 |   FILE *fo;
684 |   pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
685 |   printf("Starting training using file %s\n", train_file);
686 |   starting_alpha = alpha;
687 |   if (read_vocab_file[0] != 0) { ReadVocab(); }
688 |   else { LearnVocabFromTrainFile(); }
689 |   if (save_vocab_file[0] != 0) SaveVocab();
690 |   if (output_word_vectors_file[0] == 0) {
691 |     printf("output file not specified.\n");
692 |     return;
693 |   }
694 |   printf("entering InitNet()...\n");
695 |   InitNet();
696 |   printf("exited InitNet()\n");
697 |   if (negative > 0) InitUnigramTable();
698 |   if (negative_classes_file[0] != 0) InitClassUnigramTable();
699 |   start = clock();
700 |   for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
701 |   for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
702 | 
703 |   // Save the word vectors
704 |   fo = fopen(output_word_vectors_file, "wb");
705 |   if (classes == 0) {
706 |     if (!no_header) {
707 |       fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
708 |     }
709 |     for (a = 0; a < vocab_size; a++) {
710 |       // skip words which we already have embeddings for
711 |       if (vocab[a].fixed) { continue; }
712 |       fprintf(fo, "%s ", vocab[a].word);
713 |       if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
714 |       else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
715 |       fprintf(fo, "\n");
716 |     }
717 |   }
718 |   fclose(fo);
719 | 
720 |   // Save the context vectors
721 |   fo = fopen(output_context_vectors_file, "wb");
722 |   if (classes == 0) {
723 |     if (!no_header) {
724 |       fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
725 |     }
726 |     for (a = 0; a < vocab_size; a++) {
727 |       // skip contexts which we already have embeddings for
728 |       if (vocab[a].fixed && fixed_context_vectors_file[0]) { continue; }
729 |       fprintf(fo, "%s ", vocab[a].word);
730 |       if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn1neg[a * layer1_size + b], sizeof(real), 1, fo);
731 |       else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn1neg[a * layer1_size + b]);
732 |       fprintf(fo, "\n");
733 |     }
734 |   }
735 |   fclose(fo);
736 | }
737 | 
738 | int ArgPos(char *str, int argc, char **argv) {
739 |   int a;
740 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
741 |       if (a == argc - 1) {
742 |         printf("Argument missing for %s\n", str);
743 |         exit(1);
744 |       }
745 |       return a;
746 |     }
747 |   return -1;
748 | }
749 | 
750 | int main(int argc, char **argv) {
751 |   int i;
752 |   if (argc == 1) {
753 |     printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
754 |     printf("Options:\n");
755 |     printf("Parameters for training:\n");
756 |     printf("\t-train <file>\n");
757 |     printf("\t\tUse text data from <file> to train the model\n");
758 |     printf("\t-output <file>\n");
759 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
760 |     printf("\t-size <int>\n");
761 |     printf("\t\tSet size of word vectors; default is 100\n");
762 |     printf("\t-window <int>\n");
763 |     printf("\t\tSet max skip length between words; default is 5\n");
764 |     printf("\t-sample <float>\n");
765 |     printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
766 |     printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
767 |     printf("\t-hs <int>\n");
768 |     printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
769 |     printf("\t-negative <int>\n");
770 |     printf("\t-negative-classes <file>\n");
771 |     printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
772 |     printf("\t-threads <int>\n");
773 |     printf("\t\tUse <int> threads (default 12)\n");
774 |     printf("\t-iter <int>\n");
775 |     printf("\t\tRun more training iterations (default 5)\n");
776 |     printf("\t-min-count <int>\n");
777 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
778 |     printf("\t-alpha <float>\n");
779 |     printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
780 |     printf("\t-classes <int>\n");
781 |     printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
782 |     printf("\t-debug <int>\n");
783 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
784 |     printf("\t-binary <int>\n");
785 |     printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
786 |     printf("\t-save-vocab <file>\n");
787 |     printf("\t\tThe vocabulary will be saved to <file>\n");
788 |     printf("\t-read-vocab <file>\n");
789 |     printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
790 |     printf("\t-type <int>\n");
791 |     printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
792 |     printf("\t-fix-embeddings <file>\n");
793 |     printf("\t\tUse the provided embeddings to induce embeddings for new words not specified in this file.\n");
794 |     printf("\t-no-header <int>\n");
795 |     printf("\t\tDon't write the header line in output embeddings file(s)\n");
796 |     printf("\nExamples:\n");
797 |     printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
798 |     return 0;
799 |   }
800 |   output_word_vectors_file[0] = 0;
801 |   save_vocab_file[0] = 0;
802 |   read_vocab_file[0] = 0;
803 |   negative_classes_file[0] = 0;
804 |   if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
805 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
806 |   if ((i = ArgPos((char *)"-fix-embeddings", argc, argv)) > 0) {
807 |     strcpy(fixed_word_vectors_file, argv[i + 1]);
808 |     if (access( fixed_word_vectors_file, F_OK ) == -1) {
809 |       // fixed word embeddings file does not exist
810 |       fixed_word_vectors_file[0] = 0;
811 |     }
812 |     strcpy(fixed_context_vectors_file, fixed_word_vectors_file);
813 |     strcat(fixed_context_vectors_file, ".context");
814 |     if (access( fixed_context_vectors_file, F_OK ) == -1) {
815 |       // fixed context embeddings file does not exist
816 |       fixed_context_vectors_file[0] = 0;
817 |     }
818 |   }
819 |   if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
820 |   if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
821 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
822 |   if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
823 |   if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
824 |   if (type==0 || type==2 || type==4) alpha = 0.05;
825 |   if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
826 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) {
827 |     strcpy(output_word_vectors_file, argv[i + 1]);
828 |     strcpy(output_context_vectors_file, output_word_vectors_file);
829 |     strcat(output_context_vectors_file, ".context");
830 |   }
831 |   if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
832 |   if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
833 |   if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
834 |   if ((i = ArgPos((char *)"-no-header", argc, argv)) > 0) no_header = atoi(argv[i + 1]);
835 |   if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
836 |   if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
837 |   if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
838 |   if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
839 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
840 |   if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
841 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
842 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
843 |   expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
844 |   for (i = 0; i < EXP_TABLE_SIZE; i++) {
845 |     expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
846 |     expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
847 |   }
848 |   TrainModel();
849 |   return 0;
850 | }
851 | 


--------------------------------------------------------------------------------