├── .gitignore ├── README.md ├── TransE ├── Test_TransE.cpp ├── Train_TransE.cpp ├── makefile ├── rebuild_relation2vec.py └── test_similarity.py └── Word2vec ├── README.md ├── binary_translation.c ├── distance.c ├── makefile ├── stopwords.txt ├── train_model.sh ├── word2vec.c ├── word2vec.tex ├── word2vec_transE.c └── word2vec_transE.tex /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RepresentationLearning 2 | 3 | 1. TransE模型 4 | 5 | Bordes et al.2013《Translating Embeddings for Modeling Multi-relational Data》[pdf](http://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data.pdf). 6 | 7 | 2. Word2vec+TransE模型 8 | 9 | 在训练Word2vec时，加入三元组信息，使得词向量训练更加充分 10 | -------------------------------------------------------------------------------- /TransE/Test_TransE.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Code is modified from https://github.com/Mrlyk423/Relation_Extraction 3 | @chenbingjin 2016-05-05 4 | */ 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | 17 | /* 18 | Evaluation for link prediction used in (Bordes, 2013) 19 | Method: Mean Rank and Hits@10 (Bordes,2011, aaai) 20 | input: 21 | entity2vec,relation2vec,test datasets, train datasets 22 | output: 23 | mean rank: 平均rank越低越好 24 | Hits@10(%): 前十命中越高越好 25 | 备注：考虑测试效率，可以减少测试集的大小。4w 测试集要跑2个半小时，当然可以用多线程改进。 26 | */ 27 | 28 | bool debug=false; 29 | bool L1_flag=1; 30 | 31 | string version; 32 | string trainortest = "test"; 33 | 34 | map relation2id,entity2id; 35 | map id2entity,id2relation; 36 | 37 | map > entity2num; //某个关系下实体出现的次数 38 | map e2num; // 实体出现次数 39 | 40 | int relation_num,entity_num; 41 | int n= 50; 42 | 43 | double sigmod(double x) 44 | { 45 | return 1.0/(1+exp(-x)); 46 | } 47 | 48 | double vec_len(vector a) 49 | { 50 | double res=0; 51 | for (int i=0; i a) 57 | { 58 | for (int i=0; i a,pair b) 75 | { 76 | return a.first>b.first; 77 | } 78 | 79 | double cmp(pair a, pair b) 80 | { 81 | return a.second > relation_vec,entity_vec; 86 | 87 | vector h,l,r; 88 | vector fb_h,fb_l,fb_r; 89 | map, map > ok; 90 | double res ; 91 | public: 92 | // 添加测试集 93 | void add(int x,int y,int z, bool flag) 94 | { 95 | if (flag) 96 | { 97 | fb_h.push_back(x); 98 | fb_r.push_back(z); 99 | fb_l.push_back(y); 100 | } 101 | ok[make_pair(x,z)][y]=1; 102 | } 103 | 104 | int rand_max(int x) 105 | { 106 | int res = (rand()*rand())%x; 107 | if (res<0) 108 | res+=x; 109 | return res; 110 | } 111 | double len; 112 | double calc_sum(int e1,int e2,int rel) 113 | { 114 | double sum=0; 115 | if (L1_flag) 116 | for (int ii=0; ii1e-3) 145 | cout<<"wrong_entity"< lsum_r,lsum_filter_r; //记录关系对应实体的rank 155 | map rsum_r,rsum_filter_r; 156 | map lp_n_r,lp_n_filter_r; 157 | map rp_n_r,rp_n_filter_r; 158 | map rel_num; // 测试集关系出现次数 159 | 160 | cout << "Test triplets num: " << fb_l.size() << endl; 161 | for (int testid = 0; testid > a; 171 | for (int i=0; i=0; i--) 180 | { 181 | if (ok[make_pair(a[i].first,rel)].count(l)>0) 182 | ttt++; 183 | if (ok[make_pair(a[i].first,rel)].count(l)==0) 184 | filter+=1; 185 | if (a[i].first ==h) //记录正确实体的rank 186 | { 187 | //cout <<"hit: " << i << endl; 188 | lsum+=a.size()-i; 189 | lsum_filter+=filter+1; 190 | lsum_r[rel]+=a.size()-i; 191 | lsum_filter_r[rel]+=filter+1; 192 | if (a.size()-i<=10) 193 | { 194 | lp_n+=1; 195 | lp_n_r[rel]+=1; 196 | } 197 | if (filter<10) 198 | { 199 | lp_n_filter+=1; 200 | lp_n_filter_r[rel]+=1; 201 | } 202 | break; 203 | } 204 | } 205 | /* 注：为了方便，我只测试了替换h的效果 206 | // （h，rel，l）替换l后，计算每个实体的非相似性（Bordes，2011） 207 | a.clear(); 208 | for (int i=0; i=0; i--) 217 | { 218 | if (ok[make_pair(h,rel)].count(a[i].first)>0) 219 | ttt++; 220 | if (ok[make_pair(h,rel)].count(a[i].first)==0) 221 | filter+=1; 222 | if (a[i].first==l) 223 | { 224 | rsum+=a.size()-i; 225 | rsum_filter+=filter+1; 226 | rsum_r[rel]+=a.size()-i; 227 | rsum_filter_r[rel]+=filter+1; 228 | if (a.size()-i<=10) 229 | { 230 | rp_n+=1; 231 | rp_n_r[rel]+=1; 232 | } 233 | if (filter<10) 234 | { 235 | rp_n_filter+=1; 236 | rp_n_filter_r[rel]+=1; 237 | } 238 | break; 239 | } 240 | }*/ 241 | } 242 | cout<<"---------left----------"<(end_time-start_time)/CLOCKS_PER_SEC*1000*1000*60); 382 | } 383 | } 384 | 385 | -------------------------------------------------------------------------------- /TransE/Train_TransE.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Code is modified from https://github.com/Mrlyk423/Relation_Extraction 3 | @chenbingjin 2016-05-03 4 | */ 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | 17 | #define pi 3.1415926535897932384626433832795 18 | 19 | bool L1_flag = 0; //默认采用L1=1 20 | 21 | // 随机数 22 | double rand(double min, double max) 23 | { 24 | return min + (max-min)*rand()/(RAND_MAX + 1.0); 25 | } 26 | // 正态分布 27 | double normal(double x, double miu,double sigma) 28 | { 29 | return 1.0/sqrt(2*pi)/sigma*exp(-1*(x-miu)*(x-miu)/(2*sigma*sigma)); 30 | } 31 | // 在[min,max]区间内做正态分布采样？ 32 | double randn(double miu,double sigma, double min ,double max) 33 | { 34 | double x,y,dScope; 35 | do{ 36 | x=rand(min,max); 37 | y=normal(x,miu,sigma); 38 | dScope=rand(0.0,normal(miu,miu,sigma)); 39 | }while(dScope>y); 40 | return x; 41 | } 42 | // 平方 43 | double sqr(double x) 44 | { 45 | return x*x; 46 | } 47 | // 向量的模（L2） 48 | double vec_len(vector &a) 49 | { 50 | double res=0; 51 | for (int i=0; i relation2id,entity2id; 67 | map id2entity,id2relation; 68 | 69 | /* 70 | 记录head实体和tail实体关系类型：1-1,1-N，N-1，N-N 71 | 对于三元组（h,r,t） 72 | left_entity[r][h] ++ 73 | right_entity[r][t] ++ 74 | */ 75 | map > left_entity,right_entity; 76 | map left_num,right_num; 77 | 78 | class Train{ 79 | 80 | public: 81 | map, map > ok; 82 | // 添加三元组训练集 83 | void add(int x,int y,int z) 84 | { 85 | fb_h.push_back(x); 86 | fb_r.push_back(z); 87 | fb_l.push_back(y); 88 | ok[make_pair(x,z)][y]=1; 89 | } 90 | // transE算法学习过程（Bordes，2013） 91 | void run(int n_in,double rate_in,double margin_in,int method_in) 92 | { 93 | cout << "Initing vector..." << endl; 94 | n = n_in; 95 | rate = rate_in; 96 | margin = margin_in; 97 | method = method_in; 98 | // 申请对应的向量空间 99 | relation_vec.resize(relation_num); 100 | for (int i=0; i fb_h,fb_l,fb_r; //(h,r,l) 138 | vector > relation_vec,entity_vec; // 关系向量，实体向量 139 | vector > relation_tmp,entity_tmp; // 优化求解过程，临时向量 140 | // 归一化 141 | double norm(vector &a) 142 | { 143 | double x = vec_len(a); 144 | if (x>1) 145 | for (int ii=0; ii0) 186 | j=rand_max(entity_num); 187 | // 训练正负样本（h,r,l）（h,r,l'） 188 | train_kb(fb_h[i],fb_l[i],fb_r[i],fb_h[i],j,fb_r[i]); 189 | } 190 | else 191 | { 192 | while (ok[make_pair(j,fb_r[i])].count(fb_l[i])>0) 193 | j=rand_max(entity_num); 194 | // 训练正负样本（h,r,l）（h',r,l） 195 | train_kb(fb_h[i],fb_l[i],fb_r[i],j,fb_l[i],fb_r[i]); 196 | } 197 | norm(relation_tmp[fb_r[i]]); 198 | norm(entity_tmp[fb_h[i]]); 199 | norm(entity_tmp[fb_l[i]]); 200 | norm(entity_tmp[j]); 201 | } 202 | relation_vec = relation_tmp; 203 | entity_vec = entity_tmp; 204 | } 205 | clock_t end_time = clock(); 206 | cout<<"epoch: "<(end_time-start_time)/CLOCKS_PER_SEC*1000 <<"ms, res:" <0) 248 | x=1; 249 | else 250 | x=-1; 251 | relation_tmp[rel_a][ii]-=-1*rate*x; 252 | entity_tmp[e1_a][ii]-=-1*rate*x; 253 | entity_tmp[e2_a][ii]+=-1*rate*x; 254 | x = 2*(entity_vec[e2_b][ii]-entity_vec[e1_b][ii]-relation_vec[rel_b][ii]); 255 | if (L1_flag) 256 | if (x>0) 257 | x=1; 258 | else 259 | x=-1; 260 | relation_tmp[rel_b][ii]-=rate*x; 261 | entity_tmp[e1_b][ii]-=rate*x; 262 | entity_tmp[e2_b][ii]+=rate*x; 263 | } 264 | } 265 | // 训练过程：计算损失（L1或L2），计算梯度，更新向量 266 | void train_kb(int e1_a,int e2_a,int rel_a,int e1_b,int e2_b,int rel_b) 267 | { 268 | double sum1 = calc_sum(e1_a,e2_a,rel_a); 269 | double sum2 = calc_sum(e1_b,e2_b,rel_b); 270 | if (sum1+margin>sum2) 271 | { 272 | res+=margin+sum1-sum2; 273 | gradient( e1_a, e2_a, rel_a, e1_b, e2_b, rel_b); 274 | } 275 | } 276 | }; 277 | 278 | Train train; 279 | // 训练数据准备 280 | void prepare() 281 | { 282 | /* 283 | 需考虑实体中可能含有空格（eg.'psp go'），故采用按行读取，再划分实体名/关系名和id 284 | */ 285 | FILE* f1 = fopen("./data/entity2id.txt","r"); 286 | FILE* f2 = fopen("./data/relation2id.txt","r"); 287 | int x; 288 | cout << "Reading entity2id ..." << endl; 289 | while (!feof(f1)) 290 | { 291 | fgets(buf1,4096,f1); 292 | sscanf(buf1,"%[^\t]\t%d\n",buf,&x); 293 | string st=buf; 294 | entity2id[st]=x; 295 | id2entity[x]=st; 296 | entity_num++; 297 | if (x % 500000 == 0) 298 | cout << st << " " << x << endl; 299 | } 300 | cout << entity2id.size() << endl; 301 | cout << "Reading relation2id ..." << endl; 302 | while (!feof(f2)) 303 | { 304 | fgets(buf1,4096,f2); 305 | sscanf(buf1,"%[^\t]\t%d\n",buf,&x); 306 | string st=buf; 307 | relation2id[st]=x; 308 | id2relation[x]=st; 309 | relation_num++; 310 | } 311 | //cout << "Press num to read training data" << endl; 312 | //int c; 313 | //c = getchar(); 314 | FILE* f_kb = fopen("./data/train.txt","r"); 315 | char buf3[40960]; 316 | cout << "Loading training data..." << endl; 317 | while (!feof(f_kb)) 318 | { 319 | fgets(buf,20480,f_kb); 320 | sscanf(buf,"%[^\t]\t%[^\t]\t%[^\t\n]\n", buf1,buf2,buf3); 321 | string s1=buf1; 322 | string s2=buf3; 323 | string s3=buf2; //relation 324 | //cout << s1 << " " << s3 << " " << s2 << endl; 325 | if (entity2id.count(s1)==0) 326 | { 327 | cout<<"miss head entity:"<::iterator it = left_entity[i].begin(); it!=left_entity[i].end(); it++) 349 | { 350 | sum1++; 351 | sum2+=it->second; 352 | } 353 | left_num[i]=sum2/sum1; 354 | } 355 | // 计算每个关系tail实体出现的平均数 356 | for (int i=0; i::iterator it = right_entity[i].begin(); it!=right_entity[i].end(); it++) 360 | { 361 | sum1++; 362 | sum2+=it->second; 363 | } 364 | right_num[i]=sum2/sum1; 365 | } 366 | cout<<"relation_num="< 0) n = atoi(argv[i + 1]); 401 | //if ((i = ArgPos((char *)"-margin", argc, argv)) > 0) margin = atoi(argv[i + 1]); 402 | if ((i = ArgPos((char *)"-rate", argc, argv)) > 0) rate = atoi(argv[i + 1]); 403 | if ((i = ArgPos((char *)"-method", argc, argv)) > 0) method = atoi(argv[i + 1]); 404 | cout<<"size = "< 19 | #include 20 | #include 21 | #if __APPLE__ 22 | #include 23 | #else 24 | #include 25 | #endif 26 | 27 | const long long max_size = 2000; // max length of strings 28 | const long long max_w = 50; // max length of vocabulary entries 29 | 30 | int main(int argc, char **argv) { 31 | FILE *f; 32 | FILE *fo; 33 | char file_name[max_size]; 34 | char output_file[max_size]; 35 | float len; 36 | long long words, size, a, b; 37 | float *M; 38 | char *vocab; 39 | if (argc < 3) { 40 | printf("Usage: ./binary_translation \nwhere BINFILE contains word projections in the BINARY FORMAT\n"); 41 | return 0; 42 | } 43 | strcpy(file_name, argv[1]); 44 | strcpy(output_file, argv[2]); 45 | f = fopen(file_name, "rb"); 46 | if (f == NULL) { 47 | printf("Input file not found\n"); 48 | return -1; 49 | } 50 | fscanf(f, "%lld", &words); 51 | fscanf(f, "%lld", &size); 52 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 53 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 54 | if (M == NULL) { 55 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 56 | return -1; 57 | } 58 | fo = fopen(output_file, "wb"); 59 | for (b = 0; b < words; b++) { 60 | // read word name 61 | a = 0; 62 | while (1) { 63 | vocab[b * max_w + a] = fgetc(f); 64 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 65 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 66 | } 67 | vocab[b * max_w + a] = 0; 68 | // read word vector and normalized vector 69 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 70 | len = 0; 71 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 72 | len = sqrt(len); 73 | for (a = 0; a < size; a++) M[a + b * size] /= len; 74 | // save word vector in origin txt 75 | fprintf(fo, "%s ", &vocab[b * max_w]); 76 | for (a = 0; a < size; a++) fprintf(fo, "%lf ", M[a + b * size]); 77 | fprintf(fo, "\n"); 78 | } 79 | fclose(f); 80 | fclose(fo); 81 | return 0; 82 | } 83 | -------------------------------------------------------------------------------- /Word2vec/distance.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // modified by chenbingjin 16 | // @2017/12/27 15:00:01 17 | 18 | #include 19 | #include 20 | #include 21 | #if __APPLE__ 22 | #include 23 | #else 24 | #include 25 | #endif 26 | 27 | const long long max_size = 2000; // max length of strings 28 | const long long N = 40; // number of closest words that will be shown 29 | const long long max_w = 50; // max length of vocabulary entries 30 | 31 | int main(int argc, char **argv) { 32 | FILE *f; 33 | char st1[max_size]; 34 | char *bestw[N]; 35 | char file_name[max_size], st[100][max_size]; 36 | float dist, len, bestd[N], vec[max_size]; 37 | long long words, size, a, b, c, d, cn, bi[100]; 38 | float *M; 39 | char *vocab; 40 | if (argc < 2) { 41 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 42 | return 0; 43 | } 44 | strcpy(file_name, argv[1]); 45 | f = fopen(file_name, "rb"); 46 | if (f == NULL) { 47 | printf("Input file not found\n"); 48 | return -1; 49 | } 50 | fscanf(f, "%lld", &words); 51 | fscanf(f, "%lld", &size); 52 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 53 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 54 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 55 | if (M == NULL) { 56 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 57 | return -1; 58 | } 59 | for (b = 0; b < words; b++) { 60 | a = 0; 61 | while (1) { 62 | vocab[b * max_w + a] = fgetc(f); 63 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 64 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 65 | } 66 | vocab[b * max_w + a] = 0; 67 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 68 | len = 0; 69 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 70 | len = sqrt(len); 71 | for (a = 0; a < size; a++) M[a + b * size] /= len; 72 | } 73 | fclose(f); 74 | while (1) { 75 | for (a = 0; a < N; a++) bestd[a] = 0; 76 | for (a = 0; a < N; a++) bestw[a][0] = 0; 77 | printf("Enter word or sentence (EXIT to break): "); 78 | a = 0; 79 | while (1) { 80 | st1[a] = fgetc(stdin); 81 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 82 | st1[a] = 0; 83 | break; 84 | } 85 | a++; 86 | } 87 | if (!strcmp(st1, "EXIT")) break; 88 | cn = 0; 89 | b = 0; 90 | c = 0; 91 | while (1) { 92 | st[cn][b] = st1[c]; 93 | b++; 94 | c++; 95 | st[cn][b] = 0; 96 | if (st1[c] == 0) break; 97 | if (st1[c] == ' ') { 98 | cn++; 99 | b = 0; 100 | c++; 101 | } 102 | } 103 | cn++; 104 | for (a = 0; a < cn; a++) { 105 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 106 | if (b == words) b = -1; 107 | bi[a] = b; 108 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 109 | if (b == -1) { 110 | printf("Out of dictionary word!\n"); 111 | break; 112 | } 113 | } 114 | if (b == -1) continue; 115 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 116 | for (a = 0; a < size; a++) vec[a] = 0; 117 | for (b = 0; b < cn; b++) { 118 | if (bi[b] == -1) continue; 119 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 120 | } 121 | len = 0; 122 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 123 | len = sqrt(len); 124 | for (a = 0; a < size; a++) vec[a] /= len; 125 | for (a = 0; a < N; a++) bestd[a] = -1; 126 | for (a = 0; a < N; a++) bestw[a][0] = 0; 127 | for (c = 0; c < words; c++) { 128 | a = 0; 129 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 130 | if (a == 1) continue; 131 | dist = 0; 132 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 133 | for (a = 0; a < N; a++) { 134 | if (dist > bestd[a]) { 135 | for (d = N - 1; d > a; d--) { 136 | bestd[d] = bestd[d - 1]; 137 | strcpy(bestw[d], bestw[d - 1]); 138 | } 139 | bestd[a] = dist; 140 | strcpy(bestw[a], &vocab[c * max_w]); 141 | break; 142 | } 143 | } 144 | } 145 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 146 | } 147 | return 0; 148 | } 149 | -------------------------------------------------------------------------------- /Word2vec/makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | #CC = gcc 3 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 4 | CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result 5 | all: Word2vec Word2vec_transE Distance BinaryTrans 6 | Word2vec: word2vec.c 7 | $(CC) word2vec.c -o word2vec $(CFLAGS) 8 | Word2vec_transE: word2vec_transE.c 9 | $(CC) word2vec_transE.c -o word2vec_transE $(CFLAGS) 10 | Distance: distance.c 11 | $(CC) distance.c -o distance $(CFLAGS) 12 | BinaryTrans: binary_translation.c 13 | $(CC) binary_translation.c -o binary_translation $(CFLAGS) 14 | clean: 15 | rm -rf word2vec word2vec_transE distance binary_translation 16 | -------------------------------------------------------------------------------- /Word2vec/stopwords.txt: -------------------------------------------------------------------------------- 1 | ! 2 | """" 3 | "," 4 | # 5 | $ 6 | % 7 | & 8 | ' 9 | ( 10 | ) 11 | * 12 | + 13 | - 14 | -- 15 | . 16 | .. 17 | ... 18 | / 19 | 0 20 | 1 21 | 2 22 | 3 23 | 4 24 | 5 25 | 6 26 | 7 27 | 8 28 | 9 29 | : 30 | ; 31 | < 32 | = 33 | > 34 | >> 35 | ? 36 | @ 37 | A 38 | ZT 39 | ZZ 40 | [ 41 | \ 42 | ] 43 | ^ 44 | _ 45 | ` 46 | a 47 | a's 48 | able 49 | about 50 | above 51 | abroad 52 | according 53 | accordingly 54 | across 55 | actually 56 | adj 57 | after 58 | afterwards 59 | again 60 | against 61 | ago 62 | ahead 63 | ain't 64 | all 65 | allow 66 | allows 67 | almost 68 | alone 69 | along 70 | alongside 71 | already 72 | also 73 | although 74 | always 75 | am 76 | amid 77 | amidst 78 | among 79 | amongst 80 | an 81 | and 82 | another 83 | any 84 | anybody 85 | anyhow 86 | anyone 87 | anything 88 | anyway 89 | anyways 90 | anywhere 91 | apart 92 | appear 93 | appreciate 94 | appropriate 95 | are 96 | aren't 97 | around 98 | as 99 | aside 100 | ask 101 | asking 102 | associated 103 | at 104 | available 105 | away 106 | awfully 107 | b 108 | back 109 | backward 110 | backwards 111 | be 112 | became 113 | because 114 | become 115 | becomes 116 | becoming 117 | been 118 | before 119 | beforehand 120 | begin 121 | behind 122 | being 123 | believe 124 | below 125 | beside 126 | besides 127 | best 128 | better 129 | between 130 | beyond 131 | both 132 | brief 133 | but 134 | by 135 | c 136 | c'mon 137 | c's 138 | came 139 | can 140 | can't 141 | cannot 142 | cant 143 | caption 144 | cause 145 | causes 146 | certain 147 | certainly 148 | changes 149 | clearly 150 | co 151 | co. 152 | com 153 | come 154 | comes 155 | concerning 156 | consequently 157 | consider 158 | considering 159 | contain 160 | containing 161 | contains 162 | corresponding 163 | could 164 | couldn't 165 | course 166 | currently 167 | d 168 | dare 169 | daren't 170 | definitely 171 | described 172 | despite 173 | did 174 | didn't 175 | different 176 | directly 177 | do 178 | does 179 | doesn't 180 | doing 181 | don't 182 | done 183 | down 184 | downwards 185 | during 186 | e 187 | each 188 | edu 189 | eg 190 | eight 191 | eighty 192 | either 193 | else 194 | elsewhere 195 | end 196 | ending 197 | enough 198 | entirely 199 | especially 200 | et 201 | etc 202 | even 203 | ever 204 | evermore 205 | every 206 | everybody 207 | everyone 208 | everything 209 | everywhere 210 | ex 211 | exactly 212 | example 213 | except 214 | f 215 | fairly 216 | far 217 | farther 218 | few 219 | fewer 220 | fifth 221 | first 222 | five 223 | followed 224 | following 225 | follows 226 | for 227 | forever 228 | former 229 | formerly 230 | forth 231 | forward 232 | found 233 | four 234 | from 235 | further 236 | furthermore 237 | g 238 | get 239 | gets 240 | getting 241 | given 242 | gives 243 | go 244 | goes 245 | going 246 | gone 247 | got 248 | gotten 249 | greetings 250 | h 251 | had 252 | hadn't 253 | half 254 | happens 255 | hardly 256 | has 257 | hasn't 258 | have 259 | haven't 260 | having 261 | he 262 | he'd 263 | he'll 264 | he's 265 | hello 266 | help 267 | hence 268 | her 269 | here 270 | here's 271 | hereafter 272 | hereby 273 | herein 274 | hereupon 275 | hers 276 | herself 277 | hi 278 | him 279 | himself 280 | his 281 | hither 282 | hopefully 283 | how 284 | howbeit 285 | however 286 | hundred 287 | i 288 | i'd 289 | i'll 290 | i'm 291 | i've 292 | ie 293 | if 294 | ignored 295 | immediate 296 | in 297 | inasmuch 298 | inc 299 | inc. 300 | indeed 301 | indicate 302 | indicated 303 | indicates 304 | inner 305 | inside 306 | insofar 307 | instead 308 | into 309 | inward 310 | is 311 | isn't 312 | it 313 | it'd 314 | it'll 315 | it's 316 | its 317 | itself 318 | j 319 | just 320 | k 321 | keep 322 | keeps 323 | kept 324 | know 325 | known 326 | knows 327 | l 328 | last 329 | lately 330 | later 331 | latter 332 | latterly 333 | least 334 | less 335 | lest 336 | let 337 | let's 338 | like 339 | liked 340 | likely 341 | likewise 342 | little 343 | look 344 | looking 345 | looks 346 | low 347 | lower 348 | ltd 349 | m 350 | made 351 | mainly 352 | make 353 | makes 354 | many 355 | may 356 | maybe 357 | mayn't 358 | me 359 | mean 360 | meantime 361 | meanwhile 362 | merely 363 | might 364 | mightn't 365 | mine 366 | minus 367 | miss 368 | more 369 | moreover 370 | most 371 | mostly 372 | mr 373 | mrs 374 | much 375 | must 376 | mustn't 377 | my 378 | myself 379 | n 380 | name 381 | namely 382 | nd 383 | near 384 | nearly 385 | necessary 386 | need 387 | needn't 388 | needs 389 | neither 390 | never 391 | neverf 392 | neverless 393 | nevertheless 394 | new 395 | next 396 | nine 397 | ninety 398 | no 399 | no-one 400 | nobody 401 | non 402 | none 403 | nonetheless 404 | noone 405 | nor 406 | normally 407 | not 408 | nothing 409 | notwithstanding 410 | novel 411 | now 412 | nowhere 413 | o 414 | obviously 415 | of 416 | off 417 | often 418 | oh 419 | ok 420 | okay 421 | old 422 | on 423 | once 424 | one 425 | one's 426 | ones 427 | only 428 | onto 429 | opposite 430 | or 431 | other 432 | others 433 | otherwise 434 | ought 435 | oughtn't 436 | our 437 | ours 438 | ourselves 439 | out 440 | outside 441 | over 442 | overall 443 | own 444 | p 445 | particular 446 | particularly 447 | past 448 | per 449 | perhaps 450 | placed 451 | please 452 | plus 453 | possible 454 | presumably 455 | probably 456 | provided 457 | provides 458 | q 459 | que 460 | quite 461 | qv 462 | r 463 | rather 464 | rd 465 | re 466 | really 467 | reasonably 468 | recent 469 | recently 470 | regarding 471 | regardless 472 | regards 473 | relatively 474 | respectively 475 | right 476 | round 477 | s 478 | said 479 | same 480 | saw 481 | say 482 | saying 483 | says 484 | second 485 | secondly 486 | see 487 | seeing 488 | seem 489 | seemed 490 | seeming 491 | seems 492 | seen 493 | self 494 | selves 495 | sensible 496 | sent 497 | serious 498 | seriously 499 | seven 500 | several 501 | shall 502 | shan't 503 | she 504 | she'd 505 | she'll 506 | she's 507 | should 508 | shouldn't 509 | since 510 | six 511 | so 512 | some 513 | somebody 514 | someday 515 | somehow 516 | someone 517 | something 518 | sometime 519 | sometimes 520 | somewhat 521 | somewhere 522 | soon 523 | sorry 524 | specified 525 | specify 526 | specifying 527 | still 528 | sub 529 | such 530 | sup 531 | sure 532 | t 533 | t's 534 | take 535 | taken 536 | taking 537 | tell 538 | tends 539 | th 540 | than 541 | thank 542 | thanks 543 | thanx 544 | that 545 | that'll 546 | that's 547 | that've 548 | thats 549 | the 550 | their 551 | theirs 552 | them 553 | themselves 554 | then 555 | thence 556 | there 557 | there'd 558 | there'll 559 | there're 560 | there's 561 | there've 562 | thereafter 563 | thereby 564 | therefore 565 | therein 566 | theres 567 | thereupon 568 | these 569 | they 570 | they'd 571 | they'll 572 | they're 573 | they've 574 | thing 575 | things 576 | think 577 | third 578 | thirty 579 | this 580 | thorough 581 | thoroughly 582 | those 583 | though 584 | three 585 | through 586 | throughout 587 | thru 588 | thus 589 | till 590 | to 591 | together 592 | too 593 | took 594 | toward 595 | towards 596 | tried 597 | tries 598 | truly 599 | try 600 | trying 601 | twice 602 | two 603 | u 604 | un 605 | under 606 | underneath 607 | undoing 608 | unfortunately 609 | unless 610 | unlike 611 | unlikely 612 | until 613 | unto 614 | up 615 | upon 616 | upwards 617 | us 618 | use 619 | used 620 | useful 621 | uses 622 | using 623 | usually 624 | v 625 | value 626 | various 627 | versus 628 | very 629 | via 630 | viz 631 | vs 632 | w 633 | want 634 | wants 635 | was 636 | wasn't 637 | way 638 | we 639 | we'd 640 | we'll 641 | we're 642 | we've 643 | welcome 644 | well 645 | went 646 | were 647 | weren't 648 | what 649 | what'll 650 | what's 651 | what've 652 | whatever 653 | when 654 | whence 655 | whenever 656 | where 657 | where's 658 | whereafter 659 | whereas 660 | whereby 661 | wherein 662 | whereupon 663 | wherever 664 | whether 665 | which 666 | whichever 667 | while 668 | whilst 669 | whither 670 | who 671 | who'd 672 | who'll 673 | who's 674 | whoever 675 | whole 676 | whom 677 | whomever 678 | whose 679 | why 680 | will 681 | willing 682 | wish 683 | with 684 | within 685 | without 686 | won't 687 | wonder 688 | would 689 | wouldn't 690 | x 691 | y 692 | yes 693 | yet 694 | you 695 | you'd 696 | you'll 697 | you're 698 | you've 699 | your 700 | yours 701 | yourself 702 | yourselves 703 | z 704 | zero 705 | zt 706 | zz 707 | | 708 | ~ 709 | 710 | ―― 711 | ‘ 712 | ’ 713 | “ 714 | ” 715 | … 716 | 、 717 | 。 718 | 〈 719 | 〉 720 | 《 721 | 》 722 | ・ 723 | 一 724 | 一下 725 | 一些 726 | 一切 727 | 一则 728 | 一天 729 | 一定 730 | 一方面 731 | 一旦 732 | 一时 733 | 一来 734 | 一样 735 | 一次 736 | 一片 737 | 一直 738 | 一致 739 | 一般 740 | 一起 741 | 一边 742 | 一面 743 | 七 744 | 万一 745 | 三 746 | 三天两头 747 | 三番两次 748 | 三番五次 749 | 上 750 | 上下 751 | 上升 752 | 上去 753 | 上来 754 | 上述 755 | 上面 756 | 下列 757 | 下去 758 | 下来 759 | 下面 760 | 不 761 | 不一 762 | 不下 763 | 不久 764 | 不了 765 | 不亦乐乎 766 | 不仅 767 | 不仅仅 768 | 不仅仅是 769 | 不会 770 | 不但 771 | 不光 772 | 不免 773 | 不再 774 | 不力 775 | 不单 776 | 不变 777 | 不只 778 | 不可 779 | 不可开交 780 | 不可抗拒 781 | 不同 782 | 不外 783 | 不外乎 784 | 不够 785 | 不大 786 | 不如 787 | 不妨 788 | 不定 789 | 不对 790 | 不少 791 | 不巧 792 | 不已 793 | 不常 794 | 不得 795 | 不得不 796 | 不得了 797 | 不得已 798 | 不必 799 | 不怎么 800 | 不怕 801 | 不惟 802 | 不成 803 | 不拘 804 | 不择手段 805 | 不敢 806 | 不料 807 | 不断 808 | 不日 809 | 不时 810 | 不是 811 | 不曾 812 | 不止 813 | 不止一次 814 | 不比 815 | 不消 816 | 不满 817 | 不然 818 | 不然的话 819 | 不特 820 | 不独 821 | 不由得 822 | 不知不觉 823 | 不管 824 | 不管怎样 825 | 不经意 826 | 不胜 827 | 不能 828 | 不能不 829 | 不至于 830 | 不要 831 | 不论 832 | 不起 833 | 不足 834 | 不过 835 | 不迭 836 | 不问 837 | 不限 838 | 与 839 | 与其 840 | 与否 841 | 与此同时 842 | 专门 843 | 且 844 | 两者 845 | 严格 846 | 严重 847 | 个 848 | 个人 849 | 个别 850 | 中小 851 | 中间 852 | 丰富 853 | 串行 854 | 临 855 | 临到 856 | 为 857 | 为主 858 | 为了 859 | 为什么 860 | 为什麽 861 | 为何 862 | 为着 863 | 主张 864 | 主要 865 | 举凡 866 | 举行 867 | 乃 868 | 乃至 869 | 么 870 | 之 871 | 之一 872 | 之前 873 | 之后 874 | 之後 875 | 之所以 876 | 之类 877 | 乌乎 878 | 乎 879 | 乒 880 | 乘 881 | 乘势 882 | 乘机 883 | 乘胜 884 | 乘虚 885 | 乘隙 886 | 九 887 | 也 888 | 也好 889 | 也是 890 | 也罢 891 | 了 892 | 了解 893 | 争取 894 | 二 895 | 二话不说 896 | 二话没说 897 | 于 898 | 于是 899 | 于是乎 900 | 云云 901 | 互 902 | 互相 903 | 五 904 | 交口 905 | 产生 906 | 亲口 907 | 亲手 908 | 亲眼 909 | 亲自 910 | 亲身 911 | 人人 912 | 人们 913 | 人家 914 | 人民 915 | 什么 916 | 什么样 917 | 什麽 918 | 仅 919 | 仅仅 920 | 今后 921 | 今天 922 | 今年 923 | 今後 924 | 仍 925 | 仍旧 926 | 仍然 927 | 从 928 | 从不 929 | 从严 930 | 从中 931 | 从事 932 | 从今以后 933 | 从优 934 | 从古到今 935 | 从古至今 936 | 从头 937 | 从宽 938 | 从小 939 | 从新 940 | 从无到有 941 | 从早到晚 942 | 从未 943 | 从来 944 | 从此 945 | 从此以后 946 | 从而 947 | 从轻 948 | 从速 949 | 从重 950 | 他 951 | 他人 952 | 他们 953 | 他的 954 | 代替 955 | 以 956 | 以上 957 | 以下 958 | 以为 959 | 以便 960 | 以免 961 | 以前 962 | 以及 963 | 以后 964 | 以外 965 | 以後 966 | 以来 967 | 以至 968 | 以至于 969 | 以致 970 | 们 971 | 任 972 | 任何 973 | 任凭 974 | 任务 975 | 企图 976 | 伙同 977 | 会 978 | 伟大 979 | 传 980 | 传说 981 | 传闻 982 | 似乎 983 | 似的 984 | 但 985 | 但愿 986 | 但是 987 | 何 988 | 何乐而不为 989 | 何况 990 | 何处 991 | 何妨 992 | 何尝 993 | 何必 994 | 何时 995 | 何止 996 | 何苦 997 | 何须 998 | 作为 999 | 你 1000 | 你们 1001 | 你的 1002 | 使得 1003 | 使用 1004 | 例如 1005 | 依 1006 | 依照 1007 | 依靠 1008 | 便 1009 | 促进 1010 | 保持 1011 | 保管 1012 | 保险 1013 | 俺 1014 | 俺们 1015 | 倍加 1016 | 倍感 1017 | 倒不如 1018 | 倒不如说 1019 | 倒是 1020 | 倘 1021 | 倘使 1022 | 倘或 1023 | 倘然 1024 | 倘若 1025 | 借 1026 | 借以 1027 | 借此 1028 | 假使 1029 | 假如 1030 | 假若 1031 | 偏偏 1032 | 做到 1033 | 偶尔 1034 | 偶而 1035 | 像 1036 | 允许 1037 | 充其极 1038 | 充其量 1039 | 充分 1040 | 先后 1041 | 先後 1042 | 先生 1043 | 光 1044 | 光是 1045 | 全力 1046 | 全年 1047 | 全然 1048 | 全身心 1049 | 全部 1050 | 全都 1051 | 全面 1052 | 八 1053 | 八成 1054 | 公然 1055 | 六 1056 | 兮 1057 | 共 1058 | 共同 1059 | 共总 1060 | 关于 1061 | 其 1062 | 其一 1063 | 其中 1064 | 其二 1065 | 其他 1066 | 其余 1067 | 其后 1068 | 其它 1069 | 其实 1070 | 其次 1071 | 具体 1072 | 具体地说 1073 | 具体来说 1074 | 具体说来 1075 | 具有 1076 | 内 1077 | 再者 1078 | 再说 1079 | 冒 1080 | 冲 1081 | 决不 1082 | 决定 1083 | 决非 1084 | 况且 1085 | 准备 1086 | 凑巧 1087 | 凝神 1088 | 几 1089 | 几乎 1090 | 几度 1091 | 几时 1092 | 几番 1093 | 几经 1094 | 凭 1095 | 凭借 1096 | 出 1097 | 出去 1098 | 出来 1099 | 出现 1100 | 分别 1101 | 分头 1102 | 分期 1103 | 分期分批 1104 | 切 1105 | 切不可 1106 | 切切 1107 | 切勿 1108 | 切莫 1109 | 则 1110 | 刚 1111 | 刚好 1112 | 刚巧 1113 | 刚才 1114 | 初 1115 | 别 1116 | 别人 1117 | 别的 1118 | 别说 1119 | 到 1120 | 到了儿 1121 | 到处 1122 | 到头 1123 | 到头来 1124 | 到底 1125 | 到目前为止 1126 | 前后 1127 | 前者 1128 | 前进 1129 | 前面 1130 | 加上 1131 | 加之 1132 | 加以 1133 | 加入 1134 | 加强 1135 | 动不动 1136 | 动辄 1137 | 勃然 1138 | 匆匆 1139 | 十分 1140 | 千 1141 | 千万 1142 | 千万千万 1143 | 半 1144 | 单 1145 | 单单 1146 | 单纯 1147 | 即 1148 | 即令 1149 | 即使 1150 | 即便 1151 | 即刻 1152 | 即将 1153 | 即或 1154 | 即是说 1155 | 即若 1156 | 却不 1157 | 历 1158 | 原来 1159 | 去 1160 | 又 1161 | 及 1162 | 及其 1163 | 及时 1164 | 及至 1165 | 双方 1166 | 反之 1167 | 反之亦然 1168 | 反之则 1169 | 反倒 1170 | 反倒是 1171 | 反应 1172 | 反手 1173 | 反映 1174 | 反而 1175 | 反过来 1176 | 反过来说 1177 | 取得 1178 | 取道 1179 | 受到 1180 | 变成 1181 | 古来 1182 | 另 1183 | 另一个 1184 | 另一方面 1185 | 另外 1186 | 另方面 1187 | 另行 1188 | 只是 1189 | 只有 1190 | 只要 1191 | 只限 1192 | 叫 1193 | 叫做 1194 | 召开 1195 | 叮咚 1196 | 叮当 1197 | 可 1198 | 可以 1199 | 可好 1200 | 可是 1201 | 可能 1202 | 可见 1203 | 各 1204 | 各个 1205 | 各人 1206 | 各位 1207 | 各地 1208 | 各式 1209 | 各种 1210 | 各级 1211 | 各自 1212 | 合理 1213 | 同 1214 | 同一 1215 | 同时 1216 | 同样 1217 | 后来 1218 | 后面 1219 | 向 1220 | 向着 1221 | 吓 1222 | 吗 1223 | 否则 1224 | 吧 1225 | 吧哒 1226 | 吱 1227 | 呀 1228 | 呃 1229 | 呆呆地 1230 | 呐 1231 | 呕 1232 | 呗 1233 | 呜 1234 | 呜呼 1235 | 呢 1236 | 周围 1237 | 呵 1238 | 呸 1239 | 呼哧 1240 | 呼啦 1241 | 咋 1242 | 和 1243 | 咚 1244 | 咦 1245 | 咱 1246 | 咱们 1247 | 咳 1248 | 哇 1249 | 哈 1250 | 哈哈 1251 | 哉 1252 | 哎 1253 | 哎呀 1254 | 哎哟 1255 | 哗 1256 | 哗啦 1257 | 哟 1258 | 哦 1259 | 哩 1260 | 哪 1261 | 哪个 1262 | 哪些 1263 | 哪儿 1264 | 哪天 1265 | 哪年 1266 | 哪怕 1267 | 哪样 1268 | 哪边 1269 | 哪里 1270 | 哼 1271 | 哼唷 1272 | 唉 1273 | 啊 1274 | 啊呀 1275 | 啊哈 1276 | 啊哟 1277 | 啐 1278 | 啥 1279 | 啦 1280 | 啪达 1281 | 喀 1282 | 喂 1283 | 喏 1284 | 喔唷 1285 | 嗡嗡 1286 | 嗬 1287 | 嗯 1288 | 嗳 1289 | 嘎 1290 | 嘎嘎 1291 | 嘎登 1292 | 嘘 1293 | 嘛 1294 | 嘻 1295 | 嘿 1296 | 四 1297 | 因 1298 | 因为 1299 | 因此 1300 | 因而 1301 | 固 1302 | 固然 1303 | 在 1304 | 在下 1305 | 地 1306 | 均 1307 | 坚决 1308 | 坚持 1309 | 基于 1310 | 基本 1311 | 基本上 1312 | 处处 1313 | 处理 1314 | 复杂 1315 | 多 1316 | 多亏 1317 | 多多 1318 | 多多少少 1319 | 多多益善 1320 | 多少 1321 | 多年前 1322 | 多年来 1323 | 多数 1324 | 多次 1325 | 够瞧的 1326 | 大 1327 | 大不了 1328 | 大举 1329 | 大事 1330 | 大体 1331 | 大体上 1332 | 大凡 1333 | 大力 1334 | 大多 1335 | 大多数 1336 | 大大 1337 | 大家 1338 | 大张旗鼓 1339 | 大批 1340 | 大抵 1341 | 大概 1342 | 大略 1343 | 大约 1344 | 大致 1345 | 大都 1346 | 大量 1347 | 大面儿上 1348 | 失去 1349 | 奇 1350 | 奈 1351 | 奋勇 1352 | 她 1353 | 她们 1354 | 她的 1355 | 好在 1356 | 好的 1357 | 好象 1358 | 如 1359 | 如上 1360 | 如上所述 1361 | 如下 1362 | 如今 1363 | 如何 1364 | 如其 1365 | 如前所述 1366 | 如常 1367 | 如期 1368 | 如果 1369 | 如次 1370 | 如此 1371 | 如此等等 1372 | 如若 1373 | 姑且 1374 | 存在 1375 | 存心 1376 | 宁 1377 | 宁可 1378 | 宁愿 1379 | 宁肯 1380 | 它 1381 | 它们 1382 | 它们的 1383 | 它的 1384 | 安全 1385 | 完全 1386 | 完成 1387 | 定 1388 | 实现 1389 | 实际 1390 | 宣布 1391 | 容易 1392 | 密切 1393 | 对 1394 | 对于 1395 | 对应 1396 | 将 1397 | 将才 1398 | 将要 1399 | 将近 1400 | 少数 1401 | 尔后 1402 | 尔等 1403 | 尚且 1404 | 尤其 1405 | 就 1406 | 就地 1407 | 就是 1408 | 就是说 1409 | 就此 1410 | 就算 1411 | 尽 1412 | 尽可能 1413 | 尽如人意 1414 | 尽心尽力 1415 | 尽心竭力 1416 | 尽快 1417 | 尽早 1418 | 尽然 1419 | 尽管 1420 | 尽管如此 1421 | 尽量 1422 | 局外 1423 | 居然 1424 | 届时 1425 | 属于 1426 | 屡 1427 | 屡屡 1428 | 屡次 1429 | 屡次三番 1430 | 岂 1431 | 岂但 1432 | 岂止 1433 | 岂非 1434 | 川流不息 1435 | 左右 1436 | 巨大 1437 | 巩固 1438 | 差一点 1439 | 差不多 1440 | 己 1441 | 已经 1442 | 带 1443 | 帮助 1444 | 常 1445 | 常常 1446 | 常言说 1447 | 常言说得好 1448 | 常言道 1449 | 平素 1450 | 年复一年 1451 | 并 1452 | 并不 1453 | 并不是 1454 | 并且 1455 | 并排 1456 | 并无 1457 | 并没 1458 | 并没有 1459 | 并肩 1460 | 并非 1461 | 广大 1462 | 广泛 1463 | 应当 1464 | 应用 1465 | 应该 1466 | 开外 1467 | 开始 1468 | 开展 1469 | 引起 1470 | 弗 1471 | 弹指之间 1472 | 强烈 1473 | 强调 1474 | 归 1475 | 归根到底 1476 | 归根结底 1477 | 当 1478 | 当下 1479 | 当中 1480 | 当儿 1481 | 当前 1482 | 当即 1483 | 当口儿 1484 | 当场 1485 | 当头 1486 | 当庭 1487 | 当时 1488 | 当然 1489 | 当真 1490 | 当着 1491 | 形成 1492 | 彻夜 1493 | 彻底 1494 | 彼 1495 | 彼此 1496 | 往 1497 | 往往 1498 | 待 1499 | 待到 1500 | 很 1501 | 很多 1502 | 很少 1503 | 後来 1504 | 後面 1505 | 得 1506 | 得出 1507 | 得到 1508 | 得天独厚 1509 | 得起 1510 | 心里 1511 | 必 1512 | 必定 1513 | 必将 1514 | 必然 1515 | 必要 1516 | 必须 1517 | 快 1518 | 快要 1519 | 忽地 1520 | 忽然 1521 | 怎 1522 | 怎么 1523 | 怎么办 1524 | 怎么样 1525 | 怎样 1526 | 怎麽 1527 | 怕 1528 | 急匆匆 1529 | 怪 1530 | 怪不得 1531 | 总之 1532 | 总是 1533 | 总的来看 1534 | 总的来说 1535 | 总的说来 1536 | 总结 1537 | 总而言之 1538 | 恍然 1539 | 恐怕 1540 | 恰似 1541 | 恰好 1542 | 恰如 1543 | 恰巧 1544 | 恰恰 1545 | 恰恰相反 1546 | 恰逢 1547 | 您 1548 | 惯常 1549 | 意思 1550 | 愤然 1551 | 愿意 1552 | 慢说 1553 | 成为 1554 | 成年 1555 | 成年累月 1556 | 成心 1557 | 我 1558 | 我们 1559 | 我的 1560 | 或 1561 | 或多或少 1562 | 或是 1563 | 或者 1564 | 或许 1565 | 战斗 1566 | 截然 1567 | 截至 1568 | 所 1569 | 所以 1570 | 所有 1571 | 所谓 1572 | 才 1573 | 才能 1574 | 扑通 1575 | 打 1576 | 打从 1577 | 打开天窗说亮话 1578 | 扩大 1579 | 把 1580 | 抑或 1581 | 抽冷子 1582 | 拦腰 1583 | 拿 1584 | 按 1585 | 按时 1586 | 按期 1587 | 按照 1588 | 按理 1589 | 按说 1590 | 挨个 1591 | 挨家挨户 1592 | 挨次 1593 | 挨着 1594 | 挨门挨户 1595 | 挨门逐户 1596 | 换句话说 1597 | 换言之 1598 | 据 1599 | 据实 1600 | 据悉 1601 | 据我所知 1602 | 据此 1603 | 据称 1604 | 据说 1605 | 掌握 1606 | 接下来 1607 | 接着 1608 | 接著 1609 | 接连不断 1610 | 放量 1611 | 故 1612 | 故意 1613 | 故此 1614 | 故而 1615 | 敞开儿 1616 | 敢 1617 | 敢于 1618 | 敢情 1619 | 整个 1620 | 断然 1621 | 方 1622 | 方便 1623 | 方才 1624 | 方能 1625 | 方面 1626 | 旁人 1627 | 无宁 1628 | 无法 1629 | 无论 1630 | 既 1631 | 既是 1632 | 既然 1633 | 日复一日 1634 | 日渐 1635 | 日益 1636 | 日臻 1637 | 日见 1638 | 时候 1639 | 昂然 1640 | 明显 1641 | 明确 1642 | 是 1643 | 是不是 1644 | 是否 1645 | 是的 1646 | 显然 1647 | 显著 1648 | 普通 1649 | 普遍 1650 | 暗中 1651 | 暗地里 1652 | 暗自 1653 | 更 1654 | 更为 1655 | 更加 1656 | 更进一步 1657 | 曾经 1658 | 替 1659 | 最后 1660 | 最大 1661 | 最好 1662 | 最後 1663 | 最近 1664 | 最高 1665 | 有 1666 | 有些 1667 | 有关 1668 | 有利 1669 | 有力 1670 | 有所 1671 | 有效 1672 | 有时 1673 | 有点 1674 | 有的 1675 | 有着 1676 | 有著 1677 | 望 1678 | 朝 1679 | 朝着 1680 | 末##末 1681 | 本 1682 | 本人 1683 | 本着 1684 | 本身 1685 | 权时 1686 | 来 1687 | 来不及 1688 | 来得及 1689 | 来看 1690 | 来着 1691 | 来讲 1692 | 极 1693 | 极为 1694 | 极了 1695 | 极其 1696 | 极力 1697 | 极大 1698 | 极度 1699 | 极端 1700 | 构成 1701 | 果然 1702 | 果真 1703 | 某 1704 | 某个 1705 | 某些 1706 | 根据 1707 | 根本 1708 | 格外 1709 | 梆 1710 | 概 1711 | 次第 1712 | 欢迎 1713 | 正在 1714 | 正如 1715 | 正常 1716 | 此 1717 | 此中 1718 | 此后 1719 | 此外 1720 | 此时 1721 | 此间 1722 | 殆 1723 | 毋宁 1724 | 每 1725 | 每个 1726 | 每天 1727 | 每年 1728 | 每当 1729 | 每时每刻 1730 | 每每 1731 | 每逢 1732 | 比 1733 | 比如 1734 | 比如说 1735 | 比方 1736 | 比照 1737 | 比起 1738 | 比较 1739 | 毕竟 1740 | 毫不 1741 | 毫无 1742 | 毫无例外 1743 | 毫无保留地 1744 | 汝 1745 | 沙沙 1746 | 没 1747 | 没有 1748 | 沿 1749 | 沿着 1750 | 注意 1751 | 活 1752 | 深入 1753 | 清楚 1754 | 满 1755 | 满足 1756 | 漫说 1757 | 焉 1758 | 然 1759 | 然则 1760 | 然后 1761 | 然後 1762 | 然而 1763 | 照 1764 | 照着 1765 | 牢牢 1766 | 特别是 1767 | 特殊 1768 | 特点 1769 | 独 1770 | 独自 1771 | 猛然 1772 | 猛然间 1773 | 率尔 1774 | 率然 1775 | 现代 1776 | 现在 1777 | 理应 1778 | 理当 1779 | 理该 1780 | 瑟瑟 1781 | 甚么 1782 | 甚而 1783 | 甚至 1784 | 用 1785 | 甫 1786 | 甭 1787 | 由 1788 | 由于 1789 | 由此可见 1790 | 略 1791 | 略为 1792 | 略加 1793 | 略微 1794 | 白 1795 | 白白 1796 | 的 1797 | 的确 1798 | 的话 1799 | 皆可 1800 | 目前 1801 | 直到 1802 | 直接 1803 | 相似 1804 | 相信 1805 | 相反 1806 | 相同 1807 | 相对 1808 | 相对而言 1809 | 相应 1810 | 相当 1811 | 相等 1812 | 省得 1813 | 看 1814 | 看上去 1815 | 看出 1816 | 看到 1817 | 看来 1818 | 看样子 1819 | 看看 1820 | 看见 1821 | 看起来 1822 | 真是 1823 | 真正 1824 | 着 1825 | 着呢 1826 | 矣 1827 | 知道 1828 | 砰 1829 | 确定 1830 | 碰巧 1831 | 社会主义 1832 | 离 1833 | 积极 1834 | 移动 1835 | 究竟 1836 | 穷年累月 1837 | 突出 1838 | 突然 1839 | 窃 1840 | 立 1841 | 立刻 1842 | 立即 1843 | 立地 1844 | 立时 1845 | 立马 1846 | 竟 1847 | 竟然 1848 | 第 1849 | 等 1850 | 等到 1851 | 等等 1852 | 策略地 1853 | 简直 1854 | 简而言之 1855 | 简言之 1856 | 管 1857 | 粗 1858 | 精光 1859 | 紧接着 1860 | 累年 1861 | 累次 1862 | 纯 1863 | 纯粹 1864 | 纵 1865 | 纵令 1866 | 纵使 1867 | 纵然 1868 | 练习 1869 | 组成 1870 | 经 1871 | 经常 1872 | 经过 1873 | 结合 1874 | 结果 1875 | 给 1876 | 绝 1877 | 绝不 1878 | 绝对 1879 | 绝非 1880 | 绝顶 1881 | 继之 1882 | 继续 1883 | 继而 1884 | 维持 1885 | 综上所述 1886 | 缕缕 1887 | 罢了 1888 | 老 1889 | 老大 1890 | 老是 1891 | 老老实实 1892 | 考虑 1893 | 者 1894 | 而 1895 | 而且 1896 | 而况 1897 | 而又 1898 | 而后 1899 | 而外 1900 | 而已 1901 | 而是 1902 | 而言 1903 | 而论 1904 | 联系 1905 | 联袂 1906 | 背地里 1907 | 背靠背 1908 | 能 1909 | 能否 1910 | 能够 1911 | 腾 1912 | 自 1913 | 自个儿 1914 | 自从 1915 | 自各儿 1916 | 自家 1917 | 自己 1918 | 自身 1919 | 臭 1920 | 至 1921 | 至于 1922 | 良好 1923 | 若 1924 | 若是 1925 | 若非 1926 | 范围 1927 | 莫 1928 | 莫不 1929 | 莫如 1930 | 莫若 1931 | 莫非 1932 | 获得 1933 | 藉以 1934 | 虽 1935 | 虽则 1936 | 虽然 1937 | 虽说 1938 | 蛮 1939 | 行为 1940 | 行动 1941 | 表明 1942 | 表示 1943 | 被 1944 | 要 1945 | 要不 1946 | 要不是 1947 | 要不然 1948 | 要么 1949 | 要是 1950 | 要求 1951 | 见 1952 | 规定 1953 | 觉得 1954 | 譬如 1955 | 认为 1956 | 认真 1957 | 认识 1958 | 让 1959 | 许多 1960 | 论 1961 | 论说 1962 | 设使 1963 | 设若 1964 | 诚然 1965 | 话说 1966 | 该 1967 | 该当 1968 | 说明 1969 | 说说 1970 | 请勿 1971 | 诸位 1972 | 谁 1973 | 谁知 1974 | 谨 1975 | 豁然 1976 | 赶 1977 | 赶快 1978 | 赶早不赶晚 1979 | 起 1980 | 起先 1981 | 起初 1982 | 起头 1983 | 起来 1984 | 起见 1985 | 起首 1986 | 趁 1987 | 趁便 1988 | 趁势 1989 | 趁早 1990 | 趁机 1991 | 趁热 1992 | 趁着 1993 | 越是 1994 | 跟 1995 | 路经 1996 | 转动 1997 | 转变 1998 | 转贴 1999 | 轰然 2000 | 较 2001 | 较为 2002 | 较之 2003 | 较比 2004 | 边 2005 | 达到 2006 | 达旦 2007 | 迄 2008 | 迅速 2009 | 过 2010 | 过于 2011 | 过去 2012 | 过来 2013 | 运用 2014 | 近 2015 | 近几年来 2016 | 近年来 2017 | 近来 2018 | 还 2019 | 还是 2020 | 还有 2021 | 这 2022 | 这个 2023 | 这么 2024 | 这么些 2025 | 这么样 2026 | 这么点儿 2027 | 这些 2028 | 这会儿 2029 | 这儿 2030 | 这就是说 2031 | 这时 2032 | 这样 2033 | 这点 2034 | 这种 2035 | 这边 2036 | 这里 2037 | 这麽 2038 | 进入 2039 | 进去 2040 | 进来 2041 | 进步 2042 | 进而 2043 | 进行 2044 | 连 2045 | 连同 2046 | 连声 2047 | 连日 2048 | 连日来 2049 | 连袂 2050 | 连连 2051 | 迟早 2052 | 迫于 2053 | 适应 2054 | 适当 2055 | 适用 2056 | 逐步 2057 | 逐渐 2058 | 通常 2059 | 通过 2060 | 造成 2061 | 逢 2062 | 遇到 2063 | 遭到 2064 | 遵照 2065 | 避免 2066 | 那 2067 | 那个 2068 | 那么 2069 | 那么些 2070 | 那么样 2071 | 那些 2072 | 那会儿 2073 | 那儿 2074 | 那时 2075 | 那末 2076 | 那样 2077 | 那边 2078 | 那里 2079 | 那麽 2080 | 部分 2081 | 都 2082 | 鄙人 2083 | 采取 2084 | 里面 2085 | 重大 2086 | 重新 2087 | 重要 2088 | 鉴于 2089 | 长期以来 2090 | 长此下去 2091 | 长线 2092 | 长话短说 2093 | 问题 2094 | 间或 2095 | 防止 2096 | 阿 2097 | 附近 2098 | 陈年 2099 | 限制 2100 | 陡然 2101 | 除 2102 | 除了 2103 | 除却 2104 | 除去 2105 | 除外 2106 | 除开 2107 | 除此 2108 | 除此之外 2109 | 除此以外 2110 | 除此而外 2111 | 除非 2112 | 随 2113 | 随着 2114 | 随著 2115 | 隔夜 2116 | 隔日 2117 | 难得 2118 | 难怪 2119 | 难说 2120 | 难道 2121 | 集中 2122 | 零 2123 | 需要 2124 | 非但 2125 | 非常 2126 | 非徒 2127 | 非得 2128 | 靠 2129 | 顶多 2130 | 顷 2131 | 顷刻 2132 | 顷刻之间 2133 | 顷刻间 2134 | 顺 2135 | 顺着 2136 | 顿时 2137 | 颇 2138 | 风雨无阻 2139 | 饱 2140 | 首先 2141 | 马上 2142 | 高低 2143 | 高兴 2144 | 默然 2145 | 默默地 2146 | 齐 2147 | ！ 2148 | ＃ 2149 | ＄ 2150 | ％ 2151 | ＆ 2152 | （ 2153 | ） 2154 | ＊ 2155 | ＋ 2156 | ， 2157 | ０ 2158 | １ 2159 | ２ 2160 | ３ 2161 | ４ 2162 | ５ 2163 | ６ 2164 | ７ 2165 | ８ 2166 | ９ 2167 | ： 2168 | ； 2169 | ＜ 2170 | ＞ 2171 | ？ 2172 | ＠ 2173 | ［ 2174 | ］ 2175 | ｛ 2176 | ｜ 2177 | ｝ 2178 | ～ 2179 | ￥ 2180 | -------------------------------------------------------------------------------- /Word2vec/train_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Author:chenbingjin 4 | # Date:2016-05-17 5 | # Train word2vec 6 | make 7 | if [ ! -e train.txt ]; then 8 | printf "Train file 'train.txt' is acquired." 9 | else 10 | time ./word2vec_transE -train train.txt -triplet triplet.txt -output vectors200_transE.bin -cbow 1 -size 200 -window 5 -negative 5 -hs 0 -sample 1e-4 -threads 20 -binary 1 -save-vocab vocab.txt -iter 15 11 | fi 12 | -------------------------------------------------------------------------------- /Word2vec/word2vec.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Modified from https://code.google.com/p/word2vec/ 16 | // @chenbingjin 2016-05-16 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #define MAX_STRING 100 25 | #define EXP_TABLE_SIZE 1000 26 | #define MAX_EXP 6 27 | #define MAX_SENTENCE_LENGTH 1000 28 | #define MAX_CODE_LENGTH 40 29 | 30 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 31 | 32 | typedef float real; // Precision of float numbers 33 | 34 | struct vocab_word { 35 | long long cn; //词频 36 | int *point; //huffman编码对应内节点的路径 37 | char *word, *code, codelen; //（词，对应huffman编码，编码长度） 38 | }; 39 | 40 | char train_file[MAX_STRING], output_file[MAX_STRING]; 41 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 42 | struct vocab_word *vocab; //词汇表 43 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 44 | int *vocab_hash; //词汇哈希表，便于快速查找，存储每个词在词汇表的索引位置。 45 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; //向量维度 46 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 47 | real alpha = 0.025, starting_alpha, sample = 1e-3; 48 | //分别对应：词的向量，内节点的向量，负采样词的向量，sigmoid函数的近似计算表 49 | real *syn0, *syn1, *syn1neg, *expTable; 50 | 51 | clock_t start; 52 | // hierarchical softmax 或者NEG 53 | int hs = 0, negative = 5; 54 | const int table_size = 1e8; 55 | int *table; 56 | 57 | //负采样算法：带权采样思想。每个词的权重为l(w) = [counter(w)]^(3/4) / sum([counter(u)]^(3/4))，u属于词典D 58 | // 每个词对应一个线段, 将[0,1]等距离划分成10^8，每次生成一个随机整数r，Table[r]就是一个样本。 59 | void InitUnigramTable() { 60 | int a, i; 61 | double train_words_pow = 0; 62 | double d1, power = 0.75; 63 | table = (int *)malloc(table_size * sizeof(int)); 64 | // 遍历词表，统计总权重 65 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 66 | i = 0; 67 | d1 = pow(vocab[i].cn, power) / train_words_pow; 68 | // 遍历词表，为每个词分配table空间 69 | for (a = 0; a < table_size; a++) { 70 | table[a] = i; 71 | if (a / (double)table_size > d1) { 72 | i++; 73 | d1 += pow(vocab[i].cn, power) / train_words_pow; 74 | } 75 | if (i >= vocab_size) i = vocab_size - 1; 76 | } 77 | } 78 | 79 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 80 | // 从文件中读取一个词 81 | void ReadWord(char *word, FILE *fin) { 82 | int a = 0, ch; 83 | while (!feof(fin)) { 84 | ch = fgetc(fin); //读取一个字符 85 | if (ch == 13) continue; //回车符 86 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 87 | if (a > 0) { 88 | if (ch == '\n') ungetc(ch, fin); //退回一个字符，文件指针左移一位 89 | break; 90 | } 91 | if (ch == '\n') { 92 | strcpy(word, (char *)""); 93 | return; 94 | } else continue; 95 | } 96 | word[a] = ch; 97 | a++; 98 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 99 | } 100 | word[a] = 0; 101 | } 102 | 103 | // Returns hash value of a word 104 | // 返回词的hash值 105 | int GetWordHash(char *word) { 106 | unsigned long long a, hash = 0; 107 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; //257进制，计算词的hash值 108 | hash = hash % vocab_hash_size; 109 | return hash; 110 | } 111 | 112 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 113 | // 返回词在词表中的索引位置，找不到返回-1. 114 | int SearchVocab(char *word) { 115 | unsigned int hash = GetWordHash(word); 116 | while (1) { 117 | if (vocab_hash[hash] == -1) return -1; 118 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 119 | hash = (hash + 1) % vocab_hash_size; 120 | } 121 | return -1; 122 | } 123 | 124 | // Reads a word and returns its index in the vocabulary 125 | // 从文件读一个词，返回词在词汇表的索引位置 126 | int ReadWordIndex(FILE *fin) { 127 | char word[MAX_STRING]; 128 | ReadWord(word, fin); 129 | if (feof(fin)) return -1; 130 | return SearchVocab(word); 131 | } 132 | 133 | // Adds a word to the vocabulary 134 | // 将词添加到词汇表 135 | int AddWordToVocab(char *word) { 136 | unsigned int hash, length = strlen(word) + 1; 137 | if (length > MAX_STRING) length = MAX_STRING; //词的长度不能超MAX_STRING 138 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 139 | strcpy(vocab[vocab_size].word, word); 140 | vocab[vocab_size].cn = 0; //初始词频为0 141 | vocab_size++; 142 | // Reallocate memory if needed 143 | if (vocab_size + 2 >= vocab_max_size) { 144 | vocab_max_size += 1000; 145 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 146 | } 147 | hash = GetWordHash(word); 148 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; //如果hash值冲突，采用线性探测的开放定址法，顺序向下查找 149 | vocab_hash[hash] = vocab_size - 1; 150 | return vocab_size - 1; 151 | } 152 | 153 | // Used later for sorting by word counts 154 | // 词表排序的比较算法cmp：根据词频排序,降序 155 | int VocabCompare(const void *a, const void *b) { 156 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 157 | } 158 | 159 | // Sorts the vocabulary by frequency using word counts 160 | // 根据词频排序 161 | void SortVocab() { 162 | int a, size; 163 | unsigned int hash; 164 | // Sort the vocabulary and keep at the first position 165 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 166 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 167 | size = vocab_size; 168 | train_words = 0; 169 | for (a = 0; a < size; a++) { 170 | // Words occuring less than min_count times will be discarded from the vocab 171 | // 出现次数太少的词直接丢弃，min_count 默认5 172 | if ((vocab[a].cn < min_count) && (a != 0)) { 173 | vocab_size--; 174 | free(vocab[a].word); 175 | } else { 176 | // Hash will be re-computed, as after the sorting it is not actual 177 | // 重新计算hash值 178 | hash=GetWordHash(vocab[a].word); 179 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 180 | vocab_hash[hash] = a; 181 | train_words += vocab[a].cn; //总词频 182 | } 183 | } 184 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 185 | // Allocate memory for the binary tree construction 186 | for (a = 0; a < vocab_size; a++) { 187 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 188 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 189 | } 190 | } 191 | 192 | // Reduces the vocabulary by removing infrequent tokens 193 | // 缩小词汇表，移除词频过小的词 194 | void ReduceVocab() { 195 | int a, b = 0; 196 | unsigned int hash; 197 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 198 | vocab[b].cn = vocab[a].cn; 199 | vocab[b].word = vocab[a].word; 200 | b++; 201 | } else free(vocab[a].word); 202 | vocab_size = b; 203 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 204 | for (a = 0; a < vocab_size; a++) { 205 | // Hash will be re-computed, as it is not actual 206 | hash = GetWordHash(vocab[a].word); 207 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 208 | vocab_hash[hash] = a; 209 | } 210 | fflush(stdout); 211 | min_reduce++; 212 | } 213 | 214 | // Create binary Huffman tree using the word counts 215 | // Frequent words will have short uniqe binary codes 216 | // 根据词频构建huffman树，词频越大编码越短 217 | void CreateBinaryTree() { 218 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 219 | char code[MAX_CODE_LENGTH]; 220 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 221 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 222 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 223 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 224 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 225 | pos1 = vocab_size - 1; 226 | pos2 = vocab_size; 227 | // Following algorithm constructs the Huffman tree by adding one node at a time 228 | for (a = 0; a < vocab_size - 1; a++) { 229 | // First, find two smallest nodes 'min1, min2' 230 | if (pos1 >= 0) { //找第一小 231 | if (count[pos1] < count[pos2]) { 232 | min1i = pos1; 233 | pos1--; 234 | } else { 235 | min1i = pos2; 236 | pos2++; 237 | } 238 | } else { 239 | min1i = pos2; 240 | pos2++; 241 | } 242 | if (pos1 >= 0) { //找第二小 243 | if (count[pos1] < count[pos2]) { 244 | min2i = pos1; 245 | pos1--; 246 | } else { 247 | min2i = pos2; 248 | pos2++; 249 | } 250 | } else { 251 | min2i = pos2; 252 | pos2++; 253 | } 254 | count[vocab_size + a] = count[min1i] + count[min2i]; 255 | parent_node[min1i] = vocab_size + a; 256 | parent_node[min2i] = vocab_size + a; 257 | binary[min2i] = 1; 258 | } 259 | // Now assign binary code to each vocabulary word 260 | for (a = 0; a < vocab_size; a++) { 261 | b = a; 262 | i = 0; 263 | while (1) { 264 | code[i] = binary[b]; 265 | point[i] = b; 266 | i++; 267 | b = parent_node[b]; 268 | if (b == vocab_size * 2 - 2) break; 269 | } 270 | vocab[a].codelen = i; // 编码长度 271 | vocab[a].point[0] = vocab_size - 2; //? 272 | for (b = 0; b < i; b++) { 273 | vocab[a].code[i - b - 1] = code[b]; 274 | vocab[a].point[i - b] = point[b] - vocab_size; 275 | } 276 | } 277 | free(count); 278 | free(binary); 279 | free(parent_node); 280 | } 281 | // 从训练文件中统计每个词的词频 282 | void LearnVocabFromTrainFile() { 283 | char word[MAX_STRING]; 284 | FILE *fin; 285 | long long a, i; 286 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 287 | fin = fopen(train_file, "rb"); 288 | if (fin == NULL) { 289 | printf("ERROR: training data file not found!\n"); 290 | exit(1); 291 | } 292 | vocab_size = 0; 293 | AddWordToVocab((char *)""); 294 | while (1) { 295 | ReadWord(word, fin); 296 | if (feof(fin)) break; 297 | train_words++; 298 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 299 | printf("%lldK%c", train_words / 1000, 13); 300 | fflush(stdout); 301 | } 302 | i = SearchVocab(word); 303 | if (i == -1) { 304 | a = AddWordToVocab(word); 305 | vocab[a].cn = 1; 306 | } else vocab[i].cn++; 307 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); //如果词汇过多，先删除低频词 308 | } 309 | SortVocab(); 310 | if (debug_mode > 0) { 311 | printf("Vocab size: %lld\n", vocab_size); 312 | printf("Words in train file: %lld\n", train_words); 313 | } 314 | file_size = ftell(fin); 315 | fclose(fin); 316 | } 317 | // 保存词汇表 318 | void SaveVocab() { 319 | long long i; 320 | FILE *fo = fopen(save_vocab_file, "wb"); 321 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 322 | fclose(fo); 323 | } 324 | //从文件读取词汇，该文件已经统计好每个词的词频 325 | void ReadVocab() { 326 | long long a, i = 0; 327 | char c; 328 | char word[MAX_STRING]; 329 | FILE *fin = fopen(read_vocab_file, "rb"); 330 | if (fin == NULL) { 331 | printf("Vocabulary file not found\n"); 332 | exit(1); 333 | } 334 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 335 | vocab_size = 0; 336 | while (1) { 337 | ReadWord(word, fin); 338 | if (feof(fin)) break; 339 | a = AddWordToVocab(word); 340 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); //读取词频，换行符 341 | i++; 342 | } 343 | SortVocab(); 344 | if (debug_mode > 0) { 345 | printf("Vocab size: %lld\n", vocab_size); 346 | printf("Words in train file: %lld\n", train_words); 347 | } 348 | fin = fopen(train_file, "rb"); 349 | if (fin == NULL) { 350 | printf("ERROR: training data file not found!\n"); 351 | exit(1); 352 | } 353 | fseek(fin, 0, SEEK_END); 354 | file_size = ftell(fin); 355 | fclose(fin); 356 | } 357 | // 初始化网络结构 358 | void InitNet() { 359 | long long a, b; 360 | unsigned long long next_random = 1; 361 | // 分配词的向量内存，地址是128的倍数 362 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 363 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 364 | if (hs) { 365 | // 分配huffman内部节点内存 366 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 367 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 368 | // 初始化为0向量 369 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 370 | syn1[a * layer1_size + b] = 0; 371 | } 372 | if (negative>0) { 373 | // 分配负样本词的向量空间 374 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); 375 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 376 | // 初始化为0向量 377 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 378 | syn1neg[a * layer1_size + b] = 0; 379 | } 380 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 381 | next_random = next_random * (unsigned long long)25214903917 + 11; 382 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 383 | } 384 | CreateBinaryTree(); 385 | } 386 | // 训练模型线程：训练过程 387 | void *TrainModelThread(void *id) { 388 | long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; 389 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 390 | long long l1, l2, c, target, label, local_iter = iter; 391 | unsigned long long next_random = (long long)id; 392 | real f, g; 393 | clock_t now; 394 | real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //对应Xw 395 | real *neu1e = (real *)calloc(layer1_size, sizeof(real)); //对应error累加量 396 | FILE *fi = fopen(train_file, "rb"); 397 | //每个线程对应一段文本 398 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 399 | while (1) { 400 | if (word_count - last_word_count > 10000) { 401 | word_count_actual += word_count - last_word_count; 402 | last_word_count = word_count; 403 | if ((debug_mode > 1)) { 404 | now=clock(); 405 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 406 | word_count_actual / (real)(iter * train_words + 1) * 100, 407 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 408 | fflush(stdout); 409 | } 410 | alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 411 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 412 | } 413 | if (sentence_length == 0) { 414 | while (1) { 415 | word = ReadWordIndex(fi); //读一个词，返回其在词汇表的索引位置 416 | if (feof(fi)) break; 417 | if (word == -1) continue; 418 | word_count++; 419 | if (word == 0) break; 420 | // The subsampling randomly discards frequent words while keeping the ranking same 421 | // 对高频词进行下采样，以概率p丢弃。p = 1-[sqrt(t/f(w))+t/f(w)].但仍保持排序不变 422 | // 先计算ran = sqrt(t/f(w))+t/f(w)，产生(0,1)上的随机数r，如果r>ran，则丢弃。 423 | if (sample > 0) { 424 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 425 | next_random = next_random * (unsigned long long)25214903917 + 11; 426 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 427 | } 428 | sen[sentence_length] = word; 429 | sentence_length++; 430 | // 将1000个词当成一个句子 431 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 432 | } 433 | sentence_position = 0; 434 | } 435 | // 当前线程处理单词数超过阈值 436 | if (feof(fi) || (word_count > train_words / num_threads)) { 437 | word_count_actual += word_count - last_word_count; 438 | local_iter--; 439 | if (local_iter == 0) break; 440 | word_count = 0; 441 | last_word_count = 0; 442 | sentence_length = 0; 443 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 444 | continue; 445 | } 446 | word = sen[sentence_position]; 447 | if (word == -1) continue; 448 | for (c = 0; c < layer1_size; c++) neu1[c] = 0; 449 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 450 | next_random = next_random * (unsigned long long)25214903917 + 11; 451 | // 随机产生0-5的窗口大小 452 | b = next_random % window; 453 | if (cbow) { //train the cbow architecture 454 | // in -> hidden 455 | cw = 0; 456 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 457 | c = sentence_position - window + a; 458 | if (c < 0) continue; 459 | if (c >= sentence_length) continue; 460 | last_word = sen[c]; 461 | if (last_word == -1) continue; 462 | // 上下文词进行向量加和，得到Xw 463 | for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; 464 | cw++; 465 | } 466 | if (cw) { 467 | // average 向量和取平均 468 | for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 469 | // hs，采用huffman 470 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 471 | f = 0; 472 | l2 = vocab[word].point[d] * layer1_size; //路径的内部节点 473 | // Propagate hidden -> output 474 | // 隐藏层到输出层，计算误差梯度 475 | // neu1 对应 Xw， syn1对应内部节点的向量0 476 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; //计算内积 477 | if (f <= -MAX_EXP) continue; 478 | else if (f >= MAX_EXP) continue; 479 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];//sigmoid 480 | // 'g' is the gradient multiplied by the learning rate 481 | // 内部节点0的梯度(1-d-sigmoid(Xw·0))Xw，g为前面部分 482 | g = (1 - vocab[word].code[d] - f) * alpha; 483 | 484 | // Propagate errors output -> hidden 485 | // 反向传播误差，从huffman树传到隐藏层 486 | // 累加的梯度更新量 487 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 488 | // Learn weights hidden -> output 489 | // 内部节点更新向量 490 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 491 | } 492 | // NEGATIVE SAMPLING 493 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 494 | if (d == 0) { 495 | target = word; //目标词 496 | label = 1; //正样本 497 | } else {//采样负样本 498 | next_random = next_random * (unsigned long long)25214903917 + 11; 499 | target = table[(next_random >> 16) % table_size]; 500 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 501 | if (target == word) continue; 502 | label = 0; 503 | } 504 | l2 = target * layer1_size; 505 | f = 0; 506 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; //内积 507 | if (f > MAX_EXP) g = (label - 1) * alpha; 508 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 509 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; //sigmoid 510 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; //累积误差梯度 511 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; //参数向量更新 512 | } 513 | // hidden -> in 514 | // 更新上下文几个词语的向量。 515 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 516 | c = sentence_position - window + a; 517 | if (c < 0) continue; 518 | if (c >= sentence_length) continue; 519 | last_word = sen[c]; 520 | if (last_word == -1) continue; 521 | for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 522 | } 523 | } 524 | } else { //train skip-gram 525 | //这里很神奇，利用了目标函数的对称性，p(u|w) = p(w|u), u in Context(w). 具体看 http://blog.csdn.net/mytestmy/article/details/26969149 526 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 527 | c = sentence_position - window + a; 528 | if (c < 0) continue; 529 | if (c >= sentence_length) continue; 530 | last_word = sen[c]; 531 | if (last_word == -1) continue; 532 | l1 = last_word * layer1_size; 533 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 534 | // HIERARCHICAL SOFTMAX 535 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { //遍历叶子节点 536 | f = 0; 537 | l2 = vocab[word].point[d] * layer1_size; //point是路径上的节点 538 | // Propagate hidden -> output 539 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; //内积 540 | if (f <= -MAX_EXP) continue; 541 | else if (f >= MAX_EXP) continue; 542 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; //sigmoid 543 | // 'g' is the gradient multiplied by the learning rate 544 | g = (1 - vocab[word].code[d] - f) * alpha; //梯度一部分 545 | // Propagate errors output -> hidden 546 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; //隐藏层的误差 547 | // Learn weights hidden -> output 548 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; //更新内部节点向量 549 | } 550 | // NEGATIVE SAMPLING 551 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 552 | if (d == 0) { 553 | target = word; 554 | label = 1; 555 | } else { 556 | next_random = next_random * (unsigned long long)25214903917 + 11; 557 | target = table[(next_random >> 16) % table_size]; 558 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 559 | if (target == word) continue; 560 | label = 0; 561 | } 562 | l2 = target * layer1_size; 563 | f = 0; 564 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 565 | if (f > MAX_EXP) g = (label - 1) * alpha; 566 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 567 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 568 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 569 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; 570 | } 571 | // Learn weights input -> hidden 572 | for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; //更新的是当前上下文的词向量 573 | } 574 | } 575 | sentence_position++; 576 | if (sentence_position >= sentence_length) { 577 | sentence_length = 0; 578 | continue; 579 | } 580 | } 581 | fclose(fi); 582 | free(neu1); 583 | free(neu1e); 584 | pthread_exit(NULL); 585 | } 586 | // 训练模型 587 | void TrainModel() { 588 | long a, b, c, d; 589 | FILE *fo; 590 | // 默认12个线程 591 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 592 | printf("Starting training using file %s\n", train_file); 593 | starting_alpha = alpha; 594 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 595 | if (save_vocab_file[0] != 0) SaveVocab(); 596 | if (output_file[0] == 0) return; 597 | InitNet(); 598 | if (negative > 0) InitUnigramTable(); 599 | start = clock(); 600 | // 启动线程 601 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 602 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 603 | // 保存结果 604 | fo = fopen(output_file, "wb"); 605 | if (classes == 0) { 606 | // Save the word vectors 607 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 608 | for (a = 0; a < vocab_size; a++) { 609 | fprintf(fo, "%s ", vocab[a].word); 610 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 611 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 612 | fprintf(fo, "\n"); 613 | } 614 | } else { 615 | // Run K-means on the word vectors 616 | // 对向量进行聚类 617 | int clcn = classes, iter = 10, closeid; 618 | // 该类别的数量 619 | int *centcn = (int *)malloc(classes * sizeof(int)); 620 | // 每个词对应类别 621 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 622 | real closev, x; 623 | // 每个类的中心点 624 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 625 | // 初始化，每个词分配到一个类 626 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 627 | for (a = 0; a < iter; a++) { 628 | // 中心点清零 629 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 630 | for (b = 0; b < clcn; b++) centcn[b] = 1; 631 | // 计算每个类别求和值 632 | for (c = 0; c < vocab_size; c++) { 633 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 634 | centcn[cl[c]]++; //对应类别的数量加1 635 | } 636 | // 遍历所有类别 637 | for (b = 0; b < clcn; b++) { 638 | closev = 0; 639 | for (c = 0; c < layer1_size; c++) { 640 | cent[layer1_size * b + c] /= centcn[b]; //均值 641 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 642 | } 643 | closev = sqrt(closev); 644 | // 中心点归一化 645 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 646 | } 647 | // 所有词重新分类 648 | for (c = 0; c < vocab_size; c++) { 649 | closev = -10; 650 | closeid = 0; 651 | for (d = 0; d < clcn; d++) { 652 | x = 0; 653 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 654 | if (x > closev) { 655 | closev = x; 656 | closeid = d; 657 | } 658 | } 659 | cl[c] = closeid; 660 | } 661 | } 662 | // Save the K-means classes 663 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 664 | free(centcn); 665 | free(cent); 666 | free(cl); 667 | } 668 | fclose(fo); 669 | } 670 | 671 | int ArgPos(char *str, int argc, char **argv) { 672 | int a; 673 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 674 | if (a == argc - 1) { 675 | printf("Argument missing for %s\n", str); 676 | exit(1); 677 | } 678 | return a; 679 | } 680 | return -1; 681 | } 682 | 683 | int main(int argc, char **argv) { 684 | int i; 685 | if (argc == 1) { 686 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 687 | printf("Options:\n"); 688 | printf("Parameters for training:\n"); 689 | // 输入文件：已分词的语料 690 | printf("\t-train \n"); 691 | printf("\t\tUse text data from to train the model\n"); 692 | // 输出文件：词向量或词聚类 693 | printf("\t-output \n"); 694 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 695 | // 词向量维度：默认100 696 | printf("\t-size \n"); 697 | printf("\t\tSet size of word vectors; default is 100\n"); 698 | // 窗口大小：默认5 699 | printf("\t-window \n"); 700 | printf("\t\tSet max skip length between words; default is 5\n"); 701 | // 词频阈值：默认0，对高频词随机下采样 702 | printf("\t-sample \n"); 703 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 704 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 705 | // 采用层次softmax：默认0，不采用 706 | printf("\t-hs \n"); 707 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 708 | // 采用NEG：默认5 709 | printf("\t-negative \n"); 710 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 711 | // 线程数：默认12 712 | printf("\t-threads \n"); 713 | printf("\t\tUse threads (default 12)\n"); 714 | // 迭代数：默认5 715 | printf("\t-iter \n"); 716 | printf("\t\tRun more training iterations (default 5)\n"); 717 | // 词频最小阈值：默认5，小于阈值则丢弃 718 | printf("\t-min-count \n"); 719 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 720 | // 学习率：默认是0.025(skip-gram),0.05(cbow) 721 | printf("\t-alpha \n"); 722 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 723 | // 聚类数：默认0 724 | printf("\t-classes \n"); 725 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 726 | // debug模式：默认2 727 | printf("\t-debug \n"); 728 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 729 | // 二进制存储：默认0，即保存文件时不采用二进制 730 | printf("\t-binary \n"); 731 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 732 | // 保存词汇表 733 | printf("\t-save-vocab \n"); 734 | printf("\t\tThe vocabulary will be saved to \n"); 735 | // 读取已统计好词频的词汇表 736 | printf("\t-read-vocab \n"); 737 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 738 | // 采用模型：1 CBOW，0 skip-gram，默认1 739 | printf("\t-cbow \n"); 740 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 741 | // 示例 742 | printf("\nExamples:\n"); 743 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 744 | return 0; 745 | } 746 | output_file[0] = 0; 747 | save_vocab_file[0] = 0; 748 | read_vocab_file[0] = 0; 749 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 750 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 751 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 752 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 753 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 754 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 755 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 756 | if (cbow) alpha = 0.05; 757 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 758 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 759 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 760 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 761 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 762 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 763 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 764 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 765 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 766 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 767 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 768 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 769 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 770 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 771 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 772 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 773 | } 774 | TrainModel(); 775 | return 0; 776 | } 777 | -------------------------------------------------------------------------------- /Word2vec/word2vec.tex: -------------------------------------------------------------------------------- 1 | % word2vc.tex 2 | % @chenbingjin 2016-07-07 3 | 4 | \documentclass[10pt]{article} 5 | \usepackage{caption} 6 | \usepackage{algorithm} 7 | \usepackage{algpseudocode} 8 | \begin{document} 9 | \begin{algorithm} 10 | \caption{ word2vec training algorithm} 11 | \textbf{Input:} corpus \textbf{C}, vector dimension \textbf{k}\\ 12 | \textbf{Output:} word vector \textbf{V} 13 | \begin{algorithmic}[1] 14 | \Procedure {w2v\_Train}{$C$, $k$} 15 | \State $vocab\_size \leftarrow get\_vocab\_size(C)$ 16 | \State $V \leftarrow init\_vector(vocab\_size,k)$ 17 | \State $\theta \leftarrow init\_vector(vocab\_size,k)$ 18 | \ForAll {$w_{i} \in C$} 19 | \State $e \leftarrow 0$ 20 | \State $X_w \leftarrow \sum_{u \in Context(w_i)} V(u)$ 21 | \ForAll { $u = \{w_i\} \cup NEG(w_i)$} 22 | \State $q \leftarrow \sigma(X_{w_i}^T\theta^u)$ 23 | \State $g \leftarrow \eta(L^{w_i}(u)-q)$ 24 | \State $e \leftarrow e+g\theta^u $ 25 | \State $\theta^u \leftarrow \theta^u + gX_w$ 26 | \EndFor 27 | \ForAll { $ u \in Context(w_i) $ } 28 | \State $V(u) \leftarrow V(u) + e$ 29 | \EndFor 30 | \EndFor 31 | \EndProcedure 32 | \end{algorithmic} 33 | \end{algorithm} 34 | \end{document} 35 | -------------------------------------------------------------------------------- /Word2vec/word2vec_transE.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Modified from https://code.google.com/p/word2vec/ 16 | // @chenbingjin 2016-05-16 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | using namespace std; 27 | 28 | #define MAX_STRING 100 29 | #define EXP_TABLE_SIZE 1000 30 | #define MAX_EXP 6 31 | #define MAX_SENTENCE_LENGTH 1000 32 | #define MAX_CODE_LENGTH 40 33 | #define pi 3.1415926535897932384626433832795 34 | 35 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 36 | 37 | typedef float real; // Precision of float numbers 38 | 39 | struct vocab_word { 40 | long long cn; //词频 41 | int *point; //huffman编码对应内节点的路径 42 | char *word, *code, codelen; //（词，对应huffman编码，编码长度） 43 | }; 44 | 45 | char train_file[MAX_STRING], triplet_file[MAX_STRING], output_file[MAX_STRING]; 46 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 47 | struct vocab_word *vocab; //词汇表 48 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 49 | int *vocab_hash; //词汇哈希表，便于快速查找，存储每个词在词汇表的索引位置。 50 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; //向量维度 51 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 52 | real alpha = 0.025, starting_alpha, sample = 1e-3; 53 | //分别对应：词的向量，内节点的向量，负采样词的向量，sigmoid函数的近似计算表 54 | real *syn0, *syn1, *syn1neg, *expTable; 55 | 56 | clock_t start; 57 | // hierarchical softmax 或者NEG 58 | int hs = 0, negative = 5; 59 | const int table_size = 1e8; 60 | int *table; 61 | 62 | // 添加关系信息所需变量 63 | real belta = 0.0005, gama = 0.8; 64 | char buf[100000]; 65 | int relation_num; 66 | map relation2id; 67 | map > > triplets; 68 | vector > relation_vec; // 关系向量 69 | // 随机数 70 | double rand(double min, double max) 71 | { 72 | return min + (max-min)*rand()/(RAND_MAX + 1.0); 73 | } 74 | // 正态分布 75 | double normal(double x, double miu,double sigma) 76 | { 77 | return 1.0/sqrt(2*pi)/sigma*exp(-1*(x-miu)*(x-miu)/(2*sigma*sigma)); 78 | } 79 | // 在[min,max]区间内做正态分布采样？ 80 | double randn(double miu,double sigma, double min ,double max) 81 | { 82 | double x,y,dScope; 83 | do{ 84 | x=rand(min,max); 85 | y=normal(x,miu,sigma); 86 | dScope=rand(0.0,normal(miu,miu,sigma)); 87 | }while(dScope>y); 88 | return x; 89 | } 90 | //负采样算法：带权采样思想。每个词的权重为l(w) = [counter(w)]^(3/4) / sum([counter(u)]^(3/4))，u属于词典D 91 | // 每个词对应一个线段, 将[0,1]等距离划分成10^8，每次生成一个随机整数r，Table[r]就是一个样本。 92 | void InitUnigramTable() { 93 | int a, i; 94 | double train_words_pow = 0; 95 | double d1, power = 0.75; 96 | table = (int *)malloc(table_size * sizeof(int)); 97 | // 遍历词表，统计总权重 98 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 99 | i = 0; 100 | d1 = pow(vocab[i].cn, power) / train_words_pow; 101 | // 遍历词表，为每个词分配table空间 102 | for (a = 0; a < table_size; a++) { 103 | table[a] = i; 104 | if (a / (double)table_size > d1) { 105 | i++; 106 | d1 += pow(vocab[i].cn, power) / train_words_pow; 107 | } 108 | if (i >= vocab_size) i = vocab_size - 1; 109 | } 110 | } 111 | 112 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 113 | // 从文件中读取一个词 114 | void ReadWord(char *word, FILE *fin) { 115 | int a = 0, ch; 116 | while (!feof(fin)) { 117 | ch = fgetc(fin); //读取一个字符 118 | if (ch == 13) continue; //回车符 119 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 120 | if (a > 0) { 121 | if (ch == '\n') ungetc(ch, fin); //退回一个字符，文件指针左移一位 122 | break; 123 | } 124 | if (ch == '\n') { 125 | strcpy(word, (char *)""); 126 | return; 127 | } else continue; 128 | } 129 | word[a] = ch; 130 | a++; 131 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 132 | } 133 | word[a] = 0; 134 | } 135 | 136 | // Returns hash value of a word 137 | // 返回词的hash值 138 | int GetWordHash(char *word) { 139 | unsigned long long a, hash = 0; 140 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; //257进制，计算词的hash值 141 | hash = hash % vocab_hash_size; 142 | return hash; 143 | } 144 | 145 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 146 | // 返回词在词表中的索引位置，找不到返回-1. 147 | int SearchVocab(char *word) { 148 | unsigned int hash = GetWordHash(word); 149 | while (1) { 150 | if (vocab_hash[hash] == -1) return -1; 151 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 152 | hash = (hash + 1) % vocab_hash_size; 153 | } 154 | return -1; 155 | } 156 | 157 | // Reads a word and returns its index in the vocabulary 158 | // 从文件读一个词，返回词在词汇表的索引位置 159 | int ReadWordIndex(FILE *fin) { 160 | char word[MAX_STRING]; 161 | ReadWord(word, fin); 162 | if (feof(fin)) return -1; 163 | return SearchVocab(word); 164 | } 165 | 166 | // Adds a word to the vocabulary 167 | // 将词添加到词汇表 168 | int AddWordToVocab(char *word) { 169 | unsigned int hash, length = strlen(word) + 1; 170 | if (length > MAX_STRING) length = MAX_STRING; //词的长度不能超MAX_STRING 171 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 172 | strcpy(vocab[vocab_size].word, word); 173 | vocab[vocab_size].cn = 0; //初始词频为0 174 | vocab_size++; 175 | // Reallocate memory if needed 176 | if (vocab_size + 2 >= vocab_max_size) { 177 | vocab_max_size += 1000; 178 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 179 | } 180 | hash = GetWordHash(word); 181 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; //如果hash值冲突，采用线性探测的开放定址法，顺序向下查找 182 | vocab_hash[hash] = vocab_size - 1; 183 | return vocab_size - 1; 184 | } 185 | 186 | // Used later for sorting by word counts 187 | // 词表排序的比较算法cmp：根据词频排序,降序 188 | int VocabCompare(const void *a, const void *b) { 189 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 190 | } 191 | 192 | // Sorts the vocabulary by frequency using word counts 193 | // 根据词频排序 194 | void SortVocab() { 195 | int a, size; 196 | unsigned int hash; 197 | // Sort the vocabulary and keep at the first position 198 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 199 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 200 | size = vocab_size; 201 | train_words = 0; 202 | for (a = 0; a < size; a++) { 203 | // Words occuring less than min_count times will be discarded from the vocab 204 | // 出现次数太少的词直接丢弃，min_count 默认5 205 | if ((vocab[a].cn < min_count) && (a != 0)) { 206 | vocab_size--; 207 | free(vocab[a].word); 208 | } else { 209 | // Hash will be re-computed, as after the sorting it is not actual 210 | // 重新计算hash值 211 | hash=GetWordHash(vocab[a].word); 212 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 213 | vocab_hash[hash] = a; 214 | train_words += vocab[a].cn; //总词频 215 | } 216 | } 217 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 218 | // Allocate memory for the binary tree construction 219 | for (a = 0; a < vocab_size; a++) { 220 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 221 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 222 | } 223 | } 224 | 225 | // Reduces the vocabulary by removing infrequent tokens 226 | // 缩小词汇表，移除词频过小的词 227 | void ReduceVocab() { 228 | int a, b = 0; 229 | unsigned int hash; 230 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 231 | vocab[b].cn = vocab[a].cn; 232 | vocab[b].word = vocab[a].word; 233 | b++; 234 | } else free(vocab[a].word); 235 | vocab_size = b; 236 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 237 | for (a = 0; a < vocab_size; a++) { 238 | // Hash will be re-computed, as it is not actual 239 | hash = GetWordHash(vocab[a].word); 240 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 241 | vocab_hash[hash] = a; 242 | } 243 | fflush(stdout); 244 | min_reduce++; 245 | } 246 | 247 | // Create binary Huffman tree using the word counts 248 | // Frequent words will have short uniqe binary codes 249 | // 根据词频构建huffman树，词频越大编码越短 250 | void CreateBinaryTree() { 251 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 252 | char code[MAX_CODE_LENGTH]; 253 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 254 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 255 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 256 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 257 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 258 | pos1 = vocab_size - 1; 259 | pos2 = vocab_size; 260 | // Following algorithm constructs the Huffman tree by adding one node at a time 261 | for (a = 0; a < vocab_size - 1; a++) { 262 | // First, find two smallest nodes 'min1, min2' 263 | if (pos1 >= 0) { //找第一小 264 | if (count[pos1] < count[pos2]) { 265 | min1i = pos1; 266 | pos1--; 267 | } else { 268 | min1i = pos2; 269 | pos2++; 270 | } 271 | } else { 272 | min1i = pos2; 273 | pos2++; 274 | } 275 | if (pos1 >= 0) { //找第二小 276 | if (count[pos1] < count[pos2]) { 277 | min2i = pos1; 278 | pos1--; 279 | } else { 280 | min2i = pos2; 281 | pos2++; 282 | } 283 | } else { 284 | min2i = pos2; 285 | pos2++; 286 | } 287 | count[vocab_size + a] = count[min1i] + count[min2i]; 288 | parent_node[min1i] = vocab_size + a; 289 | parent_node[min2i] = vocab_size + a; 290 | binary[min2i] = 1; 291 | } 292 | // Now assign binary code to each vocabulary word 293 | for (a = 0; a < vocab_size; a++) { 294 | b = a; 295 | i = 0; 296 | while (1) { 297 | code[i] = binary[b]; 298 | point[i] = b; 299 | i++; 300 | b = parent_node[b]; 301 | if (b == vocab_size * 2 - 2) break; 302 | } 303 | vocab[a].codelen = i; // 编码长度 304 | vocab[a].point[0] = vocab_size - 2; //? 305 | for (b = 0; b < i; b++) { 306 | vocab[a].code[i - b - 1] = code[b]; 307 | vocab[a].point[i - b] = point[b] - vocab_size; 308 | } 309 | } 310 | free(count); 311 | free(binary); 312 | free(parent_node); 313 | } 314 | // 从训练文件中统计每个词的词频 315 | void LearnVocabFromTrainFile() { 316 | char word[MAX_STRING]; 317 | FILE *fin; 318 | long long a, i; 319 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 320 | fin = fopen(train_file, "rb"); 321 | if (fin == NULL) { 322 | printf("ERROR: training data file not found!\n"); 323 | exit(1); 324 | } 325 | vocab_size = 0; 326 | AddWordToVocab((char *)""); 327 | while (1) { 328 | ReadWord(word, fin); 329 | if (feof(fin)) break; 330 | train_words++; 331 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 332 | printf("%lldK%c", train_words / 1000, 13); 333 | fflush(stdout); 334 | } 335 | i = SearchVocab(word); 336 | if (i == -1) { 337 | a = AddWordToVocab(word); 338 | vocab[a].cn = 1; 339 | } else vocab[i].cn++; 340 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); //如果词汇过多，先删除低频词 341 | } 342 | SortVocab(); 343 | if (debug_mode > 0) { 344 | printf("Vocab size: %lld\n", vocab_size); 345 | printf("Words in train file: %lld\n", train_words); 346 | } 347 | file_size = ftell(fin); 348 | fclose(fin); 349 | } 350 | // 保存词汇表 351 | void SaveVocab() { 352 | long long i; 353 | FILE *fo = fopen(save_vocab_file, "wb"); 354 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 355 | fclose(fo); 356 | } 357 | //从文件读取词汇，该文件已经统计好每个词的词频 358 | void ReadVocab() { 359 | long long a, i = 0; 360 | char c; 361 | char word[MAX_STRING]; 362 | FILE *fin = fopen(read_vocab_file, "rb"); 363 | if (fin == NULL) { 364 | printf("Vocabulary file not found\n"); 365 | exit(1); 366 | } 367 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 368 | vocab_size = 0; 369 | while (1) { 370 | ReadWord(word, fin); 371 | if (feof(fin)) break; 372 | a = AddWordToVocab(word); 373 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); //读取词频，换行符 374 | i++; 375 | } 376 | SortVocab(); 377 | if (debug_mode > 0) { 378 | printf("Vocab size: %lld\n", vocab_size); 379 | printf("Words in train file: %lld\n", train_words); 380 | } 381 | fin = fopen(train_file, "rb"); 382 | if (fin == NULL) { 383 | printf("ERROR: training data file not found!\n"); 384 | exit(1); 385 | } 386 | fseek(fin, 0, SEEK_END); 387 | file_size = ftell(fin); 388 | fclose(fin); 389 | } 390 | 391 | void ReadTriplets() { 392 | cout <<"reading triplets" << endl; 393 | FILE* f_kb = fopen(triplet_file,"r"); 394 | char buf3[40960]; 395 | char buf2[40960]; 396 | char buf1[40960]; 397 | char *word; 398 | int len = 0; 399 | int i = 0,j = 0; 400 | relation_num = 0; 401 | int bingo_num = 0; 402 | while (!feof(f_kb)) { 403 | fgets(buf,20480,f_kb); 404 | sscanf(buf,"%[^\t]\t%[^\t]\t%[^\t\n]\n", buf1,buf2,buf3); 405 | string s1=buf1; 406 | string s2=buf3; 407 | string s3=buf2; //relation 408 | len = s1.length(); 409 | word = (char *)malloc((len+1)*sizeof(char)); 410 | //cout << s1 << " " << s3 << " " << s2 << endl; 411 | s1.copy(word,len,0); 412 | i = SearchVocab(word); 413 | if (i == -1) { 414 | //cout<<"miss head entity:"<0) { 454 | // 分配负样本词的向量空间 455 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); 456 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 457 | // 初始化为0向量 458 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 459 | syn1neg[a * layer1_size + b] = 0; 460 | } 461 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 462 | next_random = next_random * (unsigned long long)25214903917 + 11; 463 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 464 | } 465 | // 关系向量初始化 466 | relation_vec.resize(relation_num); 467 | for (int i = 0; i < relation_vec.size(); i++) 468 | relation_vec[i].resize(layer1_size); 469 | for (int i = 0; i < relation_num; i++) 470 | { 471 | for (int ii = 0 ; ii < layer1_size; ii++) 472 | relation_vec[i][ii] = randn(0,1.0/layer1_size,-6/sqrt(layer1_size),6/sqrt(layer1_size)); 473 | } 474 | CreateBinaryTree(); 475 | } 476 | // 训练模型线程：训练过程 477 | void *TrainModelThread(void *id) { 478 | long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; 479 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 480 | long long l1, l2, c, target, label, local_iter = iter; 481 | unsigned long long next_random = (long long)id; 482 | real f, g; 483 | clock_t now; 484 | real *waddr = (real *)calloc(layer1_size, sizeof(real)); //对应wi+r 485 | real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //对应Xw 486 | real *neu1e = (real *)calloc(layer1_size, sizeof(real)); //对应error累加量 487 | FILE *fi = fopen(train_file, "rb"); 488 | //每个线程对应一段文本 489 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 490 | while (1) { 491 | if (word_count - last_word_count > 10000) { 492 | word_count_actual += word_count - last_word_count; 493 | last_word_count = word_count; 494 | if ((debug_mode > 1)) { 495 | now=clock(); 496 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 497 | word_count_actual / (real)(iter * train_words + 1) * 100, 498 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 499 | fflush(stdout); 500 | } 501 | alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 502 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 503 | } 504 | if (sentence_length == 0) { 505 | while (1) { 506 | word = ReadWordIndex(fi); //读一个词，返回其在词汇表的索引位置 507 | if (feof(fi)) break; 508 | if (word == -1) continue; 509 | word_count++; 510 | if (word == 0) break; 511 | // The subsampling randomly discards frequent words while keeping the ranking same 512 | // 对高频词进行下采样，以概率p丢弃。p = 1-[sqrt(t/f(w))+t/f(w)].但仍保持排序不变 513 | // 先计算ran = sqrt(t/f(w))+t/f(w)，产生(0,1)上的随机数r，如果r>ran，则丢弃。 514 | if (sample > 0) { 515 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 516 | next_random = next_random * (unsigned long long)25214903917 + 11; 517 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 518 | } 519 | sen[sentence_length] = word; 520 | sentence_length++; 521 | // 将1000个词当成一个句子 522 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 523 | } 524 | sentence_position = 0; 525 | } 526 | // 当前线程处理单词数超过阈值 527 | if (feof(fi) || (word_count > train_words / num_threads)) { 528 | word_count_actual += word_count - last_word_count; 529 | local_iter--; 530 | if (local_iter == 0) break; 531 | word_count = 0; 532 | last_word_count = 0; 533 | sentence_length = 0; 534 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 535 | continue; 536 | } 537 | word = sen[sentence_position]; 538 | if (word == -1) continue; 539 | for (c = 0; c < layer1_size; c++) neu1[c] = 0; 540 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 541 | next_random = next_random * (unsigned long long)25214903917 + 11; 542 | // 随机产生0-5的窗口大小 543 | b = next_random % window; 544 | if (cbow) { //train the cbow architecture 545 | // in -> hidden 546 | cw = 0; 547 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 548 | c = sentence_position - window + a; 549 | if (c < 0) continue; 550 | if (c >= sentence_length) continue; 551 | last_word = sen[c]; 552 | if (last_word == -1) continue; 553 | // 上下文词进行向量加和，得到Xw 554 | for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; 555 | cw++; 556 | } 557 | if (cw) { 558 | // 归一化？ 559 | for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 560 | // hs，采用huffman 561 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 562 | f = 0; 563 | l2 = vocab[word].point[d] * layer1_size; //路径的内部节点 564 | // Propagate hidden -> output 565 | // 隐藏层到输出层，计算误差梯度 566 | // neu1 对应 Xw， syn1对应内部节点的向量0 567 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; //计算内积 568 | if (f <= -MAX_EXP) continue; 569 | else if (f >= MAX_EXP) continue; 570 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];//sigmoid 571 | // 'g' is the gradient multiplied by the learning rate 572 | // 内部节点0的梯度(1-d-sigmoid(Xw·0))Xw，g为前面部分 573 | g = (1 - vocab[word].code[d] - f) * alpha; 574 | 575 | // Propagate errors output -> hidden 576 | // 反向传播误差，从huffman树传到隐藏层 577 | // 累加的梯度更新量 578 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 579 | // Learn weights hidden -> output 580 | // 内部节点更新向量 581 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 582 | } 583 | // NEGATIVE SAMPLING 584 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 585 | if (d == 0) { 586 | target = word; //目标词 587 | label = 1; //正样本 588 | } else {//采样负样本 589 | next_random = next_random * (unsigned long long)25214903917 + 11; 590 | target = table[(next_random >> 16) % table_size]; 591 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 592 | if (target == word) continue; 593 | label = 0; 594 | } 595 | l2 = target * layer1_size; 596 | f = 0; 597 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; //内积 598 | if (f > MAX_EXP) g = (label - 1) * alpha; 599 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 600 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; //sigmoid 601 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; //累积误差梯度 602 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; //参数向量更新 603 | } 604 | // hidden -> in 605 | // 更新上下文几个词语的向量。 606 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 607 | c = sentence_position - window + a; 608 | if (c < 0) continue; 609 | if (c >= sentence_length) continue; 610 | last_word = sen[c]; 611 | if (last_word == -1) continue; 612 | for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 613 | } 614 | // 关系三元组训练 615 | // 对word对应的所有三元组(wi,r,t)，负采样5个三元组， 616 | // wi向量syn0[word*layer1_size] , r向量 617 | // @chenbingjin 2016-05-17 618 | 619 | // printf("word: %d \tsize: %d\n", word, triplets[word].size()); 620 | if(triplets[word].size() > 0) for (int i = 0; i < triplets[word].size(); ++i) 621 | { 622 | int rid = triplets[word][i].first; 623 | int t = triplets[word][i].second; 624 | //printf("rid: %d\t t: %d\n", rid, t); 625 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; //梯度向量清零 626 | for (c = 0; c < layer1_size; c++) waddr[c] = syn0[c + word * layer1_size] + relation_vec[rid][c]; // Vwi + Vr 627 | //printf("waddr done...\n"); 628 | for (d = 0; d < negative + 1; ++d) 629 | { 630 | // printf("d: %d\n", d); 631 | if (d == 0) { 632 | target = t; //目标词 633 | label = 1; //正样本 634 | } else { 635 | //采样负样本(wi,r,t') 636 | next_random = next_random * (unsigned long long)25214903917 + 11; 637 | target = table[(next_random >> 16) % table_size]; 638 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 639 | if (target == t) continue; //与t不同 640 | label = 0; 641 | } 642 | l2 = target * layer1_size; 643 | 644 | f = 0; 645 | for (c = 0; c < layer1_size; c++) f += waddr[c] * syn1neg[c + l2]; //内积 646 | if (f > MAX_EXP) g = (label - 1) * belta; 647 | else if (f < -MAX_EXP) g = (label - 0) * belta; 648 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * belta; //sigmoid 649 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; //累积误差梯度 650 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += gama* g * waddr[c]; //参数向量更新 651 | } 652 | for (c = 0; c < layer1_size; c++) syn0[c + word * layer1_size] += gama * g * neu1e[c]; //更新当前词wi 653 | for (c = 0; c < layer1_size; c++) relation_vec[rid][c] += gama * g * neu1e[c]; //更新r向量 654 | } 655 | } 656 | } else { //train skip-gram 657 | //这里很神奇，利用了目标函数的对称性，p(u|w) = p(w|u), u in Context(w). 具体看 http://blog.csdn.net/mytestmy/article/details/26969149 658 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 659 | c = sentence_position - window + a; 660 | if (c < 0) continue; 661 | if (c >= sentence_length) continue; 662 | last_word = sen[c]; 663 | if (last_word == -1) continue; 664 | l1 = last_word * layer1_size; 665 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 666 | // HIERARCHICAL SOFTMAX 667 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { //遍历叶子节点 668 | f = 0; 669 | l2 = vocab[word].point[d] * layer1_size; //point是路径上的节点 670 | // Propagate hidden -> output 671 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; //内积 672 | if (f <= -MAX_EXP) continue; 673 | else if (f >= MAX_EXP) continue; 674 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; //sigmoid 675 | // 'g' is the gradient multiplied by the learning rate 676 | g = (1 - vocab[word].code[d] - f) * alpha; //梯度一部分 677 | // Propagate errors output -> hidden 678 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; //隐藏层的误差 679 | // Learn weights hidden -> output 680 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; //更新内部节点向量 681 | } 682 | // NEGATIVE SAMPLING 683 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 684 | if (d == 0) { 685 | target = word; 686 | label = 1; 687 | } else { 688 | next_random = next_random * (unsigned long long)25214903917 + 11; 689 | target = table[(next_random >> 16) % table_size]; 690 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 691 | if (target == word) continue; 692 | label = 0; 693 | } 694 | l2 = target * layer1_size; 695 | f = 0; 696 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 697 | if (f > MAX_EXP) g = (label - 1) * alpha; 698 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 699 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 700 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 701 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; 702 | } 703 | // Learn weights input -> hidden 704 | for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; //更新的是当前上下文的词向量 705 | } 706 | } 707 | sentence_position++; 708 | if (sentence_position >= sentence_length) { 709 | sentence_length = 0; 710 | continue; 711 | } 712 | } 713 | fclose(fi); 714 | free(neu1); 715 | free(neu1e); 716 | pthread_exit(NULL); 717 | } 718 | // 训练模型 719 | void TrainModel() { 720 | long a, b, c, d; 721 | FILE *fo; 722 | FILE *fv; 723 | FILE *fvv; 724 | // 默认12个线程 725 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 726 | printf("Starting training using file %s\n", train_file); 727 | starting_alpha = alpha; 728 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 729 | if (save_vocab_file[0] != 0) SaveVocab(); 730 | if (output_file[0] == 0) return; 731 | ReadTriplets(); 732 | InitNet(); 733 | if (negative > 0) InitUnigramTable(); 734 | start = clock(); 735 | // 启动线程 736 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 737 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 738 | // 保存结果 739 | fv = fopen("vector-orgin.txt", "w"); 740 | fvv = fopen("word2id.txt", "w"); 741 | fo = fopen(output_file, "wb"); 742 | if (classes == 0) { 743 | // Save the word vectors 744 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 745 | for (a = 0; a < vocab_size; a++) { 746 | fprintf(fo, "%s ", vocab[a].word); 747 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 748 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 749 | fprintf(fo, "\n"); 750 | fprintf(fvv, "%s %ld\n",vocab[a].word,a); 751 | for (b = 0; b < layer1_size; b++) fprintf(fv, "%lf ", syn0[a * layer1_size + b]); 752 | fprintf(fv, "\n"); 753 | } 754 | } else { 755 | // Run K-means on the word vectors 756 | // 对向量进行聚类 757 | int clcn = classes, iter = 10, closeid; 758 | // 该类别的数量 759 | int *centcn = (int *)malloc(classes * sizeof(int)); 760 | // 每个词对应类别 761 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 762 | real closev, x; 763 | // 每个类的中心点 764 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 765 | // 初始化，每个词分配到一个类 766 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 767 | for (a = 0; a < iter; a++) { 768 | // 中心点清零 769 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 770 | for (b = 0; b < clcn; b++) centcn[b] = 1; 771 | // 计算每个类别求和值 772 | for (c = 0; c < vocab_size; c++) { 773 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 774 | centcn[cl[c]]++; //对应类别的数量加1 775 | } 776 | // 遍历所有类别 777 | for (b = 0; b < clcn; b++) { 778 | closev = 0; 779 | for (c = 0; c < layer1_size; c++) { 780 | cent[layer1_size * b + c] /= centcn[b]; //均值 781 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 782 | } 783 | closev = sqrt(closev); 784 | // 中心点归一化 785 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 786 | } 787 | // 所有词重新分类 788 | for (c = 0; c < vocab_size; c++) { 789 | closev = -10; 790 | closeid = 0; 791 | for (d = 0; d < clcn; d++) { 792 | x = 0; 793 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 794 | if (x > closev) { 795 | closev = x; 796 | closeid = d; 797 | } 798 | } 799 | cl[c] = closeid; 800 | } 801 | } 802 | // Save the K-means classes 803 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 804 | free(centcn); 805 | free(cent); 806 | free(cl); 807 | } 808 | fclose(fo); 809 | fclose(fv); 810 | fclose(fvv); 811 | } 812 | 813 | int ArgPos(char *str, int argc, char **argv) { 814 | int a; 815 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 816 | if (a == argc - 1) { 817 | printf("Argument missing for %s\n", str); 818 | exit(1); 819 | } 820 | return a; 821 | } 822 | return -1; 823 | } 824 | 825 | int main(int argc, char **argv) { 826 | int i; 827 | if (argc == 1) { 828 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 829 | printf("Options:\n"); 830 | printf("Parameters for training:\n"); 831 | // 输入文件：已分词的语料 832 | printf("\t-train \n"); 833 | printf("\t\tUse text data from to train the model\n"); 834 | // 输入文件：三元组语料 835 | printf("\t-triplet \n"); 836 | printf("\t\tUse triplets data from to train the model\n"); 837 | // 输出文件：词向量或词聚类 838 | printf("\t-output \n"); 839 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 840 | // 词向量维度：默认100 841 | printf("\t-size \n"); 842 | printf("\t\tSet size of word vectors; default is 100\n"); 843 | // 窗口大小：默认5 844 | printf("\t-window \n"); 845 | printf("\t\tSet max skip length between words; default is 5\n"); 846 | // 词频阈值：默认0，对高频词随机下采样 847 | printf("\t-sample \n"); 848 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 849 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 850 | // 采用层次softmax：默认0，不采用 851 | printf("\t-hs \n"); 852 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 853 | // 采用NEG：默认5 854 | printf("\t-negative \n"); 855 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 856 | // 线程数：默认12 857 | printf("\t-threads \n"); 858 | printf("\t\tUse threads (default 12)\n"); 859 | // 迭代数：默认5 860 | printf("\t-iter \n"); 861 | printf("\t\tRun more training iterations (default 5)\n"); 862 | // 词频最小阈值：默认5，小于阈值则丢弃 863 | printf("\t-min-count \n"); 864 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 865 | // 学习率：默认是0.025(skip-gram),0.05(cbow) 866 | printf("\t-alpha \n"); 867 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 868 | // 聚类数：默认0 869 | printf("\t-classes \n"); 870 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 871 | // debug模式：默认2 872 | printf("\t-debug \n"); 873 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 874 | // 二进制存储：默认0，即保存文件时不采用二进制 875 | printf("\t-binary \n"); 876 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 877 | // 保存词汇表 878 | printf("\t-save-vocab \n"); 879 | printf("\t\tThe vocabulary will be saved to \n"); 880 | // 读取已统计好词频的词汇表 881 | printf("\t-read-vocab \n"); 882 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 883 | // 采用模型：1 CBOW，0 skip-gram，默认1 884 | printf("\t-cbow \n"); 885 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 886 | // 示例 887 | printf("\nExamples:\n"); 888 | printf("./word2vec -train data.txt -triplet triplets.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 889 | return 0; 890 | } 891 | output_file[0] = 0; 892 | save_vocab_file[0] = 0; 893 | read_vocab_file[0] = 0; 894 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 895 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 896 | if ((i = ArgPos((char *)"-triplet", argc, argv)) > 0) strcpy(triplet_file, argv[i + 1]); 897 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 898 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 899 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 900 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 901 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 902 | if (cbow) alpha = 0.05; 903 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 904 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 905 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 906 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 907 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 908 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 909 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 910 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 911 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 912 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 913 | //cout << triplet_file << endl; 914 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 915 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 916 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 917 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 918 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 919 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 920 | } 921 | TrainModel(); 922 | return 0; 923 | } 924 | -------------------------------------------------------------------------------- /Word2vec/word2vec_transE.tex: -------------------------------------------------------------------------------- 1 | % word2vc_transE.tex 2 | % @chenbingjin 2016-07-07 3 | 4 | \documentclass[10pt]{article} 5 | \usepackage{caption} 6 | \usepackage{algorithm} 7 | \usepackage{algpseudocode} 8 | \begin{document} 9 | \begin{algorithm} 10 | \caption{ word2vec\_transE training algorithm} 11 | \textbf{Input:} corpus \textbf{C}, vector dimension \textbf{k}, triplets \textbf{R}\\ 12 | \textbf{Output:} word vector \textbf{V}, relation vector \textbf{P} 13 | \begin{algorithmic}[1] 14 | \Procedure {w2v\_transE\_Train}{$C$, $k$} 15 | \State $vocab\_size \leftarrow get\_vocab\_size(C)$ 16 | \State $relation\_size \leftarrow get\_relation\_size(R)$ //read triplets 17 | \State $V \leftarrow init\_vector(vocab\_size,k)$ 18 | \State $\theta \leftarrow init\_vector(vocab\_size,k)$ 19 | \State $P \leftarrow init\_vector(relation\_size,k)$ //init relation vector 20 | \ForAll {$w_{i} \in C$} 21 | \State // word2vec model training 22 | \State $e \leftarrow 0$ 23 | \State $X_w \leftarrow \sum_{u \in Context(w_i)} V(u)$ 24 | \ForAll { $u = \{w_i\} \cup NEG(w_i)$} 25 | \State $q \leftarrow \sigma(X_{w_i}^T\theta^u)$ 26 | \State $g \leftarrow \eta(L^{w_i}(u)-q)$ 27 | \State $e \leftarrow e+g\theta^u $ 28 | \State $\theta^u \leftarrow \theta^u + gX_w$ 29 | \EndFor 30 | \ForAll { $ u \in Context(w_i) $ } 31 | \State $V(u) \leftarrow V(u) + e$ 32 | \EndFor 33 | \State // transE model training 34 | \ForAll {$r \in R_{w_i} $} 35 | \State $e \leftarrow 0$ 36 | \State $(w_i,r,t) \leftarrow get\_triplet(R,w_i,r)$ $//get\ triplet(w_i,r,t)$ 37 | \ForAll {$u = \{t\} \cup NEG(w_i,r) $} 38 | \State $ X_{w_i+r} \leftarrow V(w_i)+ P(r)$ 39 | \State $q \leftarrow \sigma(X_{w_i+r}^T\theta^u)$ 40 | \State $g \leftarrow \mu (L^t(u)-q)$ 41 | \State $e \leftarrow e+g\theta^u $ 42 | \State $\theta^u \leftarrow \theta^u + gX_{w_i+r}$ 43 | \EndFor 44 | \State $V(w_i) \leftarrow V(w_i) + e$ 45 | \State $P(r) \leftarrow P(r) + e$ 46 | \EndFor 47 | \EndFor 48 | \EndProcedure 49 | \end{algorithmic} 50 | \end{algorithm} 51 | \end{document} --------------------------------------------------------------------------------