├── .DS_Store
├── README.md
├── libfm-1.42
    ├── src-fm_core
    │   ├── fm_data.h
    │   └── fm_model.h
    ├── src-linfm-src
    │   └── fm_learn_sgd_element.h
    └── src-util
    │   └── rlog.h
└── word2vec
    ├── .gitignore
    ├── huffman_test.cc
    └── word2vec.c


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaozhiyong19890102/OpenSourceReading/63ed4ee07c6f2627717920f28b9e51557e0be1b1/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenSourceReading
 2 | 
 3 | 对一些开源代码的学习与总结，从源码中学习。
 4 | 
 5 | ## 1、word2vec
 6 | 
 7 | - [代码注释](https://github.com/zhaozhiyong19890102/OpenSourceReading/tree/master/word2vec)
 8 | - [机器学习算法实现解析——word2vec源码解析](https://blog.csdn.net/google19890102/article/details/51887344 " 机器学习算法实现解析——word2vec源码解析")
 9 | 
10 | ## 2、libfm-1.42
11 | 
12 | - [代码注释](https://github.com/zhaozhiyong19890102/OpenSourceReading/tree/master/libfm-1.42)
13 | - [机器学习算法实现解析——libFM之libFM的模型处理部分](https://blog.csdn.net/google19890102/article/details/72866290 "机器学习算法实现解析——libFM之libFM的模型处理部分")
14 | - [机器学习算法实现解析——libFM之libFM的训练过程概述](https://blog.csdn.net/google19890102/article/details/72866320 " 机器学习算法实现解析——libFM之libFM的训练过程概述")
15 | - [机器学习算法实现解析——libFM之libFM的训练过程之SGD的方法](https://blog.csdn.net/google19890102/article/details/72866334 "机器学习算法实现解析——libFM之libFM的训练过程之SGD的方法")
16 | - [机器学习算法实现解析——libFM之libFM的训练过程之Adaptive Regularization](https://blog.csdn.net/google19890102/article/details/73301949 "机器学习算法实现解析——libFM之libFM的训练过程之Adaptive Regularization")
17 | 
18 | ## 3、liblbfgs-1.10
19 | 
20 | - [机器学习算法实现解析——liblbfgs之L-BFGS算法](https://blog.csdn.net/google19890102/article/details/77187890 "机器学习算法实现解析——liblbfgs之L-BFGS算法")


--------------------------------------------------------------------------------
/libfm-1.42/src-fm_core/fm_data.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle
 2 | // Contact:   srendle@libfm.org, http://www.libfm.org/
 3 | //
 4 | // This file is part of libFM.
 5 | //
 6 | // libFM is free software: you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation, either version 3 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // libFM is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License
17 | // along with libFM.  If not, see <http://www.gnu.org/licenses/>.
18 | //
19 | //
20 | // fm_data.h: Base data type of libFM
21 | 
22 | #ifndef FM_DATA_H_
23 | #define FM_DATA_H_
24 | 
25 | typedef float FM_FLOAT;// 定义数据类型
26 | 
27 | #endif /*FM_DATA_H_*/
28 | 


--------------------------------------------------------------------------------
/libfm-1.42/src-fm_core/fm_model.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle
  2 | // Contact:   srendle@libfm.org, http://www.libfm.org/
  3 | //
  4 | // This file is part of libFM.
  5 | //
  6 | // libFM is free software: you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation, either version 3 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // libFM is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License
 17 | // along with libFM.  If not, see <http://www.gnu.org/licenses/>.
 18 | //
 19 | //
 20 | // fm_model.h: Model for Factorization Machines
 21 | //
 22 | // Based on the publication(s):
 23 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th
 24 | //   IEEE International Conference on Data Mining (ICDM 2010), Sydney,
 25 | //   Australia.
 26 | 
 27 | #ifndef FM_MODEL_H_
 28 | #define FM_MODEL_H_
 29 | 
 30 | #include "../util/matrix.h"
 31 | #include "../util/fmatrix.h"
 32 | 
 33 | #include "fm_data.h"
 34 | 
 35 | // fm_model模型类
 36 | class fm_model {
 37 |   private:
 38 | 		DVector<double> m_sum, m_sum_sqr;// 分别对应着交叉项的中的两项
 39 | 	public: //fm模型中的参数
 40 | 		double w0;// 常数项
 41 | 		DVectorDouble w;// 一次项的系数
 42 | 		DMatrixDouble v;// 交叉项的系数矩阵
 43 | 
 44 | 	public:
 45 | 		// 属性
 46 | 		// the following values should be set:
 47 | 		uint num_attribute;// 特征的个数
 48 | 		
 49 | 		bool k0, k1;// 是否包含常数项和一次项
 50 | 		int num_factor;// 交叉项因子的个数
 51 | 		
 52 | 		double reg0;// 常数项的正则参数
 53 | 		double regw, regv;// 一次项和交叉项的正则系数
 54 | 		
 55 | 		double init_stdev;// 初始化参数时的方差
 56 | 		double init_mean;// 初始化参数时的均值
 57 | 		
 58 | 		// 函数
 59 | 		fm_model();// 构造函数，主要完成参数的初始化
 60 | 		void debug();// debug函数
 61 | 		void init();// 初始化函数，主要用于生成各维度系数的初始值
 62 | 		// 对样本进行预测
 63 | 		double predict(sparse_row<FM_FLOAT>& x);
 64 | 		double predict(sparse_row<FM_FLOAT>& x, DVector<double> &sum, DVector<double> &sum_sqr);
 65 | };
 66 | 
 67 | // fm_model类的构造函数
 68 | fm_model::fm_model() {
 69 | 	num_factor = 0;// 交叉项中因子的个数
 70 | 	init_mean = 0;// 初始化的均值
 71 | 	init_stdev = 0.01;// 初始化的方差
 72 | 	reg0 = 0.0;// 常数项的正则化参数
 73 | 	regw = 0.0;// 一次项的正则化参数
 74 | 	regv = 0.0;// 交叉项的正则化参数 
 75 | 	k0 = true;// 是否包含常数项
 76 | 	k1 = true;// 是否包含一次项
 77 | }
 78 | 
 79 | // debug函数，主要用于输出中间调试的结果
 80 | void fm_model::debug() {
 81 | 	std::cout << "num_attributes=" << num_attribute << std::endl;
 82 | 	std::cout << "use w0=" << k0 << std::endl;
 83 | 	std::cout << "use w1=" << k1 << std::endl;
 84 | 	std::cout << "dim v =" << num_factor << std::endl;
 85 | 	std::cout << "reg_w0=" << reg0 << std::endl;
 86 | 	std::cout << "reg_w=" << regw << std::endl;
 87 | 	std::cout << "reg_v=" << regv << std::endl; 
 88 | 	std::cout << "init ~ N(" << init_mean << "," << init_stdev << ")" << std::endl;
 89 | }
 90 | 
 91 | // 初始化fm模型的函数
 92 | void fm_model::init() {
 93 | 	w0 = 0;// 常数项的系数
 94 | 	w.setSize(num_attribute);// 设置一次项系数的个数
 95 | 	v.setSize(num_factor, num_attribute);// 设置交叉项的矩阵大小
 96 | 	w.init(0);// 初始化一次项系数为0
 97 | 	v.init(init_mean, init_stdev);// 按照均值和方差初始化交叉项系数
 98 | 	// 交叉项中的两个参数，设置其大小为num_factor
 99 | 	m_sum.setSize(num_factor);
100 | 	m_sum_sqr.setSize(num_factor);
101 | }
102 | 
103 | // 对样本进行预测，其中x表示的是一行样本
104 | double fm_model::predict(sparse_row<FM_FLOAT>& x) {
105 | 	return predict(x, m_sum, m_sum_sqr);		
106 | }
107 | 
108 | double fm_model::predict(sparse_row<FM_FLOAT>& x, DVector<double> &sum, DVector<double> &sum_sqr) {
109 | 	double result = 0;// 最终的结果
110 | 	// 第一部分
111 | 	if (k0) {// 常数项	
112 | 		result += w0;
113 | 	}
114 | 	
115 | 	// 第二部分
116 | 	if (k1) {// 一次项
117 | 		for (uint i = 0; i < x.size; i++) {// 对样本中的每一个特征
118 | 			assert(x.data[i].id < num_attribute);// 验证样本的正确性
119 | 			// w * x
120 | 			result += w(x.data[i].id) * x.data[i].value;
121 | 		}
122 | 	}
123 | 	
124 | 	// 第三部分
125 | 	// 交叉项，对应着公式，有两重循环
126 | 	for (int f = 0; f < num_factor; f++) {// 外层循环
127 | 		sum(f) = 0;
128 | 		sum_sqr(f) = 0;
129 | 		for (uint i = 0; i < x.size; i++) {
130 | 			double d = v(f,x.data[i].id) * x.data[i].value;
131 | 			sum(f) += d;
132 | 			sum_sqr(f) += d*d;
133 | 		}
134 | 		result += 0.5 * (sum(f)*sum(f) - sum_sqr(f));// 得到交叉项的值
135 | 	}
136 | 	return result;
137 | }
138 | 
139 | #endif /*FM_MODEL_H_*/
140 | 


--------------------------------------------------------------------------------
/libfm-1.42/src-linfm-src/fm_learn_sgd_element.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle
 2 | // Contact:   srendle@libfm.org, http://www.libfm.org/
 3 | //
 4 | // This file is part of libFM.
 5 | //
 6 | // libFM is free software: you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation, either version 3 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // libFM is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License
17 | // along with libFM.  If not, see <http://www.gnu.org/licenses/>.
18 | //
19 | //
20 | // fm_learn_sgd.h: Stochastic Gradient Descent based learning for
21 | // classification and regression
22 | //
23 | // Based on the publication(s):
24 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th
25 | //   IEEE International Conference on Data Mining (ICDM 2010), Sydney,
26 | //   Australia.
27 | 
28 | #ifndef FM_LEARN_SGD_ELEMENT_H_
29 | #define FM_LEARN_SGD_ELEMENT_H_
30 | 
31 | #include "fm_learn_sgd.h"
32 | 
33 | // 继承了fm_learn_sgd
34 | class fm_learn_sgd_element: public fm_learn_sgd {
35 | 	public:
36 | 		// 初始化
37 | 		virtual void init() {
38 | 			fm_learn_sgd::init();
39 | 			// 日志输出
40 | 			if (log != NULL) {
41 | 				log->addField("rmse_train", std::numeric_limits<double>::quiet_NaN());
42 | 			}
43 | 		}
44 | 		// 利用SGD训练FM模型
45 | 		virtual void learn(Data& train, Data& test) {
46 | 			fm_learn_sgd::learn(train, test);// 输出参数信息
47 | 
48 | 			std::cout << "SGD: DON'T FORGET TO SHUFFLE THE ROWS IN TRAINING DATA TO GET THE BEST RESULTS." << std::endl; 
49 | 			// SGD
50 | 			for (int i = 0; i < num_iter; i++) {// 开始迭代，每一轮的迭代过程
51 | 				double iteration_time = getusertime();// 记录开始的时间
52 | 				for (train.data->begin(); !train.data->end(); train.data->next()) {// 对于每一个样本
53 | 					double p = fm->predict(train.data->getRow(), sum, sum_sqr);// 得到样本的预测值
54 | 					double mult = 0;// 损失函数的导数
55 | 					if (task == 0) {// 回归
56 | 						p = std::min(max_target, p);
57 | 						p = std::max(min_target, p);
58 | 						// loss=(y_ori-y_pre)^2
59 | 						mult = -(train.target(train.data->getRowIndex())-p);// 对损失函数求导
60 | 					} else if (task == 1) {// 分类
61 | 						// loss
62 | 						mult = -train.target(train.data->getRowIndex())*(1.0-1.0/(1.0+exp(-train.target(train.data->getRowIndex())*p)));
63 | 					}
64 | 					// 利用梯度下降法对参数进行学习
65 | 					SGD(train.data->getRow(), mult, sum);					
66 | 				}				
67 | 				iteration_time = (getusertime() - iteration_time);// 记录时间差
68 | 				// evaluate函数是调用的fm_learn类中的方法
69 | 				double rmse_train = evaluate(train);// 对训练结果评估
70 | 				double rmse_test = evaluate(test);// 将模型应用在测试数据上
71 | 				std::cout << "#Iter=" << std::setw(3) << i << "\tTrain=" << rmse_train << "\tTest=" << rmse_test << std::endl;
72 | 				// 日志输出
73 | 				if (log != NULL) {
74 | 					log->log("rmse_train", rmse_train);
75 | 					log->log("time_learn", iteration_time);
76 | 					log->newLine();
77 | 				}
78 | 			}		
79 | 		}
80 | 		
81 | };
82 | 
83 | #endif /*FM_LEARN_SGD_ELEMENT_H_*/
84 | 


--------------------------------------------------------------------------------
/libfm-1.42/src-util/rlog.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle
  2 | // Contact:   srendle@libfm.org, http://www.libfm.org/
  3 | //
  4 | // This file is part of libFM.
  5 | //
  6 | // libFM is free software: you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation, either version 3 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // libFM is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License
 17 | // along with libFM.  If not, see <http://www.gnu.org/licenses/>.
 18 | //
 19 | //
 20 | // rlog.h: Logging into R compatible files
 21 | 
 22 | #ifndef RLOG_H_
 23 | #define RLOG_H_
 24 | #include <iostream>
 25 | #include <fstream>
 26 | #include <assert.h>
 27 | #include <map>
 28 | 
 29 | // 日志类
 30 | class RLog {
 31 | 	private:
 32 | 		std::ostream* out;// 输出流
 33 | 		std::vector<std::string> header;// 保存日志需要记录的项
 34 | 		std::map<std::string,double> default_value;// 需要记录的项与值
 35 | 		std::map<std::string,double> value;// 需要记录的项与值
 36 | 	public:
 37 | 		// 构造方法
 38 | 		RLog(std::ostream* stream) { 
 39 | 			out = stream;// 初始化输出流
 40 | 			// 分别清空三个容器
 41 | 			header.clear();
 42 | 			default_value.clear();
 43 | 			value.clear();
 44 | 		};	
 45 | 		
 46 | 		// 在value中记录相应的日志内容
 47 | 		void log(const std::string& field, double d) {
 48 | 			value[field] = d;
 49 | 		}
 50 | 		
 51 | 		// 初始化
 52 | 		void init() {
 53 | 			if (out != NULL) {// 正确打开输出流
 54 | 				// 对header中的内容进行输出
 55 | 				for (uint i = 0; i < header.size(); i++) {
 56 | 					*out << header[i];
 57 | 					if (i < (header.size()-1)) {
 58 | 						*out << "\t";
 59 | 					} else {
 60 | 						*out << "\n";
 61 | 					}
 62 | 				}			
 63 | 				out->flush();// 刷新
 64 | 			}
 65 | 			// 将default_value中的内容转存到value中
 66 | 			for (uint i = 0; i < header.size(); i++) {
 67 | 				value[header[i]] = default_value[header[i]];	
 68 | 			}
 69 | 		}
 70 | 		
 71 | 		// 增加新的field
 72 | 		void addField(const std::string& field_name, double def) {
 73 | 			//std::cout << field_name << std::endl; std::cout.flush();
 74 | 			// 在header中查找field_name
 75 | 			std::vector<std::string>::iterator i = std::find(header.begin(), header.end(), field_name);
 76 | 			if (i != header.end()) {// 存在field_name
 77 | 				throw "the field " + field_name + " already exists";
 78 | 			}
 79 | 			// 不存在
 80 | 			header.push_back(field_name);// 将field_name添加到header中
 81 | 			default_value[field_name] = def;// 添加到default_value中
 82 | 		}
 83 | 		
 84 | 		// 将当前value中的内容输出，同时，在value中增加新的一行日志
 85 | 		void newLine() {
 86 | 			if (out != NULL) {// 正确打开输出流
 87 | 				// 对value中的内容进行输出
 88 | 				for (uint i = 0; i < header.size(); i++) {
 89 | 					*out << value[header[i]];
 90 | 					// 输出标志位
 91 | 					if (i < (header.size()-1)) {
 92 | 						*out << "\t";
 93 | 					} else {
 94 | 						*out << "\n";
 95 | 					}
 96 | 				}
 97 | 				out->flush();// 刷新
 98 | 				value.clear();// 清空value	
 99 | 				// 将default_value中的内容转存到value中
100 | 				for (uint i = 0; i < header.size(); i++) {
101 | 					value[header[i]] = default_value[header[i]];	
102 | 				}
103 | 			}
104 | 		}
105 | };
106 | 	
107 | 
108 | #endif /*RLOG_H_*/
109 | 


--------------------------------------------------------------------------------
/word2vec/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/


--------------------------------------------------------------------------------
/word2vec/huffman_test.cc:
--------------------------------------------------------------------------------
  1 | # include <stdio.h>
  2 | # include <stdlib.h>
  3 | # include <vector>
  4 | # include <iostream>
  5 | using namespace std; 
  6 | 
  7 | #define MAX_CODE_LENGTH 10
  8 | #define vocab_size 6
  9 | #define vocab_max_size 10
 10 | 
 11 | struct vocab_word {
 12 | 	long long cn; // 出现的次数
 13 | 	int *point; // 从根结点到叶子节点的路径
 14 | 	char *word, *code, codelen;// 分别对应着词，Huffman编码，编码长度
 15 | };
 16 | 
 17 | struct vocab_word *vocab;
 18 | 
 19 | void CreateBinaryTree() {
 20 | 	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
 21 | 	char code[MAX_CODE_LENGTH];
 22 | 	
 23 | 	// 申请2倍的词的空间，（在这里完全没有必要申请这么多的空间）
 24 | 	long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
 25 | 	long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
 26 | 	long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
 27 | 	
 28 | 	// 分成两半进行初始化
 29 | 	for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;// 前半部分初始化为每个词出现的次数
 30 | 	for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;// 后半部分初始化为一个固定的常数
 31 | 
 32 | 	// 两个指针：
 33 | 	// pos1指向前半截的尾部
 34 | 	// pos2指向后半截的开始
 35 | 	pos1 = vocab_size - 1;
 36 | 	pos2 = vocab_size;
 37 | 
 38 | 	// Following algorithm constructs the Huffman tree by adding one node at a time
 39 | 	// 每次增加一个节点，构建Huffman树
 40 | 	for (a = 0; a < vocab_size - 1; a++) {
 41 | 		// First, find two smallest nodes 'min1, min2'
 42 | 		// 选择最小的节点min1
 43 | 		if (pos1 >= 0) {
 44 | 			if (count[pos1] < count[pos2]) {
 45 | 				min1i = pos1;
 46 | 				pos1--;
 47 | 			} else {
 48 | 				min1i = pos2;
 49 | 				pos2++;
 50 | 			}
 51 | 		} else {
 52 | 			min1i = pos2;
 53 | 			pos2++;
 54 | 		}
 55 | 		// 选择最小的节点min2
 56 | 		if (pos1 >= 0) {
 57 | 			if (count[pos1] < count[pos2]) {
 58 | 				min2i = pos1;
 59 | 				pos1--;
 60 | 			} else {
 61 | 				min2i = pos2;
 62 | 				pos2++;
 63 | 			}
 64 | 		} else {
 65 | 			min2i = pos2;
 66 | 			pos2++;
 67 | 		}
 68 | 
 69 | 		count[vocab_size + a] = count[min1i] + count[min2i];
 70 | 		// 设置父节点
 71 | 		parent_node[min1i] = vocab_size + a;
 72 | 		parent_node[min2i] = vocab_size + a;
 73 | 		binary[min2i] = 1;// 设置一个子树的编码为1
 74 | 	}
 75 | 	// Now assign binary code to each vocabulary word
 76 | 	// 为每一个词分配二进制编码，即Huffman编码
 77 | 	for (a = 0; a < vocab_size*2; a++) {// 针对每一个词
 78 | 		// 测试
 79 | 		cout << a << ": count: " << count[a] << ", binary: " << binary[a] << ", parent_node: " << parent_node[a] << endl;
 80 | 		/*b = a;
 81 | 		i = 0;
 82 | 		while (1) {
 83 | 			code[i] = binary[b];// 找到当前的节点的编码
 84 | 			point[i] = b;// 记录从叶子节点到根结点的序列
 85 | 			i++;
 86 | 			b = parent_node[b];// 找到当前节点的父节点
 87 | 			if (b == vocab_size * 2 - 2) break;// 已经找到了根结点，根节点是没有编码的
 88 | 		}
 89 | 		vocab[a].codelen = i;// 词的编码长度
 90 | 		vocab[a].point[0] = vocab_size - 2;// 根结点
 91 | 		for (b = 0; b < i; b++) {
 92 | 			vocab[a].code[i - b - 1] = code[b];// 编码的反转
 93 | 			vocab[a].point[i - b] = point[b] - vocab_size;// 记录的是从根结点到叶子节点的路径
 94 | 		}*/
 95 | 	}
 96 | 	free(count);
 97 | 	free(binary);
 98 | 	free(parent_node);
 99 | }
100 | 
101 | int main() {
102 | 	vector<int> cnt;
103 | 	cnt.push_back(8);
104 | 	cnt.push_back(5);
105 | 	cnt.push_back(4);
106 | 	cnt.push_back(3);
107 | 	cnt.push_back(1);
108 | 	cnt.push_back(1);
109 | 	vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
110 | 	for (int i = 0; i < vocab_size; i++) {
111 | 		vocab[i].cn = cnt[i];
112 | 	}
113 | 	CreateBinaryTree();
114 | 	return 0;
115 | }


--------------------------------------------------------------------------------
/word2vec/word2vec.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 100
 22 | #define EXP_TABLE_SIZE 1000
 23 | #define MAX_EXP 6
 24 | #define MAX_SENTENCE_LENGTH 1000
 25 | #define MAX_CODE_LENGTH 40
 26 | 
 27 | // 构建的全局变量
 28 | 
 29 | const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 30 | 
 31 | // 定义的浮点数
 32 | typedef float real;                    // Precision of float numbers
 33 | 
 34 | // 词的结构体
 35 | struct vocab_word {
 36 | 	long long cn; // 出现的次数
 37 | 	int *point; // 从根结点到叶子节点的路径
 38 | 	char *word, *code, codelen;// 分别对应着词，Huffman编码，编码长度
 39 | };
 40 | 
 41 | char train_file[MAX_STRING], output_file[MAX_STRING];// 训练文件，输出文件
 42 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 43 | struct vocab_word *vocab; // 出现的词的统计
 44 | 
 45 | // 初始化参数
 46 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
 47 | int *vocab_hash;// 存储词的hash
 48 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
 49 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
 50 | real alpha = 0.025, starting_alpha, sample = 1e-3;
 51 | real *syn0, *syn1, *syn1neg, *expTable;
 52 | clock_t start;
 53 | 
 54 | int hs = 0, negative = 5;
 55 | const int table_size = 1e8;
 56 | int *table;
 57 | 
 58 | // 生成负采样的概率表
 59 | void InitUnigramTable() {
 60 | 	int a, i;
 61 | 	double train_words_pow = 0;
 62 | 	double d1, power = 0.75;
 63 | 	table = (int *)malloc(table_size * sizeof(int));// int --> int
 64 | 	for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
 65 | 	// 类似轮盘赌生成每个词的概率
 66 | 	i = 0;
 67 | 	d1 = pow(vocab[i].cn, power) / train_words_pow;
 68 | 	for (a = 0; a < table_size; a++) {
 69 | 		table[a] = i;
 70 | 		if (a / (double)table_size > d1) {
 71 | 			i++;
 72 | 			d1 += pow(vocab[i].cn, power) / train_words_pow;
 73 | 		}
 74 | 		if (i >= vocab_size) i = vocab_size - 1;
 75 | 	}
 76 | }
 77 | 
 78 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 79 | // 读取每一个词
 80 | void ReadWord(char *word, FILE *fin) {
 81 | 	int a = 0, ch;
 82 | 	while (!feof(fin)) {
 83 | 		ch = fgetc(fin);
 84 | 		if (ch == 13) continue; // 回车，\r
 85 | 		if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 86 | 			if (a > 0) {// 当前的词还没结束
 87 | 				if (ch == '\n') ungetc(ch, fin);
 88 | 				break;
 89 | 			}
 90 | 			if (ch == '\n') {
 91 | 				strcpy(word, (char *)"</s>");// 换行符用</s>表示
 92 | 				return;
 93 | 			} else continue;
 94 | 		}
 95 | 		word[a] = ch;
 96 | 		a++;
 97 | 		if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 98 | 	}
 99 | 	word[a] = 0;
100 | }
101 | 
102 | // Returns hash value of a word
103 | // 取词的hash值
104 | int GetWordHash(char *word) {
105 | 	unsigned long long a, hash = 0;
106 | 	for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
107 | 	hash = hash % vocab_hash_size;
108 | 	return hash;
109 | }
110 | 
111 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
112 | // 查找词在词库中的位置，若没有查找到则返回-1
113 | int SearchVocab(char *word) {
114 | 	unsigned int hash = GetWordHash(word);
115 | 	while (1) {
116 | 		if (vocab_hash[hash] == -1) return -1;// 不存在该词
117 | 		if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];// 返回索引值
118 | 		hash = (hash + 1) % vocab_hash_size;
119 | 	}
120 | 	return -1;// 不存在该词
121 | }
122 | 
123 | // Reads a word and returns its index in the vocabulary
124 | // 返回的是在词库中的位置
125 | int ReadWordIndex(FILE *fin) {
126 | 	char word[MAX_STRING];
127 | 	ReadWord(word, fin);
128 | 	if (feof(fin)) return -1;
129 | 	return SearchVocab(word);
130 | }
131 | 
132 | // Adds a word to the vocabulary
133 | // 为词库中增加一个词
134 | int AddWordToVocab(char *word) {
135 | 	unsigned int hash, length = strlen(word) + 1;// 单词的长度+1
136 | 	if (length > MAX_STRING) length = MAX_STRING;
137 | 	vocab[vocab_size].word = (char *)calloc(length, sizeof(char));//开始的位置增加指定的词
138 | 	strcpy(vocab[vocab_size].word, word);
139 | 	vocab[vocab_size].cn = 0;
140 | 	vocab_size++;
141 | 	// Reallocate memory if needed
142 | 	if (vocab_size + 2 >= vocab_max_size) {
143 | 		vocab_max_size += 1000;
144 | 		vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
145 | 	}
146 | 	hash = GetWordHash(word);// 对增加的词hash
147 | 	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;// hash的碰撞检测
148 | 	vocab_hash[hash] = vocab_size - 1;// 词的hash值->词的词库中的索引
149 | 	return vocab_size - 1;
150 | }
151 | 
152 | // Used later for sorting by word counts
153 | int VocabCompare(const void *a, const void *b) {
154 | 	return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
155 | }
156 | 
157 | // Sorts the vocabulary by frequency using word counts
158 | // 根据词出现的频率对词库中的词排序 
159 | void SortVocab() {
160 | 	int a, size;
161 | 	unsigned int hash;
162 | 	// Sort the vocabulary and keep </s> at the first position
163 | 	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
164 | 	// 排完序后需要重新做hash运算
165 | 	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
166 | 	size = vocab_size;
167 | 	train_words = 0;
168 | 	for (a = 0; a < size; a++) {
169 | 		// Words occuring less than min_count times will be discarded from the vocab
170 | 		// 根据min_count对低频词的处理
171 | 		if ((vocab[a].cn < min_count) && (a != 0)) {
172 | 			vocab_size--;
173 | 			free(vocab[a].word);
174 | 		} else {
175 | 			// Hash will be re-computed, as after the sorting it is not actual
176 | 			hash=GetWordHash(vocab[a].word);
177 | 			while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
178 | 			vocab_hash[hash] = a;
179 | 			train_words += vocab[a].cn;
180 | 		}
181 | 	}
182 | 	vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
183 | 	// Allocate memory for the binary tree construction
184 | 	// 为构建huffman树申请空间
185 | 	for (a = 0; a < vocab_size; a++) {
186 | 		vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
187 | 		vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
188 | 	}
189 | }
190 | 
191 | // Reduces the vocabulary by removing infrequent tokens
192 | // 删除频率较小的词
193 | void ReduceVocab() {
194 | 	int a, b = 0;
195 | 	unsigned int hash;
196 | 	// 通过min_reduce控制
197 | 	for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
198 | 		vocab[b].cn = vocab[a].cn;
199 | 		vocab[b].word = vocab[a].word;
200 | 		b++;
201 | 	} else free(vocab[a].word);
202 | 	vocab_size = b;// 删减后词的个数
203 | 	// 重新进行hash操作
204 | 	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
205 | 	for (a = 0; a < vocab_size; a++) {
206 | 		// Hash will be re-computed, as it is not actual
207 | 		hash = GetWordHash(vocab[a].word);
208 | 		while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
209 | 		vocab_hash[hash] = a;
210 | 	}
211 | 	fflush(stdout);
212 | 	min_reduce++;
213 | }
214 | 
215 | // Create binary Huffman tree using the word counts
216 | // Frequent words will have short uniqe binary codes
217 | // 根据词库中的词频构建Huffman树
218 | void CreateBinaryTree() {
219 | 	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
220 | 	char code[MAX_CODE_LENGTH];
221 | 	
222 | 	// 申请2倍的词的空间，（在这里完全没有必要申请这么多的空间）
223 | 	long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
224 | 	long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
225 | 	long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
226 | 	
227 | 	// 分成两半进行初始化
228 | 	for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;// 前半部分初始化为每个词出现的次数
229 | 	for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;// 后半部分初始化为一个固定的常数
230 | 
231 | 	// 两个指针：
232 | 	// pos1指向前半截的尾部
233 | 	// pos2指向后半截的开始
234 | 	pos1 = vocab_size - 1;
235 | 	pos2 = vocab_size;
236 | 
237 | 	// Following algorithm constructs the Huffman tree by adding one node at a time
238 | 	// 每次增加一个节点，构建Huffman树
239 | 	for (a = 0; a < vocab_size - 1; a++) {
240 | 		// First, find two smallest nodes 'min1, min2'
241 | 		// 选择最小的节点min1
242 | 		if (pos1 >= 0) {
243 | 			if (count[pos1] < count[pos2]) {
244 | 				min1i = pos1;
245 | 				pos1--;
246 | 			} else {
247 | 				min1i = pos2;
248 | 				pos2++;
249 | 			}
250 | 		} else {
251 | 			min1i = pos2;
252 | 			pos2++;
253 | 		}
254 | 		// 选择最小的节点min2
255 | 		if (pos1 >= 0) {
256 | 			if (count[pos1] < count[pos2]) {
257 | 				min2i = pos1;
258 | 				pos1--;
259 | 			} else {
260 | 				min2i = pos2;
261 | 				pos2++;
262 | 			}
263 | 		} else {
264 | 			min2i = pos2;
265 | 			pos2++;
266 | 		}
267 | 
268 | 		count[vocab_size + a] = count[min1i] + count[min2i];
269 | 		// 设置父节点
270 | 		parent_node[min1i] = vocab_size + a;
271 | 		parent_node[min2i] = vocab_size + a;
272 | 		binary[min2i] = 1;// 设置一个子树的编码为1
273 | 	}
274 | 	// Now assign binary code to each vocabulary word
275 | 	// 为每一个词分配二进制编码，即Huffman编码
276 | 	for (a = 0; a < vocab_size; a++) {// 针对每一个词
277 | 		b = a;
278 | 		i = 0;
279 | 		while (1) {
280 | 			code[i] = binary[b];// 找到当前的节点的编码
281 | 			point[i] = b;// 记录从叶子节点到根结点的序列
282 | 			i++;
283 | 			b = parent_node[b];// 找到当前节点的父节点
284 | 			if (b == vocab_size * 2 - 2) break;// 已经找到了根结点，根节点是没有编码的
285 | 		}
286 | 		vocab[a].codelen = i;// 词的编码长度
287 | 		vocab[a].point[0] = vocab_size - 2;// 根结点
288 | 		for (b = 0; b < i; b++) {
289 | 			vocab[a].code[i - b - 1] = code[b];// 编码的反转
290 | 			vocab[a].point[i - b] = point[b] - vocab_size;// 记录的是从根结点到叶子节点的路径
291 | 		}
292 | 	}
293 | 	free(count);
294 | 	free(binary);
295 | 	free(parent_node);
296 | }
297 | 
298 | // 读取输入的文件，并从输入文件中构建词库
299 | void LearnVocabFromTrainFile() {
300 | 	char word[MAX_STRING];// 存储每一个单词
301 | 	FILE *fin;
302 | 	long long a, i;
303 | 
304 | 	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; // 初始化
305 | 
306 | 	fin = fopen(train_file, "rb");
307 | 	if (fin == NULL) {
308 | 		printf("ERROR: training data file not found!\n");
309 | 		exit(1);
310 | 	}
311 | 	vocab_size = 0;// 记录文件中的词的个数
312 | 
313 | 	AddWordToVocab((char *)"</s>");// 在最开始增加指定的词
314 | 
315 | 	// 开始从文本取每一个词
316 | 	while (1) {
317 | 		ReadWord(word, fin); // 读取每一个词
318 | 		if (feof(fin)) break; // 判断文件是否读完
319 | 		train_words++; // 记录词的个数
320 | 		if ((debug_mode > 1) && (train_words % 100000 == 0)) {
321 | 			printf("%lldK%c", train_words / 1000, 13);
322 | 			fflush(stdout);
323 | 		}
324 | 		i = SearchVocab(word);// 查找词在词库中的位置
325 | 		if (i == -1) {// 没有查找到对应的词
326 | 			a = AddWordToVocab(word);// 增加词
327 | 			vocab[a].cn = 1;// 设置词出现的次数为1
328 | 		} else vocab[i].cn++;// 设置词出现的次数+1
329 | 		
330 | 		// 根据当前词的个数和设定的hash表的大小，删除低频词
331 | 		if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
332 | 	}
333 | 	SortVocab();// 根据词出现的频率对词进行排序
334 | 	if (debug_mode > 0) {
335 | 		printf("Vocab size: %lld\n", vocab_size);
336 | 		printf("Words in train file: %lld\n", train_words);
337 | 	}
338 | 	file_size = ftell(fin);
339 | 	fclose(fin);
340 | }
341 | 
342 | // 保存词库
343 | void SaveVocab() {
344 | 	long long i;
345 | 	FILE *fo = fopen(save_vocab_file, "wb");
346 | 	// 保存词库时，保存的是词库中的词和词出现的次数
347 | 	for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
348 | 	fclose(fo);
349 | }
350 | 
351 | void ReadVocab() {
352 | 	long long a, i = 0;
353 | 	char c;
354 | 	char word[MAX_STRING];
355 | 	FILE *fin = fopen(read_vocab_file, "rb");
356 | 	if (fin == NULL) {
357 | 		printf("Vocabulary file not found\n");
358 | 		exit(1);
359 | 	}
360 | 	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; // 初始化vocab_hash
361 | 	vocab_size = 0;
362 | 	while (1) {
363 | 		ReadWord(word, fin);
364 | 		if (feof(fin)) break;
365 | 		a = AddWordToVocab(word);
366 | 		fscanf(fin, "%lld%c", &vocab[a].cn, &c);
367 | 		i++;
368 | 	}
369 | 	SortVocab();
370 | 	if (debug_mode > 0) {
371 | 		printf("Vocab size: %lld\n", vocab_size);
372 | 		printf("Words in train file: %lld\n", train_words);
373 | 	}
374 | 	fin = fopen(train_file, "rb");
375 | 	if (fin == NULL) {
376 | 		printf("ERROR: training data file not found!\n");
377 | 		exit(1);
378 | 	}
379 | 	fseek(fin, 0, SEEK_END);
380 | 	file_size = ftell(fin);
381 | 	fclose(fin);
382 | }
383 | 
384 | // 初始化网络
385 | // 主要分为两个部分：1、对词向量的初始化；2、对映射层到输出层权重的初始化
386 | void InitNet() {
387 | 	long long a, b;
388 | 	unsigned long long next_random = 1;
389 | 
390 | 	// 为每一个词分配词向量的空间
391 | 	// 对齐分配内存,posix_memalign函数的用法类似于malloc的用法，最后一个参数的分配的内存的大小
392 | 	a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
393 | 	if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
394 | 	
395 | 	// 层次softmax的结构
396 | 	if (hs) {
397 | 		// 映射层到输出层之间的权重
398 | 		a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
399 | 		if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
400 | 		for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
401 | 			syn1[a * layer1_size + b] = 0;// 权重初始化为0
402 | 	}
403 | 	
404 | 	// 负采样的结构
405 | 	if (negative>0) {
406 | 		a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
407 | 		if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
408 | 		for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
409 | 			syn1neg[a * layer1_size + b] = 0;
410 | 	}
411 | 	
412 | 	// 随机初始化
413 | 	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
414 | 		next_random = next_random * (unsigned long long)25214903917 + 11;
415 | 		// 1、与：相当于将数控制在一定范围内
416 | 		// 2、0xFFFF：65536
417 | 		// 3、/65536：[0,1]之间
418 | 		syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;// 初始化词向量
419 | 	}
420 | 
421 | 	// 构建Huffman树
422 | 	CreateBinaryTree();
423 | }
424 | 
425 | void *TrainModelThread(void *id) {
426 | 	long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
427 | 	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
428 | 	long long l1, l2, c, target, label, local_iter = iter;
429 | 	unsigned long long next_random = (long long)id;
430 | 	real f, g;
431 | 	clock_t now;
432 | 	
433 | 	// layer1_size为词向量的长度
434 | 	real *neu1 = (real *)calloc(layer1_size, sizeof(real));// 存储映射层的结果
435 | 	real *neu1e = (real *)calloc(layer1_size, sizeof(real));// skip-gram中使用到的向量
436 | 
437 | 	FILE *fi = fopen(train_file, "rb");
438 | 	// 利用多线程对训练文件划分，每个线程训练一部分的数据
439 | 	fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
440 | 	
441 | 	// 训练模型的核心部分
442 | 	while (1) {
443 | 		// 每处理10000个词重新计算学习率
444 | 		if (word_count - last_word_count > 10000) {// 每处理10000个词重新计算学习率
445 | 			word_count_actual += word_count - last_word_count;
446 | 			last_word_count = word_count;
447 | 			if ((debug_mode > 1)) {
448 | 				now=clock();
449 | 				printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
450 | 						word_count_actual / (real)(iter * train_words + 1) * 100,
451 | 						word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
452 | 				fflush(stdout);
453 | 			}
454 | 			// 重新计算alpha的值
455 | 			alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
456 | 			// 防止学习率过小
457 | 			if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
458 | 		}
459 | 
460 | 		// sentence_length=0表示的是当前还没有读取文本
461 | 		// 开始读取文本，读取词的个数最多为MAX_SENTENCE_LENGTH
462 | 		if (sentence_length == 0) {
463 | 			// 需要根据文件指针的位置读取相应的文本
464 | 			while (1) {
465 | 				word = ReadWordIndex(fi);// 词在词库中的索引
466 | 
467 | 				if (feof(fi)) break;
468 | 				if (word == -1) continue;// 没有查到该词
469 | 				word_count++;
470 | 
471 | 				if (word == 0) break;
472 | 
473 | 				// The subsampling randomly discards frequent words while keeping the ranking same
474 | 				if (sample > 0) {
475 | 					real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
476 | 					next_random = next_random * (unsigned long long)25214903917 + 11;
477 | 					if (ran < (next_random & 0xFFFF) / (real)65536) continue;
478 | 				}
479 | 
480 | 				sen[sentence_length] = word;// 存储词在词库中的位置，word代表的是Index
481 | 				sentence_length++;
482 | 				if (sentence_length >= MAX_SENTENCE_LENGTH) break;// 达到指定长度
483 | 			}
484 | 			sentence_position = 0;// 将待处理的文本指针置0
485 | 		}
486 | 		
487 | 		// 当前的线程已经处理完分配给该线程的文本
488 | 		if (feof(fi) || (word_count > train_words / num_threads)) {// 当前线程已经读完数据
489 | 			word_count_actual += word_count - last_word_count;
490 | 			// 当前线程的迭代次数
491 | 			local_iter--;
492 | 			if (local_iter == 0) break;// 迭代结束
493 | 			// 重新置0，准备下一次重新迭代
494 | 			word_count = 0;
495 | 			last_word_count = 0;
496 | 			sentence_length = 0;
497 | 			// 重置文件指针
498 | 			fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
499 | 			continue;
500 | 		}
501 | 		
502 | 		// sen表示的是当前的线程读取到的每一个词对应在词库中的索引
503 | 		word = sen[sentence_position];//sentence_position表示的是当前词
504 | 		if (word == -1) continue;
505 | 		
506 | 		// 初始化映射层
507 | 		for (c = 0; c < layer1_size; c++) neu1[c] = 0;// 映射层的结果
508 | 		for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
509 | 		
510 | 		// 产生一个0~window-1的随机数
511 | 		next_random = next_random * (unsigned long long)25214903917 + 11;
512 | 		b = next_random % window;
513 | 		
514 | 		// 模型的训练
515 | 		if (cbow) {  // 训练CBOW模型
516 | 			// in -> hidden
517 | 			// 输入层到映射层
518 | 			cw = 0;
519 | 			for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
520 | 				c = sentence_position - window + a;// sentence_position表示的是当前的位置
521 | 				// 判断c是否越界
522 | 				if (c < 0) continue;
523 | 				if (c >= sentence_length) continue;
524 | 
525 | 				last_word = sen[c];// 找到c对应的索引
526 | 				if (last_word == -1) continue;
527 | 
528 | 				for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];// 累加
529 | 				cw++;
530 | 			}
531 | 
532 | 			if (cw) {
533 | 				for (c = 0; c < layer1_size; c++) neu1[c] /= cw;// 计算均值
534 | 				
535 | 				// 计算的中心词是word
536 | 				// 层次Softmax
537 | 				if (hs) for (d = 0; d < vocab[word].codelen; d++) {// word为当前词
538 | 					// 计算输出层的输出
539 | 					f = 0;
540 | 					l2 = vocab[word].point[d] * layer1_size;// 找到第d个词对应的权重
541 | 					// Propagate hidden -> output
542 | 					for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];// 映射层到输出层
543 | 
544 | 					if (f <= -MAX_EXP) continue;
545 | 					else if (f >= MAX_EXP) continue;
546 | 					else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];// Sigmoid结果
547 | 
548 | 					// 'g' is the gradient multiplied by the learning rate
549 | 					g = (1 - vocab[word].code[d] - f) * alpha;
550 | 					// Propagate errors output -> hidden
551 | 					for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];// 修改映射后的结果
552 | 					// Learn weights hidden -> output
553 | 					for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];// 修改映射层到输出层之间的权重
554 | 				}
555 | 				// NEGATIVE SAMPLING
556 | 				// 负采样
557 | 				if (negative > 0) for (d = 0; d < negative + 1; d++) {
558 | 					// 标记target和label
559 | 					if (d == 0) {// 正样本
560 | 						target = word;
561 | 						label = 1;
562 | 					} else {// 选择出负样本
563 | 						next_random = next_random * (unsigned long long)25214903917 + 11;
564 | 						target = table[(next_random >> 16) % table_size];// 从table表中选择出负样本
565 | 						// 重新选择
566 | 						if (target == 0) target = next_random % (vocab_size - 1) + 1;
567 | 						if (target == word) continue;
568 | 						label = 0;
569 | 					}
570 | 
571 | 					l2 = target * layer1_size;
572 | 					f = 0;
573 | 					for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];// 映射层到输出层
574 | 					
575 | 					// g
576 | 					if (f > MAX_EXP) g = (label - 1) * alpha;
577 | 					else if (f < -MAX_EXP) g = (label - 0) * alpha;
578 | 					else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
579 | 
580 | 					for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
581 | 					for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
582 | 				}
583 | 				// hidden -> in
584 | 				// 以上是从映射层到输出层的修改，现在返回修改每一个词向量
585 | 				for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
586 | 					c = sentence_position - window + a;
587 | 					if (c < 0) continue;
588 | 					if (c >= sentence_length) continue;
589 | 					last_word = sen[c];
590 | 					if (last_word == -1) continue;
591 | 					// 利用窗口内的所有词向量的梯度之和来更新
592 | 					for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
593 | 				}
594 | 			}
595 | 		} else {  //train skip-gram 训练skip-gram模型
596 | 			for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
597 | 				c = sentence_position - window + a;
598 | 				if (c < 0) continue;
599 | 				if (c >= sentence_length) continue;
600 | 				last_word = sen[c];
601 | 				if (last_word == -1) continue;
602 | 				l1 = last_word * layer1_size;
603 | 				for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
604 | 				// HIERARCHICAL SOFTMAX
605 | 				if (hs) for (d = 0; d < vocab[word].codelen; d++) {
606 | 					f = 0;
607 | 					l2 = vocab[word].point[d] * layer1_size;
608 | 					// Propagate hidden -> output
609 | 					// 映射层即为输入层
610 | 					for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
611 | 
612 | 					if (f <= -MAX_EXP) continue;
613 | 					else if (f >= MAX_EXP) continue;
614 | 					else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
615 | 
616 | 					// 'g' is the gradient multiplied by the learning rate
617 | 					g = (1 - vocab[word].code[d] - f) * alpha;
618 | 					// Propagate errors output -> hidden
619 | 					for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
620 | 					// Learn weights hidden -> output
621 | 					for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
622 | 				}
623 | 				// NEGATIVE SAMPLING
624 | 				if (negative > 0) for (d = 0; d < negative + 1; d++) {
625 | 					if (d == 0) {
626 | 						target = word;
627 | 						label = 1;
628 | 					} else {
629 | 						next_random = next_random * (unsigned long long)25214903917 + 11;
630 | 						target = table[(next_random >> 16) % table_size];
631 | 						if (target == 0) target = next_random % (vocab_size - 1) + 1;
632 | 						if (target == word) continue;
633 | 						label = 0;
634 | 					}
635 | 					l2 = target * layer1_size;
636 | 					f = 0;
637 | 					for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
638 | 					if (f > MAX_EXP) g = (label - 1) * alpha;
639 | 					else if (f < -MAX_EXP) g = (label - 0) * alpha;
640 | 					else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
641 | 					for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
642 | 					for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
643 | 				}
644 | 
645 | 				// Learn weights input -> hidden
646 | 				for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
647 | 			}
648 | 		}
649 | 		// 当已经处理完读入的所有文本，要重新继续往下读文本
650 | 		sentence_position++;
651 | 		if (sentence_position >= sentence_length) {
652 | 			sentence_length = 0;
653 | 			continue;
654 | 		}
655 | 	}
656 | 	fclose(fi);
657 | 	free(neu1);
658 | 	free(neu1e);
659 | 	pthread_exit(NULL);
660 | }
661 | 
662 | // 模型训练
663 | void TrainModel() {
664 | 	long a, b, c, d;
665 | 	FILE *fo;
666 | 	
667 | 	pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));// 多线程
668 | 
669 | 	printf("Starting training using file %s\n", train_file);
670 | 	starting_alpha = alpha;
671 | 	
672 | 	// 区分是否指定词库
673 | 	// 若指定词库，则从词库中读入词
674 | 	// 若不指定词库，则从文件中构建词库
675 | 	if (read_vocab_file[0] != 0) 
676 | 		ReadVocab();// 指定词库
677 | 	else 
678 | 		LearnVocabFromTrainFile();// 不指定词库，从文件中构建词库
679 | 
680 | 	if (save_vocab_file[0] != 0) SaveVocab();// 判断是否需要保存词库
681 | 	
682 | 	// 若没有指定输出文件，则退出
683 | 	if (output_file[0] == 0) return;
684 | 
685 | 	InitNet();// 初始化网络
686 | 
687 | 	if (negative > 0) InitUnigramTable();// 利用负采样的方法
688 | 
689 | 	// 开始训练
690 | 	start = clock();
691 | 	for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
692 | 	for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
693 | 	
694 | 	// 输出最终的训练结果
695 | 	fo = fopen(output_file, "wb");
696 | 	if (classes == 0) {
697 | 		// Save the word vectors
698 | 		fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
699 | 		for (a = 0; a < vocab_size; a++) {
700 | 			fprintf(fo, "%s ", vocab[a].word);
701 | 			if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
702 | 			else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
703 | 			fprintf(fo, "\n");
704 | 		}
705 | 	} else {
706 | 		// Run K-means on the word vectors
707 | 		int clcn = classes, iter = 10, closeid;
708 | 		int *centcn = (int *)malloc(classes * sizeof(int));
709 | 		int *cl = (int *)calloc(vocab_size, sizeof(int));
710 | 		real closev, x;
711 | 		real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
712 | 		for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
713 | 		for (a = 0; a < iter; a++) {
714 | 			for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
715 | 			for (b = 0; b < clcn; b++) centcn[b] = 1;
716 | 			for (c = 0; c < vocab_size; c++) {
717 | 				for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
718 | 				centcn[cl[c]]++;
719 | 			}
720 | 			for (b = 0; b < clcn; b++) {
721 | 				closev = 0;
722 | 				for (c = 0; c < layer1_size; c++) {
723 | 					cent[layer1_size * b + c] /= centcn[b];
724 | 					closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
725 | 				}
726 | 				closev = sqrt(closev);
727 | 				for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
728 | 			}
729 | 			for (c = 0; c < vocab_size; c++) {
730 | 				closev = -10;
731 | 				closeid = 0;
732 | 				for (d = 0; d < clcn; d++) {
733 | 					x = 0;
734 | 					for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
735 | 					if (x > closev) {
736 | 						closev = x;
737 | 						closeid = d;
738 | 					}
739 | 				}
740 | 				cl[c] = closeid;
741 | 			}
742 | 		}
743 | 		// Save the K-means classes
744 | 		for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
745 | 		free(centcn);
746 | 		free(cent);
747 | 		free(cl);
748 | 	}
749 | 	fclose(fo);
750 | }
751 | 
752 | // 解析命令行
753 | int ArgPos(char *str, int argc, char **argv) {
754 | 	int a;
755 | 	for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {// 查找对应的参数
756 | 		if (a == argc - 1) {
757 | 			printf("Argument missing for %s\n", str);
758 | 			exit(1);
759 | 		}
760 | 		return a;// 匹配成功，返回值所在的位置
761 | 	}
762 | 	return -1;
763 | }
764 | 
765 | int main(int argc, char **argv) {
766 | 	int i;
767 | 	//  判断参数的个数
768 | 	if (argc == 1) {
769 | 		printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
770 | 		printf("Options:\n");
771 | 		printf("Parameters for training:\n");
772 | 		printf("\t-train <file>\n");
773 | 		printf("\t\tUse text data from <file> to train the model\n");
774 | 		printf("\t-output <file>\n");
775 | 		printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
776 | 		printf("\t-size <int>\n");
777 | 		printf("\t\tSet size of word vectors; default is 100\n");
778 | 		printf("\t-window <int>\n");
779 | 		printf("\t\tSet max skip length between words; default is 5\n");
780 | 		printf("\t-sample <float>\n");
781 | 		printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
782 | 		printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
783 | 		printf("\t-hs <int>\n");
784 | 		printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
785 | 		printf("\t-negative <int>\n");
786 | 		printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
787 | 		printf("\t-threads <int>\n");
788 | 		printf("\t\tUse <int> threads (default 12)\n");
789 | 		printf("\t-iter <int>\n");
790 | 		printf("\t\tRun more training iterations (default 5)\n");
791 | 		printf("\t-min-count <int>\n");
792 | 		printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
793 | 		printf("\t-alpha <float>\n");
794 | 		printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
795 | 		printf("\t-classes <int>\n");
796 | 		printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
797 | 		printf("\t-debug <int>\n");
798 | 		printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
799 | 		printf("\t-binary <int>\n");
800 | 		printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
801 | 		printf("\t-save-vocab <file>\n");
802 | 		printf("\t\tThe vocabulary will be saved to <file>\n");
803 | 		printf("\t-read-vocab <file>\n");
804 | 		printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
805 | 		printf("\t-cbow <int>\n");
806 | 		printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
807 | 		printf("\nExamples:\n");
808 | 		printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
809 | 		return 0;
810 | 	}
811 | 
812 | 	output_file[0] = 0;// 输出文件
813 | 	save_vocab_file[0] = 0;// 输出词的文件
814 | 	read_vocab_file[0] = 0;// 读入指定词的文件
815 | 	
816 | 	// 解析word2vec所需用到的参数 
817 | 	if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
818 | 	if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
819 | 	if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
820 | 	if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
821 | 	if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
822 | 	if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
823 | 	if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
824 | 	if (cbow) alpha = 0.05;
825 | 	if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
826 | 	if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
827 | 	if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
828 | 	if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
829 | 	if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
830 | 	if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
831 | 	if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
832 | 	if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
833 | 	if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
834 | 	if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
835 | 
836 | 	vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));// 存储每一个词的结构体
837 | 	vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));// 存储词的hash
838 | 	expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));// 申请EXP_TABLE_SIZE+1个空间
839 | 	
840 | 	// 计算sigmoid值
841 | 	for (i = 0; i < EXP_TABLE_SIZE; i++) {
842 | 		expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
843 | 		expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
844 | 	}
845 | 	
846 | 	// 开始模型训练
847 | 	TrainModel();// 模型训练
848 | 	return 0;
849 | }
850 | 


--------------------------------------------------------------------------------