├── .DS_Store ├── README.md ├── libfm-1.42 ├── src-fm_core │ ├── fm_data.h │ └── fm_model.h ├── src-linfm-src │ └── fm_learn_sgd_element.h └── src-util │ └── rlog.h └── word2vec ├── .gitignore ├── huffman_test.cc └── word2vec.c /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaozhiyong19890102/OpenSourceReading/63ed4ee07c6f2627717920f28b9e51557e0be1b1/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenSourceReading 2 | 3 | 对一些开源代码的学习与总结,从源码中学习。 4 | 5 | ## 1、word2vec 6 | 7 | - [代码注释](https://github.com/zhaozhiyong19890102/OpenSourceReading/tree/master/word2vec) 8 | - [机器学习算法实现解析——word2vec源码解析](https://blog.csdn.net/google19890102/article/details/51887344 " 机器学习算法实现解析——word2vec源码解析") 9 | 10 | ## 2、libfm-1.42 11 | 12 | - [代码注释](https://github.com/zhaozhiyong19890102/OpenSourceReading/tree/master/libfm-1.42) 13 | - [机器学习算法实现解析——libFM之libFM的模型处理部分](https://blog.csdn.net/google19890102/article/details/72866290 "机器学习算法实现解析——libFM之libFM的模型处理部分") 14 | - [机器学习算法实现解析——libFM之libFM的训练过程概述](https://blog.csdn.net/google19890102/article/details/72866320 " 机器学习算法实现解析——libFM之libFM的训练过程概述") 15 | - [机器学习算法实现解析——libFM之libFM的训练过程之SGD的方法](https://blog.csdn.net/google19890102/article/details/72866334 "机器学习算法实现解析——libFM之libFM的训练过程之SGD的方法") 16 | - [机器学习算法实现解析——libFM之libFM的训练过程之Adaptive Regularization](https://blog.csdn.net/google19890102/article/details/73301949 "机器学习算法实现解析——libFM之libFM的训练过程之Adaptive Regularization") 17 | 18 | ## 3、liblbfgs-1.10 19 | 20 | - [机器学习算法实现解析——liblbfgs之L-BFGS算法](https://blog.csdn.net/google19890102/article/details/77187890 "机器学习算法实现解析——liblbfgs之L-BFGS算法") -------------------------------------------------------------------------------- /libfm-1.42/src-fm_core/fm_data.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // fm_data.h: Base data type of libFM 21 | 22 | #ifndef FM_DATA_H_ 23 | #define FM_DATA_H_ 24 | 25 | typedef float FM_FLOAT;// 定义数据类型 26 | 27 | #endif /*FM_DATA_H_*/ 28 | -------------------------------------------------------------------------------- /libfm-1.42/src-fm_core/fm_model.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // fm_model.h: Model for Factorization Machines 21 | // 22 | // Based on the publication(s): 23 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th 24 | // IEEE International Conference on Data Mining (ICDM 2010), Sydney, 25 | // Australia. 26 | 27 | #ifndef FM_MODEL_H_ 28 | #define FM_MODEL_H_ 29 | 30 | #include "../util/matrix.h" 31 | #include "../util/fmatrix.h" 32 | 33 | #include "fm_data.h" 34 | 35 | // fm_model模型类 36 | class fm_model { 37 | private: 38 | DVector m_sum, m_sum_sqr;// 分别对应着交叉项的中的两项 39 | public: //fm模型中的参数 40 | double w0;// 常数项 41 | DVectorDouble w;// 一次项的系数 42 | DMatrixDouble v;// 交叉项的系数矩阵 43 | 44 | public: 45 | // 属性 46 | // the following values should be set: 47 | uint num_attribute;// 特征的个数 48 | 49 | bool k0, k1;// 是否包含常数项和一次项 50 | int num_factor;// 交叉项因子的个数 51 | 52 | double reg0;// 常数项的正则参数 53 | double regw, regv;// 一次项和交叉项的正则系数 54 | 55 | double init_stdev;// 初始化参数时的方差 56 | double init_mean;// 初始化参数时的均值 57 | 58 | // 函数 59 | fm_model();// 构造函数,主要完成参数的初始化 60 | void debug();// debug函数 61 | void init();// 初始化函数,主要用于生成各维度系数的初始值 62 | // 对样本进行预测 63 | double predict(sparse_row& x); 64 | double predict(sparse_row& x, DVector &sum, DVector &sum_sqr); 65 | }; 66 | 67 | // fm_model类的构造函数 68 | fm_model::fm_model() { 69 | num_factor = 0;// 交叉项中因子的个数 70 | init_mean = 0;// 初始化的均值 71 | init_stdev = 0.01;// 初始化的方差 72 | reg0 = 0.0;// 常数项的正则化参数 73 | regw = 0.0;// 一次项的正则化参数 74 | regv = 0.0;// 交叉项的正则化参数 75 | k0 = true;// 是否包含常数项 76 | k1 = true;// 是否包含一次项 77 | } 78 | 79 | // debug函数,主要用于输出中间调试的结果 80 | void fm_model::debug() { 81 | std::cout << "num_attributes=" << num_attribute << std::endl; 82 | std::cout << "use w0=" << k0 << std::endl; 83 | std::cout << "use w1=" << k1 << std::endl; 84 | std::cout << "dim v =" << num_factor << std::endl; 85 | std::cout << "reg_w0=" << reg0 << std::endl; 86 | std::cout << "reg_w=" << regw << std::endl; 87 | std::cout << "reg_v=" << regv << std::endl; 88 | std::cout << "init ~ N(" << init_mean << "," << init_stdev << ")" << std::endl; 89 | } 90 | 91 | // 初始化fm模型的函数 92 | void fm_model::init() { 93 | w0 = 0;// 常数项的系数 94 | w.setSize(num_attribute);// 设置一次项系数的个数 95 | v.setSize(num_factor, num_attribute);// 设置交叉项的矩阵大小 96 | w.init(0);// 初始化一次项系数为0 97 | v.init(init_mean, init_stdev);// 按照均值和方差初始化交叉项系数 98 | // 交叉项中的两个参数,设置其大小为num_factor 99 | m_sum.setSize(num_factor); 100 | m_sum_sqr.setSize(num_factor); 101 | } 102 | 103 | // 对样本进行预测,其中x表示的是一行样本 104 | double fm_model::predict(sparse_row& x) { 105 | return predict(x, m_sum, m_sum_sqr); 106 | } 107 | 108 | double fm_model::predict(sparse_row& x, DVector &sum, DVector &sum_sqr) { 109 | double result = 0;// 最终的结果 110 | // 第一部分 111 | if (k0) {// 常数项 112 | result += w0; 113 | } 114 | 115 | // 第二部分 116 | if (k1) {// 一次项 117 | for (uint i = 0; i < x.size; i++) {// 对样本中的每一个特征 118 | assert(x.data[i].id < num_attribute);// 验证样本的正确性 119 | // w * x 120 | result += w(x.data[i].id) * x.data[i].value; 121 | } 122 | } 123 | 124 | // 第三部分 125 | // 交叉项,对应着公式,有两重循环 126 | for (int f = 0; f < num_factor; f++) {// 外层循环 127 | sum(f) = 0; 128 | sum_sqr(f) = 0; 129 | for (uint i = 0; i < x.size; i++) { 130 | double d = v(f,x.data[i].id) * x.data[i].value; 131 | sum(f) += d; 132 | sum_sqr(f) += d*d; 133 | } 134 | result += 0.5 * (sum(f)*sum(f) - sum_sqr(f));// 得到交叉项的值 135 | } 136 | return result; 137 | } 138 | 139 | #endif /*FM_MODEL_H_*/ 140 | -------------------------------------------------------------------------------- /libfm-1.42/src-linfm-src/fm_learn_sgd_element.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // fm_learn_sgd.h: Stochastic Gradient Descent based learning for 21 | // classification and regression 22 | // 23 | // Based on the publication(s): 24 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th 25 | // IEEE International Conference on Data Mining (ICDM 2010), Sydney, 26 | // Australia. 27 | 28 | #ifndef FM_LEARN_SGD_ELEMENT_H_ 29 | #define FM_LEARN_SGD_ELEMENT_H_ 30 | 31 | #include "fm_learn_sgd.h" 32 | 33 | // 继承了fm_learn_sgd 34 | class fm_learn_sgd_element: public fm_learn_sgd { 35 | public: 36 | // 初始化 37 | virtual void init() { 38 | fm_learn_sgd::init(); 39 | // 日志输出 40 | if (log != NULL) { 41 | log->addField("rmse_train", std::numeric_limits::quiet_NaN()); 42 | } 43 | } 44 | // 利用SGD训练FM模型 45 | virtual void learn(Data& train, Data& test) { 46 | fm_learn_sgd::learn(train, test);// 输出参数信息 47 | 48 | std::cout << "SGD: DON'T FORGET TO SHUFFLE THE ROWS IN TRAINING DATA TO GET THE BEST RESULTS." << std::endl; 49 | // SGD 50 | for (int i = 0; i < num_iter; i++) {// 开始迭代,每一轮的迭代过程 51 | double iteration_time = getusertime();// 记录开始的时间 52 | for (train.data->begin(); !train.data->end(); train.data->next()) {// 对于每一个样本 53 | double p = fm->predict(train.data->getRow(), sum, sum_sqr);// 得到样本的预测值 54 | double mult = 0;// 损失函数的导数 55 | if (task == 0) {// 回归 56 | p = std::min(max_target, p); 57 | p = std::max(min_target, p); 58 | // loss=(y_ori-y_pre)^2 59 | mult = -(train.target(train.data->getRowIndex())-p);// 对损失函数求导 60 | } else if (task == 1) {// 分类 61 | // loss 62 | mult = -train.target(train.data->getRowIndex())*(1.0-1.0/(1.0+exp(-train.target(train.data->getRowIndex())*p))); 63 | } 64 | // 利用梯度下降法对参数进行学习 65 | SGD(train.data->getRow(), mult, sum); 66 | } 67 | iteration_time = (getusertime() - iteration_time);// 记录时间差 68 | // evaluate函数是调用的fm_learn类中的方法 69 | double rmse_train = evaluate(train);// 对训练结果评估 70 | double rmse_test = evaluate(test);// 将模型应用在测试数据上 71 | std::cout << "#Iter=" << std::setw(3) << i << "\tTrain=" << rmse_train << "\tTest=" << rmse_test << std::endl; 72 | // 日志输出 73 | if (log != NULL) { 74 | log->log("rmse_train", rmse_train); 75 | log->log("time_learn", iteration_time); 76 | log->newLine(); 77 | } 78 | } 79 | } 80 | 81 | }; 82 | 83 | #endif /*FM_LEARN_SGD_ELEMENT_H_*/ 84 | -------------------------------------------------------------------------------- /libfm-1.42/src-util/rlog.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // rlog.h: Logging into R compatible files 21 | 22 | #ifndef RLOG_H_ 23 | #define RLOG_H_ 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | // 日志类 30 | class RLog { 31 | private: 32 | std::ostream* out;// 输出流 33 | std::vector header;// 保存日志需要记录的项 34 | std::map default_value;// 需要记录的项与值 35 | std::map value;// 需要记录的项与值 36 | public: 37 | // 构造方法 38 | RLog(std::ostream* stream) { 39 | out = stream;// 初始化输出流 40 | // 分别清空三个容器 41 | header.clear(); 42 | default_value.clear(); 43 | value.clear(); 44 | }; 45 | 46 | // 在value中记录相应的日志内容 47 | void log(const std::string& field, double d) { 48 | value[field] = d; 49 | } 50 | 51 | // 初始化 52 | void init() { 53 | if (out != NULL) {// 正确打开输出流 54 | // 对header中的内容进行输出 55 | for (uint i = 0; i < header.size(); i++) { 56 | *out << header[i]; 57 | if (i < (header.size()-1)) { 58 | *out << "\t"; 59 | } else { 60 | *out << "\n"; 61 | } 62 | } 63 | out->flush();// 刷新 64 | } 65 | // 将default_value中的内容转存到value中 66 | for (uint i = 0; i < header.size(); i++) { 67 | value[header[i]] = default_value[header[i]]; 68 | } 69 | } 70 | 71 | // 增加新的field 72 | void addField(const std::string& field_name, double def) { 73 | //std::cout << field_name << std::endl; std::cout.flush(); 74 | // 在header中查找field_name 75 | std::vector::iterator i = std::find(header.begin(), header.end(), field_name); 76 | if (i != header.end()) {// 存在field_name 77 | throw "the field " + field_name + " already exists"; 78 | } 79 | // 不存在 80 | header.push_back(field_name);// 将field_name添加到header中 81 | default_value[field_name] = def;// 添加到default_value中 82 | } 83 | 84 | // 将当前value中的内容输出,同时,在value中增加新的一行日志 85 | void newLine() { 86 | if (out != NULL) {// 正确打开输出流 87 | // 对value中的内容进行输出 88 | for (uint i = 0; i < header.size(); i++) { 89 | *out << value[header[i]]; 90 | // 输出标志位 91 | if (i < (header.size()-1)) { 92 | *out << "\t"; 93 | } else { 94 | *out << "\n"; 95 | } 96 | } 97 | out->flush();// 刷新 98 | value.clear();// 清空value 99 | // 将default_value中的内容转存到value中 100 | for (uint i = 0; i < header.size(); i++) { 101 | value[header[i]] = default_value[header[i]]; 102 | } 103 | } 104 | } 105 | }; 106 | 107 | 108 | #endif /*RLOG_H_*/ 109 | -------------------------------------------------------------------------------- /word2vec/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ -------------------------------------------------------------------------------- /word2vec/huffman_test.cc: -------------------------------------------------------------------------------- 1 | # include 2 | # include 3 | # include 4 | # include 5 | using namespace std; 6 | 7 | #define MAX_CODE_LENGTH 10 8 | #define vocab_size 6 9 | #define vocab_max_size 10 10 | 11 | struct vocab_word { 12 | long long cn; // 出现的次数 13 | int *point; // 从根结点到叶子节点的路径 14 | char *word, *code, codelen;// 分别对应着词,Huffman编码,编码长度 15 | }; 16 | 17 | struct vocab_word *vocab; 18 | 19 | void CreateBinaryTree() { 20 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 21 | char code[MAX_CODE_LENGTH]; 22 | 23 | // 申请2倍的词的空间,(在这里完全没有必要申请这么多的空间) 24 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 25 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 26 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 27 | 28 | // 分成两半进行初始化 29 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;// 前半部分初始化为每个词出现的次数 30 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;// 后半部分初始化为一个固定的常数 31 | 32 | // 两个指针: 33 | // pos1指向前半截的尾部 34 | // pos2指向后半截的开始 35 | pos1 = vocab_size - 1; 36 | pos2 = vocab_size; 37 | 38 | // Following algorithm constructs the Huffman tree by adding one node at a time 39 | // 每次增加一个节点,构建Huffman树 40 | for (a = 0; a < vocab_size - 1; a++) { 41 | // First, find two smallest nodes 'min1, min2' 42 | // 选择最小的节点min1 43 | if (pos1 >= 0) { 44 | if (count[pos1] < count[pos2]) { 45 | min1i = pos1; 46 | pos1--; 47 | } else { 48 | min1i = pos2; 49 | pos2++; 50 | } 51 | } else { 52 | min1i = pos2; 53 | pos2++; 54 | } 55 | // 选择最小的节点min2 56 | if (pos1 >= 0) { 57 | if (count[pos1] < count[pos2]) { 58 | min2i = pos1; 59 | pos1--; 60 | } else { 61 | min2i = pos2; 62 | pos2++; 63 | } 64 | } else { 65 | min2i = pos2; 66 | pos2++; 67 | } 68 | 69 | count[vocab_size + a] = count[min1i] + count[min2i]; 70 | // 设置父节点 71 | parent_node[min1i] = vocab_size + a; 72 | parent_node[min2i] = vocab_size + a; 73 | binary[min2i] = 1;// 设置一个子树的编码为1 74 | } 75 | // Now assign binary code to each vocabulary word 76 | // 为每一个词分配二进制编码,即Huffman编码 77 | for (a = 0; a < vocab_size*2; a++) {// 针对每一个词 78 | // 测试 79 | cout << a << ": count: " << count[a] << ", binary: " << binary[a] << ", parent_node: " << parent_node[a] << endl; 80 | /*b = a; 81 | i = 0; 82 | while (1) { 83 | code[i] = binary[b];// 找到当前的节点的编码 84 | point[i] = b;// 记录从叶子节点到根结点的序列 85 | i++; 86 | b = parent_node[b];// 找到当前节点的父节点 87 | if (b == vocab_size * 2 - 2) break;// 已经找到了根结点,根节点是没有编码的 88 | } 89 | vocab[a].codelen = i;// 词的编码长度 90 | vocab[a].point[0] = vocab_size - 2;// 根结点 91 | for (b = 0; b < i; b++) { 92 | vocab[a].code[i - b - 1] = code[b];// 编码的反转 93 | vocab[a].point[i - b] = point[b] - vocab_size;// 记录的是从根结点到叶子节点的路径 94 | }*/ 95 | } 96 | free(count); 97 | free(binary); 98 | free(parent_node); 99 | } 100 | 101 | int main() { 102 | vector cnt; 103 | cnt.push_back(8); 104 | cnt.push_back(5); 105 | cnt.push_back(4); 106 | cnt.push_back(3); 107 | cnt.push_back(1); 108 | cnt.push_back(1); 109 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 110 | for (int i = 0; i < vocab_size; i++) { 111 | vocab[i].cn = cnt[i]; 112 | } 113 | CreateBinaryTree(); 114 | return 0; 115 | } -------------------------------------------------------------------------------- /word2vec/word2vec.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 100 22 | #define EXP_TABLE_SIZE 1000 23 | #define MAX_EXP 6 24 | #define MAX_SENTENCE_LENGTH 1000 25 | #define MAX_CODE_LENGTH 40 26 | 27 | // 构建的全局变量 28 | 29 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 30 | 31 | // 定义的浮点数 32 | typedef float real; // Precision of float numbers 33 | 34 | // 词的结构体 35 | struct vocab_word { 36 | long long cn; // 出现的次数 37 | int *point; // 从根结点到叶子节点的路径 38 | char *word, *code, codelen;// 分别对应着词,Huffman编码,编码长度 39 | }; 40 | 41 | char train_file[MAX_STRING], output_file[MAX_STRING];// 训练文件,输出文件 42 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 43 | struct vocab_word *vocab; // 出现的词的统计 44 | 45 | // 初始化参数 46 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 47 | int *vocab_hash;// 存储词的hash 48 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; 49 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 50 | real alpha = 0.025, starting_alpha, sample = 1e-3; 51 | real *syn0, *syn1, *syn1neg, *expTable; 52 | clock_t start; 53 | 54 | int hs = 0, negative = 5; 55 | const int table_size = 1e8; 56 | int *table; 57 | 58 | // 生成负采样的概率表 59 | void InitUnigramTable() { 60 | int a, i; 61 | double train_words_pow = 0; 62 | double d1, power = 0.75; 63 | table = (int *)malloc(table_size * sizeof(int));// int --> int 64 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 65 | // 类似轮盘赌生成每个词的概率 66 | i = 0; 67 | d1 = pow(vocab[i].cn, power) / train_words_pow; 68 | for (a = 0; a < table_size; a++) { 69 | table[a] = i; 70 | if (a / (double)table_size > d1) { 71 | i++; 72 | d1 += pow(vocab[i].cn, power) / train_words_pow; 73 | } 74 | if (i >= vocab_size) i = vocab_size - 1; 75 | } 76 | } 77 | 78 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 79 | // 读取每一个词 80 | void ReadWord(char *word, FILE *fin) { 81 | int a = 0, ch; 82 | while (!feof(fin)) { 83 | ch = fgetc(fin); 84 | if (ch == 13) continue; // 回车,\r 85 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 86 | if (a > 0) {// 当前的词还没结束 87 | if (ch == '\n') ungetc(ch, fin); 88 | break; 89 | } 90 | if (ch == '\n') { 91 | strcpy(word, (char *)"");// 换行符用表示 92 | return; 93 | } else continue; 94 | } 95 | word[a] = ch; 96 | a++; 97 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 98 | } 99 | word[a] = 0; 100 | } 101 | 102 | // Returns hash value of a word 103 | // 取词的hash值 104 | int GetWordHash(char *word) { 105 | unsigned long long a, hash = 0; 106 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 107 | hash = hash % vocab_hash_size; 108 | return hash; 109 | } 110 | 111 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 112 | // 查找词在词库中的位置,若没有查找到则返回-1 113 | int SearchVocab(char *word) { 114 | unsigned int hash = GetWordHash(word); 115 | while (1) { 116 | if (vocab_hash[hash] == -1) return -1;// 不存在该词 117 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];// 返回索引值 118 | hash = (hash + 1) % vocab_hash_size; 119 | } 120 | return -1;// 不存在该词 121 | } 122 | 123 | // Reads a word and returns its index in the vocabulary 124 | // 返回的是在词库中的位置 125 | int ReadWordIndex(FILE *fin) { 126 | char word[MAX_STRING]; 127 | ReadWord(word, fin); 128 | if (feof(fin)) return -1; 129 | return SearchVocab(word); 130 | } 131 | 132 | // Adds a word to the vocabulary 133 | // 为词库中增加一个词 134 | int AddWordToVocab(char *word) { 135 | unsigned int hash, length = strlen(word) + 1;// 单词的长度+1 136 | if (length > MAX_STRING) length = MAX_STRING; 137 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char));//开始的位置增加指定的词 138 | strcpy(vocab[vocab_size].word, word); 139 | vocab[vocab_size].cn = 0; 140 | vocab_size++; 141 | // Reallocate memory if needed 142 | if (vocab_size + 2 >= vocab_max_size) { 143 | vocab_max_size += 1000; 144 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 145 | } 146 | hash = GetWordHash(word);// 对增加的词hash 147 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;// hash的碰撞检测 148 | vocab_hash[hash] = vocab_size - 1;// 词的hash值->词的词库中的索引 149 | return vocab_size - 1; 150 | } 151 | 152 | // Used later for sorting by word counts 153 | int VocabCompare(const void *a, const void *b) { 154 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 155 | } 156 | 157 | // Sorts the vocabulary by frequency using word counts 158 | // 根据词出现的频率对词库中的词排序 159 | void SortVocab() { 160 | int a, size; 161 | unsigned int hash; 162 | // Sort the vocabulary and keep at the first position 163 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 164 | // 排完序后需要重新做hash运算 165 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 166 | size = vocab_size; 167 | train_words = 0; 168 | for (a = 0; a < size; a++) { 169 | // Words occuring less than min_count times will be discarded from the vocab 170 | // 根据min_count对低频词的处理 171 | if ((vocab[a].cn < min_count) && (a != 0)) { 172 | vocab_size--; 173 | free(vocab[a].word); 174 | } else { 175 | // Hash will be re-computed, as after the sorting it is not actual 176 | hash=GetWordHash(vocab[a].word); 177 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 178 | vocab_hash[hash] = a; 179 | train_words += vocab[a].cn; 180 | } 181 | } 182 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 183 | // Allocate memory for the binary tree construction 184 | // 为构建huffman树申请空间 185 | for (a = 0; a < vocab_size; a++) { 186 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 187 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 188 | } 189 | } 190 | 191 | // Reduces the vocabulary by removing infrequent tokens 192 | // 删除频率较小的词 193 | void ReduceVocab() { 194 | int a, b = 0; 195 | unsigned int hash; 196 | // 通过min_reduce控制 197 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 198 | vocab[b].cn = vocab[a].cn; 199 | vocab[b].word = vocab[a].word; 200 | b++; 201 | } else free(vocab[a].word); 202 | vocab_size = b;// 删减后词的个数 203 | // 重新进行hash操作 204 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 205 | for (a = 0; a < vocab_size; a++) { 206 | // Hash will be re-computed, as it is not actual 207 | hash = GetWordHash(vocab[a].word); 208 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 209 | vocab_hash[hash] = a; 210 | } 211 | fflush(stdout); 212 | min_reduce++; 213 | } 214 | 215 | // Create binary Huffman tree using the word counts 216 | // Frequent words will have short uniqe binary codes 217 | // 根据词库中的词频构建Huffman树 218 | void CreateBinaryTree() { 219 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 220 | char code[MAX_CODE_LENGTH]; 221 | 222 | // 申请2倍的词的空间,(在这里完全没有必要申请这么多的空间) 223 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 224 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 225 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 226 | 227 | // 分成两半进行初始化 228 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;// 前半部分初始化为每个词出现的次数 229 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;// 后半部分初始化为一个固定的常数 230 | 231 | // 两个指针: 232 | // pos1指向前半截的尾部 233 | // pos2指向后半截的开始 234 | pos1 = vocab_size - 1; 235 | pos2 = vocab_size; 236 | 237 | // Following algorithm constructs the Huffman tree by adding one node at a time 238 | // 每次增加一个节点,构建Huffman树 239 | for (a = 0; a < vocab_size - 1; a++) { 240 | // First, find two smallest nodes 'min1, min2' 241 | // 选择最小的节点min1 242 | if (pos1 >= 0) { 243 | if (count[pos1] < count[pos2]) { 244 | min1i = pos1; 245 | pos1--; 246 | } else { 247 | min1i = pos2; 248 | pos2++; 249 | } 250 | } else { 251 | min1i = pos2; 252 | pos2++; 253 | } 254 | // 选择最小的节点min2 255 | if (pos1 >= 0) { 256 | if (count[pos1] < count[pos2]) { 257 | min2i = pos1; 258 | pos1--; 259 | } else { 260 | min2i = pos2; 261 | pos2++; 262 | } 263 | } else { 264 | min2i = pos2; 265 | pos2++; 266 | } 267 | 268 | count[vocab_size + a] = count[min1i] + count[min2i]; 269 | // 设置父节点 270 | parent_node[min1i] = vocab_size + a; 271 | parent_node[min2i] = vocab_size + a; 272 | binary[min2i] = 1;// 设置一个子树的编码为1 273 | } 274 | // Now assign binary code to each vocabulary word 275 | // 为每一个词分配二进制编码,即Huffman编码 276 | for (a = 0; a < vocab_size; a++) {// 针对每一个词 277 | b = a; 278 | i = 0; 279 | while (1) { 280 | code[i] = binary[b];// 找到当前的节点的编码 281 | point[i] = b;// 记录从叶子节点到根结点的序列 282 | i++; 283 | b = parent_node[b];// 找到当前节点的父节点 284 | if (b == vocab_size * 2 - 2) break;// 已经找到了根结点,根节点是没有编码的 285 | } 286 | vocab[a].codelen = i;// 词的编码长度 287 | vocab[a].point[0] = vocab_size - 2;// 根结点 288 | for (b = 0; b < i; b++) { 289 | vocab[a].code[i - b - 1] = code[b];// 编码的反转 290 | vocab[a].point[i - b] = point[b] - vocab_size;// 记录的是从根结点到叶子节点的路径 291 | } 292 | } 293 | free(count); 294 | free(binary); 295 | free(parent_node); 296 | } 297 | 298 | // 读取输入的文件,并从输入文件中构建词库 299 | void LearnVocabFromTrainFile() { 300 | char word[MAX_STRING];// 存储每一个单词 301 | FILE *fin; 302 | long long a, i; 303 | 304 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; // 初始化 305 | 306 | fin = fopen(train_file, "rb"); 307 | if (fin == NULL) { 308 | printf("ERROR: training data file not found!\n"); 309 | exit(1); 310 | } 311 | vocab_size = 0;// 记录文件中的词的个数 312 | 313 | AddWordToVocab((char *)"");// 在最开始增加指定的词 314 | 315 | // 开始从文本取每一个词 316 | while (1) { 317 | ReadWord(word, fin); // 读取每一个词 318 | if (feof(fin)) break; // 判断文件是否读完 319 | train_words++; // 记录词的个数 320 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 321 | printf("%lldK%c", train_words / 1000, 13); 322 | fflush(stdout); 323 | } 324 | i = SearchVocab(word);// 查找词在词库中的位置 325 | if (i == -1) {// 没有查找到对应的词 326 | a = AddWordToVocab(word);// 增加词 327 | vocab[a].cn = 1;// 设置词出现的次数为1 328 | } else vocab[i].cn++;// 设置词出现的次数+1 329 | 330 | // 根据当前词的个数和设定的hash表的大小,删除低频词 331 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 332 | } 333 | SortVocab();// 根据词出现的频率对词进行排序 334 | if (debug_mode > 0) { 335 | printf("Vocab size: %lld\n", vocab_size); 336 | printf("Words in train file: %lld\n", train_words); 337 | } 338 | file_size = ftell(fin); 339 | fclose(fin); 340 | } 341 | 342 | // 保存词库 343 | void SaveVocab() { 344 | long long i; 345 | FILE *fo = fopen(save_vocab_file, "wb"); 346 | // 保存词库时,保存的是词库中的词和词出现的次数 347 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 348 | fclose(fo); 349 | } 350 | 351 | void ReadVocab() { 352 | long long a, i = 0; 353 | char c; 354 | char word[MAX_STRING]; 355 | FILE *fin = fopen(read_vocab_file, "rb"); 356 | if (fin == NULL) { 357 | printf("Vocabulary file not found\n"); 358 | exit(1); 359 | } 360 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; // 初始化vocab_hash 361 | vocab_size = 0; 362 | while (1) { 363 | ReadWord(word, fin); 364 | if (feof(fin)) break; 365 | a = AddWordToVocab(word); 366 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 367 | i++; 368 | } 369 | SortVocab(); 370 | if (debug_mode > 0) { 371 | printf("Vocab size: %lld\n", vocab_size); 372 | printf("Words in train file: %lld\n", train_words); 373 | } 374 | fin = fopen(train_file, "rb"); 375 | if (fin == NULL) { 376 | printf("ERROR: training data file not found!\n"); 377 | exit(1); 378 | } 379 | fseek(fin, 0, SEEK_END); 380 | file_size = ftell(fin); 381 | fclose(fin); 382 | } 383 | 384 | // 初始化网络 385 | // 主要分为两个部分:1、对词向量的初始化;2、对映射层到输出层权重的初始化 386 | void InitNet() { 387 | long long a, b; 388 | unsigned long long next_random = 1; 389 | 390 | // 为每一个词分配词向量的空间 391 | // 对齐分配内存,posix_memalign函数的用法类似于malloc的用法,最后一个参数的分配的内存的大小 392 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 393 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 394 | 395 | // 层次softmax的结构 396 | if (hs) { 397 | // 映射层到输出层之间的权重 398 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 399 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 400 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 401 | syn1[a * layer1_size + b] = 0;// 权重初始化为0 402 | } 403 | 404 | // 负采样的结构 405 | if (negative>0) { 406 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); 407 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 408 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 409 | syn1neg[a * layer1_size + b] = 0; 410 | } 411 | 412 | // 随机初始化 413 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 414 | next_random = next_random * (unsigned long long)25214903917 + 11; 415 | // 1、与:相当于将数控制在一定范围内 416 | // 2、0xFFFF:65536 417 | // 3、/65536:[0,1]之间 418 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;// 初始化词向量 419 | } 420 | 421 | // 构建Huffman树 422 | CreateBinaryTree(); 423 | } 424 | 425 | void *TrainModelThread(void *id) { 426 | long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; 427 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 428 | long long l1, l2, c, target, label, local_iter = iter; 429 | unsigned long long next_random = (long long)id; 430 | real f, g; 431 | clock_t now; 432 | 433 | // layer1_size为词向量的长度 434 | real *neu1 = (real *)calloc(layer1_size, sizeof(real));// 存储映射层的结果 435 | real *neu1e = (real *)calloc(layer1_size, sizeof(real));// skip-gram中使用到的向量 436 | 437 | FILE *fi = fopen(train_file, "rb"); 438 | // 利用多线程对训练文件划分,每个线程训练一部分的数据 439 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 440 | 441 | // 训练模型的核心部分 442 | while (1) { 443 | // 每处理10000个词重新计算学习率 444 | if (word_count - last_word_count > 10000) {// 每处理10000个词重新计算学习率 445 | word_count_actual += word_count - last_word_count; 446 | last_word_count = word_count; 447 | if ((debug_mode > 1)) { 448 | now=clock(); 449 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 450 | word_count_actual / (real)(iter * train_words + 1) * 100, 451 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 452 | fflush(stdout); 453 | } 454 | // 重新计算alpha的值 455 | alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 456 | // 防止学习率过小 457 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 458 | } 459 | 460 | // sentence_length=0表示的是当前还没有读取文本 461 | // 开始读取文本,读取词的个数最多为MAX_SENTENCE_LENGTH 462 | if (sentence_length == 0) { 463 | // 需要根据文件指针的位置读取相应的文本 464 | while (1) { 465 | word = ReadWordIndex(fi);// 词在词库中的索引 466 | 467 | if (feof(fi)) break; 468 | if (word == -1) continue;// 没有查到该词 469 | word_count++; 470 | 471 | if (word == 0) break; 472 | 473 | // The subsampling randomly discards frequent words while keeping the ranking same 474 | if (sample > 0) { 475 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 476 | next_random = next_random * (unsigned long long)25214903917 + 11; 477 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 478 | } 479 | 480 | sen[sentence_length] = word;// 存储词在词库中的位置,word代表的是Index 481 | sentence_length++; 482 | if (sentence_length >= MAX_SENTENCE_LENGTH) break;// 达到指定长度 483 | } 484 | sentence_position = 0;// 将待处理的文本指针置0 485 | } 486 | 487 | // 当前的线程已经处理完分配给该线程的文本 488 | if (feof(fi) || (word_count > train_words / num_threads)) {// 当前线程已经读完数据 489 | word_count_actual += word_count - last_word_count; 490 | // 当前线程的迭代次数 491 | local_iter--; 492 | if (local_iter == 0) break;// 迭代结束 493 | // 重新置0,准备下一次重新迭代 494 | word_count = 0; 495 | last_word_count = 0; 496 | sentence_length = 0; 497 | // 重置文件指针 498 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 499 | continue; 500 | } 501 | 502 | // sen表示的是当前的线程读取到的每一个词对应在词库中的索引 503 | word = sen[sentence_position];//sentence_position表示的是当前词 504 | if (word == -1) continue; 505 | 506 | // 初始化映射层 507 | for (c = 0; c < layer1_size; c++) neu1[c] = 0;// 映射层的结果 508 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 509 | 510 | // 产生一个0~window-1的随机数 511 | next_random = next_random * (unsigned long long)25214903917 + 11; 512 | b = next_random % window; 513 | 514 | // 模型的训练 515 | if (cbow) { // 训练CBOW模型 516 | // in -> hidden 517 | // 输入层到映射层 518 | cw = 0; 519 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 520 | c = sentence_position - window + a;// sentence_position表示的是当前的位置 521 | // 判断c是否越界 522 | if (c < 0) continue; 523 | if (c >= sentence_length) continue; 524 | 525 | last_word = sen[c];// 找到c对应的索引 526 | if (last_word == -1) continue; 527 | 528 | for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];// 累加 529 | cw++; 530 | } 531 | 532 | if (cw) { 533 | for (c = 0; c < layer1_size; c++) neu1[c] /= cw;// 计算均值 534 | 535 | // 计算的中心词是word 536 | // 层次Softmax 537 | if (hs) for (d = 0; d < vocab[word].codelen; d++) {// word为当前词 538 | // 计算输出层的输出 539 | f = 0; 540 | l2 = vocab[word].point[d] * layer1_size;// 找到第d个词对应的权重 541 | // Propagate hidden -> output 542 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];// 映射层到输出层 543 | 544 | if (f <= -MAX_EXP) continue; 545 | else if (f >= MAX_EXP) continue; 546 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];// Sigmoid结果 547 | 548 | // 'g' is the gradient multiplied by the learning rate 549 | g = (1 - vocab[word].code[d] - f) * alpha; 550 | // Propagate errors output -> hidden 551 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];// 修改映射后的结果 552 | // Learn weights hidden -> output 553 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];// 修改映射层到输出层之间的权重 554 | } 555 | // NEGATIVE SAMPLING 556 | // 负采样 557 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 558 | // 标记target和label 559 | if (d == 0) {// 正样本 560 | target = word; 561 | label = 1; 562 | } else {// 选择出负样本 563 | next_random = next_random * (unsigned long long)25214903917 + 11; 564 | target = table[(next_random >> 16) % table_size];// 从table表中选择出负样本 565 | // 重新选择 566 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 567 | if (target == word) continue; 568 | label = 0; 569 | } 570 | 571 | l2 = target * layer1_size; 572 | f = 0; 573 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];// 映射层到输出层 574 | 575 | // g 576 | if (f > MAX_EXP) g = (label - 1) * alpha; 577 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 578 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 579 | 580 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 581 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 582 | } 583 | // hidden -> in 584 | // 以上是从映射层到输出层的修改,现在返回修改每一个词向量 585 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 586 | c = sentence_position - window + a; 587 | if (c < 0) continue; 588 | if (c >= sentence_length) continue; 589 | last_word = sen[c]; 590 | if (last_word == -1) continue; 591 | // 利用窗口内的所有词向量的梯度之和来更新 592 | for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 593 | } 594 | } 595 | } else { //train skip-gram 训练skip-gram模型 596 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 597 | c = sentence_position - window + a; 598 | if (c < 0) continue; 599 | if (c >= sentence_length) continue; 600 | last_word = sen[c]; 601 | if (last_word == -1) continue; 602 | l1 = last_word * layer1_size; 603 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 604 | // HIERARCHICAL SOFTMAX 605 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 606 | f = 0; 607 | l2 = vocab[word].point[d] * layer1_size; 608 | // Propagate hidden -> output 609 | // 映射层即为输入层 610 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; 611 | 612 | if (f <= -MAX_EXP) continue; 613 | else if (f >= MAX_EXP) continue; 614 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 615 | 616 | // 'g' is the gradient multiplied by the learning rate 617 | g = (1 - vocab[word].code[d] - f) * alpha; 618 | // Propagate errors output -> hidden 619 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 620 | // Learn weights hidden -> output 621 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; 622 | } 623 | // NEGATIVE SAMPLING 624 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 625 | if (d == 0) { 626 | target = word; 627 | label = 1; 628 | } else { 629 | next_random = next_random * (unsigned long long)25214903917 + 11; 630 | target = table[(next_random >> 16) % table_size]; 631 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 632 | if (target == word) continue; 633 | label = 0; 634 | } 635 | l2 = target * layer1_size; 636 | f = 0; 637 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 638 | if (f > MAX_EXP) g = (label - 1) * alpha; 639 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 640 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 641 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 642 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; 643 | } 644 | 645 | // Learn weights input -> hidden 646 | for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; 647 | } 648 | } 649 | // 当已经处理完读入的所有文本,要重新继续往下读文本 650 | sentence_position++; 651 | if (sentence_position >= sentence_length) { 652 | sentence_length = 0; 653 | continue; 654 | } 655 | } 656 | fclose(fi); 657 | free(neu1); 658 | free(neu1e); 659 | pthread_exit(NULL); 660 | } 661 | 662 | // 模型训练 663 | void TrainModel() { 664 | long a, b, c, d; 665 | FILE *fo; 666 | 667 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));// 多线程 668 | 669 | printf("Starting training using file %s\n", train_file); 670 | starting_alpha = alpha; 671 | 672 | // 区分是否指定词库 673 | // 若指定词库,则从词库中读入词 674 | // 若不指定词库,则从文件中构建词库 675 | if (read_vocab_file[0] != 0) 676 | ReadVocab();// 指定词库 677 | else 678 | LearnVocabFromTrainFile();// 不指定词库,从文件中构建词库 679 | 680 | if (save_vocab_file[0] != 0) SaveVocab();// 判断是否需要保存词库 681 | 682 | // 若没有指定输出文件,则退出 683 | if (output_file[0] == 0) return; 684 | 685 | InitNet();// 初始化网络 686 | 687 | if (negative > 0) InitUnigramTable();// 利用负采样的方法 688 | 689 | // 开始训练 690 | start = clock(); 691 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 692 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 693 | 694 | // 输出最终的训练结果 695 | fo = fopen(output_file, "wb"); 696 | if (classes == 0) { 697 | // Save the word vectors 698 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 699 | for (a = 0; a < vocab_size; a++) { 700 | fprintf(fo, "%s ", vocab[a].word); 701 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 702 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 703 | fprintf(fo, "\n"); 704 | } 705 | } else { 706 | // Run K-means on the word vectors 707 | int clcn = classes, iter = 10, closeid; 708 | int *centcn = (int *)malloc(classes * sizeof(int)); 709 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 710 | real closev, x; 711 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 712 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 713 | for (a = 0; a < iter; a++) { 714 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 715 | for (b = 0; b < clcn; b++) centcn[b] = 1; 716 | for (c = 0; c < vocab_size; c++) { 717 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 718 | centcn[cl[c]]++; 719 | } 720 | for (b = 0; b < clcn; b++) { 721 | closev = 0; 722 | for (c = 0; c < layer1_size; c++) { 723 | cent[layer1_size * b + c] /= centcn[b]; 724 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 725 | } 726 | closev = sqrt(closev); 727 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 728 | } 729 | for (c = 0; c < vocab_size; c++) { 730 | closev = -10; 731 | closeid = 0; 732 | for (d = 0; d < clcn; d++) { 733 | x = 0; 734 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 735 | if (x > closev) { 736 | closev = x; 737 | closeid = d; 738 | } 739 | } 740 | cl[c] = closeid; 741 | } 742 | } 743 | // Save the K-means classes 744 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 745 | free(centcn); 746 | free(cent); 747 | free(cl); 748 | } 749 | fclose(fo); 750 | } 751 | 752 | // 解析命令行 753 | int ArgPos(char *str, int argc, char **argv) { 754 | int a; 755 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {// 查找对应的参数 756 | if (a == argc - 1) { 757 | printf("Argument missing for %s\n", str); 758 | exit(1); 759 | } 760 | return a;// 匹配成功,返回值所在的位置 761 | } 762 | return -1; 763 | } 764 | 765 | int main(int argc, char **argv) { 766 | int i; 767 | // 判断参数的个数 768 | if (argc == 1) { 769 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 770 | printf("Options:\n"); 771 | printf("Parameters for training:\n"); 772 | printf("\t-train \n"); 773 | printf("\t\tUse text data from to train the model\n"); 774 | printf("\t-output \n"); 775 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 776 | printf("\t-size \n"); 777 | printf("\t\tSet size of word vectors; default is 100\n"); 778 | printf("\t-window \n"); 779 | printf("\t\tSet max skip length between words; default is 5\n"); 780 | printf("\t-sample \n"); 781 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 782 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 783 | printf("\t-hs \n"); 784 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 785 | printf("\t-negative \n"); 786 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 787 | printf("\t-threads \n"); 788 | printf("\t\tUse threads (default 12)\n"); 789 | printf("\t-iter \n"); 790 | printf("\t\tRun more training iterations (default 5)\n"); 791 | printf("\t-min-count \n"); 792 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 793 | printf("\t-alpha \n"); 794 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 795 | printf("\t-classes \n"); 796 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 797 | printf("\t-debug \n"); 798 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 799 | printf("\t-binary \n"); 800 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 801 | printf("\t-save-vocab \n"); 802 | printf("\t\tThe vocabulary will be saved to \n"); 803 | printf("\t-read-vocab \n"); 804 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 805 | printf("\t-cbow \n"); 806 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 807 | printf("\nExamples:\n"); 808 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 809 | return 0; 810 | } 811 | 812 | output_file[0] = 0;// 输出文件 813 | save_vocab_file[0] = 0;// 输出词的文件 814 | read_vocab_file[0] = 0;// 读入指定词的文件 815 | 816 | // 解析word2vec所需用到的参数 817 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 818 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 819 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 820 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 821 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 822 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 823 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 824 | if (cbow) alpha = 0.05; 825 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 826 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 827 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 828 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 829 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 830 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 831 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 832 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 833 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 834 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 835 | 836 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));// 存储每一个词的结构体 837 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));// 存储词的hash 838 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));// 申请EXP_TABLE_SIZE+1个空间 839 | 840 | // 计算sigmoid值 841 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 842 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 843 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 844 | } 845 | 846 | // 开始模型训练 847 | TrainModel();// 模型训练 848 | return 0; 849 | } 850 | --------------------------------------------------------------------------------