├── .DS_Store
├── README.md
├── libfm-1.42
├── src-fm_core
│ ├── fm_data.h
│ └── fm_model.h
├── src-linfm-src
│ └── fm_learn_sgd_element.h
└── src-util
│ └── rlog.h
└── word2vec
├── .gitignore
├── huffman_test.cc
└── word2vec.c
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaozhiyong19890102/OpenSourceReading/63ed4ee07c6f2627717920f28b9e51557e0be1b1/.DS_Store
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OpenSourceReading
2 |
3 | 对一些开源代码的学习与总结,从源码中学习。
4 |
5 | ## 1、word2vec
6 |
7 | - [代码注释](https://github.com/zhaozhiyong19890102/OpenSourceReading/tree/master/word2vec)
8 | - [机器学习算法实现解析——word2vec源码解析](https://blog.csdn.net/google19890102/article/details/51887344 " 机器学习算法实现解析——word2vec源码解析")
9 |
10 | ## 2、libfm-1.42
11 |
12 | - [代码注释](https://github.com/zhaozhiyong19890102/OpenSourceReading/tree/master/libfm-1.42)
13 | - [机器学习算法实现解析——libFM之libFM的模型处理部分](https://blog.csdn.net/google19890102/article/details/72866290 "机器学习算法实现解析——libFM之libFM的模型处理部分")
14 | - [机器学习算法实现解析——libFM之libFM的训练过程概述](https://blog.csdn.net/google19890102/article/details/72866320 " 机器学习算法实现解析——libFM之libFM的训练过程概述")
15 | - [机器学习算法实现解析——libFM之libFM的训练过程之SGD的方法](https://blog.csdn.net/google19890102/article/details/72866334 "机器学习算法实现解析——libFM之libFM的训练过程之SGD的方法")
16 | - [机器学习算法实现解析——libFM之libFM的训练过程之Adaptive Regularization](https://blog.csdn.net/google19890102/article/details/73301949 "机器学习算法实现解析——libFM之libFM的训练过程之Adaptive Regularization")
17 |
18 | ## 3、liblbfgs-1.10
19 |
20 | - [机器学习算法实现解析——liblbfgs之L-BFGS算法](https://blog.csdn.net/google19890102/article/details/77187890 "机器学习算法实现解析——liblbfgs之L-BFGS算法")
--------------------------------------------------------------------------------
/libfm-1.42/src-fm_core/fm_data.h:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle
2 | // Contact: srendle@libfm.org, http://www.libfm.org/
3 | //
4 | // This file is part of libFM.
5 | //
6 | // libFM is free software: you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation, either version 3 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // libFM is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License
17 | // along with libFM. If not, see .
18 | //
19 | //
20 | // fm_data.h: Base data type of libFM
21 |
22 | #ifndef FM_DATA_H_
23 | #define FM_DATA_H_
24 |
25 | typedef float FM_FLOAT;// 定义数据类型
26 |
27 | #endif /*FM_DATA_H_*/
28 |
--------------------------------------------------------------------------------
/libfm-1.42/src-fm_core/fm_model.h:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle
2 | // Contact: srendle@libfm.org, http://www.libfm.org/
3 | //
4 | // This file is part of libFM.
5 | //
6 | // libFM is free software: you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation, either version 3 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // libFM is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License
17 | // along with libFM. If not, see .
18 | //
19 | //
20 | // fm_model.h: Model for Factorization Machines
21 | //
22 | // Based on the publication(s):
23 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th
24 | // IEEE International Conference on Data Mining (ICDM 2010), Sydney,
25 | // Australia.
26 |
27 | #ifndef FM_MODEL_H_
28 | #define FM_MODEL_H_
29 |
30 | #include "../util/matrix.h"
31 | #include "../util/fmatrix.h"
32 |
33 | #include "fm_data.h"
34 |
35 | // fm_model模型类
36 | class fm_model {
37 | private:
38 | DVector m_sum, m_sum_sqr;// 分别对应着交叉项的中的两项
39 | public: //fm模型中的参数
40 | double w0;// 常数项
41 | DVectorDouble w;// 一次项的系数
42 | DMatrixDouble v;// 交叉项的系数矩阵
43 |
44 | public:
45 | // 属性
46 | // the following values should be set:
47 | uint num_attribute;// 特征的个数
48 |
49 | bool k0, k1;// 是否包含常数项和一次项
50 | int num_factor;// 交叉项因子的个数
51 |
52 | double reg0;// 常数项的正则参数
53 | double regw, regv;// 一次项和交叉项的正则系数
54 |
55 | double init_stdev;// 初始化参数时的方差
56 | double init_mean;// 初始化参数时的均值
57 |
58 | // 函数
59 | fm_model();// 构造函数,主要完成参数的初始化
60 | void debug();// debug函数
61 | void init();// 初始化函数,主要用于生成各维度系数的初始值
62 | // 对样本进行预测
63 | double predict(sparse_row& x);
64 | double predict(sparse_row& x, DVector &sum, DVector &sum_sqr);
65 | };
66 |
67 | // fm_model类的构造函数
68 | fm_model::fm_model() {
69 | num_factor = 0;// 交叉项中因子的个数
70 | init_mean = 0;// 初始化的均值
71 | init_stdev = 0.01;// 初始化的方差
72 | reg0 = 0.0;// 常数项的正则化参数
73 | regw = 0.0;// 一次项的正则化参数
74 | regv = 0.0;// 交叉项的正则化参数
75 | k0 = true;// 是否包含常数项
76 | k1 = true;// 是否包含一次项
77 | }
78 |
79 | // debug函数,主要用于输出中间调试的结果
80 | void fm_model::debug() {
81 | std::cout << "num_attributes=" << num_attribute << std::endl;
82 | std::cout << "use w0=" << k0 << std::endl;
83 | std::cout << "use w1=" << k1 << std::endl;
84 | std::cout << "dim v =" << num_factor << std::endl;
85 | std::cout << "reg_w0=" << reg0 << std::endl;
86 | std::cout << "reg_w=" << regw << std::endl;
87 | std::cout << "reg_v=" << regv << std::endl;
88 | std::cout << "init ~ N(" << init_mean << "," << init_stdev << ")" << std::endl;
89 | }
90 |
91 | // 初始化fm模型的函数
92 | void fm_model::init() {
93 | w0 = 0;// 常数项的系数
94 | w.setSize(num_attribute);// 设置一次项系数的个数
95 | v.setSize(num_factor, num_attribute);// 设置交叉项的矩阵大小
96 | w.init(0);// 初始化一次项系数为0
97 | v.init(init_mean, init_stdev);// 按照均值和方差初始化交叉项系数
98 | // 交叉项中的两个参数,设置其大小为num_factor
99 | m_sum.setSize(num_factor);
100 | m_sum_sqr.setSize(num_factor);
101 | }
102 |
103 | // 对样本进行预测,其中x表示的是一行样本
104 | double fm_model::predict(sparse_row& x) {
105 | return predict(x, m_sum, m_sum_sqr);
106 | }
107 |
108 | double fm_model::predict(sparse_row& x, DVector &sum, DVector &sum_sqr) {
109 | double result = 0;// 最终的结果
110 | // 第一部分
111 | if (k0) {// 常数项
112 | result += w0;
113 | }
114 |
115 | // 第二部分
116 | if (k1) {// 一次项
117 | for (uint i = 0; i < x.size; i++) {// 对样本中的每一个特征
118 | assert(x.data[i].id < num_attribute);// 验证样本的正确性
119 | // w * x
120 | result += w(x.data[i].id) * x.data[i].value;
121 | }
122 | }
123 |
124 | // 第三部分
125 | // 交叉项,对应着公式,有两重循环
126 | for (int f = 0; f < num_factor; f++) {// 外层循环
127 | sum(f) = 0;
128 | sum_sqr(f) = 0;
129 | for (uint i = 0; i < x.size; i++) {
130 | double d = v(f,x.data[i].id) * x.data[i].value;
131 | sum(f) += d;
132 | sum_sqr(f) += d*d;
133 | }
134 | result += 0.5 * (sum(f)*sum(f) - sum_sqr(f));// 得到交叉项的值
135 | }
136 | return result;
137 | }
138 |
139 | #endif /*FM_MODEL_H_*/
140 |
--------------------------------------------------------------------------------
/libfm-1.42/src-linfm-src/fm_learn_sgd_element.h:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle
2 | // Contact: srendle@libfm.org, http://www.libfm.org/
3 | //
4 | // This file is part of libFM.
5 | //
6 | // libFM is free software: you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation, either version 3 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // libFM is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License
17 | // along with libFM. If not, see .
18 | //
19 | //
20 | // fm_learn_sgd.h: Stochastic Gradient Descent based learning for
21 | // classification and regression
22 | //
23 | // Based on the publication(s):
24 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th
25 | // IEEE International Conference on Data Mining (ICDM 2010), Sydney,
26 | // Australia.
27 |
28 | #ifndef FM_LEARN_SGD_ELEMENT_H_
29 | #define FM_LEARN_SGD_ELEMENT_H_
30 |
31 | #include "fm_learn_sgd.h"
32 |
33 | // 继承了fm_learn_sgd
34 | class fm_learn_sgd_element: public fm_learn_sgd {
35 | public:
36 | // 初始化
37 | virtual void init() {
38 | fm_learn_sgd::init();
39 | // 日志输出
40 | if (log != NULL) {
41 | log->addField("rmse_train", std::numeric_limits::quiet_NaN());
42 | }
43 | }
44 | // 利用SGD训练FM模型
45 | virtual void learn(Data& train, Data& test) {
46 | fm_learn_sgd::learn(train, test);// 输出参数信息
47 |
48 | std::cout << "SGD: DON'T FORGET TO SHUFFLE THE ROWS IN TRAINING DATA TO GET THE BEST RESULTS." << std::endl;
49 | // SGD
50 | for (int i = 0; i < num_iter; i++) {// 开始迭代,每一轮的迭代过程
51 | double iteration_time = getusertime();// 记录开始的时间
52 | for (train.data->begin(); !train.data->end(); train.data->next()) {// 对于每一个样本
53 | double p = fm->predict(train.data->getRow(), sum, sum_sqr);// 得到样本的预测值
54 | double mult = 0;// 损失函数的导数
55 | if (task == 0) {// 回归
56 | p = std::min(max_target, p);
57 | p = std::max(min_target, p);
58 | // loss=(y_ori-y_pre)^2
59 | mult = -(train.target(train.data->getRowIndex())-p);// 对损失函数求导
60 | } else if (task == 1) {// 分类
61 | // loss
62 | mult = -train.target(train.data->getRowIndex())*(1.0-1.0/(1.0+exp(-train.target(train.data->getRowIndex())*p)));
63 | }
64 | // 利用梯度下降法对参数进行学习
65 | SGD(train.data->getRow(), mult, sum);
66 | }
67 | iteration_time = (getusertime() - iteration_time);// 记录时间差
68 | // evaluate函数是调用的fm_learn类中的方法
69 | double rmse_train = evaluate(train);// 对训练结果评估
70 | double rmse_test = evaluate(test);// 将模型应用在测试数据上
71 | std::cout << "#Iter=" << std::setw(3) << i << "\tTrain=" << rmse_train << "\tTest=" << rmse_test << std::endl;
72 | // 日志输出
73 | if (log != NULL) {
74 | log->log("rmse_train", rmse_train);
75 | log->log("time_learn", iteration_time);
76 | log->newLine();
77 | }
78 | }
79 | }
80 |
81 | };
82 |
83 | #endif /*FM_LEARN_SGD_ELEMENT_H_*/
84 |
--------------------------------------------------------------------------------
/libfm-1.42/src-util/rlog.h:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle
2 | // Contact: srendle@libfm.org, http://www.libfm.org/
3 | //
4 | // This file is part of libFM.
5 | //
6 | // libFM is free software: you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation, either version 3 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // libFM is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License
17 | // along with libFM. If not, see .
18 | //
19 | //
20 | // rlog.h: Logging into R compatible files
21 |
22 | #ifndef RLOG_H_
23 | #define RLOG_H_
24 | #include
25 | #include
26 | #include
27 | #include