├── .gitignore
├── Makefile
├── README.md
├── liblr.so
├── python-package
    ├── example.py
    ├── features.dat
    ├── labels.dat
    └── lr
    │   ├── __init__.py
    │   ├── liblr.so
    │   └── model.py
└── src
    ├── lr.cc
    ├── lr.h
    ├── python_wrapper.cc
    ├── utils.cc
    └── utils.h


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.dat
3 | test.cc
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | src=src/python_wrapper.cc src/lr.cc src/utils.cc
2 | main:$(src)
3 | 	g++ -fPIC -shared -fopenmp -o liblr.so $(src)
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## RTFSC
 2 | 
 3 | 两年前甚至是三年前开始，你会发现越来越多的人转行做算法，业界也很给力，源源不断地发布各种数据挖掘类型的比赛，成为很多人从入门到实践的第一课．如果说学术的作用是推动算法创新，那么此类比赛的作用便是创新落地，以及检验那些在实践中真正work的东西．然而，实际上我发现这样的比赛很少，大多数赛题其实只是官方宣传自己的一种手段，题目类型非常陈旧，以致于参赛选手只需要`import xgboost as xgb`就行了．从我过去一两年的参赛经验来看，`import xgboost as xgb`的确是很有效的，而从头改算法造轮子最终都是劳而无功或者收效甚微．如果你赞同这个观点的话，右上角`star`一下．
 4 | 
 5 | 唠叨这些跟这个repo有什么关系？　关注我的人里面，有不少是因为看到我以前的一些参赛代码，可能大部分都是在校生，可能现在正在参加某个比赛．我想给一些小小的个人建议，不要日复一日地重复`import xgboost as xgb`或者`import lightgbm as lgb`，做一些门槛更高的东西，比如学术里的前沿算法，比如工程上机器学习系统的高效实现．机器学习涉及到的领域很多很多，你我还需要不断学习，就不要重复地去写`import xxx`了．
 6 | 
 7 | 这个小项目最初是为了在毕业离校前做一个简单的组内分享，科普一下机器学习算法包的实现流程．现在打算开源，对很多入门的朋友或许有帮助，但因为懒没有写出完整文档，感兴趣的朋友只能将就读代码了，相信Linus，代码是最好的文档.
 8 | 
 9 | ## 机器学习算法的底层实现与高层调用
10 | 
11 | 以最简单的机器学习算法逻辑回归为例，介绍底层C++实现，以及高层Python调用，掌握ctypes基本用法．
12 | 
13 | ## 源码说明
14 | 
15 | - `src/`, c++实现逻辑回归，主要源码是`lr.cc`与`utils.cc`．`python_wrapper.cc`实现了一些辅助函数，暴露C风格接口给python
16 | - `python-package`，通过`ctypes`实现python调用C函数，`lr/model.py`封装了相关函数，`example.py`是具体的实例
17 | 
18 | ## 依赖
19 | 
20 | - Eigen
21 | 
22 | ### 使用方法
23 | 
24 | - 编译得到动态链接库`liblr.so`
25 | 
26 | ```
27 | g++ -fPIC -shared -fopenmp -o liblr.so python_wrapper.cc lr.cc utils.cc
28 | ```
29 | 
30 | - 复制到相应文件夹下，`cp liblr.so python-package/lr/`
31 | 
32 | - 运行　`python example.py`
33 | 
34 | ```python
35 | 
36 | from lr import model
37 | import numpy as np
38 | 
39 | # custom metric function, mean accuracy
40 | def mean_accuracy(label,pred,size):
41 |     num_pos,hit_pos = 0.0,0.0
42 |     num_neg,hit_neg = 0.0,0.0
43 |     for i in range(size):
44 |         if label[i]==1.0:
45 |             num_pos += 1.0
46 |             if pred[i]>0.5:
47 |                 hit_pos += 1.0
48 | 
49 |         if label[i]==0.0:
50 |             num_neg += 1.0
51 |             if pred[i]<=0.5:
52 |                 hit_neg += 1.0
53 |     print "pos-accracy:{0:.5f},neg-accuracy:{1:.5f}".format(hit_pos/num_pos,hit_neg/num_neg)
54 |     return 0.5*hit_pos/num_pos + 0.5*hit_neg/num_neg
55 | 
56 | 
57 | features = np.load('features.dat')
58 | labels = np.load('labels.dat')
59 | print features.shape,labels.shape
60 | 
61 | clf = model(max_iter=1000,alpha=0.01,l2_lambda=0.5,tolerance=0.01)
62 | clf.fit(features,labels,batch_size=1024,early_stopping_round=100,metric=mean_accuracy)
63 | print clf.predict(features[:30])
64 | 
65 | clf.save("/home/wepon/lr.model")
66 | clf1 = model()
67 | clf1.load("/home/wepon/lr.model")
68 | print clf1.predict(features[:30])
69 | 
70 | ```
71 | 
72 | 


--------------------------------------------------------------------------------
/liblr.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/dive-into-ml-system/3aa2a00205c2dfccecbeac502d8272531fe86740/liblr.so


--------------------------------------------------------------------------------
/python-package/example.py:
--------------------------------------------------------------------------------
 1 | from lr import model
 2 | import numpy as np
 3 | 
 4 | # custom metric function, mean accuracy
 5 | def mean_accuracy(label,pred,size):
 6 |     num_pos,hit_pos = 0.0,0.0
 7 |     num_neg,hit_neg = 0.0,0.0
 8 |     for i in range(size):
 9 |         if label[i]==1.0:
10 |             num_pos += 1.0
11 |             if pred[i]>0.5:
12 |                 hit_pos += 1.0
13 | 
14 |         if label[i]==0.0:
15 |             num_neg += 1.0
16 |             if pred[i]<=0.5:
17 |                 hit_neg += 1.0
18 |     print "pos-accracy:{0:.5f},neg-accuracy:{1:.5f}".format(hit_pos/num_pos,hit_neg/num_neg)
19 |     return 0.5*hit_pos/num_pos + 0.5*hit_neg/num_neg
20 | 
21 | 
22 | features = np.load('features.dat')
23 | labels = np.load('labels.dat')
24 | print features.shape,labels.shape,labels.sum()
25 | 
26 | clf = model(max_iter=1000,alpha=0.01,l2_lambda=0.5,tolerance=0.01)
27 | clf.fit(features,labels,batch_size=1024,early_stopping_round=100,metric=mean_accuracy)
28 | print clf.predict(features[:30])
29 | 
30 | clf.save("/home/wepon/lr.model")
31 | clf1 = model()
32 | clf1.load("/home/wepon/lr.model")
33 | print clf1.predict(features[:30])
34 | 
35 | 


--------------------------------------------------------------------------------
/python-package/features.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/dive-into-ml-system/3aa2a00205c2dfccecbeac502d8272531fe86740/python-package/features.dat


--------------------------------------------------------------------------------
/python-package/labels.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/dive-into-ml-system/3aa2a00205c2dfccecbeac502d8272531fe86740/python-package/labels.dat


--------------------------------------------------------------------------------
/python-package/lr/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import model
2 | 


--------------------------------------------------------------------------------
/python-package/lr/liblr.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wepe/dive-into-ml-system/3aa2a00205c2dfccecbeac502d8272531fe86740/python-package/lr/liblr.so


--------------------------------------------------------------------------------
/python-package/lr/model.py:
--------------------------------------------------------------------------------
  1 | from ctypes import *
  2 | import numpy as np
  3 | import os
  4 | import shutil
  5 | from threading import Thread
  6 | 
  7 | liblr = cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__))+'/liblr.so')
  8 | 
  9 | def accuracy(y,pred,size):
 10 |     hit = 0.0
 11 |     for i in range(size):
 12 |         if y[i]==1.0 and pred[i]>0.5:
 13 |             hit += 1.0
 14 |         if y[i]==0.0 and pred[i]<=0.5:
 15 |             hit += 1.0
 16 |     return hit/size
 17 | 
 18 | 
 19 | class model(object):
 20 |     def __init__(self,max_iter=200,alpha=0.01,l2_lambda=0.01,tolerance=0.001):
 21 |         self.max_iter = max_iter
 22 |         self.alpha = alpha
 23 |         self.l2_lambda = l2_lambda
 24 |         self.tolerance = tolerance
 25 |         self.fmodel = None
 26 |         self.auto_clear = True
 27 | 
 28 |     # support python list, numpy array
 29 |     def fit(self,features,labels,batch_size=128,early_stopping_round=20,metric=accuracy):
 30 |         # convert to numpy array
 31 |         #if not isinstance(features,np.ndarray):
 32 |         features = np.asarray(features,dtype=np.double)
 33 |         #if not isinstance(labels,np.ndarray):
 34 |         labels = np.ascontiguousarray(np.asarray(labels,dtype=np.int32),dtype=np.int32)
 35 | 
 36 |         # convert to ctypes's type
 37 |         row,col = features.shape
 38 |         int_p = cast(labels.ctypes.data, POINTER(c_int))
 39 |         double_p_p = (features.ctypes.data + np.arange(features.shape[0]) * features.strides[0]).astype(np.uintp)
 40 |         char_p = c_char_p("0"*25)
 41 | 
 42 |         # call the C function
 43 |         DOUBLEPP = np.ctypeslib.ndpointer(dtype=np.uintp,ndim=1,flags='C')
 44 |         INTP = POINTER(c_int)
 45 |         METRIC = CFUNCTYPE(c_double,POINTER(c_double),POINTER(c_double),c_int)
 46 |         liblr.fit.argtypes = [DOUBLEPP,INTP,c_int,c_int,c_int,c_double,c_double,c_double,c_int,c_int,c_char_p,METRIC]
 47 |         liblr.fit.restype = None
 48 | 
 49 |         # enable interrupt
 50 |         t = Thread(target=liblr.fit,args=(double_p_p,int_p,c_int(row),c_int(col),c_int(self.max_iter),c_double(self.alpha),c_double(self.l2_lambda),c_double(self.tolerance),c_int(early_stopping_round),c_int(batch_size),char_p,METRIC(metric)))
 51 |         t.daemon = True
 52 |         t.start()
 53 |         while t.is_alive():
 54 |             t.join(0.1)
 55 | 
 56 |         # get the result
 57 |         self.fmodel = char_p.value
 58 | 
 59 |     def predict_prob(self,features):
 60 |         assert self.fmodel is not None
 61 |         # convert to numpy array
 62 |         #if not isinstance(features,np.ndarray):
 63 |         features = np.asarray(features,dtype=np.double)
 64 | 
 65 |         # convert to ctypes's type
 66 |         row,col = features.shape
 67 |         double_p_p = (features.ctypes.data + np.arange(features.shape[0]) * features.strides[0]).astype(np.uintp)
 68 |         ret = (c_double*row)(*([-1.0 for _ in range(row)]))
 69 |         ret_double_p = cast(ret,POINTER(c_double))
 70 |         # call C function
 71 |         DOUBLEPP = np.ctypeslib.ndpointer(dtype=np.uintp,ndim=1,flags='C')
 72 |         liblr.predict_prob.argtypes = [DOUBLEPP,c_int,c_int,c_char_p,POINTER(c_double)]
 73 |         liblr.predict_prob.restype = None
 74 |         # enable interrupt
 75 |         t = Thread(target=liblr.predict_prob,args=(double_p_p,c_int(row),c_int(col),c_char_p(self.fmodel),ret_double_p))
 76 |         t.daemon = True
 77 |         t.start()
 78 |         while t.is_alive():
 79 |             t.join(0.1)
 80 | 
 81 |         return [ret_double_p[i] for i in range(row)]
 82 | 
 83 |     def predict(self,features):
 84 |         assert self.fmodel is not None
 85 |         prob = self.predict_prob(features)
 86 |         return [1 if p>0.5 else 0 for p in prob]
 87 | 
 88 |     def save(self,path):
 89 |         shutil.copy(self.fmodel, path)
 90 | 
 91 |     def load(self,path):
 92 |         self.fmodel = path
 93 |         self.auto_clear = False
 94 | 
 95 |     def __del__(self):
 96 |         if self.auto_clear:
 97 |             os.remove(self.fmodel)
 98 | 
 99 |     # old version code
100 |     # only support python list
101 |     def _fit(self,features,labels):
102 |         features = [tuple(f) for f in features]
103 |         row = len(features)
104 |         col = len(features[0])
105 |         # initialize ctypes array
106 |         double_2d_array = ((c_double*col)*row)(*features)
107 |         int_array = (c_int*row)(*labels)
108 |         del features,labels
109 |         # cast to C function argument's type
110 |         int_p = cast(int_array,POINTER(c_int))
111 |         double_p_list = []
112 |         for i in range(row):
113 |             double_p_list.append(cast(double_2d_array[i],POINTER(c_double)))
114 |         double_p_p = (POINTER(c_double)*row)(*double_p_list)
115 |         # call the C function
116 |         liblr.fit.argtypes = [POINTER(POINTER(c_double)),POINTER(c_int),c_int,c_int,c_int,c_double,c_double,c_double]
117 |         liblr.fit.restype = POINTER(c_char)
118 |         res = liblr.fit(double_p_p,int_p,c_int(row),c_int(col),c_int(self.max_iter),c_double(self.alpha),c_double(self.l2_lambda),c_double(self.tolerance))
119 |         self.fmodel = ''.join([res[i] for i in range(25)])
120 | 
121 |     # only support python list
122 |     def _predict_prob(self,features):
123 |         assert self.fmodel is not None
124 |         features = [tuple(f) for f in features]
125 |         row = len(features)
126 |         col = len(features[0])
127 |         # initialize ctypes array
128 |         double_2d_array = ((c_double*col)*row)(*features)
129 |         del features
130 |         # cast to C function argument's type
131 |         double_p_list = []
132 |         for i in range(row):
133 |             double_p_list.append(cast(double_2d_array[i],POINTER(c_double)))
134 |         double_p_p = (POINTER(c_double)*row)(*double_p_list)
135 |         # call the C function
136 |         liblr.predict_prob.argtypes = [POINTER(POINTER(c_double)),c_int,c_int,POINTER(c_char)]
137 |         liblr.predict_prob.restype = POINTER(c_double)
138 | 
139 |         #res = liblr.predict_prob(double_p_p,c_int(row),c_int(col),c_char_p(self.fmodel))
140 |         res = pool.apply_async(liblr.predict_prob,(double_p_p,c_int(row),c_int(col),c_char_p(self.fmodel)))
141 |         res = res.get()
142 |         return [res[i] for i in range(row)]
143 | 


--------------------------------------------------------------------------------
/src/lr.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <sstream>
  4 | #include <string>
  5 | #include <vector>
  6 | #include <boost/format.hpp>
  7 | #include "lr.h"
  8 | #include "utils.h"
  9 | #include <omp.h>
 10 | #include <ctime>
 11 | 
 12 | using namespace std;
 13 | using namespace Eigen;
 14 | 
 15 | LR::LR(){
 16 |     LR(100,0.01,0.01,0.001);
 17 | }
 18 | 
 19 | LR::LR(int max_iter,double alpha,double lambda,double tolerance){
 20 |     this->lambda = lambda; 
 21 |     this->max_iter = max_iter;
 22 |     this->tolerance = tolerance;
 23 |     this->alpha = alpha;
 24 | }
 25 | 
 26 | LR::~LR(){}
 27 | 
 28 | void LR::fit(MatrixXd X,VectorXd y,int batch_size,int early_stopping_round,double (*metric)(double* y,double* pred,int size)){
 29 |     //learn VectorXd W, consider reg,max_iter,tol.
 30 |     srand(time(NULL));
 31 |     W = VectorXd::Random(X.cols()+1);  //the last column of weight represent bias
 32 |     MatrixXd X_new(X.rows(),X.cols()+1);
 33 |     X_new<<X,MatrixXd::Ones(X.rows(),1);  //last column is 1.0
 34 | 
 35 |     MatrixXd X_batch;
 36 |     VectorXd y_batch;
 37 |     MatrixXd X_new_batch;
 38 |     //perform early stopping
 39 |     double best_acc = -1.0;
 40 |     int become_worse_round = 0;
 41 |     for(int iter=0;iter<max_iter;iter++){
 42 |         //index of this batch samples
 43 |         int start_idx = (batch_size*iter)%(static_cast<int>(X.rows()));
 44 |         int end_idx = min(start_idx+batch_size,static_cast<int>(X.rows()));
 45 |         
 46 |         X_batch = Utils::slice(X,start_idx,end_idx-1);
 47 |         y_batch = Utils::slice(y,start_idx,end_idx-1);
 48 |         X_new_batch = Utils::slice(X_new,start_idx,end_idx-1);
 49 |         
 50 |         //
 51 |         VectorXd y_pred = predict_prob(X_batch);
 52 |         VectorXd E = y_pred - y_batch;
 53 | 
 54 |         //W:= (1-lambda/n_samples)W-alpha*X^T*E
 55 |         //reference : http://blog.csdn.net/pakko/article/details/37878837
 56 |         W = (1.0-lambda/batch_size)*W - alpha*X_new_batch.transpose()*E;
 57 | 	    
 58 |         //calculate the logloss and accuracy after this step
 59 |         y_pred = predict_prob(X_batch);
 60 |         
 61 |         double loss = Utils::crossEntropyLoss(y_batch,y_pred);
 62 |         double acc = metric(Utils::VectorXd_to_double_array(y_batch),Utils::VectorXd_to_double_array(y_pred),end_idx-start_idx);
 63 |         cout<<boost::format("Iteration: %d, logloss:%.5f, accuracy:%.5f") %iter %loss %acc<< endl;
 64 | 		
 65 |         //when loss<tolerance, break
 66 |         if(loss<=tolerance) break;
 67 | 
 68 |         //perform early stopping
 69 |         if(acc<best_acc){
 70 |             become_worse_round += 1;
 71 |         }else{
 72 |             become_worse_round = 0;
 73 |             best_acc = acc;
 74 |         }
 75 |         if(become_worse_round>=early_stopping_round){
 76 |             cout<<"Early stopping."<<endl;
 77 |             break;
 78 |         }
 79 |     }
 80 | }
 81 | 
 82 | 
 83 | VectorXd LR::predict_prob(MatrixXd X){
 84 |     //predict the probability (of label 1) for given data X
 85 |     MatrixXd X_new(X.rows(),X.cols()+1);
 86 |     X_new<<X,MatrixXd::Ones(X.rows(),1);
 87 |     int num_samples = X_new.rows();
 88 |     VectorXd y_pred_prob = VectorXd::Zero(num_samples);
 89 |     #pragma omp parallel for
 90 |     for(int num=0;num<num_samples;num++){
 91 |         y_pred_prob(num) = Utils::sigmod(X_new.row(num).dot(W));
 92 |     }
 93 |     return y_pred_prob;
 94 | }
 95 | 
 96 | 
 97 | VectorXi LR::predict(MatrixXd X){
 98 |     //predict the label for given data X
 99 |     VectorXd y_pred_prob = predict_prob(X);
100 |     VectorXi y_pred(y_pred_prob.size());
101 |     #pragma omp parallel for
102 |     for(int num=0;num<y_pred_prob.size();num++){
103 |         y_pred(num) = y_pred_prob(num)>0.5?1:0;
104 |     }
105 |     return y_pred;
106 | }
107 | 
108 | 
109 | Eigen::VectorXd LR::getW(){
110 |     return W;
111 | }
112 | 
113 | void LR::saveWeights(std::string fpath){
114 |     //save the model (save the weight ). 
115 |     std::ofstream ofile;
116 |     ofile.open(fpath.c_str());
117 |     if (!ofile.is_open()){
118 |         std::cerr<<"Can not open the file when call LR::saveWeights"<<std::endl;
119 |         return;
120 |     }
121 |     //W write into the file
122 |     for(int i=0;i<W.size()-1;i++){
123 |         ofile<<W(i)<<" ";
124 |     }
125 |     ofile<<W(W.size()-1);
126 |     ofile.close();
127 | }
128 | 
129 | 
130 | void LR::loadWeights(std::string fpath){
131 |     //load the model (load the weight ) from filename.
132 |     std::ifstream ifile;
133 |     ifile.open(fpath.c_str());
134 |     if (!ifile.is_open()){
135 |         std::cerr<<"Can not open the file when call LR::loadWeights"<<std::endl;
136 |         return;
137 |     }
138 |     //read the weights into vector<double>
139 |     std::string line;
140 |     std::vector<double> weights;
141 |     getline(ifile,line);    //only one line
142 |     std::stringstream ss(line); 
143 |     double tmp;
144 |     while(!ss.eof()){
145 |         ss>>tmp;
146 |         weights.push_back(tmp);
147 |     }
148 |     //initialize VectorXd with std::vector
149 |     W = VectorXd::Map(weights.data(),weights.size());
150 |     ifile.close();
151 | }
152 | 


--------------------------------------------------------------------------------
/src/lr.h:
--------------------------------------------------------------------------------
 1 | #ifndef __LR_H__
 2 | #define __LR_H__
 3 | 
 4 | #include <eigen3/Eigen/Dense>
 5 | #include <string>
 6 | #include "utils.h"
 7 | 
 8 | 
 9 | class LR{
10 | public:
11 |     LR();
12 |     LR(int max_iter,double alpha,double lambda,double tolerance);
13 |     ~LR();
14 |     void fit(Eigen::MatrixXd X,Eigen::VectorXd y,int batch_size,int early_stopping_round,double (*metric)(double* y,double* y_pred,int size)=Utils::accuracy);
15 |     Eigen::VectorXd getW();
16 |     Eigen::VectorXd predict_prob(Eigen::MatrixXd X);
17 |     Eigen::VectorXi predict(Eigen::MatrixXd X);
18 |     void saveWeights(std::string filename);
19 |     void loadWeights(std::string filename);
20 | private:
21 |     Eigen::VectorXd W;
22 |     int max_iter;
23 |     double lambda;  //l2 regulization
24 |     double tolerance;  // error tolence
25 |     double alpha; //learning rate
26 | };
27 | 
28 | 
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/python_wrapper.cc:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "lr.h"
 3 | #include <ctime>
 4 | 
 5 | using namespace Eigen;
 6 | using namespace std;
 7 | 
 8 | void gen_random(char *s, int len) {
 9 |     srand (time(NULL));
10 |     static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
11 |     for (int i = 0; i < len; ++i) {
12 |         s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
13 |     }
14 |     s[len] = 0;
15 | }
16 | 
17 | extern "C" void fit(double** features,int* labels,int row,int col,int max_iter,double alpha,double lambda,double tolerance,int early_stopping_round,int batch_size,char* ret,double (*metric)(double* y,double* pred,int size)=Utils::accuracy){
18 |     //initialize data of Eigen type
19 |     MatrixXd X(row,col);
20 |     VectorXd y(row);
21 |     for(int i=0;i<row;i++){
22 |         for(int j=0;j<col;j++){
23 |             X(i,j) = features[i][j];
24 |         }
25 |         y(i) = labels[i];
26 |     }
27 | 
28 |     //train the logistic regression model
29 |     LR clf = LR(max_iter,alpha,lambda,tolerance);
30 |     clf.fit(X,y,batch_size,early_stopping_round,metric);
31 | 
32 |     //save the model weights
33 |     char* fmodel = new char[21];
34 |     gen_random(fmodel,20);
35 |     string model_path = "/tmp/"+string(fmodel);
36 |     clf.saveWeights(model_path);
37 |     strcpy(ret,model_path.c_str());
38 | }
39 | 
40 | 
41 | extern "C" void predict_prob(double** features,int row,int col,char* fmodel,double* ret){
42 |     LR clf = LR();
43 |     clf.loadWeights(fmodel);
44 |     MatrixXd X(row,col);
45 |     for(int i=0;i<row;i++){
46 |         for(int j=0;j<col;j++){
47 |             X(i,j) = features[i][j];
48 |         }
49 |     }
50 |     VectorXd pred = clf.predict_prob(X);
51 | 
52 |     for(int i=0;i<row;i++){
53 |         ret[i] = pred(i);
54 |     }
55 | }
56 | 
57 | 
58 | extern "C" void predict(double** features,int row,int col,char* fmodel,int* ret){
59 |     double* prob = new double[row];
60 |     predict_prob(features,row,col,fmodel,prob);
61 |     for(int i=0;i<row;i++){
62 |         ret[i] = prob[i]>0.5?1:0;
63 |     }
64 | }
65 | 
66 | int main(){
67 |     int row=10,col=2;
68 |     double** features = new double *[row];
69 |     for(int i=0;i<row;i++){
70 |         features[i] = new double[col];
71 |     }
72 |     int* labels = new int[row];
73 |     
74 |     double features_value[row*col] = {1.0,0.8,2.0,1.7,3.0,2.5,4.0,3.6,5.0,4.9,1.0,1.2,2.0,2.5,3.0,3.4,4.0,4.5,5.0,6.0};
75 |     int labels_value[row] = {0,0,0,0,0,1,1,1,1,1};
76 |     for(int i=0;i<row;i++){
77 |         for(int j=0;j<col;j++){
78 |             features[i][j] = features_value[i*col+j];
79 |         }
80 |         labels[i] = labels_value[i];
81 |     }
82 | 
83 |     char* ret = new char[26];
84 |     fit(features,labels,row,col,200,0.01,0.0,0.01,10,64,ret);
85 |     cout<<ret<<endl;
86 | 
87 |     int* pred = new int[row];
88 |     predict(features,row,col,ret,pred);
89 |     for(int i=0;i<row;i++){
90 |         cout<<pred[i]<<",";
91 |     }
92 | }
93 | 
94 | 


--------------------------------------------------------------------------------
/src/utils.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include "utils.h"
 3 | #include <iostream>
 4 | 
 5 | double Utils::sigmod(double x){
 6 |     return 1.0/(1.0+exp(-x));
 7 | }
 8 | 
 9 | 
10 | double Utils::crossEntropyLoss(Eigen::VectorXd y,Eigen::VectorXd y_pred){
11 |     int n = y.size();
12 |     double loss = 0.0;
13 |     #pragma omp parallel for reduction(+: loss)
14 |     for(int i=0;i<n;i++){
15 |         double yi_prob = y_pred(i);
16 |         yi_prob = std::min(std::max(yi_prob,0.0001),0.9999);
17 |         loss -= (y(i)*log2(yi_prob)+(1-y(i))*log2(1-yi_prob));
18 |     }
19 |     return loss/n;
20 | }
21 | 
22 | 
23 | double Utils::accuracy(Eigen::VectorXd y, Eigen::VectorXd pred){
24 |     Eigen::VectorXi y_ = y.cast<int>();
25 |     int n = y_.size();
26 |     double hit = 0.0;
27 |     #pragma omp parallel for reduction(+: hit)
28 |     for(int i=0;i<n;i++){
29 |         if(y_(i)==(pred(i)>0.5?1:0)){
30 |             hit += 1.0;
31 |         }
32 |     }
33 |     return hit/n;
34 | }
35 | 
36 | double Utils::accuracy(double* y,double* pred,int size){
37 |     double hit = 0.0;
38 |     #pragma omp parallel for reduction(+: hit)
39 |     for(int i=0;i<size;i++){
40 |         if(y[i]==(pred[i]>0.5?1.0:0.0)){
41 |             hit += 1.0;
42 |         }
43 |     }
44 |     return hit/size;
45 | }
46 | 
47 | Eigen::MatrixXd Utils::slice(Eigen::MatrixXd X,int start_idx,int end_idx){
48 |     Eigen::MatrixXd ret(end_idx-start_idx+1,X.cols());
49 |     #pragma omp parallel for
50 |     for(int i=start_idx;i<=end_idx;i++){
51 |         ret.row(i-start_idx) = X.row(i);
52 |     }
53 |     return ret;
54 | }
55 | 
56 | Eigen::VectorXd Utils::slice(Eigen::VectorXd y,int start_idx,int end_idx){
57 |     Eigen::VectorXd ret(end_idx-start_idx+1);
58 |     #pragma omp parallel for
59 |     for(int i=start_idx;i<=end_idx;i++){
60 |         ret(i-start_idx) = y(i);
61 |     }
62 |     return ret;
63 | }
64 | 
65 | 
66 | int* Utils::VectorXi_to_int_array(Eigen::VectorXi y){
67 |     int size = y.size();
68 |     int* ret = new int[size];
69 |     #pragma omp parallel for
70 |     for(int i=0;i<size;i++){
71 |         ret[i] = y(i);
72 |     }
73 |     return ret;
74 | }  
75 | 
76 | double* Utils::VectorXd_to_double_array(Eigen::VectorXd pred){
77 |     int size = pred.size();
78 |     double* ret = new double[size];
79 |     #pragma omp parallel for
80 |     for(int i=0;i<size;i++){
81 |         ret[i] = pred(i);
82 |     }
83 |     return ret;
84 | }
85 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __UTILS_H__
 2 | #define __UTILS_H__
 3 | 
 4 | #include <eigen3/Eigen/Dense>
 5 | 
 6 | 
 7 | class Utils{
 8 | public:
 9 |     // sigmod function, depend on <cmath> library
10 |     static double sigmod(double x);
11 |     static double crossEntropyLoss(Eigen::VectorXd y,Eigen::VectorXd h);
12 |     static double accuracy(Eigen::VectorXd y,Eigen::VectorXd pred);
13 |     static double accuracy(double* y,double* pred,int size);
14 |     static double* VectorXd_to_double_array(Eigen::VectorXd pred);
15 |     static int* VectorXi_to_int_array(Eigen::VectorXi y);
16 |     static Eigen::MatrixXd slice(Eigen::MatrixXd X,int start_idx,int end_idx);
17 |     static Eigen::VectorXd slice(Eigen::VectorXd X,int start_idx,int end_idx);
18 | };
19 | 
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------