├── LICENSE ├── MLP.py ├── README.md ├── gbdt.py └── gbdt_numba.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 drop-out 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MLP.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from abc import ABCMeta, abstractmethod 4 | 5 | 6 | class loss(metaclass=ABCMeta): 7 | ''' 8 | The abstract base class for loss function. 9 | For each loss, the gradient should be specified. 10 | ''' 11 | def obj(self,pred,true): 12 | pass 13 | 14 | def gradient(self,pred,true): 15 | pass 16 | 17 | class mse(loss): 18 | '''Loss function for mse.''' 19 | def obj(self,pred,true): 20 | return np.square(pred-true).mean()/2 21 | 22 | def gradient(self,pred,true): 23 | return pred-true 24 | 25 | class log_loss(loss): 26 | '''Loss fucntion for log loss.''' 27 | def obj(self,pred,true): 28 | return (-np.multiply(true,np.log(pred))-np.multiply(1-y,np.log(1-pred))).mean() 29 | 30 | def gradient(self,pred,true): 31 | return -np.multiply(true,1/pred)+np.multiply(1-true,1/(1-pred)) 32 | 33 | 34 | class act(metaclass=ABCMeta): 35 | ''' 36 | The abstract base class for activation function. 37 | For each loss, 38 | the functions used for forward and backward propagation are specified respectively. 39 | The two functions take same inputs. 40 | The forward function would return the values after the transformation. 41 | The backward function would return the derivative musk at this layer. 42 | ''' 43 | def forward(self,matrix): 44 | pass 45 | 46 | def backward(self,matrix): 47 | pass 48 | 49 | class linear(act): 50 | '''Linear activation function.''' 51 | def forward(self,matrix): 52 | return matrix 53 | 54 | def backward(self,matrix): 55 | return np.ones_like(matrix) 56 | 57 | class relu(act): 58 | '''Rectified linear units.''' 59 | def forward(self,matrix): 60 | return np.multiply(matrix>0,matrix) 61 | 62 | def backward(self,matrix): 63 | return 1*(matrix>0) 64 | 65 | class logistic(act): 66 | '''Logistic transformation''' 67 | def forward(self,matrix): 68 | return 1/(1+np.exp(-matrix)+0.000001) 69 | 70 | def backward(self,matrix): 71 | return np.multiply(self.forward(matrix),1-self.forward(matrix)) 72 | 73 | 74 | class MLP(object): 75 | ''' 76 | Parameters: 77 | ---------- 78 | n_hidden_units: Number of units in the hidden layer. 79 | batch_size: Number of data points used in each gradient step. 80 | n_epochs: Number of epochs. 81 | Note that this determines the number of epochs (how many times each data point will be used), 82 | not the number of gradient steps. 83 | learning_rate: The learning rate of gradient descent. 84 | momentum: Momentum for gradient descent update. (Between 0 and 1.) 85 | weight_decay: Coeffecients for L2 regularization. (Also known as weight decay.) 86 | activation: Activation function for the hidden layer. 87 | 'relu' for rectified linear units. 88 | 'logistic' for sigmoid activation. 89 | 'linear' for linear activation 90 | loss: Loss function. 91 | 'mse' for regression task 92 | 'log_loss' for classfication task. 93 | ''' 94 | 95 | def __init__(self, 96 | n_hidden_units=100, 97 | batch_size=200, 98 | n_epochs=200, 99 | learning_rate=0.01, 100 | momentum=0.9, 101 | weight_decay=0.0001, 102 | activation='relu', 103 | loss='mse'): 104 | 105 | self.n_hidden_units=n_hidden_units 106 | self.batch_size=batch_size 107 | self.n_epochs=n_epochs 108 | self.learning_rate=learning_rate 109 | self.momentum=momentum 110 | self.weight_decay=weight_decay 111 | 112 | #activation (This is the activation function for the hidden layer.) 113 | if activation=='relu': 114 | self.act1=relu() 115 | elif activation=='logistic': 116 | self.act1=logistic() 117 | elif activation=='linear': 118 | self.act1=linear() 119 | else: 120 | self.act1=activation 121 | 122 | #loss (Note that the activation function for the output layer is determined by the loss.) 123 | if loss=='mse': 124 | self.loss=mse() 125 | self.act2=linear() 126 | elif loss=='log_loss': 127 | self.loss=log_loss() 128 | self.act2=logistic() 129 | else: 130 | self.loss=loss[0] 131 | self.act2=loss[1] 132 | 133 | def forward(self): 134 | self.layer1=self.W1*self.X+self.b1 135 | self.layer1act=self.act1.forward(self.layer1) 136 | self.score=self.W2*self.layer1act+self.b2 137 | self.pred=self.act2.forward(self.score) 138 | 139 | def backward(self): 140 | self.dpred=self.loss.gradient(self.pred,self.true) 141 | self.dscore=np.multiply(self.dpred,self.act2.backward(self.score)) 142 | self.dlayer1act=self.W2.T*self.dscore 143 | self.dlayer1=np.multiply(self.dlayer1act,self.act1.backward(self.layer1)) 144 | 145 | self.dW1=(self.dlayer1*self.X.T-self.weight_decay*self.W1)/self.batch_size 146 | self.db1=np.sum(self.dlayer1,axis=1)/self.batch_size 147 | self.dW2=(self.dscore*self.layer1act.T-self.weight_decay*self.W2)/self.batch_size 148 | self.db2=np.sum(self.dscore,axis=1)/self.batch_size 149 | 150 | def update_weights(self): 151 | #calculate moving average gradients(momentum) 152 | self.tW1 = self.momentum*self.tW1+(1-self.momentum)*self.dW1 153 | self.tb1 = self.momentum*self.tb1+(1-self.momentum)*self.db1 154 | self.tW2 = self.momentum*self.tW2+(1-self.momentum)*self.dW2 155 | self.tb2 = self.momentum*self.tb2+(1-self.momentum)*self.db2 156 | 157 | #update weights 158 | self.W1-=self.tW1*self.learning_rate 159 | self.b1-=self.tb1*self.learning_rate 160 | self.W2-=self.tW2*self.learning_rate 161 | self.b2-=self.tb2*self.learning_rate 162 | 163 | def fit(self,train,target): 164 | #turn the inputs into matrices. 165 | train=np.matrix(train).T 166 | target=np.matrix(target.reshape(-1,1)).T 167 | 168 | #parameters 169 | n_features=train.shape[0] 170 | n_obs=train.shape[1] 171 | 172 | #weights initialization 173 | s1=np.sqrt(6/(n_features+self.n_hidden_units)) 174 | s2=np.sqrt(6/(1+self.n_hidden_units)) 175 | self.W1=np.matrix(np.random.uniform(-s1,s1,[self.n_hidden_units,n_features])) 176 | self.b1=np.matrix(np.random.uniform(-s1,s1,[self.n_hidden_units,1])) 177 | self.W2=np.matrix(np.random.uniform(-s2,s2,[1,self.n_hidden_units])) 178 | self.b2=np.matrix(np.random.uniform(-s2,s2,[1,1])) 179 | 180 | #momentum initialization 181 | self.tW1=self.W1*0 182 | self.tb1=self.b1*0 183 | self.tW2=self.W2*0 184 | self.tb2=self.b2*0 185 | 186 | #the trainging process 187 | for i in range(self.n_epochs): 188 | for j in range(n_obs//self.batch_size): 189 | self.X=train[:,j*self.batch_size:(j+1)*self.batch_size] 190 | self.true=target[:,j*self.batch_size:(j+1)*self.batch_size] 191 | self.forward() 192 | self.backward() 193 | self.update_weights() 194 | 195 | def predict(self,test): 196 | self.X=np.matrix(test).T 197 | self.forward() 198 | return np.squeeze(np.asarray(self.pred)) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning From Scratch 2 | Gradient Boosting Decision Tree, Support Vector Machine and Neural Network are arguably the three best machine learning algorithms that has gone through the test of time. 3 | 4 | This project implements the three algorithms with simple and neat python code. Those toy codes may not compare to other mature packages such as `xgboost` and `sklearn` in terms of speed and memory consumption, but could help illustrate how those algorithms work. 5 | 6 | ## Dependence 7 | 8 | All three algorithms are implemented in `Python 3.6`. All three algorithms are build from scratch, which means that the implementation is purely based on `numpy`, and there is no dependence on any other machine learning package. 9 | 10 | - [NumPy](https://github.com/numpy/numpy) 11 | 12 | ## Construction in Progress 13 | 14 | The implementation of GBDT and NN has been finished, while SVM is still construction in progress. 15 | 16 | Tutorial of GBDT and NN is provided below. 17 | 18 | ## GBDT 19 | 20 | This implementation of GBDT supports most of the core features of `xgboost`. Briefly, it supports: 21 | 22 | - **Built-in loss**: Mean squared loss for regression task and log loss for classfication task. 23 | - **Customized loss**: Other loss are also supported. User should provide the link function, the gradient, and the hessian. 24 | - **Hessian information**: It uses Newton Method for boosting, thus makes full use of the second-order derivative information. 25 | - **Regularization**: lambda and gamma, as in `xgboost`. 26 | - **Multi-processing**: It uses the python `Pool` module for multi-processing. 27 | 28 | To keep the code neat, some features of `xgboost` are not implemented. For example, it does not handle missing value, and randomness is not supported. 29 | 30 | A quick start is provided below. 31 | 32 | **Import the module** 33 | 34 | ```python 35 | from gbdt import GBDT 36 | ``` 37 | 38 | **Initialize model** 39 | ```python 40 | model = GBDT(n_threads=None,loss='mse',max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0, 41 | learning_rate=0.1,n_estimators=100) 42 | ``` 43 | * `n_threads`: Number of threads for multiprocessing. `None` to use all. 44 | * `loss`: Loss function for gradient boosting. `'mse'` is mean squared error for regression task and `'log'` is log loss for classification task. Pass a child class of the `loss` class to use customized loss. See [source code](https://github.com/drop-out/Machine-Learning-From-Scratch/blob/master/gbdt.py#L7) for details. 45 | * `max_depth`: The maximum depth of a tree. 46 | * `min_sample_split`: The minimum number of samples required to further split a node. 47 | * `reg_lambda`: The regularization coefficient for leaf score, also known as lambda. 48 | * `gamma`: The regularization coefficient for number of tree nodes, also know as gamma. 49 | * `learning_rate`: The learning rate of gradient boosting. 50 | * `n_estimators`: Number of trees. 51 | 52 | **Train** 53 | ```python 54 | model.fit(train,target) 55 | ``` 56 | All inputs should be numpy arrays. `train` should be 2D array and `target` should be 1D array. 57 | 58 | **Predict** 59 | ```python 60 | model.predict(test) 61 | ``` 62 | Return predictions as numpy array. 63 | 64 | **Customized loss** 65 | 66 | Define a class that inheritates the `loss` class (see [source code](https://github.com/drop-out/Machine-Learning-From-Scratch/blob/master/gbdt.py#L7)), which specifies the link function, the gradients and the hessian. 67 | 68 | ```python 69 | class customized_loss(loss): 70 | def link(self,score): 71 | return 1/(1+np.exp(-score)) 72 | def g(self,true,score): 73 | pred=self.link(score) 74 | return pred-true 75 | def h(self,true,score): 76 | pred=self.link(score) 77 | return pred*(1-pred) 78 | ``` 79 | 80 | And the class could be passed when initializing the model. 81 | 82 | ```python 83 | model = GBDT(loss=customized_loss,learning_rate=0.1,n_estimators=100) 84 | ``` 85 | 86 | ## MLP 87 | 88 | To implement a full-featured deep learning framework is rather complicated. Instead of doing that, I write a simple Multi-layer Perceptron (MLP) with one hidden layer. This implementation of MLP supports: 89 | 90 | - **Loss**: Mean squared loss for regression task and log loss for classfication task. 91 | - **Activation**: `relu`, `sigmoid` and `linear` activation are supported natively. 92 | - **Momentum**: Batch gradient descent with momentum optimizer. 93 | - **Regularization**: L2 regularization, also known as weight decay. 94 | 95 | A quick start is provided below. 96 | 97 | **Import the module** 98 | 99 | ```python 100 | from MLP import MLP 101 | ``` 102 | 103 | **Initialize model** 104 | 105 | ```python 106 | model=MLP(n_hidden_units=100,batch_size=200,n_epochs=200,learning_rate=0.01,momentum=0.9,weight_decay=0.0001,loss='mse') 107 | ``` 108 | 109 | - `n_hidden_units`: Number of units in the hidden layer. 110 | - `batch_size`: Number of data points used in each gradient step. 111 | - `n_epochs`: Number of epochs. Note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps. 112 | - `learning_rate`: The learning rate of gradient descent. 113 | - `momentum`: Momentum for gradient descent update. (Between 0 and 1.) 114 | - `weight_decay`: Coeffecients for L2 regularization. (Also known as weight decay.) 115 | - `activation`: Activation function for the hidden layer.`'relu'` for rectified linear units, `'logistic'` for sigmoid activation and `'linear'` for linear activation. 116 | - `loss`: Loss function,`'mse'` for regression task and `'log_loss'` for classfication task. 117 | 118 | **Train** 119 | 120 | ```python 121 | model.fit(train,target) 122 | ``` 123 | 124 | All inputs should be numpy arrays. `train` should be 2D array and `target` should be 1D array. 125 | 126 | **Predict** 127 | 128 | ```python 129 | model.predict(test) 130 | ``` 131 | 132 | Return predictions as numpy array. 133 | 134 | -------------------------------------------------------------------------------- /gbdt.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from multiprocessing import Pool 3 | from functools import partial 4 | import numpy as np 5 | 6 | 7 | class loss(metaclass=ABCMeta): 8 | ''' 9 | The absctract base class for loss function. 10 | Three things should be specified for a loss, 11 | namely link function, gradient and hessian. 12 | link() is the link function, which takes scores as input, and returns predictions. 13 | g() is the gradient, which takes true values and scores as input, and returns gradient. 14 | h() is the heassian, which takes true values and scores as input, and returns hessian. 15 | All inputs and outputs are numpy arrays. 16 | ''' 17 | @abstractmethod 18 | def link(self,score): 19 | pass 20 | 21 | @abstractmethod 22 | def g(self,true,score): 23 | pass 24 | 25 | @abstractmethod 26 | def h(self,true,score): 27 | pass 28 | 29 | class mse(loss): 30 | '''Loss class for mse. As for mse, link function is pred=score.''' 31 | def link(self,score): 32 | return score 33 | 34 | def g(self,true,score): 35 | return score-true 36 | 37 | def h(self,true,score): 38 | return np.ones_like(score) 39 | 40 | class log(loss): 41 | '''Loss class for log loss. As for log loss, link function is logistic transformation.''' 42 | def link(self,score): 43 | return 1/(1+np.exp(-score)) 44 | 45 | def g(self,true,score): 46 | pred=self.link(score) 47 | return pred-true 48 | 49 | def h(self,true,score): 50 | pred=self.link(score) 51 | return pred*(1-pred) 52 | 53 | 54 | class GBDT(object): 55 | ''' 56 | Parameters: 57 | ---------- 58 | n_threads: The number of threads used for fitting and predicting. 59 | loss: Loss function for gradient boosting. 60 | 'mse' for regression task and 'log' for classfication task. 61 | A child class of the loss class could be passed to implement customized loss. 62 | max_depth: The maximum depth of a tree. 63 | min_sample_split: The minimum number of samples required to further split a node. 64 | reg_lamda: The regularization coefficient for leaf score, also known as lambda. 65 | gamma: The regularization coefficient for number of tree nodes, also know as gamma. 66 | learning_rate: The learning rate of gradient boosting. 67 | n_estimators: Number of trees. 68 | ''' 69 | def __init__(self, 70 | n_threads=None, 71 | loss='mse', 72 | max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0, 73 | learning_rate=0.1,n_estimators=100): 74 | self.n_threads=n_threads 75 | self.loss=loss 76 | self.max_depth=max_depth 77 | self.min_sample_split=min_sample_split 78 | self.reg_lambda=reg_lambda 79 | self.gamma=gamma 80 | self.learning_rate=learning_rate 81 | self.n_estimators=n_estimators 82 | 83 | def fit(self,train,target): 84 | self.estimators=[] 85 | if self.loss=='mse': 86 | self.loss=mse() 87 | if self.loss=='log': 88 | self.loss=log() 89 | self.score_start=target.mean() 90 | score=np.ones(len(train))*self.score_start 91 | for i in range(self.n_estimators): 92 | estimator=Tree(n_threads=self.n_threads, 93 | max_depth=self.max_depth,min_sample_split=self.min_sample_split,reg_lambda=self.reg_lambda,gamma=self.gamma) 94 | estimator.fit(train,g=self.loss.g(target,score),h=self.loss.h(target,score)) 95 | self.estimators.append(estimator) 96 | score+=self.learning_rate*estimator.predict(train) 97 | return self 98 | 99 | def predict(self,test): 100 | score=np.ones(len(test))*self.score_start 101 | for i in range(self.n_estimators): 102 | score+=self.learning_rate*self.estimators[i].predict(test) 103 | return self.loss.link(score) 104 | 105 | 106 | class TreeNode(object): 107 | ''' 108 | The data structure that are used for storing trees. 109 | A tree is presented by a set of nested TreeNodes, 110 | with one TreeNode pointing two child TreeNodes, 111 | until a tree leaf is reached. 112 | 113 | Parameters: 114 | ---------- 115 | is_leaf: If is TreeNode is a leaf. 116 | score: The prediction (score) of a tree leaf. 117 | split_feature: The split feature of a tree node. 118 | split_threshold: The split threshold of a tree node. 119 | left_child: Pointing to a child TreeNode, 120 | where the value of split feature is less than the split threshold. 121 | right_child: Pointing to a child TreeNode, 122 | where the value of split features is greater than or equal to the split threshold. 123 | ''' 124 | def __init__(self, 125 | is_leaf=False,score=None, 126 | split_feature=None,split_threshold=None,left_child=None,right_child=None): 127 | self.is_leaf=is_leaf 128 | self.score=score 129 | self.split_feature=split_feature 130 | self.split_threshold=split_threshold 131 | self.left_child=left_child 132 | self.right_child=right_child 133 | 134 | class Tree(object): 135 | ''' 136 | This is the building block for GBDT, 137 | which is a single decision tree, 138 | also known as an estimator. 139 | 140 | Parameters: 141 | ---------- 142 | n_threads: The number of threads used for fitting and predicting. 143 | max_depth: The maximum depth of the tree. 144 | min_sample_split: The minimum number of samples required to further split a node. 145 | reg_lamda: The regularization coefficient for leaf prediction, also known as lambda. 146 | gamma: The regularization coefficient for number of TreeNode, also know as gamma. 147 | ''' 148 | def __init__(self,n_threads=None,max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0): 149 | self.n_threads=n_threads 150 | self.max_depth=max_depth 151 | self.min_sample_split=min_sample_split 152 | self.reg_lambda=reg_lambda 153 | self.gamma=gamma 154 | 155 | def fit(self,train,g,h): 156 | ''' 157 | All inputs must be numpy arrays. 158 | g and h are gradient and hessian respectively. 159 | ''' 160 | self.estimator=self.construct_tree(train,g,h,self.max_depth) 161 | return self 162 | 163 | def predict(self,test): 164 | ''' 165 | test must be numpy array. 166 | Return predictions (scores) as an array. 167 | Multiprocessing is supported for prediction. 168 | ''' 169 | pool=Pool(self.n_threads) 170 | f=partial(self.predict_single,self.estimator) 171 | result=np.array(pool.map(f,test)) 172 | pool.close() 173 | pool.join() 174 | return result 175 | 176 | def predict_single(self,treenode,test): 177 | ''' 178 | The predict method for a single sample point. 179 | test must be numpy array. 180 | Return prediction (score) as a number. 181 | ''' 182 | if treenode.is_leaf: 183 | return treenode.score 184 | else: 185 | if test[treenode.split_feature]best_gain: 273 | threshold=this_threshold 274 | best_gain=this_gain 275 | return [threshold,best_gain] 276 | -------------------------------------------------------------------------------- /gbdt_numba.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | import numpy as np 3 | from numba import jit 4 | 5 | @jit 6 | def leaf_score(g,h,reg_lambda): 7 | ''' 8 | Given the gradient and hessian of a tree leaf, 9 | return the prediction (score) at this leaf. 10 | The score is -G/(H+λ). 11 | ''' 12 | return -np.sum(g)/(np.sum(h)+reg_lambda) 13 | 14 | @jit 15 | def leaf_loss(g,h,reg_lambda): 16 | ''' 17 | Given the gradient and hessian of a tree leaf, 18 | return the minimized loss at this leaf. 19 | The minimized loss is -0.5*G^2/(H+λ). 20 | ''' 21 | return -0.5*np.square(np.sum(g))/(np.sum(h)+reg_lambda) 22 | 23 | @jit 24 | def calculate_gain(original_loss,feature,g,h,threshold,reg_lambda): 25 | ''' 26 | Given the original loss, 27 | and the threshold to split, 28 | calculate the new loss. 29 | ''' 30 | left_g=0 31 | left_h=0 32 | right_g=0 33 | right_h=0 34 | for i in range(len(feature)): 35 | if feature[i]best_gain: 59 | threshold=this_threshold 60 | best_gain=this_gain 61 | return threshold,best_gain 62 | 63 | @jit 64 | def find_best_split(train,g,h,reg_lambda): 65 | ''' 66 | Return the best feature to split together with the corresponding threshold. 67 | Each feature is scanned by find_threshold(), 68 | a (threshold,gain) tuple is returned for each feature. 69 | Then we select the feature with the largest best_gain, 70 | and return index of that feature, the threshold, and the gain that is achieved. 71 | ''' 72 | train=train.T 73 | feature=0 74 | threshold=None 75 | best_gain=0 76 | for i in range(len(train)): 77 | this_threshold,this_gain=find_threshold(g,h,train[i],reg_lambda) 78 | if this_gain>best_gain: 79 | feature=i 80 | threshold=this_threshold 81 | best_gain=this_gain 82 | return feature,threshold,best_gain 83 | 84 | class loss(metaclass=ABCMeta): 85 | ''' 86 | The absctract base class for loss function. 87 | Three things should be specified for a loss, 88 | namely link function, gradient and hessian. 89 | link() is the link function, which takes scores as input, and returns predictions. 90 | g() is the gradient, which takes true values and scores as input, and returns gradient. 91 | h() is the heassian, which takes true values and scores as input, and returns hessian. 92 | All inputs and outputs are numpy arrays. 93 | ''' 94 | @abstractmethod 95 | def link(self,score): 96 | pass 97 | 98 | @abstractmethod 99 | def g(self,true,score): 100 | pass 101 | 102 | @abstractmethod 103 | def h(self,true,score): 104 | pass 105 | 106 | class mse(loss): 107 | '''Loss class for mse. As for mse, link function is pred=score.''' 108 | def link(self,score): 109 | return score 110 | 111 | def g(self,true,score): 112 | return score-true 113 | 114 | def h(self,true,score): 115 | return np.ones_like(score) 116 | 117 | class log(loss): 118 | '''Loss class for log loss. As for log loss, link function is logistic transformation.''' 119 | def link(self,score): 120 | return 1/(1+np.exp(-score)) 121 | 122 | def g(self,true,score): 123 | pred=self.link(score) 124 | return pred-true 125 | 126 | def h(self,true,score): 127 | pred=self.link(score) 128 | return pred*(1-pred) 129 | 130 | 131 | 132 | 133 | class GBDT(object): 134 | ''' 135 | Parameters: 136 | ---------- 137 | n_threads: The number of threads used for fitting and predicting. 138 | loss: Loss function for gradient boosting. 139 | 'mse' for regression task and 'log' for classfication task. 140 | A child class of the loss class could be passed to implement customized loss. 141 | max_depth: The maximum depth of a tree. 142 | min_sample_split: The minimum number of samples required to further split a node. 143 | reg_lamda: The regularization coefficient for leaf score, also known as lambda. 144 | gamma: The regularization coefficient for number of tree nodes, also know as gamma. 145 | learning_rate: The learning rate of gradient boosting. 146 | n_estimators: Number of trees. 147 | ''' 148 | def __init__(self, 149 | loss='mse', 150 | max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0, 151 | learning_rate=0.1,n_estimators=100): 152 | self.loss=loss 153 | self.max_depth=max_depth 154 | self.min_sample_split=min_sample_split 155 | self.reg_lambda=reg_lambda 156 | self.gamma=gamma 157 | self.learning_rate=learning_rate 158 | self.n_estimators=n_estimators 159 | 160 | def fit(self,train,target): 161 | self.estimators=[] 162 | if self.loss=='mse': 163 | self.loss=mse() 164 | if self.loss=='log': 165 | self.loss=log() 166 | self.score_start=target.mean() 167 | score=np.ones(len(train))*self.score_start 168 | for i in range(self.n_estimators): 169 | estimator=Tree( 170 | max_depth=self.max_depth,min_sample_split=self.min_sample_split,reg_lambda=self.reg_lambda,gamma=self.gamma) 171 | estimator.fit(train,g=self.loss.g(target,score),h=self.loss.h(target,score)) 172 | self.estimators.append(estimator) 173 | score+=self.learning_rate*estimator.predict(train) 174 | return self 175 | 176 | def predict(self,test): 177 | score=np.ones(len(test))*self.score_start 178 | for i in range(self.n_estimators): 179 | score+=self.learning_rate*self.estimators[i].predict(test) 180 | return self.loss.link(score) 181 | 182 | 183 | class TreeNode(object): 184 | ''' 185 | The data structure that are used for storing trees. 186 | A tree is presented by a set of nested TreeNodes, 187 | with one TreeNode pointing two child TreeNodes, 188 | until a tree leaf is reached. 189 | 190 | Parameters: 191 | ---------- 192 | is_leaf: If is TreeNode is a leaf. 193 | score: The prediction (score) of a tree leaf. 194 | split_feature: The split feature of a tree node. 195 | split_threshold: The split threshold of a tree node. 196 | left_child: Pointing to a child TreeNode, 197 | where the value of split feature is less than the split threshold. 198 | right_child: Pointing to a child TreeNode, 199 | where the value of split features is greater than or equal to the split threshold. 200 | ''' 201 | def __init__(self, 202 | is_leaf=False,score=None, 203 | split_feature=None,split_threshold=None,left_child=None,right_child=None): 204 | self.is_leaf=is_leaf 205 | self.score=score 206 | self.split_feature=split_feature 207 | self.split_threshold=split_threshold 208 | self.left_child=left_child 209 | self.right_child=right_child 210 | 211 | class Tree(object): 212 | ''' 213 | This is the building block for GBDT, 214 | which is a single decision tree, 215 | also known as an estimator. 216 | 217 | Parameters: 218 | ---------- 219 | n_threads: The number of threads used for fitting and predicting. 220 | max_depth: The maximum depth of the tree. 221 | min_sample_split: The minimum number of samples required to further split a node. 222 | reg_lamda: The regularization coefficient for leaf prediction, also known as lambda. 223 | gamma: The regularization coefficient for number of TreeNode, also know as gamma. 224 | ''' 225 | def __init__(self,max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0): 226 | self.max_depth=max_depth 227 | self.min_sample_split=min_sample_split 228 | self.reg_lambda=reg_lambda 229 | self.gamma=gamma 230 | 231 | def fit(self,train,g,h): 232 | ''' 233 | All inputs must be numpy arrays. 234 | g and h are gradient and hessian respectively. 235 | ''' 236 | self.estimator=self.construct_tree(train,g,h,self.max_depth) 237 | return self 238 | 239 | def predict(self,test): 240 | ''' 241 | test must be numpy array. 242 | Return predictions (scores) as an array. 243 | Multiprocessing is supported for prediction. 244 | ''' 245 | result=np.zeros(len(test)) 246 | for i in range(len(test)): 247 | result[i]=self.predict_single(self.estimator,test[i]) 248 | return result 249 | 250 | def predict_single(self,treenode,test): 251 | ''' 252 | The predict method for a single sample point. 253 | test must be numpy array. 254 | Return prediction (score) as a number. 255 | ''' 256 | if treenode.is_leaf: 257 | return treenode.score 258 | else: 259 | if test[treenode.split_feature]