├── LICENSE
├── MLP.py
├── README.md
├── gbdt.py
└── gbdt_numba.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 drop-out
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MLP.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | from abc import ABCMeta, abstractmethod
  4 | 
  5 | 
  6 | class loss(metaclass=ABCMeta):
  7 |     '''
  8 |     The abstract base class for loss function.
  9 |     For each loss, the gradient should be specified.
 10 |     '''
 11 |     def obj(self,pred,true):
 12 |         pass
 13 |     
 14 |     def gradient(self,pred,true):
 15 |         pass
 16 |         
 17 | class mse(loss):
 18 |     '''Loss function for mse.'''
 19 |     def obj(self,pred,true):
 20 |         return np.square(pred-true).mean()/2
 21 |     
 22 |     def gradient(self,pred,true):
 23 |         return pred-true
 24 |     
 25 | class log_loss(loss):
 26 |     '''Loss fucntion for log loss.'''
 27 |     def obj(self,pred,true):
 28 |         return (-np.multiply(true,np.log(pred))-np.multiply(1-y,np.log(1-pred))).mean()
 29 |     
 30 |     def gradient(self,pred,true):
 31 |         return -np.multiply(true,1/pred)+np.multiply(1-true,1/(1-pred))
 32 | 
 33 | 
 34 | class act(metaclass=ABCMeta):
 35 |     '''
 36 |     The abstract base class for activation function.
 37 |     For each loss, 
 38 |     the functions used for forward and backward propagation are specified respectively.
 39 |     The two functions take same inputs.
 40 |     The forward function would return the values after the transformation.
 41 |     The backward function would return the derivative musk at this layer.
 42 |     '''
 43 |     def forward(self,matrix):
 44 |         pass
 45 |     
 46 |     def backward(self,matrix):
 47 |         pass
 48 |         
 49 | class linear(act):
 50 |     '''Linear activation function.'''
 51 |     def forward(self,matrix):
 52 |         return matrix
 53 |     
 54 |     def backward(self,matrix):
 55 |         return np.ones_like(matrix)
 56 |     
 57 | class relu(act):
 58 |     '''Rectified linear units.'''
 59 |     def forward(self,matrix):
 60 |         return np.multiply(matrix>0,matrix)
 61 |     
 62 |     def backward(self,matrix):
 63 |         return 1*(matrix>0)
 64 |         
 65 | class logistic(act):
 66 |     '''Logistic transformation'''
 67 |     def forward(self,matrix):
 68 |         return 1/(1+np.exp(-matrix)+0.000001)
 69 |     
 70 |     def backward(self,matrix):
 71 |         return np.multiply(self.forward(matrix),1-self.forward(matrix))
 72 | 
 73 | 
 74 | class MLP(object):
 75 |     '''
 76 |     Parameters:
 77 |     ----------
 78 |     n_hidden_units: Number of units in the hidden layer.
 79 |     batch_size: Number of data points used in each gradient step.
 80 |     n_epochs: Number of epochs.
 81 |               Note that this determines the number of epochs (how many times each data point will be used),
 82 |               not the number of gradient steps.
 83 |     learning_rate: The learning rate of gradient descent.
 84 |     momentum: Momentum for gradient descent update. (Between 0 and 1.)
 85 |     weight_decay: Coeffecients for L2 regularization. (Also known as weight decay.)
 86 |     activation: Activation function for the hidden layer.
 87 |                 'relu' for rectified linear units.
 88 |                 'logistic' for sigmoid activation.
 89 |                 'linear' for linear activation
 90 |     loss: Loss function.
 91 |           'mse' for regression task
 92 |           'log_loss' for classfication task.
 93 |     '''
 94 |         
 95 |     def __init__(self,
 96 |                  n_hidden_units=100,
 97 |                  batch_size=200,
 98 |                  n_epochs=200,
 99 |                  learning_rate=0.01,
100 |                  momentum=0.9,
101 |                  weight_decay=0.0001,
102 |                  activation='relu',
103 |                  loss='mse'):
104 | 
105 |         self.n_hidden_units=n_hidden_units
106 |         self.batch_size=batch_size
107 |         self.n_epochs=n_epochs
108 |         self.learning_rate=learning_rate
109 |         self.momentum=momentum
110 |         self.weight_decay=weight_decay
111 | 
112 |         #activation (This is the activation function for the hidden layer.)
113 |         if activation=='relu':
114 |             self.act1=relu()
115 |         elif activation=='logistic':
116 |             self.act1=logistic()
117 |         elif activation=='linear':
118 |             self.act1=linear()
119 |         else:
120 |             self.act1=activation
121 | 
122 |         #loss (Note that the activation function for the output layer is determined by the loss.)
123 |         if loss=='mse':
124 |             self.loss=mse()
125 |             self.act2=linear()
126 |         elif loss=='log_loss':
127 |             self.loss=log_loss()
128 |             self.act2=logistic()
129 |         else:
130 |             self.loss=loss[0]
131 |             self.act2=loss[1]
132 | 
133 |     def forward(self):
134 |         self.layer1=self.W1*self.X+self.b1
135 |         self.layer1act=self.act1.forward(self.layer1)
136 |         self.score=self.W2*self.layer1act+self.b2
137 |         self.pred=self.act2.forward(self.score)
138 | 
139 |     def backward(self):
140 |         self.dpred=self.loss.gradient(self.pred,self.true)
141 |         self.dscore=np.multiply(self.dpred,self.act2.backward(self.score))
142 |         self.dlayer1act=self.W2.T*self.dscore
143 |         self.dlayer1=np.multiply(self.dlayer1act,self.act1.backward(self.layer1))
144 |         
145 |         self.dW1=(self.dlayer1*self.X.T-self.weight_decay*self.W1)/self.batch_size
146 |         self.db1=np.sum(self.dlayer1,axis=1)/self.batch_size
147 |         self.dW2=(self.dscore*self.layer1act.T-self.weight_decay*self.W2)/self.batch_size
148 |         self.db2=np.sum(self.dscore,axis=1)/self.batch_size
149 | 
150 |     def update_weights(self):
151 |         #calculate moving average gradients(momentum)
152 |         self.tW1 = self.momentum*self.tW1+(1-self.momentum)*self.dW1
153 |         self.tb1 = self.momentum*self.tb1+(1-self.momentum)*self.db1
154 |         self.tW2 = self.momentum*self.tW2+(1-self.momentum)*self.dW2
155 |         self.tb2 = self.momentum*self.tb2+(1-self.momentum)*self.db2
156 | 
157 |         #update weights
158 |         self.W1-=self.tW1*self.learning_rate
159 |         self.b1-=self.tb1*self.learning_rate
160 |         self.W2-=self.tW2*self.learning_rate
161 |         self.b2-=self.tb2*self.learning_rate
162 |         
163 |     def fit(self,train,target):
164 |         #turn the inputs into matrices.
165 |         train=np.matrix(train).T
166 |         target=np.matrix(target.reshape(-1,1)).T
167 |         
168 |         #parameters
169 |         n_features=train.shape[0]
170 |         n_obs=train.shape[1]
171 | 
172 |         #weights initialization
173 |         s1=np.sqrt(6/(n_features+self.n_hidden_units))
174 |         s2=np.sqrt(6/(1+self.n_hidden_units))
175 |         self.W1=np.matrix(np.random.uniform(-s1,s1,[self.n_hidden_units,n_features]))
176 |         self.b1=np.matrix(np.random.uniform(-s1,s1,[self.n_hidden_units,1]))
177 |         self.W2=np.matrix(np.random.uniform(-s2,s2,[1,self.n_hidden_units]))
178 |         self.b2=np.matrix(np.random.uniform(-s2,s2,[1,1]))
179 |         
180 |         #momentum initialization
181 |         self.tW1=self.W1*0
182 |         self.tb1=self.b1*0
183 |         self.tW2=self.W2*0
184 |         self.tb2=self.b2*0
185 | 
186 |         #the trainging process
187 |         for i in range(self.n_epochs):
188 |             for j in range(n_obs//self.batch_size):
189 |                         self.X=train[:,j*self.batch_size:(j+1)*self.batch_size]
190 |                         self.true=target[:,j*self.batch_size:(j+1)*self.batch_size]
191 |                         self.forward()
192 |                         self.backward()
193 |                         self.update_weights()
194 | 
195 |     def predict(self,test):
196 |         self.X=np.matrix(test).T
197 |         self.forward()
198 |         return np.squeeze(np.asarray(self.pred))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Machine Learning From Scratch
  2 | Gradient Boosting Decision Tree, Support Vector Machine and Neural Network are arguably the three best machine learning algorithms that has gone through the test of time.
  3 | 
  4 | This project implements the three algorithms with simple and neat python code. Those toy codes may not compare to other mature packages such as `xgboost` and `sklearn` in terms of speed and memory consumption, but could help illustrate how those algorithms work.
  5 | 
  6 | ## Dependence
  7 | 
  8 | All three algorithms are implemented in `Python 3.6`. All three algorithms are build from scratch, which means that the implementation is purely based on `numpy`, and there is no dependence on any other machine learning package.
  9 | 
 10 | - [NumPy](https://github.com/numpy/numpy)
 11 | 
 12 | ## Construction in Progress
 13 | 
 14 | The implementation of GBDT and NN has been finished, while SVM is still construction in progress.
 15 | 
 16 | Tutorial of GBDT and NN is provided below.
 17 | 
 18 | ## GBDT
 19 | 
 20 | This implementation of GBDT supports most of the core features of `xgboost`. Briefly, it supports:
 21 | 
 22 | - **Built-in loss**: Mean squared loss for regression task and log loss for classfication task.
 23 | - **Customized loss**: Other loss are also supported. User should provide the link function, the gradient, and the hessian.
 24 | - **Hessian information**: It uses Newton Method for boosting, thus makes full use of the second-order derivative information. 
 25 | - **Regularization**: lambda and gamma, as in `xgboost`.
 26 | - **Multi-processing**: It uses the python `Pool` module for multi-processing.
 27 | 
 28 | To keep the code neat, some features of `xgboost` are not implemented. For example, it does not handle missing value, and randomness is not supported.
 29 | 
 30 | A quick start is provided below.
 31 | 
 32 | **Import the module**
 33 | 
 34 | ```python
 35 | from gbdt import GBDT
 36 | ```
 37 | 
 38 | **Initialize model**
 39 | ```python
 40 | model = GBDT(n_threads=None,loss='mse',max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0,
 41 | learning_rate=0.1,n_estimators=100)
 42 | ```
 43 | * `n_threads`: Number of threads for multiprocessing. `None` to use all.
 44 | * `loss`: Loss function for gradient boosting. `'mse'`  is mean squared error for regression task and `'log'` is log loss for classification task. Pass a child class of the `loss` class to use customized loss. See [source code](https://github.com/drop-out/Machine-Learning-From-Scratch/blob/master/gbdt.py#L7) for details.
 45 | * `max_depth`: The maximum depth of a tree.
 46 | * `min_sample_split`: The minimum number of samples required to further split a node.
 47 | * `reg_lambda`: The regularization coefficient for leaf score, also known as lambda.
 48 | * `gamma`: The regularization coefficient for number of tree nodes, also know as gamma.
 49 | * `learning_rate`: The learning rate of gradient boosting.
 50 | * `n_estimators`: Number of trees.
 51 | 
 52 | **Train**
 53 | ```python
 54 | model.fit(train,target)
 55 | ```
 56 | All inputs should be numpy arrays. `train` should be 2D array and `target` should be 1D array.
 57 | 
 58 | **Predict**
 59 | ```python
 60 | model.predict(test)
 61 | ```
 62 | Return predictions as numpy array.
 63 | 
 64 | **Customized loss**
 65 | 
 66 | Define a class that inheritates the `loss` class (see [source code](https://github.com/drop-out/Machine-Learning-From-Scratch/blob/master/gbdt.py#L7)), which specifies the link function, the gradients and the hessian.
 67 | 
 68 | ```python
 69 | class customized_loss(loss):
 70 | 	def link(self,score):
 71 | 		return 1/(1+np.exp(-score))
 72 | 	def g(self,true,score):
 73 | 		pred=self.link(score)
 74 | 		return pred-true
 75 | 	def h(self,true,score):
 76 | 		pred=self.link(score)
 77 | 		return pred*(1-pred)
 78 | ```
 79 | 
 80 | And the class could be passed when initializing the model.
 81 | 
 82 | ```python
 83 | model = GBDT(loss=customized_loss,learning_rate=0.1,n_estimators=100)
 84 | ```
 85 | 
 86 | ## MLP
 87 | 
 88 | To implement a full-featured deep learning framework is rather complicated. Instead of doing that, I write a simple Multi-layer Perceptron (MLP) with one hidden layer. This implementation of MLP supports:
 89 | 
 90 | - **Loss**: Mean squared loss for regression task and log loss for classfication task.
 91 | - **Activation**: `relu`, `sigmoid` and `linear` activation are supported natively.
 92 | - **Momentum**: Batch gradient descent with momentum optimizer.
 93 | - **Regularization**: L2 regularization, also known as weight decay.
 94 | 
 95 | A quick start is provided below.
 96 | 
 97 | **Import the module**
 98 | 
 99 | ```python
100 | from MLP import MLP
101 | ```
102 | 
103 | **Initialize model**
104 | 
105 | ```python
106 | model=MLP(n_hidden_units=100,batch_size=200,n_epochs=200,learning_rate=0.01,momentum=0.9,weight_decay=0.0001,loss='mse')
107 | ```
108 | 
109 | - `n_hidden_units`: Number of units in the hidden layer.
110 | - `batch_size`: Number of data points used in each gradient step.
111 | - `n_epochs`: Number of epochs. Note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.
112 | - `learning_rate`: The learning rate of gradient descent.
113 | - `momentum`: Momentum for gradient descent update. (Between 0 and 1.)
114 | - `weight_decay`: Coeffecients for L2 regularization. (Also known as weight decay.)
115 | - `activation`: Activation function for the hidden layer.`'relu'` for rectified linear units, `'logistic'` for sigmoid activation and `'linear'` for linear activation.
116 | - `loss`: Loss function,`'mse'` for regression task and `'log_loss'` for classfication task.
117 | 
118 | **Train**
119 | 
120 | ```python
121 | model.fit(train,target)
122 | ```
123 | 
124 | All inputs should be numpy arrays. `train` should be 2D array and `target` should be 1D array.
125 | 
126 | **Predict**
127 | 
128 | ```python
129 | model.predict(test)
130 | ```
131 | 
132 | Return predictions as numpy array.
133 | 
134 | 


--------------------------------------------------------------------------------
/gbdt.py:
--------------------------------------------------------------------------------
  1 | from abc import ABCMeta, abstractmethod
  2 | from multiprocessing import Pool
  3 | from functools import partial
  4 | import numpy as np
  5 | 
  6 | 
  7 | class loss(metaclass=ABCMeta):
  8 | 	'''
  9 | 	The absctract base class for loss function.
 10 | 	Three things should be specified for a loss,
 11 | 	namely link function, gradient and hessian.
 12 | 	link() is the link function, which takes scores as input, and returns predictions.
 13 | 	g() is the gradient, which takes true values and scores as input, and returns gradient.
 14 | 	h() is the heassian, which takes true values and scores as input, and returns hessian.
 15 | 	All inputs and outputs are numpy arrays.
 16 | 	'''
 17 | 	@abstractmethod
 18 | 	def link(self,score):
 19 | 		pass
 20 | 
 21 | 	@abstractmethod
 22 | 	def g(self,true,score):
 23 | 		pass
 24 | 
 25 | 	@abstractmethod
 26 | 	def h(self,true,score):
 27 | 		pass
 28 | 
 29 | class mse(loss):
 30 | 	'''Loss class for mse. As for mse, link function is pred=score.'''
 31 | 	def link(self,score):
 32 | 		return score
 33 | 
 34 | 	def g(self,true,score):
 35 | 		return score-true
 36 | 
 37 | 	def h(self,true,score):
 38 | 		return np.ones_like(score)
 39 | 
 40 | class log(loss):
 41 | 	'''Loss class for log loss. As for log loss, link function is logistic transformation.'''
 42 | 	def link(self,score):
 43 | 		return 1/(1+np.exp(-score))
 44 | 
 45 | 	def g(self,true,score):
 46 | 		pred=self.link(score)
 47 | 		return pred-true
 48 | 
 49 | 	def h(self,true,score):
 50 | 		pred=self.link(score)
 51 | 		return pred*(1-pred)
 52 | 
 53 | 
 54 | class GBDT(object):
 55 | 	'''
 56 | 	Parameters:
 57 | 	----------
 58 | 	n_threads: The number of threads used for fitting and predicting.
 59 | 	loss: Loss function for gradient boosting.
 60 | 		'mse' for regression task and 'log' for classfication task.
 61 | 		A child class of the loss class could be passed to implement customized loss.
 62 | 	max_depth: The maximum depth of a tree.
 63 | 	min_sample_split: The minimum number of samples required to further split a node.
 64 | 	reg_lamda: The regularization coefficient for leaf score, also known as lambda.
 65 | 	gamma: The regularization coefficient for number of tree nodes, also know as gamma.
 66 | 	learning_rate: The learning rate of gradient boosting.
 67 | 	n_estimators: Number of trees.
 68 | 	'''
 69 | 	def __init__(self,
 70 | 		n_threads=None,
 71 | 		loss='mse',
 72 | 		max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0,
 73 | 		learning_rate=0.1,n_estimators=100):
 74 | 		self.n_threads=n_threads
 75 | 		self.loss=loss
 76 | 		self.max_depth=max_depth
 77 | 		self.min_sample_split=min_sample_split
 78 | 		self.reg_lambda=reg_lambda
 79 | 		self.gamma=gamma
 80 | 		self.learning_rate=learning_rate
 81 | 		self.n_estimators=n_estimators
 82 | 
 83 | 	def fit(self,train,target):
 84 | 		self.estimators=[]
 85 | 		if self.loss=='mse':
 86 | 			self.loss=mse()
 87 | 		if self.loss=='log':
 88 | 			self.loss=log()
 89 | 		self.score_start=target.mean()
 90 | 		score=np.ones(len(train))*self.score_start
 91 | 		for i in range(self.n_estimators):
 92 | 			estimator=Tree(n_threads=self.n_threads,
 93 | 				max_depth=self.max_depth,min_sample_split=self.min_sample_split,reg_lambda=self.reg_lambda,gamma=self.gamma)
 94 | 			estimator.fit(train,g=self.loss.g(target,score),h=self.loss.h(target,score))
 95 | 			self.estimators.append(estimator)
 96 | 			score+=self.learning_rate*estimator.predict(train)
 97 | 		return self
 98 | 
 99 | 	def predict(self,test):
100 | 		score=np.ones(len(test))*self.score_start
101 | 		for i in range(self.n_estimators):
102 | 			score+=self.learning_rate*self.estimators[i].predict(test)
103 | 		return self.loss.link(score)
104 | 
105 | 
106 | class TreeNode(object):
107 | 	'''
108 | 	The data structure that are used for storing trees.
109 | 	A tree is presented by a set of nested TreeNodes,
110 | 	with one TreeNode pointing two child TreeNodes,
111 | 	until a tree leaf is reached.
112 | 
113 | 	Parameters:
114 | 	----------
115 | 	is_leaf: If is TreeNode is a leaf.
116 | 	score: The prediction (score) of a tree leaf.
117 | 	split_feature: The split feature of a tree node.
118 | 	split_threshold: The split threshold of a tree node.
119 | 	left_child: Pointing to a child TreeNode,
120 | 		where the value of split feature is less than the split threshold.
121 | 	right_child: Pointing to a child TreeNode,
122 | 		where the value of split features is greater than or equal to the split threshold.
123 | 	'''
124 | 	def __init__(self,
125 | 		is_leaf=False,score=None,
126 | 		split_feature=None,split_threshold=None,left_child=None,right_child=None):
127 | 		self.is_leaf=is_leaf
128 | 		self.score=score
129 | 		self.split_feature=split_feature
130 | 		self.split_threshold=split_threshold
131 | 		self.left_child=left_child
132 | 		self.right_child=right_child
133 | 
134 | class Tree(object):
135 | 	'''
136 | 	This is the building block for GBDT,
137 | 	which is a single decision tree,
138 | 	also known as an estimator.
139 | 
140 | 	Parameters:
141 | 	----------
142 | 	n_threads: The number of threads used for fitting and predicting.
143 | 	max_depth: The maximum depth of the tree.
144 | 	min_sample_split: The minimum number of samples required to further split a node.
145 | 	reg_lamda: The regularization coefficient for leaf prediction, also known as lambda.
146 | 	gamma: The regularization coefficient for number of TreeNode, also know as gamma.
147 | 	'''
148 | 	def __init__(self,n_threads=None,max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0):
149 | 		self.n_threads=n_threads
150 | 		self.max_depth=max_depth
151 | 		self.min_sample_split=min_sample_split
152 | 		self.reg_lambda=reg_lambda
153 | 		self.gamma=gamma
154 | 
155 | 	def fit(self,train,g,h):
156 | 		'''
157 | 		All inputs must be numpy arrays.
158 | 		g and h are gradient and hessian respectively.
159 | 		'''
160 | 		self.estimator=self.construct_tree(train,g,h,self.max_depth)
161 | 		return self
162 | 
163 | 	def predict(self,test):
164 | 		'''
165 | 		test must be numpy array.
166 | 		Return predictions (scores) as an array.
167 | 		Multiprocessing is supported for prediction.
168 | 		'''
169 | 		pool=Pool(self.n_threads)
170 | 		f=partial(self.predict_single,self.estimator)
171 | 		result=np.array(pool.map(f,test))
172 | 		pool.close()
173 | 		pool.join()
174 | 		return result
175 | 
176 | 	def predict_single(self,treenode,test):
177 | 		'''
178 | 		The predict method for a single sample point.
179 | 		test must be numpy array.
180 | 		Return prediction (score) as a number.
181 | 		'''
182 | 		if treenode.is_leaf:
183 | 			return treenode.score
184 | 		else:
185 | 			if test[treenode.split_feature]<treenode.split_threshold:
186 | 				return self.predict_single(treenode.left_child,test)
187 | 			else:
188 | 				return self.predict_single(treenode.right_child,test)
189 | 
190 | 	def construct_tree(self,train,g,h,max_depth):
191 | 		'''
192 | 		Construct tree recursively.
193 | 		First we should check if we should stop further splitting.
194 | 		The stopping conditions include:
195 | 		1. We have reached the pre-determined max_depth
196 | 		2. The number of sample points at this node is less than min_sample_split
197 | 		3. The best gain is less than gamma.
198 | 		4. Targets take only one value.
199 | 		5. Each feature takes only one value.
200 | 		By careful design, we could avoid checking condition 4 and 5 explicitly.
201 | 		In function find_threshold(), the best_gain is set to 0 initially.
202 | 		So if there are no further feature to split,
203 | 		or all the targets take the same value,
204 | 		the return value of best_gain would be zero.
205 | 		Thus condition 3 would be satisfied,
206 | 		and no further splitting would be done.
207 | 		To conclude, we need only to check condition 1,2 and 3.
208 | 		'''
209 | 
210 | 		if max_depth==0 or len(train)<self.min_sample_split:
211 | 			return TreeNode(is_leaf=True,score=self.leaf_score(g,h))
212 | 
213 | 		feature,threshold,gain=self.find_best_split(train,g,h)
214 | 
215 | 		if gain<=self.gamma:
216 | 			return TreeNode(is_leaf=True,score=self.leaf_score(g,h))
217 | 
218 | 		index=train[:,feature]<threshold
219 | 		left_child=self.construct_tree(train[index],g[index],h[index],max_depth-1)
220 | 		right_child=self.construct_tree(train[~index],g[~index],h[~index],max_depth-1)
221 | 		return TreeNode(split_feature=feature,split_threshold=threshold,left_child=left_child,right_child=right_child)
222 | 
223 | 	def leaf_score(self,g,h):
224 | 		'''
225 | 		Given the gradient and hessian of a tree leaf,
226 | 		return the prediction (score) at this leaf.
227 | 		The score is -G/(H+λ).
228 | 		'''
229 | 		return -np.sum(g)/(np.sum(h)+self.reg_lambda)
230 | 
231 | 	def leaf_loss(self,g,h):
232 | 		'''
233 | 		Given the gradient and hessian of a tree leaf,
234 | 		return the minimized loss at this leaf.
235 | 		The minimized loss is -0.5*G^2/(H+λ).
236 | 		'''
237 | 		return -0.5*np.square(np.sum(g))/(np.sum(h)+self.reg_lambda)
238 | 
239 | 	def find_best_split(self,train,g,h):
240 | 		'''
241 | 		Return the best feature to split together with the corresponding threshold.
242 | 		Each feature is scanned by find_threshold(),
243 | 		a [threshold,best_gain] list is returned for each feature.
244 | 		Then we select the feature with the largest best_gain,
245 | 		and return index of that feature, the threshold, and the gain that is achieved.
246 | 		'''
247 | 		pool=Pool(self.n_threads)
248 | 		f=partial(self.find_threshold,g,h)
249 | 		thresholds=np.array(pool.map(f,train.T))
250 | 		pool.close()
251 | 		pool.join()
252 | 		feature=np.argmax(thresholds[:,1],axis=0)
253 | 		threshold=thresholds[feature,0]
254 | 		gain=thresholds[feature,1]
255 | 		return feature,threshold,gain
256 | 
257 | 	def find_threshold(self,g,h,train):
258 | 		'''
259 | 		Given a particular feature,
260 | 		return the best split threshold together with the gain that is achieved.
261 | 		'''
262 | 		loss=self.leaf_loss(g,h)
263 | 		threshold=None
264 | 		best_gain=0
265 | 		unq=np.unique(train)
266 | 		for i in range(1,len(unq)):
267 | 			this_threshold=(unq[i-1]+unq[i])/2
268 | 			index=train<this_threshold
269 | 			left_loss=self.leaf_loss(g[index],h[index])
270 | 			right_loss=self.leaf_loss(g[~index],h[~index])
271 | 			this_gain=loss-left_loss-right_loss
272 | 			if this_gain>best_gain:
273 | 				threshold=this_threshold
274 | 				best_gain=this_gain
275 | 		return [threshold,best_gain]
276 | 


--------------------------------------------------------------------------------
/gbdt_numba.py:
--------------------------------------------------------------------------------
  1 | from abc import ABCMeta, abstractmethod
  2 | import numpy as np
  3 | from numba import jit
  4 | 
  5 | @jit
  6 | def leaf_score(g,h,reg_lambda):
  7 |     '''
  8 |     Given the gradient and hessian of a tree leaf,
  9 |     return the prediction (score) at this leaf.
 10 |     The score is -G/(H+λ).
 11 |     '''
 12 |     return -np.sum(g)/(np.sum(h)+reg_lambda)
 13 | 
 14 | @jit
 15 | def leaf_loss(g,h,reg_lambda):
 16 |     '''
 17 |     Given the gradient and hessian of a tree leaf,
 18 |     return the minimized loss at this leaf.
 19 |     The minimized loss is -0.5*G^2/(H+λ).
 20 |     '''
 21 |     return -0.5*np.square(np.sum(g))/(np.sum(h)+reg_lambda)
 22 | 
 23 | @jit
 24 | def calculate_gain(original_loss,feature,g,h,threshold,reg_lambda):
 25 |     '''
 26 |     Given the original loss,
 27 |     and the threshold to split,
 28 |     calculate the new loss.
 29 |     '''
 30 |     left_g=0
 31 |     left_h=0
 32 |     right_g=0
 33 |     right_h=0
 34 |     for i in range(len(feature)):
 35 |         if feature[i]<threshold:
 36 |             left_g+=g[i]
 37 |             left_h+=h[i]
 38 |         else:
 39 |             right_g+=g[i]
 40 |             right_h+=h[i]
 41 |     left_loss=-0.5*np.square(left_g)/(left_h+reg_lambda)
 42 |     right_loss=-0.5*np.square(right_g)/(right_h+reg_lambda)
 43 |     return original_loss-left_loss-right_loss
 44 | 
 45 | @jit
 46 | def find_threshold(g,h,train,reg_lambda):
 47 |     '''
 48 |     Given a particular feature,
 49 |     return the best split threshold together with the gain that is achieved.
 50 |     '''
 51 |     loss=leaf_loss(g,h,reg_lambda)
 52 |     threshold=None
 53 |     best_gain=0
 54 |     unq=np.unique(train)
 55 |     for i in range(1,len(unq)):
 56 |         this_threshold=(unq[i-1]+unq[i])/2
 57 |         this_gain=calculate_gain(loss,train,g,h,this_threshold,reg_lambda)
 58 |         if this_gain>best_gain:
 59 |             threshold=this_threshold
 60 |             best_gain=this_gain
 61 |     return threshold,best_gain
 62 | 
 63 | @jit
 64 | def find_best_split(train,g,h,reg_lambda):
 65 |     '''
 66 |     Return the best feature to split together with the corresponding threshold.
 67 |     Each feature is scanned by find_threshold(),
 68 |     a (threshold,gain) tuple is returned for each feature.
 69 |     Then we select the feature with the largest best_gain,
 70 |     and return index of that feature, the threshold, and the gain that is achieved.
 71 |     '''
 72 |     train=train.T
 73 |     feature=0
 74 |     threshold=None
 75 |     best_gain=0
 76 |     for i in range(len(train)):
 77 |         this_threshold,this_gain=find_threshold(g,h,train[i],reg_lambda)
 78 |         if this_gain>best_gain:
 79 |             feature=i
 80 |             threshold=this_threshold
 81 |             best_gain=this_gain
 82 |     return feature,threshold,best_gain
 83 | 
 84 | class loss(metaclass=ABCMeta):
 85 |     '''
 86 |     The absctract base class for loss function.
 87 |     Three things should be specified for a loss,
 88 |     namely link function, gradient and hessian.
 89 |     link() is the link function, which takes scores as input, and returns predictions.
 90 |     g() is the gradient, which takes true values and scores as input, and returns gradient.
 91 |     h() is the heassian, which takes true values and scores as input, and returns hessian.
 92 |     All inputs and outputs are numpy arrays.
 93 |     '''
 94 |     @abstractmethod
 95 |     def link(self,score):
 96 |         pass
 97 | 
 98 |     @abstractmethod
 99 |     def g(self,true,score):
100 |         pass
101 | 
102 |     @abstractmethod
103 |     def h(self,true,score):
104 |         pass
105 | 
106 | class mse(loss):
107 |     '''Loss class for mse. As for mse, link function is pred=score.'''
108 |     def link(self,score):
109 |         return score
110 | 
111 |     def g(self,true,score):
112 |         return score-true
113 | 
114 |     def h(self,true,score):
115 |         return np.ones_like(score)
116 | 
117 | class log(loss):
118 |     '''Loss class for log loss. As for log loss, link function is logistic transformation.'''
119 |     def link(self,score):
120 |         return 1/(1+np.exp(-score))
121 | 
122 |     def g(self,true,score):
123 |         pred=self.link(score)
124 |         return pred-true
125 | 
126 |     def h(self,true,score):
127 |         pred=self.link(score)
128 |         return pred*(1-pred)
129 | 
130 | 
131 |     
132 |     
133 | class GBDT(object):
134 |     '''
135 |     Parameters:
136 |     ----------
137 |     n_threads: The number of threads used for fitting and predicting.
138 |     loss: Loss function for gradient boosting.
139 |         'mse' for regression task and 'log' for classfication task.
140 |         A child class of the loss class could be passed to implement customized loss.
141 |     max_depth: The maximum depth of a tree.
142 |     min_sample_split: The minimum number of samples required to further split a node.
143 |     reg_lamda: The regularization coefficient for leaf score, also known as lambda.
144 |     gamma: The regularization coefficient for number of tree nodes, also know as gamma.
145 |     learning_rate: The learning rate of gradient boosting.
146 |     n_estimators: Number of trees.
147 |     '''
148 |     def __init__(self,
149 |         loss='mse',
150 |         max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0,
151 |         learning_rate=0.1,n_estimators=100):
152 |         self.loss=loss
153 |         self.max_depth=max_depth
154 |         self.min_sample_split=min_sample_split
155 |         self.reg_lambda=reg_lambda
156 |         self.gamma=gamma
157 |         self.learning_rate=learning_rate
158 |         self.n_estimators=n_estimators
159 | 
160 |     def fit(self,train,target):
161 |         self.estimators=[]
162 |         if self.loss=='mse':
163 |             self.loss=mse()
164 |         if self.loss=='log':
165 |             self.loss=log()
166 |         self.score_start=target.mean()
167 |         score=np.ones(len(train))*self.score_start
168 |         for i in range(self.n_estimators):
169 |             estimator=Tree(
170 |                 max_depth=self.max_depth,min_sample_split=self.min_sample_split,reg_lambda=self.reg_lambda,gamma=self.gamma)
171 |             estimator.fit(train,g=self.loss.g(target,score),h=self.loss.h(target,score))
172 |             self.estimators.append(estimator)
173 |             score+=self.learning_rate*estimator.predict(train)
174 |         return self
175 | 
176 |     def predict(self,test):
177 |         score=np.ones(len(test))*self.score_start
178 |         for i in range(self.n_estimators):
179 |             score+=self.learning_rate*self.estimators[i].predict(test)
180 |         return self.loss.link(score)
181 | 
182 | 
183 | class TreeNode(object):
184 |     '''
185 |     The data structure that are used for storing trees.
186 |     A tree is presented by a set of nested TreeNodes,
187 |     with one TreeNode pointing two child TreeNodes,
188 |     until a tree leaf is reached.
189 | 
190 |     Parameters:
191 |     ----------
192 |     is_leaf: If is TreeNode is a leaf.
193 |     score: The prediction (score) of a tree leaf.
194 |     split_feature: The split feature of a tree node.
195 |     split_threshold: The split threshold of a tree node.
196 |     left_child: Pointing to a child TreeNode,
197 |         where the value of split feature is less than the split threshold.
198 |     right_child: Pointing to a child TreeNode,
199 |         where the value of split features is greater than or equal to the split threshold.
200 |     '''
201 |     def __init__(self,
202 |         is_leaf=False,score=None,
203 |         split_feature=None,split_threshold=None,left_child=None,right_child=None):
204 |         self.is_leaf=is_leaf
205 |         self.score=score
206 |         self.split_feature=split_feature
207 |         self.split_threshold=split_threshold
208 |         self.left_child=left_child
209 |         self.right_child=right_child
210 | 
211 | class Tree(object):
212 |     '''
213 |     This is the building block for GBDT,
214 |     which is a single decision tree,
215 |     also known as an estimator.
216 | 
217 |     Parameters:
218 |     ----------
219 |     n_threads: The number of threads used for fitting and predicting.
220 |     max_depth: The maximum depth of the tree.
221 |     min_sample_split: The minimum number of samples required to further split a node.
222 |     reg_lamda: The regularization coefficient for leaf prediction, also known as lambda.
223 |     gamma: The regularization coefficient for number of TreeNode, also know as gamma.
224 |     '''
225 |     def __init__(self,max_depth=3,min_sample_split=10,reg_lambda=1,gamma=0):
226 |         self.max_depth=max_depth
227 |         self.min_sample_split=min_sample_split
228 |         self.reg_lambda=reg_lambda
229 |         self.gamma=gamma
230 | 
231 |     def fit(self,train,g,h):
232 |         '''
233 |         All inputs must be numpy arrays.
234 |         g and h are gradient and hessian respectively.
235 |         '''
236 |         self.estimator=self.construct_tree(train,g,h,self.max_depth)
237 |         return self
238 | 
239 |     def predict(self,test):
240 |         '''
241 |         test must be numpy array.
242 |         Return predictions (scores) as an array.
243 |         Multiprocessing is supported for prediction.
244 |         '''
245 |         result=np.zeros(len(test))
246 |         for i in range(len(test)):
247 |             result[i]=self.predict_single(self.estimator,test[i])
248 |         return result
249 | 
250 |     def predict_single(self,treenode,test):
251 |         '''
252 |         The predict method for a single sample point.
253 |         test must be numpy array.
254 |         Return prediction (score) as a number.
255 |         '''
256 |         if treenode.is_leaf:
257 |             return treenode.score
258 |         else:
259 |             if test[treenode.split_feature]<treenode.split_threshold:
260 |                 return self.predict_single(treenode.left_child,test)
261 |             else:
262 |                 return self.predict_single(treenode.right_child,test)
263 | 
264 |     def construct_tree(self,train,g,h,max_depth):
265 |         '''
266 |         Construct tree recursively.
267 |         First we should check if we should stop further splitting.
268 |         The stopping conditions include:
269 |         1. We have reached the pre-determined max_depth
270 |         2. The number of sample points at this node is less than min_sample_split
271 |         3. The best gain is less than gamma.
272 |         4. Targets take only one value.
273 |         5. Each feature takes only one value.
274 |         By careful design, we could avoid checking condition 4 and 5 explicitly.
275 |         In function find_threshold(), the best_gain is set to 0 initially.
276 |         So if there are no further feature to split,
277 |         or all the targets take the same value,
278 |         the return value of best_gain would be zero.
279 |         Thus condition 3 would be satisfied,
280 |         and no further splitting would be done.
281 |         To conclude, we need only to check condition 1,2 and 3.
282 |         '''
283 | 
284 |         if max_depth==0 or len(train)<self.min_sample_split:
285 |             return TreeNode(is_leaf=True,score=leaf_score(g,h,self.reg_lambda))
286 | 
287 |         feature,threshold,gain=find_best_split(train,g,h,self.reg_lambda)
288 | 
289 |         if gain<=self.gamma:
290 |             return TreeNode(is_leaf=True,score=leaf_score(g,h,self.reg_lambda))
291 | 
292 |         index=train[:,feature]<threshold
293 |         left_child=self.construct_tree(train[index],g[index],h[index],max_depth-1)
294 |         right_child=self.construct_tree(train[~index],g[~index],h[~index],max_depth-1)
295 |         return TreeNode(split_feature=feature,split_threshold=threshold,left_child=left_child,right_child=right_child)
296 | 


--------------------------------------------------------------------------------