├── .gitignore
├── EnsembleClassifiers.py
├── FunctionTransformer.py
├── LasagneUtils.py
├── Readme.md
├── XGBoostClassifier.py
└── tests
    ├── test_functiontransformer.py
    └── test_xgboostclassifier.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/EnsembleClassifiers.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     A wrapper for different ways of combining models
  3 | 
  4 |     Authors: Henning Sperr
  5 | 
  6 |     License: BSD-3 clause
  7 | """
  8 | from __future__ import print_function
  9 | from itertools import combinations, izip
 10 | import random
 11 | 
 12 | from sklearn.base import ClassifierMixin
 13 | from sklearn.cross_validation import StratifiedShuffleSplit
 14 | from sklearn.linear_model import LogisticRegressionCV
 15 | from sklearn.ensemble import RandomForestClassifier
 16 | from sklearn.metrics import log_loss
 17 | 
 18 | import numpy as np
 19 | from scipy.optimize import minimize
 20 | 
 21 | 
 22 | class LinearModelCombination(ClassifierMixin):
 23 |     """
 24 |         Class that combines two models linearly.
 25 | 
 26 |         model1/2 : models to be combined
 27 |         metric : metric to minimize
 28 |     """
 29 | 
 30 |     def __init__(self, model1, model2, metric=log_loss):
 31 |         self.model1 = model1
 32 |         self.model2 = model2
 33 |         self.weight = None
 34 |         self.metric = metric
 35 | 
 36 |     def fit(self, X, y):
 37 |         scores = []
 38 |         pred1 = self.model1.predict_proba(X)
 39 |         pred2 = self.model2.predict_proba(X)
 40 | 
 41 |         for i in xrange(0, 101):
 42 |             weight = i / 100.
 43 |             scores.append(
 44 |                 self.metric(y, weight * pred1 + (1 - weight) * pred2))
 45 |             # linear surface so if the score gets worse we can stop
 46 |             if len(scores) > 1 and scores[-1] > scores[-2]:
 47 |                 break
 48 | 
 49 |         best_weight = np.argmin(scores)
 50 | 
 51 |         self.best_score = scores[best_weight]
 52 |         self.weight = best_weight / 100.
 53 | 
 54 |         return self
 55 | 
 56 |     def predict(self, X):
 57 |         if self.weight == None:
 58 |             raise Exception("Classifier seems to be not yet fitted")
 59 | 
 60 |         pred1 = self.model1.predict_proba(X) * self.weight
 61 |         pred2 = self.model2.predict_proba(X) * (1 - self.weight)
 62 |         return np.argmax(pred1 + pred2)
 63 | 
 64 |     def predict_proba(self, X):
 65 |         if self.weight == None:
 66 |             raise Exception("Classifier seems to be not yet fitted")
 67 | 
 68 |         pred1 = self.model1.predict_proba(X) * self.weight
 69 |         pred2 = self.model2.predict_proba(X) * (1 - self.weight)
 70 |         return pred1 + pred2
 71 | 
 72 |     def __str__(self):
 73 |         return ' '.join(["LM: ", str(self.model1), ' - ', str(self.model2), ' W: ', str(self.weight)])
 74 | 
 75 | 
 76 | class BestEnsembleWeights(ClassifierMixin):
 77 | 
 78 |     """
 79 |         Use scipys optimize package to find best weights for classifier combination.
 80 | 
 81 |         classifiers : list of classifiers
 82 |         prefit : if True classifiers will be assumed to be fit already and the data passed to
 83 |                  fit method will be fully used for finding best weights
 84 |         random_state : random seed
 85 |         verbose : print verbose output
 86 | 
 87 |     """
 88 | 
 89 |     def __init__(self, classifiers, metric=log_loss, voting='soft', higher_is_better=False, prefit=False, num_iter=50, random_state=None, verbose=0):
 90 |         self.classifiers = classifiers
 91 |         self.prefit = prefit
 92 |         self.metric = metric
 93 |         self.higher_is_better=higher_is_better
 94 |         self.num_iter = num_iter
 95 |         self.voting = voting
 96 |         if random_state is None:
 97 |             self.random_state = random.randint(0, 10000)
 98 |         else:
 99 |             self.random_state = random_state
100 |         self.verbose = verbose
101 | 
102 |     def fit(self, X, y):
103 |         if self.prefit:
104 |             test_x, test_y = X, y
105 |         else:
106 |             sss = StratifiedShuffleSplit(
107 |                 y, n_iter=1, random_state=self.random_state)
108 |             for train_index, test_index in sss:
109 |                 break
110 | 
111 |             train_x = X[train_index]
112 |             train_y = y[train_index]
113 | 
114 |             test_x = X[test_index]
115 |             test_y = y[test_index]
116 | 
117 |             for clf in self.classifiers:
118 |                 clf.fit(train_x, train_y)
119 | 
120 |         self._find_best_weights(test_x, test_y)
121 | 
122 |     def _find_best_weights(self, X, y):
123 |         predictions = self._predict_probas(X)
124 | 
125 |         if self.verbose:
126 |             print('Individual Scores:')
127 |             for mn, pred in enumerate(predictions):
128 |                 print("Model {model_number}:{score}".format(model_number=mn,
129 |                                                                score=self.metric(y, pred)))
130 |         def loss_func(weights):
131 |             ''' scipy minimize will pass the weights as a numpy array '''
132 |             weighted_predictions = np.average(predictions, axis=0, weights=weights)
133 |             sign = (1,-1)[self.higher_is_better]
134 |             return sign*self.metric(y, weighted_predictions)
135 | 
136 |         # the algorithms need a starting value, right not we chose 0.5 for all weights
137 |         # its better to choose many random starting points and run minimize a
138 |         # few times
139 |         starting_values = np.ones(len(predictions)) / (len(predictions))
140 |         # This sets the bounds on the weights, between 0 and 1
141 |         bounds = tuple((0, 1) for w in starting_values)
142 | 
143 |         # adding constraints  and a different solver as suggested by user 16universes
144 |         # https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
145 |         cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
146 | 
147 |         res = minimize(loss_func, starting_values,
148 |                        method='SLSQP', bounds=bounds, constraints=cons)
149 | 
150 |         self.best_score = res['fun']
151 |         self.best_weights = res['x']
152 |         if self.verbose:
153 |             print('First Iteration:')
154 |             print('Update Ensamble Score: {best_score}'.format(best_score=res['fun']))
155 |             print('Update Best Weights: {weights}'.format(weights=self.best_weights))
156 | 
157 | 
158 |         for i in xrange(self.num_iter):
159 |             starting_values = np.random.uniform(0,1,size=len(predictions))
160 | 
161 |             res = minimize(loss_func, starting_values,
162 |                            method='SLSQP', bounds=bounds, constraints=cons)
163 | 
164 |             if res['fun']<self.best_score:
165 |                 self.best_score = res['fun']
166 |                 self.best_weights = res['x']
167 | 
168 |                 if self.verbose:
169 |                     print('')
170 |                     print('Update Ensamble Score: {best_score}'.format(best_score=res['fun']))
171 |                     print('Update Best Weights: {weights}'.format(weights=self.best_weights))
172 | 
173 |         if self.verbose:
174 |             print('Final Ensemble Score')
175 |             print('Ensamble Score: {best_score}'.format(best_score=self.best_score))
176 |             print('Best Weights: {weights}'.format(weights=self.best_weights))
177 | 
178 |     def predict_proba(self, X):
179 |         return self._predict_proba(X, self.best_weights)
180 | 
181 |     def _predict_probas(self, X):
182 |         return np.asarray([clf.predict_proba(X) for clf in self.classifiers])
183 | 
184 |     def _predict_proba(self, X, weights):
185 |         return np.average(self._predict_probas(X), axis=0, weights=weights)
186 | 
187 |     def _predict(self, X, weights):
188 |         if self.verbose:
189 |             print('Num Classifiers:{num_class}'.format(num_class=len(self.classifiers)))
190 | 
191 |         if self.voting == 'hard':
192 |             predictions = np.array([clf.predict(X) for clf in self.classifiers])
193 |             majority_vote = np.apply_along_axis(
194 |                                   lambda x:
195 |                                   np.argmax(np.bincount(x, weights=weights)),
196 |                                   axis=1,
197 |                                   arr=predictions)
198 |             return majority_vote
199 |         elif self.voting == 'decision':
200 |             pass
201 |         else:
202 |             return np.argmax(self._predict_proba(X, weights), axis=1)
203 | 
204 |     def predict(self, X):
205 |         return self._predict(X, self.best_weight)
206 | 
207 | 
208 | class LogisticModelCombination(ClassifierMixin):
209 | 
210 |     """
211 |         Combine multiple models using a Logistic Regression
212 |     """
213 | 
214 |     def __init__(self, classifiers, cv_folds=1, use_original_features=False, random_state=None, verbose=0):
215 |         self.classifiers = classifiers
216 |         self.cv_folds = cv_folds
217 |         self.use_original_features = use_original_features
218 |         self.logistic = LogisticRegressionCV(
219 |             Cs=[10, 1, 0.1, 0.01, 0.001], refit=True)
220 | 
221 |         if random_state is None:
222 |             self.random_state = random.randint(0, 10000)
223 |         else:
224 |             self.random_state = random_state
225 | 
226 |     def fit(self, X, y):
227 |         sss = StratifiedShuffleSplit(
228 |             y, n_iter=self.cv_folds, random_state=self.random_state)
229 |         for train_index, test_index in sss:
230 |             train_x = X[train_index]
231 |             train_y = y[train_index]
232 | 
233 |             test_x = X[test_index]
234 |             test_y = y[test_index]
235 | 
236 |             self._fit_logistic(train_x, train_y)
237 | 
238 |     def _fit_logistic(self, X, y):
239 |         pred_X = self.convert_data(X)
240 |         self.logistic.fit(pred_X, y)
241 |         return self
242 | 
243 |     def convert_data(self, X):
244 |         preds = []
245 |         for i, clf in enumerate(self.classifiers):
246 |             class_proba = clf.predict(X)
247 |             preds.append(class_proba)
248 |         pred_X = np.vstack(preds).T
249 | 
250 |         if self.use_original_features:
251 |             pred_X = np.concatenate([X, pred_X], axis=1)
252 |         return pred_X
253 | 
254 |     def predict_proba(self, X):
255 |         pred_X = self.convert_data(X)
256 |         return self.logistic.predict_proba(pred_X)
257 | 


--------------------------------------------------------------------------------
/FunctionTransformer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from sklearn.base import TransformerMixin
 4 | 
 5 | class FunctionTransformer(TransformerMixin):
 6 |     """
 7 |         Transformer that takes arbitrary function for transformation as input
 8 | 
 9 |         Parameters
10 |         ----------
11 | 
12 |         func : Function
13 |             arbitrary function that gets applied to each element of the input.
14 |     """
15 |     def __init__(self, func):
16 |         self.func=func
17 | 
18 |     def fit(self, X, y):
19 |         return self
20 | 
21 |     def transform(self, X):
22 |         return self.func(X)
23 | 
24 |     def fit_transform(self, X, y):
25 |         return self.fit(X, y).transform(X)
26 | 
27 | class LogTransformer(FunctionTransformer):
28 |     """
29 |         Logarithm Transformer that applies log(1+x) to the input
30 |     """
31 |     def __init__(self):
32 |         super(self.__class__, self).__init__(lambda x: np.log1p(x.astype(np.float32)))
33 | 
34 | class PowerTransformer(FunctionTransformer):
35 |     """
36 |         Power Transformer that applies x^n to the input
37 |     """
38 |     def __init__(self, power):
39 |         super(self.__class__, self).__init__(lambda x:np.power(x, power))
40 | 
41 | if __name__ == '__main__':
42 |     pwr = PowerTransformer(2)
43 |     arr = np.array([1,2,3,4])
44 |     print pwr.transform(arr)
45 | 


--------------------------------------------------------------------------------
/LasagneUtils.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Different Utilities to help with Lasange/Nolearn learning
  3 | 
  4 |     Authors: Henning Sperr
  5 | 
  6 |     License: BSD-3 clause
  7 | """
  8 | 
  9 | from __future__ import print_function
 10 | 
 11 | from sklearn.metrics import log_loss
 12 | import numpy as np
 13 | 
 14 | class LinearAdjustVariable(object):
 15 | 
 16 |     """
 17 |         Adjusts a variable after each epoch, e.g. learning_rate or momentum
 18 | 
 19 |         name : name of the variable to update
 20 |         start : start value for update
 21 |         stop : stop value for update
 22 |     """
 23 | 
 24 |     def __init__(self, name, start=0.1, stop=0.001):
 25 |         self.name = name
 26 |         self.start = start
 27 |         self.stop = stop
 28 | 
 29 |         # TODO: make custom stepfuction, exponential etc.
 30 |         self.step_function = None
 31 | 
 32 |     def __call__(self, nn, train_history):
 33 |         epoch = train_history[-1]['epoch']
 34 | 
 35 |         new_value = self.start
 36 |         stepsize = 0.4 * (self.start - self.stop) / nn.max_epochs
 37 |         if self.start < 0.1:
 38 |             new_value -= stepsize * epoch  # np.float32(self.ls[epoch-1])
 39 |             new_value = max(new_value, self.stop)
 40 |         else:
 41 |             new_value += stepsize * epoch
 42 |             new_value = min(new_value, self.stop)
 43 | 
 44 |         new_value = np.float32(new_value)
 45 |         getattr(nn, self.name).set_value(new_value)
 46 | 
 47 | 
 48 | class EarlyStopper(object):
 49 | 
 50 |     """
 51 |         Stops learning if there was no improvement for N iterations
 52 | 
 53 |         max_iterations : number of iterations to have no improvement in a row
 54 |                          before stopping
 55 |     """
 56 | 
 57 |     def __init__(self, max_iterations=10):
 58 |         self.max_iterations = max_iterations
 59 |         self.best_iteration_score = 10000
 60 |         self.best_iteration = 0
 61 | 
 62 |     def __call__(self, nn, train_history):
 63 |         if self.best_iteration_score > train_history[-1]['valid_loss']:
 64 |             self.best_iteration_score = train_history[-1]['valid_loss']
 65 |             self.best_iteration = len(train_history)
 66 | 
 67 |         if len(train_history) - self.best_iteration >= self.max_iterations:
 68 |             nn.max_epochs = train_history[-1]['epoch']
 69 | 
 70 | 
 71 | class CustomValidationSet(object):
 72 | 
 73 |     """
 74 |         Pass a custom validation set and a metric, standart is log_loss
 75 | 
 76 |         validation_items: pass items in the form [('name1', [test_x, test_y]),
 77 |                                                   ('name2',...)]]
 78 |     """
 79 | 
 80 |     def __init__(self, validation_items=None, metric=log_loss):
 81 |         self.validation_items = validation_items
 82 |         self.metric = metric
 83 | 
 84 |     def __call__(self, nn, train_history):
 85 |         if self.validation_items is None:
 86 |             return
 87 | 
 88 |         for name, data in self.validation_items:
 89 |             data_x, data_y = data
 90 |             print('Validating {name}: {score}'.format(
 91 |                 name=name, score=self.metric(data_y, nn.predict_proba(data_x))))
 92 | 
 93 | 
 94 | class TrainRatioStopper(object):
 95 | 
 96 |     """
 97 |         Stops learning if train_loss/validation_loss falls below a certain ratio
 98 | 
 99 |         stop_ratio : the train_loss/validation_loss ratio to stop training
100 |     """
101 | 
102 |     def __init__(self, stop_ratio=0.8):
103 |         self.stop_ratio = stop_ratio
104 | 
105 |     def __call__(self, nn, train_history):
106 |         ratio = train_history[-1]['train_loss'] / \
107 |             train_history[-1]['valid_loss']
108 |         if ratio < self.stop_ratio:
109 |             nn.max_epochs = train_history[-1]['epoch']
110 | 
111 | 
112 | class BestIterationSaver(object):
113 | 
114 |     """
115 |         Saves the weights for the best iteration
116 | 
117 |         name : name of the best iteration weights file
118 |         delayed_start : number of iterations to wait before starting to save
119 |         verbose : print a logmessage when saving
120 | 
121 |     """
122 | 
123 |     def __init__(self, name='best_iteration.weights', delayed_start=10, verbose=0):
124 |         self.best_score = None
125 |         self.best_weights = None
126 |         self.delayed_start = delayed_start
127 |         self.filename = name
128 |         self.verbose = verbose
129 | 
130 |     def __call__(self, nn, train_history):
131 |         if len(train_history) < self.delayed_start:
132 |             return
133 | 
134 |         if self.best_score is None or train_history[-1]['valid_loss'] < self.best_score:
135 |             if self.verbose:
136 |                 print('Saving to {filename}'.format(filename=self.filename))
137 | 
138 |             self.best_score = train_history[-1]['valid_loss']
139 |             nn.save_weights_to(self.filename)
140 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning Helpers
 2 | 
 3 | This repository contains several helper classes for ML, it tries to maintain rudimentary scikit learn compatibility
 4 | 
 5 | ## Helpers
 6 | 
 7 | ### FunctionTransformer / LasagneUtils
 8 | 
 9 | - applies a function to all elements
10 | 
11 | Example creates a simple network, with linear decrease in learning rate and 
12 | linear increase in momentum, it will save the best iterations after the first 10 and
13 | stop if either there was no improvement in the last 10 iterations or the train/test ratio is below 0.8:
14 | 
15 | 
16 | ```Python
17 | from lasagne.layers import DenseLayer, InputLayer, DropoutLayer
18 | from lasagne.nonlinearities import rectify, softmax, tanh, linear
19 | from lasagne.updates import nesterov_momentum, rmsprop, momentum
20 | from nolearn.lasagne import NeuralNet
21 | import theano
22 | 
23 | from sklearn.preprocessing import StandardScaler
24 | from sklearn.pipeline import Pipeline
25 | 
26 | from FunctionTransformer import LogTransformer
27 | from LasagneUtils import EarlyStopper, LinearAdjustVariable, TrainRatioStopper, BestIteratorSaver
28 | 
29 | 
30 | 
31 | layers=[('input', InputLayer),
32 |         ('dense0', DenseLayer),
33 |         ('output', DenseLayer),
34 |         ]
35 | #DenseLayer()
36 | net =  NeuralNet(layers=layers,
37 |                  input_shape=(None, train_x.shape[1]),
38 |                  dense0_num_units=512,
39 |                  dense0_nonlinearity=rectify,
40 |                  output_num_units=9,
41 |                  output_nonlinearity=softmax,
42 |                  update=momentum,
43 |                  update_learning_rate=theano.shared(np.float32(0.05)),
44 |                  update_momentum=theano.shared(np.float32(0.9)),
45 |                  on_epoch_finished = [LinearAdjustVariable('update_learning_rate', start=0.05, stop=0.0001),
46 |                                       LinearAdjustVariable('update_momentum', start=0.9, stop=0.999),
47 |                                       TrainRatioStopper(0.8),
48 |                                       EarlyStopper(),
49 |                                       BestIterationSaver(verbose=1)
50 |                                       ],
51 |                  eval_size=0.1,
52 |                  verbose=1,
53 |                  max_epochs=501)
54 | 
55 | net_ppl2 = Pipeline([('LogTrans', LogTransformer()),('StandartScale', StandardScaler()), ('nn',net)])
56 | net_ppl2.fit(train_x.astype(np.float32), train_y.astype(np.int32))
57 | ```
58 | 
59 | ### XGBoostClassifier
60 | 
61 | - wrapper for xgboost. needs xgboost installed and have <xgboostmaindir>/wrapper in PYTHON_PATH
62 | 
63 | Example:
64 | 
65 | ```Python
66 | from XGBoostClassifier import XGBoostClassifier
67 | xgb = XGBoostClassifier(watchlist=[(test_x, test_y)],
68 |                         max_samples=0.9,
69 |                         n_iter=105,
70 |                         random_state=1335)
71 | 
72 | xgb.fit(train_x, train_y)
73 | xgb.predict_proba(train_x)
74 | ```
75 | 
76 | ### EnsembleClassifiers
77 | 
78 | - pass a list of classifiers and find the best weights for combining them
79 | 
80 | Example:
81 | 
82 | ```Python
83 | from EnsembleClassifier import BestEnsambleWeights
84 | 
85 | rfc = RandomForestClassifier(...)
86 | xgb = XGBoostClassifier(...)
87 | logreg = LogisticRegression(...)
88 | 
89 | bew = BestEnsembleWeights([rfc, xgb, logreg], prefit=False, random_state=1337, verbose=1)
90 | bew.fit(train_y, train_x)
91 | bew.predict_proba(test_x)
92 | 
93 | ```
94 | 


--------------------------------------------------------------------------------
/XGBoostClassifier.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     A wrapper around XGBoost trying to maintain the Scikit Learn API
  3 | 
  4 |     Authors: Henning Sperr
  5 | 
  6 |     License: BSD-3 clause
  7 | """
  8 | 
  9 | import random
 10 | import xgboost as xgb
 11 | 
 12 | from sklearn.base import BaseEstimator, ClassifierMixin
 13 | 
 14 | import numpy as np
 15 | 
 16 | class XGBoostClassifier(BaseEstimator, ClassifierMixin):
 17 |     """
 18 |         A simple wrapper around XGBoost
 19 | 
 20 |         more details:
 21 |         https://github.com/dmlc/xgboost/wiki/Parameters
 22 | 
 23 |         Parameters
 24 |         ----------
 25 | 
 26 |         base_estimator : can be 'gbtree' or 'gblinear'
 27 |         gamma : minimum loss reduction required to make a partition, higher values
 28 |                 mean more conservative boosting
 29 |         max_depth : maximum depth of a tree
 30 |         min_child_weight : larger values mean more conservative partitioning
 31 | 
 32 |         objective : 'reg:linear' - linear regression
 33 |                     'reg:logistic' - logistic regression
 34 |                     'binary:logistic' - binary logistic regression
 35 |                     'binary:logitraw' - binary logistic regression before logistic transformation
 36 |                     'multi:softmax' - multiclass classification
 37 |                     'multi:softprob' - multiclass classification with class probability output
 38 |                     'rank:pairwise' - pairwise minimize loss
 39 | 
 40 |         metric : 'rmse' - root mean square error
 41 |                  'logloss' - negative log likelihood
 42 |                  'error' - binary classification error rate
 43 |                  'merror' - multiclass error rate
 44 |                  'mlogloss' - multiclass logloss
 45 |                  'auc' - area under the curve for ranking evaluation
 46 |                  'ndcg' - normalized discounted cumulative gain ndcg@n for top n eval
 47 |                  'map' - mean average precision map@n for top n eval
 48 |     """
 49 |     def __init__(self,
 50 |                  base_estimator='gbtree',
 51 |                  objective='multi:softprob',
 52 |                  metric='mlogloss',
 53 |                  num_classes=9,
 54 |                  learning_rate=0.25,
 55 |                  max_depth=10,
 56 |                  max_samples=1.0,
 57 |                  max_features=1.0,
 58 |                  max_delta_step=0,
 59 |                  min_child_weight=4,
 60 |                  min_loss_reduction=1,
 61 |                  l1_weight=0.0,
 62 |                  l2_weight=0.0,
 63 |                  l2_on_bias=False,
 64 |                  gamma=0.02,
 65 |                  inital_bias=0.5,
 66 |                  random_state=None,
 67 |                  watchlist=None,
 68 |                  n_jobs=4,
 69 |                  n_iter=150):
 70 | 
 71 | 
 72 |         if random_state is None:
 73 |             random_state = random.randint(0, 1000000)
 74 | 
 75 |         param ={
 76 |           'silent':1,
 77 |           'verbose':0,
 78 |           'use_buffer': True,
 79 |           'base_score': inital_bias,
 80 |           'nthread': n_jobs,
 81 |           'booster': base_estimator,
 82 |           'eta': learning_rate,
 83 |           'gamma': gamma,
 84 |           'max_depth': max_depth,
 85 |           'max_delta_step' : max_delta_step,
 86 |           'min_child_weight': min_child_weight,
 87 |           'min_loss_reduction':min_loss_reduction,
 88 |           'subsample': max_samples,
 89 |           'colsample_bytree': max_features,
 90 |           'alpha': l1_weight,
 91 |           'lambda':l2_weight,
 92 |           'lambda_bias': l2_on_bias,
 93 |           'objective': objective,
 94 |           'eval_metric': metric,
 95 |           'seed': random_state,
 96 |           'num_class': num_classes
 97 |         }
 98 |         self.param = param
 99 |         if not watchlist:
100 |             self.wl=[]
101 |         else:
102 |             self.wl = watchlist
103 |         self.n_iter=n_iter
104 | 
105 |     def fit(self, X, y=None):
106 |         self.booster_ = None
107 |         X=self.convert(X, y)
108 |         if self.wl:
109 |             wl = [(X, 'train')]
110 |             for i, ent in enumerate(self.wl):
111 |                 ent, lbl = ent
112 |                 wl.append((self.convert(ent, lbl), 'test-'+str(i)))
113 |             self.booster_ = xgb.train(self.param, X, self.n_iter, wl)
114 |         else:
115 |             self.booster_ = xgb.train(self.param, X, self.n_iter, [(X,'train')])
116 | 
117 |         return self
118 | 
119 |     def predict_proba(self, X):
120 |         X = xgb.DMatrix(X)
121 |         return self.booster_.predict(X)
122 | 
123 |     def convert(self, X, y=None):
124 |         if y is None:
125 |             if isinstance(X, xgb.DMatrix):
126 |                 return X
127 |             if hasattr(X,'values'):
128 |                 X = xgb.DMatrix(X.values)
129 |                 return X
130 |             return xgb.DMatrix(X)
131 |         else:
132 |             if hasattr(X,'values'):
133 |                 X = xgb.DMatrix(X.values, y.values, missing=np.nan)
134 |                 return X
135 |             return xgb.DMatrix(X, y, missing=np.nan)
136 | 
137 |     def predict(self, X):
138 |         X = self.convert(X)
139 |         probs = self.booster_.predict(X)
140 |         return np.argmax(probs, axis=1)
141 | 
142 |     def get_params(self, deep=False):
143 |         params = {
144 |                  'base_estimator':self.param['booster'],
145 |                  'objective':self.param['objective'],
146 |                  'metric':self.param['eval_metric'],
147 |                  'num_classes':self.param['num_class'],
148 |                  'learning_rate':self.param['eta'],
149 |                  'max_depth':self.param['max_depth'],
150 |                  'max_samples':self.param['subsample'],
151 |                  'max_features':self.param['colsample_bytree'],
152 |                  'max_delta_step':self.param['max_delta_step'],
153 |                  'min_child_weight':self.param['min_child_weight'],
154 |                  'min_loss_reduction':self.param['min_loss_reduction'],
155 |                  'l1_weight':self.param['alpha'],
156 |                  'l2_weight':self.param['lambda'],
157 |                  'l2_on_bias':self.param['lambda_bias'],
158 |                  'gamma':self.param['gamma'],
159 |                  'inital_bias':self.param['base_score'],
160 |                  'random_state':self.param['seed'],
161 |                  'watchlist':self.wl,
162 |                  'n_jobs':self.param['nthread'],
163 |                  'n_iter':self.n_iter}
164 |         return params
165 | 
166 |     def set_params(self, **parameters):
167 |         for parameter, value in parameters.iteritems():
168 |             self.setattr(parameter, value)
169 |         return self
170 | 
171 | 


--------------------------------------------------------------------------------
/tests/test_functiontransformer.py:
--------------------------------------------------------------------------------
 1 | import FunctionTransformer as ft
 2 | import numpy as np
 3 | import numpy.testing as npt
 4 | 
 5 | def test_power_transformer():
 6 |     transformer = ft.PowerTransformer(2)
 7 |     arr = np.array([-2, 0, 1, 2, 3, 4])
 8 |     res = np.array([4, 0, 1, 4, 9, 16])
 9 | 
10 |     npt.assert_array_equal(res, transformer.transform(arr))
11 | 
12 | def test_power_transformer_zero_power():
13 |     transformer = ft.PowerTransformer(0)
14 |     arr = np.array([-2, 0, 1, 2, 3, 4])
15 |     res = np.array([1, 1, 1, 1, 1, 1])
16 | 
17 |     npt.assert_array_equal(res, transformer.transform(arr))
18 | 
19 | def test_log_transformer():
20 |     transformer = ft.LogTransformer()
21 |     arr = np.array([ 0, 1, 2, 3, 4])
22 |     res = np.log1p(np.array([ 0., 1., 2., 3., 4.]))
23 | 
24 |     npt.assert_array_almost_equal(res, transformer.transform(arr))
25 | 
26 | if __name__ == '__main__':
27 |     import nose
28 |     nose.runmodule(argv=[__file__, '-vvs', '-x'],
29 |                    exit=False)
30 | 


--------------------------------------------------------------------------------
/tests/test_xgboostclassifier.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import XGBoostClassifier
 3 | from sklearn.metrics import log_loss
 4 | from sklearn.datasets import make_classification
 5 | 
 6 | def test_xgboost_classifier():
 7 |     X, y = make_classification(random_state=1337)
 8 | 
 9 |     xgb = XGBoostClassifier.XGBoostClassifier(num_classes=2, n_iter=10)
10 |     xgb.fit(X, y)
11 |     np.testing.assert_almost_equal(log_loss(y, xgb.predict_proba(X)), 0.12696089, decimal = 6) 
12 | 


--------------------------------------------------------------------------------