├── .gitignore ├── EnsembleClassifiers.py ├── FunctionTransformer.py ├── LasagneUtils.py ├── Readme.md ├── XGBoostClassifier.py └── tests ├── test_functiontransformer.py └── test_xgboostclassifier.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /EnsembleClassifiers.py: -------------------------------------------------------------------------------- 1 | """ 2 | A wrapper for different ways of combining models 3 | 4 | Authors: Henning Sperr 5 | 6 | License: BSD-3 clause 7 | """ 8 | from __future__ import print_function 9 | from itertools import combinations, izip 10 | import random 11 | 12 | from sklearn.base import ClassifierMixin 13 | from sklearn.cross_validation import StratifiedShuffleSplit 14 | from sklearn.linear_model import LogisticRegressionCV 15 | from sklearn.ensemble import RandomForestClassifier 16 | from sklearn.metrics import log_loss 17 | 18 | import numpy as np 19 | from scipy.optimize import minimize 20 | 21 | 22 | class LinearModelCombination(ClassifierMixin): 23 | """ 24 | Class that combines two models linearly. 25 | 26 | model1/2 : models to be combined 27 | metric : metric to minimize 28 | """ 29 | 30 | def __init__(self, model1, model2, metric=log_loss): 31 | self.model1 = model1 32 | self.model2 = model2 33 | self.weight = None 34 | self.metric = metric 35 | 36 | def fit(self, X, y): 37 | scores = [] 38 | pred1 = self.model1.predict_proba(X) 39 | pred2 = self.model2.predict_proba(X) 40 | 41 | for i in xrange(0, 101): 42 | weight = i / 100. 43 | scores.append( 44 | self.metric(y, weight * pred1 + (1 - weight) * pred2)) 45 | # linear surface so if the score gets worse we can stop 46 | if len(scores) > 1 and scores[-1] > scores[-2]: 47 | break 48 | 49 | best_weight = np.argmin(scores) 50 | 51 | self.best_score = scores[best_weight] 52 | self.weight = best_weight / 100. 53 | 54 | return self 55 | 56 | def predict(self, X): 57 | if self.weight == None: 58 | raise Exception("Classifier seems to be not yet fitted") 59 | 60 | pred1 = self.model1.predict_proba(X) * self.weight 61 | pred2 = self.model2.predict_proba(X) * (1 - self.weight) 62 | return np.argmax(pred1 + pred2) 63 | 64 | def predict_proba(self, X): 65 | if self.weight == None: 66 | raise Exception("Classifier seems to be not yet fitted") 67 | 68 | pred1 = self.model1.predict_proba(X) * self.weight 69 | pred2 = self.model2.predict_proba(X) * (1 - self.weight) 70 | return pred1 + pred2 71 | 72 | def __str__(self): 73 | return ' '.join(["LM: ", str(self.model1), ' - ', str(self.model2), ' W: ', str(self.weight)]) 74 | 75 | 76 | class BestEnsembleWeights(ClassifierMixin): 77 | 78 | """ 79 | Use scipys optimize package to find best weights for classifier combination. 80 | 81 | classifiers : list of classifiers 82 | prefit : if True classifiers will be assumed to be fit already and the data passed to 83 | fit method will be fully used for finding best weights 84 | random_state : random seed 85 | verbose : print verbose output 86 | 87 | """ 88 | 89 | def __init__(self, classifiers, metric=log_loss, voting='soft', higher_is_better=False, prefit=False, num_iter=50, random_state=None, verbose=0): 90 | self.classifiers = classifiers 91 | self.prefit = prefit 92 | self.metric = metric 93 | self.higher_is_better=higher_is_better 94 | self.num_iter = num_iter 95 | self.voting = voting 96 | if random_state is None: 97 | self.random_state = random.randint(0, 10000) 98 | else: 99 | self.random_state = random_state 100 | self.verbose = verbose 101 | 102 | def fit(self, X, y): 103 | if self.prefit: 104 | test_x, test_y = X, y 105 | else: 106 | sss = StratifiedShuffleSplit( 107 | y, n_iter=1, random_state=self.random_state) 108 | for train_index, test_index in sss: 109 | break 110 | 111 | train_x = X[train_index] 112 | train_y = y[train_index] 113 | 114 | test_x = X[test_index] 115 | test_y = y[test_index] 116 | 117 | for clf in self.classifiers: 118 | clf.fit(train_x, train_y) 119 | 120 | self._find_best_weights(test_x, test_y) 121 | 122 | def _find_best_weights(self, X, y): 123 | predictions = self._predict_probas(X) 124 | 125 | if self.verbose: 126 | print('Individual Scores:') 127 | for mn, pred in enumerate(predictions): 128 | print("Model {model_number}:{score}".format(model_number=mn, 129 | score=self.metric(y, pred))) 130 | def loss_func(weights): 131 | ''' scipy minimize will pass the weights as a numpy array ''' 132 | weighted_predictions = np.average(predictions, axis=0, weights=weights) 133 | sign = (1,-1)[self.higher_is_better] 134 | return sign*self.metric(y, weighted_predictions) 135 | 136 | # the algorithms need a starting value, right not we chose 0.5 for all weights 137 | # its better to choose many random starting points and run minimize a 138 | # few times 139 | starting_values = np.ones(len(predictions)) / (len(predictions)) 140 | # This sets the bounds on the weights, between 0 and 1 141 | bounds = tuple((0, 1) for w in starting_values) 142 | 143 | # adding constraints and a different solver as suggested by user 16universes 144 | # https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D 145 | cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)}) 146 | 147 | res = minimize(loss_func, starting_values, 148 | method='SLSQP', bounds=bounds, constraints=cons) 149 | 150 | self.best_score = res['fun'] 151 | self.best_weights = res['x'] 152 | if self.verbose: 153 | print('First Iteration:') 154 | print('Update Ensamble Score: {best_score}'.format(best_score=res['fun'])) 155 | print('Update Best Weights: {weights}'.format(weights=self.best_weights)) 156 | 157 | 158 | for i in xrange(self.num_iter): 159 | starting_values = np.random.uniform(0,1,size=len(predictions)) 160 | 161 | res = minimize(loss_func, starting_values, 162 | method='SLSQP', bounds=bounds, constraints=cons) 163 | 164 | if res['fun'] train_history[-1]['valid_loss']: 64 | self.best_iteration_score = train_history[-1]['valid_loss'] 65 | self.best_iteration = len(train_history) 66 | 67 | if len(train_history) - self.best_iteration >= self.max_iterations: 68 | nn.max_epochs = train_history[-1]['epoch'] 69 | 70 | 71 | class CustomValidationSet(object): 72 | 73 | """ 74 | Pass a custom validation set and a metric, standart is log_loss 75 | 76 | validation_items: pass items in the form [('name1', [test_x, test_y]), 77 | ('name2',...)]] 78 | """ 79 | 80 | def __init__(self, validation_items=None, metric=log_loss): 81 | self.validation_items = validation_items 82 | self.metric = metric 83 | 84 | def __call__(self, nn, train_history): 85 | if self.validation_items is None: 86 | return 87 | 88 | for name, data in self.validation_items: 89 | data_x, data_y = data 90 | print('Validating {name}: {score}'.format( 91 | name=name, score=self.metric(data_y, nn.predict_proba(data_x)))) 92 | 93 | 94 | class TrainRatioStopper(object): 95 | 96 | """ 97 | Stops learning if train_loss/validation_loss falls below a certain ratio 98 | 99 | stop_ratio : the train_loss/validation_loss ratio to stop training 100 | """ 101 | 102 | def __init__(self, stop_ratio=0.8): 103 | self.stop_ratio = stop_ratio 104 | 105 | def __call__(self, nn, train_history): 106 | ratio = train_history[-1]['train_loss'] / \ 107 | train_history[-1]['valid_loss'] 108 | if ratio < self.stop_ratio: 109 | nn.max_epochs = train_history[-1]['epoch'] 110 | 111 | 112 | class BestIterationSaver(object): 113 | 114 | """ 115 | Saves the weights for the best iteration 116 | 117 | name : name of the best iteration weights file 118 | delayed_start : number of iterations to wait before starting to save 119 | verbose : print a logmessage when saving 120 | 121 | """ 122 | 123 | def __init__(self, name='best_iteration.weights', delayed_start=10, verbose=0): 124 | self.best_score = None 125 | self.best_weights = None 126 | self.delayed_start = delayed_start 127 | self.filename = name 128 | self.verbose = verbose 129 | 130 | def __call__(self, nn, train_history): 131 | if len(train_history) < self.delayed_start: 132 | return 133 | 134 | if self.best_score is None or train_history[-1]['valid_loss'] < self.best_score: 135 | if self.verbose: 136 | print('Saving to {filename}'.format(filename=self.filename)) 137 | 138 | self.best_score = train_history[-1]['valid_loss'] 139 | nn.save_weights_to(self.filename) 140 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Helpers 2 | 3 | This repository contains several helper classes for ML, it tries to maintain rudimentary scikit learn compatibility 4 | 5 | ## Helpers 6 | 7 | ### FunctionTransformer / LasagneUtils 8 | 9 | - applies a function to all elements 10 | 11 | Example creates a simple network, with linear decrease in learning rate and 12 | linear increase in momentum, it will save the best iterations after the first 10 and 13 | stop if either there was no improvement in the last 10 iterations or the train/test ratio is below 0.8: 14 | 15 | 16 | ```Python 17 | from lasagne.layers import DenseLayer, InputLayer, DropoutLayer 18 | from lasagne.nonlinearities import rectify, softmax, tanh, linear 19 | from lasagne.updates import nesterov_momentum, rmsprop, momentum 20 | from nolearn.lasagne import NeuralNet 21 | import theano 22 | 23 | from sklearn.preprocessing import StandardScaler 24 | from sklearn.pipeline import Pipeline 25 | 26 | from FunctionTransformer import LogTransformer 27 | from LasagneUtils import EarlyStopper, LinearAdjustVariable, TrainRatioStopper, BestIteratorSaver 28 | 29 | 30 | 31 | layers=[('input', InputLayer), 32 | ('dense0', DenseLayer), 33 | ('output', DenseLayer), 34 | ] 35 | #DenseLayer() 36 | net = NeuralNet(layers=layers, 37 | input_shape=(None, train_x.shape[1]), 38 | dense0_num_units=512, 39 | dense0_nonlinearity=rectify, 40 | output_num_units=9, 41 | output_nonlinearity=softmax, 42 | update=momentum, 43 | update_learning_rate=theano.shared(np.float32(0.05)), 44 | update_momentum=theano.shared(np.float32(0.9)), 45 | on_epoch_finished = [LinearAdjustVariable('update_learning_rate', start=0.05, stop=0.0001), 46 | LinearAdjustVariable('update_momentum', start=0.9, stop=0.999), 47 | TrainRatioStopper(0.8), 48 | EarlyStopper(), 49 | BestIterationSaver(verbose=1) 50 | ], 51 | eval_size=0.1, 52 | verbose=1, 53 | max_epochs=501) 54 | 55 | net_ppl2 = Pipeline([('LogTrans', LogTransformer()),('StandartScale', StandardScaler()), ('nn',net)]) 56 | net_ppl2.fit(train_x.astype(np.float32), train_y.astype(np.int32)) 57 | ``` 58 | 59 | ### XGBoostClassifier 60 | 61 | - wrapper for xgboost. needs xgboost installed and have /wrapper in PYTHON_PATH 62 | 63 | Example: 64 | 65 | ```Python 66 | from XGBoostClassifier import XGBoostClassifier 67 | xgb = XGBoostClassifier(watchlist=[(test_x, test_y)], 68 | max_samples=0.9, 69 | n_iter=105, 70 | random_state=1335) 71 | 72 | xgb.fit(train_x, train_y) 73 | xgb.predict_proba(train_x) 74 | ``` 75 | 76 | ### EnsembleClassifiers 77 | 78 | - pass a list of classifiers and find the best weights for combining them 79 | 80 | Example: 81 | 82 | ```Python 83 | from EnsembleClassifier import BestEnsambleWeights 84 | 85 | rfc = RandomForestClassifier(...) 86 | xgb = XGBoostClassifier(...) 87 | logreg = LogisticRegression(...) 88 | 89 | bew = BestEnsembleWeights([rfc, xgb, logreg], prefit=False, random_state=1337, verbose=1) 90 | bew.fit(train_y, train_x) 91 | bew.predict_proba(test_x) 92 | 93 | ``` 94 | -------------------------------------------------------------------------------- /XGBoostClassifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | A wrapper around XGBoost trying to maintain the Scikit Learn API 3 | 4 | Authors: Henning Sperr 5 | 6 | License: BSD-3 clause 7 | """ 8 | 9 | import random 10 | import xgboost as xgb 11 | 12 | from sklearn.base import BaseEstimator, ClassifierMixin 13 | 14 | import numpy as np 15 | 16 | class XGBoostClassifier(BaseEstimator, ClassifierMixin): 17 | """ 18 | A simple wrapper around XGBoost 19 | 20 | more details: 21 | https://github.com/dmlc/xgboost/wiki/Parameters 22 | 23 | Parameters 24 | ---------- 25 | 26 | base_estimator : can be 'gbtree' or 'gblinear' 27 | gamma : minimum loss reduction required to make a partition, higher values 28 | mean more conservative boosting 29 | max_depth : maximum depth of a tree 30 | min_child_weight : larger values mean more conservative partitioning 31 | 32 | objective : 'reg:linear' - linear regression 33 | 'reg:logistic' - logistic regression 34 | 'binary:logistic' - binary logistic regression 35 | 'binary:logitraw' - binary logistic regression before logistic transformation 36 | 'multi:softmax' - multiclass classification 37 | 'multi:softprob' - multiclass classification with class probability output 38 | 'rank:pairwise' - pairwise minimize loss 39 | 40 | metric : 'rmse' - root mean square error 41 | 'logloss' - negative log likelihood 42 | 'error' - binary classification error rate 43 | 'merror' - multiclass error rate 44 | 'mlogloss' - multiclass logloss 45 | 'auc' - area under the curve for ranking evaluation 46 | 'ndcg' - normalized discounted cumulative gain ndcg@n for top n eval 47 | 'map' - mean average precision map@n for top n eval 48 | """ 49 | def __init__(self, 50 | base_estimator='gbtree', 51 | objective='multi:softprob', 52 | metric='mlogloss', 53 | num_classes=9, 54 | learning_rate=0.25, 55 | max_depth=10, 56 | max_samples=1.0, 57 | max_features=1.0, 58 | max_delta_step=0, 59 | min_child_weight=4, 60 | min_loss_reduction=1, 61 | l1_weight=0.0, 62 | l2_weight=0.0, 63 | l2_on_bias=False, 64 | gamma=0.02, 65 | inital_bias=0.5, 66 | random_state=None, 67 | watchlist=None, 68 | n_jobs=4, 69 | n_iter=150): 70 | 71 | 72 | if random_state is None: 73 | random_state = random.randint(0, 1000000) 74 | 75 | param ={ 76 | 'silent':1, 77 | 'verbose':0, 78 | 'use_buffer': True, 79 | 'base_score': inital_bias, 80 | 'nthread': n_jobs, 81 | 'booster': base_estimator, 82 | 'eta': learning_rate, 83 | 'gamma': gamma, 84 | 'max_depth': max_depth, 85 | 'max_delta_step' : max_delta_step, 86 | 'min_child_weight': min_child_weight, 87 | 'min_loss_reduction':min_loss_reduction, 88 | 'subsample': max_samples, 89 | 'colsample_bytree': max_features, 90 | 'alpha': l1_weight, 91 | 'lambda':l2_weight, 92 | 'lambda_bias': l2_on_bias, 93 | 'objective': objective, 94 | 'eval_metric': metric, 95 | 'seed': random_state, 96 | 'num_class': num_classes 97 | } 98 | self.param = param 99 | if not watchlist: 100 | self.wl=[] 101 | else: 102 | self.wl = watchlist 103 | self.n_iter=n_iter 104 | 105 | def fit(self, X, y=None): 106 | self.booster_ = None 107 | X=self.convert(X, y) 108 | if self.wl: 109 | wl = [(X, 'train')] 110 | for i, ent in enumerate(self.wl): 111 | ent, lbl = ent 112 | wl.append((self.convert(ent, lbl), 'test-'+str(i))) 113 | self.booster_ = xgb.train(self.param, X, self.n_iter, wl) 114 | else: 115 | self.booster_ = xgb.train(self.param, X, self.n_iter, [(X,'train')]) 116 | 117 | return self 118 | 119 | def predict_proba(self, X): 120 | X = xgb.DMatrix(X) 121 | return self.booster_.predict(X) 122 | 123 | def convert(self, X, y=None): 124 | if y is None: 125 | if isinstance(X, xgb.DMatrix): 126 | return X 127 | if hasattr(X,'values'): 128 | X = xgb.DMatrix(X.values) 129 | return X 130 | return xgb.DMatrix(X) 131 | else: 132 | if hasattr(X,'values'): 133 | X = xgb.DMatrix(X.values, y.values, missing=np.nan) 134 | return X 135 | return xgb.DMatrix(X, y, missing=np.nan) 136 | 137 | def predict(self, X): 138 | X = self.convert(X) 139 | probs = self.booster_.predict(X) 140 | return np.argmax(probs, axis=1) 141 | 142 | def get_params(self, deep=False): 143 | params = { 144 | 'base_estimator':self.param['booster'], 145 | 'objective':self.param['objective'], 146 | 'metric':self.param['eval_metric'], 147 | 'num_classes':self.param['num_class'], 148 | 'learning_rate':self.param['eta'], 149 | 'max_depth':self.param['max_depth'], 150 | 'max_samples':self.param['subsample'], 151 | 'max_features':self.param['colsample_bytree'], 152 | 'max_delta_step':self.param['max_delta_step'], 153 | 'min_child_weight':self.param['min_child_weight'], 154 | 'min_loss_reduction':self.param['min_loss_reduction'], 155 | 'l1_weight':self.param['alpha'], 156 | 'l2_weight':self.param['lambda'], 157 | 'l2_on_bias':self.param['lambda_bias'], 158 | 'gamma':self.param['gamma'], 159 | 'inital_bias':self.param['base_score'], 160 | 'random_state':self.param['seed'], 161 | 'watchlist':self.wl, 162 | 'n_jobs':self.param['nthread'], 163 | 'n_iter':self.n_iter} 164 | return params 165 | 166 | def set_params(self, **parameters): 167 | for parameter, value in parameters.iteritems(): 168 | self.setattr(parameter, value) 169 | return self 170 | 171 | -------------------------------------------------------------------------------- /tests/test_functiontransformer.py: -------------------------------------------------------------------------------- 1 | import FunctionTransformer as ft 2 | import numpy as np 3 | import numpy.testing as npt 4 | 5 | def test_power_transformer(): 6 | transformer = ft.PowerTransformer(2) 7 | arr = np.array([-2, 0, 1, 2, 3, 4]) 8 | res = np.array([4, 0, 1, 4, 9, 16]) 9 | 10 | npt.assert_array_equal(res, transformer.transform(arr)) 11 | 12 | def test_power_transformer_zero_power(): 13 | transformer = ft.PowerTransformer(0) 14 | arr = np.array([-2, 0, 1, 2, 3, 4]) 15 | res = np.array([1, 1, 1, 1, 1, 1]) 16 | 17 | npt.assert_array_equal(res, transformer.transform(arr)) 18 | 19 | def test_log_transformer(): 20 | transformer = ft.LogTransformer() 21 | arr = np.array([ 0, 1, 2, 3, 4]) 22 | res = np.log1p(np.array([ 0., 1., 2., 3., 4.])) 23 | 24 | npt.assert_array_almost_equal(res, transformer.transform(arr)) 25 | 26 | if __name__ == '__main__': 27 | import nose 28 | nose.runmodule(argv=[__file__, '-vvs', '-x'], 29 | exit=False) 30 | -------------------------------------------------------------------------------- /tests/test_xgboostclassifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import XGBoostClassifier 3 | from sklearn.metrics import log_loss 4 | from sklearn.datasets import make_classification 5 | 6 | def test_xgboost_classifier(): 7 | X, y = make_classification(random_state=1337) 8 | 9 | xgb = XGBoostClassifier.XGBoostClassifier(num_classes=2, n_iter=10) 10 | xgb.fit(X, y) 11 | np.testing.assert_almost_equal(log_loss(y, xgb.predict_proba(X)), 0.12696089, decimal = 6) 12 | --------------------------------------------------------------------------------