├── treelearn ├── clustered_classifier.py ├── test_helpers.py ├── typecheck.py ├── test_iris.py ├── __init__.py ├── breadth_first.py ├── test_regression.py ├── regression_ensemble.py ├── constant_leaf.py ├── clustered_regression.py ├── test_randomized_tree.py ├── test_ensembles.py ├── test_tree_helpers.py ├── clustered.py ├── tree_node.py ├── viterbi_tree.py ├── oblique_tree.py ├── oblique_tree_node.py ├── base_ensemble.py ├── classifier_ensemble.py ├── randomized_tree.py ├── recipes.py └── tree_helpers.py ├── setup.py ├── README.md ├── LICENSE └── distribute_setup.py /treelearn/clustered_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from clustered import ClusteredEstimator 3 | from sklearn.svm import LinearSVC 4 | from copy import deepcopy 5 | 6 | class ClusteredClassifier(ClusteredEstimator): 7 | def __init__(self, k=10, base_model = LinearSVC(), verbose=False): 8 | ClusteredEstimator.__init__(self, k, base_model, verbose) 9 | 10 | -------------------------------------------------------------------------------- /treelearn/test_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def split_dataset(x, y, prct_train=0.5): 4 | nrows, ncols = x.shape 5 | indices = np.arange(nrows) 6 | np.random.shuffle(indices) 7 | ntrain = int(nrows * prct_train) 8 | train_indices = indices[:ntrain] 9 | test_indices = indices[ntrain:] 10 | x_train = x[train_indices, :] 11 | x_test = x[test_indices, :] 12 | y_train = y[train_indices] 13 | y_test = y[test_indices] 14 | return x_train, y_train, x_test, y_test 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distribute_setup import use_setuptools 2 | use_setuptools() 3 | 4 | from setuptools import setup 5 | 6 | setup( 7 | name = "treelearn", 8 | version = "0.0.10", 9 | #package_dir = { '' : 'treelearn' }, 10 | packages = ['treelearn'], 11 | install_requires = [ 'scikit-learn' ], 12 | license = "LGPL", 13 | keywords = "machine learning tree forest random", 14 | url = "https://github.com/capitalk/treelearn", 15 | classifiers=[ 16 | "Development Status :: 3 - Alpha", 17 | "Topic :: Utilities", 18 | "License :: OSI Approved :: LGPL License", 19 | ], 20 | ) 21 | -------------------------------------------------------------------------------- /treelearn/typecheck.py: -------------------------------------------------------------------------------- 1 | 2 | def check_type(x, t): 3 | if not isinstance(x,t): 4 | msg = "Expected %s : %s to be %s" % (x, type(x), t) 5 | raise RuntimeError(msg) 6 | 7 | def check_field(x,f): 8 | if not hasattr(x,f): 9 | msg = "Expected %s : %s to have field %s" % (x, type(x), f) 10 | raise RuntimeError(msg) 11 | 12 | def check_fields(x,fs): 13 | for f in fs: 14 | check_field(x,f) 15 | 16 | def check_estimator(x): 17 | check_fields(x, ['fit', 'predict']) 18 | 19 | def check_int(x): 20 | check_type(x, int) 21 | 22 | def check_bool(x): 23 | check_type(x, bool) 24 | 25 | def check_dict(x): 26 | check_type(x, dict) 27 | -------------------------------------------------------------------------------- /treelearn/test_iris.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import recipes 4 | import numpy as np 5 | import sklearn.datasets 6 | from test_helpers import split_dataset 7 | 8 | iris = sklearn.datasets.load_iris() 9 | x_train, y_train, x_test, y_test = split_dataset(iris.data, iris.target) 10 | 11 | 12 | classifiers = [ 13 | recipes.train_svm_tree, 14 | recipes.train_sgd_tree, 15 | #recipes.train_svm_forest, 16 | #recipes.train_sgd_forest, 17 | recipes.train_random_forest, 18 | recipes.train_clustered_svm, 19 | recipes.train_clustered_svm_ensemble 20 | ] 21 | 22 | def test_all_classifiers(): 23 | for model_constructor in classifiers: 24 | 25 | print model_constructor 26 | model = model_constructor(x_train, y_train) 27 | print model 28 | pred = model.predict(x_test) 29 | num_incorrect = np.sum(pred != y_test) 30 | print "Expected num_incorrect <= 20, got:", num_incorrect 31 | assert num_incorrect <= 15 32 | 33 | -------------------------------------------------------------------------------- /treelearn/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from constant_leaf import ConstantLeaf 3 | from tree_node import TreeNode 4 | from randomized_tree import RandomizedTree 5 | from oblique_tree import ObliqueTree 6 | from classifier_ensemble import ClassifierEnsemble 7 | from regression_ensemble import RegressionEnsemble 8 | from clustered_regression import ClusteredRegression 9 | from clustered_classifier import ClusteredClassifier 10 | from recipes import * 11 | 12 | __all__ = [ 13 | 'ClassifierEnsemble', 'RegressionEnsemble', 14 | 'ClusteredRegression', 'ClusteredClassifier', 15 | 'RandomizedTree', 'TreeNode', 'ConstantLeaf', 16 | 'train_random_forest', 17 | 'ObliqueTree', 18 | 'mk_svm_tree', 'train_svm_tree', 19 | 'mk_sgd_tree','train_sgd_tree', 20 | 'train_svm_forest', 'train_sgd_forest', 21 | 'mk_clustered_regression_ensemble', 'train_clustered_regression_ensemble', 22 | 'mk_clustered_classifier_ensemble', 'train_clustered_classifier_ensemble', 23 | 'train_clustered_ols', 24 | 'mk_additive_regression_forest', 'train_additive_regression_forest', 25 | ] 26 | -------------------------------------------------------------------------------- /treelearn/breadth_first.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class BaseTree(object): 4 | 5 | def process_work_items(self): 6 | max_pos = max(item.pos for item in work_items) 7 | if max_pos > len(self.values): 8 | self.grow_tree() 9 | 10 | # work items that need to create leaf nodes on the GPU 11 | leaves = [] 12 | 13 | # work items that need to be fully grown into small subtrees per thradblock 14 | subtrees = [] 15 | 16 | # work items that need a single split but are small enough to be loaded into shared memory 17 | small_splits = [] 18 | 19 | # work items that have enough features to justify 20 | # each one getting its own thread block 21 | block_per_feature = [] 22 | 23 | # otherwise, launch a kernel for each feature 24 | kernel_per_feature = [] 25 | 26 | for item in work_items: 27 | if item.nelts == 1 or item.purity == 1.9: 28 | leaves.append(item) 29 | elif item.nelts <= 32: 30 | subtrees.append(item) 31 | elif items.nelts * items.n_features * self.values.itemsize <= 4096: 32 | small_splits.append(item) 33 | elif item.n_features > 30: 34 | block_per_feature.append(item) 35 | else: 36 | kernel_per_feature.append(item) 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /treelearn/test_regression.py: -------------------------------------------------------------------------------- 1 | 2 | import recipes 3 | import test_helpers 4 | from sklearn.datasets import make_friedman1, make_friedman2, make_friedman3 5 | from sklearn.metrics import mean_square_error 6 | from sklearn.linear_model import LinearRegression 7 | 8 | 9 | 10 | regressors = [ 11 | recipes.train_clustered_ols, 12 | lambda x, y: recipes.train_clustered_regression_ensemble(x, y, feature_subset_percent=1), 13 | #recipes.train_additive_regression_forest, 14 | recipes.train_random_forest 15 | ] 16 | def test_all_regressors(): 17 | x, y = make_friedman2(10000) 18 | x_train, y_train, x_test, y_test = test_helpers.split_dataset(x,y) 19 | #print y_test[:100] 20 | ols = LinearRegression() 21 | ols.fit(x_train, y_train) 22 | ols_pred = ols.predict(x_test) 23 | #print ols_pred[:100] 24 | ols_mse = mean_square_error(y_test, ols_pred) 25 | 26 | for fn in regressors: 27 | 28 | print fn 29 | model = fn(x_train,y_train) 30 | print model 31 | pred = model.predict(x_test) 32 | #print pred[:100] 33 | mse = mean_square_error(y_test, pred) 34 | 35 | print "OLS MSE:", ols_mse, " Current MSE:", mse 36 | print "Ratio:", mse / ols_mse 37 | assert ols_mse > 1.1*mse 38 | 39 | 40 | -------------------------------------------------------------------------------- /treelearn/regression_ensemble.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | from base_ensemble import BaseEnsemble 4 | 5 | class RegressionEnsemble(BaseEnsemble): 6 | def __init__(self, 7 | base_model=LinearRegression(), 8 | num_models = 50, 9 | bagging_percent=0.5, 10 | bagging_replacement=True, 11 | feature_subset_percent = 1.0, 12 | stacking_model=None, 13 | randomize_params = {}, 14 | additive = False, 15 | verbose=False): 16 | 17 | BaseEnsemble.__init__(self, 18 | base_model, 19 | num_models, 20 | bagging_percent, 21 | bagging_replacement, 22 | feature_subset_percent, 23 | stacking_model, 24 | randomize_params, 25 | additive, 26 | verbose) 27 | 28 | 29 | 30 | def predict(self, X): 31 | pred = self.transform(X) 32 | if self.stacking_model: 33 | return self.stacking_model.predict(pred) 34 | else: 35 | return np.dot(pred, self.weights) 36 | 37 | def _init_fit(self, X, Y): 38 | pass 39 | 40 | def _created_model(self, X, Y, indices, i, model): 41 | pass 42 | -------------------------------------------------------------------------------- /treelearn/constant_leaf.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | 18 | import numpy as np 19 | 20 | class ConstantLeaf: 21 | """Decision tree node which always predicts the same value.""" 22 | def __init__(self, v): 23 | self.v = v 24 | 25 | def to_str(self, indent="", feature_names=None): 26 | return indent + "Constant(" + str(self.v) + ")" 27 | 28 | def __str__(self): 29 | return self.to_str() 30 | 31 | def predict(self, X): 32 | X = np.atleast_2d(X) 33 | if isinstance(self.v, int): 34 | dtype = 'int32' 35 | else: 36 | dtype = 'float' 37 | outputs = np.zeros(X.shape[0], dtype=dtype) 38 | outputs[:] = self.v 39 | return outputs 40 | 41 | def fill_predict(self, X, outputs, mask): 42 | outputs[mask] = self.v 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /treelearn/clustered_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from clustered import ClusteredEstimator 3 | from sklearn.linear_model import LinearRegression 4 | from copy import deepcopy 5 | 6 | class ClusteredRegression(ClusteredEstimator): 7 | def __init__( 8 | self, 9 | k=10, 10 | base_model = LinearRegression(), 11 | cluster_prediction_weights = 'hard', # or 'soft' 12 | verbose=False): 13 | ClusteredEstimator.__init__(self, k, base_model, verbose) 14 | self.cluster_prediction_weights = cluster_prediction_weights 15 | 16 | def predict(self, X): 17 | nrows = X.shape[0] 18 | Y = np.zeros(nrows) 19 | if self.cluster_prediction_weights == 'hard': 20 | labels = self.clusters.predict(X) 21 | 22 | for label in self.models.keys(): 23 | mask = (labels == label) 24 | X_slice = X[mask, :] 25 | Y_slice = self.models[label].predict(X_slice) 26 | Y[mask] = Y_slice 27 | else: 28 | distances = self.clusters.transform(X) 29 | inv_dist_squared = 1.0 / (distances ** 2) 30 | Z = np.sum(inv_dist_squared, axis=1) 31 | # normalize weights so they add to 1 32 | weights = inv_dist_squared / np.array([Z], dtype='float').T 33 | if self.verbose: 34 | "First row of weights:", weights[0, :] 35 | for label in self.models.keys(): 36 | Y_curr = self.models[label].predict(X) 37 | Y += Y_curr * weights[:, label] 38 | return Y 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /treelearn/test_randomized_tree.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | 18 | import numpy as np 19 | import randomized_tree as tree 20 | 21 | def test_simple_tree(): 22 | data = np.array([[0,0], [0.1, 0.1], [1.0, 1.0], [.99,.99]]) 23 | labels = np.array([0,0,1,1]) 24 | t = tree.RandomizedTree(min_leaf_size=1) 25 | t.fit(data,labels) 26 | print t 27 | pred0 = t.predict(np.array([0.05, 0.05])) 28 | print "Expected: 0, Received:", pred0 29 | assert pred0 == 0 30 | 31 | pred1 = t.predict(np.array([0.995, 0.995])) 32 | print "Expected: 1, Received:", pred1 33 | assert pred1 == 1 34 | 35 | def test_big_tree(n=1000, d = 50, max_thresholds=10): 36 | t = tree.RandomizedTree(max_thresholds=max_thresholds) 37 | x = np.random.randn(n,d) 38 | y = np.random.randint(0,2,n) 39 | t.fit(x,y) 40 | return t 41 | 42 | def test_binary_data(n = 1000, d = 50): 43 | t = tree.RandomizedTree() 44 | x = np.random.randint(0,2, [n,d]) 45 | y = np.random.randint(0,2,n) 46 | t.fit(x,y) 47 | return t 48 | -------------------------------------------------------------------------------- /treelearn/test_ensembles.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | 18 | import numpy as np 19 | from recipes import train_random_forest, train_svm_forest 20 | from classifier_ensemble import ClassifierEnsemble 21 | from randomized_tree import RandomizedTree 22 | from sklearn.linear_model import LogisticRegression 23 | 24 | n = 200 25 | left_data = np.random.randn(n, 10) 26 | left_labels = np.zeros(n, dtype='int') 27 | 28 | right_data = 10*(np.random.randn(n,10)-2) 29 | right_labels = np.ones(n, dtype='int') 30 | 31 | data = np.concatenate([left_data, right_data]) 32 | labels = np.concatenate([left_labels, right_labels]) 33 | 34 | 35 | def try_predictor(model): 36 | print "Trying predictor:", model 37 | 38 | pred0 = model.predict(left_data) 39 | fp = np.sum(pred0 != 0) 40 | print "False positives:", fp 41 | assert fp < (n / 10) 42 | 43 | pred1 = model.predict(right_data) 44 | fn = np.sum(pred1 != 1) 45 | print "False negatives:", fn 46 | assert fn < (n/ 10) 47 | 48 | 49 | def test_simple_forest(): 50 | try_predictor(train_random_forest(data, labels)) 51 | 52 | def test_svm_forest(): 53 | try_predictor(train_svm_forest(data, labels, tree_args={'verbose':True})) 54 | 55 | def test_stacked_random_forest(): 56 | t = RandomizedTree(min_leaf_size=1) 57 | lr = LogisticRegression() 58 | ensemble = ClassifierEnsemble(base_model=t, stacking_model=lr) 59 | ensemble.fit(data, labels) 60 | try_predictor(ensemble) 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TreeLearn started as a Python implementation of Breiman's Random Forest 2 | but is being slowly generalized into a tree ensemble library. 3 | 4 | 5 | ## Creating a Random Forest 6 | 7 | A random forest is simply a bagging ensemble of randomized tree. To construct 8 | these with default parameters: 9 | 10 | forest = treelearn.ClassifierEnsemble(base_model = treelearn.RandomizedTree()) 11 | 12 | 13 | ## Training 14 | 15 | Place your training data in a n-by-d numpy array, where n is the number of 16 | training examples and d is the dimensionality of your data. 17 | Place labels in an n-length numpy array. Then call: 18 | 19 | forest.fit(Xtrain,Y) 20 | 21 | If you're lazy, there's a helper for simultaneously creating and training a random forest: 22 | 23 | forest = treelearn.train_random_forest(X, Y) 24 | 25 | 26 | ## Classification 27 | 28 | forest.predict(Xtest) 29 | 30 | 31 | ## ClassifierEnsemble options 32 | 33 | * base_model = any classifier which obeys the fit/predict protocol 34 | 35 | * num_models = size of the forest 36 | 37 | * bagging_percent = what percentage of your data each classifier is trained on 38 | 39 | * bagging_replacement = sample with or without replacement 40 | 41 | * stacking_model = treat outputs of base classifiers as inputs to given model 42 | 43 | 44 | ## RandomizedTree options 45 | 46 | * num_features_per_node = number of features each node of a tree should 47 | consider (default = log2 of total features) 48 | 49 | * min_leaf_size = stop splitting if we get down to this number of data points 50 | 51 | * max_height = stop splitting if we exceed this number of tree levels 52 | 53 | * max_thresholds = how many feature value thesholds to consider (use None for all values) 54 | 55 | ## ObliqueTree options 56 | * num_features_per_node = size of random feature subset at each node, 57 | default = sqrt(total number of features) 58 | 59 | * C = Tradeoff between error and L2 regularizer of linear SVM 60 | 61 | * max_depth = When you get to this depth, train an SVM on all features 62 | and stop splitting the data. 63 | 64 | * min_leaf_size = stop splitting when any subset of the data gets smaller 65 | than this. 66 | -------------------------------------------------------------------------------- /treelearn/test_tree_helpers.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | 18 | import numpy as np 19 | 20 | from tree_helpers import * 21 | 22 | def test_majority(): 23 | labels = np.array([1,1,1,2,3,3,3,2,3,3,3,1,3,3,3,3]) 24 | result = majority(labels) 25 | print "Expected 3:, Received:", result 26 | assert result == 3 27 | classes = [1,2] 28 | result = majority(labels, classes) 29 | print "Expected 1:, Received:", result 30 | assert result == 1 31 | 32 | 33 | classes = np.array([0,1]) 34 | all_zero = np.array([0,0,0,0]) 35 | mixed = np.array([0,1,0,1]) 36 | def test_gini(): 37 | result1 = gini(classes, all_zero) 38 | print "Expected 0.0, Received:", result1 39 | assert result1 == 0.0 40 | result2 = gini(classes, mixed) 41 | print "Expected 0.5, Received:", result2 42 | assert result2 == 0.5 43 | 44 | feature_vec = np.array([0.1, 0.5, 0.9, 1.1]) 45 | def test_eval_split(): 46 | slow = slow_eval_split(classes, feature_vec, 0.5, mixed) 47 | print "Slow GINI", slow 48 | fast = eval_gini_split(classes, feature_vec, 0.5, mixed) 49 | print "Fast GINI", fast 50 | assert abs(slow - fast) < 0.01 51 | 52 | 53 | labels = np.array([0, 0, 1, 1]) 54 | thresholds = midpoints(np.unique(feature_vec)) 55 | def test_eval_all_splits(): 56 | thresh_slow, score_slow = slow_find_best_gini_split(classes, feature_vec, thresholds, labels) 57 | print "Slow Thresh", thresh_slow, "Score", score_slow 58 | 59 | thresh_fast, score_fast = find_best_gini_split(classes, feature_vec, thresholds, labels) 60 | print "Fast Thresh", thresh_fast, "Score", score_fast 61 | assert thresh_slow == 0.7 62 | assert thresh_fast == 0.7 63 | 64 | -------------------------------------------------------------------------------- /treelearn/clustered.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import deepcopy 3 | 4 | 5 | from sklearn.base import BaseEstimator 6 | from sklearn.cluster import MiniBatchKMeans 7 | from sklearn.linear_model import LinearRegression 8 | 9 | from typecheck import check_estimator, check_dict, check_int, check_bool 10 | from tree_helpers import clear_sklearn_fields 11 | 12 | class ClusteredEstimator(BaseEstimator): 13 | """Base class for ClusteredRegression and ClusteredClassifier""" 14 | def __init__(self, k, base_model, verbose=False): 15 | check_int(k) 16 | check_estimator(base_model) 17 | check_bool(verbose) 18 | 19 | self.k = k 20 | self.base_model = base_model 21 | self.verbose = verbose 22 | self.clusters = MiniBatchKMeans(k) 23 | self.models = None 24 | 25 | def fit(self, X, Y, **fit_keywords): 26 | self.models = {} 27 | if self.verbose: 28 | print "Clustering X" 29 | # also get back the labels so we can use them to create regressors 30 | self.clusters.fit(X) 31 | labels = self.clusters.labels_ 32 | # clear this field so that it doesn't get serialized later 33 | self.clusters.labels_ = None 34 | for label in np.unique(labels): 35 | if self.verbose: 36 | print "Fitting model for cluster", label 37 | model = deepcopy(self.base_model) 38 | mask = (labels == label) 39 | X_slice = X[mask, :] 40 | Y_slice = Y[mask] 41 | model.fit(X_slice, Y_slice, **fit_keywords) 42 | 43 | # clear sklearn's left over junk to make pickled strings smaller 44 | clear_sklearn_fields(model) 45 | self.models[label] = model 46 | 47 | def predict(self, X): 48 | if self.verbose: 49 | print "Prediction inputs of shape", X.shape 50 | nrows = X.shape[0] 51 | Y = np.zeros(nrows) 52 | if self.verbose: 53 | print "Assigning cluster labels to input data" 54 | labels = self.clusters.predict(X) 55 | for label in self.models.keys(): 56 | mask = (labels == label) 57 | if self.verbose: 58 | print "Predicting cluster", label, "nvectors = ", np.sum(mask) 59 | 60 | X_slice = X[mask, :] 61 | model = self.models[label] 62 | Y[mask] = model.predict(X_slice) 63 | return Y 64 | 65 | -------------------------------------------------------------------------------- /treelearn/tree_node.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | from sklearn.base import BaseEstimator 18 | 19 | class TreeNode(BaseEstimator): 20 | """Basic decision tree interior node.""" 21 | 22 | def __init__(self, feature_idx, split_val, left, right): 23 | self.feature_idx = feature_idx 24 | self.split_val = split_val 25 | self.left = left 26 | self.right = right 27 | 28 | 29 | def predict(self, X): 30 | """Inefficient since calling this method recursively copy outputs""" 31 | outputs = np.zeros(X.shape[0]) 32 | col = X[:, self.feature_idx] 33 | split = col < self.split_val 34 | left_mask = mask & split 35 | outputs[left_mask] = self.left.predict(X[left_mask, :]) 36 | right_mask = mask & ~split 37 | outputs[right_mask] = self.right.predict(X[right_mask, :]) 38 | return outputs 39 | 40 | 41 | def fill_predict(self, X, outputs, mask): 42 | """instead of returning output values, let the leaves fill an 43 | output matrix 44 | """ 45 | col = X[:, self.feature_idx] 46 | split = col < self.split_val 47 | left_mask = mask & split 48 | right_mask = mask & ~split 49 | self.left.fill_predict(X, outputs, left_mask) 50 | self.right.fill_predict(X, outputs, right_mask) 51 | 52 | 53 | def to_str(self, indent="", feature_names=None): 54 | if feature_names: 55 | featureStr = feature_names[feature_idx] 56 | else: 57 | featureStr = "x[" + str(self.feature_idx) + "]" 58 | longer_indent = indent + " " 59 | left = self.left.to_str(indent = longer_indent) 60 | right = self.right.to_str(indent = longer_indent) 61 | cond = "if %s < %f:" % (featureStr, self.split_val) 62 | return indent + cond + "\n" + left + "\n" + indent + "else:\n" + right 63 | 64 | 65 | -------------------------------------------------------------------------------- /treelearn/viterbi_tree.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | 18 | from sklearn.base import BaseEstimator 19 | from sklearn.linear_model import LogisticRegression 20 | from sklearn.svm import LinearSVC 21 | 22 | class ViterbiTreeNode(BaseEstimator): 23 | def __init__(self, depth, max_depth, num_retries, leaf_model): 24 | self.depth = depth 25 | self.max_depth = max_depth 26 | self.is_leaf = (depth == max_depth) 27 | self.num_retries = num_retries 28 | self.C = C 29 | if depth == max_depth: 30 | self.leaf_model = leaf_model 31 | else: 32 | self.left = ViterbiTreeNode(depth+1, max_depth, num_retries, leaf_model) 33 | self.right = ViterbiTreeNode(depth+1, max_depth, num_retries, leaf_model) 34 | 35 | def gen_random_cs(self): 36 | return 10 ** (np.random.randn(self.num_retries) - 1) 37 | 38 | def init_fit(self, X,Y): 39 | """Initialize partitions and leaf models to minimize training error""" 40 | best_model = None 41 | best_error = np.inf 42 | 43 | for c in self.gen_random_cs(): 44 | if self.is_leaf: 45 | model = self.leaf_model(C=c) 46 | else: 47 | model = LinearSVC(C=c) 48 | 49 | model.fit(X,Y) 50 | error = model.score(X,Y) 51 | if err < best_error: 52 | best_model = model 53 | best_error = error 54 | self.model = best_model 55 | if not self.is_leaf: 56 | pred = model.predict(X) 57 | mask = (pred != 1) 58 | self.left.init_fit(X[mask, :], Y[mask]) 59 | self.right.init_fit(X[~mask, :], Y[~mask]) 60 | 61 | def refit_partition(X,partition,Y): 62 | """Assumes that 'init_fit' has already been run.""" 63 | if self.is_leaf: 64 | self.model.fit(X,Y) 65 | else: 66 | nrows = X.shape[0] 67 | # get probabilities of y=1 68 | left_prob = self.left.predict_proba(X)[:, 1] 69 | right_prob = self.right.predict_proba(X)[:, 1] 70 | assignments = np.zeros(nrows) 71 | right_mask = (left_prob < right_prob) & Y == 1 72 | 73 | # TODO 74 | # assignments[] 75 | def refit_leaves(X,Y): 76 | # TODO 77 | pass 78 | 79 | def predict(X): 80 | # TODO 81 | pass 82 | 83 | class ViterbiTree(BaseEstimator): 84 | def __init__(self, max_depth=3, num_retries = 3, leaf_model=LogisticRegression): 85 | self.root = ViterbiTreeNode(1, max_depth, num_retries, leaf_model) 86 | 87 | def fit(self, X, Y): 88 | self.root.init_fit(X,Y) 89 | 90 | def predict(self, X) 91 | return self.root.predict(X) 92 | -------------------------------------------------------------------------------- /treelearn/oblique_tree.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | 18 | from copy import deepcopy 19 | import math 20 | import numpy as np 21 | from sklearn.base import BaseEstimator 22 | from sklearn.svm import LinearSVC 23 | 24 | from tree_helpers import majority, clear_sklearn_fields 25 | from typecheck import check_estimator, check_dict, check_int, check_bool 26 | from oblique_tree_node import _ObliqueTreeNode 27 | 28 | 29 | class ObliqueTree(BaseEstimator): 30 | """A decision tree whose splits are hyperplanes. 31 | Used as base learner for oblique random forests. 32 | For more information, see 'On oblique random forests'. 33 | http://people.csail.mit.edu/menze/papers/menze_11_oblique.pdf 34 | 35 | Parameters 36 | ---------------- 37 | leaf_model : classifier or regressor. 38 | 39 | split_classifier : classifier, optional (default = LinearSVC()) 40 | Learning machine used to assign data points to either side of a tree 41 | split. 42 | 43 | num_features_per_node : int, optional(default = sqrt of total feature count) 44 | 45 | max_depth : int, optional (default=3). 46 | The number of SVMs will be at most 2^max_depth 47 | 48 | min_leaf_size : int, optional (default=100). 49 | Don't split data if it gets smaller than this number. 50 | 51 | randomize_split_params : dict, optional (default={}) 52 | Maps names of split classifier parameters to functions which generate 53 | random values. 54 | 55 | randomize_leaf_params : dict, optional (default={}) 56 | Maps names of leaf model params to functions which randomly generate 57 | their values. 58 | """ 59 | 60 | def __init__(self, 61 | leaf_model = LinearSVC(), 62 | split_classifier = LinearSVC(), 63 | num_features_per_node = None, 64 | max_depth=3, 65 | min_leaf_size=50, 66 | randomize_split_params={}, 67 | randomize_leaf_params={}, 68 | verbose = False): 69 | 70 | # check everyone's types -- I can't give up the OCaml instincts 71 | # also, if running this code remotely it's nice to know when something 72 | # goes wrong before we send an object over to AWS 73 | check_estimator(leaf_model) 74 | check_estimator(split_classifier) 75 | check_int(max_depth) 76 | check_int(min_leaf_size) 77 | check_dict(randomize_split_params) 78 | check_dict(randomize_leaf_params) 79 | check_bool(verbose) 80 | 81 | self.leaf_model = leaf_model 82 | self.split_classifier = split_classifier 83 | self.max_depth = max_depth 84 | self.min_leaf_size = min_leaf_size 85 | self.num_features_per_node = num_features_per_node 86 | 87 | self.randomize_split_params = randomize_split_params 88 | self.randomize_leaf_params = randomize_leaf_params 89 | self.verbose = verbose 90 | 91 | self.root = None 92 | self.classes = None 93 | 94 | def fit(self, X,Y, **fit_keywords): 95 | X = np.atleast_2d(X) 96 | Y = np.atleast_1d(Y) 97 | 98 | n_features = X.shape[1] 99 | num_features_per_node = self.num_features_per_node 100 | 101 | if num_features_per_node is None: 102 | num_features_per_node = int(math.ceil(math.sqrt(X.shape[0]))) 103 | 104 | elif num_features_per_node > n_features: 105 | num_features_per_node = n_features 106 | 107 | self.classes = list(np.unique(Y)) 108 | 109 | self.root = _ObliqueTreeNode( 110 | split_classifier = self.split_classifier, 111 | leaf_model = self.leaf_model, 112 | num_features_per_node = num_features_per_node, 113 | classes = self.classes, 114 | depth = 1, 115 | max_depth = self.max_depth, 116 | min_leaf_size = self.min_leaf_size, 117 | randomize_split_params = self.randomize_split_params, 118 | randomize_leaf_params = self.randomize_leaf_params, 119 | verbose = self.verbose 120 | ) 121 | self.root.fit(X, Y, **fit_keywords) 122 | 123 | 124 | def predict(self, X): 125 | return self.root.predict(X) 126 | 127 | -------------------------------------------------------------------------------- /treelearn/oblique_tree_node.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | from sklearn.base import BaseEstimator 4 | from tree_helpers import majority, clear_sklearn_fields 5 | from constant_leaf import ConstantLeaf 6 | 7 | class _ObliqueTreeNode(BaseEstimator): 8 | """ 9 | Do not use this directly, instead train an ObliqueTree""" 10 | def __init__(self, 11 | split_classifier, 12 | leaf_model, 13 | num_features_per_node, 14 | classes, 15 | depth, 16 | max_depth, 17 | min_leaf_size, 18 | randomize_split_params, 19 | randomize_leaf_params, 20 | verbose): 21 | 22 | self.split_classifier = split_classifier 23 | self.leaf_model = leaf_model 24 | self.num_features_per_node = num_features_per_node 25 | self.classes = classes 26 | self.depth = depth 27 | self.max_depth = max_depth 28 | self.min_leaf_size = min_leaf_size 29 | self.randomize_split_params = randomize_split_params 30 | self.randomize_leaf_params = randomize_leaf_params 31 | self.verbose = verbose 32 | 33 | self.children = {} 34 | self.model = None 35 | self.subspace = None 36 | 37 | def _fit_leaf(self, X, Y, fit_keywords): 38 | if self.verbose: 39 | print "Fitting leaf" 40 | model = deepcopy(self.leaf_model) 41 | for field, gen in self.randomize_leaf_params.items(): 42 | setattr(model, field, gen()) 43 | model.fit(X, Y, **fit_keywords) 44 | clear_sklearn_fields(model) 45 | return model 46 | 47 | def _fit_child(self, X_slice, Y_slice, fit_keywords): 48 | count = X_slice.shape[0] 49 | unique_ys = np.unique(Y_slice) 50 | if len(unique_ys) == 1: 51 | const = int(unique_ys[0]) 52 | if self.verbose: 53 | print "ConstantLeaf", const 54 | child = ConstantLeaf(const) 55 | elif count < self.min_leaf_size: 56 | child = self._fit_leaf(X_slice, Y_slice, fit_keywords) 57 | else: 58 | child = _ObliqueTreeNode( 59 | split_classifier = self.split_classifier, 60 | leaf_model = self.leaf_model, 61 | num_features_per_node = self.num_features_per_node, 62 | classes = self.classes, 63 | depth = self.depth +1, 64 | max_depth = self.max_depth, 65 | min_leaf_size = self.min_leaf_size, 66 | randomize_split_params = self.randomize_split_params, 67 | randomize_leaf_params = self.randomize_leaf_params, 68 | verbose = self.verbose 69 | ) 70 | child.fit(X_slice, Y_slice, **fit_keywords) 71 | return child 72 | 73 | 74 | 75 | def fit(self, X, Y, **fit_keywords): 76 | n_samples, n_features = X.shape 77 | 78 | if self.verbose: 79 | print "Depth", self.depth, ": Fitting model for", n_samples, "vectors" 80 | 81 | if self.depth >= self.max_depth or n_samples <= self.min_leaf_size: 82 | self.model = self._fit_leaf(X, Y, fit_keywords) 83 | else: 84 | 85 | # if we've been passed a limit to the number of features 86 | # then train the current model on a random subspace of that size 87 | if self.num_features_per_node: 88 | feature_indices = np.random.permutation(n_features) 89 | self.subspace = feature_indices[:self.num_features_per_node] 90 | X_reduced = X[:, self.subspace] 91 | else: 92 | X_reduced = X 93 | 94 | 95 | self.model = deepcopy(self.split_classifier) 96 | for field, gen in self.randomize_split_params.items(): 97 | setattr(self.model, field, gen()) 98 | self.model.fit(X_reduced, Y, **fit_keywords) 99 | clear_sklearn_fields(self.model) 100 | pred = self.model.predict(X_reduced) 101 | 102 | for c in self.classes: 103 | mask = (pred == c) 104 | count = np.sum(mask) 105 | if count == 0: 106 | self.children[c] = ConstantLeaf(int(c)) 107 | else: 108 | X_slice = X[mask, :] 109 | Y_slice = Y[mask] 110 | self.children[c] = self._fit_child(X_slice, Y_slice, fit_keywords) 111 | 112 | def predict(self, X): 113 | nrows = X.shape[0] 114 | if self.subspace is not None: 115 | X_reduced = X[:, self.subspace] 116 | pred = self.model.predict(X_reduced) 117 | else: 118 | pred = self.model.predict(X) 119 | 120 | if len(self.children) == 0: 121 | return pred 122 | else: 123 | # fill this array with sub-arrays received from the predictions of children 124 | outputs = pred.copy() 125 | for c in self.classes: 126 | mask = (pred == c) 127 | X_slice = X[mask, :] 128 | count = X_slice.shape[0] 129 | 130 | if count > 0: 131 | pred = self.children[c].predict(X_slice) 132 | outputs[mask] = pred 133 | return outputs 134 | -------------------------------------------------------------------------------- /treelearn/base_ensemble.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | 18 | from copy import deepcopy 19 | import numpy as np 20 | import random 21 | import math 22 | 23 | from sklearn.base import BaseEstimator 24 | 25 | from tree_helpers import clear_sklearn_fields 26 | from typecheck import check_estimator, check_dict, check_int, check_bool 27 | 28 | class BaseEnsemble(BaseEstimator): 29 | def __init__(self, 30 | base_model, 31 | num_models, 32 | bagging_percent, 33 | bagging_replacement, 34 | feature_subset_percent, 35 | stacking_model, 36 | randomize_params, 37 | additive, 38 | verbose): 39 | check_estimator(base_model) 40 | check_int(num_models) 41 | 42 | self.base_model = base_model 43 | self.num_models = num_models 44 | self.bagging_percent = bagging_percent 45 | self.bagging_replacement = bagging_replacement 46 | self.feature_subset_percent = feature_subset_percent 47 | self.stacking_model = stacking_model 48 | self.randomize_params = randomize_params 49 | self.additive = additive 50 | self.verbose = verbose 51 | self.need_to_fit = True 52 | self.models = None 53 | self.weights = None 54 | 55 | 56 | def fit(self, X, Y, **fit_keywords): 57 | assert self.base_model is not None 58 | assert self.bagging_percent is not None 59 | assert self.bagging_replacement is not None 60 | assert self.num_models is not None 61 | assert self.verbose is not None 62 | 63 | self.need_to_fit = False 64 | self.models = [] 65 | 66 | X = np.atleast_2d(X) 67 | Y = np.atleast_1d(Y) 68 | 69 | n_rows, total_features = X.shape 70 | bagsize = int(math.ceil(self.bagging_percent * n_rows)) 71 | 72 | 73 | if self.additive: 74 | self.weights = np.ones(self.num_models, dtype='float') 75 | else: 76 | self.weights = np.ones(self.num_models, dtype='float') / self.num_models 77 | 78 | 79 | # each derived class needs to implement this 80 | self._init_fit(X,Y) 81 | if self.feature_subset_percent < 1: 82 | n_features = int(math.ceil(self.feature_subset_percent * total_features)) 83 | self.feature_subsets = [] 84 | else: 85 | n_features = total_features 86 | self.feature_subsets = None 87 | 88 | for i in xrange(self.num_models): 89 | if self.verbose: 90 | print "Training iteration", i 91 | 92 | if self.bagging_replacement: 93 | indices = np.random.random_integers(0,n_rows-1,bagsize) 94 | else: 95 | p = np.random.permutation(n_rows) 96 | indices = p[:bagsize] 97 | 98 | data_subset = X[indices, :] 99 | if n_features < total_features: 100 | feature_indices = np.random.permutation(total_features)[:n_features] 101 | self.feature_subsets.append(feature_indices) 102 | data_subset = data_subset[:, feature_indices] 103 | 104 | label_subset = Y[indices] 105 | model = deepcopy(self.base_model) 106 | # randomize parameters using given functions 107 | for param_name, fn in self.randomize_params.items(): 108 | setattr(model, param_name, fn()) 109 | model.fit(data_subset, label_subset, **fit_keywords) 110 | 111 | self.models.append(model) 112 | self._created_model(X, Y, indices, i, model) 113 | 114 | if self.additive: 115 | if n_features < total_features: 116 | Y -= model.predict(X[:, feature_indices]) 117 | else: 118 | Y -= model.predict(X) 119 | 120 | clear_sklearn_fields(model) 121 | # stacking works by treating the outputs of each base classifier as the 122 | # inputs to an additional meta-classifier 123 | if self.stacking_model: 124 | transformed_data = self.transform(X) 125 | self.stacking_model.fit(transformed_data, Y) 126 | 127 | 128 | def transform(self, X): 129 | """Convert each feature vector into a row of predictions.""" 130 | assert self.models is not None 131 | 132 | X = np.atleast_2d(X) 133 | n_samples, n_features = X.shape 134 | n_models = len(self.models) 135 | pred = np.zeros([n_samples, n_models]) 136 | if self.feature_subsets: 137 | for i, model in enumerate(self.models): 138 | feature_indices = self.feature_subsets[i] 139 | X_subset = X[:, feature_indices] 140 | pred[:, i] = model.predict(X_subset) 141 | else: 142 | for i, model in enumerate(self.models): 143 | pred[:, i] = model.predict(X) 144 | return pred 145 | 146 | -------------------------------------------------------------------------------- /treelearn/classifier_ensemble.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.svm import LinearSVC 4 | from sklearn.metrics import fbeta_score 5 | 6 | from base_ensemble import BaseEnsemble 7 | 8 | class ClassifierEnsemble(BaseEnsemble): 9 | """ 10 | Train an ensemble of classifiers using a 11 | subset of the data for each base classifier. 12 | 13 | Parameters 14 | ---------- 15 | base_model : Any classifier which obeys the fit/predict protocol. 16 | Defaults to a Linear SVM with C = 1. 17 | 18 | num_models : int, optional (default = 50) 19 | How many base classifiers to train. 20 | 21 | bagging_percent : float, optional (default=0.5). 22 | How much of the data set goes into each bootstrap sample. 23 | 24 | bagging_replacement : bool, optional (default = True). 25 | Sample with our without replacement. 26 | 27 | weighting : None or float, optional (default=None). 28 | Weight individual classifiers in the ensemble by 29 | None : all classifiers given equal weight 30 | : compute F_beta score for each classifier. 31 | Only works for binary classification. 32 | 33 | stacking : classifier, optional (default=None). 34 | Feed output of weighted individual predictions into another classifier. 35 | Suggested model: LogisticRegression. 36 | 37 | 38 | verbose : bool, optional (default = False). 39 | Print diagnostic output. 40 | """ 41 | 42 | def __init__(self, 43 | base_model = LinearSVC(), 44 | num_models = 50, 45 | bagging_percent=0.5, 46 | bagging_replacement=True, 47 | feature_subset_percent = 1.0, 48 | weighting = None, 49 | stacking_model = None, 50 | randomize_params = {}, 51 | verbose=False): 52 | 53 | BaseEnsemble.__init__( 54 | self, 55 | base_model, 56 | num_models, 57 | bagging_percent, 58 | bagging_replacement, 59 | feature_subset_percent, 60 | stacking_model, 61 | randomize_params, 62 | False, # for now additive only works for regression 63 | verbose) 64 | self.weighting = weighting 65 | self.classes = None 66 | self.class_list = None 67 | 68 | def _init_fit(self, X, Y): 69 | self.classes = np.unique(Y) 70 | self.class_list = list(self.classes) 71 | 72 | def _created_model(self, X, Y, indices, i, model): 73 | # to assign an F-score weight to each classifier, 74 | # sample another subset of the data and use the model 75 | # we just train to generate predictions 76 | beta = self.weighting 77 | n = X.shape[0] 78 | bagsize = len(indices) 79 | if beta or self.verbose: 80 | error_sample_indices = np.random.random_integers(0,n-1,bagsize) 81 | error_subset = X[error_sample_indices, :] 82 | if self.feature_subsets: 83 | error_subset = error_subset[:, self.feature_subsets[i]] 84 | error_labels = Y[error_sample_indices] 85 | y_pred = model.predict(error_subset) 86 | 87 | if self.weighting: 88 | f_score = fbeta_score(error_labels, y_pred, beta) 89 | self.weights[i] = f_score 90 | if self.verbose: 91 | print "Actual non-zero:", np.sum(error_labels != 0) 92 | num_pred_nz = np.sum(y_pred != 0) 93 | print "Predicted non-zero:", num_pred_nz 94 | pred_correct = (y_pred == error_labels) 95 | pred_nz = (y_pred != 0) 96 | num_true_nz = np.sum(pred_correct & pred_nz) 97 | print "True non-zero:", num_true_nz 98 | print "False non-zero:", num_pred_nz - num_true_nz 99 | print "---" 100 | # normalize weights to add up to 1 101 | 102 | 103 | def _predict_votes(self, X): 104 | X = np.atleast_2d(X) 105 | n_samples, n_features = X.shape 106 | n_classes = len(self.classes) 107 | votes = np.zeros( [n_samples, n_classes] ) 108 | 109 | for i, model in enumerate(self.models): 110 | weight = self.weights[i] 111 | if self.feature_subsets: 112 | feature_indices = self.feature_subsets[i] 113 | X_subset = X[:, feature_indices] 114 | ys = model.predict(X_subset) 115 | else: 116 | ys = model.predict(X) 117 | for c in self.classes: 118 | class_index = self.class_list.index(c) 119 | votes[ys == c, class_index] += weight 120 | return votes 121 | 122 | def _predict_normalized_votes(self, X): 123 | votes = self._predict_votes(X) 124 | sums = np.sum(votes, axis=1) 125 | return votes / np.array([sums], dtype='float').T 126 | 127 | def _weighted_transform(self, X): 128 | pred = self.transform(X) 129 | for i, w in enumerate(self.weights): 130 | pred[:, i] *= w 131 | return pred 132 | 133 | def _predict_stacked_probs(self, X): 134 | transformed = self.transform(X) 135 | return self.stacking_model.predict_proba(transformed) 136 | 137 | def predict_proba(self, X): 138 | if self.need_to_fit: 139 | raise RuntimeError("Trying to call 'predict_proba' before 'fit'") 140 | if self.stacking_model: 141 | return self._predict_stacked_probs(X) 142 | else: 143 | return self._predict_normalized_votes(X) 144 | 145 | def predict(self, X, return_probs=False): 146 | """Every classifier in the ensemble votes for a class. 147 | If we're doing stacking, then pass the votes as features into 148 | the stacking classifier, otherwise return the majority vote.""" 149 | if self.need_to_fit: 150 | raise RuntimeError("Trying to call 'predict' before 'fit'") 151 | 152 | if self.stacking_model: 153 | majority_indices = np.argmax(self._predict_stacked_probs(X), axis=1) 154 | else: 155 | majority_indices = np.argmax(self._predict_votes(X), axis=1) 156 | return np.array([self.class_list[i] for i in majority_indices]) 157 | 158 | -------------------------------------------------------------------------------- /treelearn/randomized_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import scipy.weave 4 | import scipy.stats 5 | import random 6 | import math 7 | 8 | from sklearn.base import BaseEstimator 9 | from constant_leaf import ConstantLeaf 10 | from tree_node import TreeNode 11 | import random 12 | from tree_helpers import majority, midpoints 13 | from tree_helpers import find_best_gini_split, find_min_variance_split 14 | 15 | class RandomizedTree(BaseEstimator): 16 | """Decision tree which only inspects a random subset of the features 17 | at each split. Uses Gini impurity to compare possible data splits. 18 | 19 | Parameters 20 | ---------- 21 | num_features_per_node : int, optional (default = None). 22 | At each split, how many features should we consider splitting. 23 | If None, then use log(total number of features). 24 | 25 | min_leaf_size : int, optional (default=15). 26 | Stop splitting when the data gets this small. 27 | 28 | max_height : int, optional (default = 100). 29 | Stop growing tree at this height. 30 | 31 | max_thresholds : int, optional (default = None). 32 | At each split, generate at most this number of evenly spaced thresholds 33 | between the min and max feature values. The default behavior is 34 | to consider all midpoints between unique feature values. 35 | 36 | classes : sequence of int labels, optional (default = None). 37 | If None, then use the unique values of the classes given to 'fit'. 38 | 39 | feature_names : string list (default = None). 40 | Names to use for pretty printing. 41 | 42 | verbose : bool (default = False). 43 | Print debugging output. 44 | """ 45 | 46 | def __init__(self, 47 | num_features_per_node=None, 48 | min_leaf_size=10, 49 | max_height = 20, 50 | max_thresholds=None, 51 | regression = False, 52 | feature_names = None, 53 | verbose = False): 54 | self.root = None 55 | self.num_features_per_node = num_features_per_node 56 | self.min_leaf_size = min_leaf_size 57 | self.max_height = max_height 58 | self.classes = None 59 | self.feature_names = feature_names 60 | self.max_thresholds = max_thresholds 61 | if max_thresholds is None: 62 | self.get_thresholds = self.all_thresholds 63 | else: 64 | self.get_thresholds = self.random_threshold_subset 65 | self.regression = regression 66 | if regression: 67 | self.leaf_dtype = 'float' 68 | else: 69 | self.leaf_dtype = 'int' 70 | 71 | self.verbose = verbose 72 | 73 | 74 | def all_thresholds(self, x): 75 | """get midpoints between all unique values""" 76 | if len(x) > 1: 77 | return midpoints(np.unique(x)) 78 | else: 79 | return x 80 | 81 | def random_threshold_subset(self, x): 82 | total = len(x) 83 | k = self.max_thresholds 84 | nsamples = min(total, k) 85 | rand_subset = random.sample(x, nsamples) 86 | return self.all_thresholds(rand_subset) 87 | 88 | def _split(self, data, labels, m, height): 89 | n_samples = data.shape[0] 90 | if n_samples <= self.min_leaf_size or height > self.max_height: 91 | self.nleaves += 1 92 | if self.regression: 93 | return ConstantLeaf(np.mean(labels)) 94 | else: 95 | return ConstantLeaf(majority(labels, self.classes)) 96 | elif np.all(labels == labels[0]): 97 | self.nleaves += 1 98 | return ConstantLeaf(labels[0]) 99 | else: 100 | nfeatures = data.shape[1] 101 | # randomly draw m feature indices. 102 | # should be more efficient than explicitly constructing a permutation 103 | # vector and then keeping only the head elements 104 | random_feature_indices = random.sample(xrange(nfeatures), m) 105 | best_split_score = np.inf 106 | best_feature_idx = None 107 | best_thresh = None 108 | 109 | for feature_idx in random_feature_indices: 110 | feature_vec = data[:, feature_idx] 111 | thresholds = self.get_thresholds(feature_vec) 112 | 113 | 114 | if self.regression: 115 | thresh, combined_score = \ 116 | find_min_variance_split(feature_vec, thresholds, labels) 117 | else: 118 | thresh, combined_score = \ 119 | find_best_gini_split(self.classes, feature_vec, thresholds, labels) 120 | 121 | if combined_score < best_split_score: 122 | best_split_score = combined_score 123 | best_feature_idx = feature_idx 124 | best_thresh = thresh 125 | 126 | left_mask = data[:, best_feature_idx] < best_thresh 127 | right_mask = ~left_mask 128 | 129 | left_data = data[left_mask, :] 130 | right_data = data[right_mask, :] 131 | 132 | left_labels = labels[left_mask] 133 | right_labels = labels[right_mask] 134 | 135 | # get rid of references before recursion so data can be deleted 136 | del labels 137 | del data 138 | del random_feature_indices 139 | del left_mask 140 | del right_mask 141 | 142 | left_node = self._split(left_data, left_labels, m, height+1) 143 | right_node = self._split(right_data, right_labels, m, height+1) 144 | node = TreeNode(best_feature_idx, best_thresh, left_node, right_node) 145 | return node 146 | 147 | 148 | def fit(self, data, labels, feature_names = None): 149 | data = np.atleast_2d(data) 150 | labels = np.atleast_1d(labels) 151 | 152 | if not self.regression: 153 | self.classes = np.unique(labels) 154 | self.nclasses = len(self.classes) 155 | self.feature_names = feature_names 156 | self.nleaves = 0 157 | nrows = data.shape[0] 158 | nfeatures = data.shape[1] 159 | if self.num_features_per_node is None: 160 | m = int(round(math.log(nfeatures, 2))) 161 | else: 162 | m = self.num_features_per_node 163 | self.root = self._split(data, labels, m, 1) 164 | 165 | def predict(self, X): 166 | X = np.atleast_2d(X) 167 | n_samples = X.shape[0] 168 | # create an output array and let the tree nodes recursively fill it 169 | outputs = np.zeros(n_samples, dtype=self.leaf_dtype) 170 | mask = np.ones(n_samples, dtype='bool') 171 | self.root.fill_predict(X, outputs, mask) 172 | return outputs 173 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /treelearn/recipes.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | import numpy as np 18 | 19 | from sklearn.svm import LinearSVC 20 | from sklearn.linear_model import LogisticRegression, SGDClassifier 21 | from sklearn.linear_model import LinearRegression, Ridge 22 | 23 | from regression_ensemble import RegressionEnsemble 24 | from classifier_ensemble import ClassifierEnsemble 25 | from clustered_regression import ClusteredRegression 26 | from clustered_classifier import ClusteredClassifier 27 | from randomized_tree import RandomizedTree 28 | from oblique_tree import ObliqueTree 29 | 30 | 31 | def train_random_forest( 32 | X, 33 | Y, 34 | num_trees = 20, 35 | max_thresholds = 10, 36 | max_height = None, 37 | min_leaf_size = None, 38 | bagging_percent=0.65): 39 | """A random forest is a bagging ensemble of randomized trees, so it can 40 | be implemented by combining the BaggedClassifier and RandomizedTree objects. 41 | This function is just a helper to your life easier. 42 | 43 | Parameters 44 | ---------- 45 | X : numpy array containing input data. 46 | Should have samples for rows and features for columns. 47 | 48 | Y : numpy array containing class labels for each sample 49 | 50 | num_trees : how big is the forest? 51 | 52 | max_thresholds : rather than evaluating all possible thresholds at each 53 | split, randomly sample this number of thresholds 54 | 55 | max_height : don't let tree grow past given height, inferred if omitted. 56 | 57 | min_leaf_size : don't split nodes smaller than this, inferred if omitted. 58 | 59 | bagging_percent : what subset of the data is each tree trained on? 60 | 61 | **tree_args : parameters for individual decision tree. 62 | """ 63 | if isinstance(Y[0], float): 64 | regression = True 65 | else: 66 | regression = False 67 | 68 | if max_height is None: 69 | max_height = int(np.log2(X.shape[0])) + 1 70 | if min_leaf_size is None: 71 | min_leaf_size = int(np.log2(X.shape[0])) 72 | 73 | tree = RandomizedTree( 74 | regression = regression, 75 | max_thresholds = max_thresholds, 76 | max_height = max_height, 77 | min_leaf_size = min_leaf_size, 78 | ) 79 | 80 | if regression: 81 | forest = RegressionEnsemble( 82 | base_model = tree, 83 | num_models= num_trees, 84 | bagging_percent = bagging_percent 85 | ) 86 | else: 87 | forest = ClassifierEnsemble( 88 | base_model = tree, 89 | num_models = num_trees, 90 | bagging_percent = bagging_percent 91 | ) 92 | forest.fit(X,Y) 93 | return forest 94 | 95 | def gen_random_C(): 96 | return 10 ** (np.random.randn()) 97 | 98 | def mk_svm_tree(max_depth = 3, randomize_C = False, model_args = {}, tree_args = {}): 99 | randomize_split_params = {} 100 | randomize_leaf_params = {} 101 | if randomize_C: 102 | randomize_split_params['C'] = gen_random_C 103 | randomize_leaf_params['C'] = gen_random_C 104 | 105 | split_classifier = LinearSVC(**model_args) 106 | leaf_classifier = LinearSVC(**model_args) 107 | 108 | tree = ObliqueTree( 109 | max_depth = max_depth, 110 | split_classifier=split_classifier, 111 | leaf_model=leaf_classifier, 112 | randomize_split_params = randomize_split_params, 113 | randomize_leaf_params = randomize_leaf_params, 114 | **tree_args) 115 | return tree 116 | 117 | def train_svm_tree(X, Y, max_depth = 3, randomize_C = False, model_args = {}, tree_args={}): 118 | tree = mk_svm_tree(max_depth, randomize_C, model_args, tree_args) 119 | tree.fit(X, Y) 120 | return tree 121 | 122 | def train_svm_forest(X, Y, num_trees = 10, max_depth = 3, bagging_percent=0.65, randomize_C = False, model_args ={}, tree_args={}): 123 | """A random forest whose base classifier is a SVM-Tree (rather 124 | than splitting individual features we project each point onto a hyperplane) 125 | 126 | Parameters 127 | ---------- 128 | X : numpy array containing input data. 129 | Should have samples for rows and features for columns. 130 | 131 | Y : numpy array containing class labels for each sample 132 | 133 | num_trees : how big is the forest? 134 | 135 | bagging_percent : what subset of the data is each tree trained on? 136 | 137 | randomize_C : bool 138 | 139 | model_args : parameters for each SVM classifier 140 | 141 | tree_args : parameters for each tree of classifiers 142 | """ 143 | tree = mk_svm_tree(max_depth, randomize_C, model_args, tree_args) 144 | forest = ClassifierEnsemble( 145 | base_model = tree, 146 | num_models = num_trees, 147 | bagging_percent = bagging_percent) 148 | forest.fit(X,Y) 149 | return forest 150 | 151 | def gen_random_alpha(): 152 | return 10**(-np.random.random()*7) 153 | 154 | def mk_sgd_tree(n_examples=200000, max_depth=3, randomize_alpha=False, model_args={}, tree_args={}): 155 | randomize_split_params = {} 156 | randomize_leaf_params = {} 157 | if randomize_alpha: 158 | randomize_split_params['alpha'] = gen_random_alpha 159 | randomize_leaf_params['alpha'] = gen_random_alpha 160 | 161 | n_iter = np.ceil(10**6 / n_examples) 162 | split_classifier = SGDClassifier(n_iter = n_iter, shuffle=True, **model_args) 163 | leaf_classifier = SGDClassifier(n_iter = n_iter, shuffle=True, **model_args) 164 | 165 | tree = ObliqueTree( 166 | max_depth = max_depth, 167 | split_classifier=split_classifier, 168 | leaf_model=leaf_classifier, 169 | randomize_split_params = randomize_split_params, 170 | randomize_leaf_params = randomize_leaf_params, 171 | **tree_args 172 | ) 173 | return tree 174 | 175 | def train_sgd_tree(X, Y, max_depth=3, randomize_alpha=False, model_args = {}, tree_args={}): 176 | tree = mk_sgd_tree(X.shape[0], max_depth, randomize_alpha, model_args, tree_args) 177 | tree.fit(X, Y) 178 | return tree 179 | 180 | def train_sgd_forest(X, Y, 181 | num_trees = 20, 182 | max_depth = 3, 183 | bagging_percent=0.65, 184 | randomize_alpha=False, 185 | model_args = {}, 186 | tree_args= {}): 187 | """A random forest whose base classifier is a tree of SGD classifiers 188 | 189 | Parameters 190 | ---------- 191 | X : numpy array containing input data. 192 | Should have samples for rows and features for columns. 193 | 194 | Y : numpy array containing class labels for each sample 195 | 196 | num_trees : how big is the forest? 197 | 198 | bagging_percent : what subset of the data is each tree trained on? 199 | 200 | randomize_alpha : bool 201 | 202 | model_args : parameters for each SGD classifier 203 | 204 | tree_args : parameters for each tree 205 | """ 206 | bagsize = bagging_percent * X.shape[0] 207 | tree = mk_sgd_tree(bagsize, max_depth, randomize_alpha, model_args, tree_args) 208 | forest = ClassifierEnsemble( 209 | base_model = tree, 210 | num_models = num_trees, 211 | bagging_percent = bagging_percent) 212 | forest.fit(X,Y) 213 | return forest 214 | 215 | def train_clustered_ols(X, Y, k = 20): 216 | """Cluster data and then train a linear regressor per cluster""" 217 | cr = ClusteredRegression(k) 218 | cr.fit(X, Y) 219 | return cr 220 | 221 | def train_clustered_svm(X, Y, k = 20, C = 1, verbose = True): 222 | base_model = LinearSVC(C = C) 223 | cc = ClusteredClassifier(k = k, base_model = base_model, verbose = verbose) 224 | cc.fit(X, Y) 225 | return cc 226 | 227 | def mk_clustered_svm_ensemble( 228 | num_models = 20, 229 | C = 1, 230 | k = 20, 231 | stacking= False, 232 | bagging_percent = 0.65, 233 | feature_subset_percent=0.5, 234 | verbose = True): 235 | 236 | base_model = LinearSVC(C = C) 237 | clustered_model = ClusteredClassifier(k, base_model = base_model, verbose=verbose) 238 | 239 | if stacking: 240 | stacking_model = LogisticRegression(fit_intercept=False) 241 | else: 242 | stacking_model = None 243 | 244 | return ClassifierEnsemble( 245 | base_model = clustered_model, 246 | num_models = num_models, 247 | bagging_percent = bagging_percent, 248 | feature_subset_percent = feature_subset_percent, 249 | stacking_model = stacking_model) 250 | 251 | def train_clustered_svm_ensemble( 252 | X, 253 | Y, 254 | num_models = 10, 255 | C = 1, 256 | k = 20, 257 | stacking= False, 258 | bagging_percent = 0.65, 259 | feature_subset_percent=0.5, 260 | verbose = True): 261 | ensemble = mk_clustered_svm_ensemble( 262 | num_models, 263 | C, 264 | k, 265 | stacking, 266 | bagging_percent, 267 | feature_subset_percent, 268 | verbose) 269 | ensemble.fit(X, Y) 270 | return ensemble 271 | 272 | def mk_clustered_regression_ensemble( 273 | num_models = 20, 274 | k = 20, 275 | stacking= False, 276 | additive=False, 277 | bagging_percent = 0.65, 278 | feature_subset_percent=0.5): 279 | 280 | 281 | clustered_model = ClusteredRegression(k=k, base_model = LinearRegression()) 282 | 283 | if stacking: 284 | stacking_model = LinearRegression(fit_intercept=False) 285 | else: 286 | stacking_model = None 287 | 288 | return RegressionEnsemble( 289 | base_model = clustered_model, 290 | num_models = num_models, 291 | bagging_percent = bagging_percent, 292 | feature_subset_percent = feature_subset_percent, 293 | stacking_model = stacking_model, 294 | additive = additive 295 | ) 296 | 297 | def train_clustered_regression_ensemble( 298 | X, 299 | Y, 300 | num_models=10, 301 | k = 20, 302 | stacking=False, 303 | additive=False, 304 | bagging_percent = 0.65, 305 | feature_subset_percent=0.5): 306 | ensemble = mk_clustered_regression_ensemble ( 307 | num_models = num_models, 308 | k = k, 309 | stacking = stacking, 310 | additive = additive, 311 | bagging_percent = bagging_percent, 312 | feature_subset_percent = feature_subset_percent 313 | ) 314 | ensemble.fit(X, Y) 315 | return ensemble 316 | 317 | def mk_additive_regression_forest( 318 | num_trees=50, 319 | bagging_percent = 0.65, 320 | feature_subset_percent = 0.5, 321 | max_height=3, 322 | min_leaf_size=10, 323 | max_thresholds=100): 324 | tree = RandomizedTree( 325 | max_height= max_height, 326 | min_leaf_size=min_leaf_size, 327 | max_thresholds=max_thresholds, 328 | regression=True) 329 | forest = RegressionEnsemble( 330 | base_model = tree, 331 | num_models=num_trees, 332 | bagging_percent = bagging_percent, 333 | feature_subset_percent = feature_subset_percent, 334 | additive=True) 335 | return forest 336 | 337 | def train_additive_regression_forest(X, Y, 338 | num_trees=50, 339 | bagging_percent = 0.65, 340 | feature_subset_percent = 0.5, 341 | max_height=3, 342 | min_leaf_size=10, 343 | max_thresholds=50): 344 | forest = mk_additive_regression_forest( 345 | num_trees, 346 | bagging_percent, 347 | feature_subset_percent, 348 | max_height, 349 | min_leaf_size, 350 | max_thresholds) 351 | forest.fit(X,Y) 352 | return forest 353 | 354 | -------------------------------------------------------------------------------- /treelearn/tree_helpers.py: -------------------------------------------------------------------------------- 1 | # TreeLearn 2 | # 3 | # Copyright (C) Capital K Partners 4 | # Author: Alex Rubinsteyn 5 | # Contact: alex [at] capitalkpartners [dot] com 6 | # 7 | # This library is free software; you can redistribute it and/or 8 | # modify it under the terms of the GNU Lesser General Public 9 | # License as published by the Free Software Foundation; either 10 | # version 2.1 of the License, or (at your option) any later version. 11 | # 12 | # This library is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # Lesser General Public License for more details. 16 | 17 | 18 | import numpy as np 19 | import scipy 20 | import scipy.weave 21 | 22 | 23 | # some sklearn classifiers leave behind large data members after fitting 24 | # which make serialization a pain--- clear those fields 25 | def clear_sklearn_fields(clf): 26 | # stupid reuse of the same field--- clearing this on classifiers 27 | # causes them to crash 28 | #if hasattr(clf, 'label_'): 29 | # clf.label_ = None 30 | if hasattr(clf, 'sample_weight'): 31 | clf.sample_weight = None 32 | 33 | def midpoints(x): 34 | return (x[1:] + x[:-1])/2.0 35 | 36 | def majority(labels, classes=None): 37 | if classes is None: 38 | classes = np.unique(labels) 39 | votes = np.zeros(len(classes)) 40 | for i, c in enumerate(classes): 41 | votes[i] = np.sum(labels == c) 42 | majority_idx = np.argmax(votes) 43 | return classes[majority_idx] 44 | 45 | 46 | def slow_gini(classes, labels): 47 | sum_squares = 0.0 48 | n = len(labels) 49 | if n == 0: 50 | return 0.0 51 | else: 52 | n_squared = float(n * n) 53 | for c in classes: 54 | count = np.sum(labels == c) 55 | p_squared = count*count / n_squared 56 | sum_squares += p_squared 57 | return 1 - sum_squares 58 | 59 | inline = scipy.weave.inline 60 | def gini(classes, labels): 61 | code = """ 62 | int num_classes = Nclasses[0]; 63 | int n = Nlabels[0]; 64 | float sum_squares = 0.0f; 65 | for (int class_index = 0; class_index < num_classes; ++class_index) { 66 | int c = classes[class_index]; 67 | int count = 0; 68 | for (int i = 0; i < n; ++i) { 69 | if (labels[i] == c) { ++count; } 70 | } 71 | float p = ((float) count) / n; 72 | sum_squares += p * p; 73 | } 74 | return_val = 1.0f - sum_squares; 75 | """ 76 | return inline(code, ['classes', 'labels'], local_dict=None, verbose=2) 77 | 78 | 79 | def slow_eval_split(classes, feature_vec, thresh, labels): 80 | mask = feature_vec < thresh 81 | left_labels = labels[mask] 82 | right_labels = labels[~mask] 83 | left_score = slow_gini(classes, left_labels) 84 | right_score = slow_gini(classes, right_labels) 85 | n_left = len(left_labels) 86 | n_right = len(right_labels) 87 | n = float(n_left+n_right) 88 | 89 | combined_score = (n_left/n)*left_score + (n_right/n)*right_score 90 | return combined_score 91 | 92 | 93 | dtype2ctype = { 94 | np.dtype(np.float64): 'double', 95 | np.dtype(np.float32): 'float', 96 | np.dtype(np.int32): 'int', 97 | np.dtype(np.int16): 'short', 98 | np.dtype(np.bool): 'bool', 99 | } 100 | 101 | def eval_gini_split(classes, feature_vec, thresh, labels): 102 | left_mask = feature_vec < thresh 103 | code = """ 104 | 105 | int num_classes = Nclasses[0]; 106 | int nlabels = Nlabels[0]; 107 | 108 | float left_sum_squares = 0.0f; 109 | float right_sum_squares = 0.0f; 110 | 111 | 112 | /* total number of elements in the left and right of the split */ 113 | int total_left = 0; 114 | int total_right = 0; 115 | 116 | /* first pass for C = 0 to get total counts along with class-specific 117 | scores 118 | */ 119 | int left_class_count = 0; 120 | int right_class_count = 0; 121 | 122 | for (int i = 0; i < nlabels; ++i) { 123 | if (left_mask[i]) { 124 | total_left += 1; 125 | if (labels[i] == 0) left_class_count += 1; 126 | } else { 127 | total_right += 1; 128 | if (labels[i] == 0) right_class_count += 1; 129 | } 130 | } 131 | if (total_left > 0) { 132 | float left_p = ((float) left_class_count) / total_left; 133 | left_sum_squares += left_p * left_p; 134 | } 135 | if (total_right > 0) { 136 | float right_p = ((float) right_class_count) / total_right; 137 | right_sum_squares += right_p* right_p; 138 | } 139 | 140 | /* how many elements of each side have we counted in the score so far? */ 141 | int cumulative_left_count = left_class_count; 142 | int cumulative_right_count = right_class_count; 143 | 144 | /* if we have a multi-class problem iterate over rest of classes, 145 | except for the last class, whose size can be inferred from the 146 | difference between left_count and total_left 147 | */ 148 | for (int class_index = 1; class_index < num_classes - 1; ++class_index) { 149 | int c = classes[class_index]; 150 | left_class_count = 0; 151 | right_class_count = 0; 152 | 153 | for (int i = 0; i < nlabels; ++i) { 154 | if (labels[i] == c) { 155 | if (left_mask[i]) left_class_count += 1; 156 | else right_class_count += 1; 157 | } 158 | } 159 | cumulative_left_count += left_class_count; 160 | cumulative_right_count += right_class_count; 161 | 162 | if (total_left > 0) { 163 | float left_p = ((float) left_class_count) / total_left; 164 | left_sum_squares += left_p * left_p; 165 | } 166 | if (total_right > 0) { 167 | float right_p = ((float) right_class_count) / total_right; 168 | right_sum_squares += right_p* right_p; 169 | } 170 | } 171 | 172 | /* handle last class */ 173 | left_class_count = total_left - cumulative_left_count; 174 | right_class_count = total_right - cumulative_right_count; 175 | if (total_left > 0) { 176 | float left_p = ((float) left_class_count) / total_left; 177 | left_sum_squares += left_p * left_p; 178 | } 179 | if (total_right > 0) { 180 | float right_p = ((float) right_class_count) / total_right; 181 | right_sum_squares += right_p* right_p; 182 | } 183 | float left_gini = 1.0f - left_sum_squares; 184 | float right_gini = 1.0f - right_sum_squares; 185 | float total = (float) nlabels; 186 | float left_weight = total_left / total; 187 | float right_weight = total_right / total; 188 | 189 | return_val = left_weight * left_gini + right_weight * right_gini; 190 | """ 191 | return inline(code, ['classes', 'left_mask', 'labels'], \ 192 | local_dict=None, verbose=2) 193 | 194 | 195 | def slow_find_best_gini_split(classes, feature_vec, thresholds, labels): 196 | best_t = None 197 | best_score = np.inf 198 | 199 | n = len(labels) 200 | for t in thresholds: 201 | mask = feature_vec < t 202 | left_labels = labels[mask] 203 | right_labels = labels[~mask] 204 | left_score = slow_gini(classes, left_labels) 205 | right_score = slow_gini(classes, right_labels) 206 | n_left = len(left_labels) 207 | n_right = len(right_labels) 208 | nf = float(n) 209 | combined_score = (n_left/nf)*left_score + (n_right/nf)*right_score 210 | if combined_score < best_score: 211 | best_t = t 212 | best_score = combined_score 213 | return best_t, best_score 214 | 215 | def slow_find_min_variance_split(feature_vec, thresholds, ys): 216 | best_score = np.inf 217 | best_t = None 218 | for t in thresholds: 219 | mask = feature_vec < t 220 | left = ys[mask] 221 | right = ys[~mask] 222 | left_size = left.shape[0] 223 | right_size = right.shape[0] 224 | 225 | if left_size > 0 and right_size > 0: 226 | total = float(left_size + right_size) 227 | score = (left_size / total) * np.var(left) + (right_size / total) * np.var(right) 228 | if score < best_score: 229 | best_score = score 230 | best_t = t 231 | return best_t, best_score 232 | 233 | def find_min_variance_split(feature_vec, thresholds, ys): 234 | code = """ 235 | float best_score = 100000000000.0; 236 | double best_thresh = 0.0; 237 | int n_thresholds = Nthresholds[0]; 238 | int n_rows = Nys[0]; 239 | 240 | for (int t_index = 0; t_index < n_thresholds; t_index++) { 241 | double thresh = thresholds[t_index]; 242 | int counts[2] = {0,0}; 243 | float means[2] = {0.0f, 0.0f}; 244 | float sum_squares[2] = {0.0f, 0.0f}; 245 | float x; 246 | float delta; 247 | bool flag; 248 | for (int i = 0; i < n_rows; ++i) { 249 | x = ys[i]; 250 | flag = feature_vec[i] < thresh; 251 | counts[flag] += 1; 252 | delta = x - means[flag]; 253 | means[flag] += delta / counts[flag]; 254 | sum_squares[flag] += delta * (x - means[flag]); 255 | } 256 | if (counts[0] > 1 && counts[1] > 1) { 257 | float score = (sum_squares[0] + sum_squares[1]) / (counts[0] + counts[1]); 258 | if (score < best_score) { 259 | best_score = score; 260 | best_thresh = thresh; 261 | } 262 | } 263 | } 264 | py::tuple results(2); 265 | results[0] = best_thresh; 266 | results[1] = best_score; 267 | return_val = results; 268 | """ 269 | return inline(code, ['feature_vec', 'thresholds', 'ys'], local_dict=None, verbose=2) 270 | 271 | def find_best_gini_split(classes, feature_vec, thresholds, labels): 272 | code = """ 273 | int n_labels = Nlabels[0]; 274 | int n_classes = Nclasses[0]; 275 | int n_thresholds = Nthresholds[0]; 276 | 277 | float best_score = 10000000.0; 278 | double best_thresh = 0.0; 279 | 280 | /* loop over each possible threshold, compute GINI impurity for each, 281 | return the threshold with lowest score 282 | */ 283 | for (int t_index = 0; t_index < n_thresholds; t_index++) { 284 | double thresh = thresholds[t_index]; 285 | 286 | float left_sum_squares = 0.0f; 287 | float right_sum_squares = 0.0f; 288 | 289 | /* total number of elements in the left and right of the split */ 290 | int totals[2] = {0, 0}; 291 | int class_counts[2] = {0,0}; 292 | 293 | 294 | /* first pass for C = 0 to get total counts along with class-specific 295 | scores 296 | */ 297 | 298 | bool choice; 299 | bool correct_class; 300 | for (int i = 0; i < n_labels; ++i) { 301 | choice = feature_vec[i] < thresh; 302 | totals[choice] += 1; 303 | correct_class = (labels[i] == 0); 304 | class_counts[choice] += correct_class; 305 | } 306 | 307 | int total_left = totals[0]; 308 | int total_right = totals[1]; 309 | if (total_left > 0) { 310 | float left_p = ((float) class_counts[0]) / total_left; 311 | left_sum_squares += left_p * left_p; 312 | } 313 | if (total_right > 0) { 314 | float right_p = ((float) class_counts[1]) / total_right; 315 | right_sum_squares += right_p* right_p; 316 | } 317 | 318 | /* how many elements of each side have we counted in the score so far? */ 319 | int cumulative_left_count = class_counts[0]; 320 | int cumulative_right_count = class_counts[1]; 321 | 322 | /* if we have a multi-class problem iterate over rest of classes, 323 | except for the last class, whose size can be inferred from the 324 | difference between left_count and total_left 325 | */ 326 | for (int class_index = 1; class_index < n_classes - 1; ++class_index) { 327 | int c = classes[class_index]; 328 | class_counts[0] = 0; 329 | class_counts[1] = 0; 330 | 331 | for (int i = 0; i < n_labels; ++i) { 332 | choice = (feature_vec[i] <= thresh); 333 | correct_class = (labels[i] == c); 334 | class_counts[choice] += correct_class; 335 | } 336 | cumulative_left_count += class_counts[0]; 337 | cumulative_right_count += class_counts[1]; 338 | 339 | if (total_left > 0) { 340 | float left_p = ((float) class_counts[0]) / total_left; 341 | left_sum_squares += left_p * left_p; 342 | } 343 | if (total_right > 0) { 344 | float right_p = ((float) class_counts[1]) / total_right; 345 | right_sum_squares += right_p* right_p; 346 | } 347 | } 348 | 349 | /* handle last class */ 350 | float left_count = total_left - cumulative_left_count; 351 | float right_count = total_right - cumulative_right_count; 352 | if (total_left > 0) { 353 | float left_p = left_count / total_left; 354 | left_sum_squares += left_p * left_p; 355 | } 356 | if (total_right > 0) { 357 | float right_p = right_count / total_right; 358 | right_sum_squares += right_p* right_p; 359 | } 360 | float left_gini = 1.0f - left_sum_squares; 361 | float right_gini = 1.0f - right_sum_squares; 362 | float total = (float) n_labels; 363 | float left_weight = total_left / total; 364 | float right_weight = total_right / total; 365 | float score = left_weight * left_gini + right_weight * right_gini; 366 | if (score < best_score) { 367 | best_score = score; 368 | best_thresh = thresh; 369 | } 370 | } 371 | 372 | py::tuple results(2); 373 | results[0] = best_thresh; 374 | results[1] = best_score; 375 | return_val = results; 376 | """ 377 | return inline(code, ['classes', 'feature_vec', 'thresholds', 'labels'], verbose=2) 378 | -------------------------------------------------------------------------------- /distribute_setup.py: -------------------------------------------------------------------------------- 1 | #!python 2 | """Bootstrap distribute installation 3 | 4 | If you want to use setuptools in your package's setup.py, just include this 5 | file in the same directory with it, and add this to the top of your setup.py:: 6 | 7 | from distribute_setup import use_setuptools 8 | use_setuptools() 9 | 10 | If you want to require a specific version of setuptools, set a download 11 | mirror, or use an alternate download directory, you can do so by supplying 12 | the appropriate options to ``use_setuptools()``. 13 | 14 | This file can also be run as a script to install or upgrade setuptools. 15 | """ 16 | import os 17 | import sys 18 | import time 19 | import fnmatch 20 | import tempfile 21 | import tarfile 22 | from distutils import log 23 | 24 | try: 25 | from site import USER_SITE 26 | except ImportError: 27 | USER_SITE = None 28 | 29 | try: 30 | import subprocess 31 | 32 | def _python_cmd(*args): 33 | args = (sys.executable,) + args 34 | return subprocess.call(args) == 0 35 | 36 | except ImportError: 37 | # will be used for python 2.3 38 | def _python_cmd(*args): 39 | args = (sys.executable,) + args 40 | # quoting arguments if windows 41 | if sys.platform == 'win32': 42 | def quote(arg): 43 | if ' ' in arg: 44 | return '"%s"' % arg 45 | return arg 46 | args = [quote(arg) for arg in args] 47 | return os.spawnl(os.P_WAIT, sys.executable, *args) == 0 48 | 49 | DEFAULT_VERSION = "0.6.19" 50 | DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/" 51 | SETUPTOOLS_FAKED_VERSION = "0.6c11" 52 | 53 | SETUPTOOLS_PKG_INFO = """\ 54 | Metadata-Version: 1.0 55 | Name: setuptools 56 | Version: %s 57 | Summary: xxxx 58 | Home-page: xxx 59 | Author: xxx 60 | Author-email: xxx 61 | License: xxx 62 | Description: xxx 63 | """ % SETUPTOOLS_FAKED_VERSION 64 | 65 | 66 | def _install(tarball): 67 | # extracting the tarball 68 | tmpdir = tempfile.mkdtemp() 69 | log.warn('Extracting in %s', tmpdir) 70 | old_wd = os.getcwd() 71 | try: 72 | os.chdir(tmpdir) 73 | tar = tarfile.open(tarball) 74 | _extractall(tar) 75 | tar.close() 76 | 77 | # going in the directory 78 | subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) 79 | os.chdir(subdir) 80 | log.warn('Now working in %s', subdir) 81 | 82 | # installing 83 | log.warn('Installing Distribute') 84 | if not _python_cmd('setup.py', 'install'): 85 | log.warn('Something went wrong during the installation.') 86 | log.warn('See the error message above.') 87 | finally: 88 | os.chdir(old_wd) 89 | 90 | 91 | def _build_egg(egg, tarball, to_dir): 92 | # extracting the tarball 93 | tmpdir = tempfile.mkdtemp() 94 | log.warn('Extracting in %s', tmpdir) 95 | old_wd = os.getcwd() 96 | try: 97 | os.chdir(tmpdir) 98 | tar = tarfile.open(tarball) 99 | _extractall(tar) 100 | tar.close() 101 | 102 | # going in the directory 103 | subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) 104 | os.chdir(subdir) 105 | log.warn('Now working in %s', subdir) 106 | 107 | # building an egg 108 | log.warn('Building a Distribute egg in %s', to_dir) 109 | _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) 110 | 111 | finally: 112 | os.chdir(old_wd) 113 | # returning the result 114 | log.warn(egg) 115 | if not os.path.exists(egg): 116 | raise IOError('Could not build the egg.') 117 | 118 | 119 | def _do_download(version, download_base, to_dir, download_delay): 120 | egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg' 121 | % (version, sys.version_info[0], sys.version_info[1])) 122 | if not os.path.exists(egg): 123 | tarball = download_setuptools(version, download_base, 124 | to_dir, download_delay) 125 | _build_egg(egg, tarball, to_dir) 126 | sys.path.insert(0, egg) 127 | import setuptools 128 | setuptools.bootstrap_install_from = egg 129 | 130 | 131 | def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, 132 | to_dir=os.curdir, download_delay=15, no_fake=True): 133 | # making sure we use the absolute path 134 | to_dir = os.path.abspath(to_dir) 135 | was_imported = 'pkg_resources' in sys.modules or \ 136 | 'setuptools' in sys.modules 137 | try: 138 | try: 139 | import pkg_resources 140 | if not hasattr(pkg_resources, '_distribute'): 141 | if not no_fake: 142 | _fake_setuptools() 143 | raise ImportError 144 | except ImportError: 145 | return _do_download(version, download_base, to_dir, download_delay) 146 | try: 147 | pkg_resources.require("distribute>="+version) 148 | return 149 | except pkg_resources.VersionConflict: 150 | e = sys.exc_info()[1] 151 | if was_imported: 152 | sys.stderr.write( 153 | "The required version of distribute (>=%s) is not available,\n" 154 | "and can't be installed while this script is running. Please\n" 155 | "install a more recent version first, using\n" 156 | "'easy_install -U distribute'." 157 | "\n\n(Currently using %r)\n" % (version, e.args[0])) 158 | sys.exit(2) 159 | else: 160 | del pkg_resources, sys.modules['pkg_resources'] # reload ok 161 | return _do_download(version, download_base, to_dir, 162 | download_delay) 163 | except pkg_resources.DistributionNotFound: 164 | return _do_download(version, download_base, to_dir, 165 | download_delay) 166 | finally: 167 | if not no_fake: 168 | _create_fake_setuptools_pkg_info(to_dir) 169 | 170 | def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, 171 | to_dir=os.curdir, delay=15): 172 | """Download distribute from a specified location and return its filename 173 | 174 | `version` should be a valid distribute version number that is available 175 | as an egg for download under the `download_base` URL (which should end 176 | with a '/'). `to_dir` is the directory where the egg will be downloaded. 177 | `delay` is the number of seconds to pause before an actual download 178 | attempt. 179 | """ 180 | # making sure we use the absolute path 181 | to_dir = os.path.abspath(to_dir) 182 | try: 183 | from urllib.request import urlopen 184 | except ImportError: 185 | from urllib2 import urlopen 186 | tgz_name = "distribute-%s.tar.gz" % version 187 | url = download_base + tgz_name 188 | saveto = os.path.join(to_dir, tgz_name) 189 | src = dst = None 190 | if not os.path.exists(saveto): # Avoid repeated downloads 191 | try: 192 | log.warn("Downloading %s", url) 193 | src = urlopen(url) 194 | # Read/write all in one block, so we don't create a corrupt file 195 | # if the download is interrupted. 196 | data = src.read() 197 | dst = open(saveto, "wb") 198 | dst.write(data) 199 | finally: 200 | if src: 201 | src.close() 202 | if dst: 203 | dst.close() 204 | return os.path.realpath(saveto) 205 | 206 | def _no_sandbox(function): 207 | def __no_sandbox(*args, **kw): 208 | try: 209 | from setuptools.sandbox import DirectorySandbox 210 | if not hasattr(DirectorySandbox, '_old'): 211 | def violation(*args): 212 | pass 213 | DirectorySandbox._old = DirectorySandbox._violation 214 | DirectorySandbox._violation = violation 215 | patched = True 216 | else: 217 | patched = False 218 | except ImportError: 219 | patched = False 220 | 221 | try: 222 | return function(*args, **kw) 223 | finally: 224 | if patched: 225 | DirectorySandbox._violation = DirectorySandbox._old 226 | del DirectorySandbox._old 227 | 228 | return __no_sandbox 229 | 230 | def _patch_file(path, content): 231 | """Will backup the file then patch it""" 232 | existing_content = open(path).read() 233 | if existing_content == content: 234 | # already patched 235 | log.warn('Already patched.') 236 | return False 237 | log.warn('Patching...') 238 | _rename_path(path) 239 | f = open(path, 'w') 240 | try: 241 | f.write(content) 242 | finally: 243 | f.close() 244 | return True 245 | 246 | _patch_file = _no_sandbox(_patch_file) 247 | 248 | def _same_content(path, content): 249 | return open(path).read() == content 250 | 251 | def _rename_path(path): 252 | new_name = path + '.OLD.%s' % time.time() 253 | log.warn('Renaming %s into %s', path, new_name) 254 | os.rename(path, new_name) 255 | return new_name 256 | 257 | def _remove_flat_installation(placeholder): 258 | if not os.path.isdir(placeholder): 259 | log.warn('Unkown installation at %s', placeholder) 260 | return False 261 | found = False 262 | for file in os.listdir(placeholder): 263 | if fnmatch.fnmatch(file, 'setuptools*.egg-info'): 264 | found = True 265 | break 266 | if not found: 267 | log.warn('Could not locate setuptools*.egg-info') 268 | return 269 | 270 | log.warn('Removing elements out of the way...') 271 | pkg_info = os.path.join(placeholder, file) 272 | if os.path.isdir(pkg_info): 273 | patched = _patch_egg_dir(pkg_info) 274 | else: 275 | patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO) 276 | 277 | if not patched: 278 | log.warn('%s already patched.', pkg_info) 279 | return False 280 | # now let's move the files out of the way 281 | for element in ('setuptools', 'pkg_resources.py', 'site.py'): 282 | element = os.path.join(placeholder, element) 283 | if os.path.exists(element): 284 | _rename_path(element) 285 | else: 286 | log.warn('Could not find the %s element of the ' 287 | 'Setuptools distribution', element) 288 | return True 289 | 290 | _remove_flat_installation = _no_sandbox(_remove_flat_installation) 291 | 292 | def _after_install(dist): 293 | log.warn('After install bootstrap.') 294 | placeholder = dist.get_command_obj('install').install_purelib 295 | _create_fake_setuptools_pkg_info(placeholder) 296 | 297 | def _create_fake_setuptools_pkg_info(placeholder): 298 | if not placeholder or not os.path.exists(placeholder): 299 | log.warn('Could not find the install location') 300 | return 301 | pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1]) 302 | setuptools_file = 'setuptools-%s-py%s.egg-info' % \ 303 | (SETUPTOOLS_FAKED_VERSION, pyver) 304 | pkg_info = os.path.join(placeholder, setuptools_file) 305 | if os.path.exists(pkg_info): 306 | log.warn('%s already exists', pkg_info) 307 | return 308 | 309 | log.warn('Creating %s', pkg_info) 310 | f = open(pkg_info, 'w') 311 | try: 312 | f.write(SETUPTOOLS_PKG_INFO) 313 | finally: 314 | f.close() 315 | 316 | pth_file = os.path.join(placeholder, 'setuptools.pth') 317 | log.warn('Creating %s', pth_file) 318 | f = open(pth_file, 'w') 319 | try: 320 | f.write(os.path.join(os.curdir, setuptools_file)) 321 | finally: 322 | f.close() 323 | 324 | _create_fake_setuptools_pkg_info = _no_sandbox(_create_fake_setuptools_pkg_info) 325 | 326 | def _patch_egg_dir(path): 327 | # let's check if it's already patched 328 | pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') 329 | if os.path.exists(pkg_info): 330 | if _same_content(pkg_info, SETUPTOOLS_PKG_INFO): 331 | log.warn('%s already patched.', pkg_info) 332 | return False 333 | _rename_path(path) 334 | os.mkdir(path) 335 | os.mkdir(os.path.join(path, 'EGG-INFO')) 336 | pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') 337 | f = open(pkg_info, 'w') 338 | try: 339 | f.write(SETUPTOOLS_PKG_INFO) 340 | finally: 341 | f.close() 342 | return True 343 | 344 | _patch_egg_dir = _no_sandbox(_patch_egg_dir) 345 | 346 | def _before_install(): 347 | log.warn('Before install bootstrap.') 348 | _fake_setuptools() 349 | 350 | 351 | def _under_prefix(location): 352 | if 'install' not in sys.argv: 353 | return True 354 | args = sys.argv[sys.argv.index('install')+1:] 355 | for index, arg in enumerate(args): 356 | for option in ('--root', '--prefix'): 357 | if arg.startswith('%s=' % option): 358 | top_dir = arg.split('root=')[-1] 359 | return location.startswith(top_dir) 360 | elif arg == option: 361 | if len(args) > index: 362 | top_dir = args[index+1] 363 | return location.startswith(top_dir) 364 | if arg == '--user' and USER_SITE is not None: 365 | return location.startswith(USER_SITE) 366 | return True 367 | 368 | 369 | def _fake_setuptools(): 370 | log.warn('Scanning installed packages') 371 | try: 372 | import pkg_resources 373 | except ImportError: 374 | # we're cool 375 | log.warn('Setuptools or Distribute does not seem to be installed.') 376 | return 377 | ws = pkg_resources.working_set 378 | try: 379 | setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools', 380 | replacement=False)) 381 | except TypeError: 382 | # old distribute API 383 | setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools')) 384 | 385 | if setuptools_dist is None: 386 | log.warn('No setuptools distribution found') 387 | return 388 | # detecting if it was already faked 389 | setuptools_location = setuptools_dist.location 390 | log.warn('Setuptools installation detected at %s', setuptools_location) 391 | 392 | # if --root or --preix was provided, and if 393 | # setuptools is not located in them, we don't patch it 394 | if not _under_prefix(setuptools_location): 395 | log.warn('Not patching, --root or --prefix is installing Distribute' 396 | ' in another location') 397 | return 398 | 399 | # let's see if its an egg 400 | if not setuptools_location.endswith('.egg'): 401 | log.warn('Non-egg installation') 402 | res = _remove_flat_installation(setuptools_location) 403 | if not res: 404 | return 405 | else: 406 | log.warn('Egg installation') 407 | pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO') 408 | if (os.path.exists(pkg_info) and 409 | _same_content(pkg_info, SETUPTOOLS_PKG_INFO)): 410 | log.warn('Already patched.') 411 | return 412 | log.warn('Patching...') 413 | # let's create a fake egg replacing setuptools one 414 | res = _patch_egg_dir(setuptools_location) 415 | if not res: 416 | return 417 | log.warn('Patched done.') 418 | _relaunch() 419 | 420 | 421 | def _relaunch(): 422 | log.warn('Relaunching...') 423 | # we have to relaunch the process 424 | # pip marker to avoid a relaunch bug 425 | if sys.argv[:3] == ['-c', 'install', '--single-version-externally-managed']: 426 | sys.argv[0] = 'setup.py' 427 | args = [sys.executable] + sys.argv 428 | sys.exit(subprocess.call(args)) 429 | 430 | 431 | def _extractall(self, path=".", members=None): 432 | """Extract all members from the archive to the current working 433 | directory and set owner, modification time and permissions on 434 | directories afterwards. `path' specifies a different directory 435 | to extract to. `members' is optional and must be a subset of the 436 | list returned by getmembers(). 437 | """ 438 | import copy 439 | import operator 440 | from tarfile import ExtractError 441 | directories = [] 442 | 443 | if members is None: 444 | members = self 445 | 446 | for tarinfo in members: 447 | if tarinfo.isdir(): 448 | # Extract directories with a safe mode. 449 | directories.append(tarinfo) 450 | tarinfo = copy.copy(tarinfo) 451 | tarinfo.mode = 448 # decimal for oct 0700 452 | self.extract(tarinfo, path) 453 | 454 | # Reverse sort directories. 455 | if sys.version_info < (2, 4): 456 | def sorter(dir1, dir2): 457 | return cmp(dir1.name, dir2.name) 458 | directories.sort(sorter) 459 | directories.reverse() 460 | else: 461 | directories.sort(key=operator.attrgetter('name'), reverse=True) 462 | 463 | # Set correct owner, mtime and filemode on directories. 464 | for tarinfo in directories: 465 | dirpath = os.path.join(path, tarinfo.name) 466 | try: 467 | self.chown(tarinfo, dirpath) 468 | self.utime(tarinfo, dirpath) 469 | self.chmod(tarinfo, dirpath) 470 | except ExtractError: 471 | e = sys.exc_info()[1] 472 | if self.errorlevel > 1: 473 | raise 474 | else: 475 | self._dbg(1, "tarfile: %s" % e) 476 | 477 | 478 | def main(argv, version=DEFAULT_VERSION): 479 | """Install or upgrade setuptools and EasyInstall""" 480 | tarball = download_setuptools() 481 | _install(tarball) 482 | 483 | 484 | if __name__ == '__main__': 485 | main(sys.argv[1:]) 486 | --------------------------------------------------------------------------------