├── treelearn
    ├── clustered_classifier.py
    ├── test_helpers.py
    ├── typecheck.py
    ├── test_iris.py
    ├── __init__.py
    ├── breadth_first.py
    ├── test_regression.py
    ├── regression_ensemble.py
    ├── constant_leaf.py
    ├── clustered_regression.py
    ├── test_randomized_tree.py
    ├── test_ensembles.py
    ├── test_tree_helpers.py
    ├── clustered.py
    ├── tree_node.py
    ├── viterbi_tree.py
    ├── oblique_tree.py
    ├── oblique_tree_node.py
    ├── base_ensemble.py
    ├── classifier_ensemble.py
    ├── randomized_tree.py
    ├── recipes.py
    └── tree_helpers.py
├── setup.py
├── README.md
├── LICENSE
└── distribute_setup.py


/treelearn/clustered_classifier.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | from clustered import ClusteredEstimator
 3 | from sklearn.svm import LinearSVC
 4 | from copy import deepcopy 
 5 | 
 6 | class ClusteredClassifier(ClusteredEstimator): 
 7 |     def __init__(self, k=10, base_model = LinearSVC(), verbose=False): 
 8 |         ClusteredEstimator.__init__(self, k,  base_model, verbose)
 9 | 
10 | 


--------------------------------------------------------------------------------
/treelearn/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | 
 3 | def split_dataset(x, y, prct_train=0.5):
 4 |     nrows, ncols = x.shape
 5 |     indices = np.arange(nrows)
 6 |     np.random.shuffle(indices)
 7 |     ntrain = int(nrows * prct_train)
 8 |     train_indices = indices[:ntrain]
 9 |     test_indices = indices[ntrain:] 
10 |     x_train = x[train_indices, :] 
11 |     x_test = x[test_indices, :] 
12 |     y_train = y[train_indices]
13 |     y_test = y[test_indices] 
14 |     return x_train, y_train, x_test, y_test 
15 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distribute_setup import use_setuptools
 2 | use_setuptools()
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(
 7 |    name = "treelearn",
 8 |    version = "0.0.10",
 9 |    #package_dir = { '' : 'treelearn' },
10 |    packages = ['treelearn'],
11 |    install_requires = [ 'scikit-learn' ],
12 |    license = "LGPL",
13 |    keywords = "machine learning tree forest random",
14 |    url = "https://github.com/capitalk/treelearn",
15 |    classifiers=[
16 |      "Development Status :: 3 - Alpha",
17 |      "Topic :: Utilities",
18 |      "License :: OSI Approved :: LGPL License",
19 |    ],
20 | )
21 | 


--------------------------------------------------------------------------------
/treelearn/typecheck.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def check_type(x, t):
 3 |     if not isinstance(x,t):
 4 |         msg = "Expected %s : %s to be %s" % (x, type(x), t)
 5 |         raise RuntimeError(msg)
 6 | 
 7 | def check_field(x,f):
 8 |     if not hasattr(x,f):
 9 |         msg = "Expected %s : %s to have field %s" % (x, type(x), f)
10 |         raise RuntimeError(msg)
11 | 
12 | def check_fields(x,fs):
13 |     for f in fs:
14 |         check_field(x,f)
15 | 
16 | def check_estimator(x):
17 |     check_fields(x, ['fit', 'predict'])
18 | 
19 | def check_int(x):
20 |     check_type(x, int)
21 | 
22 | def check_bool(x):
23 |     check_type(x, bool)
24 | 
25 | def check_dict(x):
26 |     check_type(x, dict)
27 | 


--------------------------------------------------------------------------------
/treelearn/test_iris.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import recipes
 4 | import numpy as np 
 5 | import sklearn.datasets
 6 | from test_helpers import split_dataset
 7 | 
 8 | iris = sklearn.datasets.load_iris()
 9 | x_train, y_train, x_test, y_test = split_dataset(iris.data, iris.target)
10 | 
11 | 
12 | classifiers = [
13 |     recipes.train_svm_tree, 
14 |     recipes.train_sgd_tree, 
15 |     #recipes.train_svm_forest, 
16 |     #recipes.train_sgd_forest, 
17 |     recipes.train_random_forest,
18 |     recipes.train_clustered_svm, 
19 |     recipes.train_clustered_svm_ensemble
20 | ]
21 | 
22 | def test_all_classifiers():
23 |     for model_constructor in classifiers:
24 |         
25 |         print model_constructor
26 |         model = model_constructor(x_train, y_train)
27 |         print model 
28 |         pred = model.predict(x_test)
29 |         num_incorrect = np.sum(pred != y_test)
30 |         print "Expected num_incorrect <= 20, got:", num_incorrect 
31 |         assert num_incorrect <= 15
32 |         
33 | 


--------------------------------------------------------------------------------
/treelearn/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from constant_leaf import ConstantLeaf
 3 | from tree_node import TreeNode
 4 | from randomized_tree import RandomizedTree
 5 | from oblique_tree import ObliqueTree
 6 | from classifier_ensemble import ClassifierEnsemble
 7 | from regression_ensemble import RegressionEnsemble
 8 | from clustered_regression import ClusteredRegression
 9 | from clustered_classifier import ClusteredClassifier 
10 | from recipes import * 
11 | 
12 | __all__ = [
13 |   'ClassifierEnsemble', 'RegressionEnsemble', 
14 |   'ClusteredRegression', 'ClusteredClassifier',  
15 |   'RandomizedTree', 'TreeNode', 'ConstantLeaf', 
16 |   'train_random_forest',
17 |   'ObliqueTree',
18 |   'mk_svm_tree', 'train_svm_tree', 
19 |   'mk_sgd_tree','train_sgd_tree', 
20 |   'train_svm_forest',  'train_sgd_forest', 
21 |   'mk_clustered_regression_ensemble', 'train_clustered_regression_ensemble',
22 |   'mk_clustered_classifier_ensemble', 'train_clustered_classifier_ensemble', 
23 |   'train_clustered_ols',  
24 |   'mk_additive_regression_forest', 'train_additive_regression_forest', 
25 | ]
26 | 


--------------------------------------------------------------------------------
/treelearn/breadth_first.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class BaseTree(object):
 4 | 
 5 |   def process_work_items(self):
 6 |     max_pos = max(item.pos for item in work_items)
 7 |     if max_pos > len(self.values):
 8 |       self.grow_tree()
 9 |     
10 |     # work items that need to create leaf nodes on the GPU
11 |     leaves = []
12 | 
13 |     # work items that need to be fully grown into small subtrees per thradblock
14 |     subtrees = []
15 |     
16 |     # work items that need a single split but are small enough to be loaded into shared memory
17 |     small_splits = []
18 |     
19 |     # work items that have enough features to justify 
20 |     # each one getting its own thread block
21 |     block_per_feature = []
22 | 
23 |     # otherwise, launch a kernel for each feature
24 |     kernel_per_feature = []
25 | 
26 |     for item in work_items:
27 |       if item.nelts == 1 or item.purity == 1.9:
28 |         leaves.append(item)
29 |       elif item.nelts <= 32:
30 |         subtrees.append(item)
31 |       elif items.nelts * items.n_features * self.values.itemsize <= 4096:
32 |         small_splits.append(item)
33 |       elif item.n_features > 30:
34 |         block_per_feature.append(item)
35 |       else:
36 |         kernel_per_feature.append(item)
37 |         
38 |         
39 |         
40 |     
41 | 


--------------------------------------------------------------------------------
/treelearn/test_regression.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import recipes
 3 | import test_helpers  
 4 | from sklearn.datasets import make_friedman1, make_friedman2, make_friedman3 
 5 | from sklearn.metrics  import mean_square_error
 6 | from sklearn.linear_model import LinearRegression
 7 | 
 8 | 
 9 | 
10 | regressors = [
11 |     recipes.train_clustered_ols, 
12 |     lambda x, y: recipes.train_clustered_regression_ensemble(x, y, feature_subset_percent=1), 
13 |     #recipes.train_additive_regression_forest, 
14 |     recipes.train_random_forest
15 | ]
16 | def test_all_regressors():
17 |     x, y  = make_friedman2(10000)
18 |     x_train, y_train, x_test, y_test = test_helpers.split_dataset(x,y)
19 |     #print y_test[:100]
20 |     ols = LinearRegression()
21 |     ols.fit(x_train, y_train)
22 |     ols_pred = ols.predict(x_test)
23 |     #print ols_pred[:100]
24 |     ols_mse = mean_square_error(y_test, ols_pred)
25 |     
26 |     for fn in regressors:
27 |         
28 |         print fn
29 |         model = fn(x_train,y_train)
30 |         print model 
31 |         pred = model.predict(x_test)
32 |         #print pred[:100]
33 |         mse = mean_square_error(y_test, pred)
34 |         
35 |         print "OLS MSE:", ols_mse, " Current MSE:", mse
36 |         print "Ratio:",  mse / ols_mse 
37 |         assert ols_mse > 1.1*mse
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/treelearn/regression_ensemble.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | from sklearn.linear_model import LinearRegression 
 3 | from base_ensemble import BaseEnsemble
 4 | 
 5 | class RegressionEnsemble(BaseEnsemble):
 6 |     def __init__(self, 
 7 |             base_model=LinearRegression(), 
 8 |             num_models = 50, 
 9 |             bagging_percent=0.5, 
10 |             bagging_replacement=True,
11 |             feature_subset_percent = 1.0,  
12 |             stacking_model=None,
13 |             randomize_params = {}, 
14 |             additive = False, 
15 |             verbose=False):
16 |                 
17 |         BaseEnsemble.__init__(self, 
18 |             base_model, 
19 |             num_models, 
20 |             bagging_percent,
21 |             bagging_replacement, 
22 |             feature_subset_percent, 
23 |             stacking_model, 
24 |             randomize_params, 
25 |             additive, 
26 |             verbose)
27 |             
28 |        
29 |         
30 |     def predict(self, X):
31 |         pred = self.transform(X)
32 |         if self.stacking_model: 
33 |             return self.stacking_model.predict(pred)
34 |         else: 
35 |             return np.dot(pred, self.weights)
36 | 
37 |     def _init_fit(self, X, Y): 
38 |         pass 
39 |         
40 |     def _created_model(self, X, Y, indices, i, model):
41 |         pass 
42 | 


--------------------------------------------------------------------------------
/treelearn/constant_leaf.py:
--------------------------------------------------------------------------------
 1 | # TreeLearn
 2 | #
 3 | # Copyright (C) Capital K Partners
 4 | # Author: Alex Rubinsteyn
 5 | # Contact: alex [at] capitalkpartners [dot] com 
 6 | #
 7 | # This library is free software; you can redistribute it and/or
 8 | # modify it under the terms of the GNU Lesser General Public
 9 | # License as published by the Free Software Foundation; either
10 | # version 2.1 of the License, or (at your option) any later version.
11 | #
12 | # This library is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | # Lesser General Public License for more details.
16 | 
17 | 
18 | import numpy as np 
19 | 
20 | class ConstantLeaf:
21 |     """Decision tree node which always predicts the same value."""
22 |     def __init__(self, v):
23 |         self.v = v
24 |     
25 |     def to_str(self, indent="", feature_names=None):
26 |         return indent + "Constant(" + str(self.v) + ")"
27 |     
28 |     def __str__(self): 
29 |         return self.to_str() 
30 |         
31 |     def predict(self, X):
32 |         X = np.atleast_2d(X)
33 |         if isinstance(self.v, int):
34 |             dtype = 'int32'
35 |         else:
36 |             dtype = 'float'
37 |         outputs = np.zeros(X.shape[0], dtype=dtype)
38 |         outputs[:] = self.v
39 |         return outputs 
40 |         
41 |     def fill_predict(self, X, outputs, mask):
42 |         outputs[mask] = self.v 
43 |         
44 |     
45 |     
46 | 


--------------------------------------------------------------------------------
/treelearn/clustered_regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | from clustered import ClusteredEstimator
 3 | from sklearn.linear_model import LinearRegression 
 4 | from copy import deepcopy 
 5 | 
 6 | class ClusteredRegression(ClusteredEstimator): 
 7 |     def __init__(
 8 |             self, 
 9 |             k=10, 
10 |             base_model = LinearRegression(), 
11 |             cluster_prediction_weights = 'hard', # or 'soft' 
12 |             verbose=False): 
13 |         ClusteredEstimator.__init__(self, k,  base_model, verbose)
14 |         self.cluster_prediction_weights = cluster_prediction_weights 
15 | 
16 |     def predict(self, X):
17 |         nrows = X.shape[0] 
18 |         Y = np.zeros(nrows)
19 |         if self.cluster_prediction_weights == 'hard':
20 |             labels = self.clusters.predict(X)
21 |             
22 |             for label in self.models.keys(): 
23 |                 mask = (labels == label)
24 |                 X_slice = X[mask, :] 
25 |                 Y_slice = self.models[label].predict(X_slice) 
26 |                 Y[mask] = Y_slice 
27 |         else:
28 |             distances = self.clusters.transform(X)
29 |             inv_dist_squared = 1.0 / (distances ** 2)
30 |             Z = np.sum(inv_dist_squared, axis=1)
31 |             # normalize weights so they add to 1 
32 |             weights = inv_dist_squared / np.array([Z], dtype='float').T
33 |             if self.verbose:
34 |                 "First row of weights:", weights[0, :] 
35 |             for label in self.models.keys(): 
36 |                 Y_curr = self.models[label].predict(X) 
37 |                 Y += Y_curr * weights[:, label]
38 |         return Y
39 |         
40 |         
41 | 
42 | 


--------------------------------------------------------------------------------
/treelearn/test_randomized_tree.py:
--------------------------------------------------------------------------------
 1 | # TreeLearn
 2 | #
 3 | # Copyright (C) Capital K Partners
 4 | # Author: Alex Rubinsteyn
 5 | # Contact: alex [at] capitalkpartners [dot] com 
 6 | #
 7 | # This library is free software; you can redistribute it and/or
 8 | # modify it under the terms of the GNU Lesser General Public
 9 | # License as published by the Free Software Foundation; either
10 | # version 2.1 of the License, or (at your option) any later version.
11 | #
12 | # This library is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | # Lesser General Public License for more details.
16 | 
17 | 
18 | import numpy as np 
19 | import randomized_tree as tree
20 | 
21 | def test_simple_tree():
22 |     data = np.array([[0,0], [0.1, 0.1], [1.0, 1.0], [.99,.99]])
23 |     labels = np.array([0,0,1,1])
24 |     t = tree.RandomizedTree(min_leaf_size=1)
25 |     t.fit(data,labels)
26 |     print t 
27 |     pred0 = t.predict(np.array([0.05, 0.05]))
28 |     print "Expected: 0, Received:", pred0
29 |     assert pred0 == 0
30 |     
31 |     pred1 = t.predict(np.array([0.995, 0.995]))
32 |     print "Expected: 1, Received:", pred1
33 |     assert pred1 == 1
34 | 
35 | def test_big_tree(n=1000, d = 50, max_thresholds=10):
36 |     t = tree.RandomizedTree(max_thresholds=max_thresholds)
37 |     x = np.random.randn(n,d)
38 |     y = np.random.randint(0,2,n)
39 |     t.fit(x,y)
40 |     return t 
41 | 
42 | def test_binary_data(n = 1000, d = 50):
43 |     t = tree.RandomizedTree()
44 |     x = np.random.randint(0,2, [n,d])
45 |     y = np.random.randint(0,2,n)
46 |     t.fit(x,y)
47 |     return t 
48 | 


--------------------------------------------------------------------------------
/treelearn/test_ensembles.py:
--------------------------------------------------------------------------------
 1 | # TreeLearn
 2 | #
 3 | # Copyright (C) Capital K Partners
 4 | # Author: Alex Rubinsteyn
 5 | # Contact: alex [at] capitalkpartners [dot] com 
 6 | #
 7 | # This library is free software; you can redistribute it and/or
 8 | # modify it under the terms of the GNU Lesser General Public
 9 | # License as published by the Free Software Foundation; either
10 | # version 2.1 of the License, or (at your option) any later version.
11 | #
12 | # This library is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | # Lesser General Public License for more details.
16 | 
17 | 
18 | import numpy as np 
19 | from recipes import train_random_forest, train_svm_forest
20 | from classifier_ensemble import ClassifierEnsemble
21 | from randomized_tree import RandomizedTree 
22 | from sklearn.linear_model import LogisticRegression
23 | 
24 | n = 200
25 | left_data = np.random.randn(n, 10)
26 | left_labels = np.zeros(n, dtype='int')
27 | 
28 | right_data = 10*(np.random.randn(n,10)-2)
29 | right_labels = np.ones(n, dtype='int')
30 | 
31 | data = np.concatenate([left_data, right_data])
32 | labels = np.concatenate([left_labels, right_labels])
33 | 
34 | 
35 | def try_predictor(model):
36 |     print "Trying predictor:", model 
37 | 
38 |     pred0 = model.predict(left_data)
39 |     fp = np.sum(pred0 != 0)
40 |     print "False positives:", fp
41 |     assert fp < (n / 10)
42 |     
43 |     pred1 = model.predict(right_data)
44 |     fn = np.sum(pred1 != 1)
45 |     print "False negatives:", fn
46 |     assert fn < (n/ 10)
47 | 
48 | 
49 | def test_simple_forest():
50 |     try_predictor(train_random_forest(data, labels))
51 |     
52 | def test_svm_forest():
53 |     try_predictor(train_svm_forest(data, labels,  tree_args={'verbose':True}))
54 | 
55 | def test_stacked_random_forest():
56 |     t = RandomizedTree(min_leaf_size=1)
57 |     lr = LogisticRegression()
58 |     ensemble = ClassifierEnsemble(base_model=t, stacking_model=lr)
59 |     ensemble.fit(data, labels)
60 |     try_predictor(ensemble)
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | TreeLearn started as a Python implementation of Breiman's Random Forest 
 2 | but is being slowly generalized into a tree ensemble library. 
 3 | 
 4 | 
 5 | ## Creating a Random Forest
 6 | 
 7 | A random forest is simply a bagging ensemble of randomized tree. To construct
 8 | these with default parameters:
 9 | 
10 |     forest = treelearn.ClassifierEnsemble(base_model = treelearn.RandomizedTree())
11 | 
12 | 
13 | ## Training
14 | 
15 | Place your training data in a n-by-d numpy array, where n is the number of 
16 | training  examples and d is the dimensionality of your data. 
17 | Place labels in an n-length numpy array. Then call: 
18 | 
19 |     forest.fit(Xtrain,Y)
20 | 
21 | If you're lazy, there's a helper for simultaneously creating and training a random forest:
22 | 
23 |     forest = treelearn.train_random_forest(X, Y)
24 | 
25 | 
26 | ## Classification
27 | 
28 |     forest.predict(Xtest)
29 |  
30 | 
31 | ## ClassifierEnsemble options
32 | 
33 |  * base_model = any classifier which obeys the fit/predict protocol
34 | 
35 |  * num_models = size of the forest 
36 |  
37 |  * bagging_percent = what percentage of your data each classifier is trained on
38 |  
39 |  * bagging_replacement = sample with or without replacement 
40 | 
41 |  * stacking_model = treat outputs of base classifiers as inputs to given model
42 | 
43 |  
44 | ## RandomizedTree options 
45 |     
46 |  * num_features_per_node = number of features each node of a tree should
47 |         consider (default = log2 of total features)
48 |     
49 |  * min_leaf_size = stop splitting if we get down to this number of data points 
50 | 
51 |  * max_height = stop splitting if we exceed this number of tree levels
52 | 
53 |  * max_thresholds = how many feature value thesholds to consider (use None for all values)
54 | 
55 | ## ObliqueTree options 
56 |  * num_features_per_node = size of random feature subset at each node, 
57 |         default = sqrt(total number of features)
58 |  
59 |  * C = Tradeoff between error and L2 regularizer of linear SVM
60 |         
61 |  * max_depth = When you get to this depth, train an SVM on all features 
62 |         and stop splitting the data. 
63 |         
64 |  * min_leaf_size = stop splitting when any subset of the data gets smaller 
65 |         than this. 
66 | 


--------------------------------------------------------------------------------
/treelearn/test_tree_helpers.py:
--------------------------------------------------------------------------------
 1 | # TreeLearn
 2 | #
 3 | # Copyright (C) Capital K Partners
 4 | # Author: Alex Rubinsteyn
 5 | # Contact: alex [at] capitalkpartners [dot] com 
 6 | #
 7 | # This library is free software; you can redistribute it and/or
 8 | # modify it under the terms of the GNU Lesser General Public
 9 | # License as published by the Free Software Foundation; either
10 | # version 2.1 of the License, or (at your option) any later version.
11 | #
12 | # This library is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | # Lesser General Public License for more details.
16 | 
17 | 
18 | import numpy as np 
19 | 
20 | from tree_helpers import * 
21 | 
22 | def test_majority():
23 |     labels = np.array([1,1,1,2,3,3,3,2,3,3,3,1,3,3,3,3])
24 |     result = majority(labels)
25 |     print "Expected 3:, Received:", result 
26 |     assert result == 3
27 |     classes = [1,2]
28 |     result = majority(labels, classes)
29 |     print "Expected 1:, Received:", result 
30 |     assert result == 1
31 |     
32 | 
33 | classes = np.array([0,1])
34 | all_zero = np.array([0,0,0,0])
35 | mixed = np.array([0,1,0,1])
36 | def test_gini():
37 |     result1 = gini(classes, all_zero)
38 |     print "Expected 0.0, Received:", result1 
39 |     assert result1 == 0.0 
40 |     result2 = gini(classes, mixed)
41 |     print "Expected 0.5, Received:", result2 
42 |     assert result2 == 0.5
43 | 
44 | feature_vec = np.array([0.1, 0.5, 0.9, 1.1])
45 | def test_eval_split():
46 |     slow = slow_eval_split(classes, feature_vec, 0.5, mixed)
47 |     print "Slow GINI", slow 
48 |     fast = eval_gini_split(classes, feature_vec, 0.5, mixed)
49 |     print "Fast GINI", fast 
50 |     assert abs(slow - fast) < 0.01
51 | 
52 | 
53 | labels = np.array([0, 0, 1, 1])    
54 | thresholds = midpoints(np.unique(feature_vec))
55 | def test_eval_all_splits():
56 |     thresh_slow, score_slow = slow_find_best_gini_split(classes, feature_vec, thresholds, labels)
57 |     print "Slow Thresh", thresh_slow, "Score", score_slow
58 |     
59 |     thresh_fast, score_fast = find_best_gini_split(classes, feature_vec, thresholds, labels)
60 |     print "Fast Thresh", thresh_fast, "Score", score_fast
61 |     assert thresh_slow == 0.7
62 |     assert thresh_fast == 0.7
63 |     
64 | 


--------------------------------------------------------------------------------
/treelearn/clustered.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | from copy import deepcopy 
 3 | 
 4 | 
 5 | from sklearn.base import BaseEstimator
 6 | from sklearn.cluster import MiniBatchKMeans 
 7 | from sklearn.linear_model import LinearRegression 
 8 | 
 9 | from typecheck import check_estimator, check_dict, check_int, check_bool
10 | from tree_helpers import clear_sklearn_fields
11 | 
12 | class ClusteredEstimator(BaseEstimator): 
13 |     """Base class for ClusteredRegression and ClusteredClassifier"""
14 |     def __init__(self, k, base_model, verbose=False): 
15 |         check_int(k)
16 |         check_estimator(base_model)
17 |         check_bool(verbose) 
18 |         
19 |         self.k = k
20 |         self.base_model = base_model 
21 |         self.verbose = verbose 
22 |         self.clusters = MiniBatchKMeans(k)
23 |         self.models = None 
24 |         
25 |     def fit(self, X, Y, **fit_keywords):
26 |         self.models = {}
27 |         if self.verbose:
28 |             print "Clustering X"
29 |         # also get back the labels so we can use them to create regressors 
30 |         self.clusters.fit(X)
31 |         labels = self.clusters.labels_ 
32 |         # clear this field so that it doesn't get serialized later
33 |         self.clusters.labels_ = None 
34 |         for label in np.unique(labels):
35 |             if self.verbose: 
36 |                 print "Fitting model for cluster", label 
37 |             model = deepcopy(self.base_model)
38 |             mask = (labels == label)
39 |             X_slice = X[mask, :] 
40 |             Y_slice = Y[mask] 
41 |             model.fit(X_slice, Y_slice, **fit_keywords)
42 |             
43 |             # clear sklearn's left over junk to make pickled strings smaller  
44 |             clear_sklearn_fields(model)
45 |             self.models[label] = model
46 |              
47 |     def predict(self, X): 
48 |         if self.verbose:
49 |             print "Prediction inputs of shape", X.shape 
50 |         nrows = X.shape[0] 
51 |         Y = np.zeros(nrows)
52 |         if self.verbose:
53 |             print "Assigning cluster labels to input data" 
54 |         labels = self.clusters.predict(X)
55 |         for label in self.models.keys(): 
56 |             mask = (labels == label)
57 |             if self.verbose: 
58 |                 print "Predicting cluster", label, "nvectors = ", np.sum(mask)
59 |             
60 |             X_slice = X[mask, :] 
61 |             model = self.models[label] 
62 |             Y[mask] = model.predict(X_slice) 
63 |         return Y 
64 | 
65 | 


--------------------------------------------------------------------------------
/treelearn/tree_node.py:
--------------------------------------------------------------------------------
 1 | # TreeLearn
 2 | #
 3 | # Copyright (C) Capital K Partners
 4 | # Author: Alex Rubinsteyn
 5 | # Contact: alex [at] capitalkpartners [dot] com 
 6 | #
 7 | # This library is free software; you can redistribute it and/or
 8 | # modify it under the terms of the GNU Lesser General Public
 9 | # License as published by the Free Software Foundation; either
10 | # version 2.1 of the License, or (at your option) any later version.
11 | #
12 | # This library is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | # Lesser General Public License for more details.
16 | 
17 | from sklearn.base import BaseEstimator 
18 | 
19 | class TreeNode(BaseEstimator):
20 |     """Basic decision tree interior node.""" 
21 |     
22 |     def __init__(self, feature_idx, split_val, left, right):
23 |         self.feature_idx = feature_idx
24 |         self.split_val = split_val 
25 |         self.left = left
26 |         self.right = right
27 |         
28 |    
29 |     def predict(self, X):
30 |         """Inefficient since calling this method recursively copy outputs"""
31 |         outputs = np.zeros(X.shape[0])
32 |         col = X[:, self.feature_idx] 
33 |         split = col < self.split_val 
34 |         left_mask = mask & split
35 |         outputs[left_mask] = self.left.predict(X[left_mask, :])
36 |         right_mask = mask & ~split 
37 |         outputs[right_mask] = self.right.predict(X[right_mask, :])
38 |         return outputs 
39 |         
40 |      
41 |     def fill_predict(self, X, outputs, mask):
42 |         """instead of returning output values, let the leaves fill an 
43 |         output matrix
44 |         """
45 |         col = X[:, self.feature_idx] 
46 |         split = col < self.split_val 
47 |         left_mask = mask & split
48 |         right_mask = mask & ~split 
49 |         self.left.fill_predict(X, outputs, left_mask)
50 |         self.right.fill_predict(X, outputs, right_mask)
51 |     
52 |         
53 |     def to_str(self, indent="", feature_names=None):
54 |         if feature_names:
55 |             featureStr = feature_names[feature_idx]
56 |         else:
57 |             featureStr = "x[" + str(self.feature_idx) + "]"
58 |         longer_indent = indent + "  " 
59 |         left = self.left.to_str(indent = longer_indent)
60 |         right = self.right.to_str(indent = longer_indent)
61 |         cond = "if %s < %f:" % (featureStr, self.split_val)
62 |         return indent + cond + "\n" +  left + "\n" + indent + "else:\n" + right
63 |         
64 |         
65 | 


--------------------------------------------------------------------------------
/treelearn/viterbi_tree.py:
--------------------------------------------------------------------------------
 1 | # TreeLearn
 2 | #
 3 | # Copyright (C) Capital K Partners
 4 | # Author: Alex Rubinsteyn
 5 | # Contact: alex [at] capitalkpartners [dot] com 
 6 | #
 7 | # This library is free software; you can redistribute it and/or
 8 | # modify it under the terms of the GNU Lesser General Public
 9 | # License as published by the Free Software Foundation; either
10 | # version 2.1 of the License, or (at your option) any later version.
11 | #
12 | # This library is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | # Lesser General Public License for more details.
16 | 
17 | 
18 | from sklearn.base import BaseEstimator
19 | from sklearn.linear_model import LogisticRegression
20 | from sklearn.svm import LinearSVC
21 | 
22 | class ViterbiTreeNode(BaseEstimator):
23 |     def __init__(self, depth, max_depth, num_retries, leaf_model):
24 |         self.depth = depth
25 |         self.max_depth = max_depth 
26 |         self.is_leaf = (depth == max_depth)
27 |         self.num_retries = num_retries 
28 |         self.C = C
29 |         if depth == max_depth:
30 |             self.leaf_model = leaf_model 
31 |         else:
32 |             self.left = ViterbiTreeNode(depth+1, max_depth, num_retries, leaf_model)
33 |             self.right = ViterbiTreeNode(depth+1, max_depth, num_retries, leaf_model)
34 |             
35 |     def gen_random_cs(self):
36 |         return 10 ** (np.random.randn(self.num_retries) - 1)
37 |     
38 |     def init_fit(self, X,Y):
39 |         """Initialize partitions and leaf models to minimize training error"""
40 |         best_model = None
41 |         best_error = np.inf  
42 |         
43 |         for c in self.gen_random_cs():
44 |             if self.is_leaf: 
45 |                 model = self.leaf_model(C=c)
46 |             else:
47 |                 model = LinearSVC(C=c)
48 |                 
49 |             model.fit(X,Y)
50 |             error = model.score(X,Y)
51 |             if err < best_error:
52 |                 best_model = model
53 |                 best_error = error
54 |         self.model = best_model 
55 |         if not self.is_leaf:
56 |             pred = model.predict(X)
57 |             mask = (pred != 1)
58 |             self.left.init_fit(X[mask, :], Y[mask])
59 |             self.right.init_fit(X[~mask, :], Y[~mask])
60 |         
61 |     def refit_partition(X,partition,Y):
62 |         """Assumes that 'init_fit' has already been run."""
63 |         if self.is_leaf:
64 |             self.model.fit(X,Y)
65 |         else:
66 |             nrows = X.shape[0]
67 |             # get probabilities of y=1
68 |             left_prob = self.left.predict_proba(X)[:, 1]
69 |             right_prob = self.right.predict_proba(X)[:, 1]
70 |             assignments = np.zeros(nrows)
71 |             right_mask = (left_prob < right_prob) & Y == 1
72 |             
73 |             # TODO
74 |             # assignments[]
75 |     def refit_leaves(X,Y):
76 |         # TODO
77 |         pass
78 |         
79 |     def predict(X):
80 |         # TODO
81 |         pass
82 |         
83 | class ViterbiTree(BaseEstimator):
84 |     def __init__(self, max_depth=3, num_retries = 3, leaf_model=LogisticRegression):
85 |         self.root = ViterbiTreeNode(1, max_depth, num_retries, leaf_model)
86 |     
87 |     def fit(self, X, Y):
88 |         self.root.init_fit(X,Y)
89 | 
90 |     def predict(self, X)
91 |         return self.root.predict(X)
92 | 


--------------------------------------------------------------------------------
/treelearn/oblique_tree.py:
--------------------------------------------------------------------------------
  1 | # TreeLearn
  2 | #
  3 | # Copyright (C) Capital K Partners
  4 | # Author: Alex Rubinsteyn
  5 | # Contact: alex [at] capitalkpartners [dot] com 
  6 | #
  7 | # This library is free software; you can redistribute it and/or
  8 | # modify it under the terms of the GNU Lesser General Public
  9 | # License as published by the Free Software Foundation; either
 10 | # version 2.1 of the License, or (at your option) any later version.
 11 | #
 12 | # This library is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 15 | # Lesser General Public License for more details.
 16 | 
 17 | 
 18 | from copy import deepcopy 
 19 | import math 
 20 | import numpy as np
 21 | from sklearn.base import BaseEstimator
 22 | from sklearn.svm  import LinearSVC
 23 | 
 24 | from tree_helpers import majority, clear_sklearn_fields
 25 | from typecheck import check_estimator, check_dict, check_int, check_bool
 26 | from oblique_tree_node import _ObliqueTreeNode
 27 | 
 28 |         
 29 | class ObliqueTree(BaseEstimator):
 30 |     """A decision tree whose splits are hyperplanes. 
 31 |     Used as base learner for oblique random forests. 
 32 |     For more information, see 'On oblique random forests'. 
 33 |     http://people.csail.mit.edu/menze/papers/menze_11_oblique.pdf
 34 |     
 35 |     Parameters
 36 |     ----------------
 37 |     leaf_model : classifier or regressor. 
 38 |     
 39 |     split_classifier : classifier, optional (default = LinearSVC())
 40 |         Learning machine used to assign data points to either side of a tree
 41 |         split. 
 42 |     
 43 |     num_features_per_node : int, optional(default = sqrt of total feature count)
 44 |     
 45 |     max_depth : int, optional (default=3). 
 46 |         The number of SVMs will be at most 2^max_depth
 47 |         
 48 |     min_leaf_size : int, optional (default=100).
 49 |         Don't split data if it gets smaller than this number. 
 50 |     
 51 |     randomize_split_params : dict, optional (default={})
 52 |         Maps names of split classifier parameters to functions which generate
 53 |         random values.
 54 |         
 55 |     randomize_leaf_params : dict, optional (default={})
 56 |         Maps names of leaf model params to functions which randomly generate
 57 |         their values. 
 58 |     """
 59 |     
 60 |     def __init__(self, 
 61 |             leaf_model = LinearSVC(), 
 62 |             split_classifier = LinearSVC(), 
 63 |             num_features_per_node = None, 
 64 |             max_depth=3, 
 65 |             min_leaf_size=50, 
 66 |             randomize_split_params={}, 
 67 |             randomize_leaf_params={}, 
 68 |             verbose = False):
 69 |                 
 70 |         # check everyone's types -- I can't give up the OCaml instincts 
 71 |         # also, if running this code remotely it's nice to know when something
 72 |         # goes wrong before we send an object over to AWS 
 73 |         check_estimator(leaf_model)
 74 |         check_estimator(split_classifier)
 75 |         check_int(max_depth)
 76 |         check_int(min_leaf_size)
 77 |         check_dict(randomize_split_params)
 78 |         check_dict(randomize_leaf_params)
 79 |         check_bool(verbose)
 80 |         
 81 |         self.leaf_model = leaf_model 
 82 |         self.split_classifier = split_classifier 
 83 |         self.max_depth = max_depth 
 84 |         self.min_leaf_size = min_leaf_size 
 85 |         self.num_features_per_node = num_features_per_node 
 86 |         
 87 |         self.randomize_split_params = randomize_split_params
 88 |         self.randomize_leaf_params = randomize_leaf_params 
 89 |         self.verbose = verbose 
 90 |         
 91 |         self.root = None
 92 |         self.classes = None
 93 |         
 94 |     def fit(self, X,Y, **fit_keywords):
 95 |         X = np.atleast_2d(X)
 96 |         Y = np.atleast_1d(Y)
 97 |         
 98 |         n_features = X.shape[1]
 99 |         num_features_per_node = self.num_features_per_node 
100 |         
101 |         if num_features_per_node is None:
102 |             num_features_per_node = int(math.ceil(math.sqrt(X.shape[0])))
103 |             
104 |         elif num_features_per_node > n_features:
105 |             num_features_per_node = n_features 
106 |         
107 |         self.classes = list(np.unique(Y))
108 |         
109 |         self.root = _ObliqueTreeNode(
110 |             split_classifier = self.split_classifier, 
111 |             leaf_model = self.leaf_model, 
112 |             num_features_per_node = num_features_per_node, 
113 |             classes = self.classes, 
114 |             depth = 1, 
115 |             max_depth = self.max_depth,
116 |             min_leaf_size = self.min_leaf_size, 
117 |             randomize_split_params = self.randomize_split_params,
118 |             randomize_leaf_params = self.randomize_leaf_params, 
119 |             verbose = self.verbose 
120 |         )
121 |         self.root.fit(X, Y, **fit_keywords)
122 |         
123 |         
124 |     def predict(self, X):
125 |         return self.root.predict(X)
126 |         
127 | 


--------------------------------------------------------------------------------
/treelearn/oblique_tree_node.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy 
  2 | import numpy as np 
  3 | from sklearn.base import BaseEstimator
  4 | from tree_helpers import majority, clear_sklearn_fields
  5 | from constant_leaf import ConstantLeaf 
  6 | 
  7 | class _ObliqueTreeNode(BaseEstimator):
  8 |     """
  9 |         Do not use this directly, instead train an ObliqueTree"""
 10 |     def __init__(self, 
 11 |             split_classifier, 
 12 |             leaf_model, 
 13 |             num_features_per_node, 
 14 |             classes, 
 15 |             depth, 
 16 |             max_depth, 
 17 |             min_leaf_size, 
 18 |             randomize_split_params, 
 19 |             randomize_leaf_params, 
 20 |             verbose):
 21 |         
 22 |         self.split_classifier = split_classifier 
 23 |         self.leaf_model = leaf_model 
 24 |         self.num_features_per_node = num_features_per_node 
 25 |         self.classes = classes
 26 |         self.depth = depth
 27 |         self.max_depth = max_depth 
 28 |         self.min_leaf_size = min_leaf_size
 29 |         self.randomize_split_params = randomize_split_params
 30 |         self.randomize_leaf_params = randomize_leaf_params
 31 |         self.verbose = verbose 
 32 |         
 33 |         self.children = {}
 34 |         self.model = None 
 35 |         self.subspace = None 
 36 |     
 37 |     def _fit_leaf(self, X, Y, fit_keywords):
 38 |         if self.verbose: 
 39 |             print "Fitting leaf"
 40 |         model = deepcopy(self.leaf_model)
 41 |         for field, gen in self.randomize_leaf_params.items():
 42 |             setattr(model, field, gen())
 43 |         model.fit(X, Y, **fit_keywords) 
 44 |         clear_sklearn_fields(model)
 45 |         return model 
 46 |         
 47 |     def _fit_child(self, X_slice, Y_slice, fit_keywords):
 48 |         count = X_slice.shape[0] 
 49 |         unique_ys = np.unique(Y_slice)
 50 |         if len(unique_ys) == 1:
 51 |             const = int(unique_ys[0])
 52 |             if self.verbose: 
 53 |                 print "ConstantLeaf", const 
 54 |             child = ConstantLeaf(const)
 55 |         elif count < self.min_leaf_size:
 56 |             child = self._fit_leaf(X_slice, Y_slice, fit_keywords)
 57 |         else: 
 58 |             child = _ObliqueTreeNode(
 59 |                 split_classifier = self.split_classifier, 
 60 |                 leaf_model = self.leaf_model, 
 61 |                 num_features_per_node = self.num_features_per_node, 
 62 |                 classes = self.classes, 
 63 |                 depth = self.depth +1, 
 64 |                 max_depth = self.max_depth, 
 65 |                 min_leaf_size = self.min_leaf_size, 
 66 |                 randomize_split_params = self.randomize_split_params, 
 67 |                 randomize_leaf_params = self.randomize_leaf_params, 
 68 |                 verbose = self.verbose 
 69 |             )
 70 |             child.fit(X_slice, Y_slice, **fit_keywords)
 71 |         return child 
 72 |     
 73 |    
 74 | 
 75 |     def fit(self, X, Y, **fit_keywords):
 76 |         n_samples, n_features = X.shape
 77 |         
 78 |         if self.verbose: 
 79 |             print "Depth", self.depth, ": Fitting model for", n_samples, "vectors"
 80 |             
 81 |         if self.depth >= self.max_depth or n_samples <= self.min_leaf_size:
 82 |             self.model = self._fit_leaf(X, Y, fit_keywords)
 83 |         else:
 84 |             
 85 |             # if we've been passed a limit to the number of features 
 86 |             # then train the current model on a random subspace of that size
 87 |             if self.num_features_per_node:
 88 |                 feature_indices = np.random.permutation(n_features)
 89 |                 self.subspace  = feature_indices[:self.num_features_per_node]
 90 |                 X_reduced = X[:, self.subspace]
 91 |             else:
 92 |                 X_reduced = X 
 93 |             
 94 |             
 95 |             self.model = deepcopy(self.split_classifier)
 96 |             for field, gen in self.randomize_split_params.items():
 97 |                 setattr(self.model, field, gen())
 98 |             self.model.fit(X_reduced, Y, **fit_keywords)
 99 |             clear_sklearn_fields(self.model)
100 |             pred = self.model.predict(X_reduced)
101 |             
102 |             for c in self.classes:
103 |                 mask = (pred == c)
104 |                 count = np.sum(mask)
105 |                 if count == 0:
106 |                     self.children[c] = ConstantLeaf(int(c))
107 |                 else:
108 |                     X_slice = X[mask, :] 
109 |                     Y_slice = Y[mask]
110 |                     self.children[c] = self._fit_child(X_slice, Y_slice, fit_keywords)
111 |                     
112 |     def predict(self, X):
113 |         nrows = X.shape[0]
114 |         if self.subspace is not None:
115 |             X_reduced = X[:, self.subspace] 
116 |             pred = self.model.predict(X_reduced)
117 |         else:
118 |             pred = self.model.predict(X) 
119 |         
120 |         if len(self.children) == 0:
121 |             return pred 
122 |         else:
123 |             # fill this array with sub-arrays received from the predictions of children 
124 |             outputs = pred.copy()
125 |             for c in self.classes:
126 |                 mask = (pred == c)
127 |                 X_slice = X[mask, :]
128 |                 count = X_slice.shape[0]
129 |                 
130 |                 if count > 0:
131 |                     pred = self.children[c].predict(X_slice)
132 |                     outputs[mask] = pred 
133 |             return outputs 
134 | 


--------------------------------------------------------------------------------
/treelearn/base_ensemble.py:
--------------------------------------------------------------------------------
  1 | # TreeLearn
  2 | #
  3 | # Copyright (C) Capital K Partners
  4 | # Author: Alex Rubinsteyn
  5 | # Contact: alex [at] capitalkpartners [dot] com 
  6 | #
  7 | # This library is free software; you can redistribute it and/or
  8 | # modify it under the terms of the GNU Lesser General Public
  9 | # License as published by the Free Software Foundation; either
 10 | # version 2.1 of the License, or (at your option) any later version.
 11 | #
 12 | # This library is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 15 | # Lesser General Public License for more details.
 16 | 
 17 | 
 18 | from copy import deepcopy 
 19 | import numpy as np 
 20 | import random 
 21 | import math 
 22 | 
 23 | from sklearn.base import BaseEstimator 
 24 | 
 25 | from tree_helpers import clear_sklearn_fields
 26 | from typecheck import check_estimator, check_dict, check_int, check_bool
 27 | 
 28 | class BaseEnsemble(BaseEstimator):
 29 |     def __init__(self, 
 30 |             base_model, 
 31 |             num_models, 
 32 |             bagging_percent,
 33 |             bagging_replacement,
 34 |             feature_subset_percent, 
 35 |             stacking_model, 
 36 |             randomize_params, 
 37 |             additive, 
 38 |             verbose):
 39 |         check_estimator(base_model)
 40 |         check_int(num_models)
 41 |         
 42 |         self.base_model = base_model
 43 |         self.num_models = num_models
 44 |         self.bagging_percent = bagging_percent 
 45 |         self.bagging_replacement = bagging_replacement 
 46 |         self.feature_subset_percent = feature_subset_percent 
 47 |         self.stacking_model = stacking_model 
 48 |         self.randomize_params = randomize_params 
 49 |         self.additive = additive 
 50 |         self.verbose = verbose
 51 |         self.need_to_fit = True
 52 |         self.models = None
 53 |         self.weights = None 
 54 |         
 55 |         
 56 |     def fit(self, X, Y, **fit_keywords):
 57 |         assert self.base_model is not None
 58 |         assert self.bagging_percent is not None 
 59 |         assert self.bagging_replacement is not None 
 60 |         assert self.num_models is not None 
 61 |         assert self.verbose is not None
 62 |         
 63 |         self.need_to_fit = False 
 64 |         self.models = [] 
 65 |         
 66 |         X = np.atleast_2d(X)
 67 |         Y = np.atleast_1d(Y) 
 68 |         
 69 |         n_rows, total_features = X.shape
 70 |         bagsize = int(math.ceil(self.bagging_percent * n_rows))
 71 |         
 72 |         
 73 |         if self.additive: 
 74 |             self.weights = np.ones(self.num_models, dtype='float') 
 75 |         else:
 76 |             self.weights = np.ones(self.num_models, dtype='float') / self.num_models            
 77 |         
 78 |         
 79 |         # each derived class needs to implement this 
 80 |         self._init_fit(X,Y)
 81 |         if self.feature_subset_percent < 1:
 82 |             n_features = int(math.ceil(self.feature_subset_percent * total_features))
 83 |             self.feature_subsets = [] 
 84 |         else:
 85 |             n_features = total_features 
 86 |             self.feature_subsets = None 
 87 |             
 88 |         for i in xrange(self.num_models):
 89 |             if self.verbose:
 90 |                 print "Training iteration", i 
 91 |             
 92 |             if self.bagging_replacement: 
 93 |                 indices = np.random.random_integers(0,n_rows-1,bagsize)
 94 |             else:
 95 |                 p = np.random.permutation(n_rows)
 96 |                 indices = p[:bagsize] 
 97 |                 
 98 |             data_subset = X[indices, :]
 99 |             if n_features < total_features: 
100 |                 feature_indices = np.random.permutation(total_features)[:n_features]
101 |                 self.feature_subsets.append(feature_indices)
102 |                 data_subset = data_subset[:, feature_indices]
103 |                 
104 |             label_subset = Y[indices] 
105 |             model = deepcopy(self.base_model)
106 |             # randomize parameters using given functions
107 |             for param_name, fn in self.randomize_params.items():
108 |                 setattr(model, param_name, fn())
109 |             model.fit(data_subset, label_subset, **fit_keywords)
110 |             
111 |             self.models.append(model)
112 |             self._created_model(X, Y, indices, i, model)
113 |             
114 |             if self.additive: 
115 |                 if n_features < total_features:
116 |                     Y -= model.predict(X[:, feature_indices])
117 |                 else: 
118 |                     Y -= model.predict(X)
119 |                     
120 |             clear_sklearn_fields(model) 
121 |         # stacking works by treating the outputs of each base classifier as the 
122 |         # inputs to an additional meta-classifier
123 |         if self.stacking_model:
124 |             transformed_data = self.transform(X)
125 |             self.stacking_model.fit(transformed_data, Y)
126 |         
127 |     
128 |     def transform(self, X):
129 |         """Convert each feature vector into a row of predictions."""
130 |         assert self.models is not None 
131 |         
132 |         X = np.atleast_2d(X)
133 |         n_samples, n_features = X.shape    
134 |         n_models = len(self.models)
135 |         pred = np.zeros([n_samples, n_models])
136 |         if self.feature_subsets:
137 |             for i, model in enumerate(self.models):
138 |                 feature_indices = self.feature_subsets[i]
139 |                 X_subset = X[:, feature_indices] 
140 |                 pred[:, i] = model.predict(X_subset)
141 |         else:
142 |             for i, model in enumerate(self.models):
143 |                 pred[:, i] = model.predict(X)
144 |         return pred
145 |     
146 | 


--------------------------------------------------------------------------------
/treelearn/classifier_ensemble.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | 
  3 | from sklearn.svm import LinearSVC
  4 | from sklearn.metrics import fbeta_score
  5 | 
  6 | from base_ensemble import BaseEnsemble 
  7 | 
  8 | class ClassifierEnsemble(BaseEnsemble):
  9 |     """
 10 |         Train an ensemble of classifiers using a 
 11 |         subset of the data for each base classifier.  
 12 |         
 13 |     Parameters
 14 |     ----------
 15 |     base_model : Any classifier which obeys the fit/predict protocol. 
 16 |         Defaults to a Linear SVM with C = 1. 
 17 |         
 18 |     num_models : int, optional (default = 50)
 19 |         How many base classifiers to train. 
 20 |     
 21 |     bagging_percent : float, optional (default=0.5). 
 22 |         How much of the data set goes into each bootstrap sample. 
 23 |     
 24 |     bagging_replacement : bool, optional (default = True). 
 25 |         Sample with our without replacement. 
 26 |     
 27 |     weighting : None or float, optional (default=None). 
 28 |         Weight individual classifiers in the ensemble by 
 29 |             None : all classifiers given equal weight
 30 |             <beta> : compute F_beta score for each classifier. 
 31 |                      Only works for binary classification.
 32 |     
 33 |     stacking : classifier, optional (default=None).
 34 |         Feed output of weighted individual predictions into another classifier. 
 35 |         Suggested model: LogisticRegression. 
 36 |         
 37 |     
 38 |     verbose : bool, optional (default = False).
 39 |         Print diagnostic output. 
 40 |     """
 41 | 
 42 |     def __init__(self, 
 43 |             base_model = LinearSVC(), 
 44 |             num_models = 50, 
 45 |             bagging_percent=0.5, 
 46 |             bagging_replacement=True, 
 47 |             feature_subset_percent = 1.0, 
 48 |             weighting = None, 
 49 |             stacking_model = None,
 50 |             randomize_params = {}, 
 51 |             verbose=False):
 52 |                 
 53 |         BaseEnsemble.__init__(
 54 |             self, 
 55 |             base_model, 
 56 |             num_models, 
 57 |             bagging_percent,
 58 |             bagging_replacement, 
 59 |             feature_subset_percent, 
 60 |             stacking_model, 
 61 |             randomize_params, 
 62 |             False, # for now additive only works for regression 
 63 |             verbose)
 64 |         self.weighting = weighting 
 65 |         self.classes = None
 66 |         self.class_list = None 
 67 |         
 68 |     def _init_fit(self, X, Y): 
 69 |         self.classes = np.unique(Y) 
 70 |         self.class_list = list(self.classes)
 71 |         
 72 |     def _created_model(self, X, Y, indices, i, model):
 73 |         # to assign an F-score weight to each classifier, 
 74 |         # sample another subset of the data and use the model 
 75 |         # we just train to generate predictions 
 76 |         beta = self.weighting 
 77 |         n = X.shape[0]
 78 |         bagsize = len(indices)
 79 |         if beta or self.verbose:
 80 |             error_sample_indices = np.random.random_integers(0,n-1,bagsize)
 81 |             error_subset = X[error_sample_indices, :] 
 82 |             if self.feature_subsets:
 83 |                 error_subset = error_subset[:, self.feature_subsets[i]]
 84 |             error_labels = Y[error_sample_indices]
 85 |             y_pred = model.predict(error_subset)
 86 |             
 87 |             if self.weighting: 
 88 |                 f_score = fbeta_score(error_labels, y_pred, beta)
 89 |                 self.weights[i] = f_score 
 90 |             if self.verbose:
 91 |                 print "Actual non-zero:", np.sum(error_labels != 0)
 92 |                 num_pred_nz = np.sum(y_pred != 0)
 93 |                 print "Predicted non-zero:", num_pred_nz
 94 |                 pred_correct = (y_pred == error_labels)
 95 |                 pred_nz = (y_pred != 0)
 96 |                 num_true_nz = np.sum(pred_correct & pred_nz)
 97 |                 print "True non-zero:", num_true_nz
 98 |                 print "False non-zero:", num_pred_nz - num_true_nz
 99 |                 print "---" 
100 |     # normalize weights to add up to 1 
101 |         
102 | 
103 |     def _predict_votes(self, X): 
104 |         X = np.atleast_2d(X)
105 |         n_samples, n_features = X.shape    
106 |         n_classes = len(self.classes)
107 |         votes = np.zeros( [n_samples, n_classes] )
108 |         
109 |         for i, model in enumerate(self.models):
110 |             weight = self.weights[i]
111 |             if self.feature_subsets:
112 |                 feature_indices = self.feature_subsets[i]
113 |                 X_subset = X[:, feature_indices]
114 |                 ys = model.predict(X_subset)
115 |             else:
116 |                 ys = model.predict(X)
117 |             for c in self.classes:
118 |                 class_index = self.class_list.index(c)
119 |                 votes[ys == c, class_index] += weight
120 |         return votes
121 |     
122 |     def _predict_normalized_votes(self, X): 
123 |         votes = self._predict_votes(X)
124 |         sums = np.sum(votes, axis=1)
125 |         return votes / np.array([sums], dtype='float').T
126 |     
127 |     def _weighted_transform(self, X):
128 |         pred = self.transform(X)
129 |         for i, w in enumerate(self.weights):
130 |             pred[:, i] *= w
131 |         return pred 
132 |     
133 |     def _predict_stacked_probs(self, X):
134 |         transformed = self.transform(X)
135 |         return self.stacking_model.predict_proba(transformed)
136 | 
137 |     def predict_proba(self, X):
138 |         if self.need_to_fit:
139 |             raise RuntimeError("Trying to call 'predict_proba' before 'fit'")
140 |         if self.stacking_model:
141 |             return self._predict_stacked_probs(X) 
142 |         else:
143 |             return self._predict_normalized_votes(X)
144 | 
145 |     def predict(self, X, return_probs=False):
146 |         """Every classifier in the ensemble votes for a class. 
147 |            If we're doing stacking, then pass the votes as features into 
148 |            the stacking classifier, otherwise return the majority vote."""
149 |         if self.need_to_fit:
150 |             raise RuntimeError("Trying to call 'predict' before 'fit'")
151 |         
152 |         if self.stacking_model:
153 |             majority_indices = np.argmax(self._predict_stacked_probs(X), axis=1)
154 |         else: 
155 |             majority_indices = np.argmax(self._predict_votes(X), axis=1)        
156 |         return np.array([self.class_list[i] for i in majority_indices])
157 |             
158 | 


--------------------------------------------------------------------------------
/treelearn/randomized_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import scipy 
  3 | import scipy.weave 
  4 | import scipy.stats 
  5 | import random 
  6 | import math 
  7 | 
  8 | from sklearn.base import BaseEstimator 
  9 | from constant_leaf import ConstantLeaf
 10 | from tree_node import TreeNode 
 11 | import random 
 12 | from tree_helpers import majority,  midpoints 
 13 | from tree_helpers import find_best_gini_split, find_min_variance_split
 14 | 
 15 | class RandomizedTree(BaseEstimator):
 16 |     """Decision tree which only inspects a random subset of the features
 17 |        at each split. Uses Gini impurity to compare possible data splits. 
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     num_features_per_node : int, optional (default = None).
 22 |         At each split, how many features should we consider splitting. 
 23 |         If None, then use log(total number of features). 
 24 | 
 25 |     min_leaf_size : int, optional (default=15). 
 26 |         Stop splitting when the data gets this small. 
 27 |     
 28 |     max_height : int, optional (default = 100). 
 29 |         Stop growing tree at this height. 
 30 |     
 31 |     max_thresholds : int, optional (default = None). 
 32 |         At each split, generate at most this number of evenly spaced thresholds
 33 |         between the min and max feature values. The default behavior is
 34 |         to consider all midpoints between unique feature values. 
 35 |     
 36 |     classes : sequence of int labels, optional (default = None). 
 37 |         If None, then use the unique values of the classes given to 'fit'. 
 38 |     
 39 |     feature_names : string list (default = None). 
 40 |         Names to use for pretty printing. 
 41 |         
 42 |     verbose : bool (default = False).
 43 |         Print debugging output. 
 44 |     """
 45 | 
 46 |     def __init__(self, 
 47 |             num_features_per_node=None, 
 48 |             min_leaf_size=10, 
 49 |             max_height = 20, 
 50 |             max_thresholds=None, 
 51 |             regression =  False, 
 52 |             feature_names = None, 
 53 |             verbose = False):
 54 |         self.root = None 
 55 |         self.num_features_per_node = num_features_per_node 
 56 |         self.min_leaf_size = min_leaf_size
 57 |         self.max_height = max_height 
 58 |         self.classes = None 
 59 |         self.feature_names = feature_names 
 60 |         self.max_thresholds = max_thresholds 
 61 |         if max_thresholds is None:
 62 |             self.get_thresholds = self.all_thresholds
 63 |         else:
 64 |             self.get_thresholds = self.random_threshold_subset 
 65 |         self.regression = regression 
 66 |         if regression: 
 67 |             self.leaf_dtype = 'float' 
 68 |         else:
 69 |             self.leaf_dtype = 'int'
 70 |             
 71 |         self.verbose = verbose 
 72 | 
 73 |     
 74 |     def all_thresholds(self, x): 
 75 |         """get midpoints between all unique values"""
 76 |         if len(x) > 1: 
 77 |             return midpoints(np.unique(x))
 78 |         else: 
 79 |             return x 
 80 |     
 81 |     def random_threshold_subset(self, x): 
 82 |         total = len(x)
 83 |         k = self.max_thresholds 
 84 |         nsamples = min(total, k)
 85 |         rand_subset = random.sample(x, nsamples)
 86 |         return self.all_thresholds(rand_subset)
 87 |     
 88 |     def _split(self, data, labels, m, height):
 89 |         n_samples = data.shape[0]
 90 |         if n_samples <= self.min_leaf_size or height > self.max_height:
 91 |             self.nleaves += 1
 92 |             if self.regression:
 93 |                 return ConstantLeaf(np.mean(labels))
 94 |             else: 
 95 |                 return ConstantLeaf(majority(labels, self.classes))
 96 |         elif np.all(labels == labels[0]):
 97 |             self.nleaves += 1
 98 |             return ConstantLeaf(labels[0])
 99 |         else:
100 |             nfeatures = data.shape[1]
101 |             # randomly draw m feature indices. 
102 |             # should be more efficient than explicitly constructing a permutation
103 |             # vector and then keeping only the head elements 
104 |             random_feature_indices = random.sample(xrange(nfeatures), m)
105 |             best_split_score = np.inf
106 |             best_feature_idx = None
107 |             best_thresh = None 
108 |             
109 |             for feature_idx in random_feature_indices:
110 |                 feature_vec = data[:, feature_idx]
111 |                 thresholds = self.get_thresholds(feature_vec)
112 |                 
113 |                 
114 |                 if self.regression:
115 |                     thresh, combined_score = \
116 |                         find_min_variance_split(feature_vec, thresholds, labels)
117 |                 else:
118 |                     thresh, combined_score = \
119 |                         find_best_gini_split(self.classes, feature_vec, thresholds, labels)
120 |                 
121 |                 if combined_score < best_split_score:
122 |                     best_split_score = combined_score
123 |                     best_feature_idx = feature_idx
124 |                     best_thresh = thresh 
125 |                     
126 |             left_mask = data[:, best_feature_idx] < best_thresh 
127 |             right_mask = ~left_mask
128 |             
129 |             left_data = data[left_mask, :] 
130 |             right_data = data[right_mask, :] 
131 |             
132 |             left_labels = labels[left_mask] 
133 |             right_labels = labels[right_mask]
134 |             
135 |             # get rid of references before recursion so data can be deleted
136 |             del labels 
137 |             del data 
138 |             del random_feature_indices 
139 |             del left_mask 
140 |             del right_mask 
141 |             
142 |             left_node = self._split(left_data, left_labels, m, height+1)
143 |             right_node = self._split(right_data, right_labels, m, height+1)
144 |             node = TreeNode(best_feature_idx, best_thresh, left_node, right_node)
145 |             return node 
146 | 
147 |                 
148 |     def fit(self, data, labels, feature_names = None): 
149 |         data = np.atleast_2d(data)
150 |         labels = np.atleast_1d(labels)
151 |         
152 |         if not self.regression: 
153 |             self.classes = np.unique(labels)
154 |             self.nclasses = len(self.classes)
155 |         self.feature_names = feature_names 
156 |         self.nleaves = 0 
157 |         nrows = data.shape[0]
158 |         nfeatures = data.shape[1]
159 |         if self.num_features_per_node is None:
160 |             m = int(round(math.log(nfeatures, 2)))
161 |         else:
162 |             m = self.num_features_per_node 
163 |         self.root = self._split(data, labels, m, 1)
164 | 
165 |     def predict(self, X):
166 |         X = np.atleast_2d(X)
167 |         n_samples = X.shape[0]
168 |         # create an output array and let the tree nodes recursively fill it
169 |         outputs = np.zeros(n_samples, dtype=self.leaf_dtype)
170 |         mask = np.ones(n_samples, dtype='bool')
171 |         self.root.fill_predict(X, outputs, mask)
172 |         return outputs 
173 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |          GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/treelearn/recipes.py:
--------------------------------------------------------------------------------
  1 | # TreeLearn
  2 | #
  3 | # Copyright (C) Capital K Partners
  4 | # Author: Alex Rubinsteyn
  5 | # Contact: alex [at] capitalkpartners [dot] com 
  6 | #
  7 | # This library is free software; you can redistribute it and/or
  8 | # modify it under the terms of the GNU Lesser General Public
  9 | # License as published by the Free Software Foundation; either
 10 | # version 2.1 of the License, or (at your option) any later version.
 11 | #
 12 | # This library is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 15 | # Lesser General Public License for more details.
 16 | 
 17 | import numpy as np 
 18 | 
 19 | from sklearn.svm import LinearSVC
 20 | from sklearn.linear_model import LogisticRegression, SGDClassifier
 21 | from sklearn.linear_model import LinearRegression, Ridge 
 22 | 
 23 | from regression_ensemble import RegressionEnsemble
 24 | from classifier_ensemble import ClassifierEnsemble 
 25 | from clustered_regression import ClusteredRegression 
 26 | from clustered_classifier import ClusteredClassifier
 27 | from randomized_tree import RandomizedTree 
 28 | from oblique_tree import ObliqueTree
 29 | 
 30 | 
 31 | def train_random_forest(
 32 |         X, 
 33 |         Y, 
 34 |         num_trees = 20, 
 35 |         max_thresholds = 10, 
 36 |         max_height = None, 
 37 |         min_leaf_size = None, 
 38 |         bagging_percent=0.65):
 39 |     """A random forest is a bagging ensemble of randomized trees, so it can
 40 |     be implemented by combining the BaggedClassifier and RandomizedTree objects.
 41 |     This function is just a helper to your life easier.
 42 |     
 43 |     Parameters
 44 |     ----------
 45 |     X : numpy array containing input data.
 46 |         Should have samples for rows and features for columns. 
 47 |     
 48 |     Y : numpy array containing class labels for each sample
 49 |     
 50 |     num_trees : how big is the forest?
 51 |     
 52 |     max_thresholds : rather than evaluating all possible thresholds at each 
 53 |         split, randomly sample this number of thresholds
 54 |     
 55 |     max_height : don't let tree grow past given height, inferred if omitted. 
 56 |     
 57 |     min_leaf_size : don't split nodes smaller than this, inferred if omitted. 
 58 |     
 59 |     bagging_percent : what subset of the data is each tree trained on?
 60 |     
 61 |     **tree_args :  parameters for individual decision tree. 
 62 |     """
 63 |     if isinstance(Y[0], float):
 64 |         regression = True
 65 |     else: 
 66 |         regression = False 
 67 |         
 68 |     if max_height is None: 
 69 |         max_height = int(np.log2(X.shape[0])) + 1
 70 |     if min_leaf_size is None: 
 71 |         min_leaf_size = int(np.log2(X.shape[0])) 
 72 |         
 73 |     tree = RandomizedTree(
 74 |         regression = regression, 
 75 |         max_thresholds = max_thresholds, 
 76 |         max_height = max_height, 
 77 |         min_leaf_size = min_leaf_size, 
 78 |     )
 79 | 
 80 |     if regression:
 81 |         forest = RegressionEnsemble(
 82 |             base_model = tree, 
 83 |             num_models= num_trees,
 84 |             bagging_percent = bagging_percent
 85 |         )
 86 |     else: 
 87 |         forest = ClassifierEnsemble(
 88 |             base_model = tree, 
 89 |             num_models = num_trees, 
 90 |             bagging_percent = bagging_percent
 91 |         )
 92 |     forest.fit(X,Y)
 93 |     return forest
 94 |     
 95 | def gen_random_C():
 96 |     return 10 ** (np.random.randn())
 97 |         
 98 | def mk_svm_tree(max_depth = 3, randomize_C = False, model_args = {}, tree_args = {}):
 99 |     randomize_split_params = {}
100 |     randomize_leaf_params = {}
101 |     if randomize_C:
102 |         randomize_split_params['C'] = gen_random_C
103 |         randomize_leaf_params['C'] = gen_random_C
104 | 
105 |     split_classifier = LinearSVC(**model_args)
106 |     leaf_classifier = LinearSVC(**model_args)
107 |     
108 |     tree = ObliqueTree(
109 |         max_depth = max_depth, 
110 |         split_classifier=split_classifier, 
111 |         leaf_model=leaf_classifier, 
112 |         randomize_split_params = randomize_split_params,
113 |         randomize_leaf_params = randomize_leaf_params, 
114 |         **tree_args)
115 |     return tree 
116 | 
117 | def train_svm_tree(X, Y, max_depth = 3, randomize_C = False, model_args = {}, tree_args={}):
118 |     tree = mk_svm_tree(max_depth, randomize_C, model_args, tree_args)
119 |     tree.fit(X, Y)
120 |     return tree 
121 | 
122 | def train_svm_forest(X, Y, num_trees = 10, max_depth = 3, bagging_percent=0.65, randomize_C = False, model_args ={}, tree_args={}):
123 |     """A random forest whose base classifier is a SVM-Tree (rather
124 |     than splitting individual features we project each point onto a hyperplane)
125 |     
126 |     Parameters
127 |     ----------
128 |     X : numpy array containing input data.
129 |         Should have samples for rows and features for columns. 
130 |     
131 |     Y : numpy array containing class labels for each sample
132 |     
133 |     num_trees : how big is the forest?
134 |     
135 |     bagging_percent : what subset of the data is each tree trained on?
136 |     
137 |     randomize_C : bool 
138 |     
139 |     model_args : parameters for each SVM classifier 
140 |     
141 |     tree_args :  parameters for each tree of classifiers 
142 |     """
143 |     tree = mk_svm_tree(max_depth, randomize_C, model_args, tree_args)
144 |     forest = ClassifierEnsemble(
145 |         base_model = tree, 
146 |         num_models = num_trees,
147 |         bagging_percent = bagging_percent)
148 |     forest.fit(X,Y)
149 |     return forest
150 | 
151 | def gen_random_alpha():
152 |     return 10**(-np.random.random()*7)
153 | 
154 | def mk_sgd_tree(n_examples=200000, max_depth=3, randomize_alpha=False, model_args={}, tree_args={}):
155 |     randomize_split_params = {}
156 |     randomize_leaf_params = {}
157 |     if randomize_alpha:
158 |         randomize_split_params['alpha'] = gen_random_alpha
159 |         randomize_leaf_params['alpha'] = gen_random_alpha
160 |     
161 |     n_iter = np.ceil(10**6 / n_examples)
162 |     split_classifier = SGDClassifier(n_iter = n_iter, shuffle=True, **model_args)
163 |     leaf_classifier = SGDClassifier(n_iter = n_iter, shuffle=True, **model_args)
164 |     
165 |     tree = ObliqueTree(
166 |         max_depth = max_depth, 
167 |         split_classifier=split_classifier, 
168 |         leaf_model=leaf_classifier, 
169 |         randomize_split_params = randomize_split_params,
170 |         randomize_leaf_params = randomize_leaf_params, 
171 |         **tree_args
172 |     )
173 |     return tree 
174 | 
175 | def train_sgd_tree(X, Y, max_depth=3, randomize_alpha=False, model_args = {}, tree_args={}):
176 |     tree = mk_sgd_tree(X.shape[0], max_depth, randomize_alpha, model_args, tree_args)
177 |     tree.fit(X, Y)
178 |     return tree 
179 |     
180 | def train_sgd_forest(X, Y, 
181 |         num_trees = 20, 
182 |         max_depth = 3, 
183 |         bagging_percent=0.65, 
184 |         randomize_alpha=False, 
185 |         model_args = {}, 
186 |         tree_args= {}):
187 |     """A random forest whose base classifier is a tree of SGD classifiers
188 |     
189 |     Parameters
190 |     ----------
191 |     X : numpy array containing input data.
192 |         Should have samples for rows and features for columns. 
193 |     
194 |     Y : numpy array containing class labels for each sample
195 |     
196 |     num_trees : how big is the forest?
197 |     
198 |     bagging_percent : what subset of the data is each tree trained on?
199 |     
200 |     randomize_alpha : bool
201 |     
202 |     model_args : parameters for each SGD classifier 
203 |     
204 |     tree_args :  parameters for each tree
205 |     """
206 |     bagsize = bagging_percent * X.shape[0]
207 |     tree = mk_sgd_tree(bagsize, max_depth, randomize_alpha, model_args, tree_args)
208 |     forest = ClassifierEnsemble(
209 |         base_model = tree, 
210 |         num_models = num_trees,
211 |         bagging_percent = bagging_percent)
212 |     forest.fit(X,Y)
213 |     return forest
214 | 
215 | def train_clustered_ols(X, Y, k = 20): 
216 |     """Cluster data and then train a linear regressor per cluster"""
217 |     cr = ClusteredRegression(k)
218 |     cr.fit(X, Y)
219 |     return cr 
220 | 
221 | def train_clustered_svm(X, Y, k = 20, C = 1, verbose = True):
222 |     base_model = LinearSVC(C = C)
223 |     cc = ClusteredClassifier(k = k, base_model = base_model, verbose = verbose)
224 |     cc.fit(X, Y)
225 |     return cc 
226 | 
227 | def mk_clustered_svm_ensemble(
228 |         num_models = 20, 
229 |         C = 1, 
230 |         k = 20, 
231 |         stacking= False, 
232 |         bagging_percent = 0.65, 
233 |         feature_subset_percent=0.5, 
234 |         verbose = True): 
235 |     
236 |     base_model = LinearSVC(C = C)
237 |     clustered_model = ClusteredClassifier(k, base_model = base_model, verbose=verbose)
238 |     
239 |     if stacking:
240 |         stacking_model = LogisticRegression(fit_intercept=False)
241 |     else:
242 |         stacking_model = None 
243 |         
244 |     return ClassifierEnsemble(
245 |         base_model = clustered_model, 
246 |         num_models = num_models, 
247 |         bagging_percent = bagging_percent, 
248 |         feature_subset_percent = feature_subset_percent, 
249 |         stacking_model = stacking_model)
250 | 
251 | def train_clustered_svm_ensemble(
252 |         X, 
253 |         Y, 
254 |         num_models = 10, 
255 |         C = 1, 
256 |         k =  20, 
257 |         stacking= False, 
258 |         bagging_percent = 0.65, 
259 |         feature_subset_percent=0.5, 
260 |         verbose = True): 
261 |     ensemble = mk_clustered_svm_ensemble(
262 |                 num_models, 
263 |                 C, 
264 |                 k, 
265 |                 stacking, 
266 |                 bagging_percent, 
267 |                 feature_subset_percent, 
268 |                 verbose)
269 |     ensemble.fit(X, Y)
270 |     return ensemble 
271 | 
272 | def mk_clustered_regression_ensemble(
273 |             num_models = 20, 
274 |             k = 20,
275 |             stacking= False, 
276 |             additive=False, 
277 |             bagging_percent = 0.65, 
278 |             feature_subset_percent=0.5):
279 | 
280 | 
281 |     clustered_model = ClusteredRegression(k=k, base_model = LinearRegression())
282 |     
283 |     if stacking:
284 |         stacking_model = LinearRegression(fit_intercept=False)
285 |     else:
286 |         stacking_model = None 
287 |         
288 |     return RegressionEnsemble(
289 |         base_model = clustered_model, 
290 |         num_models = num_models, 
291 |         bagging_percent = bagging_percent, 
292 |         feature_subset_percent = feature_subset_percent, 
293 |         stacking_model = stacking_model, 
294 |         additive = additive 
295 |     )
296 |     
297 | def train_clustered_regression_ensemble(
298 |         X, 
299 |         Y, 
300 |         num_models=10, 
301 |         k = 20,
302 |         stacking=False, 
303 |         additive=False, 
304 |         bagging_percent = 0.65, 
305 |         feature_subset_percent=0.5):
306 |     ensemble = mk_clustered_regression_ensemble (
307 |         num_models = num_models, 
308 |         k = k, 
309 |         stacking = stacking, 
310 |         additive = additive, 
311 |         bagging_percent = bagging_percent, 
312 |         feature_subset_percent = feature_subset_percent 
313 |     )
314 |     ensemble.fit(X, Y)
315 |     return ensemble 
316 | 
317 | def mk_additive_regression_forest(
318 |         num_trees=50, 
319 |         bagging_percent = 0.65, 
320 |         feature_subset_percent = 0.5, 
321 |         max_height=3, 
322 |         min_leaf_size=10, 
323 |         max_thresholds=100):
324 |     tree = RandomizedTree(
325 |         max_height= max_height, 
326 |         min_leaf_size=min_leaf_size, 
327 |         max_thresholds=max_thresholds, 
328 |         regression=True)
329 |     forest = RegressionEnsemble(
330 |         base_model = tree, 
331 |         num_models=num_trees,
332 |         bagging_percent = bagging_percent, 
333 |         feature_subset_percent = feature_subset_percent, 
334 |         additive=True)
335 |     return forest 
336 |     
337 | def train_additive_regression_forest(X, Y, 
338 |         num_trees=50, 
339 |         bagging_percent = 0.65, 
340 |         feature_subset_percent = 0.5, 
341 |         max_height=3, 
342 |         min_leaf_size=10, 
343 |         max_thresholds=50):
344 |     forest = mk_additive_regression_forest(
345 |         num_trees, 
346 |         bagging_percent, 
347 |         feature_subset_percent, 
348 |         max_height, 
349 |         min_leaf_size, 
350 |         max_thresholds)
351 |     forest.fit(X,Y)
352 |     return forest 
353 |     
354 | 


--------------------------------------------------------------------------------
/treelearn/tree_helpers.py:
--------------------------------------------------------------------------------
  1 | # TreeLearn
  2 | #
  3 | # Copyright (C) Capital K Partners
  4 | # Author: Alex Rubinsteyn
  5 | # Contact: alex [at] capitalkpartners [dot] com 
  6 | #
  7 | # This library is free software; you can redistribute it and/or
  8 | # modify it under the terms of the GNU Lesser General Public
  9 | # License as published by the Free Software Foundation; either
 10 | # version 2.1 of the License, or (at your option) any later version.
 11 | #
 12 | # This library is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 15 | # Lesser General Public License for more details.
 16 | 
 17 | 
 18 | import numpy as np 
 19 | import scipy
 20 | import scipy.weave 
 21 | 
 22 | 
 23 | # some sklearn classifiers leave behind large data members after fitting
 24 | # which make serialization a pain--- clear those fields 
 25 | def clear_sklearn_fields(clf):
 26 |     # stupid reuse of the same field--- clearing this on classifiers
 27 |     # causes them to crash 
 28 |     #if hasattr(clf, 'label_'):
 29 |     #    clf.label_ = None
 30 |     if hasattr(clf, 'sample_weight'):
 31 |         clf.sample_weight = None 
 32 | 
 33 | def midpoints(x):
 34 |     return (x[1:] + x[:-1])/2.0
 35 |     
 36 | def majority(labels, classes=None): 
 37 |     if classes is None: 
 38 |         classes = np.unique(labels)
 39 |     votes = np.zeros(len(classes))
 40 |     for i, c in enumerate(classes):
 41 |         votes[i] = np.sum(labels == c)
 42 |     majority_idx = np.argmax(votes)
 43 |     return classes[majority_idx] 
 44 | 
 45 | 
 46 | def slow_gini(classes, labels):
 47 |     sum_squares = 0.0
 48 |     n = len(labels)
 49 |     if n == 0:
 50 |         return 0.0
 51 |     else:
 52 |         n_squared = float(n * n)
 53 |         for c in classes:
 54 |             count = np.sum(labels == c)
 55 |             p_squared = count*count / n_squared
 56 |             sum_squares += p_squared
 57 |         return 1 - sum_squares 
 58 | 
 59 | inline = scipy.weave.inline 
 60 | def gini(classes, labels): 
 61 |     code = """
 62 |         int num_classes = Nclasses[0]; 
 63 |         int n = Nlabels[0];
 64 |         float sum_squares = 0.0f; 
 65 |         for (int class_index = 0; class_index < num_classes; ++class_index) { 
 66 |             int c = classes[class_index]; 
 67 |             int count = 0; 
 68 |             for (int i = 0; i < n; ++i) { 
 69 |                 if (labels[i] == c) { ++count; } 
 70 |             }
 71 |             float p = ((float) count) / n; 
 72 |             sum_squares += p * p; 
 73 |         }
 74 |         return_val = 1.0f - sum_squares; 
 75 |     """
 76 |     return inline(code, ['classes', 'labels'], local_dict=None, verbose=2)
 77 | 
 78 | 
 79 | def slow_eval_split(classes, feature_vec, thresh, labels): 
 80 |     mask = feature_vec < thresh 
 81 |     left_labels = labels[mask]
 82 |     right_labels = labels[~mask] 
 83 |     left_score = slow_gini(classes, left_labels)
 84 |     right_score = slow_gini(classes, right_labels)    
 85 |     n_left = len(left_labels)
 86 |     n_right = len(right_labels)
 87 |     n = float(n_left+n_right)
 88 |     
 89 |     combined_score = (n_left/n)*left_score + (n_right/n)*right_score 
 90 |     return combined_score 
 91 | 
 92 |     
 93 | dtype2ctype = {
 94 |     np.dtype(np.float64): 'double',
 95 |     np.dtype(np.float32): 'float',
 96 |     np.dtype(np.int32): 'int',
 97 |     np.dtype(np.int16): 'short',
 98 |     np.dtype(np.bool): 'bool', 
 99 | }
100 |    
101 | def eval_gini_split(classes, feature_vec, thresh, labels): 
102 |     left_mask = feature_vec < thresh
103 |     code = """
104 |         
105 |         int num_classes = Nclasses[0]; 
106 |         int nlabels = Nlabels[0]; 
107 |         
108 |         float left_sum_squares = 0.0f; 
109 |         float right_sum_squares = 0.0f; 
110 |         
111 |         
112 |         /* total number of elements in the left and right of the split */ 
113 |         int total_left = 0; 
114 |         int total_right = 0; 
115 |         
116 |         /* first pass for C = 0 to get total counts along with class-specific
117 |            scores 
118 |         */
119 |         int left_class_count = 0; 
120 |         int right_class_count = 0; 
121 |         
122 |         for (int i = 0; i < nlabels; ++i) { 
123 |             if (left_mask[i]) {
124 |                 total_left += 1; 
125 |                 if (labels[i] == 0) left_class_count += 1; 
126 |             } else {
127 |                 total_right += 1;
128 |                 if (labels[i] == 0) right_class_count += 1; 
129 |             }
130 |         }
131 |         if (total_left > 0) {
132 |             float left_p = ((float) left_class_count) / total_left; 
133 |             left_sum_squares += left_p * left_p; 
134 |         }
135 |         if (total_right > 0) { 
136 |             float right_p = ((float) right_class_count) / total_right; 
137 |             right_sum_squares += right_p* right_p; 
138 |         }
139 |         
140 |         /* how many elements of each side have we counted in the score so far? */ 
141 |         int cumulative_left_count = left_class_count; 
142 |         int cumulative_right_count = right_class_count; 
143 |         
144 |         /* if we have a multi-class problem iterate over rest of classes, 
145 |            except for the last class, whose size can be inferred from the 
146 |            difference between left_count and total_left
147 |         */ 
148 |         for (int class_index = 1; class_index < num_classes - 1; ++class_index) { 
149 |             int c = classes[class_index]; 
150 |             left_class_count = 0; 
151 |             right_class_count = 0; 
152 |             
153 |             for (int i = 0; i < nlabels; ++i) {
154 |                 if (labels[i] == c) { 
155 |                     if (left_mask[i]) left_class_count += 1; 
156 |                     else right_class_count += 1; 
157 |                 }
158 |             }
159 |             cumulative_left_count += left_class_count; 
160 |             cumulative_right_count += right_class_count; 
161 |             
162 |             if (total_left > 0) {
163 |                 float left_p = ((float) left_class_count) / total_left; 
164 |                 left_sum_squares += left_p * left_p; 
165 |             }
166 |             if (total_right > 0) { 
167 |                 float right_p = ((float) right_class_count) / total_right; 
168 |                 right_sum_squares += right_p* right_p; 
169 |             }
170 |         }
171 |         
172 |         /* handle last class */ 
173 |         left_class_count = total_left - cumulative_left_count; 
174 |         right_class_count = total_right - cumulative_right_count; 
175 |         if (total_left > 0) {
176 |             float left_p = ((float) left_class_count) / total_left; 
177 |             left_sum_squares += left_p * left_p; 
178 |         }
179 |         if (total_right > 0) { 
180 |             float right_p = ((float) right_class_count) / total_right; 
181 |             right_sum_squares += right_p* right_p; 
182 |         }
183 |         float left_gini = 1.0f - left_sum_squares; 
184 |         float right_gini = 1.0f - right_sum_squares; 
185 |         float total = (float) nlabels; 
186 |         float left_weight = total_left / total; 
187 |         float right_weight = total_right / total; 
188 |          
189 |         return_val = left_weight * left_gini + right_weight  * right_gini; 
190 |     """ 
191 |     return inline(code, ['classes', 'left_mask', 'labels'], \
192 |         local_dict=None, verbose=2)
193 | 
194 | 
195 | def slow_find_best_gini_split(classes, feature_vec, thresholds, labels): 
196 |     best_t = None
197 |     best_score = np.inf
198 |     
199 |     n = len(labels) 
200 |     for t in thresholds:
201 |         mask = feature_vec < t
202 |         left_labels = labels[mask]
203 |         right_labels = labels[~mask] 
204 |         left_score = slow_gini(classes, left_labels)
205 |         right_score = slow_gini(classes, right_labels)    
206 |         n_left = len(left_labels)
207 |         n_right = len(right_labels)
208 |         nf = float(n)
209 |         combined_score = (n_left/nf)*left_score + (n_right/nf)*right_score 
210 |         if combined_score < best_score:
211 |             best_t = t
212 |             best_score = combined_score
213 |     return best_t, best_score 
214 | 
215 | def slow_find_min_variance_split(feature_vec, thresholds, ys): 
216 |     best_score = np.inf
217 |     best_t = None
218 |     for t in thresholds:
219 |         mask = feature_vec < t
220 |         left = ys[mask]
221 |         right = ys[~mask]
222 |         left_size = left.shape[0]
223 |         right_size = right.shape[0]
224 |         
225 |         if left_size > 0 and right_size > 0:
226 |             total = float(left_size + right_size)
227 |             score = (left_size / total) * np.var(left) + (right_size / total) * np.var(right)
228 |             if score < best_score:
229 |                 best_score = score
230 |                 best_t = t
231 |     return best_t, best_score 
232 |     
233 | def find_min_variance_split(feature_vec, thresholds, ys):
234 |     code = """
235 |         float best_score = 100000000000.0; 
236 |         double best_thresh = 0.0; 
237 |         int n_thresholds = Nthresholds[0]; 
238 |         int n_rows = Nys[0]; 
239 |         
240 |         for (int t_index = 0; t_index < n_thresholds; t_index++) {
241 |             double thresh = thresholds[t_index];
242 |             int counts[2] = {0,0}; 
243 |             float means[2] = {0.0f, 0.0f}; 
244 |             float sum_squares[2] = {0.0f, 0.0f}; 
245 |             float x; 
246 |             float delta; 
247 |             bool flag; 
248 |             for (int i = 0; i < n_rows; ++i) {
249 |                 x = ys[i]; 
250 |                 flag = feature_vec[i] < thresh; 
251 |                 counts[flag] += 1; 
252 |                 delta = x - means[flag]; 
253 |                 means[flag] += delta / counts[flag];
254 |                 sum_squares[flag] += delta * (x - means[flag]);  
255 |             }
256 |             if (counts[0] > 1 && counts[1] > 1) { 
257 |                 float score = (sum_squares[0] + sum_squares[1]) / (counts[0] + counts[1]); 
258 |                 if (score < best_score) { 
259 |                     best_score = score; 
260 |                     best_thresh = thresh; 
261 |                 }
262 |             }
263 |         }
264 |         py::tuple results(2);
265 |         results[0] = best_thresh;
266 |         results[1] = best_score;
267 |         return_val = results;
268 |     """
269 |     return inline(code, ['feature_vec', 'thresholds', 'ys'], local_dict=None, verbose=2)
270 | 
271 | def find_best_gini_split(classes, feature_vec, thresholds, labels): 
272 |     code = """
273 |         int n_labels = Nlabels[0]; 
274 |         int n_classes = Nclasses[0];
275 |         int n_thresholds = Nthresholds[0]; 
276 |         
277 |         float best_score = 10000000.0; 
278 |         double best_thresh = 0.0; 
279 |         
280 |         /* loop over each possible threshold, compute GINI impurity for each,
281 |            return the threshold with lowest score 
282 |         */ 
283 |         for (int t_index = 0; t_index < n_thresholds; t_index++) {
284 |             double thresh = thresholds[t_index];
285 |             
286 |             float left_sum_squares = 0.0f; 
287 |             float right_sum_squares = 0.0f; 
288 |             
289 |             /* total number of elements in the left and right of the split */ 
290 |             int totals[2] = {0, 0}; 
291 |             int class_counts[2] = {0,0};
292 |             
293 |             
294 |             /* first pass for C = 0 to get total counts along with class-specific
295 |                scores 
296 |             */
297 |             
298 |             bool choice; 
299 |             bool correct_class; 
300 |             for (int i = 0; i < n_labels; ++i) { 
301 |                 choice = feature_vec[i] < thresh; 
302 |                 totals[choice] += 1; 
303 |                 correct_class = (labels[i] == 0); 
304 |                 class_counts[choice] += correct_class; 
305 |             }
306 |             
307 |             int total_left = totals[0];
308 |             int total_right = totals[1]; 
309 |             if (total_left > 0) {
310 |                 float left_p = ((float) class_counts[0]) / total_left; 
311 |                 left_sum_squares += left_p * left_p; 
312 |             }
313 |             if (total_right > 0) { 
314 |                 float right_p = ((float) class_counts[1]) / total_right;
315 |                 right_sum_squares += right_p* right_p; 
316 |             }
317 |             
318 |             /* how many elements of each side have we counted in the score so far? */ 
319 |             int cumulative_left_count = class_counts[0]; 
320 |             int cumulative_right_count = class_counts[1]; 
321 |             
322 |             /* if we have a multi-class problem iterate over rest of classes, 
323 |                except for the last class, whose size can be inferred from the 
324 |                difference between left_count and total_left
325 |             */ 
326 |             for (int class_index = 1; class_index < n_classes - 1; ++class_index) { 
327 |                 int c = classes[class_index]; 
328 |                 class_counts[0] = 0; 
329 |                 class_counts[1] = 0; 
330 |                 
331 |                 for (int i = 0; i < n_labels; ++i) {
332 |                     choice = (feature_vec[i] <= thresh); 
333 |                     correct_class = (labels[i] == c);
334 |                     class_counts[choice] += correct_class; 
335 |                 }
336 |                 cumulative_left_count += class_counts[0]; 
337 |                 cumulative_right_count += class_counts[1]; 
338 |                 
339 |                 if (total_left > 0) {
340 |                     float left_p = ((float) class_counts[0]) / total_left;
341 |                     left_sum_squares += left_p * left_p; 
342 |                 }
343 |                 if (total_right > 0) { 
344 |                     float right_p = ((float) class_counts[1]) / total_right; 
345 |                     right_sum_squares += right_p* right_p; 
346 |                 }
347 |             }
348 |             
349 |             /* handle last class */ 
350 |             float left_count = total_left - cumulative_left_count; 
351 |             float right_count = total_right - cumulative_right_count; 
352 |             if (total_left > 0) {
353 |                 float left_p = left_count / total_left; 
354 |                 left_sum_squares += left_p * left_p; 
355 |             }
356 |             if (total_right > 0) { 
357 |                 float right_p = right_count / total_right; 
358 |                 right_sum_squares += right_p* right_p; 
359 |             }
360 |             float left_gini = 1.0f - left_sum_squares; 
361 |             float right_gini = 1.0f - right_sum_squares; 
362 |             float total = (float) n_labels; 
363 |             float left_weight = total_left / total; 
364 |             float right_weight = total_right / total; 
365 |             float score = left_weight * left_gini + right_weight  * right_gini; 
366 |             if (score < best_score) { 
367 |                 best_score = score; 
368 |                 best_thresh = thresh; 
369 |             }
370 |         }
371 |         
372 |         py::tuple results(2);
373 |         results[0] = best_thresh;
374 |         results[1] = best_score;
375 |         return_val = results;
376 |     """ 
377 |     return inline(code, ['classes', 'feature_vec', 'thresholds', 'labels'], verbose=2)
378 | 


--------------------------------------------------------------------------------
/distribute_setup.py:
--------------------------------------------------------------------------------
  1 | #!python
  2 | """Bootstrap distribute installation
  3 | 
  4 | If you want to use setuptools in your package's setup.py, just include this
  5 | file in the same directory with it, and add this to the top of your setup.py::
  6 | 
  7 |     from distribute_setup import use_setuptools
  8 |     use_setuptools()
  9 | 
 10 | If you want to require a specific version of setuptools, set a download
 11 | mirror, or use an alternate download directory, you can do so by supplying
 12 | the appropriate options to ``use_setuptools()``.
 13 | 
 14 | This file can also be run as a script to install or upgrade setuptools.
 15 | """
 16 | import os
 17 | import sys
 18 | import time
 19 | import fnmatch
 20 | import tempfile
 21 | import tarfile
 22 | from distutils import log
 23 | 
 24 | try:
 25 |     from site import USER_SITE
 26 | except ImportError:
 27 |     USER_SITE = None
 28 | 
 29 | try:
 30 |     import subprocess
 31 | 
 32 |     def _python_cmd(*args):
 33 |         args = (sys.executable,) + args
 34 |         return subprocess.call(args) == 0
 35 | 
 36 | except ImportError:
 37 |     # will be used for python 2.3
 38 |     def _python_cmd(*args):
 39 |         args = (sys.executable,) + args
 40 |         # quoting arguments if windows
 41 |         if sys.platform == 'win32':
 42 |             def quote(arg):
 43 |                 if ' ' in arg:
 44 |                     return '"%s"' % arg
 45 |                 return arg
 46 |             args = [quote(arg) for arg in args]
 47 |         return os.spawnl(os.P_WAIT, sys.executable, *args) == 0
 48 | 
 49 | DEFAULT_VERSION = "0.6.19"
 50 | DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/"
 51 | SETUPTOOLS_FAKED_VERSION = "0.6c11"
 52 | 
 53 | SETUPTOOLS_PKG_INFO = """\
 54 | Metadata-Version: 1.0
 55 | Name: setuptools
 56 | Version: %s
 57 | Summary: xxxx
 58 | Home-page: xxx
 59 | Author: xxx
 60 | Author-email: xxx
 61 | License: xxx
 62 | Description: xxx
 63 | """ % SETUPTOOLS_FAKED_VERSION
 64 | 
 65 | 
 66 | def _install(tarball):
 67 |     # extracting the tarball
 68 |     tmpdir = tempfile.mkdtemp()
 69 |     log.warn('Extracting in %s', tmpdir)
 70 |     old_wd = os.getcwd()
 71 |     try:
 72 |         os.chdir(tmpdir)
 73 |         tar = tarfile.open(tarball)
 74 |         _extractall(tar)
 75 |         tar.close()
 76 | 
 77 |         # going in the directory
 78 |         subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
 79 |         os.chdir(subdir)
 80 |         log.warn('Now working in %s', subdir)
 81 | 
 82 |         # installing
 83 |         log.warn('Installing Distribute')
 84 |         if not _python_cmd('setup.py', 'install'):
 85 |             log.warn('Something went wrong during the installation.')
 86 |             log.warn('See the error message above.')
 87 |     finally:
 88 |         os.chdir(old_wd)
 89 | 
 90 | 
 91 | def _build_egg(egg, tarball, to_dir):
 92 |     # extracting the tarball
 93 |     tmpdir = tempfile.mkdtemp()
 94 |     log.warn('Extracting in %s', tmpdir)
 95 |     old_wd = os.getcwd()
 96 |     try:
 97 |         os.chdir(tmpdir)
 98 |         tar = tarfile.open(tarball)
 99 |         _extractall(tar)
100 |         tar.close()
101 | 
102 |         # going in the directory
103 |         subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
104 |         os.chdir(subdir)
105 |         log.warn('Now working in %s', subdir)
106 | 
107 |         # building an egg
108 |         log.warn('Building a Distribute egg in %s', to_dir)
109 |         _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir)
110 | 
111 |     finally:
112 |         os.chdir(old_wd)
113 |     # returning the result
114 |     log.warn(egg)
115 |     if not os.path.exists(egg):
116 |         raise IOError('Could not build the egg.')
117 | 
118 | 
119 | def _do_download(version, download_base, to_dir, download_delay):
120 |     egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg'
121 |                        % (version, sys.version_info[0], sys.version_info[1]))
122 |     if not os.path.exists(egg):
123 |         tarball = download_setuptools(version, download_base,
124 |                                       to_dir, download_delay)
125 |         _build_egg(egg, tarball, to_dir)
126 |     sys.path.insert(0, egg)
127 |     import setuptools
128 |     setuptools.bootstrap_install_from = egg
129 | 
130 | 
131 | def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
132 |                    to_dir=os.curdir, download_delay=15, no_fake=True):
133 |     # making sure we use the absolute path
134 |     to_dir = os.path.abspath(to_dir)
135 |     was_imported = 'pkg_resources' in sys.modules or \
136 |         'setuptools' in sys.modules
137 |     try:
138 |         try:
139 |             import pkg_resources
140 |             if not hasattr(pkg_resources, '_distribute'):
141 |                 if not no_fake:
142 |                     _fake_setuptools()
143 |                 raise ImportError
144 |         except ImportError:
145 |             return _do_download(version, download_base, to_dir, download_delay)
146 |         try:
147 |             pkg_resources.require("distribute>="+version)
148 |             return
149 |         except pkg_resources.VersionConflict:
150 |             e = sys.exc_info()[1]
151 |             if was_imported:
152 |                 sys.stderr.write(
153 |                 "The required version of distribute (>=%s) is not available,\n"
154 |                 "and can't be installed while this script is running. Please\n"
155 |                 "install a more recent version first, using\n"
156 |                 "'easy_install -U distribute'."
157 |                 "\n\n(Currently using %r)\n" % (version, e.args[0]))
158 |                 sys.exit(2)
159 |             else:
160 |                 del pkg_resources, sys.modules['pkg_resources']    # reload ok
161 |                 return _do_download(version, download_base, to_dir,
162 |                                     download_delay)
163 |         except pkg_resources.DistributionNotFound:
164 |             return _do_download(version, download_base, to_dir,
165 |                                 download_delay)
166 |     finally:
167 |         if not no_fake:
168 |             _create_fake_setuptools_pkg_info(to_dir)
169 | 
170 | def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
171 |                         to_dir=os.curdir, delay=15):
172 |     """Download distribute from a specified location and return its filename
173 | 
174 |     `version` should be a valid distribute version number that is available
175 |     as an egg for download under the `download_base` URL (which should end
176 |     with a '/'). `to_dir` is the directory where the egg will be downloaded.
177 |     `delay` is the number of seconds to pause before an actual download
178 |     attempt.
179 |     """
180 |     # making sure we use the absolute path
181 |     to_dir = os.path.abspath(to_dir)
182 |     try:
183 |         from urllib.request import urlopen
184 |     except ImportError:
185 |         from urllib2 import urlopen
186 |     tgz_name = "distribute-%s.tar.gz" % version
187 |     url = download_base + tgz_name
188 |     saveto = os.path.join(to_dir, tgz_name)
189 |     src = dst = None
190 |     if not os.path.exists(saveto):  # Avoid repeated downloads
191 |         try:
192 |             log.warn("Downloading %s", url)
193 |             src = urlopen(url)
194 |             # Read/write all in one block, so we don't create a corrupt file
195 |             # if the download is interrupted.
196 |             data = src.read()
197 |             dst = open(saveto, "wb")
198 |             dst.write(data)
199 |         finally:
200 |             if src:
201 |                 src.close()
202 |             if dst:
203 |                 dst.close()
204 |     return os.path.realpath(saveto)
205 | 
206 | def _no_sandbox(function):
207 |     def __no_sandbox(*args, **kw):
208 |         try:
209 |             from setuptools.sandbox import DirectorySandbox
210 |             if not hasattr(DirectorySandbox, '_old'):
211 |                 def violation(*args):
212 |                     pass
213 |                 DirectorySandbox._old = DirectorySandbox._violation
214 |                 DirectorySandbox._violation = violation
215 |                 patched = True
216 |             else:
217 |                 patched = False
218 |         except ImportError:
219 |             patched = False
220 | 
221 |         try:
222 |             return function(*args, **kw)
223 |         finally:
224 |             if patched:
225 |                 DirectorySandbox._violation = DirectorySandbox._old
226 |                 del DirectorySandbox._old
227 | 
228 |     return __no_sandbox
229 | 
230 | def _patch_file(path, content):
231 |     """Will backup the file then patch it"""
232 |     existing_content = open(path).read()
233 |     if existing_content == content:
234 |         # already patched
235 |         log.warn('Already patched.')
236 |         return False
237 |     log.warn('Patching...')
238 |     _rename_path(path)
239 |     f = open(path, 'w')
240 |     try:
241 |         f.write(content)
242 |     finally:
243 |         f.close()
244 |     return True
245 | 
246 | _patch_file = _no_sandbox(_patch_file)
247 | 
248 | def _same_content(path, content):
249 |     return open(path).read() == content
250 | 
251 | def _rename_path(path):
252 |     new_name = path + '.OLD.%s' % time.time()
253 |     log.warn('Renaming %s into %s', path, new_name)
254 |     os.rename(path, new_name)
255 |     return new_name
256 | 
257 | def _remove_flat_installation(placeholder):
258 |     if not os.path.isdir(placeholder):
259 |         log.warn('Unkown installation at %s', placeholder)
260 |         return False
261 |     found = False
262 |     for file in os.listdir(placeholder):
263 |         if fnmatch.fnmatch(file, 'setuptools*.egg-info'):
264 |             found = True
265 |             break
266 |     if not found:
267 |         log.warn('Could not locate setuptools*.egg-info')
268 |         return
269 | 
270 |     log.warn('Removing elements out of the way...')
271 |     pkg_info = os.path.join(placeholder, file)
272 |     if os.path.isdir(pkg_info):
273 |         patched = _patch_egg_dir(pkg_info)
274 |     else:
275 |         patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO)
276 | 
277 |     if not patched:
278 |         log.warn('%s already patched.', pkg_info)
279 |         return False
280 |     # now let's move the files out of the way
281 |     for element in ('setuptools', 'pkg_resources.py', 'site.py'):
282 |         element = os.path.join(placeholder, element)
283 |         if os.path.exists(element):
284 |             _rename_path(element)
285 |         else:
286 |             log.warn('Could not find the %s element of the '
287 |                      'Setuptools distribution', element)
288 |     return True
289 | 
290 | _remove_flat_installation = _no_sandbox(_remove_flat_installation)
291 | 
292 | def _after_install(dist):
293 |     log.warn('After install bootstrap.')
294 |     placeholder = dist.get_command_obj('install').install_purelib
295 |     _create_fake_setuptools_pkg_info(placeholder)
296 | 
297 | def _create_fake_setuptools_pkg_info(placeholder):
298 |     if not placeholder or not os.path.exists(placeholder):
299 |         log.warn('Could not find the install location')
300 |         return
301 |     pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1])
302 |     setuptools_file = 'setuptools-%s-py%s.egg-info' % \
303 |             (SETUPTOOLS_FAKED_VERSION, pyver)
304 |     pkg_info = os.path.join(placeholder, setuptools_file)
305 |     if os.path.exists(pkg_info):
306 |         log.warn('%s already exists', pkg_info)
307 |         return
308 | 
309 |     log.warn('Creating %s', pkg_info)
310 |     f = open(pkg_info, 'w')
311 |     try:
312 |         f.write(SETUPTOOLS_PKG_INFO)
313 |     finally:
314 |         f.close()
315 | 
316 |     pth_file = os.path.join(placeholder, 'setuptools.pth')
317 |     log.warn('Creating %s', pth_file)
318 |     f = open(pth_file, 'w')
319 |     try:
320 |         f.write(os.path.join(os.curdir, setuptools_file))
321 |     finally:
322 |         f.close()
323 | 
324 | _create_fake_setuptools_pkg_info = _no_sandbox(_create_fake_setuptools_pkg_info)
325 | 
326 | def _patch_egg_dir(path):
327 |     # let's check if it's already patched
328 |     pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO')
329 |     if os.path.exists(pkg_info):
330 |         if _same_content(pkg_info, SETUPTOOLS_PKG_INFO):
331 |             log.warn('%s already patched.', pkg_info)
332 |             return False
333 |     _rename_path(path)
334 |     os.mkdir(path)
335 |     os.mkdir(os.path.join(path, 'EGG-INFO'))
336 |     pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO')
337 |     f = open(pkg_info, 'w')
338 |     try:
339 |         f.write(SETUPTOOLS_PKG_INFO)
340 |     finally:
341 |         f.close()
342 |     return True
343 | 
344 | _patch_egg_dir = _no_sandbox(_patch_egg_dir)
345 | 
346 | def _before_install():
347 |     log.warn('Before install bootstrap.')
348 |     _fake_setuptools()
349 | 
350 | 
351 | def _under_prefix(location):
352 |     if 'install' not in sys.argv:
353 |         return True
354 |     args = sys.argv[sys.argv.index('install')+1:]
355 |     for index, arg in enumerate(args):
356 |         for option in ('--root', '--prefix'):
357 |             if arg.startswith('%s=' % option):
358 |                 top_dir = arg.split('root=')[-1]
359 |                 return location.startswith(top_dir)
360 |             elif arg == option:
361 |                 if len(args) > index:
362 |                     top_dir = args[index+1]
363 |                     return location.startswith(top_dir)
364 |         if arg == '--user' and USER_SITE is not None:
365 |             return location.startswith(USER_SITE)
366 |     return True
367 | 
368 | 
369 | def _fake_setuptools():
370 |     log.warn('Scanning installed packages')
371 |     try:
372 |         import pkg_resources
373 |     except ImportError:
374 |         # we're cool
375 |         log.warn('Setuptools or Distribute does not seem to be installed.')
376 |         return
377 |     ws = pkg_resources.working_set
378 |     try:
379 |         setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools',
380 |                                   replacement=False))
381 |     except TypeError:
382 |         # old distribute API
383 |         setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools'))
384 | 
385 |     if setuptools_dist is None:
386 |         log.warn('No setuptools distribution found')
387 |         return
388 |     # detecting if it was already faked
389 |     setuptools_location = setuptools_dist.location
390 |     log.warn('Setuptools installation detected at %s', setuptools_location)
391 | 
392 |     # if --root or --preix was provided, and if
393 |     # setuptools is not located in them, we don't patch it
394 |     if not _under_prefix(setuptools_location):
395 |         log.warn('Not patching, --root or --prefix is installing Distribute'
396 |                  ' in another location')
397 |         return
398 | 
399 |     # let's see if its an egg
400 |     if not setuptools_location.endswith('.egg'):
401 |         log.warn('Non-egg installation')
402 |         res = _remove_flat_installation(setuptools_location)
403 |         if not res:
404 |             return
405 |     else:
406 |         log.warn('Egg installation')
407 |         pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO')
408 |         if (os.path.exists(pkg_info) and
409 |             _same_content(pkg_info, SETUPTOOLS_PKG_INFO)):
410 |             log.warn('Already patched.')
411 |             return
412 |         log.warn('Patching...')
413 |         # let's create a fake egg replacing setuptools one
414 |         res = _patch_egg_dir(setuptools_location)
415 |         if not res:
416 |             return
417 |     log.warn('Patched done.')
418 |     _relaunch()
419 | 
420 | 
421 | def _relaunch():
422 |     log.warn('Relaunching...')
423 |     # we have to relaunch the process
424 |     # pip marker to avoid a relaunch bug
425 |     if sys.argv[:3] == ['-c', 'install', '--single-version-externally-managed']:
426 |         sys.argv[0] = 'setup.py'
427 |     args = [sys.executable] + sys.argv
428 |     sys.exit(subprocess.call(args))
429 | 
430 | 
431 | def _extractall(self, path=".", members=None):
432 |     """Extract all members from the archive to the current working
433 |        directory and set owner, modification time and permissions on
434 |        directories afterwards. `path' specifies a different directory
435 |        to extract to. `members' is optional and must be a subset of the
436 |        list returned by getmembers().
437 |     """
438 |     import copy
439 |     import operator
440 |     from tarfile import ExtractError
441 |     directories = []
442 | 
443 |     if members is None:
444 |         members = self
445 | 
446 |     for tarinfo in members:
447 |         if tarinfo.isdir():
448 |             # Extract directories with a safe mode.
449 |             directories.append(tarinfo)
450 |             tarinfo = copy.copy(tarinfo)
451 |             tarinfo.mode = 448 # decimal for oct 0700
452 |         self.extract(tarinfo, path)
453 | 
454 |     # Reverse sort directories.
455 |     if sys.version_info < (2, 4):
456 |         def sorter(dir1, dir2):
457 |             return cmp(dir1.name, dir2.name)
458 |         directories.sort(sorter)
459 |         directories.reverse()
460 |     else:
461 |         directories.sort(key=operator.attrgetter('name'), reverse=True)
462 | 
463 |     # Set correct owner, mtime and filemode on directories.
464 |     for tarinfo in directories:
465 |         dirpath = os.path.join(path, tarinfo.name)
466 |         try:
467 |             self.chown(tarinfo, dirpath)
468 |             self.utime(tarinfo, dirpath)
469 |             self.chmod(tarinfo, dirpath)
470 |         except ExtractError:
471 |             e = sys.exc_info()[1]
472 |             if self.errorlevel > 1:
473 |                 raise
474 |             else:
475 |                 self._dbg(1, "tarfile: %s" % e)
476 | 
477 | 
478 | def main(argv, version=DEFAULT_VERSION):
479 |     """Install or upgrade setuptools and EasyInstall"""
480 |     tarball = download_setuptools()
481 |     _install(tarball)
482 | 
483 | 
484 | if __name__ == '__main__':
485 |     main(sys.argv[1:])
486 | 


--------------------------------------------------------------------------------