├── ivalice ├── __init__.py ├── impl │ ├── __init__.py │ ├── setup.py │ ├── tests │ │ ├── test_adaboost.py │ │ ├── test_lambda_mart.py │ │ ├── test_forest.py │ │ ├── test_sort.py │ │ ├── test_mcrank.py │ │ ├── test_gradient_boosting.py │ │ └── test_tree.py │ ├── adaboost.py │ ├── forest.py │ ├── lambda_mart.py │ ├── sort.py │ ├── mcrank.py │ ├── gradient_boosting.py │ └── tree.py ├── ranking.py ├── regression.py ├── classification.py └── setup.py ├── .gitignore ├── README.rst ├── benchmarks └── bench_rf.py ├── setup.py └── examples └── plot_gradient_boosting_classification.py /ivalice/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ivalice/impl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ivalice/ranking.py: -------------------------------------------------------------------------------- 1 | from .impl.lambda_mart import LambdaMART 2 | from .impl.mcrank import McRank 3 | from .impl.mcrank import OrdinalMcRank 4 | -------------------------------------------------------------------------------- /ivalice/regression.py: -------------------------------------------------------------------------------- 1 | from .impl.forest import RFRegressor 2 | from .impl.gradient_boosting import GBRegressor 3 | #from .impl.tree import TreeRegressor 4 | -------------------------------------------------------------------------------- /ivalice/classification.py: -------------------------------------------------------------------------------- 1 | from .impl.adaboost import AdaBoostClassifier 2 | from .impl.gradient_boosting import GBClassifier 3 | #from .impl.tree import TreeClassifier 4 | -------------------------------------------------------------------------------- /ivalice/setup.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import numpy 4 | 5 | 6 | def configuration(parent_package='', top_path=None): 7 | from numpy.distutils.misc_util import Configuration 8 | 9 | config = Configuration('ivalice', parent_package, top_path) 10 | 11 | config.add_subpackage('impl') 12 | 13 | return config 14 | 15 | if __name__ == '__main__': 16 | from numpy.distutils.core import setup 17 | setup(**configuration(top_path='').todict()) 18 | -------------------------------------------------------------------------------- /ivalice/impl/setup.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import numpy 4 | 5 | 6 | def configuration(parent_package='', top_path=None): 7 | from numpy.distutils.misc_util import Configuration 8 | 9 | config = Configuration('impl', parent_package, top_path) 10 | 11 | config.add_subpackage('tests') 12 | 13 | return config 14 | 15 | if __name__ == '__main__': 16 | from numpy.distutils.core import setup 17 | setup(**configuration(top_path='').todict()) 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *~ 4 | .#* 5 | *.swp 6 | .DS_Store 7 | build 8 | ivalice/**/*.html 9 | 10 | dist/ 11 | doc/_build/ 12 | doc/generated/ 13 | doc/auto_examples/ 14 | doc/modules/generated/ 15 | doc/datasets/generated/ 16 | pip-log.txt 17 | ivalice.egg-info/ 18 | .coverage 19 | coverage 20 | tags 21 | coverages.zip 22 | samples.zip 23 | doc/coverages.zip 24 | doc/samples.zip 25 | coverages 26 | samples 27 | doc/coverages 28 | doc/samples 29 | 30 | 31 | *.nt.bz2 32 | *.tar.gz 33 | *.tgz 34 | joblib 35 | -------------------------------------------------------------------------------- /ivalice/impl/tests/test_adaboost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.datasets import make_classification 4 | from sklearn.tree import DecisionTreeClassifier 5 | from sklearn.utils.testing import assert_equal 6 | 7 | from ivalice.classification import AdaBoostClassifier 8 | 9 | X_bin, y_bin = make_classification(n_samples=200, n_classes=2, random_state=0) 10 | 11 | 12 | def test_adaboost_binary(): 13 | tree = DecisionTreeClassifier(max_depth=1, random_state=0) 14 | clf = AdaBoostClassifier(tree, n_estimators=10) 15 | clf.fit(X_bin, y_bin) 16 | assert_equal(clf.score(X_bin, y_bin), 0.96) 17 | -------------------------------------------------------------------------------- /ivalice/impl/tests/test_lambda_mart.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.datasets import load_diabetes 4 | from sklearn.tree import DecisionTreeRegressor 5 | from sklearn.utils.testing import assert_almost_equal 6 | 7 | from ivalice.ranking import LambdaMART 8 | 9 | data = load_diabetes() 10 | X, y = data.data, data.target 11 | y /= (y.max() - y.min()) 12 | 13 | 14 | def test_lambda_mart_ndcg(): 15 | for gains in ("linear", "exponential"): 16 | reg = DecisionTreeRegressor() 17 | lm = LambdaMART(reg, n_estimators=10, max_rank=10, gains=gains) 18 | lm.fit(X, y) 19 | ndcg = lm.score(X, y) 20 | assert_almost_equal(ndcg, 1.0) 21 | 22 | 23 | def test_lambda_mart_ndcg_all(): 24 | for gains in ("linear", "exponential"): 25 | reg = DecisionTreeRegressor() 26 | lm = LambdaMART(reg, n_estimators=10, max_rank=None, gains=gains) 27 | lm.fit(X, y) 28 | ndcg = lm.score(X, y) 29 | assert_almost_equal(ndcg, 1.0) 30 | -------------------------------------------------------------------------------- /ivalice/impl/tests/test_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.datasets import load_diabetes 4 | from sklearn.datasets import make_regression 5 | from sklearn.ensemble import RandomForestRegressor as skRF 6 | from sklearn.utils.testing import assert_almost_equal 7 | 8 | from ivalice.regression import RFRegressor 9 | 10 | diabetes = load_diabetes() 11 | X_d, y_d = diabetes.data, diabetes.target 12 | 13 | 14 | def test_regression(): 15 | rf = skRF(n_estimators=100, 16 | max_features=0.6, 17 | max_depth=3, 18 | bootstrap=False, 19 | random_state=0) 20 | rf.fit(X_d, y_d) 21 | y_pred = rf.predict(X_d) 22 | sk = np.mean((y_d - y_pred) ** 2) 23 | 24 | 25 | rf = RFRegressor(n_estimators=100, 26 | max_features=0.6, 27 | max_depth=3, 28 | bootstrap=False, 29 | random_state=0) 30 | 31 | rf.fit(X_d, y_d) 32 | y_pred = rf.predict(X_d) 33 | iv = np.mean((y_d - y_pred) ** 2) 34 | 35 | assert_almost_equal(sk, 2692.3, 1) 36 | assert_almost_equal(iv, 2689.9, 1) 37 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | ivalice 4 | ======= 5 | 6 | Boosting and ensemble learning library in Python. 7 | 8 | Algorithms supported: 9 | 10 | - Classification and regression trees (work in progress) 11 | - Random forests (work in progress) 12 | - Gradient Boosting 13 | - McRank 14 | - LambdaMART 15 | 16 | ivalice follows the `scikit-learn `_ API conventions. 17 | Computationally demanding parts are implemented using `Numba 18 | `_. 19 | 20 | Dependencies 21 | ------------ 22 | 23 | ivalice needs Python >= 2.7, setuptools, Numpy >= 1.3, SciPy >= 0.7, 24 | scikit-learn >= 0.15.1 and Numba >= 0.13.4. 25 | 26 | To run the tests you will also need nose >= 0.10. 27 | 28 | Installation 29 | ------------ 30 | 31 | To install ivalice from pip, type:: 32 | 33 | pip install https://github.com/mblondel/ivalice/archive/master.zip 34 | 35 | To install ivalice from source, type:: 36 | 37 | git clone https://github.com/mblondel/ivalice.git 38 | cd ivalice 39 | sudo python setup.py install 40 | 41 | On Github 42 | --------- 43 | 44 | https://github.com/mblondel/ivalice 45 | 46 | Author 47 | ------ 48 | 49 | Mathieu Blondel, 2014-present 50 | -------------------------------------------------------------------------------- /benchmarks/bench_rf.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import fetch_covtype 6 | from sklearn.cross_validation import train_test_split 7 | from sklearn.ensemble import RandomForestRegressor 8 | from sklearn.metrics import mean_squared_error 9 | 10 | from ivalice.regression import RFRegressor 11 | 12 | data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=0) 13 | X, y = data.data, data.target 14 | 15 | n_samples = 10000 16 | mask = y <= 2 17 | Xb = X[mask][:n_samples] 18 | yb = y[mask][:n_samples] 19 | 20 | Xb_tr, Xb_te, yb_tr, yb_te = train_test_split(Xb, yb, train_size=0.75, 21 | test_size=0.2, random_state=0) 22 | 23 | rf = RandomForestRegressor(n_estimators=100, 24 | max_depth=3, 25 | max_features=0.6) 26 | start = time.time() 27 | rf.fit(Xb_tr, yb_tr) 28 | print "RandomForestRegressor" 29 | print time.time() - start, "seconds" 30 | y_pred = rf.predict(Xb_te) 31 | print mean_squared_error(yb_te, y_pred) 32 | print 33 | 34 | rf = RFRegressor(n_estimators=100, 35 | max_depth=3, 36 | max_features=0.6) 37 | start = time.time() 38 | rf.fit(Xb_tr, yb_tr) 39 | print "RandomForestRegressor" 40 | print time.time() - start, "seconds" 41 | y_pred = rf.predict(Xb_te) 42 | print mean_squared_error(yb_te, y_pred) 43 | -------------------------------------------------------------------------------- /ivalice/impl/tests/test_sort.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.utils.testing import assert_array_equal 4 | 5 | from ivalice.impl.sort import quicksort 6 | from ivalice.impl.sort import heapsort 7 | 8 | 9 | def test_quicksort(): 10 | rng = np.random.RandomState(0) 11 | values = rng.rand(500) 12 | indices = np.arange(len(values)).astype(np.int32) 13 | 14 | sorted_idx = np.argsort(values) 15 | sorted_values = values[sorted_idx] 16 | sorted_indices = indices[sorted_idx] 17 | 18 | quicksort(values, indices, 0, len(values) - 1) 19 | 20 | assert_array_equal(sorted_values, values) 21 | assert_array_equal(sorted_indices, indices) 22 | 23 | 24 | def test_quicksort_one(): 25 | values = np.arange(1).astype(np.float64) 26 | indices = np.arange(1).astype(np.int32) 27 | quicksort(values, indices, 0, len(values) - 1) 28 | 29 | 30 | def test_heapsort(): 31 | rng = np.random.RandomState(0) 32 | values = rng.rand(500) 33 | indices = np.arange(len(values)).astype(np.int32) 34 | 35 | sorted_idx = np.argsort(values) 36 | sorted_values = values[sorted_idx] 37 | sorted_indices = indices[sorted_idx] 38 | 39 | heapsort(values, indices, len(values)) 40 | 41 | assert_array_equal(sorted_values, values) 42 | assert_array_equal(sorted_indices, indices) 43 | 44 | 45 | def test_heapsort_one(): 46 | values = np.arange(1).astype(np.float64) 47 | indices = np.arange(1).astype(np.int32) 48 | heapsort(values, indices, len(values)) 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # 3 | # Copyright (C) 2014 Mathieu Blondel 4 | 5 | import sys 6 | import os 7 | 8 | DISTNAME = 'ivalice' 9 | DESCRIPTION = "Boosting and ensemble learning library in Python." 10 | LONG_DESCRIPTION = open('README.rst').read() 11 | MAINTAINER = 'Mathieu Blondel' 12 | MAINTAINER_EMAIL = 'mathieu@mblondel.org' 13 | URL = 'https://github.com/mblondel/ivalice' 14 | LICENSE = 'new BSD' 15 | DOWNLOAD_URL = 'https://github.com/mblondel/ivalice' 16 | VERSION = '0.1-git' 17 | 18 | import setuptools # we are using a setuptools namespace 19 | from numpy.distutils.core import setup 20 | 21 | 22 | def configuration(parent_package='', top_path=None): 23 | if os.path.exists('MANIFEST'): 24 | os.remove('MANIFEST') 25 | 26 | from numpy.distutils.misc_util import Configuration 27 | config = Configuration(None, parent_package, top_path) 28 | 29 | config.add_subpackage('ivalice') 30 | 31 | return config 32 | 33 | if __name__ == "__main__": 34 | 35 | old_path = os.getcwd() 36 | local_path = os.path.dirname(os.path.abspath(sys.argv[0])) 37 | 38 | os.chdir(local_path) 39 | sys.path.insert(0, local_path) 40 | 41 | setup(configuration=configuration, 42 | name=DISTNAME, 43 | maintainer=MAINTAINER, 44 | include_package_data=True, 45 | maintainer_email=MAINTAINER_EMAIL, 46 | description=DESCRIPTION, 47 | license=LICENSE, 48 | url=URL, 49 | version=VERSION, 50 | download_url=DOWNLOAD_URL, 51 | long_description=LONG_DESCRIPTION, 52 | zip_safe=False, # the package can run out of an .egg file 53 | classifiers=[ 54 | 'Intended Audience :: Science/Research', 55 | 'Intended Audience :: Developers', 56 | 'License :: OSI Approved', 57 | 'Programming Language :: C', 58 | 'Programming Language :: Python', 59 | 'Topic :: Software Development', 60 | 'Topic :: Scientific/Engineering', 61 | 'Operating System :: Microsoft :: Windows', 62 | 'Operating System :: POSIX', 63 | 'Operating System :: Unix', 64 | 'Operating System :: MacOS' 65 | ] 66 | ) 67 | -------------------------------------------------------------------------------- /ivalice/impl/adaboost.py: -------------------------------------------------------------------------------- 1 | """AdaBoost""" 2 | 3 | # Author: Mathieu Blondel 4 | # License: BSD 3 clause 5 | 6 | import numpy as np 7 | 8 | from sklearn.base import BaseEstimator, ClassifierMixin, clone 9 | from sklearn.preprocessing import LabelBinarizer 10 | from sklearn.metrics import accuracy_score 11 | 12 | 13 | class AdaBoostClassifier(BaseEstimator, ClassifierMixin): 14 | 15 | def __init__(self, estimator, n_estimators=10): 16 | self.estimator = estimator 17 | self.n_estimators = n_estimators 18 | 19 | def fit(self, X, y): 20 | n_samples = X.shape[0] 21 | 22 | weights = np.ones(n_samples, dtype=np.float64) / n_samples 23 | 24 | self._lb = LabelBinarizer(neg_label=-1) 25 | y = self._lb.fit_transform(y).ravel() 26 | 27 | self.estimators_ = np.zeros(self.n_estimators, dtype=np.object) 28 | self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) 29 | 30 | y_pred_ = np.zeros(n_samples, dtype=np.float64) 31 | 32 | for it in xrange(self.n_estimators): 33 | est = clone(self.estimator) 34 | est = est.fit(X, y, sample_weight=weights) 35 | 36 | y_pred = est.predict(X) 37 | err = 1 - accuracy_score(y, y_pred, sample_weight=weights) 38 | 39 | if err == 0: 40 | self.estimator_weights_[it] = 1 41 | self.estimators_[it] = est 42 | break 43 | 44 | alpha = 0.5 * np.log((1 - err) / err) 45 | 46 | #weights *= np.exp(- alpha * y * y_pred) 47 | #weights /= weights.sum() 48 | 49 | y_pred_ += alpha * y_pred 50 | weights = np.exp(-y * y_pred_) 51 | #weights = 1.0 / (1 + np.exp(y * y_pred_)) # logit boost 52 | weights /= weights.sum() 53 | 54 | self.estimator_weights_[it] = alpha 55 | self.estimators_[it] = est 56 | 57 | 58 | return self 59 | 60 | def predict(self, X): 61 | y_pred = np.zeros(X.shape[0], dtype=np.float64) 62 | for it in xrange(self.n_estimators): 63 | if self.estimator_weights_[it] != 0: 64 | pred = self.estimators_[it].predict(X) 65 | y_pred += self.estimator_weights_[it] * pred 66 | y_pred = np.sign(y_pred) 67 | return self._lb.inverse_transform(y_pred.reshape(-1, 1)) 68 | -------------------------------------------------------------------------------- /examples/plot_gradient_boosting_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | ================================ 3 | Gradient boosting classification 4 | ================================ 5 | 6 | This example compares the squared hinge and log losses in gradient boosting. 7 | """ 8 | 9 | print __doc__ 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | 14 | from sklearn.datasets import load_iris 15 | from sklearn.cross_validation import train_test_split 16 | from sklearn.tree import DecisionTreeClassifier 17 | 18 | from ivalice.classification import GBClassifier 19 | 20 | n_estimators = 10 21 | 22 | class Callback(object): 23 | 24 | def __init__(self, X_tr, y_tr, X_te, y_te): 25 | self.X_tr = X_tr 26 | self.y_tr = y_tr 27 | self.X_te = X_te 28 | self.y_te = y_te 29 | self.accuracy_tr = [] 30 | self.accuracy_te = [] 31 | 32 | def __call__(self, est): 33 | y_pred_tr = est.predict(X_tr) 34 | y_pred_te = est.predict(X_te) 35 | self.accuracy_tr.append(np.mean(self.y_tr == y_pred_tr)) 36 | self.accuracy_te.append(np.mean(self.y_te == y_pred_te)) 37 | 38 | data = load_iris() 39 | 40 | X_tr, X_te, y_tr, y_te = train_test_split(data.data, data.target, 41 | train_size=0.5, test_size=0.5, 42 | random_state=0) 43 | 44 | tree = DecisionTreeClassifier(max_depth=1) # decision stumps 45 | 46 | estimators = ( 47 | ("squared hinge", "b", GBClassifier(tree, n_estimators=n_estimators, 48 | step_size="constant", learning_rate=0.1, 49 | loss="squared_hinge")), 50 | 51 | ("log", "g", GBClassifier(tree, n_estimators=n_estimators, 52 | step_size="constant", learning_rate=0.1, loss="log")), 53 | 54 | ) 55 | 56 | it = np.arange(n_estimators) + 1 57 | 58 | for name, color, clf in estimators: 59 | clf.callback = Callback(X_tr, y_tr, X_te, y_te) 60 | clf.fit(X_tr, y_tr) 61 | 62 | plt.plot(it, clf.callback.accuracy_tr, label=name + " (train)", color=color, 63 | linestyle="-", linewidth=2) 64 | plt.plot(it, clf.callback.accuracy_te, label=name + " (test)", color=color, 65 | linestyle="--", linewidth=2) 66 | 67 | plt.xlabel("Boosting iteration") 68 | plt.ylabel("Accuracy") 69 | plt.legend(loc="lower right") 70 | 71 | plt.show() 72 | -------------------------------------------------------------------------------- /ivalice/impl/forest.py: -------------------------------------------------------------------------------- 1 | """Random Forests""" 2 | 3 | # Author: Mathieu Blondel 4 | # License: BSD 3 clause 5 | 6 | import numpy as np 7 | 8 | from sklearn.base import BaseEstimator, clone 9 | from sklearn.base import RegressorMixin 10 | from sklearn.utils import check_random_state 11 | 12 | MAX_INT = np.iinfo(np.int32).max 13 | 14 | 15 | def _fit_random_tree(tree, X, y, sample_weight, bootstrap, rng): 16 | if bootstrap: 17 | n_samples = X.shape[0] 18 | if sample_weight is None: 19 | sample_weight = np.ones((n_samples,), dtype=np.float64) 20 | else: 21 | sample_weight = sample_weight.copy() 22 | 23 | indices = rng.randint(0, n_samples, n_samples) 24 | sample_counts = np.bincount(indices, minlength=n_samples) 25 | sample_weight *= sample_counts 26 | 27 | tree.fit(X, y, sample_weight=sample_weight) 28 | tree.indices_ = sample_counts > 0. 29 | 30 | else: 31 | tree.fit(X, y, sample_weight=sample_weight) 32 | 33 | 34 | class _BaseRF(BaseEstimator): 35 | 36 | def _fit(self, X, y, sample_weight, tree): 37 | rng = check_random_state(self.random_state) 38 | self.estimators_ = [] 39 | 40 | 41 | for k in xrange(self.n_estimators): 42 | tree = clone(tree) 43 | tree.set_params(random_state=rng.randint(MAX_INT)) 44 | _fit_random_tree(tree, X, y, sample_weight, self.bootstrap, rng) 45 | self.estimators_.append(tree) 46 | 47 | return self 48 | 49 | 50 | class RFRegressor(_BaseRF, RegressorMixin): 51 | 52 | def __init__(self, n_estimators=10, max_features=None, max_depth=None, 53 | min_samples_split=2, min_samples_leaf=1, bootstrap=True, 54 | random_state=None): 55 | self.n_estimators = n_estimators 56 | self.max_features = max_features 57 | self.max_depth = max_depth 58 | self.min_samples_split = min_samples_split 59 | self.min_samples_leaf = min_samples_leaf 60 | self.bootstrap = bootstrap 61 | self.random_state = random_state 62 | 63 | def fit(self, X, y, sample_weight=None): 64 | X = np.array(X, dtype=np.float64) 65 | y = np.array(y, dtype=np.float64) 66 | 67 | from .tree import TreeRegressor 68 | 69 | tree = TreeRegressor(max_features=self.max_features, 70 | max_depth=self.max_depth, 71 | min_samples_split=self.min_samples_split, 72 | min_samples_leaf=self.min_samples_leaf) 73 | return self._fit(X, y, sample_weight, tree) 74 | 75 | def predict(self, X): 76 | pred = np.array([tree.predict(X) for tree in self.estimators_]) 77 | return np.mean(pred, axis=0) 78 | -------------------------------------------------------------------------------- /ivalice/impl/tests/test_mcrank.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.datasets import load_diabetes 4 | from sklearn.ensemble import GradientBoostingClassifier 5 | from sklearn.utils.testing import assert_almost_equal 6 | from sklearn.utils.testing import assert_equal 7 | 8 | from ivalice.ranking import McRank 9 | from ivalice.ranking import OrdinalMcRank 10 | 11 | 12 | bunch = load_diabetes() 13 | X, y = bunch.data, bunch.target 14 | y = np.round(y, decimals=-2) 15 | 16 | 17 | def test_mcrank(): 18 | gb = GradientBoostingClassifier(n_estimators=10, 19 | loss="deviance", 20 | random_state=0) 21 | mc = McRank(gb) 22 | mc.fit(X, y) 23 | assert_almost_equal(mc.score(X, y), 48.08, 2) 24 | 25 | 26 | def test_mcrank_set_estimator_params(): 27 | gb = GradientBoostingClassifier(n_estimators=5, 28 | loss="deviance", 29 | random_state=0) 30 | mc = McRank(gb) 31 | mc.set_params(estimator__n_estimators=10) 32 | assert_equal(gb.n_estimators, 10) 33 | 34 | 35 | def test_mcrank_warm_start(): 36 | gb = GradientBoostingClassifier(n_estimators=5, 37 | loss="deviance", 38 | warm_start=True, 39 | random_state=0) 40 | mc = McRank(gb) 41 | mc.fit(X, y) 42 | assert_almost_equal(mc.score(X, y), 56.06, 1) 43 | 44 | mc.set_params(estimator__n_estimators=10) 45 | mc.fit(X, y) 46 | assert_almost_equal(mc.score(X, y), 48.08, 2) 47 | 48 | 49 | def test_ordinal_mcrank(): 50 | gb = GradientBoostingClassifier(n_estimators=10, 51 | loss="deviance", 52 | random_state=0) 53 | mc = OrdinalMcRank(gb) 54 | mc.fit(X, y) 55 | assert_almost_equal(mc.score(X, y), 48.62, 2) 56 | 57 | 58 | def test_ordinal_mcrank_set_estimator_params(): 59 | gb = GradientBoostingClassifier(n_estimators=5, 60 | loss="deviance", 61 | random_state=0) 62 | mc = OrdinalMcRank(gb) 63 | mc.set_params(estimator__n_estimators=10) 64 | assert_equal(gb.n_estimators, 10) 65 | 66 | 67 | def test_ordinal_mcrank_warm_start(): 68 | gb = GradientBoostingClassifier(n_estimators=5, 69 | loss="deviance", 70 | warm_start=True, 71 | random_state=0) 72 | 73 | mc = OrdinalMcRank(gb) 74 | mc.fit(X, y) 75 | assert_almost_equal(mc.score(X, y), 56.35, 2) 76 | 77 | mc.set_params(estimator__n_estimators=10) 78 | mc.fit(X, y) 79 | assert_almost_equal(mc.score(X, y), 48.62, 2) 80 | -------------------------------------------------------------------------------- /ivalice/impl/lambda_mart.py: -------------------------------------------------------------------------------- 1 | """LambdaMART""" 2 | 3 | # Author: Mathieu Blondel 4 | # License: BSD 3 clause 5 | 6 | import numpy as np 7 | import numba 8 | 9 | from .gradient_boosting import _BaseGB, _MeanEstimator 10 | 11 | 12 | @numba.njit("void(f8[:], f8[:], f8[:], f8, i4, f8[:])") 13 | def _negative_gradient(y, y_pred, c, idcg, max_rank, g): 14 | n_samples = y.shape[0] 15 | 16 | for i in xrange(max_rank): 17 | for j in xrange(i + 1, n_samples): 18 | S = np.sign(y[i] - y[j]) 19 | 20 | if S == 0: 21 | continue 22 | 23 | score_diff = y_pred[i] - y_pred[j] 24 | 25 | diff = y[j] * (c[i] - c[j]) + y[i] * (c[j] - c[i]) 26 | ndcg_diff = abs(diff / idcg) 27 | 28 | if ndcg_diff == 0: 29 | continue 30 | 31 | rho = 1.0 / (1.0 + np.exp(S * score_diff)) 32 | #rho = expit(-S * score_diff) 33 | g[i] += S * ndcg_diff * rho 34 | g[j] -= S * ndcg_diff * rho 35 | 36 | 37 | def _dcg_score(y_true, y_score, max_rank=10, gains="exponential"): 38 | order = np.lexsort((y_true, -y_score)) 39 | 40 | if max_rank is not None: 41 | order = order[:max_rank] 42 | 43 | y_true = np.take(y_true, order) 44 | 45 | if gains == "exponential": 46 | gains = 2 ** y_true - 1 47 | elif gains == "linear": 48 | gains = y_true 49 | else: 50 | raise ValueError("Invalid gains option.") 51 | 52 | # highest rank is 1 so +2 instead of +1 53 | discounts = np.log2(np.arange(len(y_true)) + 2) 54 | return np.sum(gains / discounts) 55 | 56 | 57 | def _ndcg_score(y_true, y_score, max_rank=10, gains="exponential"): 58 | best = _dcg_score(y_true, y_true, max_rank, gains) 59 | actual = _dcg_score(y_true, y_score, max_rank, gains) 60 | return actual / best 61 | 62 | 63 | class _NDCGLoss(object): 64 | 65 | def __init__(self, max_rank=10): 66 | self.max_rank = max_rank 67 | 68 | def init_estimator(self): 69 | return _MeanEstimator() 70 | 71 | def negative_gradient(self, y, y_pred): 72 | n_samples = y.shape[0] 73 | 74 | max_rank = self.max_rank if self.max_rank is not None else n_samples 75 | 76 | #order = np.argsort(y_pred)[::-1] 77 | order = np.lexsort((y, -y_pred)) 78 | y = np.take(y, order) 79 | y_pred = np.take(y_pred, order) 80 | 81 | ind = np.arange(n_samples) 82 | c = 1. / np.log2(ind + 2) # discount factors 83 | c[max_rank:] = 0 84 | 85 | g = np.zeros(n_samples, dtype=np.float64) 86 | 87 | y_sorted = np.sort(y)[::-1] 88 | idcg = np.sum(y_sorted * c) 89 | 90 | _negative_gradient(y, y_pred, c, idcg, max_rank, g) 91 | 92 | if np.any(np.isnan(g)): 93 | print "g contains NaNs" 94 | 95 | inv_ix = np.empty_like(order) 96 | inv_ix[order] = np.arange(len(order)) 97 | g = g[inv_ix] 98 | 99 | return g 100 | 101 | 102 | class LambdaMART(_BaseGB): 103 | 104 | def __init__(self, estimator, n_estimators=100, learning_rate=1.0, 105 | loss="ndcg", max_rank=10, gains="exponential", 106 | subsample=1.0, callback=None, random_state=None): 107 | self.estimator = estimator 108 | self.n_estimators = n_estimators 109 | self.learning_rate = learning_rate 110 | self.loss = loss 111 | self.max_rank = max_rank 112 | self.gains = gains 113 | self.subsample = subsample 114 | self.callback = callback 115 | self.random_state = random_state 116 | 117 | def _get_loss(self): 118 | losses = dict(ndcg=_NDCGLoss(max_rank=self.max_rank)) 119 | return losses[self.loss] 120 | 121 | def fit(self, X, y): 122 | if self.gains == "exponential": 123 | y = 2 ** y - 1 124 | 125 | return super(LambdaMART, self).fit(X, y) 126 | 127 | def score(self, X, y): 128 | y_pred = self.predict(X) 129 | return _ndcg_score(y, y_pred, max_rank=self.max_rank, gains=self.gains) 130 | -------------------------------------------------------------------------------- /ivalice/impl/sort.py: -------------------------------------------------------------------------------- 1 | """Efficient sorting routines""" 2 | 3 | # Authors: Jake Vanderplas (quicksort) 4 | # Lars Buitinck (heapsort) 5 | # Mathieu Blondel (Numba port) 6 | # License: BSD 3 clause 7 | 8 | import numba 9 | 10 | 11 | @numba.njit("void(f8[:], i4[:], i4, i4)") 12 | def _dual_swap(values, indices, i1, i2): 13 | dtmp = values[i1] 14 | values[i1] = values[i2] 15 | values[i2] = dtmp 16 | 17 | itmp = indices[i1] 18 | indices[i1] = indices[i2] 19 | indices[i2] = itmp 20 | 21 | 22 | @numba.njit("i4(f8[:], i4, i4)") 23 | def _median3(values, start, end): 24 | # Median of three pivot selection, after Bentley and McIlroy (1993). 25 | # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. 26 | size = end - start + 1 27 | mid = start + size / 2 28 | 29 | a = values[start] 30 | b = values[mid] 31 | c = values[end] 32 | 33 | if a < b: 34 | if b < c: 35 | return mid 36 | elif a < c: 37 | return end 38 | else: 39 | return start 40 | elif b < c: 41 | if a < c: 42 | return start 43 | else: 44 | return end 45 | else: 46 | return mid 47 | 48 | 49 | @numba.njit("i4(f8[:], i4[:], i4, i4)") 50 | def _partition(values, indices, start, end): 51 | #pivot_idx = start + (end - start + 1) / 2 52 | pivot_idx = _median3(values, start, end) 53 | _dual_swap(values, indices, start, pivot_idx) 54 | pivot = values[start] 55 | i = start + 1 56 | j = start + 1 57 | 58 | while j <= end: 59 | if values[j] <= pivot: 60 | _dual_swap(values, indices, i, j) 61 | i += 1 62 | j += 1 63 | 64 | _dual_swap(values, indices, start, i - 1) 65 | 66 | return i - 1 67 | 68 | 69 | @numba.njit("void(f8[:], i4[:], i4)") 70 | def _sort2(values, indices, start): 71 | end = start + 1 72 | if values[start] > values[end]: 73 | _dual_swap(values, indices, start, end) 74 | 75 | 76 | @numba.njit("void(f8[:], i4[:], i4)") 77 | def _sort3(values, indices, start): 78 | mid = start + 1 79 | end = start + 2 80 | if values[start] > values[mid]: 81 | _dual_swap(values, indices, start, mid) 82 | if values[mid] > values[end]: 83 | _dual_swap(values, indices, mid, end) 84 | if values[start] > values[mid]: 85 | _dual_swap(values, indices, start, mid) 86 | 87 | 88 | # As of Numba v0.13.2, recursion is not supported in Numba yet. 89 | @numba.jit("void(f8[:], i4[:], i4, i4)") 90 | def quicksort(values, indices, start, end): 91 | size = end - start + 1 92 | 93 | if size == 2: 94 | _sort2(values, indices, start) 95 | elif size == 3: 96 | _sort3(values, indices, start) 97 | if size > 1: 98 | i = _partition(values, indices, start, end) 99 | quicksort(values, indices, start, i - 1) 100 | quicksort(values, indices, i + 1, end) 101 | 102 | 103 | @numba.njit("void(f8[:], i4[:], i4, i4)") 104 | def _sift_down(values, indices, start, end): 105 | # Restore heap order in Xf[start:end] by moving the max element to start. 106 | 107 | root = start 108 | while True: 109 | child = root * 2 + 1 110 | 111 | # find max of root, left child, right child 112 | maxind = root 113 | if child < end and values[maxind] < values[child]: 114 | maxind = child 115 | if child + 1 < end and values[maxind] < values[child + 1]: 116 | maxind = child + 1 117 | 118 | if maxind == root: 119 | break 120 | else: 121 | _dual_swap(values, indices, root, maxind) 122 | root = maxind 123 | 124 | 125 | @numba.njit("void(f8[:], i4[:], i4)") 126 | def heapsort(values, indices, size): 127 | if size > 1: 128 | # Heapify. 129 | start = (size - 2) / 2 130 | end = size 131 | while True: 132 | _sift_down(values, indices, start, end) 133 | if start == 0: 134 | break 135 | start -= 1 136 | 137 | # Sort by shrinking the heap, putting the max element 138 | # immediately after it. 139 | end = size - 1 140 | while end > 0: 141 | _dual_swap(values, indices, 0, end) 142 | _sift_down(values, indices, 0, end) 143 | end -= 1 144 | -------------------------------------------------------------------------------- /ivalice/impl/mcrank.py: -------------------------------------------------------------------------------- 1 | """McRank""" 2 | 3 | # Author: Mathieu Blondel 4 | # License: BSD 3 clause 5 | 6 | import numpy as np 7 | 8 | from sklearn.base import BaseEstimator, clone 9 | from sklearn.ensemble import GradientBoostingClassifier 10 | from sklearn.preprocessing import LabelEncoder 11 | 12 | 13 | DEFAULT_CLF = GradientBoostingClassifier(loss="deviance") 14 | 15 | 16 | class _BaseMcRank(BaseEstimator): 17 | 18 | def __init__(self, estimator=DEFAULT_CLF): 19 | self.estimator = estimator 20 | 21 | @property 22 | def classes_(self): 23 | return self._label_encoder.classes_ 24 | 25 | def score(self, X, y): 26 | y_pred = self.predict(X) 27 | return np.mean(np.abs(y - y_pred)) 28 | 29 | def predict(self, X): 30 | """Predict expected target value for X. 31 | 32 | Parameters 33 | ---------- 34 | X : array-like of shape = [n_samples, n_features] 35 | The input samples. 36 | 37 | Returns 38 | ------- 39 | p : array of shape = [n_samples] 40 | """ 41 | n_samples = X.shape[0] 42 | n_classes = len(self.classes_) 43 | proba = self.predict_proba(X) 44 | classes = np.repeat(self.classes_, n_samples) 45 | classes = classes.reshape(n_classes, n_samples).T 46 | # pred[i] = \sum_m P(y_i = m) * m 47 | return np.average(classes, axis=1, weights=proba) 48 | 49 | def _get_estimator_params(self, **params): 50 | est_params = {} 51 | for key, value in params.items(): 52 | if key.startswith("estimator__"): 53 | key = key.replace("estimator__", "") 54 | est_params[key] = value 55 | return est_params 56 | 57 | 58 | class McRank(_BaseMcRank): 59 | 60 | def set_params(self, **params): 61 | super(McRank, self).set_params(**params) 62 | 63 | est_params = self._get_estimator_params(**params) 64 | 65 | if hasattr(self, "estimator_") and len(est_params) > 0: 66 | self.estimator_.set_params(**est_params) 67 | 68 | def fit(self, X, y): 69 | self._label_encoder = LabelEncoder() 70 | y = self._label_encoder.fit_transform(y) 71 | 72 | if not hasattr(self, "estimator_") or \ 73 | not getattr(self.estimator, "warm_start", False): 74 | self.estimator_ = clone(self.estimator) 75 | 76 | self.estimator_.fit(X, y) 77 | 78 | return self 79 | 80 | def predict_proba(self, X): 81 | """Predict class probabilities for X. 82 | 83 | Parameters 84 | ---------- 85 | X : array-like of shape = [n_samples, n_features] 86 | The input samples. 87 | 88 | Returns 89 | ------- 90 | p : array of shape = [n_samples, n_classes] 91 | The class probabilities of the input samples. The order of the 92 | classes corresponds to that in the attribute `classes_`. 93 | """ 94 | return self.estimator_.predict_proba(X) 95 | 96 | 97 | class OrdinalMcRank(_BaseMcRank): 98 | 99 | def _fit(self, X, y, m, est): 100 | cond = y <= m 101 | y_bin = y.copy() 102 | y_bin[cond] = 0 103 | y_bin[~cond] = 1 104 | est.fit(X, y_bin) 105 | 106 | def set_params(self, **params): 107 | super(OrdinalMcRank, self).set_params(**params) 108 | 109 | est_params = self._get_estimator_params(**params) 110 | 111 | if hasattr(self, "estimators_") and len(est_params) > 0: 112 | for est in self.estimators_: 113 | est.set_params(**est_params) 114 | 115 | def fit(self, X, y): 116 | self._label_encoder = LabelEncoder() 117 | y = self._label_encoder.fit_transform(y) 118 | 119 | n_classifiers = len(self.classes_) - 1 120 | 121 | if not hasattr(self, "estimators_") or \ 122 | not getattr(self.estimator, "warm_start", False): 123 | self.estimators_ = [clone(self.estimator) 124 | for m in xrange(n_classifiers)] 125 | 126 | for m in xrange(n_classifiers): 127 | self._fit(X, y, m, self.estimators_[m]) 128 | 129 | return self 130 | 131 | def predict_proba(self, X): 132 | """Predict class probabilities for X. 133 | 134 | Parameters 135 | ---------- 136 | X : array-like of shape = [n_samples, n_features] 137 | The input samples. 138 | 139 | Returns 140 | ------- 141 | p : array of shape = [n_samples, n_classes] 142 | The class probabilities of the input samples. The order of the 143 | classes corresponds to that in the attribute `classes_`. 144 | """ 145 | n_samples = X.shape[0] 146 | n_classes = len(self.classes_) 147 | 148 | # 2d array of shape (n_classes-1) x n_samples containing 149 | # cumulative probabilities P(y_i <= k) 150 | P = np.array([e.predict_proba(X)[:, 0] for e in self.estimators_]) 151 | 152 | # 2d array of shape n_classes x n_samples containing 153 | # cumulative probabilities P(y_i <= k) 154 | P = np.vstack((P, np.ones(n_samples))) 155 | 156 | proba = np.zeros((n_samples, n_classes), dtype=np.float64) 157 | 158 | proba[:, 0] = P[0] # P(y = 0) = P(y <= 0) 159 | 160 | for m in xrange(1, n_classes): 161 | proba[:, m] = P[m] - P[m - 1] # P(y = m) = P(y <= m) - P(y <= m - 1) 162 | 163 | return proba 164 | -------------------------------------------------------------------------------- /ivalice/impl/tests/test_gradient_boosting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.datasets import load_diabetes 4 | from sklearn.datasets import load_iris 5 | from sklearn.datasets import load_linnerud 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.tree import DecisionTreeRegressor 8 | from sklearn.ensemble import GradientBoostingRegressor 9 | from sklearn.cross_validation import train_test_split 10 | from sklearn.metrics import r2_score 11 | from sklearn.svm import SVR 12 | 13 | from sklearn.utils.testing import assert_almost_equal 14 | from sklearn.utils.testing import assert_array_almost_equal 15 | 16 | from ivalice.classification import GBClassifier 17 | from ivalice.regression import GBRegressor 18 | 19 | bunch = load_diabetes() 20 | X, y = bunch.data, bunch.target 21 | 22 | X_tr, X_te, y_tr, y_te = train_test_split(X, y, 23 | train_size=0.75, 24 | test_size=0.25, 25 | random_state=0) 26 | 27 | iris = load_iris() 28 | cond = iris.target <= 1 29 | X_bin, y_bin = iris.data[cond], iris.target[cond] 30 | 31 | X_bin_tr, X_bin_te, y_bin_tr, y_bin_te = train_test_split(X_bin, y_bin, 32 | train_size=0.75, 33 | test_size=0.25, 34 | random_state=0) 35 | 36 | iris = load_iris() 37 | X_mult, y_mult = iris.data, iris.target 38 | 39 | X_mult_tr, X_mult_te, y_mult_tr, y_mult_te = train_test_split(X_mult, y_mult, 40 | train_size=0.75, 41 | test_size=0.25, 42 | random_state=0) 43 | 44 | 45 | def test_squared_loss(): 46 | reg = GradientBoostingRegressor(learning_rate=0.1, max_depth=3, 47 | random_state=0) 48 | reg.fit(X_tr, y_tr) 49 | y_pred = reg.predict(X_te) 50 | sk = np.sqrt(np.mean((y_pred - y_te) ** 2)) 51 | 52 | reg = DecisionTreeRegressor(max_features=1.0, max_depth=3, random_state=0) 53 | reg = GBRegressor(reg, n_estimators=100, learning_rate=0.1) 54 | reg.fit(X_tr, y_tr) 55 | y_pred = reg.predict(X_te) 56 | iv = np.sqrt(np.mean((y_pred - y_te) ** 2)) 57 | 58 | assert_almost_equal(sk, iv, 0) 59 | 60 | 61 | def test_squared_loss_svr(): 62 | reg = SVR(kernel="rbf", gamma=10) 63 | reg = GBRegressor(reg, n_estimators=10) 64 | reg.fit(X_tr, y_tr) 65 | y_pred = reg.predict(X_tr) 66 | assert_almost_equal(np.mean((y_tr - y_pred) ** 2), 3778.3, 1) 67 | 68 | 69 | def test_absolute_loss(): 70 | # Check absolute loss with scikit-learn implementation. 71 | reg = GradientBoostingRegressor(learning_rate=0.1, loss="lad", 72 | random_state=0) 73 | reg.fit(X_tr, y_tr) 74 | y_pred = reg.predict(X_te) 75 | sk = np.mean(np.abs(y_pred - y_te)) 76 | 77 | reg = DecisionTreeRegressor(max_features=1.0, max_depth=3, random_state=0) 78 | reg = GBRegressor(reg, n_estimators=100, learning_rate=0.1, loss="absolute") 79 | reg.fit(X_tr, y_tr) 80 | y_pred = reg.predict(X_te) 81 | iv = np.mean(np.abs(y_pred - y_te)) 82 | 83 | assert_almost_equal(sk, iv, 0) 84 | 85 | 86 | def test_absolute_loss_constant(): 87 | # Check absolute loss with scikit-learn implementation. 88 | reg = DecisionTreeRegressor(max_features=1.0, max_depth=3, random_state=0) 89 | reg = GBRegressor(reg, n_estimators=100, learning_rate=0.1, loss="absolute", 90 | step_size="constant") 91 | reg.fit(X_tr, y_tr) 92 | y_pred = reg.predict(X_te) 93 | iv = np.mean(np.abs(y_pred - y_te)) 94 | 95 | assert_almost_equal(iv, 55.6, 1) 96 | 97 | 98 | def test_subsample(): 99 | reg = DecisionTreeRegressor(max_features=1.0, max_depth=3, 100 | random_state=0) 101 | reg = GBRegressor(reg, n_estimators=100, learning_rate=0.1, subsample=0.6, 102 | random_state=0) 103 | reg.fit(X_tr, y_tr) 104 | y_pred = reg.predict(X_te) 105 | mse = np.sqrt(np.mean((y_pred - y_te) ** 2)) 106 | assert_almost_equal(mse, 62.9, 1) 107 | 108 | 109 | def test_squared_hinge_loss(): 110 | # With line search. 111 | clf = DecisionTreeClassifier(max_features=1.0, max_depth=3) 112 | clf = GBClassifier(clf, n_estimators=10, step_size="line_search") 113 | clf.fit(X_bin_tr, y_bin_tr) 114 | assert_almost_equal(clf.score(X_bin_te, y_bin_te), 1.0) 115 | 116 | # With constant step size. 117 | clf = DecisionTreeClassifier(max_features=1.0, max_depth=3) 118 | clf = GBClassifier(clf, n_estimators=10, step_size="constant", 119 | learning_rate=0.1) 120 | clf.fit(X_bin_te, y_bin_te) 121 | assert_almost_equal(clf.score(X_bin_te, y_bin_te), 1.0) 122 | 123 | 124 | def test_squared_hinge_loss_ovr(): 125 | # With line search. 126 | clf = DecisionTreeClassifier(max_features=1.0, max_depth=3) 127 | clf = GBClassifier(clf, n_estimators=10, step_size="line_search") 128 | clf.fit(X_mult_tr, y_mult_tr) 129 | assert_almost_equal(clf.score(X_mult_te, y_mult_te), 0.974, 3) 130 | 131 | # With constant step size. 132 | clf = DecisionTreeClassifier(max_features=1.0, max_depth=3) 133 | clf = GBClassifier(clf, n_estimators=10, step_size="constant", 134 | learning_rate=0.1) 135 | clf.fit(X_mult_te, y_mult_te) 136 | assert_almost_equal(clf.score(X_mult_te, y_mult_te), 1.0) 137 | 138 | 139 | def test_log_loss(): 140 | # With line search. 141 | clf = DecisionTreeClassifier(max_features=1.0, max_depth=3) 142 | clf = GBClassifier(clf, n_estimators=10, step_size="line_search", 143 | loss="log") 144 | clf.fit(X_bin_tr, y_bin_tr) 145 | assert_almost_equal(clf.score(X_bin_te, y_bin_te), 1.0) 146 | 147 | # With constant step size. 148 | clf = DecisionTreeClassifier(max_features=1.0, max_depth=3) 149 | clf = GBClassifier(clf, n_estimators=10, step_size="constant", 150 | loss="log", learning_rate=0.1) 151 | clf.fit(X_bin_te, y_bin_te) 152 | assert_almost_equal(clf.score(X_bin_te, y_bin_te), 1.0) 153 | 154 | 155 | def test_multioutput_regression(): 156 | data = load_linnerud() 157 | X, Y = data.data, data.target 158 | 159 | reg = DecisionTreeRegressor(max_features=1.0, max_depth=3) 160 | reg = GBRegressor(reg, n_estimators=10, step_size="line_search") 161 | Y_pred = reg.fit(X, Y).predict(X) 162 | 163 | acc = [0.697, 0.744, 0.631] 164 | acc2 = [r2_score(Y[:, k], Y_pred[:, k]) for k in xrange(Y.shape[1])] 165 | 166 | assert_array_almost_equal(acc, acc2, 3) 167 | -------------------------------------------------------------------------------- /ivalice/impl/tests/test_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.tree import DecisionTreeRegressor as skRegTree 4 | from sklearn.tree import DecisionTreeClassifier as skClassifTree 5 | 6 | from sklearn.datasets import make_regression 7 | from sklearn.datasets import make_classification 8 | from sklearn.datasets import load_diabetes 9 | 10 | from sklearn.metrics import mean_squared_error 11 | from sklearn.metrics import accuracy_score 12 | 13 | from sklearn.utils.testing import assert_almost_equal 14 | 15 | from ivalice.impl.tree import TreeRegressor 16 | from ivalice.impl.tree import TreeClassifier 17 | 18 | 19 | def _make_regression_datasets(n_times, sw=False): 20 | for n in xrange(n_times): 21 | X, y = make_regression(n_samples=100, n_features=10, random_state=n) 22 | if sw: 23 | rng = np.random.RandomState(n) 24 | w = rng.rand(X.shape[0]) 25 | w[w <= 0.5] = 0.0 26 | yield X, y, w 27 | else: 28 | yield X, y 29 | 30 | 31 | def _make_classification_datasets(n_times, sw=False): 32 | for n in xrange(n_times): 33 | X, y = make_classification(n_samples=20, 34 | n_features=10, 35 | n_informative=10, 36 | n_redundant=0, 37 | n_classes=3, 38 | random_state=n) 39 | if sw: 40 | rng = np.random.RandomState(n) 41 | w = np.round(rng.rand(X.shape[0])) 42 | w[w <= 0.5] = 0.0 43 | yield X, y, w 44 | else: 45 | yield X, y 46 | 47 | 48 | def test_mse_fully_developed(): 49 | sk = 0 50 | iv = 0 51 | 52 | for X, y in _make_regression_datasets(10): 53 | reg = skRegTree(max_depth=None) 54 | reg.fit(X, y) 55 | y_pred = reg.predict(X) 56 | sk += np.mean((y - y_pred) ** 2) 57 | 58 | reg = TreeRegressor(max_depth=None) 59 | reg.fit(X, y) 60 | y_pred = reg.predict(X) 61 | iv += np.mean((y - y_pred) ** 2) 62 | 63 | assert_almost_equal(sk, iv) 64 | 65 | 66 | def test_mse_max_depth(): 67 | for max_depth in (5, 1): 68 | sk = 0 69 | iv = 0 70 | 71 | for X, y in _make_regression_datasets(10): 72 | reg = skRegTree(max_depth=max_depth) 73 | reg.fit(X, y) 74 | y_pred = reg.predict(X) 75 | sk += np.mean((y - y_pred) ** 2) 76 | 77 | reg = TreeRegressor(max_depth=max_depth) 78 | reg.fit(X, y) 79 | y_pred = reg.predict(X) 80 | iv += np.mean((y - y_pred) ** 2) 81 | 82 | assert_almost_equal(sk, iv) 83 | 84 | 85 | def test_mse_min_samples(): 86 | sk = 0 87 | iv = 0 88 | 89 | for X, y in _make_regression_datasets(10): 90 | reg = skRegTree(max_depth=5, 91 | min_samples_split=4, 92 | min_samples_leaf=2) 93 | reg.fit(X, y) 94 | y_pred = reg.predict(X) 95 | sk += np.mean((y - y_pred) ** 2) 96 | 97 | reg = TreeRegressor(max_depth=5, 98 | min_samples_split=4, 99 | min_samples_leaf=2) 100 | reg.fit(X, y) 101 | y_pred = reg.predict(X) 102 | iv += np.mean((y - y_pred) ** 2) 103 | 104 | assert_almost_equal(sk, iv) 105 | 106 | 107 | def test_mse_max_features(): 108 | sk = 0 109 | iv = 0 110 | 111 | n_times = 30 112 | for X, y in _make_regression_datasets(n_times): 113 | reg = skRegTree(max_depth=5, 114 | max_features=4, 115 | random_state=0) 116 | reg.fit(X, y) 117 | y_pred = reg.predict(X) 118 | sk += np.mean((y - y_pred) ** 2) 119 | 120 | reg = TreeRegressor(max_depth=5, 121 | max_features=4, 122 | random_state=0) 123 | reg.fit(X, y) 124 | y_pred = reg.predict(X) 125 | iv += np.mean((y - y_pred) ** 2) 126 | 127 | sk /= n_times 128 | iv /= n_times 129 | 130 | assert_almost_equal(sk, 4588.4, 1) 131 | assert_almost_equal(iv, 4921.1, 1) 132 | 133 | 134 | def test_mse_sample_weight(): 135 | sk = 0 136 | iv = 0 137 | 138 | n_times = 10 139 | for X, y, w in _make_regression_datasets(n_times, sw=True): 140 | reg = skRegTree(max_depth=5) 141 | reg.fit(X, y, w) 142 | y_pred = reg.predict(X) 143 | sk += mean_squared_error(y, y_pred, sample_weight=w) 144 | 145 | reg = TreeRegressor(max_depth=5) 146 | reg.fit(X, y, w) 147 | y_pred = reg.predict(X) 148 | iv += mean_squared_error(y, y_pred, sample_weight=w) 149 | 150 | sk /= n_times 151 | iv /= n_times 152 | 153 | assert_almost_equal(sk, iv) 154 | 155 | 156 | def test_mse_duplicate_features(): 157 | diabetes = load_diabetes() 158 | X, y = diabetes.data, diabetes.target 159 | 160 | reg = skRegTree(max_depth=5) 161 | reg.fit(X, y) 162 | y_pred = reg.predict(X) 163 | sk = np.mean((y - y_pred) ** 2) 164 | 165 | reg = TreeRegressor(max_depth=5) 166 | reg.fit(X, y) 167 | y_pred = reg.predict(X) 168 | iv = np.mean((y - y_pred) ** 2) 169 | 170 | assert_almost_equal(sk, iv) 171 | 172 | 173 | def test_classif_max_depth(): 174 | for criterion in ("gini", "entropy"): 175 | sk = 0 176 | iv = 0 177 | 178 | for X, y in _make_classification_datasets(10): 179 | clf = skClassifTree(criterion=criterion, 180 | max_depth=5, 181 | random_state=1) 182 | clf.fit(X, y) 183 | y_pred = clf.predict(X) 184 | sk += np.mean(y == y_pred) 185 | 186 | clf = TreeClassifier(criterion=criterion, 187 | max_depth=5, 188 | random_state=1) 189 | clf.fit(X, y) 190 | y_pred = clf.predict(X) 191 | iv += np.mean(y == y_pred) 192 | 193 | sk /= 10 194 | iv /= 10 195 | 196 | assert_almost_equal(sk, iv) 197 | 198 | 199 | def test_classif_sample_weight(): 200 | for criterion in ("gini", "entropy"): 201 | sk = 0 202 | iv = 0 203 | 204 | for X, y, w in _make_classification_datasets(10, sw=True): 205 | clf = skClassifTree(criterion=criterion, max_depth=5) 206 | clf.fit(X, y, w) 207 | y_pred = clf.predict(X) 208 | sk += accuracy_score(y, y_pred, sample_weight=w) 209 | 210 | clf = TreeClassifier(criterion=criterion, max_depth=5) 211 | clf.fit(X, y, w) 212 | y_pred = clf.predict(X) 213 | iv += accuracy_score(y, y_pred, sample_weight=w) 214 | 215 | sk /= 10 216 | iv /= 10 217 | 218 | assert_almost_equal(sk, iv) 219 | -------------------------------------------------------------------------------- /ivalice/impl/gradient_boosting.py: -------------------------------------------------------------------------------- 1 | """Functional gradient boosting""" 2 | 3 | # Author: Mathieu Blondel 4 | # License: BSD 3 clause 5 | 6 | # Differences with scikit-learn: 7 | # - accepts any base estimator (not only trees) 8 | # - line search finds base estimator weights (not leaf weights) 9 | 10 | import numpy as np 11 | from scipy import stats 12 | 13 | from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone 14 | from sklearn.preprocessing import LabelBinarizer 15 | from sklearn.utils import check_random_state 16 | 17 | 18 | # Taken from https://github.com/nudomarinero/wquantiles (MIT license) 19 | def _weighted_quantile(data, weights, quantile): 20 | # Sort the data 21 | ind_sorted = np.argsort(data) 22 | sorted_data = data[ind_sorted] 23 | sorted_weights = weights[ind_sorted] 24 | # Compute the auxiliary arrays 25 | Sn = np.cumsum(sorted_weights) 26 | # TODO: Check that the weights do not sum zero 27 | Pn = (Sn-0.5*sorted_weights)/np.sum(sorted_weights) 28 | # Get the value of the weighted median 29 | return np.interp(quantile, Pn, sorted_data) 30 | 31 | 32 | def _weighted_median(data, weights): 33 | return _weighted_quantile(data, weights, 0.5) 34 | 35 | 36 | class _QuantileEstimator(BaseEstimator): 37 | """An estimator predicting the alpha-quantile of the training targets.""" 38 | def __init__(self, alpha=0.9): 39 | if not 0 < alpha < 1.0: 40 | raise ValueError("`alpha` must be in (0, 1.0) but was %r" % alpha) 41 | self.alpha = alpha 42 | 43 | def fit(self, X, y): 44 | self.quantile = stats.scoreatpercentile(y, self.alpha * 100.0) 45 | return self 46 | 47 | def predict(self, X): 48 | y = np.empty(X.shape[0], dtype=np.float64) 49 | y.fill(self.quantile) 50 | return y 51 | 52 | 53 | class _MeanEstimator(BaseEstimator): 54 | """An estimator predicting the mean of the training targets.""" 55 | def fit(self, X, y): 56 | self.mean = np.mean(y) 57 | return self 58 | 59 | def predict(self, X): 60 | y = np.empty(X.shape[0], dtype=np.float64) 61 | y.fill(self.mean) 62 | return y 63 | 64 | 65 | class _SquareLoss(object): 66 | 67 | def init_estimator(self): 68 | return _MeanEstimator() 69 | 70 | def negative_gradient(self, y, y_pred): 71 | return y - y_pred 72 | 73 | def line_search(self, y, y_pred, h_pred): 74 | Lp = np.sum((y - y_pred) * h_pred) 75 | Lpp = np.sum(h_pred * h_pred) 76 | 77 | if Lpp == 0: 78 | return 1.0 79 | 80 | # Should be 1.0 assuming that the base learner perfectly fits the 81 | # residuals. 82 | return Lp/Lpp 83 | 84 | 85 | class _AbsoluteLoss(object): 86 | 87 | def init_estimator(self): 88 | return _QuantileEstimator(alpha=0.5) 89 | 90 | def negative_gradient(self, y, y_pred): 91 | return np.sign(y - y_pred) 92 | 93 | def line_search(self, y, y_pred, h_pred): 94 | cond = h_pred != 0 95 | diff = y - y_pred 96 | diff[cond] /= h_pred[cond] 97 | diff[~cond] = 0 98 | return _weighted_median(diff, np.abs(h_pred)) 99 | 100 | 101 | class _SquaredHingeLoss(object): 102 | 103 | def __init__(self, max_steps=1): 104 | self.max_steps = max_steps 105 | 106 | def init_estimator(self): 107 | return _MeanEstimator() 108 | 109 | def negative_gradient(self, y, y_pred): 110 | return 2 * np.maximum(1 - y * y_pred, 0) * y 111 | 112 | def line_search(self, y, y_pred, h_pred): 113 | rho = 0 114 | 115 | y_h_pred = y * h_pred 116 | h_pred_sq = h_pred ** 2 117 | 118 | for it in xrange(self.max_steps): 119 | error = 1 - y * (y_pred + rho * h_pred) 120 | Lp = -np.sum(np.maximum(error, 0) * y_h_pred) 121 | Lpp = np.sum((error > 0) * h_pred_sq) 122 | 123 | if Lpp == 0: 124 | break 125 | 126 | rho -= Lp / Lpp 127 | 128 | return rho 129 | 130 | 131 | class _LogLoss(object): 132 | 133 | def __init__(self, max_steps=1): 134 | self.max_steps = max_steps 135 | 136 | def init_estimator(self): 137 | return _MeanEstimator() 138 | 139 | def negative_gradient(self, y, y_pred): 140 | q = 1.0 / (1 + np.exp(-y * y_pred)) 141 | return -y * (q - 1) 142 | 143 | def line_search(self, y, y_pred, h_pred): 144 | rho = 0 145 | 146 | y_h_pred = y * h_pred 147 | h_pred_sq = h_pred ** 2 148 | 149 | for it in xrange(self.max_steps): 150 | q = 1.0 / (1 + np.exp(-y * (y_pred + rho * h_pred))) 151 | Lp = np.sum((q - 1) * y_h_pred) 152 | Lpp = np.sum(q * (1 - q) * h_pred_sq) 153 | 154 | if Lpp == 0: 155 | break 156 | 157 | rho -= Lp / Lpp 158 | 159 | return rho 160 | 161 | 162 | class _BaseGB(BaseEstimator): 163 | 164 | def _fit(self, X, y, y_pred, loss, rng): 165 | if self.subsample != 1.0: 166 | n = int(X.shape[0] * self.subsample) 167 | ind = rng.permutation(X.shape[0])[:n] 168 | X = X[ind] 169 | y = y[ind] 170 | y_pred = y_pred[ind] 171 | 172 | negative_gradient = loss.negative_gradient(y, y_pred) 173 | 174 | est = clone(self.estimator) 175 | est.fit(X, negative_gradient) 176 | 177 | step_size = getattr(self, "step_size", "constant") 178 | 179 | if step_size == "line_search": 180 | h_pred = est.predict(X) 181 | step_size = loss.line_search(y, y_pred, h_pred) 182 | elif step_size == "constant": 183 | step_size = 1.0 184 | else: 185 | raise ValueError("Unknown step size.") 186 | 187 | return est, step_size 188 | 189 | def fit(self, X, y): 190 | rng = check_random_state(self.random_state) 191 | loss = self._get_loss() 192 | 193 | ravel = len(y.shape) == 1 194 | Y = y.reshape(-1, 1) if ravel else y 195 | n_samples = X.shape[0] 196 | n_vectors = Y.shape[1] 197 | 198 | self.estimator_weights_ = np.ones((self.n_estimators, n_vectors), 199 | dtype=np.float64) 200 | self.estimator_weights_[1:] *= self.learning_rate 201 | 202 | self.estimators_ = np.empty((self.n_estimators, n_vectors), 203 | dtype=np.object) 204 | 205 | Y_pred = np.zeros((n_samples, n_vectors), dtype=np.float64) 206 | 207 | # Initial estimator. 208 | for k in xrange(n_vectors): 209 | est = loss.init_estimator().fit(X, Y[:, k]) 210 | self.estimators_[0, k] = est 211 | Y_pred[:, k] += est.predict(X) 212 | 213 | if self.callback is not None: 214 | self.callback(self) 215 | 216 | # Incremental fitting. 217 | for i in xrange(1, self.n_estimators): 218 | for k in xrange(n_vectors): 219 | 220 | est, step_size = self._fit(X, Y[:, k], Y_pred[:, k], loss, rng) 221 | self.estimators_[i, k] = est 222 | self.estimator_weights_[i, k] *= step_size 223 | Y_pred[:, k] += self.estimator_weights_[i, k] * est.predict(X) 224 | 225 | if self.callback is not None: 226 | self.callback(self) 227 | 228 | if ravel: 229 | self.estimators_ = self.estimators_.ravel() 230 | self.estimator_weights_ = self.estimator_weights_.ravel() 231 | 232 | return self 233 | 234 | def _df_multi(self, X): 235 | n_samples = X.shape[0] 236 | n_estimators, n_vectors = self.estimators_.shape 237 | pred = np.zeros((n_samples, n_vectors), dtype=np.float64) 238 | 239 | for i in xrange(n_estimators): 240 | for k in xrange(n_vectors): 241 | est = self.estimators_[i, k] 242 | if est is None: continue 243 | pred[:, k] += self.estimator_weights_[i, k] * est.predict(X) 244 | 245 | return pred 246 | 247 | def _df(self, X): 248 | n_samples = X.shape[0] 249 | n_estimators = self.estimators_.shape[0] 250 | pred = np.zeros(n_samples, dtype=np.float64) 251 | 252 | for i in xrange(n_estimators): 253 | est = self.estimators_[i] 254 | if est is None: continue 255 | pred += self.estimator_weights_[i] * est.predict(X) 256 | 257 | return pred 258 | 259 | def decision_function(self, X): 260 | if len(self.estimators_.shape) == 1: 261 | return self._df(X) 262 | else: 263 | return self._df_multi(X) 264 | 265 | def predict(self, X): 266 | return self.decision_function(X) 267 | 268 | 269 | class GBClassifier(_BaseGB, ClassifierMixin): 270 | 271 | def __init__(self, estimator, n_estimators=100, 272 | step_size="line_search", learning_rate=0.1, 273 | loss="squared_hinge", subsample=1.0, 274 | callback=None, random_state=None): 275 | self.estimator = estimator 276 | self.n_estimators = n_estimators 277 | self.step_size = step_size 278 | self.learning_rate = learning_rate 279 | self.loss = loss 280 | self.subsample = subsample 281 | self.callback = callback 282 | self.random_state = random_state 283 | 284 | def _get_loss(self): 285 | losses = dict(squared_hinge=_SquaredHingeLoss(), 286 | log=_LogLoss()) 287 | return losses[self.loss] 288 | 289 | def fit(self, X, y): 290 | self._lb = LabelBinarizer(neg_label=-1) 291 | Y = self._lb.fit_transform(y) 292 | return super(GBClassifier, self).fit(X, Y) 293 | 294 | def predict(self, X): 295 | pred = self.decision_function(X) 296 | return self._lb.inverse_transform(pred) 297 | 298 | 299 | class GBRegressor(_BaseGB, RegressorMixin): 300 | 301 | def __init__(self, estimator, n_estimators=100, 302 | step_size="line_search", learning_rate=0.1, 303 | loss="squared", subsample=1.0, 304 | callback=None, random_state=None): 305 | self.estimator = estimator 306 | self.n_estimators = n_estimators 307 | self.step_size = step_size 308 | self.learning_rate = learning_rate 309 | self.loss = loss 310 | self.subsample = subsample 311 | self.callback = callback 312 | self.random_state = random_state 313 | 314 | def _get_loss(self): 315 | losses = dict(squared=_SquareLoss(), 316 | absolute=_AbsoluteLoss()) 317 | return losses[self.loss] 318 | -------------------------------------------------------------------------------- /ivalice/impl/tree.py: -------------------------------------------------------------------------------- 1 | """Classification and regression trees""" 2 | 3 | # Author: Mathieu Blondel 4 | # License: BSD 3 clause 5 | 6 | import numbers 7 | 8 | import numpy as np 9 | import numba 10 | 11 | from sklearn.base import BaseEstimator 12 | from sklearn.base import RegressorMixin 13 | from sklearn.base import ClassifierMixin 14 | from sklearn.preprocessing import LabelEncoder 15 | from sklearn.utils import check_random_state 16 | 17 | from .sort import heapsort 18 | 19 | TREE_LEAF = -1 20 | UNDEFINED = -2 21 | 22 | DOUBLE_MAX = np.finfo(np.float64).max 23 | 24 | MSE_CRITERION = 0 25 | GINI_CRITERION = 1 26 | ENTROPY_CRITERION = 2 27 | 28 | 29 | class _Tree(object): 30 | 31 | def __init__(self, capacity=2 ** 10): 32 | self.capacity = capacity 33 | self.threshold = np.zeros(capacity, dtype=np.float64) + UNDEFINED 34 | self.feature = np.zeros(capacity, dtype=np.int32) + UNDEFINED 35 | self.children_left = np.zeros(capacity, dtype=np.int32) + TREE_LEAF 36 | self.children_right = np.zeros(capacity, dtype=np.int32) + TREE_LEAF 37 | self.value = np.zeros(capacity, dtype=np.float64) 38 | self.ptr = 0 39 | 40 | def add_node(self, threshold, feature, value): 41 | self.threshold[self.ptr] = threshold 42 | self.feature[self.ptr] = feature 43 | self.value[self.ptr] = value 44 | self.ptr += 1 45 | 46 | def add_terminal_node(self, value): 47 | self.value[self.ptr] = value 48 | self.ptr += 1 49 | 50 | def finalize(self): 51 | for attr in ("threshold", "feature", "value", 52 | "children_left", "children_right"): 53 | attr_value = getattr(self, attr)[:self.ptr + 1] 54 | setattr(self, attr, attr_value) 55 | return self 56 | 57 | 58 | class _Stack(object): 59 | 60 | def __init__(self, capacity=2 ** 10): 61 | self.capacity = capacity 62 | self.start = np.zeros(capacity, dtype=np.int32) 63 | self.end = np.zeros(capacity, dtype=np.int32) 64 | self.left = np.zeros(capacity, dtype=bool) 65 | self.depth = np.zeros(capacity, dtype=np.int32) 66 | self.n_samples = np.zeros(capacity, dtype=np.float64) 67 | self.parent = np.zeros(capacity, dtype=np.int32) 68 | self.value = np.zeros(capacity, dtype=np.float64) 69 | self.ptr = -1 70 | 71 | def push(self, start, end, left, depth, n_samples, parent, value): 72 | if self.ptr >= self.capacity: 73 | raise ValueError("Stack overflow!") 74 | 75 | self.ptr += 1 76 | self.start[self.ptr] = start 77 | self.end[self.ptr] = end 78 | self.left[self.ptr] = left 79 | self.depth[self.ptr] = depth 80 | self.n_samples[self.ptr] = n_samples 81 | self.parent[self.ptr] = parent 82 | self.value[self.ptr] = value 83 | 84 | def pop(self): 85 | self.ptr -= 1 86 | p = self.ptr + 1 87 | return self.start[p], self.end[p], self.left[p], self.depth[p], \ 88 | self.n_samples[p], self.parent[p], self.value[p] 89 | 90 | def __len__(self): 91 | return self.ptr + 1 92 | 93 | 94 | @numba.njit("void(f8[:,:], i4[:], f8[:], i4[:], i4[:], i4[:])") 95 | def _apply(X, feature, threshold, children_left, children_right, out): 96 | for i in range(X.shape[0]): 97 | node = 0 98 | # While node not a leaf 99 | while children_left[node] != TREE_LEAF: 100 | if X[i, feature[node]] <= threshold[node]: 101 | node = children_left[node] 102 | else: 103 | node = children_right[node] 104 | out[i] = node 105 | 106 | 107 | @numba.njit("f8(f8[:], f8[:], f8[:], i4[:], i4, i4, i4, f8[:])") 108 | def _impurity_mse(Xj, y, sample_weight, samples, start_t, pos_t, end_t, out): 109 | N_L = 0 110 | N_R = 0 111 | 112 | y_sq = 0 113 | y_sum = 0 114 | 115 | for ii in xrange(start_t, pos_t): 116 | i = samples[ii] 117 | N_L += sample_weight[i] 118 | y_sq += sample_weight[i] * y[i] * y[i] 119 | y_sum += sample_weight[i] * y[i] 120 | 121 | if N_L == 0: 122 | return DOUBLE_MAX 123 | 124 | value_L = y_sum / N_L 125 | imp_L = y_sq - 1 * y_sum * y_sum / N_L 126 | 127 | y_sq = 0 128 | y_sum = 0 129 | 130 | for ii in xrange(pos_t, end_t): 131 | i = samples[ii] 132 | N_R += sample_weight[i] 133 | y_sq += sample_weight[i] * y[i] * y[i] 134 | y_sum += sample_weight[i] * y[i] 135 | 136 | if N_R == 0: 137 | return DOUBLE_MAX 138 | 139 | value_R = y_sum / N_R 140 | imp_R = y_sq - 1 * y_sum * y_sum / N_R 141 | 142 | N_t = N_L + N_R 143 | 144 | out[0] = N_L 145 | out[1] = N_R 146 | out[2] = N_t 147 | out[3] = value_L 148 | out[4] = value_R 149 | 150 | return (imp_L + imp_R) / N_t 151 | 152 | 153 | @numba.njit("void(f8[:], f8[:], f8[:], i4[:], i4, i4, f8, f8[:], f8[:], f8[:])") 154 | def _compute_counts(Xj, y, sample_weight, samples, start_t, pos_t, end_t, 155 | count_L, count_R, out): 156 | n_classes = count_L.shape[0] 157 | N_L = 0 158 | N_R = 0 159 | 160 | for k in xrange(n_classes): 161 | count_L[k] = 0 162 | count_R[k] = 0 163 | 164 | for ii in xrange(start_t, pos_t): 165 | i = samples[ii] 166 | N_L += sample_weight[i] 167 | idx = int(y[i]) 168 | count_L[idx] += sample_weight[i] 169 | 170 | for ii in xrange(pos_t, end_t): 171 | i = samples[ii] 172 | N_R += sample_weight[i] 173 | idx = int(y[i]) 174 | count_R[idx] += sample_weight[i] 175 | 176 | best_L = -DOUBLE_MAX 177 | best_R = -DOUBLE_MAX 178 | value_L = 0 179 | value_R = 0 180 | 181 | for k in xrange(n_classes): 182 | if count_L[k] > best_L: 183 | best_L = count_L[k] 184 | value_L = k 185 | 186 | if count_R[k] > best_R: 187 | best_R = count_R[k] 188 | value_R = k 189 | 190 | out[0] = N_L 191 | out[1] = N_R 192 | out[2] = N_L + N_R 193 | out[3] = value_L 194 | out[4] = value_R 195 | 196 | 197 | @numba.njit("f8(f8[:], f8[:], f8[:], i4[:], i4, i4, i4, f8[:], f8[:], f8[:])") 198 | def _impurity_gini(Xj, y, sample_weight, samples, start_t, pos_t, end_t, 199 | count_L, count_R, out): 200 | n_classes = count_L.shape[0] 201 | 202 | _compute_counts(Xj, y, sample_weight, samples, start_t, pos_t, end_t, 203 | count_L, count_R, out) 204 | N_L = out[0] 205 | N_R = out[1] 206 | N_t = out[2] 207 | 208 | if N_L == 0 and N_R == 0: 209 | return DOUBLE_MAX 210 | 211 | gini_L = 0 212 | gini_R = 0 213 | for k in xrange(n_classes): 214 | proba_L = count_L[k] / N_t 215 | proba_R = count_R[k] / N_t 216 | 217 | gini_L += proba_L * (1 - proba_L) 218 | gini_R += proba_R * (1 - proba_R) 219 | 220 | #return float(N_L) / N_t * gini_L + float(N_R) / N_t * gini_R 221 | return N_L * gini_L + N_R * gini_R 222 | 223 | 224 | @numba.njit("f8(f8[:], f8[:], f8[:], i4[:], i4, i4, i4, f8[:], f8[:], f8[:])") 225 | def _impurity_entropy(Xj, y, sample_weight, samples, start_t, pos_t, end_t, 226 | count_L, count_R, out): 227 | n_classes = count_L.shape[0] 228 | 229 | _compute_counts(Xj, y, sample_weight, samples, start_t, pos_t, end_t, 230 | count_L, count_R, out) 231 | N_L = out[0] 232 | N_R = out[1] 233 | N_t = out[2] 234 | 235 | if N_L == 0 or N_R == 0: 236 | return DOUBLE_MAX 237 | 238 | ent_L = 0 239 | ent_R = 0 240 | for k in xrange(n_classes): 241 | proba_L = count_L[k] / N_t 242 | proba_R = count_R[k] / N_t 243 | 244 | if proba_L > 0: 245 | ent_L -= proba_L * np.log2(proba_L) 246 | 247 | if proba_R > 0: 248 | ent_R -= proba_R * np.log2(proba_R) 249 | 250 | #return float(N_L) / N_t * ent_L + float(N_R) / N_t * ent_R 251 | return N_L * ent_L + N_R * ent_R 252 | 253 | 254 | @numba.njit("void(f8[:,:], f8[:], f8[:], i4[:], i4[:], f8[:], i4, i4, i4, " 255 | "i4, f8[:], f8[:], f8[:])") 256 | def _best_split(X, y, sample_weight, samples, features, Xj, start_t, end_t, 257 | criterion, min_samples_leaf, count_L, count_R, out): 258 | best_imp = DOUBLE_MAX 259 | best_thresh = 0.0 260 | best_j = -1 261 | best_pos_t = -1 262 | N_L = 0.0 263 | N_R = 0.0 264 | N_t = 0.0 265 | value_L = 0.0 266 | value_R = 0.0 267 | 268 | size_t = end_t - start_t 269 | 270 | for j in features: 271 | 272 | for p in xrange(start_t, end_t): 273 | Xj[p] = X[samples[p], j] 274 | 275 | # Sort samples in nodes_t by their value for feature j. 276 | heapsort(Xj[start_t:end_t], samples[start_t:end_t], size_t) 277 | # FIXME: use introsort. 278 | 279 | for k in xrange(start_t, end_t - 1): 280 | pos_t = k + 1 281 | N_L = pos_t - start_t 282 | N_R = size_t - N_L 283 | 284 | if N_R < min_samples_leaf or N_L < min_samples_leaf: 285 | continue 286 | 287 | # Choose splitting threshold. 288 | # Any value between Xj[k+1] and Xj[k] is fine. 289 | Xj_diff = Xj[k + 1] - Xj[k] 290 | 291 | if Xj_diff == 0: 292 | continue 293 | 294 | thresh = Xj_diff / 2.0 + Xj[k] 295 | 296 | # FIXME: impurity can be computed efficiently from last 297 | # iteration. 298 | if criterion == MSE_CRITERION: 299 | imp = _impurity_mse(Xj, y, sample_weight, samples, start_t, 300 | pos_t, end_t, out) 301 | elif criterion == GINI_CRITERION: 302 | imp = _impurity_gini(Xj, y, sample_weight, samples, start_t, 303 | pos_t, end_t, count_L, count_R, out) 304 | else: 305 | imp = _impurity_entropy(Xj, y, sample_weight, samples, start_t, 306 | pos_t, end_t, count_L, count_R, out) 307 | 308 | if imp < best_imp: 309 | best_imp = imp 310 | best_thresh = thresh 311 | best_j = j 312 | best_pos_t = pos_t 313 | N_L = out[0] 314 | N_R = out[1] 315 | N_t = out[2] 316 | value_L = out[3] 317 | value_R = out[4] 318 | 319 | out[0] = N_L 320 | out[1] = N_R 321 | out[2] = N_t 322 | out[3] = value_L 323 | out[4] = value_R 324 | out[5] = best_thresh 325 | out[6] = best_j 326 | out[7] = best_pos_t 327 | 328 | if best_j != -1: 329 | # Reorder samples for the best split. 330 | for p in xrange(start_t, end_t): 331 | Xj[p] = X[samples[p], best_j] 332 | 333 | heapsort(Xj[start_t:end_t], samples[start_t:end_t], size_t) 334 | 335 | 336 | def _build_tree(X, y, sample_weight, criterion, max_features=None, 337 | max_depth=None, min_samples_split=2, min_samples_leaf=1, 338 | random_state=None): 339 | n_samples, n_features = X.shape 340 | 341 | tree = _Tree() 342 | node_t = 0 343 | samples = np.arange(n_samples).astype(np.int32) 344 | samples = samples[sample_weight > 0] 345 | features = np.arange(n_features).astype(np.int32) 346 | 347 | stack = _Stack() 348 | stack.push(start=0, end=len(samples), left=False, 349 | depth=0, n_samples=np.sum(sample_weight), 350 | parent=0, value=0) 351 | 352 | # Buffers 353 | Xj = np.zeros(n_samples, dtype=np.float64) 354 | out = np.zeros(8, dtype=np.float64) 355 | 356 | if criterion >= GINI_CRITERION: # Classification case 357 | enc = LabelEncoder() 358 | y = enc.fit_transform(y).astype(np.float64) 359 | # Arrays which will contain the number of samples in each class. 360 | count_L = np.zeros(len(enc.classes_), dtype=np.float64) 361 | count_R = np.zeros(len(enc.classes_), dtype=np.float64) 362 | else: 363 | count_L = np.zeros(0, dtype=np.float64) 364 | count_R = np.zeros(0, dtype=np.float64) 365 | 366 | while len(stack) > 0: 367 | # Pick node from the stack. 368 | start_t, end_t, left_t, depth_t, N_t, parent_t, value_t = stack.pop() 369 | 370 | if node_t > 0: 371 | # Adjust children node id of parent. 372 | if left_t: 373 | tree.children_left[parent_t] = node_t 374 | else: 375 | tree.children_right[parent_t] = node_t 376 | 377 | size_t = end_t - start_t 378 | 379 | # Terminal node if max_depth or min_samples_split conditions are met. 380 | if depth_t == max_depth or size_t < min_samples_split: 381 | tree.add_terminal_node(value_t) 382 | node_t += 1 383 | continue 384 | 385 | # Find best split across all features. 386 | if max_features != n_features: 387 | random_state.shuffle(features) 388 | 389 | _best_split(X, y, sample_weight, samples, features[:max_features], Xj, 390 | start_t, end_t, criterion, min_samples_leaf, 391 | count_L, count_R, out) 392 | N_L, N_R, _, value_L, value_R, best_thresh, best_j, pos_t = out 393 | best_j = int(best_j) 394 | pos_t = int(pos_t) 395 | 396 | # No best split found: terminal node. 397 | if best_j == -1: 398 | tree.add_terminal_node(value_t) 399 | node_t += 1 400 | continue 401 | 402 | # Add node to the tree. 403 | tree.add_node(threshold=best_thresh, feature=best_j, value=value_t) 404 | 405 | # Add left and right children to the stack. 406 | stack.push(start=start_t, end=pos_t, left=True, depth=depth_t + 1, 407 | n_samples=N_L, parent=node_t, value=value_L) 408 | stack.push(start=pos_t, end=end_t, left=False, depth=depth_t + 1, 409 | n_samples=N_R, parent=node_t, value=value_R) 410 | 411 | node_t += 1 412 | 413 | if criterion >= GINI_CRITERION: 414 | values = np.array(tree.value, dtype=np.int32) 415 | tree.value = enc.inverse_transform(values) 416 | 417 | return tree.finalize() 418 | 419 | 420 | class _BaseTree(BaseEstimator): 421 | 422 | def _get_max_features(self, X): 423 | n_features = X.shape[1] 424 | 425 | if self.max_features is None: 426 | max_features = n_features 427 | elif isinstance(self.max_features, (numbers.Integral, np.integer)): 428 | max_features = self.max_features 429 | else: # float 430 | if self.max_features > 0.0: 431 | max_features = max(1, int(self.max_features * n_features)) 432 | else: 433 | raise ValueError("max_features should be positive!") 434 | 435 | return max_features 436 | 437 | 438 | class TreeClassifier(_BaseTree, ClassifierMixin): 439 | 440 | def __init__(self, criterion="gini", max_features=None, max_depth=None, 441 | min_samples_split=2, min_samples_leaf=1, random_state=None): 442 | self.criterion = criterion 443 | self.max_features = max_features 444 | self.max_depth = max_depth 445 | self.min_samples_split = min_samples_split 446 | self.min_samples_leaf = min_samples_leaf 447 | self.random_state = random_state 448 | 449 | def _get_criterion(self): 450 | return {"gini": GINI_CRITERION, 451 | "entropy": ENTROPY_CRITERION}[self.criterion] 452 | 453 | def fit(self, X, y, sample_weight=None): 454 | rng = check_random_state(self.random_state) 455 | 456 | if sample_weight is None: 457 | sample_weight = np.ones(X.shape[0], dtype=np.float64) 458 | 459 | self.tree_ = _build_tree(X, y, sample_weight, 460 | criterion=self._get_criterion(), 461 | max_features=self._get_max_features(X), 462 | max_depth=self.max_depth, 463 | min_samples_split=self.min_samples_split, 464 | min_samples_leaf=self.min_samples_leaf, 465 | random_state=rng) 466 | self.tree_.value = self.tree_.value.astype(np.int32) 467 | return self 468 | 469 | def predict(self, X): 470 | nodes = np.empty(X.shape[0], dtype=np.int32) 471 | _apply(X, self.tree_.feature, self.tree_.threshold, 472 | self.tree_.children_left, self.tree_.children_right, nodes) 473 | return self.tree_.value.take(nodes) 474 | 475 | 476 | class TreeRegressor(_BaseTree, RegressorMixin): 477 | 478 | def __init__(self, max_features=None, max_depth=None, min_samples_split=2, 479 | min_samples_leaf=1, random_state=None): 480 | self.max_features = max_features 481 | self.max_depth = max_depth 482 | self.min_samples_split = min_samples_split 483 | self.min_samples_leaf = min_samples_leaf 484 | self.random_state = random_state 485 | 486 | def fit(self, X, y, sample_weight=None): 487 | rng = check_random_state(self.random_state) 488 | 489 | if sample_weight is None: 490 | sample_weight = np.ones(X.shape[0], dtype=np.float64) 491 | 492 | self.tree_ = _build_tree(X, y, sample_weight, 493 | criterion=MSE_CRITERION, 494 | max_features=self._get_max_features(X), 495 | max_depth=self.max_depth, 496 | min_samples_split=self.min_samples_split, 497 | min_samples_leaf=self.min_samples_leaf, 498 | random_state=rng) 499 | return self 500 | 501 | def predict(self, X): 502 | nodes = np.empty(X.shape[0], dtype=np.int32) 503 | _apply(X, self.tree_.feature, self.tree_.threshold, 504 | self.tree_.children_left, self.tree_.children_right, nodes) 505 | return self.tree_.value.take(nodes) 506 | --------------------------------------------------------------------------------