├── ivalice
    ├── __init__.py
    ├── impl
    │   ├── __init__.py
    │   ├── setup.py
    │   ├── tests
    │   │   ├── test_adaboost.py
    │   │   ├── test_lambda_mart.py
    │   │   ├── test_forest.py
    │   │   ├── test_sort.py
    │   │   ├── test_mcrank.py
    │   │   ├── test_gradient_boosting.py
    │   │   └── test_tree.py
    │   ├── adaboost.py
    │   ├── forest.py
    │   ├── lambda_mart.py
    │   ├── sort.py
    │   ├── mcrank.py
    │   ├── gradient_boosting.py
    │   └── tree.py
    ├── ranking.py
    ├── regression.py
    ├── classification.py
    └── setup.py
├── .gitignore
├── README.rst
├── benchmarks
    └── bench_rf.py
├── setup.py
└── examples
    └── plot_gradient_boosting_classification.py


/ivalice/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ivalice/impl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ivalice/ranking.py:
--------------------------------------------------------------------------------
1 | from .impl.lambda_mart import LambdaMART
2 | from .impl.mcrank import McRank
3 | from .impl.mcrank import OrdinalMcRank
4 | 


--------------------------------------------------------------------------------
/ivalice/regression.py:
--------------------------------------------------------------------------------
1 | from .impl.forest import RFRegressor
2 | from .impl.gradient_boosting import GBRegressor
3 | #from .impl.tree import TreeRegressor
4 | 


--------------------------------------------------------------------------------
/ivalice/classification.py:
--------------------------------------------------------------------------------
1 | from .impl.adaboost import AdaBoostClassifier
2 | from .impl.gradient_boosting import GBClassifier
3 | #from .impl.tree import TreeClassifier
4 | 


--------------------------------------------------------------------------------
/ivalice/setup.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | import numpy
 4 | 
 5 | 
 6 | def configuration(parent_package='', top_path=None):
 7 |     from numpy.distutils.misc_util import Configuration
 8 | 
 9 |     config = Configuration('ivalice', parent_package, top_path)
10 | 
11 |     config.add_subpackage('impl')
12 | 
13 |     return config
14 | 
15 | if __name__ == '__main__':
16 |     from numpy.distutils.core import setup
17 |     setup(**configuration(top_path='').todict())
18 | 


--------------------------------------------------------------------------------
/ivalice/impl/setup.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | import numpy
 4 | 
 5 | 
 6 | def configuration(parent_package='', top_path=None):
 7 |     from numpy.distutils.misc_util import Configuration
 8 | 
 9 |     config = Configuration('impl', parent_package, top_path)
10 | 
11 |     config.add_subpackage('tests')
12 | 
13 |     return config
14 | 
15 | if __name__ == '__main__':
16 |     from numpy.distutils.core import setup
17 |     setup(**configuration(top_path='').todict())
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.so
 3 | *~
 4 | .#*
 5 | *.swp
 6 | .DS_Store
 7 | build
 8 | ivalice/**/*.html
 9 | 
10 | dist/
11 | doc/_build/
12 | doc/generated/
13 | doc/auto_examples/
14 | doc/modules/generated/
15 | doc/datasets/generated/
16 | pip-log.txt
17 | ivalice.egg-info/
18 | .coverage
19 | coverage
20 | tags
21 | coverages.zip
22 | samples.zip
23 | doc/coverages.zip
24 | doc/samples.zip
25 | coverages
26 | samples
27 | doc/coverages
28 | doc/samples
29 | 
30 | 
31 | *.nt.bz2
32 | *.tar.gz
33 | *.tgz
34 | joblib
35 | 


--------------------------------------------------------------------------------
/ivalice/impl/tests/test_adaboost.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from sklearn.datasets import make_classification
 4 | from sklearn.tree import DecisionTreeClassifier
 5 | from sklearn.utils.testing import assert_equal
 6 | 
 7 | from ivalice.classification import AdaBoostClassifier
 8 | 
 9 | X_bin, y_bin = make_classification(n_samples=200, n_classes=2, random_state=0)
10 | 
11 | 
12 | def test_adaboost_binary():
13 |     tree = DecisionTreeClassifier(max_depth=1, random_state=0)
14 |     clf = AdaBoostClassifier(tree, n_estimators=10)
15 |     clf.fit(X_bin, y_bin)
16 |     assert_equal(clf.score(X_bin, y_bin), 0.96)
17 | 


--------------------------------------------------------------------------------
/ivalice/impl/tests/test_lambda_mart.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from sklearn.datasets import load_diabetes
 4 | from sklearn.tree import DecisionTreeRegressor
 5 | from sklearn.utils.testing import assert_almost_equal
 6 | 
 7 | from ivalice.ranking import LambdaMART
 8 | 
 9 | data = load_diabetes()
10 | X, y = data.data, data.target
11 | y /= (y.max() - y.min())
12 | 
13 | 
14 | def test_lambda_mart_ndcg():
15 |     for gains in ("linear", "exponential"):
16 |         reg = DecisionTreeRegressor()
17 |         lm = LambdaMART(reg, n_estimators=10, max_rank=10, gains=gains)
18 |         lm.fit(X, y)
19 |         ndcg = lm.score(X, y)
20 |         assert_almost_equal(ndcg, 1.0)
21 | 
22 | 
23 | def test_lambda_mart_ndcg_all():
24 |     for gains in ("linear", "exponential"):
25 |         reg = DecisionTreeRegressor()
26 |         lm = LambdaMART(reg, n_estimators=10, max_rank=None, gains=gains)
27 |         lm.fit(X, y)
28 |         ndcg = lm.score(X, y)
29 |         assert_almost_equal(ndcg, 1.0)
30 | 


--------------------------------------------------------------------------------
/ivalice/impl/tests/test_forest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from sklearn.datasets import load_diabetes
 4 | from sklearn.datasets import make_regression
 5 | from sklearn.ensemble import RandomForestRegressor as skRF
 6 | from sklearn.utils.testing import assert_almost_equal
 7 | 
 8 | from ivalice.regression import RFRegressor
 9 | 
10 | diabetes = load_diabetes()
11 | X_d, y_d = diabetes.data, diabetes.target
12 | 
13 | 
14 | def test_regression():
15 |     rf = skRF(n_estimators=100,
16 |               max_features=0.6,
17 |               max_depth=3,
18 |               bootstrap=False,
19 |               random_state=0)
20 |     rf.fit(X_d, y_d)
21 |     y_pred = rf.predict(X_d)
22 |     sk = np.mean((y_d - y_pred) ** 2)
23 | 
24 | 
25 |     rf = RFRegressor(n_estimators=100,
26 |                      max_features=0.6,
27 |                      max_depth=3,
28 |                      bootstrap=False,
29 |                      random_state=0)
30 | 
31 |     rf.fit(X_d, y_d)
32 |     y_pred = rf.predict(X_d)
33 |     iv = np.mean((y_d - y_pred) ** 2)
34 | 
35 |     assert_almost_equal(sk, 2692.3, 1)
36 |     assert_almost_equal(iv, 2689.9, 1)
37 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. -*- mode: rst -*-
 2 | 
 3 | ivalice
 4 | =======
 5 | 
 6 | Boosting and ensemble learning library in Python.
 7 | 
 8 | Algorithms supported:
 9 | 
10 | - Classification and regression trees (work in progress)
11 | - Random forests (work in progress)
12 | - Gradient Boosting
13 | - McRank
14 | - LambdaMART
15 | 
16 | ivalice follows the `scikit-learn <http://scikit-learn.org>`_ API conventions.
17 | Computationally demanding parts are implemented using `Numba
18 | <http://numba.pydata.org>`_.
19 | 
20 | Dependencies
21 | ------------
22 | 
23 | ivalice needs Python >= 2.7, setuptools, Numpy >= 1.3, SciPy >= 0.7,
24 | scikit-learn >= 0.15.1 and Numba >= 0.13.4.
25 | 
26 | To run the tests you will also need nose >= 0.10.
27 | 
28 | Installation
29 | ------------
30 | 
31 | To install ivalice from pip, type::
32 | 
33 |     pip install https://github.com/mblondel/ivalice/archive/master.zip
34 | 
35 | To install ivalice from source, type::
36 | 
37 |   git clone https://github.com/mblondel/ivalice.git
38 |   cd ivalice
39 |   sudo python setup.py install
40 | 
41 | On Github
42 | ---------
43 | 
44 | https://github.com/mblondel/ivalice
45 | 
46 | Author
47 | ------
48 | 
49 | Mathieu Blondel, 2014-present
50 | 


--------------------------------------------------------------------------------
/benchmarks/bench_rf.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.datasets import fetch_covtype
 6 | from sklearn.cross_validation import train_test_split
 7 | from sklearn.ensemble import RandomForestRegressor
 8 | from sklearn.metrics import mean_squared_error
 9 | 
10 | from ivalice.regression import RFRegressor
11 | 
12 | data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=0)
13 | X, y = data.data, data.target
14 | 
15 | n_samples = 10000
16 | mask = y <= 2
17 | Xb = X[mask][:n_samples]
18 | yb = y[mask][:n_samples]
19 | 
20 | Xb_tr, Xb_te, yb_tr, yb_te = train_test_split(Xb, yb, train_size=0.75,
21 |                                               test_size=0.2, random_state=0)
22 | 
23 | rf = RandomForestRegressor(n_estimators=100,
24 |                            max_depth=3,
25 |                            max_features=0.6)
26 | start = time.time()
27 | rf.fit(Xb_tr, yb_tr)
28 | print "RandomForestRegressor"
29 | print time.time() - start, "seconds"
30 | y_pred = rf.predict(Xb_te)
31 | print mean_squared_error(yb_te, y_pred)
32 | print
33 | 
34 | rf = RFRegressor(n_estimators=100,
35 |                  max_depth=3,
36 |                  max_features=0.6)
37 | start = time.time()
38 | rf.fit(Xb_tr, yb_tr)
39 | print "RandomForestRegressor"
40 | print time.time() - start, "seconds"
41 | y_pred = rf.predict(Xb_te)
42 | print mean_squared_error(yb_te, y_pred)
43 | 


--------------------------------------------------------------------------------
/ivalice/impl/tests/test_sort.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from sklearn.utils.testing import assert_array_equal
 4 | 
 5 | from ivalice.impl.sort import quicksort
 6 | from ivalice.impl.sort import heapsort
 7 | 
 8 | 
 9 | def test_quicksort():
10 |     rng = np.random.RandomState(0)
11 |     values = rng.rand(500)
12 |     indices = np.arange(len(values)).astype(np.int32)
13 | 
14 |     sorted_idx = np.argsort(values)
15 |     sorted_values = values[sorted_idx]
16 |     sorted_indices = indices[sorted_idx]
17 | 
18 |     quicksort(values, indices, 0, len(values) - 1)
19 | 
20 |     assert_array_equal(sorted_values, values)
21 |     assert_array_equal(sorted_indices, indices)
22 | 
23 | 
24 | def test_quicksort_one():
25 |     values = np.arange(1).astype(np.float64)
26 |     indices = np.arange(1).astype(np.int32)
27 |     quicksort(values, indices, 0, len(values) - 1)
28 | 
29 | 
30 | def test_heapsort():
31 |     rng = np.random.RandomState(0)
32 |     values = rng.rand(500)
33 |     indices = np.arange(len(values)).astype(np.int32)
34 | 
35 |     sorted_idx = np.argsort(values)
36 |     sorted_values = values[sorted_idx]
37 |     sorted_indices = indices[sorted_idx]
38 | 
39 |     heapsort(values, indices, len(values))
40 | 
41 |     assert_array_equal(sorted_values, values)
42 |     assert_array_equal(sorted_indices, indices)
43 | 
44 | 
45 | def test_heapsort_one():
46 |     values = np.arange(1).astype(np.float64)
47 |     indices = np.arange(1).astype(np.int32)
48 |     heapsort(values, indices, len(values))
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | #
 3 | # Copyright (C) 2014 Mathieu Blondel
 4 | 
 5 | import sys
 6 | import os
 7 | 
 8 | DISTNAME = 'ivalice'
 9 | DESCRIPTION = "Boosting and ensemble learning library in Python."
10 | LONG_DESCRIPTION = open('README.rst').read()
11 | MAINTAINER = 'Mathieu Blondel'
12 | MAINTAINER_EMAIL = 'mathieu@mblondel.org'
13 | URL = 'https://github.com/mblondel/ivalice'
14 | LICENSE = 'new BSD'
15 | DOWNLOAD_URL = 'https://github.com/mblondel/ivalice'
16 | VERSION = '0.1-git'
17 | 
18 | import setuptools  # we are using a setuptools namespace
19 | from numpy.distutils.core import setup
20 | 
21 | 
22 | def configuration(parent_package='', top_path=None):
23 |     if os.path.exists('MANIFEST'):
24 |         os.remove('MANIFEST')
25 | 
26 |     from numpy.distutils.misc_util import Configuration
27 |     config = Configuration(None, parent_package, top_path)
28 | 
29 |     config.add_subpackage('ivalice')
30 | 
31 |     return config
32 | 
33 | if __name__ == "__main__":
34 | 
35 |     old_path = os.getcwd()
36 |     local_path = os.path.dirname(os.path.abspath(sys.argv[0]))
37 | 
38 |     os.chdir(local_path)
39 |     sys.path.insert(0, local_path)
40 | 
41 |     setup(configuration=configuration,
42 |           name=DISTNAME,
43 |           maintainer=MAINTAINER,
44 |           include_package_data=True,
45 |           maintainer_email=MAINTAINER_EMAIL,
46 |           description=DESCRIPTION,
47 |           license=LICENSE,
48 |           url=URL,
49 |           version=VERSION,
50 |           download_url=DOWNLOAD_URL,
51 |           long_description=LONG_DESCRIPTION,
52 |           zip_safe=False, # the package can run out of an .egg file
53 |           classifiers=[
54 |               'Intended Audience :: Science/Research',
55 |               'Intended Audience :: Developers',
56 |               'License :: OSI Approved',
57 |               'Programming Language :: C',
58 |               'Programming Language :: Python',
59 |               'Topic :: Software Development',
60 |               'Topic :: Scientific/Engineering',
61 |               'Operating System :: Microsoft :: Windows',
62 |               'Operating System :: POSIX',
63 |               'Operating System :: Unix',
64 |               'Operating System :: MacOS'
65 |              ]
66 |     )
67 | 


--------------------------------------------------------------------------------
/ivalice/impl/adaboost.py:
--------------------------------------------------------------------------------
 1 | """AdaBoost"""
 2 | 
 3 | # Author: Mathieu Blondel <mathieu@mblondel.org>
 4 | # License: BSD 3 clause
 5 | 
 6 | import numpy as np
 7 | 
 8 | from sklearn.base import BaseEstimator, ClassifierMixin, clone
 9 | from sklearn.preprocessing import LabelBinarizer
10 | from sklearn.metrics import accuracy_score
11 | 
12 | 
13 | class AdaBoostClassifier(BaseEstimator, ClassifierMixin):
14 | 
15 |     def __init__(self, estimator, n_estimators=10):
16 |         self.estimator = estimator
17 |         self.n_estimators = n_estimators
18 | 
19 |     def fit(self, X, y):
20 |         n_samples = X.shape[0]
21 | 
22 |         weights = np.ones(n_samples, dtype=np.float64) / n_samples
23 | 
24 |         self._lb = LabelBinarizer(neg_label=-1)
25 |         y = self._lb.fit_transform(y).ravel()
26 | 
27 |         self.estimators_ = np.zeros(self.n_estimators, dtype=np.object)
28 |         self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
29 | 
30 |         y_pred_ = np.zeros(n_samples, dtype=np.float64)
31 | 
32 |         for it in xrange(self.n_estimators):
33 |             est = clone(self.estimator)
34 |             est = est.fit(X, y, sample_weight=weights)
35 | 
36 |             y_pred = est.predict(X)
37 |             err = 1 - accuracy_score(y, y_pred, sample_weight=weights)
38 | 
39 |             if err == 0:
40 |                 self.estimator_weights_[it] = 1
41 |                 self.estimators_[it] = est
42 |                 break
43 | 
44 |             alpha = 0.5 * np.log((1 - err) / err)
45 | 
46 |             #weights *= np.exp(- alpha * y * y_pred)
47 |             #weights /= weights.sum()
48 | 
49 |             y_pred_ += alpha * y_pred
50 |             weights = np.exp(-y * y_pred_)
51 |             #weights = 1.0 / (1 + np.exp(y * y_pred_))  # logit boost
52 |             weights /= weights.sum()
53 | 
54 |             self.estimator_weights_[it] = alpha
55 |             self.estimators_[it] = est
56 | 
57 | 
58 |         return self
59 | 
60 |     def predict(self, X):
61 |         y_pred = np.zeros(X.shape[0], dtype=np.float64)
62 |         for it in xrange(self.n_estimators):
63 |             if self.estimator_weights_[it] != 0:
64 |                 pred = self.estimators_[it].predict(X)
65 |                 y_pred += self.estimator_weights_[it] * pred
66 |         y_pred = np.sign(y_pred)
67 |         return self._lb.inverse_transform(y_pred.reshape(-1, 1))
68 | 


--------------------------------------------------------------------------------
/examples/plot_gradient_boosting_classification.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ================================
 3 | Gradient boosting classification
 4 | ================================
 5 | 
 6 | This example compares the squared hinge and log losses in gradient boosting.
 7 | """
 8 | 
 9 | print __doc__
10 | 
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | 
14 | from sklearn.datasets import load_iris
15 | from sklearn.cross_validation import train_test_split
16 | from sklearn.tree import DecisionTreeClassifier
17 | 
18 | from ivalice.classification import GBClassifier
19 | 
20 | n_estimators = 10
21 | 
22 | class Callback(object):
23 | 
24 |     def __init__(self, X_tr, y_tr, X_te, y_te):
25 |         self.X_tr = X_tr
26 |         self.y_tr = y_tr
27 |         self.X_te = X_te
28 |         self.y_te = y_te
29 |         self.accuracy_tr = []
30 |         self.accuracy_te = []
31 | 
32 |     def __call__(self, est):
33 |         y_pred_tr = est.predict(X_tr)
34 |         y_pred_te = est.predict(X_te)
35 |         self.accuracy_tr.append(np.mean(self.y_tr == y_pred_tr))
36 |         self.accuracy_te.append(np.mean(self.y_te == y_pred_te))
37 | 
38 | data = load_iris()
39 | 
40 | X_tr, X_te, y_tr, y_te = train_test_split(data.data, data.target,
41 |                                           train_size=0.5, test_size=0.5,
42 |                                           random_state=0)
43 | 
44 | tree = DecisionTreeClassifier(max_depth=1)  # decision stumps
45 | 
46 | estimators = (
47 |     ("squared hinge", "b", GBClassifier(tree, n_estimators=n_estimators,
48 |                                         step_size="constant", learning_rate=0.1,
49 |                                         loss="squared_hinge")),
50 | 
51 |     ("log", "g", GBClassifier(tree, n_estimators=n_estimators,
52 |                               step_size="constant", learning_rate=0.1, loss="log")),
53 | 
54 | )
55 | 
56 | it = np.arange(n_estimators) + 1
57 | 
58 | for name, color, clf in estimators:
59 |     clf.callback = Callback(X_tr, y_tr, X_te, y_te)
60 |     clf.fit(X_tr, y_tr)
61 | 
62 |     plt.plot(it, clf.callback.accuracy_tr, label=name + " (train)", color=color,
63 |              linestyle="-", linewidth=2)
64 |     plt.plot(it, clf.callback.accuracy_te, label=name + " (test)", color=color,
65 |              linestyle="--", linewidth=2)
66 | 
67 | plt.xlabel("Boosting iteration")
68 | plt.ylabel("Accuracy")
69 | plt.legend(loc="lower right")
70 | 
71 | plt.show()
72 | 


--------------------------------------------------------------------------------
/ivalice/impl/forest.py:
--------------------------------------------------------------------------------
 1 | """Random Forests"""
 2 | 
 3 | # Author: Mathieu Blondel <mathieu@mblondel.org>
 4 | # License: BSD 3 clause
 5 | 
 6 | import numpy as np
 7 | 
 8 | from sklearn.base import BaseEstimator, clone
 9 | from sklearn.base import RegressorMixin
10 | from sklearn.utils import check_random_state
11 | 
12 | MAX_INT = np.iinfo(np.int32).max
13 | 
14 | 
15 | def _fit_random_tree(tree, X, y, sample_weight, bootstrap, rng):
16 |     if bootstrap:
17 |         n_samples = X.shape[0]
18 |         if sample_weight is None:
19 |             sample_weight = np.ones((n_samples,), dtype=np.float64)
20 |         else:
21 |             sample_weight = sample_weight.copy()
22 | 
23 |         indices = rng.randint(0, n_samples, n_samples)
24 |         sample_counts = np.bincount(indices, minlength=n_samples)
25 |         sample_weight *= sample_counts
26 | 
27 |         tree.fit(X, y, sample_weight=sample_weight)
28 |         tree.indices_ = sample_counts > 0.
29 | 
30 |     else:
31 |         tree.fit(X, y, sample_weight=sample_weight)
32 | 
33 | 
34 | class _BaseRF(BaseEstimator):
35 | 
36 |     def _fit(self, X, y, sample_weight, tree):
37 |         rng = check_random_state(self.random_state)
38 |         self.estimators_ = []
39 | 
40 | 
41 |         for k in xrange(self.n_estimators):
42 |             tree = clone(tree)
43 |             tree.set_params(random_state=rng.randint(MAX_INT))
44 |             _fit_random_tree(tree, X, y, sample_weight, self.bootstrap, rng)
45 |             self.estimators_.append(tree)
46 | 
47 |         return self
48 | 
49 | 
50 | class RFRegressor(_BaseRF, RegressorMixin):
51 | 
52 |     def __init__(self, n_estimators=10, max_features=None, max_depth=None,
53 |                  min_samples_split=2, min_samples_leaf=1, bootstrap=True,
54 |                  random_state=None):
55 |         self.n_estimators = n_estimators
56 |         self.max_features = max_features
57 |         self.max_depth = max_depth
58 |         self.min_samples_split = min_samples_split
59 |         self.min_samples_leaf = min_samples_leaf
60 |         self.bootstrap = bootstrap
61 |         self.random_state = random_state
62 | 
63 |     def fit(self, X, y, sample_weight=None):
64 |         X = np.array(X, dtype=np.float64)
65 |         y = np.array(y, dtype=np.float64)
66 | 
67 |         from .tree import TreeRegressor
68 | 
69 |         tree = TreeRegressor(max_features=self.max_features,
70 |                              max_depth=self.max_depth,
71 |                              min_samples_split=self.min_samples_split,
72 |                              min_samples_leaf=self.min_samples_leaf)
73 |         return self._fit(X, y, sample_weight, tree)
74 | 
75 |     def predict(self, X):
76 |         pred = np.array([tree.predict(X) for tree in self.estimators_])
77 |         return np.mean(pred, axis=0)
78 | 


--------------------------------------------------------------------------------
/ivalice/impl/tests/test_mcrank.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from sklearn.datasets import load_diabetes
 4 | from sklearn.ensemble import GradientBoostingClassifier
 5 | from sklearn.utils.testing import assert_almost_equal
 6 | from sklearn.utils.testing import assert_equal
 7 | 
 8 | from ivalice.ranking import McRank
 9 | from ivalice.ranking import OrdinalMcRank
10 | 
11 | 
12 | bunch = load_diabetes()
13 | X, y = bunch.data, bunch.target
14 | y = np.round(y, decimals=-2)
15 | 
16 | 
17 | def test_mcrank():
18 |     gb = GradientBoostingClassifier(n_estimators=10,
19 |                                     loss="deviance",
20 |                                     random_state=0)
21 |     mc = McRank(gb)
22 |     mc.fit(X, y)
23 |     assert_almost_equal(mc.score(X, y), 48.08, 2)
24 | 
25 | 
26 | def test_mcrank_set_estimator_params():
27 |     gb = GradientBoostingClassifier(n_estimators=5,
28 |                                     loss="deviance",
29 |                                     random_state=0)
30 |     mc = McRank(gb)
31 |     mc.set_params(estimator__n_estimators=10)
32 |     assert_equal(gb.n_estimators, 10)
33 | 
34 | 
35 | def test_mcrank_warm_start():
36 |     gb = GradientBoostingClassifier(n_estimators=5,
37 |                                     loss="deviance",
38 |                                     warm_start=True,
39 |                                     random_state=0)
40 |     mc = McRank(gb)
41 |     mc.fit(X, y)
42 |     assert_almost_equal(mc.score(X, y), 56.06, 1)
43 | 
44 |     mc.set_params(estimator__n_estimators=10)
45 |     mc.fit(X, y)
46 |     assert_almost_equal(mc.score(X, y), 48.08, 2)
47 | 
48 | 
49 | def test_ordinal_mcrank():
50 |     gb = GradientBoostingClassifier(n_estimators=10,
51 |                                     loss="deviance",
52 |                                     random_state=0)
53 |     mc = OrdinalMcRank(gb)
54 |     mc.fit(X, y)
55 |     assert_almost_equal(mc.score(X, y), 48.62, 2)
56 | 
57 | 
58 | def test_ordinal_mcrank_set_estimator_params():
59 |     gb = GradientBoostingClassifier(n_estimators=5,
60 |                                     loss="deviance",
61 |                                     random_state=0)
62 |     mc = OrdinalMcRank(gb)
63 |     mc.set_params(estimator__n_estimators=10)
64 |     assert_equal(gb.n_estimators, 10)
65 | 
66 | 
67 | def test_ordinal_mcrank_warm_start():
68 |     gb = GradientBoostingClassifier(n_estimators=5,
69 |                                     loss="deviance",
70 |                                     warm_start=True,
71 |                                     random_state=0)
72 | 
73 |     mc = OrdinalMcRank(gb)
74 |     mc.fit(X, y)
75 |     assert_almost_equal(mc.score(X, y), 56.35, 2)
76 | 
77 |     mc.set_params(estimator__n_estimators=10)
78 |     mc.fit(X, y)
79 |     assert_almost_equal(mc.score(X, y), 48.62, 2)
80 | 


--------------------------------------------------------------------------------
/ivalice/impl/lambda_mart.py:
--------------------------------------------------------------------------------
  1 | """LambdaMART"""
  2 | 
  3 | # Author: Mathieu Blondel <mathieu@mblondel.org>
  4 | # License: BSD 3 clause
  5 | 
  6 | import numpy as np
  7 | import numba
  8 | 
  9 | from .gradient_boosting import _BaseGB, _MeanEstimator
 10 | 
 11 | 
 12 | @numba.njit("void(f8[:], f8[:], f8[:], f8, i4, f8[:])")
 13 | def _negative_gradient(y, y_pred, c, idcg, max_rank, g):
 14 |     n_samples = y.shape[0]
 15 | 
 16 |     for i in xrange(max_rank):
 17 |         for j in xrange(i + 1, n_samples):
 18 |             S = np.sign(y[i] - y[j])
 19 | 
 20 |             if S == 0:
 21 |                 continue
 22 | 
 23 |             score_diff = y_pred[i] - y_pred[j]
 24 | 
 25 |             diff = y[j] * (c[i] - c[j]) + y[i] * (c[j] - c[i])
 26 |             ndcg_diff = abs(diff / idcg)
 27 | 
 28 |             if ndcg_diff == 0:
 29 |                 continue
 30 | 
 31 |             rho = 1.0 / (1.0 + np.exp(S * score_diff))
 32 |             #rho = expit(-S * score_diff)
 33 |             g[i] += S * ndcg_diff * rho
 34 |             g[j] -= S * ndcg_diff * rho
 35 | 
 36 | 
 37 | def _dcg_score(y_true, y_score, max_rank=10, gains="exponential"):
 38 |     order = np.lexsort((y_true, -y_score))
 39 | 
 40 |     if max_rank is not None:
 41 |         order = order[:max_rank]
 42 | 
 43 |     y_true = np.take(y_true, order)
 44 | 
 45 |     if gains == "exponential":
 46 |         gains = 2 ** y_true - 1
 47 |     elif gains == "linear":
 48 |         gains = y_true
 49 |     else:
 50 |         raise ValueError("Invalid gains option.")
 51 | 
 52 |     # highest rank is 1 so +2 instead of +1
 53 |     discounts = np.log2(np.arange(len(y_true)) + 2)
 54 |     return np.sum(gains / discounts)
 55 | 
 56 | 
 57 | def _ndcg_score(y_true, y_score, max_rank=10, gains="exponential"):
 58 |     best = _dcg_score(y_true, y_true, max_rank, gains)
 59 |     actual = _dcg_score(y_true, y_score, max_rank, gains)
 60 |     return actual / best
 61 | 
 62 | 
 63 | class _NDCGLoss(object):
 64 | 
 65 |     def __init__(self, max_rank=10):
 66 |         self.max_rank = max_rank
 67 | 
 68 |     def init_estimator(self):
 69 |         return _MeanEstimator()
 70 | 
 71 |     def negative_gradient(self, y, y_pred):
 72 |         n_samples = y.shape[0]
 73 | 
 74 |         max_rank = self.max_rank if self.max_rank is not None else n_samples
 75 | 
 76 |         #order = np.argsort(y_pred)[::-1]
 77 |         order = np.lexsort((y, -y_pred))
 78 |         y = np.take(y, order)
 79 |         y_pred = np.take(y_pred, order)
 80 | 
 81 |         ind = np.arange(n_samples)
 82 |         c = 1. / np.log2(ind + 2)  # discount factors
 83 |         c[max_rank:] = 0
 84 | 
 85 |         g = np.zeros(n_samples, dtype=np.float64)
 86 | 
 87 |         y_sorted = np.sort(y)[::-1]
 88 |         idcg = np.sum(y_sorted * c)
 89 | 
 90 |         _negative_gradient(y, y_pred, c, idcg, max_rank, g)
 91 | 
 92 |         if np.any(np.isnan(g)):
 93 |             print "g contains NaNs"
 94 | 
 95 |         inv_ix = np.empty_like(order)
 96 |         inv_ix[order] = np.arange(len(order))
 97 |         g = g[inv_ix]
 98 | 
 99 |         return g
100 | 
101 | 
102 | class LambdaMART(_BaseGB):
103 | 
104 |     def __init__(self, estimator, n_estimators=100, learning_rate=1.0,
105 |                  loss="ndcg", max_rank=10, gains="exponential",
106 |                  subsample=1.0, callback=None, random_state=None):
107 |         self.estimator = estimator
108 |         self.n_estimators = n_estimators
109 |         self.learning_rate = learning_rate
110 |         self.loss = loss
111 |         self.max_rank = max_rank
112 |         self.gains = gains
113 |         self.subsample = subsample
114 |         self.callback = callback
115 |         self.random_state = random_state
116 | 
117 |     def _get_loss(self):
118 |         losses = dict(ndcg=_NDCGLoss(max_rank=self.max_rank))
119 |         return losses[self.loss]
120 | 
121 |     def fit(self, X, y):
122 |         if self.gains == "exponential":
123 |             y = 2 ** y - 1
124 | 
125 |         return super(LambdaMART, self).fit(X, y)
126 | 
127 |     def score(self, X, y):
128 |         y_pred = self.predict(X)
129 |         return _ndcg_score(y, y_pred, max_rank=self.max_rank, gains=self.gains)
130 | 


--------------------------------------------------------------------------------
/ivalice/impl/sort.py:
--------------------------------------------------------------------------------
  1 | """Efficient sorting routines"""
  2 | 
  3 | # Authors: Jake Vanderplas <jakevdp@cs.washington.edu> (quicksort)
  4 | #          Lars Buitinck <L.J.Buitinck@uva.nl> (heapsort)
  5 | #          Mathieu Blondel <mathieu@mblondel.org> (Numba port)
  6 | # License: BSD 3 clause
  7 | 
  8 | import numba
  9 | 
 10 | 
 11 | @numba.njit("void(f8[:], i4[:], i4, i4)")
 12 | def _dual_swap(values, indices, i1, i2):
 13 |     dtmp = values[i1]
 14 |     values[i1] = values[i2]
 15 |     values[i2] = dtmp
 16 | 
 17 |     itmp = indices[i1]
 18 |     indices[i1] = indices[i2]
 19 |     indices[i2] = itmp
 20 | 
 21 | 
 22 | @numba.njit("i4(f8[:], i4, i4)")
 23 | def _median3(values, start, end):
 24 |     # Median of three pivot selection, after Bentley and McIlroy (1993).
 25 |     # Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
 26 |     size = end - start + 1
 27 |     mid = start + size / 2
 28 | 
 29 |     a = values[start]
 30 |     b = values[mid]
 31 |     c = values[end]
 32 | 
 33 |     if a < b:
 34 |         if b < c:
 35 |             return mid
 36 |         elif a < c:
 37 |             return end
 38 |         else:
 39 |             return start
 40 |     elif b < c:
 41 |         if a < c:
 42 |             return start
 43 |         else:
 44 |             return end
 45 |     else:
 46 |         return mid
 47 | 
 48 | 
 49 | @numba.njit("i4(f8[:], i4[:], i4, i4)")
 50 | def _partition(values, indices, start, end):
 51 |     #pivot_idx = start + (end - start + 1) / 2
 52 |     pivot_idx = _median3(values, start, end)
 53 |     _dual_swap(values, indices, start, pivot_idx)
 54 |     pivot = values[start]
 55 |     i = start + 1
 56 |     j = start + 1
 57 | 
 58 |     while j <= end:
 59 |         if values[j] <= pivot:
 60 |             _dual_swap(values, indices, i, j)
 61 |             i += 1
 62 |         j += 1
 63 | 
 64 |     _dual_swap(values, indices, start, i - 1)
 65 | 
 66 |     return i - 1
 67 | 
 68 | 
 69 | @numba.njit("void(f8[:], i4[:], i4)")
 70 | def _sort2(values, indices, start):
 71 |     end = start + 1
 72 |     if values[start] > values[end]:
 73 |         _dual_swap(values, indices, start, end)
 74 | 
 75 | 
 76 | @numba.njit("void(f8[:], i4[:], i4)")
 77 | def _sort3(values, indices, start):
 78 |     mid = start + 1
 79 |     end = start + 2
 80 |     if values[start] > values[mid]:
 81 |         _dual_swap(values, indices, start, mid)
 82 |     if values[mid] > values[end]:
 83 |         _dual_swap(values, indices, mid, end)
 84 |         if values[start] > values[mid]:
 85 |             _dual_swap(values, indices, start, mid)
 86 | 
 87 | 
 88 | # As of Numba v0.13.2, recursion is not supported in Numba yet.
 89 | @numba.jit("void(f8[:], i4[:], i4, i4)")
 90 | def quicksort(values, indices, start, end):
 91 |     size = end - start + 1
 92 | 
 93 |     if size == 2:
 94 |         _sort2(values, indices, start)
 95 |     elif size == 3:
 96 |         _sort3(values, indices, start)
 97 |     if size > 1:
 98 |         i = _partition(values, indices, start, end)
 99 |         quicksort(values, indices, start, i - 1)
100 |         quicksort(values, indices, i + 1, end)
101 | 
102 | 
103 | @numba.njit("void(f8[:], i4[:], i4, i4)")
104 | def _sift_down(values, indices, start, end):
105 |     # Restore heap order in Xf[start:end] by moving the max element to start.
106 | 
107 |     root = start
108 |     while True:
109 |         child = root * 2 + 1
110 | 
111 |         # find max of root, left child, right child
112 |         maxind = root
113 |         if child < end and values[maxind] < values[child]:
114 |             maxind = child
115 |         if child + 1 < end and values[maxind] < values[child + 1]:
116 |             maxind = child + 1
117 | 
118 |         if maxind == root:
119 |             break
120 |         else:
121 |             _dual_swap(values, indices, root, maxind)
122 |             root = maxind
123 | 
124 | 
125 | @numba.njit("void(f8[:], i4[:], i4)")
126 | def heapsort(values, indices, size):
127 |     if size > 1:
128 |         # Heapify.
129 |         start = (size - 2) / 2
130 |         end = size
131 |         while True:
132 |             _sift_down(values, indices, start, end)
133 |             if start == 0:
134 |                 break
135 |             start -= 1
136 | 
137 |         # Sort by shrinking the heap, putting the max element
138 |         # immediately after it.
139 |         end = size - 1
140 |         while end > 0:
141 |             _dual_swap(values, indices, 0, end)
142 |             _sift_down(values, indices, 0, end)
143 |             end -= 1
144 | 


--------------------------------------------------------------------------------
/ivalice/impl/mcrank.py:
--------------------------------------------------------------------------------
  1 | """McRank"""
  2 | 
  3 | # Author: Mathieu Blondel <mathieu@mblondel.org>
  4 | # License: BSD 3 clause
  5 | 
  6 | import numpy as np
  7 | 
  8 | from sklearn.base import BaseEstimator, clone
  9 | from sklearn.ensemble import GradientBoostingClassifier
 10 | from sklearn.preprocessing import LabelEncoder
 11 | 
 12 | 
 13 | DEFAULT_CLF = GradientBoostingClassifier(loss="deviance")
 14 | 
 15 | 
 16 | class _BaseMcRank(BaseEstimator):
 17 | 
 18 |     def __init__(self, estimator=DEFAULT_CLF):
 19 |         self.estimator = estimator
 20 | 
 21 |     @property
 22 |     def classes_(self):
 23 |         return self._label_encoder.classes_
 24 | 
 25 |     def score(self, X, y):
 26 |         y_pred = self.predict(X)
 27 |         return np.mean(np.abs(y - y_pred))
 28 | 
 29 |     def predict(self, X):
 30 |         """Predict expected target value for X.
 31 | 
 32 |         Parameters
 33 |         ----------
 34 |         X : array-like of shape = [n_samples, n_features]
 35 |             The input samples.
 36 | 
 37 |         Returns
 38 |         -------
 39 |         p : array of shape = [n_samples]
 40 |         """
 41 |         n_samples = X.shape[0]
 42 |         n_classes = len(self.classes_)
 43 |         proba = self.predict_proba(X)
 44 |         classes = np.repeat(self.classes_, n_samples)
 45 |         classes = classes.reshape(n_classes, n_samples).T
 46 |         # pred[i] = \sum_m P(y_i = m) * m
 47 |         return np.average(classes, axis=1, weights=proba)
 48 | 
 49 |     def _get_estimator_params(self, **params):
 50 |         est_params = {}
 51 |         for key, value in params.items():
 52 |             if key.startswith("estimator__"):
 53 |                 key = key.replace("estimator__", "")
 54 |                 est_params[key] = value
 55 |         return est_params
 56 | 
 57 | 
 58 | class McRank(_BaseMcRank):
 59 | 
 60 |     def set_params(self, **params):
 61 |         super(McRank, self).set_params(**params)
 62 | 
 63 |         est_params = self._get_estimator_params(**params)
 64 | 
 65 |         if hasattr(self, "estimator_") and len(est_params) > 0:
 66 |             self.estimator_.set_params(**est_params)
 67 | 
 68 |     def fit(self, X, y):
 69 |         self._label_encoder = LabelEncoder()
 70 |         y = self._label_encoder.fit_transform(y)
 71 | 
 72 |         if not hasattr(self, "estimator_") or \
 73 |            not getattr(self.estimator, "warm_start", False):
 74 |             self.estimator_ = clone(self.estimator)
 75 | 
 76 |         self.estimator_.fit(X, y)
 77 | 
 78 |         return self
 79 | 
 80 |     def predict_proba(self, X):
 81 |         """Predict class probabilities for X.
 82 | 
 83 |         Parameters
 84 |         ----------
 85 |         X : array-like of shape = [n_samples, n_features]
 86 |             The input samples.
 87 | 
 88 |         Returns
 89 |         -------
 90 |         p : array of shape = [n_samples, n_classes]
 91 |             The class probabilities of the input samples. The order of the
 92 |             classes corresponds to that in the attribute `classes_`.
 93 |         """
 94 |         return self.estimator_.predict_proba(X)
 95 | 
 96 | 
 97 | class OrdinalMcRank(_BaseMcRank):
 98 | 
 99 |     def _fit(self, X, y, m, est):
100 |         cond = y <= m
101 |         y_bin = y.copy()
102 |         y_bin[cond] = 0
103 |         y_bin[~cond] = 1
104 |         est.fit(X, y_bin)
105 | 
106 |     def set_params(self, **params):
107 |         super(OrdinalMcRank, self).set_params(**params)
108 | 
109 |         est_params = self._get_estimator_params(**params)
110 | 
111 |         if hasattr(self, "estimators_") and len(est_params) > 0:
112 |             for est in self.estimators_:
113 |                 est.set_params(**est_params)
114 | 
115 |     def fit(self, X, y):
116 |         self._label_encoder = LabelEncoder()
117 |         y = self._label_encoder.fit_transform(y)
118 | 
119 |         n_classifiers = len(self.classes_) - 1
120 | 
121 |         if not hasattr(self, "estimators_") or \
122 |            not getattr(self.estimator, "warm_start", False):
123 |             self.estimators_ = [clone(self.estimator)
124 |                                 for m in xrange(n_classifiers)]
125 | 
126 |         for m in xrange(n_classifiers):
127 |             self._fit(X, y, m, self.estimators_[m])
128 | 
129 |         return self
130 | 
131 |     def predict_proba(self, X):
132 |         """Predict class probabilities for X.
133 | 
134 |         Parameters
135 |         ----------
136 |         X : array-like of shape = [n_samples, n_features]
137 |             The input samples.
138 | 
139 |         Returns
140 |         -------
141 |         p : array of shape = [n_samples, n_classes]
142 |             The class probabilities of the input samples. The order of the
143 |             classes corresponds to that in the attribute `classes_`.
144 |         """
145 |         n_samples = X.shape[0]
146 |         n_classes = len(self.classes_)
147 | 
148 |         # 2d array of shape (n_classes-1) x n_samples containing
149 |         # cumulative probabilities P(y_i <= k)
150 |         P = np.array([e.predict_proba(X)[:, 0] for e in self.estimators_])
151 | 
152 |         # 2d array of shape n_classes x n_samples containing
153 |         # cumulative probabilities P(y_i <= k)
154 |         P = np.vstack((P, np.ones(n_samples)))
155 | 
156 |         proba = np.zeros((n_samples, n_classes), dtype=np.float64)
157 | 
158 |         proba[:, 0] = P[0] # P(y = 0) = P(y <= 0)
159 | 
160 |         for m in xrange(1, n_classes):
161 |             proba[:, m] = P[m] - P[m - 1] # P(y = m) = P(y <= m) - P(y <= m - 1)
162 | 
163 |         return proba
164 | 


--------------------------------------------------------------------------------
/ivalice/impl/tests/test_gradient_boosting.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from sklearn.datasets import load_diabetes
  4 | from sklearn.datasets import load_iris
  5 | from sklearn.datasets import load_linnerud
  6 | from sklearn.tree import DecisionTreeClassifier
  7 | from sklearn.tree import DecisionTreeRegressor
  8 | from sklearn.ensemble import GradientBoostingRegressor
  9 | from sklearn.cross_validation import train_test_split
 10 | from sklearn.metrics import r2_score
 11 | from sklearn.svm import SVR
 12 | 
 13 | from sklearn.utils.testing import assert_almost_equal
 14 | from sklearn.utils.testing import assert_array_almost_equal
 15 | 
 16 | from ivalice.classification import GBClassifier
 17 | from ivalice.regression import GBRegressor
 18 | 
 19 | bunch = load_diabetes()
 20 | X, y = bunch.data, bunch.target
 21 | 
 22 | X_tr, X_te, y_tr, y_te = train_test_split(X, y,
 23 |                                           train_size=0.75,
 24 |                                           test_size=0.25,
 25 |                                           random_state=0)
 26 | 
 27 | iris = load_iris()
 28 | cond = iris.target <= 1
 29 | X_bin, y_bin = iris.data[cond], iris.target[cond]
 30 | 
 31 | X_bin_tr, X_bin_te, y_bin_tr, y_bin_te = train_test_split(X_bin, y_bin,
 32 |                                                           train_size=0.75,
 33 |                                                           test_size=0.25,
 34 |                                                           random_state=0)
 35 | 
 36 | iris = load_iris()
 37 | X_mult, y_mult = iris.data, iris.target
 38 | 
 39 | X_mult_tr, X_mult_te, y_mult_tr, y_mult_te = train_test_split(X_mult, y_mult,
 40 |                                                           train_size=0.75,
 41 |                                                           test_size=0.25,
 42 |                                                           random_state=0)
 43 | 
 44 | 
 45 | def test_squared_loss():
 46 |     reg = GradientBoostingRegressor(learning_rate=0.1, max_depth=3,
 47 |                                     random_state=0)
 48 |     reg.fit(X_tr, y_tr)
 49 |     y_pred = reg.predict(X_te)
 50 |     sk = np.sqrt(np.mean((y_pred - y_te) ** 2))
 51 | 
 52 |     reg = DecisionTreeRegressor(max_features=1.0, max_depth=3, random_state=0)
 53 |     reg = GBRegressor(reg, n_estimators=100, learning_rate=0.1)
 54 |     reg.fit(X_tr, y_tr)
 55 |     y_pred = reg.predict(X_te)
 56 |     iv = np.sqrt(np.mean((y_pred - y_te) ** 2))
 57 | 
 58 |     assert_almost_equal(sk, iv, 0)
 59 | 
 60 | 
 61 | def test_squared_loss_svr():
 62 |     reg = SVR(kernel="rbf", gamma=10)
 63 |     reg = GBRegressor(reg, n_estimators=10)
 64 |     reg.fit(X_tr, y_tr)
 65 |     y_pred = reg.predict(X_tr)
 66 |     assert_almost_equal(np.mean((y_tr - y_pred) ** 2), 3778.3, 1)
 67 | 
 68 | 
 69 | def test_absolute_loss():
 70 |     # Check absolute loss with scikit-learn implementation.
 71 |     reg = GradientBoostingRegressor(learning_rate=0.1, loss="lad",
 72 |                                     random_state=0)
 73 |     reg.fit(X_tr, y_tr)
 74 |     y_pred = reg.predict(X_te)
 75 |     sk = np.mean(np.abs(y_pred - y_te))
 76 | 
 77 |     reg = DecisionTreeRegressor(max_features=1.0, max_depth=3, random_state=0)
 78 |     reg = GBRegressor(reg, n_estimators=100, learning_rate=0.1, loss="absolute")
 79 |     reg.fit(X_tr, y_tr)
 80 |     y_pred = reg.predict(X_te)
 81 |     iv = np.mean(np.abs(y_pred - y_te))
 82 | 
 83 |     assert_almost_equal(sk, iv, 0)
 84 | 
 85 | 
 86 | def test_absolute_loss_constant():
 87 |     # Check absolute loss with scikit-learn implementation.
 88 |     reg = DecisionTreeRegressor(max_features=1.0, max_depth=3, random_state=0)
 89 |     reg = GBRegressor(reg, n_estimators=100, learning_rate=0.1, loss="absolute",
 90 |                       step_size="constant")
 91 |     reg.fit(X_tr, y_tr)
 92 |     y_pred = reg.predict(X_te)
 93 |     iv = np.mean(np.abs(y_pred - y_te))
 94 | 
 95 |     assert_almost_equal(iv, 55.6, 1)
 96 | 
 97 | 
 98 | def test_subsample():
 99 |     reg = DecisionTreeRegressor(max_features=1.0, max_depth=3,
100 |                                 random_state=0)
101 |     reg = GBRegressor(reg, n_estimators=100, learning_rate=0.1, subsample=0.6,
102 |                       random_state=0)
103 |     reg.fit(X_tr, y_tr)
104 |     y_pred = reg.predict(X_te)
105 |     mse = np.sqrt(np.mean((y_pred - y_te) ** 2))
106 |     assert_almost_equal(mse, 62.9, 1)
107 | 
108 | 
109 | def test_squared_hinge_loss():
110 |     # With line search.
111 |     clf = DecisionTreeClassifier(max_features=1.0, max_depth=3)
112 |     clf = GBClassifier(clf, n_estimators=10, step_size="line_search")
113 |     clf.fit(X_bin_tr, y_bin_tr)
114 |     assert_almost_equal(clf.score(X_bin_te, y_bin_te), 1.0)
115 | 
116 |     # With constant step size.
117 |     clf = DecisionTreeClassifier(max_features=1.0, max_depth=3)
118 |     clf = GBClassifier(clf, n_estimators=10, step_size="constant",
119 |                        learning_rate=0.1)
120 |     clf.fit(X_bin_te, y_bin_te)
121 |     assert_almost_equal(clf.score(X_bin_te, y_bin_te), 1.0)
122 | 
123 | 
124 | def test_squared_hinge_loss_ovr():
125 |     # With line search.
126 |     clf = DecisionTreeClassifier(max_features=1.0, max_depth=3)
127 |     clf = GBClassifier(clf, n_estimators=10, step_size="line_search")
128 |     clf.fit(X_mult_tr, y_mult_tr)
129 |     assert_almost_equal(clf.score(X_mult_te, y_mult_te), 0.974, 3)
130 | 
131 |     # With constant step size.
132 |     clf = DecisionTreeClassifier(max_features=1.0, max_depth=3)
133 |     clf = GBClassifier(clf, n_estimators=10, step_size="constant",
134 |                        learning_rate=0.1)
135 |     clf.fit(X_mult_te, y_mult_te)
136 |     assert_almost_equal(clf.score(X_mult_te, y_mult_te), 1.0)
137 | 
138 | 
139 | def test_log_loss():
140 |     # With line search.
141 |     clf = DecisionTreeClassifier(max_features=1.0, max_depth=3)
142 |     clf = GBClassifier(clf, n_estimators=10, step_size="line_search",
143 |                        loss="log")
144 |     clf.fit(X_bin_tr, y_bin_tr)
145 |     assert_almost_equal(clf.score(X_bin_te, y_bin_te), 1.0)
146 | 
147 |     # With constant step size.
148 |     clf = DecisionTreeClassifier(max_features=1.0, max_depth=3)
149 |     clf = GBClassifier(clf, n_estimators=10, step_size="constant",
150 |                        loss="log", learning_rate=0.1)
151 |     clf.fit(X_bin_te, y_bin_te)
152 |     assert_almost_equal(clf.score(X_bin_te, y_bin_te), 1.0)
153 | 
154 | 
155 | def test_multioutput_regression():
156 |     data = load_linnerud()
157 |     X, Y = data.data, data.target
158 | 
159 |     reg = DecisionTreeRegressor(max_features=1.0, max_depth=3)
160 |     reg = GBRegressor(reg, n_estimators=10, step_size="line_search")
161 |     Y_pred = reg.fit(X, Y).predict(X)
162 | 
163 |     acc = [0.697, 0.744, 0.631]
164 |     acc2 = [r2_score(Y[:, k], Y_pred[:, k]) for k in xrange(Y.shape[1])]
165 | 
166 |     assert_array_almost_equal(acc, acc2, 3)
167 | 


--------------------------------------------------------------------------------
/ivalice/impl/tests/test_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from sklearn.tree import DecisionTreeRegressor as skRegTree
  4 | from sklearn.tree import DecisionTreeClassifier as skClassifTree
  5 | 
  6 | from sklearn.datasets import make_regression
  7 | from sklearn.datasets import make_classification
  8 | from sklearn.datasets import load_diabetes
  9 | 
 10 | from sklearn.metrics import mean_squared_error
 11 | from sklearn.metrics import accuracy_score
 12 | 
 13 | from sklearn.utils.testing import assert_almost_equal
 14 | 
 15 | from ivalice.impl.tree import TreeRegressor
 16 | from ivalice.impl.tree import TreeClassifier
 17 | 
 18 | 
 19 | def _make_regression_datasets(n_times, sw=False):
 20 |     for n in xrange(n_times):
 21 |         X, y = make_regression(n_samples=100, n_features=10, random_state=n)
 22 |         if sw:
 23 |             rng = np.random.RandomState(n)
 24 |             w = rng.rand(X.shape[0])
 25 |             w[w <= 0.5] = 0.0
 26 |             yield X, y, w
 27 |         else:
 28 |             yield X, y
 29 | 
 30 | 
 31 | def _make_classification_datasets(n_times, sw=False):
 32 |     for n in xrange(n_times):
 33 |         X, y = make_classification(n_samples=20,
 34 |                                    n_features=10,
 35 |                                    n_informative=10,
 36 |                                    n_redundant=0,
 37 |                                    n_classes=3,
 38 |                                    random_state=n)
 39 |         if sw:
 40 |             rng = np.random.RandomState(n)
 41 |             w = np.round(rng.rand(X.shape[0]))
 42 |             w[w <= 0.5] = 0.0
 43 |             yield X, y, w
 44 |         else:
 45 |             yield X, y
 46 | 
 47 | 
 48 | def test_mse_fully_developed():
 49 |     sk = 0
 50 |     iv = 0
 51 | 
 52 |     for X, y in _make_regression_datasets(10):
 53 |         reg = skRegTree(max_depth=None)
 54 |         reg.fit(X, y)
 55 |         y_pred = reg.predict(X)
 56 |         sk += np.mean((y - y_pred) ** 2)
 57 | 
 58 |         reg = TreeRegressor(max_depth=None)
 59 |         reg.fit(X, y)
 60 |         y_pred = reg.predict(X)
 61 |         iv += np.mean((y - y_pred) ** 2)
 62 | 
 63 |     assert_almost_equal(sk, iv)
 64 | 
 65 | 
 66 | def test_mse_max_depth():
 67 |     for max_depth in (5, 1):
 68 |         sk = 0
 69 |         iv = 0
 70 | 
 71 |         for X, y in _make_regression_datasets(10):
 72 |             reg = skRegTree(max_depth=max_depth)
 73 |             reg.fit(X, y)
 74 |             y_pred = reg.predict(X)
 75 |             sk += np.mean((y - y_pred) ** 2)
 76 | 
 77 |             reg = TreeRegressor(max_depth=max_depth)
 78 |             reg.fit(X, y)
 79 |             y_pred = reg.predict(X)
 80 |             iv += np.mean((y - y_pred) ** 2)
 81 | 
 82 |         assert_almost_equal(sk, iv)
 83 | 
 84 | 
 85 | def test_mse_min_samples():
 86 |     sk = 0
 87 |     iv = 0
 88 | 
 89 |     for X, y in _make_regression_datasets(10):
 90 |         reg = skRegTree(max_depth=5,
 91 |                         min_samples_split=4,
 92 |                         min_samples_leaf=2)
 93 |         reg.fit(X, y)
 94 |         y_pred = reg.predict(X)
 95 |         sk += np.mean((y - y_pred) ** 2)
 96 | 
 97 |         reg = TreeRegressor(max_depth=5,
 98 |                                     min_samples_split=4,
 99 |                                     min_samples_leaf=2)
100 |         reg.fit(X, y)
101 |         y_pred = reg.predict(X)
102 |         iv += np.mean((y - y_pred) ** 2)
103 | 
104 |     assert_almost_equal(sk, iv)
105 | 
106 | 
107 | def test_mse_max_features():
108 |     sk = 0
109 |     iv = 0
110 | 
111 |     n_times = 30
112 |     for X, y in _make_regression_datasets(n_times):
113 |         reg = skRegTree(max_depth=5,
114 |                         max_features=4,
115 |                         random_state=0)
116 |         reg.fit(X, y)
117 |         y_pred = reg.predict(X)
118 |         sk += np.mean((y - y_pred) ** 2)
119 | 
120 |         reg = TreeRegressor(max_depth=5,
121 |                                     max_features=4,
122 |                                     random_state=0)
123 |         reg.fit(X, y)
124 |         y_pred = reg.predict(X)
125 |         iv += np.mean((y - y_pred) ** 2)
126 | 
127 |     sk /= n_times
128 |     iv /= n_times
129 | 
130 |     assert_almost_equal(sk, 4588.4, 1)
131 |     assert_almost_equal(iv, 4921.1, 1)
132 | 
133 | 
134 | def test_mse_sample_weight():
135 |     sk = 0
136 |     iv = 0
137 | 
138 |     n_times = 10
139 |     for X, y, w in _make_regression_datasets(n_times, sw=True):
140 |         reg = skRegTree(max_depth=5)
141 |         reg.fit(X, y, w)
142 |         y_pred = reg.predict(X)
143 |         sk += mean_squared_error(y, y_pred, sample_weight=w)
144 | 
145 |         reg = TreeRegressor(max_depth=5)
146 |         reg.fit(X, y, w)
147 |         y_pred = reg.predict(X)
148 |         iv += mean_squared_error(y, y_pred, sample_weight=w)
149 | 
150 |     sk /= n_times
151 |     iv /= n_times
152 | 
153 |     assert_almost_equal(sk, iv)
154 | 
155 | 
156 | def test_mse_duplicate_features():
157 |     diabetes = load_diabetes()
158 |     X, y = diabetes.data, diabetes.target
159 | 
160 |     reg = skRegTree(max_depth=5)
161 |     reg.fit(X, y)
162 |     y_pred = reg.predict(X)
163 |     sk = np.mean((y - y_pred) ** 2)
164 | 
165 |     reg = TreeRegressor(max_depth=5)
166 |     reg.fit(X, y)
167 |     y_pred = reg.predict(X)
168 |     iv = np.mean((y - y_pred) ** 2)
169 | 
170 |     assert_almost_equal(sk, iv)
171 | 
172 | 
173 | def test_classif_max_depth():
174 |     for criterion in ("gini", "entropy"):
175 |         sk = 0
176 |         iv = 0
177 | 
178 |         for X, y in _make_classification_datasets(10):
179 |             clf = skClassifTree(criterion=criterion,
180 |                                 max_depth=5,
181 |                                 random_state=1)
182 |             clf.fit(X, y)
183 |             y_pred = clf.predict(X)
184 |             sk += np.mean(y == y_pred)
185 | 
186 |             clf = TreeClassifier(criterion=criterion,
187 |                                          max_depth=5,
188 |                                          random_state=1)
189 |             clf.fit(X, y)
190 |             y_pred = clf.predict(X)
191 |             iv += np.mean(y == y_pred)
192 | 
193 |         sk /= 10
194 |         iv /= 10
195 | 
196 |     assert_almost_equal(sk, iv)
197 | 
198 | 
199 | def test_classif_sample_weight():
200 |     for criterion in ("gini", "entropy"):
201 |         sk = 0
202 |         iv = 0
203 | 
204 |         for X, y, w in _make_classification_datasets(10, sw=True):
205 |             clf = skClassifTree(criterion=criterion, max_depth=5)
206 |             clf.fit(X, y, w)
207 |             y_pred = clf.predict(X)
208 |             sk += accuracy_score(y, y_pred, sample_weight=w)
209 | 
210 |             clf = TreeClassifier(criterion=criterion, max_depth=5)
211 |             clf.fit(X, y, w)
212 |             y_pred = clf.predict(X)
213 |             iv += accuracy_score(y, y_pred, sample_weight=w)
214 | 
215 |         sk /= 10
216 |         iv /= 10
217 | 
218 |     assert_almost_equal(sk, iv)
219 | 


--------------------------------------------------------------------------------
/ivalice/impl/gradient_boosting.py:
--------------------------------------------------------------------------------
  1 | """Functional gradient boosting"""
  2 | 
  3 | # Author: Mathieu Blondel <mathieu@mblondel.org>
  4 | # License: BSD 3 clause
  5 | 
  6 | # Differences with scikit-learn:
  7 | #   - accepts any base estimator (not only trees)
  8 | #   - line search finds base estimator weights (not leaf weights)
  9 | 
 10 | import numpy as np
 11 | from scipy import stats
 12 | 
 13 | from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
 14 | from sklearn.preprocessing import LabelBinarizer
 15 | from sklearn.utils import check_random_state
 16 | 
 17 | 
 18 | # Taken from https://github.com/nudomarinero/wquantiles (MIT license)
 19 | def _weighted_quantile(data, weights, quantile):
 20 |     # Sort the data
 21 |     ind_sorted = np.argsort(data)
 22 |     sorted_data = data[ind_sorted]
 23 |     sorted_weights = weights[ind_sorted]
 24 |     # Compute the auxiliary arrays
 25 |     Sn = np.cumsum(sorted_weights)
 26 |     # TODO: Check that the weights do not sum zero
 27 |     Pn = (Sn-0.5*sorted_weights)/np.sum(sorted_weights)
 28 |     # Get the value of the weighted median
 29 |     return np.interp(quantile, Pn, sorted_data)
 30 | 
 31 | 
 32 | def _weighted_median(data, weights):
 33 |     return _weighted_quantile(data, weights, 0.5)
 34 | 
 35 | 
 36 | class _QuantileEstimator(BaseEstimator):
 37 |     """An estimator predicting the alpha-quantile of the training targets."""
 38 |     def __init__(self, alpha=0.9):
 39 |         if not 0 < alpha < 1.0:
 40 |             raise ValueError("`alpha` must be in (0, 1.0) but was %r" % alpha)
 41 |         self.alpha = alpha
 42 | 
 43 |     def fit(self, X, y):
 44 |         self.quantile = stats.scoreatpercentile(y, self.alpha * 100.0)
 45 |         return self
 46 | 
 47 |     def predict(self, X):
 48 |         y = np.empty(X.shape[0], dtype=np.float64)
 49 |         y.fill(self.quantile)
 50 |         return y
 51 | 
 52 | 
 53 | class _MeanEstimator(BaseEstimator):
 54 |     """An estimator predicting the mean of the training targets."""
 55 |     def fit(self, X, y):
 56 |         self.mean = np.mean(y)
 57 |         return self
 58 | 
 59 |     def predict(self, X):
 60 |         y = np.empty(X.shape[0], dtype=np.float64)
 61 |         y.fill(self.mean)
 62 |         return y
 63 | 
 64 | 
 65 | class _SquareLoss(object):
 66 | 
 67 |     def init_estimator(self):
 68 |         return _MeanEstimator()
 69 | 
 70 |     def negative_gradient(self, y, y_pred):
 71 |         return y - y_pred
 72 | 
 73 |     def line_search(self, y, y_pred, h_pred):
 74 |         Lp = np.sum((y - y_pred) * h_pred)
 75 |         Lpp = np.sum(h_pred * h_pred)
 76 | 
 77 |         if Lpp == 0:
 78 |             return 1.0
 79 | 
 80 |         # Should be 1.0 assuming that the base learner perfectly fits the
 81 |         # residuals.
 82 |         return Lp/Lpp
 83 | 
 84 | 
 85 | class _AbsoluteLoss(object):
 86 | 
 87 |     def init_estimator(self):
 88 |         return _QuantileEstimator(alpha=0.5)
 89 | 
 90 |     def negative_gradient(self, y, y_pred):
 91 |         return np.sign(y - y_pred)
 92 | 
 93 |     def line_search(self, y, y_pred, h_pred):
 94 |         cond = h_pred != 0
 95 |         diff = y - y_pred
 96 |         diff[cond] /= h_pred[cond]
 97 |         diff[~cond] = 0
 98 |         return _weighted_median(diff, np.abs(h_pred))
 99 | 
100 | 
101 | class _SquaredHingeLoss(object):
102 | 
103 |     def __init__(self, max_steps=1):
104 |         self.max_steps = max_steps
105 | 
106 |     def init_estimator(self):
107 |         return _MeanEstimator()
108 | 
109 |     def negative_gradient(self, y, y_pred):
110 |         return 2 * np.maximum(1 - y * y_pred, 0) * y
111 | 
112 |     def line_search(self, y, y_pred, h_pred):
113 |         rho = 0
114 | 
115 |         y_h_pred = y * h_pred
116 |         h_pred_sq = h_pred ** 2
117 | 
118 |         for it in xrange(self.max_steps):
119 |             error = 1 - y * (y_pred + rho * h_pred)
120 |             Lp = -np.sum(np.maximum(error, 0) * y_h_pred)
121 |             Lpp = np.sum((error > 0) * h_pred_sq)
122 | 
123 |             if Lpp == 0:
124 |                 break
125 | 
126 |             rho -= Lp / Lpp
127 | 
128 |         return rho
129 | 
130 | 
131 | class _LogLoss(object):
132 | 
133 |     def __init__(self, max_steps=1):
134 |         self.max_steps = max_steps
135 | 
136 |     def init_estimator(self):
137 |         return _MeanEstimator()
138 | 
139 |     def negative_gradient(self, y, y_pred):
140 |         q = 1.0 / (1 + np.exp(-y * y_pred))
141 |         return -y * (q - 1)
142 | 
143 |     def line_search(self, y, y_pred, h_pred):
144 |         rho = 0
145 | 
146 |         y_h_pred = y * h_pred
147 |         h_pred_sq = h_pred ** 2
148 | 
149 |         for it in xrange(self.max_steps):
150 |             q = 1.0 / (1 + np.exp(-y * (y_pred + rho * h_pred)))
151 |             Lp = np.sum((q - 1) * y_h_pred)
152 |             Lpp = np.sum(q * (1 - q) * h_pred_sq)
153 | 
154 |             if Lpp == 0:
155 |                 break
156 | 
157 |             rho -= Lp / Lpp
158 | 
159 |         return rho
160 | 
161 | 
162 | class _BaseGB(BaseEstimator):
163 | 
164 |     def _fit(self, X, y, y_pred, loss, rng):
165 |         if self.subsample != 1.0:
166 |             n = int(X.shape[0] * self.subsample)
167 |             ind = rng.permutation(X.shape[0])[:n]
168 |             X = X[ind]
169 |             y = y[ind]
170 |             y_pred = y_pred[ind]
171 | 
172 |         negative_gradient = loss.negative_gradient(y, y_pred)
173 | 
174 |         est = clone(self.estimator)
175 |         est.fit(X, negative_gradient)
176 | 
177 |         step_size = getattr(self, "step_size", "constant")
178 | 
179 |         if step_size == "line_search":
180 |             h_pred = est.predict(X)
181 |             step_size = loss.line_search(y, y_pred, h_pred)
182 |         elif step_size == "constant":
183 |             step_size = 1.0
184 |         else:
185 |             raise ValueError("Unknown step size.")
186 | 
187 |         return est, step_size
188 | 
189 |     def fit(self, X, y):
190 |         rng = check_random_state(self.random_state)
191 |         loss = self._get_loss()
192 | 
193 |         ravel = len(y.shape) == 1
194 |         Y = y.reshape(-1, 1) if ravel else y
195 |         n_samples = X.shape[0]
196 |         n_vectors = Y.shape[1]
197 | 
198 |         self.estimator_weights_ = np.ones((self.n_estimators, n_vectors),
199 |                                           dtype=np.float64)
200 |         self.estimator_weights_[1:] *= self.learning_rate
201 | 
202 |         self.estimators_ = np.empty((self.n_estimators, n_vectors),
203 |                                     dtype=np.object)
204 | 
205 |         Y_pred = np.zeros((n_samples, n_vectors), dtype=np.float64)
206 | 
207 |         # Initial estimator.
208 |         for k in xrange(n_vectors):
209 |             est = loss.init_estimator().fit(X, Y[:, k])
210 |             self.estimators_[0, k] = est
211 |             Y_pred[:, k] += est.predict(X)
212 | 
213 |         if self.callback is not None:
214 |             self.callback(self)
215 | 
216 |         # Incremental fitting.
217 |         for i in xrange(1, self.n_estimators):
218 |             for k in xrange(n_vectors):
219 | 
220 |                 est, step_size = self._fit(X, Y[:, k], Y_pred[:, k], loss, rng)
221 |                 self.estimators_[i, k] = est
222 |                 self.estimator_weights_[i, k] *= step_size
223 |                 Y_pred[:, k] += self.estimator_weights_[i, k] * est.predict(X)
224 | 
225 |             if self.callback is not None:
226 |                 self.callback(self)
227 | 
228 |         if ravel:
229 |             self.estimators_ = self.estimators_.ravel()
230 |             self.estimator_weights_ = self.estimator_weights_.ravel()
231 | 
232 |         return self
233 | 
234 |     def _df_multi(self, X):
235 |         n_samples = X.shape[0]
236 |         n_estimators, n_vectors = self.estimators_.shape
237 |         pred = np.zeros((n_samples, n_vectors), dtype=np.float64)
238 | 
239 |         for i in xrange(n_estimators):
240 |             for k in xrange(n_vectors):
241 |                 est = self.estimators_[i, k]
242 |                 if est is None: continue
243 |                 pred[:, k] += self.estimator_weights_[i, k] * est.predict(X)
244 | 
245 |         return pred
246 | 
247 |     def _df(self, X):
248 |         n_samples = X.shape[0]
249 |         n_estimators = self.estimators_.shape[0]
250 |         pred = np.zeros(n_samples, dtype=np.float64)
251 | 
252 |         for i in xrange(n_estimators):
253 |             est = self.estimators_[i]
254 |             if est is None: continue
255 |             pred += self.estimator_weights_[i] * est.predict(X)
256 | 
257 |         return pred
258 | 
259 |     def decision_function(self, X):
260 |         if len(self.estimators_.shape) == 1:
261 |             return self._df(X)
262 |         else:
263 |             return self._df_multi(X)
264 | 
265 |     def predict(self, X):
266 |         return self.decision_function(X)
267 | 
268 | 
269 | class GBClassifier(_BaseGB, ClassifierMixin):
270 | 
271 |     def __init__(self, estimator, n_estimators=100,
272 |                  step_size="line_search", learning_rate=0.1,
273 |                  loss="squared_hinge", subsample=1.0,
274 |                  callback=None, random_state=None):
275 |         self.estimator = estimator
276 |         self.n_estimators = n_estimators
277 |         self.step_size = step_size
278 |         self.learning_rate = learning_rate
279 |         self.loss = loss
280 |         self.subsample = subsample
281 |         self.callback = callback
282 |         self.random_state = random_state
283 | 
284 |     def _get_loss(self):
285 |         losses = dict(squared_hinge=_SquaredHingeLoss(),
286 |                       log=_LogLoss())
287 |         return losses[self.loss]
288 | 
289 |     def fit(self, X, y):
290 |         self._lb = LabelBinarizer(neg_label=-1)
291 |         Y = self._lb.fit_transform(y)
292 |         return super(GBClassifier, self).fit(X, Y)
293 | 
294 |     def predict(self, X):
295 |         pred = self.decision_function(X)
296 |         return self._lb.inverse_transform(pred)
297 | 
298 | 
299 | class GBRegressor(_BaseGB, RegressorMixin):
300 | 
301 |     def __init__(self, estimator, n_estimators=100,
302 |                  step_size="line_search", learning_rate=0.1,
303 |                  loss="squared", subsample=1.0,
304 |                  callback=None, random_state=None):
305 |         self.estimator = estimator
306 |         self.n_estimators = n_estimators
307 |         self.step_size = step_size
308 |         self.learning_rate = learning_rate
309 |         self.loss = loss
310 |         self.subsample = subsample
311 |         self.callback = callback
312 |         self.random_state = random_state
313 | 
314 |     def _get_loss(self):
315 |         losses = dict(squared=_SquareLoss(),
316 |                       absolute=_AbsoluteLoss())
317 |         return losses[self.loss]
318 | 


--------------------------------------------------------------------------------
/ivalice/impl/tree.py:
--------------------------------------------------------------------------------
  1 | """Classification and regression trees"""
  2 | 
  3 | # Author: Mathieu Blondel <mathieu@mblondel.org>
  4 | # License: BSD 3 clause
  5 | 
  6 | import numbers
  7 | 
  8 | import numpy as np
  9 | import numba
 10 | 
 11 | from sklearn.base import BaseEstimator
 12 | from sklearn.base import RegressorMixin
 13 | from sklearn.base import ClassifierMixin
 14 | from sklearn.preprocessing import LabelEncoder
 15 | from sklearn.utils import check_random_state
 16 | 
 17 | from .sort import heapsort
 18 | 
 19 | TREE_LEAF = -1
 20 | UNDEFINED = -2
 21 | 
 22 | DOUBLE_MAX = np.finfo(np.float64).max
 23 | 
 24 | MSE_CRITERION = 0
 25 | GINI_CRITERION = 1
 26 | ENTROPY_CRITERION = 2
 27 | 
 28 | 
 29 | class _Tree(object):
 30 | 
 31 |     def __init__(self, capacity=2 ** 10):
 32 |         self.capacity = capacity
 33 |         self.threshold = np.zeros(capacity, dtype=np.float64) + UNDEFINED
 34 |         self.feature = np.zeros(capacity, dtype=np.int32) + UNDEFINED
 35 |         self.children_left = np.zeros(capacity, dtype=np.int32) + TREE_LEAF
 36 |         self.children_right = np.zeros(capacity, dtype=np.int32) + TREE_LEAF
 37 |         self.value = np.zeros(capacity, dtype=np.float64)
 38 |         self.ptr = 0
 39 | 
 40 |     def add_node(self, threshold, feature, value):
 41 |         self.threshold[self.ptr] = threshold
 42 |         self.feature[self.ptr] = feature
 43 |         self.value[self.ptr] = value
 44 |         self.ptr += 1
 45 | 
 46 |     def add_terminal_node(self, value):
 47 |         self.value[self.ptr] = value
 48 |         self.ptr += 1
 49 | 
 50 |     def finalize(self):
 51 |         for attr in ("threshold", "feature", "value",
 52 |                      "children_left", "children_right"):
 53 |             attr_value = getattr(self, attr)[:self.ptr + 1]
 54 |             setattr(self, attr, attr_value)
 55 |         return self
 56 | 
 57 | 
 58 | class _Stack(object):
 59 | 
 60 |     def __init__(self, capacity=2 ** 10):
 61 |         self.capacity = capacity
 62 |         self.start = np.zeros(capacity, dtype=np.int32)
 63 |         self.end = np.zeros(capacity, dtype=np.int32)
 64 |         self.left = np.zeros(capacity, dtype=bool)
 65 |         self.depth = np.zeros(capacity, dtype=np.int32)
 66 |         self.n_samples = np.zeros(capacity, dtype=np.float64)
 67 |         self.parent = np.zeros(capacity, dtype=np.int32)
 68 |         self.value = np.zeros(capacity, dtype=np.float64)
 69 |         self.ptr = -1
 70 | 
 71 |     def push(self, start, end, left, depth, n_samples, parent, value):
 72 |         if self.ptr >= self.capacity:
 73 |             raise ValueError("Stack overflow!")
 74 | 
 75 |         self.ptr += 1
 76 |         self.start[self.ptr] = start
 77 |         self.end[self.ptr] = end
 78 |         self.left[self.ptr] = left
 79 |         self.depth[self.ptr] = depth
 80 |         self.n_samples[self.ptr] = n_samples
 81 |         self.parent[self.ptr] = parent
 82 |         self.value[self.ptr] = value
 83 | 
 84 |     def pop(self):
 85 |         self.ptr -= 1
 86 |         p = self.ptr + 1
 87 |         return self.start[p], self.end[p], self.left[p], self.depth[p], \
 88 |                self.n_samples[p], self.parent[p], self.value[p]
 89 | 
 90 |     def __len__(self):
 91 |         return self.ptr + 1
 92 | 
 93 | 
 94 | @numba.njit("void(f8[:,:], i4[:], f8[:], i4[:], i4[:], i4[:])")
 95 | def _apply(X, feature, threshold, children_left, children_right, out):
 96 |     for i in range(X.shape[0]):
 97 |         node = 0
 98 |         # While node not a leaf
 99 |         while children_left[node] != TREE_LEAF:
100 |             if X[i, feature[node]] <= threshold[node]:
101 |                 node = children_left[node]
102 |             else:
103 |                 node = children_right[node]
104 |         out[i] = node
105 | 
106 | 
107 | @numba.njit("f8(f8[:], f8[:], f8[:], i4[:], i4, i4, i4, f8[:])")
108 | def _impurity_mse(Xj, y, sample_weight, samples, start_t, pos_t, end_t, out):
109 |     N_L = 0
110 |     N_R = 0
111 | 
112 |     y_sq = 0
113 |     y_sum = 0
114 | 
115 |     for ii in xrange(start_t, pos_t):
116 |         i = samples[ii]
117 |         N_L += sample_weight[i]
118 |         y_sq += sample_weight[i] * y[i] * y[i]
119 |         y_sum += sample_weight[i] * y[i]
120 | 
121 |     if N_L == 0:
122 |         return DOUBLE_MAX
123 | 
124 |     value_L = y_sum / N_L
125 |     imp_L = y_sq - 1 * y_sum * y_sum / N_L
126 | 
127 |     y_sq = 0
128 |     y_sum = 0
129 | 
130 |     for ii in xrange(pos_t, end_t):
131 |         i = samples[ii]
132 |         N_R += sample_weight[i]
133 |         y_sq += sample_weight[i] * y[i] * y[i]
134 |         y_sum += sample_weight[i] * y[i]
135 | 
136 |     if N_R == 0:
137 |         return DOUBLE_MAX
138 | 
139 |     value_R = y_sum / N_R
140 |     imp_R = y_sq - 1 * y_sum * y_sum / N_R
141 | 
142 |     N_t = N_L + N_R
143 | 
144 |     out[0] = N_L
145 |     out[1] = N_R
146 |     out[2] = N_t
147 |     out[3] = value_L
148 |     out[4] = value_R
149 | 
150 |     return (imp_L + imp_R) / N_t
151 | 
152 | 
153 | @numba.njit("void(f8[:], f8[:], f8[:], i4[:], i4, i4, f8, f8[:], f8[:], f8[:])")
154 | def _compute_counts(Xj, y, sample_weight, samples, start_t, pos_t, end_t,
155 |                     count_L, count_R, out):
156 |     n_classes = count_L.shape[0]
157 |     N_L = 0
158 |     N_R = 0
159 | 
160 |     for k in xrange(n_classes):
161 |         count_L[k] = 0
162 |         count_R[k] = 0
163 | 
164 |     for ii in xrange(start_t, pos_t):
165 |         i = samples[ii]
166 |         N_L += sample_weight[i]
167 |         idx = int(y[i])
168 |         count_L[idx] += sample_weight[i]
169 | 
170 |     for ii in xrange(pos_t, end_t):
171 |         i = samples[ii]
172 |         N_R += sample_weight[i]
173 |         idx = int(y[i])
174 |         count_R[idx] += sample_weight[i]
175 | 
176 |     best_L = -DOUBLE_MAX
177 |     best_R = -DOUBLE_MAX
178 |     value_L = 0
179 |     value_R = 0
180 | 
181 |     for k in xrange(n_classes):
182 |         if count_L[k] > best_L:
183 |             best_L = count_L[k]
184 |             value_L = k
185 | 
186 |         if count_R[k] > best_R:
187 |             best_R = count_R[k]
188 |             value_R = k
189 | 
190 |     out[0] = N_L
191 |     out[1] = N_R
192 |     out[2] = N_L + N_R
193 |     out[3] = value_L
194 |     out[4] = value_R
195 | 
196 | 
197 | @numba.njit("f8(f8[:], f8[:], f8[:], i4[:], i4, i4, i4, f8[:], f8[:], f8[:])")
198 | def _impurity_gini(Xj, y, sample_weight, samples, start_t, pos_t, end_t,
199 |                    count_L, count_R, out):
200 |     n_classes = count_L.shape[0]
201 | 
202 |     _compute_counts(Xj, y, sample_weight, samples, start_t, pos_t, end_t,
203 |                     count_L, count_R, out)
204 |     N_L = out[0]
205 |     N_R = out[1]
206 |     N_t = out[2]
207 | 
208 |     if N_L == 0 and N_R == 0:
209 |         return DOUBLE_MAX
210 | 
211 |     gini_L = 0
212 |     gini_R = 0
213 |     for k in xrange(n_classes):
214 |         proba_L = count_L[k] / N_t
215 |         proba_R = count_R[k] / N_t
216 | 
217 |         gini_L += proba_L * (1 - proba_L)
218 |         gini_R += proba_R * (1 - proba_R)
219 | 
220 |     #return float(N_L) / N_t * gini_L + float(N_R) / N_t * gini_R
221 |     return N_L * gini_L + N_R * gini_R
222 | 
223 | 
224 | @numba.njit("f8(f8[:], f8[:], f8[:], i4[:], i4, i4, i4, f8[:], f8[:], f8[:])")
225 | def _impurity_entropy(Xj, y, sample_weight, samples, start_t, pos_t, end_t,
226 |                       count_L, count_R, out):
227 |     n_classes = count_L.shape[0]
228 | 
229 |     _compute_counts(Xj, y, sample_weight, samples, start_t, pos_t, end_t,
230 |                     count_L, count_R, out)
231 |     N_L = out[0]
232 |     N_R = out[1]
233 |     N_t = out[2]
234 | 
235 |     if N_L == 0 or N_R == 0:
236 |         return DOUBLE_MAX
237 | 
238 |     ent_L = 0
239 |     ent_R = 0
240 |     for k in xrange(n_classes):
241 |         proba_L = count_L[k] / N_t
242 |         proba_R = count_R[k] / N_t
243 | 
244 |         if proba_L > 0:
245 |             ent_L -= proba_L * np.log2(proba_L)
246 | 
247 |         if proba_R > 0:
248 |             ent_R -= proba_R * np.log2(proba_R)
249 | 
250 |     #return float(N_L) / N_t * ent_L + float(N_R) / N_t * ent_R
251 |     return N_L * ent_L + N_R * ent_R
252 | 
253 | 
254 | @numba.njit("void(f8[:,:], f8[:], f8[:], i4[:], i4[:], f8[:], i4, i4, i4, "
255 |             "i4, f8[:], f8[:], f8[:])")
256 | def _best_split(X, y, sample_weight, samples, features, Xj, start_t, end_t,
257 |                 criterion, min_samples_leaf, count_L, count_R, out):
258 |     best_imp = DOUBLE_MAX
259 |     best_thresh = 0.0
260 |     best_j = -1
261 |     best_pos_t = -1
262 |     N_L = 0.0
263 |     N_R = 0.0
264 |     N_t = 0.0
265 |     value_L = 0.0
266 |     value_R = 0.0
267 | 
268 |     size_t = end_t - start_t
269 | 
270 |     for j in features:
271 | 
272 |         for p in xrange(start_t, end_t):
273 |             Xj[p] = X[samples[p], j]
274 | 
275 |         # Sort samples in nodes_t by their value for feature j.
276 |         heapsort(Xj[start_t:end_t], samples[start_t:end_t], size_t)
277 |         # FIXME: use introsort.
278 | 
279 |         for k in xrange(start_t, end_t - 1):
280 |             pos_t = k + 1
281 |             N_L = pos_t - start_t
282 |             N_R = size_t - N_L
283 | 
284 |             if N_R < min_samples_leaf or N_L < min_samples_leaf:
285 |                 continue
286 | 
287 |             # Choose splitting threshold.
288 |             # Any value between Xj[k+1] and Xj[k] is fine.
289 |             Xj_diff = Xj[k + 1] - Xj[k]
290 | 
291 |             if Xj_diff == 0:
292 |                 continue
293 | 
294 |             thresh = Xj_diff / 2.0 + Xj[k]
295 | 
296 |             # FIXME: impurity can be computed efficiently from last
297 |             # iteration.
298 |             if criterion == MSE_CRITERION:
299 |                 imp = _impurity_mse(Xj, y, sample_weight, samples, start_t,
300 |                                     pos_t, end_t, out)
301 |             elif criterion == GINI_CRITERION:
302 |                 imp = _impurity_gini(Xj, y, sample_weight, samples, start_t,
303 |                                      pos_t, end_t, count_L, count_R, out)
304 |             else:
305 |                 imp = _impurity_entropy(Xj, y, sample_weight, samples, start_t,
306 |                                         pos_t, end_t, count_L, count_R, out)
307 | 
308 |             if imp < best_imp:
309 |                 best_imp = imp
310 |                 best_thresh = thresh
311 |                 best_j = j
312 |                 best_pos_t = pos_t
313 |                 N_L = out[0]
314 |                 N_R = out[1]
315 |                 N_t = out[2]
316 |                 value_L = out[3]
317 |                 value_R = out[4]
318 | 
319 |     out[0] = N_L
320 |     out[1] = N_R
321 |     out[2] = N_t
322 |     out[3] = value_L
323 |     out[4] = value_R
324 |     out[5] = best_thresh
325 |     out[6] = best_j
326 |     out[7] = best_pos_t
327 | 
328 |     if best_j != -1:
329 |         # Reorder samples for the best split.
330 |         for p in xrange(start_t, end_t):
331 |             Xj[p] = X[samples[p], best_j]
332 | 
333 |         heapsort(Xj[start_t:end_t], samples[start_t:end_t], size_t)
334 | 
335 | 
336 | def _build_tree(X, y, sample_weight, criterion, max_features=None,
337 |                 max_depth=None, min_samples_split=2, min_samples_leaf=1,
338 |                 random_state=None):
339 |     n_samples, n_features = X.shape
340 | 
341 |     tree = _Tree()
342 |     node_t = 0
343 |     samples = np.arange(n_samples).astype(np.int32)
344 |     samples = samples[sample_weight > 0]
345 |     features = np.arange(n_features).astype(np.int32)
346 | 
347 |     stack = _Stack()
348 |     stack.push(start=0, end=len(samples), left=False,
349 |                depth=0, n_samples=np.sum(sample_weight),
350 |                parent=0, value=0)
351 | 
352 |     # Buffers
353 |     Xj = np.zeros(n_samples, dtype=np.float64)
354 |     out = np.zeros(8, dtype=np.float64)
355 | 
356 |     if criterion >= GINI_CRITERION:  # Classification case
357 |         enc = LabelEncoder()
358 |         y = enc.fit_transform(y).astype(np.float64)
359 |         # Arrays which will contain the number of samples in each class.
360 |         count_L = np.zeros(len(enc.classes_), dtype=np.float64)
361 |         count_R = np.zeros(len(enc.classes_), dtype=np.float64)
362 |     else:
363 |         count_L = np.zeros(0, dtype=np.float64)
364 |         count_R = np.zeros(0, dtype=np.float64)
365 | 
366 |     while len(stack) > 0:
367 |         # Pick node from the stack.
368 |         start_t, end_t, left_t, depth_t, N_t, parent_t, value_t = stack.pop()
369 | 
370 |         if node_t > 0:
371 |             # Adjust children node id of parent.
372 |             if left_t:
373 |                 tree.children_left[parent_t] = node_t
374 |             else:
375 |                 tree.children_right[parent_t] = node_t
376 | 
377 |         size_t = end_t - start_t
378 | 
379 |         # Terminal node if max_depth or min_samples_split conditions are met.
380 |         if depth_t == max_depth or size_t < min_samples_split:
381 |             tree.add_terminal_node(value_t)
382 |             node_t += 1
383 |             continue
384 | 
385 |         # Find best split across all features.
386 |         if max_features != n_features:
387 |             random_state.shuffle(features)
388 | 
389 |         _best_split(X, y, sample_weight, samples, features[:max_features], Xj,
390 |                     start_t, end_t, criterion, min_samples_leaf,
391 |                     count_L, count_R, out)
392 |         N_L, N_R, _, value_L, value_R, best_thresh, best_j, pos_t = out
393 |         best_j = int(best_j)
394 |         pos_t = int(pos_t)
395 | 
396 |         # No best split found: terminal node.
397 |         if best_j == -1:
398 |             tree.add_terminal_node(value_t)
399 |             node_t += 1
400 |             continue
401 | 
402 |         # Add node to the tree.
403 |         tree.add_node(threshold=best_thresh, feature=best_j, value=value_t)
404 | 
405 |         # Add left and right children to the stack.
406 |         stack.push(start=start_t, end=pos_t, left=True, depth=depth_t + 1,
407 |                    n_samples=N_L, parent=node_t, value=value_L)
408 |         stack.push(start=pos_t, end=end_t, left=False, depth=depth_t + 1,
409 |                    n_samples=N_R, parent=node_t, value=value_R)
410 | 
411 |         node_t += 1
412 | 
413 |     if criterion >= GINI_CRITERION:
414 |         values = np.array(tree.value, dtype=np.int32)
415 |         tree.value = enc.inverse_transform(values)
416 | 
417 |     return tree.finalize()
418 | 
419 | 
420 | class _BaseTree(BaseEstimator):
421 | 
422 |     def _get_max_features(self, X):
423 |         n_features = X.shape[1]
424 | 
425 |         if self.max_features is None:
426 |             max_features = n_features
427 |         elif isinstance(self.max_features, (numbers.Integral, np.integer)):
428 |             max_features = self.max_features
429 |         else:  # float
430 |             if self.max_features > 0.0:
431 |                 max_features = max(1, int(self.max_features * n_features))
432 |             else:
433 |                 raise ValueError("max_features should be positive!")
434 | 
435 |         return max_features
436 | 
437 | 
438 | class TreeClassifier(_BaseTree, ClassifierMixin):
439 | 
440 |     def __init__(self, criterion="gini", max_features=None, max_depth=None,
441 |                  min_samples_split=2, min_samples_leaf=1, random_state=None):
442 |         self.criterion = criterion
443 |         self.max_features = max_features
444 |         self.max_depth = max_depth
445 |         self.min_samples_split = min_samples_split
446 |         self.min_samples_leaf = min_samples_leaf
447 |         self.random_state = random_state
448 | 
449 |     def _get_criterion(self):
450 |         return {"gini": GINI_CRITERION,
451 |                 "entropy": ENTROPY_CRITERION}[self.criterion]
452 | 
453 |     def fit(self, X, y, sample_weight=None):
454 |         rng = check_random_state(self.random_state)
455 | 
456 |         if sample_weight is None:
457 |             sample_weight = np.ones(X.shape[0], dtype=np.float64)
458 | 
459 |         self.tree_ = _build_tree(X, y, sample_weight,
460 |                                  criterion=self._get_criterion(),
461 |                                  max_features=self._get_max_features(X),
462 |                                  max_depth=self.max_depth,
463 |                                  min_samples_split=self.min_samples_split,
464 |                                  min_samples_leaf=self.min_samples_leaf,
465 |                                  random_state=rng)
466 |         self.tree_.value = self.tree_.value.astype(np.int32)
467 |         return self
468 | 
469 |     def predict(self, X):
470 |         nodes = np.empty(X.shape[0], dtype=np.int32)
471 |         _apply(X, self.tree_.feature, self.tree_.threshold,
472 |                self.tree_.children_left, self.tree_.children_right, nodes)
473 |         return self.tree_.value.take(nodes)
474 | 
475 | 
476 | class TreeRegressor(_BaseTree, RegressorMixin):
477 | 
478 |     def __init__(self, max_features=None, max_depth=None, min_samples_split=2,
479 |                  min_samples_leaf=1, random_state=None):
480 |         self.max_features = max_features
481 |         self.max_depth = max_depth
482 |         self.min_samples_split = min_samples_split
483 |         self.min_samples_leaf = min_samples_leaf
484 |         self.random_state = random_state
485 | 
486 |     def fit(self, X, y, sample_weight=None):
487 |         rng = check_random_state(self.random_state)
488 | 
489 |         if sample_weight is None:
490 |             sample_weight = np.ones(X.shape[0], dtype=np.float64)
491 | 
492 |         self.tree_ = _build_tree(X, y, sample_weight,
493 |                                  criterion=MSE_CRITERION,
494 |                                  max_features=self._get_max_features(X),
495 |                                  max_depth=self.max_depth,
496 |                                  min_samples_split=self.min_samples_split,
497 |                                  min_samples_leaf=self.min_samples_leaf,
498 |                                  random_state=rng)
499 |         return self
500 | 
501 |     def predict(self, X):
502 |         nodes = np.empty(X.shape[0], dtype=np.int32)
503 |         _apply(X, self.tree_.feature, self.tree_.threshold,
504 |                self.tree_.children_left, self.tree_.children_right, nodes)
505 |         return self.tree_.value.take(nodes)
506 | 


--------------------------------------------------------------------------------