├── packtml
    ├── VERSION
    ├── utils
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_linalg.py
    │   │   └── test_validation.py
    │   ├── __init__.py
    │   ├── linalg.py
    │   ├── extmath.py
    │   ├── plotting.py
    │   └── validation.py
    ├── clustering
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_knn.py
    │   ├── __init__.py
    │   └── knn.py
    ├── metrics
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_ranking.py
    │   ├── __init__.py
    │   └── ranking.py
    ├── neural_net
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_mlp.py
    │   │   └── test_transfer.py
    │   ├── __init__.py
    │   ├── base.py
    │   ├── transfer.py
    │   └── mlp.py
    ├── regression
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_simple_regression.py
    │   │   └── test_simple_logistic.py
    │   ├── __init__.py
    │   ├── simple_regression.py
    │   └── simple_logistic.py
    ├── decision_tree
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_metrics.py
    │   │   └── test_cart.py
    │   ├── __init__.py
    │   ├── metrics.py
    │   └── cart.py
    ├── recommendation
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_als.py
    │   │   └── test_itemitem.py
    │   ├── __init__.py
    │   ├── base.py
    │   ├── data.py
    │   ├── itemitem.py
    │   └── als.py
    ├── __init__.py
    └── base.py
├── MANIFEST.in
├── examples
    ├── data
    │   └── README.md
    ├── decision_tree
    │   ├── example_information_gain.py
    │   ├── example_classification_split.py
    │   ├── example_regression_decision_tree.py
    │   └── example_classification_decision_tree.py
    ├── recommendation
    │   ├── example_item_item_recommender.py
    │   └── example_als_recommender.py
    ├── run_all_examples.py
    ├── regression
    │   ├── example_linear_regression.py
    │   └── example_logistic_regression.py
    ├── clustering
    │   └── example_knn_classifier.py
    └── neural_net
    │   ├── example_mlp_classifier.py
    │   └── example_transfer_learning.py
├── requirements.txt
├── curriculum.docx
├── environment.yml
├── .coveragerc
├── img
    ├── clustering
    │   └── example_knn_classifier.png
    ├── neural_net
    │   ├── example_mlp_classifier.png
    │   └── example_transfer_learning.png
    ├── regression
    │   ├── example_linear_regression.png
    │   └── example_logistic_regression.png
    ├── recommendation
    │   └── example_als_recommender.png
    ├── decision_tree
    │   ├── example_regression_decision_tree.png
    │   └── example_classification_decision_tree.png
    └── README.md
├── .travis.yml
├── LICENSE
├── .gitignore
├── setup.py
├── appveyor.yml
└── README.md


/packtml/VERSION:
--------------------------------------------------------------------------------
1 | 1.0.5


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive include packtml/*


--------------------------------------------------------------------------------
/examples/data/README.md:
--------------------------------------------------------------------------------
1 | # Demo data
2 | 
3 | Cached data for the ML demo goes here.


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=0.15
2 | scipy>=0.19
3 | scikit-learn>=0.19
4 | pandas>=0.23
5 | matplotlib


--------------------------------------------------------------------------------
/packtml/utils/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/packtml/clustering/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/packtml/metrics/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/packtml/neural_net/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/packtml/regression/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/packtml/decision_tree/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/packtml/recommendation/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/curriculum.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/curriculum.docx


--------------------------------------------------------------------------------
/packtml/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .ranking import *
4 | 
5 | __all__ = [s for s in dir() if not s.startswith("_")]
6 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: packt-sml
2 | 
3 | dependencies:
4 |   - python=3.6
5 |   - numpy
6 |   - scipy
7 |   - scikit-learn
8 |   - pandas
9 |   - matplotlib


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = packtml
3 | include = */packtml/*
4 | omit =
5 |     */packtml/setup.py
6 |     */packtml/utils/plotting.py
7 |     */setup.py
8 | 


--------------------------------------------------------------------------------
/packtml/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from packtml.clustering.knn import *
4 | 
5 | __all__ = [s for s in dir() if not s.startswith("_")]
6 | 


--------------------------------------------------------------------------------
/img/clustering/example_knn_classifier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/clustering/example_knn_classifier.png


--------------------------------------------------------------------------------
/img/neural_net/example_mlp_classifier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/neural_net/example_mlp_classifier.png


--------------------------------------------------------------------------------
/img/neural_net/example_transfer_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/neural_net/example_transfer_learning.png


--------------------------------------------------------------------------------
/img/regression/example_linear_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/regression/example_linear_regression.png


--------------------------------------------------------------------------------
/img/recommendation/example_als_recommender.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/recommendation/example_als_recommender.png


--------------------------------------------------------------------------------
/img/regression/example_logistic_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/regression/example_logistic_regression.png


--------------------------------------------------------------------------------
/img/decision_tree/example_regression_decision_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/decision_tree/example_regression_decision_tree.png


--------------------------------------------------------------------------------
/packtml/neural_net/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from packtml.neural_net.mlp import *
4 | from packtml.neural_net.transfer import *
5 | 
6 | __all__ = [s for s in dir() if not s.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/img/decision_tree/example_classification_decision_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/decision_tree/example_classification_decision_tree.png


--------------------------------------------------------------------------------
/packtml/decision_tree/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from packtml.decision_tree.cart import *
4 | from packtml.decision_tree.metrics import *
5 | 
6 | __all__ = [s for s in dir() if not s.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/packtml/regression/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from packtml.regression.simple_regression import *
4 | from packtml.regression.simple_logistic import *
5 | 
6 | __all__ = [s for s in dir() if not s.startswith("_")]
7 | 
8 | 


--------------------------------------------------------------------------------
/packtml/recommendation/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from packtml.recommendation.als import *
4 | from packtml.recommendation.data import *
5 | from packtml.recommendation.itemitem import *
6 | 
7 | __all__ = [s for s in dir() if not s.startswith("_")]
8 | 


--------------------------------------------------------------------------------
/packtml/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from packtml.utils.extmath import *
4 | from packtml.utils.linalg import *
5 | from packtml.utils.plotting import *
6 | from packtml.utils.validation import *
7 | 
8 | __all__ = [s for s in dir() if not s.startswith("_")]
9 | 


--------------------------------------------------------------------------------
/img/README.md:
--------------------------------------------------------------------------------
1 | # img
2 | 
3 | Within this directory, you'll find the output of the various example scripts.
4 | The rendering of these images is automated by the 
5 | [examples/run_all_examples.py](../examples/run_all_examples.py) script.
6 | 
7 | ### Do not directly edit anything in this directory! Its contents are 100% automated!!
8 | 


--------------------------------------------------------------------------------
/packtml/neural_net/tests/test_mlp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.neural_net import NeuralNetClassifier
 6 | from sklearn.datasets import load_iris
 7 | 
 8 | iris = load_iris()
 9 | X, y = iris.data,  iris.target
10 | 
11 | 
12 | def test_mlp():
13 |     # show we can fit and predict
14 |     clf = NeuralNetClassifier(X, y, random_state=42)
15 |     clf.predict(X)
16 | 


--------------------------------------------------------------------------------
/packtml/utils/tests/test_linalg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from sklearn.datasets import load_iris
 6 | from packtml.utils import linalg
 7 | 
 8 | import numpy as np
 9 | 
10 | iris = load_iris()
11 | X, y = iris.data, iris.target
12 | 
13 | 
14 | def test_row_norms():
15 |     means = np.average(X, axis=0)
16 |     X_centered = X - means
17 | 
18 |     norms = linalg.l2_norm(X_centered, axis=0)
19 |     assert np.allclose(
20 |         norms,
21 |         np.array([10.10783524, 5.29269308, 21.53749599, 9.31556404]),
22 |         rtol=0.01)
23 | 


--------------------------------------------------------------------------------
/packtml/regression/tests/test_simple_regression.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.regression import SimpleLinearRegression
 6 | 
 7 | import numpy as np
 8 | from numpy.testing import assert_almost_equal
 9 | 
10 | 
11 | def test_simple_linear_regression():
12 |     # y = 2a + 1.5b + 0
13 |     random_state = np.random.RandomState(42)
14 |     X = random_state.rand(100, 2)
15 |     y = 2. * X[:, 0] + 1.5 * X[:, 1]
16 | 
17 |     lm = SimpleLinearRegression(X, y)
18 |     predictions = lm.predict(X)
19 |     residuals = y - predictions
20 |     assert_almost_equal(residuals.sum(), 0.)
21 |     assert np.allclose(lm.theta, [2., 1.5])
22 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: required
 3 | 
 4 | cache:
 5 |   apt: true
 6 |   directories:
 7 |   - $HOME/.cache/pip
 8 |   - $HOME/.ccache
 9 | 
10 | before_install:
11 |   - source build_tools/travis/before_install.sh
12 | env:
13 |   global:
14 |     - TEST_DIR=/tmp/packtml
15 | 
16 | matrix:
17 |   include:
18 |     - os: linux
19 |       dist: trusty
20 |       env: PYTHON_VERSION="3.5"
21 | 
22 |     - os: linux
23 |       dist: trusty
24 |       env: PYTHON_VERSION="3.6"
25 | 
26 | install: source build_tools/travis/install.sh
27 | before_script: bash build_tools/travis/before_script.sh
28 | script: bash build_tools/travis/test_script.sh
29 | after_success: bash build_tools/travis/after_success.sh
30 | 


--------------------------------------------------------------------------------
/packtml/utils/linalg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from numpy import linalg as la
 6 | 
 7 | __all__ = [
 8 |     'l2_norm'
 9 | ]
10 | 
11 | 
12 | def l2_norm(X, axis=0):
13 |     """Compute the L2 (Euclidean) norm of a matrix.
14 | 
15 |     Computes the L2 norm along the specified axis. If axis is 0,
16 |     computes the norms along the columns. If 1, computes along the
17 |     rows.
18 | 
19 |     Parameters
20 |     ----------
21 |     X : array-like, shape=(n_samples, n_features)
22 |         The matrix on which to compute the norm.
23 | 
24 |     axis : int, optional (default=0)
25 |         The axis along which to compute the norm. 0 is for columns,
26 |         1 is for rows.
27 |     """
28 |     return la.norm(X, ord=None, axis=axis)
29 | 


--------------------------------------------------------------------------------
/packtml/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | 
 5 | # global namespace:
 6 | from packtml import clustering
 7 | from packtml import decision_tree
 8 | from packtml import metrics
 9 | from packtml import neural_net
10 | from packtml import recommendation
11 | from packtml import regression
12 | from packtml import utils
13 | 
14 | # set the version
15 | packtml_location = os.path.abspath(os.path.dirname(__file__))
16 | with open(os.path.join(packtml_location, "VERSION")) as vsn:
17 |     __version__ = vsn.read().strip()
18 | 
19 | # remove from global namespace
20 | del os
21 | del packtml_location
22 | del vsn
23 | 
24 | __all__ = [
25 |     'clustering',
26 |     'decision_tree',
27 |     'metrics',
28 |     'neural_net',
29 |     'recommendation',
30 |     'regression',
31 |     'utils'
32 | ]
33 | 


--------------------------------------------------------------------------------
/examples/decision_tree/example_information_gain.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.decision_tree.metrics import gini_impurity, InformationGain
 6 | import numpy as np
 7 | 
 8 | # #############################################################################
 9 | # Build the example from the slides
10 | y = np.array([0, 0, 0, 1, 1, 1, 1])
11 | uncertainty = gini_impurity(y)
12 | print("Initial gini impurity: %.4f" % uncertainty)
13 | 
14 | # now get the information gain of the split from the slides
15 | directions = np.array(["right", "left", "left", "left",
16 |                        "right", "right", "right"])
17 | mask = directions == "left"
18 | print("Information gain from the split we created: %.4f"
19 |       % InformationGain("gini")(target=y, mask=mask, uncertainty=uncertainty))
20 | 


--------------------------------------------------------------------------------
/packtml/clustering/tests/test_knn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.clustering import KNNClassifier
 6 | 
 7 | from sklearn.datasets import load_iris
 8 | from numpy.testing import assert_array_equal
 9 | import numpy as np
10 | 
11 | iris = load_iris()
12 | X = iris.data[:, :2]
13 | y = iris.target
14 | 
15 | 
16 | def test_knn():
17 |     # show we can fit
18 |     knn = KNNClassifier(X, y)
19 |     # show we can predict
20 |     knn.predict(X)
21 | 
22 | 
23 | def test_knn2():
24 |     X2 = np.array([[0., 0., 0.5],
25 |                    [0., 0.5, 0.],
26 |                    [0.5, 0., 0.],
27 |                    [5., 5., 6.],
28 |                    [6., 5., 5.]])
29 | 
30 |     y2 = [0, 0, 0, 1, 1]
31 |     knn = KNNClassifier(X2, y2, k=3)
32 |     preds = knn.predict(X2)
33 |     assert_array_equal(preds, y2)
34 | 


--------------------------------------------------------------------------------
/packtml/neural_net/base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | import six
 6 | from abc import ABCMeta, abstractmethod
 7 | 
 8 | import numpy as np
 9 | 
10 | __all__ = [
11 |     'tanh',
12 |     'NeuralMixin'
13 | ]
14 | 
15 | 
16 | def tanh(X):
17 |     """Hyperbolic tangent.
18 | 
19 |     Compute the tan-h (Hyperbolic tangent) activation function.
20 |     This is a very easily-differentiable activation function.
21 | 
22 |     Parameters
23 |     ----------
24 |     X : np.ndarray, shape=(n_samples, n_features)
25 |         The transformed X array (X * W + b).
26 |     """
27 |     return np.tanh(X)
28 | 
29 | 
30 | class NeuralMixin(six.with_metaclass(ABCMeta)):
31 |     """Abstract interface for neural network classes."""
32 |     @abstractmethod
33 |     def export_weights_and_biases(self, output_layer=True):
34 |         """Return the weights and biases of the network"""
35 | 


--------------------------------------------------------------------------------
/examples/decision_tree/example_classification_split.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.decision_tree.cart import RandomSplitter
 6 | from packtml.decision_tree.metrics import InformationGain
 7 | import numpy as np
 8 | 
 9 | # #############################################################################
10 | # Build the example from the slides (3.3)
11 | X = np.array([[21, 3], [ 4, 2], [37, 2]])
12 | y = np.array([1, 0, 1])
13 | 
14 | # this is the splitting class; we'll use gini as the criteria
15 | random_state = np.random.RandomState(42)
16 | splitter = RandomSplitter(random_state=random_state,
17 |                           criterion=InformationGain('gini'),
18 |                           n_val_sample=3)
19 | 
20 | # find the best:
21 | best_feature, best_value, best_gain = splitter.find_best(X, y)
22 | print("Best feature=%i, best value=%r, information gain: %.3f"
23 |       % (best_feature, best_value, best_gain))
24 | 


--------------------------------------------------------------------------------
/packtml/regression/tests/test_simple_logistic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.regression import SimpleLogisticRegression
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.metrics import accuracy_score
 8 | 
 9 | import numpy as np
10 | 
11 | X, y = make_classification(n_samples=100, n_features=2, random_state=42,
12 |                            n_redundant=0, n_repeated=0, n_classes=2,
13 |                            class_sep=1.0)
14 | 
15 | 
16 | def test_simple_logistic():
17 |     lm = SimpleLogisticRegression(X, y, n_steps=50, loglik_interval=10)
18 |     assert np.allclose(lm.theta, np.array([ 1.32320936, -0.03926072]))
19 | 
20 |     # test that we can predict
21 |     preds = lm.predict(X)
22 | 
23 |     # show we're better than chance
24 |     assert accuracy_score(y, preds) > 0.5
25 | 
26 |     # show that we only computed the log likelihood 5 times
27 |     assert len(lm.log_likelihood) == 5, lm.log_likelihood
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/packtml/utils/tests/test_validation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.utils import validation as val
 6 | from packtml.regression import SimpleLogisticRegression
 7 | 
 8 | from sklearn.metrics import accuracy_score
 9 | from sklearn.datasets import load_breast_cancer
10 | 
11 | bc = load_breast_cancer()
12 | X, y = bc.data, bc.target
13 | 
14 | 
15 | def test_is_iterable():
16 |     assert val.is_iterable([1, 2, 3])
17 |     assert val.is_iterable((1, 2, 3))
18 |     assert val.is_iterable({1, 2, 3})
19 |     assert val.is_iterable({1: 'a', 2: 'b'})
20 |     assert not val.is_iterable(123)
21 |     assert not val.is_iterable(None)
22 |     assert not val.is_iterable("a string")
23 | 
24 | 
25 | def test_learning_curves():
26 |     train_scores, val_scores = \
27 |         val.learning_curve(
28 |             SimpleLogisticRegression, X, y,
29 |             metric=accuracy_score,
30 |             train_sizes=(100, 250, 400),
31 |             n_folds=3, seed=42, trace=True,
32 | 
33 |             # kwargs:
34 |             n_steps=20, loglik_interval=20)
35 | 
36 |     assert train_scores.shape == (3, 3)
37 |     assert val_scores.shape == (3, 3)
38 | 


--------------------------------------------------------------------------------
/packtml/base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from abc import ABCMeta, abstractmethod
 6 | import six
 7 | 
 8 | __all__ = [
 9 |     'BaseSimpleEstimator'
10 | ]
11 | 
12 | 
13 | class BaseSimpleEstimator(six.with_metaclass(ABCMeta)):
14 |     """Base class for packt estimators.
15 | 
16 |     The estimators in the Packt package do not behave exactly like scikit-learn
17 |     estimators (by design). They are made to perform the model fit immediately
18 |     upon class instantiation. Moreover, many of the hyper-parameter options
19 |     are limited to promote readability and avoid confusion.
20 | 
21 |     The constructor of every Packt estimator should resemble the following::
22 | 
23 |         def __init__(self, X, y, *args, **kwargs):
24 |             ...
25 | 
26 |     where ``X`` is the training matrix, ``y`` is the training target variable,
27 |     and ``*args`` and ``**kwargs`` are varargs that will differ for each
28 |     estimator.
29 |     """
30 |     @abstractmethod
31 |     def predict(self, X):
32 |         """Form predictions based on new data.
33 | 
34 |         This function must be implemented by subclasses to generate
35 |         predictions given the model fit.
36 | 
37 |         Parameters
38 |         ----------
39 |         X : array-like, shape=(n_samples, n_features)
40 |             The test array. Should be only finite values.
41 |         """
42 | 


--------------------------------------------------------------------------------
/packtml/recommendation/base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | import six
 6 | from abc import ABCMeta, abstractmethod
 7 | 
 8 | __all__ = [
 9 |     'RecommenderMixin'
10 | ]
11 | 
12 | try:
13 |     xrange
14 | except NameError:  # py3
15 |     xrange = range
16 | 
17 | 
18 | class RecommenderMixin(six.with_metaclass(ABCMeta)):
19 |     """Mixin interface for recommenders.
20 | 
21 |     This class should be inherited by recommender algorithms. It provides an
22 |     abstract interface for generating recommendations for a user, and a
23 |     function for creating recommendations for all users.
24 |     """
25 |     @abstractmethod
26 |     def recommend_for_user(self, R, user, n=10, filter_previously_seen=False,
27 |                            return_scores=True, **kwargs):
28 |         """Generate recommendations for a user.
29 | 
30 |         A method that should be overridden by subclasses to create
31 |         recommendations via their own prediction strategy.
32 |         """
33 | 
34 |     def recommend_for_all_users(self, R, n=10,
35 |                                 filter_previously_seen=False,
36 |                                 return_scores=True, **kwargs):
37 |         """Create recommendations for all users."""
38 |         return (
39 |             self.recommend_for_user(
40 |                 R, user, n=n, filter_previously_seen=filter_previously_seen,
41 |                 return_scores=return_scores, **kwargs)
42 |             for user in xrange(R.shape[0]))
43 | 


--------------------------------------------------------------------------------
/packtml/metrics/tests/test_ranking.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.metrics.ranking import (mean_average_precision, ndcg_at,
 6 |                                      precision_at)
 7 | 
 8 | from numpy.testing import assert_almost_equal
 9 | import warnings
10 | 
11 | preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
12 |          [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
13 |          [1, 2, 3, 4, 5]]
14 | 
15 | labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
16 | 
17 | 
18 | def assert_warning_caught(func):
19 |     def test_wrapper(*args, **kwargs):
20 |         with warnings.catch_warnings(record=True) as w:
21 |             warnings.simplefilter("always")
22 | 
23 |             # execute the fxn
24 |             func(*args, **kwargs)
25 |             assert len(w)  # assert there's something there...
26 |     return test_wrapper
27 | 
28 | 
29 | @assert_warning_caught
30 | def test_map():
31 |     assert_almost_equal(
32 |         mean_average_precision(preds, labels), 0.35502645502645497)
33 | 
34 | 
35 | @assert_warning_caught
36 | def test_pak():
37 |     assert_almost_equal(precision_at(preds, labels, 1), 0.33333333333333331)
38 |     assert_almost_equal(precision_at(preds, labels, 5), 0.26666666666666666)
39 |     assert_almost_equal(precision_at(preds, labels, 15), 0.17777777777777778)
40 | 
41 | 
42 | @assert_warning_caught
43 | def test_ndcg():
44 |     assert_almost_equal(ndcg_at(preds, labels, 3), 0.3333333432674408)
45 |     assert_almost_equal(ndcg_at(preds, labels, 10), 0.48791273434956867)
46 | 


--------------------------------------------------------------------------------
/packtml/decision_tree/tests/test_metrics.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.decision_tree.metrics import (entropy, gini_impurity,
 6 |                                            InformationGain)
 7 | 
 8 | import numpy as np
 9 | from numpy.testing import assert_almost_equal
10 | 
11 | 
12 | def test_entropy():
13 |     events = np.asarray(9 * [0] + 5 * [1])  # 9/14, 5/14
14 |     ent = entropy(events)
15 |     assert round(ent, 2) == 0.94, round(ent, 2)
16 | 
17 | 
18 | def test_gini_impurity():
19 |     x = np.asarray([0] * 10 + [1] * 10)
20 |     assert gini_impurity(x) == 0.5
21 |     assert gini_impurity(x[:10]) == 0.
22 | 
23 |     # show that no mixing of gini yields 0.0
24 |     assert gini_impurity(np.array([0, 0])) == 0.
25 | 
26 |     # with SOME mixing we get 0.5
27 |     assert gini_impurity(np.array([0, 1])) == 0.5
28 | 
29 |     # with a lot of mixing we get a number close to 0.8
30 |     gi = gini_impurity([0, 1, 2, 3, 4])
31 |     assert_almost_equal(gi, 0.8)
32 | 
33 | 
34 | def test_information_gain():
35 |     X = np.array([
36 |         [0, 3],
37 |         [1, 3],
38 |         [2, 1],
39 |         [2, 1],
40 |         [1, 3]
41 |     ])
42 | 
43 |     y = np.array([0, 0, 1, 1, 2])
44 | 
45 |     uncertainty = gini_impurity(y)
46 |     assert_almost_equal(uncertainty, 0.63999999)
47 |     mask = X[:, 0] == 0
48 | 
49 |     # compute the info gain for this mask
50 |     infog = InformationGain("gini")
51 |     ig = infog(y, mask, uncertainty)
52 |     assert_almost_equal(ig, 0.1399999)
53 | 


--------------------------------------------------------------------------------
/packtml/recommendation/tests/test_als.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.recommendation import ALS
 6 | 
 7 | # make up a ratings matrix...
 8 | R = [[1.,  0.,  3.5,  2.,  0.,  0.,  0.,  1.5],
 9 |      [0.,  2.,  3.,   0.,  0.,  2.5, 0.,  0. ],
10 |      [3.5, 4.,  2.,   0.,  4.5, 3.5, 0.,  2. ],
11 |      [3.,  3.5, 0.,   2.5, 3.,  0.,  0.,  0. ]]
12 | 
13 | 
14 | def test_als_simple_fit():
15 |     als = ALS(R, factors=3, n_iter=5, random_state=42)
16 |     assert len(als.train_err) == 5, als.train_err
17 |     assert als.n_factors == 3, als.n_factors
18 | 
19 |     # assert all errors are decreasing over time
20 |     errs = list(zip(als.train_err[:-1], als.train_err[1:]))
21 |     assert all(new_err < last_err for last_err, new_err in errs), errs
22 | 
23 | 
24 | def test_als_predict():
25 |     als = ALS(R, factors=4, n_iter=8, random_state=42)
26 |     user0, scr = als.recommend_for_user(R, 0, filter_previously_seen=True,
27 |                                         return_scores=True)
28 | 
29 |     # assert previously-rated items not present
30 |     rated = (0, 2, 3, 7)
31 |     for r in rated:  # previously-rated
32 |         assert r not in user0
33 | 
34 |     # show the score lengths are the same
35 |     assert scr.shape[0] == user0.shape[0]
36 | 
37 |     # now if we do NOT filter, assert those are present again (also, recompute)
38 |     user0, scr = als.recommend_for_user(R, 0, filter_previously_seen=False,
39 |                                         return_scores=True,
40 |                                         recompute_user=True)
41 |     for r in rated:
42 |         assert r in user0
43 | 
44 |     assert user0.shape[0] == scr.shape[0]
45 | 


--------------------------------------------------------------------------------
/examples/recommendation/example_item_item_recommender.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.recommendation import ItemItemRecommender
 6 | from packtml.recommendation.data import get_completely_fabricated_ratings_data
 7 | from packtml.metrics.ranking import mean_average_precision
 8 | import numpy as np
 9 | 
10 | # #############################################################################
11 | # Use our fabricated data set
12 | R, titles = get_completely_fabricated_ratings_data()
13 | 
14 | # #############################################################################
15 | # Fit an item-item recommender, predict for user 0
16 | rec = ItemItemRecommender(R, k=3)
17 | user0_rec, user_0_preds = rec.recommend_for_user(
18 |     R, user=0, filter_previously_seen=True,
19 |     return_scores=True)
20 | 
21 | # print some info about user 0
22 | top_rated = np.argsort(-R[0, :])[:3]
23 | print("User 0's top 3 rated movies are: %r" % titles[top_rated].tolist())
24 | print("User 0's top 3 recommended movies are: %r"
25 |       % titles[user0_rec[:3]].tolist())
26 | 
27 | # #############################################################################
28 | # We can score our recommender as well, to determine how well it actually did
29 | 
30 | # first, get all user recommendations (top 10, not filtered)
31 | recommendations = list(rec.recommend_for_all_users(
32 |     R, n=10, filter_previously_seen=False,
33 |     return_scores=False))
34 | 
35 | # get the TRUE items they've rated (in order)
36 | ground_truth = np.argsort(-R, axis=1)
37 | mean_avg_prec = mean_average_precision(
38 |     predictions=recommendations, labels=ground_truth)
39 | print("Mean average precision: %.3f" % mean_avg_prec)
40 | 


--------------------------------------------------------------------------------
/packtml/utils/extmath.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | import numpy as np
 6 | 
 7 | __all__ = [
 8 |     'log_likelihood',
 9 |     'logistic_sigmoid'
10 | ]
11 | 
12 | 
13 | def log_likelihood(X, y, w):
14 |     """Compute the log-likelihood function.
15 | 
16 |     Computes the log-likelihood function over the training data.
17 |     The key to the log-likelihood is that the log of the product of
18 |     likelihoods becomes the sum of logs. That is (in pseudo-code),
19 | 
20 |         np.log(np.product([f(i) for i in range(N)]))
21 | 
22 |     is equivalent to:
23 | 
24 |         np.sum([np.log(f(i)) for i in range(N)])
25 | 
26 |     The log-likelihood function is used in computing the gradient for
27 |     our loss function since the derivative of the sum (of logs) is equivalent
28 |     to the sum of derivatives, which simplifies all of our math.
29 | 
30 |     Parameters
31 |     ----------
32 |     X : np.ndarray, shape=(n_samples, n_features)
33 |         The training data.
34 | 
35 |     y : np.ndarray, shape=(n_samples,)
36 |         The target vector of 1s or 0s.
37 | 
38 |     w : np.ndarray, shape=(n_features,)
39 |         The vector of feature weights (coefficients)
40 | 
41 |     References
42 |     ----------
43 |     .. [1] For a very thorough explanation of the log-likelihood function, see
44 |            https://www.coursera.org/learn/ml-classification/lecture/1ZeTC/very-optional-expressing-the-log-likelihood
45 |     """
46 |     weighted = X.dot(w)
47 |     return (y * weighted - np.log(1. + np.exp(weighted))).sum()
48 | 
49 | 
50 | def logistic_sigmoid(x):
51 |     """The logistic function.
52 | 
53 |     Compute the logistic (sigmoid) function over a vector, ``x``.
54 | 
55 |     Parameters
56 |     ----------
57 |     x : np.ndarray, shape=(n_samples,)
58 |         A vector to transform.
59 |     """
60 |     return 1. / (1. + np.exp(-x))
61 | 


--------------------------------------------------------------------------------
/examples/run_all_examples.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This function is not intended to be run by students (or anyone, for that
 4 | # matter). It is intended to be run by me (Taylor) just to automate the
 5 | # population of the img/ directory with the output of the example plots.
 6 | # Hence its poor documentation and sheer hackiness.
 7 | 
 8 | from __future__ import absolute_import
 9 | 
10 | import os
11 | import sys
12 | import subprocess
13 | 
14 | # determine where the user is calling this from...
15 | here = os.listdir(".")
16 | if "examples" in here:
17 |     cwd = "examples"
18 |     img_dir = "img"
19 | elif "clustering" in here:
20 |     cwd = "."
21 |     img_dir = "../img"
22 | else:
23 |     raise ValueError("Call this from top-level or from within "
24 |                      "the examples dir")
25 | 
26 | # iterate all py files
27 | for root, dirs, files in os.walk(cwd, topdown=False):
28 |     for fil in files:
29 |         # Only run the ones with the appropriate prefix
30 |         if not fil.startswith("example_"):
31 |             continue
32 | 
33 |         # Get the module root
34 |         module_root = root.split(os.sep)[1]
35 | 
36 |         # If it's "data" we don't want that! That's where we cache the data
37 |         # for the demo
38 |         if module_root in ("data", ".ipynb_checkpoints"):
39 |             print("Skipping dir: %s" % module_root)
40 |             continue
41 | 
42 |         # Otherwise create its corresponding path in ../img
43 |         image_root = os.path.join(img_dir, module_root)  # ../img/clustering
44 | 
45 |         # create the directory in the image dir if it's not there
46 |         if not os.path.exists(image_root):
47 |             os.mkdir(image_root)
48 | 
49 |         # run it
50 |         dest = os.path.join(image_root, fil[:-3] + ".png")
51 |         filexec = os.path.join(root, fil)
52 | 
53 |         print("Running %s" % filexec)
54 |         subprocess.Popen([sys.executable, filexec, dest])
55 | 
56 | sys.exit(0)
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # scratch code
  2 | scratch/
  3 | 
  4 | # Any data unpackaged by tensorflow
  5 | MNIST_data/
  6 | 
  7 | # In-progress word docs
  8 | ~$*.doc*
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # Ignore PyCharm stuff...
 16 | .idea/
 17 | 
 18 | # Mac stuff
 19 | .DS_Store
 20 | 
 21 | # C extensions
 22 | *.so
 23 | 
 24 | # Testing
 25 | .pytest_cache/
 26 | 
 27 | # Distribution / packaging
 28 | .Python
 29 | env/
 30 | build/
 31 | develop-eggs/
 32 | dist/
 33 | downloads/
 34 | eggs/
 35 | .eggs/
 36 | lib/
 37 | lib64/
 38 | parts/
 39 | sdist/
 40 | var/
 41 | wheels/
 42 | *.egg-info/
 43 | .installed.cfg
 44 | *.egg
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .coverage
 60 | .coverage.*
 61 | .cache
 62 | nosetests.xml
 63 | coverage.xml
 64 | *.cover
 65 | .hypothesis/
 66 | 
 67 | # Translations
 68 | *.mo
 69 | *.pot
 70 | 
 71 | # Django stuff:
 72 | *.log
 73 | local_settings.py
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # pyenv
 92 | .python-version
 93 | 
 94 | # celery beat schedule file
 95 | celerybeat-schedule
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # dotenv
101 | .env
102 | 
103 | # virtualenv
104 | .venv
105 | venv/
106 | ENV/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | 


--------------------------------------------------------------------------------
/examples/decision_tree/example_regression_decision_tree.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.decision_tree import CARTRegressor
 6 | from sklearn.metrics import mean_squared_error
 7 | from sklearn.model_selection import train_test_split
 8 | import matplotlib.pyplot as plt
 9 | import numpy as np
10 | import sys
11 | 
12 | # #############################################################################
13 | # Create a classification dataset
14 | rs = np.random.RandomState(42)
15 | X = np.sort(5 * rs.rand(80, 1), axis=0)
16 | y = np.sin(X).ravel()
17 | 
18 | # split the data
19 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
20 | 
21 | # #############################################################################
22 | # Fit a simple decision tree regressor and get predictions
23 | clf = CARTRegressor(X_train, y_train, max_depth=3, random_state=42)
24 | pred = clf.predict(X_test)
25 | clf_mse = mean_squared_error(y_test, pred)
26 | print("Test MSE (depth=3): %.3f" % clf_mse)
27 | 
28 | # Fit a deeper tree and show accuracy increases
29 | clf2 = CARTRegressor(X_train, y_train, max_depth=10, random_state=42)
30 | pred2 = clf2.predict(X_test)
31 | clf2_mse = mean_squared_error(y_test, pred2)
32 | print("Test MSE (depth=10): %.3f" % clf2_mse)
33 | 
34 | # #############################################################################
35 | # Visualize difference in learning ability
36 | 
37 | x = X_train.ravel()
38 | xte = X_test.ravel()
39 | 
40 | fig, axes = plt.subplots(1, 2, figsize=(12, 8))
41 | axes[0].scatter(x, y_train, alpha=0.25, c='r')
42 | axes[0].scatter(xte, pred, alpha=1.)
43 | axes[0].set_title("Shallow tree (depth=3) test MSE: %.3f" % clf_mse)
44 | 
45 | axes[1].scatter(x, y_train, alpha=0.4, c='r')
46 | axes[1].scatter(xte, pred2, alpha=1.)
47 | axes[1].set_title("Deeper tree (depth=10) test MSE: %.3f" % clf2_mse)
48 | 
49 | # if we're supposed to save it, do so INSTEAD OF showing it
50 | if len(sys.argv) > 1:
51 |     plt.savefig(sys.argv[1])
52 | else:
53 |     plt.show()
54 | 


--------------------------------------------------------------------------------
/examples/regression/example_linear_regression.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.regression import SimpleLinearRegression
 6 | from sklearn.linear_model import LinearRegression
 7 | from sklearn.model_selection import train_test_split
 8 | from matplotlib import pyplot as plt
 9 | import numpy as np
10 | import sys
11 | 
12 | # #############################################################################
13 | # Create a data-set that perfectly models the linear relationship:
14 | # y = 2a + 1.5b + 0
15 | random_state = np.random.RandomState(42)
16 | X = random_state.rand(500, 2)
17 | y = 2. * X[:, 0] + 1.5 * X[:, 1]
18 | 
19 | # split the data
20 | X_train, X_test, y_train, y_test = train_test_split(X, y,
21 |                                                     random_state=random_state)
22 | 
23 | # #############################################################################
24 | # Fit a simple linear regression, produce predictions
25 | lm = SimpleLinearRegression(X_train, y_train)
26 | predictions = lm.predict(X_test)
27 | print("Test sum of residuals: %.3f" % (y_test - predictions).sum())
28 | assert np.allclose(lm.theta, [2., 1.5])
29 | 
30 | # #############################################################################
31 | # Show that our solution is similar to scikit-learn's
32 | 
33 | lr = LinearRegression(fit_intercept=True)
34 | lr.fit(X_train, y_train)
35 | assert np.allclose(lm.theta, lr.coef_)
36 | assert np.allclose(predictions, lr.predict(X_test))
37 | 
38 | # #############################################################################
39 | # Fit another on ONE feature so we can show the plot
40 | X_train = X_train[:, np.newaxis, 0]
41 | X_test = X_test[:, np.newaxis, 0]
42 | lm = SimpleLinearRegression(X_train, y_train)
43 | 
44 | # create the predictions & plot them as the line
45 | preds = lm.predict(X_test)
46 | plt.scatter(X_test[:, 0], y_test, color='black')
47 | plt.plot(X_test[:, 0], preds, linewidth=3)
48 | 
49 | # if we're supposed to save it, do so INSTEAD OF showing it
50 | if len(sys.argv) > 1:
51 |     plt.savefig(sys.argv[1])
52 | else:
53 |     plt.show()
54 | 


--------------------------------------------------------------------------------
/examples/recommendation/example_als_recommender.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.recommendation import ALS
 6 | from packtml.recommendation.data import get_completely_fabricated_ratings_data
 7 | from packtml.metrics.ranking import mean_average_precision
 8 | from matplotlib import pyplot as plt
 9 | import numpy as np
10 | import sys
11 | 
12 | # #############################################################################
13 | # Use our fabricated data set
14 | R, titles = get_completely_fabricated_ratings_data()
15 | 
16 | # #############################################################################
17 | # Fit an item-item recommender, predict for user 0
18 | n_iter = 25
19 | rec = ALS(R, factors=5, n_iter=n_iter, random_state=42, lam=0.01)
20 | user0_rec, user_0_preds = rec.recommend_for_user(
21 |     R, user=0, filter_previously_seen=True,
22 |     return_scores=True)
23 | 
24 | # print some info about user 0
25 | top_rated = np.argsort(-R[0, :])[:3]
26 | print("User 0's top 3 rated movies are: %r" % titles[top_rated].tolist())
27 | print("User 0's top 3 recommended movies are: %r"
28 |       % titles[user0_rec[:3]].tolist())
29 | 
30 | # #############################################################################
31 | # We can score our recommender as well, to determine how well it actually did
32 | 
33 | # first, get all user recommendations (top 10, not filtered)
34 | recommendations = list(rec.recommend_for_all_users(
35 |     R, n=10, filter_previously_seen=False,
36 |     return_scores=False))
37 | 
38 | # get the TRUE items they've rated (in order)
39 | ground_truth = np.argsort(-R, axis=1)
40 | mean_avg_prec = mean_average_precision(
41 |     predictions=recommendations, labels=ground_truth)
42 | print("Mean average precision: %.3f" % mean_avg_prec)
43 | 
44 | # plot the error
45 | plt.plot(np.arange(n_iter), rec.train_err)
46 | plt.xlabel("Iteration")
47 | plt.ylabel("MSE")
48 | plt.title("Train error by iteration")
49 | 
50 | # if we're supposed to save it, do so INSTEAD OF showing it
51 | if len(sys.argv) > 1:
52 |     plt.savefig(sys.argv[1])
53 | else:
54 |     plt.show()
55 | 


--------------------------------------------------------------------------------
/packtml/neural_net/tests/test_transfer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.neural_net import NeuralNetClassifier, TransferLearningClassifier
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | def test_transfer_learner():
11 |     rs = np.random.RandomState(42)
12 |     covariance = [[1, .75], [.75, 1]]
13 | 
14 |     # these are the majority classes
15 |     n_obs = 500
16 |     x1 = rs.multivariate_normal(mean=[0, 0], cov=covariance, size=n_obs)
17 |     x2 = rs.multivariate_normal(mean=[1, 5], cov=covariance, size=n_obs)
18 | 
19 |     # this is the minority class
20 |     x3 = rs.multivariate_normal(mean=[0.85, 3.25],
21 |                                 cov=[[1., .5], [1.25, 0.85]],
22 |                                 size=150)
23 | 
24 |     # this is what the FIRST network will be trained on
25 |     n_first = 400
26 |     X = np.vstack((x1[:n_first], x2[:n_first])).astype(np.float32)
27 |     y = np.hstack((np.zeros(n_first), np.ones(n_first))).astype(int)
28 | 
29 |     # this is what the SECOND network will be trained on
30 |     X2 = np.vstack((x1[n_first:], x2[n_first:], x3)).astype(np.float32)
31 |     y2 = np.hstack((np.zeros(n_obs - n_first),
32 |                     np.ones(n_obs - n_first),
33 |                     np.ones(x3.shape[0]) * 2)).astype(int)
34 | 
35 |     # Fit the first neural network
36 |     clf = NeuralNetClassifier(X, y, hidden=(25, 25), n_iter=50,
37 |                               learning_rate=0.001, random_state=42)
38 | 
39 |     # Fit the transfer network - train one more layer with a new class
40 |     transfer = TransferLearningClassifier(X2, y2, pretrained=clf, hidden=(15,),
41 |                                           n_iter=10, random_state=42)
42 | 
43 |     # show we can predict
44 |     transfer.predict(X2)
45 | 
46 |     # show we can use a transfer learner on an existing transfer learner
47 |     transfer2 = TransferLearningClassifier(X2, y2, pretrained=transfer,
48 |                                            hidden=(25,),
49 |                                            random_state=15)
50 | 
51 |     # and show we can still predict
52 |     transfer2.predict(X2)
53 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | import sys
 6 | import setuptools
 7 | 
 8 | with open("packtml/VERSION", 'r') as vsn:
 9 |     VERSION = vsn.read().strip()
10 | 
11 | # Permitted args: "install" only, basically.
12 | UNSUPPORTED_COMMANDS = {  # this is a set literal, not a dict
13 |     'develop', 'release', 'bdist_egg', 'bdist_rpm',
14 |     'bdist_wininst', 'install_egg_info', 'build_sphinx',
15 |     'egg_info', 'easy_install', 'upload', 'bdist_wheel',
16 |     '--single-version-externally-managed', 'test', 'build_ext'
17 | }
18 | 
19 | intersect = UNSUPPORTED_COMMANDS.intersection(set(sys.argv))
20 | if intersect:
21 |     msg = "The following arguments are unsupported: %s. " \
22 |           "To install, please use `python setup.py install`." \
23 |           % str(list(intersect))
24 | 
25 |     # if "test" is in the arguments, make sure the user knows how to test.
26 |     if "test" in intersect:
27 |         msg += " To test, make sure pytest is installed, and after " \
28 |                "installation run `pytest packtml`"
29 | 
30 |     raise ValueError(msg)
31 | 
32 | # get requirements
33 | with open("requirements.txt") as req:
34 |     REQUIREMENTS = req.read().strip().split("\n")
35 | 
36 | py_version_tag = '-%s.%s'.format(sys.version_info[:2])
37 | setuptools.setup(name="packtml",
38 |                  description="Hands-on Supervised Learning - teach a machine "
39 |                              "to think for itself!",
40 |                  author="Taylor G Smith",
41 |                  author_email="taylor.smith@alkaline-ml.com",
42 |                  packages=['packtml',
43 |                            'packtml/clustering',
44 |                            'packtml/decision_tree',
45 |                            'packtml/metrics',
46 |                            'packtml/neural_net',
47 |                            'packtml/recommendation',
48 |                            'packtml/regression',
49 |                            'packtml/utils'],
50 |                  zip_safe=False,
51 |                  include_package_data=True,
52 |                  install_requires=REQUIREMENTS,
53 |                  package_data={"packtml": ["*"]},
54 |                  python_requires='>=3.5, <4',
55 |                  version=VERSION)
56 | 


--------------------------------------------------------------------------------
/examples/clustering/example_knn_classifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.clustering import KNNClassifier
 6 | from packtml.utils.plotting import add_decision_boundary_to_axis
 7 | from sklearn.model_selection import train_test_split
 8 | from sklearn.preprocessing import StandardScaler
 9 | from sklearn.metrics import accuracy_score
10 | from sklearn.datasets import load_iris
11 | from matplotlib import pyplot as plt
12 | from matplotlib.colors import ListedColormap
13 | import sys
14 | 
15 | # #############################################################################
16 | # Create a classification sub-dataset using iris
17 | iris = load_iris()
18 | X = iris.data[:, :2]  # just use the first two dimensions
19 | y = iris.target
20 | 
21 | # split data
22 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
23 | 
24 | # scale the data
25 | scaler = StandardScaler()
26 | X_train = scaler.fit_transform(X_train)
27 | X_test = scaler.transform(X_test)
28 | 
29 | # #############################################################################
30 | # Fit a k-nearest neighbor model and get predictions
31 | k=10
32 | clf = KNNClassifier(X_train, y_train, k=k)
33 | pred = clf.predict(X_test)
34 | clf_accuracy = accuracy_score(y_test, pred)
35 | print("Test accuracy: %.3f" % clf_accuracy)
36 | 
37 | # #############################################################################
38 | # Visualize difference in classes (this is from the scikit-learn KNN
39 | # plotting example:
40 | # http://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html#sphx-glr-auto-examples-neighbors-plot-classification-py)
41 | 
42 | xx, yy, _ = add_decision_boundary_to_axis(estimator=clf, axis=plt,
43 |                                           nclasses=3, X_data=X_test)
44 | 
45 | # Plot also the training points
46 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test,
47 |             cmap=ListedColormap(['#FF0000', '#00FF00', '#0000FF']),
48 |             edgecolor='k', s=20)
49 | 
50 | plt.xlim(xx.min(), xx.max())
51 | plt.ylim(yy.min(), yy.max())
52 | plt.title("3-Class classification (k=%i)" % k)
53 | 
54 | # if we're supposed to save it, do so INSTEAD OF showing it
55 | if len(sys.argv) > 1:
56 |     plt.savefig(sys.argv[1])
57 | else:
58 |     plt.show()
59 | 


--------------------------------------------------------------------------------
/examples/regression/example_logistic_regression.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.regression import SimpleLogisticRegression
 6 | from packtml.utils.plotting import add_decision_boundary_to_axis
 7 | from sklearn.linear_model import LogisticRegression
 8 | from sklearn.datasets import make_classification
 9 | from sklearn.model_selection import train_test_split
10 | from sklearn.metrics import accuracy_score
11 | from matplotlib import pyplot as plt
12 | import sys
13 | 
14 | # #############################################################################
15 | # Create an almost perfectly linearly-separable classification set
16 | X, y = make_classification(n_samples=100, n_features=2, random_state=42,
17 |                            n_redundant=0, n_repeated=0, n_classes=2,
18 |                            class_sep=1.0)
19 | 
20 | # split data
21 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
22 | 
23 | # #############################################################################
24 | # Fit a simple logistic regression, produce predictions
25 | lm = SimpleLogisticRegression(X_train, y_train, n_steps=50)
26 | 
27 | predictions = lm.predict(X_test)
28 | acc = accuracy_score(y_test, predictions)
29 | print("Test accuracy: %.3f" % acc)
30 | 
31 | # Show that our solution is similar to scikit-learn's
32 | lr = LogisticRegression(fit_intercept=True, C=1e16)  # almost no regularization
33 | lr.fit(X_train, y_train)
34 | print("Sklearn test accuracy: %.3f" % accuracy_score(y_test,
35 |                                                      lr.predict(X_test)))
36 | 
37 | # #############################################################################
38 | # Plot the data and the boundary we learned.
39 | 
40 | add_decision_boundary_to_axis(estimator=lm, axis=plt,
41 |                               nclasses=2, X_data=X_test)
42 | 
43 | # We have to break this into two plot calls, one for each class to
44 | # have different markers...
45 | c0_mask = y_test == 0
46 | plt.scatter(X_test[c0_mask, 0], X_test[c0_mask, 1],
47 |             c=~predictions[c0_mask], marker='o')
48 | plt.scatter(X_test[~c0_mask, 0], X_test[~c0_mask, 1],
49 |             c=~predictions[~c0_mask], marker='x')
50 | 
51 | plt.title("Logistic test performance: %.4f (o=true 0, x=true 1)" % acc)
52 | 
53 | # if we're supposed to save it, do so INSTEAD OF showing it
54 | if len(sys.argv) > 1:
55 |     plt.savefig(sys.argv[1])
56 | else:
57 |     plt.show()
58 | 


--------------------------------------------------------------------------------
/examples/decision_tree/example_classification_decision_tree.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.decision_tree import CARTClassifier
 6 | from packtml.utils.plotting import add_decision_boundary_to_axis
 7 | from sklearn.metrics import accuracy_score
 8 | from sklearn.model_selection import train_test_split
 9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | import sys
12 | 
13 | # #############################################################################
14 | # Create a classification dataset
15 | rs = np.random.RandomState(42)
16 | covariance = [[1, .75], [.75, 1]]
17 | n_obs = 500
18 | x1 = rs.multivariate_normal(mean=[0, 0], cov=covariance, size=n_obs)
19 | x2 = rs.multivariate_normal(mean=[1, 3], cov=covariance, size=n_obs)
20 | 
21 | X = np.vstack((x1, x2)).astype(np.float32)
22 | y = np.hstack((np.zeros(n_obs), np.ones(n_obs)))
23 | 
24 | # split the data
25 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
26 | 
27 | # #############################################################################
28 | # Fit a simple decision tree classifier and get predictions
29 | shallow_depth = 2
30 | clf = CARTClassifier(X_train, y_train, max_depth=shallow_depth, criterion='gini',
31 |                      random_state=42)
32 | pred = clf.predict(X_test)
33 | clf_accuracy = accuracy_score(y_test, pred)
34 | print("Test accuracy (depth=%i): %.3f" % (shallow_depth, clf_accuracy))
35 | 
36 | # Fit a deeper tree and show accuracy increases
37 | clf2 = CARTClassifier(X_train, y_train, max_depth=25, criterion='gini',
38 |                       random_state=42)
39 | pred2 = clf2.predict(X_test)
40 | clf2_accuracy = accuracy_score(y_test, pred2)
41 | print("Test accuracy (depth=25): %.3f" % clf2_accuracy)
42 | 
43 | # #############################################################################
44 | # Visualize difference in classification ability
45 | 
46 | fig, axes = plt.subplots(1, 2, figsize=(12, 8))
47 | 
48 | add_decision_boundary_to_axis(estimator=clf, axis=axes[0],
49 |                               nclasses=2, X_data=X_test)
50 | axes[0].scatter(X_test[:, 0], X_test[:, 1], c=pred, alpha=0.4)
51 | axes[0].set_title("Shallow tree (depth=%i) performance: %.3f"
52 |                   % (shallow_depth, clf_accuracy))
53 | 
54 | add_decision_boundary_to_axis(estimator=clf2, axis=axes[1],
55 |                               nclasses=2, X_data=X_test)
56 | axes[1].scatter(X_test[:, 0], X_test[:, 1], c=pred2, alpha=0.4)
57 | axes[1].set_title("Deep tree (depth=25) performance: %.3f" % clf2_accuracy)
58 | 
59 | # if we're supposed to save it, do so INSTEAD OF showing it
60 | if len(sys.argv) > 1:
61 |     plt.savefig(sys.argv[1])
62 | else:
63 |     plt.show()
64 | 


--------------------------------------------------------------------------------
/packtml/recommendation/tests/test_itemitem.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.recommendation import ItemItemRecommender
 6 | 
 7 | import numpy as np
 8 | from numpy.testing import assert_array_almost_equal
 9 | 
10 | from types import GeneratorType
11 | 
12 | # make up a ratings matrix...
13 | R = np.array([[1.,  0.,  3.5,  2.,  0.,  0.,  0.,  1.5],
14 |               [0.,  2.,  3.,   0.,  0.,  2.5, 0.,  0. ],
15 |               [3.5, 4.,  2.,   0.,  4.5, 3.5, 0.,  2. ],
16 |               [3.,  3.5, 0.,   2.5, 3.,  0.,  0.,  0. ]])
17 | 
18 | 
19 | def test_itemitem_simple():
20 |     rec = ItemItemRecommender(R, k=3)
21 | 
22 |     # assert on the similarity
23 |     expected = np.array([
24 |         [ 1.        ,  0.91461057,  0.        ,  0.        ,  0.9701687 ,
25 |           0.        ,  0.        ,  0.        ],
26 |         [ 0.91461057,  1.        ,  0.        ,  0.        ,  0.92793395,
27 |           0.        ,  0.        ,  0.        ],
28 |         [ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
29 |           0.6708902 ,  0.        ,  0.73632752],
30 |         [ 0.62906665,  0.48126166,  0.        ,  1.        ,  0.        ,
31 |           0.        ,  0.        ,  0.        ],
32 |         [ 0.9701687 ,  0.92793395,  0.        ,  0.        ,  1.        ,
33 |           0.        ,  0.        ,  0.        ],
34 |         [ 0.        ,  0.77786258,  0.        ,  0.        ,  0.67706717,
35 |           1.        ,  0.        ,  0.        ],
36 |         [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
37 |           0.        ,  0.        ,  0.        ],
38 |         [ 0.72079856,  0.        ,  0.73632752,  0.        ,  0.        ,
39 |           0.        ,  0.        ,  1.        ]])
40 | 
41 |     assert_array_almost_equal(expected, rec.similarity)
42 | 
43 |     # show we can generate recommendations
44 |     rec0, scores0 = rec.recommend_for_user(R, 0)
45 | 
46 |     # we didn't filter, so the rated items should still be present
47 |     assert np.in1d([0, 2, 3, 7], rec0).all()
48 | 
49 |     # re-compute and show the previously-rated are not present
50 |     rec0_filtered, scores0_filtered = rec.recommend_for_user(
51 |         R, 0, filter_previously_seen=True)
52 | 
53 |     assert len(rec0_filtered) == 4, rec0_filtered
54 |     assert rec0_filtered.tolist() == [5, 1, 4, 6]
55 | 
56 |     # test the prediction, which is just a big product...
57 |     pred = rec.predict(R)
58 |     assert pred.shape == R.shape
59 | 
60 |     # get recommendations for ALL users
61 |     recommendations = rec.recommend_for_all_users(R, return_scores=False,
62 |                                                   filter_previously_seen=False)
63 | 
64 |     assert isinstance(recommendations, GeneratorType)
65 |     recs = list(recommendations)
66 |     assert len(recs) == 4
67 |     assert all(len(x) == 8 for x in recs)
68 | 


--------------------------------------------------------------------------------
/packtml/recommendation/data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | import numpy as np
 6 | 
 7 | __all__ = [
 8 |     'get_completely_fabricated_ratings_data'
 9 | ]
10 | 
11 | 
12 | def get_completely_fabricated_ratings_data():
13 |     """Disclaimer: this is a made-up data set.
14 | 
15 |     Get a ratings data set for use with one of the packtml recommenders.
16 |     This data set is a completely made-up ratings matrix consisting of
17 |     cult classics, all of which are awesome (seriously, if there are any
18 |     you haven't seen, you should).
19 | 
20 |     (Please
21 |                     don't
22 |                 sue
23 | 
24 |                              me......)
25 | 
26 |     The data contains 5 users and 15 items (movies). Movies:
27 | 
28 |       0)  Ghost Busters
29 |       1)  Ghost Busters 2
30 |       2)  The Goonies
31 |       3)  Big Trouble in Little China
32 |       4)  The Rocky Horror Picture Show
33 |       5)  A Clockwork Orange
34 |       6)  Pulp Fiction
35 |       7)  Bill & Ted's Excellent Adventure
36 |       8)  Weekend at Bernie's
37 |       9)  Dumb and Dumber
38 |       10) Clerks
39 |       11) Jay & Silent Bob Strike Back
40 |       12) Tron
41 |       13) Total Recall
42 |       14) The Princess Bride
43 | 
44 |     Notes
45 |     -----
46 |     Seriously, I fabricated all of these ratings semi-haphazardly. Don't
47 |     take this as me bashing any movies.
48 |     """
49 |     return (np.array([
50 |         # user 0 is a classic 30-yo millennial who is nostalgic for the 90s
51 |         [5.0, 3.5, 5.0, 0.0, 0.0, 0.0, 4.5, 3.0,
52 |          0.0, 2.5, 4.0, 4.0, 0.0, 1.5, 3.0],
53 | 
54 |         # user 1 is a 40-yo who only likes action
55 |         [1.5, 0.0, 0.0, 1.0, 0.0, 4.0, 5.0, 0.0,
56 |          2.0, 0.0, 3.0, 3.5, 0.0, 4.0, 0.0],
57 | 
58 |         # user 2 is a 12-yo whose parents are strict about what she watches.
59 |         [4.5, 4.0, 5.0, 0.0, 0.0, 0.0, 0.0, 4.0,
60 |          3.5, 5.0, 0.0, 0.0, 0.0, 0.0, 5.0],
61 | 
62 |         # user 3 has just about seen it all, and doesn't really care for
63 |         # the goofy stuff. (but seriously, who rates the Goonies 2/5???)
64 |         [2.0, 1.0, 2.0, 1.0, 2.5, 4.5, 4.5, 0.5,
65 |          1.5, 1.0, 2.0, 2.5, 3.5, 3.5, 2.0],
66 | 
67 |         # user 4 has just opened a netflix account and hasn't had a chance
68 |         # to watch too much
69 |         [0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0,
70 |          0.0, 0.0, 0.0, 1.5, 4.0, 0.0, 0.0],
71 |     ]), np.array(["Ghost Busters", "Ghost Busters 2",
72 |                   "The Goonies", "Big Trouble in Little China",
73 |                   "The Rocky Horror Picture Show", "A Clockwork Orange",
74 |                   "Pulp Fiction", "Bill & Ted's Excellent Adventure",
75 |                   "Weekend at Bernie's", "Dumb and Dumber", "Clerks",
76 |                   "Jay & Silent Bob Strike Back", "Tron", "Total Recall",
77 |                   "The Princess Bride" ]))
78 | 


--------------------------------------------------------------------------------
/examples/neural_net/example_mlp_classifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | from packtml.neural_net import NeuralNetClassifier
 6 | from packtml.utils.plotting import add_decision_boundary_to_axis
 7 | from sklearn.metrics import accuracy_score
 8 | from sklearn.model_selection import train_test_split
 9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | import sys
12 | 
13 | # #############################################################################
14 | # Create a classification dataset
15 | rs = np.random.RandomState(42)
16 | covariance = [[1, .75], [.75, 1]]
17 | n_obs = 1000
18 | x1 = rs.multivariate_normal(mean=[0, 0], cov=covariance, size=n_obs)
19 | x2 = rs.multivariate_normal(mean=[1, 5], cov=covariance, size=n_obs)
20 | 
21 | X = np.vstack((x1, x2)).astype(np.float32)
22 | y = np.hstack((np.zeros(n_obs), np.ones(n_obs))).astype(int)
23 | 
24 | # split the data
25 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs)
26 | 
27 | # #############################################################################
28 | # Fit a simple neural network
29 | n_iter = 4
30 | hidden = (10,)
31 | clf = NeuralNetClassifier(X_train, y_train, hidden=hidden, n_iter=n_iter,
32 |                           learning_rate=0.001, random_state=42)
33 | print("Loss per training iteration: %r" % clf.train_loss)
34 | 
35 | pred = clf.predict(X_test)
36 | clf_accuracy = accuracy_score(y_test, pred)
37 | print("Test accuracy (hidden=%s): %.3f" % (str(hidden), clf_accuracy))
38 | 
39 | # #############################################################################
40 | # Fit a more complex neural network
41 | n_iter2 = 150
42 | hidden2 = (25, 25)
43 | clf2 = NeuralNetClassifier(X_train, y_train, hidden=hidden2, n_iter=n_iter2,
44 |                            learning_rate=0.001, random_state=42)
45 | 
46 | pred2 = clf2.predict(X_test)
47 | clf_accuracy2 = accuracy_score(y_test, pred2)
48 | print("Test accuracy (hidden=%s): %.3f" % (str(hidden2), clf_accuracy2))
49 | 
50 | # #############################################################################
51 | # Visualize difference in classification ability
52 | 
53 | fig, axes = plt.subplots(2, 2, figsize=(12, 8))
54 | 
55 | add_decision_boundary_to_axis(estimator=clf, axis=axes[0, 0],
56 |                               nclasses=2, X_data=X_test)
57 | axes[0, 0].scatter(X_test[:, 0], X_test[:, 1], c=pred, alpha=0.4)
58 | axes[0, 0].set_title("Shallow (hidden=%s @ %i iter) test accuracy: %.3f"
59 |                      % (str(hidden), n_iter, clf_accuracy))
60 | 
61 | add_decision_boundary_to_axis(estimator=clf2, axis=axes[0, 1],
62 |                               nclasses=2, X_data=X_test)
63 | axes[0, 1].scatter(X_test[:, 0], X_test[:, 1], c=pred2, alpha=0.4)
64 | axes[0, 1].set_title("Deeper (hidden=%s @ %i iter): test accuracy: %.3f"
65 |                      % (str(hidden2), n_iter2, clf_accuracy2))
66 | 
67 | # show the learning rates for each
68 | axes[1, 0].plot(np.arange(len(clf.train_loss)), clf.train_loss)
69 | axes[1, 0].set_title("Training loss by iteration")
70 | 
71 | axes[1, 1].plot(np.arange(len(clf2.train_loss)), clf2.train_loss)
72 | axes[1, 1].set_title("Training loss by iteration")
73 | 
74 | # if we're supposed to save it, do so INSTEAD OF showing it
75 | if len(sys.argv) > 1:
76 |     plt.savefig(sys.argv[1])
77 | else:
78 |     plt.show()
79 | 


--------------------------------------------------------------------------------
/packtml/regression/simple_regression.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | from sklearn.utils.validation import check_X_y, check_array
  6 | 
  7 | import numpy as np
  8 | from numpy.linalg import lstsq
  9 | 
 10 | from packtml.base import BaseSimpleEstimator
 11 | 
 12 | 
 13 | __all__ = [
 14 |     'SimpleLinearRegression'
 15 | ]
 16 | 
 17 | 
 18 | class SimpleLinearRegression(BaseSimpleEstimator):
 19 |     """Simple linear regression.
 20 | 
 21 |     This class provides a very simple example of straight forward OLS
 22 |     regression with an intercept. There are no tunable parameters, and
 23 |     the model fit happens directly on class instantiation.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     X : array-like, shape=(n_samples, n_features)
 28 |         The array of predictor variables. This is the array we will use
 29 |         to regress on ``y``.
 30 | 
 31 |     y : array-like, shape=(n_samples,)
 32 |         This is the target array on which we will regress to build
 33 |         our model.
 34 | 
 35 |     Attributes
 36 |     ----------
 37 |     theta : array-like, shape=(n_features,)
 38 |         The least-squares solution (the coefficients)
 39 | 
 40 |     rank : int
 41 |         The rank of the predictor matrix, ``X``
 42 | 
 43 |     singular_values : array-like, shape=(n_features,)
 44 |         The singular values of ``X``
 45 | 
 46 |     X_means : array-like, shape=(n_features,)
 47 |         The column means of the predictor matrix, ``X``
 48 | 
 49 |     y_mean : float
 50 |         The mean of the target variable, ``y``
 51 | 
 52 |     intercept : float
 53 |         The intercept term
 54 |     """
 55 |     def __init__(self, X, y):
 56 |         # First check X, y and make sure they are of equal length, no NaNs
 57 |         # and that they are numeric
 58 |         X, y = check_X_y(X, y, y_numeric=True,
 59 |                          accept_sparse=False)  # keep it simple
 60 | 
 61 |         # Next, we want to scale all of our features so X is centered
 62 |         # We will do the same with our target variable, y
 63 |         X_means = np.average(X, axis=0)
 64 |         y_mean = y.mean(axis=0)
 65 | 
 66 |         # don't do in place, so we get a copy
 67 |         X = X - X_means
 68 |         y = y - y_mean
 69 | 
 70 |         # Let's compute the least squares on X wrt y
 71 |         # Least squares solves the equation `a x = b` by computing a
 72 |         # vector `x` that minimizes the Euclidean 2-norm `|| b - a x ||^2`.
 73 |         theta, _, rank, singular_values = lstsq(X, y, rcond=None)
 74 | 
 75 |         # finally, we compute the intercept values as the mean of the target
 76 |         # variable MINUS the inner product of the X_means and the coefficients
 77 |         intercept = y_mean - np.dot(X_means, theta.T)
 78 | 
 79 |         # ... and set everything as an instance attribute
 80 |         self.theta = theta
 81 |         self.rank = rank
 82 |         self.singular_values = singular_values
 83 | 
 84 |         # we have to retain some of the statistics around the data too
 85 |         self.X_means = X_means
 86 |         self.y_mean = y_mean
 87 |         self.intercept = intercept
 88 | 
 89 |     def predict(self, X):
 90 |         """Compute new predictions for X"""
 91 |         # copy, make sure numeric, etc...
 92 |         X = check_array(X, accept_sparse=False, copy=False)  # type: np.ndarray
 93 | 
 94 |         # make sure dims match
 95 |         theta = self.theta
 96 |         if theta.shape[0] != X.shape[1]:
 97 |             raise ValueError("Dim mismatch in predictors!")
 98 | 
 99 |         # creates a copy
100 |         return np.dot(X, theta.T) + self.intercept
101 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # AppVeyor.com is a Continuous Integration service to build and run tests under
 2 | # Windows. This .yml file is based on scikit-learn and statsmodels' Appveyor CI
 3 | # setups, adapted for use with the Hands-on Supervised Learning repo
 4 | 
 5 | # This image contains the most pre-installed software (including supposedly
 6 | # MinGW and Miniconda?...)
 7 | image:
 8 | - Visual Studio 2015
 9 | 
10 | cache:
11 |   - '%LOCALAPPDATA%\pip\Cache'
12 | 
13 | environment:
14 |   global:
15 |     APPVEYOR_SAVE_CACHE_ON_ERROR: false
16 |     TEST_TIMEOUT: 1000
17 |     # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the
18 |     # /E:ON and /V:ON options are not enabled in the batch script interpreter
19 |     # See: http://stackoverflow.com/a/13751649/163740
20 |     # CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\build_tools\\appveyor\\run_with_env.cmd"
21 | 
22 |   matrix:
23 |     - PYTHON: C:\Miniconda35
24 |       PYTHON_VERSION: 3.5
25 |       PYTHON_ARCH: 32
26 | 
27 |     - PYTHON: C:\Miniconda35-x64
28 |       PYTHON_VERSION: 3.5
29 |       PYTHON_ARCH: 64
30 | 
31 |     # Currently failing due to Appveyor bugs?
32 |     # - PYTHON: C:\Miniconda36
33 |     #   PYTHON_VERSION: 3.6
34 |     #   PYTHON_ARCH: 32
35 |     #
36 |     # - PYTHON: C:\Miniconda36-x64
37 |     #   PYTHON_VERSION: 3.6
38 |     #   PYTHON_ARCH: 64
39 | 
40 | init:
41 |   - "ECHO \"%APPVEYOR_SCHEDULED_BUILD%\""
42 | 
43 |   # If there is a newer build queued for the same PR, cancel this one.
44 |   # The AppVeyor 'rollout builds' option is supposed to serve the same
45 |   # purpose but it is problematic because it tends to cancel builds pushed
46 |   # directly to master instead of just PR builds (or the converse).
47 |   # credits: JuliaLang developers.
48 |   - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod `
49 |         https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | `
50 |         Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { `
51 |           Write-Host "There are newer queued builds for this pull request, skipping build."
52 |           Exit-AppveyorBuild
53 |         }
54 |   - ps: |
55 |       If (($env:SKIP_NOTAG -eq "true") -and ($env:APPVEYOR_REPO_TAG -ne "true")) {
56 |           Write-Host "Skipping build, not at a tag."
57 |           Exit-AppveyorBuild
58 |       }
59 | 
60 | install:
61 |   - C:\cygwin\bin\du -hs "%LOCALAPPDATA%\pip\Cache"
62 | 
63 |   # Prepend Miniconda to the PATH of this build (this cannot be
64 |   # done from inside the powershell script as it would require to restart
65 |   # the parent CMD process).
66 |   - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PYTHON%\Library\bin;%PATH%
67 | 
68 |   # Setup the conda config
69 |   - conda config --set always_yes yes
70 |   - ps: conda create -n testenv --yes python=$env:PYTHON_VERSION
71 |   - activate testenv
72 |   - pip install -r requirements.txt
73 |   - pip install pytest
74 | 
75 | build_script:
76 |   # set up the package
77 |   - python setup.py install
78 | 
79 | after_build:
80 |   # Remove old or huge cache files to hopefully not exceed the 1GB cache limit.
81 |   #
82 |   # If the cache limit is reached, the cache will not be updated (of not even
83 |   # created in the first run). So this is a trade of between keeping the cache
84 |   # current and having a cache at all.
85 |   # NB: This is done only `on_success` since the cache in uploaded only on
86 |   # success anyway.
87 |   - C:\cygwin\bin\find "%LOCALAPPDATA%\pip" -type f -mtime +360 -delete
88 |   - C:\cygwin\bin\find "%LOCALAPPDATA%\pip" -type f -size +10M -delete
89 |   - C:\cygwin\bin\find "%LOCALAPPDATA%\pip" -empty -delete
90 | 
91 |   # Show size of cache
92 |   - C:\cygwin\bin\du -hs "%LOCALAPPDATA%\pip\Cache"
93 | 
94 | test_script:
95 |   - python -m pytest --showlocals --durations=20 --pyargs packtml
96 | 


--------------------------------------------------------------------------------
/packtml/decision_tree/tests/test_cart.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | from numpy.testing import assert_array_equal, assert_almost_equal
  6 | import numpy as np
  7 | 
  8 | from packtml.decision_tree.metrics import InformationGain
  9 | from packtml.decision_tree.cart import (CARTClassifier, CARTRegressor,
 10 |                                         RandomSplitter, LeafNode, _most_common)
 11 | 
 12 | X = np.array([[0, 1, 2],
 13 |               [1, 2, 3],
 14 |               [2, 3, 4]])
 15 | 
 16 | y = np.array([0, 1, 1])
 17 | 
 18 | X2 = np.array([[0, 1, 2],
 19 |                [1, 2, 3],
 20 |                [2, 3, 4],
 21 |                [3, 4, 5],
 22 |                [4, 5, 6],
 23 |                [5, 6, 7]])
 24 | 
 25 | y2 = np.array([0, 0, 1, 1, 1, 1])
 26 | 
 27 | # a regression dataset
 28 | rs = np.random.RandomState(42)
 29 | Xreg = np.sort(5 * rs.rand(100, 1), axis=0)
 30 | yreg = np.sin(Xreg).ravel()
 31 | 
 32 | 
 33 | def test_most_common():
 34 |     assert _most_common(y) == 1
 35 |     assert _most_common([1]) == 1
 36 | 
 37 | 
 38 | def test_terminal_leaf_node():
 39 |     node = LeafNode(split_col=0, split_val=1.,
 40 |                     class_statistic=_most_common(y),
 41 |                     split_gain=np.inf)
 42 | 
 43 |     # show that there are no children
 44 |     assert node.is_terminal()
 45 | 
 46 |     # show that the splitting works as expected
 47 |     X_left, X_right, y_left, y_right = node.create_split(X, y)
 48 |     assert_array_equal(X_left, X[1:, :])
 49 |     assert_array_equal(X_right, X[:1, :])
 50 |     assert_array_equal(y_left, [1, 1])
 51 |     assert_array_equal(y_right, [0])
 52 | 
 53 |     # show that predictions work as expected
 54 |     assert [node.predict_record(r) for r in X] == [1, 1, 1]
 55 | 
 56 | 
 57 | def test_complex_leaf_node():
 58 |     node = LeafNode(split_col=0, split_val=3.,
 59 |                     class_statistic=_most_common(y2),
 60 |                     split_gain=np.inf)
 61 | 
 62 |     # create the split
 63 |     X_left, X_right, y_left, y_right = node.create_split(X2, y2)
 64 | 
 65 |     # show it worked as expected
 66 |     assert_array_equal(X_left, X2[3:, :])
 67 |     assert_array_equal(X_right, X2[:3, :])
 68 |     assert_array_equal(y_left, [1, 1, 1])
 69 |     assert_array_equal(y_right, [0, 0, 1])
 70 | 
 71 |     # show that if we CURRENTLY predicted on the bases of node being the
 72 |     # terminal leaf, we'd get all 1s.
 73 |     get_preds = (lambda: [node.predict_record(r) for r in X2])
 74 |     assert get_preds() == [1, 1, 1, 1, 1, 1]
 75 | 
 76 |     # add a sub node to the right side
 77 |     right_node = LeafNode(split_col=0, split_val=2.,
 78 |                           class_statistic=_most_common(y_right),
 79 |                           split_gain=np.inf)
 80 | 
 81 |     assert right_node.class_statistic == 0.
 82 | 
 83 |     # attach to the original node and assert it's not terminal anymore
 84 |     node.right = right_node
 85 |     assert not node.is_terminal()
 86 | 
 87 |     # now our predictions should differ!
 88 |     assert get_preds() == [0, 0, 0, 1, 1, 1]
 89 | 
 90 | 
 91 | def test_fit_classifier():
 92 |     # show we can fit a classifier
 93 |     clf = CARTClassifier(X, y)
 94 |     # show we can predict
 95 |     clf.predict(X)
 96 | 
 97 | 
 98 | def test_fit_regressor():
 99 |     # show we can fit a regressor
100 |     reg = CARTRegressor(Xreg, yreg)
101 |     # show we can predict
102 |     reg.predict(Xreg)
103 | 
104 | 
105 | def test_random_splitter():
106 |     pre_X = np.array([[21, 3], [4, 2], [37, 2]])
107 |     pre_y = np.array([1, 0, 1])
108 | 
109 |     # this is the splitting class; we'll use gini as the criteria
110 |     random_state = np.random.RandomState(42)
111 |     splitter = RandomSplitter(random_state=random_state,
112 |                               criterion=InformationGain('gini'),
113 |                               n_val_sample=3)
114 | 
115 |     # find the best:
116 |     best_feature, best_value, best_gain = splitter.find_best(pre_X, pre_y)
117 |     assert best_feature == 0
118 |     assert best_value == 21
119 |     assert_almost_equal(best_gain, 0.4444444444, decimal=8)
120 | 


--------------------------------------------------------------------------------
/packtml/clustering/knn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Author: Taylor Smith <taylor.smith@alkaline-ml.com>
  4 | #
  5 | # An implementation of kNN clustering. Note that this was written to
  6 | # maximize readability. To use kNN in a true project setting, you may
  7 | # wish to use a more highly optimized library, such as scikit-learn.
  8 | 
  9 | from __future__ import absolute_import
 10 | 
 11 | from sklearn.metrics.pairwise import euclidean_distances
 12 | from sklearn.utils.validation import check_X_y
 13 | from sklearn.utils.multiclass import check_classification_targets
 14 | 
 15 | from scipy.stats import mode
 16 | import numpy as np
 17 | 
 18 | from packtml.base import BaseSimpleEstimator
 19 | 
 20 | __all__ = [
 21 |     'KNNClassifier'
 22 | ]
 23 | 
 24 | 
 25 | class KNNClassifier(BaseSimpleEstimator):
 26 |     """Classify points using k-Nearest Neighbors.
 27 | 
 28 |     The kNN algorithm computes the distances between points in a matrix and
 29 |     identifies the nearest "neighboring" points to each observation. The idea
 30 |     is that neighboring points share similar attributes. Therefore, if a
 31 |     neighbor is of some class, an unknown observation may likely belong to
 32 |     the same class.
 33 | 
 34 |     There are several caveats to kNN:
 35 | 
 36 |         * We have to retain all of the training data, which is expensive.
 37 |         * Computing the pairwise distance matrix is also expensive.
 38 |         * You should make sure you've standardized your data (mean 0, stddev 1)
 39 |           prior to fitting a kNN model
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     X : array-like, shape=(n_samples, n_features)
 44 |         The training array. Should be a numpy array or array-like structure
 45 |         with only finite values.
 46 | 
 47 |     y : array-like, shape=(n_samples,)
 48 |         The target vector.
 49 | 
 50 |     k : int, optional (default=10)
 51 |         The number of neighbors to identify. The higher the ``k`` parameter,
 52 |         the more likely you are to *under*-fit your data. The lower the ``k``
 53 |         parameter, the more likely you are to *over*-fit your model.
 54 | 
 55 |     Notes
 56 |     -----
 57 |     This is a very rudimentary implementation of KNN. It does not permit tuning
 58 |     of distance metrics, optimization of the search algorithm or any other
 59 |     parameters. It is written to be as simple as possible to maximize
 60 |     readability. For a more optimal solution, see
 61 |     ``sklearn.neighbors.KNeighborsClassifier``.
 62 |     """
 63 |     def __init__(self, X, y, k=10):
 64 |         # check the input array
 65 |         X, y = check_X_y(X, y, accept_sparse=False, dtype=np.float32,
 66 |                          copy=True)
 67 | 
 68 |         # make sure we're performing classification here
 69 |         check_classification_targets(y)
 70 | 
 71 |         # Save the K hyper-parameter so we can use it later
 72 |         self.k = k
 73 | 
 74 |         # kNN is a special case where we have to save the training data in
 75 |         # order to make predictions in the future
 76 |         self.X = X
 77 |         self.y = y
 78 | 
 79 |     def predict(self, X):
 80 |         # Compute the pairwise distances between each observation in
 81 |         # the dataset and the training data. This can be relatively expensive
 82 |         # for very large datasets!!
 83 |         train = self.X
 84 |         dists = euclidean_distances(X, train)
 85 | 
 86 |         # Arg sort to find the shortest distance for each row. This sorts
 87 |         # elements in each row (independent of other rows) to determine the
 88 |         # order required to sort the rows.
 89 |         # I.e:
 90 |         # >>> P = np.array([[4, 5, 1], [3, 1, 6]])
 91 |         # >>> np.argsort(P, axis=1)
 92 |         # array([[2, 0, 1],
 93 |         #        [1, 0, 2]])
 94 |         nearest = np.argsort(dists, axis=1)
 95 | 
 96 |         # We only care about the top K, really, so get sorted and then truncate
 97 |         # I.e:
 98 |         # array([[1, 2, 1],
 99 |         #           ...
100 |         #        [0, 0, 0]])
101 |         predicted_labels = self.y[nearest][:, :self.k]
102 | 
103 |         # We want the most common along the rows as the predictions
104 |         # I.e:
105 |         # array([1, ..., 0])
106 |         return mode(predicted_labels, axis=1)[0].ravel()
107 | 


--------------------------------------------------------------------------------
/examples/neural_net/example_transfer_learning.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | from packtml.neural_net import NeuralNetClassifier, TransferLearningClassifier
  6 | from packtml.utils.plotting import add_decision_boundary_to_axis
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.metrics import accuracy_score
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | import sys
 12 | 
 13 | # #############################################################################
 14 | # Create a classification dataset. This dataset differs from other datsets
 15 | # we've created in that there are two majority classes, and one third (tiny)
 16 | # class that we'll train the transfer learner over
 17 | rs = np.random.RandomState(42)
 18 | covariance = [[1, .75], [.75, 1]]
 19 | 
 20 | # these are the majority classes
 21 | n_obs = 1250
 22 | x1 = rs.multivariate_normal(mean=[0, 0], cov=covariance, size=n_obs)
 23 | x2 = rs.multivariate_normal(mean=[1, 5], cov=covariance, size=n_obs)
 24 | 
 25 | # this is the minority class
 26 | x3 = rs.multivariate_normal(mean=[0.85, 3.25], cov=[[1., .5], [1.25, 0.85]],
 27 |                             size=n_obs // 3)
 28 | 
 29 | # this is what the FIRST network will be trained on
 30 | n_first = int(0.8 * n_obs)
 31 | X = np.vstack((x1[:n_first], x2[:n_first])).astype(np.float32)
 32 | y = np.hstack((np.zeros(n_first), np.ones(n_first))).astype(int)
 33 | 
 34 | # this is what the SECOND network will be trained on
 35 | X2 = np.vstack((x1[n_first:], x2[n_first:], x3)).astype(np.float32)
 36 | y2 = np.hstack((np.zeros(n_obs - n_first),
 37 |                 np.ones(n_obs - n_first),
 38 |                 np.ones(x3.shape[0]) * 2)).astype(int)
 39 | 
 40 | # split the data up
 41 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs)
 42 | X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2,
 43 |                                                         random_state=rs)
 44 | 
 45 | # #############################################################################
 46 | # Fit the first neural network
 47 | hidden = (25, 25)
 48 | n_iter = 75
 49 | clf = NeuralNetClassifier(X_train, y_train, hidden=hidden, n_iter=n_iter,
 50 |                           learning_rate=0.001, random_state=42)
 51 | 
 52 | pred = clf.predict(X_test)
 53 | clf_accuracy = accuracy_score(y_test, pred)
 54 | print("Test accuracy (hidden=%s): %.3f" % (str(hidden), clf_accuracy))
 55 | 
 56 | # #############################################################################
 57 | # Fit the transfer network - train one more layer with a new class
 58 | t_hidden = (15,)
 59 | t_iter = 25
 60 | transfer = TransferLearningClassifier(X2_train, y2_train, pretrained=clf,
 61 |                                       hidden=t_hidden, n_iter=t_iter,
 62 |                                       random_state=42)
 63 | 
 64 | t_pred = transfer.predict(X2_test)
 65 | trans_accuracy = accuracy_score(y2_test, t_pred)
 66 | print("Test accuracy (hidden=%s): %.3f" % (str(hidden + t_hidden),
 67 |                                            trans_accuracy))
 68 | 
 69 | # #############################################################################
 70 | # Visualize how the models learned the classes
 71 | 
 72 | fig, axes = plt.subplots(2, 2, figsize=(12, 8))
 73 | 
 74 | 
 75 | add_decision_boundary_to_axis(estimator=clf, axis=axes[0, 0],
 76 |                               nclasses=2, X_data=X_test)
 77 | axes[0, 0].scatter(X_test[:, 0], X_test[:, 1], c=pred, alpha=0.4)
 78 | axes[0, 0].set_title("MLP network (hidden=%s @ %i iter): %.3f"
 79 |                      % (str(hidden), n_iter, clf_accuracy))
 80 | 
 81 | add_decision_boundary_to_axis(estimator=transfer, axis=axes[0, 1],
 82 |                               nclasses=3, X_data=X2_test)
 83 | axes[0, 1].scatter(X2_test[:, 0], X2_test[:, 1], c=t_pred, alpha=0.4)
 84 | axes[0, 1].set_title("Transfer network (hidden=%s @ %i iter): "
 85 |                      "%.3f" % (str(hidden + t_hidden), t_iter,
 86 |                                trans_accuracy))
 87 | 
 88 | # show the learning rates for each
 89 | axes[1, 0].plot(np.arange(len(clf.train_loss)), clf.train_loss)
 90 | axes[1, 0].set_title("Training loss by iteration")
 91 | 
 92 | # concat the two training losses together for this plot
 93 | trans_train_loss = clf.train_loss + transfer.train_loss
 94 | axes[1, 1].plot(np.arange(len(trans_train_loss)), trans_train_loss)
 95 | axes[1, 1].set_title("Training loss by iteration")
 96 | 
 97 | # Add a verticle line for where the transfer learning begins
 98 | axes[1, 1].axvline(x=n_iter, ls="--")
 99 | 
100 | # if we're supposed to save it, do so INSTEAD OF showing it
101 | if len(sys.argv) > 1:
102 |     plt.savefig(sys.argv[1])
103 | else:
104 |     plt.show()
105 | 


--------------------------------------------------------------------------------
/packtml/regression/simple_logistic.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | from sklearn.utils.validation import check_X_y, check_array
  6 | 
  7 | import numpy as np
  8 | 
  9 | from packtml.utils.extmath import log_likelihood, logistic_sigmoid
 10 | from packtml.utils.validation import assert_is_binary
 11 | from packtml.base import BaseSimpleEstimator
 12 | 
 13 | __all__ = [
 14 |     'SimpleLogisticRegression'
 15 | ]
 16 | 
 17 | try:
 18 |     xrange
 19 | except NameError:  # py 3 doesn't have an xrange
 20 |     xrange = range
 21 | 
 22 | 
 23 | class SimpleLogisticRegression(BaseSimpleEstimator):
 24 |     """Simple logistic regression.
 25 | 
 26 |     This class provides a very simple example of straight forward logistic
 27 |     regression with an intercept. There are few tunable parameters aside from
 28 |     the number of iterations, & learning rate, and the model is fit upon
 29 |     class initialization.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     X : array-like, shape=(n_samples, n_features)
 34 |         The array of predictor variables. This is the array we will use
 35 |         to regress on ``y``.
 36 | 
 37 |     y : array-like, shape=(n_samples,)
 38 |         This is the target array on which we will regress to build
 39 |         our model. It should be binary (0, 1).
 40 | 
 41 |     n_steps : int, optional (default=100)
 42 |         The number of iterations to perform.
 43 | 
 44 |     learning_rate : float, optional (default=0.001)
 45 |         The learning rate.
 46 | 
 47 |     loglik_interval : int, optional (default=5)
 48 |         How frequently to compute the log likelihood. This is an expensive
 49 |         operation--computing too frequently will be very expensive.
 50 | 
 51 |     Attributes
 52 |     ----------
 53 |     theta : array-like, shape=(n_features,)
 54 |         The coefficients
 55 | 
 56 |     intercept : float
 57 |         The intercept term
 58 | 
 59 |     log_likelihood : list
 60 |         A list of the iterations' log-likelihoods
 61 |     """
 62 |     def __init__(self, X, y, n_steps=100, learning_rate=0.001,
 63 |                  loglik_interval=5):
 64 |         X, y = check_X_y(X, y, accept_sparse=False,  # keep dense for example
 65 |                          y_numeric=True)
 66 | 
 67 |         # we want to make sure y is binary since that's all our example covers
 68 |         assert_is_binary(y)
 69 | 
 70 |         # X should be centered/scaled for logistic regression, much like
 71 |         # with linear regression
 72 |         means, stds = X.mean(axis=0), X.std(axis=0)
 73 |         X = (X - means) / stds
 74 | 
 75 |         # since we're going to learn an intercept, we can cheat and set the
 76 |         # intercept to be a new feature that we'll learn with everything else
 77 |         X_w_intercept = np.hstack((np.ones((X.shape[0], 1)), X))
 78 | 
 79 |         # initialize the coefficients as zeros
 80 |         theta = np.zeros(X_w_intercept.shape[1])
 81 | 
 82 |         # now for each step, we compute the inner product of X and the
 83 |         # coefficients, transform the predictions with the sigmoid function,
 84 |         # and adjust the weights by the gradient
 85 |         ll = []
 86 |         for iteration in xrange(n_steps):
 87 |             preds = logistic_sigmoid(X_w_intercept.dot(theta))
 88 |             residuals = y - preds  # The error term
 89 |             gradient = X_w_intercept.T.dot(residuals)
 90 | 
 91 |             # update the coefficients
 92 |             theta += learning_rate * gradient
 93 | 
 94 |             # you may not always want to do this, since it's expensive. Tune
 95 |             # the error_interval to increase/reduce this
 96 |             if (iteration + 1) % loglik_interval == 0:
 97 |                 ll.append(log_likelihood(X_w_intercept, y, theta))
 98 | 
 99 |         # recall that our theta includes the intercept, so we need to pop
100 |         # that off and store it
101 |         self.intercept = theta[0]
102 |         self.theta = theta[1:]
103 |         self.log_likelihood = ll
104 |         self.column_means = means
105 |         self.column_std = stds
106 | 
107 |     def predict_proba(self, X):
108 |         """Generate the probabilities that a sample belongs to class 1"""
109 |         X = check_array(X, accept_sparse=False, copy=False)  # type: np.ndarray
110 | 
111 |         # make sure dims match
112 |         theta = self.theta
113 |         if theta.shape[0] != X.shape[1]:
114 |             raise ValueError("Dim mismatch in predictors!")
115 | 
116 |         # scale the data appropriately
117 |         X = (X - self.column_means) / self.column_std
118 | 
119 |         # creates a copy
120 |         return logistic_sigmoid(np.dot(X, theta.T) + self.intercept)
121 | 
122 |     def predict(self, X):
123 |         return np.round(self.predict_proba(X)).astype(int)
124 | 


--------------------------------------------------------------------------------
/packtml/decision_tree/metrics.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Author: Taylor Smith <taylor.smith@alkaline-ml.com>
  4 | #
  5 | # Metrics used for determining how to split a feature in a decision tree.
  6 | 
  7 | from __future__ import absolute_import
  8 | 
  9 | import numpy as np
 10 | 
 11 | __all__ = [
 12 |     'entropy',
 13 |     'gini_impurity',
 14 |     'InformationGain',
 15 |     'VarianceReduction'
 16 | ]
 17 | 
 18 | 
 19 | def _clf_metric(y, metric):
 20 |     """Internal helper. Since this is internal, so no validation performed"""
 21 |     # get unique classes in y
 22 |     y = np.asarray(y)
 23 |     C, cts = np.unique(y, return_counts=True)
 24 | 
 25 |     # a base case is that there is only one class label
 26 |     if C.shape[0] == 1:
 27 |         return 0.
 28 | 
 29 |     pr_C = cts.astype(float) / y.shape[0]  # P(Ci)
 30 | 
 31 |     # 1 - sum(P(Ci)^2)
 32 |     if metric == 'gini':
 33 |         return 1. - pr_C.dot(pr_C)  # np.sum(pr_C ** 2)
 34 |     elif metric == 'entropy':
 35 |         return np.sum(-pr_C * np.log2(pr_C))
 36 | 
 37 |     # shouldn't ever get to this point since it is internal
 38 |     else:
 39 |         raise ValueError("metric should be one of ('gini', 'entropy'), "
 40 |                          "but encountered %s" % metric)
 41 | 
 42 | 
 43 | def entropy(y):
 44 |     """Compute the entropy of class labels.
 45 | 
 46 |     This computes the entropy of training samples. A high entropy means
 47 |     a relatively uniform distribution, while low entropy indicates a
 48 |     varying distribution (many peaks and valleys).
 49 | 
 50 |     References
 51 |     ----------
 52 |     .. [1] http://www.cs.csi.cuny.edu/~imberman/ai/Entropy%20and%20Information%20Gain.htm
 53 |     """
 54 |     return _clf_metric(y, 'entropy')
 55 | 
 56 | 
 57 | def gini_impurity(y):
 58 |     """Compute the Gini index on a target variable.
 59 | 
 60 |     The Gini index gives an idea of how mixed two classes are within a leaf
 61 |     node. A perfect class separation will result in a Gini impurity of 0 (i.e.,
 62 |     "perfectly pure").
 63 |     """
 64 |     return _clf_metric(y, 'gini')
 65 | 
 66 | 
 67 | class BaseCriterion(object):
 68 |     """Splitting criterion.
 69 | 
 70 |     Base class for InformationGain and VarianceReduction. WARNING - do
 71 |     not invoke this class directly. Use derived classes only! This is a
 72 |     loosely-defined abstract class used to prescribe a common interface
 73 |     for sub-classes.
 74 |     """
 75 |     def compute_uncertainty(self, y):
 76 |         """Compute the uncertainty for a vector.
 77 | 
 78 |         A subclass should override this function to compute the uncertainty
 79 |         (i.e., entropy or gini) of a vector.
 80 |         """
 81 | 
 82 | 
 83 | class InformationGain(BaseCriterion):
 84 |     """Compute the information gain after a split.
 85 | 
 86 |     The information gain metric is used by CART trees in a classification
 87 |     context. It measures the difference in the gini or entropy before and
 88 |     after a split to determine whether the split "taught" us anything.
 89 | 
 90 |     Parameters
 91 |     ----------
 92 |     metric : str or unicode
 93 |         The name of the metric to use. Either "gini" (Gini impurity)
 94 |         or "entropy".
 95 |     """
 96 |     def __init__(self, metric):
 97 |         # let fail out with a KeyError if an improper metric
 98 |         self.crit = {'gini': gini_impurity,
 99 |                      'entropy': entropy}[metric]
100 | 
101 |     def compute_uncertainty(self, y):
102 |         """Compute the uncertainty for a vector.
103 | 
104 |         This method computes either the Gini impurity or entropy of a target
105 |         vector using the prescribed method.
106 |         """
107 |         return self.crit(y)
108 | 
109 |     def __call__(self, target, mask, uncertainty):
110 |         """Compute the information gain of a split.
111 | 
112 |         Parameters
113 |         ----------
114 |         target : np.ndarray
115 |             The target feature
116 | 
117 |         mask : np.ndarray
118 |             The value mask
119 | 
120 |         uncertainty : float
121 |             The gini or entropy of rows pre-split
122 |         """
123 |         left, right = target[mask], target[~mask]
124 |         p = float(left.shape[0]) / float(target.shape[0])
125 | 
126 |         crit = self.crit  # type: callable
127 |         return uncertainty - p * crit(left) - (1 - p) * crit(right)
128 | 
129 | 
130 | class VarianceReduction(BaseCriterion):
131 |     """Compute the variance reduction after a split.
132 | 
133 |     Variance reduction is a splitting criterion used by CART trees in the
134 |     context of regression. It examines the variance in a target before and
135 |     after a split to determine whether we've reduced the variability in the
136 |     target.
137 |     """
138 |     def compute_uncertainty(self, y):
139 |         """Compute the variance of a target."""
140 |         return np.var(y)
141 | 
142 |     def __call__(self, target, mask, uncertainty):
143 |         left, right = target[mask], target[~mask]
144 |         return uncertainty - (self.compute_uncertainty(left) +
145 |                               self.compute_uncertainty(right))
146 | 


--------------------------------------------------------------------------------
/packtml/recommendation/itemitem.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | from sklearn.utils.validation import check_array
  6 | from sklearn.metrics.pairwise import cosine_similarity
  7 | 
  8 | import numpy as np
  9 | 
 10 | from packtml.recommendation.base import RecommenderMixin
 11 | from packtml.base import BaseSimpleEstimator
 12 | 
 13 | __all__ = [
 14 |     'ItemItemRecommender'
 15 | ]
 16 | 
 17 | try:
 18 |     xrange
 19 | except NameError:  # py3
 20 |     xrange = range
 21 | 
 22 | 
 23 | class ItemItemRecommender(BaseSimpleEstimator, RecommenderMixin):
 24 |     """Item-to-item recommendation system using cosine similarity.
 25 | 
 26 |     A collaborative filtering recommender algorithm that computes the cosine
 27 |     similarity between each item and generates recommendations for users'
 28 |     highly rated items by returning similar items.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     R : array-like, shape=(n_users, n_items)
 33 |         The ratings matrix. This must be an explicit ratings matrix where
 34 |         0 indicates an item that a user has not yet rated.
 35 | 
 36 |     Attributes
 37 |     ----------
 38 |     similarity : np.ndarray, shape=(n_items, n_items)
 39 |         The similarity matrix.
 40 | 
 41 |     Notes
 42 |     -----
 43 |     This implementation is very rudimentary and does not allow tuning of
 44 |     hyper-parameters apart from ``k``. No similarity metrics apart from cosine
 45 |     similarity may be used. It is largely written to optimize readability. For
 46 |     a very highly optimized version, try the "implicit" library.
 47 |     """
 48 |     def __init__(self, R, k=10):
 49 |         # check the array, but don't copy if not needed
 50 |         R = check_array(R, dtype=np.float32, copy=False)  # type: np.ndarray
 51 | 
 52 |         # save the hyper param for later use later
 53 |         self.k = k
 54 |         self.similarity = self._compute_sim(R, k)
 55 | 
 56 |     def _compute_sim(self, R, k):
 57 |         # compute the similarity between all the items. This calculates the
 58 |         # similarity between each ITEM
 59 |         sim = cosine_similarity(R.T)
 60 | 
 61 |         # Only keep the similarities of the top K, setting all others to zero
 62 |         # (negative since we want descending)
 63 |         not_top_k = np.argsort(-sim, axis=1)[:, k:]  # shape=(n_items, k)
 64 | 
 65 |         if not_top_k.shape[1]:  # only if there are cols (k < n_items)
 66 |             # now we have to set these to zero in the similarity matrix
 67 |             row_indices = np.repeat(range(not_top_k.shape[0]),
 68 |                                     not_top_k.shape[1])
 69 |             sim[row_indices, not_top_k.ravel()] = 0.
 70 | 
 71 |         return sim
 72 | 
 73 |     def recommend_for_user(self, R, user, n=10,
 74 |                            filter_previously_seen=False,
 75 |                            return_scores=True, **kwargs):
 76 |         """Generate predictions for a single user.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         R : array-like, shape=(n_users, n_items)
 81 |             The test ratings matrix. This must be an explicit ratings matrix
 82 |             where 0 indicates an item that a user has not yet rated.
 83 | 
 84 |         user : int
 85 |             The user index for whom to generate predictions.
 86 | 
 87 |         n : int or None, optional (default=10)
 88 |             The number of recommendations to return. Default is 10. For all,
 89 |             set to None.
 90 | 
 91 |         filter_previously_seen : bool, optional (default=False)
 92 |             Whether to filter out previously-rated items.
 93 | 
 94 |         return_scores : bool, optional (default=True)
 95 |             Whether to return the computed scores for the recommended items.
 96 | 
 97 |         **kwargs : keyword args
 98 |             Ignored. Present to match super signature.
 99 | 
100 |         Returns
101 |         -------
102 |         items : np.ndarray
103 |             The top ``n`` items recommended for the user.
104 | 
105 |         recommendations (optional) : np.ndarray
106 |             The corresponding scores for the top ``n`` items for the
107 |             user. Only returned if ``return_scores`` is True.
108 |         """
109 | 
110 |         # check the array and get the user vector
111 |         R = check_array(R, dtype=np.float32, copy=False)
112 |         user_vector = R[user, :]
113 | 
114 |         # compute the dot product between the user vector and the similarity
115 |         # matrix
116 |         recommendations = user_vector.dot(self.similarity)  # shape=(n_items,)
117 | 
118 |         # if we're filtering previously-seen items, now is the time to do that
119 |         item_indices = np.arange(recommendations.shape[0])
120 |         if filter_previously_seen:
121 |             rated_mask = user_vector != 0.
122 |             recommendations = recommendations[~rated_mask]
123 |             item_indices = item_indices[~rated_mask]
124 | 
125 |         # now arg sort descending (most similar items first)
126 |         order = np.argsort(-recommendations)[:n]
127 |         items = item_indices[order]
128 | 
129 |         if return_scores:
130 |             return items, recommendations[order]
131 |         return items
132 | 
133 |     def predict(self, R):
134 |         """Generate predictions for the test set.
135 | 
136 |         Computes the predicted product of users' rated vectors on the
137 |         pre-computed similarity matrix.
138 |         """
139 |         R = check_array(R, dtype=np.float32, copy=False)  # type: np.ndarray
140 | 
141 |         # compute the product R*sim
142 |         return R.dot(self.similarity)
143 | 


--------------------------------------------------------------------------------
/packtml/utils/plotting.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | from matplotlib.colors import ListedColormap
  6 | from matplotlib import pyplot as plt
  7 | 
  8 | from packtml.utils.validation import learning_curve
  9 | 
 10 | import numpy as np
 11 | 
 12 | __all__ = [
 13 |     'add_decision_boundary_to_axis',
 14 |     'plot_learning_curve'
 15 | ]
 16 | 
 17 | 
 18 | def add_decision_boundary_to_axis(estimator, axis, nclasses,
 19 |                                   X_data, stepsize=0.02,
 20 |                                   colors=('#FFAAAA', '#AAFFFA', '#AAAAFF')):
 21 |     """Plot a classification decision boundary on an axis.
 22 | 
 23 |     Estimates lots of values from a classifier and adds the color map
 24 |     mesh to an axis. WARNING - use PRIOR to applying scatter values on the
 25 |     axis!
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     estimator : BaseSimpleEstimator
 30 |         An estimator that implements ``predict``.
 31 | 
 32 |     axis : matplotlib.Axis
 33 |         The axis we're plotting on.
 34 | 
 35 |     nclasses : int
 36 |         The number of classes present in the data
 37 | 
 38 |     X_data : np.ndarray, shape=(n_samples, n_features)
 39 |         The X data used to fit the data, and along which to plot. Preferably
 40 |         2 features for plotting. The first two will be used to plot.
 41 | 
 42 |     stepsize : float, optional (default=0.02)
 43 |         The size of the steps in the values on which to predict.
 44 | 
 45 |     colors : tuple or iterable, optional
 46 |         The color map
 47 | 
 48 |     Returns
 49 |     -------
 50 |     xx : np.ndarray
 51 |         The x array
 52 | 
 53 |     yy : np.ndarray
 54 |         The y array
 55 | 
 56 |     axis : matplotlib.Axis
 57 |         The axis
 58 |     """
 59 |     x_min, x_max = X_data[:, 0].min() - 1, X_data[:, 0].max() + 1
 60 |     y_min, y_max = X_data[:, 1].min() - 1, X_data[:, 1].max() + 1
 61 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, stepsize),
 62 |                          np.arange(y_min, y_max, stepsize))
 63 | 
 64 |     Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])
 65 |     Z = Z.reshape(xx.shape)
 66 | 
 67 |     axis.pcolormesh(xx, yy, Z, cmap=ListedColormap(list(colors[:nclasses])))
 68 |     return xx, yy, axis
 69 | 
 70 | 
 71 | def plot_learning_curve(model, X, y, n_folds, metric, train_sizes,
 72 |                         seed=None, trace=False, y_lim=None, **kwargs):
 73 |     """Fit and plot a CV learning curve.
 74 | 
 75 |     Fits the model with ``n_folds`` of cross-validation over various
 76 |     training sizes and computes arrays of scores for the train samples
 77 |     and the validation fold samples, then plots them.
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     model : BaseSimpleEstimator
 82 |         The model class that should be fit.
 83 | 
 84 |     X : array-like, shape=(n_samples, n_features)
 85 |         The training matrix.
 86 | 
 87 |     y : array-like, shape=(n_samples,)
 88 |         The training labels/ground-truth.
 89 | 
 90 |     metric : callable
 91 |         The scoring metric
 92 | 
 93 |     train_sizes : iterable
 94 |         The size of the training set for each fold.
 95 | 
 96 |     n_folds : int, optional (default=3)
 97 |         The number of CV folds
 98 | 
 99 |     seed : int or None, optional (default=None)
100 |         The random seed for cross validation.
101 | 
102 |     trace : bool, optional (default=False)
103 |         Whether to print to stdout after each set of folds is fit
104 |         for a given train size.
105 | 
106 |     y_lim : iterable or None, optional (default=None)
107 |         The y-axis limits
108 | 
109 |     **kwargs : keyword args or dict
110 |         The keyword args to pass to the estimator.
111 | 
112 |     Returns
113 |     -------
114 |     plt : Figure
115 |         The matplotlib figure for plotting
116 | 
117 |     References
118 |     ----------
119 |     .. [1] Based on the scikit-learn example:
120 |            http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
121 |     """
122 |     # delegate the model fits to the function in .validation
123 |     train_scores, val_scores = learning_curve(
124 |         model, X, y, train_sizes=train_sizes,
125 |         metric=metric, seed=seed, trace=trace,
126 |         n_folds=n_folds, **kwargs)
127 | 
128 |     # compute the means/stds of each scores list
129 |     train_scores_mean = np.mean(train_scores, axis=1)
130 |     val_scores_mean = np.mean(val_scores, axis=1)
131 |     train_scores_std = np.std(train_scores, axis=1)
132 |     val_scores_std = np.std(val_scores, axis=1)
133 | 
134 |     # plot the learning curves
135 |     plt.figure()
136 |     plt.title("Learning curve (model=%s, train sizes=%s)"
137 |               % (model.__name__, str(train_sizes)))
138 | 
139 |     plt.xlabel("Training sizes")
140 |     plt.ylabel("Score (%s)" % metric.__name__)
141 |     plt.grid()
142 | 
143 |     # define the y-axis limit if necessary
144 |     if y_lim is not None:
145 |         plt.ylim(y_lim)
146 | 
147 |     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
148 |                      train_scores_mean + train_scores_std, alpha=0.1,
149 |                      color="r")
150 |     plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
151 |                      val_scores_mean + val_scores_std, alpha=0.1,
152 |                      color="g")
153 | 
154 |     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
155 |              label="Training score")
156 |     plt.plot(train_sizes, val_scores_mean, 'o-', color="g",
157 |              label="Validation score")
158 |     plt.legend(loc="best")
159 | 
160 |     return plt
161 | 


--------------------------------------------------------------------------------
/packtml/utils/validation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | from sklearn.externals import six
  6 | from sklearn.model_selection import ShuffleSplit
  7 | 
  8 | import numpy as np
  9 | 
 10 | __all__ = [
 11 |     'assert_is_binary',
 12 |     'is_iterable',
 13 |     'learning_curve'
 14 | ]
 15 | 
 16 | 
 17 | def assert_is_binary(y):
 18 |     """Validate that a vector is binary.
 19 | 
 20 |     Checks that a vector is binary. This utility is used by all of
 21 |     the simple classifier estimators to validate the input target.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     y : np.ndarray, shape=(n_samples,)
 26 |         The target vector
 27 |     """
 28 |     # validate that y is in (0, 1)
 29 |     unique_y = np.unique(y)  # type: np.ndarray
 30 |     if unique_y.shape[0] != 2 or [0, 1] != unique_y.tolist():
 31 |         raise ValueError("y must be binary, but got unique values of %s"
 32 |                          % str(unique_y))
 33 | 
 34 | 
 35 | def is_iterable(x):
 36 |     """Determine whether an item is iterable.
 37 | 
 38 |     Python 3 introduced the ``__iter__`` functionality to
 39 |     strings, making them falsely behave like iterables. This
 40 |     function determines whether an object is an iterable given
 41 |     the presence of the ``__iter__`` method and that the object
 42 |     is *not* a string.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     x : int, object, str, iterable, None
 47 |         The object in question. Could feasibly be any type.
 48 |     """
 49 |     if isinstance(x, six.string_types):
 50 |         return False
 51 |     return hasattr(x, "__iter__")
 52 | 
 53 | 
 54 | def learning_curve(model, X, y, metric, train_sizes, n_folds=3,
 55 |                    seed=None, trace=False, **kwargs):
 56 |     """Fit a CV learning curve.
 57 | 
 58 |     Fits the model with ``n_folds`` of cross-validation over various
 59 |     training sizes and returns arrays of scores for the train samples
 60 |     and the validation fold samples.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     model : BaseSimpleEstimator
 65 |         The model class that should be fit.
 66 | 
 67 |     X : array-like, shape=(n_samples, n_features)
 68 |         The training matrix.
 69 | 
 70 |     y : array-like, shape=(n_samples,)
 71 |         The training labels/ground-truth.
 72 | 
 73 |     metric : callable
 74 |         The scoring metric
 75 | 
 76 |     train_sizes : iterable
 77 |         The size of the training set for each fold.
 78 | 
 79 |     n_folds : int, optional (default=3)
 80 |         The number of CV folds
 81 | 
 82 |     seed : int or None, optional (default=None)
 83 |         The random seed for cross validation.
 84 | 
 85 |     trace : bool, optional (default=False)
 86 |         Whether to print to stdout after each set of folds is fit
 87 |         for a given train size.
 88 | 
 89 |     **kwargs : keyword args or dict
 90 |         The keyword args to pass to the estimator.
 91 | 
 92 |     Returns
 93 |     -------
 94 |     train_scores : np.ndarray, shape=(n_trials, n_folds)
 95 |         The scores for the train samples. Each row represents a
 96 |         trial (new train size), and each column corresponds to the
 97 |         fold of the trial, i.e., for ``n_folds=3``, there will be
 98 |         3 columns.
 99 | 
100 |     val_scores : np.ndarray, shape=(n_trials, n_folds)
101 |         The scores for the validation folds. Each row represents a
102 |         trial (new train size), and each column corresponds to the
103 |         fold of the trial, i.e., for ``n_folds=3``, there will be
104 |         3 columns.
105 |     """
106 |     # Each of these lists will be a 2d array. A row will represent a
107 |     # trial for a particular train size, and each column will
108 |     # correspond with a fold.
109 |     train_scores = []
110 |     val_scores = []
111 | 
112 |     # The number of samples in the dataset
113 |     n_samples = X.shape[0]
114 | 
115 |     # If the input is a pandas frame, make it a numpy array for indexing
116 |     if hasattr(X, "iloc"):
117 |         X = X.values
118 | 
119 |     # We need to validate that all of the sizes within the train_sizes
120 |     # are less than the number of samples in the dataset!
121 |     assert all(s < n_samples for s in train_sizes), \
122 |         "All train sizes (%s) must be less than n_samples (%i)" \
123 |         % (str(train_sizes), n_samples)
124 | 
125 |     # For each training size, we're going to initialize a new KFold
126 |     # cross validation instance and fit the K folds...
127 |     for train_size in train_sizes:
128 |         cv = ShuffleSplit(n_splits=n_folds,
129 |                           train_size=train_size,
130 |                           test_size=n_samples - train_size,
131 |                           random_state=seed)
132 | 
133 |         # This is the inner list (row) that will represent the
134 |         # scores for this train size
135 |         inner_train_scores = []
136 |         inner_val_scores = []
137 | 
138 |         # get our splits
139 |         for train_indices, test_indices in cv.split(X, y):
140 |             # get the training samples
141 |             train_X = X[train_indices, :]
142 |             train_y = y.take(train_indices)
143 | 
144 |             # fit the model
145 |             m = model(train_X, train_y, **kwargs)
146 | 
147 |             # score the model on the train set
148 |             inner_train_scores.append(
149 |                 metric(train_y, m.predict(train_X)))
150 | 
151 |             # score the model on the validation set
152 |             inner_val_scores.append(
153 |                 metric(y.take(test_indices),
154 |                        m.predict(X[test_indices, :])))
155 | 
156 |         # Now attach the inner lists to the outer lists
157 |         train_scores.append(inner_train_scores)
158 |         val_scores.append(inner_val_scores)
159 | 
160 |         if trace:
161 |             print("Completed fitting %i folds for train size=%i"
162 |                   % (n_folds, train_size))
163 | 
164 |     # Make our train/val arrays into numpy arrays
165 |     train_scores = np.asarray(train_scores)
166 |     val_scores = np.asarray(val_scores)
167 | 
168 |     return train_scores, val_scores
169 | 


--------------------------------------------------------------------------------
/packtml/neural_net/transfer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Author: Taylor G Smith <taylor.smith@alkaline-ml.com>
  4 | #
  5 | # A simple transfer learning classifier. If you find yourself struggling
  6 | # to follow the derivation of the back-propagation, check out this great
  7 | # refresher on scalar & matrix calculas + differential equations.
  8 | # http://parrt.cs.usfca.edu/doc/matrix-calculus/index.html
  9 | 
 10 | from __future__ import absolute_import
 11 | 
 12 | import numpy as np
 13 | 
 14 | from packtml.neural_net.base import NeuralMixin, tanh
 15 | from packtml.base import BaseSimpleEstimator
 16 | from packtml.neural_net.mlp import NeuralNetClassifier, _calculate_loss
 17 | 
 18 | __all__ = [
 19 |     'TransferLearningClassifier'
 20 | ]
 21 | 
 22 | try:
 23 |     xrange
 24 | except NameError:
 25 |     xrange = range
 26 | 
 27 | 
 28 | def _pretrained_forward_step(X, pt_weights, pt_biases):
 29 |     """Complete a forward step from the pre-trained model"""
 30 |     # progress through all the layers (the output was already trimmed off)
 31 |     for w, b in zip(pt_weights, pt_biases):
 32 |         X = tanh(X.dot(w) + b)
 33 |     return X
 34 | 
 35 | 
 36 | class TransferLearningClassifier(BaseSimpleEstimator, NeuralMixin):
 37 |     """A transfer learning classifier.
 38 | 
 39 |     Create a multi-layer perceptron classifier that learned from a
 40 |     previously-trained network. No fine-tuning is performed, and no
 41 |     prior-trained layers can be retrained (i.e., they remain frozen).
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     X : array-like, shape=(n_samples, n_features)
 46 |         The training array. Should be a numpy array or array-like structure
 47 |         with only finite values.
 48 | 
 49 |     y : array-like, shape=(n_samples,)
 50 |         The target vector.
 51 | 
 52 |     pretrained : NeuralNetClassifier, TransferLearningClassifier
 53 |         The pre-trained MLP. The transfer learner leverages the features
 54 |         extracted from the pre-trained network (the trained weights without
 55 |         the output layer) and uses them to transform the input data before
 56 |         training the new layers.
 57 | 
 58 |     hidden : iterable, optional (default=(25,))
 59 |         An iterable indicating the number of units per hidden layer.
 60 | 
 61 |     n_iter : int, optional (default=10)
 62 |         The default number of iterations to perform.
 63 | 
 64 |     learning_rate : float, optional (default=0.001)
 65 |         The rate at which we descend the gradient.
 66 | 
 67 |     random_state : int, None or RandomState, optional (default=42)
 68 |         The random state for initializing the weights matrices.
 69 |     """
 70 |     def __init__(self, X, y, pretrained, hidden=(25,), n_iter=10,
 71 |                  regularization=0.01, learning_rate=0.001, random_state=42):
 72 | 
 73 |         # initialize via the NN static method
 74 |         self.hidden = hidden
 75 |         self.random_state = random_state
 76 |         self.n_iter = n_iter
 77 |         self.learning_rate = learning_rate
 78 |         self.regularization = regularization
 79 | 
 80 |         # this is the previous model
 81 |         self.model = pretrained
 82 | 
 83 |         # assert that it's a neural net or we'll break down later
 84 |         assert isinstance(pretrained, NeuralMixin), \
 85 |             "Pre-trained model must be a neural network!"
 86 | 
 87 |         # initialize weights, biases, etc. for THE TRAINABLE LAYERS ONLY!
 88 |         pt_w, pt_b = pretrained.export_weights_and_biases(output_layer=False)
 89 |         X, y, weights, biases = NeuralNetClassifier._init_weights_biases(
 90 |             X, y, hidden, random_state,
 91 | 
 92 |             # use as the last dim the column dimension of the last weights
 93 |             # (the ones BEFORE the output layer, that is)
 94 |             last_dim=pt_w[-1].shape[1])
 95 | 
 96 |         # we can train this in a similar fashion to the plain MLP we designed:
 97 |         # for each iteration, feed X through the network, compute the loss,
 98 |         # and back-propagate the error to correct the weights.
 99 |         train_loss = []
100 |         for _ in xrange(n_iter):
101 |             # first, pass the input data through the pre-trained model's
102 |             # hidden layers. Do not pass it through the last layer, however,
103 |             # since we don't want its output from the softmax layer.
104 |             X_transform = _pretrained_forward_step(X, pt_w, pt_b)
105 | 
106 |             # NOW we complete a forward step on THIS model's
107 |             # untrained  weights/biases
108 |             out, layer_results = NeuralNetClassifier._forward_step(
109 |                 X_transform, weights, biases)
110 | 
111 |             # compute the loss on the output
112 |             loss = _calculate_loss(truth=y, preds=out, weights=pt_w + weights,
113 |                                    l2=self.regularization)
114 |             train_loss.append(loss)
115 | 
116 |             # now back-propagate to correct THIS MODEL's weights and biases via
117 |             # gradient descent. NOTE we do NOT adjust the pre-trained model's
118 |             # weights!!!
119 |             NeuralNetClassifier._back_propagate(
120 |                 truth=y, probas=out, layer_results=layer_results,
121 |                 weights=weights, biases=biases,
122 |                 learning_rate=learning_rate,
123 |                 l2=self.regularization)
124 | 
125 |         # save the weights, biases
126 |         self.weights = weights
127 |         self.biases = biases
128 |         self.train_loss = train_loss
129 | 
130 |     def predict(self, X):
131 |         # compute the probabilities and then get the argmax for each class
132 |         probas = self.predict_proba(X)
133 | 
134 |         # we want the argmaxes of each row
135 |         return np.argmax(probas, axis=1)
136 | 
137 |     def predict_proba(self, X):
138 |         # Compute a forward step with the pre-trained model first:
139 |         pt_w, pt_b = self.model.export_weights_and_biases(output_layer=False)
140 |         X_transform = _pretrained_forward_step(X, pt_w, pt_b)
141 | 
142 |         # and then complete a forward step with the trained weights and biases
143 |         return NeuralNetClassifier._forward_step(
144 |             X_transform, self.weights, self.biases)[0]
145 | 
146 |     def export_weights_and_biases(self, output_layer=True):
147 |         pt_weights, pt_biases = \
148 |             self.model.export_weights_and_biases(output_layer=False)
149 |         w = pt_weights + self.weights
150 |         b = pt_biases + self.biases
151 | 
152 |         if output_layer:
153 |             return w, b
154 |         return w[:-1], b[:-1]
155 | 


--------------------------------------------------------------------------------
/packtml/recommendation/als.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | from sklearn.utils.validation import check_random_state, check_array
  6 | 
  7 | from numpy.linalg import solve
  8 | import numpy as np
  9 | 
 10 | from packtml.recommendation.base import RecommenderMixin
 11 | from packtml.base import BaseSimpleEstimator
 12 | 
 13 | __all__ = [
 14 |     'ALS'
 15 | ]
 16 | 
 17 | try:
 18 |     xrange
 19 | except NameError:  # py3 does not have xrange
 20 |     xrange = range
 21 | 
 22 | 
 23 | def mse(R, X, Y, W):
 24 |     """Compute the reconstruction MSE. This is our loss function"""
 25 |     return ((W * (R - X.dot(Y))) ** 2).sum()
 26 | 
 27 | 
 28 | class ALS(BaseSimpleEstimator, RecommenderMixin):
 29 |     r"""Alternating Least Squares for explicit ratings matrices.
 30 | 
 31 |     Computes the ALS user factors and item factors for explicit ratings
 32 |     systems. This solves:
 33 | 
 34 |         R' = XY
 35 | 
 36 |     where ``X`` is an (m x f) matrix of user factors, and ``Y`` is an
 37 |     (f x n) matrix of item factors. Note that for very large ratings matrices,
 38 |     this can quickly grow outside the scope of what will fit into memory!
 39 | 
 40 |     Parameters
 41 |     ----------
 42 |     R : array-like, shape=(n_users, n_items)
 43 |         The ratings matrix. This must be an explicit ratings matrix where
 44 |         0 indicates an item that a user has not yet rated.
 45 | 
 46 |     factors : int or float, optional (default=0.25)
 47 |         The number of factors to learn. Default is ``0.25 * n_items``.
 48 | 
 49 |     n_iter : int, optional (default=10)
 50 |         The number of iterations to perform. The larger the number, the
 51 |         smaller the train error, but the more likely to overfit.
 52 | 
 53 |     lam : float, optional (default=0.001)
 54 |         The L2 regularization parameter. The higher ``lam``, the more
 55 |         regularization is performed, and the more robust the solution. However,
 56 |         extra iterations are typically required.
 57 | 
 58 |     random_state : int, None or RandomState, optional (default=None)
 59 |         The random state for seeding the initial item factors matrix, ``Y``.
 60 | 
 61 |     Attributes
 62 |     ----------
 63 |     X : np.ndarray, shape=(n_users, factors)
 64 |         The user factors
 65 | 
 66 |     Y : np.ndarray, shape=(factors, n_items)
 67 |         The item factors
 68 | 
 69 |     train_err : list
 70 |         The list of training MSE for each iteration performed
 71 | 
 72 |     lam : float
 73 |         The lambda (regularization) value.
 74 | 
 75 |     Notes
 76 |     -----
 77 |     If you plan to use a very large matrix, consider using a sparse CSR matrix
 78 |     to preserve memory, but you'll have to amend the ``recommend_for_user``
 79 |     function, which expects dense output.
 80 |     """
 81 |     def __init__(self, R, factors=0.25, n_iter=10, lam=0.001,
 82 |                  random_state=None):
 83 |         # check the array
 84 |         R = check_array(R, dtype=np.float32)  # type: np.ndarray
 85 |         n_users, n_items = R.shape
 86 | 
 87 |         # get the random state
 88 |         random_state = check_random_state(random_state)
 89 | 
 90 |         # get the number of factors. If it's a float, compute it
 91 |         if isinstance(factors, float):
 92 |             factors = min(np.ceil(factors * n_items).astype(int), n_items)
 93 | 
 94 |         # the weight matrix is used as a masking matrix when computing the MSE.
 95 |         # it allows us to only compute the reconstruction MSE on the rated
 96 |         # items, and not the unrated ones.
 97 |         W = (R > 0.).astype(np.float32)
 98 | 
 99 |         # initialize the first array, Y, and X to None
100 |         Y = random_state.rand(factors, n_items)
101 |         X = None
102 | 
103 |         # the identity matrix (time lambda) is added to the XX or YY product
104 |         # at each iteration.
105 |         I = np.eye(factors) * lam
106 | 
107 |         # this list will store all of the training errors
108 |         train_err = []
109 | 
110 |         # for each iteration, iteratively solve for X, Y, and compute the
111 |         # updated MSE
112 |         for i in xrange(n_iter):
113 |             X = solve(Y.dot(Y.T) + I, Y.dot(R.T)).T
114 |             Y = solve(X.T.dot(X) + I, X.T.dot(R))
115 | 
116 |             # update the training error
117 |             train_err.append(mse(R, X, Y, W))
118 | 
119 |         # now we have X, Y, which are our user factors and item factors
120 |         self.X = X
121 |         self.Y = Y
122 |         self.train_err = train_err
123 |         self.n_factors = factors
124 |         self.lam = lam
125 | 
126 |     def predict(self, R, recompute_users=False):
127 |         """Generate predictions for the test set.
128 | 
129 |         Computes the predicted product of ``XY`` given the fit factors.
130 |         If recomputing users, will learn the new user factors given the
131 |         existing item factors.
132 |         """
133 |         R = check_array(R, dtype=np.float32, copy=False)  # type: np.ndarray
134 |         Y = self.Y  # item factors
135 |         n_factors, _ = Y.shape
136 | 
137 |         # we can re-compute user factors on their updated ratings, if we want.
138 |         # (not always advisable, but can be useful for offline recommenders)
139 |         if recompute_users:
140 |             I = np.eye(n_factors) * self.lam
141 |             X = solve(Y.dot(Y.T) + I, Y.dot(R.T)).T
142 |         else:
143 |             X = self.X
144 | 
145 |         return X.dot(Y)
146 | 
147 |     def recommend_for_user(self, R, user, n=10, recompute_user=False,
148 |                            filter_previously_seen=False,
149 |                            return_scores=True):
150 |         """Generate predictions for a single user.
151 | 
152 |         Parameters
153 |         ----------
154 |         R : array-like, shape=(n_users, n_items)
155 |             The test ratings matrix. This must be an explicit ratings matrix
156 |             where 0 indicates an item that a user has not yet rated.
157 | 
158 |         user : int
159 |             The user index for whom to generate predictions.
160 | 
161 |         n : int or None, optional (default=10)
162 |             The number of recommendations to return. Default is 10. For all,
163 |             set to None.
164 | 
165 |         recompute_user : bool, optional (default=False)
166 |             Whether to recompute the user factors given the test set.
167 |             Not always advisable, as it can be considered leakage, but can
168 |             be useful in an offline recommender system where refits are
169 |             infrequent.
170 | 
171 |         filter_previously_seen : bool, optional (default=False)
172 |             Whether to filter out previously-rated items.
173 | 
174 |         return_scores : bool, optional (default=True)
175 |             Whether to return the computed scores for the recommended items.
176 | 
177 |         Returns
178 |         -------
179 |         items : np.ndarray
180 |             The top ``n`` items recommended for the user.
181 | 
182 |         scores (optional) : np.ndarray
183 |             The corresponding scores for the top ``n`` items for the user.
184 |             Only returned if ``return_scores`` is True.
185 |         """
186 |         R = check_array(R, dtype=np.float32, copy=False)
187 | 
188 |         # compute the new user vector. Squeeze to make sure it's a vector
189 |         user_vec = self.predict(R, recompute_users=recompute_user)[user, :]
190 |         item_indices = np.arange(user_vec.shape[0])
191 | 
192 |         # if we are filtering previously seen, remove the prior-rated items
193 |         if filter_previously_seen:
194 |             rated_mask = R[user, :] != 0.
195 |             user_vec = user_vec[~rated_mask]
196 |             item_indices = item_indices[~rated_mask]
197 | 
198 |         order = np.argsort(-user_vec)[:n]  # descending order of computed scores
199 |         items = item_indices[order]
200 |         if return_scores:
201 |             return items, user_vec[order]
202 |         return items
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/tgsmith61591/Hands-on-Supervised-Machine-Learning-with-Python.svg?branch=master)](https://travis-ci.org/tgsmith61591/Hands-on-Supervised-Machine-Learning-with-Python)
  2 | [![Build status](https://ci.appveyor.com/api/projects/status/181d16js2ao3vn5v/branch/master?svg=true)](https://ci.appveyor.com/project/tgsmith61591/hands-on-supervised-machine-learning-with-python/branch/master)
  3 | [![codecov](https://codecov.io/gh/tgsmith61591/Hands-on-Supervised-Machine-Learning-with-Python/branch/master/graph/badge.svg)](https://codecov.io/gh/tgsmith61591/Hands-on-Supervised-Machine-Learning-with-Python)
  4 | ![Supported versions](https://img.shields.io/badge/python-3.5+-blue.svg)
  5 | 
  6 | # Hands-on-Supervised-Machine-Learning-with-Python
  7 | 
  8 | Published by Packt, Hands-on Supervised Machine Learning with Python
  9 | 
 10 | ### Learn the underpinning os many supervised learning algorithms, and develop rich python coding practices in the process.
 11 | 
 12 | *Supervised learning&mdash;help teach a machine to think for itself!*
 13 | 
 14 | ## Overview
 15 | 
 16 | These days machine learning is everywhere, and it’s here to stay. Understanding the core principles that drive how a machine “learns” is a critical skill for any would-be practitioner or consumer alike. This course will introduce you to supervised machine learning, guiding you through the implementation and nuances of many popular machine learning algorithms while facilitating a deep understanding along the way.
 17 | 
 18 | In this course, we’ll cover parametric models such as linear and logistic regression, non-parametric methods such as decision trees & various clustering techniques, and we’ll wrap up with a brief foray into neural networks.
 19 | 
 20 | This video course highlights clean coding techniques, object-oriented class design, and general best practices in machine learning
 21 | 
 22 | ## Target audience
 23 | 
 24 | This course is designed for those who would like to understand supervised machine learning algorithms at a deeper level. If you’re interested in understanding how and why an algorithm works rather than simply how to call its API, this course might be for you. Intermediate Python knowledge and at least an intermediate understanding of mathematical concepts is assumed. While notions in this course will be broken down into bits as granular as absolutely possible, terms and ideas such as “matrix transposition,” “gradient,” “dot product,” and “time complexity” are assumed to be understood without further explanation.
 25 | 
 26 | ## What you will learn
 27 | 
 28 | * Understand the fundamental and theoretical differences between parametric and non-parametric models, and why you might opt for one over the other.
 29 | * Discover how a machine can learn a concept and generalize its understanding to new data
 30 | * Implement and grok several well-known supervised learning algorithms from scratch; build out your github portfolio and show off what you’re capable of!
 31 | * Learn about model families like recommender systems, which are immediately applicable in domains such as ecommerce and marketing.
 32 | * Become a much stronger python developer
 33 | 
 34 | ### Project layout
 35 | 
 36 | All **[source code](packtml/)** is within the `packtml` folder, which serves as the python
 37 | package for this course. Within the [examples](examples/) directory, you'll find a
 38 | number of short Python scripts that serve to demonstrate how various classes in the `packtml`
 39 | submodules work. Each respective folder inside the `examples/` directory corresponds to a
 40 | submodule inside of the `packtml` python package.
 41 | 
 42 | ### Getting started
 43 | 
 44 | To get your environment set up, make sure you have Anaconda installed and on your path.
 45 | Then simply run the following:
 46 | 
 47 | ```bash
 48 | $ conda env create -f environment.yml
 49 | ```
 50 | 
 51 | To activate your environment in a Unix environment:
 52 | 
 53 | ```bash
 54 | $ source activate packt-sml
 55 | ```
 56 | 
 57 | In a Windows environment:
 58 | 
 59 | ```
 60 | activate packt-sml
 61 | ```
 62 | 
 63 | ### Set up the python package (in your activated environment):
 64 | 
 65 | ```bash
 66 | (packt-sml) $ python setup.py install
 67 | ```
 68 | 
 69 | ## What you'll learn
 70 | 
 71 | In this course and within this package, you'll learn to implement a number of 
 72 | commonly-used supervised learning algorithms, and when best to use one type of
 73 | model over another. Below you'll find in-action examples of the various algorithms 
 74 | we implement within this package.
 75 | 
 76 | ### Regression
 77 | 
 78 | The classic introduction to machine learning, not only will we learn about linear regression,
 79 | we'll code one from scratch so you really understand what's happening 
 80 | [under the hood](packtml/regression/simple_regression.py). Then we'll 
 81 | [apply one in practice](examples/regression/example_linear_regression.py) so you can see 
 82 | how you might use it.
 83 | 
 84 | <img src="img/regression/example_linear_regression.png" alt="KNN" width="50%"/>
 85 | 
 86 | Next, we'll dive into logistic regression, which is linear regression's classification cousin. See
 87 | the full logistic regression example [here](examples/regression/example_logistic_regression.py)
 88 | or the algorithm's [source code](packtml/regression/simple_logistic.py) if you're interested.
 89 | 
 90 | <img src="img/regression/example_logistic_regression.png" alt="KNN" width="50%"/>
 91 | 
 92 | ### KNN clustering
 93 | 
 94 | During our exploration of non-parametric models, we'll explore clustering.
 95 | The `packtml` package implements a simple, but effective k-Nearest Neighbor classifier.
 96 | Here is its output on the iris dataset. For the full code example, head to the
 97 | [examples directory](examples/clustering/example_knn_classifier.py) and then to the
 98 | [source code](packtml/clustering/knn.py) to see how it's implemented.
 99 | 
100 | <img src="img/clustering/example_knn_classifier.png" alt="KNN" width="50%"/>
101 | 
102 | ### Decision trees
103 | 
104 | In this course, we'll also implement a CART decision tree from scratch (for both
105 | regression and classification). Our classification tree's performance and potential 
106 | is shown at varying tree depths in the images below. The classification tree example
107 | is located [here](examples/decision_tree/example_classification_decision_tree.py), and
108 | the source code can be found [here](packtml/decision_tree/cart.py).
109 | 
110 | <img src="img/decision_tree/example_classification_decision_tree.png" alt="CART clf" width="75%"/>
111 | 
112 | In addition to classification, we can build a tree as a non-linear regression
113 | model, as shown below. The regression tree example is located 
114 | [here](examples/decision_tree/example_regression_decision_tree.py). Check out the
115 | [source code](packtml/decision_tree/cart.py) to understand how it works.
116 | 
117 | <img src="img/decision_tree/example_regression_decision_tree.png" alt="CART reg" width="75%"/>
118 | 
119 | ### Deep learning
120 | 
121 | One of the hottest topics of machine learning right now is deep learning and neural
122 | networks. In this course, we'll learn how to code a multi-layer perceptron classifier
123 | from scratch. The full example code is located [here](examples/neural_net/example_mlp_classifier.py)
124 | and this is the [source code](packtml/neural_net/mlp.py).
125 | 
126 | <img src="img/neural_net/example_mlp_classifier.png" alt="MLP" width="75%"/>
127 | 
128 | Next, we'll show how we can use the weights the MLP has learned on previous data to
129 | learn new classification labels via transfer learning. For further implementation
130 | details, check out the [example code](examples/neural_net/example_transfer_learning.py)
131 | or the [source code](packtml/neural_net/transfer.py).
132 | 
133 | <img src="img/neural_net/example_transfer_learning.png" alt="MLP transfer" width="75%"/>
134 | 
135 | ### Recommendation algorithms
136 | 
137 | These days, everything is available for purchase online. E-commerce sites have devoted
138 | lots of research to algorithms that can learn your preferences. In this course, we'll
139 | learn two such algorithms:
140 | 
141 | * [Item-to-item](packtml/recommendation/itemitem.py) collaborative filtering
142 | * [Alternating least squares](packtml/recommendation/als.py) (matrix factorization)
143 | 
144 | The [example ALS code](examples/recommendation/example_als_recommender.py) shows how
145 | train error decreases by iteration:
146 | 
147 | <img src="img/recommendation/example_als_recommender.png" alt="ALS" width="50%"/>
148 | 


--------------------------------------------------------------------------------
/packtml/metrics/ranking.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Author: Taylor G Smith
  4 | #
  5 | # Recommender system ranking metrics derived from Spark source for use with
  6 | # Python-based recommender systems. See the full gist here:
  7 | # https://gist.github.com/tgsmith61591/d8aa96ac7c74c24b33e4b0cb967ca519
  8 | 
  9 | from __future__ import absolute_import, division
 10 | 
 11 | import numpy as np
 12 | 
 13 | import warnings
 14 | 
 15 | __all__ = [
 16 |     'mean_average_precision',
 17 |     'ndcg_at',
 18 |     'precision_at',
 19 | ]
 20 | 
 21 | try:
 22 |     xrange
 23 | except NameError:  # python 3 does not have an 'xrange'
 24 |     xrange = range
 25 | 
 26 | 
 27 | def _require_positive_k(k):
 28 |     """Helper function to avoid copy/pasted code for validating K"""
 29 |     if k <= 0:
 30 |         raise ValueError("ranking position k should be positive")
 31 | 
 32 | 
 33 | def _mean_ranking_metric(predictions, labels, metric):
 34 |     """Helper function for precision_at_k and mean_average_precision"""
 35 |     # do not zip, as this will require an extra pass of O(N). Just assert
 36 |     # equal length and index (compute in ONE pass of O(N)).
 37 |     # if len(predictions) != len(labels):
 38 |     #     raise ValueError("dim mismatch in predictions and labels!")
 39 |     # return np.mean([
 40 |     #     metric(np.asarray(predictions[i]), np.asarray(labels[i]))
 41 |     #     for i in xrange(len(predictions))
 42 |     # ])
 43 | 
 44 |     # Actually probably want lazy evaluation in case preds is a
 45 |     # generator, since preds can be very dense and could blow up
 46 |     # memory... but how to assert lengths equal? FIXME
 47 |     return np.mean([
 48 |         metric(np.asarray(prd), np.asarray(labels[i]))
 49 |         for i, prd in enumerate(predictions)  # lazy eval if generator
 50 |     ])
 51 | 
 52 | 
 53 | def _warn_for_empty_labels():
 54 |     """Helper for missing ground truth sets"""
 55 |     warnings.warn("Empty ground truth set! Check input data")
 56 |     return 0.
 57 | 
 58 | 
 59 | def precision_at(predictions, labels, k=10, assume_unique=True):
 60 |     """Compute the precision at K.
 61 | 
 62 |     Compute the average precision of all the queries, truncated at
 63 |     ranking position k. If for a query, the ranking algorithm returns
 64 |     n (n is less than k) results, the precision value will be computed
 65 |     as #(relevant items retrieved) / k. This formula also applies when
 66 |     the size of the ground truth set is less than k.
 67 |     If a query has an empty ground truth set, zero will be used as
 68 |     precision together with a warning.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     predictions : array-like, shape=(n_predictions,)
 73 |         The prediction array. The items that were predicted, in descending
 74 |         order of relevance.
 75 | 
 76 |     labels : array-like, shape=(n_ratings,)
 77 |         The labels (positively-rated items).
 78 | 
 79 |     k : int, optional (default=10)
 80 |         The rank at which to measure the precision.
 81 | 
 82 |     assume_unique : bool, optional (default=True)
 83 |         Whether to assume the items in the labels and predictions are each
 84 |         unique. That is, the same item is not predicted multiple times or
 85 |         rated multiple times.
 86 | 
 87 |     Examples
 88 |     --------
 89 |     >>> # predictions for 3 users
 90 |     >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
 91 |     ...          [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
 92 |     ...          [1, 2, 3, 4, 5]]
 93 |     >>> # labels for the 3 users
 94 |     >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
 95 |     >>> precision_at(preds, labels, 1)
 96 |     0.33333333333333331
 97 |     >>> precision_at(preds, labels, 5)
 98 |     0.26666666666666666
 99 |     >>> precision_at(preds, labels, 15)
100 |     0.17777777777777778
101 |     """
102 |     # validate K
103 |     _require_positive_k(k)
104 | 
105 |     def _inner_pk(pred, lab):
106 |         # need to compute the count of the number of values in the predictions
107 |         # that are present in the labels. We'll use numpy in1d for this (set
108 |         # intersection in O(1))
109 |         if lab.shape[0] > 0:
110 |             n = min(pred.shape[0], k)
111 |             cnt = np.in1d(pred[:n], lab, assume_unique=assume_unique).sum()
112 |             return float(cnt) / k
113 |         else:
114 |             return _warn_for_empty_labels()
115 | 
116 |     return _mean_ranking_metric(predictions, labels, _inner_pk)
117 | 
118 | 
119 | def mean_average_precision(predictions, labels, assume_unique=True):
120 |     """Compute the mean average precision on predictions and labels.
121 | 
122 |     Returns the mean average precision (MAP) of all the queries. If a query
123 |     has an empty ground truth set, the average precision will be zero and a
124 |     warning is generated.
125 | 
126 |     Parameters
127 |     ----------
128 |     predictions : array-like, shape=(n_predictions,)
129 |         The prediction array. The items that were predicted, in descending
130 |         order of relevance.
131 | 
132 |     labels : array-like, shape=(n_ratings,)
133 |         The labels (positively-rated items).
134 | 
135 |     assume_unique : bool, optional (default=True)
136 |         Whether to assume the items in the labels and predictions are each
137 |         unique. That is, the same item is not predicted multiple times or
138 |         rated multiple times.
139 | 
140 |     Examples
141 |     --------
142 |     >>> # predictions for 3 users
143 |     >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
144 |     ...          [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
145 |     ...          [1, 2, 3, 4, 5]]
146 |     >>> # labels for the 3 users
147 |     >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
148 |     >>> mean_average_precision(preds, labels)
149 |     0.35502645502645497
150 |     """
151 | 
152 |     def _inner_map(pred, lab):
153 |         if lab.shape[0]:
154 |             # compute the number of elements within the predictions that are
155 |             # present in the actual labels, and get the cumulative sum weighted
156 |             # by the index of the ranking
157 |             n = pred.shape[0]
158 | 
159 |             # Scala code from Spark source:
160 |             # var i = 0
161 |             # var cnt = 0
162 |             # var precSum = 0.0
163 |             # val n = pred.length
164 |             # while (i < n) {
165 |             #     if (labSet.contains(pred(i))) {
166 |             #         cnt += 1
167 |             #         precSum += cnt.toDouble / (i + 1)
168 |             #     }
169 |             #     i += 1
170 |             # }
171 |             # precSum / labSet.size
172 | 
173 |             arange = np.arange(n, dtype=np.float32) + 1.  # this is the denom
174 |             present = np.in1d(pred[:n], lab, assume_unique=assume_unique)
175 |             prec_sum = np.ones(present.sum()).cumsum()
176 |             denom = arange[present]
177 |             return (prec_sum / denom).sum() / lab.shape[0]
178 | 
179 |         else:
180 |             return _warn_for_empty_labels()
181 | 
182 |     return _mean_ranking_metric(predictions, labels, _inner_map)
183 | 
184 | 
185 | def ndcg_at(predictions, labels, k=10, assume_unique=True):
186 |     """Compute the normalized discounted cumulative gain at K.
187 | 
188 |     Compute the average NDCG value of all the queries, truncated at ranking
189 |     position k. The discounted cumulative gain at position k is computed as:
190 |         sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1)
191 |     and the NDCG is obtained by dividing the DCG value on the ground truth set.
192 |     In the current implementation, the relevance value is binary.
193 |     If a query has an empty ground truth set, zero will be used as
194 |     NDCG together with a warning.
195 | 
196 |     Parameters
197 |     ----------
198 |     predictions : array-like, shape=(n_predictions,)
199 |         The prediction array. The items that were predicted, in descending
200 |         order of relevance.
201 | 
202 |     labels : array-like, shape=(n_ratings,)
203 |         The labels (positively-rated items).
204 | 
205 |     k : int, optional (default=10)
206 |         The rank at which to measure the NDCG.
207 | 
208 |     assume_unique : bool, optional (default=True)
209 |         Whether to assume the items in the labels and predictions are each
210 |         unique. That is, the same item is not predicted multiple times or
211 |         rated multiple times.
212 | 
213 |     Examples
214 |     --------
215 |     >>> # predictions for 3 users
216 |     >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
217 |     ...          [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
218 |     ...          [1, 2, 3, 4, 5]]
219 |     >>> # labels for the 3 users
220 |     >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
221 |     >>> ndcg_at(preds, labels, 3)
222 |     0.3333333432674408
223 |     >>> ndcg_at(preds, labels, 10)
224 |     0.48791273434956867
225 | 
226 |     References
227 |     ----------
228 |     .. [1] K. Jarvelin and J. Kekalainen, "IR evaluation methods for
229 |            retrieving highly relevant documents."
230 |     """
231 |     # validate K
232 |     _require_positive_k(k)
233 | 
234 |     def _inner_ndcg(pred, lab):
235 |         if lab.shape[0]:
236 |             # if we do NOT assume uniqueness, the set is a bit different here
237 |             if not assume_unique:
238 |                 lab = np.unique(lab)
239 | 
240 |             n_lab = lab.shape[0]
241 |             n_pred = pred.shape[0]
242 |             n = min(max(n_pred, n_lab), k)  # min(min(p, l), k)?
243 | 
244 |             # similar to mean_avg_prcsn, we need an arange, but this time +2
245 |             # since python is zero-indexed, and the denom typically needs +1.
246 |             # Also need the log base2...
247 |             arange = np.arange(n, dtype=np.float32)  # length n
248 | 
249 |             # since we are only interested in the arange up to n_pred, truncate
250 |             # if necessary
251 |             arange = arange[:n_pred]
252 |             denom = np.log2(arange + 2.)  # length n
253 |             gains = 1. / denom  # length n
254 | 
255 |             # compute the gains where the prediction is present in the labels
256 |             dcg_mask = np.in1d(pred[:n], lab, assume_unique=assume_unique)
257 |             dcg = gains[dcg_mask].sum()
258 | 
259 |             # the max DCG is sum of gains where the index < the label set size
260 |             max_dcg = gains[arange < n_lab].sum()
261 |             return dcg / max_dcg
262 | 
263 |         else:
264 |             return _warn_for_empty_labels()
265 | 
266 |     return _mean_ranking_metric(predictions, labels, _inner_ndcg)


--------------------------------------------------------------------------------
/packtml/neural_net/mlp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Author: Taylor G Smith <taylor.smith@alkaline-ml.com>
  4 | #
  5 | # A simple multilayer perceptron classifier. If you find yourself struggling
  6 | # to follow the derivation of the back-propagation, check out this great
  7 | # refresher on scalar & matrix calculas + differential equations.
  8 | # http://parrt.cs.usfca.edu/doc/matrix-calculus/index.html
  9 | 
 10 | from __future__ import absolute_import, division
 11 | 
 12 | from sklearn.utils.validation import check_X_y, check_random_state
 13 | from sklearn.utils.multiclass import check_classification_targets
 14 | 
 15 | import numpy as np
 16 | 
 17 | from packtml.base import BaseSimpleEstimator
 18 | from packtml.neural_net.base import NeuralMixin, tanh
 19 | 
 20 | __all__ = [
 21 |     'NeuralNetClassifier'
 22 | ]
 23 | 
 24 | try:
 25 |     xrange
 26 | except NameError:  # py3
 27 |     xrange = range
 28 | 
 29 | 
 30 | def _calculate_loss(truth, preds, weights, l2):
 31 |     """Compute the log loss.
 32 | 
 33 |     Calculate the log loss between the true class labels and the predictions
 34 |     generated by the softmax layer in our neural network.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     truth : np.ndarray, shape=(n_samples,)
 39 |         The true labels
 40 | 
 41 |     preds : np.ndarray, shape=(n_samples, n_classes)
 42 |         The predicted class probabilities
 43 | 
 44 |     weights : list
 45 |         The list of weights matrices. Used for computing the loss
 46 |         with the L2 regularization.
 47 | 
 48 |     l2 : float
 49 |         The regularization parameter
 50 |     """
 51 |     # get the log probs of the prediction for the true class labels
 52 |     n_samples = truth.shape[0]
 53 |     logprobs = -np.log(preds[range(n_samples), truth])
 54 | 
 55 |     # compute the sum of log probs
 56 |     sum_logprobs = logprobs.sum()
 57 | 
 58 |     # add the L2 regularization term
 59 |     sum_logprobs += l2 / 2. * sum(np.square(W).sum() for W in weights)
 60 |     return 1. / n_samples * sum_logprobs
 61 | 
 62 | 
 63 | def softmax(X):
 64 |     """Apply the softmax function.
 65 | 
 66 |     The softmax function squashes an N-dimensional vector into a K-dimensional
 67 |     vector whose elements add up to 1, and whose elements are bound in (0, 1).
 68 | 
 69 |     Parameters
 70 |     ----------
 71 |     X : np.ndarray, shape=(n_samples, n_features)
 72 |         The matrix over which to apply softmax along the rows.
 73 |     """
 74 |     # first compute the exponential. This is a step that would take place
 75 |     # in the sigmoid (logistic) function as well. We can already begin to see
 76 |     # where this is going to resemble logistic regression...
 77 |     X_exp = np.exp(X)
 78 |     return X_exp / np.sum(X_exp, axis=1, keepdims=True)
 79 | 
 80 | 
 81 | class NeuralNetClassifier(BaseSimpleEstimator, NeuralMixin):
 82 |     """A neural network classifier.
 83 | 
 84 |     Create a multi-layer perceptron classifier. Note that this is a very
 85 |     simple implementation of an MLP with only fully-connected layers and
 86 |     very few tunable parameters. It is designed for readability. For more
 87 |     optimized neural network code, look into TensorFlow, Keras or other
 88 |     libraries.
 89 | 
 90 |     This implementation of a neural net uses the TanH activation function
 91 |     *only*, and does not allow early convergence. It will continue for
 92 |     ``n_iter``. There are many other parameters that would typically be
 93 |     tunable in a network, for instance dropout, regularization, learning
 94 |     rate, etc. The majority of these parameters are left out of this
 95 |     implementation to keep it simple.
 96 | 
 97 |     Parameters
 98 |     ----------
 99 |     X : array-like, shape=(n_samples, n_features)
100 |         The training array. Should be a numpy array or array-like structure
101 |         with only finite values.
102 | 
103 |     y : array-like, shape=(n_samples,)
104 |         The target vector.
105 | 
106 |     hidden : iterable, optional (default=(25,))
107 |         An iterable indicating the number of units per hidden layer.
108 | 
109 |     n_iter : int, optional (default=10)
110 |         The default number of iterations to perform.
111 | 
112 |     learning_rate : float, optional (default=0.001)
113 |         The rate at which we descend the gradient.
114 | 
115 |     random_state : int, None or RandomState, optional (default=42)
116 |         The random state for initializing the weights matrices.
117 |     """
118 |     def __init__(self, X, y, hidden=(25,), n_iter=10, learning_rate=0.001,
119 |                  regularization=0.01, random_state=42):
120 | 
121 |         self.hidden = hidden
122 |         self.random_state = random_state
123 |         self.n_iter = n_iter
124 |         self.learning_rate = learning_rate
125 |         self.regularization = regularization
126 | 
127 |         # initialize weights, biases, etc.
128 |         X, y, weights, biases = self._init_weights_biases(
129 |             X, y, hidden, random_state, last_dim=None)
130 | 
131 |         # we can keep track of the loss for each iter
132 |         train_loss = []
133 | 
134 |         # for each iteration, feed X through the network, compute the loss,
135 |         # and back-propagate the error to correct the weights.
136 |         for _ in xrange(n_iter):
137 |             # compute the product of X on the hidden layers (the output of
138 |             # the network)
139 |             out, layer_results = self._forward_step(X, weights, biases)
140 | 
141 |             # compute the loss on the output
142 |             loss = _calculate_loss(truth=y, preds=out, weights=weights,
143 |                                    l2=self.regularization)
144 |             train_loss.append(loss)
145 | 
146 |             # now back-propagate to correct the weights and biases via
147 |             # gradient descent
148 |             self._back_propagate(y, out, layer_results, weights,
149 |                                  biases, learning_rate,
150 |                                  self.regularization)
151 | 
152 |         # save the weights, biases and loss as instance attributes
153 |         self.weights = weights
154 |         self.biases = biases
155 |         self.train_loss = train_loss
156 | 
157 |     @staticmethod
158 |     def _init_weights_biases(X, y, hidden, random_state, last_dim=None):
159 |         # make sure dims all match in X, y and that we have appropriate
160 |         # classification targets
161 |         X, y = check_X_y(X, y, copy=False)
162 |         check_classification_targets(y)
163 | 
164 |         random_state = check_random_state(random_state)
165 | 
166 |         # initialize the weights and biases. For each layer, we create a new
167 |         # matrix of dimensions [last_layer_col_dim, new_col_dim]. This ensures
168 |         # we can compute matrix products across the layers and that the
169 |         # dimensions all match up. The biases will each be a vector of ones
170 |         # in this example, though in other networks that can be initialized
171 |         # differently
172 |         weights = []
173 |         biases = []
174 | 
175 |         # if last dim is undefined, use the column shape of the input data.
176 |         # this argument is used to simplify the initialization of weights/
177 |         # biases in the transfer learning class...
178 |         if last_dim is None:
179 |             last_dim = X.shape[1]
180 | 
181 |         for layer_size in hidden:
182 |             # initialize to extremely small values
183 |             w = random_state.rand(last_dim, layer_size) * 0.01
184 |             b = np.ones(layer_size)
185 |             last_dim = layer_size
186 | 
187 |             weights.append(w)
188 |             biases.append(b)
189 | 
190 |         # we need to add one more layer (the output layer) that is the size of
191 |         # the expected output probabilities. We'll apply the softmax function
192 |         # to the output of this layer.
193 |         n_outputs = np.unique(y).shape[0]
194 |         weights.append(random_state.rand(last_dim, n_outputs))
195 |         biases.append(np.ones(n_outputs))
196 | 
197 |         return X, y, weights, biases
198 | 
199 |     @staticmethod
200 |     def _forward_step(X, weights, biases):
201 |         # track the intermediate products
202 |         intermediate_results = [X]
203 | 
204 |         # progress through all the layers EXCEPT the very last one.
205 |         for w, b in zip(weights[:-1], biases[:-1]):
206 | 
207 |             # apply the activation function to the product of X and the weights
208 |             # (after adding the bias vector)
209 |             X = tanh(X.dot(w) + b)
210 | 
211 |             # append this layer result
212 |             intermediate_results.append(X)
213 | 
214 |         # we handle the very last layer a bit differently, since it's out
215 |         # output layer. First compute the product...
216 |         X = X.dot(weights[-1]) + biases[-1]
217 | 
218 |         # then rather than apply the activation function (tanh), we apply
219 |         # the softmax, which is essentially generalized logistic regression.
220 |         return softmax(X), intermediate_results
221 | 
222 |     @staticmethod
223 |     def _back_propagate(truth, probas, layer_results, weights,
224 |                         biases, learning_rate, l2):
225 |         # Compute the gradient (derivative) of our loss function WRT our
226 |         # last layer of weights/biases, and back propagate the error back
227 |         # up the layers, adjusting the weights as we go.
228 |         #
229 |         # Or, expressed in the chain rule:
230 |         # dL/dW = (dL/dZ)(dZ/dW) ...
231 | 
232 |         # the probabilities are our first delta. Subtract 1 from the
233 |         # TRUE labels' probabilities in the predictions
234 |         n_samples = truth.shape[0]
235 | 
236 |         # subtract 1 from true idcs. initial deltas are: (y_hat - y)
237 |         # This computes d2 = Y - T
238 |         probas[range(n_samples), truth] -= 1.
239 | 
240 |         # iterate back through the layers computing the deltas (derivatives)
241 |         last_delta = probas
242 |         for next_weights, next_biases, layer_res in \
243 |                 zip(weights[::-1], biases[::-1], layer_results[::-1]):
244 | 
245 |             # the gradient for this layer is equivalent to the previous delta
246 |             # multiplied by the intermittent layer result
247 |             d_W = layer_res.T.dot(last_delta)
248 | 
249 |             # column sums of the (just-computed) delta is the derivative
250 |             # of the biases
251 |             d_b = np.sum(last_delta, axis=0)
252 | 
253 |             # set the next delta for the next iter
254 |             last_delta = last_delta.dot(next_weights.T) * \
255 |                 (1. - np.power(layer_res, 2.))
256 | 
257 |             # update the weights gradient with the L2 regularization term
258 |             d_W += l2 * next_weights
259 | 
260 |             # update the weights in this layer. The learning rate governs how
261 |             # quickly we descend the gradient
262 |             next_weights += -learning_rate * d_W
263 |             next_biases += -learning_rate * d_b
264 | 
265 |     def predict(self, X):
266 |         # compute the probabilities and then get the argmax for each class
267 |         probas = self.predict_proba(X)
268 | 
269 |         # we want the argmaxes of each row
270 |         return np.argmax(probas, axis=1)
271 | 
272 |     def predict_proba(self, X):
273 |         # simply compute a forward step (we don't care about idx 1 of the
274 |         # tuple, which is just the intermediate products)
275 |         return self._forward_step(X, self.weights, self.biases)[0]
276 | 
277 |     def export_weights_and_biases(self, output_layer=True):
278 |         w, b = self.weights, self.biases
279 |         if output_layer:
280 |             return w, b
281 |         return w[:-1], b[:-1]
282 | 


--------------------------------------------------------------------------------
/packtml/decision_tree/cart.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Author: Taylor G Smith <taylor.smith@alkaline-ml.com>
  4 | #
  5 | # A simplified version of Classification and Regression Trees. This file
  6 | # is intended to maximize readability and understanding of how CART trees work.
  7 | # For very fast or customizable decision tree solutions, use scikit-learn.
  8 | #
  9 | # The best order in which to read & understand the contents to best
 10 | # grok the entire concept:
 11 | #
 12 | #   1. metrics.InformationGain & metrics.VarianceReduction
 13 | #   2. RandomSplitter
 14 | #   3. LeafNode
 15 | #   4. BaseCART
 16 | 
 17 | from __future__ import absolute_import, division
 18 | 
 19 | from sklearn.utils.validation import check_X_y, check_random_state, check_array
 20 | from sklearn.utils.multiclass import check_classification_targets
 21 | from sklearn.base import ClassifierMixin, RegressorMixin, is_classifier
 22 | 
 23 | import numpy as np
 24 | 
 25 | from packtml.base import BaseSimpleEstimator
 26 | from packtml.decision_tree.metrics import InformationGain, VarianceReduction
 27 | 
 28 | __all__ = [
 29 |     'CARTRegressor',
 30 |     'CARTClassifier'
 31 | ]
 32 | 
 33 | try:
 34 |     xrange
 35 | except NameError:  # py3
 36 |     xrange = range
 37 | 
 38 | 
 39 | class RandomSplitter(object):
 40 |     """Evaluate a split via random values in a feature.
 41 | 
 42 |     Every feature in the dataset needs to be evaluated in a CART tree. Since
 43 |     that in itself can be expensive, the random splitter allows us to look at
 44 |     only a random amount of row splits per feature in order to make the best
 45 |     splitting decision.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     random_state : np.random.RandomState
 50 |         The random state for seeding the choices
 51 | 
 52 |     criterion : callable
 53 |         The metric used for evaluating the "goodness" of a split. Either
 54 |         ``InformationGain`` (with entropy or Gini) for classification, or
 55 |         ``VarianceReduction`` for regression.
 56 | 
 57 |     n_val_sample : float, optional (default=25)
 58 |         The number of values per feature to sample as a splitting point.
 59 |     """
 60 |     def __init__(self, random_state, criterion, n_val_sample=25):
 61 |         self.random_state = random_state
 62 |         self.criterion = criterion  # BaseCriterion from metrics
 63 |         self.n_val_sample = n_val_sample
 64 | 
 65 |     def find_best(self, X, y):
 66 |         criterion = self.criterion
 67 |         rs = self.random_state
 68 | 
 69 |         # keep track of the best info gain
 70 |         best_gain = 0.
 71 | 
 72 |         # keep track of best feature and best value on which to split
 73 |         best_feature = None
 74 |         best_value = None
 75 | 
 76 |         # get the current state of the uncertainty (gini or entropy)
 77 |         uncertainty = criterion.compute_uncertainty(y)
 78 | 
 79 |         # iterate over each feature
 80 |         for col in xrange(X.shape[1]):
 81 |             feature = X[:, col]
 82 | 
 83 |             # get all values in the feature
 84 |             # values = np.unique(feature)
 85 |             seen_values = set()
 86 | 
 87 |             # the number of values to sample. Should be defined as the min
 88 |             # between the prescribed n_val_sample value and the number of
 89 |             # unique values in the feature.
 90 |             n_vals = min(self.n_val_sample, np.unique(feature).shape[0])
 91 | 
 92 |             # For each of n_val_sample iterations, select a random value
 93 |             # from the feature and create a split. We store whether we've seen
 94 |             # the value before; if we have, continue. Continue until we've seen
 95 |             # n_vals unique values. This allows us to more likely select values
 96 |             # that are high frequency (retains distributional data implicitly)
 97 |             for v in rs.permutation(feature):
 98 | 
 99 |                 # if we've hit the limit of the number of values we wanted to
100 |                 # examine, break out
101 |                 if len(seen_values) == n_vals:
102 |                     break
103 | 
104 |                 # if we've already tried this value, continue
105 |                 elif v in seen_values:  # O(1) lookup
106 |                     continue
107 | 
108 |                 # otherwise, it's a new value we've never tried splitting on.
109 |                 # add it to the set.
110 |                 seen_values.add(v)
111 | 
112 |                 # create the mask (these values "go left")
113 |                 mask = feature >= v  # type: np.ndarray
114 | 
115 |                 # skip this step if this doesn't divide the dataset
116 |                 if np.unique(mask).shape[0] == 1:  # all True or all False
117 |                     continue
118 | 
119 |                 # compute how good this split was
120 |                 gain = criterion(y, mask, uncertainty=uncertainty)
121 | 
122 |                 # if the gain is better, we keep this feature & value &
123 |                 # update the best gain we've seen so far
124 |                 if gain > best_gain:
125 |                     best_feature = col
126 |                     best_value = v
127 |                     best_gain = gain
128 | 
129 |         # if best feature is None, it means we never found a viable split...
130 |         # this is likely because all of our labels were perfect. In this case,
131 |         # we could select any feature and the first value and define that as
132 |         # our left split and nothing will go right.
133 |         if best_feature is None:
134 |             best_feature = 0
135 |             best_value = np.squeeze(X[:, best_feature])[0]
136 |             best_gain = 0.
137 | 
138 |         # we need to know the best feature, the best value, and the best gain
139 |         return best_feature, best_value, best_gain
140 | 
141 | 
142 | class LeafNode(object):
143 |     """A tree node class.
144 | 
145 |     Tree node that store the column on which to split and the value above
146 |     which to go left vs. right. Additionally, it stores the target statistic
147 |     related to this node. For instance, in a classification scenario:
148 | 
149 |         >>> X = np.array([[ 1, 1.5 ],
150 |         ...               [ 2, 0.5 ],
151 |         ...               [ 3, 0.75]])
152 |         >>> y = np.array([0, 1, 1])
153 |         >>> node = LeafNode(split_col=0, split_val=2,
154 |         ...                 class_statistic=_most_common(y))
155 | 
156 |     This means if ``node`` were a terminal node, it would generate predictions
157 |     of 1, since that was the most common value in the pre-split ``y``. The
158 |     class statistic will differ for splits in the tree, where the most common
159 |     value in ``y`` for records in ``X`` that go left is 1, and 0 for that which
160 |     goes to the right.
161 | 
162 |     The class statistic is computed for each split as the tree recurses.
163 | 
164 |     Parameters
165 |     ----------
166 |     split_col : int
167 |         The column on which to split.
168 | 
169 |     split_val : float or int
170 |         The value above which to go left.
171 |     """
172 |     def __init__(self, split_col, split_val, split_gain, class_statistic):
173 | 
174 |         self.split_col = split_col
175 |         self.split_val = split_val
176 |         self.split_gain = split_gain
177 | 
178 |         # the class statistic is the mode or the mean of the targets for
179 |         # this split
180 |         self.class_statistic = class_statistic
181 | 
182 |         # if these remain None, it's a terminal node
183 |         self.left = None
184 |         self.right = None
185 | 
186 |     def create_split(self, X, y):
187 |         """Split the next X, y.
188 | 
189 |         Returns
190 |         -------
191 |         X_left : np.ndarray, shape=(n_samples, n_features)
192 |             Rows where ``split_col >= split_val``.
193 | 
194 |         X_right : np.ndarray, shape=(n_samples, n_features)
195 |             Rows where ``split_col < split_val``.
196 | 
197 |         y_left : np.ndarray, shape=(n_samples,)
198 |             Target where ``split_col >= split_val``.
199 | 
200 |         y_right : np.ndarray, shape=(n_samples,)
201 |             Target where ``split_col < split_val``.
202 |         """
203 |         # If values in the split column are greater than or equal to the
204 |         # split value, we go left.
205 |         left_mask = X[:, self.split_col] >= self.split_val
206 | 
207 |         # Otherwise we go to the right
208 |         right_mask = ~left_mask  # type: np.ndarray
209 | 
210 |         # If the left mask is all False or all True, it means we've achieved
211 |         # a perfect split.
212 |         all_left = left_mask.all()
213 |         all_right = right_mask.all()
214 | 
215 |         # create the left split. If it's all right side, we'll return None
216 |         X_left = X[left_mask, :] if not all_right else None
217 |         y_left = y[left_mask] if not all_right else None
218 | 
219 |         # create the right split. If it's all left side, we'll return None.
220 |         X_right = X[right_mask, :] if not all_left else None
221 |         y_right = y[right_mask] if not all_left else None
222 | 
223 |         return X_left, X_right, y_left, y_right
224 | 
225 |     def is_terminal(self):
226 |         """Determine whether the node is terminal.
227 | 
228 |         If there is no left node and no right node, it's a terminal node.
229 |         If either is non-None, it is a parent to something.
230 |         """
231 |         return self.left is None and self.right is None
232 | 
233 |     def __repr__(self):
234 |         """Get the string representation of the node."""
235 |         return "Rule: Go left if x%i >= %r else go right (gain=%.3f)" \
236 |                % (self.split_col, self.split_val, self.split_gain)
237 | 
238 |     def predict_record(self, record):
239 |         """Find the terminal node in the tree and return the class statistic"""
240 |         # First base case, this is a terminal node:
241 |         has_left = self.left is not None
242 |         has_right = self.right is not None
243 |         if not has_left and not has_right:
244 |             return self.class_statistic
245 | 
246 |         # Otherwise, determine whether the record goes right or left
247 |         go_left = record[self.split_col] >= self.split_val
248 | 
249 |         # if we go left and there is a left node, delegate the recursion to the
250 |         # left side
251 |         if go_left and has_left:
252 |             return self.left.predict_record(record)
253 | 
254 |         # if we go right, delegate to the right
255 |         if not go_left and has_right:
256 |             return self.right.predict_record(record)
257 | 
258 |         # if we get here, it means one of two things:
259 |         # 1. we were supposed to go left and didn't have a left
260 |         # 2. we were supposed to go right and didn't have a right
261 |         # for both of these, we return THIS class statistic
262 |         return self.class_statistic
263 | 
264 | 
265 | def _most_common(y):
266 |     # This is essentially just a "mode" function to compute the most
267 |     # common value in a vector.
268 |     cls, cts = np.unique(y, return_counts=True)
269 |     order = np.argsort(-cts)
270 |     return cls[order][0]
271 | 
272 | 
273 | class _BaseCART(BaseSimpleEstimator):
274 |     def __init__(self, X, y, criterion, min_samples_split, max_depth,
275 |                  n_val_sample, random_state):
276 |         # make sure max_depth > 1
277 |         if max_depth < 2:
278 |             raise ValueError("max depth must be > 1")
279 | 
280 |         # check the input arrays, and if it's classification validate the
281 |         # target values in y
282 |         X, y = check_X_y(X, y, accept_sparse=False, dtype=None, copy=True)
283 |         if is_classifier(self):
284 |             check_classification_targets(y)
285 | 
286 |         # hyper parameters so we can later inspect attributes of the model
287 |         self.min_samples_split = min_samples_split
288 |         self.max_depth = max_depth
289 |         self.n_val_sample = n_val_sample
290 |         self.random_state = random_state
291 | 
292 |         # create the splitting class
293 |         random_state = check_random_state(random_state)
294 |         self.splitter = RandomSplitter(random_state, criterion, n_val_sample)
295 | 
296 |         # grow the tree depth first
297 |         self.tree = self._find_next_split(X, y, 0)
298 | 
299 |     def _target_stat(self, y):
300 |         """Given a vector, ``y``, decide what value to return as the leaf
301 |         node statistic (mean for regression, mode for classification)
302 |         """
303 | 
304 |     def _find_next_split(self, X, y, current_depth):
305 |         # base case 1: current depth is the limit, the parent node should
306 |         # be a terminal node (child = None)
307 |         # base case 2: n samples in X <= min_samples_split
308 |         if current_depth == self.max_depth or \
309 |                 X.shape[0] <= self.min_samples_split:
310 |             return None
311 | 
312 |         # create the next split
313 |         split_feature, split_value, gain = \
314 |             self.splitter.find_best(X, y)
315 | 
316 |         # create the next node based on the best split feature and value
317 |         # that we just found. Also compute the "target stat" (mode of y for
318 |         # classification problems or mean of y for regression problems) and
319 |         # pass that to the node in case it is the terminal node (i.e., the
320 |         # decision maker)
321 |         node = LeafNode(split_feature, split_value, gain, self._target_stat(y))
322 | 
323 |         # Create the splits based on the criteria we just determined, and then
324 |         # recurse down left, right sides
325 |         X_left, X_right, y_left, y_right = node.create_split(X, y)
326 | 
327 |         # if either the left or right is None, it means we've achieved a
328 |         # perfect split. It is then a terminal node and will remain None.
329 |         if X_left is not None:
330 |             node.left = self._find_next_split(X_left, y_left,
331 |                                               current_depth + 1)
332 | 
333 |         if X_right is not None:
334 |             node.right = self._find_next_split(X_right, y_right,
335 |                                                current_depth + 1)
336 | 
337 |         return node
338 | 
339 |     def predict(self, X):
340 |         # Check the array
341 |         X = check_array(X, dtype=np.float32)  # type: np.ndarray
342 | 
343 |         # For each record in X, find its leaf node in the tree (O(log N))
344 |         # to get the predictions. This makes the prediction operation
345 |         # O(N log N) runtime complexity
346 |         predictions = [self.tree.predict_record(row) for row in X]
347 |         return np.asarray(predictions)
348 | 
349 | 
350 | class CARTRegressor(_BaseCART, RegressorMixin):
351 |     """Decision tree regression.
352 | 
353 |     Builds a decision tree to solve a regression problem using the CART
354 |     algorithm. The estimator builds a binary tree structure, evaluating each
355 |     feature at each iteration to recursively split along the best value and
356 |     progress down the tree until each leaf node reaches parsimony.
357 | 
358 |     The regression tree uses "variance reduction" to assess the "goodness"
359 |     of a split, selecting the split and feature that maximizes the value.
360 | 
361 |     To make predictions, each record is evaluated at each node of the tree
362 |     until it reaches a leaf node. For regression, predictions are made by
363 |     returning the training target's mean for the leaf node.
364 | 
365 |     Parameters
366 |     ----------
367 |     X : array-like, shape=(n_samples, n_features)
368 |         The training array. Should be a numpy array or array-like structure
369 |         with only finite values.
370 | 
371 |     y : array-like, shape=(n_samples,)
372 |         The target vector.
373 | 
374 |     max_depth : int, optional (default=5)
375 |         The maximum depth to which the tree will grow. Note that the tree is
376 |         not guaranteed to reach this depth and may stop growing early if the
377 |         ``min_samples_split`` terminal criterion is met first.
378 | 
379 |     min_samples_split : int, optional (default=1)
380 |         A terminal criterion used to halt the growth of a tree. If a leaf
381 |         node's split contains <= ``min_samples_split``, it will not grow
382 |         any further.
383 | 
384 |     n_val_sample : int, optional (default=25)
385 |         The method by which we evaluate splits differs a bit from highly
386 |         optimized libraries like scikit-learn, which may evaluate for the
387 |         globally optimal split for each feature. We use random splitting
388 |         which evaluates a number of unique values for each feature at each
389 |         split. The ``n_val_sample`` is the maximum number of values per
390 |         feature that will be evaluated as a potential splitting point at
391 |         each iteration.
392 | 
393 |     random_state : int, None or RandomState, optional (default=None)
394 |         The random state used to seed the RandomSplitter.
395 | 
396 |     Attributes
397 |     ----------
398 |     splitter : RandomSplitter
399 |         The feature splitting class. Used for determining optimal splits at
400 |         each node.
401 | 
402 |     tree : LeafNode
403 |         The actual tree. Each node contains data on the class statistic (i.e.,
404 |         mode or mean of the training target at that split), best feature and
405 |         best value.
406 |     """
407 |     def __init__(self, X, y, max_depth=5, min_samples_split=1,
408 |                  n_val_sample=25, random_state=None):
409 | 
410 |         super(CARTRegressor, self).__init__(
411 |             X, y, criterion=VarianceReduction(),
412 |             min_samples_split=min_samples_split, max_depth=max_depth,
413 |             n_val_sample=n_val_sample, random_state=random_state)
414 | 
415 |     def _target_stat(self, y):
416 |         """Given a vector, ``y``, get the mean"""
417 |         return y.mean()
418 | 
419 | 
420 | class CARTClassifier(_BaseCART, ClassifierMixin):
421 |     """Decision tree classication.
422 | 
423 |     Builds a decision tree to solve a classification problem using the CART
424 |     algorithm. The estimator builds a binary tree structure, evaluating each
425 |     feature at each iteration to recursively split along the best value and
426 |     progress down the tree until each leaf node reaches parsimony.
427 | 
428 |     The classification tree uses "information gain" to assess the "goodness"
429 |     of a split, selecting the split and feature that maximizes the value.
430 | 
431 |     To make predictions, each record is evaluated at each node of the tree
432 |     until it reaches a leaf node. For classification, predictions are made by
433 |     returning the training target's mode for the leaf node.
434 | 
435 |     Parameters
436 |     ----------
437 |     X : array-like, shape=(n_samples, n_features)
438 |         The training array. Should be a numpy array or array-like structure
439 |         with only finite values.
440 | 
441 |     y : array-like, shape=(n_samples,)
442 |         The target vector.
443 | 
444 |     criterion : str or unicode, optional (default='gini')
445 |         The splitting criterion used for classification problems. CART trees
446 |         typically use "gini" but their cousins, C4.5 trees, use "entropy". Both
447 |         metrics are extremely similar and will likely not change your tree
448 |         structure by much.
449 | 
450 |     max_depth : int, optional (default=5)
451 |         The maximum depth to which the tree will grow. Note that the tree is
452 |         not guaranteed to reach this depth and may stop growing early if the
453 |         ``min_samples_split`` terminal criterion is met first.
454 | 
455 |     min_samples_split : int, optional (default=1)
456 |         A terminal criterion used to halt the growth of a tree. If a leaf
457 |         node's split contains <= ``min_samples_split``, it will not grow
458 |         any further.
459 | 
460 |     n_val_sample : int, optional (default=25)
461 |         The method by which we evaluate splits differs a bit from highly
462 |         optimized libraries like scikit-learn, which may evaluate for the
463 |         globally optimal split for each feature. We use random splitting
464 |         which evaluates a number of unique values for each feature at each
465 |         split. The ``n_val_sample`` is the maximum number of values per
466 |         feature that will be evaluated as a potential splitting point at
467 |         each iteration.
468 | 
469 |     random_state : int, None or RandomState, optional (default=None)
470 |         The random state used to seed the RandomSplitter.
471 | 
472 |     Attributes
473 |     ----------
474 |     splitter : RandomSplitter
475 |         The feature splitting class. Used for determining optimal splits at
476 |         each node.
477 | 
478 |     tree : LeafNode
479 |         The actual tree. Each node contains data on the class statistic (i.e.,
480 |         mode or mean of the training target at that split), best feature and
481 |         best value.
482 |     """
483 |     def __init__(self, X, y, criterion='gini', max_depth=5,
484 |                  min_samples_split=1, n_val_sample=25, random_state=None):
485 | 
486 |         super(CARTClassifier, self).__init__(
487 |             X, y, criterion=InformationGain(criterion), max_depth=max_depth,
488 |             min_samples_split=min_samples_split,
489 |             n_val_sample=n_val_sample, random_state=random_state)
490 | 
491 |     def _target_stat(self, y):
492 |         """Given a vector, ``y``, get the mode"""
493 |         return _most_common(y)
494 | 


--------------------------------------------------------------------------------