├── packtml ├── VERSION ├── utils │ ├── tests │ │ ├── __init__.py │ │ ├── test_linalg.py │ │ └── test_validation.py │ ├── __init__.py │ ├── linalg.py │ ├── extmath.py │ ├── plotting.py │ └── validation.py ├── clustering │ ├── tests │ │ ├── __init__.py │ │ └── test_knn.py │ ├── __init__.py │ └── knn.py ├── metrics │ ├── tests │ │ ├── __init__.py │ │ └── test_ranking.py │ ├── __init__.py │ └── ranking.py ├── neural_net │ ├── tests │ │ ├── __init__.py │ │ ├── test_mlp.py │ │ └── test_transfer.py │ ├── __init__.py │ ├── base.py │ ├── transfer.py │ └── mlp.py ├── regression │ ├── tests │ │ ├── __init__.py │ │ ├── test_simple_regression.py │ │ └── test_simple_logistic.py │ ├── __init__.py │ ├── simple_regression.py │ └── simple_logistic.py ├── decision_tree │ ├── tests │ │ ├── __init__.py │ │ ├── test_metrics.py │ │ └── test_cart.py │ ├── __init__.py │ ├── metrics.py │ └── cart.py ├── recommendation │ ├── tests │ │ ├── __init__.py │ │ ├── test_als.py │ │ └── test_itemitem.py │ ├── __init__.py │ ├── base.py │ ├── data.py │ ├── itemitem.py │ └── als.py ├── __init__.py └── base.py ├── MANIFEST.in ├── examples ├── data │ └── README.md ├── decision_tree │ ├── example_information_gain.py │ ├── example_classification_split.py │ ├── example_regression_decision_tree.py │ └── example_classification_decision_tree.py ├── recommendation │ ├── example_item_item_recommender.py │ └── example_als_recommender.py ├── run_all_examples.py ├── regression │ ├── example_linear_regression.py │ └── example_logistic_regression.py ├── clustering │ └── example_knn_classifier.py └── neural_net │ ├── example_mlp_classifier.py │ └── example_transfer_learning.py ├── requirements.txt ├── curriculum.docx ├── environment.yml ├── .coveragerc ├── img ├── clustering │ └── example_knn_classifier.png ├── neural_net │ ├── example_mlp_classifier.png │ └── example_transfer_learning.png ├── regression │ ├── example_linear_regression.png │ └── example_logistic_regression.png ├── recommendation │ └── example_als_recommender.png ├── decision_tree │ ├── example_regression_decision_tree.png │ └── example_classification_decision_tree.png └── README.md ├── .travis.yml ├── LICENSE ├── .gitignore ├── setup.py ├── appveyor.yml └── README.md /packtml/VERSION: -------------------------------------------------------------------------------- 1 | 1.0.5 -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive include packtml/* -------------------------------------------------------------------------------- /examples/data/README.md: -------------------------------------------------------------------------------- 1 | # Demo data 2 | 3 | Cached data for the ML demo goes here. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=0.15 2 | scipy>=0.19 3 | scikit-learn>=0.19 4 | pandas>=0.23 5 | matplotlib -------------------------------------------------------------------------------- /packtml/utils/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import -------------------------------------------------------------------------------- /packtml/clustering/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import -------------------------------------------------------------------------------- /packtml/metrics/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import -------------------------------------------------------------------------------- /packtml/neural_net/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import -------------------------------------------------------------------------------- /packtml/regression/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import -------------------------------------------------------------------------------- /packtml/decision_tree/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import -------------------------------------------------------------------------------- /packtml/recommendation/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import -------------------------------------------------------------------------------- /curriculum.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/curriculum.docx -------------------------------------------------------------------------------- /packtml/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .ranking import * 4 | 5 | __all__ = [s for s in dir() if not s.startswith("_")] 6 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: packt-sml 2 | 3 | dependencies: 4 | - python=3.6 5 | - numpy 6 | - scipy 7 | - scikit-learn 8 | - pandas 9 | - matplotlib -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = packtml 3 | include = */packtml/* 4 | omit = 5 | */packtml/setup.py 6 | */packtml/utils/plotting.py 7 | */setup.py 8 | -------------------------------------------------------------------------------- /packtml/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from packtml.clustering.knn import * 4 | 5 | __all__ = [s for s in dir() if not s.startswith("_")] 6 | -------------------------------------------------------------------------------- /img/clustering/example_knn_classifier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/clustering/example_knn_classifier.png -------------------------------------------------------------------------------- /img/neural_net/example_mlp_classifier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/neural_net/example_mlp_classifier.png -------------------------------------------------------------------------------- /img/neural_net/example_transfer_learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/neural_net/example_transfer_learning.png -------------------------------------------------------------------------------- /img/regression/example_linear_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/regression/example_linear_regression.png -------------------------------------------------------------------------------- /img/recommendation/example_als_recommender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/recommendation/example_als_recommender.png -------------------------------------------------------------------------------- /img/regression/example_logistic_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/regression/example_logistic_regression.png -------------------------------------------------------------------------------- /img/decision_tree/example_regression_decision_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/decision_tree/example_regression_decision_tree.png -------------------------------------------------------------------------------- /packtml/neural_net/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from packtml.neural_net.mlp import * 4 | from packtml.neural_net.transfer import * 5 | 6 | __all__ = [s for s in dir() if not s.startswith("_")] 7 | -------------------------------------------------------------------------------- /img/decision_tree/example_classification_decision_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-Supervised-Machine-Learning-with-Python/HEAD/img/decision_tree/example_classification_decision_tree.png -------------------------------------------------------------------------------- /packtml/decision_tree/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from packtml.decision_tree.cart import * 4 | from packtml.decision_tree.metrics import * 5 | 6 | __all__ = [s for s in dir() if not s.startswith("_")] 7 | -------------------------------------------------------------------------------- /packtml/regression/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from packtml.regression.simple_regression import * 4 | from packtml.regression.simple_logistic import * 5 | 6 | __all__ = [s for s in dir() if not s.startswith("_")] 7 | 8 | -------------------------------------------------------------------------------- /packtml/recommendation/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from packtml.recommendation.als import * 4 | from packtml.recommendation.data import * 5 | from packtml.recommendation.itemitem import * 6 | 7 | __all__ = [s for s in dir() if not s.startswith("_")] 8 | -------------------------------------------------------------------------------- /packtml/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from packtml.utils.extmath import * 4 | from packtml.utils.linalg import * 5 | from packtml.utils.plotting import * 6 | from packtml.utils.validation import * 7 | 8 | __all__ = [s for s in dir() if not s.startswith("_")] 9 | -------------------------------------------------------------------------------- /img/README.md: -------------------------------------------------------------------------------- 1 | # img 2 | 3 | Within this directory, you'll find the output of the various example scripts. 4 | The rendering of these images is automated by the 5 | [examples/run_all_examples.py](../examples/run_all_examples.py) script. 6 | 7 | ### Do not directly edit anything in this directory! Its contents are 100% automated!! 8 | -------------------------------------------------------------------------------- /packtml/neural_net/tests/test_mlp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.neural_net import NeuralNetClassifier 6 | from sklearn.datasets import load_iris 7 | 8 | iris = load_iris() 9 | X, y = iris.data, iris.target 10 | 11 | 12 | def test_mlp(): 13 | # show we can fit and predict 14 | clf = NeuralNetClassifier(X, y, random_state=42) 15 | clf.predict(X) 16 | -------------------------------------------------------------------------------- /packtml/utils/tests/test_linalg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from sklearn.datasets import load_iris 6 | from packtml.utils import linalg 7 | 8 | import numpy as np 9 | 10 | iris = load_iris() 11 | X, y = iris.data, iris.target 12 | 13 | 14 | def test_row_norms(): 15 | means = np.average(X, axis=0) 16 | X_centered = X - means 17 | 18 | norms = linalg.l2_norm(X_centered, axis=0) 19 | assert np.allclose( 20 | norms, 21 | np.array([10.10783524, 5.29269308, 21.53749599, 9.31556404]), 22 | rtol=0.01) 23 | -------------------------------------------------------------------------------- /packtml/regression/tests/test_simple_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.regression import SimpleLinearRegression 6 | 7 | import numpy as np 8 | from numpy.testing import assert_almost_equal 9 | 10 | 11 | def test_simple_linear_regression(): 12 | # y = 2a + 1.5b + 0 13 | random_state = np.random.RandomState(42) 14 | X = random_state.rand(100, 2) 15 | y = 2. * X[:, 0] + 1.5 * X[:, 1] 16 | 17 | lm = SimpleLinearRegression(X, y) 18 | predictions = lm.predict(X) 19 | residuals = y - predictions 20 | assert_almost_equal(residuals.sum(), 0.) 21 | assert np.allclose(lm.theta, [2., 1.5]) 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: required 3 | 4 | cache: 5 | apt: true 6 | directories: 7 | - $HOME/.cache/pip 8 | - $HOME/.ccache 9 | 10 | before_install: 11 | - source build_tools/travis/before_install.sh 12 | env: 13 | global: 14 | - TEST_DIR=/tmp/packtml 15 | 16 | matrix: 17 | include: 18 | - os: linux 19 | dist: trusty 20 | env: PYTHON_VERSION="3.5" 21 | 22 | - os: linux 23 | dist: trusty 24 | env: PYTHON_VERSION="3.6" 25 | 26 | install: source build_tools/travis/install.sh 27 | before_script: bash build_tools/travis/before_script.sh 28 | script: bash build_tools/travis/test_script.sh 29 | after_success: bash build_tools/travis/after_success.sh 30 | -------------------------------------------------------------------------------- /packtml/utils/linalg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from numpy import linalg as la 6 | 7 | __all__ = [ 8 | 'l2_norm' 9 | ] 10 | 11 | 12 | def l2_norm(X, axis=0): 13 | """Compute the L2 (Euclidean) norm of a matrix. 14 | 15 | Computes the L2 norm along the specified axis. If axis is 0, 16 | computes the norms along the columns. If 1, computes along the 17 | rows. 18 | 19 | Parameters 20 | ---------- 21 | X : array-like, shape=(n_samples, n_features) 22 | The matrix on which to compute the norm. 23 | 24 | axis : int, optional (default=0) 25 | The axis along which to compute the norm. 0 is for columns, 26 | 1 is for rows. 27 | """ 28 | return la.norm(X, ord=None, axis=axis) 29 | -------------------------------------------------------------------------------- /packtml/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | # global namespace: 6 | from packtml import clustering 7 | from packtml import decision_tree 8 | from packtml import metrics 9 | from packtml import neural_net 10 | from packtml import recommendation 11 | from packtml import regression 12 | from packtml import utils 13 | 14 | # set the version 15 | packtml_location = os.path.abspath(os.path.dirname(__file__)) 16 | with open(os.path.join(packtml_location, "VERSION")) as vsn: 17 | __version__ = vsn.read().strip() 18 | 19 | # remove from global namespace 20 | del os 21 | del packtml_location 22 | del vsn 23 | 24 | __all__ = [ 25 | 'clustering', 26 | 'decision_tree', 27 | 'metrics', 28 | 'neural_net', 29 | 'recommendation', 30 | 'regression', 31 | 'utils' 32 | ] 33 | -------------------------------------------------------------------------------- /examples/decision_tree/example_information_gain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.decision_tree.metrics import gini_impurity, InformationGain 6 | import numpy as np 7 | 8 | # ############################################################################# 9 | # Build the example from the slides 10 | y = np.array([0, 0, 0, 1, 1, 1, 1]) 11 | uncertainty = gini_impurity(y) 12 | print("Initial gini impurity: %.4f" % uncertainty) 13 | 14 | # now get the information gain of the split from the slides 15 | directions = np.array(["right", "left", "left", "left", 16 | "right", "right", "right"]) 17 | mask = directions == "left" 18 | print("Information gain from the split we created: %.4f" 19 | % InformationGain("gini")(target=y, mask=mask, uncertainty=uncertainty)) 20 | -------------------------------------------------------------------------------- /packtml/clustering/tests/test_knn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.clustering import KNNClassifier 6 | 7 | from sklearn.datasets import load_iris 8 | from numpy.testing import assert_array_equal 9 | import numpy as np 10 | 11 | iris = load_iris() 12 | X = iris.data[:, :2] 13 | y = iris.target 14 | 15 | 16 | def test_knn(): 17 | # show we can fit 18 | knn = KNNClassifier(X, y) 19 | # show we can predict 20 | knn.predict(X) 21 | 22 | 23 | def test_knn2(): 24 | X2 = np.array([[0., 0., 0.5], 25 | [0., 0.5, 0.], 26 | [0.5, 0., 0.], 27 | [5., 5., 6.], 28 | [6., 5., 5.]]) 29 | 30 | y2 = [0, 0, 0, 1, 1] 31 | knn = KNNClassifier(X2, y2, k=3) 32 | preds = knn.predict(X2) 33 | assert_array_equal(preds, y2) 34 | -------------------------------------------------------------------------------- /packtml/neural_net/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | import six 6 | from abc import ABCMeta, abstractmethod 7 | 8 | import numpy as np 9 | 10 | __all__ = [ 11 | 'tanh', 12 | 'NeuralMixin' 13 | ] 14 | 15 | 16 | def tanh(X): 17 | """Hyperbolic tangent. 18 | 19 | Compute the tan-h (Hyperbolic tangent) activation function. 20 | This is a very easily-differentiable activation function. 21 | 22 | Parameters 23 | ---------- 24 | X : np.ndarray, shape=(n_samples, n_features) 25 | The transformed X array (X * W + b). 26 | """ 27 | return np.tanh(X) 28 | 29 | 30 | class NeuralMixin(six.with_metaclass(ABCMeta)): 31 | """Abstract interface for neural network classes.""" 32 | @abstractmethod 33 | def export_weights_and_biases(self, output_layer=True): 34 | """Return the weights and biases of the network""" 35 | -------------------------------------------------------------------------------- /examples/decision_tree/example_classification_split.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.decision_tree.cart import RandomSplitter 6 | from packtml.decision_tree.metrics import InformationGain 7 | import numpy as np 8 | 9 | # ############################################################################# 10 | # Build the example from the slides (3.3) 11 | X = np.array([[21, 3], [ 4, 2], [37, 2]]) 12 | y = np.array([1, 0, 1]) 13 | 14 | # this is the splitting class; we'll use gini as the criteria 15 | random_state = np.random.RandomState(42) 16 | splitter = RandomSplitter(random_state=random_state, 17 | criterion=InformationGain('gini'), 18 | n_val_sample=3) 19 | 20 | # find the best: 21 | best_feature, best_value, best_gain = splitter.find_best(X, y) 22 | print("Best feature=%i, best value=%r, information gain: %.3f" 23 | % (best_feature, best_value, best_gain)) 24 | -------------------------------------------------------------------------------- /packtml/regression/tests/test_simple_logistic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.regression import SimpleLogisticRegression 6 | from sklearn.datasets import make_classification 7 | from sklearn.metrics import accuracy_score 8 | 9 | import numpy as np 10 | 11 | X, y = make_classification(n_samples=100, n_features=2, random_state=42, 12 | n_redundant=0, n_repeated=0, n_classes=2, 13 | class_sep=1.0) 14 | 15 | 16 | def test_simple_logistic(): 17 | lm = SimpleLogisticRegression(X, y, n_steps=50, loglik_interval=10) 18 | assert np.allclose(lm.theta, np.array([ 1.32320936, -0.03926072])) 19 | 20 | # test that we can predict 21 | preds = lm.predict(X) 22 | 23 | # show we're better than chance 24 | assert accuracy_score(y, preds) > 0.5 25 | 26 | # show that we only computed the log likelihood 5 times 27 | assert len(lm.log_likelihood) == 5, lm.log_likelihood 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /packtml/utils/tests/test_validation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.utils import validation as val 6 | from packtml.regression import SimpleLogisticRegression 7 | 8 | from sklearn.metrics import accuracy_score 9 | from sklearn.datasets import load_breast_cancer 10 | 11 | bc = load_breast_cancer() 12 | X, y = bc.data, bc.target 13 | 14 | 15 | def test_is_iterable(): 16 | assert val.is_iterable([1, 2, 3]) 17 | assert val.is_iterable((1, 2, 3)) 18 | assert val.is_iterable({1, 2, 3}) 19 | assert val.is_iterable({1: 'a', 2: 'b'}) 20 | assert not val.is_iterable(123) 21 | assert not val.is_iterable(None) 22 | assert not val.is_iterable("a string") 23 | 24 | 25 | def test_learning_curves(): 26 | train_scores, val_scores = \ 27 | val.learning_curve( 28 | SimpleLogisticRegression, X, y, 29 | metric=accuracy_score, 30 | train_sizes=(100, 250, 400), 31 | n_folds=3, seed=42, trace=True, 32 | 33 | # kwargs: 34 | n_steps=20, loglik_interval=20) 35 | 36 | assert train_scores.shape == (3, 3) 37 | assert val_scores.shape == (3, 3) 38 | -------------------------------------------------------------------------------- /packtml/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from abc import ABCMeta, abstractmethod 6 | import six 7 | 8 | __all__ = [ 9 | 'BaseSimpleEstimator' 10 | ] 11 | 12 | 13 | class BaseSimpleEstimator(six.with_metaclass(ABCMeta)): 14 | """Base class for packt estimators. 15 | 16 | The estimators in the Packt package do not behave exactly like scikit-learn 17 | estimators (by design). They are made to perform the model fit immediately 18 | upon class instantiation. Moreover, many of the hyper-parameter options 19 | are limited to promote readability and avoid confusion. 20 | 21 | The constructor of every Packt estimator should resemble the following:: 22 | 23 | def __init__(self, X, y, *args, **kwargs): 24 | ... 25 | 26 | where ``X`` is the training matrix, ``y`` is the training target variable, 27 | and ``*args`` and ``**kwargs`` are varargs that will differ for each 28 | estimator. 29 | """ 30 | @abstractmethod 31 | def predict(self, X): 32 | """Form predictions based on new data. 33 | 34 | This function must be implemented by subclasses to generate 35 | predictions given the model fit. 36 | 37 | Parameters 38 | ---------- 39 | X : array-like, shape=(n_samples, n_features) 40 | The test array. Should be only finite values. 41 | """ 42 | -------------------------------------------------------------------------------- /packtml/recommendation/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | import six 6 | from abc import ABCMeta, abstractmethod 7 | 8 | __all__ = [ 9 | 'RecommenderMixin' 10 | ] 11 | 12 | try: 13 | xrange 14 | except NameError: # py3 15 | xrange = range 16 | 17 | 18 | class RecommenderMixin(six.with_metaclass(ABCMeta)): 19 | """Mixin interface for recommenders. 20 | 21 | This class should be inherited by recommender algorithms. It provides an 22 | abstract interface for generating recommendations for a user, and a 23 | function for creating recommendations for all users. 24 | """ 25 | @abstractmethod 26 | def recommend_for_user(self, R, user, n=10, filter_previously_seen=False, 27 | return_scores=True, **kwargs): 28 | """Generate recommendations for a user. 29 | 30 | A method that should be overridden by subclasses to create 31 | recommendations via their own prediction strategy. 32 | """ 33 | 34 | def recommend_for_all_users(self, R, n=10, 35 | filter_previously_seen=False, 36 | return_scores=True, **kwargs): 37 | """Create recommendations for all users.""" 38 | return ( 39 | self.recommend_for_user( 40 | R, user, n=n, filter_previously_seen=filter_previously_seen, 41 | return_scores=return_scores, **kwargs) 42 | for user in xrange(R.shape[0])) 43 | -------------------------------------------------------------------------------- /packtml/metrics/tests/test_ranking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.metrics.ranking import (mean_average_precision, ndcg_at, 6 | precision_at) 7 | 8 | from numpy.testing import assert_almost_equal 9 | import warnings 10 | 11 | preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5], 12 | [4, 1, 5, 6, 2, 7, 3, 8, 9, 10], 13 | [1, 2, 3, 4, 5]] 14 | 15 | labels = [[1, 2, 3, 4, 5], [1, 2, 3], []] 16 | 17 | 18 | def assert_warning_caught(func): 19 | def test_wrapper(*args, **kwargs): 20 | with warnings.catch_warnings(record=True) as w: 21 | warnings.simplefilter("always") 22 | 23 | # execute the fxn 24 | func(*args, **kwargs) 25 | assert len(w) # assert there's something there... 26 | return test_wrapper 27 | 28 | 29 | @assert_warning_caught 30 | def test_map(): 31 | assert_almost_equal( 32 | mean_average_precision(preds, labels), 0.35502645502645497) 33 | 34 | 35 | @assert_warning_caught 36 | def test_pak(): 37 | assert_almost_equal(precision_at(preds, labels, 1), 0.33333333333333331) 38 | assert_almost_equal(precision_at(preds, labels, 5), 0.26666666666666666) 39 | assert_almost_equal(precision_at(preds, labels, 15), 0.17777777777777778) 40 | 41 | 42 | @assert_warning_caught 43 | def test_ndcg(): 44 | assert_almost_equal(ndcg_at(preds, labels, 3), 0.3333333432674408) 45 | assert_almost_equal(ndcg_at(preds, labels, 10), 0.48791273434956867) 46 | -------------------------------------------------------------------------------- /packtml/decision_tree/tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.decision_tree.metrics import (entropy, gini_impurity, 6 | InformationGain) 7 | 8 | import numpy as np 9 | from numpy.testing import assert_almost_equal 10 | 11 | 12 | def test_entropy(): 13 | events = np.asarray(9 * [0] + 5 * [1]) # 9/14, 5/14 14 | ent = entropy(events) 15 | assert round(ent, 2) == 0.94, round(ent, 2) 16 | 17 | 18 | def test_gini_impurity(): 19 | x = np.asarray([0] * 10 + [1] * 10) 20 | assert gini_impurity(x) == 0.5 21 | assert gini_impurity(x[:10]) == 0. 22 | 23 | # show that no mixing of gini yields 0.0 24 | assert gini_impurity(np.array([0, 0])) == 0. 25 | 26 | # with SOME mixing we get 0.5 27 | assert gini_impurity(np.array([0, 1])) == 0.5 28 | 29 | # with a lot of mixing we get a number close to 0.8 30 | gi = gini_impurity([0, 1, 2, 3, 4]) 31 | assert_almost_equal(gi, 0.8) 32 | 33 | 34 | def test_information_gain(): 35 | X = np.array([ 36 | [0, 3], 37 | [1, 3], 38 | [2, 1], 39 | [2, 1], 40 | [1, 3] 41 | ]) 42 | 43 | y = np.array([0, 0, 1, 1, 2]) 44 | 45 | uncertainty = gini_impurity(y) 46 | assert_almost_equal(uncertainty, 0.63999999) 47 | mask = X[:, 0] == 0 48 | 49 | # compute the info gain for this mask 50 | infog = InformationGain("gini") 51 | ig = infog(y, mask, uncertainty) 52 | assert_almost_equal(ig, 0.1399999) 53 | -------------------------------------------------------------------------------- /packtml/recommendation/tests/test_als.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.recommendation import ALS 6 | 7 | # make up a ratings matrix... 8 | R = [[1., 0., 3.5, 2., 0., 0., 0., 1.5], 9 | [0., 2., 3., 0., 0., 2.5, 0., 0. ], 10 | [3.5, 4., 2., 0., 4.5, 3.5, 0., 2. ], 11 | [3., 3.5, 0., 2.5, 3., 0., 0., 0. ]] 12 | 13 | 14 | def test_als_simple_fit(): 15 | als = ALS(R, factors=3, n_iter=5, random_state=42) 16 | assert len(als.train_err) == 5, als.train_err 17 | assert als.n_factors == 3, als.n_factors 18 | 19 | # assert all errors are decreasing over time 20 | errs = list(zip(als.train_err[:-1], als.train_err[1:])) 21 | assert all(new_err < last_err for last_err, new_err in errs), errs 22 | 23 | 24 | def test_als_predict(): 25 | als = ALS(R, factors=4, n_iter=8, random_state=42) 26 | user0, scr = als.recommend_for_user(R, 0, filter_previously_seen=True, 27 | return_scores=True) 28 | 29 | # assert previously-rated items not present 30 | rated = (0, 2, 3, 7) 31 | for r in rated: # previously-rated 32 | assert r not in user0 33 | 34 | # show the score lengths are the same 35 | assert scr.shape[0] == user0.shape[0] 36 | 37 | # now if we do NOT filter, assert those are present again (also, recompute) 38 | user0, scr = als.recommend_for_user(R, 0, filter_previously_seen=False, 39 | return_scores=True, 40 | recompute_user=True) 41 | for r in rated: 42 | assert r in user0 43 | 44 | assert user0.shape[0] == scr.shape[0] 45 | -------------------------------------------------------------------------------- /examples/recommendation/example_item_item_recommender.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.recommendation import ItemItemRecommender 6 | from packtml.recommendation.data import get_completely_fabricated_ratings_data 7 | from packtml.metrics.ranking import mean_average_precision 8 | import numpy as np 9 | 10 | # ############################################################################# 11 | # Use our fabricated data set 12 | R, titles = get_completely_fabricated_ratings_data() 13 | 14 | # ############################################################################# 15 | # Fit an item-item recommender, predict for user 0 16 | rec = ItemItemRecommender(R, k=3) 17 | user0_rec, user_0_preds = rec.recommend_for_user( 18 | R, user=0, filter_previously_seen=True, 19 | return_scores=True) 20 | 21 | # print some info about user 0 22 | top_rated = np.argsort(-R[0, :])[:3] 23 | print("User 0's top 3 rated movies are: %r" % titles[top_rated].tolist()) 24 | print("User 0's top 3 recommended movies are: %r" 25 | % titles[user0_rec[:3]].tolist()) 26 | 27 | # ############################################################################# 28 | # We can score our recommender as well, to determine how well it actually did 29 | 30 | # first, get all user recommendations (top 10, not filtered) 31 | recommendations = list(rec.recommend_for_all_users( 32 | R, n=10, filter_previously_seen=False, 33 | return_scores=False)) 34 | 35 | # get the TRUE items they've rated (in order) 36 | ground_truth = np.argsort(-R, axis=1) 37 | mean_avg_prec = mean_average_precision( 38 | predictions=recommendations, labels=ground_truth) 39 | print("Mean average precision: %.3f" % mean_avg_prec) 40 | -------------------------------------------------------------------------------- /packtml/utils/extmath.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | 7 | __all__ = [ 8 | 'log_likelihood', 9 | 'logistic_sigmoid' 10 | ] 11 | 12 | 13 | def log_likelihood(X, y, w): 14 | """Compute the log-likelihood function. 15 | 16 | Computes the log-likelihood function over the training data. 17 | The key to the log-likelihood is that the log of the product of 18 | likelihoods becomes the sum of logs. That is (in pseudo-code), 19 | 20 | np.log(np.product([f(i) for i in range(N)])) 21 | 22 | is equivalent to: 23 | 24 | np.sum([np.log(f(i)) for i in range(N)]) 25 | 26 | The log-likelihood function is used in computing the gradient for 27 | our loss function since the derivative of the sum (of logs) is equivalent 28 | to the sum of derivatives, which simplifies all of our math. 29 | 30 | Parameters 31 | ---------- 32 | X : np.ndarray, shape=(n_samples, n_features) 33 | The training data. 34 | 35 | y : np.ndarray, shape=(n_samples,) 36 | The target vector of 1s or 0s. 37 | 38 | w : np.ndarray, shape=(n_features,) 39 | The vector of feature weights (coefficients) 40 | 41 | References 42 | ---------- 43 | .. [1] For a very thorough explanation of the log-likelihood function, see 44 | https://www.coursera.org/learn/ml-classification/lecture/1ZeTC/very-optional-expressing-the-log-likelihood 45 | """ 46 | weighted = X.dot(w) 47 | return (y * weighted - np.log(1. + np.exp(weighted))).sum() 48 | 49 | 50 | def logistic_sigmoid(x): 51 | """The logistic function. 52 | 53 | Compute the logistic (sigmoid) function over a vector, ``x``. 54 | 55 | Parameters 56 | ---------- 57 | x : np.ndarray, shape=(n_samples,) 58 | A vector to transform. 59 | """ 60 | return 1. / (1. + np.exp(-x)) 61 | -------------------------------------------------------------------------------- /examples/run_all_examples.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This function is not intended to be run by students (or anyone, for that 4 | # matter). It is intended to be run by me (Taylor) just to automate the 5 | # population of the img/ directory with the output of the example plots. 6 | # Hence its poor documentation and sheer hackiness. 7 | 8 | from __future__ import absolute_import 9 | 10 | import os 11 | import sys 12 | import subprocess 13 | 14 | # determine where the user is calling this from... 15 | here = os.listdir(".") 16 | if "examples" in here: 17 | cwd = "examples" 18 | img_dir = "img" 19 | elif "clustering" in here: 20 | cwd = "." 21 | img_dir = "../img" 22 | else: 23 | raise ValueError("Call this from top-level or from within " 24 | "the examples dir") 25 | 26 | # iterate all py files 27 | for root, dirs, files in os.walk(cwd, topdown=False): 28 | for fil in files: 29 | # Only run the ones with the appropriate prefix 30 | if not fil.startswith("example_"): 31 | continue 32 | 33 | # Get the module root 34 | module_root = root.split(os.sep)[1] 35 | 36 | # If it's "data" we don't want that! That's where we cache the data 37 | # for the demo 38 | if module_root in ("data", ".ipynb_checkpoints"): 39 | print("Skipping dir: %s" % module_root) 40 | continue 41 | 42 | # Otherwise create its corresponding path in ../img 43 | image_root = os.path.join(img_dir, module_root) # ../img/clustering 44 | 45 | # create the directory in the image dir if it's not there 46 | if not os.path.exists(image_root): 47 | os.mkdir(image_root) 48 | 49 | # run it 50 | dest = os.path.join(image_root, fil[:-3] + ".png") 51 | filexec = os.path.join(root, fil) 52 | 53 | print("Running %s" % filexec) 54 | subprocess.Popen([sys.executable, filexec, dest]) 55 | 56 | sys.exit(0) 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # scratch code 2 | scratch/ 3 | 4 | # Any data unpackaged by tensorflow 5 | MNIST_data/ 6 | 7 | # In-progress word docs 8 | ~$*.doc* 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # Ignore PyCharm stuff... 16 | .idea/ 17 | 18 | # Mac stuff 19 | .DS_Store 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Testing 25 | .pytest_cache/ 26 | 27 | # Distribution / packaging 28 | .Python 29 | env/ 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # dotenv 101 | .env 102 | 103 | # virtualenv 104 | .venv 105 | venv/ 106 | ENV/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | -------------------------------------------------------------------------------- /examples/decision_tree/example_regression_decision_tree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.decision_tree import CARTRegressor 6 | from sklearn.metrics import mean_squared_error 7 | from sklearn.model_selection import train_test_split 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import sys 11 | 12 | # ############################################################################# 13 | # Create a classification dataset 14 | rs = np.random.RandomState(42) 15 | X = np.sort(5 * rs.rand(80, 1), axis=0) 16 | y = np.sin(X).ravel() 17 | 18 | # split the data 19 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 20 | 21 | # ############################################################################# 22 | # Fit a simple decision tree regressor and get predictions 23 | clf = CARTRegressor(X_train, y_train, max_depth=3, random_state=42) 24 | pred = clf.predict(X_test) 25 | clf_mse = mean_squared_error(y_test, pred) 26 | print("Test MSE (depth=3): %.3f" % clf_mse) 27 | 28 | # Fit a deeper tree and show accuracy increases 29 | clf2 = CARTRegressor(X_train, y_train, max_depth=10, random_state=42) 30 | pred2 = clf2.predict(X_test) 31 | clf2_mse = mean_squared_error(y_test, pred2) 32 | print("Test MSE (depth=10): %.3f" % clf2_mse) 33 | 34 | # ############################################################################# 35 | # Visualize difference in learning ability 36 | 37 | x = X_train.ravel() 38 | xte = X_test.ravel() 39 | 40 | fig, axes = plt.subplots(1, 2, figsize=(12, 8)) 41 | axes[0].scatter(x, y_train, alpha=0.25, c='r') 42 | axes[0].scatter(xte, pred, alpha=1.) 43 | axes[0].set_title("Shallow tree (depth=3) test MSE: %.3f" % clf_mse) 44 | 45 | axes[1].scatter(x, y_train, alpha=0.4, c='r') 46 | axes[1].scatter(xte, pred2, alpha=1.) 47 | axes[1].set_title("Deeper tree (depth=10) test MSE: %.3f" % clf2_mse) 48 | 49 | # if we're supposed to save it, do so INSTEAD OF showing it 50 | if len(sys.argv) > 1: 51 | plt.savefig(sys.argv[1]) 52 | else: 53 | plt.show() 54 | -------------------------------------------------------------------------------- /examples/regression/example_linear_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.regression import SimpleLinearRegression 6 | from sklearn.linear_model import LinearRegression 7 | from sklearn.model_selection import train_test_split 8 | from matplotlib import pyplot as plt 9 | import numpy as np 10 | import sys 11 | 12 | # ############################################################################# 13 | # Create a data-set that perfectly models the linear relationship: 14 | # y = 2a + 1.5b + 0 15 | random_state = np.random.RandomState(42) 16 | X = random_state.rand(500, 2) 17 | y = 2. * X[:, 0] + 1.5 * X[:, 1] 18 | 19 | # split the data 20 | X_train, X_test, y_train, y_test = train_test_split(X, y, 21 | random_state=random_state) 22 | 23 | # ############################################################################# 24 | # Fit a simple linear regression, produce predictions 25 | lm = SimpleLinearRegression(X_train, y_train) 26 | predictions = lm.predict(X_test) 27 | print("Test sum of residuals: %.3f" % (y_test - predictions).sum()) 28 | assert np.allclose(lm.theta, [2., 1.5]) 29 | 30 | # ############################################################################# 31 | # Show that our solution is similar to scikit-learn's 32 | 33 | lr = LinearRegression(fit_intercept=True) 34 | lr.fit(X_train, y_train) 35 | assert np.allclose(lm.theta, lr.coef_) 36 | assert np.allclose(predictions, lr.predict(X_test)) 37 | 38 | # ############################################################################# 39 | # Fit another on ONE feature so we can show the plot 40 | X_train = X_train[:, np.newaxis, 0] 41 | X_test = X_test[:, np.newaxis, 0] 42 | lm = SimpleLinearRegression(X_train, y_train) 43 | 44 | # create the predictions & plot them as the line 45 | preds = lm.predict(X_test) 46 | plt.scatter(X_test[:, 0], y_test, color='black') 47 | plt.plot(X_test[:, 0], preds, linewidth=3) 48 | 49 | # if we're supposed to save it, do so INSTEAD OF showing it 50 | if len(sys.argv) > 1: 51 | plt.savefig(sys.argv[1]) 52 | else: 53 | plt.show() 54 | -------------------------------------------------------------------------------- /examples/recommendation/example_als_recommender.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.recommendation import ALS 6 | from packtml.recommendation.data import get_completely_fabricated_ratings_data 7 | from packtml.metrics.ranking import mean_average_precision 8 | from matplotlib import pyplot as plt 9 | import numpy as np 10 | import sys 11 | 12 | # ############################################################################# 13 | # Use our fabricated data set 14 | R, titles = get_completely_fabricated_ratings_data() 15 | 16 | # ############################################################################# 17 | # Fit an item-item recommender, predict for user 0 18 | n_iter = 25 19 | rec = ALS(R, factors=5, n_iter=n_iter, random_state=42, lam=0.01) 20 | user0_rec, user_0_preds = rec.recommend_for_user( 21 | R, user=0, filter_previously_seen=True, 22 | return_scores=True) 23 | 24 | # print some info about user 0 25 | top_rated = np.argsort(-R[0, :])[:3] 26 | print("User 0's top 3 rated movies are: %r" % titles[top_rated].tolist()) 27 | print("User 0's top 3 recommended movies are: %r" 28 | % titles[user0_rec[:3]].tolist()) 29 | 30 | # ############################################################################# 31 | # We can score our recommender as well, to determine how well it actually did 32 | 33 | # first, get all user recommendations (top 10, not filtered) 34 | recommendations = list(rec.recommend_for_all_users( 35 | R, n=10, filter_previously_seen=False, 36 | return_scores=False)) 37 | 38 | # get the TRUE items they've rated (in order) 39 | ground_truth = np.argsort(-R, axis=1) 40 | mean_avg_prec = mean_average_precision( 41 | predictions=recommendations, labels=ground_truth) 42 | print("Mean average precision: %.3f" % mean_avg_prec) 43 | 44 | # plot the error 45 | plt.plot(np.arange(n_iter), rec.train_err) 46 | plt.xlabel("Iteration") 47 | plt.ylabel("MSE") 48 | plt.title("Train error by iteration") 49 | 50 | # if we're supposed to save it, do so INSTEAD OF showing it 51 | if len(sys.argv) > 1: 52 | plt.savefig(sys.argv[1]) 53 | else: 54 | plt.show() 55 | -------------------------------------------------------------------------------- /packtml/neural_net/tests/test_transfer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.neural_net import NeuralNetClassifier, TransferLearningClassifier 6 | 7 | import numpy as np 8 | 9 | 10 | def test_transfer_learner(): 11 | rs = np.random.RandomState(42) 12 | covariance = [[1, .75], [.75, 1]] 13 | 14 | # these are the majority classes 15 | n_obs = 500 16 | x1 = rs.multivariate_normal(mean=[0, 0], cov=covariance, size=n_obs) 17 | x2 = rs.multivariate_normal(mean=[1, 5], cov=covariance, size=n_obs) 18 | 19 | # this is the minority class 20 | x3 = rs.multivariate_normal(mean=[0.85, 3.25], 21 | cov=[[1., .5], [1.25, 0.85]], 22 | size=150) 23 | 24 | # this is what the FIRST network will be trained on 25 | n_first = 400 26 | X = np.vstack((x1[:n_first], x2[:n_first])).astype(np.float32) 27 | y = np.hstack((np.zeros(n_first), np.ones(n_first))).astype(int) 28 | 29 | # this is what the SECOND network will be trained on 30 | X2 = np.vstack((x1[n_first:], x2[n_first:], x3)).astype(np.float32) 31 | y2 = np.hstack((np.zeros(n_obs - n_first), 32 | np.ones(n_obs - n_first), 33 | np.ones(x3.shape[0]) * 2)).astype(int) 34 | 35 | # Fit the first neural network 36 | clf = NeuralNetClassifier(X, y, hidden=(25, 25), n_iter=50, 37 | learning_rate=0.001, random_state=42) 38 | 39 | # Fit the transfer network - train one more layer with a new class 40 | transfer = TransferLearningClassifier(X2, y2, pretrained=clf, hidden=(15,), 41 | n_iter=10, random_state=42) 42 | 43 | # show we can predict 44 | transfer.predict(X2) 45 | 46 | # show we can use a transfer learner on an existing transfer learner 47 | transfer2 = TransferLearningClassifier(X2, y2, pretrained=transfer, 48 | hidden=(25,), 49 | random_state=15) 50 | 51 | # and show we can still predict 52 | transfer2.predict(X2) 53 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | import sys 6 | import setuptools 7 | 8 | with open("packtml/VERSION", 'r') as vsn: 9 | VERSION = vsn.read().strip() 10 | 11 | # Permitted args: "install" only, basically. 12 | UNSUPPORTED_COMMANDS = { # this is a set literal, not a dict 13 | 'develop', 'release', 'bdist_egg', 'bdist_rpm', 14 | 'bdist_wininst', 'install_egg_info', 'build_sphinx', 15 | 'egg_info', 'easy_install', 'upload', 'bdist_wheel', 16 | '--single-version-externally-managed', 'test', 'build_ext' 17 | } 18 | 19 | intersect = UNSUPPORTED_COMMANDS.intersection(set(sys.argv)) 20 | if intersect: 21 | msg = "The following arguments are unsupported: %s. " \ 22 | "To install, please use `python setup.py install`." \ 23 | % str(list(intersect)) 24 | 25 | # if "test" is in the arguments, make sure the user knows how to test. 26 | if "test" in intersect: 27 | msg += " To test, make sure pytest is installed, and after " \ 28 | "installation run `pytest packtml`" 29 | 30 | raise ValueError(msg) 31 | 32 | # get requirements 33 | with open("requirements.txt") as req: 34 | REQUIREMENTS = req.read().strip().split("\n") 35 | 36 | py_version_tag = '-%s.%s'.format(sys.version_info[:2]) 37 | setuptools.setup(name="packtml", 38 | description="Hands-on Supervised Learning - teach a machine " 39 | "to think for itself!", 40 | author="Taylor G Smith", 41 | author_email="taylor.smith@alkaline-ml.com", 42 | packages=['packtml', 43 | 'packtml/clustering', 44 | 'packtml/decision_tree', 45 | 'packtml/metrics', 46 | 'packtml/neural_net', 47 | 'packtml/recommendation', 48 | 'packtml/regression', 49 | 'packtml/utils'], 50 | zip_safe=False, 51 | include_package_data=True, 52 | install_requires=REQUIREMENTS, 53 | package_data={"packtml": ["*"]}, 54 | python_requires='>=3.5, <4', 55 | version=VERSION) 56 | -------------------------------------------------------------------------------- /examples/clustering/example_knn_classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.clustering import KNNClassifier 6 | from packtml.utils.plotting import add_decision_boundary_to_axis 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.metrics import accuracy_score 10 | from sklearn.datasets import load_iris 11 | from matplotlib import pyplot as plt 12 | from matplotlib.colors import ListedColormap 13 | import sys 14 | 15 | # ############################################################################# 16 | # Create a classification sub-dataset using iris 17 | iris = load_iris() 18 | X = iris.data[:, :2] # just use the first two dimensions 19 | y = iris.target 20 | 21 | # split data 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 23 | 24 | # scale the data 25 | scaler = StandardScaler() 26 | X_train = scaler.fit_transform(X_train) 27 | X_test = scaler.transform(X_test) 28 | 29 | # ############################################################################# 30 | # Fit a k-nearest neighbor model and get predictions 31 | k=10 32 | clf = KNNClassifier(X_train, y_train, k=k) 33 | pred = clf.predict(X_test) 34 | clf_accuracy = accuracy_score(y_test, pred) 35 | print("Test accuracy: %.3f" % clf_accuracy) 36 | 37 | # ############################################################################# 38 | # Visualize difference in classes (this is from the scikit-learn KNN 39 | # plotting example: 40 | # http://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html#sphx-glr-auto-examples-neighbors-plot-classification-py) 41 | 42 | xx, yy, _ = add_decision_boundary_to_axis(estimator=clf, axis=plt, 43 | nclasses=3, X_data=X_test) 44 | 45 | # Plot also the training points 46 | plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, 47 | cmap=ListedColormap(['#FF0000', '#00FF00', '#0000FF']), 48 | edgecolor='k', s=20) 49 | 50 | plt.xlim(xx.min(), xx.max()) 51 | plt.ylim(yy.min(), yy.max()) 52 | plt.title("3-Class classification (k=%i)" % k) 53 | 54 | # if we're supposed to save it, do so INSTEAD OF showing it 55 | if len(sys.argv) > 1: 56 | plt.savefig(sys.argv[1]) 57 | else: 58 | plt.show() 59 | -------------------------------------------------------------------------------- /examples/regression/example_logistic_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.regression import SimpleLogisticRegression 6 | from packtml.utils.plotting import add_decision_boundary_to_axis 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.datasets import make_classification 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import accuracy_score 11 | from matplotlib import pyplot as plt 12 | import sys 13 | 14 | # ############################################################################# 15 | # Create an almost perfectly linearly-separable classification set 16 | X, y = make_classification(n_samples=100, n_features=2, random_state=42, 17 | n_redundant=0, n_repeated=0, n_classes=2, 18 | class_sep=1.0) 19 | 20 | # split data 21 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 22 | 23 | # ############################################################################# 24 | # Fit a simple logistic regression, produce predictions 25 | lm = SimpleLogisticRegression(X_train, y_train, n_steps=50) 26 | 27 | predictions = lm.predict(X_test) 28 | acc = accuracy_score(y_test, predictions) 29 | print("Test accuracy: %.3f" % acc) 30 | 31 | # Show that our solution is similar to scikit-learn's 32 | lr = LogisticRegression(fit_intercept=True, C=1e16) # almost no regularization 33 | lr.fit(X_train, y_train) 34 | print("Sklearn test accuracy: %.3f" % accuracy_score(y_test, 35 | lr.predict(X_test))) 36 | 37 | # ############################################################################# 38 | # Plot the data and the boundary we learned. 39 | 40 | add_decision_boundary_to_axis(estimator=lm, axis=plt, 41 | nclasses=2, X_data=X_test) 42 | 43 | # We have to break this into two plot calls, one for each class to 44 | # have different markers... 45 | c0_mask = y_test == 0 46 | plt.scatter(X_test[c0_mask, 0], X_test[c0_mask, 1], 47 | c=~predictions[c0_mask], marker='o') 48 | plt.scatter(X_test[~c0_mask, 0], X_test[~c0_mask, 1], 49 | c=~predictions[~c0_mask], marker='x') 50 | 51 | plt.title("Logistic test performance: %.4f (o=true 0, x=true 1)" % acc) 52 | 53 | # if we're supposed to save it, do so INSTEAD OF showing it 54 | if len(sys.argv) > 1: 55 | plt.savefig(sys.argv[1]) 56 | else: 57 | plt.show() 58 | -------------------------------------------------------------------------------- /examples/decision_tree/example_classification_decision_tree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.decision_tree import CARTClassifier 6 | from packtml.utils.plotting import add_decision_boundary_to_axis 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.model_selection import train_test_split 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import sys 12 | 13 | # ############################################################################# 14 | # Create a classification dataset 15 | rs = np.random.RandomState(42) 16 | covariance = [[1, .75], [.75, 1]] 17 | n_obs = 500 18 | x1 = rs.multivariate_normal(mean=[0, 0], cov=covariance, size=n_obs) 19 | x2 = rs.multivariate_normal(mean=[1, 3], cov=covariance, size=n_obs) 20 | 21 | X = np.vstack((x1, x2)).astype(np.float32) 22 | y = np.hstack((np.zeros(n_obs), np.ones(n_obs))) 23 | 24 | # split the data 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 26 | 27 | # ############################################################################# 28 | # Fit a simple decision tree classifier and get predictions 29 | shallow_depth = 2 30 | clf = CARTClassifier(X_train, y_train, max_depth=shallow_depth, criterion='gini', 31 | random_state=42) 32 | pred = clf.predict(X_test) 33 | clf_accuracy = accuracy_score(y_test, pred) 34 | print("Test accuracy (depth=%i): %.3f" % (shallow_depth, clf_accuracy)) 35 | 36 | # Fit a deeper tree and show accuracy increases 37 | clf2 = CARTClassifier(X_train, y_train, max_depth=25, criterion='gini', 38 | random_state=42) 39 | pred2 = clf2.predict(X_test) 40 | clf2_accuracy = accuracy_score(y_test, pred2) 41 | print("Test accuracy (depth=25): %.3f" % clf2_accuracy) 42 | 43 | # ############################################################################# 44 | # Visualize difference in classification ability 45 | 46 | fig, axes = plt.subplots(1, 2, figsize=(12, 8)) 47 | 48 | add_decision_boundary_to_axis(estimator=clf, axis=axes[0], 49 | nclasses=2, X_data=X_test) 50 | axes[0].scatter(X_test[:, 0], X_test[:, 1], c=pred, alpha=0.4) 51 | axes[0].set_title("Shallow tree (depth=%i) performance: %.3f" 52 | % (shallow_depth, clf_accuracy)) 53 | 54 | add_decision_boundary_to_axis(estimator=clf2, axis=axes[1], 55 | nclasses=2, X_data=X_test) 56 | axes[1].scatter(X_test[:, 0], X_test[:, 1], c=pred2, alpha=0.4) 57 | axes[1].set_title("Deep tree (depth=25) performance: %.3f" % clf2_accuracy) 58 | 59 | # if we're supposed to save it, do so INSTEAD OF showing it 60 | if len(sys.argv) > 1: 61 | plt.savefig(sys.argv[1]) 62 | else: 63 | plt.show() 64 | -------------------------------------------------------------------------------- /packtml/recommendation/tests/test_itemitem.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.recommendation import ItemItemRecommender 6 | 7 | import numpy as np 8 | from numpy.testing import assert_array_almost_equal 9 | 10 | from types import GeneratorType 11 | 12 | # make up a ratings matrix... 13 | R = np.array([[1., 0., 3.5, 2., 0., 0., 0., 1.5], 14 | [0., 2., 3., 0., 0., 2.5, 0., 0. ], 15 | [3.5, 4., 2., 0., 4.5, 3.5, 0., 2. ], 16 | [3., 3.5, 0., 2.5, 3., 0., 0., 0. ]]) 17 | 18 | 19 | def test_itemitem_simple(): 20 | rec = ItemItemRecommender(R, k=3) 21 | 22 | # assert on the similarity 23 | expected = np.array([ 24 | [ 1. , 0.91461057, 0. , 0. , 0.9701687 , 25 | 0. , 0. , 0. ], 26 | [ 0.91461057, 1. , 0. , 0. , 0.92793395, 27 | 0. , 0. , 0. ], 28 | [ 0. , 0. , 1. , 0. , 0. , 29 | 0.6708902 , 0. , 0.73632752], 30 | [ 0.62906665, 0.48126166, 0. , 1. , 0. , 31 | 0. , 0. , 0. ], 32 | [ 0.9701687 , 0.92793395, 0. , 0. , 1. , 33 | 0. , 0. , 0. ], 34 | [ 0. , 0.77786258, 0. , 0. , 0.67706717, 35 | 1. , 0. , 0. ], 36 | [ 0. , 0. , 0. , 0. , 0. , 37 | 0. , 0. , 0. ], 38 | [ 0.72079856, 0. , 0.73632752, 0. , 0. , 39 | 0. , 0. , 1. ]]) 40 | 41 | assert_array_almost_equal(expected, rec.similarity) 42 | 43 | # show we can generate recommendations 44 | rec0, scores0 = rec.recommend_for_user(R, 0) 45 | 46 | # we didn't filter, so the rated items should still be present 47 | assert np.in1d([0, 2, 3, 7], rec0).all() 48 | 49 | # re-compute and show the previously-rated are not present 50 | rec0_filtered, scores0_filtered = rec.recommend_for_user( 51 | R, 0, filter_previously_seen=True) 52 | 53 | assert len(rec0_filtered) == 4, rec0_filtered 54 | assert rec0_filtered.tolist() == [5, 1, 4, 6] 55 | 56 | # test the prediction, which is just a big product... 57 | pred = rec.predict(R) 58 | assert pred.shape == R.shape 59 | 60 | # get recommendations for ALL users 61 | recommendations = rec.recommend_for_all_users(R, return_scores=False, 62 | filter_previously_seen=False) 63 | 64 | assert isinstance(recommendations, GeneratorType) 65 | recs = list(recommendations) 66 | assert len(recs) == 4 67 | assert all(len(x) == 8 for x in recs) 68 | -------------------------------------------------------------------------------- /packtml/recommendation/data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | 7 | __all__ = [ 8 | 'get_completely_fabricated_ratings_data' 9 | ] 10 | 11 | 12 | def get_completely_fabricated_ratings_data(): 13 | """Disclaimer: this is a made-up data set. 14 | 15 | Get a ratings data set for use with one of the packtml recommenders. 16 | This data set is a completely made-up ratings matrix consisting of 17 | cult classics, all of which are awesome (seriously, if there are any 18 | you haven't seen, you should). 19 | 20 | (Please 21 | don't 22 | sue 23 | 24 | me......) 25 | 26 | The data contains 5 users and 15 items (movies). Movies: 27 | 28 | 0) Ghost Busters 29 | 1) Ghost Busters 2 30 | 2) The Goonies 31 | 3) Big Trouble in Little China 32 | 4) The Rocky Horror Picture Show 33 | 5) A Clockwork Orange 34 | 6) Pulp Fiction 35 | 7) Bill & Ted's Excellent Adventure 36 | 8) Weekend at Bernie's 37 | 9) Dumb and Dumber 38 | 10) Clerks 39 | 11) Jay & Silent Bob Strike Back 40 | 12) Tron 41 | 13) Total Recall 42 | 14) The Princess Bride 43 | 44 | Notes 45 | ----- 46 | Seriously, I fabricated all of these ratings semi-haphazardly. Don't 47 | take this as me bashing any movies. 48 | """ 49 | return (np.array([ 50 | # user 0 is a classic 30-yo millennial who is nostalgic for the 90s 51 | [5.0, 3.5, 5.0, 0.0, 0.0, 0.0, 4.5, 3.0, 52 | 0.0, 2.5, 4.0, 4.0, 0.0, 1.5, 3.0], 53 | 54 | # user 1 is a 40-yo who only likes action 55 | [1.5, 0.0, 0.0, 1.0, 0.0, 4.0, 5.0, 0.0, 56 | 2.0, 0.0, 3.0, 3.5, 0.0, 4.0, 0.0], 57 | 58 | # user 2 is a 12-yo whose parents are strict about what she watches. 59 | [4.5, 4.0, 5.0, 0.0, 0.0, 0.0, 0.0, 4.0, 60 | 3.5, 5.0, 0.0, 0.0, 0.0, 0.0, 5.0], 61 | 62 | # user 3 has just about seen it all, and doesn't really care for 63 | # the goofy stuff. (but seriously, who rates the Goonies 2/5???) 64 | [2.0, 1.0, 2.0, 1.0, 2.5, 4.5, 4.5, 0.5, 65 | 1.5, 1.0, 2.0, 2.5, 3.5, 3.5, 2.0], 66 | 67 | # user 4 has just opened a netflix account and hasn't had a chance 68 | # to watch too much 69 | [0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 70 | 0.0, 0.0, 0.0, 1.5, 4.0, 0.0, 0.0], 71 | ]), np.array(["Ghost Busters", "Ghost Busters 2", 72 | "The Goonies", "Big Trouble in Little China", 73 | "The Rocky Horror Picture Show", "A Clockwork Orange", 74 | "Pulp Fiction", "Bill & Ted's Excellent Adventure", 75 | "Weekend at Bernie's", "Dumb and Dumber", "Clerks", 76 | "Jay & Silent Bob Strike Back", "Tron", "Total Recall", 77 | "The Princess Bride" ])) 78 | -------------------------------------------------------------------------------- /examples/neural_net/example_mlp_classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.neural_net import NeuralNetClassifier 6 | from packtml.utils.plotting import add_decision_boundary_to_axis 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.model_selection import train_test_split 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import sys 12 | 13 | # ############################################################################# 14 | # Create a classification dataset 15 | rs = np.random.RandomState(42) 16 | covariance = [[1, .75], [.75, 1]] 17 | n_obs = 1000 18 | x1 = rs.multivariate_normal(mean=[0, 0], cov=covariance, size=n_obs) 19 | x2 = rs.multivariate_normal(mean=[1, 5], cov=covariance, size=n_obs) 20 | 21 | X = np.vstack((x1, x2)).astype(np.float32) 22 | y = np.hstack((np.zeros(n_obs), np.ones(n_obs))).astype(int) 23 | 24 | # split the data 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs) 26 | 27 | # ############################################################################# 28 | # Fit a simple neural network 29 | n_iter = 4 30 | hidden = (10,) 31 | clf = NeuralNetClassifier(X_train, y_train, hidden=hidden, n_iter=n_iter, 32 | learning_rate=0.001, random_state=42) 33 | print("Loss per training iteration: %r" % clf.train_loss) 34 | 35 | pred = clf.predict(X_test) 36 | clf_accuracy = accuracy_score(y_test, pred) 37 | print("Test accuracy (hidden=%s): %.3f" % (str(hidden), clf_accuracy)) 38 | 39 | # ############################################################################# 40 | # Fit a more complex neural network 41 | n_iter2 = 150 42 | hidden2 = (25, 25) 43 | clf2 = NeuralNetClassifier(X_train, y_train, hidden=hidden2, n_iter=n_iter2, 44 | learning_rate=0.001, random_state=42) 45 | 46 | pred2 = clf2.predict(X_test) 47 | clf_accuracy2 = accuracy_score(y_test, pred2) 48 | print("Test accuracy (hidden=%s): %.3f" % (str(hidden2), clf_accuracy2)) 49 | 50 | # ############################################################################# 51 | # Visualize difference in classification ability 52 | 53 | fig, axes = plt.subplots(2, 2, figsize=(12, 8)) 54 | 55 | add_decision_boundary_to_axis(estimator=clf, axis=axes[0, 0], 56 | nclasses=2, X_data=X_test) 57 | axes[0, 0].scatter(X_test[:, 0], X_test[:, 1], c=pred, alpha=0.4) 58 | axes[0, 0].set_title("Shallow (hidden=%s @ %i iter) test accuracy: %.3f" 59 | % (str(hidden), n_iter, clf_accuracy)) 60 | 61 | add_decision_boundary_to_axis(estimator=clf2, axis=axes[0, 1], 62 | nclasses=2, X_data=X_test) 63 | axes[0, 1].scatter(X_test[:, 0], X_test[:, 1], c=pred2, alpha=0.4) 64 | axes[0, 1].set_title("Deeper (hidden=%s @ %i iter): test accuracy: %.3f" 65 | % (str(hidden2), n_iter2, clf_accuracy2)) 66 | 67 | # show the learning rates for each 68 | axes[1, 0].plot(np.arange(len(clf.train_loss)), clf.train_loss) 69 | axes[1, 0].set_title("Training loss by iteration") 70 | 71 | axes[1, 1].plot(np.arange(len(clf2.train_loss)), clf2.train_loss) 72 | axes[1, 1].set_title("Training loss by iteration") 73 | 74 | # if we're supposed to save it, do so INSTEAD OF showing it 75 | if len(sys.argv) > 1: 76 | plt.savefig(sys.argv[1]) 77 | else: 78 | plt.show() 79 | -------------------------------------------------------------------------------- /packtml/regression/simple_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from sklearn.utils.validation import check_X_y, check_array 6 | 7 | import numpy as np 8 | from numpy.linalg import lstsq 9 | 10 | from packtml.base import BaseSimpleEstimator 11 | 12 | 13 | __all__ = [ 14 | 'SimpleLinearRegression' 15 | ] 16 | 17 | 18 | class SimpleLinearRegression(BaseSimpleEstimator): 19 | """Simple linear regression. 20 | 21 | This class provides a very simple example of straight forward OLS 22 | regression with an intercept. There are no tunable parameters, and 23 | the model fit happens directly on class instantiation. 24 | 25 | Parameters 26 | ---------- 27 | X : array-like, shape=(n_samples, n_features) 28 | The array of predictor variables. This is the array we will use 29 | to regress on ``y``. 30 | 31 | y : array-like, shape=(n_samples,) 32 | This is the target array on which we will regress to build 33 | our model. 34 | 35 | Attributes 36 | ---------- 37 | theta : array-like, shape=(n_features,) 38 | The least-squares solution (the coefficients) 39 | 40 | rank : int 41 | The rank of the predictor matrix, ``X`` 42 | 43 | singular_values : array-like, shape=(n_features,) 44 | The singular values of ``X`` 45 | 46 | X_means : array-like, shape=(n_features,) 47 | The column means of the predictor matrix, ``X`` 48 | 49 | y_mean : float 50 | The mean of the target variable, ``y`` 51 | 52 | intercept : float 53 | The intercept term 54 | """ 55 | def __init__(self, X, y): 56 | # First check X, y and make sure they are of equal length, no NaNs 57 | # and that they are numeric 58 | X, y = check_X_y(X, y, y_numeric=True, 59 | accept_sparse=False) # keep it simple 60 | 61 | # Next, we want to scale all of our features so X is centered 62 | # We will do the same with our target variable, y 63 | X_means = np.average(X, axis=0) 64 | y_mean = y.mean(axis=0) 65 | 66 | # don't do in place, so we get a copy 67 | X = X - X_means 68 | y = y - y_mean 69 | 70 | # Let's compute the least squares on X wrt y 71 | # Least squares solves the equation `a x = b` by computing a 72 | # vector `x` that minimizes the Euclidean 2-norm `|| b - a x ||^2`. 73 | theta, _, rank, singular_values = lstsq(X, y, rcond=None) 74 | 75 | # finally, we compute the intercept values as the mean of the target 76 | # variable MINUS the inner product of the X_means and the coefficients 77 | intercept = y_mean - np.dot(X_means, theta.T) 78 | 79 | # ... and set everything as an instance attribute 80 | self.theta = theta 81 | self.rank = rank 82 | self.singular_values = singular_values 83 | 84 | # we have to retain some of the statistics around the data too 85 | self.X_means = X_means 86 | self.y_mean = y_mean 87 | self.intercept = intercept 88 | 89 | def predict(self, X): 90 | """Compute new predictions for X""" 91 | # copy, make sure numeric, etc... 92 | X = check_array(X, accept_sparse=False, copy=False) # type: np.ndarray 93 | 94 | # make sure dims match 95 | theta = self.theta 96 | if theta.shape[0] != X.shape[1]: 97 | raise ValueError("Dim mismatch in predictors!") 98 | 99 | # creates a copy 100 | return np.dot(X, theta.T) + self.intercept 101 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # AppVeyor.com is a Continuous Integration service to build and run tests under 2 | # Windows. This .yml file is based on scikit-learn and statsmodels' Appveyor CI 3 | # setups, adapted for use with the Hands-on Supervised Learning repo 4 | 5 | # This image contains the most pre-installed software (including supposedly 6 | # MinGW and Miniconda?...) 7 | image: 8 | - Visual Studio 2015 9 | 10 | cache: 11 | - '%LOCALAPPDATA%\pip\Cache' 12 | 13 | environment: 14 | global: 15 | APPVEYOR_SAVE_CACHE_ON_ERROR: false 16 | TEST_TIMEOUT: 1000 17 | # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the 18 | # /E:ON and /V:ON options are not enabled in the batch script interpreter 19 | # See: http://stackoverflow.com/a/13751649/163740 20 | # CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\build_tools\\appveyor\\run_with_env.cmd" 21 | 22 | matrix: 23 | - PYTHON: C:\Miniconda35 24 | PYTHON_VERSION: 3.5 25 | PYTHON_ARCH: 32 26 | 27 | - PYTHON: C:\Miniconda35-x64 28 | PYTHON_VERSION: 3.5 29 | PYTHON_ARCH: 64 30 | 31 | # Currently failing due to Appveyor bugs? 32 | # - PYTHON: C:\Miniconda36 33 | # PYTHON_VERSION: 3.6 34 | # PYTHON_ARCH: 32 35 | # 36 | # - PYTHON: C:\Miniconda36-x64 37 | # PYTHON_VERSION: 3.6 38 | # PYTHON_ARCH: 64 39 | 40 | init: 41 | - "ECHO \"%APPVEYOR_SCHEDULED_BUILD%\"" 42 | 43 | # If there is a newer build queued for the same PR, cancel this one. 44 | # The AppVeyor 'rollout builds' option is supposed to serve the same 45 | # purpose but it is problematic because it tends to cancel builds pushed 46 | # directly to master instead of just PR builds (or the converse). 47 | # credits: JuliaLang developers. 48 | - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod ` 49 | https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | ` 50 | Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { ` 51 | Write-Host "There are newer queued builds for this pull request, skipping build." 52 | Exit-AppveyorBuild 53 | } 54 | - ps: | 55 | If (($env:SKIP_NOTAG -eq "true") -and ($env:APPVEYOR_REPO_TAG -ne "true")) { 56 | Write-Host "Skipping build, not at a tag." 57 | Exit-AppveyorBuild 58 | } 59 | 60 | install: 61 | - C:\cygwin\bin\du -hs "%LOCALAPPDATA%\pip\Cache" 62 | 63 | # Prepend Miniconda to the PATH of this build (this cannot be 64 | # done from inside the powershell script as it would require to restart 65 | # the parent CMD process). 66 | - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PYTHON%\Library\bin;%PATH% 67 | 68 | # Setup the conda config 69 | - conda config --set always_yes yes 70 | - ps: conda create -n testenv --yes python=$env:PYTHON_VERSION 71 | - activate testenv 72 | - pip install -r requirements.txt 73 | - pip install pytest 74 | 75 | build_script: 76 | # set up the package 77 | - python setup.py install 78 | 79 | after_build: 80 | # Remove old or huge cache files to hopefully not exceed the 1GB cache limit. 81 | # 82 | # If the cache limit is reached, the cache will not be updated (of not even 83 | # created in the first run). So this is a trade of between keeping the cache 84 | # current and having a cache at all. 85 | # NB: This is done only `on_success` since the cache in uploaded only on 86 | # success anyway. 87 | - C:\cygwin\bin\find "%LOCALAPPDATA%\pip" -type f -mtime +360 -delete 88 | - C:\cygwin\bin\find "%LOCALAPPDATA%\pip" -type f -size +10M -delete 89 | - C:\cygwin\bin\find "%LOCALAPPDATA%\pip" -empty -delete 90 | 91 | # Show size of cache 92 | - C:\cygwin\bin\du -hs "%LOCALAPPDATA%\pip\Cache" 93 | 94 | test_script: 95 | - python -m pytest --showlocals --durations=20 --pyargs packtml 96 | -------------------------------------------------------------------------------- /packtml/decision_tree/tests/test_cart.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from numpy.testing import assert_array_equal, assert_almost_equal 6 | import numpy as np 7 | 8 | from packtml.decision_tree.metrics import InformationGain 9 | from packtml.decision_tree.cart import (CARTClassifier, CARTRegressor, 10 | RandomSplitter, LeafNode, _most_common) 11 | 12 | X = np.array([[0, 1, 2], 13 | [1, 2, 3], 14 | [2, 3, 4]]) 15 | 16 | y = np.array([0, 1, 1]) 17 | 18 | X2 = np.array([[0, 1, 2], 19 | [1, 2, 3], 20 | [2, 3, 4], 21 | [3, 4, 5], 22 | [4, 5, 6], 23 | [5, 6, 7]]) 24 | 25 | y2 = np.array([0, 0, 1, 1, 1, 1]) 26 | 27 | # a regression dataset 28 | rs = np.random.RandomState(42) 29 | Xreg = np.sort(5 * rs.rand(100, 1), axis=0) 30 | yreg = np.sin(Xreg).ravel() 31 | 32 | 33 | def test_most_common(): 34 | assert _most_common(y) == 1 35 | assert _most_common([1]) == 1 36 | 37 | 38 | def test_terminal_leaf_node(): 39 | node = LeafNode(split_col=0, split_val=1., 40 | class_statistic=_most_common(y), 41 | split_gain=np.inf) 42 | 43 | # show that there are no children 44 | assert node.is_terminal() 45 | 46 | # show that the splitting works as expected 47 | X_left, X_right, y_left, y_right = node.create_split(X, y) 48 | assert_array_equal(X_left, X[1:, :]) 49 | assert_array_equal(X_right, X[:1, :]) 50 | assert_array_equal(y_left, [1, 1]) 51 | assert_array_equal(y_right, [0]) 52 | 53 | # show that predictions work as expected 54 | assert [node.predict_record(r) for r in X] == [1, 1, 1] 55 | 56 | 57 | def test_complex_leaf_node(): 58 | node = LeafNode(split_col=0, split_val=3., 59 | class_statistic=_most_common(y2), 60 | split_gain=np.inf) 61 | 62 | # create the split 63 | X_left, X_right, y_left, y_right = node.create_split(X2, y2) 64 | 65 | # show it worked as expected 66 | assert_array_equal(X_left, X2[3:, :]) 67 | assert_array_equal(X_right, X2[:3, :]) 68 | assert_array_equal(y_left, [1, 1, 1]) 69 | assert_array_equal(y_right, [0, 0, 1]) 70 | 71 | # show that if we CURRENTLY predicted on the bases of node being the 72 | # terminal leaf, we'd get all 1s. 73 | get_preds = (lambda: [node.predict_record(r) for r in X2]) 74 | assert get_preds() == [1, 1, 1, 1, 1, 1] 75 | 76 | # add a sub node to the right side 77 | right_node = LeafNode(split_col=0, split_val=2., 78 | class_statistic=_most_common(y_right), 79 | split_gain=np.inf) 80 | 81 | assert right_node.class_statistic == 0. 82 | 83 | # attach to the original node and assert it's not terminal anymore 84 | node.right = right_node 85 | assert not node.is_terminal() 86 | 87 | # now our predictions should differ! 88 | assert get_preds() == [0, 0, 0, 1, 1, 1] 89 | 90 | 91 | def test_fit_classifier(): 92 | # show we can fit a classifier 93 | clf = CARTClassifier(X, y) 94 | # show we can predict 95 | clf.predict(X) 96 | 97 | 98 | def test_fit_regressor(): 99 | # show we can fit a regressor 100 | reg = CARTRegressor(Xreg, yreg) 101 | # show we can predict 102 | reg.predict(Xreg) 103 | 104 | 105 | def test_random_splitter(): 106 | pre_X = np.array([[21, 3], [4, 2], [37, 2]]) 107 | pre_y = np.array([1, 0, 1]) 108 | 109 | # this is the splitting class; we'll use gini as the criteria 110 | random_state = np.random.RandomState(42) 111 | splitter = RandomSplitter(random_state=random_state, 112 | criterion=InformationGain('gini'), 113 | n_val_sample=3) 114 | 115 | # find the best: 116 | best_feature, best_value, best_gain = splitter.find_best(pre_X, pre_y) 117 | assert best_feature == 0 118 | assert best_value == 21 119 | assert_almost_equal(best_gain, 0.4444444444, decimal=8) 120 | -------------------------------------------------------------------------------- /packtml/clustering/knn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Author: Taylor Smith 4 | # 5 | # An implementation of kNN clustering. Note that this was written to 6 | # maximize readability. To use kNN in a true project setting, you may 7 | # wish to use a more highly optimized library, such as scikit-learn. 8 | 9 | from __future__ import absolute_import 10 | 11 | from sklearn.metrics.pairwise import euclidean_distances 12 | from sklearn.utils.validation import check_X_y 13 | from sklearn.utils.multiclass import check_classification_targets 14 | 15 | from scipy.stats import mode 16 | import numpy as np 17 | 18 | from packtml.base import BaseSimpleEstimator 19 | 20 | __all__ = [ 21 | 'KNNClassifier' 22 | ] 23 | 24 | 25 | class KNNClassifier(BaseSimpleEstimator): 26 | """Classify points using k-Nearest Neighbors. 27 | 28 | The kNN algorithm computes the distances between points in a matrix and 29 | identifies the nearest "neighboring" points to each observation. The idea 30 | is that neighboring points share similar attributes. Therefore, if a 31 | neighbor is of some class, an unknown observation may likely belong to 32 | the same class. 33 | 34 | There are several caveats to kNN: 35 | 36 | * We have to retain all of the training data, which is expensive. 37 | * Computing the pairwise distance matrix is also expensive. 38 | * You should make sure you've standardized your data (mean 0, stddev 1) 39 | prior to fitting a kNN model 40 | 41 | Parameters 42 | ---------- 43 | X : array-like, shape=(n_samples, n_features) 44 | The training array. Should be a numpy array or array-like structure 45 | with only finite values. 46 | 47 | y : array-like, shape=(n_samples,) 48 | The target vector. 49 | 50 | k : int, optional (default=10) 51 | The number of neighbors to identify. The higher the ``k`` parameter, 52 | the more likely you are to *under*-fit your data. The lower the ``k`` 53 | parameter, the more likely you are to *over*-fit your model. 54 | 55 | Notes 56 | ----- 57 | This is a very rudimentary implementation of KNN. It does not permit tuning 58 | of distance metrics, optimization of the search algorithm or any other 59 | parameters. It is written to be as simple as possible to maximize 60 | readability. For a more optimal solution, see 61 | ``sklearn.neighbors.KNeighborsClassifier``. 62 | """ 63 | def __init__(self, X, y, k=10): 64 | # check the input array 65 | X, y = check_X_y(X, y, accept_sparse=False, dtype=np.float32, 66 | copy=True) 67 | 68 | # make sure we're performing classification here 69 | check_classification_targets(y) 70 | 71 | # Save the K hyper-parameter so we can use it later 72 | self.k = k 73 | 74 | # kNN is a special case where we have to save the training data in 75 | # order to make predictions in the future 76 | self.X = X 77 | self.y = y 78 | 79 | def predict(self, X): 80 | # Compute the pairwise distances between each observation in 81 | # the dataset and the training data. This can be relatively expensive 82 | # for very large datasets!! 83 | train = self.X 84 | dists = euclidean_distances(X, train) 85 | 86 | # Arg sort to find the shortest distance for each row. This sorts 87 | # elements in each row (independent of other rows) to determine the 88 | # order required to sort the rows. 89 | # I.e: 90 | # >>> P = np.array([[4, 5, 1], [3, 1, 6]]) 91 | # >>> np.argsort(P, axis=1) 92 | # array([[2, 0, 1], 93 | # [1, 0, 2]]) 94 | nearest = np.argsort(dists, axis=1) 95 | 96 | # We only care about the top K, really, so get sorted and then truncate 97 | # I.e: 98 | # array([[1, 2, 1], 99 | # ... 100 | # [0, 0, 0]]) 101 | predicted_labels = self.y[nearest][:, :self.k] 102 | 103 | # We want the most common along the rows as the predictions 104 | # I.e: 105 | # array([1, ..., 0]) 106 | return mode(predicted_labels, axis=1)[0].ravel() 107 | -------------------------------------------------------------------------------- /examples/neural_net/example_transfer_learning.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from packtml.neural_net import NeuralNetClassifier, TransferLearningClassifier 6 | from packtml.utils.plotting import add_decision_boundary_to_axis 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import accuracy_score 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import sys 12 | 13 | # ############################################################################# 14 | # Create a classification dataset. This dataset differs from other datsets 15 | # we've created in that there are two majority classes, and one third (tiny) 16 | # class that we'll train the transfer learner over 17 | rs = np.random.RandomState(42) 18 | covariance = [[1, .75], [.75, 1]] 19 | 20 | # these are the majority classes 21 | n_obs = 1250 22 | x1 = rs.multivariate_normal(mean=[0, 0], cov=covariance, size=n_obs) 23 | x2 = rs.multivariate_normal(mean=[1, 5], cov=covariance, size=n_obs) 24 | 25 | # this is the minority class 26 | x3 = rs.multivariate_normal(mean=[0.85, 3.25], cov=[[1., .5], [1.25, 0.85]], 27 | size=n_obs // 3) 28 | 29 | # this is what the FIRST network will be trained on 30 | n_first = int(0.8 * n_obs) 31 | X = np.vstack((x1[:n_first], x2[:n_first])).astype(np.float32) 32 | y = np.hstack((np.zeros(n_first), np.ones(n_first))).astype(int) 33 | 34 | # this is what the SECOND network will be trained on 35 | X2 = np.vstack((x1[n_first:], x2[n_first:], x3)).astype(np.float32) 36 | y2 = np.hstack((np.zeros(n_obs - n_first), 37 | np.ones(n_obs - n_first), 38 | np.ones(x3.shape[0]) * 2)).astype(int) 39 | 40 | # split the data up 41 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs) 42 | X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, 43 | random_state=rs) 44 | 45 | # ############################################################################# 46 | # Fit the first neural network 47 | hidden = (25, 25) 48 | n_iter = 75 49 | clf = NeuralNetClassifier(X_train, y_train, hidden=hidden, n_iter=n_iter, 50 | learning_rate=0.001, random_state=42) 51 | 52 | pred = clf.predict(X_test) 53 | clf_accuracy = accuracy_score(y_test, pred) 54 | print("Test accuracy (hidden=%s): %.3f" % (str(hidden), clf_accuracy)) 55 | 56 | # ############################################################################# 57 | # Fit the transfer network - train one more layer with a new class 58 | t_hidden = (15,) 59 | t_iter = 25 60 | transfer = TransferLearningClassifier(X2_train, y2_train, pretrained=clf, 61 | hidden=t_hidden, n_iter=t_iter, 62 | random_state=42) 63 | 64 | t_pred = transfer.predict(X2_test) 65 | trans_accuracy = accuracy_score(y2_test, t_pred) 66 | print("Test accuracy (hidden=%s): %.3f" % (str(hidden + t_hidden), 67 | trans_accuracy)) 68 | 69 | # ############################################################################# 70 | # Visualize how the models learned the classes 71 | 72 | fig, axes = plt.subplots(2, 2, figsize=(12, 8)) 73 | 74 | 75 | add_decision_boundary_to_axis(estimator=clf, axis=axes[0, 0], 76 | nclasses=2, X_data=X_test) 77 | axes[0, 0].scatter(X_test[:, 0], X_test[:, 1], c=pred, alpha=0.4) 78 | axes[0, 0].set_title("MLP network (hidden=%s @ %i iter): %.3f" 79 | % (str(hidden), n_iter, clf_accuracy)) 80 | 81 | add_decision_boundary_to_axis(estimator=transfer, axis=axes[0, 1], 82 | nclasses=3, X_data=X2_test) 83 | axes[0, 1].scatter(X2_test[:, 0], X2_test[:, 1], c=t_pred, alpha=0.4) 84 | axes[0, 1].set_title("Transfer network (hidden=%s @ %i iter): " 85 | "%.3f" % (str(hidden + t_hidden), t_iter, 86 | trans_accuracy)) 87 | 88 | # show the learning rates for each 89 | axes[1, 0].plot(np.arange(len(clf.train_loss)), clf.train_loss) 90 | axes[1, 0].set_title("Training loss by iteration") 91 | 92 | # concat the two training losses together for this plot 93 | trans_train_loss = clf.train_loss + transfer.train_loss 94 | axes[1, 1].plot(np.arange(len(trans_train_loss)), trans_train_loss) 95 | axes[1, 1].set_title("Training loss by iteration") 96 | 97 | # Add a verticle line for where the transfer learning begins 98 | axes[1, 1].axvline(x=n_iter, ls="--") 99 | 100 | # if we're supposed to save it, do so INSTEAD OF showing it 101 | if len(sys.argv) > 1: 102 | plt.savefig(sys.argv[1]) 103 | else: 104 | plt.show() 105 | -------------------------------------------------------------------------------- /packtml/regression/simple_logistic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from sklearn.utils.validation import check_X_y, check_array 6 | 7 | import numpy as np 8 | 9 | from packtml.utils.extmath import log_likelihood, logistic_sigmoid 10 | from packtml.utils.validation import assert_is_binary 11 | from packtml.base import BaseSimpleEstimator 12 | 13 | __all__ = [ 14 | 'SimpleLogisticRegression' 15 | ] 16 | 17 | try: 18 | xrange 19 | except NameError: # py 3 doesn't have an xrange 20 | xrange = range 21 | 22 | 23 | class SimpleLogisticRegression(BaseSimpleEstimator): 24 | """Simple logistic regression. 25 | 26 | This class provides a very simple example of straight forward logistic 27 | regression with an intercept. There are few tunable parameters aside from 28 | the number of iterations, & learning rate, and the model is fit upon 29 | class initialization. 30 | 31 | Parameters 32 | ---------- 33 | X : array-like, shape=(n_samples, n_features) 34 | The array of predictor variables. This is the array we will use 35 | to regress on ``y``. 36 | 37 | y : array-like, shape=(n_samples,) 38 | This is the target array on which we will regress to build 39 | our model. It should be binary (0, 1). 40 | 41 | n_steps : int, optional (default=100) 42 | The number of iterations to perform. 43 | 44 | learning_rate : float, optional (default=0.001) 45 | The learning rate. 46 | 47 | loglik_interval : int, optional (default=5) 48 | How frequently to compute the log likelihood. This is an expensive 49 | operation--computing too frequently will be very expensive. 50 | 51 | Attributes 52 | ---------- 53 | theta : array-like, shape=(n_features,) 54 | The coefficients 55 | 56 | intercept : float 57 | The intercept term 58 | 59 | log_likelihood : list 60 | A list of the iterations' log-likelihoods 61 | """ 62 | def __init__(self, X, y, n_steps=100, learning_rate=0.001, 63 | loglik_interval=5): 64 | X, y = check_X_y(X, y, accept_sparse=False, # keep dense for example 65 | y_numeric=True) 66 | 67 | # we want to make sure y is binary since that's all our example covers 68 | assert_is_binary(y) 69 | 70 | # X should be centered/scaled for logistic regression, much like 71 | # with linear regression 72 | means, stds = X.mean(axis=0), X.std(axis=0) 73 | X = (X - means) / stds 74 | 75 | # since we're going to learn an intercept, we can cheat and set the 76 | # intercept to be a new feature that we'll learn with everything else 77 | X_w_intercept = np.hstack((np.ones((X.shape[0], 1)), X)) 78 | 79 | # initialize the coefficients as zeros 80 | theta = np.zeros(X_w_intercept.shape[1]) 81 | 82 | # now for each step, we compute the inner product of X and the 83 | # coefficients, transform the predictions with the sigmoid function, 84 | # and adjust the weights by the gradient 85 | ll = [] 86 | for iteration in xrange(n_steps): 87 | preds = logistic_sigmoid(X_w_intercept.dot(theta)) 88 | residuals = y - preds # The error term 89 | gradient = X_w_intercept.T.dot(residuals) 90 | 91 | # update the coefficients 92 | theta += learning_rate * gradient 93 | 94 | # you may not always want to do this, since it's expensive. Tune 95 | # the error_interval to increase/reduce this 96 | if (iteration + 1) % loglik_interval == 0: 97 | ll.append(log_likelihood(X_w_intercept, y, theta)) 98 | 99 | # recall that our theta includes the intercept, so we need to pop 100 | # that off and store it 101 | self.intercept = theta[0] 102 | self.theta = theta[1:] 103 | self.log_likelihood = ll 104 | self.column_means = means 105 | self.column_std = stds 106 | 107 | def predict_proba(self, X): 108 | """Generate the probabilities that a sample belongs to class 1""" 109 | X = check_array(X, accept_sparse=False, copy=False) # type: np.ndarray 110 | 111 | # make sure dims match 112 | theta = self.theta 113 | if theta.shape[0] != X.shape[1]: 114 | raise ValueError("Dim mismatch in predictors!") 115 | 116 | # scale the data appropriately 117 | X = (X - self.column_means) / self.column_std 118 | 119 | # creates a copy 120 | return logistic_sigmoid(np.dot(X, theta.T) + self.intercept) 121 | 122 | def predict(self, X): 123 | return np.round(self.predict_proba(X)).astype(int) 124 | -------------------------------------------------------------------------------- /packtml/decision_tree/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Author: Taylor Smith 4 | # 5 | # Metrics used for determining how to split a feature in a decision tree. 6 | 7 | from __future__ import absolute_import 8 | 9 | import numpy as np 10 | 11 | __all__ = [ 12 | 'entropy', 13 | 'gini_impurity', 14 | 'InformationGain', 15 | 'VarianceReduction' 16 | ] 17 | 18 | 19 | def _clf_metric(y, metric): 20 | """Internal helper. Since this is internal, so no validation performed""" 21 | # get unique classes in y 22 | y = np.asarray(y) 23 | C, cts = np.unique(y, return_counts=True) 24 | 25 | # a base case is that there is only one class label 26 | if C.shape[0] == 1: 27 | return 0. 28 | 29 | pr_C = cts.astype(float) / y.shape[0] # P(Ci) 30 | 31 | # 1 - sum(P(Ci)^2) 32 | if metric == 'gini': 33 | return 1. - pr_C.dot(pr_C) # np.sum(pr_C ** 2) 34 | elif metric == 'entropy': 35 | return np.sum(-pr_C * np.log2(pr_C)) 36 | 37 | # shouldn't ever get to this point since it is internal 38 | else: 39 | raise ValueError("metric should be one of ('gini', 'entropy'), " 40 | "but encountered %s" % metric) 41 | 42 | 43 | def entropy(y): 44 | """Compute the entropy of class labels. 45 | 46 | This computes the entropy of training samples. A high entropy means 47 | a relatively uniform distribution, while low entropy indicates a 48 | varying distribution (many peaks and valleys). 49 | 50 | References 51 | ---------- 52 | .. [1] http://www.cs.csi.cuny.edu/~imberman/ai/Entropy%20and%20Information%20Gain.htm 53 | """ 54 | return _clf_metric(y, 'entropy') 55 | 56 | 57 | def gini_impurity(y): 58 | """Compute the Gini index on a target variable. 59 | 60 | The Gini index gives an idea of how mixed two classes are within a leaf 61 | node. A perfect class separation will result in a Gini impurity of 0 (i.e., 62 | "perfectly pure"). 63 | """ 64 | return _clf_metric(y, 'gini') 65 | 66 | 67 | class BaseCriterion(object): 68 | """Splitting criterion. 69 | 70 | Base class for InformationGain and VarianceReduction. WARNING - do 71 | not invoke this class directly. Use derived classes only! This is a 72 | loosely-defined abstract class used to prescribe a common interface 73 | for sub-classes. 74 | """ 75 | def compute_uncertainty(self, y): 76 | """Compute the uncertainty for a vector. 77 | 78 | A subclass should override this function to compute the uncertainty 79 | (i.e., entropy or gini) of a vector. 80 | """ 81 | 82 | 83 | class InformationGain(BaseCriterion): 84 | """Compute the information gain after a split. 85 | 86 | The information gain metric is used by CART trees in a classification 87 | context. It measures the difference in the gini or entropy before and 88 | after a split to determine whether the split "taught" us anything. 89 | 90 | Parameters 91 | ---------- 92 | metric : str or unicode 93 | The name of the metric to use. Either "gini" (Gini impurity) 94 | or "entropy". 95 | """ 96 | def __init__(self, metric): 97 | # let fail out with a KeyError if an improper metric 98 | self.crit = {'gini': gini_impurity, 99 | 'entropy': entropy}[metric] 100 | 101 | def compute_uncertainty(self, y): 102 | """Compute the uncertainty for a vector. 103 | 104 | This method computes either the Gini impurity or entropy of a target 105 | vector using the prescribed method. 106 | """ 107 | return self.crit(y) 108 | 109 | def __call__(self, target, mask, uncertainty): 110 | """Compute the information gain of a split. 111 | 112 | Parameters 113 | ---------- 114 | target : np.ndarray 115 | The target feature 116 | 117 | mask : np.ndarray 118 | The value mask 119 | 120 | uncertainty : float 121 | The gini or entropy of rows pre-split 122 | """ 123 | left, right = target[mask], target[~mask] 124 | p = float(left.shape[0]) / float(target.shape[0]) 125 | 126 | crit = self.crit # type: callable 127 | return uncertainty - p * crit(left) - (1 - p) * crit(right) 128 | 129 | 130 | class VarianceReduction(BaseCriterion): 131 | """Compute the variance reduction after a split. 132 | 133 | Variance reduction is a splitting criterion used by CART trees in the 134 | context of regression. It examines the variance in a target before and 135 | after a split to determine whether we've reduced the variability in the 136 | target. 137 | """ 138 | def compute_uncertainty(self, y): 139 | """Compute the variance of a target.""" 140 | return np.var(y) 141 | 142 | def __call__(self, target, mask, uncertainty): 143 | left, right = target[mask], target[~mask] 144 | return uncertainty - (self.compute_uncertainty(left) + 145 | self.compute_uncertainty(right)) 146 | -------------------------------------------------------------------------------- /packtml/recommendation/itemitem.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from sklearn.utils.validation import check_array 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | import numpy as np 9 | 10 | from packtml.recommendation.base import RecommenderMixin 11 | from packtml.base import BaseSimpleEstimator 12 | 13 | __all__ = [ 14 | 'ItemItemRecommender' 15 | ] 16 | 17 | try: 18 | xrange 19 | except NameError: # py3 20 | xrange = range 21 | 22 | 23 | class ItemItemRecommender(BaseSimpleEstimator, RecommenderMixin): 24 | """Item-to-item recommendation system using cosine similarity. 25 | 26 | A collaborative filtering recommender algorithm that computes the cosine 27 | similarity between each item and generates recommendations for users' 28 | highly rated items by returning similar items. 29 | 30 | Parameters 31 | ---------- 32 | R : array-like, shape=(n_users, n_items) 33 | The ratings matrix. This must be an explicit ratings matrix where 34 | 0 indicates an item that a user has not yet rated. 35 | 36 | Attributes 37 | ---------- 38 | similarity : np.ndarray, shape=(n_items, n_items) 39 | The similarity matrix. 40 | 41 | Notes 42 | ----- 43 | This implementation is very rudimentary and does not allow tuning of 44 | hyper-parameters apart from ``k``. No similarity metrics apart from cosine 45 | similarity may be used. It is largely written to optimize readability. For 46 | a very highly optimized version, try the "implicit" library. 47 | """ 48 | def __init__(self, R, k=10): 49 | # check the array, but don't copy if not needed 50 | R = check_array(R, dtype=np.float32, copy=False) # type: np.ndarray 51 | 52 | # save the hyper param for later use later 53 | self.k = k 54 | self.similarity = self._compute_sim(R, k) 55 | 56 | def _compute_sim(self, R, k): 57 | # compute the similarity between all the items. This calculates the 58 | # similarity between each ITEM 59 | sim = cosine_similarity(R.T) 60 | 61 | # Only keep the similarities of the top K, setting all others to zero 62 | # (negative since we want descending) 63 | not_top_k = np.argsort(-sim, axis=1)[:, k:] # shape=(n_items, k) 64 | 65 | if not_top_k.shape[1]: # only if there are cols (k < n_items) 66 | # now we have to set these to zero in the similarity matrix 67 | row_indices = np.repeat(range(not_top_k.shape[0]), 68 | not_top_k.shape[1]) 69 | sim[row_indices, not_top_k.ravel()] = 0. 70 | 71 | return sim 72 | 73 | def recommend_for_user(self, R, user, n=10, 74 | filter_previously_seen=False, 75 | return_scores=True, **kwargs): 76 | """Generate predictions for a single user. 77 | 78 | Parameters 79 | ---------- 80 | R : array-like, shape=(n_users, n_items) 81 | The test ratings matrix. This must be an explicit ratings matrix 82 | where 0 indicates an item that a user has not yet rated. 83 | 84 | user : int 85 | The user index for whom to generate predictions. 86 | 87 | n : int or None, optional (default=10) 88 | The number of recommendations to return. Default is 10. For all, 89 | set to None. 90 | 91 | filter_previously_seen : bool, optional (default=False) 92 | Whether to filter out previously-rated items. 93 | 94 | return_scores : bool, optional (default=True) 95 | Whether to return the computed scores for the recommended items. 96 | 97 | **kwargs : keyword args 98 | Ignored. Present to match super signature. 99 | 100 | Returns 101 | ------- 102 | items : np.ndarray 103 | The top ``n`` items recommended for the user. 104 | 105 | recommendations (optional) : np.ndarray 106 | The corresponding scores for the top ``n`` items for the 107 | user. Only returned if ``return_scores`` is True. 108 | """ 109 | 110 | # check the array and get the user vector 111 | R = check_array(R, dtype=np.float32, copy=False) 112 | user_vector = R[user, :] 113 | 114 | # compute the dot product between the user vector and the similarity 115 | # matrix 116 | recommendations = user_vector.dot(self.similarity) # shape=(n_items,) 117 | 118 | # if we're filtering previously-seen items, now is the time to do that 119 | item_indices = np.arange(recommendations.shape[0]) 120 | if filter_previously_seen: 121 | rated_mask = user_vector != 0. 122 | recommendations = recommendations[~rated_mask] 123 | item_indices = item_indices[~rated_mask] 124 | 125 | # now arg sort descending (most similar items first) 126 | order = np.argsort(-recommendations)[:n] 127 | items = item_indices[order] 128 | 129 | if return_scores: 130 | return items, recommendations[order] 131 | return items 132 | 133 | def predict(self, R): 134 | """Generate predictions for the test set. 135 | 136 | Computes the predicted product of users' rated vectors on the 137 | pre-computed similarity matrix. 138 | """ 139 | R = check_array(R, dtype=np.float32, copy=False) # type: np.ndarray 140 | 141 | # compute the product R*sim 142 | return R.dot(self.similarity) 143 | -------------------------------------------------------------------------------- /packtml/utils/plotting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from matplotlib.colors import ListedColormap 6 | from matplotlib import pyplot as plt 7 | 8 | from packtml.utils.validation import learning_curve 9 | 10 | import numpy as np 11 | 12 | __all__ = [ 13 | 'add_decision_boundary_to_axis', 14 | 'plot_learning_curve' 15 | ] 16 | 17 | 18 | def add_decision_boundary_to_axis(estimator, axis, nclasses, 19 | X_data, stepsize=0.02, 20 | colors=('#FFAAAA', '#AAFFFA', '#AAAAFF')): 21 | """Plot a classification decision boundary on an axis. 22 | 23 | Estimates lots of values from a classifier and adds the color map 24 | mesh to an axis. WARNING - use PRIOR to applying scatter values on the 25 | axis! 26 | 27 | Parameters 28 | ---------- 29 | estimator : BaseSimpleEstimator 30 | An estimator that implements ``predict``. 31 | 32 | axis : matplotlib.Axis 33 | The axis we're plotting on. 34 | 35 | nclasses : int 36 | The number of classes present in the data 37 | 38 | X_data : np.ndarray, shape=(n_samples, n_features) 39 | The X data used to fit the data, and along which to plot. Preferably 40 | 2 features for plotting. The first two will be used to plot. 41 | 42 | stepsize : float, optional (default=0.02) 43 | The size of the steps in the values on which to predict. 44 | 45 | colors : tuple or iterable, optional 46 | The color map 47 | 48 | Returns 49 | ------- 50 | xx : np.ndarray 51 | The x array 52 | 53 | yy : np.ndarray 54 | The y array 55 | 56 | axis : matplotlib.Axis 57 | The axis 58 | """ 59 | x_min, x_max = X_data[:, 0].min() - 1, X_data[:, 0].max() + 1 60 | y_min, y_max = X_data[:, 1].min() - 1, X_data[:, 1].max() + 1 61 | xx, yy = np.meshgrid(np.arange(x_min, x_max, stepsize), 62 | np.arange(y_min, y_max, stepsize)) 63 | 64 | Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()]) 65 | Z = Z.reshape(xx.shape) 66 | 67 | axis.pcolormesh(xx, yy, Z, cmap=ListedColormap(list(colors[:nclasses]))) 68 | return xx, yy, axis 69 | 70 | 71 | def plot_learning_curve(model, X, y, n_folds, metric, train_sizes, 72 | seed=None, trace=False, y_lim=None, **kwargs): 73 | """Fit and plot a CV learning curve. 74 | 75 | Fits the model with ``n_folds`` of cross-validation over various 76 | training sizes and computes arrays of scores for the train samples 77 | and the validation fold samples, then plots them. 78 | 79 | Parameters 80 | ---------- 81 | model : BaseSimpleEstimator 82 | The model class that should be fit. 83 | 84 | X : array-like, shape=(n_samples, n_features) 85 | The training matrix. 86 | 87 | y : array-like, shape=(n_samples,) 88 | The training labels/ground-truth. 89 | 90 | metric : callable 91 | The scoring metric 92 | 93 | train_sizes : iterable 94 | The size of the training set for each fold. 95 | 96 | n_folds : int, optional (default=3) 97 | The number of CV folds 98 | 99 | seed : int or None, optional (default=None) 100 | The random seed for cross validation. 101 | 102 | trace : bool, optional (default=False) 103 | Whether to print to stdout after each set of folds is fit 104 | for a given train size. 105 | 106 | y_lim : iterable or None, optional (default=None) 107 | The y-axis limits 108 | 109 | **kwargs : keyword args or dict 110 | The keyword args to pass to the estimator. 111 | 112 | Returns 113 | ------- 114 | plt : Figure 115 | The matplotlib figure for plotting 116 | 117 | References 118 | ---------- 119 | .. [1] Based on the scikit-learn example: 120 | http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html 121 | """ 122 | # delegate the model fits to the function in .validation 123 | train_scores, val_scores = learning_curve( 124 | model, X, y, train_sizes=train_sizes, 125 | metric=metric, seed=seed, trace=trace, 126 | n_folds=n_folds, **kwargs) 127 | 128 | # compute the means/stds of each scores list 129 | train_scores_mean = np.mean(train_scores, axis=1) 130 | val_scores_mean = np.mean(val_scores, axis=1) 131 | train_scores_std = np.std(train_scores, axis=1) 132 | val_scores_std = np.std(val_scores, axis=1) 133 | 134 | # plot the learning curves 135 | plt.figure() 136 | plt.title("Learning curve (model=%s, train sizes=%s)" 137 | % (model.__name__, str(train_sizes))) 138 | 139 | plt.xlabel("Training sizes") 140 | plt.ylabel("Score (%s)" % metric.__name__) 141 | plt.grid() 142 | 143 | # define the y-axis limit if necessary 144 | if y_lim is not None: 145 | plt.ylim(y_lim) 146 | 147 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 148 | train_scores_mean + train_scores_std, alpha=0.1, 149 | color="r") 150 | plt.fill_between(train_sizes, val_scores_mean - val_scores_std, 151 | val_scores_mean + val_scores_std, alpha=0.1, 152 | color="g") 153 | 154 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", 155 | label="Training score") 156 | plt.plot(train_sizes, val_scores_mean, 'o-', color="g", 157 | label="Validation score") 158 | plt.legend(loc="best") 159 | 160 | return plt 161 | -------------------------------------------------------------------------------- /packtml/utils/validation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from sklearn.externals import six 6 | from sklearn.model_selection import ShuffleSplit 7 | 8 | import numpy as np 9 | 10 | __all__ = [ 11 | 'assert_is_binary', 12 | 'is_iterable', 13 | 'learning_curve' 14 | ] 15 | 16 | 17 | def assert_is_binary(y): 18 | """Validate that a vector is binary. 19 | 20 | Checks that a vector is binary. This utility is used by all of 21 | the simple classifier estimators to validate the input target. 22 | 23 | Parameters 24 | ---------- 25 | y : np.ndarray, shape=(n_samples,) 26 | The target vector 27 | """ 28 | # validate that y is in (0, 1) 29 | unique_y = np.unique(y) # type: np.ndarray 30 | if unique_y.shape[0] != 2 or [0, 1] != unique_y.tolist(): 31 | raise ValueError("y must be binary, but got unique values of %s" 32 | % str(unique_y)) 33 | 34 | 35 | def is_iterable(x): 36 | """Determine whether an item is iterable. 37 | 38 | Python 3 introduced the ``__iter__`` functionality to 39 | strings, making them falsely behave like iterables. This 40 | function determines whether an object is an iterable given 41 | the presence of the ``__iter__`` method and that the object 42 | is *not* a string. 43 | 44 | Parameters 45 | ---------- 46 | x : int, object, str, iterable, None 47 | The object in question. Could feasibly be any type. 48 | """ 49 | if isinstance(x, six.string_types): 50 | return False 51 | return hasattr(x, "__iter__") 52 | 53 | 54 | def learning_curve(model, X, y, metric, train_sizes, n_folds=3, 55 | seed=None, trace=False, **kwargs): 56 | """Fit a CV learning curve. 57 | 58 | Fits the model with ``n_folds`` of cross-validation over various 59 | training sizes and returns arrays of scores for the train samples 60 | and the validation fold samples. 61 | 62 | Parameters 63 | ---------- 64 | model : BaseSimpleEstimator 65 | The model class that should be fit. 66 | 67 | X : array-like, shape=(n_samples, n_features) 68 | The training matrix. 69 | 70 | y : array-like, shape=(n_samples,) 71 | The training labels/ground-truth. 72 | 73 | metric : callable 74 | The scoring metric 75 | 76 | train_sizes : iterable 77 | The size of the training set for each fold. 78 | 79 | n_folds : int, optional (default=3) 80 | The number of CV folds 81 | 82 | seed : int or None, optional (default=None) 83 | The random seed for cross validation. 84 | 85 | trace : bool, optional (default=False) 86 | Whether to print to stdout after each set of folds is fit 87 | for a given train size. 88 | 89 | **kwargs : keyword args or dict 90 | The keyword args to pass to the estimator. 91 | 92 | Returns 93 | ------- 94 | train_scores : np.ndarray, shape=(n_trials, n_folds) 95 | The scores for the train samples. Each row represents a 96 | trial (new train size), and each column corresponds to the 97 | fold of the trial, i.e., for ``n_folds=3``, there will be 98 | 3 columns. 99 | 100 | val_scores : np.ndarray, shape=(n_trials, n_folds) 101 | The scores for the validation folds. Each row represents a 102 | trial (new train size), and each column corresponds to the 103 | fold of the trial, i.e., for ``n_folds=3``, there will be 104 | 3 columns. 105 | """ 106 | # Each of these lists will be a 2d array. A row will represent a 107 | # trial for a particular train size, and each column will 108 | # correspond with a fold. 109 | train_scores = [] 110 | val_scores = [] 111 | 112 | # The number of samples in the dataset 113 | n_samples = X.shape[0] 114 | 115 | # If the input is a pandas frame, make it a numpy array for indexing 116 | if hasattr(X, "iloc"): 117 | X = X.values 118 | 119 | # We need to validate that all of the sizes within the train_sizes 120 | # are less than the number of samples in the dataset! 121 | assert all(s < n_samples for s in train_sizes), \ 122 | "All train sizes (%s) must be less than n_samples (%i)" \ 123 | % (str(train_sizes), n_samples) 124 | 125 | # For each training size, we're going to initialize a new KFold 126 | # cross validation instance and fit the K folds... 127 | for train_size in train_sizes: 128 | cv = ShuffleSplit(n_splits=n_folds, 129 | train_size=train_size, 130 | test_size=n_samples - train_size, 131 | random_state=seed) 132 | 133 | # This is the inner list (row) that will represent the 134 | # scores for this train size 135 | inner_train_scores = [] 136 | inner_val_scores = [] 137 | 138 | # get our splits 139 | for train_indices, test_indices in cv.split(X, y): 140 | # get the training samples 141 | train_X = X[train_indices, :] 142 | train_y = y.take(train_indices) 143 | 144 | # fit the model 145 | m = model(train_X, train_y, **kwargs) 146 | 147 | # score the model on the train set 148 | inner_train_scores.append( 149 | metric(train_y, m.predict(train_X))) 150 | 151 | # score the model on the validation set 152 | inner_val_scores.append( 153 | metric(y.take(test_indices), 154 | m.predict(X[test_indices, :]))) 155 | 156 | # Now attach the inner lists to the outer lists 157 | train_scores.append(inner_train_scores) 158 | val_scores.append(inner_val_scores) 159 | 160 | if trace: 161 | print("Completed fitting %i folds for train size=%i" 162 | % (n_folds, train_size)) 163 | 164 | # Make our train/val arrays into numpy arrays 165 | train_scores = np.asarray(train_scores) 166 | val_scores = np.asarray(val_scores) 167 | 168 | return train_scores, val_scores 169 | -------------------------------------------------------------------------------- /packtml/neural_net/transfer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Author: Taylor G Smith 4 | # 5 | # A simple transfer learning classifier. If you find yourself struggling 6 | # to follow the derivation of the back-propagation, check out this great 7 | # refresher on scalar & matrix calculas + differential equations. 8 | # http://parrt.cs.usfca.edu/doc/matrix-calculus/index.html 9 | 10 | from __future__ import absolute_import 11 | 12 | import numpy as np 13 | 14 | from packtml.neural_net.base import NeuralMixin, tanh 15 | from packtml.base import BaseSimpleEstimator 16 | from packtml.neural_net.mlp import NeuralNetClassifier, _calculate_loss 17 | 18 | __all__ = [ 19 | 'TransferLearningClassifier' 20 | ] 21 | 22 | try: 23 | xrange 24 | except NameError: 25 | xrange = range 26 | 27 | 28 | def _pretrained_forward_step(X, pt_weights, pt_biases): 29 | """Complete a forward step from the pre-trained model""" 30 | # progress through all the layers (the output was already trimmed off) 31 | for w, b in zip(pt_weights, pt_biases): 32 | X = tanh(X.dot(w) + b) 33 | return X 34 | 35 | 36 | class TransferLearningClassifier(BaseSimpleEstimator, NeuralMixin): 37 | """A transfer learning classifier. 38 | 39 | Create a multi-layer perceptron classifier that learned from a 40 | previously-trained network. No fine-tuning is performed, and no 41 | prior-trained layers can be retrained (i.e., they remain frozen). 42 | 43 | Parameters 44 | ---------- 45 | X : array-like, shape=(n_samples, n_features) 46 | The training array. Should be a numpy array or array-like structure 47 | with only finite values. 48 | 49 | y : array-like, shape=(n_samples,) 50 | The target vector. 51 | 52 | pretrained : NeuralNetClassifier, TransferLearningClassifier 53 | The pre-trained MLP. The transfer learner leverages the features 54 | extracted from the pre-trained network (the trained weights without 55 | the output layer) and uses them to transform the input data before 56 | training the new layers. 57 | 58 | hidden : iterable, optional (default=(25,)) 59 | An iterable indicating the number of units per hidden layer. 60 | 61 | n_iter : int, optional (default=10) 62 | The default number of iterations to perform. 63 | 64 | learning_rate : float, optional (default=0.001) 65 | The rate at which we descend the gradient. 66 | 67 | random_state : int, None or RandomState, optional (default=42) 68 | The random state for initializing the weights matrices. 69 | """ 70 | def __init__(self, X, y, pretrained, hidden=(25,), n_iter=10, 71 | regularization=0.01, learning_rate=0.001, random_state=42): 72 | 73 | # initialize via the NN static method 74 | self.hidden = hidden 75 | self.random_state = random_state 76 | self.n_iter = n_iter 77 | self.learning_rate = learning_rate 78 | self.regularization = regularization 79 | 80 | # this is the previous model 81 | self.model = pretrained 82 | 83 | # assert that it's a neural net or we'll break down later 84 | assert isinstance(pretrained, NeuralMixin), \ 85 | "Pre-trained model must be a neural network!" 86 | 87 | # initialize weights, biases, etc. for THE TRAINABLE LAYERS ONLY! 88 | pt_w, pt_b = pretrained.export_weights_and_biases(output_layer=False) 89 | X, y, weights, biases = NeuralNetClassifier._init_weights_biases( 90 | X, y, hidden, random_state, 91 | 92 | # use as the last dim the column dimension of the last weights 93 | # (the ones BEFORE the output layer, that is) 94 | last_dim=pt_w[-1].shape[1]) 95 | 96 | # we can train this in a similar fashion to the plain MLP we designed: 97 | # for each iteration, feed X through the network, compute the loss, 98 | # and back-propagate the error to correct the weights. 99 | train_loss = [] 100 | for _ in xrange(n_iter): 101 | # first, pass the input data through the pre-trained model's 102 | # hidden layers. Do not pass it through the last layer, however, 103 | # since we don't want its output from the softmax layer. 104 | X_transform = _pretrained_forward_step(X, pt_w, pt_b) 105 | 106 | # NOW we complete a forward step on THIS model's 107 | # untrained weights/biases 108 | out, layer_results = NeuralNetClassifier._forward_step( 109 | X_transform, weights, biases) 110 | 111 | # compute the loss on the output 112 | loss = _calculate_loss(truth=y, preds=out, weights=pt_w + weights, 113 | l2=self.regularization) 114 | train_loss.append(loss) 115 | 116 | # now back-propagate to correct THIS MODEL's weights and biases via 117 | # gradient descent. NOTE we do NOT adjust the pre-trained model's 118 | # weights!!! 119 | NeuralNetClassifier._back_propagate( 120 | truth=y, probas=out, layer_results=layer_results, 121 | weights=weights, biases=biases, 122 | learning_rate=learning_rate, 123 | l2=self.regularization) 124 | 125 | # save the weights, biases 126 | self.weights = weights 127 | self.biases = biases 128 | self.train_loss = train_loss 129 | 130 | def predict(self, X): 131 | # compute the probabilities and then get the argmax for each class 132 | probas = self.predict_proba(X) 133 | 134 | # we want the argmaxes of each row 135 | return np.argmax(probas, axis=1) 136 | 137 | def predict_proba(self, X): 138 | # Compute a forward step with the pre-trained model first: 139 | pt_w, pt_b = self.model.export_weights_and_biases(output_layer=False) 140 | X_transform = _pretrained_forward_step(X, pt_w, pt_b) 141 | 142 | # and then complete a forward step with the trained weights and biases 143 | return NeuralNetClassifier._forward_step( 144 | X_transform, self.weights, self.biases)[0] 145 | 146 | def export_weights_and_biases(self, output_layer=True): 147 | pt_weights, pt_biases = \ 148 | self.model.export_weights_and_biases(output_layer=False) 149 | w = pt_weights + self.weights 150 | b = pt_biases + self.biases 151 | 152 | if output_layer: 153 | return w, b 154 | return w[:-1], b[:-1] 155 | -------------------------------------------------------------------------------- /packtml/recommendation/als.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from sklearn.utils.validation import check_random_state, check_array 6 | 7 | from numpy.linalg import solve 8 | import numpy as np 9 | 10 | from packtml.recommendation.base import RecommenderMixin 11 | from packtml.base import BaseSimpleEstimator 12 | 13 | __all__ = [ 14 | 'ALS' 15 | ] 16 | 17 | try: 18 | xrange 19 | except NameError: # py3 does not have xrange 20 | xrange = range 21 | 22 | 23 | def mse(R, X, Y, W): 24 | """Compute the reconstruction MSE. This is our loss function""" 25 | return ((W * (R - X.dot(Y))) ** 2).sum() 26 | 27 | 28 | class ALS(BaseSimpleEstimator, RecommenderMixin): 29 | r"""Alternating Least Squares for explicit ratings matrices. 30 | 31 | Computes the ALS user factors and item factors for explicit ratings 32 | systems. This solves: 33 | 34 | R' = XY 35 | 36 | where ``X`` is an (m x f) matrix of user factors, and ``Y`` is an 37 | (f x n) matrix of item factors. Note that for very large ratings matrices, 38 | this can quickly grow outside the scope of what will fit into memory! 39 | 40 | Parameters 41 | ---------- 42 | R : array-like, shape=(n_users, n_items) 43 | The ratings matrix. This must be an explicit ratings matrix where 44 | 0 indicates an item that a user has not yet rated. 45 | 46 | factors : int or float, optional (default=0.25) 47 | The number of factors to learn. Default is ``0.25 * n_items``. 48 | 49 | n_iter : int, optional (default=10) 50 | The number of iterations to perform. The larger the number, the 51 | smaller the train error, but the more likely to overfit. 52 | 53 | lam : float, optional (default=0.001) 54 | The L2 regularization parameter. The higher ``lam``, the more 55 | regularization is performed, and the more robust the solution. However, 56 | extra iterations are typically required. 57 | 58 | random_state : int, None or RandomState, optional (default=None) 59 | The random state for seeding the initial item factors matrix, ``Y``. 60 | 61 | Attributes 62 | ---------- 63 | X : np.ndarray, shape=(n_users, factors) 64 | The user factors 65 | 66 | Y : np.ndarray, shape=(factors, n_items) 67 | The item factors 68 | 69 | train_err : list 70 | The list of training MSE for each iteration performed 71 | 72 | lam : float 73 | The lambda (regularization) value. 74 | 75 | Notes 76 | ----- 77 | If you plan to use a very large matrix, consider using a sparse CSR matrix 78 | to preserve memory, but you'll have to amend the ``recommend_for_user`` 79 | function, which expects dense output. 80 | """ 81 | def __init__(self, R, factors=0.25, n_iter=10, lam=0.001, 82 | random_state=None): 83 | # check the array 84 | R = check_array(R, dtype=np.float32) # type: np.ndarray 85 | n_users, n_items = R.shape 86 | 87 | # get the random state 88 | random_state = check_random_state(random_state) 89 | 90 | # get the number of factors. If it's a float, compute it 91 | if isinstance(factors, float): 92 | factors = min(np.ceil(factors * n_items).astype(int), n_items) 93 | 94 | # the weight matrix is used as a masking matrix when computing the MSE. 95 | # it allows us to only compute the reconstruction MSE on the rated 96 | # items, and not the unrated ones. 97 | W = (R > 0.).astype(np.float32) 98 | 99 | # initialize the first array, Y, and X to None 100 | Y = random_state.rand(factors, n_items) 101 | X = None 102 | 103 | # the identity matrix (time lambda) is added to the XX or YY product 104 | # at each iteration. 105 | I = np.eye(factors) * lam 106 | 107 | # this list will store all of the training errors 108 | train_err = [] 109 | 110 | # for each iteration, iteratively solve for X, Y, and compute the 111 | # updated MSE 112 | for i in xrange(n_iter): 113 | X = solve(Y.dot(Y.T) + I, Y.dot(R.T)).T 114 | Y = solve(X.T.dot(X) + I, X.T.dot(R)) 115 | 116 | # update the training error 117 | train_err.append(mse(R, X, Y, W)) 118 | 119 | # now we have X, Y, which are our user factors and item factors 120 | self.X = X 121 | self.Y = Y 122 | self.train_err = train_err 123 | self.n_factors = factors 124 | self.lam = lam 125 | 126 | def predict(self, R, recompute_users=False): 127 | """Generate predictions for the test set. 128 | 129 | Computes the predicted product of ``XY`` given the fit factors. 130 | If recomputing users, will learn the new user factors given the 131 | existing item factors. 132 | """ 133 | R = check_array(R, dtype=np.float32, copy=False) # type: np.ndarray 134 | Y = self.Y # item factors 135 | n_factors, _ = Y.shape 136 | 137 | # we can re-compute user factors on their updated ratings, if we want. 138 | # (not always advisable, but can be useful for offline recommenders) 139 | if recompute_users: 140 | I = np.eye(n_factors) * self.lam 141 | X = solve(Y.dot(Y.T) + I, Y.dot(R.T)).T 142 | else: 143 | X = self.X 144 | 145 | return X.dot(Y) 146 | 147 | def recommend_for_user(self, R, user, n=10, recompute_user=False, 148 | filter_previously_seen=False, 149 | return_scores=True): 150 | """Generate predictions for a single user. 151 | 152 | Parameters 153 | ---------- 154 | R : array-like, shape=(n_users, n_items) 155 | The test ratings matrix. This must be an explicit ratings matrix 156 | where 0 indicates an item that a user has not yet rated. 157 | 158 | user : int 159 | The user index for whom to generate predictions. 160 | 161 | n : int or None, optional (default=10) 162 | The number of recommendations to return. Default is 10. For all, 163 | set to None. 164 | 165 | recompute_user : bool, optional (default=False) 166 | Whether to recompute the user factors given the test set. 167 | Not always advisable, as it can be considered leakage, but can 168 | be useful in an offline recommender system where refits are 169 | infrequent. 170 | 171 | filter_previously_seen : bool, optional (default=False) 172 | Whether to filter out previously-rated items. 173 | 174 | return_scores : bool, optional (default=True) 175 | Whether to return the computed scores for the recommended items. 176 | 177 | Returns 178 | ------- 179 | items : np.ndarray 180 | The top ``n`` items recommended for the user. 181 | 182 | scores (optional) : np.ndarray 183 | The corresponding scores for the top ``n`` items for the user. 184 | Only returned if ``return_scores`` is True. 185 | """ 186 | R = check_array(R, dtype=np.float32, copy=False) 187 | 188 | # compute the new user vector. Squeeze to make sure it's a vector 189 | user_vec = self.predict(R, recompute_users=recompute_user)[user, :] 190 | item_indices = np.arange(user_vec.shape[0]) 191 | 192 | # if we are filtering previously seen, remove the prior-rated items 193 | if filter_previously_seen: 194 | rated_mask = R[user, :] != 0. 195 | user_vec = user_vec[~rated_mask] 196 | item_indices = item_indices[~rated_mask] 197 | 198 | order = np.argsort(-user_vec)[:n] # descending order of computed scores 199 | items = item_indices[order] 200 | if return_scores: 201 | return items, user_vec[order] 202 | return items 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/tgsmith61591/Hands-on-Supervised-Machine-Learning-with-Python.svg?branch=master)](https://travis-ci.org/tgsmith61591/Hands-on-Supervised-Machine-Learning-with-Python) 2 | [![Build status](https://ci.appveyor.com/api/projects/status/181d16js2ao3vn5v/branch/master?svg=true)](https://ci.appveyor.com/project/tgsmith61591/hands-on-supervised-machine-learning-with-python/branch/master) 3 | [![codecov](https://codecov.io/gh/tgsmith61591/Hands-on-Supervised-Machine-Learning-with-Python/branch/master/graph/badge.svg)](https://codecov.io/gh/tgsmith61591/Hands-on-Supervised-Machine-Learning-with-Python) 4 | ![Supported versions](https://img.shields.io/badge/python-3.5+-blue.svg) 5 | 6 | # Hands-on-Supervised-Machine-Learning-with-Python 7 | 8 | Published by Packt, Hands-on Supervised Machine Learning with Python 9 | 10 | ### Learn the underpinning os many supervised learning algorithms, and develop rich python coding practices in the process. 11 | 12 | *Supervised learning—help teach a machine to think for itself!* 13 | 14 | ## Overview 15 | 16 | These days machine learning is everywhere, and it’s here to stay. Understanding the core principles that drive how a machine “learns” is a critical skill for any would-be practitioner or consumer alike. This course will introduce you to supervised machine learning, guiding you through the implementation and nuances of many popular machine learning algorithms while facilitating a deep understanding along the way. 17 | 18 | In this course, we’ll cover parametric models such as linear and logistic regression, non-parametric methods such as decision trees & various clustering techniques, and we’ll wrap up with a brief foray into neural networks. 19 | 20 | This video course highlights clean coding techniques, object-oriented class design, and general best practices in machine learning 21 | 22 | ## Target audience 23 | 24 | This course is designed for those who would like to understand supervised machine learning algorithms at a deeper level. If you’re interested in understanding how and why an algorithm works rather than simply how to call its API, this course might be for you. Intermediate Python knowledge and at least an intermediate understanding of mathematical concepts is assumed. While notions in this course will be broken down into bits as granular as absolutely possible, terms and ideas such as “matrix transposition,” “gradient,” “dot product,” and “time complexity” are assumed to be understood without further explanation. 25 | 26 | ## What you will learn 27 | 28 | * Understand the fundamental and theoretical differences between parametric and non-parametric models, and why you might opt for one over the other. 29 | * Discover how a machine can learn a concept and generalize its understanding to new data 30 | * Implement and grok several well-known supervised learning algorithms from scratch; build out your github portfolio and show off what you’re capable of! 31 | * Learn about model families like recommender systems, which are immediately applicable in domains such as ecommerce and marketing. 32 | * Become a much stronger python developer 33 | 34 | ### Project layout 35 | 36 | All **[source code](packtml/)** is within the `packtml` folder, which serves as the python 37 | package for this course. Within the [examples](examples/) directory, you'll find a 38 | number of short Python scripts that serve to demonstrate how various classes in the `packtml` 39 | submodules work. Each respective folder inside the `examples/` directory corresponds to a 40 | submodule inside of the `packtml` python package. 41 | 42 | ### Getting started 43 | 44 | To get your environment set up, make sure you have Anaconda installed and on your path. 45 | Then simply run the following: 46 | 47 | ```bash 48 | $ conda env create -f environment.yml 49 | ``` 50 | 51 | To activate your environment in a Unix environment: 52 | 53 | ```bash 54 | $ source activate packt-sml 55 | ``` 56 | 57 | In a Windows environment: 58 | 59 | ``` 60 | activate packt-sml 61 | ``` 62 | 63 | ### Set up the python package (in your activated environment): 64 | 65 | ```bash 66 | (packt-sml) $ python setup.py install 67 | ``` 68 | 69 | ## What you'll learn 70 | 71 | In this course and within this package, you'll learn to implement a number of 72 | commonly-used supervised learning algorithms, and when best to use one type of 73 | model over another. Below you'll find in-action examples of the various algorithms 74 | we implement within this package. 75 | 76 | ### Regression 77 | 78 | The classic introduction to machine learning, not only will we learn about linear regression, 79 | we'll code one from scratch so you really understand what's happening 80 | [under the hood](packtml/regression/simple_regression.py). Then we'll 81 | [apply one in practice](examples/regression/example_linear_regression.py) so you can see 82 | how you might use it. 83 | 84 | KNN 85 | 86 | Next, we'll dive into logistic regression, which is linear regression's classification cousin. See 87 | the full logistic regression example [here](examples/regression/example_logistic_regression.py) 88 | or the algorithm's [source code](packtml/regression/simple_logistic.py) if you're interested. 89 | 90 | KNN 91 | 92 | ### KNN clustering 93 | 94 | During our exploration of non-parametric models, we'll explore clustering. 95 | The `packtml` package implements a simple, but effective k-Nearest Neighbor classifier. 96 | Here is its output on the iris dataset. For the full code example, head to the 97 | [examples directory](examples/clustering/example_knn_classifier.py) and then to the 98 | [source code](packtml/clustering/knn.py) to see how it's implemented. 99 | 100 | KNN 101 | 102 | ### Decision trees 103 | 104 | In this course, we'll also implement a CART decision tree from scratch (for both 105 | regression and classification). Our classification tree's performance and potential 106 | is shown at varying tree depths in the images below. The classification tree example 107 | is located [here](examples/decision_tree/example_classification_decision_tree.py), and 108 | the source code can be found [here](packtml/decision_tree/cart.py). 109 | 110 | CART clf 111 | 112 | In addition to classification, we can build a tree as a non-linear regression 113 | model, as shown below. The regression tree example is located 114 | [here](examples/decision_tree/example_regression_decision_tree.py). Check out the 115 | [source code](packtml/decision_tree/cart.py) to understand how it works. 116 | 117 | CART reg 118 | 119 | ### Deep learning 120 | 121 | One of the hottest topics of machine learning right now is deep learning and neural 122 | networks. In this course, we'll learn how to code a multi-layer perceptron classifier 123 | from scratch. The full example code is located [here](examples/neural_net/example_mlp_classifier.py) 124 | and this is the [source code](packtml/neural_net/mlp.py). 125 | 126 | MLP 127 | 128 | Next, we'll show how we can use the weights the MLP has learned on previous data to 129 | learn new classification labels via transfer learning. For further implementation 130 | details, check out the [example code](examples/neural_net/example_transfer_learning.py) 131 | or the [source code](packtml/neural_net/transfer.py). 132 | 133 | MLP transfer 134 | 135 | ### Recommendation algorithms 136 | 137 | These days, everything is available for purchase online. E-commerce sites have devoted 138 | lots of research to algorithms that can learn your preferences. In this course, we'll 139 | learn two such algorithms: 140 | 141 | * [Item-to-item](packtml/recommendation/itemitem.py) collaborative filtering 142 | * [Alternating least squares](packtml/recommendation/als.py) (matrix factorization) 143 | 144 | The [example ALS code](examples/recommendation/example_als_recommender.py) shows how 145 | train error decreases by iteration: 146 | 147 | ALS 148 | -------------------------------------------------------------------------------- /packtml/metrics/ranking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Author: Taylor G Smith 4 | # 5 | # Recommender system ranking metrics derived from Spark source for use with 6 | # Python-based recommender systems. See the full gist here: 7 | # https://gist.github.com/tgsmith61591/d8aa96ac7c74c24b33e4b0cb967ca519 8 | 9 | from __future__ import absolute_import, division 10 | 11 | import numpy as np 12 | 13 | import warnings 14 | 15 | __all__ = [ 16 | 'mean_average_precision', 17 | 'ndcg_at', 18 | 'precision_at', 19 | ] 20 | 21 | try: 22 | xrange 23 | except NameError: # python 3 does not have an 'xrange' 24 | xrange = range 25 | 26 | 27 | def _require_positive_k(k): 28 | """Helper function to avoid copy/pasted code for validating K""" 29 | if k <= 0: 30 | raise ValueError("ranking position k should be positive") 31 | 32 | 33 | def _mean_ranking_metric(predictions, labels, metric): 34 | """Helper function for precision_at_k and mean_average_precision""" 35 | # do not zip, as this will require an extra pass of O(N). Just assert 36 | # equal length and index (compute in ONE pass of O(N)). 37 | # if len(predictions) != len(labels): 38 | # raise ValueError("dim mismatch in predictions and labels!") 39 | # return np.mean([ 40 | # metric(np.asarray(predictions[i]), np.asarray(labels[i])) 41 | # for i in xrange(len(predictions)) 42 | # ]) 43 | 44 | # Actually probably want lazy evaluation in case preds is a 45 | # generator, since preds can be very dense and could blow up 46 | # memory... but how to assert lengths equal? FIXME 47 | return np.mean([ 48 | metric(np.asarray(prd), np.asarray(labels[i])) 49 | for i, prd in enumerate(predictions) # lazy eval if generator 50 | ]) 51 | 52 | 53 | def _warn_for_empty_labels(): 54 | """Helper for missing ground truth sets""" 55 | warnings.warn("Empty ground truth set! Check input data") 56 | return 0. 57 | 58 | 59 | def precision_at(predictions, labels, k=10, assume_unique=True): 60 | """Compute the precision at K. 61 | 62 | Compute the average precision of all the queries, truncated at 63 | ranking position k. If for a query, the ranking algorithm returns 64 | n (n is less than k) results, the precision value will be computed 65 | as #(relevant items retrieved) / k. This formula also applies when 66 | the size of the ground truth set is less than k. 67 | If a query has an empty ground truth set, zero will be used as 68 | precision together with a warning. 69 | 70 | Parameters 71 | ---------- 72 | predictions : array-like, shape=(n_predictions,) 73 | The prediction array. The items that were predicted, in descending 74 | order of relevance. 75 | 76 | labels : array-like, shape=(n_ratings,) 77 | The labels (positively-rated items). 78 | 79 | k : int, optional (default=10) 80 | The rank at which to measure the precision. 81 | 82 | assume_unique : bool, optional (default=True) 83 | Whether to assume the items in the labels and predictions are each 84 | unique. That is, the same item is not predicted multiple times or 85 | rated multiple times. 86 | 87 | Examples 88 | -------- 89 | >>> # predictions for 3 users 90 | >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5], 91 | ... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10], 92 | ... [1, 2, 3, 4, 5]] 93 | >>> # labels for the 3 users 94 | >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []] 95 | >>> precision_at(preds, labels, 1) 96 | 0.33333333333333331 97 | >>> precision_at(preds, labels, 5) 98 | 0.26666666666666666 99 | >>> precision_at(preds, labels, 15) 100 | 0.17777777777777778 101 | """ 102 | # validate K 103 | _require_positive_k(k) 104 | 105 | def _inner_pk(pred, lab): 106 | # need to compute the count of the number of values in the predictions 107 | # that are present in the labels. We'll use numpy in1d for this (set 108 | # intersection in O(1)) 109 | if lab.shape[0] > 0: 110 | n = min(pred.shape[0], k) 111 | cnt = np.in1d(pred[:n], lab, assume_unique=assume_unique).sum() 112 | return float(cnt) / k 113 | else: 114 | return _warn_for_empty_labels() 115 | 116 | return _mean_ranking_metric(predictions, labels, _inner_pk) 117 | 118 | 119 | def mean_average_precision(predictions, labels, assume_unique=True): 120 | """Compute the mean average precision on predictions and labels. 121 | 122 | Returns the mean average precision (MAP) of all the queries. If a query 123 | has an empty ground truth set, the average precision will be zero and a 124 | warning is generated. 125 | 126 | Parameters 127 | ---------- 128 | predictions : array-like, shape=(n_predictions,) 129 | The prediction array. The items that were predicted, in descending 130 | order of relevance. 131 | 132 | labels : array-like, shape=(n_ratings,) 133 | The labels (positively-rated items). 134 | 135 | assume_unique : bool, optional (default=True) 136 | Whether to assume the items in the labels and predictions are each 137 | unique. That is, the same item is not predicted multiple times or 138 | rated multiple times. 139 | 140 | Examples 141 | -------- 142 | >>> # predictions for 3 users 143 | >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5], 144 | ... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10], 145 | ... [1, 2, 3, 4, 5]] 146 | >>> # labels for the 3 users 147 | >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []] 148 | >>> mean_average_precision(preds, labels) 149 | 0.35502645502645497 150 | """ 151 | 152 | def _inner_map(pred, lab): 153 | if lab.shape[0]: 154 | # compute the number of elements within the predictions that are 155 | # present in the actual labels, and get the cumulative sum weighted 156 | # by the index of the ranking 157 | n = pred.shape[0] 158 | 159 | # Scala code from Spark source: 160 | # var i = 0 161 | # var cnt = 0 162 | # var precSum = 0.0 163 | # val n = pred.length 164 | # while (i < n) { 165 | # if (labSet.contains(pred(i))) { 166 | # cnt += 1 167 | # precSum += cnt.toDouble / (i + 1) 168 | # } 169 | # i += 1 170 | # } 171 | # precSum / labSet.size 172 | 173 | arange = np.arange(n, dtype=np.float32) + 1. # this is the denom 174 | present = np.in1d(pred[:n], lab, assume_unique=assume_unique) 175 | prec_sum = np.ones(present.sum()).cumsum() 176 | denom = arange[present] 177 | return (prec_sum / denom).sum() / lab.shape[0] 178 | 179 | else: 180 | return _warn_for_empty_labels() 181 | 182 | return _mean_ranking_metric(predictions, labels, _inner_map) 183 | 184 | 185 | def ndcg_at(predictions, labels, k=10, assume_unique=True): 186 | """Compute the normalized discounted cumulative gain at K. 187 | 188 | Compute the average NDCG value of all the queries, truncated at ranking 189 | position k. The discounted cumulative gain at position k is computed as: 190 | sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1) 191 | and the NDCG is obtained by dividing the DCG value on the ground truth set. 192 | In the current implementation, the relevance value is binary. 193 | If a query has an empty ground truth set, zero will be used as 194 | NDCG together with a warning. 195 | 196 | Parameters 197 | ---------- 198 | predictions : array-like, shape=(n_predictions,) 199 | The prediction array. The items that were predicted, in descending 200 | order of relevance. 201 | 202 | labels : array-like, shape=(n_ratings,) 203 | The labels (positively-rated items). 204 | 205 | k : int, optional (default=10) 206 | The rank at which to measure the NDCG. 207 | 208 | assume_unique : bool, optional (default=True) 209 | Whether to assume the items in the labels and predictions are each 210 | unique. That is, the same item is not predicted multiple times or 211 | rated multiple times. 212 | 213 | Examples 214 | -------- 215 | >>> # predictions for 3 users 216 | >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5], 217 | ... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10], 218 | ... [1, 2, 3, 4, 5]] 219 | >>> # labels for the 3 users 220 | >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []] 221 | >>> ndcg_at(preds, labels, 3) 222 | 0.3333333432674408 223 | >>> ndcg_at(preds, labels, 10) 224 | 0.48791273434956867 225 | 226 | References 227 | ---------- 228 | .. [1] K. Jarvelin and J. Kekalainen, "IR evaluation methods for 229 | retrieving highly relevant documents." 230 | """ 231 | # validate K 232 | _require_positive_k(k) 233 | 234 | def _inner_ndcg(pred, lab): 235 | if lab.shape[0]: 236 | # if we do NOT assume uniqueness, the set is a bit different here 237 | if not assume_unique: 238 | lab = np.unique(lab) 239 | 240 | n_lab = lab.shape[0] 241 | n_pred = pred.shape[0] 242 | n = min(max(n_pred, n_lab), k) # min(min(p, l), k)? 243 | 244 | # similar to mean_avg_prcsn, we need an arange, but this time +2 245 | # since python is zero-indexed, and the denom typically needs +1. 246 | # Also need the log base2... 247 | arange = np.arange(n, dtype=np.float32) # length n 248 | 249 | # since we are only interested in the arange up to n_pred, truncate 250 | # if necessary 251 | arange = arange[:n_pred] 252 | denom = np.log2(arange + 2.) # length n 253 | gains = 1. / denom # length n 254 | 255 | # compute the gains where the prediction is present in the labels 256 | dcg_mask = np.in1d(pred[:n], lab, assume_unique=assume_unique) 257 | dcg = gains[dcg_mask].sum() 258 | 259 | # the max DCG is sum of gains where the index < the label set size 260 | max_dcg = gains[arange < n_lab].sum() 261 | return dcg / max_dcg 262 | 263 | else: 264 | return _warn_for_empty_labels() 265 | 266 | return _mean_ranking_metric(predictions, labels, _inner_ndcg) -------------------------------------------------------------------------------- /packtml/neural_net/mlp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Author: Taylor G Smith 4 | # 5 | # A simple multilayer perceptron classifier. If you find yourself struggling 6 | # to follow the derivation of the back-propagation, check out this great 7 | # refresher on scalar & matrix calculas + differential equations. 8 | # http://parrt.cs.usfca.edu/doc/matrix-calculus/index.html 9 | 10 | from __future__ import absolute_import, division 11 | 12 | from sklearn.utils.validation import check_X_y, check_random_state 13 | from sklearn.utils.multiclass import check_classification_targets 14 | 15 | import numpy as np 16 | 17 | from packtml.base import BaseSimpleEstimator 18 | from packtml.neural_net.base import NeuralMixin, tanh 19 | 20 | __all__ = [ 21 | 'NeuralNetClassifier' 22 | ] 23 | 24 | try: 25 | xrange 26 | except NameError: # py3 27 | xrange = range 28 | 29 | 30 | def _calculate_loss(truth, preds, weights, l2): 31 | """Compute the log loss. 32 | 33 | Calculate the log loss between the true class labels and the predictions 34 | generated by the softmax layer in our neural network. 35 | 36 | Parameters 37 | ---------- 38 | truth : np.ndarray, shape=(n_samples,) 39 | The true labels 40 | 41 | preds : np.ndarray, shape=(n_samples, n_classes) 42 | The predicted class probabilities 43 | 44 | weights : list 45 | The list of weights matrices. Used for computing the loss 46 | with the L2 regularization. 47 | 48 | l2 : float 49 | The regularization parameter 50 | """ 51 | # get the log probs of the prediction for the true class labels 52 | n_samples = truth.shape[0] 53 | logprobs = -np.log(preds[range(n_samples), truth]) 54 | 55 | # compute the sum of log probs 56 | sum_logprobs = logprobs.sum() 57 | 58 | # add the L2 regularization term 59 | sum_logprobs += l2 / 2. * sum(np.square(W).sum() for W in weights) 60 | return 1. / n_samples * sum_logprobs 61 | 62 | 63 | def softmax(X): 64 | """Apply the softmax function. 65 | 66 | The softmax function squashes an N-dimensional vector into a K-dimensional 67 | vector whose elements add up to 1, and whose elements are bound in (0, 1). 68 | 69 | Parameters 70 | ---------- 71 | X : np.ndarray, shape=(n_samples, n_features) 72 | The matrix over which to apply softmax along the rows. 73 | """ 74 | # first compute the exponential. This is a step that would take place 75 | # in the sigmoid (logistic) function as well. We can already begin to see 76 | # where this is going to resemble logistic regression... 77 | X_exp = np.exp(X) 78 | return X_exp / np.sum(X_exp, axis=1, keepdims=True) 79 | 80 | 81 | class NeuralNetClassifier(BaseSimpleEstimator, NeuralMixin): 82 | """A neural network classifier. 83 | 84 | Create a multi-layer perceptron classifier. Note that this is a very 85 | simple implementation of an MLP with only fully-connected layers and 86 | very few tunable parameters. It is designed for readability. For more 87 | optimized neural network code, look into TensorFlow, Keras or other 88 | libraries. 89 | 90 | This implementation of a neural net uses the TanH activation function 91 | *only*, and does not allow early convergence. It will continue for 92 | ``n_iter``. There are many other parameters that would typically be 93 | tunable in a network, for instance dropout, regularization, learning 94 | rate, etc. The majority of these parameters are left out of this 95 | implementation to keep it simple. 96 | 97 | Parameters 98 | ---------- 99 | X : array-like, shape=(n_samples, n_features) 100 | The training array. Should be a numpy array or array-like structure 101 | with only finite values. 102 | 103 | y : array-like, shape=(n_samples,) 104 | The target vector. 105 | 106 | hidden : iterable, optional (default=(25,)) 107 | An iterable indicating the number of units per hidden layer. 108 | 109 | n_iter : int, optional (default=10) 110 | The default number of iterations to perform. 111 | 112 | learning_rate : float, optional (default=0.001) 113 | The rate at which we descend the gradient. 114 | 115 | random_state : int, None or RandomState, optional (default=42) 116 | The random state for initializing the weights matrices. 117 | """ 118 | def __init__(self, X, y, hidden=(25,), n_iter=10, learning_rate=0.001, 119 | regularization=0.01, random_state=42): 120 | 121 | self.hidden = hidden 122 | self.random_state = random_state 123 | self.n_iter = n_iter 124 | self.learning_rate = learning_rate 125 | self.regularization = regularization 126 | 127 | # initialize weights, biases, etc. 128 | X, y, weights, biases = self._init_weights_biases( 129 | X, y, hidden, random_state, last_dim=None) 130 | 131 | # we can keep track of the loss for each iter 132 | train_loss = [] 133 | 134 | # for each iteration, feed X through the network, compute the loss, 135 | # and back-propagate the error to correct the weights. 136 | for _ in xrange(n_iter): 137 | # compute the product of X on the hidden layers (the output of 138 | # the network) 139 | out, layer_results = self._forward_step(X, weights, biases) 140 | 141 | # compute the loss on the output 142 | loss = _calculate_loss(truth=y, preds=out, weights=weights, 143 | l2=self.regularization) 144 | train_loss.append(loss) 145 | 146 | # now back-propagate to correct the weights and biases via 147 | # gradient descent 148 | self._back_propagate(y, out, layer_results, weights, 149 | biases, learning_rate, 150 | self.regularization) 151 | 152 | # save the weights, biases and loss as instance attributes 153 | self.weights = weights 154 | self.biases = biases 155 | self.train_loss = train_loss 156 | 157 | @staticmethod 158 | def _init_weights_biases(X, y, hidden, random_state, last_dim=None): 159 | # make sure dims all match in X, y and that we have appropriate 160 | # classification targets 161 | X, y = check_X_y(X, y, copy=False) 162 | check_classification_targets(y) 163 | 164 | random_state = check_random_state(random_state) 165 | 166 | # initialize the weights and biases. For each layer, we create a new 167 | # matrix of dimensions [last_layer_col_dim, new_col_dim]. This ensures 168 | # we can compute matrix products across the layers and that the 169 | # dimensions all match up. The biases will each be a vector of ones 170 | # in this example, though in other networks that can be initialized 171 | # differently 172 | weights = [] 173 | biases = [] 174 | 175 | # if last dim is undefined, use the column shape of the input data. 176 | # this argument is used to simplify the initialization of weights/ 177 | # biases in the transfer learning class... 178 | if last_dim is None: 179 | last_dim = X.shape[1] 180 | 181 | for layer_size in hidden: 182 | # initialize to extremely small values 183 | w = random_state.rand(last_dim, layer_size) * 0.01 184 | b = np.ones(layer_size) 185 | last_dim = layer_size 186 | 187 | weights.append(w) 188 | biases.append(b) 189 | 190 | # we need to add one more layer (the output layer) that is the size of 191 | # the expected output probabilities. We'll apply the softmax function 192 | # to the output of this layer. 193 | n_outputs = np.unique(y).shape[0] 194 | weights.append(random_state.rand(last_dim, n_outputs)) 195 | biases.append(np.ones(n_outputs)) 196 | 197 | return X, y, weights, biases 198 | 199 | @staticmethod 200 | def _forward_step(X, weights, biases): 201 | # track the intermediate products 202 | intermediate_results = [X] 203 | 204 | # progress through all the layers EXCEPT the very last one. 205 | for w, b in zip(weights[:-1], biases[:-1]): 206 | 207 | # apply the activation function to the product of X and the weights 208 | # (after adding the bias vector) 209 | X = tanh(X.dot(w) + b) 210 | 211 | # append this layer result 212 | intermediate_results.append(X) 213 | 214 | # we handle the very last layer a bit differently, since it's out 215 | # output layer. First compute the product... 216 | X = X.dot(weights[-1]) + biases[-1] 217 | 218 | # then rather than apply the activation function (tanh), we apply 219 | # the softmax, which is essentially generalized logistic regression. 220 | return softmax(X), intermediate_results 221 | 222 | @staticmethod 223 | def _back_propagate(truth, probas, layer_results, weights, 224 | biases, learning_rate, l2): 225 | # Compute the gradient (derivative) of our loss function WRT our 226 | # last layer of weights/biases, and back propagate the error back 227 | # up the layers, adjusting the weights as we go. 228 | # 229 | # Or, expressed in the chain rule: 230 | # dL/dW = (dL/dZ)(dZ/dW) ... 231 | 232 | # the probabilities are our first delta. Subtract 1 from the 233 | # TRUE labels' probabilities in the predictions 234 | n_samples = truth.shape[0] 235 | 236 | # subtract 1 from true idcs. initial deltas are: (y_hat - y) 237 | # This computes d2 = Y - T 238 | probas[range(n_samples), truth] -= 1. 239 | 240 | # iterate back through the layers computing the deltas (derivatives) 241 | last_delta = probas 242 | for next_weights, next_biases, layer_res in \ 243 | zip(weights[::-1], biases[::-1], layer_results[::-1]): 244 | 245 | # the gradient for this layer is equivalent to the previous delta 246 | # multiplied by the intermittent layer result 247 | d_W = layer_res.T.dot(last_delta) 248 | 249 | # column sums of the (just-computed) delta is the derivative 250 | # of the biases 251 | d_b = np.sum(last_delta, axis=0) 252 | 253 | # set the next delta for the next iter 254 | last_delta = last_delta.dot(next_weights.T) * \ 255 | (1. - np.power(layer_res, 2.)) 256 | 257 | # update the weights gradient with the L2 regularization term 258 | d_W += l2 * next_weights 259 | 260 | # update the weights in this layer. The learning rate governs how 261 | # quickly we descend the gradient 262 | next_weights += -learning_rate * d_W 263 | next_biases += -learning_rate * d_b 264 | 265 | def predict(self, X): 266 | # compute the probabilities and then get the argmax for each class 267 | probas = self.predict_proba(X) 268 | 269 | # we want the argmaxes of each row 270 | return np.argmax(probas, axis=1) 271 | 272 | def predict_proba(self, X): 273 | # simply compute a forward step (we don't care about idx 1 of the 274 | # tuple, which is just the intermediate products) 275 | return self._forward_step(X, self.weights, self.biases)[0] 276 | 277 | def export_weights_and_biases(self, output_layer=True): 278 | w, b = self.weights, self.biases 279 | if output_layer: 280 | return w, b 281 | return w[:-1], b[:-1] 282 | -------------------------------------------------------------------------------- /packtml/decision_tree/cart.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Author: Taylor G Smith 4 | # 5 | # A simplified version of Classification and Regression Trees. This file 6 | # is intended to maximize readability and understanding of how CART trees work. 7 | # For very fast or customizable decision tree solutions, use scikit-learn. 8 | # 9 | # The best order in which to read & understand the contents to best 10 | # grok the entire concept: 11 | # 12 | # 1. metrics.InformationGain & metrics.VarianceReduction 13 | # 2. RandomSplitter 14 | # 3. LeafNode 15 | # 4. BaseCART 16 | 17 | from __future__ import absolute_import, division 18 | 19 | from sklearn.utils.validation import check_X_y, check_random_state, check_array 20 | from sklearn.utils.multiclass import check_classification_targets 21 | from sklearn.base import ClassifierMixin, RegressorMixin, is_classifier 22 | 23 | import numpy as np 24 | 25 | from packtml.base import BaseSimpleEstimator 26 | from packtml.decision_tree.metrics import InformationGain, VarianceReduction 27 | 28 | __all__ = [ 29 | 'CARTRegressor', 30 | 'CARTClassifier' 31 | ] 32 | 33 | try: 34 | xrange 35 | except NameError: # py3 36 | xrange = range 37 | 38 | 39 | class RandomSplitter(object): 40 | """Evaluate a split via random values in a feature. 41 | 42 | Every feature in the dataset needs to be evaluated in a CART tree. Since 43 | that in itself can be expensive, the random splitter allows us to look at 44 | only a random amount of row splits per feature in order to make the best 45 | splitting decision. 46 | 47 | Parameters 48 | ---------- 49 | random_state : np.random.RandomState 50 | The random state for seeding the choices 51 | 52 | criterion : callable 53 | The metric used for evaluating the "goodness" of a split. Either 54 | ``InformationGain`` (with entropy or Gini) for classification, or 55 | ``VarianceReduction`` for regression. 56 | 57 | n_val_sample : float, optional (default=25) 58 | The number of values per feature to sample as a splitting point. 59 | """ 60 | def __init__(self, random_state, criterion, n_val_sample=25): 61 | self.random_state = random_state 62 | self.criterion = criterion # BaseCriterion from metrics 63 | self.n_val_sample = n_val_sample 64 | 65 | def find_best(self, X, y): 66 | criterion = self.criterion 67 | rs = self.random_state 68 | 69 | # keep track of the best info gain 70 | best_gain = 0. 71 | 72 | # keep track of best feature and best value on which to split 73 | best_feature = None 74 | best_value = None 75 | 76 | # get the current state of the uncertainty (gini or entropy) 77 | uncertainty = criterion.compute_uncertainty(y) 78 | 79 | # iterate over each feature 80 | for col in xrange(X.shape[1]): 81 | feature = X[:, col] 82 | 83 | # get all values in the feature 84 | # values = np.unique(feature) 85 | seen_values = set() 86 | 87 | # the number of values to sample. Should be defined as the min 88 | # between the prescribed n_val_sample value and the number of 89 | # unique values in the feature. 90 | n_vals = min(self.n_val_sample, np.unique(feature).shape[0]) 91 | 92 | # For each of n_val_sample iterations, select a random value 93 | # from the feature and create a split. We store whether we've seen 94 | # the value before; if we have, continue. Continue until we've seen 95 | # n_vals unique values. This allows us to more likely select values 96 | # that are high frequency (retains distributional data implicitly) 97 | for v in rs.permutation(feature): 98 | 99 | # if we've hit the limit of the number of values we wanted to 100 | # examine, break out 101 | if len(seen_values) == n_vals: 102 | break 103 | 104 | # if we've already tried this value, continue 105 | elif v in seen_values: # O(1) lookup 106 | continue 107 | 108 | # otherwise, it's a new value we've never tried splitting on. 109 | # add it to the set. 110 | seen_values.add(v) 111 | 112 | # create the mask (these values "go left") 113 | mask = feature >= v # type: np.ndarray 114 | 115 | # skip this step if this doesn't divide the dataset 116 | if np.unique(mask).shape[0] == 1: # all True or all False 117 | continue 118 | 119 | # compute how good this split was 120 | gain = criterion(y, mask, uncertainty=uncertainty) 121 | 122 | # if the gain is better, we keep this feature & value & 123 | # update the best gain we've seen so far 124 | if gain > best_gain: 125 | best_feature = col 126 | best_value = v 127 | best_gain = gain 128 | 129 | # if best feature is None, it means we never found a viable split... 130 | # this is likely because all of our labels were perfect. In this case, 131 | # we could select any feature and the first value and define that as 132 | # our left split and nothing will go right. 133 | if best_feature is None: 134 | best_feature = 0 135 | best_value = np.squeeze(X[:, best_feature])[0] 136 | best_gain = 0. 137 | 138 | # we need to know the best feature, the best value, and the best gain 139 | return best_feature, best_value, best_gain 140 | 141 | 142 | class LeafNode(object): 143 | """A tree node class. 144 | 145 | Tree node that store the column on which to split and the value above 146 | which to go left vs. right. Additionally, it stores the target statistic 147 | related to this node. For instance, in a classification scenario: 148 | 149 | >>> X = np.array([[ 1, 1.5 ], 150 | ... [ 2, 0.5 ], 151 | ... [ 3, 0.75]]) 152 | >>> y = np.array([0, 1, 1]) 153 | >>> node = LeafNode(split_col=0, split_val=2, 154 | ... class_statistic=_most_common(y)) 155 | 156 | This means if ``node`` were a terminal node, it would generate predictions 157 | of 1, since that was the most common value in the pre-split ``y``. The 158 | class statistic will differ for splits in the tree, where the most common 159 | value in ``y`` for records in ``X`` that go left is 1, and 0 for that which 160 | goes to the right. 161 | 162 | The class statistic is computed for each split as the tree recurses. 163 | 164 | Parameters 165 | ---------- 166 | split_col : int 167 | The column on which to split. 168 | 169 | split_val : float or int 170 | The value above which to go left. 171 | """ 172 | def __init__(self, split_col, split_val, split_gain, class_statistic): 173 | 174 | self.split_col = split_col 175 | self.split_val = split_val 176 | self.split_gain = split_gain 177 | 178 | # the class statistic is the mode or the mean of the targets for 179 | # this split 180 | self.class_statistic = class_statistic 181 | 182 | # if these remain None, it's a terminal node 183 | self.left = None 184 | self.right = None 185 | 186 | def create_split(self, X, y): 187 | """Split the next X, y. 188 | 189 | Returns 190 | ------- 191 | X_left : np.ndarray, shape=(n_samples, n_features) 192 | Rows where ``split_col >= split_val``. 193 | 194 | X_right : np.ndarray, shape=(n_samples, n_features) 195 | Rows where ``split_col < split_val``. 196 | 197 | y_left : np.ndarray, shape=(n_samples,) 198 | Target where ``split_col >= split_val``. 199 | 200 | y_right : np.ndarray, shape=(n_samples,) 201 | Target where ``split_col < split_val``. 202 | """ 203 | # If values in the split column are greater than or equal to the 204 | # split value, we go left. 205 | left_mask = X[:, self.split_col] >= self.split_val 206 | 207 | # Otherwise we go to the right 208 | right_mask = ~left_mask # type: np.ndarray 209 | 210 | # If the left mask is all False or all True, it means we've achieved 211 | # a perfect split. 212 | all_left = left_mask.all() 213 | all_right = right_mask.all() 214 | 215 | # create the left split. If it's all right side, we'll return None 216 | X_left = X[left_mask, :] if not all_right else None 217 | y_left = y[left_mask] if not all_right else None 218 | 219 | # create the right split. If it's all left side, we'll return None. 220 | X_right = X[right_mask, :] if not all_left else None 221 | y_right = y[right_mask] if not all_left else None 222 | 223 | return X_left, X_right, y_left, y_right 224 | 225 | def is_terminal(self): 226 | """Determine whether the node is terminal. 227 | 228 | If there is no left node and no right node, it's a terminal node. 229 | If either is non-None, it is a parent to something. 230 | """ 231 | return self.left is None and self.right is None 232 | 233 | def __repr__(self): 234 | """Get the string representation of the node.""" 235 | return "Rule: Go left if x%i >= %r else go right (gain=%.3f)" \ 236 | % (self.split_col, self.split_val, self.split_gain) 237 | 238 | def predict_record(self, record): 239 | """Find the terminal node in the tree and return the class statistic""" 240 | # First base case, this is a terminal node: 241 | has_left = self.left is not None 242 | has_right = self.right is not None 243 | if not has_left and not has_right: 244 | return self.class_statistic 245 | 246 | # Otherwise, determine whether the record goes right or left 247 | go_left = record[self.split_col] >= self.split_val 248 | 249 | # if we go left and there is a left node, delegate the recursion to the 250 | # left side 251 | if go_left and has_left: 252 | return self.left.predict_record(record) 253 | 254 | # if we go right, delegate to the right 255 | if not go_left and has_right: 256 | return self.right.predict_record(record) 257 | 258 | # if we get here, it means one of two things: 259 | # 1. we were supposed to go left and didn't have a left 260 | # 2. we were supposed to go right and didn't have a right 261 | # for both of these, we return THIS class statistic 262 | return self.class_statistic 263 | 264 | 265 | def _most_common(y): 266 | # This is essentially just a "mode" function to compute the most 267 | # common value in a vector. 268 | cls, cts = np.unique(y, return_counts=True) 269 | order = np.argsort(-cts) 270 | return cls[order][0] 271 | 272 | 273 | class _BaseCART(BaseSimpleEstimator): 274 | def __init__(self, X, y, criterion, min_samples_split, max_depth, 275 | n_val_sample, random_state): 276 | # make sure max_depth > 1 277 | if max_depth < 2: 278 | raise ValueError("max depth must be > 1") 279 | 280 | # check the input arrays, and if it's classification validate the 281 | # target values in y 282 | X, y = check_X_y(X, y, accept_sparse=False, dtype=None, copy=True) 283 | if is_classifier(self): 284 | check_classification_targets(y) 285 | 286 | # hyper parameters so we can later inspect attributes of the model 287 | self.min_samples_split = min_samples_split 288 | self.max_depth = max_depth 289 | self.n_val_sample = n_val_sample 290 | self.random_state = random_state 291 | 292 | # create the splitting class 293 | random_state = check_random_state(random_state) 294 | self.splitter = RandomSplitter(random_state, criterion, n_val_sample) 295 | 296 | # grow the tree depth first 297 | self.tree = self._find_next_split(X, y, 0) 298 | 299 | def _target_stat(self, y): 300 | """Given a vector, ``y``, decide what value to return as the leaf 301 | node statistic (mean for regression, mode for classification) 302 | """ 303 | 304 | def _find_next_split(self, X, y, current_depth): 305 | # base case 1: current depth is the limit, the parent node should 306 | # be a terminal node (child = None) 307 | # base case 2: n samples in X <= min_samples_split 308 | if current_depth == self.max_depth or \ 309 | X.shape[0] <= self.min_samples_split: 310 | return None 311 | 312 | # create the next split 313 | split_feature, split_value, gain = \ 314 | self.splitter.find_best(X, y) 315 | 316 | # create the next node based on the best split feature and value 317 | # that we just found. Also compute the "target stat" (mode of y for 318 | # classification problems or mean of y for regression problems) and 319 | # pass that to the node in case it is the terminal node (i.e., the 320 | # decision maker) 321 | node = LeafNode(split_feature, split_value, gain, self._target_stat(y)) 322 | 323 | # Create the splits based on the criteria we just determined, and then 324 | # recurse down left, right sides 325 | X_left, X_right, y_left, y_right = node.create_split(X, y) 326 | 327 | # if either the left or right is None, it means we've achieved a 328 | # perfect split. It is then a terminal node and will remain None. 329 | if X_left is not None: 330 | node.left = self._find_next_split(X_left, y_left, 331 | current_depth + 1) 332 | 333 | if X_right is not None: 334 | node.right = self._find_next_split(X_right, y_right, 335 | current_depth + 1) 336 | 337 | return node 338 | 339 | def predict(self, X): 340 | # Check the array 341 | X = check_array(X, dtype=np.float32) # type: np.ndarray 342 | 343 | # For each record in X, find its leaf node in the tree (O(log N)) 344 | # to get the predictions. This makes the prediction operation 345 | # O(N log N) runtime complexity 346 | predictions = [self.tree.predict_record(row) for row in X] 347 | return np.asarray(predictions) 348 | 349 | 350 | class CARTRegressor(_BaseCART, RegressorMixin): 351 | """Decision tree regression. 352 | 353 | Builds a decision tree to solve a regression problem using the CART 354 | algorithm. The estimator builds a binary tree structure, evaluating each 355 | feature at each iteration to recursively split along the best value and 356 | progress down the tree until each leaf node reaches parsimony. 357 | 358 | The regression tree uses "variance reduction" to assess the "goodness" 359 | of a split, selecting the split and feature that maximizes the value. 360 | 361 | To make predictions, each record is evaluated at each node of the tree 362 | until it reaches a leaf node. For regression, predictions are made by 363 | returning the training target's mean for the leaf node. 364 | 365 | Parameters 366 | ---------- 367 | X : array-like, shape=(n_samples, n_features) 368 | The training array. Should be a numpy array or array-like structure 369 | with only finite values. 370 | 371 | y : array-like, shape=(n_samples,) 372 | The target vector. 373 | 374 | max_depth : int, optional (default=5) 375 | The maximum depth to which the tree will grow. Note that the tree is 376 | not guaranteed to reach this depth and may stop growing early if the 377 | ``min_samples_split`` terminal criterion is met first. 378 | 379 | min_samples_split : int, optional (default=1) 380 | A terminal criterion used to halt the growth of a tree. If a leaf 381 | node's split contains <= ``min_samples_split``, it will not grow 382 | any further. 383 | 384 | n_val_sample : int, optional (default=25) 385 | The method by which we evaluate splits differs a bit from highly 386 | optimized libraries like scikit-learn, which may evaluate for the 387 | globally optimal split for each feature. We use random splitting 388 | which evaluates a number of unique values for each feature at each 389 | split. The ``n_val_sample`` is the maximum number of values per 390 | feature that will be evaluated as a potential splitting point at 391 | each iteration. 392 | 393 | random_state : int, None or RandomState, optional (default=None) 394 | The random state used to seed the RandomSplitter. 395 | 396 | Attributes 397 | ---------- 398 | splitter : RandomSplitter 399 | The feature splitting class. Used for determining optimal splits at 400 | each node. 401 | 402 | tree : LeafNode 403 | The actual tree. Each node contains data on the class statistic (i.e., 404 | mode or mean of the training target at that split), best feature and 405 | best value. 406 | """ 407 | def __init__(self, X, y, max_depth=5, min_samples_split=1, 408 | n_val_sample=25, random_state=None): 409 | 410 | super(CARTRegressor, self).__init__( 411 | X, y, criterion=VarianceReduction(), 412 | min_samples_split=min_samples_split, max_depth=max_depth, 413 | n_val_sample=n_val_sample, random_state=random_state) 414 | 415 | def _target_stat(self, y): 416 | """Given a vector, ``y``, get the mean""" 417 | return y.mean() 418 | 419 | 420 | class CARTClassifier(_BaseCART, ClassifierMixin): 421 | """Decision tree classication. 422 | 423 | Builds a decision tree to solve a classification problem using the CART 424 | algorithm. The estimator builds a binary tree structure, evaluating each 425 | feature at each iteration to recursively split along the best value and 426 | progress down the tree until each leaf node reaches parsimony. 427 | 428 | The classification tree uses "information gain" to assess the "goodness" 429 | of a split, selecting the split and feature that maximizes the value. 430 | 431 | To make predictions, each record is evaluated at each node of the tree 432 | until it reaches a leaf node. For classification, predictions are made by 433 | returning the training target's mode for the leaf node. 434 | 435 | Parameters 436 | ---------- 437 | X : array-like, shape=(n_samples, n_features) 438 | The training array. Should be a numpy array or array-like structure 439 | with only finite values. 440 | 441 | y : array-like, shape=(n_samples,) 442 | The target vector. 443 | 444 | criterion : str or unicode, optional (default='gini') 445 | The splitting criterion used for classification problems. CART trees 446 | typically use "gini" but their cousins, C4.5 trees, use "entropy". Both 447 | metrics are extremely similar and will likely not change your tree 448 | structure by much. 449 | 450 | max_depth : int, optional (default=5) 451 | The maximum depth to which the tree will grow. Note that the tree is 452 | not guaranteed to reach this depth and may stop growing early if the 453 | ``min_samples_split`` terminal criterion is met first. 454 | 455 | min_samples_split : int, optional (default=1) 456 | A terminal criterion used to halt the growth of a tree. If a leaf 457 | node's split contains <= ``min_samples_split``, it will not grow 458 | any further. 459 | 460 | n_val_sample : int, optional (default=25) 461 | The method by which we evaluate splits differs a bit from highly 462 | optimized libraries like scikit-learn, which may evaluate for the 463 | globally optimal split for each feature. We use random splitting 464 | which evaluates a number of unique values for each feature at each 465 | split. The ``n_val_sample`` is the maximum number of values per 466 | feature that will be evaluated as a potential splitting point at 467 | each iteration. 468 | 469 | random_state : int, None or RandomState, optional (default=None) 470 | The random state used to seed the RandomSplitter. 471 | 472 | Attributes 473 | ---------- 474 | splitter : RandomSplitter 475 | The feature splitting class. Used for determining optimal splits at 476 | each node. 477 | 478 | tree : LeafNode 479 | The actual tree. Each node contains data on the class statistic (i.e., 480 | mode or mean of the training target at that split), best feature and 481 | best value. 482 | """ 483 | def __init__(self, X, y, criterion='gini', max_depth=5, 484 | min_samples_split=1, n_val_sample=25, random_state=None): 485 | 486 | super(CARTClassifier, self).__init__( 487 | X, y, criterion=InformationGain(criterion), max_depth=max_depth, 488 | min_samples_split=min_samples_split, 489 | n_val_sample=n_val_sample, random_state=random_state) 490 | 491 | def _target_stat(self, y): 492 | """Given a vector, ``y``, get the mode""" 493 | return _most_common(y) 494 | --------------------------------------------------------------------------------