├── README.md ├── notebooks ├── model.pkl ├── .ipynb_checkpoints │ ├── DensityEstimation-checkpoint.ipynb │ ├── ModelPersistence-checkpoint.ipynb │ ├── Feature Transformation-checkpoint.ipynb │ ├── NovelyDetection-checkpoint.ipynb │ ├── PipelinesAndFeatureUnions-checkpoint.ipynb │ ├── FeatureSelection-checkpoint.ipynb │ ├── FeatureExtraction-checkpoint.ipynb │ ├── CrossValidation-checkpoint.ipynb │ ├── Multiclass-checkpoint.ipynb │ └── EnsembleMethods-checkpoint.ipynb ├── ModelPersistence.ipynb ├── DensityEstimation.ipynb ├── NovelyDetection.ipynb ├── FeatureTransformation.ipynb ├── PipelinesAndFeatureUnions.ipynb ├── FeatureExtraction.ipynb ├── FeatureSelection.ipynb ├── CrossValidation.ipynb └── Multiclass.ipynb ├── LICENSE └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # bit-of-data-science-and-scikit-learn 2 | -------------------------------------------------------------------------------- /notebooks/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knathanieltucker/bit-of-data-science-and-scikit-learn/HEAD/notebooks/model.pkl -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/DensityEstimation-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 K. Nathaniel Tucker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alabaster==0.7.10 2 | appdirs==1.4.3 3 | appnope==0.1.0 4 | Babel==2.4.0 5 | backports-abc==0.5 6 | backports.shutil-get-terminal-size==1.0.0 7 | bleach==2.0.0 8 | certifi==2017.1.23 9 | configparser==3.5.0 10 | cycler==0.10.0 11 | decorator==4.0.11 12 | docutils==0.13.1 13 | entrypoints==0.2.2 14 | enum34==1.1.6 15 | functools32==3.2.3.post2 16 | futures==3.0.5 17 | html5lib==0.999999999 18 | imagesize==0.7.1 19 | ipykernel==4.5.2 20 | ipyparallel==6.0.2 21 | ipython==5.3.0 22 | ipython-genutils==0.2.0 23 | ipywidgets==6.0.0 24 | Jinja2==2.9.5 25 | jsonschema==2.6.0 26 | jupyter-client==5.0.0 27 | jupyter-core==4.3.0 28 | MarkupSafe==1.0 29 | matplotlib==2.0.0 30 | mistune==0.7.4 31 | nbconvert==5.1.1 32 | nbformat==4.3.0 33 | nose==1.3.7 34 | notebook==4.4.1 35 | numpy==1.12.1 36 | olefile==0.44 37 | packaging==16.8 38 | pandocfilters==1.4.1 39 | pathlib2==2.2.1 40 | pexpect==4.2.1 41 | pickleshare==0.7.4 42 | Pillow==4.0.0 43 | prompt-toolkit==1.0.14 44 | ptyprocess==0.5.1 45 | Pygments==2.2.0 46 | pyparsing==2.2.0 47 | python-dateutil==2.6.0 48 | pytz==2016.10 49 | pyzmq==16.0.2 50 | qtconsole==4.2.1 51 | requests==2.13.0 52 | scandir==1.5 53 | scikit-learn==0.18.1 54 | scipy==0.19.0 55 | simplegeneric==0.8.1 56 | singledispatch==3.4.0.3 57 | six==1.10.0 58 | snowballstemmer==1.2.1 59 | Sphinx==1.5.3 60 | subprocess32==3.2.7 61 | terminado==0.6 62 | testpath==0.3 63 | tornado==4.4.2 64 | traitlets==4.3.2 65 | wcwidth==0.1.7 66 | webencodings==0.5 67 | widgetsnbextension==2.0.0 68 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/ModelPersistence-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Model Persistence\n", 8 | "\n", 9 | "After training a scikit-learn model, it is desirable to have a way to persist the model for future use without having to retrain. The following section gives you an example of how to persist a model with pickle. We’ll also review a few security and maintainability issues when working with pickle serialization.\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 23 | " decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n", 24 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 25 | " tol=0.001, verbose=False)" 26 | ] 27 | }, 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "from sklearn import svm\n", 35 | "from sklearn import datasets\n", 36 | "clf = svm.SVC()\n", 37 | "iris = datasets.load_iris()\n", 38 | "X, y = iris.data, iris.target\n", 39 | "clf.fit(X, y) " 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "array([0])" 53 | ] 54 | }, 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "import pickle\n", 62 | "s = pickle.dumps(clf)\n", 63 | "clf2 = pickle.loads(s)\n", 64 | "clf2.predict(X[0:1])\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "0" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "y[0]" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "In the specific case of the scikit, it may be more interesting to use joblib’s replacement of pickle (joblib.dump & joblib.load), which is more efficient on objects that carry large numpy arrays internally as is often the case for fitted scikit-learn estimators, but can only pickle to the disk and not to a string:" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "['model.pkl']" 107 | ] 108 | }, 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "from sklearn.externals import joblib\n", 116 | "\n", 117 | "joblib.dump(clf, 'model.pkl') " 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "clf = joblib.load('model.pkl') " 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [] 139 | } 140 | ], 141 | "metadata": { 142 | "kernelspec": { 143 | "display_name": "Python 2", 144 | "language": "python", 145 | "name": "python2" 146 | }, 147 | "language_info": { 148 | "codemirror_mode": { 149 | "name": "ipython", 150 | "version": 2 151 | }, 152 | "file_extension": ".py", 153 | "mimetype": "text/x-python", 154 | "name": "python", 155 | "nbconvert_exporter": "python", 156 | "pygments_lexer": "ipython2", 157 | "version": "2.7.10" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 2 162 | } 163 | -------------------------------------------------------------------------------- /notebooks/ModelPersistence.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "## Model Persistence\n", 11 | "\n", 12 | "After training a scikit-learn model, it is desirable to have a way to persist the model for future use without having to retrain. The following section gives you an example of how to persist a model with pickle. We’ll also review a few security and maintainability issues when working with pickle serialization.\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": false, 20 | "deletable": true, 21 | "editable": true 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 28 | " decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n", 29 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 30 | " tol=0.001, verbose=False)" 31 | ] 32 | }, 33 | "execution_count": 1, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "from sklearn import svm\n", 40 | "from sklearn import datasets\n", 41 | "clf = svm.SVC()\n", 42 | "iris = datasets.load_iris()\n", 43 | "X, y = iris.data, iris.target\n", 44 | "clf.fit(X, y) " 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": { 51 | "collapsed": false, 52 | "deletable": true, 53 | "editable": true 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "array([0])" 60 | ] 61 | }, 62 | "execution_count": 2, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "import pickle\n", 69 | "s = pickle.dumps(clf)\n", 70 | "clf2 = pickle.loads(s)\n", 71 | "clf2.predict(X[0:1])\n" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 3, 77 | "metadata": { 78 | "collapsed": false, 79 | "deletable": true, 80 | "editable": true 81 | }, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "0" 87 | ] 88 | }, 89 | "execution_count": 3, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "y[0]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": { 101 | "deletable": true, 102 | "editable": true 103 | }, 104 | "source": [ 105 | "In the specific case of the scikit, it may be more interesting to use joblib’s replacement of pickle (joblib.dump & joblib.load), which is more efficient on objects that carry large numpy arrays internally as is often the case for fitted scikit-learn estimators, but can only pickle to the disk and not to a string:" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 4, 111 | "metadata": { 112 | "collapsed": false, 113 | "deletable": true, 114 | "editable": true 115 | }, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "['model.pkl']" 121 | ] 122 | }, 123 | "execution_count": 4, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "from sklearn.externals import joblib\n", 130 | "\n", 131 | "joblib.dump(clf, 'model.pkl') " 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 5, 137 | "metadata": { 138 | "collapsed": true, 139 | "deletable": true, 140 | "editable": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "clf = joblib.load('model.pkl') " 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": true, 152 | "deletable": true, 153 | "editable": true 154 | }, 155 | "outputs": [], 156 | "source": [] 157 | } 158 | ], 159 | "metadata": { 160 | "kernelspec": { 161 | "display_name": "Python 2", 162 | "language": "python", 163 | "name": "python2" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": { 167 | "name": "ipython", 168 | "version": 2 169 | }, 170 | "file_extension": ".py", 171 | "mimetype": "text/x-python", 172 | "name": "python", 173 | "nbconvert_exporter": "python", 174 | "pygments_lexer": "ipython2", 175 | "version": "2.7.10" 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 2 180 | } 181 | -------------------------------------------------------------------------------- /notebooks/DensityEstimation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Density Estimation\n", 11 | "\n", 12 | "Density estimation walks the line between unsupervised learning, feature engineering, and data modeling. Some of the most popular and useful density estimation techniques are mixture models such as Gaussian Mixtures (sklearn.mixture.GaussianMixture), and neighbor-based approaches such as the kernel density estimate (sklearn.neighbors.KernelDensity). Gaussian Mixtures are discussed more fully in the context of clustering, because the technique is also useful as an unsupervised clustering scheme.\n", 13 | "\n", 14 | "Density estimation is a very simple concept, and most people are already familiar with one common density estimation technique: the histogram." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "deletable": true, 21 | "editable": true 22 | }, 23 | "source": [ 24 | "## Kernel Density Estimation\n", 25 | "\n", 26 | "Kernel density estimation in scikit-learn is implemented in the sklearn.neighbors.KernelDensity estimator, which uses the Ball Tree or KD Tree for efficient queries (see Nearest Neighbors for a discussion of these). Though the above example uses a 1D data set for simplicity, kernel density estimation can be performed in any number of dimensions, though in practice the curse of dimensionality causes its performance to degrade in high dimensions.\n", 27 | "\n", 28 | "The kernel density estimator can be used with any of the valid distance metrics (see sklearn.neighbors.DistanceMetric for a list of available metrics), though the results are properly normalized only for the Euclidean metric. One particularly useful metric is the Haversine distance which measures the angular distance between points on a sphere." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "from sklearn.neighbors.kde import KernelDensity\n", 40 | "\n", 41 | "KernelDensity?" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": { 48 | "collapsed": false, 49 | "deletable": true, 50 | "editable": true 51 | }, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "array([-10562.91076071])" 57 | ] 58 | }, 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "from sklearn.neighbors.kde import KernelDensity\n", 66 | "import numpy as np\n", 67 | "\n", 68 | "X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n", 69 | "kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X)\n", 70 | "\n", 71 | "kde.score_samples([[32,4]])" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": { 78 | "collapsed": false, 79 | "deletable": true, 80 | "editable": true 81 | }, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "array([[ 2.21052437, 1.09216422]])" 87 | ] 88 | }, 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "kde.sample(1)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": { 102 | "collapsed": true, 103 | "deletable": true, 104 | "editable": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "from sklearn.datasets import load_iris\n", 109 | "\n", 110 | "X, y = load_iris(return_X_y=True)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 6, 116 | "metadata": { 117 | "collapsed": false, 118 | "deletable": true, 119 | "editable": true 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "[-3.8262878]\n", 127 | "[-8.13952384]\n", 128 | "[-12.91720053]\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "estimators = []\n", 134 | "for c in [0, 1, 2]:\n", 135 | " m = KernelDensity().fit(X[y == c])\n", 136 | " estimators.append(m)\n", 137 | " \n", 138 | "for estimator in estimators:\n", 139 | " print estimator.score_samples([X[0]])" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": true, 147 | "deletable": true, 148 | "editable": true 149 | }, 150 | "outputs": [], 151 | "source": [] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": true, 158 | "deletable": true, 159 | "editable": true 160 | }, 161 | "outputs": [], 162 | "source": [] 163 | } 164 | ], 165 | "metadata": { 166 | "kernelspec": { 167 | "display_name": "Python 2", 168 | "language": "python", 169 | "name": "python2" 170 | }, 171 | "language_info": { 172 | "codemirror_mode": { 173 | "name": "ipython", 174 | "version": 2 175 | }, 176 | "file_extension": ".py", 177 | "mimetype": "text/x-python", 178 | "name": "python", 179 | "nbconvert_exporter": "python", 180 | "pygments_lexer": "ipython2", 181 | "version": "2.7.10" 182 | } 183 | }, 184 | "nbformat": 4, 185 | "nbformat_minor": 2 186 | } 187 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/Feature Transformation-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Transformation\n", 8 | "\n", 9 | "I am going to show off only two parts of the massive quantity of code in the unsupervised learning section of sklearn. And they can be put into this single bucket:\n", 10 | "\n", 11 | "* Feature Transformation\n", 12 | "* Exploratory Data Analysis\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Clustering\n", 20 | "\n", 21 | "Clustering of unlabeled data can be performed with the module sklearn.cluster.\n", 22 | "Each clustering algorithm comes in two variants: a class, that implements the fit method to learn the clusters on train data, and a function, that, given train data, returns an array of integer labels corresponding to the different clusters. For the class, the labels over the training data can be found in the labels_ attribute." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "#### Kmeans\n", 30 | "\n", 31 | "The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. This algorithm requires the number of clusters to be specified. It scales well to large number of samples and has been used across a large range of application areas in many different fields.\n", 32 | "\n", 33 | "Let's check out how it is used" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 24, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "from sklearn.cluster import KMeans\n", 45 | "\n", 46 | "KMeans?" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 12, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from sklearn.datasets import load_iris\n", 58 | "\n", 59 | "X, y = load_iris(return_X_y=True)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 13, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "cluster = KMeans(n_clusters=3)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 14, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n", 84 | " n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',\n", 85 | " random_state=None, tol=0.0001, verbose=0)" 86 | ] 87 | }, 88 | "execution_count": 14, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "cluster.fit(X)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 16, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 108 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 109 | " 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", 110 | " 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", 111 | " 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1,\n", 112 | " 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,\n", 113 | " 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1], dtype=int32)" 114 | ] 115 | }, 116 | "execution_count": 16, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "cluster.predict(X)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 21, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "from sklearn.tree import DecisionTreeClassifier\n", 134 | "\n", 135 | "m = DecisionTreeClassifier(max_depth=2)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 22, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n", 149 | " max_features=None, max_leaf_nodes=None,\n", 150 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 151 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 152 | " presort=False, random_state=None, splitter='best')" 153 | ] 154 | }, 155 | "execution_count": 22, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "m.fit(cluster.predict(X)[:, None], y)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 23, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "0.89333333333333331" 175 | ] 176 | }, 177 | "execution_count": 23, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "m.score(cluster.predict(X)[:, None], y)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Principal component analysis (PCA)\n", 191 | "\n", 192 | "PCA is used to decompose a multivariate dataset in a set of successive orthogonal components that explain a maximum amount of the variance. In scikit-learn, PCA is implemented as a transformer object that learns n components in its fit method, and can be used on new data to project it on these components.\n", 193 | "\n", 194 | "The optional parameter whiten=True makes it possible to project the data onto the singular space while scaling each component to unit variance. This is often useful if the models down-stream make strong assumptions on the isotropy of the signal: this is for example the case for Support Vector Machines with the RBF kernel and the K-Means clustering algorithm." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 25, 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "from sklearn.decomposition import PCA\n", 206 | "\n", 207 | "PCA?" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 29, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "(150, 2)" 221 | ] 222 | }, 223 | "execution_count": 29, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "from sklearn.svm import SVC\n", 230 | "\n", 231 | "pca = PCA(n_components=2)\n", 232 | "\n", 233 | "X_pca = pca.fit_transform(X)\n", 234 | "\n", 235 | "X_pca.shape" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 28, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "0.95333333333333337" 249 | ] 250 | }, 251 | "execution_count": 28, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "SVC().fit(X_pca, y).score(X_pca, y)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [] 268 | } 269 | ], 270 | "metadata": { 271 | "kernelspec": { 272 | "display_name": "Python 2", 273 | "language": "python", 274 | "name": "python2" 275 | }, 276 | "language_info": { 277 | "codemirror_mode": { 278 | "name": "ipython", 279 | "version": 2 280 | }, 281 | "file_extension": ".py", 282 | "mimetype": "text/x-python", 283 | "name": "python", 284 | "nbconvert_exporter": "python", 285 | "pygments_lexer": "ipython2", 286 | "version": "2.7.10" 287 | } 288 | }, 289 | "nbformat": 4, 290 | "nbformat_minor": 2 291 | } 292 | -------------------------------------------------------------------------------- /notebooks/NovelyDetection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Novely and Outlier Detection\n", 8 | "\n", 9 | "Many applications require being able to decide whether a new observation belongs to the same distribution as existing observations (it is an inlier), or should be considered as different (it is an outlier). Often, this ability is used to clean real data sets. Two important distinction must be made:\n", 10 | "\n", 11 | "* novelty detection:\n", 12 | " \tThe training data is not polluted by outliers, and we are interested in detecting anomalies in new observations.\n", 13 | "* outlier detection:\n", 14 | " \tThe training data contains outliers, and we need to fit the central mode of the training data, ignoring the deviant observations.\n", 15 | "\n", 16 | "The scikit-learn project provides a set of machine learning tools that can be used both for novelty or outliers detection. This strategy is implemented with objects learning in an unsupervised way from the data:\n", 17 | "\n", 18 | "`estimator.fit(X_train)`\n", 19 | "\n", 20 | "new observations can then be sorted as inliers or outliers with a predict method:\n", 21 | "\n", 22 | "`estimator.predict(X_test)`\n", 23 | "\n", 24 | "Inliers are labeled 1, while outliers are labeled -1." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Novelty Detection\n", 32 | "\n", 33 | "Consider a data set of n observations from the same distribution described by p features. Consider now that we add one more observation to that data set. Is the new observation so different from the others that we can doubt it is regular? (i.e. does it come from the same distribution?) Or on the contrary, is it so similar to the other that we cannot distinguish it from the original observations? This is the question addressed by the novelty detection tools and methods.\n", 34 | "\n", 35 | "In general, it is about to learn a rough, close frontier delimiting the contour of the initial observations distribution, plotted in embedding p-dimensional space. Then, if further observations lay within the frontier-delimited subspace, they are considered as coming from the same population than the initial observations. Otherwise, if they lay outside the frontier, we can say that they are abnormal with a given confidence in our assessment.\n", 36 | "\n", 37 | "The One-Class SVM has been introduced by Schölkopf et al. for that purpose and implemented in the Support Vector Machines module in the svm.OneClassSVM object. It requires the choice of a kernel and a scalar parameter to define a frontier. The RBF kernel is usually chosen although there exists no exact formula or algorithm to set its bandwidth parameter. This is the default in the scikit-learn implementation. The \\nu parameter, also known as the margin of the One-Class SVM, corresponds to the probability of finding a new, but regular, observation outside the frontier." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 1, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "from sklearn.svm import OneClassSVM\n", 49 | "\n", 50 | "OneClassSVM?" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 17, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "import numpy as np\n", 62 | "\n", 63 | "X = 0.3 * np.random.randn(100, 2)\n", 64 | "X_train = np.r_[X + 2, X - 2]\n", 65 | "# Generate some regular novel observations\n", 66 | "X = 0.3 * np.random.randn(20, 2)\n", 67 | "X_test = np.r_[X + 2, X - 2]\n", 68 | "# Generate some abnormal novel observations\n", 69 | "X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 27, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "clf = OneClassSVM(nu=0.1, kernel=\"rbf\", gamma=0.1)\n", 81 | "clf.fit(X_train)\n", 82 | "y_pred_train = clf.predict(X_train)\n", 83 | "y_pred_test = clf.predict(X_test)\n", 84 | "y_pred_outliers = clf.predict(X_outliers)\n", 85 | "n_error_train = y_pred_train[y_pred_train == -1].size\n", 86 | "n_error_test = y_pred_test[y_pred_test == -1].size\n", 87 | "n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 29, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "(20, 6, 2)" 101 | ] 102 | }, 103 | "execution_count": 29, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "n_error_train, n_error_test, n_error_outliers" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "## Outlier Detection\n", 117 | "\n", 118 | "Outlier detection is similar to novelty detection in the sense that the goal is to separate a core of regular observations from some polluting ones, called “outliers”. Yet, in the case of outlier detection, we don’t have a clean data set representing the population of regular observations that can be used to train any tool." 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "#### Isolation Forest\n", 126 | "\n", 127 | "One efficient way of performing outlier detection in high-dimensional datasets is to use random forests. The ensemble.IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.\n", 128 | "\n", 129 | "Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node.\n", 130 | "This path length, averaged over a forest of such random trees, is a measure of normality and our decision function.\n", 131 | "\n", 132 | "Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies.\n" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 30, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from sklearn.ensemble import IsolationForest\n", 144 | "\n", 145 | "rng = np.random.RandomState(42)\n", 146 | "\n", 147 | "# Generate train data\n", 148 | "X = 0.3 * rng.randn(100, 2)\n", 149 | "X_train = np.r_[X + 2, X - 2]\n", 150 | "# Generate some regular novel observations\n", 151 | "X = 0.3 * rng.randn(20, 2)\n", 152 | "X_test = np.r_[X + 2, X - 2]\n", 153 | "# Generate some abnormal novel observations\n", 154 | "X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))\n", 155 | "\n", 156 | "# fit the model\n", 157 | "clf = IsolationForest(max_samples=100, random_state=rng)\n", 158 | "clf.fit(X_train)\n", 159 | "y_pred_train = clf.predict(X_train)\n", 160 | "y_pred_test = clf.predict(X_test)\n", 161 | "y_pred_outliers = clf.predict(X_outliers)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 33, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "0" 175 | ] 176 | }, 177 | "execution_count": 33, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "y_pred_outliers[y_pred_outliers == 1].size" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": true 191 | }, 192 | "outputs": [], 193 | "source": [] 194 | } 195 | ], 196 | "metadata": { 197 | "kernelspec": { 198 | "display_name": "Python 2", 199 | "language": "python", 200 | "name": "python2" 201 | }, 202 | "language_info": { 203 | "codemirror_mode": { 204 | "name": "ipython", 205 | "version": 2 206 | }, 207 | "file_extension": ".py", 208 | "mimetype": "text/x-python", 209 | "name": "python", 210 | "nbconvert_exporter": "python", 211 | "pygments_lexer": "ipython2", 212 | "version": "2.7.10" 213 | } 214 | }, 215 | "nbformat": 4, 216 | "nbformat_minor": 2 217 | } 218 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/NovelyDetection-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Novely and Outlier Detection\n", 8 | "\n", 9 | "Many applications require being able to decide whether a new observation belongs to the same distribution as existing observations (it is an inlier), or should be considered as different (it is an outlier). Often, this ability is used to clean real data sets. Two important distinction must be made:\n", 10 | "\n", 11 | "* novelty detection:\n", 12 | " \tThe training data is not polluted by outliers, and we are interested in detecting anomalies in new observations.\n", 13 | "* outlier detection:\n", 14 | " \tThe training data contains outliers, and we need to fit the central mode of the training data, ignoring the deviant observations.\n", 15 | "\n", 16 | "The scikit-learn project provides a set of machine learning tools that can be used both for novelty or outliers detection. This strategy is implemented with objects learning in an unsupervised way from the data:\n", 17 | "\n", 18 | "`estimator.fit(X_train)`\n", 19 | "\n", 20 | "new observations can then be sorted as inliers or outliers with a predict method:\n", 21 | "\n", 22 | "`estimator.predict(X_test)`\n", 23 | "\n", 24 | "Inliers are labeled 1, while outliers are labeled -1." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Novelty Detection\n", 32 | "\n", 33 | "Consider a data set of n observations from the same distribution described by p features. Consider now that we add one more observation to that data set. Is the new observation so different from the others that we can doubt it is regular? (i.e. does it come from the same distribution?) Or on the contrary, is it so similar to the other that we cannot distinguish it from the original observations? This is the question addressed by the novelty detection tools and methods.\n", 34 | "\n", 35 | "In general, it is about to learn a rough, close frontier delimiting the contour of the initial observations distribution, plotted in embedding p-dimensional space. Then, if further observations lay within the frontier-delimited subspace, they are considered as coming from the same population than the initial observations. Otherwise, if they lay outside the frontier, we can say that they are abnormal with a given confidence in our assessment.\n", 36 | "\n", 37 | "The One-Class SVM has been introduced by Schölkopf et al. for that purpose and implemented in the Support Vector Machines module in the svm.OneClassSVM object. It requires the choice of a kernel and a scalar parameter to define a frontier. The RBF kernel is usually chosen although there exists no exact formula or algorithm to set its bandwidth parameter. This is the default in the scikit-learn implementation. The \\nu parameter, also known as the margin of the One-Class SVM, corresponds to the probability of finding a new, but regular, observation outside the frontier." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 1, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "from sklearn.svm import OneClassSVM\n", 49 | "\n", 50 | "OneClassSVM?" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 17, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "import numpy as np\n", 62 | "\n", 63 | "X = 0.3 * np.random.randn(100, 2)\n", 64 | "X_train = np.r_[X + 2, X - 2]\n", 65 | "# Generate some regular novel observations\n", 66 | "X = 0.3 * np.random.randn(20, 2)\n", 67 | "X_test = np.r_[X + 2, X - 2]\n", 68 | "# Generate some abnormal novel observations\n", 69 | "X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 27, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "clf = OneClassSVM(nu=0.1, kernel=\"rbf\", gamma=0.1)\n", 81 | "clf.fit(X_train)\n", 82 | "y_pred_train = clf.predict(X_train)\n", 83 | "y_pred_test = clf.predict(X_test)\n", 84 | "y_pred_outliers = clf.predict(X_outliers)\n", 85 | "n_error_train = y_pred_train[y_pred_train == -1].size\n", 86 | "n_error_test = y_pred_test[y_pred_test == -1].size\n", 87 | "n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 29, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "(20, 6, 2)" 101 | ] 102 | }, 103 | "execution_count": 29, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "n_error_train, n_error_test, n_error_outliers" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "## Outlier Detection\n", 117 | "\n", 118 | "Outlier detection is similar to novelty detection in the sense that the goal is to separate a core of regular observations from some polluting ones, called “outliers”. Yet, in the case of outlier detection, we don’t have a clean data set representing the population of regular observations that can be used to train any tool." 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "#### Isolation Forest\n", 126 | "\n", 127 | "One efficient way of performing outlier detection in high-dimensional datasets is to use random forests. The ensemble.IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.\n", 128 | "\n", 129 | "Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node.\n", 130 | "This path length, averaged over a forest of such random trees, is a measure of normality and our decision function.\n", 131 | "\n", 132 | "Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies.\n" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 30, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from sklearn.ensemble import IsolationForest\n", 144 | "\n", 145 | "rng = np.random.RandomState(42)\n", 146 | "\n", 147 | "# Generate train data\n", 148 | "X = 0.3 * rng.randn(100, 2)\n", 149 | "X_train = np.r_[X + 2, X - 2]\n", 150 | "# Generate some regular novel observations\n", 151 | "X = 0.3 * rng.randn(20, 2)\n", 152 | "X_test = np.r_[X + 2, X - 2]\n", 153 | "# Generate some abnormal novel observations\n", 154 | "X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))\n", 155 | "\n", 156 | "# fit the model\n", 157 | "clf = IsolationForest(max_samples=100, random_state=rng)\n", 158 | "clf.fit(X_train)\n", 159 | "y_pred_train = clf.predict(X_train)\n", 160 | "y_pred_test = clf.predict(X_test)\n", 161 | "y_pred_outliers = clf.predict(X_outliers)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 33, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "0" 175 | ] 176 | }, 177 | "execution_count": 33, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "y_pred_outliers[y_pred_outliers == 1].size" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": true 191 | }, 192 | "outputs": [], 193 | "source": [] 194 | } 195 | ], 196 | "metadata": { 197 | "kernelspec": { 198 | "display_name": "Python 2", 199 | "language": "python", 200 | "name": "python2" 201 | }, 202 | "language_info": { 203 | "codemirror_mode": { 204 | "name": "ipython", 205 | "version": 2 206 | }, 207 | "file_extension": ".py", 208 | "mimetype": "text/x-python", 209 | "name": "python", 210 | "nbconvert_exporter": "python", 211 | "pygments_lexer": "ipython2", 212 | "version": "2.7.10" 213 | } 214 | }, 215 | "nbformat": 4, 216 | "nbformat_minor": 2 217 | } 218 | -------------------------------------------------------------------------------- /notebooks/FeatureTransformation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Feature Transformation\n", 11 | "\n", 12 | "I am going to show off only two parts of the massive quantity of code in the unsupervised learning section of sklearn. And they can be put into this single bucket:\n", 13 | "\n", 14 | "* Feature Transformation\n", 15 | "* Exploratory Data Analysis\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "deletable": true, 22 | "editable": true 23 | }, 24 | "source": [ 25 | "## Clustering\n", 26 | "\n", 27 | "Clustering of unlabeled data can be performed with the module sklearn.cluster.\n", 28 | "Each clustering algorithm comes in two variants: a class, that implements the fit method to learn the clusters on train data, and a function, that, given train data, returns an array of integer labels corresponding to the different clusters. For the class, the labels over the training data can be found in the labels_ attribute." 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "deletable": true, 35 | "editable": true 36 | }, 37 | "source": [ 38 | "#### Kmeans\n", 39 | "\n", 40 | "The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. This algorithm requires the number of clusters to be specified. It scales well to large number of samples and has been used across a large range of application areas in many different fields.\n", 41 | "\n", 42 | "Let's check out how it is used" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 1, 48 | "metadata": { 49 | "collapsed": true, 50 | "deletable": true, 51 | "editable": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "from sklearn.cluster import KMeans\n", 56 | "\n", 57 | "KMeans?" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "metadata": { 64 | "collapsed": true, 65 | "deletable": true, 66 | "editable": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "from sklearn.datasets import load_iris\n", 71 | "\n", 72 | "X, y = load_iris(return_X_y=True)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": { 79 | "collapsed": true, 80 | "deletable": true, 81 | "editable": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "cluster = KMeans(n_clusters=3)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": { 92 | "collapsed": false, 93 | "deletable": true, 94 | "editable": true 95 | }, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n", 101 | " n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',\n", 102 | " random_state=None, tol=0.0001, verbose=0)" 103 | ] 104 | }, 105 | "execution_count": 4, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "cluster.fit(X)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 5, 117 | "metadata": { 118 | "collapsed": false, 119 | "deletable": true, 120 | "editable": true 121 | }, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 127 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 128 | " 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", 129 | " 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", 130 | " 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1,\n", 131 | " 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,\n", 132 | " 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1], dtype=int32)" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "cluster.predict(X)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 6, 147 | "metadata": { 148 | "collapsed": false, 149 | "deletable": true, 150 | "editable": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "from sklearn.tree import DecisionTreeClassifier\n", 155 | "\n", 156 | "m = DecisionTreeClassifier(max_depth=2)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 7, 162 | "metadata": { 163 | "collapsed": false, 164 | "deletable": true, 165 | "editable": true 166 | }, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n", 172 | " max_features=None, max_leaf_nodes=None,\n", 173 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 174 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 175 | " presort=False, random_state=None, splitter='best')" 176 | ] 177 | }, 178 | "execution_count": 7, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "m.fit(cluster.predict(X)[:, None], y)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 8, 190 | "metadata": { 191 | "collapsed": false, 192 | "deletable": true, 193 | "editable": true 194 | }, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "0.89333333333333331" 200 | ] 201 | }, 202 | "execution_count": 8, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "m.score(cluster.predict(X)[:, None], y)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": { 214 | "deletable": true, 215 | "editable": true 216 | }, 217 | "source": [ 218 | "## Principal component analysis (PCA)\n", 219 | "\n", 220 | "PCA is used to decompose a multivariate dataset in a set of successive orthogonal components that explain a maximum amount of the variance. In scikit-learn, PCA is implemented as a transformer object that learns n components in its fit method, and can be used on new data to project it on these components.\n", 221 | "\n", 222 | "The optional parameter whiten=True makes it possible to project the data onto the singular space while scaling each component to unit variance. This is often useful if the models down-stream make strong assumptions on the isotropy of the signal: this is for example the case for Support Vector Machines with the RBF kernel and the K-Means clustering algorithm." 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 9, 228 | "metadata": { 229 | "collapsed": true, 230 | "deletable": true, 231 | "editable": true 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "from sklearn.decomposition import PCA\n", 236 | "\n", 237 | "PCA?" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 10, 243 | "metadata": { 244 | "collapsed": false, 245 | "deletable": true, 246 | "editable": true 247 | }, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "(150, 2)" 253 | ] 254 | }, 255 | "execution_count": 10, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "from sklearn.svm import SVC\n", 262 | "\n", 263 | "pca = PCA(n_components=2)\n", 264 | "\n", 265 | "X_pca = pca.fit_transform(X)\n", 266 | "\n", 267 | "X_pca.shape" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 11, 273 | "metadata": { 274 | "collapsed": false, 275 | "deletable": true, 276 | "editable": true 277 | }, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "0.95333333333333337" 283 | ] 284 | }, 285 | "execution_count": 11, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "SVC().fit(X_pca, y).score(X_pca, y)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "collapsed": true, 299 | "deletable": true, 300 | "editable": true 301 | }, 302 | "outputs": [], 303 | "source": [] 304 | } 305 | ], 306 | "metadata": { 307 | "kernelspec": { 308 | "display_name": "Python 2", 309 | "language": "python", 310 | "name": "python2" 311 | }, 312 | "language_info": { 313 | "codemirror_mode": { 314 | "name": "ipython", 315 | "version": 2 316 | }, 317 | "file_extension": ".py", 318 | "mimetype": "text/x-python", 319 | "name": "python", 320 | "nbconvert_exporter": "python", 321 | "pygments_lexer": "ipython2", 322 | "version": "2.7.10" 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 2 327 | } 328 | -------------------------------------------------------------------------------- /notebooks/PipelinesAndFeatureUnions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "## Pipelines\n", 11 | "\n", 12 | "Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification. Pipeline serves two purposes here:\n", 13 | "\n", 14 | "* Convenience: You only have to call fit and predict once on your data to fit a whole sequence of estimators.\n", 15 | "* Joint parameter selection: You can grid search over parameters of all estimators in the pipeline at once.\n", 16 | "\n", 17 | "All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). The last estimator may be any type (transformer, classifier, etc.).\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": { 24 | "collapsed": true, 25 | "deletable": true, 26 | "editable": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "from sklearn.pipeline import Pipeline\n", 31 | "\n", 32 | "Pipeline?" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "collapsed": false, 40 | "deletable": true, 41 | "editable": true 42 | }, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n", 48 | " svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 49 | " decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n", 50 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 51 | " tol=0.001, verbose=False))])" 52 | ] 53 | }, 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "from sklearn.svm import SVC\n", 61 | "from sklearn.decomposition import PCA\n", 62 | "estimators = [('reduce_dim', PCA(n_components=2)), ('clf', SVC())]\n", 63 | "pipe = Pipeline(estimators)\n", 64 | "pipe \n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": true, 72 | "deletable": true, 73 | "editable": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "from sklearn.datasets import load_iris\n", 78 | "\n", 79 | "X, y = load_iris(return_X_y=True)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": { 86 | "collapsed": false, 87 | "deletable": true, 88 | "editable": true 89 | }, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/plain": [ 94 | "0.95333333333333337" 95 | ] 96 | }, 97 | "execution_count": 4, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "# Notice no need to PCA the Xs in the score!\n", 104 | "pipe.fit(X, y).score(X, y)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": { 110 | "deletable": true, 111 | "editable": true 112 | }, 113 | "source": [ 114 | "The utility function make_pipeline is a shorthand for constructing pipelines; it takes a variable number of estimators and returns a pipeline, filling in the names automatically:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 5, 120 | "metadata": { 121 | "collapsed": false, 122 | "deletable": true, 123 | "editable": true 124 | }, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "Pipeline(steps=[('binarizer', Binarizer(copy=True, threshold=0.0)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" 130 | ] 131 | }, 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "from sklearn.pipeline import make_pipeline\n", 139 | "from sklearn.naive_bayes import MultinomialNB\n", 140 | "from sklearn.preprocessing import Binarizer\n", 141 | "make_pipeline(Binarizer(), MultinomialNB()) \n" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 6, 147 | "metadata": { 148 | "collapsed": false, 149 | "deletable": true, 150 | "editable": true 151 | }, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "('reduce_dim',\n", 157 | " PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n", 158 | " svd_solver='auto', tol=0.0, whiten=False))" 159 | ] 160 | }, 161 | "execution_count": 6, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "pipe.steps[0]" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 7, 173 | "metadata": { 174 | "collapsed": false, 175 | "deletable": true, 176 | "editable": true 177 | }, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n", 183 | " svd_solver='auto', tol=0.0, whiten=False)" 184 | ] 185 | }, 186 | "execution_count": 7, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "pipe.named_steps['reduce_dim']" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 8, 198 | "metadata": { 199 | "collapsed": false, 200 | "deletable": true, 201 | "editable": true 202 | }, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n", 208 | " svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,\n", 209 | " decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n", 210 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 211 | " tol=0.001, verbose=False))])" 212 | ] 213 | }, 214 | "execution_count": 8, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "pipe.set_params(clf__C=10) " 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 9, 226 | "metadata": { 227 | "collapsed": true, 228 | "deletable": true, 229 | "editable": true 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "from sklearn.model_selection import GridSearchCV\n", 234 | "params = dict(reduce_dim__n_components=[2, 5, 10],\n", 235 | " clf__C=[0.1, 10, 100])\n", 236 | "grid_search = GridSearchCV(pipe, param_grid=params)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 10, 242 | "metadata": { 243 | "collapsed": true, 244 | "deletable": true, 245 | "editable": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "from sklearn.linear_model import LogisticRegression\n", 250 | "params = dict(reduce_dim=[None, PCA(5), PCA(10)],\n", 251 | " clf=[SVC(), LogisticRegression()],\n", 252 | " clf__C=[0.1, 10, 100])\n", 253 | "grid_search = GridSearchCV(pipe, param_grid=params)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "deletable": true, 260 | "editable": true 261 | }, 262 | "source": [ 263 | "## Feature Union\n", 264 | "\n", 265 | "FeatureUnion combines several transformer objects into a new transformer that combines their output. A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data independently. For transforming data, the transformers are applied in parallel, and the sample vectors they output are concatenated end-to-end into larger vectors.\n", 266 | "\n", 267 | "FeatureUnion serves the same purposes as Pipeline - convenience and joint parameter estimation and validation.\n", 268 | "\n", 269 | "FeatureUnion and Pipeline can be combined to create complex models.\n", 270 | "\n", 271 | "(A FeatureUnion has no way of checking whether two transformers might produce identical features. It only produces a union when the feature sets are disjoint, and making sure they are the caller’s responsibility.)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "collapsed": false, 279 | "deletable": true, 280 | "editable": true 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "from sklearn.pipeline import FeatureUnion\n", 285 | "from sklearn.decomposition import PCA\n", 286 | "from sklearn.decomposition import KernelPCA\n", 287 | "estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]\n", 288 | "combined = FeatureUnion(estimators)\n", 289 | "combined \n" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": false, 297 | "deletable": true, 298 | "editable": true 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "combined.fit_transform(X).shape" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": { 309 | "collapsed": false, 310 | "deletable": true, 311 | "editable": true 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "combined.set_params(kernel_pca=None) " 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": false, 323 | "deletable": true, 324 | "editable": true 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "combined.fit_transform(X).shape" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": { 335 | "collapsed": true, 336 | "deletable": true, 337 | "editable": true 338 | }, 339 | "outputs": [], 340 | "source": [] 341 | } 342 | ], 343 | "metadata": { 344 | "kernelspec": { 345 | "display_name": "Python 2", 346 | "language": "python", 347 | "name": "python2" 348 | }, 349 | "language_info": { 350 | "codemirror_mode": { 351 | "name": "ipython", 352 | "version": 2 353 | }, 354 | "file_extension": ".py", 355 | "mimetype": "text/x-python", 356 | "name": "python", 357 | "nbconvert_exporter": "python", 358 | "pygments_lexer": "ipython2", 359 | "version": "2.7.10" 360 | } 361 | }, 362 | "nbformat": 4, 363 | "nbformat_minor": 2 364 | } 365 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/PipelinesAndFeatureUnions-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Pipelines\n", 8 | "\n", 9 | "Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification. Pipeline serves two purposes here:\n", 10 | "\n", 11 | "* Convenience: You only have to call fit and predict once on your data to fit a whole sequence of estimators.\n", 12 | "* Joint parameter selection: You can grid search over parameters of all estimators in the pipeline at once.\n", 13 | "\n", 14 | "All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). The last estimator may be any type (transformer, classifier, etc.).\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from sklearn.pipeline import Pipeline\n", 26 | "\n", 27 | "Pipeline?" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n", 41 | " svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 42 | " decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n", 43 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 44 | " tol=0.001, verbose=False))])" 45 | ] 46 | }, 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "from sklearn.svm import SVC\n", 54 | "from sklearn.decomposition import PCA\n", 55 | "estimators = [('reduce_dim', PCA(n_components=2)), ('clf', SVC())]\n", 56 | "pipe = Pipeline(estimators)\n", 57 | "pipe \n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 11, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "from sklearn.datasets import load_iris\n", 69 | "\n", 70 | "X, y = load_iris(return_X_y=True)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 12, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "0.97333333333333338" 84 | ] 85 | }, 86 | "execution_count": 12, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "# Notice no need to PCA the Xs in the score!\n", 93 | "pipe.fit(X, y).score(X, y)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "The utility function make_pipeline is a shorthand for constructing pipelines; it takes a variable number of estimators and returns a pipeline, filling in the names automatically:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 4, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "Pipeline(steps=[('binarizer', Binarizer(copy=True, threshold=0.0)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" 114 | ] 115 | }, 116 | "execution_count": 4, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "from sklearn.pipeline import make_pipeline\n", 123 | "from sklearn.naive_bayes import MultinomialNB\n", 124 | "from sklearn.preprocessing import Binarizer\n", 125 | "make_pipeline(Binarizer(), MultinomialNB()) \n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 5, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "('reduce_dim',\n", 139 | " PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n", 140 | " svd_solver='auto', tol=0.0, whiten=False))" 141 | ] 142 | }, 143 | "execution_count": 5, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "pipe.steps[0]" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n", 163 | " svd_solver='auto', tol=0.0, whiten=False)" 164 | ] 165 | }, 166 | "execution_count": 6, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "pipe.named_steps['reduce_dim']" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 7, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [ 182 | { 183 | "data": { 184 | "text/plain": [ 185 | "Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n", 186 | " svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,\n", 187 | " decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n", 188 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 189 | " tol=0.001, verbose=False))])" 190 | ] 191 | }, 192 | "execution_count": 7, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "pipe.set_params(clf__C=10) " 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 8, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "from sklearn.model_selection import GridSearchCV\n", 210 | "params = dict(reduce_dim__n_components=[2, 5, 10],\n", 211 | " clf__C=[0.1, 10, 100])\n", 212 | "grid_search = GridSearchCV(pipe, param_grid=params)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 9, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "from sklearn.linear_model import LogisticRegression\n", 224 | "params = dict(reduce_dim=[None, PCA(5), PCA(10)],\n", 225 | " clf=[SVC(), LogisticRegression()],\n", 226 | " clf__C=[0.1, 10, 100])\n", 227 | "grid_search = GridSearchCV(pipe, param_grid=params)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "## Feature Union\n", 235 | "\n", 236 | "FeatureUnion combines several transformer objects into a new transformer that combines their output. A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data independently. For transforming data, the transformers are applied in parallel, and the sample vectors they output are concatenated end-to-end into larger vectors.\n", 237 | "\n", 238 | "FeatureUnion serves the same purposes as Pipeline - convenience and joint parameter estimation and validation.\n", 239 | "\n", 240 | "FeatureUnion and Pipeline can be combined to create complex models.\n", 241 | "\n", 242 | "(A FeatureUnion has no way of checking whether two transformers might produce identical features. It only produces a union when the feature sets are disjoint, and making sure they are the caller’s responsibility.)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 17, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "FeatureUnion(n_jobs=1,\n", 256 | " transformer_list=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,\n", 257 | " svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',\n", 258 | " fit_inverse_transform=False, gamma=None, kernel='linear',\n", 259 | " kernel_params=None, max_iter=None, n_components=None, n_jobs=1,\n", 260 | " random_state=None, remove_zero_eig=False, tol=0))],\n", 261 | " transformer_weights=None)" 262 | ] 263 | }, 264 | "execution_count": 17, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "from sklearn.pipeline import FeatureUnion\n", 271 | "from sklearn.decomposition import PCA\n", 272 | "from sklearn.decomposition import KernelPCA\n", 273 | "estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]\n", 274 | "combined = FeatureUnion(estimators)\n", 275 | "combined \n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 18, 281 | "metadata": { 282 | "collapsed": false 283 | }, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "(150, 78)" 289 | ] 290 | }, 291 | "execution_count": 18, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "combined.fit_transform(X).shape" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 19, 303 | "metadata": { 304 | "collapsed": false 305 | }, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "FeatureUnion(n_jobs=1,\n", 311 | " transformer_list=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,\n", 312 | " svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', None)],\n", 313 | " transformer_weights=None)" 314 | ] 315 | }, 316 | "execution_count": 19, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "combined.set_params(kernel_pca=None) " 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 20, 328 | "metadata": { 329 | "collapsed": false 330 | }, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/plain": [ 335 | "(150, 4)" 336 | ] 337 | }, 338 | "execution_count": 20, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "combined.fit_transform(X).shape" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": { 351 | "collapsed": true 352 | }, 353 | "outputs": [], 354 | "source": [] 355 | } 356 | ], 357 | "metadata": { 358 | "kernelspec": { 359 | "display_name": "Python 2", 360 | "language": "python", 361 | "name": "python2" 362 | }, 363 | "language_info": { 364 | "codemirror_mode": { 365 | "name": "ipython", 366 | "version": 2 367 | }, 368 | "file_extension": ".py", 369 | "mimetype": "text/x-python", 370 | "name": "python", 371 | "nbconvert_exporter": "python", 372 | "pygments_lexer": "ipython2", 373 | "version": "2.7.10" 374 | } 375 | }, 376 | "nbformat": 4, 377 | "nbformat_minor": 2 378 | } 379 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/FeatureSelection-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Selection\n", 8 | "\n", 9 | "The classes in the sklearn.feature_selection module can be used for feature selection/dimensionality reduction on sample sets, either to improve estimators’ accuracy scores or to boost their performance on very high-dimensional datasets." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Remove Low Var Features\n", 17 | "\n", 18 | "VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.\n", 19 | "\n", 20 | "Again we are starting to see fit and fit_transform pop up again. Sklearn provides a ton of functionality that's not just prediction. Some of the functionality is preprocessing the data. Again these are like models (they can only rely on the training data) but don't really predict anything. Thus they do have a fit method, but don't have a predict method. We will see two examples of this type of paradigm below." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "array([[0, 1],\n", 34 | " [1, 0],\n", 35 | " [0, 0],\n", 36 | " [1, 1],\n", 37 | " [1, 0],\n", 38 | " [1, 1]])" 39 | ] 40 | }, 41 | "execution_count": 1, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "from sklearn.feature_selection import VarianceThreshold\n", 48 | "\n", 49 | "X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]\n", 50 | "\n", 51 | "sel = VarianceThreshold(threshold=(.8 * (1 - .8)))\n", 52 | "\n", 53 | "sel.fit(X)\n", 54 | "\n", 55 | "sel.transform(X)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Univariate Feature Selection\n", 63 | "\n", 64 | "Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator. Scikit-learn exposes feature selection routines as objects that implement the transform method:\n", 65 | "* SelectKBest removes all but the k highest scoring features\n", 66 | "* SelectPercentile removes all but a user-specified highest scoring percentage of features\n", 67 | "* using common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe.\n", 68 | "* GenericUnivariateSelect allows to perform univariate feature selection with a configurable strategy. This allows to select the best univariate selection strategy with hyper-parameter search estimator.\n", 69 | "\n", 70 | "These objects take as input a scoring function that returns univariate scores and p-values (or only scores for SelectKBest and SelectPercentile):\n", 71 | "\n", 72 | "* For regression: f_regression, mutual_info_regression\n", 73 | "* For classification: chi2, f_classif, mutual_info_classif\n", 74 | "\n", 75 | "The methods based on F-test estimate the degree of linear dependency between two random variables. On the other hand, mutual information methods can capture any kind of statistical dependency, but being nonparametric, they require more samples for accurate estimation." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 2, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "from sklearn.datasets import load_iris\n", 87 | "from sklearn.feature_selection import SelectKBest\n", 88 | "from sklearn.feature_selection import chi2\n", 89 | "\n", 90 | "SelectKBest?" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "SelectKBest(k=2, score_func=)" 104 | ] 105 | }, 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "X, y = load_iris(return_X_y=True)\n", 113 | "\n", 114 | "sel = SelectKBest(chi2, k=2)\n", 115 | "\n", 116 | "sel.fit(X, y)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "(150, 2)" 130 | ] 131 | }, 132 | "execution_count": 7, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "sel.transform(X).shape" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "array([ 10.81782088, 3.59449902, 116.16984746, 67.24482759])" 152 | ] 153 | }, 154 | "execution_count": 8, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "sel.scores_" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Recursive feature elimination\n", 168 | "\n", 169 | "Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and weights are assigned to each one of them. Then, features whose absolute weights are the smallest are pruned from the current set features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.\n", 170 | "\n", 171 | "So it is very important to normalize these features in linear models!" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 14, 177 | "metadata": { 178 | "collapsed": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "from sklearn.ensemble import RandomForestClassifier\n", 183 | "from sklearn.feature_selection import RFECV\n", 184 | "\n", 185 | "RFECV?" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 15, 191 | "metadata": { 192 | "collapsed": true 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "m = RFECV(RandomForestClassifier(), scoring='accuracy')" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 16, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "RFECV(cv=None,\n", 210 | " estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 211 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 212 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 213 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 214 | " n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n", 215 | " verbose=0, warm_start=False),\n", 216 | " n_jobs=1, scoring='accuracy', step=1, verbose=0)" 217 | ] 218 | }, 219 | "execution_count": 16, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "m.fit(X, y)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Feature selection using SelectFromModel\n", 233 | "\n", 234 | "SelectFromModel is a meta-transformer that can be used along with any estimator that has a coef_ or feature_importances_ attribute after fitting. The features are considered unimportant and removed, if the corresponding coef_ or feature_importances_ values are below the provided threshold parameter. Apart from specifying the threshold numerically, there are built-in heuristics for finding a threshold using a string argument. Available heuristics are “mean”, “median” and float multiples of these like “0.1*mean”.\n", 235 | "\n", 236 | "For examples on how it is to be used refer to the sections below." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 18, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "from sklearn.svm import LinearSVC\n", 248 | "from sklearn.feature_selection import SelectFromModel\n", 249 | "\n", 250 | "SelectFromModel?" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 19, 256 | "metadata": { 257 | "collapsed": false 258 | }, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "SelectFromModel(estimator=LinearSVC(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n", 264 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 265 | " multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,\n", 266 | " verbose=0),\n", 267 | " prefit=False, threshold=None)" 268 | ] 269 | }, 270 | "execution_count": 19, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "m = SelectFromModel(LinearSVC(C=0.01, penalty='l1', dual=False))\n", 277 | "\n", 278 | "m.fit(X, y)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 22, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "(150, 3)" 292 | ] 293 | }, 294 | "execution_count": 22, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "m.transform(X).shape" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "A little bit more complex!" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 28, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "(506, 13)\n" 322 | ] 323 | }, 324 | { 325 | "data": { 326 | "text/plain": [ 327 | "(506, 10)" 328 | ] 329 | }, 330 | "execution_count": 28, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "from sklearn.linear_model import LassoCV\n", 337 | "from sklearn.datasets import load_boston\n", 338 | "\n", 339 | "X, y = load_boston(return_X_y=True)\n", 340 | "\n", 341 | "print X.shape\n", 342 | "\n", 343 | "m = SelectFromModel(LassoCV())\n", 344 | "\n", 345 | "m.fit(X, y)\n", 346 | "\n", 347 | "m.transform(X).shape" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "collapsed": false 355 | }, 356 | "outputs": [], 357 | "source": [] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": { 363 | "collapsed": true 364 | }, 365 | "outputs": [], 366 | "source": [] 367 | } 368 | ], 369 | "metadata": { 370 | "kernelspec": { 371 | "display_name": "Python 2", 372 | "language": "python", 373 | "name": "python2" 374 | }, 375 | "language_info": { 376 | "codemirror_mode": { 377 | "name": "ipython", 378 | "version": 2 379 | }, 380 | "file_extension": ".py", 381 | "mimetype": "text/x-python", 382 | "name": "python", 383 | "nbconvert_exporter": "python", 384 | "pygments_lexer": "ipython2", 385 | "version": "2.7.10" 386 | } 387 | }, 388 | "nbformat": 4, 389 | "nbformat_minor": 2 390 | } 391 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/FeatureExtraction-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Extraction\n", 8 | "\n", 9 | "The sklearn.feature_extraction module can be used to extract features in a format supported by machine learning algorithms from datasets consisting of formats such as text and image.\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Loading features from dicts\n", 17 | "\n", 18 | "The class DictVectorizer can be used to convert feature arrays represented as lists of standard Python dict objects to the NumPy/SciPy representation used by scikit-learn estimators.\n", 19 | "\n", 20 | "While not particularly fast to process, Python’s dict has the advantages of being convenient to use, being sparse (absent features need not be stored) and storing feature names in addition to values.\n", 21 | "\n", 22 | "DictVectorizer implements what is called one-of-K or “one-hot” coding for categorical (aka nominal, discrete) features. Categorical features are “attribute-value” pairs where the value is restricted to a list of discrete of possibilities without ordering (e.g. topic identifiers, types of objects, tags, names...).\n", 23 | "\n", 24 | "In the following, “city” is a categorical attribute while “temperature” is a traditional numerical feature:" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "array([[ 1., 0., 0., 33.],\n", 38 | " [ 0., 1., 0., 12.],\n", 39 | " [ 0., 0., 1., 18.]])" 40 | ] 41 | }, 42 | "execution_count": 1, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [ 48 | "measurements = [\n", 49 | " {'city': 'Dubai', 'temperature': 33.},\n", 50 | " {'city': 'London', 'temperature': 12.},\n", 51 | " {'city': 'San Fransisco', 'temperature': 18.},\n", 52 | "]\n", 53 | "\n", 54 | "from sklearn.feature_extraction import DictVectorizer\n", 55 | "vec = DictVectorizer()\n", 56 | "\n", 57 | "vec.fit_transform(measurements).toarray()\n", 58 | "\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 2, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']" 72 | ] 73 | }, 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "\n", 81 | "vec.get_feature_names()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Text feature extraction\n", 89 | "\n", 90 | "Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors with a fixed size rather than the raw text documents with variable length.\n", 91 | "\n", 92 | "In order to address this, scikit-learn provides utilities for the most common ways to extract numerical features from text content, namely:\n", 93 | "\n", 94 | "* tokenizing strings and giving an integer id for each possible token, for instance by using white-spaces and punctuation as token separators.\n", 95 | "* counting the occurrences of tokens in each document.\n", 96 | "* normalizing and weighting with diminishing importance tokens that occur in the majority of samples / documents.\n", 97 | "\n", 98 | "In this scheme, features and samples are defined as follows:\n", 99 | "\n", 100 | "* each individual token occurrence frequency (normalized or not) is treated as a feature.\n", 101 | "* the vector of all the token frequencies for a given document is considered a multivariate sample.\n", 102 | "\n", 103 | "A corpus of documents can thus be represented by a matrix with one row per document and one column per token (e.g. word) occurring in the corpus.\n", 104 | "\n", 105 | "We call vectorization the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the Bag of Words or “Bag of n-grams” representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document." 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "CountVectorizer implements both tokenization and occurrence counting in a single class:" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "from sklearn.feature_extraction.text import CountVectorizer\n", 124 | "\n", 125 | "CountVectorizer?" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 4, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',\n", 139 | " dtype=, encoding=u'utf-8', input=u'content',\n", 140 | " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", 141 | " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", 142 | " strip_accents=None, token_pattern=u'(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 143 | " tokenizer=None, vocabulary=None)" 144 | ] 145 | }, 146 | "execution_count": 4, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "vectorizer = CountVectorizer(min_df=1)\n", 153 | "vectorizer " 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 6, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "<4x9 sparse matrix of type ''\n", 167 | "\twith 19 stored elements in Compressed Sparse Row format>" 168 | ] 169 | }, 170 | "execution_count": 6, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "corpus = [\n", 177 | " 'This is the first document.',\n", 178 | " 'This is the second second document.',\n", 179 | " 'And the third one.',\n", 180 | " 'Is this the first document?',\n", 181 | "]\n", 182 | "X = vectorizer.fit_transform(corpus)\n", 183 | "X \n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 7, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "array([[0, 1, 1, 1, 0, 0, 1, 0, 1],\n", 197 | " [0, 1, 0, 1, 0, 2, 1, 0, 1],\n", 198 | " [1, 0, 0, 0, 1, 0, 1, 1, 0],\n", 199 | " [0, 1, 1, 1, 0, 0, 1, 0, 1]])" 200 | ] 201 | }, 202 | "execution_count": 7, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "X.toarray()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 9, 214 | "metadata": { 215 | "collapsed": false 216 | }, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "text/plain": [ 221 | "[u'this', u'is', u'text', u'document', u'to', u'analyze']" 222 | ] 223 | }, 224 | "execution_count": 9, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "analyze = vectorizer.build_analyzer()\n", 231 | "analyze(\"This is a text document to analyze.\")" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 10, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "[u'and',\n", 245 | " u'document',\n", 246 | " u'first',\n", 247 | " u'is',\n", 248 | " u'one',\n", 249 | " u'second',\n", 250 | " u'the',\n", 251 | " u'third',\n", 252 | " u'this']" 253 | ] 254 | }, 255 | "execution_count": 10, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "vectorizer.get_feature_names()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 11, 267 | "metadata": { 268 | "collapsed": false 269 | }, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "1" 275 | ] 276 | }, 277 | "execution_count": 11, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "vectorizer.vocabulary_.get('document')" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 12, 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])" 297 | ] 298 | }, 299 | "execution_count": 12, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "vectorizer.transform(['Something completely new.']).toarray()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 13, 311 | "metadata": { 312 | "collapsed": false 313 | }, 314 | "outputs": [ 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "TfidfTransformer(norm=u'l2', smooth_idf=False, sublinear_tf=False,\n", 319 | " use_idf=True)" 320 | ] 321 | }, 322 | "execution_count": 13, 323 | "metadata": {}, 324 | "output_type": "execute_result" 325 | } 326 | ], 327 | "source": [ 328 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 329 | "transformer = TfidfTransformer(smooth_idf=False)\n", 330 | "transformer \n" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 14, 336 | "metadata": { 337 | "collapsed": false 338 | }, 339 | "outputs": [ 340 | { 341 | "data": { 342 | "text/plain": [ 343 | "<6x3 sparse matrix of type ''\n", 344 | "\twith 9 stored elements in Compressed Sparse Row format>" 345 | ] 346 | }, 347 | "execution_count": 14, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "counts = [[3, 0, 1],\n", 354 | " [2, 0, 0],\n", 355 | " [3, 0, 0],\n", 356 | " [4, 0, 0],\n", 357 | " [3, 2, 0],\n", 358 | " [3, 0, 2]]\n", 359 | "\n", 360 | "tfidf = transformer.fit_transform(counts)\n", 361 | "tfidf \n" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 15, 367 | "metadata": { 368 | "collapsed": false 369 | }, 370 | "outputs": [ 371 | { 372 | "data": { 373 | "text/plain": [ 374 | "array([[ 0.81940995, 0. , 0.57320793],\n", 375 | " [ 1. , 0. , 0. ],\n", 376 | " [ 1. , 0. , 0. ],\n", 377 | " [ 1. , 0. , 0. ],\n", 378 | " [ 0.47330339, 0.88089948, 0. ],\n", 379 | " [ 0.58149261, 0. , 0.81355169]])" 380 | ] 381 | }, 382 | "execution_count": 15, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "tfidf.toarray() " 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 17, 394 | "metadata": { 395 | "collapsed": true 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 400 | "\n", 401 | "TfidfVectorizer?" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": { 408 | "collapsed": true 409 | }, 410 | "outputs": [], 411 | "source": [] 412 | } 413 | ], 414 | "metadata": { 415 | "kernelspec": { 416 | "display_name": "Python 2", 417 | "language": "python", 418 | "name": "python2" 419 | }, 420 | "language_info": { 421 | "codemirror_mode": { 422 | "name": "ipython", 423 | "version": 2 424 | }, 425 | "file_extension": ".py", 426 | "mimetype": "text/x-python", 427 | "name": "python", 428 | "nbconvert_exporter": "python", 429 | "pygments_lexer": "ipython2", 430 | "version": "2.7.10" 431 | } 432 | }, 433 | "nbformat": 4, 434 | "nbformat_minor": 2 435 | } 436 | -------------------------------------------------------------------------------- /notebooks/FeatureExtraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Feature Extraction\n", 11 | "\n", 12 | "The sklearn.feature_extraction module can be used to extract features in a format supported by machine learning algorithms from datasets consisting of formats such as text and image.\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "deletable": true, 19 | "editable": true 20 | }, 21 | "source": [ 22 | "## Loading features from dicts\n", 23 | "\n", 24 | "The class DictVectorizer can be used to convert feature arrays represented as lists of standard Python dict objects to the NumPy/SciPy representation used by scikit-learn estimators.\n", 25 | "\n", 26 | "While not particularly fast to process, Python’s dict has the advantages of being convenient to use, being sparse (absent features need not be stored) and storing feature names in addition to values.\n", 27 | "\n", 28 | "DictVectorizer implements what is called one-of-K or “one-hot” coding for categorical (aka nominal, discrete) features. Categorical features are “attribute-value” pairs where the value is restricted to a list of discrete of possibilities without ordering (e.g. topic identifiers, types of objects, tags, names...).\n", 29 | "\n", 30 | "In the following, “city” is a categorical attribute while “temperature” is a traditional numerical feature:" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 1, 36 | "metadata": { 37 | "collapsed": false, 38 | "deletable": true, 39 | "editable": true 40 | }, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "array([[ 1., 0., 0., 33.],\n", 46 | " [ 0., 1., 0., 12.],\n", 47 | " [ 0., 0., 1., 18.]])" 48 | ] 49 | }, 50 | "execution_count": 1, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "measurements = [\n", 57 | " {'city': 'Dubai', 'temperature': 33.},\n", 58 | " {'city': 'London', 'temperature': 12.},\n", 59 | " {'city': 'San Fransisco', 'temperature': 18.},\n", 60 | "]\n", 61 | "\n", 62 | "from sklearn.feature_extraction import DictVectorizer\n", 63 | "vec = DictVectorizer()\n", 64 | "\n", 65 | "vec.fit_transform(measurements).toarray()\n", 66 | "\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 2, 72 | "metadata": { 73 | "collapsed": false, 74 | "deletable": true, 75 | "editable": true 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']" 82 | ] 83 | }, 84 | "execution_count": 2, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "\n", 91 | "vec.get_feature_names()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "deletable": true, 98 | "editable": true 99 | }, 100 | "source": [ 101 | "## Text feature extraction\n", 102 | "\n", 103 | "Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors with a fixed size rather than the raw text documents with variable length.\n", 104 | "\n", 105 | "In order to address this, scikit-learn provides utilities for the most common ways to extract numerical features from text content, namely:\n", 106 | "\n", 107 | "* tokenizing strings and giving an integer id for each possible token, for instance by using white-spaces and punctuation as token separators.\n", 108 | "* counting the occurrences of tokens in each document.\n", 109 | "* normalizing and weighting with diminishing importance tokens that occur in the majority of samples / documents.\n", 110 | "\n", 111 | "In this scheme, features and samples are defined as follows:\n", 112 | "\n", 113 | "* each individual token occurrence frequency (normalized or not) is treated as a feature.\n", 114 | "* the vector of all the token frequencies for a given document is considered a multivariate sample.\n", 115 | "\n", 116 | "A corpus of documents can thus be represented by a matrix with one row per document and one column per token (e.g. word) occurring in the corpus.\n", 117 | "\n", 118 | "We call vectorization the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the Bag of Words or “Bag of n-grams” representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document." 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": { 124 | "deletable": true, 125 | "editable": true 126 | }, 127 | "source": [ 128 | "CountVectorizer implements both tokenization and occurrence counting in a single class:" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 3, 134 | "metadata": { 135 | "collapsed": true, 136 | "deletable": true, 137 | "editable": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "from sklearn.feature_extraction.text import CountVectorizer\n", 142 | "\n", 143 | "CountVectorizer?" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 4, 149 | "metadata": { 150 | "collapsed": false, 151 | "deletable": true, 152 | "editable": true 153 | }, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',\n", 159 | " dtype=, encoding=u'utf-8', input=u'content',\n", 160 | " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", 161 | " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", 162 | " strip_accents=None, token_pattern=u'(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 163 | " tokenizer=None, vocabulary=None)" 164 | ] 165 | }, 166 | "execution_count": 4, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "vectorizer = CountVectorizer(min_df=1)\n", 173 | "vectorizer " 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 5, 179 | "metadata": { 180 | "collapsed": false, 181 | "deletable": true, 182 | "editable": true 183 | }, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "<4x9 sparse matrix of type ''\n", 189 | "\twith 19 stored elements in Compressed Sparse Row format>" 190 | ] 191 | }, 192 | "execution_count": 5, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "corpus = [\n", 199 | " 'This is the first document.',\n", 200 | " 'This is the second second document.',\n", 201 | " 'And the third one.',\n", 202 | " 'Is this the first document?',\n", 203 | "]\n", 204 | "X = vectorizer.fit_transform(corpus)\n", 205 | "X \n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 6, 211 | "metadata": { 212 | "collapsed": false, 213 | "deletable": true, 214 | "editable": true 215 | }, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "array([[0, 1, 1, 1, 0, 0, 1, 0, 1],\n", 221 | " [0, 1, 0, 1, 0, 2, 1, 0, 1],\n", 222 | " [1, 0, 0, 0, 1, 0, 1, 1, 0],\n", 223 | " [0, 1, 1, 1, 0, 0, 1, 0, 1]])" 224 | ] 225 | }, 226 | "execution_count": 6, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "X.toarray()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 7, 238 | "metadata": { 239 | "collapsed": false, 240 | "deletable": true, 241 | "editable": true 242 | }, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "[u'this', u'is', u'text', u'document', u'to', u'analyze']" 248 | ] 249 | }, 250 | "execution_count": 7, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "analyze = vectorizer.build_analyzer()\n", 257 | "analyze(\"This is a text document to analyze.\")" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 8, 263 | "metadata": { 264 | "collapsed": false, 265 | "deletable": true, 266 | "editable": true 267 | }, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "[u'and',\n", 273 | " u'document',\n", 274 | " u'first',\n", 275 | " u'is',\n", 276 | " u'one',\n", 277 | " u'second',\n", 278 | " u'the',\n", 279 | " u'third',\n", 280 | " u'this']" 281 | ] 282 | }, 283 | "execution_count": 8, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "vectorizer.get_feature_names()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 9, 295 | "metadata": { 296 | "collapsed": false, 297 | "deletable": true, 298 | "editable": true 299 | }, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "1" 305 | ] 306 | }, 307 | "execution_count": 9, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "vectorizer.vocabulary_.get('document')" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 10, 319 | "metadata": { 320 | "collapsed": false, 321 | "deletable": true, 322 | "editable": true 323 | }, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/plain": [ 328 | "array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])" 329 | ] 330 | }, 331 | "execution_count": 10, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "vectorizer.transform(['Something completely new.']).toarray()" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 11, 343 | "metadata": { 344 | "collapsed": false, 345 | "deletable": true, 346 | "editable": true 347 | }, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/plain": [ 352 | "TfidfTransformer(norm=u'l2', smooth_idf=False, sublinear_tf=False,\n", 353 | " use_idf=True)" 354 | ] 355 | }, 356 | "execution_count": 11, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 363 | "transformer = TfidfTransformer(smooth_idf=False)\n", 364 | "transformer \n" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 12, 370 | "metadata": { 371 | "collapsed": false, 372 | "deletable": true, 373 | "editable": true 374 | }, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "text/plain": [ 379 | "<6x3 sparse matrix of type ''\n", 380 | "\twith 9 stored elements in Compressed Sparse Row format>" 381 | ] 382 | }, 383 | "execution_count": 12, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "counts = [[3, 0, 1],\n", 390 | " [2, 0, 0],\n", 391 | " [3, 0, 0],\n", 392 | " [4, 0, 0],\n", 393 | " [3, 2, 0],\n", 394 | " [3, 0, 2]]\n", 395 | "\n", 396 | "tfidf = transformer.fit_transform(counts)\n", 397 | "tfidf \n" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 13, 403 | "metadata": { 404 | "collapsed": false, 405 | "deletable": true, 406 | "editable": true 407 | }, 408 | "outputs": [ 409 | { 410 | "data": { 411 | "text/plain": [ 412 | "array([[ 0.81940995, 0. , 0.57320793],\n", 413 | " [ 1. , 0. , 0. ],\n", 414 | " [ 1. , 0. , 0. ],\n", 415 | " [ 1. , 0. , 0. ],\n", 416 | " [ 0.47330339, 0.88089948, 0. ],\n", 417 | " [ 0.58149261, 0. , 0.81355169]])" 418 | ] 419 | }, 420 | "execution_count": 13, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "tfidf.toarray() " 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 14, 432 | "metadata": { 433 | "collapsed": true, 434 | "deletable": true, 435 | "editable": true 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 440 | "\n", 441 | "TfidfVectorizer?" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": { 448 | "collapsed": true, 449 | "deletable": true, 450 | "editable": true 451 | }, 452 | "outputs": [], 453 | "source": [] 454 | } 455 | ], 456 | "metadata": { 457 | "kernelspec": { 458 | "display_name": "Python 2", 459 | "language": "python", 460 | "name": "python2" 461 | }, 462 | "language_info": { 463 | "codemirror_mode": { 464 | "name": "ipython", 465 | "version": 2 466 | }, 467 | "file_extension": ".py", 468 | "mimetype": "text/x-python", 469 | "name": "python", 470 | "nbconvert_exporter": "python", 471 | "pygments_lexer": "ipython2", 472 | "version": "2.7.10" 473 | } 474 | }, 475 | "nbformat": 4, 476 | "nbformat_minor": 2 477 | } 478 | -------------------------------------------------------------------------------- /notebooks/FeatureSelection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Feature Selection\n", 11 | "\n", 12 | "The classes in the sklearn.feature_selection module can be used for feature selection/dimensionality reduction on sample sets, either to improve estimators’ accuracy scores or to boost their performance on very high-dimensional datasets." 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "deletable": true, 19 | "editable": true 20 | }, 21 | "source": [ 22 | "## Remove Low Var Features\n", 23 | "\n", 24 | "VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.\n", 25 | "\n", 26 | "Again we are starting to see fit and fit_transform pop up again. Sklearn provides a ton of functionality that's not just prediction. Some of the functionality is preprocessing the data. Again these are like models (they can only rely on the training data) but don't really predict anything. Thus they do have a fit method, but don't have a predict method. We will see two examples of this type of paradigm below." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": { 33 | "collapsed": false, 34 | "deletable": true, 35 | "editable": true 36 | }, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "array([[0, 1],\n", 42 | " [1, 0],\n", 43 | " [0, 0],\n", 44 | " [1, 1],\n", 45 | " [1, 0],\n", 46 | " [1, 1]])" 47 | ] 48 | }, 49 | "execution_count": 1, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "from sklearn.feature_selection import VarianceThreshold\n", 56 | "\n", 57 | "X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]\n", 58 | "\n", 59 | "sel = VarianceThreshold(threshold=(.8 * (1 - .8)))\n", 60 | "\n", 61 | "sel.fit(X)\n", 62 | "\n", 63 | "sel.transform(X)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 2, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "array([[0, 1],\n", 77 | " [1, 0],\n", 78 | " [0, 0],\n", 79 | " [1, 1],\n", 80 | " [1, 0],\n", 81 | " [1, 1]])" 82 | ] 83 | }, 84 | "execution_count": 2, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "sel.fit_transform(X)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "deletable": true, 97 | "editable": true 98 | }, 99 | "source": [ 100 | "## Univariate Feature Selection\n", 101 | "\n", 102 | "Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator. Scikit-learn exposes feature selection routines as objects that implement the transform method:\n", 103 | "* SelectKBest removes all but the k highest scoring features\n", 104 | "* SelectPercentile removes all but a user-specified highest scoring percentage of features\n", 105 | "* using common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe.\n", 106 | "* GenericUnivariateSelect allows to perform univariate feature selection with a configurable strategy. This allows to select the best univariate selection strategy with hyper-parameter search estimator.\n", 107 | "\n", 108 | "These objects take as input a scoring function that returns univariate scores and p-values (or only scores for SelectKBest and SelectPercentile):\n", 109 | "\n", 110 | "* For regression: f_regression, mutual_info_regression\n", 111 | "* For classification: chi2, f_classif, mutual_info_classif\n", 112 | "\n", 113 | "The methods based on F-test estimate the degree of linear dependency between two random variables. On the other hand, mutual information methods can capture any kind of statistical dependency, but being nonparametric, they require more samples for accurate estimation." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 3, 119 | "metadata": { 120 | "collapsed": true, 121 | "deletable": true, 122 | "editable": true 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "from sklearn.datasets import load_iris\n", 127 | "from sklearn.feature_selection import SelectKBest\n", 128 | "from sklearn.feature_selection import chi2\n", 129 | "\n", 130 | "SelectKBest?" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 4, 136 | "metadata": { 137 | "collapsed": false, 138 | "deletable": true, 139 | "editable": true 140 | }, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "SelectKBest(k=2, score_func=)" 146 | ] 147 | }, 148 | "execution_count": 4, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "X, y = load_iris(return_X_y=True)\n", 155 | "\n", 156 | "sel = SelectKBest(chi2, k=2)\n", 157 | "\n", 158 | "sel.fit(X, y)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 5, 164 | "metadata": { 165 | "collapsed": false, 166 | "deletable": true, 167 | "editable": true 168 | }, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/plain": [ 173 | "(150, 2)" 174 | ] 175 | }, 176 | "execution_count": 5, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "sel.transform(X).shape" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 6, 188 | "metadata": { 189 | "collapsed": false, 190 | "deletable": true, 191 | "editable": true 192 | }, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "array([ 10.81782088, 3.59449902, 116.16984746, 67.24482759])" 198 | ] 199 | }, 200 | "execution_count": 6, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "sel.scores_" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "deletable": true, 213 | "editable": true 214 | }, 215 | "source": [ 216 | "## Recursive feature elimination\n", 217 | "\n", 218 | "Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and weights are assigned to each one of them. Then, features whose absolute weights are the smallest are pruned from the current set features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.\n", 219 | "\n", 220 | "So it is very important to normalize these features in linear models!" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 7, 226 | "metadata": { 227 | "collapsed": true, 228 | "deletable": true, 229 | "editable": true 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "from sklearn.ensemble import RandomForestClassifier\n", 234 | "from sklearn.feature_selection import RFECV\n", 235 | "\n", 236 | "RFECV?" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 8, 242 | "metadata": { 243 | "collapsed": true, 244 | "deletable": true, 245 | "editable": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "m = RFECV(RandomForestClassifier(), scoring='accuracy')" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 9, 255 | "metadata": { 256 | "collapsed": false, 257 | "deletable": true, 258 | "editable": true 259 | }, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "RFECV(cv=None,\n", 265 | " estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 266 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 267 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 268 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 269 | " n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n", 270 | " verbose=0, warm_start=False),\n", 271 | " n_jobs=1, scoring='accuracy', step=1, verbose=0)" 272 | ] 273 | }, 274 | "execution_count": 9, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "m.fit(X, y)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 11, 286 | "metadata": { 287 | "collapsed": false 288 | }, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/plain": [ 293 | "0.99333333333333329" 294 | ] 295 | }, 296 | "execution_count": 11, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "m.score(X, y)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": { 308 | "deletable": true, 309 | "editable": true 310 | }, 311 | "source": [ 312 | "## Feature selection using SelectFromModel\n", 313 | "\n", 314 | "SelectFromModel is a meta-transformer that can be used along with any estimator that has a coef_ or feature_importances_ attribute after fitting. The features are considered unimportant and removed, if the corresponding coef_ or feature_importances_ values are below the provided threshold parameter. Apart from specifying the threshold numerically, there are built-in heuristics for finding a threshold using a string argument. Available heuristics are “mean”, “median” and float multiples of these like “0.1*mean”.\n", 315 | "\n", 316 | "For examples on how it is to be used refer to the sections below." 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 13, 322 | "metadata": { 323 | "collapsed": true, 324 | "deletable": true, 325 | "editable": true 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "from sklearn.svm import LinearSVC\n", 330 | "from sklearn.feature_selection import SelectFromModel\n", 331 | "\n", 332 | "SelectFromModel?" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 14, 338 | "metadata": { 339 | "collapsed": false, 340 | "deletable": true, 341 | "editable": true 342 | }, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "SelectFromModel(estimator=LinearSVC(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n", 348 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 349 | " multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,\n", 350 | " verbose=0),\n", 351 | " prefit=False, threshold=None)" 352 | ] 353 | }, 354 | "execution_count": 14, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "m = SelectFromModel(LinearSVC(C=0.01, penalty='l1', dual=False))\n", 361 | "\n", 362 | "m.fit(X, y)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 15, 368 | "metadata": { 369 | "collapsed": false, 370 | "deletable": true, 371 | "editable": true 372 | }, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "text/plain": [ 377 | "(150, 3)" 378 | ] 379 | }, 380 | "execution_count": 15, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "m.transform(X).shape" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": { 392 | "deletable": true, 393 | "editable": true 394 | }, 395 | "source": [ 396 | "A little bit more complex!" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 16, 402 | "metadata": { 403 | "collapsed": false, 404 | "deletable": true, 405 | "editable": true 406 | }, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "(506, 13)\n" 413 | ] 414 | }, 415 | { 416 | "data": { 417 | "text/plain": [ 418 | "(506, 10)" 419 | ] 420 | }, 421 | "execution_count": 16, 422 | "metadata": {}, 423 | "output_type": "execute_result" 424 | } 425 | ], 426 | "source": [ 427 | "from sklearn.linear_model import LassoCV\n", 428 | "from sklearn.datasets import load_boston\n", 429 | "\n", 430 | "X, y = load_boston(return_X_y=True)\n", 431 | "\n", 432 | "print X.shape\n", 433 | "\n", 434 | "m = SelectFromModel(LassoCV())\n", 435 | "\n", 436 | "m.fit(X, y)\n", 437 | "\n", 438 | "m.transform(X).shape" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": { 445 | "collapsed": false, 446 | "deletable": true, 447 | "editable": true 448 | }, 449 | "outputs": [], 450 | "source": [] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": { 456 | "collapsed": true, 457 | "deletable": true, 458 | "editable": true 459 | }, 460 | "outputs": [], 461 | "source": [] 462 | } 463 | ], 464 | "metadata": { 465 | "kernelspec": { 466 | "display_name": "Python 2", 467 | "language": "python", 468 | "name": "python2" 469 | }, 470 | "language_info": { 471 | "codemirror_mode": { 472 | "name": "ipython", 473 | "version": 2 474 | }, 475 | "file_extension": ".py", 476 | "mimetype": "text/x-python", 477 | "name": "python", 478 | "nbconvert_exporter": "python", 479 | "pygments_lexer": "ipython2", 480 | "version": "2.7.10" 481 | } 482 | }, 483 | "nbformat": 4, 484 | "nbformat_minor": 2 485 | } 486 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/CrossValidation-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Cross-validation: evaluating estimator performance\n", 8 | "\n", 9 | "Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data. This situation is called overfitting. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test. Note that the word “experiment” is not intended to denote academic use only, because even in commercial settings machine learning usually starts out experimentally.\n", 10 | "\n", 11 | "In scikit-learn a random split into training and test sets can be quickly computed with the train_test_split helper function. Let’s load the iris data set to fit a linear support vector machine on it:" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "((150, 4), (150,))" 25 | ] 26 | }, 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "import numpy as np\n", 34 | "from sklearn.model_selection import train_test_split\n", 35 | "from sklearn import datasets\n", 36 | "from sklearn import svm\n", 37 | "\n", 38 | "iris = datasets.load_iris()\n", 39 | "iris.data.shape, iris.target.shape" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "train_test_split?" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "((90, 4), (90,))" 64 | ] 65 | }, 66 | "execution_count": 3, 67 | "metadata": {}, 68 | "output_type": "execute_result" 69 | } 70 | ], 71 | "source": [ 72 | "X_train, X_test, y_train, y_test = train_test_split(\n", 73 | " iris.data, iris.target, test_size=0.4, random_state=0)\n", 74 | "\n", 75 | "X_train.shape, y_train.shape" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "((60, 4), (60,))" 89 | ] 90 | }, 91 | "execution_count": 4, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "X_test.shape, y_test.shape" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "0.96666666666666667" 111 | ] 112 | }, 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)\n", 120 | "clf.score(X_test, y_test) " 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "When evaluating different settings (“hyperparameters”) for estimators, such as the C setting that must be manually set for an SVM, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.\n", 128 | "\n", 129 | "However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.\n", 130 | "\n", 131 | "A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", 132 | "\n", 133 | "* A model is trained using k-1 of the folds as training data;\n", 134 | "* the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).\n", 135 | "\n", 136 | "The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as it is the case when fixing an arbitrary test set), which is a major advantage in problem such as inverse inference where the number of samples is very small." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 6, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "from sklearn.model_selection import cross_val_score\n", 148 | "\n", 149 | "cross_val_score?" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "array([ 0.96666667, 1. , 0.96666667, 0.96666667, 1. ])" 163 | ] 164 | }, 165 | "execution_count": 7, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "clf = svm.SVC(kernel='linear', C=1)\n", 172 | "\n", 173 | "scores = cross_val_score(clf, iris.data, iris.target, cv=5)\n", 174 | "\n", 175 | "scores" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 8, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "Accuracy: 0.98 (+/- 0.03)\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 12, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "array([ 0.96658312, 1. , 0.96658312, 0.96658312, 1. ])" 208 | ] 209 | }, 210 | "execution_count": 12, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "from sklearn import metrics\n", 217 | "\n", 218 | "scores = cross_val_score(\n", 219 | " clf, iris.data, iris.target, cv=5, scoring='f1_macro')\n", 220 | "\n", 221 | "scores" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 13, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "from sklearn.model_selection import ShuffleSplit\n", 233 | "\n", 234 | "ShuffleSplit?" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 14, 240 | "metadata": { 241 | "collapsed": false 242 | }, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "array([ 0.97777778, 0.97777778, 1. ])" 248 | ] 249 | }, 250 | "execution_count": 14, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "n_samples = iris.data.shape[0]\n", 257 | "\n", 258 | "cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)\n", 259 | "\n", 260 | "cross_val_score(clf, iris.data, iris.target, cv=cv)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 18, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "from sklearn.model_selection import cross_val_predict\n", 272 | "\n", 273 | "cross_val_predict?" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 19, 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "(150,)" 287 | ] 288 | }, 289 | "execution_count": 19, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)\n", 296 | "\n", 297 | "predicted.shape" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 20, 303 | "metadata": { 304 | "collapsed": false 305 | }, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "0.97333333333333338" 311 | ] 312 | }, 313 | "execution_count": 20, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "metrics.accuracy_score(iris.target, predicted) " 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "## Cross validation iterators\n", 327 | "\n", 328 | "The following sections list utilities to generate indices that can be used to generate dataset splits according to different cross validation strategies.\n", 329 | "\n", 330 | "Assuming that some data is Independent Identically Distributed (i.i.d.) is making the assumption that all samples stem from the same generative process and that the generative process is assumed to have no memory of past generated samples.\n", 331 | "\n", 332 | "The following cross-validators can be used in such cases." 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 23, 338 | "metadata": { 339 | "collapsed": false 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "from sklearn.model_selection import KFold\n", 344 | "\n", 345 | "KFold?" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 28, 351 | "metadata": { 352 | "collapsed": false 353 | }, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "[2 3] [0 1]\n", 360 | "[0 1] [2 3]\n" 361 | ] 362 | } 363 | ], 364 | "source": [ 365 | "kf = KFold(n_splits=2, shuffle=True)\n", 366 | "\n", 367 | "X = [\"a\", \"b\", \"c\", \"d\"]\n", 368 | "for train, test in kf.split(X):\n", 369 | " print(\"%s %s\" % (train, test))" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "#### Stratification\n", 377 | "\n", 378 | "Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold." 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 24, 384 | "metadata": { 385 | "collapsed": true 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "from sklearn.model_selection import StratifiedKFold\n", 390 | "\n", 391 | "StratifiedKFold?" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 25, 397 | "metadata": { 398 | "collapsed": false 399 | }, 400 | "outputs": [ 401 | { 402 | "name": "stdout", 403 | "output_type": "stream", 404 | "text": [ 405 | "[2 3 6 7 8 9] [0 1 4 5]\n", 406 | "[0 1 3 4 5 8 9] [2 6 7]\n", 407 | "[0 1 2 4 5 6 7] [3 8 9]\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "X = np.ones(10)\n", 413 | "y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]\n", 414 | "skf = StratifiedKFold(n_splits=3)\n", 415 | "for train, test in skf.split(X, y):\n", 416 | " print(\"%s %s\" % (train, test))\n" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "#### Grouped Data\n", 424 | "\n", 425 | "The i.i.d. assumption is broken if the underlying generative process yield groups of dependent samples.\n", 426 | "\n", 427 | "Such a grouping of data is domain specific. An example would be when there is medical data collected from multiple patients, with multiple samples taken from each patient. And such data is likely to be dependent on the individual group. In our example, the patient id for each sample will be its group identifier.\n", 428 | "\n", 429 | "In this case we would like to know if a model trained on a particular set of groups generalizes well to the unseen groups. To measure this, we need to ensure that all the samples in the validation fold come from groups that are not represented at all in the paired training fold.\n", 430 | "\n", 431 | "The following cross-validation splitters can be used to do that. The grouping identifier for the samples is specified via the groups parameter." 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 26, 437 | "metadata": { 438 | "collapsed": false 439 | }, 440 | "outputs": [ 441 | { 442 | "name": "stdout", 443 | "output_type": "stream", 444 | "text": [ 445 | "[0 1 2 3 4 5] [6 7 8 9]\n", 446 | "[0 1 2 6 7 8 9] [3 4 5]\n", 447 | "[3 4 5 6 7 8 9] [0 1 2]\n" 448 | ] 449 | } 450 | ], 451 | "source": [ 452 | "from sklearn.model_selection import GroupKFold\n", 453 | "\n", 454 | "X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]\n", 455 | "y = [\"a\", \"b\", \"b\", \"b\", \"c\", \"c\", \"c\", \"d\", \"d\", \"d\"]\n", 456 | "groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]\n", 457 | "\n", 458 | "gkf = GroupKFold(n_splits=3)\n", 459 | "for train, test in gkf.split(X, y, groups=groups):\n", 460 | " print(\"%s %s\" % (train, test))\n" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "#### Time Series Split\n", 468 | "\n", 469 | "TimeSeriesSplit is a variation of k-fold which returns first k folds as train set and the (k+1) th fold as test set. Note that unlike standard cross-validation methods, successive training sets are supersets of those that come before them. Also, it adds all surplus data to the first training partition, which is always used to train the model.\n", 470 | "\n", 471 | "This class can be used to cross-validate time series data samples that are observed at fixed time intervals." 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 27, 477 | "metadata": { 478 | "collapsed": false 479 | }, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "TimeSeriesSplit(n_splits=3)\n", 486 | "[0 1 2] [3]\n", 487 | "[0 1 2 3] [4]\n", 488 | "[0 1 2 3 4] [5]\n" 489 | ] 490 | } 491 | ], 492 | "source": [ 493 | "from sklearn.model_selection import TimeSeriesSplit\n", 494 | "\n", 495 | "X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])\n", 496 | "y = np.array([1, 2, 3, 4, 5, 6])\n", 497 | "tscv = TimeSeriesSplit(n_splits=3)\n", 498 | "print(tscv) \n", 499 | "\n", 500 | "for train, test in tscv.split(X):\n", 501 | " print(\"%s %s\" % (train, test))" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": { 508 | "collapsed": true 509 | }, 510 | "outputs": [], 511 | "source": [] 512 | } 513 | ], 514 | "metadata": { 515 | "kernelspec": { 516 | "display_name": "Python 2", 517 | "language": "python", 518 | "name": "python2" 519 | }, 520 | "language_info": { 521 | "codemirror_mode": { 522 | "name": "ipython", 523 | "version": 2 524 | }, 525 | "file_extension": ".py", 526 | "mimetype": "text/x-python", 527 | "name": "python", 528 | "nbconvert_exporter": "python", 529 | "pygments_lexer": "ipython2", 530 | "version": "2.7.10" 531 | } 532 | }, 533 | "nbformat": 4, 534 | "nbformat_minor": 2 535 | } 536 | -------------------------------------------------------------------------------- /notebooks/CrossValidation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Cross-validation: evaluating estimator performance\n", 11 | "\n", 12 | "Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data. This situation is called overfitting. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test. Note that the word “experiment” is not intended to denote academic use only, because even in commercial settings machine learning usually starts out experimentally.\n", 13 | "\n", 14 | "In scikit-learn a random split into training and test sets can be quickly computed with the train_test_split helper function. Let’s load the iris data set to fit a linear support vector machine on it:" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false, 22 | "deletable": true, 23 | "editable": true 24 | }, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/plain": [ 29 | "((150, 4), (150,))" 30 | ] 31 | }, 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "output_type": "execute_result" 35 | } 36 | ], 37 | "source": [ 38 | "import numpy as np\n", 39 | "from sklearn.model_selection import train_test_split\n", 40 | "from sklearn import datasets\n", 41 | "from sklearn import svm\n", 42 | "\n", 43 | "iris = datasets.load_iris()\n", 44 | "iris.data.shape, iris.target.shape" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": { 51 | "collapsed": true, 52 | "deletable": true, 53 | "editable": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "train_test_split?" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": { 64 | "collapsed": false, 65 | "deletable": true, 66 | "editable": true 67 | }, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "((90, 4), (90,))" 73 | ] 74 | }, 75 | "execution_count": 3, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "X_train, X_test, y_train, y_test = train_test_split(\n", 82 | " iris.data, iris.target, test_size=0.4, random_state=0)\n", 83 | "\n", 84 | "X_train.shape, y_train.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 4, 90 | "metadata": { 91 | "collapsed": false, 92 | "deletable": true, 93 | "editable": true 94 | }, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "((60, 4), (60,))" 100 | ] 101 | }, 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "X_test.shape, y_test.shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "metadata": { 115 | "collapsed": false, 116 | "deletable": true, 117 | "editable": true 118 | }, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "0.96666666666666667" 124 | ] 125 | }, 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)\n", 133 | "clf.score(X_test, y_test) " 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 6, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "0.98888888888888893" 147 | ] 148 | }, 149 | "execution_count": 6, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "clf.score(X_train, y_train)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "deletable": true, 162 | "editable": true 163 | }, 164 | "source": [ 165 | "When evaluating different settings (“hyperparameters”) for estimators, such as the C setting that must be manually set for an SVM, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.\n", 166 | "\n", 167 | "However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.\n", 168 | "\n", 169 | "A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n", 170 | "\n", 171 | "* A model is trained using k-1 of the folds as training data;\n", 172 | "* the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).\n", 173 | "\n", 174 | "The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as it is the case when fixing an arbitrary test set), which is a major advantage in problem such as inverse inference where the number of samples is very small." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 7, 180 | "metadata": { 181 | "collapsed": true, 182 | "deletable": true, 183 | "editable": true 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "from sklearn.model_selection import cross_val_score\n", 188 | "\n", 189 | "cross_val_score?" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 10, 195 | "metadata": { 196 | "collapsed": false, 197 | "deletable": true, 198 | "editable": true 199 | }, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "array([ 0.98666667, 0.94666667])" 205 | ] 206 | }, 207 | "execution_count": 10, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "clf = svm.SVC(kernel='linear', C=1)\n", 214 | "\n", 215 | "scores = cross_val_score(clf, iris.data, iris.target, cv=2)\n", 216 | "\n", 217 | "scores" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 11, 223 | "metadata": { 224 | "collapsed": false, 225 | "deletable": true, 226 | "editable": true 227 | }, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "Accuracy: 0.97 (+/- 0.04)\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 12, 244 | "metadata": { 245 | "collapsed": false, 246 | "deletable": true, 247 | "editable": true 248 | }, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "array([ 0.96658312, 1. , 0.96658312, 0.96658312, 1. ])" 254 | ] 255 | }, 256 | "execution_count": 12, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "from sklearn import metrics\n", 263 | "\n", 264 | "scores = cross_val_score(\n", 265 | " clf, iris.data, iris.target, cv=5, scoring='f1_macro')\n", 266 | "\n", 267 | "scores" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 13, 273 | "metadata": { 274 | "collapsed": true, 275 | "deletable": true, 276 | "editable": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "from sklearn.model_selection import ShuffleSplit\n", 281 | "\n", 282 | "ShuffleSplit?" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 14, 288 | "metadata": { 289 | "collapsed": false, 290 | "deletable": true, 291 | "editable": true 292 | }, 293 | "outputs": [ 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "array([ 0.97777778, 0.97777778, 1. ])" 298 | ] 299 | }, 300 | "execution_count": 14, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "n_samples = iris.data.shape[0]\n", 307 | "\n", 308 | "cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)\n", 309 | "\n", 310 | "cross_val_score(clf, iris.data, iris.target, cv=cv)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 15, 316 | "metadata": { 317 | "collapsed": false, 318 | "deletable": true, 319 | "editable": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "from sklearn.model_selection import cross_val_predict\n", 324 | "\n", 325 | "cross_val_predict?" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 16, 331 | "metadata": { 332 | "collapsed": false, 333 | "deletable": true, 334 | "editable": true 335 | }, 336 | "outputs": [ 337 | { 338 | "data": { 339 | "text/plain": [ 340 | "(150,)" 341 | ] 342 | }, 343 | "execution_count": 16, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)\n", 350 | "\n", 351 | "predicted.shape" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 17, 357 | "metadata": { 358 | "collapsed": false, 359 | "deletable": true, 360 | "editable": true 361 | }, 362 | "outputs": [ 363 | { 364 | "data": { 365 | "text/plain": [ 366 | "0.97333333333333338" 367 | ] 368 | }, 369 | "execution_count": 17, 370 | "metadata": {}, 371 | "output_type": "execute_result" 372 | } 373 | ], 374 | "source": [ 375 | "metrics.accuracy_score(iris.target, predicted) " 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "from sklearn.linear_model import LassoCV" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": { 392 | "deletable": true, 393 | "editable": true 394 | }, 395 | "source": [ 396 | "## Cross validation iterators\n", 397 | "\n", 398 | "The following sections list utilities to generate indices that can be used to generate dataset splits according to different cross validation strategies.\n", 399 | "\n", 400 | "Assuming that some data is Independent Identically Distributed (i.i.d.) is making the assumption that all samples stem from the same generative process and that the generative process is assumed to have no memory of past generated samples.\n", 401 | "\n", 402 | "The following cross-validators can be used in such cases." 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 18, 408 | "metadata": { 409 | "collapsed": false, 410 | "deletable": true, 411 | "editable": true 412 | }, 413 | "outputs": [], 414 | "source": [ 415 | "from sklearn.model_selection import KFold\n", 416 | "\n", 417 | "KFold?" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 20, 423 | "metadata": { 424 | "collapsed": false, 425 | "deletable": true, 426 | "editable": true 427 | }, 428 | "outputs": [ 429 | { 430 | "name": "stdout", 431 | "output_type": "stream", 432 | "text": [ 433 | "[0 1 3] [2]\n", 434 | "[0 2 3] [1]\n", 435 | "[0 1 2] [3]\n", 436 | "[1 2 3] [0]\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "kf = KFold(n_splits=4, shuffle=True)\n", 442 | "\n", 443 | "X = [\"a\", \"b\", \"c\", \"d\"]\n", 444 | "for train, test in kf.split(X):\n", 445 | " print(\"%s %s\" % (train, test))" 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": { 451 | "deletable": true, 452 | "editable": true 453 | }, 454 | "source": [ 455 | "#### Stratification\n", 456 | "\n", 457 | "Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold." 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 21, 463 | "metadata": { 464 | "collapsed": true, 465 | "deletable": true, 466 | "editable": true 467 | }, 468 | "outputs": [], 469 | "source": [ 470 | "from sklearn.model_selection import StratifiedKFold\n", 471 | "\n", 472 | "StratifiedKFold?" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 22, 478 | "metadata": { 479 | "collapsed": false, 480 | "deletable": true, 481 | "editable": true 482 | }, 483 | "outputs": [ 484 | { 485 | "name": "stdout", 486 | "output_type": "stream", 487 | "text": [ 488 | "[2 3 6 7 8 9] [0 1 4 5]\n", 489 | "[0 1 3 4 5 8 9] [2 6 7]\n", 490 | "[0 1 2 4 5 6 7] [3 8 9]\n" 491 | ] 492 | } 493 | ], 494 | "source": [ 495 | "X = np.ones(10)\n", 496 | "y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]\n", 497 | "skf = StratifiedKFold(n_splits=3)\n", 498 | "for train, test in skf.split(X, y):\n", 499 | " print(\"%s %s\" % (train, test))\n" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": { 505 | "deletable": true, 506 | "editable": true 507 | }, 508 | "source": [ 509 | "#### Grouped Data\n", 510 | "\n", 511 | "The i.i.d. assumption is broken if the underlying generative process yield groups of dependent samples.\n", 512 | "\n", 513 | "Such a grouping of data is domain specific. An example would be when there is medical data collected from multiple patients, with multiple samples taken from each patient. And such data is likely to be dependent on the individual group. In our example, the patient id for each sample will be its group identifier.\n", 514 | "\n", 515 | "In this case we would like to know if a model trained on a particular set of groups generalizes well to the unseen groups. To measure this, we need to ensure that all the samples in the validation fold come from groups that are not represented at all in the paired training fold.\n", 516 | "\n", 517 | "The following cross-validation splitters can be used to do that. The grouping identifier for the samples is specified via the groups parameter." 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 23, 523 | "metadata": { 524 | "collapsed": false, 525 | "deletable": true, 526 | "editable": true 527 | }, 528 | "outputs": [ 529 | { 530 | "name": "stdout", 531 | "output_type": "stream", 532 | "text": [ 533 | "[0 1 2 3 4 5] [6 7 8 9]\n", 534 | "[0 1 2 6 7 8 9] [3 4 5]\n", 535 | "[3 4 5 6 7 8 9] [0 1 2]\n" 536 | ] 537 | } 538 | ], 539 | "source": [ 540 | "from sklearn.model_selection import GroupKFold\n", 541 | "\n", 542 | "X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]\n", 543 | "y = [\"a\", \"b\", \"b\", \"b\", \"c\", \"c\", \"c\", \"d\", \"d\", \"d\"]\n", 544 | "groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]\n", 545 | "\n", 546 | "gkf = GroupKFold(n_splits=3)\n", 547 | "for train, test in gkf.split(X, y, groups=groups):\n", 548 | " print(\"%s %s\" % (train, test))\n" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": { 554 | "deletable": true, 555 | "editable": true 556 | }, 557 | "source": [ 558 | "#### Time Series Split\n", 559 | "\n", 560 | "TimeSeriesSplit is a variation of k-fold which returns first k folds as train set and the (k+1) th fold as test set. Note that unlike standard cross-validation methods, successive training sets are supersets of those that come before them. Also, it adds all surplus data to the first training partition, which is always used to train the model.\n", 561 | "\n", 562 | "This class can be used to cross-validate time series data samples that are observed at fixed time intervals." 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 24, 568 | "metadata": { 569 | "collapsed": false, 570 | "deletable": true, 571 | "editable": true 572 | }, 573 | "outputs": [ 574 | { 575 | "name": "stdout", 576 | "output_type": "stream", 577 | "text": [ 578 | "TimeSeriesSplit(n_splits=3)\n", 579 | "[0 1 2] [3]\n", 580 | "[0 1 2 3] [4]\n", 581 | "[0 1 2 3 4] [5]\n" 582 | ] 583 | } 584 | ], 585 | "source": [ 586 | "from sklearn.model_selection import TimeSeriesSplit\n", 587 | "\n", 588 | "X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])\n", 589 | "y = np.array([1, 2, 3, 4, 5, 6])\n", 590 | "tscv = TimeSeriesSplit(n_splits=3)\n", 591 | "print(tscv) \n", 592 | "\n", 593 | "for train, test in tscv.split(X):\n", 594 | " print(\"%s %s\" % (train, test))" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": null, 600 | "metadata": { 601 | "collapsed": true, 602 | "deletable": true, 603 | "editable": true 604 | }, 605 | "outputs": [], 606 | "source": [] 607 | } 608 | ], 609 | "metadata": { 610 | "kernelspec": { 611 | "display_name": "Python 2", 612 | "language": "python", 613 | "name": "python2" 614 | }, 615 | "language_info": { 616 | "codemirror_mode": { 617 | "name": "ipython", 618 | "version": 2 619 | }, 620 | "file_extension": ".py", 621 | "mimetype": "text/x-python", 622 | "name": "python", 623 | "nbconvert_exporter": "python", 624 | "pygments_lexer": "ipython2", 625 | "version": "2.7.10" 626 | } 627 | }, 628 | "nbformat": 4, 629 | "nbformat_minor": 2 630 | } 631 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/Multiclass-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Multiclass and Multi Label Algorithms\n", 8 | "\n", 9 | "The sklearn.multiclass module implements meta-estimators to solve multiclass and multilabel classification problems by decomposing such problems into binary classification problems. Multitarget regression is also supported.\n", 10 | "\n", 11 | "* Multiclass classification means a classification task with more than two classes; e.g., classify a set of images of fruits which may be oranges, apples, or pears. Multiclass classification makes the assumption that each sample is assigned to one and only one label: a fruit can be either an apple or a pear but not both at the same time.\n", 12 | "* Multilabel classification assigns to each sample a set of target labels. This can be thought as predicting properties of a data-point that are not mutually exclusive, such as topics that are relevant for a document. A text might be about any of religion, politics, finance or education at the same time or none of these.\n", 13 | "* Multioutput regression assigns each sample a set of target values. This can be thought of as predicting several properties for each data-point, such as wind direction and magnitude at a certain location.\n", 14 | "* Multioutput-multiclass classification and multi-task classification means that a single estimator has to handle several joint classification tasks. This is both a generalization of the multi-label classification task, which only considers binary classification, as well as a generalization of the multi-class classification task. The output format is a 2d numpy array or sparse matrix.\n", 15 | "\n", 16 | " The set of labels can be different for each output variable. For instance, a sample could be assigned “pear” for an output variable that takes possible values in a finite set of species such as “pear”, “apple”; and “blue” or “green” for a second output variable that takes possible values in a finite set of colors such as “green”, “red”, “blue”, “yellow”...\n", 17 | "\n", 18 | " This means that any classifiers handling multi-output multiclass or multi-task classification tasks, support the multi-label classification task as a special case. Multi-task classification is similar to the multi-output classification task with different model formulations. For more information, see the relevant estimator documentation.\n", 19 | "\n", 20 | "All scikit-learn classifiers are capable of multiclass classification, but the meta-estimators offered by sklearn.multiclass permit changing the way they handle more than two classes because this may have an effect on classifier performance (either in terms of generalization error or required computational resources).\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Multilabel classification format\n", 28 | "\n", 29 | "In multilabel learning, the joint set of binary classification tasks is expressed with label binary indicator array: each sample is one row of a 2d array of shape (n_samples, n_classes) with binary values: the one, i.e. the non zero elements, corresponds to the subset of labels. An array such as np.array([[1, 0, 0], [0, 1, 1], [0, 0, 0]]) represents label 0 in the first sample, labels 1 and 2 in the second sample, and no labels in the third sample.\n", 30 | "\n", 31 | "Producing multilabel data as a list of sets of labels may be more intuitive. The MultiLabelBinarizer transformer can be used to convert between a collection of collections of labels and the indicator format.\n", 32 | "\n", 33 | "This is skipping ahead by a couple of lessons (we have not seen transform before!) But keep this in the back of your mind for when we get there and just memorize this for now" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 1, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "array([[0, 0, 1, 1, 1],\n", 47 | " [0, 0, 1, 0, 0],\n", 48 | " [1, 1, 0, 1, 0],\n", 49 | " [1, 1, 1, 1, 1],\n", 50 | " [1, 1, 1, 0, 0]])" 51 | ] 52 | }, 53 | "execution_count": 1, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 60 | "\n", 61 | "y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]\n", 62 | "\n", 63 | "MultiLabelBinarizer().fit_transform(y)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## One vs Rest\n", 71 | "\n", 72 | "This strategy, also known as one-vs-all, is implemented in OneVsRestClassifier. The strategy consists in fitting one classifier per class. For each classifier, the class is fitted against all the other classes. In addition to its computational efficiency (only n_classes classifiers are needed), one advantage of this approach is its interpretability. Since each class is represented by one and only one classifier, it is possible to gain knowledge about the class by inspecting its corresponding classifier. This is the most commonly used strategy and is a fair default choice." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 7, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "from sklearn import datasets\n", 84 | "\n", 85 | "from sklearn.multiclass import OneVsRestClassifier\n", 86 | "\n", 87 | "OneVsRestClassifier?" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "from sklearn.svm import LinearSVC\n", 99 | "\n", 100 | "# Note that this also can OneVsRest\n", 101 | "LinearSVC?" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 8, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 115 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 116 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 117 | " verbose=0),\n", 118 | " n_jobs=1)" 119 | ] 120 | }, 121 | "execution_count": 8, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "X, y = datasets.load_iris(return_X_y=True)\n", 128 | "\n", 129 | "m = OneVsRestClassifier(LinearSVC())\n", 130 | "\n", 131 | "m.fit(X, y)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 9, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "(array([0, 1, 2]), False)" 145 | ] 146 | }, 147 | "execution_count": 9, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "m.classes_, m.multilabel_" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 11, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "0.96666666666666667" 167 | ] 168 | }, 169 | "execution_count": 11, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "m.score(X, y)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 12, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "0.96666666666666667" 189 | ] 190 | }, 191 | "execution_count": 12, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "LinearSVC().fit(X, y).score(X, y)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## One Vs One\n", 205 | "\n", 206 | "OneVsOneClassifier constructs one classifier per pair of classes. At prediction time, the class which received the most votes is selected. In the event of a tie (among two classes with an equal number of votes), it selects the class with the highest aggregate classification confidence by summing over the pair-wise classification confidence levels computed by the underlying binary classifiers.\n", 207 | "\n", 208 | "Since it requires to fit n_classes * (n_classes - 1) / 2 classifiers, this method is usually slower than one-vs-the-rest, due to its O(n_classes^2) complexity. However, this method may be advantageous for algorithms such as kernel algorithms which don’t scale well with n_samples. This is because each individual learning problem only involves a small subset of the data whereas, with one-vs-the-rest, the complete dataset is used n_classes times." 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 13, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "from sklearn.multiclass import OneVsOneClassifier\n", 220 | "\n", 221 | "OneVsOneClassifier?" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 14, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "OneVsOneClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 235 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 236 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 237 | " verbose=0),\n", 238 | " n_jobs=1)" 239 | ] 240 | }, 241 | "execution_count": 14, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "m = OneVsOneClassifier(\n", 248 | " LinearSVC())\n", 249 | "\n", 250 | "m.fit(X, y)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 15, 256 | "metadata": { 257 | "collapsed": false 258 | }, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 264 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 265 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 266 | " verbose=0),\n", 267 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 268 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 269 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 270 | " verbose=0),\n", 271 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 272 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 273 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 274 | " verbose=0))" 275 | ] 276 | }, 277 | "execution_count": 15, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "m.estimators_" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 16, 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "0.97999999999999998" 297 | ] 298 | }, 299 | "execution_count": 16, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "m.score(X, y)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "## Error Correcting Output Codes\n", 313 | "\n", 314 | "Output-code based strategies are fairly different from one-vs-the-rest and one-vs-one. With these strategies, each class is represented in a Euclidean space, where each dimension can only be 0 or 1. Another way to put it is that each class is represented by a binary code (an array of 0 and 1). The matrix which keeps track of the location/code of each class is called the code book. The code size is the dimensionality of the aforementioned space. Intuitively, each class should be represented by a code as unique as possible and a good code book should be designed to optimize classification accuracy.\n", 315 | "\n", 316 | "At fitting time, one binary classifier per bit in the code book is fitted. At prediction time, the classifiers are used to project new points in the class space and the class closest to the points is chosen.\n", 317 | "\n", 318 | "In OutputCodeClassifier, the code_size attribute allows the user to control the number of classifiers which will be used. It is a percentage of the total number of classes.\n", 319 | "\n", 320 | "A number between 0 and 1 will require fewer classifiers than one-vs-the-rest. In theory, log2(n_classes) / n_classes is sufficient to represent each class unambiguously. However, in practice, it may not lead to good accuracy since log2(n_classes) is much smaller than n_classes.\n", 321 | "\n", 322 | "A number greater than 1 will require more classifiers than one-vs-the-rest. In this case, some classifiers will in theory correct for the mistakes made by other classifiers, hence the name “error-correcting”. In practice, however, this may not happen as classifier mistakes will typically be correlated. The error-correcting output codes have a similar effect to bagging." 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 17, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "from sklearn.multiclass import OutputCodeClassifier\n", 334 | "\n", 335 | "OutputCodeClassifier?" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 21, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/plain": [ 348 | "OutputCodeClassifier(code_size=2,\n", 349 | " estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 350 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 351 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 352 | " verbose=0),\n", 353 | " n_jobs=1, random_state=None)" 354 | ] 355 | }, 356 | "execution_count": 21, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "m = OutputCodeClassifier(LinearSVC(), code_size=2)\n", 363 | "\n", 364 | "m.fit(X, y)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 22, 370 | "metadata": { 371 | "collapsed": false 372 | }, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "text/plain": [ 377 | "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 378 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 379 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 380 | " verbose=0),\n", 381 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 382 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 383 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 384 | " verbose=0),\n", 385 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 386 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 387 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 388 | " verbose=0),\n", 389 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 390 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 391 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 392 | " verbose=0),\n", 393 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 394 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 395 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 396 | " verbose=0),\n", 397 | " _ConstantPredictor()]" 398 | ] 399 | }, 400 | "execution_count": 22, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "m.estimators_" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 23, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "0.96666666666666667" 420 | ] 421 | }, 422 | "execution_count": 23, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "m.score(X, y)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "# Multiple Output Regression and Classification\n", 436 | "\n", 437 | "Multioutput regression support can be added to any regressor with MultiOutputRegressor. This strategy consists of fitting one regressor per target. Since each target is represented by exactly one regressor it is possible to gain knowledge about the target by inspecting its corresponding regressor. As MultiOutputRegressor fits one regressor per target it can not take advantage of correlations between targets.\n", 438 | "\n", 439 | "Multioutput classification support can be added to any classifier with MultiOutputClassifier. This strategy consists of fitting one classifier per target. This allows multiple target variable classifications. The purpose of this class is to extend estimators to be able to estimate a series of target functions (f1,f2,f3...,fn) that are trained on a single X predictor matrix to predict a series of reponses (y1,y2,y3...,yn)." 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 25, 445 | "metadata": { 446 | "collapsed": false 447 | }, 448 | "outputs": [ 449 | { 450 | "data": { 451 | "text/plain": [ 452 | "0.99999999911789184" 453 | ] 454 | }, 455 | "execution_count": 25, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | } 459 | ], 460 | "source": [ 461 | "from sklearn.datasets import make_regression\n", 462 | "\n", 463 | "from sklearn.multioutput import MultiOutputRegressor\n", 464 | "\n", 465 | "from sklearn.ensemble import GradientBoostingRegressor\n", 466 | "\n", 467 | "X, y = make_regression(n_samples=10, n_targets=3, random_state=1)\n", 468 | "\n", 469 | "MultiOutputRegressor(\n", 470 | " GradientBoostingRegressor(random_state=0)).fit(X, y).score(X, y)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": { 477 | "collapsed": true 478 | }, 479 | "outputs": [], 480 | "source": [] 481 | } 482 | ], 483 | "metadata": { 484 | "kernelspec": { 485 | "display_name": "Python 2", 486 | "language": "python", 487 | "name": "python2" 488 | }, 489 | "language_info": { 490 | "codemirror_mode": { 491 | "name": "ipython", 492 | "version": 2 493 | }, 494 | "file_extension": ".py", 495 | "mimetype": "text/x-python", 496 | "name": "python", 497 | "nbconvert_exporter": "python", 498 | "pygments_lexer": "ipython2", 499 | "version": "2.7.10" 500 | } 501 | }, 502 | "nbformat": 4, 503 | "nbformat_minor": 2 504 | } 505 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/EnsembleMethods-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Ensemble Methods\n", 8 | "\n", 9 | "The goal of ensemble methods is to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator.\n", 10 | "\n", 11 | "Two families of ensemble methods are usually distinguished:\n", 12 | "* In averaging methods, the driving principle is to build several estimators independently and then to average their predictions. On average, the combined estimator is usually better than any of the single base estimator because its variance is reduced.\n", 13 | "\n", 14 | "Examples: Bagging methods, Forests of randomized trees, ...\n", 15 | "\n", 16 | "* By contrast, in boosting methods, base estimators are built sequentially and one tries to reduce the bias of the combined estimator. The motivation is to combine several weak models to produce a powerful ensemble.\n", 17 | "\n", 18 | "Examples: AdaBoost, Gradient Tree Boosting, ..." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Bagging Meta Estimator\n", 26 | "\n", 27 | "In ensemble algorithms, bagging methods form a class of algorithms which build several instances of a black-box estimator on random subsets of the original training set and then aggregate their individual predictions to form a final prediction. These methods are used as a way to reduce the variance of a base estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it. In many cases, bagging methods constitute a very simple way to improve with respect to a single model, without making it necessary to adapt the underlying base algorithm. As they provide a way to reduce overfitting, bagging methods work best with strong and complex models (e.g., fully developed decision trees), in contrast with boosting methods which usually work best with weak models (e.g., shallow decision trees).\n", 28 | "\n", 29 | "Bagging methods come in many flavours but mostly differ from each other by the way they draw random subsets of the training set:\n", 30 | "\n", 31 | "* When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting\n", 32 | "* When samples are drawn with replacement, then the method is known as Bagging\n", 33 | "* When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces\n", 34 | "* Finally, when base estimators are built on subsets of both samples and features, then the method is known as Random Patches\n", 35 | "\n", 36 | "In scikit-learn, bagging methods are offered as a unified BaggingClassifier meta-estimator (resp. BaggingRegressor), taking as input a user-specified base estimator along with parameters specifying the strategy to draw random subsets. In particular, max_samples and max_features control the size of the subsets (in terms of samples and features), while bootstrap and bootstrap_features control whether samples and features are drawn with or without replacement. When using a subset of the available samples the generalization accuracy can be estimated with the out-of-bag samples by setting oob_score=True. " 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "To get started, let's import some data" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "import sklearn.datasets as datasets\n", 55 | "\n", 56 | "X, y = datasets.load_iris(return_X_y=True)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Notice that we again see parallelization!\n", 64 | "\n", 65 | "Next let's check out the features of the BaggingClassifier (the BaggingRegressor is very similar)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "from sklearn.ensemble import BaggingClassifier\n", 77 | "\n", 78 | "BaggingClassifier?" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "Now that we have a feel for it, let's pair it with a classifier. And for this we will use KNN." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "from sklearn.neighbors import KNeighborsClassifier\n", 97 | "\n", 98 | "m = KNeighborsClassifier(n_neighbors=3)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 10, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "bag = BaggingClassifier(\n", 110 | " m, \n", 111 | " max_samples=.5, \n", 112 | " max_features=2, \n", 113 | " n_jobs=2,\n", 114 | " oob_score=True)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 11, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 128 | " metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n", 129 | " weights='uniform'),\n", 130 | " bootstrap=True, bootstrap_features=False, max_features=2,\n", 131 | " max_samples=0.5, n_estimators=10, n_jobs=2, oob_score=True,\n", 132 | " random_state=None, verbose=0, warm_start=False)" 133 | ] 134 | }, 135 | "execution_count": 11, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "bag.fit(X, y)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 12, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "0.92666666666666664" 155 | ] 156 | }, 157 | "execution_count": 12, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "bag.oob_score_" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 13, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "array([0])" 177 | ] 178 | }, 179 | "execution_count": 13, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "bag.predict([X[0]])" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 14, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "array([[ 1., 0., 0.]])" 199 | ] 200 | }, 201 | "execution_count": 14, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "bag.predict_proba([X[0]])" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 24, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "0.95999999999999996" 221 | ] 222 | }, 223 | "execution_count": 24, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "bag.score(X, y)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "## Random Forests\n", 237 | "\n", 238 | "Random forests are somewhat special. They happen to be so frequently used a bagging method that they have become their own method. They are in that way the same as a classic Supervised Estimator with all the base functionality, plus a little extra bagging goodness." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 15, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "from sklearn.ensemble import RandomForestClassifier\n", 250 | "\n", 251 | "RandomForestClassifier?" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 17, 257 | "metadata": { 258 | "collapsed": false 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "m = RandomForestClassifier(n_estimators=20, oob_score=True)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 18, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 276 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 277 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 278 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 279 | " n_estimators=20, n_jobs=1, oob_score=True, random_state=None,\n", 280 | " verbose=0, warm_start=False)" 281 | ] 282 | }, 283 | "execution_count": 18, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "m.fit(X, y)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 23, 295 | "metadata": { 296 | "collapsed": false 297 | }, 298 | "outputs": [ 299 | { 300 | "data": { 301 | "text/plain": [ 302 | "array([0])" 303 | ] 304 | }, 305 | "execution_count": 23, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "m.predict([X[0]])" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 25, 317 | "metadata": { 318 | "collapsed": false 319 | }, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "0.99333333333333329" 325 | ] 326 | }, 327 | "execution_count": 25, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "m.score(X, y)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "## AdaBoost\n", 341 | "\n", 342 | "The module sklearn.ensemble includes the popular boosting algorithm AdaBoost, introduced in 1995 by Freund and Schapire.\n", 343 | "\n", 344 | "The core principle of AdaBoost is to fit a sequence of weak learners (i.e., models that are only slightly better than random guessing, such as small decision trees) on repeatedly modified versions of the data. The predictions from all of them are then combined through a weighted majority vote (or sum) to produce the final prediction. The data modifications at each so-called boosting iteration consist of applying weights w_1, w_2, ..., w_N to each of the training samples. Initially, those weights are all set to w_i = 1/N, so that the first step simply trains a weak learner on the original data. For each successive iteration, the sample weights are individually modified and the learning algorithm is reapplied to the reweighted data. At a given step, those training examples that were incorrectly predicted by the boosted model induced at the previous step have their weights increased, whereas the weights are decreased for those that were predicted correctly. As iterations proceed, examples that are difficult to predict receive ever-increasing influence. Each subsequent weak learner is thereby forced to concentrate on the examples that are missed by the previous ones in the sequence " 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 27, 350 | "metadata": { 351 | "collapsed": true 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "from sklearn.ensemble import AdaBoostClassifier\n", 356 | "\n", 357 | "AdaBoostClassifier?" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 28, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "m = AdaBoostClassifier(base_estimator=None, n_estimators=100)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 30, 374 | "metadata": { 375 | "collapsed": false 376 | }, 377 | "outputs": [ 378 | { 379 | "data": { 380 | "text/plain": [ 381 | "AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,\n", 382 | " learning_rate=1.0, n_estimators=100, random_state=None)" 383 | ] 384 | }, 385 | "execution_count": 30, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "m.fit(X, y)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 31, 397 | "metadata": { 398 | "collapsed": false 399 | }, 400 | "outputs": [ 401 | { 402 | "data": { 403 | "text/plain": [ 404 | "0.97333333333333338" 405 | ] 406 | }, 407 | "execution_count": 31, 408 | "metadata": {}, 409 | "output_type": "execute_result" 410 | } 411 | ], 412 | "source": [ 413 | "m.score(X, y)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "## Gradient Tree Boosting\n", 421 | "\n", 422 | "Gradient Tree Boosting or Gradient Boosted Regression Trees (GBRT) is a generalization of boosting to arbitrary differentiable loss functions. GBRT is an accurate and effective off-the-shelf procedure that can be used for both regression and classification problems. Gradient Tree Boosting models are used in a variety of areas including Web search ranking and ecology.\n", 423 | "\n", 424 | "The advantages of GBRT are:\n", 425 | "* Natural handling of data of mixed type (= heterogeneous features)\n", 426 | "* Predictive power\n", 427 | "* Robustness to outliers in output space (via robust loss functions)\n", 428 | "\n", 429 | "The disadvantages of GBRT are:\n", 430 | "\n", 431 | "* Scalability, due to the sequential nature of boosting it can hardly be parallelized.\n", 432 | "\n", 433 | "The module sklearn.ensemble provides methods for both classification and regression via gradient boosted regression trees." 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 32, 439 | "metadata": { 440 | "collapsed": true 441 | }, 442 | "outputs": [], 443 | "source": [ 444 | "from sklearn.ensemble import GradientBoostingClassifier\n", 445 | "\n", 446 | "GradientBoostingClassifier?" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 37, 452 | "metadata": { 453 | "collapsed": false 454 | }, 455 | "outputs": [ 456 | { 457 | "data": { 458 | "text/plain": [ 459 | "0.99333333333333329" 460 | ] 461 | }, 462 | "execution_count": 37, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "m = GradientBoostingClassifier(n_estimators=10)\n", 469 | "\n", 470 | "m.fit(X, y)\n", 471 | "\n", 472 | "m.score(X, y)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 38, 478 | "metadata": { 479 | "collapsed": false 480 | }, 481 | "outputs": [ 482 | { 483 | "data": { 484 | "text/plain": [ 485 | "1.0" 486 | ] 487 | }, 488 | "execution_count": 38, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "m.set_params(n_estimators=20, warm_start=True)\n", 495 | "\n", 496 | "m.fit(X, y)\n", 497 | "\n", 498 | "m.score(X, y)" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 39, 504 | "metadata": { 505 | "collapsed": false 506 | }, 507 | "outputs": [ 508 | { 509 | "data": { 510 | "text/plain": [ 511 | "array([ 0.00444544, 0.03816819, 0.41928686, 0.53809951])" 512 | ] 513 | }, 514 | "execution_count": 39, 515 | "metadata": {}, 516 | "output_type": "execute_result" 517 | } 518 | ], 519 | "source": [ 520 | "m.feature_importances_" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "## Voting Classifier\n", 528 | "\n", 529 | "The idea behind the voting classifier implementation is to combine conceptually different machine learning classifiers and use a majority vote or the average predicted probabilities (soft vote) to predict the class labels. Such a classifier can be useful for a set of equally well performing model in order to balance out their individual weaknesses." 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 40, 535 | "metadata": { 536 | "collapsed": true 537 | }, 538 | "outputs": [], 539 | "source": [ 540 | "from sklearn.ensemble import VotingClassifier\n", 541 | "\n", 542 | "VotingClassifier?" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 43, 548 | "metadata": { 549 | "collapsed": true 550 | }, 551 | "outputs": [], 552 | "source": [ 553 | "from sklearn.linear_model import LogisticRegression\n", 554 | "from sklearn.naive_bayes import GaussianNB\n", 555 | "from sklearn.ensemble import RandomForestClassifier\n", 556 | "\n", 557 | "\n", 558 | "m = VotingClassifier(\n", 559 | " estimators=[('lr', LogisticRegression()), \n", 560 | " ('rf', RandomForestClassifier()), \n", 561 | " ('gnb', GaussianNB())], \n", 562 | " voting='hard')" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 45, 568 | "metadata": { 569 | "collapsed": false 570 | }, 571 | "outputs": [ 572 | { 573 | "name": "stdout", 574 | "output_type": "stream", 575 | "text": [ 576 | "VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 577 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 578 | " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", 579 | " verbose=0, warm_start=False)), ('rf', RandomF...lse, random_state=None,\n", 580 | " verbose=0, warm_start=False)), ('gnb', GaussianNB(priors=None))],\n", 581 | " n_jobs=1, voting='hard', weights=None)\n" 582 | ] 583 | } 584 | ], 585 | "source": [ 586 | "m.fit(X, y)" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 49, 592 | "metadata": { 593 | "collapsed": false 594 | }, 595 | "outputs": [ 596 | { 597 | "data": { 598 | "text/plain": [ 599 | "0.98666666666666669" 600 | ] 601 | }, 602 | "execution_count": 49, 603 | "metadata": {}, 604 | "output_type": "execute_result" 605 | } 606 | ], 607 | "source": [ 608 | "m.score(X, y)" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": null, 614 | "metadata": { 615 | "collapsed": true 616 | }, 617 | "outputs": [], 618 | "source": [] 619 | } 620 | ], 621 | "metadata": { 622 | "kernelspec": { 623 | "display_name": "Python 2", 624 | "language": "python", 625 | "name": "python2" 626 | }, 627 | "language_info": { 628 | "codemirror_mode": { 629 | "name": "ipython", 630 | "version": 2 631 | }, 632 | "file_extension": ".py", 633 | "mimetype": "text/x-python", 634 | "name": "python", 635 | "nbconvert_exporter": "python", 636 | "pygments_lexer": "ipython2", 637 | "version": "2.7.10" 638 | } 639 | }, 640 | "nbformat": 4, 641 | "nbformat_minor": 2 642 | } 643 | -------------------------------------------------------------------------------- /notebooks/Multiclass.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Multiclass and Multi Label Algorithms\n", 11 | "\n", 12 | "The sklearn.multiclass module implements meta-estimators to solve multiclass and multilabel classification problems by decomposing such problems into binary classification problems. Multitarget regression is also supported.\n", 13 | "\n", 14 | "* Multiclass classification means a classification task with more than two classes; e.g., classify a set of images of fruits which may be oranges, apples, or pears. Multiclass classification makes the assumption that each sample is assigned to one and only one label: a fruit can be either an apple or a pear but not both at the same time.\n", 15 | "* Multilabel classification assigns to each sample a set of target labels. This can be thought as predicting properties of a data-point that are not mutually exclusive, such as topics that are relevant for a document. A text might be about any of religion, politics, finance or education at the same time or none of these.\n", 16 | "* Multioutput regression assigns each sample a set of target values. This can be thought of as predicting several properties for each data-point, such as wind direction and magnitude at a certain location.\n", 17 | "* Multioutput-multiclass classification and multi-task classification means that a single estimator has to handle several joint classification tasks. This is both a generalization of the multi-label classification task, which only considers binary classification, as well as a generalization of the multi-class classification task. The output format is a 2d numpy array or sparse matrix.\n", 18 | "\n", 19 | " The set of labels can be different for each output variable. For instance, a sample could be assigned “pear” for an output variable that takes possible values in a finite set of species such as “pear”, “apple”; and “blue” or “green” for a second output variable that takes possible values in a finite set of colors such as “green”, “red”, “blue”, “yellow”...\n", 20 | "\n", 21 | " This means that any classifiers handling multi-output multiclass or multi-task classification tasks, support the multi-label classification task as a special case. Multi-task classification is similar to the multi-output classification task with different model formulations. For more information, see the relevant estimator documentation.\n", 22 | "\n", 23 | "All scikit-learn classifiers are capable of multiclass classification, but the meta-estimators offered by sklearn.multiclass permit changing the way they handle more than two classes because this may have an effect on classifier performance (either in terms of generalization error or required computational resources).\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "deletable": true, 30 | "editable": true 31 | }, 32 | "source": [ 33 | "## Multilabel classification format\n", 34 | "\n", 35 | "In multilabel learning, the joint set of binary classification tasks is expressed with label binary indicator array: each sample is one row of a 2d array of shape (n_samples, n_classes) with binary values: the one, i.e. the non zero elements, corresponds to the subset of labels. An array such as np.array([[1, 0, 0], [0, 1, 1], [0, 0, 0]]) represents label 0 in the first sample, labels 1 and 2 in the second sample, and no labels in the third sample.\n", 36 | "\n", 37 | "Producing multilabel data as a list of sets of labels may be more intuitive. The MultiLabelBinarizer transformer can be used to convert between a collection of collections of labels and the indicator format.\n", 38 | "\n", 39 | "This is skipping ahead by a couple of lessons (we have not seen transform before!) But keep this in the back of your mind for when we get there and just memorize this for now" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 1, 45 | "metadata": { 46 | "collapsed": false, 47 | "deletable": true, 48 | "editable": true 49 | }, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "array([[0, 0, 1, 1, 1],\n", 55 | " [0, 0, 1, 0, 0],\n", 56 | " [1, 1, 0, 1, 0],\n", 57 | " [1, 1, 1, 1, 1],\n", 58 | " [1, 1, 1, 0, 0]])" 59 | ] 60 | }, 61 | "execution_count": 1, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 68 | "\n", 69 | "y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]\n", 70 | "\n", 71 | "MultiLabelBinarizer().fit_transform(y)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "deletable": true, 78 | "editable": true 79 | }, 80 | "source": [ 81 | "## One vs Rest\n", 82 | "\n", 83 | "This strategy, also known as one-vs-all, is implemented in OneVsRestClassifier. The strategy consists in fitting one classifier per class. For each classifier, the class is fitted against all the other classes. In addition to its computational efficiency (only n_classes classifiers are needed), one advantage of this approach is its interpretability. Since each class is represented by one and only one classifier, it is possible to gain knowledge about the class by inspecting its corresponding classifier. This is the most commonly used strategy and is a fair default choice." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 2, 89 | "metadata": { 90 | "collapsed": false, 91 | "deletable": true, 92 | "editable": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "from sklearn import datasets\n", 97 | "\n", 98 | "from sklearn.multiclass import OneVsRestClassifier\n", 99 | "\n", 100 | "OneVsRestClassifier?" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 3, 106 | "metadata": { 107 | "collapsed": true, 108 | "deletable": true, 109 | "editable": true 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "from sklearn.svm import LinearSVC\n", 114 | "\n", 115 | "# Note that this also can OneVsRest\n", 116 | "LinearSVC?" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "metadata": { 123 | "collapsed": false, 124 | "deletable": true, 125 | "editable": true 126 | }, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 132 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 133 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 134 | " verbose=0),\n", 135 | " n_jobs=1)" 136 | ] 137 | }, 138 | "execution_count": 4, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "X, y = datasets.load_iris(return_X_y=True)\n", 145 | "\n", 146 | "m = OneVsRestClassifier(LinearSVC())\n", 147 | "\n", 148 | "m.fit(X, y)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 5, 154 | "metadata": { 155 | "collapsed": false, 156 | "deletable": true, 157 | "editable": true 158 | }, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "(array([0, 1, 2]), False)" 164 | ] 165 | }, 166 | "execution_count": 5, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "m.classes_, m.multilabel_" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 6, 178 | "metadata": { 179 | "collapsed": false, 180 | "deletable": true, 181 | "editable": true 182 | }, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "0.96666666666666667" 188 | ] 189 | }, 190 | "execution_count": 6, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "m.score(X, y)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 7, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "array([[ 0.18424099, 0.45123591, -0.80793809, -0.45070657],\n", 210 | " [ 0.04859238, -0.88423382, 0.40385695, -0.93606001],\n", 211 | " [-0.85070203, -0.98668927, 1.38090339, 1.86546957]])" 212 | ] 213 | }, 214 | "execution_count": 7, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "m.coef_" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 8, 226 | "metadata": { 227 | "collapsed": false, 228 | "deletable": true, 229 | "editable": true 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "0.96666666666666667" 236 | ] 237 | }, 238 | "execution_count": 8, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "LinearSVC().fit(X, y).score(X, y)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": { 250 | "deletable": true, 251 | "editable": true 252 | }, 253 | "source": [ 254 | "## One Vs One\n", 255 | "\n", 256 | "OneVsOneClassifier constructs one classifier per pair of classes. At prediction time, the class which received the most votes is selected. In the event of a tie (among two classes with an equal number of votes), it selects the class with the highest aggregate classification confidence by summing over the pair-wise classification confidence levels computed by the underlying binary classifiers.\n", 257 | "\n", 258 | "Since it requires to fit n_classes * (n_classes - 1) / 2 classifiers, this method is usually slower than one-vs-the-rest, due to its O(n_classes^2) complexity. However, this method may be advantageous for algorithms such as kernel algorithms which don’t scale well with n_samples. This is because each individual learning problem only involves a small subset of the data whereas, with one-vs-the-rest, the complete dataset is used n_classes times." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 9, 264 | "metadata": { 265 | "collapsed": true, 266 | "deletable": true, 267 | "editable": true 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "from sklearn.multiclass import OneVsOneClassifier\n", 272 | "\n", 273 | "OneVsOneClassifier?" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 10, 279 | "metadata": { 280 | "collapsed": false, 281 | "deletable": true, 282 | "editable": true 283 | }, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "OneVsOneClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 289 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 290 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 291 | " verbose=0),\n", 292 | " n_jobs=1)" 293 | ] 294 | }, 295 | "execution_count": 10, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "m = OneVsOneClassifier(\n", 302 | " LinearSVC())\n", 303 | "\n", 304 | "m.fit(X, y)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 11, 310 | "metadata": { 311 | "collapsed": false, 312 | "deletable": true, 313 | "editable": true 314 | }, 315 | "outputs": [ 316 | { 317 | "data": { 318 | "text/plain": [ 319 | "(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 320 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 321 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 322 | " verbose=0),\n", 323 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 324 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 325 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 326 | " verbose=0),\n", 327 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 328 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 329 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 330 | " verbose=0))" 331 | ] 332 | }, 333 | "execution_count": 11, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "m.estimators_" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 12, 345 | "metadata": { 346 | "collapsed": false, 347 | "deletable": true, 348 | "editable": true 349 | }, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "0.97999999999999998" 355 | ] 356 | }, 357 | "execution_count": 12, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "m.score(X, y)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": { 369 | "deletable": true, 370 | "editable": true 371 | }, 372 | "source": [ 373 | "## Error Correcting Output Codes\n", 374 | "\n", 375 | "Output-code based strategies are fairly different from one-vs-the-rest and one-vs-one. With these strategies, each class is represented in a Euclidean space, where each dimension can only be 0 or 1. Another way to put it is that each class is represented by a binary code (an array of 0 and 1). The matrix which keeps track of the location/code of each class is called the code book. The code size is the dimensionality of the aforementioned space. Intuitively, each class should be represented by a code as unique as possible and a good code book should be designed to optimize classification accuracy.\n", 376 | "\n", 377 | "At fitting time, one binary classifier per bit in the code book is fitted. At prediction time, the classifiers are used to project new points in the class space and the class closest to the points is chosen.\n", 378 | "\n", 379 | "In OutputCodeClassifier, the code_size attribute allows the user to control the number of classifiers which will be used. It is a percentage of the total number of classes.\n", 380 | "\n", 381 | "A number between 0 and 1 will require fewer classifiers than one-vs-the-rest. In theory, log2(n_classes) / n_classes is sufficient to represent each class unambiguously. However, in practice, it may not lead to good accuracy since log2(n_classes) is much smaller than n_classes.\n", 382 | "\n", 383 | "A number greater than 1 will require more classifiers than one-vs-the-rest. In this case, some classifiers will in theory correct for the mistakes made by other classifiers, hence the name “error-correcting”. In practice, however, this may not happen as classifier mistakes will typically be correlated. The error-correcting output codes have a similar effect to bagging." 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 13, 389 | "metadata": { 390 | "collapsed": true, 391 | "deletable": true, 392 | "editable": true 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "from sklearn.multiclass import OutputCodeClassifier\n", 397 | "\n", 398 | "OutputCodeClassifier?" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 14, 404 | "metadata": { 405 | "collapsed": false, 406 | "deletable": true, 407 | "editable": true 408 | }, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "OutputCodeClassifier(code_size=2,\n", 414 | " estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 415 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 416 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 417 | " verbose=0),\n", 418 | " n_jobs=1, random_state=None)" 419 | ] 420 | }, 421 | "execution_count": 14, 422 | "metadata": {}, 423 | "output_type": "execute_result" 424 | } 425 | ], 426 | "source": [ 427 | "m = OutputCodeClassifier(LinearSVC(), code_size=2)\n", 428 | "\n", 429 | "m.fit(X, y)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 15, 435 | "metadata": { 436 | "collapsed": false, 437 | "deletable": true, 438 | "editable": true 439 | }, 440 | "outputs": [ 441 | { 442 | "data": { 443 | "text/plain": [ 444 | "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 445 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 446 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 447 | " verbose=0),\n", 448 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 449 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 450 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 451 | " verbose=0),\n", 452 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 453 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 454 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 455 | " verbose=0),\n", 456 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 457 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 458 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 459 | " verbose=0),\n", 460 | " _ConstantPredictor(),\n", 461 | " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 462 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 463 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 464 | " verbose=0)]" 465 | ] 466 | }, 467 | "execution_count": 15, 468 | "metadata": {}, 469 | "output_type": "execute_result" 470 | } 471 | ], 472 | "source": [ 473 | "m.estimators_" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 16, 479 | "metadata": { 480 | "collapsed": false, 481 | "deletable": true, 482 | "editable": true 483 | }, 484 | "outputs": [ 485 | { 486 | "data": { 487 | "text/plain": [ 488 | "0.97999999999999998" 489 | ] 490 | }, 491 | "execution_count": 16, 492 | "metadata": {}, 493 | "output_type": "execute_result" 494 | } 495 | ], 496 | "source": [ 497 | "m.score(X, y)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": { 503 | "deletable": true, 504 | "editable": true 505 | }, 506 | "source": [ 507 | "# Multiple Output Regression and Classification\n", 508 | "\n", 509 | "Multioutput regression support can be added to any regressor with MultiOutputRegressor. This strategy consists of fitting one regressor per target. Since each target is represented by exactly one regressor it is possible to gain knowledge about the target by inspecting its corresponding regressor. As MultiOutputRegressor fits one regressor per target it can not take advantage of correlations between targets.\n", 510 | "\n", 511 | "Multioutput classification support can be added to any classifier with MultiOutputClassifier. This strategy consists of fitting one classifier per target. This allows multiple target variable classifications. The purpose of this class is to extend estimators to be able to estimate a series of target functions (f1,f2,f3...,fn) that are trained on a single X predictor matrix to predict a series of reponses (y1,y2,y3...,yn)." 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 17, 517 | "metadata": { 518 | "collapsed": false, 519 | "deletable": true, 520 | "editable": true 521 | }, 522 | "outputs": [ 523 | { 524 | "data": { 525 | "text/plain": [ 526 | "0.99999999911789184" 527 | ] 528 | }, 529 | "execution_count": 17, 530 | "metadata": {}, 531 | "output_type": "execute_result" 532 | } 533 | ], 534 | "source": [ 535 | "from sklearn.datasets import make_regression\n", 536 | "\n", 537 | "from sklearn.multioutput import MultiOutputRegressor\n", 538 | "\n", 539 | "from sklearn.ensemble import GradientBoostingRegressor\n", 540 | "\n", 541 | "X, y = make_regression(n_samples=10, n_targets=3, random_state=1)\n", 542 | "\n", 543 | "MultiOutputRegressor(\n", 544 | " GradientBoostingRegressor(random_state=0)).fit(X, y).score(X, y)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": { 551 | "collapsed": true, 552 | "deletable": true, 553 | "editable": true 554 | }, 555 | "outputs": [], 556 | "source": [] 557 | } 558 | ], 559 | "metadata": { 560 | "kernelspec": { 561 | "display_name": "Python 2", 562 | "language": "python", 563 | "name": "python2" 564 | }, 565 | "language_info": { 566 | "codemirror_mode": { 567 | "name": "ipython", 568 | "version": 2 569 | }, 570 | "file_extension": ".py", 571 | "mimetype": "text/x-python", 572 | "name": "python", 573 | "nbconvert_exporter": "python", 574 | "pygments_lexer": "ipython2", 575 | "version": "2.7.10" 576 | } 577 | }, 578 | "nbformat": 4, 579 | "nbformat_minor": 2 580 | } 581 | --------------------------------------------------------------------------------