├── README.md
├── notebooks
    ├── model.pkl
    ├── .ipynb_checkpoints
    │   ├── DensityEstimation-checkpoint.ipynb
    │   ├── ModelPersistence-checkpoint.ipynb
    │   ├── Feature Transformation-checkpoint.ipynb
    │   ├── NovelyDetection-checkpoint.ipynb
    │   ├── PipelinesAndFeatureUnions-checkpoint.ipynb
    │   ├── FeatureSelection-checkpoint.ipynb
    │   ├── FeatureExtraction-checkpoint.ipynb
    │   ├── CrossValidation-checkpoint.ipynb
    │   ├── Multiclass-checkpoint.ipynb
    │   └── EnsembleMethods-checkpoint.ipynb
    ├── ModelPersistence.ipynb
    ├── DensityEstimation.ipynb
    ├── NovelyDetection.ipynb
    ├── FeatureTransformation.ipynb
    ├── PipelinesAndFeatureUnions.ipynb
    ├── FeatureExtraction.ipynb
    ├── FeatureSelection.ipynb
    ├── CrossValidation.ipynb
    └── Multiclass.ipynb
├── LICENSE
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
1 | # bit-of-data-science-and-scikit-learn
2 | 


--------------------------------------------------------------------------------
/notebooks/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knathanieltucker/bit-of-data-science-and-scikit-learn/HEAD/notebooks/model.pkl


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/DensityEstimation-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 K. Nathaniel Tucker
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | alabaster==0.7.10
 2 | appdirs==1.4.3
 3 | appnope==0.1.0
 4 | Babel==2.4.0
 5 | backports-abc==0.5
 6 | backports.shutil-get-terminal-size==1.0.0
 7 | bleach==2.0.0
 8 | certifi==2017.1.23
 9 | configparser==3.5.0
10 | cycler==0.10.0
11 | decorator==4.0.11
12 | docutils==0.13.1
13 | entrypoints==0.2.2
14 | enum34==1.1.6
15 | functools32==3.2.3.post2
16 | futures==3.0.5
17 | html5lib==0.999999999
18 | imagesize==0.7.1
19 | ipykernel==4.5.2
20 | ipyparallel==6.0.2
21 | ipython==5.3.0
22 | ipython-genutils==0.2.0
23 | ipywidgets==6.0.0
24 | Jinja2==2.9.5
25 | jsonschema==2.6.0
26 | jupyter-client==5.0.0
27 | jupyter-core==4.3.0
28 | MarkupSafe==1.0
29 | matplotlib==2.0.0
30 | mistune==0.7.4
31 | nbconvert==5.1.1
32 | nbformat==4.3.0
33 | nose==1.3.7
34 | notebook==4.4.1
35 | numpy==1.12.1
36 | olefile==0.44
37 | packaging==16.8
38 | pandocfilters==1.4.1
39 | pathlib2==2.2.1
40 | pexpect==4.2.1
41 | pickleshare==0.7.4
42 | Pillow==4.0.0
43 | prompt-toolkit==1.0.14
44 | ptyprocess==0.5.1
45 | Pygments==2.2.0
46 | pyparsing==2.2.0
47 | python-dateutil==2.6.0
48 | pytz==2016.10
49 | pyzmq==16.0.2
50 | qtconsole==4.2.1
51 | requests==2.13.0
52 | scandir==1.5
53 | scikit-learn==0.18.1
54 | scipy==0.19.0
55 | simplegeneric==0.8.1
56 | singledispatch==3.4.0.3
57 | six==1.10.0
58 | snowballstemmer==1.2.1
59 | Sphinx==1.5.3
60 | subprocess32==3.2.7
61 | terminado==0.6
62 | testpath==0.3
63 | tornado==4.4.2
64 | traitlets==4.3.2
65 | wcwidth==0.1.7
66 | webencodings==0.5
67 | widgetsnbextension==2.0.0
68 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/ModelPersistence-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Model Persistence\n",
  8 |     "\n",
  9 |     "After training a scikit-learn model, it is desirable to have a way to persist the model for future use without having to retrain. The following section gives you an example of how to persist a model with pickle. We’ll also review a few security and maintainability issues when working with pickle serialization.\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {
 16 |     "collapsed": false
 17 |    },
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
 23 |        "  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
 24 |        "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
 25 |        "  tol=0.001, verbose=False)"
 26 |       ]
 27 |      },
 28 |      "execution_count": 1,
 29 |      "metadata": {},
 30 |      "output_type": "execute_result"
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "from sklearn import svm\n",
 35 |     "from sklearn import datasets\n",
 36 |     "clf = svm.SVC()\n",
 37 |     "iris = datasets.load_iris()\n",
 38 |     "X, y = iris.data, iris.target\n",
 39 |     "clf.fit(X, y)  "
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [
 49 |     {
 50 |      "data": {
 51 |       "text/plain": [
 52 |        "array([0])"
 53 |       ]
 54 |      },
 55 |      "execution_count": 2,
 56 |      "metadata": {},
 57 |      "output_type": "execute_result"
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "import pickle\n",
 62 |     "s = pickle.dumps(clf)\n",
 63 |     "clf2 = pickle.loads(s)\n",
 64 |     "clf2.predict(X[0:1])\n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/plain": [
 77 |        "0"
 78 |       ]
 79 |      },
 80 |      "execution_count": 3,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "y[0]"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "In the specific case of the scikit, it may be more interesting to use joblib’s replacement of pickle (joblib.dump & joblib.load), which is more efficient on objects that carry large numpy arrays internally as is often the case for fitted scikit-learn estimators, but can only pickle to the disk and not to a string:"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 4,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "['model.pkl']"
107 |       ]
108 |      },
109 |      "execution_count": 4,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "from sklearn.externals import joblib\n",
116 |     "\n",
117 |     "joblib.dump(clf, 'model.pkl') "
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "metadata": {
124 |     "collapsed": true
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "clf = joblib.load('model.pkl') "
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": []
139 |   }
140 |  ],
141 |  "metadata": {
142 |   "kernelspec": {
143 |    "display_name": "Python 2",
144 |    "language": "python",
145 |    "name": "python2"
146 |   },
147 |   "language_info": {
148 |    "codemirror_mode": {
149 |     "name": "ipython",
150 |     "version": 2
151 |    },
152 |    "file_extension": ".py",
153 |    "mimetype": "text/x-python",
154 |    "name": "python",
155 |    "nbconvert_exporter": "python",
156 |    "pygments_lexer": "ipython2",
157 |    "version": "2.7.10"
158 |   }
159 |  },
160 |  "nbformat": 4,
161 |  "nbformat_minor": 2
162 | }
163 | 


--------------------------------------------------------------------------------
/notebooks/ModelPersistence.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "## Model Persistence\n",
 11 |     "\n",
 12 |     "After training a scikit-learn model, it is desirable to have a way to persist the model for future use without having to retrain. The following section gives you an example of how to persist a model with pickle. We’ll also review a few security and maintainability issues when working with pickle serialization.\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {
 19 |     "collapsed": false,
 20 |     "deletable": true,
 21 |     "editable": true
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
 28 |        "  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
 29 |        "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
 30 |        "  tol=0.001, verbose=False)"
 31 |       ]
 32 |      },
 33 |      "execution_count": 1,
 34 |      "metadata": {},
 35 |      "output_type": "execute_result"
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "from sklearn import svm\n",
 40 |     "from sklearn import datasets\n",
 41 |     "clf = svm.SVC()\n",
 42 |     "iris = datasets.load_iris()\n",
 43 |     "X, y = iris.data, iris.target\n",
 44 |     "clf.fit(X, y)  "
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "metadata": {
 51 |     "collapsed": false,
 52 |     "deletable": true,
 53 |     "editable": true
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": [
 59 |        "array([0])"
 60 |       ]
 61 |      },
 62 |      "execution_count": 2,
 63 |      "metadata": {},
 64 |      "output_type": "execute_result"
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "import pickle\n",
 69 |     "s = pickle.dumps(clf)\n",
 70 |     "clf2 = pickle.loads(s)\n",
 71 |     "clf2.predict(X[0:1])\n"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 3,
 77 |    "metadata": {
 78 |     "collapsed": false,
 79 |     "deletable": true,
 80 |     "editable": true
 81 |    },
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "0"
 87 |       ]
 88 |      },
 89 |      "execution_count": 3,
 90 |      "metadata": {},
 91 |      "output_type": "execute_result"
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "y[0]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {
101 |     "deletable": true,
102 |     "editable": true
103 |    },
104 |    "source": [
105 |     "In the specific case of the scikit, it may be more interesting to use joblib’s replacement of pickle (joblib.dump & joblib.load), which is more efficient on objects that carry large numpy arrays internally as is often the case for fitted scikit-learn estimators, but can only pickle to the disk and not to a string:"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 4,
111 |    "metadata": {
112 |     "collapsed": false,
113 |     "deletable": true,
114 |     "editable": true
115 |    },
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "['model.pkl']"
121 |       ]
122 |      },
123 |      "execution_count": 4,
124 |      "metadata": {},
125 |      "output_type": "execute_result"
126 |     }
127 |    ],
128 |    "source": [
129 |     "from sklearn.externals import joblib\n",
130 |     "\n",
131 |     "joblib.dump(clf, 'model.pkl') "
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 5,
137 |    "metadata": {
138 |     "collapsed": true,
139 |     "deletable": true,
140 |     "editable": true
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "clf = joblib.load('model.pkl') "
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {
151 |     "collapsed": true,
152 |     "deletable": true,
153 |     "editable": true
154 |    },
155 |    "outputs": [],
156 |    "source": []
157 |   }
158 |  ],
159 |  "metadata": {
160 |   "kernelspec": {
161 |    "display_name": "Python 2",
162 |    "language": "python",
163 |    "name": "python2"
164 |   },
165 |   "language_info": {
166 |    "codemirror_mode": {
167 |     "name": "ipython",
168 |     "version": 2
169 |    },
170 |    "file_extension": ".py",
171 |    "mimetype": "text/x-python",
172 |    "name": "python",
173 |    "nbconvert_exporter": "python",
174 |    "pygments_lexer": "ipython2",
175 |    "version": "2.7.10"
176 |   }
177 |  },
178 |  "nbformat": 4,
179 |  "nbformat_minor": 2
180 | }
181 | 


--------------------------------------------------------------------------------
/notebooks/DensityEstimation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Density Estimation\n",
 11 |     "\n",
 12 |     "Density estimation walks the line between unsupervised learning, feature engineering, and data modeling. Some of the most popular and useful density estimation techniques are mixture models such as Gaussian Mixtures (sklearn.mixture.GaussianMixture), and neighbor-based approaches such as the kernel density estimate (sklearn.neighbors.KernelDensity). Gaussian Mixtures are discussed more fully in the context of clustering, because the technique is also useful as an unsupervised clustering scheme.\n",
 13 |     "\n",
 14 |     "Density estimation is a very simple concept, and most people are already familiar with one common density estimation technique: the histogram."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {
 20 |     "deletable": true,
 21 |     "editable": true
 22 |    },
 23 |    "source": [
 24 |     "## Kernel Density Estimation\n",
 25 |     "\n",
 26 |     "Kernel density estimation in scikit-learn is implemented in the sklearn.neighbors.KernelDensity estimator, which uses the Ball Tree or KD Tree for efficient queries (see Nearest Neighbors for a discussion of these). Though the above example uses a 1D data set for simplicity, kernel density estimation can be performed in any number of dimensions, though in practice the curse of dimensionality causes its performance to degrade in high dimensions.\n",
 27 |     "\n",
 28 |     "The kernel density estimator can be used with any of the valid distance metrics (see sklearn.neighbors.DistanceMetric for a list of available metrics), though the results are properly normalized only for the Euclidean metric. One particularly useful metric is the Haversine distance which measures the angular distance between points on a sphere."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "from sklearn.neighbors.kde import KernelDensity\n",
 40 |     "\n",
 41 |     "KernelDensity?"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {
 48 |     "collapsed": false,
 49 |     "deletable": true,
 50 |     "editable": true
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/plain": [
 56 |        "array([-10562.91076071])"
 57 |       ]
 58 |      },
 59 |      "execution_count": 3,
 60 |      "metadata": {},
 61 |      "output_type": "execute_result"
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "from sklearn.neighbors.kde import KernelDensity\n",
 66 |     "import numpy as np\n",
 67 |     "\n",
 68 |     "X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n",
 69 |     "kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X)\n",
 70 |     "\n",
 71 |     "kde.score_samples([[32,4]])"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "metadata": {
 78 |     "collapsed": false,
 79 |     "deletable": true,
 80 |     "editable": true
 81 |    },
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "array([[ 2.21052437,  1.09216422]])"
 87 |       ]
 88 |      },
 89 |      "execution_count": 4,
 90 |      "metadata": {},
 91 |      "output_type": "execute_result"
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "kde.sample(1)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {
102 |     "collapsed": true,
103 |     "deletable": true,
104 |     "editable": true
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "from sklearn.datasets import load_iris\n",
109 |     "\n",
110 |     "X, y = load_iris(return_X_y=True)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 6,
116 |    "metadata": {
117 |     "collapsed": false,
118 |     "deletable": true,
119 |     "editable": true
120 |    },
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "[-3.8262878]\n",
127 |       "[-8.13952384]\n",
128 |       "[-12.91720053]\n"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "estimators = []\n",
134 |     "for c in [0, 1, 2]:\n",
135 |     "    m = KernelDensity().fit(X[y == c])\n",
136 |     "    estimators.append(m)\n",
137 |     "    \n",
138 |     "for estimator in estimators:\n",
139 |     "    print estimator.score_samples([X[0]])"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": true,
147 |     "deletable": true,
148 |     "editable": true
149 |    },
150 |    "outputs": [],
151 |    "source": []
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {
157 |     "collapsed": true,
158 |     "deletable": true,
159 |     "editable": true
160 |    },
161 |    "outputs": [],
162 |    "source": []
163 |   }
164 |  ],
165 |  "metadata": {
166 |   "kernelspec": {
167 |    "display_name": "Python 2",
168 |    "language": "python",
169 |    "name": "python2"
170 |   },
171 |   "language_info": {
172 |    "codemirror_mode": {
173 |     "name": "ipython",
174 |     "version": 2
175 |    },
176 |    "file_extension": ".py",
177 |    "mimetype": "text/x-python",
178 |    "name": "python",
179 |    "nbconvert_exporter": "python",
180 |    "pygments_lexer": "ipython2",
181 |    "version": "2.7.10"
182 |   }
183 |  },
184 |  "nbformat": 4,
185 |  "nbformat_minor": 2
186 | }
187 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/Feature Transformation-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Feature Transformation\n",
  8 |     "\n",
  9 |     "I am going to show off only two parts of the massive quantity of code in the unsupervised learning section of sklearn. And they can be put into this single bucket:\n",
 10 |     "\n",
 11 |     "* Feature Transformation\n",
 12 |     "* Exploratory Data Analysis\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## Clustering\n",
 20 |     "\n",
 21 |     "Clustering of unlabeled data can be performed with the module sklearn.cluster.\n",
 22 |     "Each clustering algorithm comes in two variants: a class, that implements the fit method to learn the clusters on train data, and a function, that, given train data, returns an array of integer labels corresponding to the different clusters. For the class, the labels over the training data can be found in the labels_ attribute."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "#### Kmeans\n",
 30 |     "\n",
 31 |     "The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. This algorithm requires the number of clusters to be specified. It scales well to large number of samples and has been used across a large range of application areas in many different fields.\n",
 32 |     "\n",
 33 |     "Let's check out how it is used"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 24,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from sklearn.cluster import KMeans\n",
 45 |     "\n",
 46 |     "KMeans?"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 12,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "from sklearn.datasets import load_iris\n",
 58 |     "\n",
 59 |     "X, y = load_iris(return_X_y=True)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 13,
 65 |    "metadata": {
 66 |     "collapsed": true
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "cluster = KMeans(n_clusters=3)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 14,
 76 |    "metadata": {
 77 |     "collapsed": false
 78 |    },
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n",
 84 |        "    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',\n",
 85 |        "    random_state=None, tol=0.0001, verbose=0)"
 86 |       ]
 87 |      },
 88 |      "execution_count": 14,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "cluster.fit(X)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 16,
100 |    "metadata": {
101 |     "collapsed": false
102 |    },
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
108 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
109 |        "       0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
110 |        "       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
111 |        "       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1,\n",
112 |        "       2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,\n",
113 |        "       1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1], dtype=int32)"
114 |       ]
115 |      },
116 |      "execution_count": 16,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "cluster.predict(X)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 21,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "from sklearn.tree import DecisionTreeClassifier\n",
134 |     "\n",
135 |     "m = DecisionTreeClassifier(max_depth=2)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 22,
141 |    "metadata": {
142 |     "collapsed": false
143 |    },
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n",
149 |        "            max_features=None, max_leaf_nodes=None,\n",
150 |        "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
151 |        "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
152 |        "            presort=False, random_state=None, splitter='best')"
153 |       ]
154 |      },
155 |      "execution_count": 22,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "m.fit(cluster.predict(X)[:, None], y)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 23,
167 |    "metadata": {
168 |     "collapsed": false
169 |    },
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "0.89333333333333331"
175 |       ]
176 |      },
177 |      "execution_count": 23,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "m.score(cluster.predict(X)[:, None], y)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "## Principal component analysis (PCA)\n",
191 |     "\n",
192 |     "PCA is used to decompose a multivariate dataset in a set of successive orthogonal components that explain a maximum amount of the variance. In scikit-learn, PCA is implemented as a transformer object that learns n components in its fit method, and can be used on new data to project it on these components.\n",
193 |     "\n",
194 |     "The optional parameter whiten=True makes it possible to project the data onto the singular space while scaling each component to unit variance. This is often useful if the models down-stream make strong assumptions on the isotropy of the signal: this is for example the case for Support Vector Machines with the RBF kernel and the K-Means clustering algorithm."
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 25,
200 |    "metadata": {
201 |     "collapsed": true
202 |    },
203 |    "outputs": [],
204 |    "source": [
205 |     "from sklearn.decomposition import PCA\n",
206 |     "\n",
207 |     "PCA?"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 29,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [
217 |     {
218 |      "data": {
219 |       "text/plain": [
220 |        "(150, 2)"
221 |       ]
222 |      },
223 |      "execution_count": 29,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "from sklearn.svm import SVC\n",
230 |     "\n",
231 |     "pca = PCA(n_components=2)\n",
232 |     "\n",
233 |     "X_pca = pca.fit_transform(X)\n",
234 |     "\n",
235 |     "X_pca.shape"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 28,
241 |    "metadata": {
242 |     "collapsed": false
243 |    },
244 |    "outputs": [
245 |     {
246 |      "data": {
247 |       "text/plain": [
248 |        "0.95333333333333337"
249 |       ]
250 |      },
251 |      "execution_count": 28,
252 |      "metadata": {},
253 |      "output_type": "execute_result"
254 |     }
255 |    ],
256 |    "source": [
257 |     "SVC().fit(X_pca, y).score(X_pca, y)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {
264 |     "collapsed": true
265 |    },
266 |    "outputs": [],
267 |    "source": []
268 |   }
269 |  ],
270 |  "metadata": {
271 |   "kernelspec": {
272 |    "display_name": "Python 2",
273 |    "language": "python",
274 |    "name": "python2"
275 |   },
276 |   "language_info": {
277 |    "codemirror_mode": {
278 |     "name": "ipython",
279 |     "version": 2
280 |    },
281 |    "file_extension": ".py",
282 |    "mimetype": "text/x-python",
283 |    "name": "python",
284 |    "nbconvert_exporter": "python",
285 |    "pygments_lexer": "ipython2",
286 |    "version": "2.7.10"
287 |   }
288 |  },
289 |  "nbformat": 4,
290 |  "nbformat_minor": 2
291 | }
292 | 


--------------------------------------------------------------------------------
/notebooks/NovelyDetection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Novely and Outlier Detection\n",
  8 |     "\n",
  9 |     "Many applications require being able to decide whether a new observation belongs to the same distribution as existing observations (it is an inlier), or should be considered as different (it is an outlier). Often, this ability is used to clean real data sets. Two important distinction must be made:\n",
 10 |     "\n",
 11 |     "* novelty detection:\n",
 12 |     " \tThe training data is not polluted by outliers, and we are interested in detecting anomalies in new observations.\n",
 13 |     "* outlier detection:\n",
 14 |     " \tThe training data contains outliers, and we need to fit the central mode of the training data, ignoring the deviant observations.\n",
 15 |     "\n",
 16 |     "The scikit-learn project provides a set of machine learning tools that can be used both for novelty or outliers detection. This strategy is implemented with objects learning in an unsupervised way from the data:\n",
 17 |     "\n",
 18 |     "`estimator.fit(X_train)`\n",
 19 |     "\n",
 20 |     "new observations can then be sorted as inliers or outliers with a predict method:\n",
 21 |     "\n",
 22 |     "`estimator.predict(X_test)`\n",
 23 |     "\n",
 24 |     "Inliers are labeled 1, while outliers are labeled -1."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Novelty Detection\n",
 32 |     "\n",
 33 |     "Consider a data set of n observations from the same distribution described by p features. Consider now that we add one more observation to that data set. Is the new observation so different from the others that we can doubt it is regular? (i.e. does it come from the same distribution?) Or on the contrary, is it so similar to the other that we cannot distinguish it from the original observations? This is the question addressed by the novelty detection tools and methods.\n",
 34 |     "\n",
 35 |     "In general, it is about to learn a rough, close frontier delimiting the contour of the initial observations distribution, plotted in embedding p-dimensional space. Then, if further observations lay within the frontier-delimited subspace, they are considered as coming from the same population than the initial observations. Otherwise, if they lay outside the frontier, we can say that they are abnormal with a given confidence in our assessment.\n",
 36 |     "\n",
 37 |     "The One-Class SVM has been introduced by Schölkopf et al. for that purpose and implemented in the Support Vector Machines module in the svm.OneClassSVM object. It requires the choice of a kernel and a scalar parameter to define a frontier. The RBF kernel is usually chosen although there exists no exact formula or algorithm to set its bandwidth parameter. This is the default in the scikit-learn implementation. The \\nu parameter, also known as the margin of the One-Class SVM, corresponds to the probability of finding a new, but regular, observation outside the frontier."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 1,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "from sklearn.svm import OneClassSVM\n",
 49 |     "\n",
 50 |     "OneClassSVM?"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 17,
 56 |    "metadata": {
 57 |     "collapsed": false
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import numpy as np\n",
 62 |     "\n",
 63 |     "X = 0.3 * np.random.randn(100, 2)\n",
 64 |     "X_train = np.r_[X + 2, X - 2]\n",
 65 |     "# Generate some regular novel observations\n",
 66 |     "X = 0.3 * np.random.randn(20, 2)\n",
 67 |     "X_test = np.r_[X + 2, X - 2]\n",
 68 |     "# Generate some abnormal novel observations\n",
 69 |     "X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 27,
 75 |    "metadata": {
 76 |     "collapsed": false
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "clf = OneClassSVM(nu=0.1, kernel=\"rbf\", gamma=0.1)\n",
 81 |     "clf.fit(X_train)\n",
 82 |     "y_pred_train = clf.predict(X_train)\n",
 83 |     "y_pred_test = clf.predict(X_test)\n",
 84 |     "y_pred_outliers = clf.predict(X_outliers)\n",
 85 |     "n_error_train = y_pred_train[y_pred_train == -1].size\n",
 86 |     "n_error_test = y_pred_test[y_pred_test == -1].size\n",
 87 |     "n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 29,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "(20, 6, 2)"
101 |       ]
102 |      },
103 |      "execution_count": 29,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "n_error_train, n_error_test, n_error_outliers"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "## Outlier Detection\n",
117 |     "\n",
118 |     "Outlier detection is similar to novelty detection in the sense that the goal is to separate a core of regular observations from some polluting ones, called “outliers”. Yet, in the case of outlier detection, we don’t have a clean data set representing the population of regular observations that can be used to train any tool."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "#### Isolation Forest\n",
126 |     "\n",
127 |     "One efficient way of performing outlier detection in high-dimensional datasets is to use random forests. The ensemble.IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.\n",
128 |     "\n",
129 |     "Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node.\n",
130 |     "This path length, averaged over a forest of such random trees, is a measure of normality and our decision function.\n",
131 |     "\n",
132 |     "Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies.\n"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 30,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "from sklearn.ensemble import IsolationForest\n",
144 |     "\n",
145 |     "rng = np.random.RandomState(42)\n",
146 |     "\n",
147 |     "# Generate train data\n",
148 |     "X = 0.3 * rng.randn(100, 2)\n",
149 |     "X_train = np.r_[X + 2, X - 2]\n",
150 |     "# Generate some regular novel observations\n",
151 |     "X = 0.3 * rng.randn(20, 2)\n",
152 |     "X_test = np.r_[X + 2, X - 2]\n",
153 |     "# Generate some abnormal novel observations\n",
154 |     "X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))\n",
155 |     "\n",
156 |     "# fit the model\n",
157 |     "clf = IsolationForest(max_samples=100, random_state=rng)\n",
158 |     "clf.fit(X_train)\n",
159 |     "y_pred_train = clf.predict(X_train)\n",
160 |     "y_pred_test = clf.predict(X_test)\n",
161 |     "y_pred_outliers = clf.predict(X_outliers)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 33,
167 |    "metadata": {
168 |     "collapsed": false
169 |    },
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "0"
175 |       ]
176 |      },
177 |      "execution_count": 33,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "y_pred_outliers[y_pred_outliers == 1].size"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {
190 |     "collapsed": true
191 |    },
192 |    "outputs": [],
193 |    "source": []
194 |   }
195 |  ],
196 |  "metadata": {
197 |   "kernelspec": {
198 |    "display_name": "Python 2",
199 |    "language": "python",
200 |    "name": "python2"
201 |   },
202 |   "language_info": {
203 |    "codemirror_mode": {
204 |     "name": "ipython",
205 |     "version": 2
206 |    },
207 |    "file_extension": ".py",
208 |    "mimetype": "text/x-python",
209 |    "name": "python",
210 |    "nbconvert_exporter": "python",
211 |    "pygments_lexer": "ipython2",
212 |    "version": "2.7.10"
213 |   }
214 |  },
215 |  "nbformat": 4,
216 |  "nbformat_minor": 2
217 | }
218 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/NovelyDetection-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Novely and Outlier Detection\n",
  8 |     "\n",
  9 |     "Many applications require being able to decide whether a new observation belongs to the same distribution as existing observations (it is an inlier), or should be considered as different (it is an outlier). Often, this ability is used to clean real data sets. Two important distinction must be made:\n",
 10 |     "\n",
 11 |     "* novelty detection:\n",
 12 |     " \tThe training data is not polluted by outliers, and we are interested in detecting anomalies in new observations.\n",
 13 |     "* outlier detection:\n",
 14 |     " \tThe training data contains outliers, and we need to fit the central mode of the training data, ignoring the deviant observations.\n",
 15 |     "\n",
 16 |     "The scikit-learn project provides a set of machine learning tools that can be used both for novelty or outliers detection. This strategy is implemented with objects learning in an unsupervised way from the data:\n",
 17 |     "\n",
 18 |     "`estimator.fit(X_train)`\n",
 19 |     "\n",
 20 |     "new observations can then be sorted as inliers or outliers with a predict method:\n",
 21 |     "\n",
 22 |     "`estimator.predict(X_test)`\n",
 23 |     "\n",
 24 |     "Inliers are labeled 1, while outliers are labeled -1."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Novelty Detection\n",
 32 |     "\n",
 33 |     "Consider a data set of n observations from the same distribution described by p features. Consider now that we add one more observation to that data set. Is the new observation so different from the others that we can doubt it is regular? (i.e. does it come from the same distribution?) Or on the contrary, is it so similar to the other that we cannot distinguish it from the original observations? This is the question addressed by the novelty detection tools and methods.\n",
 34 |     "\n",
 35 |     "In general, it is about to learn a rough, close frontier delimiting the contour of the initial observations distribution, plotted in embedding p-dimensional space. Then, if further observations lay within the frontier-delimited subspace, they are considered as coming from the same population than the initial observations. Otherwise, if they lay outside the frontier, we can say that they are abnormal with a given confidence in our assessment.\n",
 36 |     "\n",
 37 |     "The One-Class SVM has been introduced by Schölkopf et al. for that purpose and implemented in the Support Vector Machines module in the svm.OneClassSVM object. It requires the choice of a kernel and a scalar parameter to define a frontier. The RBF kernel is usually chosen although there exists no exact formula or algorithm to set its bandwidth parameter. This is the default in the scikit-learn implementation. The \\nu parameter, also known as the margin of the One-Class SVM, corresponds to the probability of finding a new, but regular, observation outside the frontier."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 1,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "from sklearn.svm import OneClassSVM\n",
 49 |     "\n",
 50 |     "OneClassSVM?"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 17,
 56 |    "metadata": {
 57 |     "collapsed": false
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import numpy as np\n",
 62 |     "\n",
 63 |     "X = 0.3 * np.random.randn(100, 2)\n",
 64 |     "X_train = np.r_[X + 2, X - 2]\n",
 65 |     "# Generate some regular novel observations\n",
 66 |     "X = 0.3 * np.random.randn(20, 2)\n",
 67 |     "X_test = np.r_[X + 2, X - 2]\n",
 68 |     "# Generate some abnormal novel observations\n",
 69 |     "X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 27,
 75 |    "metadata": {
 76 |     "collapsed": false
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "clf = OneClassSVM(nu=0.1, kernel=\"rbf\", gamma=0.1)\n",
 81 |     "clf.fit(X_train)\n",
 82 |     "y_pred_train = clf.predict(X_train)\n",
 83 |     "y_pred_test = clf.predict(X_test)\n",
 84 |     "y_pred_outliers = clf.predict(X_outliers)\n",
 85 |     "n_error_train = y_pred_train[y_pred_train == -1].size\n",
 86 |     "n_error_test = y_pred_test[y_pred_test == -1].size\n",
 87 |     "n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 29,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "(20, 6, 2)"
101 |       ]
102 |      },
103 |      "execution_count": 29,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "n_error_train, n_error_test, n_error_outliers"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "## Outlier Detection\n",
117 |     "\n",
118 |     "Outlier detection is similar to novelty detection in the sense that the goal is to separate a core of regular observations from some polluting ones, called “outliers”. Yet, in the case of outlier detection, we don’t have a clean data set representing the population of regular observations that can be used to train any tool."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "#### Isolation Forest\n",
126 |     "\n",
127 |     "One efficient way of performing outlier detection in high-dimensional datasets is to use random forests. The ensemble.IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.\n",
128 |     "\n",
129 |     "Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node.\n",
130 |     "This path length, averaged over a forest of such random trees, is a measure of normality and our decision function.\n",
131 |     "\n",
132 |     "Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies.\n"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 30,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "from sklearn.ensemble import IsolationForest\n",
144 |     "\n",
145 |     "rng = np.random.RandomState(42)\n",
146 |     "\n",
147 |     "# Generate train data\n",
148 |     "X = 0.3 * rng.randn(100, 2)\n",
149 |     "X_train = np.r_[X + 2, X - 2]\n",
150 |     "# Generate some regular novel observations\n",
151 |     "X = 0.3 * rng.randn(20, 2)\n",
152 |     "X_test = np.r_[X + 2, X - 2]\n",
153 |     "# Generate some abnormal novel observations\n",
154 |     "X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))\n",
155 |     "\n",
156 |     "# fit the model\n",
157 |     "clf = IsolationForest(max_samples=100, random_state=rng)\n",
158 |     "clf.fit(X_train)\n",
159 |     "y_pred_train = clf.predict(X_train)\n",
160 |     "y_pred_test = clf.predict(X_test)\n",
161 |     "y_pred_outliers = clf.predict(X_outliers)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 33,
167 |    "metadata": {
168 |     "collapsed": false
169 |    },
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "0"
175 |       ]
176 |      },
177 |      "execution_count": 33,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "y_pred_outliers[y_pred_outliers == 1].size"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {
190 |     "collapsed": true
191 |    },
192 |    "outputs": [],
193 |    "source": []
194 |   }
195 |  ],
196 |  "metadata": {
197 |   "kernelspec": {
198 |    "display_name": "Python 2",
199 |    "language": "python",
200 |    "name": "python2"
201 |   },
202 |   "language_info": {
203 |    "codemirror_mode": {
204 |     "name": "ipython",
205 |     "version": 2
206 |    },
207 |    "file_extension": ".py",
208 |    "mimetype": "text/x-python",
209 |    "name": "python",
210 |    "nbconvert_exporter": "python",
211 |    "pygments_lexer": "ipython2",
212 |    "version": "2.7.10"
213 |   }
214 |  },
215 |  "nbformat": 4,
216 |  "nbformat_minor": 2
217 | }
218 | 


--------------------------------------------------------------------------------
/notebooks/FeatureTransformation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Feature Transformation\n",
 11 |     "\n",
 12 |     "I am going to show off only two parts of the massive quantity of code in the unsupervised learning section of sklearn. And they can be put into this single bucket:\n",
 13 |     "\n",
 14 |     "* Feature Transformation\n",
 15 |     "* Exploratory Data Analysis\n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {
 21 |     "deletable": true,
 22 |     "editable": true
 23 |    },
 24 |    "source": [
 25 |     "## Clustering\n",
 26 |     "\n",
 27 |     "Clustering of unlabeled data can be performed with the module sklearn.cluster.\n",
 28 |     "Each clustering algorithm comes in two variants: a class, that implements the fit method to learn the clusters on train data, and a function, that, given train data, returns an array of integer labels corresponding to the different clusters. For the class, the labels over the training data can be found in the labels_ attribute."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {
 34 |     "deletable": true,
 35 |     "editable": true
 36 |    },
 37 |    "source": [
 38 |     "#### Kmeans\n",
 39 |     "\n",
 40 |     "The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. This algorithm requires the number of clusters to be specified. It scales well to large number of samples and has been used across a large range of application areas in many different fields.\n",
 41 |     "\n",
 42 |     "Let's check out how it is used"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 1,
 48 |    "metadata": {
 49 |     "collapsed": true,
 50 |     "deletable": true,
 51 |     "editable": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "from sklearn.cluster import KMeans\n",
 56 |     "\n",
 57 |     "KMeans?"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 2,
 63 |    "metadata": {
 64 |     "collapsed": true,
 65 |     "deletable": true,
 66 |     "editable": true
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "from sklearn.datasets import load_iris\n",
 71 |     "\n",
 72 |     "X, y = load_iris(return_X_y=True)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 3,
 78 |    "metadata": {
 79 |     "collapsed": true,
 80 |     "deletable": true,
 81 |     "editable": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "cluster = KMeans(n_clusters=3)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 4,
 91 |    "metadata": {
 92 |     "collapsed": false,
 93 |     "deletable": true,
 94 |     "editable": true
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n",
101 |        "    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',\n",
102 |        "    random_state=None, tol=0.0001, verbose=0)"
103 |       ]
104 |      },
105 |      "execution_count": 4,
106 |      "metadata": {},
107 |      "output_type": "execute_result"
108 |     }
109 |    ],
110 |    "source": [
111 |     "cluster.fit(X)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 5,
117 |    "metadata": {
118 |     "collapsed": false,
119 |     "deletable": true,
120 |     "editable": true
121 |    },
122 |    "outputs": [
123 |     {
124 |      "data": {
125 |       "text/plain": [
126 |        "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
127 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
128 |        "       0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
129 |        "       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
130 |        "       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1,\n",
131 |        "       2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,\n",
132 |        "       1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1], dtype=int32)"
133 |       ]
134 |      },
135 |      "execution_count": 5,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "cluster.predict(X)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 6,
147 |    "metadata": {
148 |     "collapsed": false,
149 |     "deletable": true,
150 |     "editable": true
151 |    },
152 |    "outputs": [],
153 |    "source": [
154 |     "from sklearn.tree import DecisionTreeClassifier\n",
155 |     "\n",
156 |     "m = DecisionTreeClassifier(max_depth=2)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 7,
162 |    "metadata": {
163 |     "collapsed": false,
164 |     "deletable": true,
165 |     "editable": true
166 |    },
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/plain": [
171 |        "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n",
172 |        "            max_features=None, max_leaf_nodes=None,\n",
173 |        "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
174 |        "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
175 |        "            presort=False, random_state=None, splitter='best')"
176 |       ]
177 |      },
178 |      "execution_count": 7,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "m.fit(cluster.predict(X)[:, None], y)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 8,
190 |    "metadata": {
191 |     "collapsed": false,
192 |     "deletable": true,
193 |     "editable": true
194 |    },
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "0.89333333333333331"
200 |       ]
201 |      },
202 |      "execution_count": 8,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "m.score(cluster.predict(X)[:, None], y)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {
214 |     "deletable": true,
215 |     "editable": true
216 |    },
217 |    "source": [
218 |     "## Principal component analysis (PCA)\n",
219 |     "\n",
220 |     "PCA is used to decompose a multivariate dataset in a set of successive orthogonal components that explain a maximum amount of the variance. In scikit-learn, PCA is implemented as a transformer object that learns n components in its fit method, and can be used on new data to project it on these components.\n",
221 |     "\n",
222 |     "The optional parameter whiten=True makes it possible to project the data onto the singular space while scaling each component to unit variance. This is often useful if the models down-stream make strong assumptions on the isotropy of the signal: this is for example the case for Support Vector Machines with the RBF kernel and the K-Means clustering algorithm."
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 9,
228 |    "metadata": {
229 |     "collapsed": true,
230 |     "deletable": true,
231 |     "editable": true
232 |    },
233 |    "outputs": [],
234 |    "source": [
235 |     "from sklearn.decomposition import PCA\n",
236 |     "\n",
237 |     "PCA?"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 10,
243 |    "metadata": {
244 |     "collapsed": false,
245 |     "deletable": true,
246 |     "editable": true
247 |    },
248 |    "outputs": [
249 |     {
250 |      "data": {
251 |       "text/plain": [
252 |        "(150, 2)"
253 |       ]
254 |      },
255 |      "execution_count": 10,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "from sklearn.svm import SVC\n",
262 |     "\n",
263 |     "pca = PCA(n_components=2)\n",
264 |     "\n",
265 |     "X_pca = pca.fit_transform(X)\n",
266 |     "\n",
267 |     "X_pca.shape"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 11,
273 |    "metadata": {
274 |     "collapsed": false,
275 |     "deletable": true,
276 |     "editable": true
277 |    },
278 |    "outputs": [
279 |     {
280 |      "data": {
281 |       "text/plain": [
282 |        "0.95333333333333337"
283 |       ]
284 |      },
285 |      "execution_count": 11,
286 |      "metadata": {},
287 |      "output_type": "execute_result"
288 |     }
289 |    ],
290 |    "source": [
291 |     "SVC().fit(X_pca, y).score(X_pca, y)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {
298 |     "collapsed": true,
299 |     "deletable": true,
300 |     "editable": true
301 |    },
302 |    "outputs": [],
303 |    "source": []
304 |   }
305 |  ],
306 |  "metadata": {
307 |   "kernelspec": {
308 |    "display_name": "Python 2",
309 |    "language": "python",
310 |    "name": "python2"
311 |   },
312 |   "language_info": {
313 |    "codemirror_mode": {
314 |     "name": "ipython",
315 |     "version": 2
316 |    },
317 |    "file_extension": ".py",
318 |    "mimetype": "text/x-python",
319 |    "name": "python",
320 |    "nbconvert_exporter": "python",
321 |    "pygments_lexer": "ipython2",
322 |    "version": "2.7.10"
323 |   }
324 |  },
325 |  "nbformat": 4,
326 |  "nbformat_minor": 2
327 | }
328 | 


--------------------------------------------------------------------------------
/notebooks/PipelinesAndFeatureUnions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "## Pipelines\n",
 11 |     "\n",
 12 |     "Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification. Pipeline serves two purposes here:\n",
 13 |     "\n",
 14 |     "* Convenience: You only have to call fit and predict once on your data to fit a whole sequence of estimators.\n",
 15 |     "* Joint parameter selection: You can grid search over parameters of all estimators in the pipeline at once.\n",
 16 |     "\n",
 17 |     "All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). The last estimator may be any type (transformer, classifier, etc.).\n"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {
 24 |     "collapsed": true,
 25 |     "deletable": true,
 26 |     "editable": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "from sklearn.pipeline import Pipeline\n",
 31 |     "\n",
 32 |     "Pipeline?"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {
 39 |     "collapsed": false,
 40 |     "deletable": true,
 41 |     "editable": true
 42 |    },
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n",
 48 |        "  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
 49 |        "  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
 50 |        "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
 51 |        "  tol=0.001, verbose=False))])"
 52 |       ]
 53 |      },
 54 |      "execution_count": 2,
 55 |      "metadata": {},
 56 |      "output_type": "execute_result"
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "from sklearn.svm import SVC\n",
 61 |     "from sklearn.decomposition import PCA\n",
 62 |     "estimators = [('reduce_dim', PCA(n_components=2)), ('clf', SVC())]\n",
 63 |     "pipe = Pipeline(estimators)\n",
 64 |     "pipe \n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {
 71 |     "collapsed": true,
 72 |     "deletable": true,
 73 |     "editable": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from sklearn.datasets import load_iris\n",
 78 |     "\n",
 79 |     "X, y = load_iris(return_X_y=True)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 4,
 85 |    "metadata": {
 86 |     "collapsed": false,
 87 |     "deletable": true,
 88 |     "editable": true
 89 |    },
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/plain": [
 94 |        "0.95333333333333337"
 95 |       ]
 96 |      },
 97 |      "execution_count": 4,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "# Notice no need to PCA the Xs in the score!\n",
104 |     "pipe.fit(X, y).score(X, y)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {
110 |     "deletable": true,
111 |     "editable": true
112 |    },
113 |    "source": [
114 |     "The utility function make_pipeline is a shorthand for constructing pipelines; it takes a variable number of estimators and returns a pipeline, filling in the names automatically:"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 5,
120 |    "metadata": {
121 |     "collapsed": false,
122 |     "deletable": true,
123 |     "editable": true
124 |    },
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/plain": [
129 |        "Pipeline(steps=[('binarizer', Binarizer(copy=True, threshold=0.0)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
130 |       ]
131 |      },
132 |      "execution_count": 5,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "from sklearn.pipeline import make_pipeline\n",
139 |     "from sklearn.naive_bayes import MultinomialNB\n",
140 |     "from sklearn.preprocessing import Binarizer\n",
141 |     "make_pipeline(Binarizer(), MultinomialNB()) \n"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 6,
147 |    "metadata": {
148 |     "collapsed": false,
149 |     "deletable": true,
150 |     "editable": true
151 |    },
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "('reduce_dim',\n",
157 |        " PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n",
158 |        "   svd_solver='auto', tol=0.0, whiten=False))"
159 |       ]
160 |      },
161 |      "execution_count": 6,
162 |      "metadata": {},
163 |      "output_type": "execute_result"
164 |     }
165 |    ],
166 |    "source": [
167 |     "pipe.steps[0]"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 7,
173 |    "metadata": {
174 |     "collapsed": false,
175 |     "deletable": true,
176 |     "editable": true
177 |    },
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n",
183 |        "  svd_solver='auto', tol=0.0, whiten=False)"
184 |       ]
185 |      },
186 |      "execution_count": 7,
187 |      "metadata": {},
188 |      "output_type": "execute_result"
189 |     }
190 |    ],
191 |    "source": [
192 |     "pipe.named_steps['reduce_dim']"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 8,
198 |    "metadata": {
199 |     "collapsed": false,
200 |     "deletable": true,
201 |     "editable": true
202 |    },
203 |    "outputs": [
204 |     {
205 |      "data": {
206 |       "text/plain": [
207 |        "Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n",
208 |        "  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,\n",
209 |        "  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
210 |        "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
211 |        "  tol=0.001, verbose=False))])"
212 |       ]
213 |      },
214 |      "execution_count": 8,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "pipe.set_params(clf__C=10) "
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 9,
226 |    "metadata": {
227 |     "collapsed": true,
228 |     "deletable": true,
229 |     "editable": true
230 |    },
231 |    "outputs": [],
232 |    "source": [
233 |     "from sklearn.model_selection import GridSearchCV\n",
234 |     "params = dict(reduce_dim__n_components=[2, 5, 10],\n",
235 |     "              clf__C=[0.1, 10, 100])\n",
236 |     "grid_search = GridSearchCV(pipe, param_grid=params)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 10,
242 |    "metadata": {
243 |     "collapsed": true,
244 |     "deletable": true,
245 |     "editable": true
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "from sklearn.linear_model import LogisticRegression\n",
250 |     "params = dict(reduce_dim=[None, PCA(5), PCA(10)],\n",
251 |     "              clf=[SVC(), LogisticRegression()],\n",
252 |     "              clf__C=[0.1, 10, 100])\n",
253 |     "grid_search = GridSearchCV(pipe, param_grid=params)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {
259 |     "deletable": true,
260 |     "editable": true
261 |    },
262 |    "source": [
263 |     "## Feature Union\n",
264 |     "\n",
265 |     "FeatureUnion combines several transformer objects into a new transformer that combines their output. A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data independently. For transforming data, the transformers are applied in parallel, and the sample vectors they output are concatenated end-to-end into larger vectors.\n",
266 |     "\n",
267 |     "FeatureUnion serves the same purposes as Pipeline - convenience and joint parameter estimation and validation.\n",
268 |     "\n",
269 |     "FeatureUnion and Pipeline can be combined to create complex models.\n",
270 |     "\n",
271 |     "(A FeatureUnion has no way of checking whether two transformers might produce identical features. It only produces a union when the feature sets are disjoint, and making sure they are the caller’s responsibility.)"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {
278 |     "collapsed": false,
279 |     "deletable": true,
280 |     "editable": true
281 |    },
282 |    "outputs": [],
283 |    "source": [
284 |     "from sklearn.pipeline import FeatureUnion\n",
285 |     "from sklearn.decomposition import PCA\n",
286 |     "from sklearn.decomposition import KernelPCA\n",
287 |     "estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]\n",
288 |     "combined = FeatureUnion(estimators)\n",
289 |     "combined \n"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {
296 |     "collapsed": false,
297 |     "deletable": true,
298 |     "editable": true
299 |    },
300 |    "outputs": [],
301 |    "source": [
302 |     "combined.fit_transform(X).shape"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {
309 |     "collapsed": false,
310 |     "deletable": true,
311 |     "editable": true
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "combined.set_params(kernel_pca=None) "
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "collapsed": false,
323 |     "deletable": true,
324 |     "editable": true
325 |    },
326 |    "outputs": [],
327 |    "source": [
328 |     "combined.fit_transform(X).shape"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {
335 |     "collapsed": true,
336 |     "deletable": true,
337 |     "editable": true
338 |    },
339 |    "outputs": [],
340 |    "source": []
341 |   }
342 |  ],
343 |  "metadata": {
344 |   "kernelspec": {
345 |    "display_name": "Python 2",
346 |    "language": "python",
347 |    "name": "python2"
348 |   },
349 |   "language_info": {
350 |    "codemirror_mode": {
351 |     "name": "ipython",
352 |     "version": 2
353 |    },
354 |    "file_extension": ".py",
355 |    "mimetype": "text/x-python",
356 |    "name": "python",
357 |    "nbconvert_exporter": "python",
358 |    "pygments_lexer": "ipython2",
359 |    "version": "2.7.10"
360 |   }
361 |  },
362 |  "nbformat": 4,
363 |  "nbformat_minor": 2
364 | }
365 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/PipelinesAndFeatureUnions-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Pipelines\n",
  8 |     "\n",
  9 |     "Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification. Pipeline serves two purposes here:\n",
 10 |     "\n",
 11 |     "* Convenience: You only have to call fit and predict once on your data to fit a whole sequence of estimators.\n",
 12 |     "* Joint parameter selection: You can grid search over parameters of all estimators in the pipeline at once.\n",
 13 |     "\n",
 14 |     "All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). The last estimator may be any type (transformer, classifier, etc.).\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from sklearn.pipeline import Pipeline\n",
 26 |     "\n",
 27 |     "Pipeline?"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/plain": [
 40 |        "Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n",
 41 |        "  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
 42 |        "  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
 43 |        "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
 44 |        "  tol=0.001, verbose=False))])"
 45 |       ]
 46 |      },
 47 |      "execution_count": 3,
 48 |      "metadata": {},
 49 |      "output_type": "execute_result"
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "from sklearn.svm import SVC\n",
 54 |     "from sklearn.decomposition import PCA\n",
 55 |     "estimators = [('reduce_dim', PCA(n_components=2)), ('clf', SVC())]\n",
 56 |     "pipe = Pipeline(estimators)\n",
 57 |     "pipe \n"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 11,
 63 |    "metadata": {
 64 |     "collapsed": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "from sklearn.datasets import load_iris\n",
 69 |     "\n",
 70 |     "X, y = load_iris(return_X_y=True)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 12,
 76 |    "metadata": {
 77 |     "collapsed": false
 78 |    },
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "0.97333333333333338"
 84 |       ]
 85 |      },
 86 |      "execution_count": 12,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "# Notice no need to PCA the Xs in the score!\n",
 93 |     "pipe.fit(X, y).score(X, y)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "The utility function make_pipeline is a shorthand for constructing pipelines; it takes a variable number of estimators and returns a pipeline, filling in the names automatically:"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 4,
106 |    "metadata": {
107 |     "collapsed": false
108 |    },
109 |    "outputs": [
110 |     {
111 |      "data": {
112 |       "text/plain": [
113 |        "Pipeline(steps=[('binarizer', Binarizer(copy=True, threshold=0.0)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
114 |       ]
115 |      },
116 |      "execution_count": 4,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "from sklearn.pipeline import make_pipeline\n",
123 |     "from sklearn.naive_bayes import MultinomialNB\n",
124 |     "from sklearn.preprocessing import Binarizer\n",
125 |     "make_pipeline(Binarizer(), MultinomialNB()) \n"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 5,
131 |    "metadata": {
132 |     "collapsed": false
133 |    },
134 |    "outputs": [
135 |     {
136 |      "data": {
137 |       "text/plain": [
138 |        "('reduce_dim',\n",
139 |        " PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n",
140 |        "   svd_solver='auto', tol=0.0, whiten=False))"
141 |       ]
142 |      },
143 |      "execution_count": 5,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "pipe.steps[0]"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 6,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [
159 |     {
160 |      "data": {
161 |       "text/plain": [
162 |        "PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n",
163 |        "  svd_solver='auto', tol=0.0, whiten=False)"
164 |       ]
165 |      },
166 |      "execution_count": 6,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "pipe.named_steps['reduce_dim']"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 7,
178 |    "metadata": {
179 |     "collapsed": false
180 |    },
181 |    "outputs": [
182 |     {
183 |      "data": {
184 |       "text/plain": [
185 |        "Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n",
186 |        "  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,\n",
187 |        "  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
188 |        "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
189 |        "  tol=0.001, verbose=False))])"
190 |       ]
191 |      },
192 |      "execution_count": 7,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "pipe.set_params(clf__C=10) "
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 8,
204 |    "metadata": {
205 |     "collapsed": true
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "from sklearn.model_selection import GridSearchCV\n",
210 |     "params = dict(reduce_dim__n_components=[2, 5, 10],\n",
211 |     "              clf__C=[0.1, 10, 100])\n",
212 |     "grid_search = GridSearchCV(pipe, param_grid=params)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 9,
218 |    "metadata": {
219 |     "collapsed": true
220 |    },
221 |    "outputs": [],
222 |    "source": [
223 |     "from sklearn.linear_model import LogisticRegression\n",
224 |     "params = dict(reduce_dim=[None, PCA(5), PCA(10)],\n",
225 |     "              clf=[SVC(), LogisticRegression()],\n",
226 |     "              clf__C=[0.1, 10, 100])\n",
227 |     "grid_search = GridSearchCV(pipe, param_grid=params)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "## Feature Union\n",
235 |     "\n",
236 |     "FeatureUnion combines several transformer objects into a new transformer that combines their output. A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data independently. For transforming data, the transformers are applied in parallel, and the sample vectors they output are concatenated end-to-end into larger vectors.\n",
237 |     "\n",
238 |     "FeatureUnion serves the same purposes as Pipeline - convenience and joint parameter estimation and validation.\n",
239 |     "\n",
240 |     "FeatureUnion and Pipeline can be combined to create complex models.\n",
241 |     "\n",
242 |     "(A FeatureUnion has no way of checking whether two transformers might produce identical features. It only produces a union when the feature sets are disjoint, and making sure they are the caller’s responsibility.)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 17,
248 |    "metadata": {
249 |     "collapsed": false
250 |    },
251 |    "outputs": [
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "FeatureUnion(n_jobs=1,\n",
256 |        "       transformer_list=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,\n",
257 |        "  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',\n",
258 |        "     fit_inverse_transform=False, gamma=None, kernel='linear',\n",
259 |        "     kernel_params=None, max_iter=None, n_components=None, n_jobs=1,\n",
260 |        "     random_state=None, remove_zero_eig=False, tol=0))],\n",
261 |        "       transformer_weights=None)"
262 |       ]
263 |      },
264 |      "execution_count": 17,
265 |      "metadata": {},
266 |      "output_type": "execute_result"
267 |     }
268 |    ],
269 |    "source": [
270 |     "from sklearn.pipeline import FeatureUnion\n",
271 |     "from sklearn.decomposition import PCA\n",
272 |     "from sklearn.decomposition import KernelPCA\n",
273 |     "estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]\n",
274 |     "combined = FeatureUnion(estimators)\n",
275 |     "combined \n"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 18,
281 |    "metadata": {
282 |     "collapsed": false
283 |    },
284 |    "outputs": [
285 |     {
286 |      "data": {
287 |       "text/plain": [
288 |        "(150, 78)"
289 |       ]
290 |      },
291 |      "execution_count": 18,
292 |      "metadata": {},
293 |      "output_type": "execute_result"
294 |     }
295 |    ],
296 |    "source": [
297 |     "combined.fit_transform(X).shape"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 19,
303 |    "metadata": {
304 |     "collapsed": false
305 |    },
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/plain": [
310 |        "FeatureUnion(n_jobs=1,\n",
311 |        "       transformer_list=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,\n",
312 |        "  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', None)],\n",
313 |        "       transformer_weights=None)"
314 |       ]
315 |      },
316 |      "execution_count": 19,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "combined.set_params(kernel_pca=None) "
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 20,
328 |    "metadata": {
329 |     "collapsed": false
330 |    },
331 |    "outputs": [
332 |     {
333 |      "data": {
334 |       "text/plain": [
335 |        "(150, 4)"
336 |       ]
337 |      },
338 |      "execution_count": 20,
339 |      "metadata": {},
340 |      "output_type": "execute_result"
341 |     }
342 |    ],
343 |    "source": [
344 |     "combined.fit_transform(X).shape"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {
351 |     "collapsed": true
352 |    },
353 |    "outputs": [],
354 |    "source": []
355 |   }
356 |  ],
357 |  "metadata": {
358 |   "kernelspec": {
359 |    "display_name": "Python 2",
360 |    "language": "python",
361 |    "name": "python2"
362 |   },
363 |   "language_info": {
364 |    "codemirror_mode": {
365 |     "name": "ipython",
366 |     "version": 2
367 |    },
368 |    "file_extension": ".py",
369 |    "mimetype": "text/x-python",
370 |    "name": "python",
371 |    "nbconvert_exporter": "python",
372 |    "pygments_lexer": "ipython2",
373 |    "version": "2.7.10"
374 |   }
375 |  },
376 |  "nbformat": 4,
377 |  "nbformat_minor": 2
378 | }
379 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/FeatureSelection-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Feature Selection\n",
  8 |     "\n",
  9 |     "The classes in the sklearn.feature_selection module can be used for feature selection/dimensionality reduction on sample sets, either to improve estimators’ accuracy scores or to boost their performance on very high-dimensional datasets."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Remove Low Var Features\n",
 17 |     "\n",
 18 |     "VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.\n",
 19 |     "\n",
 20 |     "Again we are starting to see fit and fit_transform pop up again. Sklearn provides a ton of functionality that's not just prediction. Some of the functionality is preprocessing the data. Again these are like models (they can only rely on the training data) but don't really predict anything. Thus they do have a fit method, but don't have a predict method. We will see two examples of this type of paradigm below."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {
 27 |     "collapsed": false
 28 |    },
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "array([[0, 1],\n",
 34 |        "       [1, 0],\n",
 35 |        "       [0, 0],\n",
 36 |        "       [1, 1],\n",
 37 |        "       [1, 0],\n",
 38 |        "       [1, 1]])"
 39 |       ]
 40 |      },
 41 |      "execution_count": 1,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "from sklearn.feature_selection import VarianceThreshold\n",
 48 |     "\n",
 49 |     "X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]\n",
 50 |     "\n",
 51 |     "sel = VarianceThreshold(threshold=(.8 * (1 - .8)))\n",
 52 |     "\n",
 53 |     "sel.fit(X)\n",
 54 |     "\n",
 55 |     "sel.transform(X)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Univariate Feature Selection\n",
 63 |     "\n",
 64 |     "Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator. Scikit-learn exposes feature selection routines as objects that implement the transform method:\n",
 65 |     "* SelectKBest removes all but the k highest scoring features\n",
 66 |     "* SelectPercentile removes all but a user-specified highest scoring percentage of features\n",
 67 |     "* using common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe.\n",
 68 |     "* GenericUnivariateSelect allows to perform univariate feature selection with a configurable strategy. This allows to select the best univariate selection strategy with hyper-parameter search estimator.\n",
 69 |     "\n",
 70 |     "These objects take as input a scoring function that returns univariate scores and p-values (or only scores for SelectKBest and SelectPercentile):\n",
 71 |     "\n",
 72 |     "* For regression: f_regression, mutual_info_regression\n",
 73 |     "* For classification: chi2, f_classif, mutual_info_classif\n",
 74 |     "\n",
 75 |     "The methods based on F-test estimate the degree of linear dependency between two random variables. On the other hand, mutual information methods can capture any kind of statistical dependency, but being nonparametric, they require more samples for accurate estimation."
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 2,
 81 |    "metadata": {
 82 |     "collapsed": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "from sklearn.datasets import load_iris\n",
 87 |     "from sklearn.feature_selection import SelectKBest\n",
 88 |     "from sklearn.feature_selection import chi2\n",
 89 |     "\n",
 90 |     "SelectKBest?"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 5,
 96 |    "metadata": {
 97 |     "collapsed": false
 98 |    },
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "SelectKBest(k=2, score_func=<function chi2 at 0x1093ac410>)"
104 |       ]
105 |      },
106 |      "execution_count": 5,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "X, y = load_iris(return_X_y=True)\n",
113 |     "\n",
114 |     "sel = SelectKBest(chi2, k=2)\n",
115 |     "\n",
116 |     "sel.fit(X, y)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 7,
122 |    "metadata": {
123 |     "collapsed": false
124 |    },
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/plain": [
129 |        "(150, 2)"
130 |       ]
131 |      },
132 |      "execution_count": 7,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "sel.transform(X).shape"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 8,
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "array([  10.81782088,    3.59449902,  116.16984746,   67.24482759])"
152 |       ]
153 |      },
154 |      "execution_count": 8,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "sel.scores_"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "## Recursive feature elimination\n",
168 |     "\n",
169 |     "Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and weights are assigned to each one of them. Then, features whose absolute weights are the smallest are pruned from the current set features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.\n",
170 |     "\n",
171 |     "So it is very important to normalize these features in linear models!"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 14,
177 |    "metadata": {
178 |     "collapsed": true
179 |    },
180 |    "outputs": [],
181 |    "source": [
182 |     "from sklearn.ensemble import RandomForestClassifier\n",
183 |     "from sklearn.feature_selection import RFECV\n",
184 |     "\n",
185 |     "RFECV?"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 15,
191 |    "metadata": {
192 |     "collapsed": true
193 |    },
194 |    "outputs": [],
195 |    "source": [
196 |     "m = RFECV(RandomForestClassifier(), scoring='accuracy')"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 16,
202 |    "metadata": {
203 |     "collapsed": false
204 |    },
205 |    "outputs": [
206 |     {
207 |      "data": {
208 |       "text/plain": [
209 |        "RFECV(cv=None,\n",
210 |        "   estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
211 |        "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
212 |        "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
213 |        "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
214 |        "            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n",
215 |        "            verbose=0, warm_start=False),\n",
216 |        "   n_jobs=1, scoring='accuracy', step=1, verbose=0)"
217 |       ]
218 |      },
219 |      "execution_count": 16,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "m.fit(X, y)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "## Feature selection using SelectFromModel\n",
233 |     "\n",
234 |     "SelectFromModel is a meta-transformer that can be used along with any estimator that has a coef_ or feature_importances_ attribute after fitting. The features are considered unimportant and removed, if the corresponding coef_ or feature_importances_ values are below the provided threshold parameter. Apart from specifying the threshold numerically, there are built-in heuristics for finding a threshold using a string argument. Available heuristics are “mean”, “median” and float multiples of these like “0.1*mean”.\n",
235 |     "\n",
236 |     "For examples on how it is to be used refer to the sections below."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 18,
242 |    "metadata": {
243 |     "collapsed": true
244 |    },
245 |    "outputs": [],
246 |    "source": [
247 |     "from sklearn.svm import LinearSVC\n",
248 |     "from sklearn.feature_selection import SelectFromModel\n",
249 |     "\n",
250 |     "SelectFromModel?"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 19,
256 |    "metadata": {
257 |     "collapsed": false
258 |    },
259 |    "outputs": [
260 |     {
261 |      "data": {
262 |       "text/plain": [
263 |        "SelectFromModel(estimator=LinearSVC(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n",
264 |        "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
265 |        "     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,\n",
266 |        "     verbose=0),\n",
267 |        "        prefit=False, threshold=None)"
268 |       ]
269 |      },
270 |      "execution_count": 19,
271 |      "metadata": {},
272 |      "output_type": "execute_result"
273 |     }
274 |    ],
275 |    "source": [
276 |     "m = SelectFromModel(LinearSVC(C=0.01, penalty='l1', dual=False))\n",
277 |     "\n",
278 |     "m.fit(X, y)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 22,
284 |    "metadata": {
285 |     "collapsed": false
286 |    },
287 |    "outputs": [
288 |     {
289 |      "data": {
290 |       "text/plain": [
291 |        "(150, 3)"
292 |       ]
293 |      },
294 |      "execution_count": 22,
295 |      "metadata": {},
296 |      "output_type": "execute_result"
297 |     }
298 |    ],
299 |    "source": [
300 |     "m.transform(X).shape"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "A little bit more complex!"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 28,
313 |    "metadata": {
314 |     "collapsed": false
315 |    },
316 |    "outputs": [
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "(506, 13)\n"
322 |      ]
323 |     },
324 |     {
325 |      "data": {
326 |       "text/plain": [
327 |        "(506, 10)"
328 |       ]
329 |      },
330 |      "execution_count": 28,
331 |      "metadata": {},
332 |      "output_type": "execute_result"
333 |     }
334 |    ],
335 |    "source": [
336 |     "from sklearn.linear_model import LassoCV\n",
337 |     "from sklearn.datasets import load_boston\n",
338 |     "\n",
339 |     "X, y = load_boston(return_X_y=True)\n",
340 |     "\n",
341 |     "print X.shape\n",
342 |     "\n",
343 |     "m = SelectFromModel(LassoCV())\n",
344 |     "\n",
345 |     "m.fit(X, y)\n",
346 |     "\n",
347 |     "m.transform(X).shape"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {
354 |     "collapsed": false
355 |    },
356 |    "outputs": [],
357 |    "source": []
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {
363 |     "collapsed": true
364 |    },
365 |    "outputs": [],
366 |    "source": []
367 |   }
368 |  ],
369 |  "metadata": {
370 |   "kernelspec": {
371 |    "display_name": "Python 2",
372 |    "language": "python",
373 |    "name": "python2"
374 |   },
375 |   "language_info": {
376 |    "codemirror_mode": {
377 |     "name": "ipython",
378 |     "version": 2
379 |    },
380 |    "file_extension": ".py",
381 |    "mimetype": "text/x-python",
382 |    "name": "python",
383 |    "nbconvert_exporter": "python",
384 |    "pygments_lexer": "ipython2",
385 |    "version": "2.7.10"
386 |   }
387 |  },
388 |  "nbformat": 4,
389 |  "nbformat_minor": 2
390 | }
391 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/FeatureExtraction-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Feature Extraction\n",
  8 |     "\n",
  9 |     "The sklearn.feature_extraction module can be used to extract features in a format supported by machine learning algorithms from datasets consisting of formats such as text and image.\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "##  Loading features from dicts\n",
 17 |     "\n",
 18 |     "The class DictVectorizer can be used to convert feature arrays represented as lists of standard Python dict objects to the NumPy/SciPy representation used by scikit-learn estimators.\n",
 19 |     "\n",
 20 |     "While not particularly fast to process, Python’s dict has the advantages of being convenient to use, being sparse (absent features need not be stored) and storing feature names in addition to values.\n",
 21 |     "\n",
 22 |     "DictVectorizer implements what is called one-of-K or “one-hot” coding for categorical (aka nominal, discrete) features. Categorical features are “attribute-value” pairs where the value is restricted to a list of discrete of possibilities without ordering (e.g. topic identifiers, types of objects, tags, names...).\n",
 23 |     "\n",
 24 |     "In the following, “city” is a categorical attribute while “temperature” is a traditional numerical feature:"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 1,
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/plain": [
 37 |        "array([[  1.,   0.,   0.,  33.],\n",
 38 |        "       [  0.,   1.,   0.,  12.],\n",
 39 |        "       [  0.,   0.,   1.,  18.]])"
 40 |       ]
 41 |      },
 42 |      "execution_count": 1,
 43 |      "metadata": {},
 44 |      "output_type": "execute_result"
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "measurements = [\n",
 49 |     "    {'city': 'Dubai', 'temperature': 33.},\n",
 50 |     "    {'city': 'London', 'temperature': 12.},\n",
 51 |     "    {'city': 'San Fransisco', 'temperature': 18.},\n",
 52 |     "]\n",
 53 |     "\n",
 54 |     "from sklearn.feature_extraction import DictVectorizer\n",
 55 |     "vec = DictVectorizer()\n",
 56 |     "\n",
 57 |     "vec.fit_transform(measurements).toarray()\n",
 58 |     "\n"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 2,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/plain": [
 71 |        "['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']"
 72 |       ]
 73 |      },
 74 |      "execution_count": 2,
 75 |      "metadata": {},
 76 |      "output_type": "execute_result"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "\n",
 81 |     "vec.get_feature_names()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## Text feature extraction\n",
 89 |     "\n",
 90 |     "Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors with a fixed size rather than the raw text documents with variable length.\n",
 91 |     "\n",
 92 |     "In order to address this, scikit-learn provides utilities for the most common ways to extract numerical features from text content, namely:\n",
 93 |     "\n",
 94 |     "* tokenizing strings and giving an integer id for each possible token, for instance by using white-spaces and punctuation as token separators.\n",
 95 |     "* counting the occurrences of tokens in each document.\n",
 96 |     "* normalizing and weighting with diminishing importance tokens that occur in the majority of samples / documents.\n",
 97 |     "\n",
 98 |     "In this scheme, features and samples are defined as follows:\n",
 99 |     "\n",
100 |     "* each individual token occurrence frequency (normalized or not) is treated as a feature.\n",
101 |     "* the vector of all the token frequencies for a given document is considered a multivariate sample.\n",
102 |     "\n",
103 |     "A corpus of documents can thus be represented by a matrix with one row per document and one column per token (e.g. word) occurring in the corpus.\n",
104 |     "\n",
105 |     "We call vectorization the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the Bag of Words or “Bag of n-grams” representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "CountVectorizer implements both tokenization and occurrence counting in a single class:"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 5,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
124 |     "\n",
125 |     "CountVectorizer?"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 4,
131 |    "metadata": {
132 |     "collapsed": false
133 |    },
134 |    "outputs": [
135 |     {
136 |      "data": {
137 |       "text/plain": [
138 |        "CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',\n",
139 |        "        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',\n",
140 |        "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
141 |        "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
142 |        "        strip_accents=None, token_pattern=u'(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
143 |        "        tokenizer=None, vocabulary=None)"
144 |       ]
145 |      },
146 |      "execution_count": 4,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "vectorizer = CountVectorizer(min_df=1)\n",
153 |     "vectorizer "
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 6,
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "<4x9 sparse matrix of type '<type 'numpy.int64'>'\n",
167 |        "\twith 19 stored elements in Compressed Sparse Row format>"
168 |       ]
169 |      },
170 |      "execution_count": 6,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "corpus = [\n",
177 |     "    'This is the first document.',\n",
178 |     "    'This is the second second document.',\n",
179 |     "    'And the third one.',\n",
180 |     "    'Is this the first document?',\n",
181 |     "]\n",
182 |     "X = vectorizer.fit_transform(corpus)\n",
183 |     "X                              \n"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 7,
189 |    "metadata": {
190 |     "collapsed": false
191 |    },
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "array([[0, 1, 1, 1, 0, 0, 1, 0, 1],\n",
197 |        "       [0, 1, 0, 1, 0, 2, 1, 0, 1],\n",
198 |        "       [1, 0, 0, 0, 1, 0, 1, 1, 0],\n",
199 |        "       [0, 1, 1, 1, 0, 0, 1, 0, 1]])"
200 |       ]
201 |      },
202 |      "execution_count": 7,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "X.toarray()"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 9,
214 |    "metadata": {
215 |     "collapsed": false
216 |    },
217 |    "outputs": [
218 |     {
219 |      "data": {
220 |       "text/plain": [
221 |        "[u'this', u'is', u'text', u'document', u'to', u'analyze']"
222 |       ]
223 |      },
224 |      "execution_count": 9,
225 |      "metadata": {},
226 |      "output_type": "execute_result"
227 |     }
228 |    ],
229 |    "source": [
230 |     "analyze = vectorizer.build_analyzer()\n",
231 |     "analyze(\"This is a text document to analyze.\")"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 10,
237 |    "metadata": {
238 |     "collapsed": false
239 |    },
240 |    "outputs": [
241 |     {
242 |      "data": {
243 |       "text/plain": [
244 |        "[u'and',\n",
245 |        " u'document',\n",
246 |        " u'first',\n",
247 |        " u'is',\n",
248 |        " u'one',\n",
249 |        " u'second',\n",
250 |        " u'the',\n",
251 |        " u'third',\n",
252 |        " u'this']"
253 |       ]
254 |      },
255 |      "execution_count": 10,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "vectorizer.get_feature_names()"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 11,
267 |    "metadata": {
268 |     "collapsed": false
269 |    },
270 |    "outputs": [
271 |     {
272 |      "data": {
273 |       "text/plain": [
274 |        "1"
275 |       ]
276 |      },
277 |      "execution_count": 11,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "vectorizer.vocabulary_.get('document')"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 12,
289 |    "metadata": {
290 |     "collapsed": false
291 |    },
292 |    "outputs": [
293 |     {
294 |      "data": {
295 |       "text/plain": [
296 |        "array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])"
297 |       ]
298 |      },
299 |      "execution_count": 12,
300 |      "metadata": {},
301 |      "output_type": "execute_result"
302 |     }
303 |    ],
304 |    "source": [
305 |     "vectorizer.transform(['Something completely new.']).toarray()"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 13,
311 |    "metadata": {
312 |     "collapsed": false
313 |    },
314 |    "outputs": [
315 |     {
316 |      "data": {
317 |       "text/plain": [
318 |        "TfidfTransformer(norm=u'l2', smooth_idf=False, sublinear_tf=False,\n",
319 |        "         use_idf=True)"
320 |       ]
321 |      },
322 |      "execution_count": 13,
323 |      "metadata": {},
324 |      "output_type": "execute_result"
325 |     }
326 |    ],
327 |    "source": [
328 |     "from sklearn.feature_extraction.text import TfidfTransformer\n",
329 |     "transformer = TfidfTransformer(smooth_idf=False)\n",
330 |     "transformer   \n"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 14,
336 |    "metadata": {
337 |     "collapsed": false
338 |    },
339 |    "outputs": [
340 |     {
341 |      "data": {
342 |       "text/plain": [
343 |        "<6x3 sparse matrix of type '<type 'numpy.float64'>'\n",
344 |        "\twith 9 stored elements in Compressed Sparse Row format>"
345 |       ]
346 |      },
347 |      "execution_count": 14,
348 |      "metadata": {},
349 |      "output_type": "execute_result"
350 |     }
351 |    ],
352 |    "source": [
353 |     "counts = [[3, 0, 1],\n",
354 |     "          [2, 0, 0],\n",
355 |     "          [3, 0, 0],\n",
356 |     "          [4, 0, 0],\n",
357 |     "          [3, 2, 0],\n",
358 |     "          [3, 0, 2]]\n",
359 |     "\n",
360 |     "tfidf = transformer.fit_transform(counts)\n",
361 |     "tfidf                         \n"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": 15,
367 |    "metadata": {
368 |     "collapsed": false
369 |    },
370 |    "outputs": [
371 |     {
372 |      "data": {
373 |       "text/plain": [
374 |        "array([[ 0.81940995,  0.        ,  0.57320793],\n",
375 |        "       [ 1.        ,  0.        ,  0.        ],\n",
376 |        "       [ 1.        ,  0.        ,  0.        ],\n",
377 |        "       [ 1.        ,  0.        ,  0.        ],\n",
378 |        "       [ 0.47330339,  0.88089948,  0.        ],\n",
379 |        "       [ 0.58149261,  0.        ,  0.81355169]])"
380 |       ]
381 |      },
382 |      "execution_count": 15,
383 |      "metadata": {},
384 |      "output_type": "execute_result"
385 |     }
386 |    ],
387 |    "source": [
388 |     "tfidf.toarray()      "
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 17,
394 |    "metadata": {
395 |     "collapsed": true
396 |    },
397 |    "outputs": [],
398 |    "source": [
399 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
400 |     "\n",
401 |     "TfidfVectorizer?"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {
408 |     "collapsed": true
409 |    },
410 |    "outputs": [],
411 |    "source": []
412 |   }
413 |  ],
414 |  "metadata": {
415 |   "kernelspec": {
416 |    "display_name": "Python 2",
417 |    "language": "python",
418 |    "name": "python2"
419 |   },
420 |   "language_info": {
421 |    "codemirror_mode": {
422 |     "name": "ipython",
423 |     "version": 2
424 |    },
425 |    "file_extension": ".py",
426 |    "mimetype": "text/x-python",
427 |    "name": "python",
428 |    "nbconvert_exporter": "python",
429 |    "pygments_lexer": "ipython2",
430 |    "version": "2.7.10"
431 |   }
432 |  },
433 |  "nbformat": 4,
434 |  "nbformat_minor": 2
435 | }
436 | 


--------------------------------------------------------------------------------
/notebooks/FeatureExtraction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Feature Extraction\n",
 11 |     "\n",
 12 |     "The sklearn.feature_extraction module can be used to extract features in a format supported by machine learning algorithms from datasets consisting of formats such as text and image.\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {
 18 |     "deletable": true,
 19 |     "editable": true
 20 |    },
 21 |    "source": [
 22 |     "##  Loading features from dicts\n",
 23 |     "\n",
 24 |     "The class DictVectorizer can be used to convert feature arrays represented as lists of standard Python dict objects to the NumPy/SciPy representation used by scikit-learn estimators.\n",
 25 |     "\n",
 26 |     "While not particularly fast to process, Python’s dict has the advantages of being convenient to use, being sparse (absent features need not be stored) and storing feature names in addition to values.\n",
 27 |     "\n",
 28 |     "DictVectorizer implements what is called one-of-K or “one-hot” coding for categorical (aka nominal, discrete) features. Categorical features are “attribute-value” pairs where the value is restricted to a list of discrete of possibilities without ordering (e.g. topic identifiers, types of objects, tags, names...).\n",
 29 |     "\n",
 30 |     "In the following, “city” is a categorical attribute while “temperature” is a traditional numerical feature:"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 1,
 36 |    "metadata": {
 37 |     "collapsed": false,
 38 |     "deletable": true,
 39 |     "editable": true
 40 |    },
 41 |    "outputs": [
 42 |     {
 43 |      "data": {
 44 |       "text/plain": [
 45 |        "array([[  1.,   0.,   0.,  33.],\n",
 46 |        "       [  0.,   1.,   0.,  12.],\n",
 47 |        "       [  0.,   0.,   1.,  18.]])"
 48 |       ]
 49 |      },
 50 |      "execution_count": 1,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "measurements = [\n",
 57 |     "    {'city': 'Dubai', 'temperature': 33.},\n",
 58 |     "    {'city': 'London', 'temperature': 12.},\n",
 59 |     "    {'city': 'San Fransisco', 'temperature': 18.},\n",
 60 |     "]\n",
 61 |     "\n",
 62 |     "from sklearn.feature_extraction import DictVectorizer\n",
 63 |     "vec = DictVectorizer()\n",
 64 |     "\n",
 65 |     "vec.fit_transform(measurements).toarray()\n",
 66 |     "\n"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 2,
 72 |    "metadata": {
 73 |     "collapsed": false,
 74 |     "deletable": true,
 75 |     "editable": true
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']"
 82 |       ]
 83 |      },
 84 |      "execution_count": 2,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "\n",
 91 |     "vec.get_feature_names()"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {
 97 |     "deletable": true,
 98 |     "editable": true
 99 |    },
100 |    "source": [
101 |     "## Text feature extraction\n",
102 |     "\n",
103 |     "Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors with a fixed size rather than the raw text documents with variable length.\n",
104 |     "\n",
105 |     "In order to address this, scikit-learn provides utilities for the most common ways to extract numerical features from text content, namely:\n",
106 |     "\n",
107 |     "* tokenizing strings and giving an integer id for each possible token, for instance by using white-spaces and punctuation as token separators.\n",
108 |     "* counting the occurrences of tokens in each document.\n",
109 |     "* normalizing and weighting with diminishing importance tokens that occur in the majority of samples / documents.\n",
110 |     "\n",
111 |     "In this scheme, features and samples are defined as follows:\n",
112 |     "\n",
113 |     "* each individual token occurrence frequency (normalized or not) is treated as a feature.\n",
114 |     "* the vector of all the token frequencies for a given document is considered a multivariate sample.\n",
115 |     "\n",
116 |     "A corpus of documents can thus be represented by a matrix with one row per document and one column per token (e.g. word) occurring in the corpus.\n",
117 |     "\n",
118 |     "We call vectorization the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the Bag of Words or “Bag of n-grams” representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {
124 |     "deletable": true,
125 |     "editable": true
126 |    },
127 |    "source": [
128 |     "CountVectorizer implements both tokenization and occurrence counting in a single class:"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 3,
134 |    "metadata": {
135 |     "collapsed": true,
136 |     "deletable": true,
137 |     "editable": true
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
142 |     "\n",
143 |     "CountVectorizer?"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 4,
149 |    "metadata": {
150 |     "collapsed": false,
151 |     "deletable": true,
152 |     "editable": true
153 |    },
154 |    "outputs": [
155 |     {
156 |      "data": {
157 |       "text/plain": [
158 |        "CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',\n",
159 |        "        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',\n",
160 |        "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
161 |        "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
162 |        "        strip_accents=None, token_pattern=u'(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
163 |        "        tokenizer=None, vocabulary=None)"
164 |       ]
165 |      },
166 |      "execution_count": 4,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "vectorizer = CountVectorizer(min_df=1)\n",
173 |     "vectorizer "
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 5,
179 |    "metadata": {
180 |     "collapsed": false,
181 |     "deletable": true,
182 |     "editable": true
183 |    },
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "text/plain": [
188 |        "<4x9 sparse matrix of type '<type 'numpy.int64'>'\n",
189 |        "\twith 19 stored elements in Compressed Sparse Row format>"
190 |       ]
191 |      },
192 |      "execution_count": 5,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "corpus = [\n",
199 |     "    'This is the first document.',\n",
200 |     "    'This is the second second document.',\n",
201 |     "    'And the third one.',\n",
202 |     "    'Is this the first document?',\n",
203 |     "]\n",
204 |     "X = vectorizer.fit_transform(corpus)\n",
205 |     "X                              \n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 6,
211 |    "metadata": {
212 |     "collapsed": false,
213 |     "deletable": true,
214 |     "editable": true
215 |    },
216 |    "outputs": [
217 |     {
218 |      "data": {
219 |       "text/plain": [
220 |        "array([[0, 1, 1, 1, 0, 0, 1, 0, 1],\n",
221 |        "       [0, 1, 0, 1, 0, 2, 1, 0, 1],\n",
222 |        "       [1, 0, 0, 0, 1, 0, 1, 1, 0],\n",
223 |        "       [0, 1, 1, 1, 0, 0, 1, 0, 1]])"
224 |       ]
225 |      },
226 |      "execution_count": 6,
227 |      "metadata": {},
228 |      "output_type": "execute_result"
229 |     }
230 |    ],
231 |    "source": [
232 |     "X.toarray()"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 7,
238 |    "metadata": {
239 |     "collapsed": false,
240 |     "deletable": true,
241 |     "editable": true
242 |    },
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "[u'this', u'is', u'text', u'document', u'to', u'analyze']"
248 |       ]
249 |      },
250 |      "execution_count": 7,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "analyze = vectorizer.build_analyzer()\n",
257 |     "analyze(\"This is a text document to analyze.\")"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 8,
263 |    "metadata": {
264 |     "collapsed": false,
265 |     "deletable": true,
266 |     "editable": true
267 |    },
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "[u'and',\n",
273 |        " u'document',\n",
274 |        " u'first',\n",
275 |        " u'is',\n",
276 |        " u'one',\n",
277 |        " u'second',\n",
278 |        " u'the',\n",
279 |        " u'third',\n",
280 |        " u'this']"
281 |       ]
282 |      },
283 |      "execution_count": 8,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "vectorizer.get_feature_names()"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 9,
295 |    "metadata": {
296 |     "collapsed": false,
297 |     "deletable": true,
298 |     "editable": true
299 |    },
300 |    "outputs": [
301 |     {
302 |      "data": {
303 |       "text/plain": [
304 |        "1"
305 |       ]
306 |      },
307 |      "execution_count": 9,
308 |      "metadata": {},
309 |      "output_type": "execute_result"
310 |     }
311 |    ],
312 |    "source": [
313 |     "vectorizer.vocabulary_.get('document')"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 10,
319 |    "metadata": {
320 |     "collapsed": false,
321 |     "deletable": true,
322 |     "editable": true
323 |    },
324 |    "outputs": [
325 |     {
326 |      "data": {
327 |       "text/plain": [
328 |        "array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])"
329 |       ]
330 |      },
331 |      "execution_count": 10,
332 |      "metadata": {},
333 |      "output_type": "execute_result"
334 |     }
335 |    ],
336 |    "source": [
337 |     "vectorizer.transform(['Something completely new.']).toarray()"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 11,
343 |    "metadata": {
344 |     "collapsed": false,
345 |     "deletable": true,
346 |     "editable": true
347 |    },
348 |    "outputs": [
349 |     {
350 |      "data": {
351 |       "text/plain": [
352 |        "TfidfTransformer(norm=u'l2', smooth_idf=False, sublinear_tf=False,\n",
353 |        "         use_idf=True)"
354 |       ]
355 |      },
356 |      "execution_count": 11,
357 |      "metadata": {},
358 |      "output_type": "execute_result"
359 |     }
360 |    ],
361 |    "source": [
362 |     "from sklearn.feature_extraction.text import TfidfTransformer\n",
363 |     "transformer = TfidfTransformer(smooth_idf=False)\n",
364 |     "transformer   \n"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 12,
370 |    "metadata": {
371 |     "collapsed": false,
372 |     "deletable": true,
373 |     "editable": true
374 |    },
375 |    "outputs": [
376 |     {
377 |      "data": {
378 |       "text/plain": [
379 |        "<6x3 sparse matrix of type '<type 'numpy.float64'>'\n",
380 |        "\twith 9 stored elements in Compressed Sparse Row format>"
381 |       ]
382 |      },
383 |      "execution_count": 12,
384 |      "metadata": {},
385 |      "output_type": "execute_result"
386 |     }
387 |    ],
388 |    "source": [
389 |     "counts = [[3, 0, 1],\n",
390 |     "          [2, 0, 0],\n",
391 |     "          [3, 0, 0],\n",
392 |     "          [4, 0, 0],\n",
393 |     "          [3, 2, 0],\n",
394 |     "          [3, 0, 2]]\n",
395 |     "\n",
396 |     "tfidf = transformer.fit_transform(counts)\n",
397 |     "tfidf                         \n"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 13,
403 |    "metadata": {
404 |     "collapsed": false,
405 |     "deletable": true,
406 |     "editable": true
407 |    },
408 |    "outputs": [
409 |     {
410 |      "data": {
411 |       "text/plain": [
412 |        "array([[ 0.81940995,  0.        ,  0.57320793],\n",
413 |        "       [ 1.        ,  0.        ,  0.        ],\n",
414 |        "       [ 1.        ,  0.        ,  0.        ],\n",
415 |        "       [ 1.        ,  0.        ,  0.        ],\n",
416 |        "       [ 0.47330339,  0.88089948,  0.        ],\n",
417 |        "       [ 0.58149261,  0.        ,  0.81355169]])"
418 |       ]
419 |      },
420 |      "execution_count": 13,
421 |      "metadata": {},
422 |      "output_type": "execute_result"
423 |     }
424 |    ],
425 |    "source": [
426 |     "tfidf.toarray()      "
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": 14,
432 |    "metadata": {
433 |     "collapsed": true,
434 |     "deletable": true,
435 |     "editable": true
436 |    },
437 |    "outputs": [],
438 |    "source": [
439 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
440 |     "\n",
441 |     "TfidfVectorizer?"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {
448 |     "collapsed": true,
449 |     "deletable": true,
450 |     "editable": true
451 |    },
452 |    "outputs": [],
453 |    "source": []
454 |   }
455 |  ],
456 |  "metadata": {
457 |   "kernelspec": {
458 |    "display_name": "Python 2",
459 |    "language": "python",
460 |    "name": "python2"
461 |   },
462 |   "language_info": {
463 |    "codemirror_mode": {
464 |     "name": "ipython",
465 |     "version": 2
466 |    },
467 |    "file_extension": ".py",
468 |    "mimetype": "text/x-python",
469 |    "name": "python",
470 |    "nbconvert_exporter": "python",
471 |    "pygments_lexer": "ipython2",
472 |    "version": "2.7.10"
473 |   }
474 |  },
475 |  "nbformat": 4,
476 |  "nbformat_minor": 2
477 | }
478 | 


--------------------------------------------------------------------------------
/notebooks/FeatureSelection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Feature Selection\n",
 11 |     "\n",
 12 |     "The classes in the sklearn.feature_selection module can be used for feature selection/dimensionality reduction on sample sets, either to improve estimators’ accuracy scores or to boost their performance on very high-dimensional datasets."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {
 18 |     "deletable": true,
 19 |     "editable": true
 20 |    },
 21 |    "source": [
 22 |     "## Remove Low Var Features\n",
 23 |     "\n",
 24 |     "VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.\n",
 25 |     "\n",
 26 |     "Again we are starting to see fit and fit_transform pop up again. Sklearn provides a ton of functionality that's not just prediction. Some of the functionality is preprocessing the data. Again these are like models (they can only rely on the training data) but don't really predict anything. Thus they do have a fit method, but don't have a predict method. We will see two examples of this type of paradigm below."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 1,
 32 |    "metadata": {
 33 |     "collapsed": false,
 34 |     "deletable": true,
 35 |     "editable": true
 36 |    },
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/plain": [
 41 |        "array([[0, 1],\n",
 42 |        "       [1, 0],\n",
 43 |        "       [0, 0],\n",
 44 |        "       [1, 1],\n",
 45 |        "       [1, 0],\n",
 46 |        "       [1, 1]])"
 47 |       ]
 48 |      },
 49 |      "execution_count": 1,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "from sklearn.feature_selection import VarianceThreshold\n",
 56 |     "\n",
 57 |     "X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]\n",
 58 |     "\n",
 59 |     "sel = VarianceThreshold(threshold=(.8 * (1 - .8)))\n",
 60 |     "\n",
 61 |     "sel.fit(X)\n",
 62 |     "\n",
 63 |     "sel.transform(X)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 2,
 69 |    "metadata": {
 70 |     "collapsed": false
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "array([[0, 1],\n",
 77 |        "       [1, 0],\n",
 78 |        "       [0, 0],\n",
 79 |        "       [1, 1],\n",
 80 |        "       [1, 0],\n",
 81 |        "       [1, 1]])"
 82 |       ]
 83 |      },
 84 |      "execution_count": 2,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "sel.fit_transform(X)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {
 96 |     "deletable": true,
 97 |     "editable": true
 98 |    },
 99 |    "source": [
100 |     "## Univariate Feature Selection\n",
101 |     "\n",
102 |     "Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator. Scikit-learn exposes feature selection routines as objects that implement the transform method:\n",
103 |     "* SelectKBest removes all but the k highest scoring features\n",
104 |     "* SelectPercentile removes all but a user-specified highest scoring percentage of features\n",
105 |     "* using common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe.\n",
106 |     "* GenericUnivariateSelect allows to perform univariate feature selection with a configurable strategy. This allows to select the best univariate selection strategy with hyper-parameter search estimator.\n",
107 |     "\n",
108 |     "These objects take as input a scoring function that returns univariate scores and p-values (or only scores for SelectKBest and SelectPercentile):\n",
109 |     "\n",
110 |     "* For regression: f_regression, mutual_info_regression\n",
111 |     "* For classification: chi2, f_classif, mutual_info_classif\n",
112 |     "\n",
113 |     "The methods based on F-test estimate the degree of linear dependency between two random variables. On the other hand, mutual information methods can capture any kind of statistical dependency, but being nonparametric, they require more samples for accurate estimation."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 3,
119 |    "metadata": {
120 |     "collapsed": true,
121 |     "deletable": true,
122 |     "editable": true
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "from sklearn.datasets import load_iris\n",
127 |     "from sklearn.feature_selection import SelectKBest\n",
128 |     "from sklearn.feature_selection import chi2\n",
129 |     "\n",
130 |     "SelectKBest?"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 4,
136 |    "metadata": {
137 |     "collapsed": false,
138 |     "deletable": true,
139 |     "editable": true
140 |    },
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "SelectKBest(k=2, score_func=<function chi2 at 0x1092a9410>)"
146 |       ]
147 |      },
148 |      "execution_count": 4,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "X, y = load_iris(return_X_y=True)\n",
155 |     "\n",
156 |     "sel = SelectKBest(chi2, k=2)\n",
157 |     "\n",
158 |     "sel.fit(X, y)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 5,
164 |    "metadata": {
165 |     "collapsed": false,
166 |     "deletable": true,
167 |     "editable": true
168 |    },
169 |    "outputs": [
170 |     {
171 |      "data": {
172 |       "text/plain": [
173 |        "(150, 2)"
174 |       ]
175 |      },
176 |      "execution_count": 5,
177 |      "metadata": {},
178 |      "output_type": "execute_result"
179 |     }
180 |    ],
181 |    "source": [
182 |     "sel.transform(X).shape"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 6,
188 |    "metadata": {
189 |     "collapsed": false,
190 |     "deletable": true,
191 |     "editable": true
192 |    },
193 |    "outputs": [
194 |     {
195 |      "data": {
196 |       "text/plain": [
197 |        "array([  10.81782088,    3.59449902,  116.16984746,   67.24482759])"
198 |       ]
199 |      },
200 |      "execution_count": 6,
201 |      "metadata": {},
202 |      "output_type": "execute_result"
203 |     }
204 |    ],
205 |    "source": [
206 |     "sel.scores_"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {
212 |     "deletable": true,
213 |     "editable": true
214 |    },
215 |    "source": [
216 |     "## Recursive feature elimination\n",
217 |     "\n",
218 |     "Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and weights are assigned to each one of them. Then, features whose absolute weights are the smallest are pruned from the current set features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.\n",
219 |     "\n",
220 |     "So it is very important to normalize these features in linear models!"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 7,
226 |    "metadata": {
227 |     "collapsed": true,
228 |     "deletable": true,
229 |     "editable": true
230 |    },
231 |    "outputs": [],
232 |    "source": [
233 |     "from sklearn.ensemble import RandomForestClassifier\n",
234 |     "from sklearn.feature_selection import RFECV\n",
235 |     "\n",
236 |     "RFECV?"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 8,
242 |    "metadata": {
243 |     "collapsed": true,
244 |     "deletable": true,
245 |     "editable": true
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "m = RFECV(RandomForestClassifier(), scoring='accuracy')"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 9,
255 |    "metadata": {
256 |     "collapsed": false,
257 |     "deletable": true,
258 |     "editable": true
259 |    },
260 |    "outputs": [
261 |     {
262 |      "data": {
263 |       "text/plain": [
264 |        "RFECV(cv=None,\n",
265 |        "   estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
266 |        "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
267 |        "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
268 |        "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
269 |        "            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n",
270 |        "            verbose=0, warm_start=False),\n",
271 |        "   n_jobs=1, scoring='accuracy', step=1, verbose=0)"
272 |       ]
273 |      },
274 |      "execution_count": 9,
275 |      "metadata": {},
276 |      "output_type": "execute_result"
277 |     }
278 |    ],
279 |    "source": [
280 |     "m.fit(X, y)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 11,
286 |    "metadata": {
287 |     "collapsed": false
288 |    },
289 |    "outputs": [
290 |     {
291 |      "data": {
292 |       "text/plain": [
293 |        "0.99333333333333329"
294 |       ]
295 |      },
296 |      "execution_count": 11,
297 |      "metadata": {},
298 |      "output_type": "execute_result"
299 |     }
300 |    ],
301 |    "source": [
302 |     "m.score(X, y)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {
308 |     "deletable": true,
309 |     "editable": true
310 |    },
311 |    "source": [
312 |     "## Feature selection using SelectFromModel\n",
313 |     "\n",
314 |     "SelectFromModel is a meta-transformer that can be used along with any estimator that has a coef_ or feature_importances_ attribute after fitting. The features are considered unimportant and removed, if the corresponding coef_ or feature_importances_ values are below the provided threshold parameter. Apart from specifying the threshold numerically, there are built-in heuristics for finding a threshold using a string argument. Available heuristics are “mean”, “median” and float multiples of these like “0.1*mean”.\n",
315 |     "\n",
316 |     "For examples on how it is to be used refer to the sections below."
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 13,
322 |    "metadata": {
323 |     "collapsed": true,
324 |     "deletable": true,
325 |     "editable": true
326 |    },
327 |    "outputs": [],
328 |    "source": [
329 |     "from sklearn.svm import LinearSVC\n",
330 |     "from sklearn.feature_selection import SelectFromModel\n",
331 |     "\n",
332 |     "SelectFromModel?"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 14,
338 |    "metadata": {
339 |     "collapsed": false,
340 |     "deletable": true,
341 |     "editable": true
342 |    },
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "SelectFromModel(estimator=LinearSVC(C=0.01, class_weight=None, dual=False, fit_intercept=True,\n",
348 |        "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
349 |        "     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,\n",
350 |        "     verbose=0),\n",
351 |        "        prefit=False, threshold=None)"
352 |       ]
353 |      },
354 |      "execution_count": 14,
355 |      "metadata": {},
356 |      "output_type": "execute_result"
357 |     }
358 |    ],
359 |    "source": [
360 |     "m = SelectFromModel(LinearSVC(C=0.01, penalty='l1', dual=False))\n",
361 |     "\n",
362 |     "m.fit(X, y)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 15,
368 |    "metadata": {
369 |     "collapsed": false,
370 |     "deletable": true,
371 |     "editable": true
372 |    },
373 |    "outputs": [
374 |     {
375 |      "data": {
376 |       "text/plain": [
377 |        "(150, 3)"
378 |       ]
379 |      },
380 |      "execution_count": 15,
381 |      "metadata": {},
382 |      "output_type": "execute_result"
383 |     }
384 |    ],
385 |    "source": [
386 |     "m.transform(X).shape"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "metadata": {
392 |     "deletable": true,
393 |     "editable": true
394 |    },
395 |    "source": [
396 |     "A little bit more complex!"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 16,
402 |    "metadata": {
403 |     "collapsed": false,
404 |     "deletable": true,
405 |     "editable": true
406 |    },
407 |    "outputs": [
408 |     {
409 |      "name": "stdout",
410 |      "output_type": "stream",
411 |      "text": [
412 |       "(506, 13)\n"
413 |      ]
414 |     },
415 |     {
416 |      "data": {
417 |       "text/plain": [
418 |        "(506, 10)"
419 |       ]
420 |      },
421 |      "execution_count": 16,
422 |      "metadata": {},
423 |      "output_type": "execute_result"
424 |     }
425 |    ],
426 |    "source": [
427 |     "from sklearn.linear_model import LassoCV\n",
428 |     "from sklearn.datasets import load_boston\n",
429 |     "\n",
430 |     "X, y = load_boston(return_X_y=True)\n",
431 |     "\n",
432 |     "print X.shape\n",
433 |     "\n",
434 |     "m = SelectFromModel(LassoCV())\n",
435 |     "\n",
436 |     "m.fit(X, y)\n",
437 |     "\n",
438 |     "m.transform(X).shape"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "metadata": {
445 |     "collapsed": false,
446 |     "deletable": true,
447 |     "editable": true
448 |    },
449 |    "outputs": [],
450 |    "source": []
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": null,
455 |    "metadata": {
456 |     "collapsed": true,
457 |     "deletable": true,
458 |     "editable": true
459 |    },
460 |    "outputs": [],
461 |    "source": []
462 |   }
463 |  ],
464 |  "metadata": {
465 |   "kernelspec": {
466 |    "display_name": "Python 2",
467 |    "language": "python",
468 |    "name": "python2"
469 |   },
470 |   "language_info": {
471 |    "codemirror_mode": {
472 |     "name": "ipython",
473 |     "version": 2
474 |    },
475 |    "file_extension": ".py",
476 |    "mimetype": "text/x-python",
477 |    "name": "python",
478 |    "nbconvert_exporter": "python",
479 |    "pygments_lexer": "ipython2",
480 |    "version": "2.7.10"
481 |   }
482 |  },
483 |  "nbformat": 4,
484 |  "nbformat_minor": 2
485 | }
486 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/CrossValidation-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Cross-validation: evaluating estimator performance\n",
  8 |     "\n",
  9 |     "Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data. This situation is called overfitting. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test. Note that the word “experiment” is not intended to denote academic use only, because even in commercial settings machine learning usually starts out experimentally.\n",
 10 |     "\n",
 11 |     "In scikit-learn a random split into training and test sets can be quickly computed with the train_test_split helper function. Let’s load the iris data set to fit a linear support vector machine on it:"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": false
 19 |    },
 20 |    "outputs": [
 21 |     {
 22 |      "data": {
 23 |       "text/plain": [
 24 |        "((150, 4), (150,))"
 25 |       ]
 26 |      },
 27 |      "execution_count": 1,
 28 |      "metadata": {},
 29 |      "output_type": "execute_result"
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "import numpy as np\n",
 34 |     "from sklearn.model_selection import train_test_split\n",
 35 |     "from sklearn import datasets\n",
 36 |     "from sklearn import svm\n",
 37 |     "\n",
 38 |     "iris = datasets.load_iris()\n",
 39 |     "iris.data.shape, iris.target.shape"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "train_test_split?"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {
 57 |     "collapsed": false
 58 |    },
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "text/plain": [
 63 |        "((90, 4), (90,))"
 64 |       ]
 65 |      },
 66 |      "execution_count": 3,
 67 |      "metadata": {},
 68 |      "output_type": "execute_result"
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 73 |     "     iris.data, iris.target, test_size=0.4, random_state=0)\n",
 74 |     "\n",
 75 |     "X_train.shape, y_train.shape"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 4,
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "((60, 4), (60,))"
 89 |       ]
 90 |      },
 91 |      "execution_count": 4,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "X_test.shape, y_test.shape"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 5,
103 |    "metadata": {
104 |     "collapsed": false
105 |    },
106 |    "outputs": [
107 |     {
108 |      "data": {
109 |       "text/plain": [
110 |        "0.96666666666666667"
111 |       ]
112 |      },
113 |      "execution_count": 5,
114 |      "metadata": {},
115 |      "output_type": "execute_result"
116 |     }
117 |    ],
118 |    "source": [
119 |     "clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)\n",
120 |     "clf.score(X_test, y_test)  "
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "When evaluating different settings (“hyperparameters”) for estimators, such as the C setting that must be manually set for an SVM, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.\n",
128 |     "\n",
129 |     "However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.\n",
130 |     "\n",
131 |     "A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n",
132 |     "\n",
133 |     "* A model is trained using k-1 of the folds as training data;\n",
134 |     "* the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).\n",
135 |     "\n",
136 |     "The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as it is the case when fixing an arbitrary test set), which is a major advantage in problem such as inverse inference where the number of samples is very small."
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 6,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "from sklearn.model_selection import cross_val_score\n",
148 |     "\n",
149 |     "cross_val_score?"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 7,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [
159 |     {
160 |      "data": {
161 |       "text/plain": [
162 |        "array([ 0.96666667,  1.        ,  0.96666667,  0.96666667,  1.        ])"
163 |       ]
164 |      },
165 |      "execution_count": 7,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "clf = svm.SVC(kernel='linear', C=1)\n",
172 |     "\n",
173 |     "scores = cross_val_score(clf, iris.data, iris.target, cv=5)\n",
174 |     "\n",
175 |     "scores"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 8,
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [
185 |     {
186 |      "name": "stdout",
187 |      "output_type": "stream",
188 |      "text": [
189 |       "Accuracy: 0.98 (+/- 0.03)\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 12,
200 |    "metadata": {
201 |     "collapsed": false
202 |    },
203 |    "outputs": [
204 |     {
205 |      "data": {
206 |       "text/plain": [
207 |        "array([ 0.96658312,  1.        ,  0.96658312,  0.96658312,  1.        ])"
208 |       ]
209 |      },
210 |      "execution_count": 12,
211 |      "metadata": {},
212 |      "output_type": "execute_result"
213 |     }
214 |    ],
215 |    "source": [
216 |     "from sklearn import metrics\n",
217 |     "\n",
218 |     "scores = cross_val_score(\n",
219 |     "     clf, iris.data, iris.target, cv=5, scoring='f1_macro')\n",
220 |     "\n",
221 |     "scores"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 13,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "from sklearn.model_selection import ShuffleSplit\n",
233 |     "\n",
234 |     "ShuffleSplit?"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 14,
240 |    "metadata": {
241 |     "collapsed": false
242 |    },
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "array([ 0.97777778,  0.97777778,  1.        ])"
248 |       ]
249 |      },
250 |      "execution_count": 14,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "n_samples = iris.data.shape[0]\n",
257 |     "\n",
258 |     "cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)\n",
259 |     "\n",
260 |     "cross_val_score(clf, iris.data, iris.target, cv=cv)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 18,
266 |    "metadata": {
267 |     "collapsed": false
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "from sklearn.model_selection import cross_val_predict\n",
272 |     "\n",
273 |     "cross_val_predict?"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 19,
279 |    "metadata": {
280 |     "collapsed": false
281 |    },
282 |    "outputs": [
283 |     {
284 |      "data": {
285 |       "text/plain": [
286 |        "(150,)"
287 |       ]
288 |      },
289 |      "execution_count": 19,
290 |      "metadata": {},
291 |      "output_type": "execute_result"
292 |     }
293 |    ],
294 |    "source": [
295 |     "predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)\n",
296 |     "\n",
297 |     "predicted.shape"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 20,
303 |    "metadata": {
304 |     "collapsed": false
305 |    },
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/plain": [
310 |        "0.97333333333333338"
311 |       ]
312 |      },
313 |      "execution_count": 20,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "metrics.accuracy_score(iris.target, predicted) "
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "## Cross validation iterators\n",
327 |     "\n",
328 |     "The following sections list utilities to generate indices that can be used to generate dataset splits according to different cross validation strategies.\n",
329 |     "\n",
330 |     "Assuming that some data is Independent Identically Distributed (i.i.d.) is making the assumption that all samples stem from the same generative process and that the generative process is assumed to have no memory of past generated samples.\n",
331 |     "\n",
332 |     "The following cross-validators can be used in such cases."
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 23,
338 |    "metadata": {
339 |     "collapsed": false
340 |    },
341 |    "outputs": [],
342 |    "source": [
343 |     "from sklearn.model_selection import KFold\n",
344 |     "\n",
345 |     "KFold?"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 28,
351 |    "metadata": {
352 |     "collapsed": false
353 |    },
354 |    "outputs": [
355 |     {
356 |      "name": "stdout",
357 |      "output_type": "stream",
358 |      "text": [
359 |       "[2 3] [0 1]\n",
360 |       "[0 1] [2 3]\n"
361 |      ]
362 |     }
363 |    ],
364 |    "source": [
365 |     "kf = KFold(n_splits=2, shuffle=True)\n",
366 |     "\n",
367 |     "X = [\"a\", \"b\", \"c\", \"d\"]\n",
368 |     "for train, test in kf.split(X):\n",
369 |     "     print(\"%s %s\" % (train, test))"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "#### Stratification\n",
377 |     "\n",
378 |     "Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold."
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 24,
384 |    "metadata": {
385 |     "collapsed": true
386 |    },
387 |    "outputs": [],
388 |    "source": [
389 |     "from sklearn.model_selection import StratifiedKFold\n",
390 |     "\n",
391 |     "StratifiedKFold?"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 25,
397 |    "metadata": {
398 |     "collapsed": false
399 |    },
400 |    "outputs": [
401 |     {
402 |      "name": "stdout",
403 |      "output_type": "stream",
404 |      "text": [
405 |       "[2 3 6 7 8 9] [0 1 4 5]\n",
406 |       "[0 1 3 4 5 8 9] [2 6 7]\n",
407 |       "[0 1 2 4 5 6 7] [3 8 9]\n"
408 |      ]
409 |     }
410 |    ],
411 |    "source": [
412 |     "X = np.ones(10)\n",
413 |     "y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]\n",
414 |     "skf = StratifiedKFold(n_splits=3)\n",
415 |     "for train, test in skf.split(X, y):\n",
416 |     "    print(\"%s %s\" % (train, test))\n"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "#### Grouped Data\n",
424 |     "\n",
425 |     "The i.i.d. assumption is broken if the underlying generative process yield groups of dependent samples.\n",
426 |     "\n",
427 |     "Such a grouping of data is domain specific. An example would be when there is medical data collected from multiple patients, with multiple samples taken from each patient. And such data is likely to be dependent on the individual group. In our example, the patient id for each sample will be its group identifier.\n",
428 |     "\n",
429 |     "In this case we would like to know if a model trained on a particular set of groups generalizes well to the unseen groups. To measure this, we need to ensure that all the samples in the validation fold come from groups that are not represented at all in the paired training fold.\n",
430 |     "\n",
431 |     "The following cross-validation splitters can be used to do that. The grouping identifier for the samples is specified via the groups parameter."
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": 26,
437 |    "metadata": {
438 |     "collapsed": false
439 |    },
440 |    "outputs": [
441 |     {
442 |      "name": "stdout",
443 |      "output_type": "stream",
444 |      "text": [
445 |       "[0 1 2 3 4 5] [6 7 8 9]\n",
446 |       "[0 1 2 6 7 8 9] [3 4 5]\n",
447 |       "[3 4 5 6 7 8 9] [0 1 2]\n"
448 |      ]
449 |     }
450 |    ],
451 |    "source": [
452 |     "from sklearn.model_selection import GroupKFold\n",
453 |     "\n",
454 |     "X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]\n",
455 |     "y = [\"a\", \"b\", \"b\", \"b\", \"c\", \"c\", \"c\", \"d\", \"d\", \"d\"]\n",
456 |     "groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]\n",
457 |     "\n",
458 |     "gkf = GroupKFold(n_splits=3)\n",
459 |     "for train, test in gkf.split(X, y, groups=groups):\n",
460 |     "    print(\"%s %s\" % (train, test))\n"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "#### Time Series Split\n",
468 |     "\n",
469 |     "TimeSeriesSplit is a variation of k-fold which returns first k folds as train set and the (k+1) th fold as test set. Note that unlike standard cross-validation methods, successive training sets are supersets of those that come before them. Also, it adds all surplus data to the first training partition, which is always used to train the model.\n",
470 |     "\n",
471 |     "This class can be used to cross-validate time series data samples that are observed at fixed time intervals."
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": 27,
477 |    "metadata": {
478 |     "collapsed": false
479 |    },
480 |    "outputs": [
481 |     {
482 |      "name": "stdout",
483 |      "output_type": "stream",
484 |      "text": [
485 |       "TimeSeriesSplit(n_splits=3)\n",
486 |       "[0 1 2] [3]\n",
487 |       "[0 1 2 3] [4]\n",
488 |       "[0 1 2 3 4] [5]\n"
489 |      ]
490 |     }
491 |    ],
492 |    "source": [
493 |     "from sklearn.model_selection import TimeSeriesSplit\n",
494 |     "\n",
495 |     "X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])\n",
496 |     "y = np.array([1, 2, 3, 4, 5, 6])\n",
497 |     "tscv = TimeSeriesSplit(n_splits=3)\n",
498 |     "print(tscv)  \n",
499 |     "\n",
500 |     "for train, test in tscv.split(X):\n",
501 |     "    print(\"%s %s\" % (train, test))"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": null,
507 |    "metadata": {
508 |     "collapsed": true
509 |    },
510 |    "outputs": [],
511 |    "source": []
512 |   }
513 |  ],
514 |  "metadata": {
515 |   "kernelspec": {
516 |    "display_name": "Python 2",
517 |    "language": "python",
518 |    "name": "python2"
519 |   },
520 |   "language_info": {
521 |    "codemirror_mode": {
522 |     "name": "ipython",
523 |     "version": 2
524 |    },
525 |    "file_extension": ".py",
526 |    "mimetype": "text/x-python",
527 |    "name": "python",
528 |    "nbconvert_exporter": "python",
529 |    "pygments_lexer": "ipython2",
530 |    "version": "2.7.10"
531 |   }
532 |  },
533 |  "nbformat": 4,
534 |  "nbformat_minor": 2
535 | }
536 | 


--------------------------------------------------------------------------------
/notebooks/CrossValidation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Cross-validation: evaluating estimator performance\n",
 11 |     "\n",
 12 |     "Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data. This situation is called overfitting. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test. Note that the word “experiment” is not intended to denote academic use only, because even in commercial settings machine learning usually starts out experimentally.\n",
 13 |     "\n",
 14 |     "In scikit-learn a random split into training and test sets can be quickly computed with the train_test_split helper function. Let’s load the iris data set to fit a linear support vector machine on it:"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": false,
 22 |     "deletable": true,
 23 |     "editable": true
 24 |    },
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/plain": [
 29 |        "((150, 4), (150,))"
 30 |       ]
 31 |      },
 32 |      "execution_count": 1,
 33 |      "metadata": {},
 34 |      "output_type": "execute_result"
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "import numpy as np\n",
 39 |     "from sklearn.model_selection import train_test_split\n",
 40 |     "from sklearn import datasets\n",
 41 |     "from sklearn import svm\n",
 42 |     "\n",
 43 |     "iris = datasets.load_iris()\n",
 44 |     "iris.data.shape, iris.target.shape"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "metadata": {
 51 |     "collapsed": true,
 52 |     "deletable": true,
 53 |     "editable": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "train_test_split?"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {
 64 |     "collapsed": false,
 65 |     "deletable": true,
 66 |     "editable": true
 67 |    },
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "((90, 4), (90,))"
 73 |       ]
 74 |      },
 75 |      "execution_count": 3,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 82 |     "     iris.data, iris.target, test_size=0.4, random_state=0)\n",
 83 |     "\n",
 84 |     "X_train.shape, y_train.shape"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 4,
 90 |    "metadata": {
 91 |     "collapsed": false,
 92 |     "deletable": true,
 93 |     "editable": true
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/plain": [
 99 |        "((60, 4), (60,))"
100 |       ]
101 |      },
102 |      "execution_count": 4,
103 |      "metadata": {},
104 |      "output_type": "execute_result"
105 |     }
106 |    ],
107 |    "source": [
108 |     "X_test.shape, y_test.shape"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 5,
114 |    "metadata": {
115 |     "collapsed": false,
116 |     "deletable": true,
117 |     "editable": true
118 |    },
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "0.96666666666666667"
124 |       ]
125 |      },
126 |      "execution_count": 5,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)\n",
133 |     "clf.score(X_test, y_test)  "
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 6,
139 |    "metadata": {
140 |     "collapsed": false
141 |    },
142 |    "outputs": [
143 |     {
144 |      "data": {
145 |       "text/plain": [
146 |        "0.98888888888888893"
147 |       ]
148 |      },
149 |      "execution_count": 6,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "clf.score(X_train, y_train)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {
161 |     "deletable": true,
162 |     "editable": true
163 |    },
164 |    "source": [
165 |     "When evaluating different settings (“hyperparameters”) for estimators, such as the C setting that must be manually set for an SVM, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.\n",
166 |     "\n",
167 |     "However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.\n",
168 |     "\n",
169 |     "A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:\n",
170 |     "\n",
171 |     "* A model is trained using k-1 of the folds as training data;\n",
172 |     "* the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).\n",
173 |     "\n",
174 |     "The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as it is the case when fixing an arbitrary test set), which is a major advantage in problem such as inverse inference where the number of samples is very small."
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 7,
180 |    "metadata": {
181 |     "collapsed": true,
182 |     "deletable": true,
183 |     "editable": true
184 |    },
185 |    "outputs": [],
186 |    "source": [
187 |     "from sklearn.model_selection import cross_val_score\n",
188 |     "\n",
189 |     "cross_val_score?"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 10,
195 |    "metadata": {
196 |     "collapsed": false,
197 |     "deletable": true,
198 |     "editable": true
199 |    },
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "array([ 0.98666667,  0.94666667])"
205 |       ]
206 |      },
207 |      "execution_count": 10,
208 |      "metadata": {},
209 |      "output_type": "execute_result"
210 |     }
211 |    ],
212 |    "source": [
213 |     "clf = svm.SVC(kernel='linear', C=1)\n",
214 |     "\n",
215 |     "scores = cross_val_score(clf, iris.data, iris.target, cv=2)\n",
216 |     "\n",
217 |     "scores"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 11,
223 |    "metadata": {
224 |     "collapsed": false,
225 |     "deletable": true,
226 |     "editable": true
227 |    },
228 |    "outputs": [
229 |     {
230 |      "name": "stdout",
231 |      "output_type": "stream",
232 |      "text": [
233 |       "Accuracy: 0.97 (+/- 0.04)\n"
234 |      ]
235 |     }
236 |    ],
237 |    "source": [
238 |     "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 12,
244 |    "metadata": {
245 |     "collapsed": false,
246 |     "deletable": true,
247 |     "editable": true
248 |    },
249 |    "outputs": [
250 |     {
251 |      "data": {
252 |       "text/plain": [
253 |        "array([ 0.96658312,  1.        ,  0.96658312,  0.96658312,  1.        ])"
254 |       ]
255 |      },
256 |      "execution_count": 12,
257 |      "metadata": {},
258 |      "output_type": "execute_result"
259 |     }
260 |    ],
261 |    "source": [
262 |     "from sklearn import metrics\n",
263 |     "\n",
264 |     "scores = cross_val_score(\n",
265 |     "     clf, iris.data, iris.target, cv=5, scoring='f1_macro')\n",
266 |     "\n",
267 |     "scores"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 13,
273 |    "metadata": {
274 |     "collapsed": true,
275 |     "deletable": true,
276 |     "editable": true
277 |    },
278 |    "outputs": [],
279 |    "source": [
280 |     "from sklearn.model_selection import ShuffleSplit\n",
281 |     "\n",
282 |     "ShuffleSplit?"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 14,
288 |    "metadata": {
289 |     "collapsed": false,
290 |     "deletable": true,
291 |     "editable": true
292 |    },
293 |    "outputs": [
294 |     {
295 |      "data": {
296 |       "text/plain": [
297 |        "array([ 0.97777778,  0.97777778,  1.        ])"
298 |       ]
299 |      },
300 |      "execution_count": 14,
301 |      "metadata": {},
302 |      "output_type": "execute_result"
303 |     }
304 |    ],
305 |    "source": [
306 |     "n_samples = iris.data.shape[0]\n",
307 |     "\n",
308 |     "cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)\n",
309 |     "\n",
310 |     "cross_val_score(clf, iris.data, iris.target, cv=cv)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 15,
316 |    "metadata": {
317 |     "collapsed": false,
318 |     "deletable": true,
319 |     "editable": true
320 |    },
321 |    "outputs": [],
322 |    "source": [
323 |     "from sklearn.model_selection import cross_val_predict\n",
324 |     "\n",
325 |     "cross_val_predict?"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 16,
331 |    "metadata": {
332 |     "collapsed": false,
333 |     "deletable": true,
334 |     "editable": true
335 |    },
336 |    "outputs": [
337 |     {
338 |      "data": {
339 |       "text/plain": [
340 |        "(150,)"
341 |       ]
342 |      },
343 |      "execution_count": 16,
344 |      "metadata": {},
345 |      "output_type": "execute_result"
346 |     }
347 |    ],
348 |    "source": [
349 |     "predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)\n",
350 |     "\n",
351 |     "predicted.shape"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 17,
357 |    "metadata": {
358 |     "collapsed": false,
359 |     "deletable": true,
360 |     "editable": true
361 |    },
362 |    "outputs": [
363 |     {
364 |      "data": {
365 |       "text/plain": [
366 |        "0.97333333333333338"
367 |       ]
368 |      },
369 |      "execution_count": 17,
370 |      "metadata": {},
371 |      "output_type": "execute_result"
372 |     }
373 |    ],
374 |    "source": [
375 |     "metrics.accuracy_score(iris.target, predicted) "
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "metadata": {
382 |     "collapsed": true
383 |    },
384 |    "outputs": [],
385 |    "source": [
386 |     "from sklearn.linear_model import LassoCV"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "metadata": {
392 |     "deletable": true,
393 |     "editable": true
394 |    },
395 |    "source": [
396 |     "## Cross validation iterators\n",
397 |     "\n",
398 |     "The following sections list utilities to generate indices that can be used to generate dataset splits according to different cross validation strategies.\n",
399 |     "\n",
400 |     "Assuming that some data is Independent Identically Distributed (i.i.d.) is making the assumption that all samples stem from the same generative process and that the generative process is assumed to have no memory of past generated samples.\n",
401 |     "\n",
402 |     "The following cross-validators can be used in such cases."
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 18,
408 |    "metadata": {
409 |     "collapsed": false,
410 |     "deletable": true,
411 |     "editable": true
412 |    },
413 |    "outputs": [],
414 |    "source": [
415 |     "from sklearn.model_selection import KFold\n",
416 |     "\n",
417 |     "KFold?"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 20,
423 |    "metadata": {
424 |     "collapsed": false,
425 |     "deletable": true,
426 |     "editable": true
427 |    },
428 |    "outputs": [
429 |     {
430 |      "name": "stdout",
431 |      "output_type": "stream",
432 |      "text": [
433 |       "[0 1 3] [2]\n",
434 |       "[0 2 3] [1]\n",
435 |       "[0 1 2] [3]\n",
436 |       "[1 2 3] [0]\n"
437 |      ]
438 |     }
439 |    ],
440 |    "source": [
441 |     "kf = KFold(n_splits=4, shuffle=True)\n",
442 |     "\n",
443 |     "X = [\"a\", \"b\", \"c\", \"d\"]\n",
444 |     "for train, test in kf.split(X):\n",
445 |     "    print(\"%s %s\" % (train, test))"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "markdown",
450 |    "metadata": {
451 |     "deletable": true,
452 |     "editable": true
453 |    },
454 |    "source": [
455 |     "#### Stratification\n",
456 |     "\n",
457 |     "Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold."
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": 21,
463 |    "metadata": {
464 |     "collapsed": true,
465 |     "deletable": true,
466 |     "editable": true
467 |    },
468 |    "outputs": [],
469 |    "source": [
470 |     "from sklearn.model_selection import StratifiedKFold\n",
471 |     "\n",
472 |     "StratifiedKFold?"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 22,
478 |    "metadata": {
479 |     "collapsed": false,
480 |     "deletable": true,
481 |     "editable": true
482 |    },
483 |    "outputs": [
484 |     {
485 |      "name": "stdout",
486 |      "output_type": "stream",
487 |      "text": [
488 |       "[2 3 6 7 8 9] [0 1 4 5]\n",
489 |       "[0 1 3 4 5 8 9] [2 6 7]\n",
490 |       "[0 1 2 4 5 6 7] [3 8 9]\n"
491 |      ]
492 |     }
493 |    ],
494 |    "source": [
495 |     "X = np.ones(10)\n",
496 |     "y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]\n",
497 |     "skf = StratifiedKFold(n_splits=3)\n",
498 |     "for train, test in skf.split(X, y):\n",
499 |     "    print(\"%s %s\" % (train, test))\n"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {
505 |     "deletable": true,
506 |     "editable": true
507 |    },
508 |    "source": [
509 |     "#### Grouped Data\n",
510 |     "\n",
511 |     "The i.i.d. assumption is broken if the underlying generative process yield groups of dependent samples.\n",
512 |     "\n",
513 |     "Such a grouping of data is domain specific. An example would be when there is medical data collected from multiple patients, with multiple samples taken from each patient. And such data is likely to be dependent on the individual group. In our example, the patient id for each sample will be its group identifier.\n",
514 |     "\n",
515 |     "In this case we would like to know if a model trained on a particular set of groups generalizes well to the unseen groups. To measure this, we need to ensure that all the samples in the validation fold come from groups that are not represented at all in the paired training fold.\n",
516 |     "\n",
517 |     "The following cross-validation splitters can be used to do that. The grouping identifier for the samples is specified via the groups parameter."
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": 23,
523 |    "metadata": {
524 |     "collapsed": false,
525 |     "deletable": true,
526 |     "editable": true
527 |    },
528 |    "outputs": [
529 |     {
530 |      "name": "stdout",
531 |      "output_type": "stream",
532 |      "text": [
533 |       "[0 1 2 3 4 5] [6 7 8 9]\n",
534 |       "[0 1 2 6 7 8 9] [3 4 5]\n",
535 |       "[3 4 5 6 7 8 9] [0 1 2]\n"
536 |      ]
537 |     }
538 |    ],
539 |    "source": [
540 |     "from sklearn.model_selection import GroupKFold\n",
541 |     "\n",
542 |     "X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]\n",
543 |     "y = [\"a\", \"b\", \"b\", \"b\", \"c\", \"c\", \"c\", \"d\", \"d\", \"d\"]\n",
544 |     "groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]\n",
545 |     "\n",
546 |     "gkf = GroupKFold(n_splits=3)\n",
547 |     "for train, test in gkf.split(X, y, groups=groups):\n",
548 |     "    print(\"%s %s\" % (train, test))\n"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {
554 |     "deletable": true,
555 |     "editable": true
556 |    },
557 |    "source": [
558 |     "#### Time Series Split\n",
559 |     "\n",
560 |     "TimeSeriesSplit is a variation of k-fold which returns first k folds as train set and the (k+1) th fold as test set. Note that unlike standard cross-validation methods, successive training sets are supersets of those that come before them. Also, it adds all surplus data to the first training partition, which is always used to train the model.\n",
561 |     "\n",
562 |     "This class can be used to cross-validate time series data samples that are observed at fixed time intervals."
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": 24,
568 |    "metadata": {
569 |     "collapsed": false,
570 |     "deletable": true,
571 |     "editable": true
572 |    },
573 |    "outputs": [
574 |     {
575 |      "name": "stdout",
576 |      "output_type": "stream",
577 |      "text": [
578 |       "TimeSeriesSplit(n_splits=3)\n",
579 |       "[0 1 2] [3]\n",
580 |       "[0 1 2 3] [4]\n",
581 |       "[0 1 2 3 4] [5]\n"
582 |      ]
583 |     }
584 |    ],
585 |    "source": [
586 |     "from sklearn.model_selection import TimeSeriesSplit\n",
587 |     "\n",
588 |     "X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])\n",
589 |     "y = np.array([1, 2, 3, 4, 5, 6])\n",
590 |     "tscv = TimeSeriesSplit(n_splits=3)\n",
591 |     "print(tscv)  \n",
592 |     "\n",
593 |     "for train, test in tscv.split(X):\n",
594 |     "    print(\"%s %s\" % (train, test))"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": null,
600 |    "metadata": {
601 |     "collapsed": true,
602 |     "deletable": true,
603 |     "editable": true
604 |    },
605 |    "outputs": [],
606 |    "source": []
607 |   }
608 |  ],
609 |  "metadata": {
610 |   "kernelspec": {
611 |    "display_name": "Python 2",
612 |    "language": "python",
613 |    "name": "python2"
614 |   },
615 |   "language_info": {
616 |    "codemirror_mode": {
617 |     "name": "ipython",
618 |     "version": 2
619 |    },
620 |    "file_extension": ".py",
621 |    "mimetype": "text/x-python",
622 |    "name": "python",
623 |    "nbconvert_exporter": "python",
624 |    "pygments_lexer": "ipython2",
625 |    "version": "2.7.10"
626 |   }
627 |  },
628 |  "nbformat": 4,
629 |  "nbformat_minor": 2
630 | }
631 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/Multiclass-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Multiclass and Multi Label Algorithms\n",
  8 |     "\n",
  9 |     "The sklearn.multiclass module implements meta-estimators to solve multiclass and multilabel classification problems by decomposing such problems into binary classification problems. Multitarget regression is also supported.\n",
 10 |     "\n",
 11 |     "* Multiclass classification means a classification task with more than two classes; e.g., classify a set of images of fruits which may be oranges, apples, or pears. Multiclass classification makes the assumption that each sample is assigned to one and only one label: a fruit can be either an apple or a pear but not both at the same time.\n",
 12 |     "* Multilabel classification assigns to each sample a set of target labels. This can be thought as predicting properties of a data-point that are not mutually exclusive, such as topics that are relevant for a document. A text might be about any of religion, politics, finance or education at the same time or none of these.\n",
 13 |     "* Multioutput regression assigns each sample a set of target values. This can be thought of as predicting several properties for each data-point, such as wind direction and magnitude at a certain location.\n",
 14 |     "* Multioutput-multiclass classification and multi-task classification means that a single estimator has to handle several joint classification tasks. This is both a generalization of the multi-label classification task, which only considers binary classification, as well as a generalization of the multi-class classification task. The output format is a 2d numpy array or sparse matrix.\n",
 15 |     "\n",
 16 |     " The set of labels can be different for each output variable. For instance, a sample could be assigned “pear” for an output variable that takes possible values in a finite set of species such as “pear”, “apple”; and “blue” or “green” for a second output variable that takes possible values in a finite set of colors such as “green”, “red”, “blue”, “yellow”...\n",
 17 |     "\n",
 18 |     " This means that any classifiers handling multi-output multiclass or multi-task classification tasks, support the multi-label classification task as a special case. Multi-task classification is similar to the multi-output classification task with different model formulations. For more information, see the relevant estimator documentation.\n",
 19 |     "\n",
 20 |     "All scikit-learn classifiers are capable of multiclass classification, but the meta-estimators offered by sklearn.multiclass permit changing the way they handle more than two classes because this may have an effect on classifier performance (either in terms of generalization error or required computational resources).\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Multilabel classification format\n",
 28 |     "\n",
 29 |     "In multilabel learning, the joint set of binary classification tasks is expressed with label binary indicator array: each sample is one row of a 2d array of shape (n_samples, n_classes) with binary values: the one, i.e. the non zero elements, corresponds to the subset of labels. An array such as np.array([[1, 0, 0], [0, 1, 1], [0, 0, 0]]) represents label 0 in the first sample, labels 1 and 2 in the second sample, and no labels in the third sample.\n",
 30 |     "\n",
 31 |     "Producing multilabel data as a list of sets of labels may be more intuitive. The MultiLabelBinarizer transformer can be used to convert between a collection of collections of labels and the indicator format.\n",
 32 |     "\n",
 33 |     "This is skipping ahead by a couple of lessons (we have not seen transform before!) But keep this in the back of your mind for when we get there and just memorize this for now"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 1,
 39 |    "metadata": {
 40 |     "collapsed": false
 41 |    },
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "array([[0, 0, 1, 1, 1],\n",
 47 |        "       [0, 0, 1, 0, 0],\n",
 48 |        "       [1, 1, 0, 1, 0],\n",
 49 |        "       [1, 1, 1, 1, 1],\n",
 50 |        "       [1, 1, 1, 0, 0]])"
 51 |       ]
 52 |      },
 53 |      "execution_count": 1,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "from sklearn.preprocessing import MultiLabelBinarizer\n",
 60 |     "\n",
 61 |     "y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]\n",
 62 |     "\n",
 63 |     "MultiLabelBinarizer().fit_transform(y)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## One vs Rest\n",
 71 |     "\n",
 72 |     "This strategy, also known as one-vs-all, is implemented in OneVsRestClassifier. The strategy consists in fitting one classifier per class. For each classifier, the class is fitted against all the other classes. In addition to its computational efficiency (only n_classes classifiers are needed), one advantage of this approach is its interpretability. Since each class is represented by one and only one classifier, it is possible to gain knowledge about the class by inspecting its corresponding classifier. This is the most commonly used strategy and is a fair default choice."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 7,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "from sklearn import datasets\n",
 84 |     "\n",
 85 |     "from sklearn.multiclass import OneVsRestClassifier\n",
 86 |     "\n",
 87 |     "OneVsRestClassifier?"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 6,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "from sklearn.svm import LinearSVC\n",
 99 |     "\n",
100 |     "# Note that this also can OneVsRest\n",
101 |     "LinearSVC?"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 8,
107 |    "metadata": {
108 |     "collapsed": false
109 |    },
110 |    "outputs": [
111 |     {
112 |      "data": {
113 |       "text/plain": [
114 |        "OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
115 |        "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
116 |        "     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
117 |        "     verbose=0),\n",
118 |        "          n_jobs=1)"
119 |       ]
120 |      },
121 |      "execution_count": 8,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "X, y = datasets.load_iris(return_X_y=True)\n",
128 |     "\n",
129 |     "m = OneVsRestClassifier(LinearSVC())\n",
130 |     "\n",
131 |     "m.fit(X, y)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 9,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "(array([0, 1, 2]), False)"
145 |       ]
146 |      },
147 |      "execution_count": 9,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "m.classes_, m.multilabel_"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 11,
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "0.96666666666666667"
167 |       ]
168 |      },
169 |      "execution_count": 11,
170 |      "metadata": {},
171 |      "output_type": "execute_result"
172 |     }
173 |    ],
174 |    "source": [
175 |     "m.score(X, y)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 12,
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "text/plain": [
188 |        "0.96666666666666667"
189 |       ]
190 |      },
191 |      "execution_count": 12,
192 |      "metadata": {},
193 |      "output_type": "execute_result"
194 |     }
195 |    ],
196 |    "source": [
197 |     "LinearSVC().fit(X, y).score(X, y)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "## One Vs One\n",
205 |     "\n",
206 |     "OneVsOneClassifier constructs one classifier per pair of classes. At prediction time, the class which received the most votes is selected. In the event of a tie (among two classes with an equal number of votes), it selects the class with the highest aggregate classification confidence by summing over the pair-wise classification confidence levels computed by the underlying binary classifiers.\n",
207 |     "\n",
208 |     "Since it requires to fit n_classes * (n_classes - 1) / 2 classifiers, this method is usually slower than one-vs-the-rest, due to its O(n_classes^2) complexity. However, this method may be advantageous for algorithms such as kernel algorithms which don’t scale well with n_samples. This is because each individual learning problem only involves a small subset of the data whereas, with one-vs-the-rest, the complete dataset is used n_classes times."
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 13,
214 |    "metadata": {
215 |     "collapsed": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "from sklearn.multiclass import OneVsOneClassifier\n",
220 |     "\n",
221 |     "OneVsOneClassifier?"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 14,
227 |    "metadata": {
228 |     "collapsed": false
229 |    },
230 |    "outputs": [
231 |     {
232 |      "data": {
233 |       "text/plain": [
234 |        "OneVsOneClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
235 |        "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
236 |        "     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
237 |        "     verbose=0),\n",
238 |        "          n_jobs=1)"
239 |       ]
240 |      },
241 |      "execution_count": 14,
242 |      "metadata": {},
243 |      "output_type": "execute_result"
244 |     }
245 |    ],
246 |    "source": [
247 |     "m = OneVsOneClassifier(\n",
248 |     "    LinearSVC())\n",
249 |     "\n",
250 |     "m.fit(X, y)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 15,
256 |    "metadata": {
257 |     "collapsed": false
258 |    },
259 |    "outputs": [
260 |     {
261 |      "data": {
262 |       "text/plain": [
263 |        "(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
264 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
265 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
266 |        "      verbose=0),\n",
267 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
268 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
269 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
270 |        "      verbose=0),\n",
271 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
272 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
273 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
274 |        "      verbose=0))"
275 |       ]
276 |      },
277 |      "execution_count": 15,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "m.estimators_"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 16,
289 |    "metadata": {
290 |     "collapsed": false
291 |    },
292 |    "outputs": [
293 |     {
294 |      "data": {
295 |       "text/plain": [
296 |        "0.97999999999999998"
297 |       ]
298 |      },
299 |      "execution_count": 16,
300 |      "metadata": {},
301 |      "output_type": "execute_result"
302 |     }
303 |    ],
304 |    "source": [
305 |     "m.score(X, y)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "## Error Correcting Output Codes\n",
313 |     "\n",
314 |     "Output-code based strategies are fairly different from one-vs-the-rest and one-vs-one. With these strategies, each class is represented in a Euclidean space, where each dimension can only be 0 or 1. Another way to put it is that each class is represented by a binary code (an array of 0 and 1). The matrix which keeps track of the location/code of each class is called the code book. The code size is the dimensionality of the aforementioned space. Intuitively, each class should be represented by a code as unique as possible and a good code book should be designed to optimize classification accuracy.\n",
315 |     "\n",
316 |     "At fitting time, one binary classifier per bit in the code book is fitted. At prediction time, the classifiers are used to project new points in the class space and the class closest to the points is chosen.\n",
317 |     "\n",
318 |     "In OutputCodeClassifier, the code_size attribute allows the user to control the number of classifiers which will be used. It is a percentage of the total number of classes.\n",
319 |     "\n",
320 |     "A number between 0 and 1 will require fewer classifiers than one-vs-the-rest. In theory, log2(n_classes) / n_classes is sufficient to represent each class unambiguously. However, in practice, it may not lead to good accuracy since log2(n_classes) is much smaller than n_classes.\n",
321 |     "\n",
322 |     "A number greater than 1 will require more classifiers than one-vs-the-rest. In this case, some classifiers will in theory correct for the mistakes made by other classifiers, hence the name “error-correcting”. In practice, however, this may not happen as classifier mistakes will typically be correlated. The error-correcting output codes have a similar effect to bagging."
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 17,
328 |    "metadata": {
329 |     "collapsed": true
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "from sklearn.multiclass import OutputCodeClassifier\n",
334 |     "\n",
335 |     "OutputCodeClassifier?"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 21,
341 |    "metadata": {
342 |     "collapsed": false
343 |    },
344 |    "outputs": [
345 |     {
346 |      "data": {
347 |       "text/plain": [
348 |        "OutputCodeClassifier(code_size=2,\n",
349 |        "           estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
350 |        "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
351 |        "     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
352 |        "     verbose=0),\n",
353 |        "           n_jobs=1, random_state=None)"
354 |       ]
355 |      },
356 |      "execution_count": 21,
357 |      "metadata": {},
358 |      "output_type": "execute_result"
359 |     }
360 |    ],
361 |    "source": [
362 |     "m = OutputCodeClassifier(LinearSVC(), code_size=2)\n",
363 |     "\n",
364 |     "m.fit(X, y)"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 22,
370 |    "metadata": {
371 |     "collapsed": false
372 |    },
373 |    "outputs": [
374 |     {
375 |      "data": {
376 |       "text/plain": [
377 |        "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
378 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
379 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
380 |        "      verbose=0),\n",
381 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
382 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
383 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
384 |        "      verbose=0),\n",
385 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
386 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
387 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
388 |        "      verbose=0),\n",
389 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
390 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
391 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
392 |        "      verbose=0),\n",
393 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
394 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
395 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
396 |        "      verbose=0),\n",
397 |        " _ConstantPredictor()]"
398 |       ]
399 |      },
400 |      "execution_count": 22,
401 |      "metadata": {},
402 |      "output_type": "execute_result"
403 |     }
404 |    ],
405 |    "source": [
406 |     "m.estimators_"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 23,
412 |    "metadata": {
413 |     "collapsed": false
414 |    },
415 |    "outputs": [
416 |     {
417 |      "data": {
418 |       "text/plain": [
419 |        "0.96666666666666667"
420 |       ]
421 |      },
422 |      "execution_count": 23,
423 |      "metadata": {},
424 |      "output_type": "execute_result"
425 |     }
426 |    ],
427 |    "source": [
428 |     "m.score(X, y)"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {},
434 |    "source": [
435 |     "# Multiple Output Regression and Classification\n",
436 |     "\n",
437 |     "Multioutput regression support can be added to any regressor with MultiOutputRegressor. This strategy consists of fitting one regressor per target. Since each target is represented by exactly one regressor it is possible to gain knowledge about the target by inspecting its corresponding regressor. As MultiOutputRegressor fits one regressor per target it can not take advantage of correlations between targets.\n",
438 |     "\n",
439 |     "Multioutput classification support can be added to any classifier with MultiOutputClassifier. This strategy consists of fitting one classifier per target. This allows multiple target variable classifications. The purpose of this class is to extend estimators to be able to estimate a series of target functions (f1,f2,f3...,fn) that are trained on a single X predictor matrix to predict a series of reponses (y1,y2,y3...,yn)."
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 25,
445 |    "metadata": {
446 |     "collapsed": false
447 |    },
448 |    "outputs": [
449 |     {
450 |      "data": {
451 |       "text/plain": [
452 |        "0.99999999911789184"
453 |       ]
454 |      },
455 |      "execution_count": 25,
456 |      "metadata": {},
457 |      "output_type": "execute_result"
458 |     }
459 |    ],
460 |    "source": [
461 |     "from sklearn.datasets import make_regression\n",
462 |     "\n",
463 |     "from sklearn.multioutput import MultiOutputRegressor\n",
464 |     "\n",
465 |     "from sklearn.ensemble import GradientBoostingRegressor\n",
466 |     "\n",
467 |     "X, y = make_regression(n_samples=10, n_targets=3, random_state=1)\n",
468 |     "\n",
469 |     "MultiOutputRegressor(\n",
470 |     "    GradientBoostingRegressor(random_state=0)).fit(X, y).score(X, y)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {
477 |     "collapsed": true
478 |    },
479 |    "outputs": [],
480 |    "source": []
481 |   }
482 |  ],
483 |  "metadata": {
484 |   "kernelspec": {
485 |    "display_name": "Python 2",
486 |    "language": "python",
487 |    "name": "python2"
488 |   },
489 |   "language_info": {
490 |    "codemirror_mode": {
491 |     "name": "ipython",
492 |     "version": 2
493 |    },
494 |    "file_extension": ".py",
495 |    "mimetype": "text/x-python",
496 |    "name": "python",
497 |    "nbconvert_exporter": "python",
498 |    "pygments_lexer": "ipython2",
499 |    "version": "2.7.10"
500 |   }
501 |  },
502 |  "nbformat": 4,
503 |  "nbformat_minor": 2
504 | }
505 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/EnsembleMethods-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Ensemble Methods\n",
  8 |     "\n",
  9 |     "The goal of ensemble methods is to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator.\n",
 10 |     "\n",
 11 |     "Two families of ensemble methods are usually distinguished:\n",
 12 |     "* In averaging methods, the driving principle is to build several estimators independently and then to average their predictions. On average, the combined estimator is usually better than any of the single base estimator because its variance is reduced.\n",
 13 |     "\n",
 14 |     "Examples: Bagging methods, Forests of randomized trees, ...\n",
 15 |     "\n",
 16 |     "* By contrast, in boosting methods, base estimators are built sequentially and one tries to reduce the bias of the combined estimator. The motivation is to combine several weak models to produce a powerful ensemble.\n",
 17 |     "\n",
 18 |     "Examples: AdaBoost, Gradient Tree Boosting, ..."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Bagging Meta Estimator\n",
 26 |     "\n",
 27 |     "In ensemble algorithms, bagging methods form a class of algorithms which build several instances of a black-box estimator on random subsets of the original training set and then aggregate their individual predictions to form a final prediction. These methods are used as a way to reduce the variance of a base estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it. In many cases, bagging methods constitute a very simple way to improve with respect to a single model, without making it necessary to adapt the underlying base algorithm. As they provide a way to reduce overfitting, bagging methods work best with strong and complex models (e.g., fully developed decision trees), in contrast with boosting methods which usually work best with weak models (e.g., shallow decision trees).\n",
 28 |     "\n",
 29 |     "Bagging methods come in many flavours but mostly differ from each other by the way they draw random subsets of the training set:\n",
 30 |     "\n",
 31 |     "* When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting\n",
 32 |     "* When samples are drawn with replacement, then the method is known as Bagging\n",
 33 |     "* When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces\n",
 34 |     "* Finally, when base estimators are built on subsets of both samples and features, then the method is known as Random Patches\n",
 35 |     "\n",
 36 |     "In scikit-learn, bagging methods are offered as a unified BaggingClassifier meta-estimator (resp. BaggingRegressor), taking as input a user-specified base estimator along with parameters specifying the strategy to draw random subsets. In particular, max_samples and max_features control the size of the subsets (in terms of samples and features), while bootstrap and bootstrap_features control whether samples and features are drawn with or without replacement. When using a subset of the available samples the generalization accuracy can be estimated with the out-of-bag samples by setting oob_score=True. "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "To get started, let's import some data"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {
 50 |     "collapsed": true
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "import sklearn.datasets as datasets\n",
 55 |     "\n",
 56 |     "X, y = datasets.load_iris(return_X_y=True)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "Notice that we again see parallelization!\n",
 64 |     "\n",
 65 |     "Next let's check out the features of the BaggingClassifier (the BaggingRegressor is very similar)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from sklearn.ensemble import BaggingClassifier\n",
 77 |     "\n",
 78 |     "BaggingClassifier?"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "Now that we have a feel for it, let's pair it with a classifier. And for this we will use KNN."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 97 |     "\n",
 98 |     "m = KNeighborsClassifier(n_neighbors=3)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 10,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "bag = BaggingClassifier(\n",
110 |     "    m, \n",
111 |     "    max_samples=.5, \n",
112 |     "    max_features=2, \n",
113 |     "    n_jobs=2,\n",
114 |     "    oob_score=True)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 11,
120 |    "metadata": {
121 |     "collapsed": false
122 |    },
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
128 |        "           metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n",
129 |        "           weights='uniform'),\n",
130 |        "         bootstrap=True, bootstrap_features=False, max_features=2,\n",
131 |        "         max_samples=0.5, n_estimators=10, n_jobs=2, oob_score=True,\n",
132 |        "         random_state=None, verbose=0, warm_start=False)"
133 |       ]
134 |      },
135 |      "execution_count": 11,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "bag.fit(X, y)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 12,
147 |    "metadata": {
148 |     "collapsed": false
149 |    },
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "0.92666666666666664"
155 |       ]
156 |      },
157 |      "execution_count": 12,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "bag.oob_score_"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 13,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [
173 |     {
174 |      "data": {
175 |       "text/plain": [
176 |        "array([0])"
177 |       ]
178 |      },
179 |      "execution_count": 13,
180 |      "metadata": {},
181 |      "output_type": "execute_result"
182 |     }
183 |    ],
184 |    "source": [
185 |     "bag.predict([X[0]])"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 14,
191 |    "metadata": {
192 |     "collapsed": false
193 |    },
194 |    "outputs": [
195 |     {
196 |      "data": {
197 |       "text/plain": [
198 |        "array([[ 1.,  0.,  0.]])"
199 |       ]
200 |      },
201 |      "execution_count": 14,
202 |      "metadata": {},
203 |      "output_type": "execute_result"
204 |     }
205 |    ],
206 |    "source": [
207 |     "bag.predict_proba([X[0]])"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 24,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [
217 |     {
218 |      "data": {
219 |       "text/plain": [
220 |        "0.95999999999999996"
221 |       ]
222 |      },
223 |      "execution_count": 24,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "bag.score(X, y)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "## Random Forests\n",
237 |     "\n",
238 |     "Random forests are somewhat special. They happen to be so frequently used a bagging method that they have become their own method. They are in that way the same as a classic Supervised Estimator with all the base functionality, plus a little extra bagging goodness."
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 15,
244 |    "metadata": {
245 |     "collapsed": true
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "from sklearn.ensemble import RandomForestClassifier\n",
250 |     "\n",
251 |     "RandomForestClassifier?"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 17,
257 |    "metadata": {
258 |     "collapsed": false
259 |    },
260 |    "outputs": [],
261 |    "source": [
262 |     "m = RandomForestClassifier(n_estimators=20, oob_score=True)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 18,
268 |    "metadata": {
269 |     "collapsed": false
270 |    },
271 |    "outputs": [
272 |     {
273 |      "data": {
274 |       "text/plain": [
275 |        "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
276 |        "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
277 |        "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
278 |        "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
279 |        "            n_estimators=20, n_jobs=1, oob_score=True, random_state=None,\n",
280 |        "            verbose=0, warm_start=False)"
281 |       ]
282 |      },
283 |      "execution_count": 18,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "m.fit(X, y)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 23,
295 |    "metadata": {
296 |     "collapsed": false
297 |    },
298 |    "outputs": [
299 |     {
300 |      "data": {
301 |       "text/plain": [
302 |        "array([0])"
303 |       ]
304 |      },
305 |      "execution_count": 23,
306 |      "metadata": {},
307 |      "output_type": "execute_result"
308 |     }
309 |    ],
310 |    "source": [
311 |     "m.predict([X[0]])"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 25,
317 |    "metadata": {
318 |     "collapsed": false
319 |    },
320 |    "outputs": [
321 |     {
322 |      "data": {
323 |       "text/plain": [
324 |        "0.99333333333333329"
325 |       ]
326 |      },
327 |      "execution_count": 25,
328 |      "metadata": {},
329 |      "output_type": "execute_result"
330 |     }
331 |    ],
332 |    "source": [
333 |     "m.score(X, y)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "## AdaBoost\n",
341 |     "\n",
342 |     "The module sklearn.ensemble includes the popular boosting algorithm AdaBoost, introduced in 1995 by Freund and Schapire.\n",
343 |     "\n",
344 |     "The core principle of AdaBoost is to fit a sequence of weak learners (i.e., models that are only slightly better than random guessing, such as small decision trees) on repeatedly modified versions of the data. The predictions from all of them are then combined through a weighted majority vote (or sum) to produce the final prediction. The data modifications at each so-called boosting iteration consist of applying weights w_1, w_2, ..., w_N to each of the training samples. Initially, those weights are all set to w_i = 1/N, so that the first step simply trains a weak learner on the original data. For each successive iteration, the sample weights are individually modified and the learning algorithm is reapplied to the reweighted data. At a given step, those training examples that were incorrectly predicted by the boosted model induced at the previous step have their weights increased, whereas the weights are decreased for those that were predicted correctly. As iterations proceed, examples that are difficult to predict receive ever-increasing influence. Each subsequent weak learner is thereby forced to concentrate on the examples that are missed by the previous ones in the sequence "
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 27,
350 |    "metadata": {
351 |     "collapsed": true
352 |    },
353 |    "outputs": [],
354 |    "source": [
355 |     "from sklearn.ensemble import AdaBoostClassifier\n",
356 |     "\n",
357 |     "AdaBoostClassifier?"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 28,
363 |    "metadata": {
364 |     "collapsed": true
365 |    },
366 |    "outputs": [],
367 |    "source": [
368 |     "m = AdaBoostClassifier(base_estimator=None, n_estimators=100)"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": 30,
374 |    "metadata": {
375 |     "collapsed": false
376 |    },
377 |    "outputs": [
378 |     {
379 |      "data": {
380 |       "text/plain": [
381 |        "AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,\n",
382 |        "          learning_rate=1.0, n_estimators=100, random_state=None)"
383 |       ]
384 |      },
385 |      "execution_count": 30,
386 |      "metadata": {},
387 |      "output_type": "execute_result"
388 |     }
389 |    ],
390 |    "source": [
391 |     "m.fit(X, y)"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 31,
397 |    "metadata": {
398 |     "collapsed": false
399 |    },
400 |    "outputs": [
401 |     {
402 |      "data": {
403 |       "text/plain": [
404 |        "0.97333333333333338"
405 |       ]
406 |      },
407 |      "execution_count": 31,
408 |      "metadata": {},
409 |      "output_type": "execute_result"
410 |     }
411 |    ],
412 |    "source": [
413 |     "m.score(X, y)"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "## Gradient Tree Boosting\n",
421 |     "\n",
422 |     "Gradient Tree Boosting or Gradient Boosted Regression Trees (GBRT) is a generalization of boosting to arbitrary differentiable loss functions. GBRT is an accurate and effective off-the-shelf procedure that can be used for both regression and classification problems. Gradient Tree Boosting models are used in a variety of areas including Web search ranking and ecology.\n",
423 |     "\n",
424 |     "The advantages of GBRT are:\n",
425 |     "* Natural handling of data of mixed type (= heterogeneous features)\n",
426 |     "* Predictive power\n",
427 |     "* Robustness to outliers in output space (via robust loss functions)\n",
428 |     "\n",
429 |     "The disadvantages of GBRT are:\n",
430 |     "\n",
431 |     "* Scalability, due to the sequential nature of boosting it can hardly be parallelized.\n",
432 |     "\n",
433 |     "The module sklearn.ensemble provides methods for both classification and regression via gradient boosted regression trees."
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": 32,
439 |    "metadata": {
440 |     "collapsed": true
441 |    },
442 |    "outputs": [],
443 |    "source": [
444 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
445 |     "\n",
446 |     "GradientBoostingClassifier?"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": 37,
452 |    "metadata": {
453 |     "collapsed": false
454 |    },
455 |    "outputs": [
456 |     {
457 |      "data": {
458 |       "text/plain": [
459 |        "0.99333333333333329"
460 |       ]
461 |      },
462 |      "execution_count": 37,
463 |      "metadata": {},
464 |      "output_type": "execute_result"
465 |     }
466 |    ],
467 |    "source": [
468 |     "m = GradientBoostingClassifier(n_estimators=10)\n",
469 |     "\n",
470 |     "m.fit(X, y)\n",
471 |     "\n",
472 |     "m.score(X, y)"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 38,
478 |    "metadata": {
479 |     "collapsed": false
480 |    },
481 |    "outputs": [
482 |     {
483 |      "data": {
484 |       "text/plain": [
485 |        "1.0"
486 |       ]
487 |      },
488 |      "execution_count": 38,
489 |      "metadata": {},
490 |      "output_type": "execute_result"
491 |     }
492 |    ],
493 |    "source": [
494 |     "m.set_params(n_estimators=20, warm_start=True)\n",
495 |     "\n",
496 |     "m.fit(X, y)\n",
497 |     "\n",
498 |     "m.score(X, y)"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": 39,
504 |    "metadata": {
505 |     "collapsed": false
506 |    },
507 |    "outputs": [
508 |     {
509 |      "data": {
510 |       "text/plain": [
511 |        "array([ 0.00444544,  0.03816819,  0.41928686,  0.53809951])"
512 |       ]
513 |      },
514 |      "execution_count": 39,
515 |      "metadata": {},
516 |      "output_type": "execute_result"
517 |     }
518 |    ],
519 |    "source": [
520 |     "m.feature_importances_"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "markdown",
525 |    "metadata": {},
526 |    "source": [
527 |     "## Voting Classifier\n",
528 |     "\n",
529 |     "The idea behind the voting classifier implementation is to combine conceptually different machine learning classifiers and use a majority vote or the average predicted probabilities (soft vote) to predict the class labels. Such a classifier can be useful for a set of equally well performing model in order to balance out their individual weaknesses."
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 40,
535 |    "metadata": {
536 |     "collapsed": true
537 |    },
538 |    "outputs": [],
539 |    "source": [
540 |     "from sklearn.ensemble import VotingClassifier\n",
541 |     "\n",
542 |     "VotingClassifier?"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": 43,
548 |    "metadata": {
549 |     "collapsed": true
550 |    },
551 |    "outputs": [],
552 |    "source": [
553 |     "from sklearn.linear_model import LogisticRegression\n",
554 |     "from sklearn.naive_bayes import GaussianNB\n",
555 |     "from sklearn.ensemble import RandomForestClassifier\n",
556 |     "\n",
557 |     "\n",
558 |     "m = VotingClassifier(\n",
559 |     "    estimators=[('lr', LogisticRegression()), \n",
560 |     "                ('rf', RandomForestClassifier()), \n",
561 |     "                ('gnb', GaussianNB())], \n",
562 |     "    voting='hard')"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": 45,
568 |    "metadata": {
569 |     "collapsed": false
570 |    },
571 |    "outputs": [
572 |     {
573 |      "name": "stdout",
574 |      "output_type": "stream",
575 |      "text": [
576 |       "VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
577 |       "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
578 |       "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
579 |       "          verbose=0, warm_start=False)), ('rf', RandomF...lse, random_state=None,\n",
580 |       "            verbose=0, warm_start=False)), ('gnb', GaussianNB(priors=None))],\n",
581 |       "         n_jobs=1, voting='hard', weights=None)\n"
582 |      ]
583 |     }
584 |    ],
585 |    "source": [
586 |     "m.fit(X, y)"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": 49,
592 |    "metadata": {
593 |     "collapsed": false
594 |    },
595 |    "outputs": [
596 |     {
597 |      "data": {
598 |       "text/plain": [
599 |        "0.98666666666666669"
600 |       ]
601 |      },
602 |      "execution_count": 49,
603 |      "metadata": {},
604 |      "output_type": "execute_result"
605 |     }
606 |    ],
607 |    "source": [
608 |     "m.score(X, y)"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "code",
613 |    "execution_count": null,
614 |    "metadata": {
615 |     "collapsed": true
616 |    },
617 |    "outputs": [],
618 |    "source": []
619 |   }
620 |  ],
621 |  "metadata": {
622 |   "kernelspec": {
623 |    "display_name": "Python 2",
624 |    "language": "python",
625 |    "name": "python2"
626 |   },
627 |   "language_info": {
628 |    "codemirror_mode": {
629 |     "name": "ipython",
630 |     "version": 2
631 |    },
632 |    "file_extension": ".py",
633 |    "mimetype": "text/x-python",
634 |    "name": "python",
635 |    "nbconvert_exporter": "python",
636 |    "pygments_lexer": "ipython2",
637 |    "version": "2.7.10"
638 |   }
639 |  },
640 |  "nbformat": 4,
641 |  "nbformat_minor": 2
642 | }
643 | 


--------------------------------------------------------------------------------
/notebooks/Multiclass.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Multiclass and Multi Label Algorithms\n",
 11 |     "\n",
 12 |     "The sklearn.multiclass module implements meta-estimators to solve multiclass and multilabel classification problems by decomposing such problems into binary classification problems. Multitarget regression is also supported.\n",
 13 |     "\n",
 14 |     "* Multiclass classification means a classification task with more than two classes; e.g., classify a set of images of fruits which may be oranges, apples, or pears. Multiclass classification makes the assumption that each sample is assigned to one and only one label: a fruit can be either an apple or a pear but not both at the same time.\n",
 15 |     "* Multilabel classification assigns to each sample a set of target labels. This can be thought as predicting properties of a data-point that are not mutually exclusive, such as topics that are relevant for a document. A text might be about any of religion, politics, finance or education at the same time or none of these.\n",
 16 |     "* Multioutput regression assigns each sample a set of target values. This can be thought of as predicting several properties for each data-point, such as wind direction and magnitude at a certain location.\n",
 17 |     "* Multioutput-multiclass classification and multi-task classification means that a single estimator has to handle several joint classification tasks. This is both a generalization of the multi-label classification task, which only considers binary classification, as well as a generalization of the multi-class classification task. The output format is a 2d numpy array or sparse matrix.\n",
 18 |     "\n",
 19 |     " The set of labels can be different for each output variable. For instance, a sample could be assigned “pear” for an output variable that takes possible values in a finite set of species such as “pear”, “apple”; and “blue” or “green” for a second output variable that takes possible values in a finite set of colors such as “green”, “red”, “blue”, “yellow”...\n",
 20 |     "\n",
 21 |     " This means that any classifiers handling multi-output multiclass or multi-task classification tasks, support the multi-label classification task as a special case. Multi-task classification is similar to the multi-output classification task with different model formulations. For more information, see the relevant estimator documentation.\n",
 22 |     "\n",
 23 |     "All scikit-learn classifiers are capable of multiclass classification, but the meta-estimators offered by sklearn.multiclass permit changing the way they handle more than two classes because this may have an effect on classifier performance (either in terms of generalization error or required computational resources).\n"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {
 29 |     "deletable": true,
 30 |     "editable": true
 31 |    },
 32 |    "source": [
 33 |     "## Multilabel classification format\n",
 34 |     "\n",
 35 |     "In multilabel learning, the joint set of binary classification tasks is expressed with label binary indicator array: each sample is one row of a 2d array of shape (n_samples, n_classes) with binary values: the one, i.e. the non zero elements, corresponds to the subset of labels. An array such as np.array([[1, 0, 0], [0, 1, 1], [0, 0, 0]]) represents label 0 in the first sample, labels 1 and 2 in the second sample, and no labels in the third sample.\n",
 36 |     "\n",
 37 |     "Producing multilabel data as a list of sets of labels may be more intuitive. The MultiLabelBinarizer transformer can be used to convert between a collection of collections of labels and the indicator format.\n",
 38 |     "\n",
 39 |     "This is skipping ahead by a couple of lessons (we have not seen transform before!) But keep this in the back of your mind for when we get there and just memorize this for now"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 1,
 45 |    "metadata": {
 46 |     "collapsed": false,
 47 |     "deletable": true,
 48 |     "editable": true
 49 |    },
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/plain": [
 54 |        "array([[0, 0, 1, 1, 1],\n",
 55 |        "       [0, 0, 1, 0, 0],\n",
 56 |        "       [1, 1, 0, 1, 0],\n",
 57 |        "       [1, 1, 1, 1, 1],\n",
 58 |        "       [1, 1, 1, 0, 0]])"
 59 |       ]
 60 |      },
 61 |      "execution_count": 1,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "from sklearn.preprocessing import MultiLabelBinarizer\n",
 68 |     "\n",
 69 |     "y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]\n",
 70 |     "\n",
 71 |     "MultiLabelBinarizer().fit_transform(y)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {
 77 |     "deletable": true,
 78 |     "editable": true
 79 |    },
 80 |    "source": [
 81 |     "## One vs Rest\n",
 82 |     "\n",
 83 |     "This strategy, also known as one-vs-all, is implemented in OneVsRestClassifier. The strategy consists in fitting one classifier per class. For each classifier, the class is fitted against all the other classes. In addition to its computational efficiency (only n_classes classifiers are needed), one advantage of this approach is its interpretability. Since each class is represented by one and only one classifier, it is possible to gain knowledge about the class by inspecting its corresponding classifier. This is the most commonly used strategy and is a fair default choice."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 2,
 89 |    "metadata": {
 90 |     "collapsed": false,
 91 |     "deletable": true,
 92 |     "editable": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "from sklearn import datasets\n",
 97 |     "\n",
 98 |     "from sklearn.multiclass import OneVsRestClassifier\n",
 99 |     "\n",
100 |     "OneVsRestClassifier?"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 3,
106 |    "metadata": {
107 |     "collapsed": true,
108 |     "deletable": true,
109 |     "editable": true
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "from sklearn.svm import LinearSVC\n",
114 |     "\n",
115 |     "# Note that this also can OneVsRest\n",
116 |     "LinearSVC?"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 4,
122 |    "metadata": {
123 |     "collapsed": false,
124 |     "deletable": true,
125 |     "editable": true
126 |    },
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
132 |        "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
133 |        "     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
134 |        "     verbose=0),\n",
135 |        "          n_jobs=1)"
136 |       ]
137 |      },
138 |      "execution_count": 4,
139 |      "metadata": {},
140 |      "output_type": "execute_result"
141 |     }
142 |    ],
143 |    "source": [
144 |     "X, y = datasets.load_iris(return_X_y=True)\n",
145 |     "\n",
146 |     "m = OneVsRestClassifier(LinearSVC())\n",
147 |     "\n",
148 |     "m.fit(X, y)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 5,
154 |    "metadata": {
155 |     "collapsed": false,
156 |     "deletable": true,
157 |     "editable": true
158 |    },
159 |    "outputs": [
160 |     {
161 |      "data": {
162 |       "text/plain": [
163 |        "(array([0, 1, 2]), False)"
164 |       ]
165 |      },
166 |      "execution_count": 5,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "m.classes_, m.multilabel_"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 6,
178 |    "metadata": {
179 |     "collapsed": false,
180 |     "deletable": true,
181 |     "editable": true
182 |    },
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "0.96666666666666667"
188 |       ]
189 |      },
190 |      "execution_count": 6,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "m.score(X, y)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 7,
202 |    "metadata": {
203 |     "collapsed": false
204 |    },
205 |    "outputs": [
206 |     {
207 |      "data": {
208 |       "text/plain": [
209 |        "array([[ 0.18424099,  0.45123591, -0.80793809, -0.45070657],\n",
210 |        "       [ 0.04859238, -0.88423382,  0.40385695, -0.93606001],\n",
211 |        "       [-0.85070203, -0.98668927,  1.38090339,  1.86546957]])"
212 |       ]
213 |      },
214 |      "execution_count": 7,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "m.coef_"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 8,
226 |    "metadata": {
227 |     "collapsed": false,
228 |     "deletable": true,
229 |     "editable": true
230 |    },
231 |    "outputs": [
232 |     {
233 |      "data": {
234 |       "text/plain": [
235 |        "0.96666666666666667"
236 |       ]
237 |      },
238 |      "execution_count": 8,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "LinearSVC().fit(X, y).score(X, y)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {
250 |     "deletable": true,
251 |     "editable": true
252 |    },
253 |    "source": [
254 |     "## One Vs One\n",
255 |     "\n",
256 |     "OneVsOneClassifier constructs one classifier per pair of classes. At prediction time, the class which received the most votes is selected. In the event of a tie (among two classes with an equal number of votes), it selects the class with the highest aggregate classification confidence by summing over the pair-wise classification confidence levels computed by the underlying binary classifiers.\n",
257 |     "\n",
258 |     "Since it requires to fit n_classes * (n_classes - 1) / 2 classifiers, this method is usually slower than one-vs-the-rest, due to its O(n_classes^2) complexity. However, this method may be advantageous for algorithms such as kernel algorithms which don’t scale well with n_samples. This is because each individual learning problem only involves a small subset of the data whereas, with one-vs-the-rest, the complete dataset is used n_classes times."
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 9,
264 |    "metadata": {
265 |     "collapsed": true,
266 |     "deletable": true,
267 |     "editable": true
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "from sklearn.multiclass import OneVsOneClassifier\n",
272 |     "\n",
273 |     "OneVsOneClassifier?"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 10,
279 |    "metadata": {
280 |     "collapsed": false,
281 |     "deletable": true,
282 |     "editable": true
283 |    },
284 |    "outputs": [
285 |     {
286 |      "data": {
287 |       "text/plain": [
288 |        "OneVsOneClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
289 |        "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
290 |        "     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
291 |        "     verbose=0),\n",
292 |        "          n_jobs=1)"
293 |       ]
294 |      },
295 |      "execution_count": 10,
296 |      "metadata": {},
297 |      "output_type": "execute_result"
298 |     }
299 |    ],
300 |    "source": [
301 |     "m = OneVsOneClassifier(\n",
302 |     "    LinearSVC())\n",
303 |     "\n",
304 |     "m.fit(X, y)"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 11,
310 |    "metadata": {
311 |     "collapsed": false,
312 |     "deletable": true,
313 |     "editable": true
314 |    },
315 |    "outputs": [
316 |     {
317 |      "data": {
318 |       "text/plain": [
319 |        "(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
320 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
321 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
322 |        "      verbose=0),\n",
323 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
324 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
325 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
326 |        "      verbose=0),\n",
327 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
328 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
329 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
330 |        "      verbose=0))"
331 |       ]
332 |      },
333 |      "execution_count": 11,
334 |      "metadata": {},
335 |      "output_type": "execute_result"
336 |     }
337 |    ],
338 |    "source": [
339 |     "m.estimators_"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 12,
345 |    "metadata": {
346 |     "collapsed": false,
347 |     "deletable": true,
348 |     "editable": true
349 |    },
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "0.97999999999999998"
355 |       ]
356 |      },
357 |      "execution_count": 12,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": [
363 |     "m.score(X, y)"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {
369 |     "deletable": true,
370 |     "editable": true
371 |    },
372 |    "source": [
373 |     "## Error Correcting Output Codes\n",
374 |     "\n",
375 |     "Output-code based strategies are fairly different from one-vs-the-rest and one-vs-one. With these strategies, each class is represented in a Euclidean space, where each dimension can only be 0 or 1. Another way to put it is that each class is represented by a binary code (an array of 0 and 1). The matrix which keeps track of the location/code of each class is called the code book. The code size is the dimensionality of the aforementioned space. Intuitively, each class should be represented by a code as unique as possible and a good code book should be designed to optimize classification accuracy.\n",
376 |     "\n",
377 |     "At fitting time, one binary classifier per bit in the code book is fitted. At prediction time, the classifiers are used to project new points in the class space and the class closest to the points is chosen.\n",
378 |     "\n",
379 |     "In OutputCodeClassifier, the code_size attribute allows the user to control the number of classifiers which will be used. It is a percentage of the total number of classes.\n",
380 |     "\n",
381 |     "A number between 0 and 1 will require fewer classifiers than one-vs-the-rest. In theory, log2(n_classes) / n_classes is sufficient to represent each class unambiguously. However, in practice, it may not lead to good accuracy since log2(n_classes) is much smaller than n_classes.\n",
382 |     "\n",
383 |     "A number greater than 1 will require more classifiers than one-vs-the-rest. In this case, some classifiers will in theory correct for the mistakes made by other classifiers, hence the name “error-correcting”. In practice, however, this may not happen as classifier mistakes will typically be correlated. The error-correcting output codes have a similar effect to bagging."
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 13,
389 |    "metadata": {
390 |     "collapsed": true,
391 |     "deletable": true,
392 |     "editable": true
393 |    },
394 |    "outputs": [],
395 |    "source": [
396 |     "from sklearn.multiclass import OutputCodeClassifier\n",
397 |     "\n",
398 |     "OutputCodeClassifier?"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 14,
404 |    "metadata": {
405 |     "collapsed": false,
406 |     "deletable": true,
407 |     "editable": true
408 |    },
409 |    "outputs": [
410 |     {
411 |      "data": {
412 |       "text/plain": [
413 |        "OutputCodeClassifier(code_size=2,\n",
414 |        "           estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
415 |        "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
416 |        "     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
417 |        "     verbose=0),\n",
418 |        "           n_jobs=1, random_state=None)"
419 |       ]
420 |      },
421 |      "execution_count": 14,
422 |      "metadata": {},
423 |      "output_type": "execute_result"
424 |     }
425 |    ],
426 |    "source": [
427 |     "m = OutputCodeClassifier(LinearSVC(), code_size=2)\n",
428 |     "\n",
429 |     "m.fit(X, y)"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": 15,
435 |    "metadata": {
436 |     "collapsed": false,
437 |     "deletable": true,
438 |     "editable": true
439 |    },
440 |    "outputs": [
441 |     {
442 |      "data": {
443 |       "text/plain": [
444 |        "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
445 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
446 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
447 |        "      verbose=0),\n",
448 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
449 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
450 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
451 |        "      verbose=0),\n",
452 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
453 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
454 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
455 |        "      verbose=0),\n",
456 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
457 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
458 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
459 |        "      verbose=0),\n",
460 |        " _ConstantPredictor(),\n",
461 |        " LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
462 |        "      intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
463 |        "      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
464 |        "      verbose=0)]"
465 |       ]
466 |      },
467 |      "execution_count": 15,
468 |      "metadata": {},
469 |      "output_type": "execute_result"
470 |     }
471 |    ],
472 |    "source": [
473 |     "m.estimators_"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 16,
479 |    "metadata": {
480 |     "collapsed": false,
481 |     "deletable": true,
482 |     "editable": true
483 |    },
484 |    "outputs": [
485 |     {
486 |      "data": {
487 |       "text/plain": [
488 |        "0.97999999999999998"
489 |       ]
490 |      },
491 |      "execution_count": 16,
492 |      "metadata": {},
493 |      "output_type": "execute_result"
494 |     }
495 |    ],
496 |    "source": [
497 |     "m.score(X, y)"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "markdown",
502 |    "metadata": {
503 |     "deletable": true,
504 |     "editable": true
505 |    },
506 |    "source": [
507 |     "# Multiple Output Regression and Classification\n",
508 |     "\n",
509 |     "Multioutput regression support can be added to any regressor with MultiOutputRegressor. This strategy consists of fitting one regressor per target. Since each target is represented by exactly one regressor it is possible to gain knowledge about the target by inspecting its corresponding regressor. As MultiOutputRegressor fits one regressor per target it can not take advantage of correlations between targets.\n",
510 |     "\n",
511 |     "Multioutput classification support can be added to any classifier with MultiOutputClassifier. This strategy consists of fitting one classifier per target. This allows multiple target variable classifications. The purpose of this class is to extend estimators to be able to estimate a series of target functions (f1,f2,f3...,fn) that are trained on a single X predictor matrix to predict a series of reponses (y1,y2,y3...,yn)."
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": 17,
517 |    "metadata": {
518 |     "collapsed": false,
519 |     "deletable": true,
520 |     "editable": true
521 |    },
522 |    "outputs": [
523 |     {
524 |      "data": {
525 |       "text/plain": [
526 |        "0.99999999911789184"
527 |       ]
528 |      },
529 |      "execution_count": 17,
530 |      "metadata": {},
531 |      "output_type": "execute_result"
532 |     }
533 |    ],
534 |    "source": [
535 |     "from sklearn.datasets import make_regression\n",
536 |     "\n",
537 |     "from sklearn.multioutput import MultiOutputRegressor\n",
538 |     "\n",
539 |     "from sklearn.ensemble import GradientBoostingRegressor\n",
540 |     "\n",
541 |     "X, y = make_regression(n_samples=10, n_targets=3, random_state=1)\n",
542 |     "\n",
543 |     "MultiOutputRegressor(\n",
544 |     "    GradientBoostingRegressor(random_state=0)).fit(X, y).score(X, y)"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {
551 |     "collapsed": true,
552 |     "deletable": true,
553 |     "editable": true
554 |    },
555 |    "outputs": [],
556 |    "source": []
557 |   }
558 |  ],
559 |  "metadata": {
560 |   "kernelspec": {
561 |    "display_name": "Python 2",
562 |    "language": "python",
563 |    "name": "python2"
564 |   },
565 |   "language_info": {
566 |    "codemirror_mode": {
567 |     "name": "ipython",
568 |     "version": 2
569 |    },
570 |    "file_extension": ".py",
571 |    "mimetype": "text/x-python",
572 |    "name": "python",
573 |    "nbconvert_exporter": "python",
574 |    "pygments_lexer": "ipython2",
575 |    "version": "2.7.10"
576 |   }
577 |  },
578 |  "nbformat": 4,
579 |  "nbformat_minor": 2
580 | }
581 | 


--------------------------------------------------------------------------------