├── README.md └── datacourse ├── machine-learning ├── GridSearchCv and Pipeline.ipynb ├── ML_Anomaly_Detection.ipynb ├── ML_Classification.ipynb ├── ML_Clustering.ipynb ├── ML_Dimension_Reduction.ipynb ├── ML_FeatureEngineering.ipynb ├── ML_Intro_ML.ipynb ├── ML_K_Nearest_Neighbors.ipynb ├── ML_LinearRegression.ipynb ├── ML_Metrics.ipynb ├── ML_ModelSelection.ipynb ├── ML_Natural_Language_Processing.ipynb ├── ML_Scikit_Learn.ipynb ├── ML_Support_Vector_Machines.ipynb ├── ML_Time_Series.ipynb ├── ML_Tree_Based_Models.ipynb └── imputation.ipynb └── miniprojects ├── in.ipynb ├── ml.ipynb └── nlp.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # WQU-ML-Unit-2 2 | This repository contains an information about the lectures notes and exercises solutions of WorldQuant University Machine learning and statistics. 3 | -------------------------------------------------------------------------------- /datacourse/machine-learning/GridSearchCv and Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from sklearn.datasets import fetch_california_housing\n", 10 | "from sklearn.preprocessing import StandardScaler\n", 11 | "from sklearn.model_selection import GridSearchCV, train_test_split\n", 12 | "from sklearn.linear_model import Ridge\n", 13 | "from sklearn.pipeline import Pipeline\n", 14 | "import numpy as np" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "X = fetch_california_housing()['data']\n", 24 | "y = fetch_california_housing()['target']\n", 25 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "0.5943141338604155\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "pipe = Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge())])\n", 43 | "pipe.fit(X_train, y_train)\n", 44 | "print(pipe.score(X_test, y_test))" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "{'memory': None,\n", 56 | " 'steps': [('scaler',\n", 57 | " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", 58 | " ('regressor',\n", 59 | " Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n", 60 | " normalize=False, random_state=None, solver='auto', tol=0.001))],\n", 61 | " 'verbose': False,\n", 62 | " 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),\n", 63 | " 'regressor': Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n", 64 | " normalize=False, random_state=None, solver='auto', tol=0.001),\n", 65 | " 'scaler__copy': True,\n", 66 | " 'scaler__with_mean': True,\n", 67 | " 'scaler__with_std': True,\n", 68 | " 'regressor__alpha': 1.0,\n", 69 | " 'regressor__copy_X': True,\n", 70 | " 'regressor__fit_intercept': True,\n", 71 | " 'regressor__max_iter': None,\n", 72 | " 'regressor__normalize': False,\n", 73 | " 'regressor__random_state': None,\n", 74 | " 'regressor__solver': 'auto',\n", 75 | " 'regressor__tol': 0.001}" 76 | ] 77 | }, 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "pipe.get_params()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 5, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n" 97 | ] 98 | }, 99 | { 100 | "name": "stderr", 101 | "output_type": "stream", 102 | "text": [ 103 | "[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.\n", 104 | "[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed: 4.6s finished\n" 105 | ] 106 | }, 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "GridSearchCV(cv=5, error_score='raise-deprecating',\n", 111 | " estimator=Pipeline(memory=None,\n", 112 | " steps=[('scaler',\n", 113 | " StandardScaler(copy=True,\n", 114 | " with_mean=True,\n", 115 | " with_std=True)),\n", 116 | " ('regressor',\n", 117 | " Ridge(alpha=1.0, copy_X=True,\n", 118 | " fit_intercept=True, max_iter=None,\n", 119 | " normalize=False,\n", 120 | " random_state=None, solver='auto',\n", 121 | " tol=0.001))],\n", 122 | " verbose=False),\n", 123 | " iid='warn', n_jobs=2,\n", 124 | " param_grid={'regressor__al...0e-03, 8.85866790e-03,\n", 125 | " 1.83298071e-02, 3.79269019e-02, 7.84759970e-02, 1.62377674e-01,\n", 126 | " 3.35981829e-01, 6.95192796e-01, 1.43844989e+00, 2.97635144e+00,\n", 127 | " 6.15848211e+00, 1.27427499e+01, 2.63665090e+01, 5.45559478e+01,\n", 128 | " 1.12883789e+02, 2.33572147e+02, 4.83293024e+02, 1.00000000e+03])},\n", 129 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 130 | " scoring=None, verbose=True)" 131 | ] 132 | }, 133 | "execution_count": 5, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "param_grid = {'regressor__alpha': np.logspace(-3, 3, 20)}\n", 140 | "grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=2, verbose=True)\n", 141 | "grid_search.fit(X_train, y_train)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 6, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "Best parameter: {'regressor__alpha': 12.742749857031322}\n", 154 | "Best score: 0.6053956962874548\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "print(\"Best parameter:\", grid_search.best_params_)\n", 160 | "print(\"Best score: {}\".format(grid_search.best_score_))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 7, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "from tempfile import mkdtemp\n", 170 | "from shutil import rmtree\n", 171 | "cachedir = mkdtemp() # creates a temp. directory\n", 172 | "pipe_cache = Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge())], memory=cachedir)\n", 173 | "pipe_cache.fit(X_train, y_train)\n", 174 | "rmtree(cachedir)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 10, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n" 187 | ] 188 | }, 189 | { 190 | "name": "stderr", 191 | "output_type": "stream", 192 | "text": [ 193 | "[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.\n", 194 | "[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed: 1.1s finished\n" 195 | ] 196 | }, 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "Pipeline(memory=None,\n", 201 | " steps=[('scaler',\n", 202 | " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", 203 | " ('grid_search',\n", 204 | " GridSearchCV(cv=5, error_score='raise-deprecating',\n", 205 | " estimator=Ridge(alpha=1.0, copy_X=True,\n", 206 | " fit_intercept=True, max_iter=None,\n", 207 | " normalize=False,\n", 208 | " random_state=None, solver='auto',\n", 209 | " tol=0.001),\n", 210 | " iid='warn', n_jobs=2,\n", 211 | " param_grid={'alpha': array([1.00000000e-03...90e-03,\n", 212 | " 1.83298071e-02, 3.79269019e-02, 7.84759970e-02, 1.62377674e-01,\n", 213 | " 3.35981829e-01, 6.95192796e-01, 1.43844989e+00, 2.97635144e+00,\n", 214 | " 6.15848211e+00, 1.27427499e+01, 2.63665090e+01, 5.45559478e+01,\n", 215 | " 1.12883789e+02, 2.33572147e+02, 4.83293024e+02, 1.00000000e+03])},\n", 216 | " pre_dispatch='2*n_jobs', refit=True,\n", 217 | " return_train_score=False, scoring=None,\n", 218 | " verbose=1))],\n", 219 | " verbose=False)" 220 | ] 221 | }, 222 | "execution_count": 10, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "param_grid = {\"alpha\": np.logspace(-3, 3, 20)}\n", 229 | "grid_search = GridSearchCV(Ridge(), param_grid,cv=5, n_jobs=2, verbose=1)\n", 230 | "pipe_2 = Pipeline(steps=[('scaler', StandardScaler()), ('grid_search', grid_search)])\n", 231 | "pipe_2.fit(X_train, y_train)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 13, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "{'alpha': 26.366508987303554}" 243 | ] 244 | }, 245 | "execution_count": 13, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "pipe_2.named_steps['grid_search'].best_params_" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 17, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "Fitting 5 folds for each of 100 candidates, totalling 500 fits\n" 264 | ] 265 | }, 266 | { 267 | "name": "stderr", 268 | "output_type": "stream", 269 | "text": [ 270 | "[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.\n", 271 | "[Parallel(n_jobs=2)]: Done 110 tasks | elapsed: 5.4s\n", 272 | "[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed: 15.1s finished\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "from sklearn.decomposition import PCA\n", 278 | "cachedir = mkdtemp() # creates a temp. directory\n", 279 | "pipe_3 = Pipeline(steps=[('scaler', StandardScaler()), ('dim-red', PCA()), ('regressor', Ridge())], memory=cachedir)\n", 280 | "param_grid = {\"dim-red__n_components\": [2, 3, 4, 5, 6],\n", 281 | " \"regressor__alpha\": np.logspace(-3, 3, 20)}\n", 282 | "grid_search = GridSearchCV(pipe_3, param_grid, cv=5, n_jobs=2, verbose=1)\n", 283 | "grid_search.fit(X_train, y_train)\n", 284 | "rmtree(cachedir)\n" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 18, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "{'dim-red__n_components': 6, 'regressor__alpha': 26.366508987303554}" 296 | ] 297 | }, 298 | "execution_count": 18, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "grid_search.best_params_" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 20, 310 | "metadata": {}, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "text/plain": [ 315 | "0.5110075108397308" 316 | ] 317 | }, 318 | "execution_count": 20, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "grid_search.best_score_" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 22, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n" 337 | ] 338 | }, 339 | { 340 | "name": "stderr", 341 | "output_type": "stream", 342 | "text": [ 343 | "[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.\n", 344 | "[Parallel(n_jobs=2)]: Done 80 tasks | elapsed: 5.4s\n", 345 | "[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed: 5.7s finished\n" 346 | ] 347 | }, 348 | { 349 | "data": { 350 | "text/plain": [ 351 | "RandomizedSearchCV(cv=5, error_score='raise-deprecating',\n", 352 | " estimator=Pipeline(memory='/tmp/tmpqrm12nxn',\n", 353 | " steps=[('scaler',\n", 354 | " StandardScaler(copy=True,\n", 355 | " with_mean=True,\n", 356 | " with_std=True)),\n", 357 | " ('dim-red',\n", 358 | " PCA(copy=True,\n", 359 | " iterated_power='auto',\n", 360 | " n_components=None,\n", 361 | " random_state=None,\n", 362 | " svd_solver='auto', tol=0.0,\n", 363 | " whiten=False)),\n", 364 | " ('regressor',\n", 365 | " Ridge(alpha=1.0, copy_X=True,\n", 366 | " fit_interce...\n", 367 | " 'regressor__alpha': array([1.00000000e-03, 2.06913808e-03, 4.28133240e-03, 8.85866790e-03,\n", 368 | " 1.83298071e-02, 3.79269019e-02, 7.84759970e-02, 1.62377674e-01,\n", 369 | " 3.35981829e-01, 6.95192796e-01, 1.43844989e+00, 2.97635144e+00,\n", 370 | " 6.15848211e+00, 1.27427499e+01, 2.63665090e+01, 5.45559478e+01,\n", 371 | " 1.12883789e+02, 2.33572147e+02, 4.83293024e+02, 1.00000000e+03])},\n", 372 | " pre_dispatch='2*n_jobs', random_state=None, refit=True,\n", 373 | " return_train_score=False, scoring=None, verbose=1)" 374 | ] 375 | }, 376 | "execution_count": 22, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "from sklearn.model_selection import RandomizedSearchCV\n", 383 | "random_search = RandomizedSearchCV(pipe_3, param_grid, cv=5, n_jobs=2, verbose=1, n_iter=20)\n", 384 | "random_search.fit(X_train, y_train)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 24, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "data": { 394 | "text/plain": [ 395 | "{'regressor__alpha': 2.976351441631316, 'dim-red__n_components': 6}" 396 | ] 397 | }, 398 | "execution_count": 24, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "random_search.best_params_" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 25, 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "data": { 414 | "text/plain": [ 415 | "0.5110048497000744" 416 | ] 417 | }, 418 | "execution_count": 25, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "random_search.best_score_" 425 | ] 426 | } 427 | ], 428 | "metadata": { 429 | "kernelspec": { 430 | "display_name": "Python 3", 431 | "language": "python", 432 | "name": "python3" 433 | }, 434 | "language_info": { 435 | "codemirror_mode": { 436 | "name": "ipython", 437 | "version": 3 438 | }, 439 | "file_extension": ".py", 440 | "mimetype": "text/x-python", 441 | "name": "python", 442 | "nbconvert_exporter": "python", 443 | "pygments_lexer": "ipython3", 444 | "version": "3.7.3" 445 | } 446 | }, 447 | "nbformat": 4, 448 | "nbformat_minor": 2 449 | } 450 | -------------------------------------------------------------------------------- /datacourse/machine-learning/ML_Anomaly_Detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "init_cell": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%logstop\n", 12 | "%logstart -rtq ~/.logs/ML_Anomaly_Detection.py append\n", 13 | "%matplotlib inline\n", 14 | "import matplotlib\n", 15 | "import seaborn as sns\n", 16 | "sns.set()\n", 17 | "matplotlib.rcParams['figure.dpi'] = 144" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import matplotlib.pyplot as plt\n", 27 | "import numpy as np\n", 28 | "import pandas as pd" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Anomaly Detection\n", 36 | "\n", 37 | "\n", 38 | "\n", 39 | "Often times, we need to find abnormal and unusual values in our data set. This process is referred to as **anomaly detection**. Identifying these data points can serve multiple purposes, such as\n", 40 | "\n", 41 | "1. Removing outliers in a training set before fitting a machine learning model.\n", 42 | "1. Analyzing a set of observations to identify whether something is wrong with the process. For example, are there any anomalous server logs that _may_ indicate a security breach.\n", 43 | "\n", 44 | "The application of distinguishing whether something is abnormal or not falls into two divisions, **outlier** or **novelty detection**. These two terms are often lumped together but do have distinct definitions.\n", 45 | "\n", 46 | "* **Outlier Detection:** The process of identifying observations that deviate substantially from the rest. Outlier detection models are trained with data sets that include outliers; it is not \"clean\". The model learns how much a point can deviate to be classified as an outlier.\n", 47 | "\n", 48 | "* **Novelty Detection:** The process of identifying novel points by training a model with a data set that is not \"polluted\" with outliers. The model learns a boundary, or boundaries, the encompasses all normal/regular points. Any points that reside outside of these boundaries are new and thus novel.\n", 49 | "\n", 50 | "The distinction is subtle but certain algorithms are referred to as either outlier or novelty detection. However, in practice, both classes can work well regardless if the application is purely novelty or outlier detection. A further discussion of novelty versus outlier detection can be read [here](https://scikit-learn.org/stable/modules/outlier_detection.html). Anomaly detection is _usually_ an unsupervised machine learning technique because rarely do we have labels for the observations. As such, the algorithms for detecting anomalies will rely purely on features of the observations.\n", 51 | "\n", 52 | "In this notebook, we will go over two popular algorithms for outlier and novelty detection before working on a case study using time series data." 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Implementation in `scikit-learn`\n", 60 | "\n", 61 | "In `scikit-learn`, anomaly detection algorithms are unsupervised learners. We will discuss two models for anomaly detection in `scikit-learn`, **one-class SVM** and **isolation forest**. Both are unsupervised learning models with a similar interface; the two key methods are:\n", 62 | "\n", 63 | "* `fit(X)`: fit/train the model with data set `X`.\n", 64 | "* `predict(X)`: determine whether the observations in `X` are inliers `1` or outliers `-1`.\n", 65 | "* `decision_function(X)`: score/metric used to determine whether a point is an inlier/outlier.\n", 66 | "\n", 67 | "Note, the interpretation of the output of the `decision_function` method is algorithm specific." 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## One-class Support Vector Machines\n", 75 | "\n", 76 | "The support vector machine classifier can be tweaked to serve novelty detection applications, referred to as one-class support vector machines. Consider a binary classification situation where all points in the training set belong to the same class, hence the name of the algorithm. Because all training points are in the same class, it is assumed that there are no outliers. The one-class SVM is a novelty detector. \n", 77 | "\n", 78 | "The points are transformed to a higher dimensional space where you have the freedom to locate the origin of the coordinates. The algorithm's task becomes to locate a hyperplane in the space that best separates the data from the origin. The catch is that hyperplane must go through the origin and located on the origin is the only member of the second class. The algorithm works by pushing as many of the vectors in the training set away from the origin in the feature space. As before, the model includes slack variable for vectors that violate the margin. The algorithm is prevented from pushing the vectors infinitely far away from the origin as the single member of the second class always resides on the hyperplane, incurring a large penalty when the origin is very far away from most of the training points. The algorithm has to find the best balance between origin separation and margin violations from the training set. The image below illustrates an example of the algorithm. For visual purposes, we have only used two dimensions but in practice the algorithm works in a large dimensional space to achieve better separation.\n", 79 | "\n", 80 | "" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "The governing equation and constraints are\n", 88 | "\n", 89 | "$$\n", 90 | "\\min_{\\beta, \\zeta, \\rho} \\frac{1}{2} \\|\\beta\\|^2 + \\frac{1}{\\nu n}\\sum_{j=1}^n \\zeta_j - \\rho \\\\\n", 91 | "\\mbox{subject to } \\left\\{ \\begin{array} {cl} \n", 92 | " h(x_{j\\cdot}) \\cdot \\beta \\ge \\rho -\\zeta_j & \\\\\n", 93 | " \\zeta_j \\ge 0 ,\n", 94 | "\\end{array}\\right.\n", 95 | "$$\n", 96 | "\n", 97 | "where $h(x_j)$ is a kernel function, $\\rho$ is distance from the origin to the hyperplane, and $\\nu$ is a hyperparameter that is the upper bound for the fraction of training error and the lower bound for the fraction of support vectors. Notice how the constraint is forcing points to be at least $\\rho$ away from the margin, lest it incurs a margin violation as $\\zeta$ must be set large enough to satisfy the inequality.\n", 98 | "\n", 99 | "Let's use the one-class SVM to see how it works. We will be using the wine data set provided by `scikit-learn`. The details of the data set are not important, only that it has 178 observations with 13 numerical features.\n", 100 | "\n", 101 | "**Question**\n", 102 | "* What technique should we use on the wine data set to easily visualize our analysis?" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "from sklearn.datasets import load_wine\n", 112 | "from sklearn.preprocessing import StandardScaler\n", 113 | "from sklearn.decomposition import PCA\n", 114 | "from sklearn.pipeline import Pipeline\n", 115 | "\n", 116 | "# load data set\n", 117 | "data = load_wine()\n", 118 | "X = data['data']\n", 119 | "\n", 120 | "# truncate to two variables\n", 121 | "pipe = Pipeline([('scaler', StandardScaler()), ('dim_red', PCA(n_components=2))])\n", 122 | "Xt = pipe.fit_transform(X)\n", 123 | "\n", 124 | "# generate novel/outlier points\n", 125 | "np.random.seed(1)\n", 126 | "theta = 2*np.pi*np.random.random(10)\n", 127 | "X_test = np.vstack((4*np.cos(theta) + np.random.random(10), 4*np.sin(theta) + np.random.random(10)))\n", 128 | "\n", 129 | "plt.scatter(*Xt.T)\n", 130 | "plt.scatter(*X_test, c='red')\n", 131 | "plt.xlabel('$\\\\xi_1$')\n", 132 | "plt.ylabel('$\\\\xi_2$');\n", 133 | "plt.legend([\"training set\", \"novel/outliers\"]);" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "The visualization below plots the data along with the boundaries that determine whether a point is consider novel or not. The filled contour lines in the plot represent values of the decision function of the one-class SVM. The `decision_function` method reports the signed distance (negative means on the wrong side) between the point and the hyperplane. The visualization allows you to modify $\\nu$, the upper bound for the false positive rate. You can also consider $\\nu$ as the probability of having a new but regular observation outside the region defining regular points. As $\\nu$ decreases, the area encompassing the regular points increases. As with the standard kernelized SVM, you can change the kernel function, but `rbf` usually works the best." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "from sklearn.svm import OneClassSVM\n", 150 | "from ipywidgets import interact, FloatSlider\n", 151 | "\n", 152 | "def plot_one_class_svm(X, X_test):\n", 153 | " def plotter(nu=0.95):\n", 154 | " clf = OneClassSVM(nu=nu, gamma='auto')\n", 155 | " clf.fit(X)\n", 156 | " y_pred = clf.predict(X)\n", 157 | " fp_rate = (y_pred == -1).sum()/len(X)\n", 158 | " \n", 159 | " X1, X2 = np.meshgrid(np.linspace(-5, 5), np.linspace(-5, 5))\n", 160 | " y_proba = clf.decision_function(np.hstack((X1.reshape(-1, 1), X2.reshape(-1, 1))))\n", 161 | " Z = y_proba.reshape(50, 50)\n", 162 | " \n", 163 | " fig = plt.figure(figsize=(8, 5), facecolor='w', edgecolor='k')\n", 164 | " plt.contourf(X1, X2, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.Blues_r)\n", 165 | " plt.colorbar()\n", 166 | " a = plt.contour(X1, X2, Z, levels=[0], linewidths=2, colors='black') \n", 167 | " b1 = plt.scatter(*X.T, c='blue')\n", 168 | " b2 = plt.scatter(*X_test, c='red')\n", 169 | " plt.title(\"false positive rate: {:g}\".format(fp_rate))\n", 170 | " plt.legend([a.collections[0], b1, b2], [\"boundary\", \" true inliers\", \"true outliers\"], frameon=True, \n", 171 | " loc=\"lower left\")\n", 172 | " return plotter\n", 173 | "\n", 174 | "nu_slider = FloatSlider(min=0.01, max=0.99, step=0.01, value=0.5, description='$\\\\nu$')\n", 175 | "interact(plot_one_class_svm(Xt, X_test), nu=nu_slider);" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "## Isolation Forest\n", 183 | "\n", 184 | "Isolation forests is an outlier detection algorithm that uses decision trees. The principle isolation forest works on is that outliers are points with features that are considerably different than the rest of the data, the inliers. Consider a data set in a $p$-dimensional space. The inliers will be closer together while the outliers will be farther apart. As we have seen, decision trees divide up a $p$-dimensional space using orthogonal cuts. \n", 185 | "\n", 186 | "Consider a decision tree that is constructed by making random cuts with randomly chosen features. The tree is allowed to grow until all points have been isolated. It will be easier to isolate or box in the outliers than the inliers. In other words, less splits are required to isolate outliers compared to the inliers and outlier nodes will reside closer to the root node. The process of constructing a tree with random cuts is repeated to create an ensemble, hence the term forest in the name of the algorithm. For each data point, the average path/splits required to isolate the point is a metric for the regularity or normality of the point. While the algorithm could have adopted a more sophisticated manner to isolate points, making random splits is computationally cheap and averaging across all trees considers the multiple manners to isolate the data. The two key hyperparameters are\n", 187 | "\n", 188 | "* `n_estimators`: The number of trees to use in the ensemble.\n", 189 | "* `contamination`: The fraction of outliers in the data set.\n", 190 | "\n", 191 | "The `decision_function` method returns a score for a set of observations, a negative or positive score means the observation is labeled as an outlier or inlier, respectively. This score is related to the path length, number of splits, to isolate each observation, averaged across all trees in the forest, but with an offset,\n", 192 | "\n", 193 | "$$\n", 194 | "\\text{score} = \\text{mean path length} - \\text{offset},\n", 195 | "$$\n", 196 | "\n", 197 | "where the offset is chosen based on the set contamination level. For example, if the contamination fraction was set to 0.2, then the offset if chosen such that 20% of the training data have a negative score. Let's visualize the result of using an isolation forest on the wine data set." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "from sklearn.ensemble import IsolationForest\n", 207 | "\n", 208 | "def plot_isolation_forest(X, X_test):\n", 209 | " def plotter(contamination=0.2):\n", 210 | " clf = IsolationForest(n_estimators=100, contamination=contamination, behaviour='new')\n", 211 | " clf.fit(X)\n", 212 | " \n", 213 | " y_pred = clf.predict(X)\n", 214 | " outlier_rate = (y_pred == -1).sum()/len(X)\n", 215 | " \n", 216 | " X1, X2 = np.meshgrid(np.linspace(-5, 5), np.linspace(-5, 5))\n", 217 | " y_proba = clf.decision_function(np.hstack((X1.reshape(-1, 1), X2.reshape(-1, 1))))\n", 218 | " Z = y_proba.reshape(50, 50)\n", 219 | " \n", 220 | " fig = plt.figure(figsize=(8, 5), facecolor='w', edgecolor='k')\n", 221 | " plt.contourf(X1, X2, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.Blues_r)\n", 222 | " plt.colorbar()\n", 223 | " a = plt.contour(X1, X2, Z, levels=[0], linewidths=2, colors='black') \n", 224 | " b1 = plt.scatter(*X.T, c='blue')\n", 225 | " b2 = plt.scatter(*X_test, c='red')\n", 226 | " plt.title(\"outlier fraction: {:g}\".format(outlier_rate))\n", 227 | " plt.legend([a.collections[0], b1, b2], [\"boundary\", \" true inliers\", \"true outliers\"], frameon=True, \n", 228 | " loc=\"lower left\") \n", 229 | " return plotter\n", 230 | "\n", 231 | "cont_slider = FloatSlider(min=0.01, max=0.5, value=0.1, step=0.01, description=\"fraction\")\n", 232 | "interact(plot_isolation_forest(Xt, X_test), contamination=cont_slider);" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "**Questions**\n", 240 | "\n", 241 | "* Which algorithm, isolation forest or one-class SVM, has a better training time complexity? What influenced your decision?\n", 242 | "* What advantages of decision trees in general would still be present in the isolation forest algorithm?" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "## Comparison of one-class SVM and isolation forest\n", 250 | "\n", 251 | "Here are a few things to be aware between one-class SVM and isolation forest.\n", 252 | "\n", 253 | "* Both algorithms are capable of properly modeling multi-modal data sets.\n", 254 | "* One class SVM is sensitive to outliers, making it more appropriate for novelty detection, when the training data is not contaminated with outliers.\n", 255 | "* Since the splits of the decision tree are chosen at random, isolation forest is faster to train.\n", 256 | "* In general, SVM are slow to train, especially with respect to the training set size.\n", 257 | "\n", 258 | "Additionally, the two methods inherit the pros and cons of their parent algorithm. There are other outlier and novelty detection algorithms available in `scikit-learn` and a comparison and overview of other methods are outlined [here](https://scikit-learn.org/stable/auto_examples/plot_anomaly_comparison.html#sphx-glr-auto-examples-plot-anomaly-comparison-py)." 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## Anomaly detection in a time series\n", 266 | "\n", 267 | "Anomaly detection can be applied to a time series where we want to create a baseline model and determine the deviation of the observations with the baseline. If the deviation is large enough, the observation is deemed anomalous and is flagged. In general, novelty and outlier detection does not tell us _why_ something is possibility an outlier, the conditions and causes that led to an unusual observation. For example, if we are observing server logs, anomalous observations may be a result of some equipment or code breakdown or something malignant like a security breach.\n", 268 | "\n", 269 | "In this case study, we analyze appliance energy use for a 4.5 month time period. Data was collected at a sampling rate of 10 minutes. Given the large variability in energy usage at a sampling rate of 10 minutes, we will resample the time series at an hourly interval. More of the data set can be learned [here](https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction)." 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "df = pd.read_csv(\"data/energy_data.csv\", parse_dates=[\"date\"]) \n", 279 | "df = df.set_index('date')\n", 280 | "df_hourly = df.resample(\"H\").mean() # resample hourly\n", 281 | "\n", 282 | "energy = df_hourly['Appliances']\n", 283 | "energy.plot()\n", 284 | "plt.ylabel(\"energy (Wh)\");" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "**Questions**\n", 292 | "* What should our plan of attack be for analyzing the time series?\n", 293 | "* What do you observe in the time series?" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### Fourier Analysis\n", 301 | "\n", 302 | "While it is hard to tell, there are periodic behaviors in the time series. We can better spotlight the dominant frequencies that support the time series using Fourier analysis." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "from scipy import fftpack\n", 312 | "\n", 313 | "sampling_rate = (energy.index[1] - energy.index[0]).total_seconds()\n", 314 | "sampling_rate = sampling_rate / (60 * 60 * 24) # day\n", 315 | "\n", 316 | "Y = fftpack.fft(energy - energy.mean())\n", 317 | "freq = np.linspace(0, 1/sampling_rate, len(Y))\n", 318 | "\n", 319 | "plt.plot(freq[:len(freq)//2], np.abs(Y[:len(Y)//2]))\n", 320 | "plt.xlabel(\"cycles per day\")\n", 321 | "plt.ylabel(\"Fourier transform\");" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "The time series has four dominant frequencies: daily, twice-daily, three times a day, and four times a day. In other words, 6, 8, 12, and 24 hour periods. Given our everyday experience, we probably would have anticipated these frequencies/periods but it is reassuring that they can be revealed via Fourier analysis." 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "### Incorporating day of the week\n", 336 | "\n", 337 | "With time series data, a commonly generated feature is the day of the week for the observations. It might be tempting to capture day of the week behavior using a sinusoidal component but let's analyze the energy usage for each day of the week to understand how to best model it." 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "df_day_of_week = pd.DataFrame({'day': energy.index.dayofweek, 'count': energy.values})\n", 347 | "grouped_by_day = df_day_of_week.groupby('day')\n", 348 | "\n", 349 | "grouped_by_day.mean().plot(kind='bar')\n", 350 | "plt.xticks(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']);" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "The energy usage as a function of the day of the week fluctuates but is not periodic; it is best captured using one hot encoded features. We will include one hot encoded features for each day of the week to our baseline model." 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "### Initial baseline model\n", 365 | "\n", 366 | "We will need to create some custom transformers to work with our pandas times series data. Specifically, we need a transformer to extract out the indices, create Fourier components, and transform `datetime` objects into a unit of time. A common reference point used to translate a date into a unit of time is [Unix time](https://en.wikipedia.org/wiki/Unix_time). It is defined as the time since 00:00:00 Thursday, 1 January 1970 Coordinated Universal Time (UTC), minus leap seconds." 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 376 | "\n", 377 | "class IndexSelector(BaseEstimator, TransformerMixin):\n", 378 | "\n", 379 | " def __init__(self):\n", 380 | " \"\"\"Return indices of a data frame for use in other estimators.\"\"\"\n", 381 | " pass\n", 382 | "\n", 383 | " def fit(self, df, y=None):\n", 384 | " return self\n", 385 | "\n", 386 | " def transform(self, df):\n", 387 | " return df.index\n", 388 | "\n", 389 | "class FourierComponents(BaseEstimator, TransformerMixin):\n", 390 | "\n", 391 | " def __init__(self, freqs):\n", 392 | " \"\"\"Create features based on sin(2*pi*f*t) and cos(2*pi*f*t).\"\"\"\n", 393 | " self.freqs = freqs\n", 394 | "\n", 395 | " def fit(self, X, y=None):\n", 396 | " return self\n", 397 | "\n", 398 | " def transform(self, X):\n", 399 | " Xt = np.zeros((X.shape[0], 2 * len(self.freqs)))\n", 400 | " t_0 = X[0]\n", 401 | " for i, f in enumerate(self.freqs):\n", 402 | " Xt[:, 2 * i] = np.cos(2 * np.pi * f * (X)).reshape(-1)\n", 403 | " Xt[:, 2 * i + 1] = np.sin(2 * np.pi * f * (X)).reshape(-1)\n", 404 | "\n", 405 | " return Xt\n", 406 | "\n", 407 | "class EpochTime(BaseEstimator, TransformerMixin):\n", 408 | "\n", 409 | " def __init__(self, unit):\n", 410 | " \"\"\"Transform datetime object to some unit of time since the start of the epoch.\"\"\"\n", 411 | " self.unit = unit\n", 412 | "\n", 413 | " def fit(self, X, y=None):\n", 414 | " return self\n", 415 | "\n", 416 | " def transform(self, X):\n", 417 | " epoch_time = np.array([x.value for x in X])\n", 418 | "\n", 419 | " if self.unit == \"seconds\":\n", 420 | " return epoch_time / (1000000000)\n", 421 | " elif self.unit == \"minutes\":\n", 422 | " return epoch_time / (1000000000) / 60\n", 423 | " elif self.unit == \"hours\":\n", 424 | " return epoch_time / (1000000000) / 60 / 60\n", 425 | " else:\n", 426 | " return epoch_time\n", 427 | " \n", 428 | "class DayOfWeek(BaseEstimator, TransformerMixin):\n", 429 | "\n", 430 | " def __init__(self):\n", 431 | " \"\"\"Determine the day of the week for datetime objects.\"\"\"\n", 432 | " pass\n", 433 | "\n", 434 | " def fit(self, X, y=None):\n", 435 | " return self\n", 436 | "\n", 437 | " def transform(self, X):\n", 438 | " return np.array([x.dayofweek for x in X]).reshape(-1, 1)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "Some additional useful functions for our analysis." 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "def ts_train_test_split(df, cutoff, target):\n", 455 | " \"\"\"Perform a train/test split on a data frame based on a cutoff date.\"\"\"\n", 456 | " \n", 457 | " ind = df.index < cutoff\n", 458 | " \n", 459 | " df_train = df.loc[ind]\n", 460 | " df_test = df.loc[~ind]\n", 461 | " y_train = df.loc[ind, target]\n", 462 | " y_test = df.loc[~ind, target]\n", 463 | " \n", 464 | " return df_train, df_test, y_train, y_test\n", 465 | "\n", 466 | "def plot_results(df, y_pred):\n", 467 | " \"\"\"Plot predicted results and residuals.\"\"\"\n", 468 | " \n", 469 | " plt.plot(df.index, y_pred, '-r')\n", 470 | " energy.plot()\n", 471 | " plt.ylabel('energy (Wh)')\n", 472 | " plt.legend(['true', 'predicted'])\n", 473 | " plt.show();\n", 474 | "\n", 475 | " plt.plot(resd)\n", 476 | " plt.ylabel('residual');" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "from sklearn.linear_model import LinearRegression\n", 486 | "from sklearn.pipeline import FeatureUnion, Pipeline\n", 487 | "from sklearn.preprocessing import OneHotEncoder\n", 488 | "\n", 489 | "# perform train/test split\n", 490 | "cutoff = \"Mar-2016\" # roughly corresponding to 10% of the data\n", 491 | "df_train, df_test, y_train, y_test = ts_train_test_split(df_hourly, cutoff, 'Appliances')\n", 492 | "\n", 493 | "# construct and train model\n", 494 | "freqs = np.array([1, 2, 3]) / 24 / 60 # 24, 12, and 8 hour periods\n", 495 | "selector = IndexSelector()\n", 496 | "epoch_time = EpochTime(\"minutes\")\n", 497 | "fourier_components = FourierComponents(freqs)\n", 498 | "one_hot = OneHotEncoder(sparse=False, categories='auto')\n", 499 | "lr = LinearRegression()\n", 500 | "\n", 501 | "fourier = Pipeline([(\"time\", epoch_time),\n", 502 | " (\"sine_cosine\", fourier_components)])\n", 503 | "day_of_week = Pipeline([(\"day\", DayOfWeek()),\n", 504 | " (\"encoder\", one_hot)])\n", 505 | "union = FeatureUnion([(\"fourier\", fourier),\n", 506 | " (\"day_of_week\", day_of_week)])\n", 507 | "\n", 508 | "pipe = Pipeline([(\"indices\", selector),\n", 509 | " (\"union\", union),\n", 510 | " (\"regressor\", lr)])\n", 511 | "pipe.fit(df_train, y_train)\n", 512 | "\n", 513 | "# make predictions\n", 514 | "y_pred = pipe.predict(df_hourly)\n", 515 | "resd = energy - y_pred\n", 516 | "print(\"Test set R^2: {:g}\".format(pipe.score(df_test, y_test)))\n", 517 | "plot_results(df_hourly, y_pred)" 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": {}, 523 | "source": [ 524 | "It is very apparent that the initial baseline model is not adequate for the time series. The residuals reveal:\n", 525 | "\n", 526 | "1. No long term trends.\n", 527 | "1. The time series has a lot of shock events, large increases in energy use, probably as a result of sudden and short use of an appliance.\n", 528 | "\n", 529 | "Let's next analyze the residuals for any temporal correlations." 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "metadata": {}, 535 | "source": [ 536 | "### Noise based features\n", 537 | "\n", 538 | "The first thing we want to unveil is the correlation of past residuals with current values. An autocorrelation plot will inform us of the characteristic time scale of the process to guide us when generating noise based features." 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [ 547 | "from pandas.plotting import autocorrelation_plot\n", 548 | "\n", 549 | "autocorrelation_plot(resd)\n", 550 | "plt.xlabel('Lag (hour)')\n", 551 | "plt.xlim([0, 50]);" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "The time scale appears to be anywhere from 10 to 20 hours. Let's incorporate window based features from our residuals to improve our model." 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "from sklearn.base import RegressorMixin\n", 568 | "\n", 569 | "class ResidualFeatures(BaseEstimator, TransformerMixin):\n", 570 | " def __init__(self, window=100):\n", 571 | " \"\"\"Generate features based on window statistics of past noise/residuals.\"\"\"\n", 572 | " self.window = window\n", 573 | " \n", 574 | " def fit(self, X, y=None):\n", 575 | " return self\n", 576 | " \n", 577 | " def transform(self, X):\n", 578 | " df = pd.DataFrame()\n", 579 | " df['residual'] = X\n", 580 | " df['prior'] = df['residual'].shift(1)\n", 581 | " df['mean'] = df['residual'].rolling(window=self.window).mean()\n", 582 | " df['diff'] = df['residual'].diff().rolling(window=self.window).mean()\n", 583 | " df = df.fillna(method='bfill')\n", 584 | " \n", 585 | " return df\n", 586 | " \n", 587 | "class FullModel(BaseEstimator, RegressorMixin):\n", 588 | " def __init__(self, baseline, residual_model, steps=20):\n", 589 | " \"\"\"Combine a baseline and residual model to predict any number of steps in the future.\"\"\"\n", 590 | " \n", 591 | " self.baseline = baseline\n", 592 | " self.residual_model = residual_model\n", 593 | " self.steps = steps\n", 594 | " \n", 595 | " def fit(self, X, y):\n", 596 | " self.baseline.fit(X, y)\n", 597 | " resd = y - self.baseline.predict(X)\n", 598 | " self.residual_model.fit(resd.iloc[:-self.steps], resd.shift(-self.steps).dropna())\n", 599 | " \n", 600 | " return self\n", 601 | " \n", 602 | " def predict(self, X):\n", 603 | " y_b = pd.Series(self.baseline.predict(X), index=X.index)\n", 604 | " resd = X['Appliances'] - y_b\n", 605 | " resd_pred = pd.Series(self.residual_model.predict(resd), index=X.index)\n", 606 | " resd_pred = resd_pred.shift(self.steps)\n", 607 | " y_pred = y_b + resd_pred\n", 608 | " \n", 609 | " return y_pred" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "from sklearn.metrics import r2_score\n", 619 | "\n", 620 | "# construct residual model\n", 621 | "resd_train = y_train - pipe.predict(df_train)\n", 622 | "residual_feats = ResidualFeatures(window=20)\n", 623 | "residual_model = Pipeline([('residual_features', residual_feats), ('regressor', LinearRegression())])\n", 624 | " \n", 625 | "# construct and train full model\n", 626 | "full_model = FullModel(pipe, residual_model, steps=1)\n", 627 | "full_model.fit(df_train, y_train)\n", 628 | "\n", 629 | "# make predictions\n", 630 | "y_pred = full_model.predict(df_hourly)\n", 631 | "resd = energy - y_pred\n", 632 | "ind = resd.index > cutoff\n", 633 | "print(\"Test set R^2: {:g}\".format(r2_score(energy.loc[ind], y_pred.loc[ind])))\n", 634 | "plot_results(df_hourly, y_pred)" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": {}, 640 | "source": [ 641 | "Admittedly, our baseline model is not great at predicting future energy use. However, we can still utilize our baseline model for anomaly detection. Our analysis will focus on the final residuals of our baseline model. If an observation deviates significantly from the baseline, it will be flagged. The plot below illustrates the distribution and autocorrelation for our final residuals." 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [ 650 | "resd.hist(bins=50, density=True);\n", 651 | "plt.show()\n", 652 | "\n", 653 | "autocorrelation_plot(resd.dropna())\n", 654 | "plt.xlabel(\"Lag (hours)\")\n", 655 | "plt.xlim([0, 100]);" 656 | ] 657 | }, 658 | { 659 | "cell_type": "markdown", 660 | "metadata": {}, 661 | "source": [ 662 | "**Questions**\n", 663 | "* What conclusion can you make from the autocorrelation plot?\n", 664 | "* What do you observe in the distribution of residuals?\n", 665 | "* What is a good basis to use when determining whether the magnitude of a deviation is large?" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "## z-Score\n", 673 | "\n", 674 | "Since there is little temporal correlation with residual values, we can assume that the residuals are independently sampled from the same distribution. Given this probabilistic perspective, we can quantify the degree of anomaly to each observation *if* we know the distribution the residuals are being sampled from. If the distribution has one peak, there is a lower probability of observing values far from the peak. The z-score is a relative measure of how far away a value is from the mean, normalized by the standard deviation.\n", 675 | "\n", 676 | "$$\n", 677 | "z = \\frac{x - \\mu}{\\sigma},\n", 678 | "$$\n", 679 | "\n", 680 | "where $\\mu$ and $\\sigma$ are the mean and standard deviation of distribution. The larger the magnitude of the z-score, the lower the probability of observing the value. Exact percentages can only be known if we know the distribution. When the distribution is a normal or Gaussian distribution given by\n", 681 | "\n", 682 | "$$\n", 683 | "p(x) = \\frac{1}{\\sqrt{2\\pi \\sigma}}\\exp\\left(-\\frac{(x-\\mu)^2}{2\\sigma^2} \\right),\n", 684 | "$$\n", 685 | "\n", 686 | "68% of the values will reside within $z = \\pm 1$. 95% and 99.7% of the values will reside in an interval of $z = \\pm 2$ and $z = \\pm 3$, respectively.\n", 687 | "\n", 688 | "Strictly speaking, our distribution of our residuals are not normal but it has a single peak and not heavily skewed. The general idea of the greater the magnitude of the z-score the more anomalous the observation is still valid. Let's calculate the z-score for each residual and display the results. Since our distribution is slightly skewed towards positive values, we will use a different z-score cutoff whether the residual is negative or positive." 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": null, 694 | "metadata": {}, 695 | "outputs": [], 696 | "source": [ 697 | "z = (resd - resd.mean())/ resd.std()\n", 698 | "z.plot()\n", 699 | "pd.Series(3, index=resd.index).plot(color=\"r\")\n", 700 | "pd.Series(-2, index=resd.index).plot(color=\"r\")\n", 701 | "plt.ylabel(\"z-score\")\n", 702 | "plt.legend([\"residual\", \"z-score cutoff\"]);" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "def find_anomalies(z, cutoff_lower=-2, cutoff_upper=2):\n", 712 | " ind_lower = z < cutoff_lower\n", 713 | " ind_upper = z > cutoff_upper\n", 714 | " \n", 715 | " return z[ind_lower | ind_upper]\n", 716 | "\n", 717 | "find_anomalies(z, cutoff_lower=-2, cutoff_upper=3)" 718 | ] 719 | }, 720 | { 721 | "cell_type": "markdown", 722 | "metadata": {}, 723 | "source": [ 724 | "**Questions**\n", 725 | "* How should we decide the appropriate z-score cutoff? What are important ideas and consequences should we consider?\n", 726 | "* If we are concerned at identifying as much of the anomalies as possible, how should the z-score cutoff be set?" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": {}, 732 | "source": [ 733 | "## Rolling z-score\n", 734 | "\n", 735 | "The calculation of the z-score relied on the entire time series for calculating the mean and standard deviation. For anomaly detection with time series, we will usually be streaming observations and the entire series will not be available. Instead, we can calculate the z-score on a window of observations rather than the entire time history. Using a window has the advantage of \n", 736 | "\n", 737 | "1. Not having to hold in memory a large amount of data.\n", 738 | "2. Reflecting the fact that it is better to use recent values to judge whether an observation is anomalous.\n", 739 | "3. Being more adaptive to recent changes in the process.\n", 740 | "\n", 741 | "Let's modify the z-score calculation to only use a window of observations." 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": null, 747 | "metadata": {}, 748 | "outputs": [], 749 | "source": [ 750 | "def rolling_z_score(x, window=20):\n", 751 | " rolling = x.rolling(window=window)\n", 752 | " mean_roll = rolling.mean().shift(1) # shift to not include current value\n", 753 | " std_roll = rolling.std().shift(1)\n", 754 | " \n", 755 | " return (x - mean_roll) / std_roll\n", 756 | "\n", 757 | "z_roll = rolling_z_score(resd, window=20)\n", 758 | "z_roll.plot()\n", 759 | "pd.Series(3, index=resd.index).plot(color=\"r\")\n", 760 | "pd.Series(-2, index=resd.index).plot(color=\"r\")\n", 761 | "plt.ylabel(\"z-score\")\n", 762 | "plt.legend([\"residual\", \"z-score cutoff\"]);\n", 763 | "\n", 764 | "find_anomalies(z_roll, cutoff_lower=-2, cutoff_upper=3)" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "## Exercises" 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": {}, 777 | "source": [ 778 | "1. Can you improve on the baseline model for the case study? Note, there are other recorded values in the data set such as the home temperature and humidity. These other time series may help improve the $R^2$ value of the model.\n", 779 | "1. Package an anomaly detector for our time series case study into a class. What should be some hyperparameters to the model? Note, starting with version 0.20, `scikit-learn` has the `OutlierMixin` class that your custom class could inherit form." 780 | ] 781 | }, 782 | { 783 | "cell_type": "markdown", 784 | "metadata": {}, 785 | "source": [ 786 | "*Copyright © 2020 The Data Incubator. All rights reserved.*" 787 | ] 788 | } 789 | ], 790 | "metadata": { 791 | "kernelspec": { 792 | "display_name": "Python 3", 793 | "language": "python", 794 | "name": "python3" 795 | }, 796 | "nbclean": true 797 | }, 798 | "nbformat": 4, 799 | "nbformat_minor": 0 800 | } 801 | -------------------------------------------------------------------------------- /datacourse/machine-learning/ML_Clustering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "init_cell": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%logstop\n", 12 | "%logstart -rtq ~/.logs/ML_Clustering.py append\n", 13 | "%matplotlib inline\n", 14 | "import matplotlib\n", 15 | "import seaborn as sns\n", 16 | "sns.set()\n", 17 | "matplotlib.rcParams['figure.dpi'] = 144" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import matplotlib.pyplot as plt\n", 27 | "import numpy as np" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "# Clustering\n", 35 | "\n", 36 | "Clustering is a branch of unsupervised machine learning where the goal is to identify groups or clusters in your data set without the use of labels. Clustering should not be considered the same as classification; we are not trying make predictions on observations from a set of classes. In clustering, you are identifying a set of similar data points and calling the resulting set a cluster.\n", 37 | "\n", 38 | "Let's consider an example of clustering. You may have a data set characterizing your customers like demographic information and personal preferences. A supervised machine learning application would be to determine whether a person will buy a product. However, an _unsupervised_ machine learning application would be to identify several groups or types of customers. With these groups identified, you can analyze the groups and build profiles describing the groups. For example, one group tends to include people from the ages 20 to 25 who like the outdoors. With these profiles, you can pass that information and analysis to the marketing team to create different advertisements to best attract each group.\n", 39 | "\n", 40 | "In this notebook, we will discuss clustering metrics and two common algorithms for clustering. In the code below, we demonstrate the result of determining three clusters in a data set." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "from sklearn.datasets import make_blobs\n", 50 | "from sklearn.cluster import KMeans\n", 51 | "\n", 52 | "# generate data\n", 53 | "X, _ = make_blobs(n_samples=300, n_features=2, centers=3, cluster_std=2, random_state=0)\n", 54 | "\n", 55 | "# fit cluster model and assign points to clusters\n", 56 | "kmeans = KMeans(n_clusters=3)\n", 57 | "kmeans.fit(X)\n", 58 | "clusters = kmeans.predict(X)\n", 59 | "\n", 60 | "plt.scatter(*X.T, c=clusters, cmap='viridis');" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "In `scikit-learn`, clustering models have the same interface as predictors, albeit they do not make predictions in the same sense as classifiers but rather perform assignments. To train a clustering model object with data set `X`, simply invoke the `fit(X)`. To assign clusters, use `predict(X)`." 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Metrics for clustering\n", 75 | "\n", 76 | "Because clustering is unsupervised learning, we cannot rely on metrics based on labels; there is not a right or wrong answer. However, it is still possible to derive metrics to evaluate the performance of clustering algorithms. More care and analysis will be required when comparing multiple clustering results due to the nature of the derived metrics. Previously in supervised machine learning, we could objectively say one model will perform better because it has a lower test error.\n", 77 | "\n", 78 | "Two common metrics for clustering are\n", 79 | "\n", 80 | "1. **Inertia**: the within cluster sum of square distance\n", 81 | "1. **Silhouette Coefficient**: a measure of how dense and separated are the clusters\n", 82 | "\n", 83 | "Mathematically, inertia is equal to\n", 84 | "\n", 85 | "$$ \\sum_{k} \\sum_{X_j \\in C_k} \\| X_j - \\mu_k \\|^2, $$\n", 86 | "\n", 87 | "where $\\mu_k$ is the centroid of cluster $k$ and $C_k$ is the set of points assigned to cluster $k$. Basically, the inertia is the sum of the distance of each point to the centroid or center of its assigned cluster. A lower inertia means the points assigned to the clusters are closer to the centroid.\n", 88 | "\n", 89 | "The silhouette coefficient is a property assigned to each data point. It is equal to\n", 90 | "\n", 91 | "$$ \\frac{b - a}{\\max(a, b)}, $$\n", 92 | "\n", 93 | "where $a$ is the distance between a point and centroid of its assigned cluster and $b$ is the distance between the point and the centroid of the nearest neighboring cluster (i.e. the closest cluster the point is not assigned to). The silhouette coefficient ranges from -1 to 1. If a point is really close to the centroid of its assigned cluster, then $a \\ll b$ and the silhouette coefficient will be approximately equal to 1. If the reverse is true, $a \\gg b$, then the coefficient will be -1. If the point could have been assigned to either cluster, its coefficient will be 0. Maximizing the silhouette coefficient will prioritize dense and highly separated clusters as dense clusters will have a low $a$ value and having clusters well separated from each other will increase $b$." 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## $K$-Means Clustering\n", 101 | "\n", 102 | "The $K$-means algorithms seeks to find $K$ clusters within a data set. The clusters are chosen to reduce the inertia; the objective function is\n", 103 | "\n", 104 | "$$ \\min_{C_k} \\sum_{k} \\sum_{X_j \\in C_k} \\| X_j - \\mu_k \\|^2. $$\n", 105 | "\n", 106 | "The centroid of a cluster $\\mu_k$ is equal to\n", 107 | "\n", 108 | "$$ \\mu_k = \\frac{1}{|C_k|} \\sum_{X_j \\in C_k} X_j, $$\n", 109 | "\n", 110 | "where $|C_k|$ is the number of points in cluster $k$. The equation says that the components of the centroid are equal to _mean_ of each feature/components of all points assigned to the cluster, hence the name of the algorithm. The training algorithm for $K$-means is straight-forward. After seeding the algorithm, choosing the starting locations of each cluster's centroid,\n", 111 | "\n", 112 | "1. Assign each point to a cluster based on which cluster centroid it's the closest to\n", 113 | "1. Calculate the centroid of resulting cluster using the points that have been assigned to the cluster\n", 114 | "1. Repeat the above steps until convergence is met\n", 115 | "\n", 116 | "Let's create an interactive plot that allows users to walk through the iterations involved in the algorithm." 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "from ipywidgets import interact\n", 126 | "\n", 127 | "def plot_kmeans_steps(X, n_clusters, seeds=np.array([[0, 4], [4, 2], [-2, 4]])):\n", 128 | " def func(step=0):\n", 129 | " iters = step // 2\n", 130 | " \n", 131 | " if iters:\n", 132 | " kmeans = KMeans(n_clusters=n_clusters, max_iter=iters, n_init=1, init=seeds)\n", 133 | " kmeans.fit(X)\n", 134 | " centroids = kmeans.cluster_centers_\n", 135 | " labels = kmeans.labels_\n", 136 | " else:\n", 137 | " centroids = seeds\n", 138 | " labels = '0.5'\n", 139 | " if step % 2:\n", 140 | " kmeans = KMeans(n_clusters=n_clusters, max_iter=iters+1, n_init=1, init=seeds)\n", 141 | " kmeans.fit(X)\n", 142 | " labels = kmeans.labels_\n", 143 | " \n", 144 | " plt.scatter(*X.T, c=labels, cmap='viridis', alpha=0.5)\n", 145 | " plt.scatter(*centroids.T, c=range(n_clusters), cmap='viridis', marker='*', s=150, \n", 146 | " linewidths=1, edgecolors='k')\n", 147 | " plt.title(['Set Centroids', 'Assign Clusters'][step % 2])\n", 148 | " \n", 149 | " return func\n", 150 | "\n", 151 | "interact(plot_kmeans_steps(X, n_clusters=3), step=(0, 10));" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### Implementation details\n", 159 | "\n", 160 | "There are several thing to keep in mind about the $K$-means algorithm. While the algorithm is guaranteed to converge, it will not converge to the _global_ minimum. It is a greedy algorithm; it is relatively quick to run but we are at risk of obtaining a suboptimal solution. Different starting positions will result in different solutions. To counteract suboptimal solutions, `KMeans` by default runs the algorithm ten times and chooses the best result. The number of runs is controlled by `n_init`. The algorithm for the initial centroid locations is controlled by the keyword `init`; the default is `'kmeans++'` which has been shown to work well and results in faster convergence. You can read more about the `kmeans++` algorithm [here](https://en.wikipedia.org/wiki/K-means%2B%2B)." 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "**Questions**\n", 168 | "* What is the general behavior of the inertia as we increase the number of clusters?\n", 169 | "* Is it important to scale our data set when using $K$-means? If so, what `scikit-learn` tool would we use?" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "## Choosing the number of clusters\n", 177 | "\n", 178 | "The number of clusters is a hyperparameter to clustering models; it is set prior to training. How does one choose the optimal number of clusters? We expect as we use more clusters to group our observations the inertia goes down because the points will be closer to the cluster's centroid. What is the inertia when the number of clusters is equal to the number of observations? If every point is its own cluster, the inertia is zero. Choosing the number of clusters that yields the lowest inertia results in a meaningless solution. Instead, we need identify at what point is increasing the number of clusters no longer resulting in an appreciable drop in inertia (the point of \"diminishing returns\"). Identifying this region is accomplished using an \\\"elbow plot\\\", named because the graph looks like a bent arm. Let's construct an elbow plot with the California housing data set." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "from sklearn.datasets import fetch_california_housing\n", 188 | "from sklearn.cluster import KMeans\n", 189 | "from sklearn.preprocessing import StandardScaler\n", 190 | "from sklearn.utils import shuffle\n", 191 | "\n", 192 | "# import and shuffle the data\n", 193 | "data = fetch_california_housing()\n", 194 | "X = data['data']\n", 195 | "X_shuffled = shuffle(X, random_state=0)\n", 196 | "\n", 197 | "scaler = StandardScaler() # need to scale data to make kmeans work properly\n", 198 | "Xt = scaler.fit_transform(X_shuffled[:500]) # down sample for speed\n", 199 | "n_clusters = range(1, 200, 2)\n", 200 | "inertia = []\n", 201 | "\n", 202 | "for n in n_clusters:\n", 203 | " kmeans = KMeans(n_clusters=n)\n", 204 | " kmeans.fit(Xt)\n", 205 | " inertia.append(kmeans.inertia_)\n", 206 | "\n", 207 | "plt.plot(n_clusters, inertia/inertia[0])\n", 208 | "plt.hlines(0.1, n_clusters[0], n_clusters[-1], 'r', linestyles='dashed')\n", 209 | "plt.hlines(0.05, n_clusters[0], n_clusters[-1], 'r', linestyles='dashed')\n", 210 | "plt.xlabel('clusters')\n", 211 | "plt.ylabel('relative inertia')\n", 212 | "plt.legend(['inertia', '10% relative inertia', '5% relative inertia']);" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "The elbow plot shows us that there is little drop in inertia after about 50 or 100 clusters with most of the drop occurring before 25. As discussed earlier, one cannot look at this plot and definitely say whether using 50 clusters is better than using 100 clusters. Instead, we justify our choice of cluster size given the analysis and understand there is some arbitrariness in the decision. Further, rarely do machine learning problems exist in a bubble; there are constraints usually imposed by business and financial considerations. For example, in the clustering of our consumer data for advertisement purposes, it may only be financially practical to identify no more than 10 groups." 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Elongated clusters\n", 227 | "\n", 228 | "Consider applying $K$-means to a data set where points are clustered in elongated shapes. We can visualize the data and manually identify the clusters. Let's see what happens when we try to use $K$-means with a data set consisting points arranged in elongated shapes." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "def elongated(clustering=None):\n", 238 | " X, _ = make_blobs(n_samples=300, n_features=2, cluster_std=0.5, random_state=0)\n", 239 | "\n", 240 | " # elongate the data\n", 241 | " Xt = np.dot(X, [[0.6, -0.64], [-0.4, 0.9]])\n", 242 | "\n", 243 | " if clustering is None:\n", 244 | " plt.scatter(*Xt.T, cmap='viridis')\n", 245 | " else:\n", 246 | " kmeans = KMeans(n_clusters=3)\n", 247 | " kmeans.fit(Xt)\n", 248 | " clusters = kmeans.predict(Xt)\n", 249 | " plt.scatter(*Xt.T, c=clusters, cmap='viridis');\n", 250 | " \n", 251 | "interact(elongated, clustering=[None, 'kmeans']);" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "$K$-means did not identify what we would consider are the obvious clusters in the data set. Remember, the clusters are chosen to reduce inertia, which is a measure of how coherent the clusters are, resulting in isotropic clusters." 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## Gaussian mixture models\n", 266 | "\n", 267 | "Gaussian mixture models are an alternative to the $K$-means clustering. As seen previously, it fails to define clusters of elongated shapes. Gaussian mixture models offer additional capabilities compared to $K$-means. Cluster definitions are probabilistic and it can model elongated shapes. Gaussian mixture models define a probability density function over the feature space defined by adding or \"mixing\" several multivariate Gaussians. \n", 268 | "\n", 269 | "To understand Gaussian mixture models, let's first define and plot a multivariate Gaussian. A multivariate Gaussian is simply a Gaussian distribution for more than one dimension. As with the one dimensional Gaussian, it has two parameters. The mean $\\mu_i$ is now a vector since we have multiple dimensions and the variance has been replaced with the **covariance** matrix. Not only does the covariance matrix controls how spread out the distribution is but also how correlated the variables are. If the variables are highly correlated, then the distribution will looked stretched. The covariance matrix contains all possible pair-wise covariance values for all variables. For a two dimensional case the covariance matrix has three different parameters, the variance of each variable, $\\sigma^2_{x}$ and $\\sigma^2_{y}$ and the covariance between the two variables $\\mathrm{cov}(x, y)$.\n", 270 | "\n", 271 | "$$ \\Sigma = \\left[ \\begin{array}{cc} \n", 272 | "\\sigma^2_{x} & \\mathrm{cov}(x, y) \\\\\n", 273 | "\\mathrm{cov}(x, y) & \\sigma^2_{y} \\\\\n", 274 | "\\end{array} \\right]$$\n", 275 | "\n", 276 | "Let's build an interactive plot where we can see the effect of modifying the covariance matrix for a bivariate Gaussian. We can adjust all three parameters involved in the covariance matrix. Note, the correlation coefficient $\\rho$ is just a normalization of the covariance. In this case, it is equal to $\\rho = \\mathrm{cov}(x, y)/ (\\sigma_{x} \\sigma_{y})$." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "from ipywidgets import FloatSlider\n", 286 | "import scipy as sp\n", 287 | "\n", 288 | "def multivariate_gaussian(corr=0, sigma_x=1, sigma_y=1):\n", 289 | " cov = sigma_x*sigma_y*np.array([[sigma_x/sigma_y, corr], [corr, sigma_x/sigma_y]])\n", 290 | " dist = sp.stats.multivariate_normal(mean=[0, 0], cov=cov)\n", 291 | " X = dist.rvs(1000, random_state=0)\n", 292 | " \n", 293 | " n = 100\n", 294 | " xlims = [-4, 4]\n", 295 | " ylims = [-4, 4]\n", 296 | " X1, X2 = np.meshgrid(np.linspace(*xlims, n), np.linspace(*ylims, n))\n", 297 | " proba = dist.pdf(np.hstack((X1.reshape(-1, 1), X2.reshape(-1, 1))))\n", 298 | " \n", 299 | " plt.scatter(*X.T, alpha=0.25)\n", 300 | " plt.contour(X1, X2, proba.reshape(100, 100))\n", 301 | " plt.xlim(xlims)\n", 302 | " plt.ylim(ylims)\n", 303 | " plt.xlabel('$x$')\n", 304 | " plt.ylabel('$y$')\n", 305 | "\n", 306 | "corr_slider = FloatSlider(min=-0.99, max=0.99, value=0., step=0.01, description='$\\\\rho$')\n", 307 | "sigma_x_slider = FloatSlider(min=0.5, max=1.5, value=1., step=0.01, description='$\\sigma_x$')\n", 308 | "sigma_y_slider = FloatSlider(min=0.5, max=1.5, value=1., step=0.01, description='$\\sigma_y$')\n", 309 | "interact(multivariate_gaussian, corr=corr_slider, sigma_x=sigma_x_slider, sigma_y=sigma_y_slider);" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "Notice how the covariance matrix controls the elongation of the distribution. It is this \"stretching\" capability that enables Gaussian mixture models to create anisotropic clusters. The probability across our feature space is defined as\n", 317 | "\n", 318 | "$$ p(x_j) = \\sum_k \\phi_k \\mathcal{N}(x_j; \\mu_k, \\Sigma_k), $$\n", 319 | "\n", 320 | "where $\\mathcal{N}(x_j; \\mu_k, \\Sigma_k)$ is a multivariate Gaussian parameterized by $\\mu_k$ and $\\Sigma_k$. When we fit a mixture model to a training set, we are trying to determine $\\phi_k$, $\\mu_k$, and $\\Sigma_k$ that will maximize the likelihood function\n", 321 | "\n", 322 | "$$ L(\\phi_k, \\mu_k, \\Sigma_k) = \\prod_j p(x_j).$$\n", 323 | "\n", 324 | "The most common algorithm used to fit Gaussian mixture models is the expectation-maximization algorithm, a two step iterative scheme. More about the algorithm can be read [here]( https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm). As with complicated models, the additional complexity results in slower training times and Gaussian mixture models will be difficult to train with large data sets." 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "from sklearn.mixture import GaussianMixture\n", 334 | "\n", 335 | "X, _ = make_blobs(n_samples=300, n_features=2, cluster_std=0.5, random_state=0)\n", 336 | "Xt = np.dot(X, [[0.6, -0.64], [-0.4, 0.9]])\n", 337 | "\n", 338 | "gm = GaussianMixture(n_components=3, covariance_type='full')\n", 339 | "gm.fit(Xt)\n", 340 | "clusters = gm.predict(Xt)\n", 341 | "\n", 342 | "xlims = [-3, 2]\n", 343 | "ylims = [-2, 5]\n", 344 | "X1, X2 = np.meshgrid(np.linspace(*xlims, 100), np.linspace(*ylims, 100))\n", 345 | "proba = gm.score_samples(np.hstack((X1.reshape(-1, 1), X2.reshape(-1, 1))))\n", 346 | "\n", 347 | "plt.contour(X1, X2, np.log(-proba.reshape(100, 100)), 10)\n", 348 | "plt.colorbar()\n", 349 | "plt.xlim(xlims)\n", 350 | "plt.ylim(ylims)\n", 351 | "plt.xlabel('$x_1$')\n", 352 | "plt.ylabel('$x_2$')\n", 353 | "plt.scatter(*Xt.T, c=clusters, cmap='viridis')\n", 354 | "plt.title('Negative Log Likelihood');" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "## Exercises" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "1. The silhouette coefficient can also be used for choosing the number of clusters. `scikit-learn` has a function to calculate the silhouette coefficient: `from sklearn.metrics import silhouette_score`. Compare the silhouette coefficient as a function of number of clusters with the elbow plot displayed in the notebook.\n", 369 | "\n", 370 | "1. For the California housing data, create an elbow plot using Gaussian mixture models. What would be good choice of the number of clusters. How does the training time compare with $K$-means?" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "*Copyright © 2020 The Data Incubator. All rights reserved.*" 378 | ] 379 | } 380 | ], 381 | "metadata": { 382 | "kernelspec": { 383 | "display_name": "Python 3", 384 | "language": "python", 385 | "name": "python3" 386 | }, 387 | "nbclean": true 388 | }, 389 | "nbformat": 4, 390 | "nbformat_minor": 0 391 | } 392 | -------------------------------------------------------------------------------- /datacourse/machine-learning/ML_Dimension_Reduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "init_cell": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%logstop\n", 12 | "%logstart -rtq ~/.logs/ML_Dimension_Reduction.py append\n", 13 | "%matplotlib inline\n", 14 | "import matplotlib\n", 15 | "import seaborn as sns\n", 16 | "sns.set()\n", 17 | "matplotlib.rcParams['figure.dpi'] = 144" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import matplotlib.pyplot as plt\n", 27 | "import numpy as np" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "# Dimension Reduction\n", 35 | "\n", 36 | "Dimension reduction is an unsupervised learning technique; no labels are used for the process. The goal of dimension reduction is to take observations characterized by a set of features and reduce the number of features. Hence, we are _reducing_ the dimension of our data points. For example, instead of using 100 features to characterize each observation, dimension reduction techniques allow us to represent the data using a truncated set, e.g., using 10 features instead of the original 100. While reducing the dimension will result in some information loss, the algorithms we will discuss aim to keep this loss at a minimum.\n", 37 | "\n", 38 | "Several applications of dimension reduction are:\n", 39 | "\n", 40 | "1. Reducing file sizes\n", 41 | "1. Visualizing high dimensional data sets\n", 42 | "1. Faster training and predicting times for supervised machine learning models\n", 43 | "1. Generating a better, truncated, set of new features to represent our data\n", 44 | "\n", 45 | "The notebook will discuss three commonly used dimension reduction techniques and how they are implemented in `scikit-learn`." 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## Mathematics of dimension reduction\n", 53 | "\n", 54 | "Dimension reduction techniques work by first creating a new set of dimensions/axes and _projecting_ the data to the new space. The process of projecting is a matrix multiplication,\n", 55 | "\n", 56 | "$$ X' = XP,$$\n", 57 | "\n", 58 | "where $X$ is the matrix of our original data, $n$ observations and $p$ columns/features, $X'$ is the matrix of our data in the new space, and $P$ is the matrix that projects our data onto the new feature space. $P$ has $p$ rows and $p$ columns where each column is a vector that represents a new dimension. The vectors are ordered from most important to least important with regards to capturing the variation in the data. If we only include the first $m$ columns of $P$, then the matrix multiplication will project our data onto a lower dimensional space. The matrix multiplication of an $n$ by $p$ matrix with a $p$ and $m$ matrix will result in a $n$ by $m$ matrix; our transformed data set has less features,\n", 59 | "\n", 60 | "$$ X' = X \\tilde{P}, $$\n", 61 | "\n", 62 | "where $\\tilde{P}$ is the truncated form of $P$ that has $m$ columns where $m < p$. The dimension reduction algorithms work by finding $P$ given an objective function. The objective function is typically constructing the projection matrix $P$ such that using the truncated form $\\tilde{P}$ can still retain the majority of the information in our data set." 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Principal component analysis\n", 70 | "\n", 71 | "Principal component analysis (PCA) is a dimension reduction technique that takes a data set characterized by a set of possibly correlated features and generates a new set of features that are uncorrelated. It is used as a dimension reduction technique because the new set of uncorrelated features are chosen to be efficient in terms of capturing the variance in the data set.\n", 72 | "\n", 73 | "Let's examine a case where we have a data set of only two dimensions. In practice, PCA is rarely used when the dimension of the data set is already low. However, it is easier to illustrate the method when we have two or three dimensions." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "np.random.seed(0)\n", 83 | "x1 = np.linspace(0, 1, 500)\n", 84 | "x2 = 2*x1 + 1 + 0.2*np.random.randn(500)\n", 85 | "X = np.vstack((x1, x2)).T\n", 86 | "\n", 87 | "plt.scatter(*X.T, alpha=0.25)\n", 88 | "plt.plot(x1, 2*x1 + 1, '--k', linewidth=2)\n", 89 | "plt.xlabel('$x_1$')\n", 90 | "plt.ylabel('$x_2$');" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "The data plotted is characterized by two dimensions, however, most of the variation does not occur in either of the two dimensions. Most of the points \"follow\" along the direction plotted in the dashed line. The variables $x_1$ and $x_2$ are highly correlated; as $x_1$ increases, in general, so does $x_2$ and vice versa.\n", 98 | "\n", 99 | "Instead of using the original two features, $x_1$ and $x_2$, perhaps we can use a different set of features, $\\xi_1$ and $\\xi_2$. The first chosen feature $\\xi_1$ should be aligned in the direction of greatest variation while the second will be _orthogonal_ to the first. The new axes/dimensions are referred to as _principal components_. Let's visualize the data set but using the principal components $\\xi_1$ and $\\xi_2$ rather than the original features." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "from sklearn.decomposition import PCA\n", 109 | "\n", 110 | "pca = PCA(n_components=2)\n", 111 | "Xt = pca.fit_transform(X)\n", 112 | "\n", 113 | "xi_1_max, xi_2_max = Xt.max(axis=0)\n", 114 | "xi_1_min, xi_2_min = Xt.min(axis=0)\n", 115 | "\n", 116 | "plt.hlines(0, xi_1_min, xi_1_max, linestyles='--')\n", 117 | "plt.vlines(0, xi_2_min, xi_2_max, linestyles='--')\n", 118 | "\n", 119 | "plt.scatter(*Xt.T, alpha=0.25)\n", 120 | "plt.xlim([-1.75, 1.75])\n", 121 | "plt.ylim([-1.75, 1.75])\n", 122 | "plt.xlabel('$\\\\xi _1$')\n", 123 | "plt.ylabel('$\\\\xi _2$');" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "In the figure, we can clearly observe that $\\xi_1$ is the dimension with the largest variation. In the PCA algorithm, $\\xi_1$ is chosen to capture as much of the variation as possible, with $\\xi_2$ picking up the rest of remaining variation. Now, if we want to use one dimension to describe our data, we would keep $\\xi_1$ and drop $\\xi_2$, ensuring we keep as much of the information in our data set using just one dimension. Further, notice how the new dimensions are not correlated. As we move from lower to higher values of $\\xi_1$, $\\xi_2$ does not predictability increase or decrease.\n", 131 | "\n", 132 | "In the visualization below, you can represent the data points in the space of either one (reduced) or two principal components or project back onto the original space _after_ reducing the dimension." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "from ipywidgets import interact, fixed\n", 142 | "\n", 143 | "np.random.seed(0)\n", 144 | "ind = np.random.choice(Xt.shape[0], 50)\n", 145 | "\n", 146 | "def reduce_dim(X, Xt, step='one PC'):\n", 147 | " if step == 'original space': \n", 148 | " pca = PCA(n_components=1)\n", 149 | " X_t = pca.fit_transform(X)\n", 150 | " plt.scatter(*pca.inverse_transform(X_t[ind, :]).T)\n", 151 | " plt.scatter(*X[ind, :].T, c='b', alpha=0.1)\n", 152 | "\n", 153 | " plt.xlabel('$x_1$')\n", 154 | " plt.ylabel('$x_2$');\n", 155 | " \n", 156 | " return \n", 157 | " \n", 158 | " elif step == 'two PC':\n", 159 | " plt.scatter(*Xt[ind, :].T)\n", 160 | "\n", 161 | " for x in Xt[ind, :]:\n", 162 | " plt.vlines(x[0], 0, x[1], linestyles='--') \n", 163 | " else:\n", 164 | " plt.scatter(Xt[ind, 0], np.zeros(50))\n", 165 | " plt.scatter(*Xt[ind, :].T, alpha=0.1, c='b')\n", 166 | "\n", 167 | " plt.xlim([-1.75, 1.75])\n", 168 | " plt.ylim([-0.5, 0.5])\n", 169 | " plt.xlabel('$\\\\xi _1$')\n", 170 | " plt.ylabel('$\\\\xi _2$')\n", 171 | " \n", 172 | "interact(reduce_dim, X=fixed(X), Xt=fixed(Xt), step=['two PC', 'one PC', 'original space']);" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## PCA in `scikit-learn`\n", 180 | "\n", 181 | "In `scikit-learn`, dimension reduction algorithms are transformers. The choice of having these algorithms as transformers makes sense since they apply a transformation on the data set. Let's illustrate the syntax for the PCA algorithm in `scikit-learn`. Note, other dimension reduction techniques in `scikit-learn` will have the same interface. For most of these algorithms, the data needs to be centered and scaled to work properly. `PCA` automatically centers the data but **does not** scale it. `StandardScaler` is often used for preprocessing the data prior to applying PCA." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "from sklearn.datasets import fetch_california_housing\n", 191 | "from sklearn.decomposition import PCA\n", 192 | "from sklearn.preprocessing import StandardScaler\n", 193 | "\n", 194 | "data = fetch_california_housing()\n", 195 | "X = data['data']\n", 196 | "\n", 197 | "scaler = StandardScaler()\n", 198 | "X_scaled = scaler.fit_transform(X)\n", 199 | "pca = PCA(n_components=4)\n", 200 | "Xt = pca.fit_transform(X_scaled)\n", 201 | "\n", 202 | "print(\"number of dimension before reduction: {}\".format(X_scaled.shape[-1]))\n", 203 | "print(\"number of dimension after reduction: {}\".format(Xt.shape[-1]))" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "When the `fit` method is called, the transformer learns the matrix $\\tilde{P}$ to use for truncating our data set given the number of features/components, `n_components`, we want to have for our transformed data set. The matrix $\\tilde{P}^T$, where $T$ signifies the transpose, is stored in the attribute `components_` of the PCA transformer object.\n", 211 | "\n", 212 | "In the example above, we have gone from 8 to 4 dimensions. However, we don't know how much of the original information we have retained. With a trained PCA object, the explained variance of each new components is stored in the `explained_variance_` and `explained_variance_ratio_` attribute, the latter being normalized by the total variance." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "print(\"explained variance ratio: {}\".format(pca.explained_variance_ratio_))\n", 222 | "print(\"cumulative explained variance ratio: {}\".format(np.cumsum(pca.explained_variance_ratio_)[-1]))" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "Thus, with just 4 components/features, we are able to capture about 77% of the variance of the original full order data set. We could also calculate the total explained variance by using the `inverse_transform` method. After transforming our data to obtain the reduced form, we can apply the inversion to obtain the approximation of our data in the original feature space. Then, we can calculate the resulting variance." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "print(\"retained variance: {}\".format(pca.inverse_transform(Xt).var()))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "## Implementation details of PCA\n", 246 | "\n", 247 | "We have not discussed exactly how PCA obtains the new features. The matrix $\\tilde{P}$ is chosen such that \n", 248 | "\n", 249 | "$$ \\| X_c - X'\\tilde{P}^T \\|_2 $$\n", 250 | "\n", 251 | "is minimized. The subscript $c$ refers to the centered data set. The product $X'\\tilde{P}^T$ is the reconstruction of our data onto the original feature space. There are several algorithms to solve for the principal components but a popular one involves applying singular value decomposition. Singular value decomposition (SVD) is an algorithm to decompose a matrix into a product of three matrices,\n", 252 | "\n", 253 | "$$ X_c = U \\Sigma P^T. $$\n", 254 | "\n", 255 | "You can envision that the matrix $X_c$ represents a transformation that can be broken into three steps: an initial rotation $P^T$, a scaling $\\Sigma$, and a final rotation $U$. By applying SVD on $X_c$, the matrix $P$ is solved. The matrix $\\Sigma$ is a diagonal matrix, a matrix with non-zero values along the diagonal, \n", 256 | "\n", 257 | "$$ \\Sigma = \\left[ \\begin{array}{ccc} \n", 258 | "\\sigma_1 & \\\\\n", 259 | "& \\sigma_2 & \\\\\n", 260 | "&& \\ddots & \\\\\n", 261 | "&&&\n", 262 | "\\end{array} \\right]$$\n", 263 | "\n", 264 | "The diagonal values are ordered such that $|\\sigma_1| \\ge |\\sigma_2| \\ge \\cdots |\\sigma_{p-1}| \\ge |\\sigma_p|$. The larger the absolute value of $\\sigma$, the greater amount of variation exists in that direction/component. Thus, to generate $\\tilde{P}$ to truncate the data set, the first $m$ components/columns of $P$ are kept." 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Choosing the number of components\n", 272 | "\n", 273 | "How does one chose the best number of components to use? The answer is not clear cut; using more components will increase the explained variance but using too many will defeat the purpose of reducing the number of dimensions. The best way to determine a good number of components to use is to construct a plot of the cumulative explained variance versus the number of components. We need to identify at what point is increasing the number of components no longer has an appreciable gain in explained variance, the point of diminishing returns. Identifying this region is accomplished using an \"elbow plot\", named because of the resemblance of an arm with a bent elbow. Let's create the elbow plot for the California data and see how many components we need to keep to explain at least 90% of the variance." 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "X_scaled = scaler.fit_transform(X)\n", 283 | "p = X_scaled.shape[-1]\n", 284 | "pca = PCA(n_components=p)\n", 285 | "pca.fit(X_scaled)\n", 286 | "cumulative_explained_var = np.cumsum(pca.explained_variance_ratio_)\n", 287 | "\n", 288 | "plt.plot(range(1, p + 1), cumulative_explained_var)\n", 289 | "plt.hlines(0.9, 1, p+1, linestyles='--')\n", 290 | "plt.hlines(0.99, 1, p+1, linestyles='--')\n", 291 | "plt.xlabel('number of components')\n", 292 | "plt.ylabel('cumulative explained variance');" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "It appears we only need to use 5 and 6 components if we want to retain 90% and 99% of the variance, respectively. Note, we usually see more dramatic performance when we have more features. With more features, we are more likely to have a lot of redundant information and correlated features." 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "## Truncated Singular Value Decomposition\n", 307 | "\n", 308 | "To apply PCA, the data set needs to be centered, i.e., the features needs to have zero mean. Centering the data becomes a problem when we are representing our data using a sparse matrix. To center the data, you need to subtract each entry in the matrix by a value; all zero entries are now non-zero. If we have a sparse matrix, an alternative is to use the `TruncatedSVD` class. The `TruncatedSVD` transformer objects work the same as PCA but it does not center the data prior to finding the principal components." 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "## Non-negative matrix factorization\n", 316 | "\n", 317 | "In certain applications, our data only has non-negative values. For example, in natural language processing, the bag-of-words model yields a matrix of only non-negative values. In these applications, it is important that any dimension reduction scheme preserves the non-negative nature of any resulting matrices, keeping explicability in our analysis. A variation of PCA but with the added constraint that the derived matrices are non-negative is called non-negative matrix factorization (NMF).\n", 318 | "\n", 319 | "NMF is often used in the field of topic modeling; what are the major topics/ideas in a corpus. We will apply NMF to the newsgroup data set, http://qwone.com/~jason/20Newsgroups/. The first step is to transform our text data into a structured form. We will use `TfidfVectorizer` transformer that creates a data set where our features our words and the entries is a weighted frequency of a particular word. We will formally discuss the field of natural language processing in a separate notebook. When applying NMF, the resulting new dimensions represent a collection of words, our old features, which we can refer to as a topic. For each derived new component, we can display the top words that most contribute to that new dimension. With those top words identified, we can look to see what topic or concept each new feature represents." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "from sklearn.datasets import fetch_20newsgroups\n", 329 | "from sklearn.decomposition import NMF\n", 330 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 331 | "from sklearn.pipeline import Pipeline\n", 332 | "\n", 333 | "data = fetch_20newsgroups(shuffle=True, remove=('headers', 'footers', 'quotes'))\n", 334 | "X = data['data']\n", 335 | "\n", 336 | "n_topics = 10\n", 337 | "n_top_words = 20\n", 338 | "\n", 339 | "tfidf = TfidfVectorizer(stop_words='english')\n", 340 | "nmf = NMF(n_components=n_topics, random_state=0)\n", 341 | "pipe = Pipeline([('vectorizer', tfidf), ('dim-red', nmf)])\n", 342 | "pipe.fit(X)\n", 343 | "\n", 344 | "feature_names = tfidf.get_feature_names()\n", 345 | "\n", 346 | "for i, topic in enumerate(nmf.components_):\n", 347 | " print(\"Topic: {}\".format(i))\n", 348 | " indices = topic.argsort()[-n_top_words-1:-1]\n", 349 | " top_words = [feature_names[ind] for ind in indices]\n", 350 | " print(\" \".join(top_words), \"\\n\")" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "From the analysis, we can see that topic 1 represents \"computers\" while topic 2 represents \"Christianity\"." 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "## Using PCA with a supervised model\n", 365 | "\n", 366 | "A common usage of PCA is to truncate the number of dimensions so that the training and predicting times of a supervised machine learning models will be significantly faster. For example, for decision trees, the training and time complexity with respect to the number of features is $O(p)$. Thus, reducing our features by half will reduce our training by half as well. Let's see the effect of using PCA with conjunction with decision trees for the California housing data." 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "from shutil import rmtree\n", 376 | "from tempfile import mkdtemp\n", 377 | "import time\n", 378 | "\n", 379 | "from sklearn.datasets import make_classification\n", 380 | "from sklearn.model_selection import GridSearchCV, train_test_split\n", 381 | "from sklearn.tree import DecisionTreeClassifier\n", 382 | "\n", 383 | "X, y = make_classification(n_samples=10000, n_features=100, n_informative=10, random_state=0)\n", 384 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", 385 | "\n", 386 | "scaler = StandardScaler()\n", 387 | "pca = PCA(n_components=10)\n", 388 | "tree = DecisionTreeClassifier()\n", 389 | "\n", 390 | "cache = mkdtemp()\n", 391 | "pipe = Pipeline([('scaler', scaler), ('dim-red', pca), ('clf', tree)], memory=cache)\n", 392 | "param_grid = {'clf__max_depth': range(2, 20)}\n", 393 | "grid_search = GridSearchCV(pipe, param_grid, cv=3, n_jobs=2)\n", 394 | "\n", 395 | "t_0 = time.time()\n", 396 | "grid_search.fit(X_train, y_train)\n", 397 | "t_elapsed = time.time() - t_0\n", 398 | "\n", 399 | "print(\"training time: {:g} seconds\".format(t_elapsed))\n", 400 | "print(\"test accuracy: {}\".format(grid_search.score(X_test, y_test)))\n", 401 | "rmtree(cache)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "pipe = Pipeline([('scaler', scaler), ('clf', tree)], memory=cache)\n", 411 | "param_grid = {'clf__max_depth': range(2, 20)}\n", 412 | "grid_search = GridSearchCV(pipe, param_grid, cv=3, n_jobs=2)\n", 413 | "\n", 414 | "t_0 = time.time()\n", 415 | "grid_search.fit(X_train, y_train)\n", 416 | "t_elapsed = time.time() - t_0\n", 417 | "\n", 418 | "print(\"training time {:g} seconds\".format(t_elapsed))\n", 419 | "print(\"test accuracy {}\".format(grid_search.score(X_test, y_test)))\n", 420 | "rmtree(cache)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "By transforming the data set to have 10 rather than 100 dimensions, the training time is reduced by a third. However, the cost of faster training is accuracy." 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "## Dimension reduction for visualization\n", 435 | "\n", 436 | "Another use for dimension reduction is for visualization of high dimension data set. It is difficult to visualize more than two or three dimension. One approach is to choose two or three variable when plotting. However, this approach will only visualize the relationship of the data for the given chosen variables. While we cannot visual the entire relationship for all the variables in our data set, we can generate two or three new features that will capture as much of the variation as possible, more than just using two of three variables in the original set. Let's visualize the iris data set which has four components by using two generated features." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "from sklearn.datasets import load_iris\n", 446 | "\n", 447 | "data = load_iris()\n", 448 | "X = data['data']\n", 449 | "y = data['target']\n", 450 | "\n", 451 | "pca = PCA(n_components=2)\n", 452 | "pipe = Pipeline([('scaler', StandardScaler()), ('dim-red', pca)])\n", 453 | "Xt = pipe.fit_transform(X)\n", 454 | "\n", 455 | "plt.scatter(*Xt.T, c=y, cmap='viridis')\n", 456 | "plt.xlabel('$\\\\xi_1$')\n", 457 | "plt.ylabel('$\\\\xi_2$');" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "explained_var = np.cumsum(pca.explained_variance_ratio_)\n", 467 | "print('explained variance with two dimensions: {}'.format(explained_var[-1]))" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "## Exercises\n", 475 | "\n", 476 | "For the following exercises, use the Olivetti face data set, a set of 400 images of faces from 10 individuals. Each image has 4096 pixels, representing our features. The data set can be retrieved by using the `fetch_olivetti_faces` function in the `sklearn.datasets` module.\n", 477 | "\n", 478 | "1. Apply NMF to generate new features and visualize them. Using `matplotlib`, images can be visualized using the `plt.imshow` function.\n", 479 | "1. Train a supervised machine learning model to classify the images. Repeat but use a dimension reduction technique. Compare both the test score and the time required to train the model.\n", 480 | "1. In the demonstration of using PCA in conjunction of supervised machine learning, we did not _simultaneously_ tune the decision tree regressor or the number of components. Tune both of these estimators." 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "*Copyright © 2020 The Data Incubator. All rights reserved.*" 488 | ] 489 | } 490 | ], 491 | "metadata": { 492 | "kernelspec": { 493 | "display_name": "Python 3", 494 | "language": "python", 495 | "name": "python3" 496 | }, 497 | "nbclean": true 498 | }, 499 | "nbformat": 4, 500 | "nbformat_minor": 0 501 | } 502 | -------------------------------------------------------------------------------- /datacourse/machine-learning/ML_Metrics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 25, 6 | "metadata": { 7 | "init_cell": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%logstop\n", 12 | "%logstart -rtq ~/.logs/ML_Metrics.py append\n", 13 | "%matplotlib inline\n", 14 | "import matplotlib\n", 15 | "import seaborn as sns\n", 16 | "sns.set()\n", 17 | "matplotlib.rcParams['figure.dpi'] = 144" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 13, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import matplotlib.pyplot as plt\n", 27 | "import pandas as pd\n", 28 | "import sklearn.datasets" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Metrics for supervised machine learning\n", 36 | "\n", 37 | "The general problem supervised machine learning seeks to solve is to map a measurement of several variables to a target value or class. For example, we might use supervised machine learning to transcribe spoken language to text, predict home values based on neighborhood amenities, or detect fraudulent transactions. In order to assess whether our model is succeeding, we need to formally define what success is for the given task. In this notebook, we will explore several common **metrics** for model performance." 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Mathematics of supervised learning\n", 45 | "\n", 46 | "For most machine-learning problems, our model receives a vector of **features**, $X$, and maps it to some predicted label, $y$. In order to train our model, we will need many **observations** (i.e. measurements) and their associated labels. We can assemble these observations into a matrix.\n", 47 | "\n", 48 | "$$ f(X_{ij}) \\approx y_i $$\n", 49 | "\n", 50 | "We'll use the California housing data set as an example. The California housing data set has measurements of average house age, average number of rooms, location, and other qualities for various census blocks of California." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 14, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "--2021-01-27 15:07:38-- http://dataincubator-wqu.s3.amazonaws.com/caldata/cal_housing.pkz\n", 63 | "Resolving dataincubator-wqu.s3.amazonaws.com (dataincubator-wqu.s3.amazonaws.com)... 52.217.13.44\n", 64 | "Connecting to dataincubator-wqu.s3.amazonaws.com (dataincubator-wqu.s3.amazonaws.com)|52.217.13.44|:80... connected.\n", 65 | "HTTP request sent, awaiting response... 200 OK\n", 66 | "Length: 366863 (358K) [binary/octet-stream]\n", 67 | "Saving to: ‘/home/jovyan/scikit_learn_data/cal_housing.pkz’\n", 68 | "\n", 69 | "cal_housing.pkz 100%[===================>] 358.26K --.-KB/s in 0.04s \n", 70 | "\n", 71 | "2021-01-27 15:07:38 (9.86 MB/s) - ‘/home/jovyan/scikit_learn_data/cal_housing.pkz’ saved [366863/366863]\n", 72 | "\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "!wget http://dataincubator-wqu.s3.amazonaws.com/caldata/cal_housing.pkz -nc -P ~/scikit_learn_data/" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 15, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | ".. _california_housing_dataset:\n", 90 | "\n", 91 | "California Housing dataset\n", 92 | "--------------------------\n", 93 | "\n", 94 | "**Data Set Characteristics:**\n", 95 | "\n", 96 | " :Number of Instances: 20640\n", 97 | "\n", 98 | " :Number of Attributes: 8 numeric, predictive attributes and the target\n", 99 | "\n", 100 | " :Attribute Information:\n", 101 | " - MedInc median income in block\n", 102 | " - HouseAge median house age in block\n", 103 | " - AveRooms average number of rooms\n", 104 | " - AveBedrms average number of bedrooms\n", 105 | " - Population block population\n", 106 | " - AveOccup average house occupancy\n", 107 | " - Latitude house block latitude\n", 108 | " - Longitude house block longitude\n", 109 | "\n", 110 | " :Missing Attribute Values: None\n", 111 | "\n", 112 | "This dataset was obtained from the StatLib repository.\n", 113 | "http://lib.stat.cmu.edu/datasets/\n", 114 | "\n", 115 | "The target variable is the median house value for California districts.\n", 116 | "\n", 117 | "This dataset was derived from the 1990 U.S. census, using one row per census\n", 118 | "block group. A block group is the smallest geographical unit for which the U.S.\n", 119 | "Census Bureau publishes sample data (a block group typically has a population\n", 120 | "of 600 to 3,000 people).\n", 121 | "\n", 122 | "It can be downloaded/loaded using the\n", 123 | ":func:`sklearn.datasets.fetch_california_housing` function.\n", 124 | "\n", 125 | ".. topic:: References\n", 126 | "\n", 127 | " - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,\n", 128 | " Statistics and Probability Letters, 33 (1997) 291-297\n", 129 | "\n" 130 | ] 131 | }, 132 | { 133 | "data": { 134 | "text/html": [ 135 | "
\n", 136 | "\n", 149 | "\n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitude
08.325241.06.9841271.023810322.02.55555637.88-122.23
18.301421.06.2381370.9718802401.02.10984237.86-122.22
27.257452.08.2881361.073446496.02.80226037.85-122.24
35.643152.05.8173521.073059558.02.54794537.85-122.25
43.846252.06.2818531.081081565.02.18146737.85-122.25
\n", 221 | "
" 222 | ], 223 | "text/plain": [ 224 | " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", 225 | "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", 226 | "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", 227 | "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", 228 | "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", 229 | "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", 230 | "\n", 231 | " Longitude \n", 232 | "0 -122.23 \n", 233 | "1 -122.22 \n", 234 | "2 -122.24 \n", 235 | "3 -122.25 \n", 236 | "4 -122.25 " 237 | ] 238 | }, 239 | "execution_count": 15, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "from sklearn.datasets import fetch_california_housing\n", 246 | "\n", 247 | "cali_data = fetch_california_housing()\n", 248 | "\n", 249 | "print(cali_data.DESCR)\n", 250 | "\n", 251 | "cali_df = pd.DataFrame(cali_data.data, columns=cali_data.feature_names)\n", 252 | "cali_df.head()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "In the above dataframe, each column is a feature (i.e. a variable) and each row is an observation (i.e. a measurement). Said another way, things like median income and average number of rooms are features, while each census block for which we have a measurement of the features is an observation. We also have a vector of target labels, which is the median home value for each neighborhood. Altogether we have 13 features and 506 observations with labels." 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 17, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "name": "stdout", 269 | "output_type": "stream", 270 | "text": [ 271 | "(20640, 8)\n", 272 | "(20640,)\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "print(cali_data.data.shape)\n", 278 | "print(cali_data.target.shape)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "When we design a model to predict real number values (e.g. home price), our model is a **regression model**. Alternatively, we could design a model to predict categorical labels, such as \"expensive neighborhood\" and \"inexpensive neighborhood\". This would be a **classification model**. Most supervised machine learning tasks fall into the category of **regression** or **classification**. In either case we have to define a metric that quantifies what we mean by $\\approx$ in the equation above.\n", 286 | "\n", 287 | "We use our metric to define a **cost function** (let's call it $C$). To carry out gradient descent, we numerically evaluate the derivative of $C$ with respect to our model parameters.\n", 288 | "\n", 289 | "$$ \\frac{dC}{df} = \\nabla_f C = \\left(\\frac{\\partial C}{\\partial \\Theta_1}, \\frac{\\partial C}{\\partial \\Theta_2}, ...\\right) $$\n", 290 | "\n", 291 | "Often the cost function, $C$, will be the same as our metric, but sometimes it may have additional terms, which we will explore later." 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## Metrics for regression tasks\n", 299 | "\n", 300 | "In the [notebook on linear regression](ML_LinearRegression.ipynb) we introduced mean squared error (MSE) as a metric for how our trend line was performing. This lead us to define a cost function, a scalar function that depends on our model parameters. We minimized the cost function using gradient descent. Depending on what problem we are trying to solve and what we want to optimize, we may choose different metrics.\n", 301 | "\n", 302 | "**Mean squared error** (MSE) is one of the most common metrics for regression:\n", 303 | "\n", 304 | "$$ \\frac{1}{n}\\sum_i\\left[f(X_i) - y_i\\right]^2 $$\n", 305 | "\n", 306 | "We squared the error terms ($f(X_i) - y_i$) because we didn't care whether they were positive or negative. We could have also addressed this concern by taking the absolute value, which would lead to the **mean absolute error** (MAE)\n", 307 | "\n", 308 | "$$ \\frac{1}{n}\\sum_i|f(X_i) - y_i| $$\n", 309 | "\n", 310 | "When we minimize the MAE by adjusting our model parameters, our model will be less strongly affected by outliers than if we used the MSE. This is because the error terms from outliers (which will generally be large) enter into the MAE a linear terms rather than being squared.\n", 311 | "\n", 312 | "Another common metric for regression is $R^2$, also known as the **coefficient of determination**. The $R^2$ quantifies how our model's MSE compares to a naive model in which we always predict the mean $y$ value, $\\bar{y}$.\n", 313 | "\n", 314 | "$$ 1 - \\frac{\\sum_i \\left[f(X_i) - y_i\\right]^2}{\\sum_i\\left(\\bar{y} - y_i\\right)^2} $$\n", 315 | "\n", 316 | "If our $R^2 < 0$ we know our model is very bad, because the MSE is larger than the MSE of the mean model.\n", 317 | "\n", 318 | "One important consideration when choosing a metric is how they scale with the data." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 20, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "MSE: 0.176074\n", 331 | "MAE: 0.344767\n", 332 | "R^2: 0.776682\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "from sklearn import metrics\n", 338 | "import numpy as np\n", 339 | "\n", 340 | "y = np.random.randn(10)\n", 341 | "y_pred = y + .5 * np.random.randn(10)\n", 342 | "\n", 343 | "print('MSE: %f' % metrics.mean_squared_error(y, y_pred))\n", 344 | "print('MAE: %f' % metrics.mean_absolute_error(y, y_pred))\n", 345 | "print('R^2: %f' % metrics.r2_score(y, y_pred))" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 23, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "MSE: 0.704295\n", 358 | "MAE: 0.689535\n", 359 | "R^2: 0.776682\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "# rescale y\n", 365 | "\n", 366 | "y = 2 * y\n", 367 | "y_pred = 2 * y_pred\n", 368 | "\n", 369 | "print('MSE: %f' % metrics.mean_squared_error(y, y_pred))\n", 370 | "print('MAE: %f' % metrics.mean_absolute_error(y, y_pred))\n", 371 | "print('R^2: %f' % metrics.r2_score(y, y_pred))" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "## Metrics for classification tasks\n", 379 | "\n", 380 | "The metrics for regression rely on calculating an error term (i.e. the difference between our prediction and the ground truth). We can't do this for a classification task, so we will need to define entirely different metrics for classification. Let's start with the possible outcomes when we make a prediction.\n", 381 | "\n", 382 | "| | Actual positive | Actual negative |\n", 383 | "|------------------------|:---------------:|:---------------:|\n", 384 | "| **Predicted positive** | True positive | False positive |\n", 385 | "| **Predicted negative** | False negative | True negative |\n", 386 | "\n", 387 | "We have four possible outcomes we can use build our metric. We'll consider only three possibilities (though many more metrics have been defined).\n", 388 | "\n", 389 | "**Accuracy** is the most intuitive: it is the amount of proportion of true positives and negatives. We add up the true positives and true negatives and divide by the total number of predictions.\n", 390 | "\n", 391 | "Accuracy suffers from tasks in which there is class imbalance. For instance, in fraud detection, actual positives are very rare. Therefore, we could get high accuracy by simply always predicting negative. If only 0.1% of all observations are actually positive, then a model that always predicts negative gets 99.9% accuracy, even though this is clearly a bad model for detecting fraud.\n", 392 | "\n", 393 | "This example illustrates that we often care about one class more than another. For instance, if we think a transaction is fraudulent, we might waste some resources investigating it, but missing a case of fraud could cost much more. In this case we might want most to avoid false negatives.\n", 394 | "\n", 395 | "**Recall** is the count of true positives divided by the count of _actual positives_. Recall will be close to 1 as long as the count of false negatives is low, even if there are not many actual positives.\n", 396 | "\n", 397 | "On the other hand, if a fraud case goes to trial, we do not want to punish a defendant unfairly. In that case it's important to avoid false positives. **Precision** is the count of true positives divided by the count of positive predictions. As long as the count of false positives is low, precision will be close to 1." 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "y = [0, 0, 1, 0, 1, 1, 0, 1]\n", 407 | "y_pred = [0, 1, 1, 0, 1, 1, 0, 1]\n", 408 | "\n", 409 | "print('Accuracy: %f' % metrics.accuracy_score(y, y_pred))\n", 410 | "print('Recall: %f' % metrics.recall_score(y, y_pred))\n", 411 | "print('Precision: %f' % metrics.precision_score(y, y_pred))" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "## Precision-recall tradeoff\n", 419 | "\n", 420 | "There is a tradeoff between precision and recall as we adjust our model, exchanging positive predictions for negative predictions.\n", 421 | "\n", 422 | "Often our classification model won't predict whether an observation is in one class or another, but rather will predict the _probability_ of the observation being in one class or the other. We choose a threshold probability, above which we will predict the observation is in the positive class, and below which we'll predict negative." 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "p_pred = np.linspace(0, 1, 1000)\n", 432 | "y = np.random.binomial(1, p_pred)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "precisions, recalls, thresholds = metrics.precision_recall_curve(y, p_pred)\n", 442 | "\n", 443 | "plt.plot(recalls, precisions)\n", 444 | "plt.xlabel('Recall')\n", 445 | "plt.ylabel('Precision')\n", 446 | "plt.title('Precision v. Recall');" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "We can summarize this curve in a single number: the area under the curve. If our model were perfect, precision and recall would both be 1 regardless of threshold, so the area under the curve would be 1. If our model was always wrong, the precision and recall would both be 0 regardless of threshold, so the area under the curve would be 0. The better our model is, _regardless of threshold_, the closer the area under the curve is to 1. We eventually need to a choose a threshold and may choose to prioritize precision or recall, but the **area under the precision-recall curve** (AUC), is a very useful metric for assessing model performance in classification tasks." 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "*Copyright © 2020 The Data Incubator. All rights reserved.*" 461 | ] 462 | } 463 | ], 464 | "metadata": { 465 | "kernelspec": { 466 | "display_name": "Python 3", 467 | "language": "python", 468 | "name": "python3" 469 | }, 470 | "language_info": { 471 | "codemirror_mode": { 472 | "name": "ipython", 473 | "version": 3 474 | }, 475 | "file_extension": ".py", 476 | "mimetype": "text/x-python", 477 | "name": "python", 478 | "nbconvert_exporter": "python", 479 | "pygments_lexer": "ipython3", 480 | "version": "3.7.3" 481 | }, 482 | "nbclean": true 483 | }, 484 | "nbformat": 4, 485 | "nbformat_minor": 1 486 | } 487 | -------------------------------------------------------------------------------- /datacourse/machine-learning/ML_Support_Vector_Machines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "init_cell": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%logstop\n", 12 | "%logstart -rtq ~/.logs/ML_Support_Vector_Machines.py append\n", 13 | "%matplotlib inline\n", 14 | "import matplotlib\n", 15 | "import seaborn as sns\n", 16 | "sns.set()\n", 17 | "matplotlib.rcParams['figure.dpi'] = 144" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import matplotlib.pyplot as plt\n", 27 | "import numpy as np\n", 28 | "from sklearn import svm" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Support Vector Machines" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Support vector machines are a popular class of machine learning models that were developed in the 1990s. They are capable of both linear and non-linear classification and can also be used for regression and anomaly/outlier detection. They work well for wide class of problems but are generally used for problems with small or medium sized data sets. In this notebook, we will start off with a simple classifier model and extend and improve it to ultimately arrive at what is referred to a support vector machine (SVM)." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Hard margin classifier\n", 50 | "\n", 51 | "A hard margin classifier is a model that uses a hyperplane to completely separate two classes. A hyperplane is a subspace with one less dimension as the ambient space. For example, the hyperplane of a two dimensional space is a line and the hyperplane of a three dimensional space is a plane. It is helpful to consider each observation in our data set as existing in a $p$-dimensional space where $p$ is the number of features (columns) in our data. A hyperplane is simply a generalization of a plane in $p$-dimensional spaces. Take a look at the figure below, where we plot three such hyperplanes to separate two classes." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from sklearn.datasets import make_blobs\n", 61 | "\n", 62 | "X, y = make_blobs(centers=[[1, 1], [-1, -1]], cluster_std=0.4, random_state=0)\n", 63 | "x = np.linspace(-2, 2, 100)\n", 64 | "\n", 65 | "plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.bwr)\n", 66 | "plt.plot(x, -x+0.25, '--k')\n", 67 | "plt.plot(x, -0.25*x-0.3, 'r--')\n", 68 | "plt.plot(x, -1.5*x+1, 'b--')\n", 69 | "plt.xlim([-2, 2])\n", 70 | "plt.ylim([-2, 2])\n", 71 | "plt.xlabel('$x_1$')\n", 72 | "plt.ylabel('$x_2$');" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "**Question**\n", 80 | "* Of the three hyperplanes, which one should you choose to separate the two classes? What motivated your decision?" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "We not only want a classifier that will completely separate both classes but one that is situated an equal distance from the two classes. Such a classifier will likely be a better decision boundary for new data. Our model should not only seek for complete separation of our classes but also create the largest _margin_ between the two classes. To find the classifier that results in the largest margin, we need to first define the equation of a hyperplane and understand how to calculate distances. The equation for a $p$-dimensional hyperplane is\n", 88 | "\n", 89 | "$$ x \\cdot \\tilde{\\beta} + \\tilde{\\beta}_0 = 0. $$\n", 90 | "\n", 91 | "$\\tilde{\\beta}$ defines the hyperplane and the set of $x$ that satisfies the above equation lie on the plane. Note, both $x$ and $\\tilde{\\beta}$ are $p$-dimensional vectors. For our classifier, we require that all points or *vectors* are on the correct side of the dividing hyperplane. If we enforce the $||\\tilde{\\beta}|| = 1$, then the result of $||x \\cdot \\tilde{\\beta} + \\tilde{\\beta}_0||$ will be equal to the distance between the vector and the hyperplane. For our two classes, let's assign a value of $\\pm 1$, where $y_j = +1$ and $y_j = -1$ are observations above and below the hyperplane, respectively. Under this convention, we seek to find a hyperplane that will satisfy the following criterion for all observations\n", 92 | "\n", 93 | "$$ y_j(x_j \\cdot \\tilde{\\beta} + \\tilde{\\beta}_0) \\geq 0.$$ \n", 94 | "\n", 95 | "The value of the term inside the parenthesis will be positive for vectors located above the hyperplane and negative for vectors located below. Given our convention for the label values of $y_j \\pm 1$, the inequality will be satisfied so long as the hyperplane perfectly separates the two classes. " 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Determining the maximum margin\n", 103 | "\n", 104 | "For a given problem, there could be various hyperplanes that satisfy the above inequality; the three classifiers shown above all satisfy the inequality. We want a classifier the creates the largest possible margin; we will need to include the following constraint,\n", 105 | "\n", 106 | "$$ y_j(x_j \\cdot \\tilde{\\beta} + \\tilde{\\beta}_0) \\geq M,$$\n", 107 | "\n", 108 | "where $M$ is the size of the margin. Remember, the term $x_j \\cdot \\tilde{\\beta} + \\tilde{\\beta}_0$ is the distance between a vector and the hyperplane and we require all vectors are at least $M$ distance away. For simplicity, we will define $\\beta \\equiv \\tilde{\\beta}/M$ and $\\beta_0 \\equiv \\tilde{\\beta}_0/M$ which results in $\\| \\beta \\|_2 = 1/M$. To maximize, the margin, we need to minimize $\\| \\beta \\|_2$. The hyperplane resulting in the largest margin can be solved by \n", 109 | "\n", 110 | "$$ \\min_{\\beta, \\beta_0} \\frac{1}{2} \\|\\beta \\|_2, $$\n", 111 | "\n", 112 | "with the constraint\n", 113 | "\n", 114 | "$$ y_j(x_j \\cdot \\beta + \\beta_0) \\geq 1.$$\n", 115 | "\n", 116 | "Let's train the hard margin classifier on the data previously displayed." 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "from ipywidgets import interact, IntSlider, FloatSlider, fixed\n", 126 | "\n", 127 | "def plot_svc_interact(X, y):\n", 128 | " def plotter(log_C=1):\n", 129 | " clf = svm.SVC(C=10**log_C, kernel='linear')\n", 130 | " clf.fit(X, y)\n", 131 | " \n", 132 | " beta = clf.coef_[0]\n", 133 | " beta_0 = clf.intercept_\n", 134 | " slope = -beta[0]/beta[1]\n", 135 | " intercept = -beta_0/beta[1]\n", 136 | " \n", 137 | " x_max = np.ceil(np.abs(X).max())\n", 138 | " x = np.linspace(-x_max, x_max, 100)\n", 139 | " margin_bound_1 = 1/beta[1] + slope*x + intercept\n", 140 | " margin_bound_2 = -1/beta[1] + slope*x + intercept\n", 141 | "\n", 142 | " plt.plot(x, slope*x + intercept, 'k')\n", 143 | " plt.fill_between(x, margin_bound_1, margin_bound_2, color='k', alpha=0.25, linewidth=0)\n", 144 | " plt.scatter(*clf.support_vectors_.T, s=100, c='y')\n", 145 | " plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.bwr)\n", 146 | " plt.axis([-x_max, x_max, -x_max, x_max])\n", 147 | "\n", 148 | " return plotter\n", 149 | "\n", 150 | "plot_svc_interact(X, y)(log_C=2)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "The figure displays the largest margin possible without having vectors inside the margin. The figure highlights two vectors, the vectors that prevent the margin from expanding. These vectors are referred to as _support vectors_ because they support the margin structure. You can think of the margin boundaries as a wall or fence and the support vectors help maintain or support the structure. The support vectors are the only vectors in the training set that influences the choice of hyperplane. Changing the values of the other vectors will not affect the margin, so long as they still stay out of the margin. " 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "**Questions**\n", 165 | "* The choice of margin was only dictated by a few number of training data. Would you expect that this classifier to be prone to bias or variance error?\n", 166 | "* What preprocessing should we apply to our data to make the algorithm work best?" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## Soft margin classifier\n", 174 | "\n", 175 | "Not all cases is it possible to completely linearly separable the two classes. We need to relax the constraint that there be no margin violations. The result is what is called the soft-margin classifier. As before, we still look to create the largest possible margin but the model will incur a penalty for vectors that reside in the margin or are on the wrong side of the hyperplane. Mathematically, we are going to construct a margin such that\n", 176 | "\n", 177 | "$$ \\min_{\\beta, \\beta_0} \\frac{1}{2}\\| \\beta \\|_2 + C \\sum_j \\zeta_j, $$\n", 178 | "\n", 179 | "with the constraints\n", 180 | "\n", 181 | "$$ y_j(x_j \\cdot \\beta + \\beta_0) \\geq (1 - \\zeta_j), $$\n", 182 | "\n", 183 | "$$ \\zeta_j \\geq 0.$$\n", 184 | "\n", 185 | "The severity of all the violations are controlled by the hyperparameter $C$ and the magnitude of the penalty for each vector is proportional to $\\zeta_j$. The objective function we want to minimize has two parts, one that seeks for the largest margin and another the aims to reduce penalties from margin violations. Our constrain is slightly different from the hard margin classifier as it needs to consider that vectors may reside inside the margin. \n", 186 | "\n", 187 | "Each vector will have its own value of $\\zeta$. If a vector does not reside inside the margin and is on the right side of the hyperplane, then $\\zeta=0$ and we have the constraint for our hard margin classifier. These vectors will contribute to the cost function we want to minimize. If a vector is inside the margin, then $\\zeta$ needs to be greater than 0 to still satisfy the constraint. If a vector lies on the hyperplane, then $x_j \\cdot \\beta + \\beta_0 = 0$ and $\\zeta $ must be at least equal to 1 to satisfy the constraint. If the vector is on the wrong side of the hyperplane, then $\\zeta$ for that vector needs to be greater than 1.\n", 188 | "\n", 189 | "Determining the hyperplane coefficients of the soft margin classifier involves solving a convex quadratic minimization with linear constraints. It can solved using quadratic programming solvers and the time complexity will be $O(np)$. The soft-margin classifier in `scikit-learn` is available using the `svm.LinearSVC` class.\n", 190 | "\n", 191 | "**Questions**\n", 192 | "* What other model had a cost function composed of two \"competing\" terms? Can you relate these terms to bias vs. variance?\n", 193 | "* $C$ is a hyperparameter we must tune. How does changing $C$ affect variance and the number of support vectors?" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "The soft margin classifier uses the hinge loss function, named because it resembles a hinge. There is no loss so long as a threshold is not exceeded. Beyond the threshold, the loss ramps up linearly. See the figure below for an illustrations of a hinge loss function. Negative distance means the observation is on the wrong side of the hyperplane." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "x = np.linspace(-1, 2, 100)\n", 210 | "hinge_loss = lambda x: -(x-1) if x-1 < 0 else 0\n", 211 | "\n", 212 | "plt.plot(x, list(map(hinge_loss, x)))\n", 213 | "plt.xlabel(\"$y(x\\cdot\\\\beta + \\\\beta_0$)\")\n", 214 | "plt.ylabel('loss');" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "We will train the soft margin classifier on a data set that is not completely linear separable. The interactive visualization allows you to modify the hyperparameter $C$. Consider the effect of increasing and decreasing $C$. Note, for tuning purposes, it's best to use a logarithmic scale." 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "from sklearn.datasets import make_blobs\n", 231 | "\n", 232 | "X, y = make_blobs(centers=[[1, 1], [-1, -1]], cluster_std=1.5, random_state=0, n_samples=200)\n", 233 | "\n", 234 | "log_C_slider = FloatSlider(min=-4, max=2, step=0.25, value=0, description='$\\log(C)$')\n", 235 | "interact(plot_svc_interact(X, y), log_C=log_C_slider);" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "## Kernels for non-linear classification\n", 243 | "\n", 244 | "Using a hyperplane to separate the two classes will have limited performance as most problems require a non-linear decision boundary. One approach to overcome this limitation is to engineer non-linear features using the original features. Essentially, we are projecting our data onto a higher dimensional space where a linear classifier will perform substantially better. Consider the example below." 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "from sklearn.datasets import make_circles\n", 254 | "\n", 255 | "X, y = make_circles(n_samples=200, noise=0.2, factor=0.25, random_state=0)\n", 256 | "plt.scatter(*X.T, c=y, cmap=plt.cm.bwr)\n", 257 | "plt.xlabel('$x_1$')\n", 258 | "plt.ylabel('$x_2$');" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "We clearly cannot linearly separate the two classes. However, we can create a new feature $x_3 = \\sqrt{x_1^2 + x_2^2}$, the distance from the origin. With the new feature, we are projecting our data onto a higher dimensional space." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "from mpl_toolkits.mplot3d import Axes3D\n", 275 | "\n", 276 | "def plot_projection(X, y):\n", 277 | " XX, YY = np.meshgrid(np.linspace(-1, 1, 20), np.linspace(-1, 1, 20))\n", 278 | " ZZ = 0.6*np.ones((20, 20))\n", 279 | " x_3 = (X[:, 0]**2 + X[:, 1]**2)**0.5\n", 280 | " X_new = np.hstack((X, x_3.reshape(-1, 1)))\n", 281 | "\n", 282 | " def plotter(elev=30, azim=30):\n", 283 | " fig = plt.figure()\n", 284 | " ax = plt.axes(projection='3d')\n", 285 | " ax.scatter(*X_new.T, c=y, cmap=plt.cm.bwr)\n", 286 | " ax.plot_surface(XX, YY, ZZ, alpha=0.2);\n", 287 | " ax.view_init(elev, azim)\n", 288 | " ax.set_xlabel('$x_1$')\n", 289 | " ax.set_ylabel('$x_2$')\n", 290 | " ax.set_zlabel('$x_3$')\n", 291 | "\n", 292 | " return plotter\n", 293 | "\n", 294 | "interact(plot_projection(X, y), elev=(0, 360), azim=(0, 360));" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "In this higher dimension, it is possible for a hyperplane to adequately divide the two classes. The resulting decision boundary on the original, lower dimensional space, is non-linear. Check out this [visualization](https://www.youtube.com/watch?v=3liCbRZPrZA) for another example of using projections to help the performance of the classifier.\n", 302 | "\n", 303 | "In the example, we introduced the new non-linear term directly into data set. However, the way the objective function is solved allows us to _indirectly_ applying the projection, using what is called the kernel trick. A kernel is a function that creates the implicit mapping/projection to a higher dimensional space. The advantage of using a kernel trick are:\n", 304 | "\n", 305 | "* No direct feature generation that increases the size of the data set.\n", 306 | "* We can readily swap out and try different kernels to see which one performs the best.\n", 307 | "\n", 308 | "With the kernel, we can now refer to our model as a support vector machine. More about the mathematically details about the kernel trick are explained at the end of the notebook." 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "### Choices of kernels\n", 316 | "\n", 317 | "There are several popular choices of kernels; they are the polynomial, sigmoid, and Gaussian radial basis function (RBF). In `scikit-learn`, the choice of kernel is controlled by the keyword argument `kernel`. The table below \n", 318 | "\n", 319 | "\n", 320 | "\n", 321 | "\n", 322 | "\n", 323 | "\n", 324 | "\n", 325 | "\n", 326 | "\n", 327 | "\n", 328 | "\n", 329 | "\n", 330 | "\n", 331 | "\n", 332 | "\n", 333 | "\n", 334 | "\n", 335 | "\n", 336 | "\n", 337 | "\n", 338 | "\n", 339 | "\n", 340 | "\n", 341 | "\n", 342 | "\n", 343 | "\n", 344 | "\n", 345 | "\n", 346 | "\n", 347 | "\n", 348 | "\n", 349 | "
Kernel Name$K(x,x')$`kernel` keyword argument
Linear Kernel$x \\cdot x'$`'linear'`
$d$-th Degree Polynomial     $(r + \\gamma\\ x \\cdot x')^d$ `poly'`
Gaussian RBF  $\\exp{(- \\gamma\\, \\|x - x' \\|_2)} $`'rbf'`
Sigmoid      $\\tanh(\\gamma\\, x \\cdot x' + r)$`'sigmoid'`
\n", 350 | "\n", 351 | "Usually, the radial basis function kernel will perform the best and there's only one additional hyperparameter to tune, $\\gamma$. An interesting note is that using the radial basis function kernel is equivalent to projecting our vectors to an infinite dimensional space. It would not be possible to use the radial basis function kernel without the kernel trick.\n", 352 | "\n", 353 | "The following blocks of code create an interactive visualization where you can experiment with different kernels and hyperparameter values for a situation where a non-linear decision boundary is required." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "def plot_decision_boundary(X_train, X_test, y_train, y_test):\n", 363 | " def plotter(kernel='linear', log_gamma=1, log_C=1, deg=1, coef0=1):\n", 364 | " clf = svm.SVC(C=10**log_C, kernel=kernel, gamma=10**log_gamma, coef0=coef0, probability=True)\n", 365 | " clf.fit(X_train, y_train)\n", 366 | " \n", 367 | " X1, X2 = np.meshgrid(np.linspace(-2, 3), np.linspace(-2, 2))\n", 368 | " y_proba = clf.predict_proba(np.hstack((X1.reshape(-1, 1), X2.reshape(-1, 1))))[:, 1]\n", 369 | " plt.contourf(X1, X2, y_proba.reshape(50, 50), 16, cmap=plt.cm.bwr, alpha=0.75)\n", 370 | " plt.colorbar()\n", 371 | "\n", 372 | " accuracy = clf.score(X_test, y_test)\n", 373 | " plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='white', cmap=plt.cm.bwr)\n", 374 | " plt.xlabel('$x_1$')\n", 375 | " plt.ylabel('$x_2$')\n", 376 | " plt.title('test set accuracy: {}'.format(accuracy));\n", 377 | "\n", 378 | " return plotter" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "from sklearn.datasets import make_moons\n", 388 | "from sklearn.model_selection import train_test_split\n", 389 | "\n", 390 | "X, y = make_moons(400, noise=0.25, random_state=0)\n", 391 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)\n", 392 | "\n", 393 | "log_C_slider = FloatSlider(min=-4, max=4, step=0.25, value=0, description='$\\log(C)$')\n", 394 | "log_gamma_slider = FloatSlider(min=-3, max=2, step=0.01, value=0, description='$\\log(\\gamma$)')\n", 395 | "deg_slider = IntSlider(min=1, max=4, step=1, value=2, description='$d$')\n", 396 | "coef0_slider = FloatSlider(min=-100, max=100, step=0.1, value=0, description='$r$')\n", 397 | "\n", 398 | "interact(plot_decision_boundary(X_train, X_test, y_train, y_test),\n", 399 | " log_C=log_C_slider,\n", 400 | " log_gamma=log_gamma_slider, \n", 401 | " kernel=['rbf', 'linear', 'sigmoid', 'poly'],\n", 402 | " deg=deg_slider,\n", 403 | " coef0=coef0_slider);" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "## Comparison with logistic regression\n", 411 | "\n", 412 | "The SVM models resembles that of logistic regression. They are both linear binary classifiers, if we ignore the kernelized version. The difference between the two methods is the cost function they use to determine model parameters. The logistic regression uses the log loss while SVM uses the hinge loss. Both functions are plotted below." 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "x = np.linspace(-6, 4, 100)\n", 422 | "hinge_loss = lambda x: -(x-1) if x < 1 else 0\n", 423 | "log_loss = np.log(1+np.exp(-x))\n", 424 | "\n", 425 | "plt.plot(x, list(map(hinge_loss, x)))\n", 426 | "plt.plot(x, log_loss, '--r')\n", 427 | "plt.xlabel(\"$y(x \\cdot \\\\beta + \\\\beta_0)$\")\n", 428 | "plt.ylabel('loss');" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "The two cost functions have the same limiting behavior. \n", 436 | "\n", 437 | "* If an observation is on the correct side of the hyperplane and very far away, large **positive** value of $y(x\\cdot\\beta + \\beta_0)$, it will have _nearly_ zero loss for the log loss and exactly zero for the hinge loss.\n", 438 | "\n", 439 | "* If an observation is on the wrong side of the hyperplane and very far away, large **negative** value of $y(x\\cdot\\beta + \\beta_0)$, the log loss penalty is linear with respect to the distance to the hyperplane. The hinge loss penalty is always linear.\n", 440 | "\n", 441 | "The matching limiting behavior, as $z \\to \\pm \\infty$, can be observed in the equations for the loss functions,\n", 442 | "\n", 443 | "$$ C_\\text{log} = \\ln(1 + \\exp(-z), $$\n", 444 | "\n", 445 | "$$ C_\\text{hinge} = \\max(0, 1 - z) $$\n", 446 | "where\n", 447 | "$$ z = y(x\\cdot \\beta + \\beta_0).$$\n", 448 | "\n", 449 | "The difference occurs in the intermediate zone. SVM uses a threshold; if the observation is not inside the margin and on the right side of the hyperplane, there is no penalty. It does not matter how far away from the hyperplane the observation is located, so long as it still on correct side and not in the margin. This allows for the model to generalize better. For logistic regression, there will always be non-zero loss. Since every observation will have a cost, the model will need to \"satisfy\" each observation with regards to where to locate the hyperplane. Logistic regression will hurt the models ability to generalize. Note, a regularization term is often added to the logistic regression cost function, helping it generalize. Despite these differences, logistic regression and linear SVM often achieve similar results.\n", 450 | "\n", 451 | "Here are some things to consider when choosing which of the two models to use.\n", 452 | "\n", 453 | "* If calculating probabilities is important, use **logistic regression** as it is a probabilistic model.\n", 454 | "* If the data is sufficiently linearly separable, both models can be used but **SVM** may work better in the presence of outliers.\n", 455 | "* If the two classes are not linearly separable, use **SVM** with a kernel.\n", 456 | "* If there is a large number of observations, 50,000 - 100,000, and a small number of features, it is best to manually create new features and use **logistic regression** or **linear SVM**. Kernelized SVM is slow to train with large number of observations.\n", 457 | "\n", 458 | "The following visualization lets you use either a linear regression or SVM. You can control the separation of the clusters as well as the presence of an outlier. Notice how the SVM works better than linear regression when there is an outlier." 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "from sklearn.linear_model import LogisticRegression\n", 468 | "\n", 469 | "def plot_svc_vs_lr(cluster_std=0.8, log_C=1, model='logistic regression', outlier=False):\n", 470 | " X, y = make_blobs(centers=[[1, 1], [-1, -1]], cluster_std=cluster_std, random_state=0)\n", 471 | "\n", 472 | " if outlier:\n", 473 | " X = np.vstack((X, [-1.5, 0.]))\n", 474 | " y = np.hstack((y, [0]))\n", 475 | "\n", 476 | " name_to_clf = {'logistic regression': LogisticRegression(C=10**log_C, solver='lbfgs'),\n", 477 | " 'SVM': svm.SVC(C=10**log_C, kernel='linear')}\n", 478 | " \n", 479 | " clf = name_to_clf[model]\n", 480 | " clf.fit(X, y)\n", 481 | " \n", 482 | " beta = clf.coef_[0]\n", 483 | " beta_0 = clf.intercept_\n", 484 | " slope = -beta[0]/beta[1]\n", 485 | " intercept = -beta_0/beta[1]\n", 486 | " \n", 487 | " x_max = np.ceil(np.abs(X).max())\n", 488 | " x = np.linspace(-x_max, x_max, 100)\n", 489 | "\n", 490 | " plt.plot(x, slope*x + intercept, 'k')\n", 491 | " plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.bwr)\n", 492 | " plt.axis([-x_max, x_max, -x_max, x_max])" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "log_C_slider = FloatSlider(min=-4, max=4, step=0.25, value=1, description='$\\log(C)$')\n", 502 | "cluster_std_slider = FloatSlider(min=0.2, max=1.0, step=0.05, value=0.8, description='cluster $\\sigma$')\n", 503 | "\n", 504 | "interact(plot_svc_vs_lr,\n", 505 | " cluster_std=cluster_std_slider,\n", 506 | " log_C=log_C_slider,\n", 507 | " model=['logistic regression', 'SVM']);" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "## SVM for regression\n", 515 | "\n", 516 | "SVMs can be also be used for regression with some modifications to the constrained optimization. Instead of constructing a margin that avoids penalties incurred by vectors residing inside the margin, we train a model that includes as many vectors *inside* the margin as possible. Now, vectors that are inside the margin carry no penalty but will incur one if they are outside of the margin. Similarly as before, the penalty ramps up linearly the farther away the vector is from the edge of the margin. Instead of a hinge loss, the SVM regressor uses a well-loss cost function, shown below." 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "eps = 0.25\n", 526 | "x = np.linspace(-1, 1, 100)\n", 527 | "well_loss = list(map(lambda x: abs(x)-eps if abs(x) > eps else 0, x))\n", 528 | "square_loss = x**2\n", 529 | "\n", 530 | "plt.plot(x, well_loss)\n", 531 | "plt.plot(x, square_loss)\n", 532 | "plt.xlabel('distance from the center')\n", 533 | "plt.ylabel('loss')\n", 534 | "plt.legend(['well loss', 'square loss']);" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "The new optimization is\n", 542 | "$$ \\min_{\\beta, \\beta_0, \\zeta_j, \\zeta^*_j} \\frac{1}{2} \\|\\beta \\|_2 + C \\sum_j (\\zeta_j + \\zeta^*_j) $$\n", 543 | "with the following constraints\n", 544 | "$$ y_j - \\beta \\cdot x_j - \\beta_0 \\leq \\epsilon + \\zeta_i, $$\n", 545 | "$$ \\beta \\cdot x_j + \\beta_0 - y_j \\leq \\epsilon + \\zeta^*_i, $$\n", 546 | "$$ \\zeta_i, \\zeta^* \\geq 0. $$\n", 547 | "\n", 548 | "The optimization problem is very similar as before but now we have two $\\zeta$ values for each vector since our model will incur a penalty if a vector resides on either side of the margin. The hyperparameter $\\epsilon$ determines the thickness of the margin and $C$ acts as the same way as with SVM classifier. In `scikit-learn` the linear SVM regressor is accessed from `svm.LinearSVR` while the kernelized SVM is accessed via `svm.SVR`.\n", 549 | "\n", 550 | "The following interactive visualization allows you to see the effect of alternating $C$ and $\\epsilon$ on a data set with a linear behavior." 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "def plot_svr_interact(X, y):\n", 560 | " def plotter(epsilon=0.5, log_C=2):\n", 561 | " rgr = svm.SVR(kernel='linear', epsilon=epsilon, C=10**log_C)\n", 562 | " rgr.fit(X, y)\n", 563 | " \n", 564 | " y_pred = rgr.predict(X)\n", 565 | " ind = np.abs(y - y_pred) >= epsilon\n", 566 | "\n", 567 | " plt.scatter(X[ind], y[ind], s=100, color='y')\n", 568 | " plt.scatter(X, y)\n", 569 | " plt.fill_between(X.reshape(-1,), y_pred - epsilon, y_pred + epsilon, alpha=0.25, color='k', linewidth=0)\n", 570 | " plt.plot(X, y_pred, '-k')\n", 571 | " plt.xlabel('$x$')\n", 572 | " plt.ylabel('$y$')\n", 573 | "\n", 574 | " return plotter" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "np.random.seed(0)\n", 584 | "x = np.linspace(-1, 1, 100)\n", 585 | "y = 2*x + 1 + 0.5*np.random.randn(100)\n", 586 | "\n", 587 | "log_C_slider = FloatSlider(min=-3, max=1, step=0.05, value=-1, description='$\\log(C)$')\n", 588 | "epsilon_slider = FloatSlider(min=0.05, max=2, step=0.05, value=0.5, description='$\\epsilon$')\n", 589 | "interact(plot_svr_interact(x.reshape(-1, 1), y), log_C=log_C_slider, epsilon=epsilon_slider);" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "## Exercises\n", 597 | "\n", 598 | "1. Use `SVR` for the California housing data; you should experiment with using different kernels. What kernel works the best?\n", 599 | "1. Support vector regression (with a linear kernel) and linear regression are similar but use a different cost function; we compared both loss functions above. Given the loss functions, which model would you think will work better with the presence of outliers? Test out your answer by using both support vector regression and linear regression to fit a line through a data set that has an outlier.\n", 600 | "\n", 601 | "```python\n", 602 | "np.random.seed(0)\n", 603 | "x = np.linspace(-1, 1, 100)\n", 604 | "y = 2*x + 1 + 0.5*np.random.randn(100)\n", 605 | "\n", 606 | "# include outlier\n", 607 | "X = np.vstack((x.reshape(-1, 1), [-1]))\n", 608 | "y = np.hstack((y, [3]))\n", 609 | "```\n" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "metadata": {}, 615 | "source": [ 616 | "## Appendix: Lagrangian dual formulation\n", 617 | "\n", 618 | "Instead of solving the originally formulated optimization problem, we can reformulate the problem to construct what is called a dual problem. The original formulation is referred to as the primal. We can use Lagrangian multipliers which transforms the constrained minimization problem into an unconstrained problem. Under certain conditions, the solution to solving the dual problem is the same as solving if we had solved the primal. These conditions are met with our original quadratic optimization with linear constraints. Thus, we can either solve the primal or dual problem and get the same result. The purpose of reformulating the problem will become more apparent in the next section.\n", 619 | "\n", 620 | "The dual formulation of the soft-margin classifier is\n", 621 | "$$ \\min_{\\alpha_j} \\frac{1}{2} \\sum_{j'} \\sum_{j} \\alpha_{j'} \\alpha_j y_j y_{j'} x_j \\cdot x_{j'} - \\sum_{j} \\alpha_j, $$\n", 622 | "subject to\n", 623 | "$$ \\alpha_j \\geq 0, $$\n", 624 | "\n", 625 | "$$ \\sum_j y_j \\alpha_j = 0, $$\n", 626 | "\n", 627 | "$$ 0 \\leq \\alpha_j < C. $$\n", 628 | "\n", 629 | "Once solved, the coefficients of the hyperplane are\n", 630 | "$$ \\beta = \\sum_j \\alpha_j y_j x_j.$$\n", 631 | "\n", 632 | "Here, $\\alpha_j$ are the Lagrangian multipliers. Only the vectors that violate the margin have a non-zero value for it's multiplier. Given the equation for calculating the hyperplane coefficients with the multipliers, it becomes clear that only vectors with a non-zero multiplier contribute to the construction of the hyperplane. The mathematics are in agreement to our earlier statement that only the support vectors decides the chosen hyperplane." 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "## Appendix: The kernel trick\n", 640 | "\n", 641 | "As discussed earlier, you can introduce new non-linear terms into your data set directly. For example, if you have two features $x_1$ and $x_2$, you can introduce polynomial terms such as $x_1^2$, $x_1x_2$, and $x_2^2$. The dual formulation that applies the mapping to generate non-linear features is\n", 642 | "\n", 643 | "$$ L_D = \\frac{1}{2} \\sum_{j'} \\sum_{j} \\alpha_{j'} \\alpha_j y_j y_{j'} h(x_j) \\cdot h(x_{j'}) - \\sum_{j} \\alpha_j, $$\n", 644 | "\n", 645 | "where $h(x_j)$ is a function that projects the original vectors to the new higher dimensional space. However, if we have a large set of features, the number of new features will become too many, even if we only include terms of degree 2. In the dual formulation, only the result of the dot product of the vectors matter. Instead of expanding the dimensions of our vectors and then taking the dot product, we can pose a function $K$ such that \n", 646 | "\n", 647 | "$$ K(x_j, x_{j'}) = h(x_{j})\\cdot h(x_{j'}).$$ \n", 648 | "\n", 649 | "This function is referred to as a kernel. The result of using a kernel on the dot product of the vectors in the original space is mathematically equivalent to explicitly transforming our vector and then taking the dot product. The kernel function is _indirectly_ applying the feature transformation and avoids the creating vectors of very large dimensions. The the advantage of solving the problem using the dual formulation is that it allows for the use of the kernel trick. With the kernel, we can now refer to our model as a support vector machine. The kernelized form of the equation we want to minimize is\n", 650 | "\n", 651 | "$$ L_D = \\frac{1}{2} \\sum_{j'} \\sum_{j} \\alpha_{j'} \\alpha_j y_j y_{j'} K(x_j, x_{j'}) - \\sum_{j} \\alpha_j, $$\n", 652 | "\n", 653 | "Solving for the hyperplane coefficients using the dual formulation is $O(n^2p)$ to $O(n^3p)$. The training time complexity does not scale well with increasing number of observations. Because of training time complexity of SVMs, they are not useful when working with large data set. There is no hard cutoff, but `scikit-learn` recommends against using a SVM with a data set of more than 100,000 samples. The class `svm.SVC` provides the kernelized form of the model, solved using the dual formulation." 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "*Copyright © 2020 The Data Incubator. All rights reserved.*" 661 | ] 662 | } 663 | ], 664 | "metadata": { 665 | "kernelspec": { 666 | "display_name": "Python 3", 667 | "language": "python", 668 | "name": "python3" 669 | }, 670 | "nbclean": true 671 | }, 672 | "nbformat": 4, 673 | "nbformat_minor": 0 674 | } 675 | -------------------------------------------------------------------------------- /datacourse/machine-learning/imputation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /datacourse/miniprojects/in.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "init_cell": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%logstop\n", 12 | "%logstart -rtq ~/.logs/in.py append\n", 13 | "%matplotlib inline\n", 14 | "import matplotlib\n", 15 | "import seaborn as sns\n", 16 | "sns.set()\n", 17 | "matplotlib.rcParams['figure.dpi'] = 144" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 3, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "from static_grader import grader" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# IN Miniproject\n", 34 | "\n", 35 | "The objective of this miniproject to teach you about our grading system. We will do this by solving two easy problems. \n", 36 | "\n", 37 | "Our grader is a custom bit of code that needs to imported as above in order to work. Questions are scored by calling `grader.score.question_name(answer)`. The `question_name` method be different for each question, and will be provided for you. The form of `answer` will vary question to question. Sometimes you will be asked to submit a fixed value; other times you will be asked to submit a function which will compute something based on some (hidden) input.\n", 38 | "\n", 39 | "Your answer will be graded automatically and the score returned to you on a 0 to 1 scale. (Sometimes you can do even better than 1!) We will count your highest grade, so feel free to try out different solutions!" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "# Problem 1\n", 47 | "\n", 48 | "Compute the five smallest positive even numbers. Provide these in a list.\n", 49 | "\n", 50 | "In this case, the answer is a fixed value, so you may compute (or hard-code) the answer and submit it.\n", 51 | "Note that we have provided a \"dummy solution\" below. This dummy solution illustrates the correct format of the solution (i.e. a list of 5 numbers). If you are encountering an error when you try to submit a solution to the grader, double check that your answer has the same structure as the dummy solution." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "even_numbers = [2, 4, 6, 8, 10]" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "==================\n", 73 | "Your score: 1.0\n", 74 | "==================\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "grader.score.in__problem1(even_numbers)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "# Problem 2\n", 87 | "\n", 88 | "Define a function that accepts a list (called `numbers` below) as input and return a list where each element is multiplied by 10. The grader will supply the argument `numbers` to the function when you run the `grader.score.in__problem2` method below.\n", 89 | "\n", 90 | "In this case, you need to write a function that will work for arbitrary input.\n", 91 | "Before submitting your function to the grader, you may want to check that it returns the output that you expect by evaluating code similar to the following:\n", 92 | "\n", 93 | "```python\n", 94 | "\n", 95 | "test_numbers = [1, 2, 3]\n", 96 | "mult(test_numbers)\n", 97 | "```" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 6, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "def mult(numbers):\n", 107 | " return [n * 10 for n in len(numbers)]" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 7, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "ename": "TypeError", 117 | "evalue": "'int' object is not iterable", 118 | "output_type": "error", 119 | "traceback": [ 120 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 121 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 122 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgrader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0min__problem2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 123 | "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/static_grader/grader.py\u001b[0m in \u001b[0;36mfunc\u001b[0;34m(*args, **kw)\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 94\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 95\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 124 | "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/static_grader/grader.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, question_name, func)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0mtest_cases\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 90\u001b[0;31m \u001b[0mtest_cases_grading\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquestion_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_cases\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 125 | "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/static_grader/grader.py\u001b[0m in \u001b[0;36mtest_cases_grading\u001b[0;34m(question_name, func, test_cases)\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtest_case\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtest_cases\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minspect\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misroutine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 42\u001b[0;31m \u001b[0msub_res\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mtest_case\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'args'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mtest_case\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'kwargs'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 43\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtest_case\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'args'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtest_case\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'kwargs'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0msub_res\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 126 | "\u001b[0;32m\u001b[0m in \u001b[0;36mmult\u001b[0;34m(numbers)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnumbers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mn\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m10\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mn\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnumbers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 127 | "\u001b[0;31mTypeError\u001b[0m: 'int' object is not iterable" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "grader.score.in__problem2(mult)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "*Copyright © 2020 The Data Incubator. All rights reserved.*" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.7.3" 160 | }, 161 | "nbclean": true 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 1 165 | } 166 | -------------------------------------------------------------------------------- /datacourse/miniprojects/nlp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 35, 6 | "metadata": { 7 | "init_cell": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%logstop\n", 12 | "%logstart -rtq ~/.logs/nlp.py append\n", 13 | "%matplotlib inline\n", 14 | "import matplotlib\n", 15 | "import seaborn as sns\n", 16 | "sns.set()\n", 17 | "matplotlib.rcParams['figure.dpi'] = 144" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 3, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "from static_grader import grader" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# NLP Miniproject" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Introduction\n", 41 | "\n", 42 | "The objective of this miniproject is to gain experience with natural language processing and how to use text data to train a machine learning model to make predictions. For the miniproject, we will be working with product review text from Amazon. The reviews are for only products in the \"Electronics\" category. The objective is to train a model to predict the rating, ranging from 1 to 5 stars.\n", 43 | "\n", 44 | "## Scoring\n", 45 | "\n", 46 | "For most of the questions, you will be asked to submit the `predict` method of your trained model to the grader. The grader will use the passed `predict` method to evaluate how your model performs on a test set with respect to a reference model. The grader uses the [R2-score](https://scikit-learn.org/stable/modules/model_evaluation.html#r2-score) for model evaluation. If your model performs better than the reference solution, then you can score higher than 1.0. For the last question, you will submit the result of an analysis and your passed answer will be compared directly to the reference solution.\n", 47 | "\n", 48 | "## Downloading and loading the data\n", 49 | "\n", 50 | "The data set is available on Amazon S3 and comes as a compressed file where each line is a JSON object. To load the data set, we will need to use the `gzip` library to open the file and decode each JSON into a Python dictionary. In the end, we have a list of dictionaries, where each dictionary represents an observation." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stderr", 60 | "output_type": "stream", 61 | "text": [ 62 | "mkdir: cannot create directory ‘data’: File exists\n", 63 | "File ‘./data/amazon_electronics_reviews_training.json.gz’ already there; not retrieving.\n", 64 | "\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "%%bash\n", 70 | "mkdir data\n", 71 | "wget http://dataincubator-wqu.s3.amazonaws.com/mldata/amazon_electronics_reviews_training.json.gz -nc -P ./data" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "import gzip\n", 81 | "import ujson as json\n", 82 | "\n", 83 | "with gzip.open(\"data/amazon_electronics_reviews_training.json.gz\", \"r\") as f: \n", 84 | " data = [json.loads(line) for line in f]" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "The ratings are stored in the keyword `\"overall\"`. You should create an array of the ratings for each review, preferably using list comprehensions." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 6, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "[{'reviewerID': 'A238V1XTSK9NFE',\n", 103 | " 'asin': 'B00004VX3T',\n", 104 | " 'reviewerName': 'Andrew Lynn',\n", 105 | " 'helpful': [2, 2],\n", 106 | " 'reviewText': \"I bought this mouse to use with my laptop because I don't like those little touchpads. I could not be happier.Since it's USB, I can plug it in with the computer already on and expect it to work automatically. Since it's optical (the new kind, not to be confused with the old Sun optical mice that required a special checkered mouse pad) it works on most surfaces, including my pant legs, my couch, and random tables that I put my laptop down on. It's also light and durable, features that help with portability.The wheel is surprisingly useful. In addition to scrolling, it controls zoom and pan in programs like Autocad and 3D Studio Max. I can no longer bear using either of these programs without it.One complaint - the software included with the Internet navigation features is useless. Don't bother installing it if you have a newer Windows version that automatically supports wheel mice. Just plug it in and use it - it's that easy.\",\n", 107 | " 'overall': 5.0,\n", 108 | " 'summary': 'Excellent mouse for laptop users',\n", 109 | " 'unixReviewTime': 1007942400,\n", 110 | " 'reviewTime': '12 10, 2001'}]" 111 | ] 112 | }, 113 | "execution_count": 6, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "data[:1]" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 7, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "ratings = [d['overall'] for d in data]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "**Note**, the test set used by the grader is in the same format as that of `data`, a list of dictionaries. Your trained model needs to accept data in the same format. Thus, you should use `Pipeline` when constructing your model so that all necessary transformation needed are encapsulated into a single estimator object." 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "## Question 1: Bag of words model\n", 143 | "\n", 144 | "Construct a machine learning model trained on word counts using the bag of words algorithm. Remember, the bag of words is implemented with `CountVectorizer`. Some things you should consider:\n", 145 | "\n", 146 | "* The reference solution uses a linear model and you should as well; use either `Ridge` or `SGDRegressor`.\n", 147 | "* The text review is stored in the key `\"reviewText\"`. You will need to construct a custom transformer to extract out the value of this key. It will be the first step in your pipeline.\n", 148 | "* Consider what hyperparameters you will need to tune for your model.\n", 149 | "* Subsampling the training data will boost training times, which will be helpful when determining the best hyperparameters to use. Note, your final model will perform best if it is trained on the full data set.\n", 150 | "* Including stop words may help with performance." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 8, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "reviewText = [text['reviewText'] for text in data]" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 9, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/plain": [ 170 | "'One by one, all of the discs went bad within a 6 months period. It was a real pain. As a result, I would tend to not buy Memorex discs again.'" 171 | ] 172 | }, 173 | "execution_count": 9, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "reviewText[1]" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 10, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 189 | "class keyTransformer(BaseEstimator, TransformerMixin):\n", 190 | " def __init__(self, key):\n", 191 | " self.key = key\n", 192 | " \n", 193 | " def fit(self, X, y=None):\n", 194 | " return self\n", 195 | " def transform(self, X):\n", 196 | " return [text[self.key] for text in X]" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 11, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "name": "stdout", 206 | "output_type": "stream", 207 | "text": [ 208 | "I only use the remote control twice a year - at Christmas to take group photos of my family and my wife's family - but this little gadget is worth every penny. I used to set the timer, then start it running while I ran around to get in the picture. It's so much easier to trigger the shutter by remote control. I've used it with a D200 and D300, and it works perfectly.\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "key = keyTransformer(\"reviewText\")\n", 214 | "key_trs = key.fit_transform(data)\n", 215 | "print(key_trs[-1])" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 12, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "import spacy\n", 225 | "\n", 226 | "# load text processing pipeline\n", 227 | "nlp = spacy.load('en')" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 13, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "from spacy.lang.en import STOP_WORDS\n" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 14, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "# we need to generate the lemmas of the stop words\n", 246 | "stop_words_str = \" \".join(STOP_WORDS) " 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 15, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "stop_words_lemma = set(word.lemma_ for word in nlp(stop_words_str))" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 16, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer, TfidfVectorizer\n", 265 | "from sklearn.pipeline import Pipeline\n", 266 | "from sklearn.linear_model import Ridge" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 17, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "# create and train pipeline\n", 276 | "vectorizer = CountVectorizer()\n", 277 | "\n", 278 | "bag_of_words_model = Pipeline([\n", 279 | " ('selector', keyTransformer('reviewText')),\n", 280 | " ('vectorizer', vectorizer),\n", 281 | " ('regressor', Ridge(alpha=0.8))\n", 282 | "])\n", 283 | "\n", 284 | "bag_of_words_model.fit(data, ratings);" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 18, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "==================\n", 297 | "Your score: 1.0503340548115643\n", 298 | "==================\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "grader.score.nlp__bag_of_words_model(bag_of_words_model.predict)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "## Question 2: Normalized model\n", 311 | "\n", 312 | "Using raw counts will not be as effective compared if we had normalized the counts. There are several ways to normalize raw counts; the `HashingVectorizer` class has the keyword `norm` and there is also the `TfidfTransformer` and `TfidfVectorizer` that perform tf-idf weighting on the counts. Apply normalized to your model to improve performance." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 19, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "HashingVectorizer?" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 20, 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "data": { 331 | "text/plain": [ 332 | "Pipeline(memory=None,\n", 333 | " steps=[('selector', keyTransformer(key='reviewText')),\n", 334 | " ('vectorizer',\n", 335 | " HashingVectorizer(alternate_sign=True, analyzer='word',\n", 336 | " binary=False, decode_error='strict',\n", 337 | " dtype=,\n", 338 | " encoding='utf-8', input='content',\n", 339 | " lowercase=True, n_features=1048576,\n", 340 | " ngram_range=(1, 1), norm='l2',\n", 341 | " preprocessor=None, stop_words=None,\n", 342 | " strip_accents=None,\n", 343 | " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 344 | " tokenizer=None)),\n", 345 | " ('regressor',\n", 346 | " Ridge(alpha=0.8, copy_X=True, fit_intercept=True,\n", 347 | " max_iter=None, normalize=False, random_state=None,\n", 348 | " solver='auto', tol=0.001))],\n", 349 | " verbose=False)" 350 | ] 351 | }, 352 | "execution_count": 20, 353 | "metadata": {}, 354 | "output_type": "execute_result" 355 | } 356 | ], 357 | "source": [ 358 | "normalized_model = Pipeline([\n", 359 | " ('selector', keyTransformer('reviewText')),\n", 360 | " ('vectorizer', HashingVectorizer()),\n", 361 | " ('regressor', Ridge(alpha=0.8))\n", 362 | "])\n", 363 | "\n", 364 | "normalized_model.fit(data, ratings)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 21, 370 | "metadata": {}, 371 | "outputs": [ 372 | { 373 | "name": "stdout", 374 | "output_type": "stream", 375 | "text": [ 376 | "==================\n", 377 | "Your score: 1.0384302489344495\n", 378 | "==================\n" 379 | ] 380 | } 381 | ], 382 | "source": [ 383 | "grader.score.nlp__normalized_model(normalized_model.predict)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "## Question 3: Bigrams model\n", 391 | "\n", 392 | "The model performance may increase when including additional features generated by counting bigrams. Include bigrams to your model. When using more features, the risk of overfitting increases. Make sure you try to minimize overfitting as much as possible." 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 22, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "Pipeline(memory=None,\n", 404 | " steps=[('selector', keyTransformer(key='reviewText')),\n", 405 | " ('vectorizer',\n", 406 | " HashingVectorizer(alternate_sign=True, analyzer='word',\n", 407 | " binary=False, decode_error='strict',\n", 408 | " dtype=,\n", 409 | " encoding='utf-8', input='content',\n", 410 | " lowercase=True, n_features=1048576,\n", 411 | " ngram_range=(1, 2), norm='l2',\n", 412 | " preprocessor=None,\n", 413 | " stop_words={'a', 'about', 'above'...\n", 414 | " 'also', 'although', 'always',\n", 415 | " 'am', 'among', 'amongst',\n", 416 | " 'amount', 'an', 'and', 'another',\n", 417 | " 'any', 'anyhow', 'anyone',\n", 418 | " 'anything', 'anyway', 'anywhere',\n", 419 | " 'are', ...},\n", 420 | " strip_accents=None,\n", 421 | " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 422 | " tokenizer=None)),\n", 423 | " ('regressor',\n", 424 | " Ridge(alpha=0.8, copy_X=True, fit_intercept=True,\n", 425 | " max_iter=None, normalize=False, random_state=None,\n", 426 | " solver='auto', tol=0.001))],\n", 427 | " verbose=False)" 428 | ] 429 | }, 430 | "execution_count": 22, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "\n", 437 | "bigrams_model = Pipeline([\n", 438 | " ('selector', keyTransformer('reviewText')),\n", 439 | " ('vectorizer', HashingVectorizer(stop_words=STOP_WORDS, ngram_range=(1, 2), tokenizer=None)),\n", 440 | " ('regressor', Ridge(alpha=0.8))\n", 441 | "])\n", 442 | "bigrams_model.fit(data, ratings)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 23, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "name": "stdout", 452 | "output_type": "stream", 453 | "text": [ 454 | "==================\n", 455 | "Your score: 1.0293249602697048\n", 456 | "==================\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "grader.score.nlp__bigrams_model(bigrams_model.predict)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "## Question 4: Polarity analysis\n", 469 | "\n", 470 | "Let's derive some insight from our analysis. We want to determine the most polarizing words in the corpus of reviews. In other words, we want identify words that strongly signal a review is either positive or negative. For example, we understand a word like \"terrible\" will mostly appear in negative rather than positive reviews. The naive Bayes model calculates probabilities such as $P(\\text{terrible } | \\text{ negative})$, the probability the review is negative given the word \"terrible\" appears in the text. Using these probabilities, we can derive a polarity score for each counted word,\n", 471 | "\n", 472 | "$$\n", 473 | "\\text{polarity} = \\log\\left(\\frac{P(\\text{word } | \\text{ positive})}{P(\\text{word } | \\text{ negative})}\\right).\n", 474 | "$$ \n", 475 | "\n", 476 | "The polarity analysis is an example where a simpler model offers more explicability than a more complicated model. For this question, you are asked to determine the top twenty-five words with the largest positive **and** largest negative polarity, for a total of fifty words. For this analysis, you should:\n", 477 | "\n", 478 | "1. Use the naive Bayes model, `MultinomialNB`.\n", 479 | "1. Use tf-idf weighting.\n", 480 | "1. Remove stop words.\n", 481 | "\n", 482 | "A trained naive Bayes model stores the log of the probabilities in the attribute `feature_log_prob_`. It is a NumPy array of shape (number of classes, the number of features). You will need the mapping between feature index to word. For this problem, you will use a different data set; it has been processed to only include reviews with one and five stars. You can download it below." 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 24, 488 | "metadata": {}, 489 | "outputs": [ 490 | { 491 | "name": "stderr", 492 | "output_type": "stream", 493 | "text": [ 494 | "File ‘./data/amazon_one_and_five_star_reviews.json.gz’ already there; not retrieving.\n", 495 | "\n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "%%bash\n", 501 | "wget http://dataincubator-wqu.s3.amazonaws.com/mldata/amazon_one_and_five_star_reviews.json.gz -nc -P ./data" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "To avoid memory issue, we can delete the older data." 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 25, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "del data, ratings" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 26, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "import numpy as np\n", 527 | "from sklearn.naive_bayes import MultinomialNB\n", 528 | "\n", 529 | "with gzip.open(\"data/amazon_one_and_five_star_reviews.json.gz\", \"r\") as f:\n", 530 | " data_polarity = [json.loads(line) for line in f]\n", 531 | "\n", 532 | "ratings = [d['overall'] for d in data_polarity]\n" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 28, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "# lemma of stop words\n", 542 | "stop_words_str = \" \".join(STOP_WORDS)\n", 543 | "stop_words_lemma = set(word.lemma_ for word in nlp(stop_words_str))\n", 544 | "\n", 545 | "# create and train pipeline\n", 546 | "tfidf = TfidfVectorizer(stop_words=stop_words_lemma, tokenizer=None)\n", 547 | "\n", 548 | "pipe = Pipeline([(\"selector\", keyTransformer(\"reviewText\")),\n", 549 | " (\"vectorizer\", TfidfVectorizer()),\n", 550 | " (\"cls\", MultinomialNB())])\n" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 29, 556 | "metadata": {}, 557 | "outputs": [ 558 | { 559 | "data": { 560 | "text/plain": [ 561 | "Pipeline(memory=None,\n", 562 | " steps=[('selector', keyTransformer(key='reviewText')),\n", 563 | " ('vectorizer',\n", 564 | " TfidfVectorizer(analyzer='word', binary=False,\n", 565 | " decode_error='strict',\n", 566 | " dtype=,\n", 567 | " encoding='utf-8', input='content',\n", 568 | " lowercase=True, max_df=1.0, max_features=None,\n", 569 | " min_df=1, ngram_range=(1, 1), norm='l2',\n", 570 | " preprocessor=None, smooth_idf=True,\n", 571 | " stop_words=None, strip_accents=None,\n", 572 | " sublinear_tf=False,\n", 573 | " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 574 | " tokenizer=None, use_idf=True,\n", 575 | " vocabulary=None)),\n", 576 | " ('cls',\n", 577 | " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],\n", 578 | " verbose=False)" 579 | ] 580 | }, 581 | "execution_count": 29, 582 | "metadata": {}, 583 | "output_type": "execute_result" 584 | } 585 | ], 586 | "source": [ 587 | "pipe.fit(data_polarity, ratings)\n" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 30, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "# get feature from the model\n", 597 | "feature = pipe['vectorizer'].get_feature_names()\n", 598 | "\n", 599 | "# Get the log probability from the model\n", 600 | "log_prob = pipe['cls'].feature_log_prob_\n", 601 | "log_prob_polarity = log_prob[0, :] - log_prob[1, :]\n" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 31, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "# combine feature and log polarity\n", 611 | "polar = sorted(list(zip(log_prob_polarity, feature)))\n", 612 | "most_polar = polar[:25] + polar[-25:]\n", 613 | "top_50 = [term for score, term in most_polar]" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 33, 619 | "metadata": {}, 620 | "outputs": [ 621 | { 622 | "name": "stdout", 623 | "output_type": "stream", 624 | "text": [ 625 | "==================\n", 626 | "Your score: 1.0\n", 627 | "==================\n" 628 | ] 629 | } 630 | ], 631 | "source": [ 632 | "grader.score.nlp__most_polar(top_50)" 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "## Question 5: Topic modeling [optional]\n", 640 | "\n", 641 | "Topic modeling is the analysis of determining the key topics or themes in a corpus. With respect to machine learning, topic modeling is an unsupervised technique. One way to uncover the main topics in a corpus is to use [non-negative matrix factorization](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html). For this question, use non-negative matrix factorization to determine the top ten words for the first twenty topics. You should submit your answer as a list of lists. What topics exist in the reviews?" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [ 650 | "from sklearn.decomposition import NMF\n", 651 | " " 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "*Copyright © 2020 The Data Incubator. All rights reserved.*" 659 | ] 660 | } 661 | ], 662 | "metadata": { 663 | "kernelspec": { 664 | "display_name": "Python 3", 665 | "language": "python", 666 | "name": "python3" 667 | }, 668 | "language_info": { 669 | "codemirror_mode": { 670 | "name": "ipython", 671 | "version": 3 672 | }, 673 | "file_extension": ".py", 674 | "mimetype": "text/x-python", 675 | "name": "python", 676 | "nbconvert_exporter": "python", 677 | "pygments_lexer": "ipython3", 678 | "version": "3.7.3" 679 | }, 680 | "nbclean": true 681 | }, 682 | "nbformat": 4, 683 | "nbformat_minor": 1 684 | } 685 | --------------------------------------------------------------------------------