├── .gitignore ├── README.md ├── conda-reqs.txt └── notebooks ├── 1.1 - Loading Data into Python.ipynb ├── 1.2 - A new data set - exploratory analysis.ipynb ├── 1.3 - Getting data in the right shape - preprocessing and cleaning.ipynb ├── 2.1 - Tokenization.ipynb ├── 2.2 - Stop-word and punctuation removal.ipynb ├── 2.3 - Text Normalization.ipynb ├── 2.4 - Calculating Word Frequencies.ipynb ├── 3.2 - Regression analysis - predicting a quantity.ipynb ├── 3.3 - Binary Classification - predicting a label (out of two).ipynb ├── 3.4 - Multi-class Classification - predicting a label (out of many).ipynb ├── 3.5 - Cluster Analysis - grouping similar items.ipynb ├── 4.1 - Time Series Analysis.ipynb ├── 4.2 - Building a Movie Recommendation System.ipynb ├── AirPassengers.csv ├── data.csv ├── data_no_header.csv ├── movie.json ├── movies-90s.jsonl └── some_file.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .ipynb_checkpoints 3 | *.pickle 4 | 5 | .DS_Store 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Practical Python Data Science Techniques 2 | ======================================== 3 | 4 | Companion code for my video course on [Practical Python Data Science Techniques](https://www.packtpub.com/application-development/practical-python-data-science-techniques-video), published by Packt Publishing. 5 | 6 | Videos on [PacktPub's Mapt](https://www.packtpub.com/application-development/practical-python-data-science-techniques-video) (the publisher) 7 | 8 | Videos on [OReilly's Safari Online](https://www.safaribooksonline.com/library/view/practical-python-data/9781788294294/) 9 | 10 | 11 | Setting up the environment 12 | ----- 13 | 14 | Requirement: Python `3.6` 15 | 16 | The suggestion is to use the [Anaconda distribution](https://continuum.io/downloads "Download Anaconda Python"). 17 | 18 | Create and activate a `conda` environment: 19 | 20 | conda create --name packt-py36 python=3.6 --yes 21 | source activate packt-py36 # Linux / macOS 22 | activate packt-py36 # Windows 23 | 24 | Install the libraries used in the course: 25 | 26 | conda install --file conda-reqs.txt --yes 27 | 28 | Launch Jupyter notebook: 29 | 30 | jupyter notebook 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /conda-reqs.txt: -------------------------------------------------------------------------------- 1 | pandas==0.20.1 2 | numpy==1.11.3 3 | nltk==3.2.2 4 | scikit-learn==0.18.1 5 | scipy==0.19.0 6 | matplotlib==2.0.0 7 | statsmodels==0.8.0 8 | jupyter==1.0.0 9 | -------------------------------------------------------------------------------- /notebooks/1.1 - Loading Data into Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 1.1 Loading Data into Python" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Opening and reading files" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "%cat some_file.txt" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "fname = 'some_file.txt'\n", 37 | "\n", 38 | "f = open(fname, 'r')\n", 39 | "content = f.read()\n", 40 | "f.close()\n", 41 | "\n", 42 | "print(content)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "fname = 'some_file.txt'\n", 54 | "with open(fname, 'r') as f:\n", 55 | " content = f.read()\n", 56 | "\n", 57 | "print(content)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "fname = 'some_file.txt'\n", 69 | "with open(fname, 'r') as f:\n", 70 | " content = f.readlines()\n", 71 | "\n", 72 | "print(content)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": false, 80 | "scrolled": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "fname = 'some_file.txt'\n", 85 | "with open(fname, 'r') as f:\n", 86 | " for line in f:\n", 87 | " print(line)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "fname = 'some_file.txt'\n", 99 | "with open(fname, 'r') as f:\n", 100 | " for i, line in enumerate(f):\n", 101 | " print(\"Line {}: {}\".format(i, line.strip()))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### JSON\n", 109 | "\n", 110 | "JavaScript Object Notation\n", 111 | "\n", 112 | "Good for data serialization and communication between services" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "%cat movie.json" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "import json\n", 135 | "\n", 136 | "fname = 'movie.json'\n", 137 | "with open(fname, 'r') as f:\n", 138 | " content = f.read()\n", 139 | " movie = json.loads(content)\n", 140 | "\n", 141 | "movie" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": { 148 | "collapsed": true 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "import json\n", 153 | "\n", 154 | "fname = 'movie.json'\n", 155 | "with open(fname, 'r') as f:\n", 156 | " movie_alt = json.load(f)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "movie == movie_alt" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "collapsed": false 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "print(json.dumps(movie, indent=4))" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "collapsed": false 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "%cat movies-90s.jsonl" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "collapsed": false 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "import json\n", 201 | "\n", 202 | "fname = 'movies-90s.jsonl'\n", 203 | "\n", 204 | "with open(fname, 'r') as f:\n", 205 | " for line in f:\n", 206 | " try:\n", 207 | " movie = json.loads(line)\n", 208 | " print(movie['title'])\n", 209 | " except: \n", 210 | " ...\n" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "### CSV files\n", 218 | "\n", 219 | "Comma Separated Values\n", 220 | "\n", 221 | "This format is very common for import/export for spreadsheet and databases" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "%cat data.csv" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "import csv\n", 244 | "\n", 245 | "fname = 'data.csv'\n", 246 | "\n", 247 | "with open(fname, 'r') as f:\n", 248 | " data_reader = csv.reader(f, delimiter=',')\n", 249 | " headers = next(data_reader)\n", 250 | " print(\"Headers = {}\".format(headers))\n", 251 | " for line in data_reader:\n", 252 | " print(line)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": false 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "fname = 'data_no_header.csv'\n", 264 | "\n", 265 | "with open(fname, 'r') as f:\n", 266 | " data_reader = csv.reader(f, delimiter=',')\n", 267 | " for line in data_reader:\n", 268 | " print(line)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "fname = 'data.csv'\n", 280 | "\n", 281 | "with open(fname, 'r') as f:\n", 282 | " data_reader = csv.reader(f, delimiter=',')\n", 283 | " headers = next(data_reader)\n", 284 | " data = []\n", 285 | " for line in data_reader:\n", 286 | " item = {headers[i]: value for i, value in enumerate(line)}\n", 287 | " data.append(item)\n", 288 | "\n", 289 | "data" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "### Pickles: Python object serialization" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": { 303 | "collapsed": false 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "with open('movie.json', 'r') as f:\n", 308 | " content = f.read()\n", 309 | " data = json.loads(content)\n", 310 | "\n", 311 | "data" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "collapsed": false 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "type(data)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "import pickle \n", 334 | "\n", 335 | "with open('data.pickle', 'wb') as f:\n", 336 | " pickle.dump(data, f)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": { 343 | "collapsed": false 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "%cat data.pickle" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "collapsed": false 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "with open('data.pickle', 'rb') as f:\n", 359 | " data = pickle.load(f)\n", 360 | "\n", 361 | "data" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "collapsed": false 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "type(data)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": { 378 | "collapsed": true 379 | }, 380 | "source": [ 381 | "### Loading JSON and CSV into pandas" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "collapsed": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "import pandas as pd" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "collapsed": false 400 | }, 401 | "outputs": [], 402 | "source": [ 403 | "%cat movie.json" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "data = pd.read_json('movie.json')\n", 415 | "data.head()" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": { 422 | "collapsed": false 423 | }, 424 | "outputs": [], 425 | "source": [ 426 | "%cat movies-90s.jsonl" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": { 433 | "collapsed": false 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "data = pd.read_json('movies-90s.jsonl', lines=True)\n", 438 | "data.head()" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": { 445 | "collapsed": false 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "%cat data.csv" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": { 456 | "collapsed": false 457 | }, 458 | "outputs": [], 459 | "source": [ 460 | "data = pd.read_csv('data.csv')\n", 461 | "data.head()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": { 468 | "collapsed": true 469 | }, 470 | "outputs": [], 471 | "source": [] 472 | } 473 | ], 474 | "metadata": { 475 | "kernelspec": { 476 | "display_name": "Python 3", 477 | "language": "python", 478 | "name": "python3" 479 | }, 480 | "language_info": { 481 | "codemirror_mode": { 482 | "name": "ipython", 483 | "version": 3 484 | }, 485 | "file_extension": ".py", 486 | "mimetype": "text/x-python", 487 | "name": "python", 488 | "nbconvert_exporter": "python", 489 | "pygments_lexer": "ipython3", 490 | "version": "3.6.0" 491 | } 492 | }, 493 | "nbformat": 4, 494 | "nbformat_minor": 2 495 | } 496 | -------------------------------------------------------------------------------- /notebooks/1.2 - A new data set - exploratory analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## A New Data Set: Exploratory Analysis\n", 8 | "\n", 9 | "Data set: [Kaggle Titanic Disaster](https://www.kaggle.com/c/titanic/data)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "data_file = '~/data/titanic/train.csv'" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "\n", 33 | "data = pd.read_csv(data_file)\n", 34 | "\n", 35 | "len(data)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "data.head()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "data.count()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "data.describe()" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "data[['Age', 'Fare']].describe()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": false 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "data['Age'].min(), data['Age'].max(), data['Age'].mean()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "data['Sex'].value_counts()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "data['Sex'].value_counts() / len(data) * 100" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "data['Survived'].value_counts()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "data['Pclass'].value_counts()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "data['Pclass'].value_counts().sort_index()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "data['Age']. value_counts()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "bins = [0, 18, 25, 35, 45, 55, 65, 75, 80]\n", 168 | "\n", 169 | "data['AgeGroup'] = pd.cut(data['Age'], bins)\n", 170 | "\n", 171 | "data['AgeGroup'].value_counts().sort_index()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "%matplotlib inline\n", 183 | "\n", 184 | "data['AgeGroup'].value_counts().sort_index().plot(kind='bar')" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.6.0" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 2 218 | } 219 | -------------------------------------------------------------------------------- /notebooks/1.3 - Getting data in the right shape - preprocessing and cleaning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 1.3 Getting Data in the Right Shape: Pre-processing and Cleaning" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "data_file = '~/data/titanic/train.csv'" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd\n", 30 | "import numpy as np\n", 31 | "\n", 32 | "data = pd.read_csv(data_file)\n", 33 | "\n", 34 | "data.head()" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "len(data)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "data.count()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "data.duplicated().value_counts()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "data['Ticket'].duplicated().value_counts()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "data[data['Ticket'].duplicated()].head()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "data[data['Ticket'] == '349909']" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "#### Dropping rows with missing data" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "data.dropna().head()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "data.dropna(how='all').head()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "#### Dropping columns with missing data" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "data['XYZ'] = np.nan\n", 148 | "data.head()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "data = data.dropna(how='all', axis=1)\n", 160 | "data.head()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "#### Filling in missing data" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "collapsed": false 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "data['Age'].count()" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "collapsed": false 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "data['Age'].mean()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "collapsed": false 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "data['Age'] = data['Age'].fillna(-1000)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "data['Age'].mean()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "#### Transforming data" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "data['Embarked'].value_counts()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "data['Embarked'].count()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "data['Embarked'] = data['Embarked'].fillna('U')" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": false 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "port_names = {\n", 263 | " 'S': 'Southampton',\n", 264 | " 'C': 'Cherbourg',\n", 265 | " 'Q': 'Queenstown',\n", 266 | " 'U': 'Unknown'\n", 267 | "}\n", 268 | "\n", 269 | "data['Embarked'] = data['Embarked'].map(lambda x: port_names[x])\n", 270 | "\n", 271 | "# .map() for Series, .applymap() for DataFrame\n", 272 | "\n", 273 | "data.head()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": true 281 | }, 282 | "outputs": [], 283 | "source": [] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.6.0" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 2 307 | } 308 | -------------------------------------------------------------------------------- /notebooks/2.1 - Tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 2.1 - Tokenization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Word tokenization" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "s = \"The quick brown fox jumped over the lazy dog\"\n", 26 | "\n", 27 | "s.split()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "s = \"The quick brown fox, and a lazy dog\"\n", 39 | "\n", 40 | "s.split()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "from nltk.tokenize import word_tokenize\n", 52 | "\n", 53 | "s = \"The quick brown fox, and a lazy dog\"\n", 54 | "\n", 55 | "word_tokenize(s)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "s = \"Dr. Smith is visiting the patient\"\n", 67 | "\n", 68 | "word_tokenize(s)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "s = \"USA vs. U.S.A.\"\n", 80 | "\n", 81 | "word_tokenize(s)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "s = \"USA vs. U.S.A. and more words.\"\n", 93 | "\n", 94 | "word_tokenize(s)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "#### Sentence tokenization a.k.a. sentence segmentation" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "s = \"Text with many sentences. This is a sentence.\"\n", 113 | "\n", 114 | "word_tokenize(s)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "from nltk.tokenize import sent_tokenize\n", 126 | "\n", 127 | "s = \"Text with many sentences. This is a sentence.\"\n", 128 | "\n", 129 | "sent_tokenize(s)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "s = \"Text with many sentences. This is a sentence.\"\n", 141 | "\n", 142 | "for sentence in sent_tokenize(s):\n", 143 | " print(word_tokenize(sentence))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "#### Different data domains (e.g. Twitter)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "s = 'Hi @marcobonzanini just an example! :D http://example.com #NLP'\n", 162 | "\n", 163 | "word_tokenize(s)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "from nltk.tokenize import TweetTokenizer\n", 175 | "\n", 176 | "tokenizer = TweetTokenizer()\n", 177 | "\n", 178 | "s = 'Hi @marcobonzanini just an example! :D http://example.com #NLP'\n", 179 | "tokenizer.tokenize(s)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "#### Phrases\n", 187 | "\n", 188 | "Capture concepts like \"quick brown fox\", \"good movie\" or \"nice restaurant\"" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "from nltk import bigrams, trigrams, ngrams\n", 200 | "\n", 201 | "s = \"The quick brown fox jumped over the lazy dog\"\n", 202 | "tokens = word_tokenize(s)\n", 203 | "\n", 204 | "list(bigrams(tokens))" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "list(trigrams(tokens))" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "list(ngrams(tokens, 2))" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "kernelspec": { 241 | "display_name": "Python 3", 242 | "language": "python", 243 | "name": "python3" 244 | }, 245 | "language_info": { 246 | "codemirror_mode": { 247 | "name": "ipython", 248 | "version": 3 249 | }, 250 | "file_extension": ".py", 251 | "mimetype": "text/x-python", 252 | "name": "python", 253 | "nbconvert_exporter": "python", 254 | "pygments_lexer": "ipython3", 255 | "version": "3.6.0" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 2 260 | } 261 | -------------------------------------------------------------------------------- /notebooks/2.2 - Stop-word and punctuation removal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Stop-word and punctuation removal" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Stop-words" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from nltk.corpus import stopwords\n", 26 | "\n", 27 | "stop_list = stopwords.words('english')\n", 28 | "\n", 29 | "stop_list" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "len(stop_list)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "#### Punctuation" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "from string import punctuation\n", 59 | "\n", 60 | "punctuation" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "list(punctuation)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "stop_list += list(punctuation)\n", 83 | "\n", 84 | "len(stop_list)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "stop_list" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "#### Adding custom words" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "stop_list += ['rt', 'via'] # custom list\n", 114 | "\n", 115 | "len(stop_list)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "#### Fun with Unicode\n", 123 | "\n", 124 | "Unicode categories: https://en.wikipedia.org/wiki/Unicode_character_property#General_Category\n", 125 | "\n", 126 | "Punctuation categories are labelled as P*" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "from unicodedata import category" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "category('A') # Letter, uppercase" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "category('a') # Letter, lowercase" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "category('.') # Punctuation, other" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "dashes = ['‒', '–', '—', '―', '⁓'] # https://en.wikipedia.org/wiki/Dash#Common_dashes\n", 182 | "\n", 183 | "'-' in dashes" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "for d in dashes:\n", 195 | " print(category(d))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "def is_unicode_punct(token):\n", 207 | " try:\n", 208 | " return category(token).startswith('P')\n", 209 | " except TypeError:\n", 210 | " return False" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "is_unicode_punct('A')" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "is_unicode_punct('.')" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "is_unicode_punct('HELLOOO')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "#### Putting everything together" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "collapsed": false 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "from nltk.tokenize import word_tokenize\n", 262 | "\n", 263 | "text = \"\"\"Python is a widely used high-level programming\n", 264 | "language for general-purpose programming,\n", 265 | "created by Guido van Rossum and first released in 1991.\"\"\"\n", 266 | "# text from https://en.wikipedia.org/wiki/Python_(programming_language)\n", 267 | "\n", 268 | "tokens = word_tokenize(text)\n", 269 | "\n", 270 | "tokens" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "collapsed": false 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "tokens_no_stop = [t for t in tokens\n", 282 | " if t not in stop_list and not is_unicode_punct(t)]\n", 283 | "\n", 284 | "tokens_no_stop" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "collapsed": true 292 | }, 293 | "outputs": [], 294 | "source": [] 295 | } 296 | ], 297 | "metadata": { 298 | "kernelspec": { 299 | "display_name": "Python 3", 300 | "language": "python", 301 | "name": "python3" 302 | }, 303 | "language_info": { 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 3 307 | }, 308 | "file_extension": ".py", 309 | "mimetype": "text/x-python", 310 | "name": "python", 311 | "nbconvert_exporter": "python", 312 | "pygments_lexer": "ipython3", 313 | "version": "3.6.0" 314 | } 315 | }, 316 | "nbformat": 4, 317 | "nbformat_minor": 2 318 | } 319 | -------------------------------------------------------------------------------- /notebooks/2.3 - Text Normalization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 2.3 - Text Normalization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Case normalization" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "'Python' == 'python'" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "'Python'.lower() == 'python'.lower()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "#### Stemming" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "from nltk.stem import PorterStemmer\n", 55 | "\n", 56 | "stemmer = PorterStemmer()\n", 57 | "\n", 58 | "stemmer.stem('fish')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "stemmer.stem('fishing')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "stemmer.stem('fishes')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "#### Lemmatization\n", 88 | "\n", 89 | "Requires Part-of-Speech (POS) tag\n", 90 | "\n", 91 | "- a = adjective\n", 92 | "- v = verb\n", 93 | "- n = noun\n", 94 | "- r = adverb" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "from nltk.stem import WordNetLemmatizer\n", 106 | "\n", 107 | "lemmatizer = WordNetLemmatizer()\n", 108 | "\n", 109 | "lemmatizer.lemmatize('having')" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "lemmatizer.lemmatize('having', pos='v')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "lemmatizer.lemmatize('have', pos='v')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "lemmatizer.lemmatize('had', pos='v')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "lemmatizer.lemmatize('be', pos='v')" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "lemmatizer.lemmatize('am', pos='v')" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": false 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "lemmatizer.lemmatize('was', pos='v')" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "#### Synonym mapping" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "synonyms = {\n", 194 | " 'large': 'big',\n", 195 | " 'purchase': 'buy',\n", 196 | "}\n", 197 | "\n", 198 | "text = \"I want to purchase a large book on Big Data\"\n", 199 | "\n", 200 | "tokens = text.lower().split()\n", 201 | "\n", 202 | "tokens" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": false 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "new_text = [synonyms.get(t, t) for t in tokens]\n", 214 | "\n", 215 | "new_text" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "Python 3", 231 | "language": "python", 232 | "name": "python3" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython3", 244 | "version": "3.6.0" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 2 249 | } 250 | -------------------------------------------------------------------------------- /notebooks/2.4 - Calculating Word Frequencies.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 2.4 - Calculating Word Frequencies" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Get the data\n", 15 | "\n", 16 | "`$ python -m nltk.downloader moview_reviews`" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "from nltk.corpus import movie_reviews" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "movie_reviews.fileids()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "len(movie_reviews.fileids())" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "len(movie_reviews.fileids('pos')), len(movie_reviews.fileids('neg'))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "pos_reviews = [movie_reviews.words(fileid)\n", 72 | " for fileid in movie_reviews.fileids('pos')]\n", 73 | "\n", 74 | "neg_reviews = [movie_reviews.words(fileid)\n", 75 | " for fileid in movie_reviews.fileids('neg')]" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "#### Term frequencies" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "one_review = pos_reviews[10]\n", 94 | "\n", 95 | "from collections import Counter\n", 96 | "c = Counter(one_review)\n", 97 | "\n", 98 | "c.most_common(20)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "from nltk.corpus import stopwords\n", 110 | "from string import punctuation\n", 111 | "\n", 112 | "stop_list = stopwords.words('english') + list(punctuation)\n", 113 | "\n", 114 | "one_review_no_stop = [word for word in one_review if word not in stop_list]\n", 115 | "\n", 116 | "c = Counter(one_review_no_stop)\n", 117 | "\n", 118 | "c.most_common(20)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "#### Frequencies across the whole collection" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "from itertools import chain\n", 137 | "\n", 138 | "all_positive = list(chain(*pos_reviews))\n", 139 | "all_negative = list(chain(*neg_reviews))\n", 140 | "\n", 141 | "all_positive" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "total_freq = Counter(all_positive)\n", 153 | "\n", 154 | "total_freq.most_common(20)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "all_positive_no_stop = [t for t in all_positive if t not in stop_list]\n", 166 | "all_negative_no_stop = [t for t in all_negative if t not in stop_list]\n", 167 | "\n", 168 | "total_freq_no_stop = Counter(all_positive_no_stop)\n", 169 | "\n", 170 | "total_freq_no_stop.most_common(20)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "from nltk import FreqDist\n", 182 | "\n", 183 | "f = FreqDist(all_positive)\n", 184 | "\n", 185 | "f.most_common(20)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "%matplotlib inline\n", 197 | "\n", 198 | "f.plot(30)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "f = FreqDist(all_positive_no_stop)\n", 210 | "\n", 211 | "f.most_common(20)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "collapsed": false 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "f.plot(30)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "f = FreqDist(all_negative)\n", 234 | "\n", 235 | "f.plot(30)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "f = FreqDist(all_negative_no_stop)\n", 247 | "\n", 248 | "f.plot(30)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "#### Zipf's Law\n", 256 | "\n", 257 | "https://en.wikipedia.org/wiki/Zipf%27s_law" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [] 268 | } 269 | ], 270 | "metadata": { 271 | "kernelspec": { 272 | "display_name": "Python 3", 273 | "language": "python", 274 | "name": "python3" 275 | }, 276 | "language_info": { 277 | "codemirror_mode": { 278 | "name": "ipython", 279 | "version": 3 280 | }, 281 | "file_extension": ".py", 282 | "mimetype": "text/x-python", 283 | "name": "python", 284 | "nbconvert_exporter": "python", 285 | "pygments_lexer": "ipython3", 286 | "version": "3.6.0" 287 | } 288 | }, 289 | "nbformat": 4, 290 | "nbformat_minor": 2 291 | } 292 | -------------------------------------------------------------------------------- /notebooks/3.2 - Regression analysis - predicting a quantity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 3.2 - Regression Analysis - Predicting a Quantity\n", 8 | "\n", 9 | "Data set: Boston House Prices (shipped with scikit-learn)\n", 10 | "\n", 11 | "Full data set and docs: http://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "#### Load up the data" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "from sklearn.datasets import load_boston\n", 30 | "\n", 31 | "boston = load_boston()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "print(boston.DESCR)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "boston.data[0]" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "boston.feature_names" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "#### Quick look at the data" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "import pandas as pd\n", 83 | "\n", 84 | "data = pd.DataFrame(boston.data, columns=boston.feature_names)\n", 85 | "\n", 86 | "data.head()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "data['PRICE'] = boston.target\n", 98 | "\n", 99 | "data.head()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "#### House price prediction" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "X = data[['RM']] # only one feature first\n", 118 | "Y = data['PRICE']" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "from sklearn.model_selection import train_test_split\n", 130 | "\n", 131 | "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "from sklearn.linear_model import LinearRegression\n", 143 | "\n", 144 | "model = LinearRegression()\n", 145 | "model.fit(X_train, Y_train)\n", 146 | "\n", 147 | "Y_prediction = model.predict(X_test)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "Y_prediction[0]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "Y_test.values[0]" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "#### Evaluation\n", 177 | "\n", 178 | "Mean Squared Error\n", 179 | "\n", 180 | "$MSE = \\frac{1}{n} \\sum_{i=1}^{n} (\\hat{Y}_{i} - Y_{i})^2$\n", 181 | "\n", 182 | "$\\hat{Y}_{i}$ = prediction on ith sample\n", 183 | "\n", 184 | "$Y_{i}$ = true value for ith sample" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "%matplotlib inline\n", 196 | "\n", 197 | "import matplotlib.pyplot as plt\n", 198 | "\n", 199 | "plt.scatter(Y_test, Y_prediction)\n", 200 | "plt.xlabel(\"Prices: $Y_{test}$\")\n", 201 | "plt.ylabel(\"Predicted prices: $Y_{predicted}$\")\n", 202 | "plt.title(\"Prices vs Predicted prices\")" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": false 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "from sklearn.metrics import mean_squared_error\n", 214 | "\n", 215 | "mse = mean_squared_error(Y_test, Y_prediction)\n", 216 | "\n", 217 | "mse" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "collapsed": false 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "X = data.drop('PRICE', axis=1) # all features\n", 229 | "\n", 230 | "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "model = LinearRegression()\n", 242 | "model.fit(X_train, Y_train)\n", 243 | "\n", 244 | "Y_prediction = model.predict(X_test)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "plt.scatter(Y_test, Y_prediction)\n", 256 | "plt.xlabel(\"Prices: $Y_i$\")\n", 257 | "plt.ylabel(\"Predicted prices: $\\hat{Y}_i$\")\n", 258 | "plt.title(\"Prices vs Predicted prices: $Y_i$ vs $\\hat{Y}_i$\")" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": false 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "mse = mean_squared_error(Y_test, Y_prediction)\n", 270 | "\n", 271 | "mse" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "collapsed": true 279 | }, 280 | "outputs": [], 281 | "source": [] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.6.0" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 2 305 | } 306 | -------------------------------------------------------------------------------- /notebooks/3.3 - Binary Classification - predicting a label (out of two).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 3.3 - Binary Classification - predicting a label (out of two)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Get the data\n", 15 | "\n", 16 | "`$ python -m nltk.downloader movie_reviews`" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "from nltk.corpus import movie_reviews as data" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "data.categories()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "data.fileids()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "data.raw('neg/cv029_19943.txt')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "len(data.fileids('pos')), len(data.fileids('neg'))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "#### Preparing the data" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "corpus = [data.raw(fileid) for fileid in data.fileids('pos')]\n", 90 | "corpus += [data.raw(fileid) for fileid in data.fileids('neg')]\n", 91 | "\n", 92 | "target = ['pos'] * 1000 # ['pos', 'pos', ... x1000]\n", 93 | "target += ['neg'] * 1000" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "TF = Term Frequency\n", 101 | "\n", 102 | "IDF = Inverse Document Frequency" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 114 | "\n", 115 | "vectorizer = TfidfVectorizer(min_df=5, max_df=0.8)\n", 116 | "\n", 117 | "X = vectorizer.fit_transform(corpus)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "X.shape" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "#### First Attempt at Classification" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": true 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "from sklearn.model_selection import train_test_split\n", 147 | "\n", 148 | "X_train, X_test, Y_train, Y_test = train_test_split(X, target, test_size=0.2, random_state=0)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "from sklearn.svm import LinearSVC\n", 160 | "\n", 161 | "classifier = LinearSVC()\n", 162 | "classifier.fit(X_train, Y_train)\n", 163 | "\n", 164 | "Y_pred = classifier.predict(X_test)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "#### Evaluation" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 183 | "from sklearn.metrics import classification_report\n", 184 | "\n", 185 | "print(\"Precision: {}\".format(precision_score(Y_test, Y_pred, average='macro')))\n", 186 | "print(\"Recall: {}\".format(recall_score(Y_test, Y_pred, average='macro')))\n", 187 | "print(\"F1-Score: {}\".format(f1_score(Y_test, Y_pred, average='macro')))" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "print(classification_report(Y_test, Y_pred, digits=4))" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "#### Cross-validation" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "from sklearn.model_selection import ShuffleSplit\n", 217 | "from sklearn.model_selection import cross_val_score\n", 218 | "\n", 219 | "cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)\n", 220 | "\n", 221 | "cross_val_score(classifier, X, target, cv=cv, scoring='f1_macro')" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "cross_val_score(classifier, X, target, cv=cv, scoring='f1_macro').mean()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "cross_val_score(classifier, X, target, cv=cv, scoring='precision_macro').mean()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "cross_val_score(classifier, X, target, cv=cv, scoring='recall_macro').mean()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "from sklearn.model_selection import KFold\n", 266 | "\n", 267 | "cv = KFold(n_splits=10, shuffle=True, random_state=0)\n", 268 | "\n", 269 | "cross_val_score(classifier, X, target, cv=cv, scoring='f1_macro')" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "collapsed": false 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "cross_val_score(classifier, X, target, cv=cv, scoring='f1_macro').mean()" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "collapsed": true 288 | }, 289 | "outputs": [], 290 | "source": [] 291 | } 292 | ], 293 | "metadata": { 294 | "kernelspec": { 295 | "display_name": "Python 3", 296 | "language": "python", 297 | "name": "python3" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.6.0" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 2 314 | } 315 | -------------------------------------------------------------------------------- /notebooks/3.4 - Multi-class Classification - predicting a label (out of many).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 3.4 - Multi-class classification - predicting a label (out of many)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Get the data" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from sklearn.datasets import load_digits\n", 26 | "\n", 27 | "digits = load_digits()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "print(digits.DESCR)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "len(digits.images)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "digits.images[50]" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "digits.images[50].shape" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "digits.data[50]" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "digits.data[50].shape" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "%matplotlib inline\n", 105 | "\n", 106 | "import matplotlib.pyplot as plt\n", 107 | "\n", 108 | "plt.gray()\n", 109 | "plt.matshow(digits.images[50]) " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "digits.target[50]" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "#### Classification" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "from sklearn.model_selection import train_test_split\n", 139 | "\n", 140 | "X_train, X_test, Y_train, Y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=0)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "from sklearn.svm import LinearSVC\n", 152 | "\n", 153 | "classifier = LinearSVC(random_state=0) # one-vs-rest by default\n", 154 | "classifier.fit(X_train, Y_train)\n", 155 | "\n", 156 | "Y_pred = classifier.predict(X_test)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "from sklearn.metrics import classification_report\n", 168 | "\n", 169 | "print(classification_report(Y_test, Y_pred))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "from sklearn.multiclass import OneVsOneClassifier\n", 181 | "\n", 182 | "classifier = OneVsOneClassifier(LinearSVC(random_state=0))\n", 183 | "classifier.fit(X_train, Y_train)\n", 184 | "\n", 185 | "Y_pred = classifier.predict(X_test)\n", 186 | "\n", 187 | "print(classification_report(Y_test, Y_pred))" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "#### Cross-validation" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "from sklearn.model_selection import KFold\n", 206 | "from sklearn.model_selection import cross_val_score\n", 207 | "\n", 208 | "cv = KFold(n_splits=10, shuffle=True, random_state=0)\n", 209 | "\n", 210 | "cross_val_score(classifier, digits.data, digits.target, cv=cv, scoring='f1_macro').mean()" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "cross_val_score(classifier, digits.data, digits.target, cv=cv, scoring='f1_micro').mean()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 3", 237 | "language": "python", 238 | "name": "python3" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 3 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython3", 250 | "version": "3.6.0" 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 2 255 | } 256 | -------------------------------------------------------------------------------- /notebooks/3.5 - Cluster Analysis - grouping similar items.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 3.5 - Cluster Analysis - grouping similar items" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Get the data" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "\n", 27 | "data = np.random.rand(100, 2)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "%matplotlib inline\n", 39 | "\n", 40 | "import matplotlib.pyplot as plt\n", 41 | "\n", 42 | "x = [item[0] for item in data]\n", 43 | "y = [item[1] for item in data]\n", 44 | "\n", 45 | "# x, y = zip(*data)\n", 46 | "\n", 47 | "plt.scatter(x, y)\n", 48 | "plt.show()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "#### Clustering" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "from sklearn.cluster import KMeans\n", 67 | "\n", 68 | "estimator = KMeans(n_clusters=4)\n", 69 | "estimator.fit(data)\n", 70 | "\n", 71 | "colours = ['r', 'g', 'b', 'y'] # red, green, blue, yellow\n", 72 | "\n", 73 | "predicted_colours = [colours[label] for label in estimator.labels_]\n", 74 | "\n", 75 | "plt.scatter(x, y, c=predicted_colours)\n", 76 | "plt.show()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [] 87 | } 88 | ], 89 | "metadata": { 90 | "kernelspec": { 91 | "display_name": "Python 3", 92 | "language": "python", 93 | "name": "python3" 94 | }, 95 | "language_info": { 96 | "codemirror_mode": { 97 | "name": "ipython", 98 | "version": 3 99 | }, 100 | "file_extension": ".py", 101 | "mimetype": "text/x-python", 102 | "name": "python", 103 | "nbconvert_exporter": "python", 104 | "pygments_lexer": "ipython3", 105 | "version": "3.6.0" 106 | } 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 2 110 | } 111 | -------------------------------------------------------------------------------- /notebooks/4.1 - Time Series Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 4.1 - Time Series Analysis" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Get the data\n", 15 | "\n", 16 | "Data from: https://datamarket.com/data/set/22u3/international-airline-passengers-monthly-totals-in-thousands-jan-49-dec-60 \n", 17 | "\n", 18 | "Ref: Time Series Analysis: Forecasting and Control (1970) Box, Jenkins and Reinsel." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd\n", 30 | "\n", 31 | "fname = 'AirPassengers.csv'\n", 32 | "\n", 33 | "data = pd.read_csv(fname)\n", 34 | "\n", 35 | "data.head()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "data.isnull().sum()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "data.dtypes" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "data['Month'] = pd.to_datetime(data['Month'])\n", 69 | "data.head()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "data.dtypes" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": false 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "data['Month'].dt.year.head()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "data = data.set_index('Month')\n", 103 | "data.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "%matplotlib inline\n", 115 | "\n", 116 | "data.plot(grid='on')" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "from datetime import datetime\n", 128 | "\n", 129 | "start_date = datetime(1959, 1, 1)\n", 130 | "end_date = datetime(1960, 12, 1)\n", 131 | "data[(start_date <= data.index) & (data.index <= end_date)].plot(grid='on')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": { 137 | "collapsed": true 138 | }, 139 | "source": [ 140 | "#### Time Series Decomposition\n", 141 | "\n", 142 | "Additive model\n", 143 | "\n", 144 | "Y(t) = Trend(t) + Seasonality(t) + Residual(t)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": false 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "import statsmodels.api as sm\n", 156 | "\n", 157 | "decomposition = sm.tsa.seasonal_decompose(data, model='additive')\n", 158 | "fig = decomposition.plot()\n", 159 | "\n", 160 | "# decomposition.plot() # if using outside notebook" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "import matplotlib\n", 172 | "\n", 173 | "matplotlib.rcParams['figure.figsize'] = [12.0, 8.0] # double up default plot size" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "import matplotlib.pyplot as plt\n", 185 | "import matplotlib.dates as mdates\n", 186 | "\n", 187 | "fig, ax = plt.subplots()\n", 188 | "ax.grid(True)\n", 189 | "\n", 190 | "year = mdates.YearLocator(month=1)\n", 191 | "month = mdates.MonthLocator(interval=3)\n", 192 | "year_format = mdates.DateFormatter(\"%Y\")\n", 193 | "month_format = mdates.DateFormatter(\"%m\")\n", 194 | "\n", 195 | "ax.xaxis.set_minor_locator(month)\n", 196 | "\n", 197 | "ax.xaxis.grid(True, which='minor')\n", 198 | "ax.xaxis.set_major_locator(year)\n", 199 | "ax.xaxis.set_major_formatter(year_format)\n", 200 | "\n", 201 | "plt.plot(data.index, data['AirPassengers'], c='blue')\n", 202 | "plt.plot(decomposition.trend.index, decomposition.trend, c='red')" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 3", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.6.0" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 2 236 | } 237 | -------------------------------------------------------------------------------- /notebooks/4.2 - Building a Movie Recommendation System.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 4.2 - Building a Movie Recommendation System" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Get the data" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "ratings_data = \"../../../data/ml-100k/u.data\"\n", 26 | "movies_data = \"../../../data/ml-100k/u.item\"" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from collections import defaultdict\n", 38 | "\n", 39 | "user_ratings = defaultdict(dict)\n", 40 | "movie_ratings = defaultdict(dict)\n", 41 | "\n", 42 | "with open(ratings_data, 'r') as f:\n", 43 | " for line in f:\n", 44 | " user, movie, stars, _ = line.split('\\t')\n", 45 | " user_ratings[user][movie] = float(stars)\n", 46 | " movie_ratings[movie][user] = float(stars)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "len(user_ratings)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "len(movie_ratings)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "user_ratings[\"1\"] # userID = 1" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": false 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "movies = {}\n", 91 | "with open(movies_data, 'r', encoding=\"latin-1\") as f:\n", 92 | " for line in f:\n", 93 | " movie_id, title, *_ = line.split('|')\n", 94 | " movies[movie_id] = title\n", 95 | " \n", 96 | "len(movies)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "movies[\"127\"], movies[\"187\"], movies[\"29\"] # movie ID = 127, 187, 29" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "movie_ratings[\"127\"]" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "sum(movie_ratings[\"127\"].values()) / len(movie_ratings[\"127\"])" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "import pandas as pd\n", 141 | "import numpy as np\n", 142 | "\n", 143 | "ratings = pd.read_csv(ratings_data, sep='\\t', names=['user', 'movie', 'rating', 'timestamp'])\n", 144 | "\n", 145 | "ratings.head()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "ratings.shape" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "n_movies = ratings[\"movie\"].unique().shape\n", 168 | "n_movies" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "n_users = ratings[\"user\"].unique().shape\n", 180 | "n_users" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "collapsed": false 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "data_matrix = np.zeros((ratings.user.max(), ratings.movie.max()))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "for item in ratings.itertuples():\n", 203 | " data_matrix[item.user-1, item.movie-1] = item.rating" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "data_matrix" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "data_matrix.shape" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "#### Distance / Similarity\n", 233 | "\n", 234 | "https://en.wikipedia.org/wiki/Euclidean_distance\n", 235 | "\n", 236 | "$\\mbox{euclidean}(x, y) = \\big{|}\\big{|} x - y \\big{|}\\big{|}_{2} = \\sqrt{\\sum_{i=0}^{n} (x_{i} - y_{i})^{2}}$\n", 237 | "\n", 238 | "https://en.wikipedia.org/wiki/Cosine_similarity\n", 239 | "\n", 240 | "$\\mbox{cosine}(x, y) = 1 - \\frac{x \\cdot y}{|| x ||_{2} || y ||_{2}}$, i.e. one minus cosine similarity" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "from scipy.spatial.distance import cosine\n", 252 | "\n", 253 | "cosine(data_matrix[:, 126], data_matrix[:, 186]) # Godfather vs Godfather II" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "cosine(data_matrix[:, 126], data_matrix[:, 28]) # Godfather vs Batman Forever" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "collapsed": false 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "cosine(data_matrix[0, :], data_matrix[2, :]) # user 1 vs user 3" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": false 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "cosine(data_matrix[0, :], data_matrix[915, :]) # user 1 vs user 916" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "from sklearn.model_selection import train_test_split\n", 298 | "\n", 299 | "train_data, test_data = train_test_split(data_matrix, test_size=0.2)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "collapsed": false 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "train_data.shape, test_data.shape" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": { 317 | "collapsed": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "from sklearn.metrics.pairwise import pairwise_distances\n", 322 | "\n", 323 | "user_distance = pairwise_distances(train_data, metric='cosine')\n", 324 | "item_distance = pairwise_distances(train_data.T, metric='cosine')" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": { 331 | "collapsed": false 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "user_distance" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "user_similarity = 1 - user_distance\n", 347 | "item_similarity = 1 - item_distance" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "collapsed": false 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "user_similarity.shape, item_similarity.shape" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "collapsed": false 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "train_data.shape" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "#### Prediction\n", 377 | "\n", 378 | "$r_{u,i}$ = rating user u gave to item i\n", 379 | "\n", 380 | "$\\hat{r}_{u,i}$ = rating prediction for user u and item i\n", 381 | "\n", 382 | "$\\mbox{sim}(u, v)$ = similarity between user u and user v\n", 383 | "\n", 384 | "$\\hat{r}_{u,i} = \\frac{\\sum_{v} \\mbox{sim}(u, v)r_{v,i}}{\\sum_{v} \\big{|}\\mbox{sim}(u, v)\\big{|}}$" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "def make_user_prediction(data, u_similarity):\n", 396 | " return u_similarity.dot(data) / np.array([np.abs(u_similarity).sum(axis=1)]).T\n", 397 | "\n", 398 | "def make_item_prediction(data, i_similarity):\n", 399 | " return data.dot(i_similarity) / np.array([np.abs(i_similarity).sum(axis=1)])\n", 400 | "\n", 401 | "user_pred = make_user_prediction(train_data, user_similarity)\n", 402 | "item_pred = make_item_prediction(train_data, item_similarity)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": { 409 | "collapsed": false 410 | }, 411 | "outputs": [], 412 | "source": [ 413 | "user_pred.shape" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "collapsed": false 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "item_pred.shape" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": { 431 | "collapsed": false 432 | }, 433 | "outputs": [], 434 | "source": [ 435 | "from sklearn.metrics import mean_squared_error\n", 436 | "\n", 437 | "def matrix_mse(prediction, actual):\n", 438 | " prediction = prediction[actual.nonzero()].flatten() # ignore zero terms\n", 439 | " actual = actual[actual.nonzero()].flatten()\n", 440 | " return mean_squared_error(prediction, actual)\n", 441 | "\n", 442 | "matrix_mse(user_pred, train_data)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": { 449 | "collapsed": false 450 | }, 451 | "outputs": [], 452 | "source": [ 453 | "matrix_mse(item_pred, train_data)" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": { 460 | "collapsed": true 461 | }, 462 | "outputs": [], 463 | "source": [] 464 | } 465 | ], 466 | "metadata": { 467 | "kernelspec": { 468 | "display_name": "Python 3", 469 | "language": "python", 470 | "name": "python3" 471 | }, 472 | "language_info": { 473 | "codemirror_mode": { 474 | "name": "ipython", 475 | "version": 3 476 | }, 477 | "file_extension": ".py", 478 | "mimetype": "text/x-python", 479 | "name": "python", 480 | "nbconvert_exporter": "python", 481 | "pygments_lexer": "ipython3", 482 | "version": "3.6.0" 483 | } 484 | }, 485 | "nbformat": 4, 486 | "nbformat_minor": 2 487 | } 488 | -------------------------------------------------------------------------------- /notebooks/AirPassengers.csv: -------------------------------------------------------------------------------- 1 | Month,AirPassengers 2 | 1949-01,112 3 | 1949-02,118 4 | 1949-03,132 5 | 1949-04,129 6 | 1949-05,121 7 | 1949-06,135 8 | 1949-07,148 9 | 1949-08,148 10 | 1949-09,136 11 | 1949-10,119 12 | 1949-11,104 13 | 1949-12,118 14 | 1950-01,115 15 | 1950-02,126 16 | 1950-03,141 17 | 1950-04,135 18 | 1950-05,125 19 | 1950-06,149 20 | 1950-07,170 21 | 1950-08,170 22 | 1950-09,158 23 | 1950-10,133 24 | 1950-11,114 25 | 1950-12,140 26 | 1951-01,145 27 | 1951-02,150 28 | 1951-03,178 29 | 1951-04,163 30 | 1951-05,172 31 | 1951-06,178 32 | 1951-07,199 33 | 1951-08,199 34 | 1951-09,184 35 | 1951-10,162 36 | 1951-11,146 37 | 1951-12,166 38 | 1952-01,171 39 | 1952-02,180 40 | 1952-03,193 41 | 1952-04,181 42 | 1952-05,183 43 | 1952-06,218 44 | 1952-07,230 45 | 1952-08,242 46 | 1952-09,209 47 | 1952-10,191 48 | 1952-11,172 49 | 1952-12,194 50 | 1953-01,196 51 | 1953-02,196 52 | 1953-03,236 53 | 1953-04,235 54 | 1953-05,229 55 | 1953-06,243 56 | 1953-07,264 57 | 1953-08,272 58 | 1953-09,237 59 | 1953-10,211 60 | 1953-11,180 61 | 1953-12,201 62 | 1954-01,204 63 | 1954-02,188 64 | 1954-03,235 65 | 1954-04,227 66 | 1954-05,234 67 | 1954-06,264 68 | 1954-07,302 69 | 1954-08,293 70 | 1954-09,259 71 | 1954-10,229 72 | 1954-11,203 73 | 1954-12,229 74 | 1955-01,242 75 | 1955-02,233 76 | 1955-03,267 77 | 1955-04,269 78 | 1955-05,270 79 | 1955-06,315 80 | 1955-07,364 81 | 1955-08,347 82 | 1955-09,312 83 | 1955-10,274 84 | 1955-11,237 85 | 1955-12,278 86 | 1956-01,284 87 | 1956-02,277 88 | 1956-03,317 89 | 1956-04,313 90 | 1956-05,318 91 | 1956-06,374 92 | 1956-07,413 93 | 1956-08,405 94 | 1956-09,355 95 | 1956-10,306 96 | 1956-11,271 97 | 1956-12,306 98 | 1957-01,315 99 | 1957-02,301 100 | 1957-03,356 101 | 1957-04,348 102 | 1957-05,355 103 | 1957-06,422 104 | 1957-07,465 105 | 1957-08,467 106 | 1957-09,404 107 | 1957-10,347 108 | 1957-11,305 109 | 1957-12,336 110 | 1958-01,340 111 | 1958-02,318 112 | 1958-03,362 113 | 1958-04,348 114 | 1958-05,363 115 | 1958-06,435 116 | 1958-07,491 117 | 1958-08,505 118 | 1958-09,404 119 | 1958-10,359 120 | 1958-11,310 121 | 1958-12,337 122 | 1959-01,360 123 | 1959-02,342 124 | 1959-03,406 125 | 1959-04,396 126 | 1959-05,420 127 | 1959-06,472 128 | 1959-07,548 129 | 1959-08,559 130 | 1959-09,463 131 | 1959-10,407 132 | 1959-11,362 133 | 1959-12,405 134 | 1960-01,417 135 | 1960-02,391 136 | 1960-03,419 137 | 1960-04,461 138 | 1960-05,472 139 | 1960-06,535 140 | 1960-07,622 141 | 1960-08,606 142 | 1960-09,508 143 | 1960-10,461 144 | 1960-11,390 145 | 1960-12,432 146 | -------------------------------------------------------------------------------- /notebooks/data.csv: -------------------------------------------------------------------------------- 1 | "NAME","AGE","LANGUAGE" 2 | "Alice",30,"English" 3 | "Bob",25,"Spanish" 4 | "Charlie",35,"French" 5 | -------------------------------------------------------------------------------- /notebooks/data_no_header.csv: -------------------------------------------------------------------------------- 1 | "Alice",30,"English" 2 | "Bob",25,"Spanish" 3 | "Charlie",35,"French" 4 | -------------------------------------------------------------------------------- /notebooks/movie.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Fight Club", 3 | "watched": true, 4 | "year": 1999, 5 | "actors": [ 6 | "Brad Pitt", 7 | "Edward Norton", 8 | "Helena Bonham Carter" 9 | ] 10 | } -------------------------------------------------------------------------------- /notebooks/movies-90s.jsonl: -------------------------------------------------------------------------------- 1 | {"title": "Fight Club", "year": 1999, "actors": ["Brad Pitt", "Edward Norton", "Helena Bonham Carter"], "watched": true} 2 | {"title": "Goodfellas", "year": 1990, "actors": ["Robert De Niro", "Ray Liotta", "Joe Pesci"], "watched": true} 3 | {"title": "Forrest Gump", "year": 1994, "actors": ["Tom Hanks", "Robin Wright"], "watched": true} 4 | 5 | -------------------------------------------------------------------------------- /notebooks/some_file.txt: -------------------------------------------------------------------------------- 1 | This is some file 2 | It has a few line 3 | This is the last line 4 | --------------------------------------------------------------------------------