├── .gitignore ├── Chapter01 ├── .keep └── Pandas.ipynb ├── Chapter02 ├── .keep ├── README.md ├── classification.ipynb └── regression.ipynb ├── Chapter03 ├── .keep ├── Linear Classifiers.ipynb ├── Linear Regression.ipynb └── README.md ├── Chapter04 ├── .keep ├── README.md ├── category encoding.ipynb ├── imputation.ipynb └── scaling et al.ipynb ├── Chapter05 ├── .keep ├── Digits Embedding.ipynb ├── KNN Digit.ipynb └── README.md ├── Chapter06 ├── .keep ├── README.md ├── Text Classification.ipynb └── Tokenization.ipynb ├── Chapter07 ├── .keep ├── Activation Functions.ipynb ├── Convolutions.ipynb ├── Neural Networks.ipynb └── README.md ├── Chapter08 ├── .keep ├── Car Prices.ipynb └── Hastie Classifier.ipynb ├── Chapter09 ├── .keep ├── Multi Target & Calibration.ipynb └── Target Scaling.ipynb ├── Chapter10 ├── .keep └── Imbalanced Data.ipynb ├── Chapter11 ├── .keep └── Clustering.ipynb ├── Chapter12 ├── .keep └── Outliers.ipynb ├── Chapter13 ├── .keep ├── README.md ├── RecSys.ipynb ├── artist_recommender.pkl └── recsys.pkl ├── README.md └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .DS_Store 107 | **/.DS_Store 108 | 109 | # Data sets 110 | chapters/ch06/data/** 111 | -------------------------------------------------------------------------------- /Chapter01/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter01/.keep -------------------------------------------------------------------------------- /Chapter01/Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pandas\n", 8 | "\n", 9 | "Pandas is an open source library providing data analysis tools for the Python programming language. If this definition doesn't say much, I'd say you can think of Pandas for now as Python's response to spread sheets. Having that said, it is clear now why I decided to dedicate this section to Pandas, since you will be using it to create or load the data you are going to use with Scikit-learn. You will also use it to analyse and visualize the data and alter the values of columns before applying your machine learning algorithms to them." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 9, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# It's common to import pandas as pd\n", 19 | "\n", 20 | "import pandas as pd" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 10, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# This is how to create a dataframe \n", 30 | "\n", 31 | "polygons_data_frame = pd.DataFrame(\n", 32 | " {\n", 33 | " 'Name': ['Triangle', 'Quadrilateral', 'Pentagon', 'Hexagon'],\n", 34 | " 'Sides': [3, 4, 5, 6],\n", 35 | " }\n", 36 | ")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 11, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "
\n", 48 | "\n", 61 | "\n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | "
NameSides
0Triangle3
1Quadrilateral4
2Pentagon5
\n", 87 | "
" 88 | ], 89 | "text/plain": [ 90 | " Name Sides\n", 91 | "0 Triangle 3\n", 92 | "1 Quadrilateral 4\n", 93 | "2 Pentagon 5" 94 | ] 95 | }, 96 | "execution_count": 11, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "# Showing first 3 rows of the dataframe \n", 103 | "\n", 104 | "polygons_data_frame.head(3)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 12, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# Creating another dataframe\n", 114 | "\n", 115 | "polygons = {\n", 116 | " 'Name': [\n", 117 | " 'Triangle', 'Quadrilateral', 'Pentagon', 'Hexagon', 'Heptagon', 'Octagon', \n", 118 | " 'Nonagon', 'Decagon', 'Hendecagon', 'Dodecagon', 'Tridecagon', 'Tetradecagon'\n", 119 | " ],\n", 120 | " 'Sides': range(3, 15, 1),\n", 121 | "}\n", 122 | "\n", 123 | "polygons_data_frame = pd.DataFrame(polygons)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 13, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/html": [ 134 | "
\n", 135 | "\n", 148 | "\n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | "
NameSides
0Triangle3
1Quadrilateral4
2Pentagon5
3Hexagon6
4Heptagon7
5Octagon8
6Nonagon9
7Decagon10
8Hendecagon11
9Dodecagon12
10Tridecagon13
11Tetradecagon14
\n", 219 | "
" 220 | ], 221 | "text/plain": [ 222 | " Name Sides\n", 223 | "0 Triangle 3\n", 224 | "1 Quadrilateral 4\n", 225 | "2 Pentagon 5\n", 226 | "3 Hexagon 6\n", 227 | "4 Heptagon 7\n", 228 | "5 Octagon 8\n", 229 | "6 Nonagon 9\n", 230 | "7 Decagon 10\n", 231 | "8 Hendecagon 11\n", 232 | "9 Dodecagon 12\n", 233 | "10 Tridecagon 13\n", 234 | "11 Tetradecagon 14" 235 | ] 236 | }, 237 | "execution_count": 13, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "polygons_data_frame" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 14, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/html": [ 254 | "
\n", 255 | "\n", 268 | "\n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | "
NameSides
7Decagon10
9Dodecagon12
8Hendecagon11
4Heptagon7
3Hexagon6
\n", 304 | "
" 305 | ], 306 | "text/plain": [ 307 | " Name Sides\n", 308 | "7 Decagon 10\n", 309 | "9 Dodecagon 12\n", 310 | "8 Hendecagon 11\n", 311 | "4 Heptagon 7\n", 312 | "3 Hexagon 6" 313 | ] 314 | }, 315 | "execution_count": 14, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "polygons_data_frame.sort_values('Name').head(5)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 15, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "polygons_data_frame['Length of Name'] = polygons_data_frame['Name'].apply(len) " 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 16, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "# polygons_data_frame['Length of Name'] = polygons_data_frame['Name'].apply(lambda n: len(n)**2)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 17, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/html": [ 350 | "
\n", 351 | "\n", 364 | "\n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | "
NameSidesLength of Name
3Hexagon67
5Octagon87
6Nonagon97
7Decagon107
0Triangle38
2Pentagon58
4Heptagon78
9Dodecagon129
8Hendecagon1110
10Tridecagon1310
11Tetradecagon1412
1Quadrilateral413
\n", 448 | "
" 449 | ], 450 | "text/plain": [ 451 | " Name Sides Length of Name\n", 452 | "3 Hexagon 6 7\n", 453 | "5 Octagon 8 7\n", 454 | "6 Nonagon 9 7\n", 455 | "7 Decagon 10 7\n", 456 | "0 Triangle 3 8\n", 457 | "2 Pentagon 5 8\n", 458 | "4 Heptagon 7 8\n", 459 | "9 Dodecagon 12 9\n", 460 | "8 Hendecagon 11 10\n", 461 | "10 Tridecagon 13 10\n", 462 | "11 Tetradecagon 14 12\n", 463 | "1 Quadrilateral 4 13" 464 | ] 465 | }, 466 | "execution_count": 17, 467 | "metadata": {}, 468 | "output_type": "execute_result" 469 | } 470 | ], 471 | "source": [ 472 | "polygons_data_frame.sort_values('Length of Name')" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "# Comparing apply's performance to vectorizatopn " 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 18, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "# s = pd.Series(['hello world' for i in range(100000)])" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 19, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "# %timeit -n100 s.str.len()" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 20, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "# %timeit -n100 s.apply(lambda x: len(x))" 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": {}, 512 | "source": [ 513 | "# Scatter Plots" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 21, 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [ 522 | "import matplotlib.pyplot as plt" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 22, 528 | "metadata": {}, 529 | "outputs": [ 530 | { 531 | "data": { 532 | "text/plain": [ 533 | "" 534 | ] 535 | }, 536 | "execution_count": 22, 537 | "metadata": {}, 538 | "output_type": "execute_result" 539 | }, 540 | { 541 | "data": { 542 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAacAAAEXCAYAAAAJJYvtAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAelUlEQVR4nO3de5gdVZnv8e8vN5pL6KCJGEwIMLYooqBCxGsyXjA6OaKZ0QOiEmWCjGcAPahHvACjRw4MXmB0BscIJAgTVASFcNDwgCYqSCaDgEEIGyHQwQCN0J1waTok7/xR1clmpy/V3bt2Vff+fZ5nP71rVe1a79p9efdatbqWIgIzM7MyGVd0AGZmZrWcnMzMrHScnMzMrHScnMzMrHScnMzMrHScnMzMrHScnKzuJJ0p6d5BjpkrKSTNaFRczULSeklfyunccyWtlbRF0q/yqMMMnJxsiCTtKumrkiqSnpH0uKT/lHRy1WFfB44oKsY8SVoo6bmi4wCQ9CVJ6xtc7QXArcABwIK+Dkg/nISkH/Wx7zlJC/MN0caCCUUHYKPOBcBfA6cAtwN7Aq8B9u09ICKeBJ4sJDrLWxtwVkS0D3JcN/B3ko6IiN81IC4bY9xzsqF6H3BuRPw0Iu6PiNsjYklEfKX3gL6G9SSdJGmDpKcl/YKqZFZ1zOskrZD0pKQOSVdKmlW1f4akn0h6TFK3pPskfbavICWNk/SgpC/UlO8i6QlJf59uv1nSbyVtTh+3S3rXSN4gSUdLui2Ncb2kb0ravWr/ryR9X9KXJT2c9j4vkbRHTfxnpe/Dk5Iul/Sp3l5b2vv4KjAr7aWEpDOrwpgk6fz03I9I+pakAT+MSjpQ0rVpfU9KukbSS9N9cyUFMB64JK1v4QCnewj4CUkveqA6T0nfqyfT9+JySdOr9vcO/75H0s1pb/2/JL0yffwm/ZlaLemgmnMP+PNk5ebkZEO1EZgn6QVZXyDpKOBbwDeBQ4EfAefWHHMQsBK4GTgMeBuwFbheUkt62L8BrcA7gJcDxwMb+qozIrYBlwIfqdl1FNAC/Dj9Y301cAvw2vRxJvB01rb10daFJL3LbwAHAR9N4/1uzaF/B7wAmAscDcwH/k/V/k8BJwP/m6Rnuho4vWr/D4FzSNo/PX1UJ4KTSL5Xr0+f/yNw3ABx7wqsIHlv5qSPPYCfS5oE3JTWQXqu6WkMA/k8cLikvx3kuM8ArwLeT/Kh5fI+jvka8EXgdUAPsIzkfT6jquziqvZk+XmyMosIP/zI/ADeBDxA8ot+B/A9kt6Uqo45E7i3avs3wGU15/k6EMCMdHsJcHnNMbuQJIr3pdu3A2cOIdaXp3UcXlW2HFiWPt8r3T93COdcCDw3wP71wIk1ZW9N69kr3f4VcHvNMRcAN1dtPwR8teaYy6vrBr4ErO8nhqtryq7rbXc/cR+fvtdTq8r2Bp4BPlpVFsCHB3mPtn//ST6UVICJ6fZzwMIBXvuatI6XpNtz0+33VR3zgbTsb6vK3p+W7ZH158mPcj/cc7IhiYjfAn8FvAVYSvIH7Argaknq52UHkXzyrvabmu3DgfdXDSk9CfyF5JN8W3rMecAXJN0i6RxJbx0k1rtJehwfAZD0IuBdwCXp/ieA7wO/kHSdpM9LOnDgd6B/kqYBs4Bv1rTjuvSQl1YdfnvNy/9M8l4iqRXYB6i9VnPzEMK5rb/z9+OVwB8j4rHegoh4BFiX7huurwJTgX/oa2c6bPcLSe2SNrPj56J2+K36/Xo4/XpHH2UvSr9m+XmyEnNysiGLiOci4qaI+EZEHEXSm5hP0kMYrnHAD0iG/aofLyNJIETExSR/tL5LMqx0naRLBznvJcDRkiYCHwIeIxm+6m3LIpJhoetJhrLWSvrECNoAyWSR6jYcQvIH8Q9Vx/bUvDbY+fdxJEsGZDl/7iLicZIhudPTpLudpH2B/0/S0zuaZPjtvenuSTWn2lJ92gHKxlV9HfDnycrNycnq4a7064v62f9H4I01ZW+q2V4DvBr4U0TcW/N4ovegiNgYERdHxEdJhqKOlbTnALEtI7lONY/k+s9lEbG1+oCIWBsR34yIdwMXAicMcL5+pT2NduDAPtpwb0R0ZzxPF0lP5w01u2qn5/eQTFCohzuBgyRN7S2QtDdwILB2hOf+NrCZ5JpRtcOBXYFPRcRvI2IdA/fuhiLTz5OVl6eS25BIWknyB38N0EEyVHUW0An8sp+XfYNkAsJqkk/Kb2bniQpnkQzBXSrp/PTc+5Fczzo/Iu6T9J309etIhmcWkCSDzf3FGxGPS7oW+ArJJ+ftkwLSmWiLgGvS8+xDMlx5a4b34dA+iteS/AG+UNITwM9IPt2/Anh3RAylR/YN4J8k9Q5N/g1wJM/vTd0PvFjSG0iu6zwdEcOdzPEfJBMufqhkBqRIrgs+xOATHwYUEc+msyYv5vkfiCsk7TlV0mUkPczT+zjFcAz681Sneiwn7jnZUF0HHMuOJHExyR+ZN1Vfr6gWEVcBpwKfI7lOcCzPn5lGRNxF0rvaA/gFSW9rMckn6870MJFcd1oLrAJ2J/mjP9jw11KSxHRbRFQPrT1FMtx2OXAPydTnm0hmow1kPPD7Ph5TI+IHwAdJhjlXA/9JMkHgoUHOWes84DvA+em5jyBJWNW9r58CPwauJfnj+7kh1rFdRDxDkvyeJXlvV5K8P/MionaIcDguJ7lutP26ZETcQTKT8BMk3+/PkMxSHLGMP09WYhr899rMykDSRcAhEfG6omMxy5uH9cxKSNI+JNOjf0kybf9/kFwzG6xXZzYmuOdkVkLpZIQfklzUbwHuBb4dEYsLDcysQZyczMysdDwhwszMSqfU15y6urrcrTMzG+NaW1t3uruMe05mZlY6Tk5mZlY6Tk4NUKlUig6hEG53c3G7m0ve7XZyMjOz0nFyMjOz0nFyMjOz0mlIcpJ0kaRHJa2tKvuqpDsk3SZpRXq7FjMzs4b1nJaQrKdT7dyIeHVEHEqydHa9bpVvZmajXEOSU0SsAh6vKdtUtbk7I1v1s5TWb9rCkcs7WLCmhSOXd/DA5i2Dv8jMzIq95iTpa5LaSdb3GXM9pxNWdbK6o4f27nGs7uhh0UovI2NmlkXDbvwqaT9geUQc3Me+04CWiDijurz69kWj8X8JFqxpob17R/6f2bKNKw/LtFK3mdmY1tbWtv15X7cvKsu99S4jWVn1jP4OqG7IaDF9XQft3TsWEZ0+uYW2tpkFRtRYlUplVH7fRsrtbi5udz4KG9aTVN2qo4C7i4olL4vnTGH2tEnMbNnG7GmTWDxnStEhmZmNCg3pOUlaBswFpkraQNJDeo+kA4FtwAPAiY2IpZFmTZ7IivnT0k8YzdNjMjMbqYYkp4g4po/iCxtRt5mZjT6+Q4SZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZVOQ5KTpIskPSppbVXZuZLulnSHpKskTWlELGZmVn6N6jktAebVlF0PHBwRrwbuAU5rUCxmZlZyDUlOEbEKeLymbEVEPJdu/g6Y0YhYzMxs+NZv2sKRyztYsKaFI5d38MDmLbnUU5ZrTh8Hris6CDMzG9gJqzpZ3dFDe/c4Vnf0sGhlZy71KCJyOfFOFUn7Acsj4uCa8i8ChwELoiaYrq6u7duVSqUBUZqZ2UAWrGmhvXtHv2ZmyzauPKx7yOdpa2vb/ry1tVW1+ycMM766kLQQmA+8vTYx1apuyGhTqVRGdfzD5XY3F7e7OUxf10F7d8+O7ckttLXNrHs9hQ3rSZoHfA54b0Q8XVQcZmaW3eI5U5g9bRIzW7Yxe9okFs/JZ6J1Q3pOkpYBc4GpkjYAZ5DMztsFuF4SwO8i4sRGxGNmZsMza/JEVsyflvYY699j6tWQ5BQRx/RRfGEj6jYzs9GnLLP1zMzMtnNyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0nFyMjOz0smcnCS9U9KFkq5Jtw+T9Lb8QjMzs2aVKTlJOgm4AKgAb02LnwH+b05xmZlZE8vac/oU8I6IOBvYlpbdDRyYS1RmZtbUsianyUB7+jzSrxOBnrpHZGZmTS9rcloFfL6m7GTgl/UNx8zMDCZkPO4k4BpJi4DJktYBm4H5uUVmZmZNK1NyioiNkg4HZgP7kgzxrY6IbQO/0szMbOiy9pyIiABuSR9mZma5yTqV/BBJN0p6XFJP+tgiyRMizMys7rL2nJYBPyGZBPFMfuGYmZlln633YuD0iFgbEX+qfmR5saSLJD0qaW1V2Qck3Slpm6TDhhO8mZmNTVmT01LgQyOoZwkwr6ZsLbCAZJq6mZnZdlmH9c4Gbpb0BeCR6h0RMej99SJilaT9asruApCUMQQzM2sWWZPTFcD9wFX4mpOZmeVMyQzxQQ6SNgMvjIhhz85Le07LI+LgmvJfAZ+JiDW1r+nq6toeXKVSGW7VZmZWMm1tbduft7a27jSElrXn9GvgIOC2+oQ1dNUNGW0qlcqojn+43O7m4nY3l7zbnTU53Q+skHQVO19zOr3uUZmZWVPLmpx2A64FJgEzh1qJpGXAXGCqpA3AGcDjwLeBacC1km6LiHcN9dxmZjb2ZL233sdGUklEHNPPrqtGcl4zMxubMt9bD0DSZGAqsP3iVUTcV++gzMysuWVKTpIOAi4DDiFZbFDsWHRwfD6hmZlZs8p6h4h/I1lY8AXAJmAv4N+B43KKy8zMmljWYb1DgHdGxBZJioguSZ8luQXRpfmFZ2ZmzShrz6kbmJg+f0zSvulrX5hLVGZm1tSyJqdfAx9Mn18BXAesBG7MIygzM2tuWaeSf7Bq8wvAncAewCV5BGVmZs1tSFPJASJiG/CDHGIxMzMDBklOki4a5PUREcfXMR4zM7NBe04P9VO+G8k08r0AJyczM6urAZNTRHy5elvSBOBEkutOtwJfyi80MzNrVlnvEDEOWAh8GdgAHBMRK3OMy8zMmtigyUnS0cBXgC7gkxFxXe5RmZlZUxtsQsTtwD7APwNXAiHpgOpjfONXMzOrt8F6Tq9Kv54DnE3V3chTgW/8amZmdTbYhIisd5AwMzOrGycfMzMrHScnMzMrHScnMzMrnX6Tk6RDGhmImZlZr4F6Tr/ufSKp0oBYzMzMgIFn63VKmg/8EZguaX92nkru/3MyM7O6Gyg5nQKcB8wi6WH9qY9j/H9OZmZWd/0O60XEVRHx0oiYCDwdEeP6eDgxmZlZ3WWdrfdCSG4AK2l6eiPYzCRdJOlRSWuryl4g6XpJlfTrXkM5p5mZjV1Zk8wuki4BuknWeHpG0lJJrRlfvwSYV1P2eeCGiGgDbki3zczMMienbwO7AwcDu5Lcc2834F+yvDgiVgGP1xQfBSxNny8F3pcxFjMzANZv2sKRyzt47U8e5sjlHTyweUvRITVEM7Q7a3KaB3wkIu6JiGcj4h7gY+zcGxqKvSNiY/r8YWDvEZzLzJrQCas6Wd3Rw32btrK6o4dFKzuLDqkhmqHdmRYbJBnOmwY8UFU2FXi2HkFEREiKgY6pVEb3v1qN9viHy+1uLo1u98bNLVR/xt64ubuQ997tHrq2trYB92dNTt8Hrpf0TZIENQv4NPC9YUcGj0iaHhEbJU0HHh3o4MEaUmaVSmVUxz9cbndzKaLd09d10N7ds2N7cgttbTMbGoPbnY+syelrwJ+BD5EsPvhnkgUILxpB3VcDx5GsE3Uc8LMRnMvMmtDiOVNYtLKTx57dytRdxrN4zpSiQ2qIZmh3puQUEUGSiIaVjCQtA+YCUyVtAM4gSUo/knQ8SW/sg8M5t5k1r1mTJ7Ji/rSiw2i4Zmh31p7TiETEMf3sensj6jczs9HFS2aYmVnpODmZmVnpODmZmVnpZLrmJGkSsBA4FNijel9EfLT+YZmZWTPLOiFiKXAIcA3wSH7hmJmZZU9O84D9I2Ls3SPDzMxKJ+s1pweBXfIMxMzMrFe/PSdJb6vavAT4maTzqRnWi4gbc4rNzMya1EDDehf2UXZWzXYAB9QvHDMzswGSU0Ts38hAzMzMemW65iSpz5uySrqyvuGYmZllnxDx1/2Uz61THGZmZtsNOJVc0lfSp5Oqnvc6gOcvPmhmZlYXg/2fU+/qVeOqnkMyEaIdODOHmMzMrMkNmJwi4mMAkm6KiMWNCcnMzJpd1jtE3CCprynjzwIbI2JbHWMyM7MmlzU53UsylAegqucA2yRdDXwyInzfPTMzG7Gss/UWAf8BvAxoAQ4EfgB8EngVSZL71zwCNDOz5pO15/RPwEsjojvdvlfSJ4F7IuLfJS0EKnkEaGZmzSdrz2kcsF9N2b7A+PT5U2RPdGZmZgPKmlDOA26UdDHJFPIZwMfScoD3ADfXPzwzM2tGmZJTRPyzpDuADwCvBTYCx0fEz9P9PwV+mluUZmbWVDIPxaWJ6Oc5xmJmZgZkTE6SJgELgUOBPar3RcRH6x+WmZk1s6w9p6XAIcA11Cw2OFKSTiGZqi5gcUScN8hLzMxsjMuanOYB+0dEZz0rl3QwSWKaDfQAP5e0PCLurWc9Zmb1tn7TFk5Y1cnGzS1MX9fB4jlTmDV5YtFhjRlZp5I/COySQ/2vAG6JiKcj4jlgJbAgh3rMzOrqhFWdrO7oob17HKs7eli0sq6f3ZueImLwg6RTSWbqnU/NsF5E3DjsyqVXAD8D3gA8A9wArImIkwC6urq2B1ep+H98zaw8Fqxpob17x+f7mS3buPKw7gFeYdXa2tq2P29tbVXt/qzDev+Yfj2rpjxI1nUaloi4S9I5wAqSf+S9Ddja17HVDRltKpXKqI5/uNzu5tJs7Z6+roP27p4d25NbaGubOcArxpa8v99Z/89p/7wCiIgLgQsBJJ0FbMirLjOzelk8ZwqLVnaycXM30ye3sHjOlKJDGlMy/5+TpInAEcA+EfFDSbsDRMRTIwlA0osi4lFJ+5JcbzpiJOczM2uEWZMnsmL+tLQH0Tw9pkbJ+n9OrwKuJlm/aQbwQ2AOcBzwP0cYw08kvRDYAvyves8INDOz0Sdrz+kC4PSI+IGkJ9KylcCIV8eNiLeM9BxmZja2ZJ1K/krg0vR5wPbhvF3zCMrMzJpb1uS0HnhddYGk2SQr5JqZmdVV1mG9LwPXSvouMEnSacCJJHd3MDMzq6tMPaeIWE5yC6NpJNeaZgELImJFjrGZmVmTGsqSGb8HPtm7LWm8pK9ExOm5RGZmZk0r6zWnvkwAvlivQMzMzHqNJDlBssyFmZlZXY00OQ1+11gzM7MhGvCak6S3DbB7Up1jMTMzAwafEHHhIPsfrFcgZmZmvQZMTnnejdzMzKw/I73mZGZmVndOTmZmVjpOTmZmVjpOTmZmVjpOTmZmVjpOTmZmVjpOTmZmVjpOTmZmVjpOTmZmVjpOTmZmVjpOTmZmVjpOTmZmVjpOTmZmVjqFJydJn5Z0p6S1kpZJaik6JjMzK9Zg6znlStJLgJOBgyLiGUk/Ao4GltSrjvWbtnDCqk4ee3YrU3cZz+I5U5g1eWK9Tl9abndztdtsrCm850SSIHeVNAHYDfhzPU9+wqpOVnf0cN+mrazu6GHRys56nr603O7marfZWKOIKDYA6RTga8AzwIqIOLZ3X1dX1/bgKpXKsM6/YE0L7d07cvDMlm1ceVj3sOMdLdzuRLO022y0aWtr2/68tbVVtfuLHtbbCzgK2B/oBH4s6cMRcWntsdUNGYrp6zpo7+7ZsT25hba2mcMLeJgqlcqw4x8utzvdbpJ2l4Hb3VzybnfRw3rvAO6PiI6I2AJcCbyxnhUsnjOF2dMmccCe45k9bRKL50yp5+lLy+1urnabjTWF9pyAB4EjJO1GMqz3dmBNPSuYNXkiK+ZPq+cpRwW328xGs0J7ThFxC3AFcCvwhzSe7xUZk5mZFa/onhMRcQZwRtFxmJlZeRR9zcnMzGwnTk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6Tk5mZlY6hSYnSQdKuq3qsUnSp4qMyczMijehyMojYh1wKICk8cBDwFVFxmRmZsUrNDnVeDvwp4h4oOhAbGTWb9rCCas62bi5henrOlg8ZwqzJk8sOqzcFdnu3rofe3YrU3cZX0jdzfb9tnyV6ZrT0cCyooOwkTthVSerO3po7x7H6o4eFq3sLDqkhiiy3b1137dpa2F1N9v32/JVip6TpEnAe4HT+jumUqk0LqAcjPb4h2Lj5haqP/ds3NzdFO0vst3NWndZNFt7e42k3W1tbQPuL0VyAt4N3BoRj/R3wGANKbNKpTKq4x+q6es6aO/u2bE9uYW2tpkFRtQYRba7Wesug2b7/e6Vd7vLMqx3DB7SGzMWz5nC7GmTmNmyjdnTJrF4zpSiQ2qIItvdW/cBe44vrO5m+35bvhQRxQYg7Q48CBwQEV3V+7q6uooNrk78yaq5uN3Nxe0eudbWVtWWFT6sFxFPAS8sOg4zMyuPsgzrmZmZbefkZGZmpePkZGZmpePkZGZmpVP4bL2BjJXZemZm1r++Zuu552RmZqXj5GRmZqVT6mE9MzNrTu45mZlZ6Tg55UzSeEm/l7S86FgaRdIUSVdIulvSXZLeUHRMjSDp05LulLRW0jJJLUXHlBdJF0l6VNLaqrIXSLpeUiX9uleRMeahn3afm/6s3yHpKklj7uaCfbW7at+pkkLS1HrW6eSUv1OAu4oOosHOB34eES8HDqEJ2i/pJcDJwGERcTAwnmSNsrFqCTCvpuzzwA0R0QbckG6PNUvYud3XAwdHxKuBexhg6Z9RbAk7txtJM4EjSe6PWldOTjmSNAP4G+D7RcfSKJJagbcCFwJERE9ENMvqcxOAXSVNAHYD/lxwPLmJiFXA4zXFRwFL0+dLgfc1NKgG6KvdEbEiIp5LN38HzGh4YDnr5/sN8C3gc0DdJy84OeXrPJJv3LaiA2mg/YEO4OJ0OPP76Z3nx7SIeAj4OsknyI1AV0SsKDaqhts7Ijamzx8G9i4ymIJ8HLiu6CAaQdJRwEMRcXse53dyyomk+cCjEfFfRcfSYBOA1wIXRMRrgKcYm8M7z5NeXzmKJDnvA+wu6cPFRlWcSKYBN9VUYElfBJ4DLis6lrxJ2g34AnB6XnU4OeXnTcB7Ja0HLgfeJunSYkNqiA3Ahoi4Jd2+giRZjXXvAO6PiI6I2AJcCbyx4Jga7RFJ0wHSr48WHE/DSFoIzAeOjeb4/5y/Ivkgdnv6N24GcKukF9erAiennETEaRExIyL2I7kwfmNEjPlP0hHxMNAu6cC06O3AHwsMqVEeBI6QtJskkbR7zE8EqXE1cFz6/DjgZwXG0jCS5pEM3783Ip4uOp5GiIg/RMSLImK/9G/cBuC16e9/XTg5WR5OAi6TdAdwKHBWwfHkLu0pXgHcCvyB5Hfre4UGlSNJy4CbgQMlbZB0PHA28E5JFZKe5NlFxpiHftr9HWAycL2k2yR9t9Agc9BPu/Otszl6oGZmNpq452RmZqXj5GRmZqXj5GRmZqXj5GRmZqXj5GRmZqXj5GTWQJKOldTvbY0k/UrS3zcyJrMycnIyy4GkN0u6SVKXpMcl/VbS4RFxWUQcWXR8ZmU3oegAzMYaSXsCy4F/AH4ETALeAjxbZFxmo4l7Tmb19zKAiFgWEVsj4pl0WYU7JC2U9JveAyW9M12orkvSdwBVn0jSx9MFG5+Q9AtJs9JySfpWugDcJkl/kHRwQ1tpliMnJ7P6uwfYKmmppHf3tyJsunLolcCXgKnAn0huGNy7/yiSOz8vAKYBvwaWpbuPJFk362VAK/BB4C+5tMasAE5OZnUWEZuAN5MsGbEY6JB0taTa9Y3eA9wZEVekdzI/j2QdpF4nAv8vIu5KF7M7Czg07T1tIbmf28tJbkN2V9VaSmajnpOTWQ7SZLEwImYAB5Os8XRezWH7AO1Vr4nqbWAWcL6kTkmdJCuRCnhJRNxIcsPRfwUelfS99FqX2Zjg5GSWs4i4G1hCkqSqbQRm9m6kS23MrNrfDnwiIqZUPXaNiJvS8/5LRLwOOIhkeO+zOTbDrKGcnMzqTNLLJZ0qaUa6PRM4BvhdzaHXAq+UtEDSBOBkoHqxtu8Cp0l6ZXqeVkkfSJ8fLun1kiaSrDbcDWzLtWFmDeTkZFZ/m4HXA7dIeookKa0FTq0+KCIeAz5Asu7RX4A24LdV+68CzgEul7QpPce70917klzPegJ4IH39ufk1yayxvJ6TmZmVjntOZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOk5OZmZWOv8NrV++/ZFYCUgAAAAASUVORK5CYII=\n", 543 | "text/plain": [ 544 | "
" 545 | ] 546 | }, 547 | "metadata": { 548 | "needs_background": "light" 549 | }, 550 | "output_type": "display_data" 551 | } 552 | ], 553 | "source": [ 554 | "polygons_data_frame.plot(\n", 555 | " title='Sides vs Length of Name',\n", 556 | " kind='scatter',\n", 557 | " x='Sides',\n", 558 | " y='Length of Name',\n", 559 | ")" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 23, 565 | "metadata": {}, 566 | "outputs": [ 567 | { 568 | "name": "stderr", 569 | "output_type": "stream", 570 | "text": [ 571 | "/Users/tarek/anaconda3/envs/scikitbook/lib/python3.6/site-packages/ipykernel_launcher.py:11: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", 572 | " # This is added back by InteractiveShellApp.init_path()\n" 573 | ] 574 | }, 575 | { 576 | "data": { 577 | "image/png": "\n", 578 | "text/plain": [ 579 | "
" 580 | ] 581 | }, 582 | "metadata": { 583 | "needs_background": "light" 584 | }, 585 | "output_type": "display_data" 586 | } 587 | ], 588 | "source": [ 589 | "fig, ax = plt.subplots(figsize=(12, 6));\n", 590 | "\n", 591 | "polygons_data_frame.plot(\n", 592 | " title='Sides vs Length of Name',\n", 593 | " kind='scatter',\n", 594 | " x='Sides',\n", 595 | " y='Length of Name',\n", 596 | " ax=ax,\n", 597 | ")\n", 598 | "\n", 599 | "fig.show()" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 24, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "data": { 609 | "text/html": [ 610 | "
\n", 611 | "\n", 624 | "\n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | "
NameSidesLength of Name
9Dodecagon129
8Hendecagon1110
2Pentagon58
\n", 654 | "
" 655 | ], 656 | "text/plain": [ 657 | " Name Sides Length of Name\n", 658 | "9 Dodecagon 12 9\n", 659 | "8 Hendecagon 11 10\n", 660 | "2 Pentagon 5 8" 661 | ] 662 | }, 663 | "execution_count": 24, 664 | "metadata": {}, 665 | "output_type": "execute_result" 666 | } 667 | ], 668 | "source": [ 669 | "polygons_data_frame.sample(n=3)" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [] 685 | } 686 | ], 687 | "metadata": { 688 | "hide_input": false, 689 | "kernelspec": { 690 | "display_name": "Python 3", 691 | "language": "python", 692 | "name": "python3" 693 | }, 694 | "language_info": { 695 | "codemirror_mode": { 696 | "name": "ipython", 697 | "version": 3 698 | }, 699 | "file_extension": ".py", 700 | "mimetype": "text/x-python", 701 | "name": "python", 702 | "nbconvert_exporter": "python", 703 | "pygments_lexer": "ipython3", 704 | "version": "3.6.9" 705 | }, 706 | "toc": { 707 | "base_numbering": 1, 708 | "nav_menu": {}, 709 | "number_sections": true, 710 | "sideBar": true, 711 | "skip_h1_title": false, 712 | "title_cell": "Table of Contents", 713 | "title_sidebar": "Contents", 714 | "toc_cell": false, 715 | "toc_position": {}, 716 | "toc_section_display": true, 717 | "toc_window_display": false 718 | } 719 | }, 720 | "nbformat": 4, 721 | "nbformat_minor": 2 722 | } 723 | -------------------------------------------------------------------------------- /Chapter02/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter02/.keep -------------------------------------------------------------------------------- /Chapter02/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 2: Making Decisions with Trees 2 | 3 | This chapter will introduce you to our first supervised learning algorithm in this book - decision trees. This was picked to be introduced early on since it is a versatile and easy to understand algorithm. You will also see later on that it is used as the building block for numerous advanced algorithms, such as Random Forest and Gradient Boosted Trees. 4 | 5 | In each chapter you will learn about general machine learning and statistical concepts in parallel to the main topic of the chapter. Thus, you will get to know about data splitting, model evaluation and hyper-parameter tuning. 6 | 7 | By the end of this chapter, you will have a very good understanding of the following topics: 8 | 9 | - How do decision trees learn? 10 | - What are the best ways to split your data? 11 | - How to use cross-validation to get more reliable scores? 12 | - What are hyper-parameters and how to tune them? 13 | - Visualising the tree's decision boundaries. 14 | - Using decision trees for regression. 15 | 16 | 17 | -------------------------------------------------------------------------------- /Chapter02/regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Decision Trees: Regression" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Creating Height Dataset" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import numpy as np\n", 34 | "import pandas as pd\n", 35 | "\n", 36 | "n = 200\n", 37 | "\n", 38 | "height_pop1_f = np.random.normal(loc=155, scale=4, size=n)\n", 39 | "height_pop1_m = np.random.normal(loc=175, scale=5, size=n)\n", 40 | "height_pop2_f = np.random.normal(loc=165, scale=15, size=n)\n", 41 | "height_pop2_m = np.random.normal(loc=185, scale=12, size=n)\n", 42 | "\n", 43 | "height_f = np.concatenate([height_pop1_f, height_pop2_f])\n", 44 | "height_m = np.concatenate([height_pop1_m, height_pop2_m])\n", 45 | "\n", 46 | "df_height = pd.DataFrame(\n", 47 | " {\n", 48 | " 'Gender': [1 for i in range(height_f.size)] + [2 for i in range(height_m.size)],\n", 49 | " 'Height': np.concatenate((height_f, height_m))\n", 50 | " }\n", 51 | ")\n", 52 | "\n", 53 | "# df_height['Gender (text)'] = df_height['Gender'].apply(lambda g: {1: 'F', 2: 'M'}.get(g, 'N/A'))" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/html": [ 64 | "
\n", 65 | "\n", 78 | "\n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | "
GenderHeight
2521195.379347
7162181.509895
1341163.911061
\n", 104 | "
" 105 | ], 106 | "text/plain": [ 107 | " Gender Height\n", 108 | "252 1 195.379347\n", 109 | "716 2 181.509895\n", 110 | "134 1 163.911061" 111 | ] 112 | }, 113 | "execution_count": 4, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "df_height.sample(3)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "name": "stderr", 129 | "output_type": "stream", 130 | "text": [ 131 | "/Users/tarek/anaconda3/envs/scikitbook/lib/python3.6/site-packages/ipykernel_launcher.py:17: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n" 132 | ] 133 | }, 134 | { 135 | "data": { 136 | "image/png": "\n", 137 | "text/plain": [ 138 | "
" 139 | ] 140 | }, 141 | "metadata": { 142 | "needs_background": "light" 143 | }, 144 | "output_type": "display_data" 145 | } 146 | ], 147 | "source": [ 148 | "fig, ax = plt.subplots(1, 1, figsize=(10, 5))\n", 149 | "\n", 150 | "df_height[df_height['Gender'] == 1]['Height'].plot(\n", 151 | " label='Female', kind='hist', \n", 152 | " bins=10, alpha=0.7, ax=ax\n", 153 | ")\n", 154 | "df_height[df_height['Gender'] == 2]['Height'].plot(\n", 155 | " label='Male', kind='hist', \n", 156 | " bins=10, alpha=0.7, ax=ax\n", 157 | ")\n", 158 | "\n", 159 | "# ax.set_xlim(140, 200)\n", 160 | "ax.legend()\n", 161 | "ax.set_title('Height Distribution')\n", 162 | "ax.set_xlabel('Height (in centimeters)')\n", 163 | "\n", 164 | "fig.show()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 6, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/html": [ 175 | "
\n", 176 | "\n", 193 | "\n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | "
Height
meanmedian
Gender
1160.9157.0
2179.4177.3
\n", 223 | "
" 224 | ], 225 | "text/plain": [ 226 | " Height \n", 227 | " mean median\n", 228 | "Gender \n", 229 | "1 160.9 157.0\n", 230 | "2 179.4 177.3" 231 | ] 232 | }, 233 | "execution_count": 6, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "# df_height['Gender (text)'] = df_height['Gender'].apply(lambda g: {1: 'F', 2: 'M'}.get(g, 'N/A'))\n", 240 | "\n", 241 | "df_height.groupby('Gender')[['Height']].agg([np.mean, np.median]).round(1)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "## Splitting Data then applying the Regressor" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 7, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "data": { 258 | "text/plain": [ 259 | "DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,\n", 260 | " max_features=None, max_leaf_nodes=None,\n", 261 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 262 | " min_samples_leaf=1, min_samples_split=2,\n", 263 | " min_weight_fraction_leaf=0.0, presort='deprecated',\n", 264 | " random_state=None, splitter='best')" 265 | ] 266 | }, 267 | "execution_count": 7, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "from sklearn.model_selection import train_test_split\n", 274 | "\n", 275 | "df_train, df_test = train_test_split(df_height, test_size=0.3, random_state=22)\n", 276 | "\n", 277 | "x_train = df_train[['Gender']]\n", 278 | "x_test = df_test[['Gender']]\n", 279 | "\n", 280 | "y_train = df_train['Height']\n", 281 | "y_test = df_test['Height']\n", 282 | "\n", 283 | "from sklearn.tree import DecisionTreeRegressor\n", 284 | "\n", 285 | "clf = DecisionTreeRegressor()\n", 286 | "\n", 287 | "clf.fit(x_train, y_train)\n", 288 | "\n" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 8, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "\n", 301 | "criterion=mse:\n", 302 | "\n", 303 | "|--- Gender <= 1.5\n", 304 | "| |--- value: [160.8]\n", 305 | "|--- Gender > 1.5\n", 306 | "| |--- value: [179.7]\n", 307 | "\n", 308 | "MSE: 131.8\n", 309 | "MAE: 8.8\n", 310 | "\n", 311 | "criterion=mae:\n", 312 | "\n", 313 | "|--- Gender <= 1.5\n", 314 | "| |--- value: [157.2]\n", 315 | "|--- Gender > 1.5\n", 316 | "| |--- value: [177.7]\n", 317 | "\n", 318 | "MSE: 140.3\n", 319 | "MAE: 8.1\n" 320 | ] 321 | } 322 | ], 323 | "source": [ 324 | "from sklearn.tree import export_text\n", 325 | "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", 326 | "\n", 327 | "\n", 328 | "for criterion in ['mse', 'mae']:\n", 329 | " rgrsr = DecisionTreeRegressor(criterion=criterion)\n", 330 | " rgrsr.fit(x_train, y_train)\n", 331 | "\n", 332 | " print(f'\\ncriterion={criterion}:\\n')\n", 333 | " print(export_text(rgrsr, feature_names=['Gender'], spacing=3, decimals=1))\n", 334 | " \n", 335 | " y_test_pred = rgrsr.predict(x_test)\n", 336 | " \n", 337 | " print('MSE:', round(mean_squared_error(y_test, y_test_pred), 1))\n", 338 | " print('MAE:', round(mean_absolute_error(y_test, y_test_pred), 1))" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 9, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "230 189.715860\n", 350 | "140 149.980960\n", 351 | "172 150.570729\n", 352 | "782 193.298542\n", 353 | "406 172.337001\n", 354 | " ... \n", 355 | "491 172.981764\n", 356 | "502 180.858260\n", 357 | "358 167.625938\n", 358 | "356 157.478166\n", 359 | "132 154.342254\n", 360 | "Name: Height, Length: 560, dtype: float64" 361 | ] 362 | }, 363 | "execution_count": 9, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "y_train" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 10, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "name": "stdout", 379 | "output_type": "stream", 380 | "text": [ 381 | "\n", 382 | "Emphasis on below 150:\n", 383 | "\n", 384 | "|--- Gender <= 1.5\n", 385 | "| |--- value: [152.7]\n", 386 | "|--- Gender > 1.5\n", 387 | "| |--- value: [179.7]\n", 388 | "\n", 389 | "MSE: 170.5\n", 390 | "MAE: 9.0\n", 391 | "\n", 392 | "Emphasis on above 150:\n", 393 | "\n", 394 | "|--- Gender <= 1.5\n", 395 | "| |--- value: [162.6]\n", 396 | "|--- Gender > 1.5\n", 397 | "| |--- value: [179.7]\n", 398 | "\n", 399 | "MSE: 132.7\n", 400 | "MAE: 9.1\n" 401 | ] 402 | } 403 | ], 404 | "source": [ 405 | "from sklearn.tree import export_text\n", 406 | "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", 407 | "\n", 408 | "\n", 409 | "for who_gets_more_weight in ['below 150', 'above 150']:\n", 410 | " rgrsr = DecisionTreeRegressor(criterion='mse')\n", 411 | " if who_gets_more_weight == 'below 150':\n", 412 | " sample_weight = y_train.apply(lambda h: 10 if h <= 150 else 1)\n", 413 | " else:\n", 414 | " sample_weight = y_train.apply(lambda h: 10 if h > 150 else 1)\n", 415 | " rgrsr.fit(x_train, y_train, sample_weight=sample_weight)\n", 416 | "\n", 417 | " print(f'\\nEmphasis on {who_gets_more_weight}:\\n')\n", 418 | " print(export_text(rgrsr, feature_names=['Gender'], spacing=3, decimals=1))\n", 419 | " \n", 420 | " y_test_pred = rgrsr.predict(x_test)\n", 421 | " \n", 422 | " print('MSE:', round(mean_squared_error(y_test, y_test_pred), 1))\n", 423 | " print('MAE:', round(mean_absolute_error(y_test, y_test_pred), 1))" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [] 432 | } 433 | ], 434 | "metadata": { 435 | "kernelspec": { 436 | "display_name": "Python 3", 437 | "language": "python", 438 | "name": "python3" 439 | }, 440 | "language_info": { 441 | "codemirror_mode": { 442 | "name": "ipython", 443 | "version": 3 444 | }, 445 | "file_extension": ".py", 446 | "mimetype": "text/x-python", 447 | "name": "python", 448 | "nbconvert_exporter": "python", 449 | "pygments_lexer": "ipython3", 450 | "version": "3.6.9" 451 | } 452 | }, 453 | "nbformat": 4, 454 | "nbformat_minor": 2 455 | } 456 | -------------------------------------------------------------------------------- /Chapter03/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter03/.keep -------------------------------------------------------------------------------- /Chapter03/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 3: Making Decisions with Linear Equations 2 | 3 | The linear models are possibly the most commonly used algorithms in statistics and machine learning. They are used for both regression and classification. Thus, in this chapter we will start by looking into the basic least-squares algorithm, then will move on to more advanced algorithms as the chapter progresses. 4 | 5 | The secondary topics that you will get introduced to in parallel to the linear model are regularisation and regression intervals. Regularisation is a very powerful concept that you will meet over and over again throughout your machine learning journey. Thus, I decided to introduce it early on in the book. The concept of regression intervals is also a very useful tool to quantify your uncertaining about your productions. 6 | 7 | By the end of this chapter, you will have a very good understanding of the following topics: 8 | - Understanding linear models and their history 9 | - Learn about regression models evaluations criteria (MSE, MAE and R^2) 10 | - How to engineer new features and find their Importances (e.g. Polynomial features) 11 | - What is regularisation? What are solvers? 12 | - Using your first Generalised Linear Model, i.e. Logistic regression 13 | - Additional linear models (Stochastics Gradient Descent, Elastic-net, RANSAC, etc) 14 | - Finding regression intervals 15 | 16 | 17 | -------------------------------------------------------------------------------- /Chapter04/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter04/.keep -------------------------------------------------------------------------------- /Chapter04/README.md: -------------------------------------------------------------------------------- 1 | # Preparing Your Data 2 | 3 | In real life, it will often be the case that the data is not as clean as you would like it to be. Sometimes, even clean data can still be preprocessed in ways to make things easier for our machine learning algorithm. 4 | 5 | -------------------------------------------------------------------------------- /Chapter04/category encoding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Category Encoding" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 26, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pip install category_encoders" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 27, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 28, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import category_encoders as ce" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 29, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "df = pd.DataFrame(\n", 45 | " {\n", 46 | " 'Size': np.random.choice(['XS', 'S', 'M', 'L', 'XL', 'XXL'], 10),\n", 47 | " 'Brand': np.random.choice(['Nike', 'Puma', 'Adidas', 'Le Coq', 'Reebok'], 10),\n", 48 | " }\n", 49 | ")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 30, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/html": [ 60 | "
\n", 61 | "\n", 74 | "\n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | "
SizeBrand
5LPuma
7XSAdidas
3XLPuma
0LReebok
\n", 105 | "
" 106 | ], 107 | "text/plain": [ 108 | " Size Brand\n", 109 | "5 L Puma\n", 110 | "7 XS Adidas\n", 111 | "3 XL Puma\n", 112 | "0 L Reebok" 113 | ] 114 | }, 115 | "execution_count": 30, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "df.sample(n=4)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "# Splitting the Dataset" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 31, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from sklearn.model_selection import train_test_split\n", 138 | "df_train, df_test = train_test_split(df, test_size=0.5)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 32, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "(['XS', 'S', 'XXL', 'L'], ['XL', 'XS', 'S', 'L'])" 150 | ] 151 | }, 152 | "execution_count": 32, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "df_train['Size'].value_counts().index.values.tolist(), df_test['Size'].value_counts().index.values.tolist(), " 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 33, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "(['Puma', 'Nike', 'Adidas'], ['Adidas', 'Le Coq', 'Puma', 'Reebok'])" 170 | ] 171 | }, 172 | "execution_count": 33, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "df_train['Brand'].value_counts().index.values.tolist(), df_test['Brand'].value_counts().index.values.tolist(), " 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "# One-Hot Encoding" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 34, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "from category_encoders.one_hot import OneHotEncoder\n", 195 | "encoder = OneHotEncoder(use_cat_names=True, handle_unknown='return_nan')\n", 196 | "x_train = encoder.fit_transform(df_train)\n", 197 | "x_test = encoder.transform(df_test)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 35, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/html": [ 208 | "
\n", 209 | "\n", 222 | "\n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | "
SizeBrand
8XXLAdidas
4XSNike
5LPuma
2XSNike
1SPuma
\n", 258 | "
" 259 | ], 260 | "text/plain": [ 261 | " Size Brand\n", 262 | "8 XXL Adidas\n", 263 | "4 XS Nike\n", 264 | "5 L Puma\n", 265 | "2 XS Nike\n", 266 | "1 S Puma" 267 | ] 268 | }, 269 | "execution_count": 35, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "df_train" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 36, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/html": [ 286 | "
\n", 287 | "\n", 300 | "\n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | "
SizeBrand
6SLe Coq
9XLAdidas
3XLPuma
0LReebok
7XSAdidas
\n", 336 | "
" 337 | ], 338 | "text/plain": [ 339 | " Size Brand\n", 340 | "6 S Le Coq\n", 341 | "9 XL Adidas\n", 342 | "3 XL Puma\n", 343 | "0 L Reebok\n", 344 | "7 XS Adidas" 345 | ] 346 | }, 347 | "execution_count": 36, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "df_test" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 37, 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "data": { 363 | "text/html": [ 364 | "
\n", 365 | "\n", 378 | "\n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | "
Size_XXLSize_XSSize_LSize_SBrand_AdidasBrand_NikeBrand_Puma
60.00.00.01.0NaNNaNNaN
9NaNNaNNaNNaN1.00.00.0
3NaNNaNNaNNaN0.00.01.0
00.00.01.00.0NaNNaNNaN
70.01.00.00.01.00.00.0
\n", 444 | "
" 445 | ], 446 | "text/plain": [ 447 | " Size_XXL Size_XS Size_L Size_S Brand_Adidas Brand_Nike Brand_Puma\n", 448 | "6 0.0 0.0 0.0 1.0 NaN NaN NaN\n", 449 | "9 NaN NaN NaN NaN 1.0 0.0 0.0\n", 450 | "3 NaN NaN NaN NaN 0.0 0.0 1.0\n", 451 | "0 0.0 0.0 1.0 0.0 NaN NaN NaN\n", 452 | "7 0.0 1.0 0.0 0.0 1.0 0.0 0.0" 453 | ] 454 | }, 455 | "execution_count": 37, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | } 459 | ], 460 | "source": [ 461 | "x_test" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "Setting the unknown values to zero" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 38, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "encoder = ce.one_hot.OneHotEncoder(use_cat_names=True, handle_unknown='value')\n", 478 | "x_train = encoder.fit_transform(df_train)\n", 479 | "x_test = encoder.transform(df_test)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 39, 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "data": { 489 | "text/html": [ 490 | "
\n", 491 | "\n", 504 | "\n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | "
Size_XXLSize_XSSize_LSize_SBrand_AdidasBrand_NikeBrand_Puma
00010000
70100100
60001000
\n", 550 | "
" 551 | ], 552 | "text/plain": [ 553 | " Size_XXL Size_XS Size_L Size_S Brand_Adidas Brand_Nike Brand_Puma\n", 554 | "0 0 0 1 0 0 0 0\n", 555 | "7 0 1 0 0 1 0 0\n", 556 | "6 0 0 0 1 0 0 0" 557 | ] 558 | }, 559 | "execution_count": 39, 560 | "metadata": {}, 561 | "output_type": "execute_result" 562 | } 563 | ], 564 | "source": [ 565 | "x_test.sample(n=3)" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "# Ordinal Encoder" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 40, 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "df_size = df[['Size']].copy()\n", 582 | "df_size_train, df_size_test = train_test_split(df_size, test_size=0.5)\n", 583 | "\n", 584 | "from category_encoders.ordinal import OrdinalEncoder\n", 585 | "\n", 586 | "oencoder = OrdinalEncoder(\n", 587 | " mapping= [\n", 588 | " {\n", 589 | " 'col': 'Size', \n", 590 | " 'mapping': {'XS': 1, 'S': 2, 'M': 3, 'L': 4, 'XL': 5}\n", 591 | " }\n", 592 | " ]\n", 593 | ")\n", 594 | "\n", 595 | "df_train.loc[:, 'Size [Ordinal Encoded]'] = oencoder.fit_transform(df_train['Size'])['Size'].values\n", 596 | "df_test.loc[:, 'Size [Ordinal Encoded]'] = oencoder.transform(df_test['Size'])['Size'].values" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 41, 602 | "metadata": {}, 603 | "outputs": [ 604 | { 605 | "data": { 606 | "text/html": [ 607 | "
\n", 608 | "\n", 621 | "\n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | "
SizeBrandSize [Ordinal Encoded]
6SLe Coq2
9XLAdidas5
3XLPuma5
0LReebok4
7XSAdidas1
\n", 663 | "
" 664 | ], 665 | "text/plain": [ 666 | " Size Brand Size [Ordinal Encoded]\n", 667 | "6 S Le Coq 2\n", 668 | "9 XL Adidas 5\n", 669 | "3 XL Puma 5\n", 670 | "0 L Reebok 4\n", 671 | "7 XS Adidas 1" 672 | ] 673 | }, 674 | "execution_count": 41, 675 | "metadata": {}, 676 | "output_type": "execute_result" 677 | } 678 | ], 679 | "source": [ 680 | "df_test.head(5)" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": null, 686 | "metadata": {}, 687 | "outputs": [], 688 | "source": [] 689 | } 690 | ], 691 | "metadata": { 692 | "kernelspec": { 693 | "display_name": "Python 3", 694 | "language": "python", 695 | "name": "python3" 696 | }, 697 | "language_info": { 698 | "codemirror_mode": { 699 | "name": "ipython", 700 | "version": 3 701 | }, 702 | "file_extension": ".py", 703 | "mimetype": "text/x-python", 704 | "name": "python", 705 | "nbconvert_exporter": "python", 706 | "pygments_lexer": "ipython3", 707 | "version": "3.6.9" 708 | } 709 | }, 710 | "nbformat": 4, 711 | "nbformat_minor": 2 712 | } 713 | -------------------------------------------------------------------------------- /Chapter05/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter05/.keep -------------------------------------------------------------------------------- /Chapter05/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter05/README.md -------------------------------------------------------------------------------- /Chapter06/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter06/.keep -------------------------------------------------------------------------------- /Chapter06/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter06/README.md -------------------------------------------------------------------------------- /Chapter06/Tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tokenization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "lines = [\n", 17 | " 'How to tokenize?\\nLike a boss.',\n", 18 | " 'Google is accessible via http://www.google.com',\n", 19 | " '1000 new followers! #TwitterFamous',\n", 20 | "]" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "['How to tokenize?\\nLike a boss.',\n", 32 | " 'Google is accessible via http://www.google.com',\n", 33 | " '1000 new followers! #TwitterFamous']" 34 | ] 35 | }, 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "lines" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "['How', 'to', 'tokenize?', 'Like', 'a', 'boss.']\n", 55 | "['Google', 'is', 'accessible', 'via', 'http://www.google.com']\n", 56 | "['1000', 'new', 'followers!', '#TwitterFamous']\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "for line in lines:\n", 62 | " print(line.split())" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "['How', 'to', 'tokenize', 'Like', 'a', 'boss']\n", 75 | "['Google', 'is', 'accessible', 'via', 'http', 'www', 'google', 'com']\n", 76 | "['1000', 'new', 'followers', 'TwitterFamous']\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "import re\n", 82 | "\n", 83 | "_token_pattern = r\"\\w+\"\n", 84 | "token_pattern = re.compile(_token_pattern)\n", 85 | " \n", 86 | "for line in lines:\n", 87 | " print(token_pattern.findall(line))" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "['how', 'to', 'tokenize', 'like', 'a', 'boss']\n", 100 | "['google', 'is', 'accessible', 'via', '_url_']\n", 101 | "['_num_', 'new', 'followers', '_hashtag_']\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "_token_pattern = r\"\\w+\"\n", 107 | "token_pattern = re.compile(_token_pattern)\n", 108 | "\n", 109 | "def tokenizer(line):\n", 110 | " line = line.lower()\n", 111 | " line = re.sub(r'http[s]?://[\\w\\/\\-\\.\\?]+','_url_', line)\n", 112 | " line = re.sub(r'\\d+:\\d+','_time_', line)\n", 113 | " line = re.sub(r'#\\w+', '_hashtag_', line)\n", 114 | " line = re.sub(r'\\d+','_num_', line)\n", 115 | " return token_pattern.findall(line)\n", 116 | "\n", 117 | "for line in lines:\n", 118 | " print(tokenizer(line))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "['How', 'to', 'tokenize', 'Like', 'boss']\n", 131 | "['Google', 'is', 'accessible', 'via', 'http', 'www', 'google', 'com']\n", 132 | "['1000', 'new', 'followers', 'TwitterFamous']\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "import re\n", 138 | "\n", 139 | "_token_pattern = r\"(?u)\\b\\w\\w+\\b\"\n", 140 | "token_pattern = re.compile(_token_pattern)\n", 141 | " \n", 142 | "for line in lines:\n", 143 | " print(token_pattern.findall(line))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "from sklearn.feature_extraction.text import CountVectorizer\n", 153 | "\n", 154 | "vec = CountVectorizer(lowercase=True, tokenizer=tokenizer)\n", 155 | "\n", 156 | "x = vec.fit_transform(lines)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 8, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "['how', 'to', 'tokenize', 'like', 'boss', 'google', 'is', 'accessible', 'via', '_url_', '_num_', 'new', 'followers', '_hashtag_']\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "print(list(vec.vocabulary_.keys()))" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 9, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/html": [ 184 | "
\n", 185 | "\n", 198 | "\n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | "
_hashtag__num__url_accessiblebossfollowersgooglehowislikenewtotokenizevia
doc-id
000001001010110
100110010100001
211000100001000
\n", 289 | "
" 290 | ], 291 | "text/plain": [ 292 | " _hashtag_ _num_ _url_ accessible boss followers google how is \\\n", 293 | "doc-id \n", 294 | "0 0 0 0 0 1 0 0 1 0 \n", 295 | "1 0 0 1 1 0 0 1 0 1 \n", 296 | "2 1 1 0 0 0 1 0 0 0 \n", 297 | "\n", 298 | " like new to tokenize via \n", 299 | "doc-id \n", 300 | "0 1 0 1 1 0 \n", 301 | "1 0 0 0 0 1 \n", 302 | "2 0 1 0 0 0 " 303 | ] 304 | }, 305 | "execution_count": 9, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "df = pd.DataFrame(\n", 312 | " x.todense(), \n", 313 | " columns=vec.get_feature_names(),\n", 314 | ")\n", 315 | "\n", 316 | "df.index.name = 'doc-id'\n", 317 | "\n", 318 | "df" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 10, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "flight_delayed_lines = [\n", 328 | " 'Flight was delayed, I am not happy',\n", 329 | " 'Flight was not delayed, I am happy'\n", 330 | "]" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 11, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/html": [ 341 | "
\n", 342 | "\n", 355 | "\n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | "
amdelayedflighthappynotwas
doc-id
0111111
1111111
\n", 397 | "
" 398 | ], 399 | "text/plain": [ 400 | " am delayed flight happy not was\n", 401 | "doc-id \n", 402 | "0 1 1 1 1 1 1\n", 403 | "1 1 1 1 1 1 1" 404 | ] 405 | }, 406 | "execution_count": 11, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "from sklearn.feature_extraction.text import CountVectorizer\n", 413 | "\n", 414 | "vec = CountVectorizer(ngram_range=(1,1))\n", 415 | "\n", 416 | "x = vec.fit_transform(flight_delayed_lines)\n", 417 | "\n", 418 | "df = pd.DataFrame(\n", 419 | " x.todense(), \n", 420 | " columns=vec.get_feature_names(),\n", 421 | ")\n", 422 | "\n", 423 | "df.index.name = 'doc-id'\n", 424 | "\n", 425 | "df" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 12, 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "data": { 435 | "text/html": [ 436 | "
\n", 437 | "\n", 450 | "\n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | "
am happyam notdelayed amflight wasnot delayednot happywas delayedwas not
doc-id
001110110
110111001
\n", 500 | "
" 501 | ], 502 | "text/plain": [ 503 | " am happy am not delayed am flight was not delayed not happy \\\n", 504 | "doc-id \n", 505 | "0 0 1 1 1 0 1 \n", 506 | "1 1 0 1 1 1 0 \n", 507 | "\n", 508 | " was delayed was not \n", 509 | "doc-id \n", 510 | "0 1 0 \n", 511 | "1 0 1 " 512 | ] 513 | }, 514 | "execution_count": 12, 515 | "metadata": {}, 516 | "output_type": "execute_result" 517 | } 518 | ], 519 | "source": [ 520 | "from sklearn.feature_extraction.text import CountVectorizer\n", 521 | "\n", 522 | "vec = CountVectorizer(ngram_range=(2,2))\n", 523 | "\n", 524 | "x = vec.fit_transform(flight_delayed_lines)\n", 525 | "\n", 526 | "df = pd.DataFrame(\n", 527 | " x.todense(), \n", 528 | " columns=vec.get_feature_names(),\n", 529 | ")\n", 530 | "\n", 531 | "df.index.name = 'doc-id'\n", 532 | "\n", 533 | "df" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 13, 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "data": { 543 | "text/html": [ 544 | "
\n", 545 | "\n", 558 | "\n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | "
amdelhapi anotwas, iam ham nappy...notot dot hs des not det hat wawasyed,
doc-id
01111111011...1011001111
11111111101...1100110111
\n", 660 | "

2 rows × 37 columns

\n", 661 | "
" 662 | ], 663 | "text/plain": [ 664 | " am del hap i a not was , i am h am n appy ... not \\\n", 665 | "doc-id ... \n", 666 | "0 1 1 1 1 1 1 1 0 1 1 ... 1 \n", 667 | "1 1 1 1 1 1 1 1 1 0 1 ... 1 \n", 668 | "\n", 669 | " ot d ot h s de s no t de t ha t wa was yed, \n", 670 | "doc-id \n", 671 | "0 0 1 1 0 0 1 1 1 1 \n", 672 | "1 1 0 0 1 1 0 1 1 1 \n", 673 | "\n", 674 | "[2 rows x 37 columns]" 675 | ] 676 | }, 677 | "execution_count": 13, 678 | "metadata": {}, 679 | "output_type": "execute_result" 680 | } 681 | ], 682 | "source": [ 683 | "from sklearn.feature_extraction.text import CountVectorizer\n", 684 | "\n", 685 | "vec = CountVectorizer(analyzer='char', ngram_range=(4,4))\n", 686 | "\n", 687 | "x = vec.fit_transform(flight_delayed_lines)\n", 688 | "\n", 689 | "df = pd.DataFrame(\n", 690 | " x.todense(), \n", 691 | " columns=vec.get_feature_names(),\n", 692 | ")\n", 693 | "\n", 694 | "df.index.name = 'doc-id'\n", 695 | "\n", 696 | "df" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 14, 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [ 705 | "lines_fruits = [\n", 706 | " 'I like apples',\n", 707 | " 'I like oranges',\n", 708 | " 'I like pears',\n", 709 | "]" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 15, 715 | "metadata": {}, 716 | "outputs": [], 717 | "source": [ 718 | "from IPython.display import display_html\n", 719 | "\n", 720 | "def display_side_by_side(*args):\n", 721 | " \n", 722 | " html_str=''\n", 723 | " \n", 724 | " for df in args:\n", 725 | " html_str += df.to_html()\n", 726 | " html_str += ''.join([' ' for i in range(20)])\n", 727 | " \n", 728 | " html_str = html_str.replace('table','table style=\"display:inline;\"')\n", 729 | " \n", 730 | " display_html(html_str, raw=True)" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": 16, 736 | "metadata": { 737 | "scrolled": false 738 | }, 739 | "outputs": [ 740 | { 741 | "data": { 742 | "text/html": [ 743 | "\n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | "
applesilikeorangespears
CountVectorizer
01.01.01.00.00.0
10.01.01.01.00.0
20.01.01.00.01.0
                    \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | "
applesilikeorangespears
TfidfVectorizer
00.770.450.450.000.00
10.000.450.450.770.00
20.000.450.450.000.77
                    " 834 | ] 835 | }, 836 | "metadata": {}, 837 | "output_type": "display_data" 838 | } 839 | ], 840 | "source": [ 841 | "from sklearn.feature_extraction.text import CountVectorizer\n", 842 | "\n", 843 | "vec = CountVectorizer(token_pattern=r'\\w+')\n", 844 | "\n", 845 | "x = vec.fit_transform(lines_fruits)\n", 846 | "\n", 847 | "df1 = pd.DataFrame(\n", 848 | " x.todense().astype(float).round(2), \n", 849 | " columns=vec.get_feature_names(),\n", 850 | ")\n", 851 | "\n", 852 | "df1.index.name = 'CountVectorizer'\n", 853 | "\n", 854 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 855 | "\n", 856 | "vec = TfidfVectorizer(token_pattern=r'\\w+')\n", 857 | "\n", 858 | "x = vec.fit_transform(lines_fruits)\n", 859 | "\n", 860 | "df2 = pd.DataFrame(\n", 861 | " x.todense().round(2), \n", 862 | " columns=vec.get_feature_names(),\n", 863 | ")\n", 864 | "\n", 865 | "df2.index.name = 'TfidfVectorizer'\n", 866 | "\n", 867 | "\n", 868 | "display_side_by_side(df1, df2)" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": 17, 874 | "metadata": {}, 875 | "outputs": [ 876 | { 877 | "data": { 878 | "text/html": [ 879 | "
\n", 880 | "\n", 893 | "\n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | "
applesilikeorangespears
CountVectorizer
01.01.01.00.00.0
10.01.01.01.00.0
\n", 931 | "
" 932 | ], 933 | "text/plain": [ 934 | " apples i like oranges pears\n", 935 | "CountVectorizer \n", 936 | "0 1.0 1.0 1.0 0.0 0.0\n", 937 | "1 0.0 1.0 1.0 1.0 0.0" 938 | ] 939 | }, 940 | "execution_count": 17, 941 | "metadata": {}, 942 | "output_type": "execute_result" 943 | } 944 | ], 945 | "source": [ 946 | "from sklearn.feature_extraction.text import CountVectorizer\n", 947 | "\n", 948 | "vec = CountVectorizer(token_pattern=r'\\w+')\n", 949 | "\n", 950 | "x = vec.fit_transform(lines_fruits)\n", 951 | "\n", 952 | "df1 = pd.DataFrame(\n", 953 | " x.todense().astype(float).round(2), \n", 954 | " columns=vec.get_feature_names(),\n", 955 | ")\n", 956 | "\n", 957 | "df1.index.name = 'CountVectorizer'\n", 958 | "\n", 959 | "df1.head(2)" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 18, 965 | "metadata": {}, 966 | "outputs": [ 967 | { 968 | "data": { 969 | "text/html": [ 970 | "
\n", 971 | "\n", 984 | "\n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | "
applesilikeorangespears
doc-id
00.7674950.4532950.4532950.0000000.000000
10.0000000.4532950.4532950.7674950.000000
20.0000000.4532950.4532950.0000000.767495
\n", 1030 | "
" 1031 | ], 1032 | "text/plain": [ 1033 | " apples i like oranges pears\n", 1034 | "doc-id \n", 1035 | "0 0.767495 0.453295 0.453295 0.000000 0.000000\n", 1036 | "1 0.000000 0.453295 0.453295 0.767495 0.000000\n", 1037 | "2 0.000000 0.453295 0.453295 0.000000 0.767495" 1038 | ] 1039 | }, 1040 | "execution_count": 18, 1041 | "metadata": {}, 1042 | "output_type": "execute_result" 1043 | } 1044 | ], 1045 | "source": [ 1046 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 1047 | "\n", 1048 | "vec = TfidfVectorizer(token_pattern=r'\\w+')\n", 1049 | "\n", 1050 | "x = vec.fit_transform(lines_fruits)\n", 1051 | "\n", 1052 | "df = pd.DataFrame(\n", 1053 | " x.todense(), \n", 1054 | " columns=vec.get_feature_names(),\n", 1055 | ")\n", 1056 | "\n", 1057 | "df.index.name = 'doc-id'\n", 1058 | "\n", 1059 | "df" 1060 | ] 1061 | }, 1062 | { 1063 | "cell_type": "code", 1064 | "execution_count": 19, 1065 | "metadata": {}, 1066 | "outputs": [], 1067 | "source": [ 1068 | "import spacy\n", 1069 | "\n", 1070 | "nlp = spacy.load('en_core_web_md')\n", 1071 | "\n", 1072 | "\n", 1073 | "terms = ['I', 'like', 'apples', 'oranges', 'pears']\n", 1074 | "vectors = [\n", 1075 | " nlp(term).vector.tolist() for term in terms\n", 1076 | "]" 1077 | ] 1078 | }, 1079 | { 1080 | "cell_type": "code", 1081 | "execution_count": 20, 1082 | "metadata": {}, 1083 | "outputs": [ 1084 | { 1085 | "data": { 1086 | "text/plain": [ 1087 | "300" 1088 | ] 1089 | }, 1090 | "execution_count": 20, 1091 | "metadata": {}, 1092 | "output_type": "execute_result" 1093 | } 1094 | ], 1095 | "source": [ 1096 | "len(vectors[terms.index('apples')])" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "execution_count": 21, 1102 | "metadata": {}, 1103 | "outputs": [ 1104 | { 1105 | "data": { 1106 | "text/plain": [ 1107 | "0 -0.633400\n", 1108 | "1 0.189810\n", 1109 | "2 -0.535440\n", 1110 | "3 -0.526580\n", 1111 | "4 -0.300010\n", 1112 | " ... \n", 1113 | "295 0.068773\n", 1114 | "296 -0.238810\n", 1115 | "297 -1.178400\n", 1116 | "298 0.255040\n", 1117 | "299 0.611710\n", 1118 | "Name: apples, Length: 300, dtype: float64" 1119 | ] 1120 | }, 1121 | "execution_count": 21, 1122 | "metadata": {}, 1123 | "output_type": "execute_result" 1124 | } 1125 | ], 1126 | "source": [ 1127 | "pd.Series(vectors[terms.index('apples')]).rename('apples')" 1128 | ] 1129 | }, 1130 | { 1131 | "cell_type": "code", 1132 | "execution_count": 22, 1133 | "metadata": {}, 1134 | "outputs": [ 1135 | { 1136 | "data": { 1137 | "text/html": [ 1138 | "
\n", 1139 | "\n", 1152 | "\n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | "
Ilikeapplesorangespears
I0.0000000.4445090.7955730.8117590.795573
like0.4445090.0000000.6701290.7228250.670129
apples0.7955730.6701290.0000000.2219060.000000
oranges0.8117590.7228250.2219060.0000000.221906
pears0.7955730.6701290.0000000.2219060.000000
\n", 1206 | "
" 1207 | ], 1208 | "text/plain": [ 1209 | " I like apples oranges pears\n", 1210 | "I 0.000000 0.444509 0.795573 0.811759 0.795573\n", 1211 | "like 0.444509 0.000000 0.670129 0.722825 0.670129\n", 1212 | "apples 0.795573 0.670129 0.000000 0.221906 0.000000\n", 1213 | "oranges 0.811759 0.722825 0.221906 0.000000 0.221906\n", 1214 | "pears 0.795573 0.670129 0.000000 0.221906 0.000000" 1215 | ] 1216 | }, 1217 | "execution_count": 22, 1218 | "metadata": {}, 1219 | "output_type": "execute_result" 1220 | } 1221 | ], 1222 | "source": [ 1223 | "from sklearn.metrics.pairwise import cosine_similarity\n", 1224 | "from sklearn.metrics.pairwise import cosine_distances\n", 1225 | "\n", 1226 | "pd.DataFrame(\n", 1227 | " cosine_distances(vectors),\n", 1228 | " index=terms,\n", 1229 | " columns=terms,\n", 1230 | ")" 1231 | ] 1232 | }, 1233 | { 1234 | "cell_type": "code", 1235 | "execution_count": 23, 1236 | "metadata": {}, 1237 | "outputs": [ 1238 | { 1239 | "data": { 1240 | "text/html": [ 1241 | "\n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | "
I like apples oranges pears
I10.560.20.190.2
like0.5610.330.280.33
apples0.20.3310.781
oranges0.190.280.7810.78
pears0.20.3310.781
" 1359 | ], 1360 | "text/plain": [ 1361 | "" 1362 | ] 1363 | }, 1364 | "execution_count": 23, 1365 | "metadata": {}, 1366 | "output_type": "execute_result" 1367 | } 1368 | ], 1369 | "source": [ 1370 | "import seaborn as sns\n", 1371 | "from sklearn.metrics.pairwise import cosine_similarity\n", 1372 | "\n", 1373 | "cm = sns.light_palette(\"Gray\", as_cmap=True)\n", 1374 | "\n", 1375 | "pd.DataFrame(\n", 1376 | " cosine_similarity(vectors),\n", 1377 | " index=terms,\n", 1378 | " columns=terms,\n", 1379 | ").round(2).style.background_gradient(cmap=cm)" 1380 | ] 1381 | }, 1382 | { 1383 | "cell_type": "code", 1384 | "execution_count": null, 1385 | "metadata": {}, 1386 | "outputs": [], 1387 | "source": [] 1388 | }, 1389 | { 1390 | "cell_type": "code", 1391 | "execution_count": null, 1392 | "metadata": {}, 1393 | "outputs": [], 1394 | "source": [] 1395 | }, 1396 | { 1397 | "cell_type": "code", 1398 | "execution_count": null, 1399 | "metadata": {}, 1400 | "outputs": [], 1401 | "source": [] 1402 | } 1403 | ], 1404 | "metadata": { 1405 | "kernelspec": { 1406 | "display_name": "Python 3", 1407 | "language": "python", 1408 | "name": "python3" 1409 | }, 1410 | "language_info": { 1411 | "codemirror_mode": { 1412 | "name": "ipython", 1413 | "version": 3 1414 | }, 1415 | "file_extension": ".py", 1416 | "mimetype": "text/x-python", 1417 | "name": "python", 1418 | "nbconvert_exporter": "python", 1419 | "pygments_lexer": "ipython3", 1420 | "version": "3.6.9" 1421 | } 1422 | }, 1423 | "nbformat": 4, 1424 | "nbformat_minor": 2 1425 | } 1426 | -------------------------------------------------------------------------------- /Chapter07/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter07/.keep -------------------------------------------------------------------------------- /Chapter07/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 7: Neural Networks; Here Comes the Deep Learning 2 | 3 | The term deep learning refers to deep Artificial Neural Networks (ANNs). The latter concept comes in different forms and shapes. In this chapter, we are going to cover one subset of feedforward neural networks known as the Multilayer Perceptron (MLP). It is one of the most commonly used types and is implemented by scikit-learn. As its name suggests, it is composed of multiple layers, and it is a feedforward network as there are no cyclic connections between its layers. The more layers there are, the deeper the network is. These deep networks can exist in multiple forms, such as MLP, Convolutional Neural Networks (CNNs), or Long Short-Term Memory (LSTM). The latter two are not implemented by scikit-learn, yet this will not stop us from discussing the main concepts behind CNNs and manually mimicking them using the tools from the scientific Python ecosystem. 4 | 5 | In this chapter, we are going to cover the following topics: 6 | 7 | - Getting to know MLP 8 | - Classifying items of clothing 9 | - Untangling convolutions 10 | - MLP regressors 11 | -------------------------------------------------------------------------------- /Chapter08/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter08/.keep -------------------------------------------------------------------------------- /Chapter09/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter09/.keep -------------------------------------------------------------------------------- /Chapter09/Target Scaling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Target Scaling" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 14, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "x = np.random.uniform(low=5, high=20, size=100)\n", 17 | "e = np.random.normal(loc=0, scale=0.5, size=100)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 15, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "y = (x + e) ** 3 " 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 16, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stderr", 36 | "output_type": "stream", 37 | "text": [ 38 | "/Users/tarek/anaconda3/envs/scikitbook/lib/python3.6/site-packages/ipykernel_launcher.py:10: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", 39 | " # Remove the CWD from sys.path while we load stuff.\n" 40 | ] 41 | }, 42 | { 43 | "data": { 44 | "image/png": "\n", 45 | "text/plain": [ 46 | "
" 47 | ] 48 | }, 49 | "metadata": { 50 | "needs_background": "light" 51 | }, 52 | "output_type": "display_data" 53 | } 54 | ], 55 | "source": [ 56 | "fig, ax = plt.subplots(1, 1, figsize=(10, 8))\n", 57 | "\n", 58 | "pd.DataFrame({'x': x, 'y': y}).plot(\n", 59 | " title='Y is non-linear',\n", 60 | " kind='scatter',\n", 61 | " x='x', y='y', \n", 62 | " color='k', ax=ax\n", 63 | ")\n", 64 | "\n", 65 | "fig.show()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 17, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "x = x.reshape((x.shape[0],1))" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 18, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "from sklearn.model_selection import train_test_split\n", 84 | "\n", 85 | "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 19, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "from sklearn.linear_model import Ridge\n", 95 | "\n", 96 | "rgs = Ridge()\n", 97 | "rgs.fit(x_train, y_train)\n", 98 | "y_pred = rgs.predict(x_test)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 20, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "MAE=704, R2=0.883232\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "from sklearn.metrics import mean_absolute_error\n", 116 | "from sklearn.metrics import r2_score\n", 117 | "\n", 118 | "print(\n", 119 | " 'MAE={:.0f}, R2={:2f}'.format(\n", 120 | " mean_absolute_error(y_test, y_pred),\n", 121 | " r2_score(y_test, y_pred),\n", 122 | " )\n", 123 | ")" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 21, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "class YTransformer:\n", 133 | " \n", 134 | " def __init__(self, power=1):\n", 135 | " self.power = power\n", 136 | " \n", 137 | " def fit(self, x, y):\n", 138 | " pass\n", 139 | " \n", 140 | " def transform(self, x, y):\n", 141 | " return x, np.power(y, self.power)\n", 142 | " \n", 143 | " def inverse_transform(self, x, y):\n", 144 | " return x, np.power(y, 1/self.power)\n", 145 | " \n", 146 | " \n", 147 | " def fit_transform(self, x, y):\n", 148 | " return self.transform(x, y)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 22, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "Trasfomed y^1.00: MAE=704, R2=0.88\n", 161 | "Trasfomed y^0.50: MAE=378, R2=0.96\n", 162 | "Trasfomed y^0.33: MAE=339, R2=0.96\n", 163 | "Trasfomed y^0.25: MAE=359, R2=0.95\n", 164 | "Trasfomed y^0.20: MAE=400, R2=0.93\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "from sklearn.linear_model import Ridge\n", 170 | "from sklearn.metrics import mean_absolute_error\n", 171 | "from sklearn.metrics import r2_score\n", 172 | "\n", 173 | "for power in [1, 1/2, 1/3, 1/4, 1/5]:\n", 174 | "\n", 175 | " yt = YTransformer(power)\n", 176 | " _, y_train_t = yt.fit_transform(None, y_train)\n", 177 | " _, y_test_t = yt.transform(None, y_test)\n", 178 | " \n", 179 | " rgs = Ridge()\n", 180 | "\n", 181 | " rgs.fit(x_train, y_train_t)\n", 182 | " y_pred_t = rgs.predict(x_test)\n", 183 | " \n", 184 | " _, y_pred = yt.inverse_transform(None, y_pred_t)\n", 185 | "\n", 186 | " print(\n", 187 | " 'Trasfomed y^{:.2f}: MAE={:.0f}, R2={:.2f}'.format(\n", 188 | " power,\n", 189 | " mean_absolute_error(y_test, y_pred),\n", 190 | " r2_score(y_test, y_pred),\n", 191 | " )\n", 192 | " )" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": "Python 3", 206 | "language": "python", 207 | "name": "python3" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 3 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython3", 219 | "version": "3.6.9" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 2 224 | } 225 | -------------------------------------------------------------------------------- /Chapter10/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter10/.keep -------------------------------------------------------------------------------- /Chapter11/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter11/.keep -------------------------------------------------------------------------------- /Chapter12/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter12/.keep -------------------------------------------------------------------------------- /Chapter13/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter13/.keep -------------------------------------------------------------------------------- /Chapter13/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 13: Recommender System; Getting to Know Their Taste 2 | 3 | In this chapter, we will learn about the different approaches used by recommender systems. We will mainly use a sister library to scikit-learn called Surprise. Surprise is a toolkit that implements different collaborative filtering algorithms. So, we will start by learning the differences between the collaborative filtering algorithms and the content-based filtering algorithms used in a recommendation engine. We will also learn how to package our trained models to be used by other software without the need for retraining. The following topics will be discussed here: 4 | 5 | - The different recommendation paradigms 6 | - Downloading Surprise and the dataset 7 | - Using KNN-inspired algorithms 8 | - Using baseline algorithms 9 | - Using singular value decomposition 10 | - Deploying machine learning models in production 11 | -------------------------------------------------------------------------------- /Chapter13/artist_recommender.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter13/artist_recommender.pkl -------------------------------------------------------------------------------- /Chapter13/recsys.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Machine-Learning-with-scikit-learn-and-Scientific-Python-Toolkits/2dc4f5164d75adfc298767f14f5aeafe45d0b385/Chapter13/recsys.pkl -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Hands-On Machine Learning with Scikit-Learn and Scientific Python Toolkits 5 | 6 | Hands-On Machine Learning with Scikit-Learn and Scientific Python Toolkits 7 | 8 | This is the code repository for [Hands-On Machine Learning with Scikit-Learn and Scientific Python Toolkits](https://www.packtpub.com/data/hands-on-machine-learning-with-scikit-learn?utm_source=github&utm_medium=repository&utm_campaign=9781838826048), published by Packt. 9 | 10 | **A practical guide to implementing supervised and unsupervised machine learning algorithms in Python** 11 | 12 | ## What is this book about? 13 | Machine learning is applied everywhere, from business to research and academia, while Scikit-Learn is a versatile library that is popular among machine learning practitioners. This book serves as a practical guide for anyone looking to provide hands-on machine learning solutions with Scikit-Learn and Python toolkits. 14 | 15 | The book begins with an explanation of machine learning concepts and fundamentals, and strikes a balance between theoretical concepts and their applications. Each chapter covers a different set of algorithms, and shows you how to use them to solve real-life problems. You’ll also learn various key supervised and unsupervised machine learning algorithms using practical examples. Whether it is an instance-based learning algorithm, Bayesian estimation, a deep neural network, a tree-based ensemble, or a recommendation system, you’ll gain a thorough understanding of its theory and learn when to apply it. As you advance, you’ll learn how to deal with unlabeled data and when to use different clustering and anomaly detection algorithms. 16 | 17 | By the end of this machine learning book, you’ll have learnt how to take a data-driven approach to provide end-to-end machine learning solutions. You’ll also have discovered how to formulate the problem at hand, prepare required data, and evaluate and deploy models in production. 18 | 19 | This book goes beyond Scikit-Learn, and introduces you to complementary libraries such as NumPy, Pandas, SpaCy, imbalanced-learn, and Scikit-Surprise. The theoretical knowledge in this book should also prepare you to use libraries not mentioned here such as Tensor Flow and Pytorch. 20 | 21 | In this repo, you will find the code examples used in the book. I also include here parts of the code omitted in the book, such as the data visualization styling, additional formatting, etc. 22 | 23 | This book covers the following exciting features: 24 | * Understand when to use supervised, unsupervised, or reinforcement learning algorithms 25 | * Find out how to collect and prepare your data for machine learning tasks 26 | * Tackle imbalanced data and optimize your algorithm for a bias or variance tradeoff 27 | * Apply supervised and unsupervised algorithms to overcome various machine learning challenges 28 | * Employ best practices for tuning your algorithm’s hyper parameters 29 | * Discover how to use neural networks for classification and regression 30 | * Build, evaluate, and deploy your machine learning solutions to production 31 | 32 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1838826041) today! 33 | 34 | https://www.packtpub.com/ 35 | 36 | ## Book Reviews 37 | 38 | ### From [GoodReads](https://www.goodreads.com/book/show/54539914-hands-on-machine-learning-with-scikit-learn-and-scientific-python-toolki): 39 | 40 | Ali Faizan rated it: 5 out of 5 stars. 41 | > For a machine learning noob like me, it was pleasing to see that the book did not dive straight into the nitty-gritty of machine learning algorithms: it first established the raison d’être for machine learning and cohesively captured the whole gamut of developing a machine learning model. This helped me quite a bit to understand the bigger picture later on in the book where it demonstrated the practical use of various machine learning algorithms. I'll happily recommend this book to anyone interested in scikit-learn, and machine learning in general too 42 | 43 | Paul Schmidt rated it: 5 out of 5 stars. 44 | > This book is information rich with practical examples. I whom never read or touched this area was suprised to learn the weight that data analysis had on machine learning. Yes, this book also teaches you about data analysis. Throughout the chapters you learn what not to do when building machine learning and deep learning models. The author teaches you what not to do by analysing the data at hand and improving the models upon that knowledge. The book is very information rich and can easily be reread from chapter to chapter. There are some things to keep in mind, this book is not for python beginners and i urge you to know some of the basics from the pandas and matplotlib modules. In other words this book is strongly recommended. 45 | 46 | ### From [Amazon](https://www.amazon.com/Machine-Learning-scikit-learn-Scientific-Toolkits-ebook/dp/B08BTFY8YW/): 47 | 48 | Przemyslaw Chojecki rated it: 5 out of 5 stars. 49 | > If you've already did a couple of data science projects, had a basic understanding of Python, did some visualisation and want to go deeper into some details of what it means to analyse data, then this book is for you. This is a practical guide to both supervised and unsupervised learning with plenty of examples in code. The main focus is on imperfect data and how to make sense of these imperfections through various machine learning algorithms. The author discusses standard data science algorithms using scikit-learn library which gives a coherent overview of the subjest. You will learn decision trees, KNN classification, Naive Bayes and much more; applied to classical datasets like Iris dataset, Boston housing prices or Fashion-MNIST. Recommended for beginning data scientists! 50 | 51 | 52 | Adam Powell rated it: 5 out of 5 stars. 53 | > The perfect read for an analyst that wants to transition into machine learning. It broadly covers all the key algorithms with an insightful practitioner's perspective. Highly recommended! 54 | 55 | ## Instructions and Navigations 56 | All of the code is organized into folders. 57 | 58 | The code will look like the following: 59 | ``` 60 | import numpy as np 61 | import scipy as sp 62 | import pandas as pd 63 | import seaborn as sns 64 | import matplotlib.pyplot as plt 65 | 66 | ``` 67 | 68 | **Following is what you need for this book:** 69 | This book is for machine learning data scientists who want to master the theoretical and practical sides of machine learning algorithms and understand how to use them to solve real-life problems. Working knowledge of Python and a basic understanding of underlying mathematical and statistical concepts is required. Nevertheless, this book will walk you through the new concepts to cater to both new and experienced data scientists. 70 | 71 | With the following software and hardware list you can run all code files present in the book (Chapter 1-13). 72 | 73 | ### Software and Hardware List 74 | 75 | | Chapter | Software required | OS required | 76 | | -------- | -------------------------------------------------------------------------------------| -----------------------------------| 77 | | 1 - 13 | Python 3.x, Jupyter Notebook/Google Colab | Windows, Mac OS X, and Linux (Any) | 78 | 79 | 80 | # Running the code 81 | 82 | You will need Python 3.x installed on your computer. It is a good practice to set up a virtual environment to install the required libraries into. It's up to you whether you wish to use Python's venv module, the virtual environment provided by Anaconda, or any other option you like. We'll be using pip to install the libraries needed in the book, but once more, it is up to you whether you prefer to use conda or any other alternatives. 83 | 84 | We suggest you create a conda environment first, then install the required libs there: 85 | 86 | ``` 87 | conda create -n scikitbook python=3.6 88 | conda activate scikitbook 89 | pip install --upgrade -r requirements.txt 90 | ``` 91 | 92 | You need to do the above steps once. 93 | Then to activate the environment: 94 | 95 | ``` 96 | conda activate scikitbook 97 | ``` 98 | 99 | And to run Jupyter: 100 | 101 | ``` 102 | jupyter notebook 103 | ``` 104 | 105 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://static.packt-cdn.com/downloads/9781838826048_ColorImages.pdf). 106 | 107 | 108 | ### Related products 109 | * Python Machine Learning - Third Edition [[Packt]](https://www.packtpub.com/data/python-machine-learning-third-edition?utm_source=github&utm_medium=repository&utm_campaign=9781789955750) [[Amazon]](https://www.amazon.com/dp/1789955750) 110 | 111 | * Mastering Machine Learning Algorithms - Second Edition [[Packt]](https://www.packtpub.com/data/mastering-machine-learning-algorithms-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781838820299) [[Amazon]](https://www.amazon.com/dp/B0843PMXPV) 112 | 113 | ## Get to Know the Author 114 | **Tarek Amr** 115 | has 8 years of experience in data science and machine learning. After finishing his postgraduate degree at the University of East Anglia, he worked in a number of startups and scaleup companies in Egypt and in the Netherlands. This is his second data-related book. His previous book is about data visualization using D3.js. He enjoys giving talks and writing about different computer science and business concepts and explaining them to a wider audience. He can be reached on twitter at [@gr33ndata](https://twitter.com/gr33ndata). He is happy to respond to all questions related to this book. Feel free to reach him if any parts of the book need clarifications or if you would like to discuss any of the concepts there in more detail. 116 | 117 | You can also find the book's page on Good Reads [here](https://www.goodreads.com/book/show/54539914-hands-on-machine-learning-with-scikit-learn-and-scientific-python-toolki), your book reviews are highly appreciated. 118 | 119 | # Book Citation 120 | 121 | Please make sure to cite the book if you use it in your research: 122 | 123 | BiBTeX: 124 | 125 | ``` 126 | @book{amr2020hands, 127 | title={Hands-On Machine Learning with scikit-learn and Scientific Python Toolkits: A practical guide to implementing supervised and unsupervised machine learning algorithms in Python}, 128 | author={Amr, Tarek}, 129 | isbn={9781838823580}, 130 | url={https://books.google.nl/books?id=GlbzDwAAQBAJ}, 131 | year={2020}, 132 | publisher={Packt Publishing, Limited} 133 | } 134 | ``` 135 | 136 | ### Suggestions and Feedback 137 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 138 | ### Download a free PDF 139 | 140 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
141 |

https://packt.link/free-ebook/9781838826048

142 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | attrs==19.3.0 3 | backcall==0.1.0 4 | bleach==3.3.0 5 | blis==0.4.1 6 | catalogue==1.0.0 7 | category-encoders==2.1.0 8 | certifi==2024.7.4 9 | chardet==3.0.4 10 | cycler==0.10.0 11 | cymem==2.0.3 12 | decorator==4.4.1 13 | defusedxml==0.6.0 14 | en-core-web-md==2.2.5 15 | entrypoints==0.3 16 | idna==3.7 17 | imageio==2.6.1 18 | imbalanced-learn==0.6.2 19 | importlib-metadata==0.23 20 | ipykernel==5.1.3 21 | ipython==8.10.0 22 | ipython-genutils==0.2.0 23 | ipywidgets==7.5.1 24 | jedi==0.15.1 25 | Jinja2==3.1.5 26 | joblib==1.2.0 27 | jsonschema==3.2.0 28 | jupyter==1.0.0 29 | jupyter-client==5.3.4 30 | jupyter-console==6.0.0 31 | jupyter-core==4.11.2 32 | kiwisolver==1.1.0 33 | MarkupSafe==1.1.1 34 | matplotlib==3.1.2 35 | mistune==2.0.3 36 | mkl-fft==1.0.15 37 | mkl-random==1.1.0 38 | mkl-service==2.3.0 39 | more-itertools==7.2.0 40 | murmurhash==1.0.2 41 | nbconvert==6.5.1 42 | nbformat==4.4.0 43 | networkx==2.4 44 | numpy==1.22.0 45 | notebook==6.4.12 46 | pandas==0.25.3 47 | pandocfilters==1.4.2 48 | parso==0.5.1 49 | patsy==0.5.1 50 | pexpect==4.7.0 51 | pickleshare==0.7.5 52 | Pillow==10.3.0 53 | plac==1.1.3 54 | preshed==3.0.2 55 | prometheus-client==0.7.1 56 | prompt-toolkit==2.0.10 57 | ptyprocess==0.6.0 58 | Pygments==2.15.0 59 | pyparsing==2.4.5 60 | pyrsistent==0.15.5 61 | python-dateutil==2.8.1 62 | pytz==2019.3 63 | PyWavelets==1.1.1 64 | pyzmq==18.1.1 65 | qtconsole==4.6.0 66 | requests==2.32.2 67 | scikit-image==0.16.2 68 | scikit-learn==0.22 69 | scipy==1.10.0 70 | seaborn==0.9.0 71 | Send2Trash==1.5.0 72 | six==1.13.0 73 | spacy==2.2.3 74 | srsly==1.0.1 75 | statsmodels==0.10.2 76 | terminado==0.8.3 77 | testpath==0.4.4 78 | thinc==7.3.1 79 | tornado==6.4.1 80 | tqdm==4.66.3 81 | traitlets==4.3.3 82 | urllib3==1.26.19 83 | wasabi==0.6.0 84 | wcwidth==0.1.7 85 | webencodings==0.5.1 86 | widgetsnbextension==3.5.1 87 | zipp==3.19.1 88 | --------------------------------------------------------------------------------