├── .gitignore ├── README.md ├── docker_setup.sh ├── imgs ├── JupyterLabInterface.png ├── JupyterLabURL.png ├── chapter5notebook.png └── docker_shell_run.png └── notebooks ├── appendixA ├── Appendix_A.ipynb └── Appendix_A_intro.ipynb ├── ch05 └── Chapter5.ipynb ├── ch06 └── Chapter6.ipynb ├── ch07 ├── Chapter7_1.ipynb └── Chapter7_2.ipynb ├── ch08 ├── Chapter8_1.dbc ├── Chapter8_1.html └── Chapter8_1.ipynb ├── ch09 ├── CleanCode.py ├── UnitTestExample.py ├── WoT.py └── __init__.py ├── ch10 └── Chapter10_1.ipynb ├── ch11 └── Chapter11.ipynb ├── ch12 └── Chapter12.ipynb ├── ch13 ├── Chapter_13.dbc ├── Chapter_13.html └── Chapter_13.scala ├── ch14 └── Chapter14.ipynb ├── ch15 ├── Chapter15_1.ipynb └── Chapter15_2.ipynb └── ch16 ├── Chapter16_1.dbc ├── Chapter16_1.html ├── Chapter16_1.ipynb ├── Chapter16_2.dbc ├── Chapter16_2.html └── Chapter16_2.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ML-Engineering 2 | Reference code base for ML Engineering in Action, Manning Publications 3 | Author: Ben Wilson 4 | 5 | 6 | ### About this repo 7 | This is a companion to the Manning book Machine Learning Engineering in Action. 8 | Within this repo are two separate types of notebooks, linked to the examples shown in chapters within the book. 9 | The formats of these notebooks come in several different flavors, depending on the type of examples that they are covering: 10 | - Jupyter notebooks for 'standalone Python' 11 | - PySpark Databricks archive notebooks (these can be imported into Databricks or Databricks Community Edition (free of charge)) 12 | - PySpark html notebook representations (these can be loaded into any web browser for visualization) 13 | - Scala Spark Databricks notebooks (in .dbc, .html, and pure .scala formats) 14 | 15 | For the Jupyter notebooks, a pre-configured bash script is provided at the root level of this directory that will generate a docker image and automatically start the created container for you to rapidly get started with these notebooks. 16 | 17 | 18 | ### Getting Started with Docker 19 | To utilize the pre-built environment and follow along with the examples in the book with additional notes and code that wasn't included in the book, we first need Docker. 20 | 21 | There are a number of different ways to acquire Docker. Please visit their [website](https://docs.docker.com/get-docker/) for instructions on installing the desktop GUI and the engine. 22 | 23 | #### Creating the image 24 | The file [here](/docker_setup.sh) will, when executed through a bash command in your linux terminal, create the container to execute the Jupyter notebooks in this repo. 25 | The script will link this repo to the docker environment through piped synchronization to your local machine, download the required data to execute the code in the notebooks, and 26 | install the necessary dependencies to get Jupyter working (as well as some required libraries that are not part of the Anacondas runtime). 27 | ```text 28 | NOTE: Within the bash script is a variable named 'port' that will allow you to customize 29 | the access port that Jupyter and Docker will use to allow you to utilize a Jupyter notebook 30 | from your local web browser. If you currently have Jupyter running on your machine with the 31 | general 'default port' of 8888, this configuration utilizes 8887. Feel free to change it 32 | if there is a conflict. 33 | ``` 34 | Once the shell script is executed, as shown below, the container will be constructed for your use. 35 | 36 | ![Running the bash script](./imgs/docker_shell_run.png?raw=true) 37 | 38 | #### Getting Jupyter to start up 39 | At the end of the container creation process, your terminal will have a url that you can paste 40 | into your web browser of choice, as shown below. 41 | 42 | ![JupyterLab URL](./imgs/JupyterLabURL.png) 43 | 44 | After copying one of these URL's (I typically stick to the local host 127.0.0.1/ one), paste it 45 | into a browser. You'll have all of the notebooks available that are part of the chapters of 46 | ML Engineering in Action. 47 | 48 | ![JupyterLab Interface](./imgs/JupyterLabInterface.png) 49 | 50 | Navigating within each of the chapters (the ones that have supported standalone Python Jupterlab 51 | examples; the Spark ones will not load here!) will give you the notebook links that you can click 52 | on and open the notebook in a new tab for reading, running, modification, and anything else 53 | you'd like to do. 54 | 55 | ![Example Notebook](./imgs/chapter5notebook.png) -------------------------------------------------------------------------------- /docker_setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # A shell script to initialize a new docker container that will host jupyter notebooks with the conda environment. 3 | # Many thanks to Jas Bali for making this so much better than what I originally had here. -Ben 4 | set -x 5 | port=8887 6 | dataset_repo_folder="$(mktemp -d)/tmp-datasets-folder" 7 | dataset_folder="$dataset_repo_folder/datasets" 8 | final_dataset_folder="$PWD/notebooks/TCPD/datasets" 9 | mkdir -p $dataset_repo_folder 10 | 11 | echo "Cloning datasets into folder: $dataset_repo_folder" 12 | git clone https://github.com/alan-turing-institute/TCPD $dataset_repo_folder 13 | 14 | rm -r $final_dataset_folder 15 | echo "Copying datasets from $dataset_folder into $final_dataset_folder" 16 | mkdir -p $final_dataset_folder 17 | cp -r "$dataset_folder/" "$final_dataset_folder/" 18 | 19 | echo "Starting Jupyter notebooks"s 20 | 21 | docker run -i --name=MLEngineeringInAction \ 22 | -v $(PWD)/notebooks:/opt/notebooks -t \ 23 | -p $port:$port continuumio/anaconda3 bin/bash \ 24 | -c "/opt/conda/bin/conda install jupyter -y --quiet && \ 25 | /opt/conda/bin/conda install -c conda-forge hyperopt=0.2.5 -y --quiet && \ 26 | mkdir -p /opt/notebooks && \ 27 | /opt/conda/bin/jupyter notebook --notebook-dir=/opt/notebooks \ 28 | --ip='*' --port=$port --no-browser --allow-root" 29 | -------------------------------------------------------------------------------- /imgs/JupyterLabInterface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenWilson2/ML-Engineering/0fc05f4b876b26bbacc85bcb11c7c2aef517cd20/imgs/JupyterLabInterface.png -------------------------------------------------------------------------------- /imgs/JupyterLabURL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenWilson2/ML-Engineering/0fc05f4b876b26bbacc85bcb11c7c2aef517cd20/imgs/JupyterLabURL.png -------------------------------------------------------------------------------- /imgs/chapter5notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenWilson2/ML-Engineering/0fc05f4b876b26bbacc85bcb11c7c2aef517cd20/imgs/chapter5notebook.png -------------------------------------------------------------------------------- /imgs/docker_shell_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenWilson2/ML-Engineering/0fc05f4b876b26bbacc85bcb11c7c2aef517cd20/imgs/docker_shell_run.png -------------------------------------------------------------------------------- /notebooks/ch07/Chapter7_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Chapter 7, ML Engineering\n", 8 | "##### Author: Ben Wilson\n", 9 | "\n", 10 | "In this notebook, we'll be following along with the code listings shown in Chapter 7 for the stand-alone (local VM) portion. This covers section 7.1, listings 7.1 through 7.3.
\n", 11 | "For the remainder of the listing references in section 7.1, you can refer to the project notebook for the local VM implementation for hyperopt tuning of the forecasting problem in the companion notebook to this entitled, \"Chapter7_Local_Hyperopt_Forecasting_Notebook\". That notebook will serve as a full-implementation guide to the remainder of the code listings in section 7.1." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from matplotlib import pyplot as plt\n", 21 | "import random\n", 22 | "from hyperopt import hp, tpe, Trials, fmin" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Listing 7.1 Hyperopt fundamentals: the objective function\n", 30 | "In these first 3 listings, we're going to take a look at what hyperopt is actually doing and how it's a bit different from other implementations of hyperparameter tuning. We'll be comparing how the other algorithms (Random Search and Grid Search) fare against hyperopt from an accuracy standpoint and see what comes of our results.
\n", 31 | "To start off, we need a function to optimize. Listing 7.1 below is building a very simple 4th order polynomial that will, based on a value of x that is passed in, provide a loss metric in the form of a reduction factor to the 'y' value based on the submitted x. " 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import numpy as np\n", 41 | "def objective_function(x):\n", 42 | " func = np.poly1d([1, -3, -88, 112, -5])\n", 43 | " return func(x) * 0.01\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "For fun, let's see what this equation yields if we plot it in the space of [-100:100]" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Get a sorted list between -100 and 100 at every 0.1 increment\n", 60 | "x_values_big = np.arange(-100, 100, 0.1)\n", 61 | "\n", 62 | "# Get the y value for the x values defined above using list comprehension shorthand\n", 63 | "y_values_big = [objective_function(x) for x in x_values_big]\n", 64 | "\n", 65 | "# For those of you who prefer lambda calculus... (and more efficient execution)\n", 66 | "y_values_lambda_big = (lambda x: objective_function(x))(x_values_big)\n", 67 | "\n", 68 | "# Just to placate anyone who wonders if they do the same thing.\n", 69 | "np.testing.assert_array_equal(np.array(y_values_big), y_values_lambda_big)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Ok, hold on... what's with the lambda stuff? Who wants functional programming concepts and partial functions in their code base for ML?

\n", 77 | "Professional ML Engineers do.
\n", 78 | "That is, when it's called for.
\n", 79 | "... and here's why." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "# let's make a few more x values here...\n", 89 | "big_x_test = np.arange(-100, 100, 0.0001)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 5, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "Let's make this many: 2000000\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "print(\"Let's make this many: {}\".format(len(big_x_test)))" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 6, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "CPU times: user 1min 18s, sys: 298 ms, total: 1min 18s\n", 119 | "Wall time: 1min 18s\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "%time list_comp_test = [objective_function(x) for x in big_x_test]" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 7, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "CPU times: user 38.9 ms, sys: 33.8 ms, total: 72.7 ms\n", 137 | "Wall time: 72.6 ms\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "%time lambda_test = (lambda x: objective_function(x))(big_x_test)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "That's... quite the difference.
\n", 150 | "Feel free to play around with the value specified in the definition for big_x_test and see if you can melt your CPU and see the non-linear relationship in performance between a list comprehension and a lambda.
\n", 151 | "Note: There are many times that a list comprehension will actually out-perform lambda tasks. But in this particular case, where we're effectively mapping over a collection and calling a function, the optimizations of how lambda handles the traversal with the numpy array (and the subsequent compiled C++ code at the heart of numpy) means that operating on the numpy array with a lambda is going to be MUCH faster than traversing a Python list with a comprehension. These differences only really come into play when you're doing large-scale operations such as this." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 8, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "CPU times: user 434 ms, sys: 70.7 ms, total: 505 ms\n", 164 | "Wall time: 502 ms\n" 165 | ] 166 | }, 167 | { 168 | "data": { 169 | "text/plain": [ 170 | "[]" 171 | ] 172 | }, 173 | "execution_count": 8, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | }, 177 | { 178 | "data": { 179 | "image/png": "\n", 180 | "text/plain": [ 181 | "
" 182 | ] 183 | }, 184 | "metadata": { 185 | "needs_background": "light" 186 | }, 187 | "output_type": "display_data" 188 | } 189 | ], 190 | "source": [ 191 | "# Evaluate the wall-clock speed of plotting the list comprehension\n", 192 | "%time plt.plot(big_x_test, list_comp_test)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 9, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "CPU times: user 71 ms, sys: 44 ms, total: 115 ms\n", 205 | "Wall time: 116 ms\n" 206 | ] 207 | }, 208 | { 209 | "data": { 210 | "text/plain": [ 211 | "[]" 212 | ] 213 | }, 214 | "execution_count": 9, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | }, 218 | { 219 | "data": { 220 | "image/png": "\n", 221 | "text/plain": [ 222 | "
" 223 | ] 224 | }, 225 | "metadata": { 226 | "needs_background": "light" 227 | }, 228 | "output_type": "display_data" 229 | } 230 | ], 231 | "source": [ 232 | "# And evaluate the wall-clock of plotting the lambda implementation\n", 233 | "%time plt.plot(big_x_test, lambda_test)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | ".... and for those who will state, \"But lambda and list comprehensions are lazily evaluated in Py3.x....\", let's ensure that they're materialized through a forced execution by plotting their values in pyplot. This comes down the fact that one is a numpy array and one is a list (because the initial calculation to generate the collections above in the first block of timeit cells actually wasn't lazy)." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 10, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "[]" 252 | ] 253 | }, 254 | "execution_count": 10, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | }, 258 | { 259 | "data": { 260 | "image/png": "\n", 261 | "text/plain": [ 262 | "
" 263 | ] 264 | }, 265 | "metadata": { 266 | "needs_background": "light" 267 | }, 268 | "output_type": "display_data" 269 | } 270 | ], 271 | "source": [ 272 | "y_lambda = (lambda x: objective_function(x))(x_values_big)\n", 273 | "plt.plot(x_values_big, y_lambda)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "So... that's not super helpful for seeing the nuance to this equation. The resolution of the plot doesn't allow us to see where the actual minimum value is." 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "Let's see what the representation will be for this equation if we generate values against this function (in the x space before it becomes very large in the y space so that we can see the challenge that these different methods of searching for a global minima will have. This will help to inform our search range for the different approaches as well." 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 11, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/plain": [ 298 | "[]" 299 | ] 300 | }, 301 | "execution_count": 11, 302 | "metadata": {}, 303 | "output_type": "execute_result" 304 | }, 305 | { 306 | "data": { 307 | "image/png": "\n", 308 | "text/plain": [ 309 | "
" 310 | ] 311 | }, 312 | "metadata": { 313 | "needs_background": "light" 314 | }, 315 | "output_type": "display_data" 316 | } 317 | ], 318 | "source": [ 319 | "# Now, let's see it zoomed in to the range where we can see what's going on with minimum range of values for y...\n", 320 | "x_axis_values = np.arange(-12.0, 12.0, 0.01).tolist()\n", 321 | "y_values = (lambda x: objective_function(x))(x_axis_values)\n", 322 | "plt.plot(x_axis_values, y_values)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "Now, I know what you might be thinking... (it's what I'd be thinking if someone showed me this as well, likely...)
\n", 330 | "\"But Ben, dude, you can just get the minimum value directly from the data.\"
\n", 331 | "To which I would reply...
\n", 332 | "\"Shhhh... I'm just trying to make a point here. Let's play pretend and think of this as a supervised learning problem where our vector has 17 dimensions and there's no way that our human minds can figure out how to minimize the function.\"
\n", 333 | "But, to humor us both, here's the actual minimum x value." 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 12, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/plain": [ 344 | "7.569999999999581" 345 | ] 346 | }, 347 | "execution_count": 12, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "x_axis_values[np.argmin(y_values)]" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "So, how would this look as a grid search problem?

\n", 361 | "We would likely, when trying to do grid search over this space, select some collection of points to try, get the results, and then rank sort the return values, returning the lowest one. (Coincidentally that's exactly how grid search implementations actually work)." 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 13, 367 | "metadata": {}, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "text/plain": [ 372 | "'-12, -11, -10, -9, -8.5, -8.0, -7.5, -7.0, -6.5, -6.0, -5.5, -5.0, -4.5, -4.0, -3.5, -3.0, -2.5, -2.0, -1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 10.0, 11.0, 12.0'" 373 | ] 374 | }, 375 | "execution_count": 13, 376 | "metadata": {}, 377 | "output_type": "execute_result" 378 | } 379 | ], 380 | "source": [ 381 | "# our_grid = [-12, -11, -10, -9, -8.5, -8, ... ok, I'm not typing any more of this.]\n", 382 | "our_grid = np.arange(-12, -8, 1).tolist() + np.arange(-8.5, 9.0, 0.5).tolist() + np.arange(9.0, 13, 1).tolist()\n", 383 | "\", \".join(str(g) for g in our_grid)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "Ok, so we have a grid to search over now. Let's see what the results of calling our function and getting a minima will be for this." 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 14, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "name": "stdout", 400 | "output_type": "stream", 401 | "text": [ 402 | "Our minimum x value for our grid search is: 7.5 based on a y min of -22.165625000000002\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "grid_search_test = (lambda x: objective_function(x))(our_grid)\n", 408 | "print(\"Our minimum x value for our grid search is: {} based on a y min of {}\".format(\n", 409 | " our_grid[np.argmin(grid_search_test)], grid_search_test.min()))" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "It was close. Definitely close. But is it close enough? Let's try a random search to see how that fares.
\n", 417 | "We'll do the same exact search space (-12, 12) and give it 1000 iterations. " 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 15, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "data": { 427 | "text/plain": [ 428 | "(array([ 98., 84., 107., 106., 103., 106., 112., 95., 97., 92.]),\n", 429 | " array([-1.19695019e+01, -9.57358465e+00, -7.17766744e+00, -4.78175022e+00,\n", 430 | " -2.38583301e+00, 1.00842068e-02, 2.40600142e+00, 4.80191864e+00,\n", 431 | " 7.19783585e+00, 9.59375307e+00, 1.19896703e+01]),\n", 432 | " )" 433 | ] 434 | }, 435 | "execution_count": 15, 436 | "metadata": {}, 437 | "output_type": "execute_result" 438 | }, 439 | { 440 | "data": { 441 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAMg0lEQVR4nO3df6idh13H8ffHxm2uc5iQmxjbYioEtRPc5DKnBZlks3WTJRMCGUyCFuIfnU4RJNM/KoxBFX8ibhC3ugvOllA3EjaZi1dH8Z/O27W4tllJWWuaNSZ3Dn/+0dnt6x/3WbmmNz/OOffc0/u97xeE5zzPOc8935Obvu9zn3vP01QVkqRevmPWA0iS1p9xl6SGjLskNWTcJakh4y5JDW2b9QAAO3furL179856DEnaVB555JGvVdXcWve9IuK+d+9elpaWZj2GJG0qSf7lSvd5WkaSGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPGXZIaekW8Q1XSy+099pmZPfez975zZs+t9eGRuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhnwT0ybkm1skXYtH7pLUkEfuGsmsvmuY5XcMs/xOSRqXR+6S1JBxl6SGjLskNWTcJakhf6CqTcEfakqj8chdkhoy7pLU0DXjnuS+JJeSPL5q244kp5OcHZbbV933gSRPJ3kqyR3TGlySdGXXc+T+ceDOy7YdAxarah+wOKyT5DbgMPCGYZ8PJ7lh3aaVJF2Xa8a9qh4Cvn7Z5gPAwnB7ATi4avsDVfVCVT0DPA28eZ1mlSRdp3HPue+uqgsAw3LXsP0m4LlVjzs/bHuZJEeTLCVZWl5eHnMMSdJa1vsHqlljW631wKo6XlXzVTU/Nze3zmNI0tY2btwvJtkDMCwvDdvPA7esetzNwPPjjydJGse4cT8FHBluHwFOrtp+OMmrk9wK7AO+MNmIkqRRXfMdqknuB94K7ExyHrgHuBc4keQu4BxwCKCqnkhyAngSeBG4u6q+OaXZX7IVL0MrSVdzzbhX1XuucNf+Kzz+Q8CHJhlK0tbkgdr68R2qktSQcZekhoy7JDXkJX8lvYyXWN78PHKXpIaMuyQ1ZNwlqSHjLkkNGXdJasjflpG05c3yt4Om9e5Yj9wlqSHjLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPGXZIa8qqQE/D/Mynplcojd0lqyLhLUkPGXZIaMu6S1NBEcU/y60meSPJ4kvuTvCbJjiSnk5wdltvXa1hJ0vUZO+5JbgJ+FZivqh8BbgAOA8eAxaraBywO65KkDTTpaZltwHcl2Qa8FngeOAAsDPcvAAcnfA5J0ojGjntVfRX4feAccAH4j6r6HLC7qi4Mj7kA7Fpr/yRHkywlWVpeXh53DEnSGiY5LbOdlaP0W4HvA25M8t7r3b+qjlfVfFXNz83NjTuGJGkNk5yWeRvwTFUtV9X/Ap8EfhK4mGQPwLC8NPmYkqRRTBL3c8Bbkrw2SYD9wBngFHBkeMwR4ORkI0qSRjX2tWWq6uEkDwJfBF4EHgWOA68DTiS5i5UvAIfWY1BJ0vWb6MJhVXUPcM9lm19g5ShekjQjvkNVkhoy7pLUkHGXpIaMuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPGXZIaMu6S1JBxl6SGjLskNWTcJakh4y5JDRl3SWrIuEtSQ8Zdkhoy7pLUkHGXpIaMuyQ1NFHck3xPkgeTfDnJmSQ/kWRHktNJzg7L7es1rCTp+kx65P4nwGer6oeAHwXOAMeAxaraBywO65KkDTR23JO8Hvgp4GMAVfWNqvp34ACwMDxsATg46ZCSpNFMcuT+A8Ay8BdJHk3y0SQ3Arur6gLAsNy11s5JjiZZSrK0vLw8wRiSpMtNEvdtwI8BH6mqNwH/wwinYKrqeFXNV9X83NzcBGNIki43SdzPA+er6uFh/UFWYn8xyR6AYXlpshElSaMaO+5V9a/Ac0l+cNi0H3gSOAUcGbYdAU5ONKEkaWTbJtz/V4BPJHkV8BXgF1n5gnEiyV3AOeDQhM8hSRrRRHGvqseA+TXu2j/Jx5UkTcZ3qEpSQ8Zdkhoy7pLUkHGXpIaMuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPGXZIaMu6S1JBxl6SGjLskNWTcJakh4y5JDRl3SWrIuEtSQ8Zdkhoy7pLUkHGXpIYmjnuSG5I8muTTw/qOJKeTnB2W2ycfU5I0ivU4cn8/cGbV+jFgsar2AYvDuiRpA00U9yQ3A+8EPrpq8wFgYbi9AByc5DkkSaOb9Mj9j4HfBL61atvuqroAMCx3rbVjkqNJlpIsLS8vTziGJGm1seOe5OeAS1X1yDj7V9Xxqpqvqvm5ublxx5AkrWHbBPveDrwryTuA1wCvT/KXwMUke6rqQpI9wKX1GFSSdP3GPnKvqg9U1c1VtRc4DPx9Vb0XOAUcGR52BDg58ZSSpJFM4/fc7wXenuQs8PZhXZK0gSY5LfOSqvo88Pnh9r8B+9fj40qSxuM7VCWpIeMuSQ0Zd0lqyLhLUkPGXZIaMu6S1JBxl6SGjLskNWTcJakh4y5JDRl3SWrIuEtSQ8Zdkhoy7pLUkHGXpIaMuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkNjxz3JLUn+IcmZJE8kef+wfUeS00nODsvt6zeuJOl6THLk/iLwG1X1w8BbgLuT3AYcAxarah+wOKxLkjbQ2HGvqgtV9cXh9n8BZ4CbgAPAwvCwBeDgpENKkkazLufck+wF3gQ8DOyuqguw8gUA2HWFfY4mWUqytLy8vB5jSJIGE8c9yeuAvwZ+rar+83r3q6rjVTVfVfNzc3OTjiFJWmWiuCf5TlbC/omq+uSw+WKSPcP9e4BLk40oSRrVJL8tE+BjwJmq+sNVd50Cjgy3jwAnxx9PkjSObRPsezvwC8CXkjw2bPst4F7gRJK7gHPAoclGlCSNauy4V9U/ArnC3fvH/biSpMn5DlVJasi4S1JDxl2SGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lqyLhLUkPGXZIaMu6S1JBxl6SGjLskNWTcJakh4y5JDRl3SWrIuEtSQ8Zdkhoy7pLUkHGXpIaMuyQ1ZNwlqSHjLkkNGXdJasi4S1JDxl2SGjLuktTQ1OKe5M4kTyV5OsmxaT2PJOnlphL3JDcAfwb8LHAb8J4kt03juSRJLzetI/c3A09X1Veq6hvAA8CBKT2XJOky26b0cW8Cnlu1fh748dUPSHIUODqs/neSp6Y0y3rZCXxt1kPMyFZ+7bC1X/9Wfu2wAa8/vzvR7t9/pTumFfessa3+30rVceD4lJ5/3SVZqqr5Wc8xC1v5tcPWfv1b+bXD5n790zotcx64ZdX6zcDzU3ouSdJlphX3fwL2Jbk1yauAw8CpKT2XJOkyUzktU1UvJnkf8LfADcB9VfXENJ5rA22aU0hTsJVfO2zt17+VXzts4tefqrr2oyRJm4rvUJWkhoy7JDVk3K8iyaEkTyT5VpL5y+77wHBphaeS3DGrGTdKkt9J8tUkjw1/3jHrmaZtq19CI8mzSb40fL6XZj3PNCW5L8mlJI+v2rYjyekkZ4fl9lnOOCrjfnWPAz8PPLR643AphcPAG4A7gQ8Pl1zo7o+q6o3Dn7+Z9TDT5CU0XvLTw+d7U/6u9wg+zsp/y6sdAxarah+wOKxvGsb9KqrqTFWt9c7ZA8ADVfVCVT0DPM3KJRfUh5fQ2EKq6iHg65dtPgAsDLcXgIMbOtSEjPt41rq8wk0zmmUjvS/JPw/fwm6qb1HHsFU/x6sV8LkkjwyXC9lqdlfVBYBhuWvG84xkWpcf2DSS/B3wvWvc9dtVdfJKu62xbdP/TunV/i6AjwAfZOV1fhD4A+CXNm66Ddfyczyi26vq+SS7gNNJvjwc4WoT2PJxr6q3jbFby8srXO/fRZI/Bz495XFmreXneBRV9fywvJTkU6ycqtpKcb+YZE9VXUiyB7g064FG4WmZ8ZwCDid5dZJbgX3AF2Y801QN/7i/7d2s/LC5sy19CY0kNyb57m/fBn6G/p/zy50Cjgy3jwBX+k7+FWnLH7lfTZJ3A38KzAGfSfJYVd1RVU8kOQE8CbwI3F1V35zlrBvg95K8kZVTE88Cvzzbcaar6SU0RrEb+FQSWOnEX1XVZ2c70vQkuR94K7AzyXngHuBe4ESSu4BzwKHZTTg6Lz8gSQ15WkaSGjLuktSQcZekhoy7JDVk3CWpIeMuSQ0Zd0lq6P8Ad8vYjBxtEpkAAAAASUVORK5CYII=\n", 442 | "text/plain": [ 443 | "
" 444 | ] 445 | }, 446 | "metadata": { 447 | "needs_background": "light" 448 | }, 449 | "output_type": "display_data" 450 | } 451 | ], 452 | "source": [ 453 | "# We're going to use uniform search space here since that's how most people use the default values of a random search\n", 454 | "# for hyperparameter tuning. (There are other options for most APIs though with different distributions)\n", 455 | "random_search_x = [random.uniform(-12.0, 12.0) for x in range(1000)]\n", 456 | "plt.hist(random_search_x)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "And now, let's build the collection and see what happens for optimization." 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 16, 469 | "metadata": {}, 470 | "outputs": [ 471 | { 472 | "name": "stdout", 473 | "output_type": "stream", 474 | "text": [ 475 | "Our minimum x value for our random search is: 7.559268444196029 based on a y min of -22.1750297705125\n" 476 | ] 477 | } 478 | ], 479 | "source": [ 480 | "random_search_test = (lambda x: objective_function(x))(random_search_x)\n", 481 | "print(\"Our minimum x value for our random search is: {} based on a y min of {}\".format(\n", 482 | " random_search_x[np.argmin(random_search_test)], random_search_test.min()))" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "Wow, that was MUCH closer, right?
\n", 490 | "It would take a bit longer than the grid search (particularly if we were doing something more complicated that basic linear algebra for our 'model') by searching through 1000 iterations.
\n", 491 | "Let's see how many iterations it would take to get something VERY close to the global minima, though.\n", 492 | ">> NOTE: this algorithm for search space is non-deterministic. If you run this more than once, you're likely going to get different answers. That's how random number generators work." 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 17, 498 | "metadata": {}, 499 | "outputs": [ 500 | { 501 | "name": "stdout", 502 | "output_type": "stream", 503 | "text": [ 504 | "Our minimum x value for our random search (ridiculous test) is: 7.572346332547564 based on a y min of -22.175349266642858\n" 505 | ] 506 | } 507 | ], 508 | "source": [ 509 | "random_search_x_ridiculous = [random.uniform(-12.0, 12.0) for x in range(1000000)]\n", 510 | "random_search_test_ridiculous = (lambda x: objective_function(x))(random_search_x_ridiculous)\n", 511 | "print(\"Our minimum x value for our random search (ridiculous test) is: {} based on a y min of {}\".format(\n", 512 | " random_search_x_ridiculous[np.argmin(random_search_test_ridiculous)], random_search_test_ridiculous.min()))" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "Now that... that is impressive. We're VERY close to the actual confirmed global minima for this function.
\n", 520 | "\n", 521 | "Let's see how hyperopt does, though..." 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "metadata": {}, 527 | "source": [ 528 | "### Listing 7.2 Hyperopt search space for a one-dimensional polynomial optimization pattern\n", 529 | "This is the code listing from the book for this section (minus the import statement for hyperopt modules which is at the top of this notebook)." 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 18, 535 | "metadata": {}, 536 | "outputs": [ 537 | { 538 | "name": "stdout", 539 | "output_type": "stream", 540 | "text": [ 541 | "100%|██████████| 1000/1000 [00:09<00:00, 103.58trial/s, best loss: -22.175349000150717]\n" 542 | ] 543 | } 544 | ], 545 | "source": [ 546 | "# To make it fair, we're going to use a uniform search space as well.\n", 547 | "optimization_space = hp.uniform('x', -12, 12)\n", 548 | "\n", 549 | "trials = Trials()\n", 550 | "\n", 551 | "trial_estimator = fmin(fn=objective_function, \n", 552 | " space=optimization_space, \n", 553 | " algo=tpe.suggest,\n", 554 | " trials=trials,\n", 555 | " max_evals=1000 \n", 556 | " ) " 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "metadata": {}, 562 | "source": [ 563 | "### Listing 7.3 Demonstration of hyperopt performance in minimizing a simple polynomial function\n", 564 | "And here is where we are going to let hyperopt find the minima as best it can..." 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 19, 570 | "metadata": {}, 571 | "outputs": [ 572 | { 573 | "data": { 574 | "image/png": "\n", 575 | "text/plain": [ 576 | "
" 577 | ] 578 | }, 579 | "metadata": {}, 580 | "output_type": "display_data" 581 | } 582 | ], 583 | "source": [ 584 | "rng = np.arange(-12.0, 12.0, 0.01).tolist()\n", 585 | "values = (lambda x: objective_function(x))(rng)\n", 586 | "with plt.style.context(style='seaborn'):\n", 587 | " fig, ax = plt.subplots(1, 1, figsize=(5.5, 4))\n", 588 | " ax.plot(rng, values)\n", 589 | " ax.set_title('Objective function')\n", 590 | " ax.scatter(x=trial_estimator['x'], y=trials.average_best_error(), marker='o', s=100)\n", 591 | " bbox_text = 'Hyperopt calculated minimum value\\nx: {}'.format(trial_estimator['x'])\n", 592 | " arrow = dict(facecolor='darkblue', shrink=0.01, connectionstyle='angle3,angleA=90,angleB=45')\n", 593 | " bbox_conf = dict(boxstyle='round,pad=0.5', fc='ivory', ec='grey', lw=0.8)\n", 594 | " conf = dict(xycoords='data', textcoords='axes fraction', arrowprops=arrow, bbox=bbox_conf, ha='left', va='center', fontsize=12)\n", 595 | " ax.annotate(bbox_text, xy=(trial_estimator['x'], trials.average_best_error()), xytext=(0.3, 0.8), **conf)\n", 596 | " fig.tight_layout()\n", 597 | " plt.savefig('objective_func.png', format='png', dpi=1000)" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 20, 603 | "metadata": {}, 604 | "outputs": [ 605 | { 606 | "name": "stdout", 607 | "output_type": "stream", 608 | "text": [ 609 | "Grid Search absolute error: 0.06999999999958106\n", 610 | "Random Search absolute error: 0.01073155580355234\n", 611 | "Random Search Ridiculous absolute error: 0.00234633254798311\n", 612 | "Hyperopt absolute error: 0.002698416416993865\n" 613 | ] 614 | } 615 | ], 616 | "source": [ 617 | "ground_truth = x_axis_values[np.argmin(y_values)]\n", 618 | "grid_search_error = np.abs(our_grid[np.argmin(grid_search_test)] - ground_truth)\n", 619 | "random_search_error = np.abs(random_search_x[np.argmin(random_search_test)] - ground_truth)\n", 620 | "random_search_ridiculous_error = np.abs(\n", 621 | " random_search_x_ridiculous[np.argmin(random_search_test_ridiculous)] - ground_truth)\n", 622 | "hyperopt_error = np.abs(trial_estimator['x'] - ground_truth)\n", 623 | "\n", 624 | "print(\"Grid Search absolute error: {}\".format(grid_search_error))\n", 625 | "print(\"Random Search absolute error: {}\".format(random_search_error))\n", 626 | "print(\"Random Search Ridiculous absolute error: {}\".format(random_search_ridiculous_error))\n", 627 | "print(\"Hyperopt absolute error: {}\".format(hyperopt_error))" 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": {}, 633 | "source": [ 634 | "I think we can safely agree that Grid search may not be the most optimal way to tune models. It's slow, inefficient, and requires many runs to 'fine tune' the search space to get something approximating a 'well tuned model'.
\n", 635 | "The interesting results here are between hyperopt and random search.

\n", 636 | "While the 'Ridiculous (1 million iterations)' Random Search optimization beat hyperopt, it did so by a relatively small margin. However, it did so through training 999,000 more calculations than hyperopt did. Doesn't really seem like it would scale to your real-world supervised learning problem, does it?
\n", 637 | "If we compare hyperopt to the results from the iterations-matched random search with the same exact search space using the same sampling distribution (uniform), hyperopt beat random search soundly.
\n", 638 | "> This is why hyperopt was invented and why it's so very powerful. It gives fantastic results in far fewer iterations than other implementations that are widely used in the ML community.\n", 639 | "\n", 640 | ">> NOTE: Since the Random Search and HyperOpt both are using sampling algorithms that rely on random float selection, even though from the same spaces, these algorithms are non-deterministic. Each time that you run them you will get different results from the optimization. However, with enough iterations over these runs, you will find that the general performance will hold true across these different implementations for finding the global minima of this function." 641 | ] 642 | } 643 | ], 644 | "metadata": { 645 | "kernelspec": { 646 | "display_name": "Python 3", 647 | "language": "python", 648 | "name": "python3" 649 | }, 650 | "language_info": { 651 | "codemirror_mode": { 652 | "name": "ipython", 653 | "version": 3 654 | }, 655 | "file_extension": ".py", 656 | "mimetype": "text/x-python", 657 | "name": "python", 658 | "nbconvert_exporter": "python", 659 | "pygments_lexer": "ipython3", 660 | "version": "3.8.3" 661 | } 662 | }, 663 | "nbformat": 4, 664 | "nbformat_minor": 4 665 | } 666 | -------------------------------------------------------------------------------- /notebooks/ch08/Chapter8_1.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenWilson2/ML-Engineering/0fc05f4b876b26bbacc85bcb11c7c2aef517cd20/notebooks/ch08/Chapter8_1.dbc -------------------------------------------------------------------------------- /notebooks/ch09/CleanCode.py: -------------------------------------------------------------------------------- 1 | import warnings as warn 2 | import pandas as pd 3 | import numpy as np 4 | import scipy 5 | import scipy.stats as stat 6 | from scipy.stats import shapiro, normaltest, anderson 7 | import matplotlib.pyplot as plt 8 | from statsmodels.graphics.gofplots import qqplot 9 | 10 | 11 | class DistributionAnalysis(object): 12 | 13 | def __init__(self, series, histogram_bins, **kwargs): 14 | self.series = series 15 | self.histogram_bins = histogram_bins 16 | self.series_name = kwargs.get('series_name', 'data') 17 | self.plot_bins = kwargs.get('plot_bins', 200) 18 | self.best_plot_size = kwargs.get('best_plot_size', (20, 16)) 19 | self.all_plot_size = kwargs.get('all_plot_size', (24, 30)) 20 | self.MIN_BOUNDARY = 0.001 21 | self.MAX_BOUNDARY = 0.999 22 | self.ALPHA = kwargs.get('alpha', 0.05) 23 | 24 | def _get_series_bins(self): 25 | return int(np.ceil(self.series.index.values.max())) 26 | 27 | @staticmethod 28 | def _get_distributions(): 29 | scipy_ver = scipy.__version__ 30 | if (int(scipy_ver[2]) >= 5) and (int(scipy_ver[4:]) > 3): 31 | names, gen_names = stat.get_distribution_names(stat.pairs, stat.rv_continuous) 32 | else: 33 | names = stat._continuous_distns._distn_names 34 | return names 35 | 36 | @staticmethod 37 | def _extract_params(params): 38 | return {'arguments': params[:-2], 'location': params[-2], 'scale': params[-1]} 39 | 40 | @staticmethod 41 | def _generate_boundaries(distribution, parameters, x): 42 | args = parameters['arguments'] 43 | loc = parameters['location'] 44 | scale = parameters['scale'] 45 | return distribution.ppf(x, *args, loc=loc, scale=scale) if args else distribution.ppf(x, loc=loc, scale=scale) 46 | 47 | @staticmethod 48 | def _build_pdf(x, distribution, parameters): 49 | if parameters['arguments']: 50 | pdf = distribution.pdf(x, loc=parameters['location'], scale=parameters['scale'], *parameters['arguments']) 51 | else: 52 | pdf = distribution.pdf(x, loc=parameters['location'], scale=parameters['scale']) 53 | return pdf 54 | 55 | def plot_normalcy(self): 56 | qqplot(self.series, line='s') 57 | 58 | def check_normalcy(self): 59 | 60 | def significance_test(value, threshold): 61 | return "Data set {} normally distributed from".format('is' if value > threshold else 'is not') 62 | 63 | shapiro_stat, shapiro_p_value = shapiro(self.series) 64 | dagostino_stat, dagostino_p_value = normaltest(self.series) 65 | anderson_stat, anderson_crit_vals, anderson_significance_levels = anderson(self.series) 66 | anderson_report = list(zip(list(anderson_crit_vals), list(anderson_significance_levels))) 67 | shapiro_statement = """Shapiro-Wilk stat: {:.4f} 68 | Shapiro-Wilk test p-Value: {:.4f} 69 | {} Shapiro-Wilk Test""".format( 70 | shapiro_stat, shapiro_p_value, significance_test(shapiro_p_value, self.ALPHA)) 71 | 72 | dagostino_statement = """\nD'Agostino stat: {:.4f} 73 | D'Agostino test p-Value: {:.4f} 74 | {} D'Agostino Test""".format( 75 | dagostino_stat, dagostino_p_value, significance_test(dagostino_p_value, self.ALPHA)) 76 | 77 | anderson_statement = '\nAnderson statistic: {:.4f}'.format(anderson_stat) 78 | for i in anderson_report: 79 | anderson_statement = anderson_statement + """ 80 | For signifance level {} of Anderson-Darling test: {} the evaluation. Critical value: {}""".format( 81 | i[1], significance_test(i[0], anderson_stat), i[0]) 82 | 83 | return "{}{}{}".format(shapiro_statement, dagostino_statement, anderson_statement) 84 | 85 | def _calculate_fit_loss(self, x, y, dist): 86 | with warn.catch_warnings(): 87 | warn.filterwarnings('ignore') 88 | estimated_distribution = dist.fit(x) 89 | params = self._extract_params(estimated_distribution) 90 | pdf = self._build_pdf(x, dist, params) 91 | return np.sum(np.power(y - pdf, 2.0)), estimated_distribution 92 | 93 | def _generate_probability_distribution(self, distribution, parameters, bins): 94 | starting_point = self._generate_boundaries(distribution, parameters, self.MIN_BOUNDARY) 95 | ending_point = self._generate_boundaries(distribution, parameters, self.MAX_BOUNDARY) 96 | x = np.linspace(starting_point, ending_point, bins) 97 | y = self._build_pdf(x, distribution, parameters) 98 | return pd.Series(y, x) 99 | 100 | def find_distribution_fit(self): 101 | 102 | y_hist, x_hist_raw = np.histogram(self.series, self.histogram_bins, density=True) 103 | x_hist = (x_hist_raw + np.roll(x_hist_raw, -1))[:-1] / 2. 104 | full_distribution_results = {} 105 | 106 | best_loss = np.inf 107 | best_fit = stat.norm 108 | best_params = (0., 1.) 109 | for dist in self._get_distributions(): 110 | histogram = getattr(stat, dist) 111 | results, parameters = self._calculate_fit_loss(x_hist, y_hist, histogram) 112 | full_distribution_results[dist] = {'hist': histogram, 113 | 'loss': results, 114 | 'params': { 115 | 'arguments': parameters[:-2], 116 | 'location': parameters[-2], 117 | 'scale': parameters[-1] 118 | } 119 | } 120 | if best_loss > results > 0: 121 | best_loss = results 122 | best_fit = histogram 123 | best_params = parameters 124 | return {'best_distribution': best_fit, 125 | 'best_loss': best_loss, 126 | 'best_params': { 127 | 'arguments': best_params[:-2], 128 | 'location': best_params[-2], 129 | 'scale': best_params[-1] 130 | }, 131 | 'all_results': full_distribution_results 132 | } 133 | 134 | def plot_best_fit(self): 135 | 136 | fits = self.find_distribution_fit() 137 | best_fit_distribution = fits['best_distribution'] 138 | best_fit_parameters = fits['best_params'] 139 | distribution_series = self._generate_probability_distribution(best_fit_distribution, 140 | best_fit_parameters, 141 | self._get_series_bins()) 142 | with plt.style.context(style='seaborn'): 143 | fig = plt.figure(figsize=self.best_plot_size) 144 | ax = self.series.plot(kind='hist', bins=self.plot_bins, normed=True, 145 | alpha=0.5, label=self.series_name, legend=True) 146 | distribution_series.plot(lw=3, label=best_fit_distribution.__class__.__name__, legend=True, ax=ax) 147 | ax.legend(loc='best') 148 | return fig 149 | 150 | def plot_all_fits(self): 151 | 152 | fits = self.find_distribution_fit() 153 | series_bins = self._get_series_bins() 154 | 155 | with warn.catch_warnings(): 156 | warn.filterwarnings('ignore') 157 | with plt.style.context(style='seaborn'): 158 | fig = plt.figure(figsize=self.all_plot_size) 159 | ax = self.series.plot(kind='hist', bins=self.plot_bins, normed=True, alpha=0.5, 160 | label=self.series_name, legend=True) 161 | y_max = ax.get_ylim() 162 | x_max = ax.get_xlim() 163 | for dist in fits['all_results']: 164 | hist = fits['all_results'][dist] 165 | distribution_data = self._generate_probability_distribution(hist['hist'], 166 | hist['params'], 167 | series_bins) 168 | distribution_data.plot(lw=2, label=dist, alpha=0.6, ax=ax) 169 | ax.legend(loc='best') 170 | ax.set_ylim(y_max) 171 | ax.set_xlim(x_max) 172 | return fig 173 | -------------------------------------------------------------------------------- /notebooks/ch09/UnitTestExample.py: -------------------------------------------------------------------------------- 1 | from .CleanCode import DistributionAnalysis 2 | import numpy as np 3 | import scipy.stats as stat 4 | 5 | 6 | def test_generate_boundaries(): 7 | expected_low_norm = -2.3263478740408408 8 | expected_high_norm = 2.3263478740408408 9 | boundary_arguments = {'location': 0, 'scale': 1, 'arguments': ()} 10 | test_object = DistributionAnalysis(np.arange(0, 100), 10) 11 | normal_distribution_low = test_object._generate_boundaries(stat.norm, 12 | boundary_arguments, 13 | 0.01) 14 | normal_distribution_high = test_object._generate_boundaries(stat.norm, 15 | boundary_arguments, 16 | 0.99) 17 | assert normal_distribution_low == expected_low_norm, \ 18 | 'Normal Dist low boundary: {} does not match expected: {}' \ 19 | .format(normal_distribution_low, expected_low_norm) 20 | assert normal_distribution_high == expected_high_norm, \ 21 | 'Normal Dist high boundary: {} does not match expected: {}' \ 22 | .format(normal_distribution_high, expected_high_norm) 23 | 24 | 25 | if __name__ == '__main__': 26 | test_generate_boundaries() 27 | print('tests passed') 28 | -------------------------------------------------------------------------------- /notebooks/ch09/WoT.py: -------------------------------------------------------------------------------- 1 | import warnings as warn 2 | import pandas as pd 3 | import numpy as np 4 | import scipy.stats as stat 5 | from scipy.stats import shapiro, normaltest, anderson 6 | import matplotlib.pyplot as plt 7 | from statsmodels.graphics.gofplots import qqplot 8 | 9 | ### NOTE: This is for demonstration purposes ONLY. DO NOT run this 10 | ### if you want to use your computer for the next few hours. 11 | 12 | data = pd.read_csv('/sf-airbnb-clean.csv') 13 | series = data['price'] 14 | shapiro, pval = shapiro(series) 15 | print('Shapiro score: ' + str(shapiro) + ' with pvalue: ' + str(pval)) 16 | dagastino, pval = normaltest(series) 17 | print("D'Agostino score: " + str(dagastino) + " with pvalue: " + str(pval)) 18 | anderson_stat, crit, sig = anderson(series) 19 | print("Anderson statistic: " + str(anderson_stat)) 20 | anderson_rep = list(zip(list(crit), list(sig))) 21 | for i in anderson_rep: 22 | print('Significance: ' + str(i[0]) + ' Crit level: ' + str(i[1])) 23 | bins = int(np.ceil(series.index.values.max())) 24 | y, x = np.histogram(series, 200, density=True) 25 | x = (x + np.roll(x, -1))[:-1] / 2. 26 | bl = np.inf 27 | bf = stat.norm 28 | bp = (0., 1.) 29 | with warn.catch_warnings(): 30 | warn.filterwarnings('ignore') 31 | fam = stat._continuous_distns._distn_names 32 | for d in fam: 33 | h = getattr(stat, d) 34 | f = h.fit(series) 35 | pdf = h.pdf(x, loc=f[-2], scale=f[-1], *f[:-2]) 36 | loss = np.sum(np.power(y - pdf, 2.)) 37 | if bl > loss > 0: 38 | bl = loss 39 | bf = h 40 | bp = f 41 | start = bf.ppf(0.001, *bp[:-2], loc=bp[-2], scale=bp[-1]) 42 | end = bf.ppf(0.999, *bp[:-2], loc=bp[-2], scale=bp[-1]) 43 | xd = np.linspace(start, end, bins) 44 | yd = bf.pdf(xd, loc=bp[-2], scale=bp[-1], *bp[:-2]) 45 | hdist = pd.Series(yd, xd) 46 | with warn.catch_warnings(): 47 | warn.filterwarnings('ignore') 48 | with plt.style.context(style='seaborn'): 49 | fig = plt.figure(figsize=(16,12)) 50 | ax = series.plot(kind='hist', bins=100, normed=True, alpha=0.5, label='Airbnb SF Price', legend=True) 51 | ymax = ax.get_ylim() 52 | xmax = ax.get_xlim() 53 | hdist.plot(lw=3, label='best dist ' + bf.__class__.__name__, legend=True, ax=ax) 54 | ax.legend(loc='best') 55 | ax.set_xlim(xmax) 56 | ax.set_ylim(ymax) 57 | qqplot(series, line='s') 58 | -------------------------------------------------------------------------------- /notebooks/ch09/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenWilson2/ML-Engineering/0fc05f4b876b26bbacc85bcb11c7c2aef517cd20/notebooks/ch09/__init__.py -------------------------------------------------------------------------------- /notebooks/ch13/Chapter_13.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenWilson2/ML-Engineering/0fc05f4b876b26bbacc85bcb11c7c2aef517cd20/notebooks/ch13/Chapter_13.dbc -------------------------------------------------------------------------------- /notebooks/ch13/Chapter_13.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | // MAGIC %md 3 | // MAGIC # Chapter 13 ML Engineering in Action 4 | // MAGIC Author: Ben Wilson 5 | 6 | // COMMAND ---------- 7 | 8 | // MAGIC %md 9 | // MAGIC ## ML Development Hubris 10 | // MAGIC In this notebook, we'll be looking at some patterns of development that can cause problems in projects. Specifically, issues that arise from unintentionally obfuscated (overly complex) code, prematurely / incorrectly optimized code, and the complexities introduced from early generalization of a code base. 11 | 12 | // COMMAND ---------- 13 | 14 | // MAGIC %md Before we get into the examples that are in the book, we need to generate some data for the use cases here. Keeping with the common theme of this part of the book, we're talking about dogs again, as well as the one thing that they care about more than anything: food. 15 | 16 | // COMMAND ---------- 17 | 18 | import org.apache.spark.ml.Pipeline 19 | import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel} 20 | import org.apache.spark.ml.feature.{StringIndexer, StringIndexerModel, VectorAssembler, IndexToString} 21 | import org.apache.spark.ml.PipelineModel 22 | import org.apache.spark.ml.evaluation.{ 23 | BinaryClassificationEvaluator, 24 | MulticlassClassificationEvaluator} 25 | import org.apache.spark.sql.DataFrame 26 | import org.apache.spark.sql.types._ 27 | import org.apache.spark.sql.functions._ 28 | import scala.util.Random 29 | import scala.reflect.ClassTag 30 | import scala.collection.mutable.ArrayBuffer 31 | 32 | // COMMAND ---------- 33 | 34 | case class Dogs(age: Int, 35 | weight: Double, 36 | favorite_food: String, 37 | breed: String, 38 | good_boy_or_girl: String, 39 | hungry: Boolean) 40 | 41 | case object CoreData { 42 | def dogBreeds: Seq[String] = 43 | Seq( 44 | "Husky", 45 | "GermanShepherd", 46 | "Dalmation", 47 | "Pug", 48 | "Malamute", 49 | "Akita", 50 | "BelgianMalinois", 51 | "Chinook", 52 | "Estrela", 53 | "Doberman", 54 | "Mastiff" 55 | ) 56 | def foods: Seq[String] = 57 | Seq( 58 | "Kibble", 59 | "Spaghetti", 60 | "Labneh", 61 | "Steak", 62 | "Hummus", 63 | "Fajitas", 64 | "BeoufBourgignon", 65 | "Bolognese" 66 | ) 67 | def goodness: Seq[String] = 68 | Seq("yes", "no", "sometimes", "yesWhenFoodAvailable") 69 | def hungry: Seq[Boolean] = Seq(true, false) 70 | def ageSigma = 3 71 | def ageMean = 2 72 | def weightSigma = 12 73 | def weightMean = 60 74 | } 75 | 76 | trait DogUtility { 77 | 78 | def getDoggoData[T: ClassTag](a: Seq[T], dogs: Int, seed: Long): Seq[T] = { 79 | val rnd = new Random(seed) 80 | Seq.fill(dogs)(a(rnd.nextInt(a.size))) 81 | } 82 | 83 | def getDistributedIntData(sigma: Double, 84 | mean: Double, 85 | dogs: Int, 86 | seed: Long): Seq[Int] = { 87 | val rnd = new Random(seed) 88 | (0 until dogs).map( 89 | _ => math.ceil(math.abs(rnd.nextGaussian() * sigma + mean)).toInt 90 | ) 91 | } 92 | 93 | def getDistributedDoubleData(sigma: Double, 94 | mean: Double, 95 | dogs: Int, 96 | seed: Long): Seq[Double] = { 97 | val rnd = new Random(seed) 98 | (0 until dogs).map( 99 | _ => 100 | math 101 | .round(math.abs(rnd.nextGaussian() * sigma * 100 + mean)) 102 | .toDouble / 100 103 | ) 104 | } 105 | 106 | } 107 | 108 | object DogDataGeneration extends DogUtility { 109 | 110 | def generateData(rows: Int, seed: Long): DataFrame = { 111 | 112 | val ageData = getDistributedIntData(CoreData.ageSigma, CoreData.ageMean, rows, seed) 113 | val weightData = getDistributedDoubleData(CoreData.weightSigma, CoreData.weightMean, rows, seed) 114 | val foodData = getDoggoData(CoreData.foods, rows, seed) 115 | val breedData = getDoggoData(CoreData.dogBreeds, rows, seed) 116 | val goodData = getDoggoData(CoreData.goodness, rows, seed) 117 | val hungryData = getDoggoData(CoreData.hungry, rows, seed) 118 | val collection = (0 until rows).toArray.map(x => { 119 | Dogs(ageData(x), weightData(x), foodData(x), breedData(x), goodData(x), hungryData(x)) 120 | }).toSeq 121 | collection.toDF() 122 | } 123 | 124 | def generateColl(rows: Int, seed: Long) = { 125 | 126 | val ageData = getDistributedIntData(CoreData.ageSigma, CoreData.ageMean, rows, seed) 127 | val weightData = getDistributedDoubleData(CoreData.weightSigma, CoreData.weightMean, rows, seed) 128 | val foodData = getDoggoData(CoreData.foods, rows, seed) 129 | val breedData = getDoggoData(CoreData.dogBreeds, rows, seed) 130 | val goodData = getDoggoData(CoreData.goodness, rows, seed) 131 | val hungryData = getDoggoData(CoreData.hungry, rows, seed) 132 | val collection = Seq(ageData, weightData, foodData, breedData, goodData, hungryData) 133 | collection 134 | } 135 | 136 | } 137 | 138 | // COMMAND ---------- 139 | 140 | // MAGIC %md ```Note: the object DogDataGeneration is not a very optimal way of generating data. We will be covering a more optimal methodology in listing 13.10``` 141 | 142 | // COMMAND ---------- 143 | 144 | // MAGIC %md To get the initial data set for listing 13.1, let's use the sub-optimal generator above to build it. 145 | 146 | // COMMAND ---------- 147 | 148 | val dataLarger = DogDataGeneration.generateData(100000, 42L) 149 | .withColumn("hungry", when(col("hungry"), "true").otherwise("false")) 150 | .withColumn("hungry", when(col("breed") === "Husky", "true").otherwise(col("hungry"))) 151 | .withColumn("good_boy_or_girl", when(col("breed") === "Husky", "yesWhenFoodAvailable") 152 | .otherwise(col("good_boy_or_girl"))) 153 | 154 | // COMMAND ---------- 155 | 156 | // MAGIC %md 157 | // MAGIC ###Listing 13.1 Imperative model prototype 158 | // MAGIC In this code listing, we see an example of building a fairly standard SparkML DecisionTreeClassifier model. The code style, highly imperative in nature, is indicative of how a lot of demos are written, reference implementations, and how a large amount of experimentation work is conducted. 159 | 160 | // COMMAND ---------- 161 | 162 | val DATA_SOURCE = dataLarger 163 | 164 | val indexerFood = new StringIndexer() 165 | .setInputCol("favorite_food") 166 | .setOutputCol("favorite_food_si") 167 | .setHandleInvalid("keep") 168 | .fit(DATA_SOURCE) 169 | 170 | val indexerBreed = new StringIndexer() 171 | .setInputCol("breed") 172 | .setOutputCol("breed_si") 173 | .setHandleInvalid("keep") 174 | .fit(DATA_SOURCE) 175 | 176 | val indexerGood = new StringIndexer() 177 | .setInputCol("good_boy_or_girl") 178 | .setOutputCol("good_boy_or_girl_si") 179 | .setHandleInvalid("keep") 180 | .fit(DATA_SOURCE) 181 | 182 | val indexerHungry = new StringIndexer() 183 | .setInputCol("hungry") 184 | .setOutputCol("hungry_si") 185 | .setHandleInvalid("error") 186 | .fit(DATA_SOURCE) 187 | 188 | val Array(train, test) = DATA_SOURCE.randomSplit(Array(0.75, 0.25)) 189 | 190 | val indexerLabelConversion = new IndexToString() 191 | .setInputCol("prediction") 192 | .setOutputCol("predictionLabel") 193 | .setLabels(indexerHungry.labelsArray(0)) 194 | 195 | val assembler = new VectorAssembler() 196 | .setInputCols(Array("age", "weight", "favorite_food_si", "breed_si", "good_boy_or_girl_si")) 197 | .setOutputCol("features") 198 | 199 | val decisionTreeModel = new DecisionTreeClassifier() 200 | .setLabelCol("hungry_si") 201 | .setFeaturesCol("features") 202 | .setImpurity("gini") 203 | .setMinInfoGain(1e-4) 204 | .setMaxDepth(6) 205 | .setMinInstancesPerNode(5) 206 | .setMinWeightFractionPerNode(0.05) 207 | 208 | val pipeline = new Pipeline() 209 | .setStages(Array(indexerFood, indexerBreed, indexerGood, indexerHungry, assembler, decisionTreeModel, indexerLabelConversion)) 210 | 211 | val model = pipeline.fit(train) 212 | 213 | val predictions = model.transform(test) 214 | 215 | val lossMetric = new BinaryClassificationEvaluator() 216 | .setLabelCol("hungry_si") 217 | .setRawPredictionCol("prediction") 218 | .setMetricName("areaUnderROC") 219 | .evaluate(predictions) 220 | 221 | 222 | 223 | // COMMAND ---------- 224 | 225 | // MAGIC %md ### Listing 13.2 Overly complex model prototype 226 | // MAGIC What happens if someone who is trying to 'get fancy' tries their hand at building a prototype? What if they just want to show off their coding skills (flexing) to the rest of the team? Just how convoluted could this potentially become? 227 | 228 | // COMMAND ---------- 229 | 230 | case class ModelReturn( 231 | pipeline: PipelineModel, 232 | metric: Double 233 | ) 234 | 235 | class BuildDecisionTree(data: DataFrame, 236 | trainPercent: Double, 237 | labelCol: String) { 238 | 239 | final val LABEL_COL = "label" 240 | final val FEATURES_COL = "features" 241 | final val PREDICTION_COL = "prediction" 242 | final val SCORING_METRIC = "areaUnderROC" 243 | 244 | private def constructIndexers(): Array[StringIndexerModel] = { 245 | data.schema 246 | .collect { 247 | case x if (x.dataType == StringType) & (x.name != labelCol) => x.name 248 | } 249 | .map { x => 250 | new StringIndexer() 251 | .setInputCol(x) 252 | .setOutputCol(s"${x}_si") 253 | .setHandleInvalid("keep") 254 | .fit(data) 255 | } 256 | .toArray 257 | 258 | } 259 | 260 | private def indexLabel(): StringIndexerModel = { 261 | data.schema.collect { 262 | case x if (x.name == labelCol) & (x.dataType == StringType) => 263 | new StringIndexer() 264 | .setInputCol(x.name) 265 | .setOutputCol(LABEL_COL) 266 | .setHandleInvalid("error") 267 | .fit(data) 268 | }.head 269 | } 270 | 271 | private def labelInversion( 272 | labelIndexer: StringIndexerModel 273 | ): IndexToString = { 274 | new IndexToString() 275 | .setInputCol(PREDICTION_COL) 276 | .setOutputCol(s"${LABEL_COL}_${PREDICTION_COL}") 277 | .setLabels(labelIndexer.labelsArray(0)) 278 | } 279 | 280 | private def buildVector( 281 | featureIndexers: Array[StringIndexerModel] 282 | ): VectorAssembler = { 283 | 284 | val featureSchema = data.schema.names.filterNot(_.contains(labelCol)) 285 | val updatedSchema = featureIndexers.map(_.getInputCol) 286 | val features = featureSchema.filterNot(updatedSchema.contains) ++ featureIndexers 287 | .map(_.getOutputCol) 288 | new VectorAssembler() 289 | .setInputCols(features) 290 | .setOutputCol(FEATURES_COL) 291 | } 292 | 293 | private def buildDecisionTree(): DecisionTreeClassifier = { 294 | new DecisionTreeClassifier() 295 | .setLabelCol(LABEL_COL) 296 | .setFeaturesCol(FEATURES_COL) 297 | .setImpurity("entropy") 298 | .setMinInfoGain(1e-7) 299 | .setMaxDepth(6) 300 | .setMinInstancesPerNode(5) 301 | } 302 | 303 | private def scorePipeline(testData: DataFrame, pipeline: PipelineModel): Double = { 304 | new BinaryClassificationEvaluator() 305 | .setLabelCol(LABEL_COL) 306 | .setRawPredictionCol(PREDICTION_COL) 307 | .setMetricName(SCORING_METRIC) 308 | .evaluate(pipeline.transform(testData)) 309 | } 310 | 311 | def buildPipeline(): ModelReturn = { 312 | 313 | val featureIndexers = constructIndexers() 314 | val labelIndexer = indexLabel() 315 | val vectorAssembler = buildVector(featureIndexers) 316 | val Array(train, test) = data.randomSplit(Array(trainPercent, 1.0-trainPercent)) 317 | val pipeline = new Pipeline() 318 | .setStages( 319 | featureIndexers ++ 320 | Array( 321 | labelIndexer, 322 | vectorAssembler, 323 | buildDecisionTree(), 324 | labelInversion(labelIndexer) 325 | ) 326 | ) 327 | .fit(train) 328 | 329 | ModelReturn(pipeline, scorePipeline(test, pipeline)) 330 | 331 | } 332 | 333 | } 334 | 335 | object BuildDecisionTree { 336 | def apply(data: DataFrame, 337 | trainPercent: Double, 338 | labelCol: String): BuildDecisionTree = 339 | new BuildDecisionTree(data, trainPercent, labelCol) 340 | } 341 | 342 | // COMMAND ---------- 343 | 344 | val build = BuildDecisionTree(DATA_SOURCE, 0.75, "hungry").buildPipeline() 345 | display(build.pipeline.transform(dataLarger)) 346 | 347 | // COMMAND ---------- 348 | 349 | // MAGIC %md I don't know about you, but I wouldn't want to have to modify this code during the process of testing different features, adapting to model architecture changes, and adding in additional feature engineering steps. It's complicated, tightly-coupled, and looks more like generic framework code than job code.
350 | // MAGIC Were this a final-stage build of the code after the features and all steps involving the modeling completed, this code could potentially make some sense (and even then, it's very difficult to test) from a final production version standpoint. However, in an early phase (as this is clearly at with respect to the level of customization involved), this introduces more chaos than it prevents. It's just far too over-engineered to be useful. 351 | 352 | // COMMAND ---------- 353 | 354 | // MAGIC %md As a bonus (this isn't covered in the text of the book), here's an example of a truly over-engineered analysis of the extracted information from a decision tree model. As you peruse this code, try to imagine having to update a part of it during a development cycle to support additional functionality.
355 | // MAGIC Code like this belongs firmly embedded within a utility framework and should be fully external to any project work. It's a useful bit of code for analyzing the results of a tree-based algorithm, but it's far too complex and singularly focused / generic to be put into the solution repo for a project. 356 | 357 | // COMMAND ---------- 358 | 359 | import org.apache.spark.ml.PipelineModel 360 | import org.apache.spark.ml.tree.{ContinuousSplit, InternalNode, Node, Split} 361 | import org.apache.spark.ml.classification.{ 362 | DecisionTreeClassificationModel, 363 | RandomForestClassificationModel 364 | } 365 | import org.apache.spark.ml.tree.{ 366 | CategoricalSplit, 367 | ContinuousSplit, 368 | InternalNode, 369 | Node 370 | } 371 | import org.json4s.jackson.Serialization 372 | import org.json4s.jackson.Serialization.writePretty 373 | import org.json4s.{Formats, FullTypeHints} 374 | 375 | case class NodeData( 376 | featureIndex: Option[Int], 377 | informationGain: Option[Double], 378 | continuousSplitThreshold: Option[Double], 379 | treeNodeType: String, 380 | splitType: Option[String], 381 | leftNodeCategories: Option[Array[Double]], 382 | rightNodeCategories: Option[Array[Double]], 383 | leftChild: Option[NodeData], 384 | rightChild: Option[NodeData], 385 | prediction: Double 386 | ) 387 | 388 | object PayloadType extends Enumeration { 389 | type PayloadType = Value 390 | val MODEL, PIPELINE = Value 391 | } 392 | 393 | object NodeType extends Enumeration { 394 | type NodeType = Value 395 | val NODE, LEAF = Value 396 | } 397 | 398 | object SplitType extends Enumeration { 399 | type SplitType = Value 400 | val CONTINUOUS, CATEGORICAL = Value 401 | } 402 | 403 | object PayloadDetermination { 404 | 405 | import PayloadType._ 406 | 407 | def payloadType[T](value: T): PayloadType = { 408 | value match { 409 | case _: PipelineModel => PIPELINE 410 | case _ => MODEL 411 | } 412 | } 413 | 414 | } 415 | 416 | case class TreesReport(tree: Int, data: NodeData) 417 | 418 | case class FeatureIndexRenamingStructure(featureName: String, 419 | replacementText: String) 420 | 421 | def refactorDebugTree(debugStatement: String, 422 | features: Array[String]): String = { 423 | 424 | val featureMapping = features.zipWithIndex.map( 425 | x => FeatureIndexRenamingStructure(x._1, s"""featureIndex" : ${x._2}""") 426 | ) 427 | 428 | featureMapping.foldLeft(debugStatement) { 429 | case (debugString, field) => 430 | debugString.replaceAll(field.replacementText, s"""feature Index" : "${field.featureName}"""") 431 | } 432 | 433 | } 434 | 435 | object NodeDetermination { 436 | 437 | import NodeType._ 438 | import SplitType._ 439 | 440 | def nodeType(node: Node): NodeType = node match { 441 | case _: InternalNode => NODE 442 | case _ => LEAF 443 | } 444 | 445 | def splitType(split: Split): SplitType = split match { 446 | case x: ContinuousSplit => CONTINUOUS 447 | case _ => CATEGORICAL 448 | } 449 | 450 | } 451 | 452 | 453 | class TreeModelExtractor[T] 454 | object TreeModelExtractor { 455 | implicit object DecisionTreeClassifierExtractor 456 | extends TreeModelExtractor[DecisionTreeClassificationModel] 457 | implicit object RandomForestClassificationExtractor 458 | extends TreeModelExtractor[RandomForestClassificationModel] 459 | } 460 | 461 | 462 | object ModelDecisionExtractor { 463 | 464 | import NodeType._ 465 | import SplitType._ 466 | 467 | private def getSplitRuleSet(treeNode: Node): NodeData = { 468 | 469 | val nodeType = NodeDetermination.nodeType(treeNode) 470 | val internalNodeData = nodeType match { 471 | case NODE => Some(treeNode.asInstanceOf[InternalNode]) 472 | case LEAF => None 473 | } 474 | 475 | val splitType = nodeType match { 476 | case NODE => Some(NodeDetermination.splitType(internalNodeData.get.split)) 477 | case _ => None 478 | } 479 | 480 | NodeData( 481 | featureIndex = nodeType match { 482 | case NODE => Some(internalNodeData.get.split.featureIndex) 483 | case _ => None 484 | }, 485 | informationGain = nodeType match { 486 | case NODE => Some(internalNodeData.get.gain) 487 | case _ => None 488 | }, 489 | continuousSplitThreshold = splitType.getOrElse(None) match { 490 | case CONTINUOUS => 491 | Some( 492 | internalNodeData.get.split.asInstanceOf[ContinuousSplit].threshold 493 | ) 494 | case _ => None 495 | }, 496 | treeNodeType = nodeType match { 497 | case NODE => "node" 498 | case _ => "leaf" 499 | }, 500 | splitType = nodeType match { 501 | case NODE => 502 | splitType.get match { 503 | case CONTINUOUS => Some("continuous") 504 | case CATEGORICAL => Some("categorical") 505 | case _ => None 506 | } 507 | case _ => None 508 | }, 509 | leftNodeCategories = splitType.getOrElse(None) match { 510 | case CATEGORICAL => 511 | Some( 512 | internalNodeData.get.split 513 | .asInstanceOf[CategoricalSplit] 514 | .leftCategories 515 | ) 516 | case _ => None 517 | }, 518 | rightNodeCategories = splitType.getOrElse(None) match { 519 | case CATEGORICAL => 520 | Some( 521 | internalNodeData.get.split 522 | .asInstanceOf[CategoricalSplit] 523 | .rightCategories 524 | ) 525 | case _ => None 526 | }, 527 | leftChild = nodeType match { 528 | case NODE => Some(getSplitRuleSet(internalNodeData.get.leftChild)) 529 | case _ => None 530 | }, 531 | rightChild = nodeType match { 532 | case NODE => Some(getSplitRuleSet(internalNodeData.get.rightChild)) 533 | case _ => None 534 | }, 535 | prediction = treeNode.prediction 536 | ) 537 | 538 | } 539 | 540 | private def rulesToString(rules: NodeData): String = { 541 | implicit val jsonFormat: Formats = 542 | Serialization.formats(hints = FullTypeHints(List(NodeData.getClass))) 543 | writePretty(rules) 544 | } 545 | 546 | private def replaceFeatureIndicesWithText( 547 | rules: NodeData, 548 | featureVectorNames: Array[String] 549 | ): String = { 550 | 551 | featureVectorNames.zipWithIndex 552 | .map( 553 | x => 554 | FeatureIndexRenamingStructure(x._1, s""""featureIndex" : ${x._2}""") 555 | ) 556 | .foldLeft(rulesToString(rules)) { 557 | case (treeText, field) => 558 | treeText.replaceAll( 559 | field.replacementText, 560 | s"""""featureIndex" : "${field.featureName}"""" 561 | ) 562 | } 563 | } 564 | 565 | def extractModel[T: TreeModelExtractor]( 566 | model: T, 567 | featureVectorNames: Array[String] 568 | ): Array[String] = 569 | model match { 570 | case _: DecisionTreeClassificationModel => 571 | Array( 572 | replaceFeatureIndicesWithText( 573 | getSplitRuleSet( 574 | model.asInstanceOf[DecisionTreeClassificationModel].rootNode 575 | ), 576 | featureVectorNames 577 | ) 578 | ) 579 | case _: RandomForestClassificationModel => 580 | model 581 | .asInstanceOf[RandomForestClassificationModel] 582 | .trees 583 | .map( 584 | x => 585 | replaceFeatureIndicesWithText( 586 | getSplitRuleSet(x.rootNode), 587 | featureVectorNames 588 | ) 589 | ) 590 | } 591 | 592 | private def rulesToStringAlt(rules: TreesReport): String = { 593 | implicit val jsonFormat: Formats = 594 | Serialization.formats(hints = FullTypeHints(List(NodeData.getClass))) 595 | writePretty(rules) 596 | } 597 | 598 | private def replaceFeatureIndicesWithTextAlt( 599 | rules: TreesReport, 600 | featureVectorNames: Array[String] 601 | ): String = { 602 | 603 | featureVectorNames.zipWithIndex 604 | .map( 605 | x => 606 | FeatureIndexRenamingStructure(x._1, s""""featureIndex" : ${x._2}""") 607 | ) 608 | .foldLeft(rulesToStringAlt(rules)) { 609 | case (treeText, field) => 610 | treeText.replaceAll( 611 | field.replacementText, 612 | s""""featureIndex" : "${field.featureName}"""" 613 | ) 614 | } 615 | } 616 | 617 | def extractModelAlt[T: TreeModelExtractor]( 618 | model: T, 619 | featureVectorNames: Array[String] 620 | ): Array[String] = 621 | model match { 622 | case _: DecisionTreeClassificationModel => 623 | Array( 624 | getSplitRuleSet( 625 | model.asInstanceOf[DecisionTreeClassificationModel].rootNode 626 | ) 627 | ).zipWithIndex 628 | .map(x => TreesReport(x._2, x._1)) 629 | .map(replaceFeatureIndicesWithTextAlt(_, featureVectorNames)) 630 | case _: RandomForestClassificationModel => 631 | model 632 | .asInstanceOf[RandomForestClassificationModel] 633 | .trees 634 | .map(x => getSplitRuleSet(x.rootNode)) 635 | .zipWithIndex 636 | .map(x => TreesReport(x._2, x._1)) 637 | .map(replaceFeatureIndicesWithTextAlt(_, featureVectorNames)) 638 | } 639 | 640 | 641 | def extractTreesData[T: TreeModelExtractor]( 642 | model: T, 643 | featureVectorNames: Array[String] 644 | ): Array[TreesReport] = { 645 | 646 | model match { 647 | case _: DecisionTreeClassificationModel => 648 | Array( 649 | getSplitRuleSet( 650 | model.asInstanceOf[DecisionTreeClassificationModel].rootNode 651 | ) 652 | ).zipWithIndex.map(x => TreesReport(x._2, x._1)) 653 | case _: RandomForestClassificationModel => 654 | model 655 | .asInstanceOf[RandomForestClassificationModel] 656 | .trees 657 | .map(x => getSplitRuleSet(x.rootNode)) 658 | .zipWithIndex 659 | .map(x => TreesReport(x._2, x._1)) 660 | } 661 | 662 | } 663 | 664 | def blockConvertToJSON(rules: Array[TreesReport]): String = { 665 | implicit val jsonFormat: Formats = 666 | Serialization.formats(hints = FullTypeHints(List(TreesReport.getClass))) 667 | writePretty(rules) 668 | } 669 | 670 | def blockReplaceJSONText(rules: Array[TreesReport], 671 | featureVectorNames: Array[String]): String = { 672 | 673 | featureVectorNames.zipWithIndex 674 | .map( 675 | x => 676 | FeatureIndexRenamingStructure(x._1, s""""featureIndex" : ${x._2}""") 677 | ) 678 | .foldLeft(blockConvertToJSON(rules)) { 679 | case (treeText, field) => 680 | treeText.replaceAll( 681 | field.replacementText, 682 | s""""featureIndex" : "${field.featureName}"""" 683 | ) 684 | } 685 | } 686 | 687 | } 688 | 689 | // COMMAND ---------- 690 | 691 | // MAGIC %md And using the above monstrosity would look like this: 692 | 693 | // COMMAND ---------- 694 | 695 | val builtModel = build.pipeline.stages.takeRight(2).head.asInstanceOf[DecisionTreeClassificationModel] 696 | val features = build.pipeline.stages.takeRight(3).head.asInstanceOf[VectorAssembler].getInputCols 697 | val treesReport = ModelDecisionExtractor.extractTreesData(builtModel, features) 698 | ModelDecisionExtractor.blockReplaceJSONText(treesReport, features) 699 | 700 | // COMMAND ---------- 701 | 702 | // MAGIC %md Useful? Sure, I guess. Should something like this be in your ML solution code base? That would be a resounding 'no'.
703 | // MAGIC ```[NOTE]```: the model is garbage. It's supposed to be. We generated silly data that utilizes random sampling with a common seed value. If you see a modeling solution that is using real world data that can achieve 100% classification accuracy, you're either leaking your label or the problem is so simple that you shouldn't be using supervised ML to solve it 704 | 705 | // COMMAND ---------- 706 | 707 | // MAGIC %md ### Listing 13.3 Imperative casting 708 | // MAGIC In the following few listings, we're going to look at some relatively innocuous operations (the casting of a data type within a Spark DataFrame) through the lens of different approaches.
709 | // MAGIC In this listing, we'll see the imperative design that is generally used for demonstration in API Docs. It's clear, concise, easy to follow, and as simple of an approach that is possible. The only downside to it is that the constants are hard-coded in the code, which can prove complex if these references are spread throughout a large code base. 710 | 711 | // COMMAND ---------- 712 | 713 | def simple(df: DataFrame): DataFrame = { 714 | df.withColumn("age", col("age").cast("double")) 715 | .withColumn("weight", col("weight").cast("double")) 716 | .withColumn("hungry", col("hungry").cast("string")) 717 | } 718 | 719 | // COMMAND ---------- 720 | 721 | dataLarger.printSchema 722 | 723 | // COMMAND ---------- 724 | 725 | simple(dataLarger).printSchema 726 | 727 | // COMMAND ---------- 728 | 729 | // MAGIC %md ###Listing 13.4 A hacker's attempt at casting columns 730 | // MAGIC Here we see some pretty unintelligble code. It's overly complex not due to what it's doing, but rather in how it's doing it. It's also incredibly non-performant by using a mutable object definition around the DataFrame. The usage of the function is also merely migrating the constant values of column and type references to another place in the code. This does nothing to solve any problems and is just indicative of someone trying to look smart and failing in an epic manner.
731 | // MAGIC Be on the lookout for implementations like this. If the code is introducing complexity without reducing the computational or space complexity, then it's just useless code that's hard to read, hard to debug, and really challenging to expand upon. 732 | 733 | // COMMAND ---------- 734 | 735 | def hacker(df: DataFrame, castChanges: List[(String, String)]): DataFrame = { 736 | var mutated = df 737 | castChanges.foreach { x => 738 | mutated = mutated.withColumn(x._1, mutated(x._1).cast(x._2)) 739 | } 740 | mutated 741 | } 742 | 743 | // COMMAND ---------- 744 | 745 | hacker(dataLarger, List(("age", "double"), ("weight", "double"), ("hungry", "string"))).printSchema 746 | 747 | // COMMAND ---------- 748 | 749 | // MAGIC %md Yikes! Look at that function signature! Who wants to pass in a list of string tuples to a function?! That's just insanity. 750 | 751 | // COMMAND ---------- 752 | 753 | // MAGIC %md ###Listing 13.5 A pure functional programming approach 754 | // MAGIC Here we have 'the mystic'. Someone who is attempting to adhere to a set of design principles (in this case, FP, but in other cases it might be a strict adherence to a particular design theory (e.g. factory patterns) in OO). The code is cleaner than the hacker's code, certainly, but it's still really challenging to call the function due to the weird nature of using a collection of tuples. 755 | 756 | // COMMAND ---------- 757 | 758 | def mystic(df: DataFrame, 759 | castChanges: List[(String, DataType)]): DataFrame = { 760 | castChanges.foldLeft(df) { 761 | case (data, (c, t)) => data.withColumn(c, df(c).cast(t)) 762 | } 763 | } 764 | 765 | // COMMAND ---------- 766 | 767 | mystic(dataLarger, List(("age", DoubleType), ("weight", DoubleType), ("hungry", StringType))).printSchema 768 | 769 | // COMMAND ---------- 770 | 771 | // MAGIC %md The implementation is clearly better than the previous 2 listings, but the signature is still just relocating complexity to a different place. There are also a LOT of things that could go wrong here if the end-user isn't aware of how to perform casting conversion to complex types (Arrays, Maps, etc.). 772 | 773 | // COMMAND ---------- 774 | 775 | // MAGIC %md ###Listing 13.6 The show-off's casting implementation 776 | // MAGIC Here we have someone who really, desperately wants for people to think that they're skilled. They're going to approach the problem in a way that they think is going to cover a lot of potential issues that might arise from converting types, but in the process of building their solution just ends up going down a rabbit hole of complexity that generates a confusing mess of temporary references and bloated redundant code. 777 | 778 | // COMMAND ---------- 779 | 780 | final val numTypes = List(IntegerType, FloatType, DoubleType, LongType, DecimalType, ShortType) 781 | 782 | def showOff(df: DataFrame): DataFrame = { 783 | df.schema 784 | .map( 785 | s => 786 | s.dataType match { 787 | case x if numTypes.contains(x) => s.name -> "n" 788 | case _ => s.name -> "s" 789 | } 790 | ) 791 | .foldLeft(df) { 792 | case (df, x) => 793 | df.withColumn(x._1, df(x._1).cast(x._2 match { 794 | case "n" => "double" 795 | case _ => "string" 796 | })) 797 | } 798 | } 799 | 800 | // COMMAND ---------- 801 | 802 | showOff(dataLarger).printSchema 803 | 804 | // COMMAND ---------- 805 | 806 | // MAGIC %md While the approach here is better from an instantiation standpoint (namely that it doesn't require configuring a list of tuples and tries to automate conversions to support the goal of casting numeric types to Doubles and the Boolean type of the target column to a string), the internals of the showOff function are convoluted and unnecessarily complex. The first map over the schema is completely useless (creating a mapping of the name to a temporary string representation of the data type) and the foldLeft utilizes positional notation for referring to the values within the map. Not to mention that this implementation will absolutely detonate if any complex types are present in the DataFrame (Array, Map, Vector) and create confusing exceptions for an end-user of the function. It's sloppy, amateurish, and incredibly complicated to read. F- for effort. 807 | 808 | // COMMAND ---------- 809 | 810 | // MAGIC %md ###Listing 13.7 A slightly more sophisticated casting implementation 811 | // MAGIC This code block is approaching something more akin to what a production-grade implementation for automating casting should look like. As I mentioned in the chapter, though, this in and of itself can be highly obfuscated if no one else on the team is familiar with what is going on here. Because it's so dense, efficient, and involves matching directly on the iterated collection of schema values, it can be very confusing for people who are more accustomed to imperative programming styles. 812 | // MAGIC As I mentioned in the chapter, it's important to verify that the team is familiar with FP programming styles such as this before just blindly submitting PR's that contain code like this. If this is the development standard that the team chooses to employ, make sure that the entire team is taught and mentored in this paradigm. Provide examples of simple use cases that are relevant to the type of operations that project work will require. Hold hackathons. Just make sure that you're not smugly expecting people to figure something like this out with no guidance. 813 | 814 | // COMMAND ---------- 815 | 816 | def madScientist(df: DataFrame): DataFrame = { 817 | df.schema.foldLeft(df) { 818 | case (accum, s) => 819 | accum.withColumn(s.name, accum(s.name).cast(s.dataType match { 820 | case x: IntegerType => x 821 | case x if numTypes.contains(x) => DoubleType 822 | case ArrayType(_,_) | MapType(_,_,_) => s.dataType 823 | case _ => StringType 824 | })) 825 | } 826 | } 827 | 828 | // COMMAND ---------- 829 | 830 | madScientist(dataLarger).printSchema 831 | 832 | // COMMAND ---------- 833 | 834 | // MAGIC %md ###Listing 13.8 Configuration and common structures for data generator 835 | // MAGIC This is the same code as above (before listing 13.1) repeated here for reference. 836 | 837 | // COMMAND ---------- 838 | 839 | import org.apache.spark.sql.functions._ 840 | import org.apache.spark.sql.types._ 841 | import org.apache.spark.sql.{DataFrame, SparkSession} 842 | import scala.collection.mutable.ArrayBuffer 843 | import scala.reflect.ClassTag 844 | import scala.util.Random 845 | 846 | case class Dogs(age: Int, 847 | weight: Double, 848 | favorite_food: String, 849 | breed: String, 850 | good_boy_or_girl: String, 851 | hungry: Boolean) 852 | 853 | case object CoreData { 854 | def dogBreeds: Seq[String] = Seq( 855 | "Husky", "GermanShepherd", "Dalmation", "Pug", "Malamute", "Akita", 856 | "BelgianMalinois", "Chinook", "Estrela", "Doberman", "Mastiff") 857 | def foods: Seq[String] = Seq("Kibble", "Spaghetti", "Labneh", "Steak", 858 | "Hummus", "Fajitas", "BeoufBourgignon", "Bolognese") 859 | def goodness: Seq[String] = Seq("yes", "no", "sometimes", "yesWhenFoodAvailable") 860 | def hungry: Seq[Boolean] = Seq(true, false) 861 | def ageSigma = 3 862 | def ageMean = 2 863 | def weightSigma = 12 864 | def weightMean = 60 865 | } 866 | 867 | trait DogUtility { 868 | lazy val spark: SparkSession = SparkSession.builder().getOrCreate() 869 | def getDoggoData[T: ClassTag](a: Seq[T], dogs: Int, seed: Long): Seq[T] = { 870 | val rnd = new Random(seed) 871 | Seq.fill(dogs)(a(rnd.nextInt(a.size))) 872 | } 873 | def getDistributedIntData(sigma: Double, 874 | mean: Double, 875 | dogs: Int, 876 | seed: Long): Seq[Int] = { 877 | val rnd = new Random(seed) 878 | (0 until dogs).map( 879 | _ => math.ceil(math.abs(rnd.nextGaussian() * sigma + mean)).toInt 880 | ) 881 | } 882 | def getDistributedDoubleData(sigma: Double, mean: Double, dogs: Int, 883 | seed: Long): Seq[Double] = { 884 | val rnd = new Random(seed) 885 | (0 until dogs).map( _ => math.round(math.abs(rnd.nextGaussian() * sigma * 100 + mean)) 886 | .toDouble / 100) 887 | } 888 | } 889 | 890 | // COMMAND ---------- 891 | 892 | // MAGIC %md ###Listing 13.9 An overly complex and incorrectly optimized data generator 893 | // MAGIC Here we're looking at an eager early-optimization effort. Individual portions of it are optimized for performance (the generators for individual univariate series), but the key part of the final section (the creation of the structures to be cast into a Spark DataFrame) are incredibly non-performant. Each iteration through the collection forces a scan of the Sequence from the beginning, making the total runtime complexity of this approach somehting close to O(n * log(n)). 894 | 895 | // COMMAND ---------- 896 | 897 | 898 | object PrematureOptimization extends DogUtility { 899 | 900 | import spark.implicits._ 901 | 902 | case class DogInfo(columnName: String, 903 | stringData: Either[Seq[String], Seq[Boolean]], 904 | sigmaData: Option[Double], 905 | meanData: Option[Double], 906 | valueType: String) 907 | 908 | def dogDataConstruct: Seq[DogInfo] = { 909 | Seq( 910 | DogInfo( 911 | "age", 912 | Left(Seq("")), 913 | Some(CoreData.ageSigma), 914 | Some(CoreData.ageMean), 915 | "Int" 916 | ), 917 | DogInfo( 918 | "weight", 919 | Left(Seq("")), 920 | Some(CoreData.weightSigma), 921 | Some(CoreData.weightMean), 922 | "Double" 923 | ), 924 | DogInfo("food", Left(CoreData.foods), Some(0.0), Some(0.0), "String"), 925 | DogInfo( 926 | "breed", 927 | Left(CoreData.dogBreeds), 928 | Some(0.0), 929 | Some(0.0), 930 | "String" 931 | ), 932 | DogInfo("good", Left(CoreData.goodness), Some(0.0), Some(0.0), "String"), 933 | DogInfo( 934 | "hungry", 935 | Right(CoreData.hungry), 936 | Some(CoreData.ageSigma), 937 | Some(CoreData.ageMean), 938 | "Boolean" 939 | ) 940 | ) 941 | } 942 | 943 | def generateOptimizedData(rows: Int, seed: Long): DataFrame = { 944 | 945 | val data = dogDataConstruct 946 | .map( 947 | x => 948 | x.columnName -> { 949 | x.valueType match { 950 | case "Int" => 951 | getDistributedIntData( 952 | x.sigmaData.get, 953 | x.meanData.get, 954 | rows, 955 | seed 956 | ) 957 | case "Double" => 958 | getDistributedDoubleData( 959 | x.sigmaData.get, 960 | x.meanData.get, 961 | rows, 962 | seed 963 | ) 964 | case "String" => getDoggoData(x.stringData.left.get, rows, seed) 965 | case _ => getDoggoData(x.stringData.right.get, rows, seed) 966 | } 967 | } // This should be cast to an IndexedSeq in order to speed things up 968 | ) 969 | .toMap 970 | val collection = (0 until rows).toArray 971 | .map(x => { 972 | Dogs( 973 | data("age")(x).asInstanceOf[Int], 974 | data("weight")(x).asInstanceOf[Double], 975 | data("food")(x).asInstanceOf[String], 976 | data("breed")(x).asInstanceOf[String], 977 | data("good")(x).asInstanceOf[String], 978 | data("hungry")(x).asInstanceOf[Boolean] 979 | ) 980 | }) 981 | .toSeq 982 | collection.toDF() 983 | } 984 | 985 | } 986 | 987 | // COMMAND ---------- 988 | 989 | // MAGIC %md Let's demonstrate how terrible this is for performance. 990 | 991 | // COMMAND ---------- 992 | 993 | PrematureOptimization.generateOptimizedData(5000, 42L) 994 | 995 | // COMMAND ---------- 996 | 997 | PrematureOptimization.generateOptimizedData(50000, 42L) 998 | 999 | // COMMAND ---------- 1000 | 1001 | // MAGIC %md Setting aside the failure in proper optimization, the fact that this data generator is built early on in a project for testing of performance means that, throughout the experimentation phases of feature generation and testing, this code is going to need to be constantly updated and refactored. What happens when different data types are added? What happens when interactions between features would like to be explored? In order to generate the data within this code base, extensive refactoring to the point of needing a full rewrite will be occurring often and taking a long time to execute. 1002 | 1003 | // COMMAND ---------- 1004 | 1005 | // MAGIC %md ###Listing 13.10 A far more performant data generator 1006 | // MAGIC This is a bit different from the generator defined at the top of this notebook. It is both more performant than that one and certainly faster to execute than the preceding one in listing 13.9. The main difference in this one is the significantly simpler generation of the individual series (which could actually use a bit more of a refactor to make it truly useable for rapidly changing data format change testing) and only a single location to update for the generated schema definition. 1007 | 1008 | // COMMAND ---------- 1009 | 1010 | object ConfusingButOptimizedDogData extends DogUtility { 1011 | 1012 | import spark.implicits._ 1013 | 1014 | private def generateCollections(rows: Int, seed: Long): ArrayBuffer[Seq[Any]] = { 1015 | 1016 | var collections = new ArrayBuffer[Seq[Any]]() 1017 | 1018 | collections += getDistributedIntData(CoreData.ageSigma, CoreData.ageMean, rows, seed) 1019 | 1020 | collections += getDistributedDoubleData(CoreData.weightSigma, CoreData.weightMean, rows, seed) 1021 | 1022 | Seq(CoreData.foods, CoreData.dogBreeds, CoreData.goodness, CoreData.hungry) 1023 | .foreach(x => { collections += getDoggoData(x, rows, seed)}) 1024 | 1025 | collections 1026 | } 1027 | def buildDogDF(rows: Int, seed: Long): DataFrame = { 1028 | 1029 | val data = generateCollections(rows, seed) 1030 | 1031 | data.flatMap(_.zipWithIndex) 1032 | .groupBy(_._2).values.map( x => 1033 | Dogs( 1034 | x(0)._1.asInstanceOf[Int], 1035 | x(1)._1.asInstanceOf[Double], 1036 | x(2)._1.asInstanceOf[String], 1037 | x(3)._1.asInstanceOf[String], 1038 | x(4)._1.asInstanceOf[String], 1039 | x(5)._1.asInstanceOf[Boolean])).toSeq.toDF() 1040 | .withColumn("hungry", when(col("hungry"), "true").otherwise("false")) 1041 | .withColumn("hungry", when(col("breed") === "Husky", "true").otherwise(col("hungry"))) 1042 | .withColumn("good_boy_or_girl", when(col("breed") === "Husky", "yesWhenFoodAvailable") 1043 | .otherwise(col("good_boy_or_girl"))) 1044 | } 1045 | } 1046 | 1047 | // COMMAND ---------- 1048 | 1049 | ConfusingButOptimizedDogData.buildDogDF(5000, 42L) 1050 | 1051 | // COMMAND ---------- 1052 | 1053 | ConfusingButOptimizedDogData.buildDogDF(50000, 42L) 1054 | -------------------------------------------------------------------------------- /notebooks/ch16/Chapter16_1.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenWilson2/ML-Engineering/0fc05f4b876b26bbacc85bcb11c7c2aef517cd20/notebooks/ch16/Chapter16_1.dbc -------------------------------------------------------------------------------- /notebooks/ch16/Chapter16_2.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenWilson2/ML-Engineering/0fc05f4b876b26bbacc85bcb11c7c2aef517cd20/notebooks/ch16/Chapter16_2.dbc --------------------------------------------------------------------------------