├── .dask
    └── config.yaml
├── .gitignore
├── 01-Intro.ipynb
├── 02-Extract.ipynb
├── 03-Features-Modeling.ipynb
├── 04-Lab-Modeling.ipynb
├── 04a-Solution-Modeling.ipynb
├── 05-Tuning.ipynb
├── 06-Lab-Tuning.ipynb
├── 07-Scoring-Orchestration.ipynb
├── 08-RaySGD-MLflow.ipynb
├── 09-Wrapup.ipynb
├── README.md
├── binder
    ├── apt.txt
    ├── environment.yml
    ├── jupyterlab-workspace.json
    ├── postBuild
    └── start
├── data
    ├── california
    │   ├── _SUCCESS
    │   ├── _committed_2595799468439767928
    │   ├── _started_2595799468439767928
    │   ├── part-00000-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-105-1-c000.snappy.parquet
    │   ├── part-00001-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-106-1-c000.snappy.parquet
    │   ├── part-00002-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-107-1-c000.snappy.parquet
    │   ├── part-00003-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-108-1-c000.snappy.parquet
    │   ├── part-00004-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-109-1-c000.snappy.parquet
    │   ├── part-00005-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-110-1-c000.snappy.parquet
    │   ├── part-00006-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-111-1-c000.snappy.parquet
    │   └── part-00007-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-112-1-c000.snappy.parquet
    ├── diamonds.csv
    └── powerplant.csv
└── images
    ├── cpv1.mp4
    ├── dask-array.svg
    ├── dask-dataframe.svg
    ├── data.jpg
    ├── flow-analyze.png
    ├── flow-base.png
    ├── flow-extract.png
    ├── flow-model.png
    ├── flow-transform.png
    ├── largest.jpg
    └── psf-logo@2x.png


/.dask/config.yaml:
--------------------------------------------------------------------------------
1 | distributed:
2 |   dashboard:
3 |     link: "{JUPYTERHUB_BASE_URL}user/{JUPYTERHUB_USER}/proxy/{port}/status"
4 | 
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | *.tune_metadata
131 | checkpoints/
132 | 


--------------------------------------------------------------------------------
/01-Intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Techniques for Data Science with Big Datasets\n",
  8 |     "\n",
  9 |     "<img src='images/data.jpg' width=400/>\n",
 10 |     "\n",
 11 |     "## Well... that sounds awfully vague, doesn't it?\n",
 12 |     "\n",
 13 |     "__Welcome to large-scale data engineering and data science in 2020__\n",
 14 |     "* End-to-end, \"single product\" platforms are no longer the leading options\n",
 15 |     "* In the open-source world, end-to-end may not even be possible for the near future\n",
 16 |     "\n",
 17 |     "__What does this mean in concrete terms?__\n",
 18 |     "* Focusing on OSS, Hadoop and Spark can no longer support our end-to-end needs\n",
 19 |     "* We need -- and want -- to learn how to assemble a suite of best-of-breed tools for data science with newer, simpler tools like\n",
 20 |     "    * Dask\n",
 21 |     "    * Ray\n",
 22 |     "    * Horovod and others\n",
 23 |     "* ... while still using key features of mature tools like\n",
 24 |     "    * SparkSQL\n",
 25 |     "    * Hive\n",
 26 |     "    * Airflow and more\n",
 27 |     "    \n",
 28 |     "__As architects and practitioners, we need to choose and leverage suite of tools chosen for power and simplicity__\n",
 29 |     "\n",
 30 |     "This class is designed to help you become confident\n",
 31 |     "* making those tool choices\n",
 32 |     "* communicating about them with your team\n",
 33 |     "* migrating away from legacy systems to meet modern data science needs\n",
 34 |     "\n",
 35 |     "This class is *not* designed to\n",
 36 |     "* Go in depth on the APIs or internals of any specific tools (there's just not enough time)\n",
 37 |     "* \"Sell you\" on any specific open-source project or product\n",
 38 |     "    * We want to get comfortable discussing strength/weaknesses, and then you can choose a solution that is right for you\n",
 39 |     "    \n",
 40 |     "*We'll be welcoming and exploring Questions & Answers more than most of my classes (which are heavier on the code and internals)*"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Catching up to large-scale data science in 2020: what's changed?\n",
 48 |     "\n",
 49 |     "A brief recap:\n",
 50 |     "* 2016 - broad adoption across industries of R, PyData, Apache Spark\n",
 51 |     "* 2017 - broad rise of deep learning\n",
 52 |     "* 2018-2019 - decline of Hadoop/Spark for data science\n",
 53 |     "* 2020-2021 - new open tools and hybrid architectures\n",
 54 |     "\n",
 55 |     "__Theme: best-of-breed__\n",
 56 |     "\n",
 57 |     "https://www.oreilly.com/radar/why-best-of-breed-is-a-better-choice-than-all-in-one-platforms-for-data-science/"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## Interactive Survey\n",
 65 |     "\n",
 66 |     "* What size datasets do you typically work with?\n",
 67 |     "* Where is (most of) your data stored?\n",
 68 |     "* How do you get data out of your data lake?\n",
 69 |     "* What tools do you typically use for\n",
 70 |     "    * feature engineering\n",
 71 |     "    * modeling"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## The changing definition of large-scale data\n",
 79 |     "\n",
 80 |     "__Compute power has grown, but datasets have not__\n",
 81 |     "\n",
 82 |     "<img src='images/largest.jpg' width=650/>\n",
 83 |     "\n",
 84 |     "\n",
 85 |     "*Source: https://www.kdnuggets.com/2020/07/poll-largest-dataset-analyzed-results.html*\n",
 86 |     "\n",
 87 |     "The largest ML datasets used vs. largest tractable on a single node (no cluster) has changed dramatically\n",
 88 |     "* Resulting in new definitions for small, medium, and big data\n",
 89 |     "* Avoid \"big data\" tools and their taxes when you can\n",
 90 |     "\n",
 91 |     "Some \"medium data\" approaches\n",
 92 |     "* Downsample\n",
 93 |     "* XGBoost external memory (out-of-core)\n",
 94 |     "* TF/PyTorch data loaders\n",
 95 |     "* sklearn + `partial_fit` (incrementalizable) algorithms\n",
 96 |     "    * Simplify with Dask, though Dask not strictly necessary\n",
 97 |     "* Apache Arrow / PyArrow (https://arrow.apache.org/docs/python/memory.html)\n",
 98 |     "* Honorable mention for feature engineering: Vaex"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "## Roadmap for large-scale tooling journey\n",
106 |     "\n",
107 |     "<img src='images/flow-base.png' width=800>"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": []
116 |   }
117 |  ],
118 |  "metadata": {
119 |   "kernelspec": {
120 |    "display_name": "Python 3",
121 |    "language": "python",
122 |    "name": "python3"
123 |   },
124 |   "language_info": {
125 |    "codemirror_mode": {
126 |     "name": "ipython",
127 |     "version": 3
128 |    },
129 |    "file_extension": ".py",
130 |    "mimetype": "text/x-python",
131 |    "name": "python",
132 |    "nbconvert_exporter": "python",
133 |    "pygments_lexer": "ipython3",
134 |    "version": "3.7.0"
135 |   }
136 |  },
137 |  "nbformat": 4,
138 |  "nbformat_minor": 4
139 | }
140 | 


--------------------------------------------------------------------------------
/02-Extract.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Acquiring data (extraction)\n",
  8 |     "\n",
  9 |     "<img src='images/flow-extract.png' width=800>\n",
 10 |     "\n",
 11 |     "> Note: in some organizations, there is a data discovery system, like https://www.amundsen.io/amundsen/ upstream from this step. We're not covering that area due to scope constraints\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Goal: use SQL to efficiently retrieve data for further work\n",
 19 |     "\n",
 20 |     "### Legacy Tools\n",
 21 |     "\n",
 22 |     "Mostly: Apache Hive\n",
 23 |     "\n",
 24 |     "### Current Tools\n",
 25 |     "\n",
 26 |     "* SparkSQL\n",
 27 |     "* Presto\n",
 28 |     "* *Hive Metastore*\n",
 29 |     "\n",
 30 |     "### Rising/Future Tools\n",
 31 |     "\n",
 32 |     "* Kartothek, Intake\n",
 33 |     "* BlazingSQL\n",
 34 |     "* Dask-SQL\n",
 35 |     "\n",
 36 |     "*There are more non-SQL options, but support for SQL is a requirement in most large organizations, so we're sticking with SQL-capable tools for now*\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "import pyspark"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "spark = pyspark.sql.SparkSession.builder.appName(\"demo\").getOrCreate()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "spark.sql(\"SELECT * FROM parquet.`data/california`\").show()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "query = \"\"\"\n",
 73 |     "SELECT origin, mean(delay) as delay, count(1) \n",
 74 |     "FROM parquet.`data/california` \n",
 75 |     "GROUP BY origin\n",
 76 |     "HAVING count(1) > 500\n",
 77 |     "ORDER BY delay DESC\n",
 78 |     "\"\"\"\n",
 79 |     "spark.sql(query).show()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "query = \"\"\"\n",
 89 |     "SELECT *\n",
 90 |     "FROM parquet.`data/california` \n",
 91 |     "WHERE origin in (\n",
 92 |     "    SELECT origin \n",
 93 |     "    FROM parquet.`data/california` \n",
 94 |     "    GROUP BY origin \n",
 95 |     "    HAVING count(1) > 500\n",
 96 |     ")\n",
 97 |     "\"\"\"\n",
 98 |     "spark.sql(query).write.mode('overwrite').option('header', 'true').csv('data/refined_flights/')"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "! head data/refined_flights/*.csv"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": []
116 |   }
117 |  ],
118 |  "metadata": {
119 |   "kernelspec": {
120 |    "display_name": "Python 3",
121 |    "language": "python",
122 |    "name": "python3"
123 |   },
124 |   "language_info": {
125 |    "codemirror_mode": {
126 |     "name": "ipython",
127 |     "version": 3
128 |    },
129 |    "file_extension": ".py",
130 |    "mimetype": "text/x-python",
131 |    "name": "python",
132 |    "nbconvert_exporter": "python",
133 |    "pygments_lexer": "ipython3",
134 |    "version": "3.7.0"
135 |   }
136 |  },
137 |  "nbformat": 4,
138 |  "nbformat_minor": 4
139 | }
140 | 


--------------------------------------------------------------------------------
/03-Features-Modeling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Feature Engineering\n",
  8 |     "\n",
  9 |     "### Main flavors of data and feature engineering\n",
 10 |     "* Tabular: Dataframe model\n",
 11 |     "    * \"Typical\" business data tables\n",
 12 |     "* Batch/Tensor/Vector: Array model\n",
 13 |     "    * Numeric data, timeseries, scientific data, audio, images, video, geodata, etc.\n",
 14 |     "* Natural language\n",
 15 |     "    * Batches of strings\n",
 16 |     "    * Transformed into array data through NLP-specific techniques\n",
 17 |     "    \n",
 18 |     "<img src='images/flow-transform.png' width=800>\n",
 19 |     "\n",
 20 |     "### \"Must-haves\" for feature engineering on large data\n",
 21 |     "\n",
 22 |     "* Some data representation for the large dataset\n",
 23 |     "    * Likely distributed, out-of-core, lazy, streaming, etc.\n",
 24 |     "* Mechanism to load data from standard formats and locations into the representation\n",
 25 |     "    * E.g., loading HDF5 in S3 or Parquet in HDFS\n",
 26 |     "* APIs to apply feature engineering transformations\n",
 27 |     "    * Mathematical operations\n",
 28 |     "    * String, date, etc.\n",
 29 |     "    * Custom (\"user-defined\")\n",
 30 |     "* Integration to a modeling framework and/or ability to write to standard formats\n",
 31 |     "\n",
 32 |     "### \"Nice-to-haves\"\n",
 33 |     "\n",
 34 |     "* Intuitive data representation: similar to \"small data\" tooling\n",
 35 |     "* APIs that resemble those of the most common industry-standard libraries\n",
 36 |     "* Both modeling integration *and* ability to write out transformed data"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "<img src='images/psf-logo@2x.png'/>"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Rise of Python\n",
 51 |     "\n",
 52 |     "Python has become the *lingua franca* or dominant cross-cutting language for data science.\n",
 53 |     "\n",
 54 |     ">\n",
 55 |     "> __Note__ this is not to imply Python is the best or only language, or that other languages might not be intrinsically better or even, in the future, more successful. \n",
 56 |     ">\n",
 57 |     "> There are wonderful things to be said for languages from Rust to R to Julia to many others, but for baseline data science capability and versatility in commercial enterprises today, it's Python\n",
 58 |     ">\n",
 59 |     "\n",
 60 |     "So we can turn to Python and look at the dominant libraries and tools within that ecosystem\n",
 61 |     "* Tabular data: Pandas\n",
 62 |     "* Array data: NumPy and derivatives like CuPy, JAX.numpy, etc.\n",
 63 |     "* Basic modeling: scikit-learn, XGBoost, etc.\n",
 64 |     "* Deep learning: PyTorch, Tensorflow\n",
 65 |     "* NLP: SpaCy, NLTK, Huggingface, etc.\n",
 66 |     "\n",
 67 |     "As we get into further parts of the workflow, like hyperparameter tuning or reinforcement learning there are more choices. \n",
 68 |     "\n",
 69 |     "For time reasons, we're going to stick to this core workflow of extraction through modeling and tuning, and not continue on into MLOps and deploment architectures, or meta-modeling platforms for experimentation, feature and provenance tracking, etc. That would be a bit too much to take on!\n",
 70 |     "\n",
 71 |     "__Bottom line__: We want a data representation and APIs that are fairly close to the Pandas / NumPy / scikit-learn (SciPy) workflow. And we want elegant bridges into things like PyTorch, XGBoost, NLP tools, and tuning tools.\n",
 72 |     "\n",
 73 |     "## Dask: SciPy at Scale\n",
 74 |     "\n",
 75 |     "Luckily, Dask is well placed to solve this problem. \n",
 76 |     "\n",
 77 |     "While enterprises were still wrestling with JVM-based tools over the past 5 years, scientists, researchers, and others in the PyData and SciPy communities were building Dask, a pure-Python distributed compute platform that integrates deeply with all of the standard SciPy tools.\n",
 78 |     "\n",
 79 |     "__What does this mean?__\n",
 80 |     "\n",
 81 |     "We can take many of our local workflows to large-scale data via Dask with fairly minimal effort -- because under the hood, Dask is designed to use those \"small data\" structures in federation to create arbitrarily large ones."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "<table align=left><tr><td>\n",
 89 |     "    <img src='images/dask-dataframe.svg' width=350>\n",
 90 |     "</td><td style='width:10em;'>\n",
 91 |     "</td><td>\n",
 92 |     "    <img src='images/dask-array.svg'></td></tr></table>"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "As an added bonus, due to the Dask architecture, it can leverage GPU-enabled versions of the underlying libraries.\n",
100 |     "* GPU + NumPy => CuPy\n",
101 |     "* GPU + Pandas => cuDF (RAPIDS CUDA dataframe)\n",
102 |     "* GPU + scikit-learn => cuML (RAPIDS CUDA algorithms)\n",
103 |     "etc.\n",
104 |     "\n",
105 |     "### Using Dask for Feature Transformation\n",
106 |     "\n",
107 |     "* We need to be able to load data in a standard format\n",
108 |     "* Manipulate it using dataframe or array APIs\n",
109 |     "* Write it and/or pass it efficiently to a modeling framework"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "from dask import dataframe as ddf\n",
119 |     "from dask.distributed import Client\n",
120 |     "\n",
121 |     "client = Client(n_workers=2, threads_per_worker=1, memory_limit='1GB')\n",
122 |     "\n",
123 |     "client"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "df = ddf.read_csv('data/diamonds.csv')\n",
133 |     "df"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "df.head()"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "df = df.drop(columns=['Unnamed: 0'])\n",
152 |     "df = df.categorize()\n",
153 |     "\n",
154 |     "df"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "prepared = ddf.reshape.get_dummies(df)\n",
164 |     "\n",
165 |     "prepared.head()"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "# Modeling\n",
173 |     "\n",
174 |     "<img src='images/flow-model.png' width=800/>\n",
175 |     "\n",
176 |     "If Dask makes an easy choice for some feature engineering and preprocessing, we're back into the deep end making choices for modeling.\n",
177 |     "\n",
178 |     "Why?\n",
179 |     "\n",
180 |     "Simply put, different kinds of modeling are handled best by different tools, so we have a lot of choices to make.\n",
181 |     "\n",
182 |     "* \"Classic\" ML\n",
183 |     "    * Dask\n",
184 |     "    * Dask ML\n",
185 |     "    * XGBoost (with or without Dask)\n",
186 |     "* Unsupervised learning and dimensionality reduction\n",
187 |     "    * Dask supports some algorithms\n",
188 |     "    * For others, we may want to scale a deep-learning tool (PyTorch/Tensorflow)\n",
189 |     "        * Horovod\n",
190 |     "        * Ray SGD\n",
191 |     "* Deep learning (scaling PyTorch/TF easily)\n",
192 |     "    * Horovod\n",
193 |     "    * Ray SGD\n",
194 |     "    * Ray RLlib for deep reinforcement learning\n",
195 |     "* Simulations and agent-based models\n",
196 |     "    * Ray for stateful-agent simulations\n",
197 |     "    * Dask Actors may be an option\n",
198 |     "\n",
199 |     "\n",
200 |     "## Example: Linear Model with Dask"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "y = prepared.price.to_dask_array(lengths=True)\n",
210 |     "arr = prepared.drop('price', axis=1).to_dask_array(lengths=True)\n",
211 |     "\n",
212 |     "arr[:4]"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "arr[:4].compute()"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "from dask_ml.model_selection import train_test_split\n",
231 |     "\n",
232 |     "X_train, X_test, y_train, y_test = train_test_split(arr, y, test_size=0.1)\n",
233 |     "\n",
234 |     "X_train"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "from dask_ml.linear_model import LinearRegression\n",
244 |     "\n",
245 |     "lr = LinearRegression(solver='lbfgs', max_iter=10)\n",
246 |     "lr_model = lr.fit(X_train, y_train)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "y_predicted = lr_model.predict(X_test)\n",
256 |     "\n",
257 |     "y_predicted"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "from dask_ml.metrics import mean_squared_error\n",
267 |     "from math import sqrt\n",
268 |     "\n",
269 |     "sqrt(mean_squared_error(y_test, y_predicted))"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "client.close()"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "## What is Ray?\n",
286 |     "\n",
287 |     "Ray (https://ray.io/) is a scale-out computing system designed for high-throughput, resilient stateful-actor algorithms. Ray was design at UC Berkeley's RISE lab under the supervision of some of the same team that created Apache Spark. \n",
288 |     "\n",
289 |     "Ray supports a number of languages at the API layer (Python and Java today) while most of the engine is C++. Ray's stateful actor support makes it strong in a number of key areas, like distributed SGD and reinforcement learning.\n",
290 |     "\n",
291 |     "Let's try a reinforcement learning example!\n",
292 |     "\n",
293 |     "> __Reinforcement Learning__ is a family of techniques that train *agents* to act in an *environment* to maximize *reward*. Famous examples include agents that can play chess, go, or Atari games ... but the field is hot because those agents can also be robots learning to do work, autonomous vehicles driving, or even virtual salesmen learning to get the best price possible from a customer.\n",
294 |     "\n",
295 |     "Ray treats deep reinforcement learning (RL + deep learning) as a top-level use case and includes libraries that encapsulate many of the most popular algorithms.\n",
296 |     "\n",
297 |     "Here, to create a simple example, we'll use __Deep Q-Learning__ (a foundational deep RL algorithm) to learn OpenAI's \"cart-pole\" (https://gym.openai.com/envs/CartPole-v1/) environment, which you can visualize like this:\n",
298 |     "\n",
299 |     "<video src='images/cpv1.mp4' controls=\"true\">"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "This example, and the lab, are based on the demo in Dean Wampler's excellent intro paper, \"What is Ray?\" on O'Reilly Safari Online: https://learning.oreilly.com/library/view/what-is-ray/9781492085768/"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "import ray\n",
316 |     "import ray.rllib.agents.dqn as dqn\n",
317 |     "\n",
318 |     "ray.shutdown()\n",
319 |     "ray.init()"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": [
328 |     "# Specifies the OpenAI Gym environment for CartPole, V1.\n",
329 |     "SELECT_ENV = \"CartPole-v1\"\n",
330 |     "\n",
331 |     "# Number of training runs.\n",
332 |     "N_ITER = 50\n",
333 |     "\n",
334 |     "# default configuration.\n",
335 |     "config = dqn.DEFAULT_CONFIG.copy()\n",
336 |     "\n",
337 |     "# Suppress too many messages.\n",
338 |     "config[\"log_level\"] = \"WARN\"\n",
339 |     "\n",
340 |     "# Use > 1 for more CPU cores, e.g., over a cluster.\n",
341 |     "config['num_workers'] = 2\n",
342 |     "\n",
343 |     "# Describe network\n",
344 |     "config['model']['fcnet_hiddens'] = [40,20]\n",
345 |     "\n",
346 |     "# Don't pin a CPU core to each worker (allows more workers).\n",
347 |     "config['num_cpus_per_worker'] = 0\n",
348 |     "checkpoint_dir = 'checkpoints'"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {},
355 |    "outputs": [],
356 |    "source": [
357 |     "trainer = dqn.DQNTrainer(config, SELECT_ENV)"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "fmt = '{:3d},{:8.4f},{:8.4f},{:8.4f}'\n",
367 |     "last_checkpoint = ''\n",
368 |     "for n in range(N_ITER):\n",
369 |     "    result = trainer.train()\n",
370 |     "    min  = result['episode_reward_min']\n",
371 |     "    mean = result['episode_reward_mean']\n",
372 |     "    max  = result['episode_reward_max']\n",
373 |     "    last_checkpoint = trainer.save(checkpoint_dir)\n",
374 |     "    print(fmt.format(n, min, mean, max))\n",
375 |     "print(f'last checkpoint file: {last_checkpoint}')"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "__Note__: If you've worked with RL and OpenAI gym before, you may realize these are not particularly impressive numbers, and not a particularly impressive algorithm.\n",
383 |     "\n",
384 |     "Don't worry: __Ray RLlib__ includes a variety of much more powerful algorithms which achieve better results. We'll try one of them -- Proximal Policy Optimization (PPO) in the lab exercise."
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "ray.shutdown()"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": []
402 |   }
403 |  ],
404 |  "metadata": {
405 |   "kernelspec": {
406 |    "display_name": "Python 3",
407 |    "language": "python",
408 |    "name": "python3"
409 |   },
410 |   "language_info": {
411 |    "codemirror_mode": {
412 |     "name": "ipython",
413 |     "version": 3
414 |    },
415 |    "file_extension": ".py",
416 |    "mimetype": "text/x-python",
417 |    "name": "python",
418 |    "nbconvert_exporter": "python",
419 |    "pygments_lexer": "ipython3",
420 |    "version": "3.7.0"
421 |   }
422 |  },
423 |  "nbformat": 4,
424 |  "nbformat_minor": 4
425 | }
426 | 


--------------------------------------------------------------------------------
/04-Lab-Modeling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Lab: Modeling with Dask and Ray\n",
  8 |     "\n",
  9 |     "To keep things simple, while still giving you a chance to try something hands on, we'll look at \n",
 10 |     "\n",
 11 |     "* Linear modeling with Dask and a different dataset\n",
 12 |     "* Ray RL example using a more powerful algorithm (PPO) than we did earlier"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## Dask and Powerplant Output\n",
 20 |     "\n",
 21 |     "We'll use the UC Irvine ML repository's Combined Cycle Power Plant Data Set (https://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant)\n",
 22 |     "\n",
 23 |     "This dataset consists of about 10,000 records of measurements relating to peaker power plants.\n",
 24 |     "\n",
 25 |     "* Temperature (AT) in the range 1.81°C and 37.11°C,\n",
 26 |     "* Ambient Pressure (AP) in the range 992.89-1033.30 millibar,\n",
 27 |     "* Relative Humidity (RH) in the range 25.56% to 100.16%\n",
 28 |     "* Exhaust Vacuum (V) in the range 25.36-81.56 cm Hg\n",
 29 |     "* Net hourly electrical energy output (PE) 420.26-495.76 MW\n",
 30 |     "\n",
 31 |     "We want to model the power output as a function of the other parameters."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import dask.dataframe as ddf\n",
 41 |     "\n",
 42 |     "df = ddf.read_csv('data/powerplant.csv', sample=False)\n",
 43 |     "df"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "df.head()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 25,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Feel free to copy-paste-and-modify from the example to get a model and predictions!"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": []
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": []
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 26,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# Call your test set y_test and your predictions y_predicted, to score your model with the next cell"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "from dask_ml.metrics import mean_squared_error\n",
 94 |     "from math import sqrt\n",
 95 |     "\n",
 96 |     "sqrt(mean_squared_error(y_test, y_predicted))"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "## Ray RLlib and PPO\n",
104 |     "\n",
105 |     "PPO of Proximal Policy Optimization is a more powerful (and more complicated) algorithm than the DQN we've looked at.\n",
106 |     "\n",
107 |     "But thanks to Ray's implementations, you can swap it in easily.\n",
108 |     "\n",
109 |     "Note that we import `ppo` from `ray.rllib.agents`\n",
110 |     "\n",
111 |     "By replacing \"DQN\" with \"PPO\" you can quickly get better results.\n",
112 |     "\n",
113 |     ">\n",
114 |     "> Interested in PPO details? Check out this writeup: https://jonathan-hui.medium.com/rl-proximal-policy-optimization-ppo-explained-77f014ec3f12\n",
115 |     ">"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "import ray\n",
125 |     "import ray.rllib.agents.ppo as ppo\n",
126 |     "\n",
127 |     "ray.shutdown()\n",
128 |     "ray.init()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 28,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "# Copy the code from the example, but replace references to DQN with references to PPO\n",
138 |     "\n",
139 |     "# HINT: try 10 iterations -- that will be plenty for PPO to solve the problem"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": []
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "# You should be able to hit \"500\" reward by the end of the training loop"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "ray.shutdown()"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": []
173 |   }
174 |  ],
175 |  "metadata": {
176 |   "kernelspec": {
177 |    "display_name": "Python 3",
178 |    "language": "python",
179 |    "name": "python3"
180 |   },
181 |   "language_info": {
182 |    "codemirror_mode": {
183 |     "name": "ipython",
184 |     "version": 3
185 |    },
186 |    "file_extension": ".py",
187 |    "mimetype": "text/x-python",
188 |    "name": "python",
189 |    "nbconvert_exporter": "python",
190 |    "pygments_lexer": "ipython3",
191 |    "version": "3.7.0"
192 |   }
193 |  },
194 |  "nbformat": 4,
195 |  "nbformat_minor": 4
196 | }
197 | 


--------------------------------------------------------------------------------
/04a-Solution-Modeling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Lab: Modeling with Dask and Ray\n",
  8 |     "\n",
  9 |     "To keep things simple, while still giving you a chance to try something hands on, we'll look at \n",
 10 |     "\n",
 11 |     "* Linear modeling with Dask and a different dataset\n",
 12 |     "* Ray RL example using a more powerful algorithm (PPO) than we did earlier"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## Dask and Powerplant Output\n",
 20 |     "\n",
 21 |     "We'll use the UC Irvine ML repository's Combined Cycle Power Plant Data Set (https://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant)\n",
 22 |     "\n",
 23 |     "This dataset consists of about 10,000 records of measurements relating to peaker power plants.\n",
 24 |     "\n",
 25 |     "* Temperature (AT) in the range 1.81°C and 37.11°C,\n",
 26 |     "* Ambient Pressure (AP) in the range 992.89-1033.30 millibar,\n",
 27 |     "* Relative Humidity (RH) in the range 25.56% to 100.16%\n",
 28 |     "* Exhaust Vacuum (V) in the range 25.36-81.56 cm Hg\n",
 29 |     "* Net hourly electrical energy output (PE) 420.26-495.76 MW\n",
 30 |     "\n",
 31 |     "We want to model the power output as a function of the other parameters."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import dask.dataframe as ddf\n",
 41 |     "\n",
 42 |     "df = ddf.read_csv('data/powerplant.csv', sample=False)\n",
 43 |     "df"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "df.head()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "y = df.PE\n",
 62 |     "X = df.drop(columns=['PE'])\n",
 63 |     "\n",
 64 |     "X"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "X = X.to_dask_array(lengths=True)\n",
 74 |     "y = y.to_dask_array(lengths=True)\n",
 75 |     "\n",
 76 |     "X"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "from dask_ml.model_selection import train_test_split\n",
 86 |     "\n",
 87 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)\n",
 88 |     "\n",
 89 |     "X_train"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "from dask_ml.linear_model import LinearRegression\n",
 99 |     "\n",
100 |     "lr = LinearRegression(solver='lbfgs', max_iter=10)\n",
101 |     "lr_model = lr.fit(X_train, y_train)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "y_predicted = lr_model.predict(X_test)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "from dask_ml.metrics import mean_squared_error\n",
120 |     "from math import sqrt\n",
121 |     "\n",
122 |     "sqrt(mean_squared_error(y_test, y_predicted))"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "## Ray RLlib and PPO\n",
130 |     "\n",
131 |     "PPO of Proximal Policy Optimization is a more powerful (and more complicated) algorithm than the DQN we've looked at.\n",
132 |     "\n",
133 |     "But thanks to Ray's implementations, you can swap it in easily.\n",
134 |     "\n",
135 |     "Note that we import `ppo` from `ray.rllib.agents`\n",
136 |     "\n",
137 |     "By replacing \"DQN\" with \"PPO\" you can quickly get better results.\n",
138 |     "\n",
139 |     ">\n",
140 |     "> Interested in PPO details? Check out this writeup: https://jonathan-hui.medium.com/rl-proximal-policy-optimization-ppo-explained-77f014ec3f12\n",
141 |     ">"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "import ray\n",
151 |     "import ray.rllib.agents.ppo as ppo\n",
152 |     "\n",
153 |     "ray.shutdown()\n",
154 |     "ray.init()"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "# Specifies the OpenAI Gym environment for CartPole, V1.\n",
164 |     "SELECT_ENV = \"CartPole-v1\"\n",
165 |     "\n",
166 |     "# Number of training runs.\n",
167 |     "N_ITER = 10\n",
168 |     "\n",
169 |     "# default configuration.\n",
170 |     "config = ppo.DEFAULT_CONFIG.copy()\n",
171 |     "\n",
172 |     "# Suppress too many messages.\n",
173 |     "config[\"log_level\"] = \"WARN\"\n",
174 |     "\n",
175 |     "# Use > 1 for more CPU cores, e.g., over a cluster.\n",
176 |     "config['num_workers'] = 2\n",
177 |     "\n",
178 |     "# Describe network\n",
179 |     "config['model']['fcnet_hiddens'] = [40,20]\n",
180 |     "\n",
181 |     "# Don't pin a CPU core to each worker (allows more workers).\n",
182 |     "config['num_cpus_per_worker'] = 0\n",
183 |     "checkpoint_dir = 'checkpoints'"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "trainer = ppo.PPOTrainer(config, SELECT_ENV)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "fmt = '{:3d},{:8.4f},{:8.4f},{:8.4f}'\n",
202 |     "last_checkpoint = ''\n",
203 |     "for n in range(N_ITER):\n",
204 |     "    result = trainer.train()\n",
205 |     "    min  = result['episode_reward_min']\n",
206 |     "    mean = result['episode_reward_mean']\n",
207 |     "    max  = result['episode_reward_max']\n",
208 |     "    last_checkpoint = trainer.save(checkpoint_dir)\n",
209 |     "    print(fmt.format(n, min, mean, max))\n",
210 |     "print(f'last checkpoint file: {last_checkpoint}')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "ray.shutdown()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": []
228 |   }
229 |  ],
230 |  "metadata": {
231 |   "kernelspec": {
232 |    "display_name": "Python 3",
233 |    "language": "python",
234 |    "name": "python3"
235 |   },
236 |   "language_info": {
237 |    "codemirror_mode": {
238 |     "name": "ipython",
239 |     "version": 3
240 |    },
241 |    "file_extension": ".py",
242 |    "mimetype": "text/x-python",
243 |    "name": "python",
244 |    "nbconvert_exporter": "python",
245 |    "pygments_lexer": "ipython3",
246 |    "version": "3.8.0"
247 |   }
248 |  },
249 |  "nbformat": 4,
250 |  "nbformat_minor": 4
251 | }
252 | 


--------------------------------------------------------------------------------
/05-Tuning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tuning (Hyperparameter Optimization)\n",
  8 |     "\n",
  9 |     "Although it doesn't appear in our high-level outline diagram, __model tuning__ is the next critical step in a typical data-to-deployment ML flow.\n",
 10 |     "\n",
 11 |     "Tuning can take many forms\n",
 12 |     "* Choosing regularization within a narrow family of models\n",
 13 |     "    * L1 and L2 components (\"ElasticNet\") applied to a linear model\n",
 14 |     "    * Number of trees or max tree depth in a tree ensemble like random forest or gradient-boosted trees\n",
 15 |     "* Adjusting a a parameter which alters the modeling family significantly\n",
 16 |     "    * Polynomial order in polynomial regression\n",
 17 |     "    * Kernel choice in kernelized SVMs\n",
 18 |     "* Architecture search\n",
 19 |     "    * Layer type and size in a neural network\n",
 20 |     "\n",
 21 |     "... and more, depending on the assumptions of the team and tools.\n",
 22 |     "\n",
 23 |     "While early tuning approaches used grid search (searching a grid of \"points in hyperparam space\" to find the best performing model) or random search (within hyperparam space), more recent libraries have expanded accessibility to sophisticated tuning approaches including\n",
 24 |     "* Hyperband (a bandit-based approach)\n",
 25 |     "* Bayesian\n",
 26 |     "* Population based\n",
 27 |     "and more.\n",
 28 |     "\n",
 29 |     "This has, in turn, spawned frameworks, like Optuna and HyperOpt to encapsulate these techniques.\n",
 30 |     "\n",
 31 |     "## Tuning is easy ... and hard\n",
 32 |     "\n",
 33 |     "Computationally, most tuning approaches are embarrasingly parallel operations. This means that they can be fairly easily scaled to many experiments in parallel, taking advantage of large-scale compute to get results quickly.\n",
 34 |     "\n",
 35 |     "At the same time, as model and tuning complexity increase, there is a rise in value for tooling that can manage, track, and automate this tuning.\n",
 36 |     "\n",
 37 |     "## Dask and Ray\n",
 38 |     "\n",
 39 |     "Dask supports a number of approaches to tuning as described here: https://ml.dask.org/hyper-parameter-search.html\n",
 40 |     "\n",
 41 |     "The Dask approach is most valuable to users who want to get their hands on the pipeline and programmatically manage model training and hyperparam search.\n",
 42 |     "\n",
 43 |     "Ray takes a slightly different angle: as we saw with Ray RLlib, which encapsulates training use cases into a high-level interface, Ray prominently features a similar tuning library: __Ray Tune__ (https://docs.ray.io/en/latest/tune/)\n",
 44 |     "\n",
 45 |     "This example -- from the Ray project documentation -- shows the high-level structure/flow:"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "from ray import tune\n",
 55 |     "\n",
 56 |     "def objective(step, alpha, beta):\n",
 57 |     "    return (0.1 + alpha * step / 100)**(-1) + beta * 0.1\n",
 58 |     "\n",
 59 |     "def training_function(config):\n",
 60 |     "    # Hyperparameters\n",
 61 |     "    alpha, beta = config[\"alpha\"], config[\"beta\"]\n",
 62 |     "    for step in range(10):\n",
 63 |     "        # Iterative training function - can be any arbitrary training procedure.\n",
 64 |     "        intermediate_score = objective(step, alpha, beta)\n",
 65 |     "        # Feed the score back back to Tune.\n",
 66 |     "        tune.report(mean_loss=intermediate_score)\n",
 67 |     "\n",
 68 |     "analysis = tune.run(\n",
 69 |     "    training_function,\n",
 70 |     "    config={\n",
 71 |     "        \"alpha\": tune.grid_search([0.001, 0.01, 0.1]),\n",
 72 |     "        \"beta\": tune.choice([1, 2, 3])\n",
 73 |     "    })\n",
 74 |     "\n",
 75 |     "print(\"Best config: \", analysis.get_best_config(\n",
 76 |     "    metric=\"mean_loss\", mode=\"min\"))\n",
 77 |     "\n",
 78 |     "# Get a dataframe for analyzing trial results.\n",
 79 |     "df = analysis.results_df"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "We'll see a more realistic example in the lab, but the key point here is that Ray Tune exposes a meta-API over the underlying algorithms, so that we can more quickly and simply scale lots of experiments."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": []
 95 |   }
 96 |  ],
 97 |  "metadata": {
 98 |   "kernelspec": {
 99 |    "display_name": "Python 3",
100 |    "language": "python",
101 |    "name": "python3"
102 |   },
103 |   "language_info": {
104 |    "codemirror_mode": {
105 |     "name": "ipython",
106 |     "version": 3
107 |    },
108 |    "file_extension": ".py",
109 |    "mimetype": "text/x-python",
110 |    "name": "python",
111 |    "nbconvert_exporter": "python",
112 |    "pygments_lexer": "ipython3",
113 |    "version": "3.8.0"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 4
118 | }
119 | 


--------------------------------------------------------------------------------
/06-Lab-Tuning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Ray Tune Lab/Demo\n",
  8 |     "\n",
  9 |     "In this walkthrough, we'll see a minimal realistic example of tuning, from a Ray Tune example.\n",
 10 |     "\n",
 11 |     "We'll use\n",
 12 |     "* Tensorflow\n",
 13 |     "* MNIST data\n",
 14 |     "* Perceptron classifier architecture with SGD optimizer\n",
 15 |     "\n",
 16 |     "And we'll tune\n",
 17 |     "* Number of neurons in the hidden layer\n",
 18 |     "* SGD learning rate\n",
 19 |     "* SGD momentum\n",
 20 |     "\n",
 21 |     "First, we'll do some imports and set up the training call:"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import numpy as np\n",
 31 |     "from tensorflow.keras.datasets import mnist\n",
 32 |     "from ray.tune.integration.keras import TuneReportCallback\n",
 33 |     "\n",
 34 |     "def train_mnist(config):\n",
 35 |     "    # https://github.com/tensorflow/tensorflow/issues/32159\n",
 36 |     "    import tensorflow as tf\n",
 37 |     "    batch_size = 128\n",
 38 |     "    num_classes = 10\n",
 39 |     "    epochs = 12\n",
 40 |     "\n",
 41 |     "    (x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
 42 |     "    x_train, x_test = x_train / 255.0, x_test / 255.0\n",
 43 |     "    model = tf.keras.models.Sequential([\n",
 44 |     "        tf.keras.layers.Flatten(input_shape=(28, 28)),\n",
 45 |     "        tf.keras.layers.Dense(config[\"hidden\"], activation=\"relu\"),\n",
 46 |     "        tf.keras.layers.Dropout(0.2),\n",
 47 |     "        tf.keras.layers.Dense(num_classes, activation=\"softmax\")\n",
 48 |     "    ])\n",
 49 |     "\n",
 50 |     "    model.compile(\n",
 51 |     "        loss=\"sparse_categorical_crossentropy\",\n",
 52 |     "        optimizer=tf.keras.optimizers.SGD(\n",
 53 |     "            lr=config[\"lr\"], momentum=config[\"momentum\"]),\n",
 54 |     "        metrics=[\"accuracy\"])\n",
 55 |     "\n",
 56 |     "    model.fit(\n",
 57 |     "        x_train,\n",
 58 |     "        y_train,\n",
 59 |     "        batch_size=batch_size,\n",
 60 |     "        epochs=epochs,\n",
 61 |     "        verbose=0,\n",
 62 |     "        validation_data=(x_test, y_test),\n",
 63 |     "        callbacks=[TuneReportCallback({\n",
 64 |     "            \"mean_accuracy\": \"accuracy\"\n",
 65 |     "        })])"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "We'll use the `AsyncHyperBandScheduler` for managing our trials. Ray recommends this variant (described in https://arxiv.org/abs/1810.05934) over the \"base\" Hyperband implementation.\n",
 73 |     "\n",
 74 |     "For an overview of various strategies -- including Hyperband -- this is a great introduction: https://medium.com/criteo-labs/hyper-parameter-optimization-algorithms-2fe447525903"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "import ray\n",
 84 |     "from ray import tune\n",
 85 |     "from ray.tune.schedulers import AsyncHyperBandScheduler\n",
 86 |     "mnist.load_data()\n",
 87 |     "\n",
 88 |     "sched = AsyncHyperBandScheduler(\n",
 89 |     "    time_attr=\"training_iteration\",\n",
 90 |     "    metric=\"mean_accuracy\",\n",
 91 |     "    mode=\"max\",\n",
 92 |     "    max_t=400,\n",
 93 |     "    grace_period=20)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "And now we start Ray and configure our work in a call to `tune.run`"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "ray.shutdown()\n",
110 |     "ray.init()\n",
111 |     "\n",
112 |     "tune.run(\n",
113 |     "    train_mnist,\n",
114 |     "    name=\"exp\",\n",
115 |     "    scheduler=sched,\n",
116 |     "    stop={\n",
117 |     "        \"mean_accuracy\": 0.99,\n",
118 |     "        \"training_iteration\": 5\n",
119 |     "    },\n",
120 |     "    num_samples=10,\n",
121 |     "    resources_per_trial={\n",
122 |     "        \"cpu\": 1,\n",
123 |     "        \"gpu\": 0\n",
124 |     "    },\n",
125 |     "    config={\n",
126 |     "        \"threads\": 2,\n",
127 |     "        \"lr\": tune.sample_from(lambda spec: np.random.uniform(0.001, 0.1)),\n",
128 |     "        \"momentum\": tune.sample_from(\n",
129 |     "            lambda spec: np.random.uniform(0.1, 0.9)),\n",
130 |     "        \"hidden\": tune.sample_from(\n",
131 |     "            lambda spec: np.random.randint(32, 512)),\n",
132 |     "    })"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "As we can see -- and as you've probably experienced if you've tried to hand-tune a network -- even in a simple problem like this, the resulting accuracy comes from subtle interplay between the influences of the hyperparams.\n",
140 |     "\n",
141 |     "Simply put, the network size, learning rate, and momentum all have to live in a sweet spot to get optimal results."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "ray.shutdown()"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": []
159 |   }
160 |  ],
161 |  "metadata": {
162 |   "kernelspec": {
163 |    "display_name": "Python 3",
164 |    "language": "python",
165 |    "name": "python3"
166 |   },
167 |   "language_info": {
168 |    "codemirror_mode": {
169 |     "name": "ipython",
170 |     "version": 3
171 |    },
172 |    "file_extension": ".py",
173 |    "mimetype": "text/x-python",
174 |    "name": "python",
175 |    "nbconvert_exporter": "python",
176 |    "pygments_lexer": "ipython3",
177 |    "version": "3.8.0"
178 |   }
179 |  },
180 |  "nbformat": 4,
181 |  "nbformat_minor": 4
182 | }
183 | 


--------------------------------------------------------------------------------
/07-Scoring-Orchestration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Beyond the Training...\n",
  8 |     "\n",
  9 |     "#### Model scoring, online learning, and workflow orchestration"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Prediction / Inference / Scoring\n",
 17 |     "\n",
 18 |     "In the ML context, these terms all refer to using models to make predictions \n",
 19 |     "\n",
 20 |     "There are several patterns\n",
 21 |     "* Batch (bulk) scoring\n",
 22 |     "* Request/response\n",
 23 |     "* Streaming\n",
 24 |     "\n",
 25 |     "In the general case, these are -- happily -- trivially parallelizable and scalable\n",
 26 |     "\n",
 27 |     "As we move to newer tools, we notice that they are all able to address these use cases.\n",
 28 |     "* Ray includes a subproject called Ray Serve \n",
 29 |     "    * https://docs.ray.io/en/master/serve/\n",
 30 |     "* Dask has various examples addressing these use cases\n",
 31 |     "    * https://examples.dask.org/machine-learning/parallel-prediction.html\n",
 32 |     "    * https://examples.dask.org/machine-learning/torch-prediction.html\n",
 33 |     "    * https://examples.dask.org/applications/async-web-server.html\n",
 34 |     "    \n",
 35 |     "And we should include the most promising open-source \"ML platform\", Kubeflow https://www.kubeflow.org/\n",
 36 |     "\n",
 37 |     "__However__ one of the biggest challenges is not over- or under- architecting a model serving solution.\n",
 38 |     "* Dask is ideal for batch prediction\n",
 39 |     "* But for request-response model serving, one might argue that both Dask and Ray are overly complex relative to the functionality they offer\n",
 40 |     "* Kubeflow is complex, but has a broader set of functionality \n",
 41 |     "    * ... which may make it worthwhile *if you need that functionality*\n",
 42 |     "* For streaming prediction, a simple Kafka or Pulsar application may be sufficient\n",
 43 |     "\n",
 44 |     "Keep in mind that the fundamental scoring (prediction) operation is typically uncomplicated and does not warrant any \"special\" software system. So if you are going to use or build a larger system, make sure it is meeting, while not exceeding, your actual data management needs.\n",
 45 |     "\n",
 46 |     "### Model Serving is Trivial; Model Management in Production May Not Be\n",
 47 |     "\n",
 48 |     "Why might you want a more complex system?\n",
 49 |     "* Model performance monitoring\n",
 50 |     "* Caching layer\n",
 51 |     "* Model drift\n",
 52 |     "* A/B or bandit testing\n",
 53 |     "* Rolling deploy of new model versions\n",
 54 |     "etc.\n",
 55 |     "\n",
 56 |     "Those concerns are beyond our scope here, but the key point is that your focus as a system designer should be on accommodating those first -- if you need them; I would advise against using an overly complex tool for model serving and then having to bolt on those additional capabilities."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### A Note on Orchestration\n",
 64 |     "\n",
 65 |     "Apache Airflow, the incumbent orchestration solution, is a nice product. \n",
 66 |     "\n",
 67 |     "For the next generation of architecture -- particular where ML is the goal from the start, rather than just data transformation -- take a look at Prefect (https://www.prefect.io/core)\n",
 68 |     "\n",
 69 |     "Prefect's argument -- borrowed straight from https://docs.prefect.io/core/getting_started/why-not-airflow.html#overview -- is:\n",
 70 |     "\n",
 71 |     ">Airflow was designed to run static, slow-moving workflows on a fixed schedule, and it is a great tool for that purpose. Airflow was also the first successful implementation of *workflows-as-code*, a useful and flexible paradigm. It proved that workflows could be built without resorting to config files or obtuse DAG definitions.\n",
 72 |     ">\n",
 73 |     "> However, because of the types of workflows it was designed to handle, Airflow exposes a limited \"vocabulary\" for defining workflow behavior, especially by modern standards. Users often get into trouble by forcing their use cases to fit into Airflow's model. A sampling of examples that Airflow can not satisfy in a first-class way includes:\n",
 74 |     "> \n",
 75 |     "> -   DAGs which need to be run off-schedule or with no schedule at all\n",
 76 |     "> -   DAGs that run concurrently with the same start time\n",
 77 |     "> -   DAGs with complicated branching logic\n",
 78 |     "> -   DAGs with many fast tasks\n",
 79 |     "> -   DAGs which rely on the exchange of data\n",
 80 |     "> -   Parametrized DAGs\n",
 81 |     "> -   Dynamic DAGs\n",
 82 |     ">\n",
 83 |     "> If your use case resembles any of these, you will need to work *around* Airflow's abstractions rather than *with* them. For this reason, almost every medium-to-large company using Airflow ends up writing a custom DSL or maintaining significant proprietary plugins to support its internal needs. This makes upgrading difficult and dramatically increases the maintenance burden when anything breaks.\n",
 84 |     "\n",
 85 |     "Naturally, a project's own pitch is not a neutral argument to use that tool -- and I'm not suggesting you take it as such.\n",
 86 |     "\n",
 87 |     "It is, however, well worth consideration in your system design."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": []
 96 |   }
 97 |  ],
 98 |  "metadata": {
 99 |   "kernelspec": {
100 |    "display_name": "Python 3",
101 |    "language": "python",
102 |    "name": "python3"
103 |   },
104 |   "language_info": {
105 |    "codemirror_mode": {
106 |     "name": "ipython",
107 |     "version": 3
108 |    },
109 |    "file_extension": ".py",
110 |    "mimetype": "text/x-python",
111 |    "name": "python",
112 |    "nbconvert_exporter": "python",
113 |    "pygments_lexer": "ipython3",
114 |    "version": "3.7.0"
115 |   }
116 |  },
117 |  "nbformat": 4,
118 |  "nbformat_minor": 4
119 | }
120 | 


--------------------------------------------------------------------------------
/08-RaySGD-MLflow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "saving-california",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Bonus: RaySGD + MLflow"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "coated-receiver",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Distributed Deep Learning Made Simple: RaySGD\n",
 17 |     "\n",
 18 |     "Distributed deep learning -- or optimization in general using TensorFlow/PyTorch -- has slowly been getting easier, even if it's not 100% transparent (or turn-key) quite yet.\n",
 19 |     "\n",
 20 |     "Native solutions (official TF/PyTorch distributed code) are getting easier; Horovod has matured and is straightforward. But one of the simplest approaches of all comes via the Ray project.\n",
 21 |     "\n",
 22 |     "In particular, RaySGD provides a zero-ops and minimal API approach: https://docs.ray.io/en/master/raysgd/raysgd.html\n",
 23 |     "\n",
 24 |     "Let's take a look! \n",
 25 |     "\n",
 26 |     "First, we'll set up a toy example (adapted from https://docs.ray.io/en/master/raysgd/raysgd_tensorflow.html)\n",
 27 |     "\n",
 28 |     "Later, we'll try a real dataset and integrate with additional tools.\n",
 29 |     "\n",
 30 |     "Here, we create trivial dummy data and a Tensorflow Dataset loader."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "id": "christian-adrian",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import numpy as np\n",
 41 |     "\n",
 42 |     "def linear_dataset(size=100):\n",
 43 |     "    x = np.random.rand(size)\n",
 44 |     "    y = 2 * x\n",
 45 |     "\n",
 46 |     "    x = x.reshape((-1, 1))\n",
 47 |     "    y = y.reshape((-1, 1))\n",
 48 |     "\n",
 49 |     "    return x, y\n",
 50 |     "\n",
 51 |     "def simple_dataset(config):\n",
 52 |     "    batch_size = config[\"batch_size\"]\n",
 53 |     "    x_train, y_train = linear_dataset(size=NUM_TRAIN_SAMPLES)\n",
 54 |     "    x_test, y_test = linear_dataset(size=NUM_TEST_SAMPLES)\n",
 55 |     "\n",
 56 |     "    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))\n",
 57 |     "    test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))\n",
 58 |     "    train_dataset = train_dataset.shuffle(NUM_TRAIN_SAMPLES).repeat().batch(\n",
 59 |     "        batch_size)\n",
 60 |     "    test_dataset = test_dataset.repeat().batch(batch_size)\n",
 61 |     "\n",
 62 |     "    return train_dataset, test_dataset"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "id": "marked-chile",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "Next, we define our model using regular `tf.keras` components. We define a model-creation function, as RaySGD uses a factory pattern (basically a pattern that takes creator functions rather than object instances) for dependencies."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "id": "transsexual-spokesman",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "import tensorflow as tf\n",
 81 |     "from tensorflow.keras.models import Sequential\n",
 82 |     "from tensorflow.keras.layers import Dense\n",
 83 |     "\n",
 84 |     "def simple_model(config):\n",
 85 |     "    model = Sequential([Dense(10, input_shape=(1, )), Dense(1)])\n",
 86 |     "\n",
 87 |     "    model.compile(\n",
 88 |     "        optimizer=\"sgd\",\n",
 89 |     "        loss=\"mean_squared_error\",\n",
 90 |     "        metrics=[\"mean_squared_error\"])\n",
 91 |     "\n",
 92 |     "    return model"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "id": "after-mother",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Now we get to the RaySGD code. After definining a minimal \"config\" object, we define a trainer function that accomplishes these steps:\n",
101 |     "* instantiate a TFTrainer instance to wrap the actual distributed training\n",
102 |     "* explicitly calculate starting model performance\n",
103 |     "* train multiple epochs\n",
104 |     "    * here we're explicitly calling `train` twice ... in a real example we would run more epochs with a control loop\n",
105 |     "* calculate final stats, change in the loss, and a \"sanity check\" that the loss actually went down"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "sealed-screen",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "import ray\n",
116 |     "from ray.util.sgd.tf.tf_trainer import TFTrainer, TFTrainable\n",
117 |     "\n",
118 |     "NUM_TRAIN_SAMPLES = 1000\n",
119 |     "NUM_TEST_SAMPLES = 400\n",
120 |     "\n",
121 |     "def create_config(batch_size):\n",
122 |     "\n",
123 |     "    return {\n",
124 |     "        \"batch_size\": batch_size,\n",
125 |     "        \"fit_config\": {\n",
126 |     "            \"steps_per_epoch\": NUM_TRAIN_SAMPLES // batch_size\n",
127 |     "        },\n",
128 |     "        \"evaluate_config\": {\n",
129 |     "            \"steps\": NUM_TEST_SAMPLES // batch_size,\n",
130 |     "        }\n",
131 |     "    }\n",
132 |     "\n",
133 |     "def train_example(num_replicas=1, batch_size=128, use_gpu=False):\n",
134 |     "    trainer = TFTrainer(\n",
135 |     "        model_creator=simple_model,\n",
136 |     "        data_creator=simple_dataset,\n",
137 |     "        num_replicas=num_replicas,\n",
138 |     "        use_gpu=use_gpu,\n",
139 |     "        verbose=True,\n",
140 |     "        config=create_config(batch_size))\n",
141 |     "\n",
142 |     "    # model baseline performance\n",
143 |     "    start_stats = trainer.validate()\n",
144 |     "    print(start_stats)\n",
145 |     "\n",
146 |     "    # train for 2 epochs\n",
147 |     "    trainer.train()\n",
148 |     "    trainer.train()\n",
149 |     "\n",
150 |     "    # model performance after training (should improve)\n",
151 |     "    end_stats = trainer.validate()\n",
152 |     "    print(end_stats)\n",
153 |     "\n",
154 |     "    # sanity check that training worked\n",
155 |     "    dloss = end_stats[\"validation_loss\"] - start_stats[\"validation_loss\"]\n",
156 |     "    dmse = (end_stats[\"validation_mean_squared_error\"] -\n",
157 |     "            start_stats[\"validation_mean_squared_error\"])\n",
158 |     "    print(f\"dLoss: {dloss}, dMSE: {dmse}\")\n",
159 |     "\n",
160 |     "    if dloss > 0 or dmse > 0:\n",
161 |     "        print(\"training sanity check failed. loss increased!\")\n",
162 |     "    else:\n",
163 |     "        print(\"success!\")"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "id": "expired-perry",
169 |    "metadata": {},
170 |    "source": [
171 |     "Ok, now that we're all set up, let's start Ray and run the training!"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "id": "earned-nancy",
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "ray.init()\n",
182 |     "\n",
183 |     "train_example()"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "id": "controlled-bryan",
189 |    "metadata": {},
190 |    "source": [
191 |     "Note that the Dashboard which defaults to `localhost:8265` is a key part of the Ray system, but may not be compatible with (and visible through) the binder container proxy.\n",
192 |     "\n",
193 |     "__How does this work?__\n",
194 |     "\n",
195 |     "In a nutshell,\n",
196 |     "* `TFTrainer` wraps TensorFlow's `MultiWorkerMirroredStrategy` as described here: https://docs.ray.io/en/master/raysgd/raysgd_tensorflow.html\n",
197 |     "* `MultiWorkerMirroredStrategy` is a synchronous distributed approach featuring multilateral reduce (e.g., AllReduce): https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy\n",
198 |     "\n",
199 |     "But the \"Hello World\" isn't very impressive. Let's at least try a slightly more realistic, if not industrial strength, dataset and model.\n",
200 |     "\n",
201 |     "We'll train a shallow (1-layer) dense feed-forward network with ReLU activation on the R/ggplot2 diamonds data (https://ggplot2.tidyverse.org/reference/diamonds.html)\n",
202 |     "\n",
203 |     "Start with a data loader, model, and config builder"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "id": "little-freeware",
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "import pandas as pd\n",
214 |     "from sklearn.model_selection import train_test_split\n",
215 |     "\n",
216 |     "def diamonds_dataset(config):\n",
217 |     "    batch_size = config[\"batch_size\"]\n",
218 |     "    df = pd.read_csv('data/diamonds.csv')\n",
219 |     "    df.drop(df.columns[0], axis=1, inplace=True)\n",
220 |     "    df = pd.get_dummies(df, prefix=['cut_', 'color_', 'clarity_'])\n",
221 |     "    y = df.price.to_numpy()\n",
222 |     "    X = df.drop(columns=['price']).to_numpy()\n",
223 |     "    train_size = 40_000\n",
224 |     "    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)\n",
225 |     "    \n",
226 |     "    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))\n",
227 |     "    test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))\n",
228 |     "    train_dataset = train_dataset.shuffle(len(X_train)).repeat().batch(\n",
229 |     "        batch_size)\n",
230 |     "    test_dataset = test_dataset.repeat().batch(batch_size)\n",
231 |     "\n",
232 |     "    return train_dataset, test_dataset"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "id": "coordinated-server",
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "def diamonds_simple_model(config):\n",
243 |     "    model = Sequential([Dense(30, input_shape=(26, ), activation='relu'), Dense(1)])\n",
244 |     "\n",
245 |     "    model.compile(\n",
246 |     "        optimizer=\"adam\",\n",
247 |     "        loss=\"mean_squared_error\",\n",
248 |     "        metrics=[\"mean_squared_error\"])\n",
249 |     "\n",
250 |     "    return model"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "id": "green-induction",
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "def create_diamonds_config(batch_size):\n",
261 |     "    return {\n",
262 |     "        \"batch_size\": batch_size,\n",
263 |     "        \"fit_config\": {\n",
264 |     "            \"steps_per_epoch\": 40000 // batch_size\n",
265 |     "        },\n",
266 |     "        \"evaluate_config\": {\n",
267 |     "            \"steps\": 13940 // batch_size,\n",
268 |     "        }\n",
269 |     "    }"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "id": "together-winning",
275 |    "metadata": {},
276 |    "source": [
277 |     "For clarity, we'll define a bare-basics training function"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "id": "voluntary-regard",
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "def train_diamonds(num_replicas=1, batch_size=128, use_gpu=False):\n",
288 |     "    trainer = TFTrainer(\n",
289 |     "        model_creator=diamonds_simple_model,\n",
290 |     "        data_creator=diamonds_dataset,\n",
291 |     "        num_replicas=num_replicas,\n",
292 |     "        use_gpu=use_gpu,\n",
293 |     "        verbose=False,\n",
294 |     "        config=create_diamonds_config(batch_size))\n",
295 |     "\n",
296 |     "    start_stats = trainer.validate()\n",
297 |     "    print(start_stats)\n",
298 |     "\n",
299 |     "    for i in range(32):\n",
300 |     "        trainer.train()\n",
301 |     "\n",
302 |     "    end_stats = trainer.validate()\n",
303 |     "    print(end_stats)\n",
304 |     "        \n",
305 |     "train_diamonds()"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "id": "theoretical-district",
311 |    "metadata": {},
312 |    "source": [
313 |     "We won't get famous for these results, but we definitely made progress.\n",
314 |     "\n",
315 |     "## MLflow\n",
316 |     "\n",
317 |     "One of the top open source frameworks for managing machine learning, from experiment to deployment, is MLflow (https://mlflow.org/)\n",
318 |     "\n",
319 |     "Created by Databricks and open-sourced under the Linux Foundation, MLflow has rapidly evolved to support a variety of key ML engineering tasks including\n",
320 |     "* Experiment tracking\n",
321 |     "    * Parameters, results, data, code, and model artefacts/assets\n",
322 |     "* Tracking project environments for reproducibility\n",
323 |     "* Deployment from a variety of model formats to various target (prediction/scoring) environments\n",
324 |     "* Model registry\n",
325 |     "    * Versioning, lineage/provenance\n",
326 |     "\n",
327 |     "More features are planned for the future; today, we'll look at just the original experiment tracking features.\n",
328 |     "\n",
329 |     "MLflow supports auto-instrumentation for a number of popular platforms (https://mlflow.org/docs/latest/tracking.html#automatic-logging) but these don't include RaySGD/Distributed TensorFlow yet... plus we want to see concretely how the pieces fit together.\n",
330 |     "\n",
331 |     "Before proceeding, start the MLflow UI server. On Binder/JupyterLab, open a new Terminal and type\n",
332 |     "\n",
333 |     "`mlflow ui`\n",
334 |     "\n",
335 |     "The UI is served on port 5000 by default, which should be accessible via the JupyterLab proxy (open a new tab and replace `/lab...` in your URL with `/proxy/5000/`)\n",
336 |     "\n",
337 |     "We'll start with the minimal code to save params and metrics to MLflow"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "id": "russian-cliff",
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "from mlflow import log_metric, log_param, end_run\n",
348 |     "\n",
349 |     "log_param(\"foo_count\", 42)\n",
350 |     "log_param(\"bar_count\", 43)\n",
351 |     "\n",
352 |     "for i in range(10):\n",
353 |     "    log_metric(\"score\", i)\n",
354 |     "    \n",
355 |     "end_run()"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "id": "invisible-evaluation",
361 |    "metadata": {},
362 |    "source": [
363 |     "This info should appear in the UI (though it may require a refresh)\n",
364 |     "\n",
365 |     "We can use a context manager for our runs and provide step indices to enable additional view."
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "id": "national-honduras",
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "import mlflow\n",
376 |     "\n",
377 |     "with mlflow.start_run():\n",
378 |     "    for i in range(10):\n",
379 |     "        log_metric(\"score\", i*i, step=i)"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "markdown",
384 |    "id": "monthly-music",
385 |    "metadata": {},
386 |    "source": [
387 |     "Now let's look at a more realistic use of MLflow, including\n",
388 |     "* Creating a named Experiment\n",
389 |     "* Recording data for runs to this Experiment\n",
390 |     "* Integrating with our RaySGD/TF model"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "id": "inappropriate-chorus",
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "experiment_id = mlflow.create_experiment(\"Diamonds RaySGD\")"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "id": "bored-edition",
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "from  mlflow.tracking import MlflowClient\n",
411 |     "client = MlflowClient()"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "id": "negative-slope",
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "run = client.create_run(experiment_id) # returns mlflow.entities.Run\n",
422 |     "client.log_param(run.info.run_id, \"hello\", \"world\")\n",
423 |     "client.set_terminated(run.info.run_id)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "id": "elementary-subject",
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "def train_diamonds_mlflow(num_replicas=1, batch_size=128, use_gpu=False):\n",
434 |     "    trainer = TFTrainer(\n",
435 |     "        model_creator=diamonds_simple_model,\n",
436 |     "        data_creator=diamonds_dataset,\n",
437 |     "        num_replicas=num_replicas,\n",
438 |     "        use_gpu=use_gpu,\n",
439 |     "        verbose=False,\n",
440 |     "        config=create_diamonds_config(batch_size))\n",
441 |     "\n",
442 |     "    start_stats = trainer.validate()\n",
443 |     "    print(start_stats)\n",
444 |     "\n",
445 |     "    ml_run = client.create_run(experiment_id)\n",
446 |     "\n",
447 |     "    for i in range(32):\n",
448 |     "        train_stats = trainer.train()\n",
449 |     "        if i % 2 == 0:\n",
450 |     "            val_stats = trainer.validate()            \n",
451 |     "            client.log_metric(ml_run.info.run_id, \"validation_loss\", val_stats[\"validation_loss\"], step=i)\n",
452 |     "            client.log_metric(ml_run.info.run_id, \"training_loss\", train_stats[\"train_loss\"], step=i)\n",
453 |     "        \n",
454 |     "    client.set_terminated(ml_run.info.run_id)\n",
455 |     "\n",
456 |     "    end_stats = trainer.validate()\n",
457 |     "    print(end_stats)\n",
458 |     "        \n",
459 |     "train_diamonds_mlflow()"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "id": "impressive-joseph",
466 |    "metadata": {},
467 |    "outputs": [],
468 |    "source": [
469 |     "ray.shutdown()"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": null,
475 |    "id": "normal-enzyme",
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": []
479 |   }
480 |  ],
481 |  "metadata": {
482 |   "kernelspec": {
483 |    "display_name": "Python 3",
484 |    "language": "python",
485 |    "name": "python3"
486 |   },
487 |   "language_info": {
488 |    "codemirror_mode": {
489 |     "name": "ipython",
490 |     "version": 3
491 |    },
492 |    "file_extension": ".py",
493 |    "mimetype": "text/x-python",
494 |    "name": "python",
495 |    "nbconvert_exporter": "python",
496 |    "pygments_lexer": "ipython3",
497 |    "version": "3.7.0"
498 |   }
499 |  },
500 |  "nbformat": 4,
501 |  "nbformat_minor": 5
502 | }
503 | 


--------------------------------------------------------------------------------
/09-Wrapup.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "executive-melbourne",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Additional Tools, Wrapup\n",
 9 |     "\n",
10 |     "Although we couldn't even *try* to be get all of the important and popular tools into our short survey, a few extra ones get a callout based on popular questions from previous sessions:\n",
11 |     "\n",
12 |     "* GPU-accelerated tools\n",
13 |     "    * NVIDIA RAPIDS (core libraries, Dask, Spark-RAPIDS, etc.)\n",
14 |     "    * NVTabular\n",
15 |     "    * BlazingSQL\n",
16 |     "    * numba.jit for compilation to CPU, CUDA, and even non-CUDA (ROCm) GPU\n",
17 |     "    * PlaidML for OpenCL\n",
18 |     "* Apache TVM compilation framework for heterogeneous compute \n",
19 |     "* Modin dataframe abstraction (Pandas API implementation over Ray/Dask execution)\n"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "markdown",
24 |    "id": "english-harbor",
25 |    "metadata": {},
26 |    "source": [
27 |     "# Q & A"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": null,
33 |    "id": "standard-brick",
34 |    "metadata": {},
35 |    "outputs": [],
36 |    "source": []
37 |   }
38 |  ],
39 |  "metadata": {
40 |   "kernelspec": {
41 |    "display_name": "Python 3",
42 |    "language": "python",
43 |    "name": "python3"
44 |   },
45 |   "language_info": {
46 |    "codemirror_mode": {
47 |     "name": "ipython",
48 |     "version": 3
49 |    },
50 |    "file_extension": ".py",
51 |    "mimetype": "text/x-python",
52 |    "name": "python",
53 |    "nbconvert_exporter": "python",
54 |    "pygments_lexer": "ipython3",
55 |    "version": "3.7.0"
56 |   }
57 |  },
58 |  "nbformat": 4,
59 |  "nbformat_minor": 5
60 | }
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # techniques
2 | 
3 | Modern Techniques for Data Science with Big Datasets
4 | 


--------------------------------------------------------------------------------
/binder/apt.txt:
--------------------------------------------------------------------------------
1 | openjdk-8-jre-headless
2 | 
3 | 


--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
 1 | name: techniques
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.8
 6 |   - jupyterlab
 7 |   - bokeh=2.2.3
 8 |   - pip
 9 |   - dask=2021.2.0
10 |   - distributed=2021.2.0
11 |   - dask-ml=1.8.0
12 |   - dask_labextension=5.0.0
13 |   - nodejs
14 |   - aioredis=1.3.1
15 |   - pip:
16 |     - gym==0.18.0
17 |     - ray[rllib]==1.2.0
18 |     - tensorflow==2.4.1
19 |     - pyspark
20 |     - mlflow
21 | 
22 | 


--------------------------------------------------------------------------------
/binder/jupyterlab-workspace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": {
 3 |     "file-browser-filebrowser:cwd": {
 4 |       "path": ""
 5 |     },
 6 |     "dask-dashboard-launcher": {
 7 |       "url": "DASK_DASHBOARD_URL"
 8 |     }
 9 |   },
10 |   "metadata": {
11 |     "id": "/lab"
12 |   }
13 | }


--------------------------------------------------------------------------------
/binder/postBuild:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Install dask and ipywidgets JupyterLab extensions
4 | jupyter labextension install --minimize=False --clean \
5 |     dask-labextension \
6 |     @jupyter-widgets/jupyterlab-manager


--------------------------------------------------------------------------------
/binder/start:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace DASK_DASHBOARD_URL with the proxy location
 4 | sed -i -e "s|DASK_DASHBOARD_URL|${JUPYTERHUB_BASE_URL}user/${JUPYTERHUB_USER}/proxy/8787|g" binder/jupyterlab-workspace.json
 5 | 
 6 | # Import the workspace
 7 | jupyter lab workspaces import binder/jupyterlab-workspace.json
 8 | 
 9 | SPARK_HOME=$HOME/spark-2.4.4-bin-hadoop2.7
10 | export PATH=$SPARK_HOME/bin:$PATH
11 | 
12 | exec "$@" 
13 | 


--------------------------------------------------------------------------------
/data/california/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/_SUCCESS


--------------------------------------------------------------------------------
/data/california/_committed_2595799468439767928:
--------------------------------------------------------------------------------
1 | {"added":["part-00000-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-105-1-c000.snappy.parquet","part-00002-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-107-1-c000.snappy.parquet","part-00004-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-109-1-c000.snappy.parquet","part-00005-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-110-1-c000.snappy.parquet","part-00007-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-112-1-c000.snappy.parquet","part-00001-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-106-1-c000.snappy.parquet","part-00003-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-108-1-c000.snappy.parquet","part-00006-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-111-1-c000.snappy.parquet"],"removed":[]}


--------------------------------------------------------------------------------
/data/california/_started_2595799468439767928:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/_started_2595799468439767928


--------------------------------------------------------------------------------
/data/california/part-00000-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-105-1-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/part-00000-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-105-1-c000.snappy.parquet


--------------------------------------------------------------------------------
/data/california/part-00001-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-106-1-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/part-00001-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-106-1-c000.snappy.parquet


--------------------------------------------------------------------------------
/data/california/part-00002-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-107-1-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/part-00002-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-107-1-c000.snappy.parquet


--------------------------------------------------------------------------------
/data/california/part-00003-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-108-1-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/part-00003-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-108-1-c000.snappy.parquet


--------------------------------------------------------------------------------
/data/california/part-00004-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-109-1-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/part-00004-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-109-1-c000.snappy.parquet


--------------------------------------------------------------------------------
/data/california/part-00005-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-110-1-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/part-00005-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-110-1-c000.snappy.parquet


--------------------------------------------------------------------------------
/data/california/part-00006-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-111-1-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/part-00006-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-111-1-c000.snappy.parquet


--------------------------------------------------------------------------------
/data/california/part-00007-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-112-1-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/data/california/part-00007-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-112-1-c000.snappy.parquet


--------------------------------------------------------------------------------
/images/cpv1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/images/cpv1.mp4


--------------------------------------------------------------------------------
/images/dask-array.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    width="526.211"
 13 |    height="252.00002"
 14 |    id="svg3277"
 15 |    version="1.1"
 16 |    inkscape:version="0.48.4 r9939"
 17 |    sodipodi:docname="dask-array-black-text.svg">
 18 |   <defs
 19 |      id="defs3279" />
 20 |   <sodipodi:namedview
 21 |      id="base"
 22 |      pagecolor="#ffffff"
 23 |      bordercolor="#666666"
 24 |      borderopacity="1.0"
 25 |      inkscape:pageopacity="0.0"
 26 |      inkscape:pageshadow="2"
 27 |      inkscape:zoom="1.4142136"
 28 |      inkscape:cx="313.40295"
 29 |      inkscape:cy="40.810933"
 30 |      inkscape:document-units="px"
 31 |      inkscape:current-layer="layer1"
 32 |      showgrid="true"
 33 |      fit-margin-top="0"
 34 |      fit-margin-left="0"
 35 |      fit-margin-right="0"
 36 |      fit-margin-bottom="0"
 37 |      inkscape:window-width="1600"
 38 |      inkscape:window-height="876"
 39 |      inkscape:window-x="0"
 40 |      inkscape:window-y="24"
 41 |      inkscape:window-maximized="1">
 42 |     <inkscape:grid
 43 |        type="xygrid"
 44 |        id="grid3292"
 45 |        empspacing="5"
 46 |        visible="true"
 47 |        enabled="true"
 48 |        snapvisiblegridlinesonly="true"
 49 |        originx="-398.9989px"
 50 |        originy="-528.98717px" />
 51 |   </sodipodi:namedview>
 52 |   <metadata
 53 |      id="metadata3282">
 54 |     <rdf:RDF>
 55 |       <cc:Work
 56 |          rdf:about="">
 57 |         <dc:format>image/svg+xml</dc:format>
 58 |         <dc:type
 59 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 60 |         <dc:title></dc:title>
 61 |       </cc:Work>
 62 |     </rdf:RDF>
 63 |   </metadata>
 64 |   <g
 65 |      inkscape:label="Layer 1"
 66 |      inkscape:groupmode="layer"
 67 |      id="layer1"
 68 |      transform="translate(-398.9989,-271.375)">
 69 |     <rect
 70 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
 71 |        id="rect4572"
 72 |        width="60"
 73 |        height="120"
 74 |        x="-522.36218"
 75 |        y="400"
 76 |        transform="matrix(0,-1,1,0,0,0)" />
 77 |     <rect
 78 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
 79 |        id="rect4594"
 80 |        width="60"
 81 |        height="120"
 82 |        x="-452.36218"
 83 |        y="400"
 84 |        transform="matrix(0,-1,1,0,0,0)" />
 85 |     <rect
 86 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
 87 |        id="rect4598"
 88 |        width="80"
 89 |        height="120"
 90 |        x="-382.36218"
 91 |        y="400"
 92 |        transform="matrix(0,-1,1,0,0,0)" />
 93 |     <rect
 94 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
 95 |        id="rect4600"
 96 |        width="20"
 97 |        height="120"
 98 |        x="-292.36218"
 99 |        y="400"
100 |        transform="matrix(0,-1,1,0,0,0)" />
101 |     <rect
102 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
103 |        id="rect4602"
104 |        width="60"
105 |        height="30"
106 |        x="-522.36218"
107 |        y="530"
108 |        transform="matrix(0,-1,1,0,0,0)" />
109 |     <rect
110 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
111 |        id="rect4604"
112 |        width="60"
113 |        height="30"
114 |        x="-452.36218"
115 |        y="530"
116 |        transform="matrix(0,-1,1,0,0,0)" />
117 |     <rect
118 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
119 |        id="rect4606"
120 |        width="80"
121 |        height="30"
122 |        x="-382.36218"
123 |        y="530"
124 |        transform="matrix(0,-1,1,0,0,0)" />
125 |     <rect
126 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
127 |        id="rect4608"
128 |        width="20"
129 |        height="30"
130 |        x="-292.36218"
131 |        y="530"
132 |        transform="matrix(0,-1,1,0,0,0)" />
133 |     <rect
134 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
135 |        id="rect4610"
136 |        width="60"
137 |        height="60"
138 |        x="-522.36218"
139 |        y="570"
140 |        transform="matrix(0,-1,1,0,0,0)" />
141 |     <rect
142 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
143 |        id="rect4612"
144 |        width="60"
145 |        height="60"
146 |        x="-452.36218"
147 |        y="570"
148 |        transform="matrix(0,-1,1,0,0,0)" />
149 |     <rect
150 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
151 |        id="rect4614"
152 |        width="80"
153 |        height="60"
154 |        x="-382.36218"
155 |        y="570"
156 |        transform="matrix(0,-1,1,0,0,0)" />
157 |     <rect
158 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
159 |        id="rect4616"
160 |        width="20"
161 |        height="60"
162 |        x="-292.36218"
163 |        y="570"
164 |        transform="matrix(0,-1,1,0,0,0)" />
165 |     <rect
166 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
167 |        id="rect4618"
168 |        width="60"
169 |        height="110"
170 |        x="-522.36218"
171 |        y="640"
172 |        transform="matrix(0,-1,1,0,0,0)" />
173 |     <rect
174 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
175 |        id="rect4620"
176 |        width="60"
177 |        height="110"
178 |        x="-452.36218"
179 |        y="640"
180 |        transform="matrix(0,-1,1,0,0,0)" />
181 |     <rect
182 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
183 |        id="rect4622"
184 |        width="80"
185 |        height="110"
186 |        x="-382.36218"
187 |        y="640"
188 |        transform="matrix(0,-1,1,0,0,0)" />
189 |     <rect
190 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
191 |        id="rect4624"
192 |        width="20"
193 |        height="110"
194 |        x="-292.36218"
195 |        y="640"
196 |        transform="matrix(0,-1,1,0,0,0)" />
197 |     <rect
198 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
199 |        id="rect4626"
200 |        width="60"
201 |        height="50"
202 |        x="-522.36218"
203 |        y="760"
204 |        transform="matrix(0,-1,1,0,0,0)" />
205 |     <rect
206 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
207 |        id="rect4628"
208 |        width="60"
209 |        height="50"
210 |        x="-452.36218"
211 |        y="760"
212 |        transform="matrix(0,-1,1,0,0,0)" />
213 |     <rect
214 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
215 |        id="rect4630"
216 |        width="80"
217 |        height="50"
218 |        x="-382.36218"
219 |        y="760"
220 |        transform="matrix(0,-1,1,0,0,0)" />
221 |     <rect
222 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
223 |        id="rect4632"
224 |        width="20"
225 |        height="50"
226 |        x="-292.36218"
227 |        y="760"
228 |        transform="matrix(0,-1,1,0,0,0)" />
229 |     <text
230 |        xml:space="preserve"
231 |        style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
232 |        x="848.61719"
233 |        y="337.36218"
234 |        id="text4653"
235 |        sodipodi:linespacing="125%"><tspan
236 |          sodipodi:role="line"
237 |          id="tspan4655"
238 |          x="850.23047"
239 |          y="337.36218"
240 |          style="font-size:14px;fill:#000000;fill-opacity:1">NumPy </tspan><tspan
241 |          sodipodi:role="line"
242 |          x="848.61719"
243 |          y="354.86218"
244 |          id="tspan4657"
245 |          style="font-size:14px;fill:#000000;fill-opacity:1">Array</tspan></text>
246 |     <text
247 |        xml:space="preserve"
248 |        style="font-size:44.57763289999999756px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
249 |        x="1510.3115"
250 |        y="198.09331"
251 |        id="text4659"
252 |        sodipodi:linespacing="125%"
253 |        transform="scale(0.53962379,1.8531429)"><tspan
254 |          sodipodi:role="line"
255 |          id="tspan4661"
256 |          x="1510.3115"
257 |          y="198.09331">}</tspan></text>
258 |     <text
259 |        xml:space="preserve"
260 |        style="font-size:107.32640838999999744px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
261 |        x="2106.272"
262 |        y="197.52885"
263 |        id="text4671"
264 |        sodipodi:linespacing="125%"
265 |        transform="scale(0.41670775,2.3997634)"><tspan
266 |          sodipodi:role="line"
267 |          id="tspan4673"
268 |          x="2106.272"
269 |          y="197.52885">}</tspan></text>
270 |     <text
271 |        xml:space="preserve"
272 |        style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
273 |        x="908.38672"
274 |        y="387.36218"
275 |        id="text4653-3"
276 |        sodipodi:linespacing="125%"><tspan
277 |          sodipodi:role="line"
278 |          x="910"
279 |          y="387.36218"
280 |          id="tspan4699"
281 |          style="font-size:14px;fill:#000000;fill-opacity:1">Dask </tspan><tspan
282 |          sodipodi:role="line"
283 |          x="908.38672"
284 |          y="404.86218"
285 |          id="tspan4657-0"
286 |          style="font-size:14px;fill:#000000;fill-opacity:1">Array</tspan></text>
287 |   </g>
288 | </svg>
289 | 


--------------------------------------------------------------------------------
/images/dask-dataframe.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    width="812.37378"
 13 |    height="1011.8721"
 14 |    id="svg2"
 15 |    version="1.1"
 16 |    inkscape:version="0.48.4 r9939"
 17 |    sodipodi:docname="New document 1">
 18 |   <defs
 19 |      id="defs4" />
 20 |   <sodipodi:namedview
 21 |      id="base"
 22 |      pagecolor="#ffffff"
 23 |      bordercolor="#666666"
 24 |      borderopacity="1.0"
 25 |      inkscape:pageopacity="0.0"
 26 |      inkscape:pageshadow="2"
 27 |      inkscape:zoom="0.5"
 28 |      inkscape:cx="68.623957"
 29 |      inkscape:cy="300.16925"
 30 |      inkscape:document-units="px"
 31 |      inkscape:current-layer="layer1"
 32 |      showgrid="true"
 33 |      fit-margin-top="0"
 34 |      fit-margin-left="0"
 35 |      fit-margin-right="0"
 36 |      fit-margin-bottom="0"
 37 |      inkscape:window-width="1600"
 38 |      inkscape:window-height="876"
 39 |      inkscape:window-x="0"
 40 |      inkscape:window-y="24"
 41 |      inkscape:window-maximized="1">
 42 |     <inkscape:grid
 43 |        type="xygrid"
 44 |        id="grid2985"
 45 |        empspacing="5"
 46 |        visible="true"
 47 |        enabled="true"
 48 |        snapvisiblegridlinesonly="true"
 49 |        spacingx="2px"
 50 |        spacingy="2px"
 51 |        originx="45.029487px"
 52 |        originy="-244.02136px" />
 53 |   </sodipodi:namedview>
 54 |   <metadata
 55 |      id="metadata7">
 56 |     <rdf:RDF>
 57 |       <cc:Work
 58 |          rdf:about="">
 59 |         <dc:format>image/svg+xml</dc:format>
 60 |         <dc:type
 61 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 62 |         <dc:title></dc:title>
 63 |       </cc:Work>
 64 |     </rdf:RDF>
 65 |   </metadata>
 66 |   <g
 67 |      inkscape:label="Layer 1"
 68 |      inkscape:groupmode="layer"
 69 |      id="layer1"
 70 |      transform="translate(45.029487,203.53131)">
 71 |     <rect
 72 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:5.58885813;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
 73 |        id="rect2987"
 74 |        width="251.49863"
 75 |        height="195.61005"
 76 |        x="246.14598"
 77 |        y="-200.72348" />
 78 |     <rect
 79 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:5.58885813;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
 80 |        id="rect3796"
 81 |        width="251.49863"
 82 |        height="139.72145"
 83 |        x="246.14598"
 84 |        y="22.830854" />
 85 |     <rect
 86 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:5.58885813;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
 87 |        id="rect3798"
 88 |        width="251.49863"
 89 |        height="307.38721"
 90 |        x="246.14598"
 91 |        y="190.4966" />
 92 |     <rect
 93 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:5.58885813;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
 94 |        id="rect3800"
 95 |        width="251.49863"
 96 |        height="167.66576"
 97 |        x="246.14598"
 98 |        y="525.82812" />
 99 |     <rect
100 |        style="opacity:0.6;fill:#0000b0;fill-opacity:0.50196078;stroke:#000000;stroke-width:5.58885813;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:0.44292238;stroke-dasharray:none;stroke-dashoffset:0"
101 |        id="rect3802"
102 |        width="251.49863"
103 |        height="83.832878"
104 |        x="246.14598"
105 |        y="721.43817" />
106 |     <text
107 |        xml:space="preserve"
108 |        style="font-size:33.53314972px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
109 |        x="78.480225"
110 |        y="-116.8906"
111 |        id="text3804"
112 |        sodipodi:linespacing="125%"><tspan
113 |          sodipodi:role="line"
114 |          id="tspan3806"
115 |          x="78.480225"
116 |          y="-116.8906"
117 |          style="font-size:39.12200928px">January, 2016</tspan></text>
118 |     <text
119 |        xml:space="preserve"
120 |        style="font-size:33.53314972px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
121 |        x="78.480225"
122 |        y="106.66373"
123 |        id="text3804-1"
124 |        sodipodi:linespacing="125%"><tspan
125 |          sodipodi:role="line"
126 |          id="tspan3806-5"
127 |          x="78.480225"
128 |          y="106.66373"
129 |          style="font-size:39.12200928px">Febrary, 2016</tspan></text>
130 |     <text
131 |        xml:space="preserve"
132 |        style="font-size:33.53314972px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
133 |        x="78.480225"
134 |        y="358.16235"
135 |        id="text3804-3"
136 |        sodipodi:linespacing="125%"><tspan
137 |          sodipodi:role="line"
138 |          id="tspan3806-9"
139 |          x="78.480225"
140 |          y="358.16235"
141 |          style="font-size:39.12200928px">March, 2016</tspan></text>
142 |     <text
143 |        xml:space="preserve"
144 |        style="font-size:33.53314972px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
145 |        x="78.480225"
146 |        y="609.66095"
147 |        id="text3804-5"
148 |        sodipodi:linespacing="125%"><tspan
149 |          sodipodi:role="line"
150 |          id="tspan3806-52"
151 |          x="78.480225"
152 |          y="609.66095"
153 |          style="font-size:39.12200928px">April, 2016</tspan></text>
154 |     <text
155 |        xml:space="preserve"
156 |        style="font-size:33.53314972px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
157 |        x="78.480225"
158 |        y="777.32672"
159 |        id="text3804-51"
160 |        sodipodi:linespacing="125%"><tspan
161 |          sodipodi:role="line"
162 |          id="tspan3806-7"
163 |          x="78.480225"
164 |          y="777.32672"
165 |          style="font-size:39.12200928px">May, 2016</tspan></text>
166 |     <text
167 |        xml:space="preserve"
168 |        style="font-size:33.53314972px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
169 |        x="671.9798"
170 |        y="78.719437"
171 |        id="text3886"
172 |        sodipodi:linespacing="125%"><tspan
173 |          sodipodi:role="line"
174 |          id="tspan3888"
175 |          x="676.48798"
176 |          y="78.719437"
177 |          style="font-size:39.12200928px">Pandas </tspan><tspan
178 |          sodipodi:role="line"
179 |          x="671.9798"
180 |          y="127.62195"
181 |          style="font-size:39.12200928px"
182 |          id="tspan3894">DataFrame</tspan></text>
183 |     <text
184 |        xml:space="preserve"
185 |        style="font-size:122.2452774px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
186 |        x="645.08667"
187 |        y="114.33765"
188 |        id="text3890"
189 |        sodipodi:linespacing="125%"
190 |        transform="scale(0.849412,1.177285)"><tspan
191 |          sodipodi:role="line"
192 |          id="tspan3892"
193 |          x="645.08667"
194 |          y="114.33765">}</tspan></text>
195 |     <text
196 |        xml:space="preserve"
197 |        style="font-size:33.53314972px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
198 |        x="661.44617"
199 |        y="296.68491"
200 |        id="text3896"
201 |        sodipodi:linespacing="125%"><tspan
202 |          sodipodi:role="line"
203 |          id="tspan3898"
204 |          x="665.95435"
205 |          y="296.68491"
206 |          style="font-size:39.12200928px">Dask </tspan><tspan
207 |          sodipodi:role="line"
208 |          x="661.44617"
209 |          y="345.5874"
210 |          id="tspan3904"
211 |          style="font-size:39.12200928px">DataFrame</tspan></text>
212 |     <text
213 |        xml:space="preserve"
214 |        style="font-size:355.3336792px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Ubuntu;-inkscape-font-specification:Ubuntu"
215 |        x="1527.4156"
216 |        y="209.45833"
217 |        id="text3900"
218 |        sodipodi:linespacing="125%"
219 |        transform="scale(0.34044434,2.9373377)"><tspan
220 |          sodipodi:role="line"
221 |          id="tspan3902"
222 |          x="1527.4156"
223 |          y="209.45833">}</tspan></text>
224 |   </g>
225 | </svg>
226 | 


--------------------------------------------------------------------------------
/images/data.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/images/data.jpg


--------------------------------------------------------------------------------
/images/flow-analyze.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/images/flow-analyze.png


--------------------------------------------------------------------------------
/images/flow-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/images/flow-base.png


--------------------------------------------------------------------------------
/images/flow-extract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/images/flow-extract.png


--------------------------------------------------------------------------------
/images/flow-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/images/flow-model.png


--------------------------------------------------------------------------------
/images/flow-transform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/images/flow-transform.png


--------------------------------------------------------------------------------
/images/largest.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/images/largest.jpg


--------------------------------------------------------------------------------
/images/psf-logo@2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbreind/techniques/066149a9d7d3bb09090af06770527950511f07b3/images/psf-logo@2x.png


--------------------------------------------------------------------------------