├── binder
    ├── postBuild
    └── environment.yml
├── worker.yml
├── .gitignore
└── test.ipynb


/binder/postBuild:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | jupyter serverextension enable --py nbserverproxy --sys-prefix
4 | 
5 | jupyter labextension install @jupyter-widgets/jupyterlab-manager \
6 |                              dask-labextension
7 | 
8 | #EOF
9 | 


--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
 1 | name: pangeo
 2 | channels:
 3 |   - conda-forge
 4 |   - default
 5 | dependencies:
 6 |   - python=3.6
 7 |   - pyarrow
 8 |   - xarray
 9 |   - gcsfs
10 |   - dask
11 |   - dask-kubernetes
12 |   - distributed
13 |   - nbserverproxy
14 |   - jupyter
15 |   - ipywidgets
16 |   - nodejs
17 |   - jupyterlab=0.35
18 |   - matplotlib
19 | 


--------------------------------------------------------------------------------
/worker.yml:
--------------------------------------------------------------------------------
 1 | # worker.yml
 2 | 
 3 | kind: Pod
 4 | metadata:
 5 |   labels:
 6 |     foo: bar
 7 | spec:
 8 |   restartPolicy: Never
 9 |   containers:
10 |   - image: daskdev/dask:latest
11 |     imagePullPolicy: IfNotPresent
12 |     args: [dask-worker, --nthreads, '2', --no-bokeh, --memory-limit, 6GB, --death-timeout, '60']
13 |     name: dask
14 |     env:
15 |       - name: EXTRA_PIP_PACKAGES
16 |         value: gcsfs fastparquet distributed
17 |     resources:
18 |       limits:
19 |         cpu: "1"
20 |         memory: 6G
21 |       requests:
22 |         cpu: "1"
23 |         memory: 6G
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "\n",
  8 |     "Dask Dataframes on NYC Taxi Data\n",
  9 |     "================================\n",
 10 |     "\n",
 11 |     "<img src=\"http://pandas.pydata.org/_static/pandas_logo.png\"\n",
 12 |     "     align=\"left\"\n",
 13 |     "     width=\"30%\"\n",
 14 |     "     alt=\"Pandas logo\">\n",
 15 |     "     <img src=\"http://dask.readthedocs.io/en/latest/_images/dask_horizontal.svg\"\n",
 16 |     "     align=\"right\"\n",
 17 |     "     width=\"30%\"\n",
 18 |     "     alt=\"Dask logo\">\n",
 19 |     "\n",
 20 |     "In this section we will learn how to ...\n",
 21 |     "\n",
 22 |     "-  use Dask Dataframe to scale Pandas workloads\n",
 23 |     "-  call `.compute` and `.persist` to trigger computation\n",
 24 |     "-  start and scale a Dask cluster on Kubernetes\n",
 25 |     "-  interpret dashboard plots\n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import warnings\n",
 35 |     "\n",
 36 |     "warnings.filterwarnings(\"ignore\", message=\"numpy.dtype size changed\")\n",
 37 |     "warnings.filterwarnings(\"ignore\", message=\"numpy.ufunc size changed\")"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## We have several CSV files in cloud storage"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {
 51 |     "scrolled": false
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "from gcsfs import GCSFileSystem\n",
 56 |     "gcs = GCSFileSystem()\n",
 57 |     "\n",
 58 |     "sorted(gcs.glob('anaconda-public-data/nyc-taxi/csv/2015/yellow_*.csv'))"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Read a subset with Pandas\n",
 66 |     "\n",
 67 |     "It's too big to fit in memory on a single machine, so we pull out the first million rows to get a first impression."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "import pandas as pd\n",
 77 |     "\n",
 78 |     "with gcs.open('anaconda-public-data/nyc-taxi/csv/2015/yellow_tripdata_2015-01.csv') as f:\n",
 79 |     "    df = pd.read_csv(f, nrows=1000000, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "df"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "## Investigate the subset as normal"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "! wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "! unzip ngrok-stable-linux-amd64.zip"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "## Start a Dask Cluster\n",
121 |     "\n",
122 |     "Your notebook is conveniently attached to a Kubernetes cluster, so you can start a Dask cluster using the [dask-kubernetes](https://kubernetes.dask.org/en/latest/) project.\n",
123 |     "\n",
124 |     "For more information on deploying Dask on different cluster technology see [Dask's deployment documentation](https://docs.dask.org/en/latest/setup.html)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "from dask_kubernetes import KubeCluster\n",
134 |     "cluster = KubeCluster.from_yaml('worker.yml')\n",
135 |     "cluster"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "from dask.distributed import Client\n",
145 |     "\n",
146 |     "client = Client(cluster)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Create Dask dataframe around all of the data\n",
154 |     "\n",
155 |     "Before we loaded only a subset of one CSV file.  Now lets use Dask dataframe to read all of the files.\n",
156 |     "\n",
157 |     "For more information you can read [Dask's documentation for creating dataframes](http://docs.dask.org/en/latest/dataframe-create.html)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "import dask.dataframe as dd\n",
167 |     "\n",
168 |     "df = dd.read_csv('gcs://anaconda-public-data/nyc-taxi/csv/2015/yellow_tripdata_2015-01.csv', \n",
169 |     "                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])\n",
170 |     "df = df.persist()"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "Dask dataframes look like Pandas dataframes, and support most of the common Pandas methods."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "df.passenger_count.sum().compute()"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "## Investigate laziness and use the `.compute()` method\n",
194 |     "\n",
195 |     "Note that the `df.passenger_count.sum()` computation did not yet execute.  Dask dataframes are *lazy* by default, so they only evaluate when we tell them to.\n",
196 |     "\n",
197 |     "There are two ways to trigger computation:\n",
198 |     "\n",
199 |     "-  `result = result.compute()`: triggers computation and stores the result into local memory as a Pandas object.  \n",
200 |     "\n",
201 |     "    You should use this with *small* results that will fit into memory.\n",
202 |     "-  `result = result.persist()`: triggers computation and stores the result into distributed memory, returning another Dask dataframe object.  \n",
203 |     "\n",
204 |     "    You should use this with *large* results that you want to stage in distributed memory for repeated computation."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "#### *Exercise*: Run the Pandas computations above with Dask dataframe"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "# Example usage\n",
221 |     "import distributed\n",
222 |     "import dask.array as da\n",
223 |     "\n",
224 |     "# Connect dask to the cluster\n",
225 |     "client = distributed.Client(cluster)\n",
226 |     "\n",
227 |     "# Create an array and calculate the mean\n",
228 |     "array = da.ones((1000, 1000, 100), chunks=(100, 100, 10))\n",
229 |     "print(array.mean().compute())  # Should print 1.0"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "# The average trip distance for rides with a single passenger\n",
239 |     "\n"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "# The average trip distance grouped by passenger counts\n"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "## Persist data in memory\n",
256 |     "\n",
257 |     "When we started this notebook we ran the following lines to create our dataframe.\n",
258 |     "\n",
259 |     "```python\n",
260 |     "df = dd.read_csv('gcs://anaconda-public-data/nyc-taxi/csv/2015/yellow_*.csv', \n",
261 |     "                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])\n",
262 |     "df = df.persist()\n",
263 |     "```\n",
264 |     "\n",
265 |     "In particular, we called `df = df.persist()` to load all of the CSV data into distributed memory.  Having this data in memory made our subsequent computations fast.  \n",
266 |     "\n",
267 |     "In this section we're going to reset our cluster and run the same computations, but without persisting our data in memory.  What happens to our computation times?  Why?"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "client.restart()"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "df = dd.read_csv('gcs://anaconda-public-data/nyc-taxi/csv/2015/yellow_*.csv', \n",
286 |     "                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])\n",
287 |     "df"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "# How many passengers total?\n",
297 |     "df.passenger_count.sum().compute()  "
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "# The average trip distance for rides with a single passenger\n",
307 |     "df2 = df[df.passenger_count == 1]  \n",
308 |     "df2.trip_distance.mean().compute()"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "# The average trip distance grouped by passenger counts\n",
318 |     "df.groupby(df.passenger_count).trip_distance.mean().compute()"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "#### *Exercise*: What did our workers spend their time doing?\n",
326 |     "\n",
327 |     "To answer this question look at the Task Stream dashboard plot.  It will tell you the activity on each core of your cluster (y-axis) over time (x-axis).  You can hover over each rectangle of this plot to determine what kind of task it was.  What kinds of tasks are most common and take up the most time?\n",
328 |     "\n",
329 |     "*Extra*: if you're ahead of the group you might also want to look at the Profile dashboard plot.  You can access this by selecting the orange Dask icon on the left side of your JupyterLab page.  The profile plot is an interactive [Flame graph](http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "df = df.persist()  # we persist our data again, just to make future sections faster"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "markdown",
343 |    "metadata": {},
344 |    "source": [
345 |     "And the type of each partition using the `map_partitions` method."
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "type(df)"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "df.map_partitions(type).compute()"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "### Divisions and the Index\n",
371 |     "\n",
372 |     "Just like Pandas, Dask Dataframe has an *index*, a special column that indexes the rows of our dataframe.  In Dask this index has an additional purpose, it serves as a sorted partitioning of our data.  This makes some algorithms more efficient.  In this section, we'll sort our data by time and dive into the index a bit more deeply.\n",
373 |     "\n",
374 |     "First, notice that our index is not particularly informative.  This is common when you load a dataset from CSV data, which generally doesn't store index or sorting information.\n",
375 |     "\n",
376 |     "Lets set a new index to be the pickup time.  Sorting in parallel is hard, so this is an expensive operation."
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "df2 = df.set_index('tpep_pickup_datetime').persist()"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "df2"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "df2.head()"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "df2.tail()"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "Our dataframe is split into roughly as many partitions as before, but now we know the time range of each partition.  Internally, the divisions between partitions is stored in the divisions attribute."
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "df2.divisions"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {},
434 |    "source": [
435 |     "## Close things when you're done\n",
436 |     "\n",
437 |     "Before you move onto the next notebook, please close down your current cluster.\n",
438 |     "\n",
439 |     "Alternatively, you can restart this notebook by pressing the `\"0\"` key twice"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "cluster.close();"
449 |    ]
450 |   }
451 |  ],
452 |  "metadata": {
453 |   "kernelspec": {
454 |    "display_name": "Python 3",
455 |    "language": "python",
456 |    "name": "python3"
457 |   },
458 |   "language_info": {
459 |    "codemirror_mode": {
460 |     "name": "ipython",
461 |     "version": 3
462 |    },
463 |    "file_extension": ".py",
464 |    "mimetype": "text/x-python",
465 |    "name": "python",
466 |    "nbconvert_exporter": "python",
467 |    "pygments_lexer": "ipython3",
468 |    "version": "3.6.7"
469 |   }
470 |  },
471 |  "nbformat": 4,
472 |  "nbformat_minor": 2
473 | }
474 | 


--------------------------------------------------------------------------------