├── binder ├── postBuild └── environment.yml ├── worker.yml ├── .gitignore └── test.ipynb /binder/postBuild: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | jupyter serverextension enable --py nbserverproxy --sys-prefix 4 | 5 | jupyter labextension install @jupyter-widgets/jupyterlab-manager \ 6 | dask-labextension 7 | 8 | #EOF 9 | -------------------------------------------------------------------------------- /binder/environment.yml: -------------------------------------------------------------------------------- 1 | name: pangeo 2 | channels: 3 | - conda-forge 4 | - default 5 | dependencies: 6 | - python=3.6 7 | - pyarrow 8 | - xarray 9 | - gcsfs 10 | - dask 11 | - dask-kubernetes 12 | - distributed 13 | - nbserverproxy 14 | - jupyter 15 | - ipywidgets 16 | - nodejs 17 | - jupyterlab=0.35 18 | - matplotlib 19 | -------------------------------------------------------------------------------- /worker.yml: -------------------------------------------------------------------------------- 1 | # worker.yml 2 | 3 | kind: Pod 4 | metadata: 5 | labels: 6 | foo: bar 7 | spec: 8 | restartPolicy: Never 9 | containers: 10 | - image: daskdev/dask:latest 11 | imagePullPolicy: IfNotPresent 12 | args: [dask-worker, --nthreads, '2', --no-bokeh, --memory-limit, 6GB, --death-timeout, '60'] 13 | name: dask 14 | env: 15 | - name: EXTRA_PIP_PACKAGES 16 | value: gcsfs fastparquet distributed 17 | resources: 18 | limits: 19 | cpu: "1" 20 | memory: 6G 21 | requests: 22 | cpu: "1" 23 | memory: 6G 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "Dask Dataframes on NYC Taxi Data\n", 9 | "================================\n", 10 | "\n", 11 | "\"Pandas\n", 15 | " \"Dask\n", 19 | "\n", 20 | "In this section we will learn how to ...\n", 21 | "\n", 22 | "- use Dask Dataframe to scale Pandas workloads\n", 23 | "- call `.compute` and `.persist` to trigger computation\n", 24 | "- start and scale a Dask cluster on Kubernetes\n", 25 | "- interpret dashboard plots\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import warnings\n", 35 | "\n", 36 | "warnings.filterwarnings(\"ignore\", message=\"numpy.dtype size changed\")\n", 37 | "warnings.filterwarnings(\"ignore\", message=\"numpy.ufunc size changed\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## We have several CSV files in cloud storage" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "scrolled": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "from gcsfs import GCSFileSystem\n", 56 | "gcs = GCSFileSystem()\n", 57 | "\n", 58 | "sorted(gcs.glob('anaconda-public-data/nyc-taxi/csv/2015/yellow_*.csv'))" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Read a subset with Pandas\n", 66 | "\n", 67 | "It's too big to fit in memory on a single machine, so we pull out the first million rows to get a first impression." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "import pandas as pd\n", 77 | "\n", 78 | "with gcs.open('anaconda-public-data/nyc-taxi/csv/2015/yellow_tripdata_2015-01.csv') as f:\n", 79 | " df = pd.read_csv(f, nrows=1000000, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "df" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Investigate the subset as normal" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "! wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "! unzip ngrok-stable-linux-amd64.zip" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## Start a Dask Cluster\n", 121 | "\n", 122 | "Your notebook is conveniently attached to a Kubernetes cluster, so you can start a Dask cluster using the [dask-kubernetes](https://kubernetes.dask.org/en/latest/) project.\n", 123 | "\n", 124 | "For more information on deploying Dask on different cluster technology see [Dask's deployment documentation](https://docs.dask.org/en/latest/setup.html)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "from dask_kubernetes import KubeCluster\n", 134 | "cluster = KubeCluster.from_yaml('worker.yml')\n", 135 | "cluster" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "from dask.distributed import Client\n", 145 | "\n", 146 | "client = Client(cluster)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Create Dask dataframe around all of the data\n", 154 | "\n", 155 | "Before we loaded only a subset of one CSV file. Now lets use Dask dataframe to read all of the files.\n", 156 | "\n", 157 | "For more information you can read [Dask's documentation for creating dataframes](http://docs.dask.org/en/latest/dataframe-create.html)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "import dask.dataframe as dd\n", 167 | "\n", 168 | "df = dd.read_csv('gcs://anaconda-public-data/nyc-taxi/csv/2015/yellow_tripdata_2015-01.csv', \n", 169 | " parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])\n", 170 | "df = df.persist()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "Dask dataframes look like Pandas dataframes, and support most of the common Pandas methods." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "df.passenger_count.sum().compute()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## Investigate laziness and use the `.compute()` method\n", 194 | "\n", 195 | "Note that the `df.passenger_count.sum()` computation did not yet execute. Dask dataframes are *lazy* by default, so they only evaluate when we tell them to.\n", 196 | "\n", 197 | "There are two ways to trigger computation:\n", 198 | "\n", 199 | "- `result = result.compute()`: triggers computation and stores the result into local memory as a Pandas object. \n", 200 | "\n", 201 | " You should use this with *small* results that will fit into memory.\n", 202 | "- `result = result.persist()`: triggers computation and stores the result into distributed memory, returning another Dask dataframe object. \n", 203 | "\n", 204 | " You should use this with *large* results that you want to stage in distributed memory for repeated computation." 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "#### *Exercise*: Run the Pandas computations above with Dask dataframe" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# Example usage\n", 221 | "import distributed\n", 222 | "import dask.array as da\n", 223 | "\n", 224 | "# Connect dask to the cluster\n", 225 | "client = distributed.Client(cluster)\n", 226 | "\n", 227 | "# Create an array and calculate the mean\n", 228 | "array = da.ones((1000, 1000, 100), chunks=(100, 100, 10))\n", 229 | "print(array.mean().compute()) # Should print 1.0" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "# The average trip distance for rides with a single passenger\n", 239 | "\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "# The average trip distance grouped by passenger counts\n" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "## Persist data in memory\n", 256 | "\n", 257 | "When we started this notebook we ran the following lines to create our dataframe.\n", 258 | "\n", 259 | "```python\n", 260 | "df = dd.read_csv('gcs://anaconda-public-data/nyc-taxi/csv/2015/yellow_*.csv', \n", 261 | " parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])\n", 262 | "df = df.persist()\n", 263 | "```\n", 264 | "\n", 265 | "In particular, we called `df = df.persist()` to load all of the CSV data into distributed memory. Having this data in memory made our subsequent computations fast. \n", 266 | "\n", 267 | "In this section we're going to reset our cluster and run the same computations, but without persisting our data in memory. What happens to our computation times? Why?" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "client.restart()" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "df = dd.read_csv('gcs://anaconda-public-data/nyc-taxi/csv/2015/yellow_*.csv', \n", 286 | " parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])\n", 287 | "df" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "# How many passengers total?\n", 297 | "df.passenger_count.sum().compute() " 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "# The average trip distance for rides with a single passenger\n", 307 | "df2 = df[df.passenger_count == 1] \n", 308 | "df2.trip_distance.mean().compute()" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "# The average trip distance grouped by passenger counts\n", 318 | "df.groupby(df.passenger_count).trip_distance.mean().compute()" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "#### *Exercise*: What did our workers spend their time doing?\n", 326 | "\n", 327 | "To answer this question look at the Task Stream dashboard plot. It will tell you the activity on each core of your cluster (y-axis) over time (x-axis). You can hover over each rectangle of this plot to determine what kind of task it was. What kinds of tasks are most common and take up the most time?\n", 328 | "\n", 329 | "*Extra*: if you're ahead of the group you might also want to look at the Profile dashboard plot. You can access this by selecting the orange Dask icon on the left side of your JupyterLab page. The profile plot is an interactive [Flame graph](http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "df = df.persist() # we persist our data again, just to make future sections faster" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "And the type of each partition using the `map_partitions` method." 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "type(df)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "df.map_partitions(type).compute()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "### Divisions and the Index\n", 371 | "\n", 372 | "Just like Pandas, Dask Dataframe has an *index*, a special column that indexes the rows of our dataframe. In Dask this index has an additional purpose, it serves as a sorted partitioning of our data. This makes some algorithms more efficient. In this section, we'll sort our data by time and dive into the index a bit more deeply.\n", 373 | "\n", 374 | "First, notice that our index is not particularly informative. This is common when you load a dataset from CSV data, which generally doesn't store index or sorting information.\n", 375 | "\n", 376 | "Lets set a new index to be the pickup time. Sorting in parallel is hard, so this is an expensive operation." 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "df2 = df.set_index('tpep_pickup_datetime').persist()" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "df2" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "df2.head()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "df2.tail()" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "Our dataframe is split into roughly as many partitions as before, but now we know the time range of each partition. Internally, the divisions between partitions is stored in the divisions attribute." 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "df2.divisions" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "## Close things when you're done\n", 436 | "\n", 437 | "Before you move onto the next notebook, please close down your current cluster.\n", 438 | "\n", 439 | "Alternatively, you can restart this notebook by pressing the `\"0\"` key twice" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "cluster.close();" 449 | ] 450 | } 451 | ], 452 | "metadata": { 453 | "kernelspec": { 454 | "display_name": "Python 3", 455 | "language": "python", 456 | "name": "python3" 457 | }, 458 | "language_info": { 459 | "codemirror_mode": { 460 | "name": "ipython", 461 | "version": 3 462 | }, 463 | "file_extension": ".py", 464 | "mimetype": "text/x-python", 465 | "name": "python", 466 | "nbconvert_exporter": "python", 467 | "pygments_lexer": "ipython3", 468 | "version": "3.6.7" 469 | } 470 | }, 471 | "nbformat": 4, 472 | "nbformat_minor": 2 473 | } 474 | --------------------------------------------------------------------------------