├── .gitignore
├── README.md
├── docker-compose.yml
├── down.sh
├── get_and_clean_data.py
├── notebooks
    ├── container_tweet.png
    ├── dask-dataframe.ipynb
    ├── data
    │   └── .gitkeep
    ├── docker-for-data-science.ipynb
    └── pstree.png
└── up.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | Food_Inspections.csv
3 | notebooks/data/food*
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to Docker (for Data Scientists)
 2 | 
 3 | This repository contains notebooks and code for my talk at the PyData Chicago Meetup on November 7, 2016.
 4 | 
 5 | This talk
 6 | 
 7 |   * introduces software containers and why they are useful to data scientists
 8 |   * takes a look at the parts of the linux kernel that make software containers possible
 9 |   * introduces Docker as a platform for using software containers
10 |   * shows an example of running dask distributed in separate containers on a single host
11 | 
12 | Run these notebooks in a container:
13 | 
14 | ```shell
15 | docker run -d \
16 |            -p 8888:8888 \ 
17 |            -v ./notebooks:/notebooks \
18 |            -w /notebooks \
19 |            jseabold/dask-jupyter \
20 |            bash -c "jupyter notebook --no-browser --ip='*'"
21 | ```
22 | 
23 | The images are built from my [dockerfiles repo](https://github.com/jseabold/dockerfiles).
24 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | services:
 3 |   jupyter:
 4 |     image: jseabold/dask-jupyter:latest
 5 |     command: ["bash", "-c", "jupyter notebook --no-browser --ip='*'"]
 6 |     volumes:
 7 |       - ./notebooks:/notebooks
 8 |     working_dir: /notebooks
 9 |     ports:
10 |       - "8888:8888"
11 |     networks:
12 |       - distributed
13 |     links:
14 |       - dask-scheduler
15 |     container_name: jupyter
16 | 
17 |   dask-scheduler:
18 |     image: jseabold/dask-scheduler:latest
19 |     ports:
20 |       - "8786:8786"
21 |       - "9786:9786"
22 |       - "8787:8787"
23 |     command: ["bash", "-c", "dask-scheduler --host dask-scheduler"]
24 |     networks:
25 |       - distributed
26 |     container_name: dask-scheduler
27 | 
28 |   dask-worker:
29 |     image: jseabold/dask-worker:latest
30 |     command: ["bash", "-c", "dask-worker dask-scheduler:8786 --nthreads 1 --nprocs 1"]
31 |     mem_limit: 1500m
32 |     volumes:
33 |       - ./notebooks:/notebooks
34 |     networks:
35 |       - distributed
36 |     links:
37 |       - dask-scheduler
38 | 
39 | networks:
40 |   distributed:
41 |     driver: bridge
42 | 


--------------------------------------------------------------------------------
/down.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | docker-compose -p dask-distributed stop
4 | docker-compose -p dask-distributed rm -f
5 | 


--------------------------------------------------------------------------------
/get_and_clean_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import requests
 3 | 
 4 | url = ("https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?"
 5 |        "accessType=DOWNLOAD")
 6 | 
 7 | response = requests.get(url, stream=True)
 8 | 
 9 | with open("Food_Inspections.csv", "wb") as fout:
10 |     for chunk in response.iter_content(32 * 1024):
11 |         fout.write(chunk)
12 | 
13 | df = pd.read_csv("Food_Inspection.csv")
14 | 
15 | for year, group in df.groupby(df['Inspection Date'].dt.year):
16 |     group.to_csv("notebooks/data/food_inspection_{}.csv.gz".format(year),
17 |                  compression='gzip', index=False)
18 | 


--------------------------------------------------------------------------------
/notebooks/container_tweet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jseabold/pydata-chi-docker/369f9cf93dccbea3d1d31118cbe33934b46f4de5/notebooks/container_tweet.png


--------------------------------------------------------------------------------
/notebooks/dask-dataframe.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "[Bokeh Web UI](http://localhost:8787)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from pprint import pprint\n",
 19 |     "\n",
 20 |     "from dask.distributed import Client\n",
 21 |     "client = Client('dask-scheduler:8786')"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "collapsed": false
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "client"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "pprint(client.scheduler_info())"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "client.list_datasets()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "# Dask DataFrame"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import dask.dataframe as dd"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "!ls data/*.csv.gz"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "df = dd.read_csv(\"/notebooks/data/food_inspection_201*\", \n",
 95 |     "                 blocksize=None, \n",
 96 |     "                 compression='gzip', \n",
 97 |     "                 parse_dates=['Inspection Date'])"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {
104 |     "collapsed": false
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "df"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "df.head()"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "collapsed": true
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "failed = df[df.Results == 'Fail']"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "collapsed": false
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "years = failed.year.unique()\n",
142 |     "years = years.compute()\n",
143 |     "years = years.values.tolist()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {
150 |     "collapsed": false
151 |    },
152 |    "outputs": [],
153 |    "source": [
154 |     "divisions = years + [2016]\n",
155 |     "divisions"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {
162 |     "collapsed": false
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "failed = failed.set_partition('year', divisions=divisions)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {
173 |     "collapsed": false
174 |    },
175 |    "outputs": [],
176 |    "source": [
177 |     "failed.known_divisions"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {
184 |     "collapsed": false
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "failed.head()"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {
195 |     "collapsed": true
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "def violators(df, n=3):\n",
200 |     "    size = lambda x: len(x)\n",
201 |     "    return (df.groupby('Address').\n",
202 |     "            apply(lambda x: len(x)).\n",
203 |     "            nlargest(n=n).\n",
204 |     "            to_frame(name='violations').\n",
205 |     "            reset_index())"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {
212 |     "collapsed": false
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "result = (failed.groupby(failed.index).\n",
217 |     "          apply(violators, \n",
218 |     "                meta=[('Address', object), \n",
219 |     "                      ('Violations', int)]))\n",
220 |     "result"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": true
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "worst = result.compute()"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": false
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "worst.index.names = ['Year', 'Offender']"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": false
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "worst.sort_index(level=0)"
254 |    ]
255 |   }
256 |  ],
257 |  "metadata": {
258 |   "kernelspec": {
259 |    "display_name": "Python 3",
260 |    "language": "python",
261 |    "name": "python3"
262 |   },
263 |   "language_info": {
264 |    "codemirror_mode": {
265 |     "name": "ipython",
266 |     "version": 3
267 |    },
268 |    "file_extension": ".py",
269 |    "mimetype": "text/x-python",
270 |    "name": "python",
271 |    "nbconvert_exporter": "python",
272 |    "pygments_lexer": "ipython3",
273 |    "version": "3.5.2"
274 |   }
275 |  },
276 |  "nbformat": 4,
277 |  "nbformat_minor": 1
278 | }
279 | 


--------------------------------------------------------------------------------
/notebooks/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jseabold/pydata-chi-docker/369f9cf93dccbea3d1d31118cbe33934b46f4de5/notebooks/data/.gitkeep


--------------------------------------------------------------------------------
/notebooks/docker-for-data-science.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 9,
  6 |    "metadata": {
  7 |     "slideshow": {
  8 |      "slide_type": "skip"
  9 |     }
 10 |    },
 11 |    "outputs": [
 12 |     {
 13 |      "data": {
 14 |       "text/plain": [
 15 |        "{'height': 800, 'width': 1200}"
 16 |       ]
 17 |      },
 18 |      "execution_count": 9,
 19 |      "metadata": {},
 20 |      "output_type": "execute_result"
 21 |     }
 22 |    ],
 23 |    "source": [
 24 |     "from traitlets.config.manager import BaseJSONConfigManager\n",
 25 |     "path = \"/.jupyter/nbconfig\"\n",
 26 |     "cm = BaseJSONConfigManager(config_dir=path)\n",
 27 |     "\n",
 28 |     "cm.update('livereveal', {\n",
 29 |     "              'width': 1200,\n",
 30 |     "              'height': 800,\n",
 31 |     "})"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {
 37 |     "slideshow": {
 38 |      "slide_type": "slide"
 39 |     }
 40 |    },
 41 |    "source": [
 42 |     "# About Me\n",
 43 |     "\n",
 44 |     "* Lead Data Scientist at [Civis Analytics](https://civisanalytics.com/)\n",
 45 |     "* [@jseabold](https://twitter.com/jseabold/) on Twitter\n",
 46 |     "* [jseabold](https://github.com/jseabold) on GitHub\n",
 47 |     "* [https://github.com/jseabold/pydata-chi-docker](https://github.com/jseabold/pydata-chi-docker)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {
 53 |     "slideshow": {
 54 |      "slide_type": "slide"
 55 |     }
 56 |    },
 57 |    "source": [
 58 |     "# Docker for Data Science"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {
 64 |     "slideshow": {
 65 |      "slide_type": "subslide"
 66 |     }
 67 |    },
 68 |    "source": [
 69 |     "## Docker Introduction\n",
 70 |     "\n",
 71 |     "* Docker is a platform for running applications in software containers\n",
 72 |     "* Containers are an implementation of operating-system-level virtualization\n",
 73 |     "* Enabled by features in the Linux kernel"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {
 79 |     "slideshow": {
 80 |      "slide_type": "subslide"
 81 |     }
 82 |    },
 83 |    "source": [
 84 |     "## What does using Docker offer?\n",
 85 |     "\n",
 86 |     "* Reproducibility\n",
 87 |     "* Portability / working environments\n",
 88 |     "* Reduces need for complex installations\n",
 89 |     "* Easier testing / debugging\n",
 90 |     "* Resource management\n",
 91 |     "* Easier networking between services"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {
 97 |     "slideshow": {
 98 |      "slide_type": "subslide"
 99 |     }
100 |    },
101 |    "source": [
102 |     "# That sounds like a Virtual Machine"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {
108 |     "slideshow": {
109 |      "slide_type": "subslide"
110 |     }
111 |    },
112 |    "source": [
113 |     "## Example Workflow\n",
114 |     "\n",
115 |     "```shell\n",
116 |     "docker build -t jseabold/dask-jupyter .\n",
117 |     "docker push jseabold/dask-jupyter\n",
118 |     "docker run --detach \\\n",
119 |     "           --publish 8888:8888 \\\n",
120 |     "           --volume $(pwd)/notebooks:/notebooks \\\n",
121 |     "           --working-dir /notebooks \\\n",
122 |     "           jseabold/dask-jupyter\n",
123 |     "```"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {
129 |     "slideshow": {
130 |      "slide_type": "subslide"
131 |     }
132 |    },
133 |    "source": [
134 |     "## Why not Virtual Machines?\n",
135 |     "\n",
136 |     "* A computer simulated in software\n",
137 |     "* Kind of slow\n",
138 |     "* Pretty big\n",
139 |     "* Takes time to provision, bring down, resume, etc.\n",
140 |     "* The ~\\*~\\*~ cloud ~\\*~\\*~\n",
141 |     "  * Cheap and easy to provision new machines\n",
142 |     "  * Happens more often\n",
143 |     "  * Services start to be spread across hosts"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {
149 |     "slideshow": {
150 |      "slide_type": "subslide"
151 |     }
152 |    },
153 |    "source": [
154 |     "# History of OS-level Virtualization\n",
155 |     "\n",
156 |     "* chroot (1979)\n",
157 |     "  - Change the apparent root directory for the current running process and its children\n",
158 |     "* namespaces (2002)\n",
159 |     "* Solaris Containers \"chroot on steroids\" (2004)\n",
160 |     "* control groups (google, 2006)\n",
161 |     "  * LXC (2009)\n",
162 |     "* copy-on-write\n",
163 |     "* Linux Containers (LXC, 2008)\n",
164 |     "* Docker (2013)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {
170 |     "slideshow": {
171 |      "slide_type": "subslide"
172 |     }
173 |    },
174 |    "source": [
175 |     "## What Makes Up a Container\n",
176 |     "\n",
177 |     "* Control groups\n",
178 |     "* Namespaces\n",
179 |     "* copy-on-write storage"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {
185 |     "slideshow": {
186 |      "slide_type": "subslide"
187 |     }
188 |    },
189 |    "source": [
190 |     "# True Confessions\n",
191 |     "\n",
192 |     "![Containers How Do They Work](container_tweet.png)\n"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {
198 |     "slideshow": {
199 |      "slide_type": "slide"
200 |     }
201 |    },
202 |    "source": [
203 |     "# Linux Nuts and Bolts\n"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {
209 |     "slideshow": {
210 |      "slide_type": "subslide"
211 |     }
212 |    },
213 |    "source": [
214 |     "## Linux Process Model\n",
215 |     "\n",
216 |     "* A process, or *task*, is an executing instance of a program\n",
217 |     "* New processes are created (cloned) by the system call *fork*\n",
218 |     "  * This *copies* the current process and creates a child process with a link to the current parent process\n",
219 |     "  * The address space comes along / only copied on modification\n",
220 |     "* Python exposes these OS primitives in `os` and `multiprocessing`, for example\n",
221 |     "* Processes cannot live in isolation\n",
222 |     "  * Every process has a parent (with one exception)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {
228 |     "slideshow": {
229 |      "slide_type": "subslide"
230 |     }
231 |    },
232 |    "source": [
233 |     "## Initialization Process\n",
234 |     "\n",
235 |     "* What happens when you boot up the linux operating system?\n",
236 |     "* The kernel finds the initialization process and starts it\n",
237 |     "  * Traditionally, **init** \n",
238 |     "  * Now, commonly, **systemd**\n",
239 |     "* Daemon running in the background \n",
240 |     "* Direct or indirect ancestor of all processes"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 1,
246 |    "metadata": {
247 |     "slideshow": {
248 |      "slide_type": "subslide"
249 |     }
250 |    },
251 |    "outputs": [
252 |     {
253 |      "name": "stdout",
254 |      "output_type": "stream",
255 |      "text": [
256 |       "systemd\r\n"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "!ps -q 1 -o comm="
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {
267 |     "slideshow": {
268 |      "slide_type": "subslide"
269 |     }
270 |    },
271 |    "source": [
272 |     "![pstree](pstree.png)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {
278 |     "slideshow": {
279 |      "slide_type": "slide"
280 |     }
281 |    },
282 |    "source": [
283 |     "# Control Groups"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {
289 |     "slideshow": {
290 |      "slide_type": "subslide"
291 |     }
292 |    },
293 |    "source": [
294 |     "## What are cgroups\n",
295 |     "\n",
296 |     "<div style=\"font-size: 90%; line-height: 115%\">\n",
297 |     "<br /><br />\n",
298 |     "\"Control Groups provide a mechanism for aggregating / partitioning sets of\n",
299 |     "tasks, and all their future children, into hierarchical groups with\n",
300 |     "specialized behaviour.\"\n",
301 |     "</div>\n",
302 |     "\n",
303 |     "* Allow allocation of resources among processes\n",
304 |     "* Includes metering, limiting, and accounting for resources\n",
305 |     "* Similar to processes \n",
306 |     "  * hierarchical\n",
307 |     "  * inherit from parent cgroups\n",
308 |     "  * *But* many different ones can exist simultaneously"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {
314 |     "slideshow": {
315 |      "slide_type": "subslide"
316 |     }
317 |    },
318 |    "source": [
319 |     "## University Server Example\n",
320 |     "\n",
321 |     "<div style=\"font-size: 75%\">\n",
322 |     "```\n",
323 |     "CPU :          \"Top cpuset\"\n",
324 |     "                 /       \\\n",
325 |     "         CPUSet1         CPUSet2\n",
326 |     "            |               |\n",
327 |     "         (Professors)    (Students)\n",
328 |     "\n",
329 |     "         In addition (system tasks) are attached to topcpuset (so\n",
330 |     "         that they can run anywhere) with a limit of 20%\n",
331 |     "\n",
332 |     "Memory : Professors (50%), Students (30%), system (20%)\n",
333 |     "\n",
334 |     "Disk : Professors (50%), Students (30%), system (20%)\n",
335 |     "\n",
336 |     "Network : WWW browsing (20%), Network File System (60%), others (20%)\n",
337 |     "                        / \\\n",
338 |     "        Professors (15%)  students (5%)\n",
339 |     "```\n",
340 |     "</div>"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {
346 |     "slideshow": {
347 |      "slide_type": "subslide"
348 |     }
349 |    },
350 |    "source": [
351 |     "## cgroup subsystems\n",
352 |     "\n",
353 |     "* The cgroup hierarchies are connected to one or more **subsystems**\n",
354 |     "* blkio\n",
355 |     "* cpu / cpuset\n",
356 |     "* devices\n",
357 |     "* memory\n",
358 |     "* net_cls / net_prio\n",
359 |     "* ..."
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {
365 |     "slideshow": {
366 |      "slide_type": "subslide"
367 |     }
368 |    },
369 |    "source": [
370 |     "## cpu cgroup\n",
371 |     "\n",
372 |     "* group processes together\n",
373 |     "* you can set weights per cgroup that OS scheduler takes into account\n",
374 |     "* can't set limits\n",
375 |     "  * It doesn't make sense\n",
376 |     "  * CPU architecture (different registers, different instructions, doesn't make sense)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {
382 |     "slideshow": {
383 |      "slide_type": "subslide"
384 |     }
385 |    },
386 |    "source": [
387 |     "## cpuset cgroup\n",
388 |     "\n",
389 |     "* processor affinity\n",
390 |     "* pin groups to specific CPUS\n",
391 |     "* reserve CPUs for specific apps"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {
397 |     "slideshow": {
398 |      "slide_type": "subslide"
399 |     }
400 |    },
401 |    "source": [
402 |     "## memory cgroup\n",
403 |     "\n",
404 |     "* limits are optional -- soft and hard limits\n",
405 |     "* soft limits are not enforced\n",
406 |     "  * when pressure is strong, it looks at the cgroups above the soft limit, then you get pages taken from you by the kernel\n",
407 |     "* limits can be set for different kinds of memory\n",
408 |     "  * physical (RAM), kernel (dentries), total (SWAP)\n",
409 |     "* hard limit -- process gets killed on the cgroup level\n",
410 |     "  * it kills the process in this container\n",
411 |     "  * this is why you want to have one service per-container"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "markdown",
416 |    "metadata": {
417 |     "slideshow": {
418 |      "slide_type": "subslide"
419 |     }
420 |    },
421 |    "source": [
422 |     "## blkio cgroup\n",
423 |     "\n",
424 |     "* keeps track of IO for ea. grou\n",
425 |     "* per block devices\n",
426 |     "* read vs write\n",
427 |     "* sync vs async\n",
428 |     "* set throttle (limits) for each group\n",
429 |     "* set relative weights for each group"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "markdown",
434 |    "metadata": {
435 |     "slideshow": {
436 |      "slide_type": "subslide"
437 |     }
438 |    },
439 |    "source": [
440 |     "## net_cls and net_prio cgroup\n",
441 |     "\n",
442 |     "* net_cls allows tagging of network packets with their origin cgroup\n",
443 |     "* net_prio allows setting the priority of cgroups for different network interfaces"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "markdown",
448 |    "metadata": {
449 |     "slideshow": {
450 |      "slide_type": "subslide"
451 |     }
452 |    },
453 |    "source": [
454 |     "## devices cgroup\n",
455 |     "\n",
456 |     "* What tasks can use what device\n",
457 |     "* Typically, things like\n",
458 |     "  * /dev/{tty,zero,random,null}\n",
459 |     "  * /dev/net/tun\n",
460 |     "  * /dev/fuse\n",
461 |     "  * /dev/dri (GPU)"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "markdown",
466 |    "metadata": {
467 |     "slideshow": {
468 |      "slide_type": "subslide"
469 |     }
470 |    },
471 |    "source": [
472 |     "## freezer cgroup\n",
473 |     "\n",
474 |     "* Like SIGSTOP on the container\n",
475 |     "* freeze/thaw a group of processes\n",
476 |     "* process migration\n",
477 |     "* cluster batch scheduling and process migration"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {
483 |     "slideshow": {
484 |      "slide_type": "slide"
485 |     }
486 |    },
487 |    "source": [
488 |     "# Linux Namespaces"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "metadata": {
494 |     "slideshow": {
495 |      "slide_type": "subslide"
496 |     }
497 |    },
498 |    "source": [
499 |     "# What are namespaces?\n",
500 |     "\n",
501 |     "* If cgroups limit what you can use, namespaces limit what you can view\n",
502 |     "* Takes a global resource and makes it look like processes have their own\n",
503 |     "* Namespaces\n",
504 |     "  * pid (processes)\n",
505 |     "  * net (network stack)\n",
506 |     "  * mnt (filesystem and mount points)\n",
507 |     "  * uts (hostname)\n",
508 |     "  * ipc (interprocess communication)\n",
509 |     "  * user (user)\n",
510 |     "* each process is in one namespace of each type"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "markdown",
515 |    "metadata": {
516 |     "slideshow": {
517 |      "slide_type": "subslide"
518 |     }
519 |    },
520 |    "source": [
521 |     "## pid namespace\n",
522 |     "\n",
523 |     "* see only other process in your pid namespace\n",
524 |     "* pid in and outside of the container"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "markdown",
529 |    "metadata": {
530 |     "slideshow": {
531 |      "slide_type": "subslide"
532 |     }
533 |    },
534 |    "source": [
535 |     "## network namespace\n",
536 |     "\n",
537 |     "* processes within a given network namespace get their own private network stack, including\n",
538 |     "  * network interfaces (including lo)\n",
539 |     "  * routing tables\n",
540 |     "  * iptables routes\n",
541 |     "  * sockets (ss, netstate)\n",
542 |     "* you can move a network interface across netns\n",
543 |     "  * have a container that sets up a vpn connection and then moves it across containers"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "markdown",
548 |    "metadata": {
549 |     "slideshow": {
550 |      "slide_type": "subslide"
551 |     }
552 |    },
553 |    "source": [
554 |     "## mnt namespace \n",
555 |     "\n",
556 |     "* Processes can have their own root fs\n",
557 |     "* Processes also have \"private\" mounts\n",
558 |     "  * /tmp (scoped per user, per service)\n",
559 |     "* Mounts can be private or shared\n",
560 |     "* Can't pass a mount from a namespace to another"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {
566 |     "slideshow": {
567 |      "slide_type": "subslide"
568 |     }
569 |    },
570 |    "source": [
571 |     "## uts namespace\n",
572 |     "\n",
573 |     "* can have your own hostname\n",
574 |     "* isolating kernel and version identifiers"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "markdown",
579 |    "metadata": {
580 |     "slideshow": {
581 |      "slide_type": "subslide"
582 |     }
583 |    },
584 |    "source": [
585 |     "## ipc namespace\n",
586 |     "\n",
587 |     "* System V and posix IPC\n",
588 |     "* allows a process to have its own \n",
589 |     "  * IPC semaphores\n",
590 |     "  * IPC message queues\n",
591 |     "  * IPC shared memory\n",
592 |     "* without risk of conflict with other instances"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "markdown",
597 |    "metadata": {
598 |     "slideshow": {
599 |      "slide_type": "subslide"
600 |     }
601 |    },
602 |    "source": [
603 |     "## user namespace \n",
604 |     "\n",
605 |     "* map UID/GID inside the container to outside\n",
606 |     "* This is as big topic\n",
607 |     "  * Only recently added to Docker\n",
608 |     "* UIDs 0-1999 in the container are mapped to 10000->11999 on host, etc.\n",
609 |     "* UID in containers becomes irrelevant. Just use UID 0 in the container\n",
610 |     "* It gets squashed to a non-privileged user outside\n",
611 |     "  * Volumes *gotcha*"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "markdown",
616 |    "metadata": {
617 |     "slideshow": {
618 |      "slide_type": "slide"
619 |     }
620 |    },
621 |    "source": [
622 |     "# Union Filesystem"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "markdown",
627 |    "metadata": {
628 |     "slideshow": {
629 |      "slide_type": "subslide"
630 |     }
631 |    },
632 |    "source": [
633 |     "# What is a Union FS?\n",
634 |     "\n",
635 |     "* This is what makes containers lightweight\n",
636 |     "* Allows different parts of a filesystem to be overlaid as transparent layers\n",
637 |     "* Create a new container instantly instead of copying the whole filesystem\n",
638 |     "* Storage drive keeps track of what has changed\n",
639 |     "* Options\n",
640 |     "  * AUFS, overlay (file level)\n",
641 |     "  * device mapper (block level)\n",
642 |     "  * BTRFS, ZFS (filesystem level)"
643 |    ]
644 |   },
645 |   {
646 |    "cell_type": "markdown",
647 |    "metadata": {
648 |     "slideshow": {
649 |      "slide_type": "slide"
650 |     }
651 |    },
652 |    "source": [
653 |     "# Docker"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "markdown",
658 |    "metadata": {
659 |     "slideshow": {
660 |      "slide_type": "subslide"
661 |     }
662 |    },
663 |    "source": [
664 |     "## What is Docker\n",
665 |     "\n",
666 |     "\n",
667 |     "* Docker is a platform that provides abstractions for working with containers and a container runtime\n",
668 |     "* It is not the only way to manage software containers (!)"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "markdown",
673 |    "metadata": {
674 |     "slideshow": {
675 |      "slide_type": "subslide"
676 |     }
677 |    },
678 |    "source": [
679 |     "# Docker Architecture\n",
680 |     "\n",
681 |     "![docker architecture](https://docs.docker.com/engine/article-img/architecture.svg)"
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "markdown",
686 |    "metadata": {
687 |     "slideshow": {
688 |      "slide_type": "subslide"
689 |     }
690 |    },
691 |    "source": [
692 |     "# Docker Images\n",
693 |     "\n",
694 |     "* Read-only template from which containers are instantiated\n",
695 |     "* Images consist of *layers*\n",
696 |     "  * these layers can be shared\n",
697 |     "* The [Union file system](https://en.wikipedia.org/wiki/UnionFS) combines the layers into an image\n",
698 |     "* The image layers are part of what makes docker lightweight\n",
699 |     "* Updating one layer does not need to update other layers"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "markdown",
704 |    "metadata": {
705 |     "slideshow": {
706 |      "slide_type": "subslide"
707 |     }
708 |    },
709 |    "source": [
710 |     "## Dockerfile\n",
711 |     "\n",
712 |     "\n",
713 |     "<div style=\"font-size:75%\"><br />\n",
714 |     "```\n",
715 |     "FROM continuumio/miniconda3:4.1.11\n",
716 |     "MAINTAINER <jsseabold>\n",
717 |     "\n",
718 |     "RUN conda update -y conda && \\\n",
719 |     "    conda install -y -c conda-forge -c defaults --show-channel-urls --override-channels \\\n",
720 |     "    conda-build\n",
721 |     "\n",
722 |     "COPY requirements.txt /bootstrap/requirements.txt\n",
723 |     "\n",
724 |     "RUN conda install -y -c conda-forge -c defaults --file \\\n",
725 |     "    /bootstrap/requirements.txt && \\\n",
726 |     "    conda install -c damianavila82 rise && \\\n",
727 |     "    conda clean -tipsy\n",
728 |     "    \n",
729 |     "RUN pip install --user graphviz\n",
730 |     "\n",
731 |     "RUN jupyter nbextension enable --py widgetsnbexdtension && \\\n",
732 |     "    jupyter nbextension install --py rise && \\\n",
733 |     "    jupyter nbextension enable --py rise\n",
734 |     "    \n",
735 |     "EXPOSE 8888\n",
736 |     "\n",
737 |     "ENTRYPOINT [\"bash\", \"-c\", \"jupyter-notebook --no-browser --ip='*'\"]\n",
738 |     "```\n",
739 |     "</div>"
740 |    ]
741 |   },
742 |   {
743 |    "cell_type": "markdown",
744 |    "metadata": {
745 |     "slideshow": {
746 |      "slide_type": "subslide"
747 |     }
748 |    },
749 |    "source": [
750 |     "## FROM\n",
751 |     "\n",
752 |     "```\n",
753 |     "FROM continuumio/miniconda3:4.1.11\n",
754 |     "```\n",
755 |     "\n",
756 |     "* Every Dockerfile needs to start with a `FROM` instruction\n",
757 |     "* Specifies the *base image* and *tag*\n",
758 |     "* Common examples: `ubuntu:16.04`, `debian:jessie`\n",
759 |     "  * debian is recommended as a best practice\n",
760 |     "* docker maintains a list of [Official Repositories](https://hub.docker.com/explore/)"
761 |    ]
762 |   },
763 |   {
764 |    "cell_type": "markdown",
765 |    "metadata": {
766 |     "slideshow": {
767 |      "slide_type": "subslide"
768 |     }
769 |    },
770 |    "source": [
771 |     "## RUN\n",
772 |     "\n",
773 |     "```\n",
774 |     "RUN conda update -y conda && \\\n",
775 |     "    conda install -y -c conda-forge \\\n",
776 |     "                  -c defaults \\\n",
777 |     "                  --show-channel-urls \\\n",
778 |     "                  --override-channels \\\n",
779 |     "    conda-build\n",
780 |     "```\n",
781 |     "\n",
782 |     "* The RUN instruction will execute any commands in a new layer on top of the current image and commit the results\n",
783 |     "* The resulting committed image will be used for the next step in the Dockerfile"
784 |    ]
785 |   },
786 |   {
787 |    "cell_type": "markdown",
788 |    "metadata": {
789 |     "slideshow": {
790 |      "slide_type": "subslide"
791 |     }
792 |    },
793 |    "source": [
794 |     "## RUN\n",
795 |     "\n",
796 |     "* Two forms\n",
797 |     "  * shell form runs the command in a shell `/bin/sh -c`\n",
798 |     "```\n",
799 |     "RUN <command>\n",
800 |     "```\n",
801 |     "  * exec mode\n",
802 |     "```\n",
803 |     "RUN [\"executable\", \"param1\", \"param2\"]\n",
804 |     "```\n",
805 |     "\n",
806 |     "* Since each instruction is a layers, you want to group commands (and do cleanup)"
807 |    ]
808 |   },
809 |   {
810 |    "cell_type": "markdown",
811 |    "metadata": {
812 |     "slideshow": {
813 |      "slide_type": "subslide"
814 |     }
815 |    },
816 |    "source": [
817 |     "## COPY / ADD\n",
818 |     "\n",
819 |     "```\n",
820 |     "COPY requirements.txt /bootstrap/requirements.txt\n",
821 |     "```\n",
822 |     "\n",
823 |     "* The COPY instruction copies new files from the build context and adds them to the filesystem of the container\n",
824 |     "* building an image takes place in a *build context*, most often the directory that contains the Dockerfile\n",
825 |     "* The files must be in the build context\n",
826 |     "  * `COPY ../something` is not valid\n",
827 |     "* ADD is similar to copy but has support for local-only tar extraction and remote URLs\n",
828 |     "* COPY is preferred\n"
829 |    ]
830 |   },
831 |   {
832 |    "cell_type": "markdown",
833 |    "metadata": {
834 |     "slideshow": {
835 |      "slide_type": "subslide"
836 |     }
837 |    },
838 |    "source": [
839 |     "## EXPOSE\n",
840 |     "\n",
841 |     "```\n",
842 |     "EXPOSE 8888\n",
843 |     "```\n",
844 |     "\n",
845 |     "* The EXPOSE instruction informs the container to listen on the specified port\n",
846 |     "* You must use the `--publish` flag to `docker run` to make these ports accessible to the host"
847 |    ]
848 |   },
849 |   {
850 |    "cell_type": "markdown",
851 |    "metadata": {
852 |     "slideshow": {
853 |      "slide_type": "subslide"
854 |     }
855 |    },
856 |    "source": [
857 |     "## CMD / ENTRYPOINT\n",
858 |     "\n",
859 |     "* This is what is executed when you run the container\n",
860 |     "* [Understand how CMD and ENTRYPOINT Interact](https://docs.docker.com/engine/reference/builder/#/understand-how-cmd-and-entrypoint-interact)\n",
861 |     "* Specify at least one\n",
862 |     "  * ENTRYPOINT to treat the container like an executable\n",
863 |     "  * CMD for default arguments to ENTRYPOINT"
864 |    ]
865 |   },
866 |   {
867 |    "cell_type": "markdown",
868 |    "metadata": {
869 |     "slideshow": {
870 |      "slide_type": "subslide"
871 |     }
872 |    },
873 |    "source": [
874 |     "## VOLUME\n",
875 |     "\n",
876 |     "* Docker volumes are a big topic\n",
877 |     "* Launching a container, we have a series of read-only layers with a read-write layer mounted last\n",
878 |     "* When you make changes to a file, that file is copied, but the underlying file exists still in the image\n",
879 |     "* Practically, this means that changes do not persist when you delete a container\n",
880 |     "* *Docker Volumes* exist outside the UFS\n",
881 |     "* You can mount from the host to the container outside the UFS, using the `--volume` flag for `docker run`"
882 |    ]
883 |   },
884 |   {
885 |    "cell_type": "markdown",
886 |    "metadata": {
887 |     "slideshow": {
888 |      "slide_type": "slide"
889 |     }
890 |    },
891 |    "source": [
892 |     "# Putting It All Together"
893 |    ]
894 |   },
895 |   {
896 |    "cell_type": "markdown",
897 |    "metadata": {
898 |     "slideshow": {
899 |      "slide_type": "subslide"
900 |     }
901 |    },
902 |    "source": [
903 |     "## Docker Compose\n",
904 |     "\n",
905 |     "* A tool for building more complex, multi-container applications\n",
906 |     "* Use a single-command to spin up this applications"
907 |    ]
908 |   },
909 |   {
910 |    "cell_type": "markdown",
911 |    "metadata": {
912 |     "slideshow": {
913 |      "slide_type": "subslide"
914 |     }
915 |    },
916 |    "source": [
917 |     "## Dask Distributed\n",
918 |     "\n",
919 |     "* Dask-Distributed defined using a docker-compose file\n",
920 |     "\n",
921 |     "```\n",
922 |     "docker-compose --project-name dask-distributed \\\n",
923 |     "               up -d\n",
924 |     "docker-compose -p dask-distributed scale dask-worker=4\n",
925 |     "```\n",
926 |     "\n",
927 |     "* Very useful for proto-typing running dask applications in a truly distributed environment"
928 |    ]
929 |   },
930 |   {
931 |    "cell_type": "markdown",
932 |    "metadata": {
933 |     "slideshow": {
934 |      "slide_type": "subslide"
935 |     }
936 |    },
937 |    "source": [
938 |     "# Python Tools\n",
939 |     "\n",
940 |     "* [nsenter](https://github.com/zalando/python-nsenter)\n",
941 |     "  * Enter namespaces with a context manager\n",
942 |     "* [docker-py](https://github.com/docker/docker-py)\n",
943 |     "  * Python docker client"
944 |    ]
945 |   },
946 |   {
947 |    "cell_type": "markdown",
948 |    "metadata": {
949 |     "slideshow": {
950 |      "slide_type": "slide"
951 |     }
952 |    },
953 |    "source": [
954 |     "# Resources\n",
955 |     "\n",
956 |     "* [Redhat's Resource Management Guide](https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Resource_Management_Guide/index.html)\n",
957 |     "* [Kernel Documentation for cgroups v1](https://www.kernel.org/doc/Documentation/cgroup-v1/)\n",
958 |     "* [cgroups, namespaces, and beyond: what are containers made from](https://www.youtube.com/watch?v=sK5i-N34im8)\n",
959 |     "* [Deep dive into Docker storage drivers](https://jpetazzo.github.io/assets/2015-03-03-not-so-deep-dive-into-docker-storage-drivers.html#1)\n",
960 |     "* [namespace man page](http://man7.org/linux/man-pages/man7/namespaces.7.html)\n",
961 |     "* [Docker documentation](https://docs.docker.com)\n",
962 |     "* [Best practices for writing Dockerfiles](https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/)\n",
963 |     "* [You Could Have Invented Containers: An Explanatory Fantasy](https://medium.com/@gtrevorjay/you-could-have-invented-container-runtimes-an-explanatory-fantasy-764c5b389bd3#.svjwa71rv)"
964 |    ]
965 |   }
966 |  ],
967 |  "metadata": {
968 |   "celltoolbar": "Slideshow",
969 |   "kernelspec": {
970 |    "display_name": "Python 3",
971 |    "language": "python",
972 |    "name": "python3"
973 |   },
974 |   "language_info": {
975 |    "codemirror_mode": {
976 |     "name": "ipython",
977 |     "version": 3
978 |    },
979 |    "file_extension": ".py",
980 |    "mimetype": "text/x-python",
981 |    "name": "python",
982 |    "nbconvert_exporter": "python",
983 |    "pygments_lexer": "ipython3",
984 |    "version": "3.7.6"
985 |   }
986 |  },
987 |  "nbformat": 4,
988 |  "nbformat_minor": 1
989 | }
990 | 


--------------------------------------------------------------------------------
/notebooks/pstree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jseabold/pydata-chi-docker/369f9cf93dccbea3d1d31118cbe33934b46f4de5/notebooks/pstree.png


--------------------------------------------------------------------------------
/up.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | docker-compose -p dask-distributed up -d
4 | docker-compose -p dask-distributed scale dask-worker=4
5 | 


--------------------------------------------------------------------------------