├── .gitignore
├── README.md
├── docker-compose.yml
├── down.sh
├── get_and_clean_data.py
├── notebooks
├── container_tweet.png
├── dask-dataframe.ipynb
├── data
│ └── .gitkeep
├── docker-for-data-science.ipynb
└── pstree.png
└── up.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | Food_Inspections.csv
3 | notebooks/data/food*
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Introduction to Docker (for Data Scientists)
2 |
3 | This repository contains notebooks and code for my talk at the PyData Chicago Meetup on November 7, 2016.
4 |
5 | This talk
6 |
7 | * introduces software containers and why they are useful to data scientists
8 | * takes a look at the parts of the linux kernel that make software containers possible
9 | * introduces Docker as a platform for using software containers
10 | * shows an example of running dask distributed in separate containers on a single host
11 |
12 | Run these notebooks in a container:
13 |
14 | ```shell
15 | docker run -d \
16 | -p 8888:8888 \
17 | -v ./notebooks:/notebooks \
18 | -w /notebooks \
19 | jseabold/dask-jupyter \
20 | bash -c "jupyter notebook --no-browser --ip='*'"
21 | ```
22 |
23 | The images are built from my [dockerfiles repo](https://github.com/jseabold/dockerfiles).
24 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "2"
2 | services:
3 | jupyter:
4 | image: jseabold/dask-jupyter:latest
5 | command: ["bash", "-c", "jupyter notebook --no-browser --ip='*'"]
6 | volumes:
7 | - ./notebooks:/notebooks
8 | working_dir: /notebooks
9 | ports:
10 | - "8888:8888"
11 | networks:
12 | - distributed
13 | links:
14 | - dask-scheduler
15 | container_name: jupyter
16 |
17 | dask-scheduler:
18 | image: jseabold/dask-scheduler:latest
19 | ports:
20 | - "8786:8786"
21 | - "9786:9786"
22 | - "8787:8787"
23 | command: ["bash", "-c", "dask-scheduler --host dask-scheduler"]
24 | networks:
25 | - distributed
26 | container_name: dask-scheduler
27 |
28 | dask-worker:
29 | image: jseabold/dask-worker:latest
30 | command: ["bash", "-c", "dask-worker dask-scheduler:8786 --nthreads 1 --nprocs 1"]
31 | mem_limit: 1500m
32 | volumes:
33 | - ./notebooks:/notebooks
34 | networks:
35 | - distributed
36 | links:
37 | - dask-scheduler
38 |
39 | networks:
40 | distributed:
41 | driver: bridge
42 |
--------------------------------------------------------------------------------
/down.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | docker-compose -p dask-distributed stop
4 | docker-compose -p dask-distributed rm -f
5 |
--------------------------------------------------------------------------------
/get_and_clean_data.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import requests
3 |
4 | url = ("https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?"
5 | "accessType=DOWNLOAD")
6 |
7 | response = requests.get(url, stream=True)
8 |
9 | with open("Food_Inspections.csv", "wb") as fout:
10 | for chunk in response.iter_content(32 * 1024):
11 | fout.write(chunk)
12 |
13 | df = pd.read_csv("Food_Inspection.csv")
14 |
15 | for year, group in df.groupby(df['Inspection Date'].dt.year):
16 | group.to_csv("notebooks/data/food_inspection_{}.csv.gz".format(year),
17 | compression='gzip', index=False)
18 |
--------------------------------------------------------------------------------
/notebooks/container_tweet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jseabold/pydata-chi-docker/369f9cf93dccbea3d1d31118cbe33934b46f4de5/notebooks/container_tweet.png
--------------------------------------------------------------------------------
/notebooks/dask-dataframe.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "[Bokeh Web UI](http://localhost:8787)"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [],
17 | "source": [
18 | "from pprint import pprint\n",
19 | "\n",
20 | "from dask.distributed import Client\n",
21 | "client = Client('dask-scheduler:8786')"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "collapsed": false
29 | },
30 | "outputs": [],
31 | "source": [
32 | "client"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "collapsed": false
40 | },
41 | "outputs": [],
42 | "source": [
43 | "pprint(client.scheduler_info())"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {
50 | "collapsed": false
51 | },
52 | "outputs": [],
53 | "source": [
54 | "client.list_datasets()"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "# Dask DataFrame"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": [
72 | "import dask.dataframe as dd"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {
79 | "collapsed": false
80 | },
81 | "outputs": [],
82 | "source": [
83 | "!ls data/*.csv.gz"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [],
93 | "source": [
94 | "df = dd.read_csv(\"/notebooks/data/food_inspection_201*\", \n",
95 | " blocksize=None, \n",
96 | " compression='gzip', \n",
97 | " parse_dates=['Inspection Date'])"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {
104 | "collapsed": false
105 | },
106 | "outputs": [],
107 | "source": [
108 | "df"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [],
118 | "source": [
119 | "df.head()"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {
126 | "collapsed": true
127 | },
128 | "outputs": [],
129 | "source": [
130 | "failed = df[df.Results == 'Fail']"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {
137 | "collapsed": false
138 | },
139 | "outputs": [],
140 | "source": [
141 | "years = failed.year.unique()\n",
142 | "years = years.compute()\n",
143 | "years = years.values.tolist()"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {
150 | "collapsed": false
151 | },
152 | "outputs": [],
153 | "source": [
154 | "divisions = years + [2016]\n",
155 | "divisions"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {
162 | "collapsed": false
163 | },
164 | "outputs": [],
165 | "source": [
166 | "failed = failed.set_partition('year', divisions=divisions)"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {
173 | "collapsed": false
174 | },
175 | "outputs": [],
176 | "source": [
177 | "failed.known_divisions"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {
184 | "collapsed": false
185 | },
186 | "outputs": [],
187 | "source": [
188 | "failed.head()"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {
195 | "collapsed": true
196 | },
197 | "outputs": [],
198 | "source": [
199 | "def violators(df, n=3):\n",
200 | " size = lambda x: len(x)\n",
201 | " return (df.groupby('Address').\n",
202 | " apply(lambda x: len(x)).\n",
203 | " nlargest(n=n).\n",
204 | " to_frame(name='violations').\n",
205 | " reset_index())"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {
212 | "collapsed": false
213 | },
214 | "outputs": [],
215 | "source": [
216 | "result = (failed.groupby(failed.index).\n",
217 | " apply(violators, \n",
218 | " meta=[('Address', object), \n",
219 | " ('Violations', int)]))\n",
220 | "result"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {
227 | "collapsed": true
228 | },
229 | "outputs": [],
230 | "source": [
231 | "worst = result.compute()"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {
238 | "collapsed": false
239 | },
240 | "outputs": [],
241 | "source": [
242 | "worst.index.names = ['Year', 'Offender']"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "collapsed": false
250 | },
251 | "outputs": [],
252 | "source": [
253 | "worst.sort_index(level=0)"
254 | ]
255 | }
256 | ],
257 | "metadata": {
258 | "kernelspec": {
259 | "display_name": "Python 3",
260 | "language": "python",
261 | "name": "python3"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.5.2"
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 1
278 | }
279 |
--------------------------------------------------------------------------------
/notebooks/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jseabold/pydata-chi-docker/369f9cf93dccbea3d1d31118cbe33934b46f4de5/notebooks/data/.gitkeep
--------------------------------------------------------------------------------
/notebooks/docker-for-data-science.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 9,
6 | "metadata": {
7 | "slideshow": {
8 | "slide_type": "skip"
9 | }
10 | },
11 | "outputs": [
12 | {
13 | "data": {
14 | "text/plain": [
15 | "{'height': 800, 'width': 1200}"
16 | ]
17 | },
18 | "execution_count": 9,
19 | "metadata": {},
20 | "output_type": "execute_result"
21 | }
22 | ],
23 | "source": [
24 | "from traitlets.config.manager import BaseJSONConfigManager\n",
25 | "path = \"/.jupyter/nbconfig\"\n",
26 | "cm = BaseJSONConfigManager(config_dir=path)\n",
27 | "\n",
28 | "cm.update('livereveal', {\n",
29 | " 'width': 1200,\n",
30 | " 'height': 800,\n",
31 | "})"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {
37 | "slideshow": {
38 | "slide_type": "slide"
39 | }
40 | },
41 | "source": [
42 | "# About Me\n",
43 | "\n",
44 | "* Lead Data Scientist at [Civis Analytics](https://civisanalytics.com/)\n",
45 | "* [@jseabold](https://twitter.com/jseabold/) on Twitter\n",
46 | "* [jseabold](https://github.com/jseabold) on GitHub\n",
47 | "* [https://github.com/jseabold/pydata-chi-docker](https://github.com/jseabold/pydata-chi-docker)"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {
53 | "slideshow": {
54 | "slide_type": "slide"
55 | }
56 | },
57 | "source": [
58 | "# Docker for Data Science"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {
64 | "slideshow": {
65 | "slide_type": "subslide"
66 | }
67 | },
68 | "source": [
69 | "## Docker Introduction\n",
70 | "\n",
71 | "* Docker is a platform for running applications in software containers\n",
72 | "* Containers are an implementation of operating-system-level virtualization\n",
73 | "* Enabled by features in the Linux kernel"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {
79 | "slideshow": {
80 | "slide_type": "subslide"
81 | }
82 | },
83 | "source": [
84 | "## What does using Docker offer?\n",
85 | "\n",
86 | "* Reproducibility\n",
87 | "* Portability / working environments\n",
88 | "* Reduces need for complex installations\n",
89 | "* Easier testing / debugging\n",
90 | "* Resource management\n",
91 | "* Easier networking between services"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {
97 | "slideshow": {
98 | "slide_type": "subslide"
99 | }
100 | },
101 | "source": [
102 | "# That sounds like a Virtual Machine"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {
108 | "slideshow": {
109 | "slide_type": "subslide"
110 | }
111 | },
112 | "source": [
113 | "## Example Workflow\n",
114 | "\n",
115 | "```shell\n",
116 | "docker build -t jseabold/dask-jupyter .\n",
117 | "docker push jseabold/dask-jupyter\n",
118 | "docker run --detach \\\n",
119 | " --publish 8888:8888 \\\n",
120 | " --volume $(pwd)/notebooks:/notebooks \\\n",
121 | " --working-dir /notebooks \\\n",
122 | " jseabold/dask-jupyter\n",
123 | "```"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {
129 | "slideshow": {
130 | "slide_type": "subslide"
131 | }
132 | },
133 | "source": [
134 | "## Why not Virtual Machines?\n",
135 | "\n",
136 | "* A computer simulated in software\n",
137 | "* Kind of slow\n",
138 | "* Pretty big\n",
139 | "* Takes time to provision, bring down, resume, etc.\n",
140 | "* The ~\\*~\\*~ cloud ~\\*~\\*~\n",
141 | " * Cheap and easy to provision new machines\n",
142 | " * Happens more often\n",
143 | " * Services start to be spread across hosts"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {
149 | "slideshow": {
150 | "slide_type": "subslide"
151 | }
152 | },
153 | "source": [
154 | "# History of OS-level Virtualization\n",
155 | "\n",
156 | "* chroot (1979)\n",
157 | " - Change the apparent root directory for the current running process and its children\n",
158 | "* namespaces (2002)\n",
159 | "* Solaris Containers \"chroot on steroids\" (2004)\n",
160 | "* control groups (google, 2006)\n",
161 | " * LXC (2009)\n",
162 | "* copy-on-write\n",
163 | "* Linux Containers (LXC, 2008)\n",
164 | "* Docker (2013)"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {
170 | "slideshow": {
171 | "slide_type": "subslide"
172 | }
173 | },
174 | "source": [
175 | "## What Makes Up a Container\n",
176 | "\n",
177 | "* Control groups\n",
178 | "* Namespaces\n",
179 | "* copy-on-write storage"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {
185 | "slideshow": {
186 | "slide_type": "subslide"
187 | }
188 | },
189 | "source": [
190 | "# True Confessions\n",
191 | "\n",
192 | "\n"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {
198 | "slideshow": {
199 | "slide_type": "slide"
200 | }
201 | },
202 | "source": [
203 | "# Linux Nuts and Bolts\n"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {
209 | "slideshow": {
210 | "slide_type": "subslide"
211 | }
212 | },
213 | "source": [
214 | "## Linux Process Model\n",
215 | "\n",
216 | "* A process, or *task*, is an executing instance of a program\n",
217 | "* New processes are created (cloned) by the system call *fork*\n",
218 | " * This *copies* the current process and creates a child process with a link to the current parent process\n",
219 | " * The address space comes along / only copied on modification\n",
220 | "* Python exposes these OS primitives in `os` and `multiprocessing`, for example\n",
221 | "* Processes cannot live in isolation\n",
222 | " * Every process has a parent (with one exception)"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {
228 | "slideshow": {
229 | "slide_type": "subslide"
230 | }
231 | },
232 | "source": [
233 | "## Initialization Process\n",
234 | "\n",
235 | "* What happens when you boot up the linux operating system?\n",
236 | "* The kernel finds the initialization process and starts it\n",
237 | " * Traditionally, **init** \n",
238 | " * Now, commonly, **systemd**\n",
239 | "* Daemon running in the background \n",
240 | "* Direct or indirect ancestor of all processes"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 1,
246 | "metadata": {
247 | "slideshow": {
248 | "slide_type": "subslide"
249 | }
250 | },
251 | "outputs": [
252 | {
253 | "name": "stdout",
254 | "output_type": "stream",
255 | "text": [
256 | "systemd\r\n"
257 | ]
258 | }
259 | ],
260 | "source": [
261 | "!ps -q 1 -o comm="
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {
267 | "slideshow": {
268 | "slide_type": "subslide"
269 | }
270 | },
271 | "source": [
272 | ""
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {
278 | "slideshow": {
279 | "slide_type": "slide"
280 | }
281 | },
282 | "source": [
283 | "# Control Groups"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {
289 | "slideshow": {
290 | "slide_type": "subslide"
291 | }
292 | },
293 | "source": [
294 | "## What are cgroups\n",
295 | "\n",
296 | "
\n",
297 | "
\n",
298 | "\"Control Groups provide a mechanism for aggregating / partitioning sets of\n",
299 | "tasks, and all their future children, into hierarchical groups with\n",
300 | "specialized behaviour.\"\n",
301 | "
\n",
302 | "\n",
303 | "* Allow allocation of resources among processes\n",
304 | "* Includes metering, limiting, and accounting for resources\n",
305 | "* Similar to processes \n",
306 | " * hierarchical\n",
307 | " * inherit from parent cgroups\n",
308 | " * *But* many different ones can exist simultaneously"
309 | ]
310 | },
311 | {
312 | "cell_type": "markdown",
313 | "metadata": {
314 | "slideshow": {
315 | "slide_type": "subslide"
316 | }
317 | },
318 | "source": [
319 | "## University Server Example\n",
320 | "\n",
321 | "\n",
322 | "```\n",
323 | "CPU : \"Top cpuset\"\n",
324 | " / \\\n",
325 | " CPUSet1 CPUSet2\n",
326 | " | |\n",
327 | " (Professors) (Students)\n",
328 | "\n",
329 | " In addition (system tasks) are attached to topcpuset (so\n",
330 | " that they can run anywhere) with a limit of 20%\n",
331 | "\n",
332 | "Memory : Professors (50%), Students (30%), system (20%)\n",
333 | "\n",
334 | "Disk : Professors (50%), Students (30%), system (20%)\n",
335 | "\n",
336 | "Network : WWW browsing (20%), Network File System (60%), others (20%)\n",
337 | " / \\\n",
338 | " Professors (15%) students (5%)\n",
339 | "```\n",
340 | "
"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {
346 | "slideshow": {
347 | "slide_type": "subslide"
348 | }
349 | },
350 | "source": [
351 | "## cgroup subsystems\n",
352 | "\n",
353 | "* The cgroup hierarchies are connected to one or more **subsystems**\n",
354 | "* blkio\n",
355 | "* cpu / cpuset\n",
356 | "* devices\n",
357 | "* memory\n",
358 | "* net_cls / net_prio\n",
359 | "* ..."
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "metadata": {
365 | "slideshow": {
366 | "slide_type": "subslide"
367 | }
368 | },
369 | "source": [
370 | "## cpu cgroup\n",
371 | "\n",
372 | "* group processes together\n",
373 | "* you can set weights per cgroup that OS scheduler takes into account\n",
374 | "* can't set limits\n",
375 | " * It doesn't make sense\n",
376 | " * CPU architecture (different registers, different instructions, doesn't make sense)"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {
382 | "slideshow": {
383 | "slide_type": "subslide"
384 | }
385 | },
386 | "source": [
387 | "## cpuset cgroup\n",
388 | "\n",
389 | "* processor affinity\n",
390 | "* pin groups to specific CPUS\n",
391 | "* reserve CPUs for specific apps"
392 | ]
393 | },
394 | {
395 | "cell_type": "markdown",
396 | "metadata": {
397 | "slideshow": {
398 | "slide_type": "subslide"
399 | }
400 | },
401 | "source": [
402 | "## memory cgroup\n",
403 | "\n",
404 | "* limits are optional -- soft and hard limits\n",
405 | "* soft limits are not enforced\n",
406 | " * when pressure is strong, it looks at the cgroups above the soft limit, then you get pages taken from you by the kernel\n",
407 | "* limits can be set for different kinds of memory\n",
408 | " * physical (RAM), kernel (dentries), total (SWAP)\n",
409 | "* hard limit -- process gets killed on the cgroup level\n",
410 | " * it kills the process in this container\n",
411 | " * this is why you want to have one service per-container"
412 | ]
413 | },
414 | {
415 | "cell_type": "markdown",
416 | "metadata": {
417 | "slideshow": {
418 | "slide_type": "subslide"
419 | }
420 | },
421 | "source": [
422 | "## blkio cgroup\n",
423 | "\n",
424 | "* keeps track of IO for ea. grou\n",
425 | "* per block devices\n",
426 | "* read vs write\n",
427 | "* sync vs async\n",
428 | "* set throttle (limits) for each group\n",
429 | "* set relative weights for each group"
430 | ]
431 | },
432 | {
433 | "cell_type": "markdown",
434 | "metadata": {
435 | "slideshow": {
436 | "slide_type": "subslide"
437 | }
438 | },
439 | "source": [
440 | "## net_cls and net_prio cgroup\n",
441 | "\n",
442 | "* net_cls allows tagging of network packets with their origin cgroup\n",
443 | "* net_prio allows setting the priority of cgroups for different network interfaces"
444 | ]
445 | },
446 | {
447 | "cell_type": "markdown",
448 | "metadata": {
449 | "slideshow": {
450 | "slide_type": "subslide"
451 | }
452 | },
453 | "source": [
454 | "## devices cgroup\n",
455 | "\n",
456 | "* What tasks can use what device\n",
457 | "* Typically, things like\n",
458 | " * /dev/{tty,zero,random,null}\n",
459 | " * /dev/net/tun\n",
460 | " * /dev/fuse\n",
461 | " * /dev/dri (GPU)"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "metadata": {
467 | "slideshow": {
468 | "slide_type": "subslide"
469 | }
470 | },
471 | "source": [
472 | "## freezer cgroup\n",
473 | "\n",
474 | "* Like SIGSTOP on the container\n",
475 | "* freeze/thaw a group of processes\n",
476 | "* process migration\n",
477 | "* cluster batch scheduling and process migration"
478 | ]
479 | },
480 | {
481 | "cell_type": "markdown",
482 | "metadata": {
483 | "slideshow": {
484 | "slide_type": "slide"
485 | }
486 | },
487 | "source": [
488 | "# Linux Namespaces"
489 | ]
490 | },
491 | {
492 | "cell_type": "markdown",
493 | "metadata": {
494 | "slideshow": {
495 | "slide_type": "subslide"
496 | }
497 | },
498 | "source": [
499 | "# What are namespaces?\n",
500 | "\n",
501 | "* If cgroups limit what you can use, namespaces limit what you can view\n",
502 | "* Takes a global resource and makes it look like processes have their own\n",
503 | "* Namespaces\n",
504 | " * pid (processes)\n",
505 | " * net (network stack)\n",
506 | " * mnt (filesystem and mount points)\n",
507 | " * uts (hostname)\n",
508 | " * ipc (interprocess communication)\n",
509 | " * user (user)\n",
510 | "* each process is in one namespace of each type"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {
516 | "slideshow": {
517 | "slide_type": "subslide"
518 | }
519 | },
520 | "source": [
521 | "## pid namespace\n",
522 | "\n",
523 | "* see only other process in your pid namespace\n",
524 | "* pid in and outside of the container"
525 | ]
526 | },
527 | {
528 | "cell_type": "markdown",
529 | "metadata": {
530 | "slideshow": {
531 | "slide_type": "subslide"
532 | }
533 | },
534 | "source": [
535 | "## network namespace\n",
536 | "\n",
537 | "* processes within a given network namespace get their own private network stack, including\n",
538 | " * network interfaces (including lo)\n",
539 | " * routing tables\n",
540 | " * iptables routes\n",
541 | " * sockets (ss, netstate)\n",
542 | "* you can move a network interface across netns\n",
543 | " * have a container that sets up a vpn connection and then moves it across containers"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {
549 | "slideshow": {
550 | "slide_type": "subslide"
551 | }
552 | },
553 | "source": [
554 | "## mnt namespace \n",
555 | "\n",
556 | "* Processes can have their own root fs\n",
557 | "* Processes also have \"private\" mounts\n",
558 | " * /tmp (scoped per user, per service)\n",
559 | "* Mounts can be private or shared\n",
560 | "* Can't pass a mount from a namespace to another"
561 | ]
562 | },
563 | {
564 | "cell_type": "markdown",
565 | "metadata": {
566 | "slideshow": {
567 | "slide_type": "subslide"
568 | }
569 | },
570 | "source": [
571 | "## uts namespace\n",
572 | "\n",
573 | "* can have your own hostname\n",
574 | "* isolating kernel and version identifiers"
575 | ]
576 | },
577 | {
578 | "cell_type": "markdown",
579 | "metadata": {
580 | "slideshow": {
581 | "slide_type": "subslide"
582 | }
583 | },
584 | "source": [
585 | "## ipc namespace\n",
586 | "\n",
587 | "* System V and posix IPC\n",
588 | "* allows a process to have its own \n",
589 | " * IPC semaphores\n",
590 | " * IPC message queues\n",
591 | " * IPC shared memory\n",
592 | "* without risk of conflict with other instances"
593 | ]
594 | },
595 | {
596 | "cell_type": "markdown",
597 | "metadata": {
598 | "slideshow": {
599 | "slide_type": "subslide"
600 | }
601 | },
602 | "source": [
603 | "## user namespace \n",
604 | "\n",
605 | "* map UID/GID inside the container to outside\n",
606 | "* This is as big topic\n",
607 | " * Only recently added to Docker\n",
608 | "* UIDs 0-1999 in the container are mapped to 10000->11999 on host, etc.\n",
609 | "* UID in containers becomes irrelevant. Just use UID 0 in the container\n",
610 | "* It gets squashed to a non-privileged user outside\n",
611 | " * Volumes *gotcha*"
612 | ]
613 | },
614 | {
615 | "cell_type": "markdown",
616 | "metadata": {
617 | "slideshow": {
618 | "slide_type": "slide"
619 | }
620 | },
621 | "source": [
622 | "# Union Filesystem"
623 | ]
624 | },
625 | {
626 | "cell_type": "markdown",
627 | "metadata": {
628 | "slideshow": {
629 | "slide_type": "subslide"
630 | }
631 | },
632 | "source": [
633 | "# What is a Union FS?\n",
634 | "\n",
635 | "* This is what makes containers lightweight\n",
636 | "* Allows different parts of a filesystem to be overlaid as transparent layers\n",
637 | "* Create a new container instantly instead of copying the whole filesystem\n",
638 | "* Storage drive keeps track of what has changed\n",
639 | "* Options\n",
640 | " * AUFS, overlay (file level)\n",
641 | " * device mapper (block level)\n",
642 | " * BTRFS, ZFS (filesystem level)"
643 | ]
644 | },
645 | {
646 | "cell_type": "markdown",
647 | "metadata": {
648 | "slideshow": {
649 | "slide_type": "slide"
650 | }
651 | },
652 | "source": [
653 | "# Docker"
654 | ]
655 | },
656 | {
657 | "cell_type": "markdown",
658 | "metadata": {
659 | "slideshow": {
660 | "slide_type": "subslide"
661 | }
662 | },
663 | "source": [
664 | "## What is Docker\n",
665 | "\n",
666 | "\n",
667 | "* Docker is a platform that provides abstractions for working with containers and a container runtime\n",
668 | "* It is not the only way to manage software containers (!)"
669 | ]
670 | },
671 | {
672 | "cell_type": "markdown",
673 | "metadata": {
674 | "slideshow": {
675 | "slide_type": "subslide"
676 | }
677 | },
678 | "source": [
679 | "# Docker Architecture\n",
680 | "\n",
681 | ""
682 | ]
683 | },
684 | {
685 | "cell_type": "markdown",
686 | "metadata": {
687 | "slideshow": {
688 | "slide_type": "subslide"
689 | }
690 | },
691 | "source": [
692 | "# Docker Images\n",
693 | "\n",
694 | "* Read-only template from which containers are instantiated\n",
695 | "* Images consist of *layers*\n",
696 | " * these layers can be shared\n",
697 | "* The [Union file system](https://en.wikipedia.org/wiki/UnionFS) combines the layers into an image\n",
698 | "* The image layers are part of what makes docker lightweight\n",
699 | "* Updating one layer does not need to update other layers"
700 | ]
701 | },
702 | {
703 | "cell_type": "markdown",
704 | "metadata": {
705 | "slideshow": {
706 | "slide_type": "subslide"
707 | }
708 | },
709 | "source": [
710 | "## Dockerfile\n",
711 | "\n",
712 | "\n",
713 | "
\n",
714 | "```\n",
715 | "FROM continuumio/miniconda3:4.1.11\n",
716 | "MAINTAINER \n",
717 | "\n",
718 | "RUN conda update -y conda && \\\n",
719 | " conda install -y -c conda-forge -c defaults --show-channel-urls --override-channels \\\n",
720 | " conda-build\n",
721 | "\n",
722 | "COPY requirements.txt /bootstrap/requirements.txt\n",
723 | "\n",
724 | "RUN conda install -y -c conda-forge -c defaults --file \\\n",
725 | " /bootstrap/requirements.txt && \\\n",
726 | " conda install -c damianavila82 rise && \\\n",
727 | " conda clean -tipsy\n",
728 | " \n",
729 | "RUN pip install --user graphviz\n",
730 | "\n",
731 | "RUN jupyter nbextension enable --py widgetsnbexdtension && \\\n",
732 | " jupyter nbextension install --py rise && \\\n",
733 | " jupyter nbextension enable --py rise\n",
734 | " \n",
735 | "EXPOSE 8888\n",
736 | "\n",
737 | "ENTRYPOINT [\"bash\", \"-c\", \"jupyter-notebook --no-browser --ip='*'\"]\n",
738 | "```\n",
739 | "
"
740 | ]
741 | },
742 | {
743 | "cell_type": "markdown",
744 | "metadata": {
745 | "slideshow": {
746 | "slide_type": "subslide"
747 | }
748 | },
749 | "source": [
750 | "## FROM\n",
751 | "\n",
752 | "```\n",
753 | "FROM continuumio/miniconda3:4.1.11\n",
754 | "```\n",
755 | "\n",
756 | "* Every Dockerfile needs to start with a `FROM` instruction\n",
757 | "* Specifies the *base image* and *tag*\n",
758 | "* Common examples: `ubuntu:16.04`, `debian:jessie`\n",
759 | " * debian is recommended as a best practice\n",
760 | "* docker maintains a list of [Official Repositories](https://hub.docker.com/explore/)"
761 | ]
762 | },
763 | {
764 | "cell_type": "markdown",
765 | "metadata": {
766 | "slideshow": {
767 | "slide_type": "subslide"
768 | }
769 | },
770 | "source": [
771 | "## RUN\n",
772 | "\n",
773 | "```\n",
774 | "RUN conda update -y conda && \\\n",
775 | " conda install -y -c conda-forge \\\n",
776 | " -c defaults \\\n",
777 | " --show-channel-urls \\\n",
778 | " --override-channels \\\n",
779 | " conda-build\n",
780 | "```\n",
781 | "\n",
782 | "* The RUN instruction will execute any commands in a new layer on top of the current image and commit the results\n",
783 | "* The resulting committed image will be used for the next step in the Dockerfile"
784 | ]
785 | },
786 | {
787 | "cell_type": "markdown",
788 | "metadata": {
789 | "slideshow": {
790 | "slide_type": "subslide"
791 | }
792 | },
793 | "source": [
794 | "## RUN\n",
795 | "\n",
796 | "* Two forms\n",
797 | " * shell form runs the command in a shell `/bin/sh -c`\n",
798 | "```\n",
799 | "RUN \n",
800 | "```\n",
801 | " * exec mode\n",
802 | "```\n",
803 | "RUN [\"executable\", \"param1\", \"param2\"]\n",
804 | "```\n",
805 | "\n",
806 | "* Since each instruction is a layers, you want to group commands (and do cleanup)"
807 | ]
808 | },
809 | {
810 | "cell_type": "markdown",
811 | "metadata": {
812 | "slideshow": {
813 | "slide_type": "subslide"
814 | }
815 | },
816 | "source": [
817 | "## COPY / ADD\n",
818 | "\n",
819 | "```\n",
820 | "COPY requirements.txt /bootstrap/requirements.txt\n",
821 | "```\n",
822 | "\n",
823 | "* The COPY instruction copies new files from the build context and adds them to the filesystem of the container\n",
824 | "* building an image takes place in a *build context*, most often the directory that contains the Dockerfile\n",
825 | "* The files must be in the build context\n",
826 | " * `COPY ../something` is not valid\n",
827 | "* ADD is similar to copy but has support for local-only tar extraction and remote URLs\n",
828 | "* COPY is preferred\n"
829 | ]
830 | },
831 | {
832 | "cell_type": "markdown",
833 | "metadata": {
834 | "slideshow": {
835 | "slide_type": "subslide"
836 | }
837 | },
838 | "source": [
839 | "## EXPOSE\n",
840 | "\n",
841 | "```\n",
842 | "EXPOSE 8888\n",
843 | "```\n",
844 | "\n",
845 | "* The EXPOSE instruction informs the container to listen on the specified port\n",
846 | "* You must use the `--publish` flag to `docker run` to make these ports accessible to the host"
847 | ]
848 | },
849 | {
850 | "cell_type": "markdown",
851 | "metadata": {
852 | "slideshow": {
853 | "slide_type": "subslide"
854 | }
855 | },
856 | "source": [
857 | "## CMD / ENTRYPOINT\n",
858 | "\n",
859 | "* This is what is executed when you run the container\n",
860 | "* [Understand how CMD and ENTRYPOINT Interact](https://docs.docker.com/engine/reference/builder/#/understand-how-cmd-and-entrypoint-interact)\n",
861 | "* Specify at least one\n",
862 | " * ENTRYPOINT to treat the container like an executable\n",
863 | " * CMD for default arguments to ENTRYPOINT"
864 | ]
865 | },
866 | {
867 | "cell_type": "markdown",
868 | "metadata": {
869 | "slideshow": {
870 | "slide_type": "subslide"
871 | }
872 | },
873 | "source": [
874 | "## VOLUME\n",
875 | "\n",
876 | "* Docker volumes are a big topic\n",
877 | "* Launching a container, we have a series of read-only layers with a read-write layer mounted last\n",
878 | "* When you make changes to a file, that file is copied, but the underlying file exists still in the image\n",
879 | "* Practically, this means that changes do not persist when you delete a container\n",
880 | "* *Docker Volumes* exist outside the UFS\n",
881 | "* You can mount from the host to the container outside the UFS, using the `--volume` flag for `docker run`"
882 | ]
883 | },
884 | {
885 | "cell_type": "markdown",
886 | "metadata": {
887 | "slideshow": {
888 | "slide_type": "slide"
889 | }
890 | },
891 | "source": [
892 | "# Putting It All Together"
893 | ]
894 | },
895 | {
896 | "cell_type": "markdown",
897 | "metadata": {
898 | "slideshow": {
899 | "slide_type": "subslide"
900 | }
901 | },
902 | "source": [
903 | "## Docker Compose\n",
904 | "\n",
905 | "* A tool for building more complex, multi-container applications\n",
906 | "* Use a single-command to spin up this applications"
907 | ]
908 | },
909 | {
910 | "cell_type": "markdown",
911 | "metadata": {
912 | "slideshow": {
913 | "slide_type": "subslide"
914 | }
915 | },
916 | "source": [
917 | "## Dask Distributed\n",
918 | "\n",
919 | "* Dask-Distributed defined using a docker-compose file\n",
920 | "\n",
921 | "```\n",
922 | "docker-compose --project-name dask-distributed \\\n",
923 | " up -d\n",
924 | "docker-compose -p dask-distributed scale dask-worker=4\n",
925 | "```\n",
926 | "\n",
927 | "* Very useful for proto-typing running dask applications in a truly distributed environment"
928 | ]
929 | },
930 | {
931 | "cell_type": "markdown",
932 | "metadata": {
933 | "slideshow": {
934 | "slide_type": "subslide"
935 | }
936 | },
937 | "source": [
938 | "# Python Tools\n",
939 | "\n",
940 | "* [nsenter](https://github.com/zalando/python-nsenter)\n",
941 | " * Enter namespaces with a context manager\n",
942 | "* [docker-py](https://github.com/docker/docker-py)\n",
943 | " * Python docker client"
944 | ]
945 | },
946 | {
947 | "cell_type": "markdown",
948 | "metadata": {
949 | "slideshow": {
950 | "slide_type": "slide"
951 | }
952 | },
953 | "source": [
954 | "# Resources\n",
955 | "\n",
956 | "* [Redhat's Resource Management Guide](https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Resource_Management_Guide/index.html)\n",
957 | "* [Kernel Documentation for cgroups v1](https://www.kernel.org/doc/Documentation/cgroup-v1/)\n",
958 | "* [cgroups, namespaces, and beyond: what are containers made from](https://www.youtube.com/watch?v=sK5i-N34im8)\n",
959 | "* [Deep dive into Docker storage drivers](https://jpetazzo.github.io/assets/2015-03-03-not-so-deep-dive-into-docker-storage-drivers.html#1)\n",
960 | "* [namespace man page](http://man7.org/linux/man-pages/man7/namespaces.7.html)\n",
961 | "* [Docker documentation](https://docs.docker.com)\n",
962 | "* [Best practices for writing Dockerfiles](https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/)\n",
963 | "* [You Could Have Invented Containers: An Explanatory Fantasy](https://medium.com/@gtrevorjay/you-could-have-invented-container-runtimes-an-explanatory-fantasy-764c5b389bd3#.svjwa71rv)"
964 | ]
965 | }
966 | ],
967 | "metadata": {
968 | "celltoolbar": "Slideshow",
969 | "kernelspec": {
970 | "display_name": "Python 3",
971 | "language": "python",
972 | "name": "python3"
973 | },
974 | "language_info": {
975 | "codemirror_mode": {
976 | "name": "ipython",
977 | "version": 3
978 | },
979 | "file_extension": ".py",
980 | "mimetype": "text/x-python",
981 | "name": "python",
982 | "nbconvert_exporter": "python",
983 | "pygments_lexer": "ipython3",
984 | "version": "3.7.6"
985 | }
986 | },
987 | "nbformat": 4,
988 | "nbformat_minor": 1
989 | }
990 |
--------------------------------------------------------------------------------
/notebooks/pstree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jseabold/pydata-chi-docker/369f9cf93dccbea3d1d31118cbe33934b46f4de5/notebooks/pstree.png
--------------------------------------------------------------------------------
/up.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | docker-compose -p dask-distributed up -d
4 | docker-compose -p dask-distributed scale dask-worker=4
5 |
--------------------------------------------------------------------------------