├── .gitignore ├── README.md ├── docker-compose.yml ├── down.sh ├── get_and_clean_data.py ├── notebooks ├── container_tweet.png ├── dask-dataframe.ipynb ├── data │ └── .gitkeep ├── docker-for-data-science.ipynb └── pstree.png └── up.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | Food_Inspections.csv 3 | notebooks/data/food* 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction to Docker (for Data Scientists) 2 | 3 | This repository contains notebooks and code for my talk at the PyData Chicago Meetup on November 7, 2016. 4 | 5 | This talk 6 | 7 | * introduces software containers and why they are useful to data scientists 8 | * takes a look at the parts of the linux kernel that make software containers possible 9 | * introduces Docker as a platform for using software containers 10 | * shows an example of running dask distributed in separate containers on a single host 11 | 12 | Run these notebooks in a container: 13 | 14 | ```shell 15 | docker run -d \ 16 | -p 8888:8888 \ 17 | -v ./notebooks:/notebooks \ 18 | -w /notebooks \ 19 | jseabold/dask-jupyter \ 20 | bash -c "jupyter notebook --no-browser --ip='*'" 21 | ``` 22 | 23 | The images are built from my [dockerfiles repo](https://github.com/jseabold/dockerfiles). 24 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | services: 3 | jupyter: 4 | image: jseabold/dask-jupyter:latest 5 | command: ["bash", "-c", "jupyter notebook --no-browser --ip='*'"] 6 | volumes: 7 | - ./notebooks:/notebooks 8 | working_dir: /notebooks 9 | ports: 10 | - "8888:8888" 11 | networks: 12 | - distributed 13 | links: 14 | - dask-scheduler 15 | container_name: jupyter 16 | 17 | dask-scheduler: 18 | image: jseabold/dask-scheduler:latest 19 | ports: 20 | - "8786:8786" 21 | - "9786:9786" 22 | - "8787:8787" 23 | command: ["bash", "-c", "dask-scheduler --host dask-scheduler"] 24 | networks: 25 | - distributed 26 | container_name: dask-scheduler 27 | 28 | dask-worker: 29 | image: jseabold/dask-worker:latest 30 | command: ["bash", "-c", "dask-worker dask-scheduler:8786 --nthreads 1 --nprocs 1"] 31 | mem_limit: 1500m 32 | volumes: 33 | - ./notebooks:/notebooks 34 | networks: 35 | - distributed 36 | links: 37 | - dask-scheduler 38 | 39 | networks: 40 | distributed: 41 | driver: bridge 42 | -------------------------------------------------------------------------------- /down.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | docker-compose -p dask-distributed stop 4 | docker-compose -p dask-distributed rm -f 5 | -------------------------------------------------------------------------------- /get_and_clean_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import requests 3 | 4 | url = ("https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?" 5 | "accessType=DOWNLOAD") 6 | 7 | response = requests.get(url, stream=True) 8 | 9 | with open("Food_Inspections.csv", "wb") as fout: 10 | for chunk in response.iter_content(32 * 1024): 11 | fout.write(chunk) 12 | 13 | df = pd.read_csv("Food_Inspection.csv") 14 | 15 | for year, group in df.groupby(df['Inspection Date'].dt.year): 16 | group.to_csv("notebooks/data/food_inspection_{}.csv.gz".format(year), 17 | compression='gzip', index=False) 18 | -------------------------------------------------------------------------------- /notebooks/container_tweet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jseabold/pydata-chi-docker/369f9cf93dccbea3d1d31118cbe33934b46f4de5/notebooks/container_tweet.png -------------------------------------------------------------------------------- /notebooks/dask-dataframe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "[Bokeh Web UI](http://localhost:8787)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from pprint import pprint\n", 19 | "\n", 20 | "from dask.distributed import Client\n", 21 | "client = Client('dask-scheduler:8786')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "client" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "pprint(client.scheduler_info())" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "client.list_datasets()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "# Dask DataFrame" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "import dask.dataframe as dd" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "!ls data/*.csv.gz" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "df = dd.read_csv(\"/notebooks/data/food_inspection_201*\", \n", 95 | " blocksize=None, \n", 96 | " compression='gzip', \n", 97 | " parse_dates=['Inspection Date'])" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "df" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "df.head()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "failed = df[df.Results == 'Fail']" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "years = failed.year.unique()\n", 142 | "years = years.compute()\n", 143 | "years = years.values.tolist()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "divisions = years + [2016]\n", 155 | "divisions" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": false 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "failed = failed.set_partition('year', divisions=divisions)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "failed.known_divisions" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": false 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "failed.head()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": true 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "def violators(df, n=3):\n", 200 | " size = lambda x: len(x)\n", 201 | " return (df.groupby('Address').\n", 202 | " apply(lambda x: len(x)).\n", 203 | " nlargest(n=n).\n", 204 | " to_frame(name='violations').\n", 205 | " reset_index())" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "result = (failed.groupby(failed.index).\n", 217 | " apply(violators, \n", 218 | " meta=[('Address', object), \n", 219 | " ('Violations', int)]))\n", 220 | "result" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "worst = result.compute()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "worst.index.names = ['Year', 'Offender']" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "worst.sort_index(level=0)" 254 | ] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.5.2" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 1 278 | } 279 | -------------------------------------------------------------------------------- /notebooks/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jseabold/pydata-chi-docker/369f9cf93dccbea3d1d31118cbe33934b46f4de5/notebooks/data/.gitkeep -------------------------------------------------------------------------------- /notebooks/docker-for-data-science.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": { 7 | "slideshow": { 8 | "slide_type": "skip" 9 | } 10 | }, 11 | "outputs": [ 12 | { 13 | "data": { 14 | "text/plain": [ 15 | "{'height': 800, 'width': 1200}" 16 | ] 17 | }, 18 | "execution_count": 9, 19 | "metadata": {}, 20 | "output_type": "execute_result" 21 | } 22 | ], 23 | "source": [ 24 | "from traitlets.config.manager import BaseJSONConfigManager\n", 25 | "path = \"/.jupyter/nbconfig\"\n", 26 | "cm = BaseJSONConfigManager(config_dir=path)\n", 27 | "\n", 28 | "cm.update('livereveal', {\n", 29 | " 'width': 1200,\n", 30 | " 'height': 800,\n", 31 | "})" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "slideshow": { 38 | "slide_type": "slide" 39 | } 40 | }, 41 | "source": [ 42 | "# About Me\n", 43 | "\n", 44 | "* Lead Data Scientist at [Civis Analytics](https://civisanalytics.com/)\n", 45 | "* [@jseabold](https://twitter.com/jseabold/) on Twitter\n", 46 | "* [jseabold](https://github.com/jseabold) on GitHub\n", 47 | "* [https://github.com/jseabold/pydata-chi-docker](https://github.com/jseabold/pydata-chi-docker)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "slideshow": { 54 | "slide_type": "slide" 55 | } 56 | }, 57 | "source": [ 58 | "# Docker for Data Science" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "slideshow": { 65 | "slide_type": "subslide" 66 | } 67 | }, 68 | "source": [ 69 | "## Docker Introduction\n", 70 | "\n", 71 | "* Docker is a platform for running applications in software containers\n", 72 | "* Containers are an implementation of operating-system-level virtualization\n", 73 | "* Enabled by features in the Linux kernel" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "slideshow": { 80 | "slide_type": "subslide" 81 | } 82 | }, 83 | "source": [ 84 | "## What does using Docker offer?\n", 85 | "\n", 86 | "* Reproducibility\n", 87 | "* Portability / working environments\n", 88 | "* Reduces need for complex installations\n", 89 | "* Easier testing / debugging\n", 90 | "* Resource management\n", 91 | "* Easier networking between services" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "slideshow": { 98 | "slide_type": "subslide" 99 | } 100 | }, 101 | "source": [ 102 | "# That sounds like a Virtual Machine" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "slideshow": { 109 | "slide_type": "subslide" 110 | } 111 | }, 112 | "source": [ 113 | "## Example Workflow\n", 114 | "\n", 115 | "```shell\n", 116 | "docker build -t jseabold/dask-jupyter .\n", 117 | "docker push jseabold/dask-jupyter\n", 118 | "docker run --detach \\\n", 119 | " --publish 8888:8888 \\\n", 120 | " --volume $(pwd)/notebooks:/notebooks \\\n", 121 | " --working-dir /notebooks \\\n", 122 | " jseabold/dask-jupyter\n", 123 | "```" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": { 129 | "slideshow": { 130 | "slide_type": "subslide" 131 | } 132 | }, 133 | "source": [ 134 | "## Why not Virtual Machines?\n", 135 | "\n", 136 | "* A computer simulated in software\n", 137 | "* Kind of slow\n", 138 | "* Pretty big\n", 139 | "* Takes time to provision, bring down, resume, etc.\n", 140 | "* The ~\\*~\\*~ cloud ~\\*~\\*~\n", 141 | " * Cheap and easy to provision new machines\n", 142 | " * Happens more often\n", 143 | " * Services start to be spread across hosts" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "slideshow": { 150 | "slide_type": "subslide" 151 | } 152 | }, 153 | "source": [ 154 | "# History of OS-level Virtualization\n", 155 | "\n", 156 | "* chroot (1979)\n", 157 | " - Change the apparent root directory for the current running process and its children\n", 158 | "* namespaces (2002)\n", 159 | "* Solaris Containers \"chroot on steroids\" (2004)\n", 160 | "* control groups (google, 2006)\n", 161 | " * LXC (2009)\n", 162 | "* copy-on-write\n", 163 | "* Linux Containers (LXC, 2008)\n", 164 | "* Docker (2013)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "slideshow": { 171 | "slide_type": "subslide" 172 | } 173 | }, 174 | "source": [ 175 | "## What Makes Up a Container\n", 176 | "\n", 177 | "* Control groups\n", 178 | "* Namespaces\n", 179 | "* copy-on-write storage" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": { 185 | "slideshow": { 186 | "slide_type": "subslide" 187 | } 188 | }, 189 | "source": [ 190 | "# True Confessions\n", 191 | "\n", 192 | "![Containers How Do They Work](container_tweet.png)\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": { 198 | "slideshow": { 199 | "slide_type": "slide" 200 | } 201 | }, 202 | "source": [ 203 | "# Linux Nuts and Bolts\n" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": { 209 | "slideshow": { 210 | "slide_type": "subslide" 211 | } 212 | }, 213 | "source": [ 214 | "## Linux Process Model\n", 215 | "\n", 216 | "* A process, or *task*, is an executing instance of a program\n", 217 | "* New processes are created (cloned) by the system call *fork*\n", 218 | " * This *copies* the current process and creates a child process with a link to the current parent process\n", 219 | " * The address space comes along / only copied on modification\n", 220 | "* Python exposes these OS primitives in `os` and `multiprocessing`, for example\n", 221 | "* Processes cannot live in isolation\n", 222 | " * Every process has a parent (with one exception)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": { 228 | "slideshow": { 229 | "slide_type": "subslide" 230 | } 231 | }, 232 | "source": [ 233 | "## Initialization Process\n", 234 | "\n", 235 | "* What happens when you boot up the linux operating system?\n", 236 | "* The kernel finds the initialization process and starts it\n", 237 | " * Traditionally, **init** \n", 238 | " * Now, commonly, **systemd**\n", 239 | "* Daemon running in the background \n", 240 | "* Direct or indirect ancestor of all processes" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 1, 246 | "metadata": { 247 | "slideshow": { 248 | "slide_type": "subslide" 249 | } 250 | }, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "systemd\r\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "!ps -q 1 -o comm=" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": { 267 | "slideshow": { 268 | "slide_type": "subslide" 269 | } 270 | }, 271 | "source": [ 272 | "![pstree](pstree.png)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "slideshow": { 279 | "slide_type": "slide" 280 | } 281 | }, 282 | "source": [ 283 | "# Control Groups" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": { 289 | "slideshow": { 290 | "slide_type": "subslide" 291 | } 292 | }, 293 | "source": [ 294 | "## What are cgroups\n", 295 | "\n", 296 | "
\n", 297 | "

\n", 298 | "\"Control Groups provide a mechanism for aggregating / partitioning sets of\n", 299 | "tasks, and all their future children, into hierarchical groups with\n", 300 | "specialized behaviour.\"\n", 301 | "
\n", 302 | "\n", 303 | "* Allow allocation of resources among processes\n", 304 | "* Includes metering, limiting, and accounting for resources\n", 305 | "* Similar to processes \n", 306 | " * hierarchical\n", 307 | " * inherit from parent cgroups\n", 308 | " * *But* many different ones can exist simultaneously" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": { 314 | "slideshow": { 315 | "slide_type": "subslide" 316 | } 317 | }, 318 | "source": [ 319 | "## University Server Example\n", 320 | "\n", 321 | "
\n", 322 | "```\n", 323 | "CPU : \"Top cpuset\"\n", 324 | " / \\\n", 325 | " CPUSet1 CPUSet2\n", 326 | " | |\n", 327 | " (Professors) (Students)\n", 328 | "\n", 329 | " In addition (system tasks) are attached to topcpuset (so\n", 330 | " that they can run anywhere) with a limit of 20%\n", 331 | "\n", 332 | "Memory : Professors (50%), Students (30%), system (20%)\n", 333 | "\n", 334 | "Disk : Professors (50%), Students (30%), system (20%)\n", 335 | "\n", 336 | "Network : WWW browsing (20%), Network File System (60%), others (20%)\n", 337 | " / \\\n", 338 | " Professors (15%) students (5%)\n", 339 | "```\n", 340 | "
" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": { 346 | "slideshow": { 347 | "slide_type": "subslide" 348 | } 349 | }, 350 | "source": [ 351 | "## cgroup subsystems\n", 352 | "\n", 353 | "* The cgroup hierarchies are connected to one or more **subsystems**\n", 354 | "* blkio\n", 355 | "* cpu / cpuset\n", 356 | "* devices\n", 357 | "* memory\n", 358 | "* net_cls / net_prio\n", 359 | "* ..." 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": { 365 | "slideshow": { 366 | "slide_type": "subslide" 367 | } 368 | }, 369 | "source": [ 370 | "## cpu cgroup\n", 371 | "\n", 372 | "* group processes together\n", 373 | "* you can set weights per cgroup that OS scheduler takes into account\n", 374 | "* can't set limits\n", 375 | " * It doesn't make sense\n", 376 | " * CPU architecture (different registers, different instructions, doesn't make sense)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": { 382 | "slideshow": { 383 | "slide_type": "subslide" 384 | } 385 | }, 386 | "source": [ 387 | "## cpuset cgroup\n", 388 | "\n", 389 | "* processor affinity\n", 390 | "* pin groups to specific CPUS\n", 391 | "* reserve CPUs for specific apps" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": { 397 | "slideshow": { 398 | "slide_type": "subslide" 399 | } 400 | }, 401 | "source": [ 402 | "## memory cgroup\n", 403 | "\n", 404 | "* limits are optional -- soft and hard limits\n", 405 | "* soft limits are not enforced\n", 406 | " * when pressure is strong, it looks at the cgroups above the soft limit, then you get pages taken from you by the kernel\n", 407 | "* limits can be set for different kinds of memory\n", 408 | " * physical (RAM), kernel (dentries), total (SWAP)\n", 409 | "* hard limit -- process gets killed on the cgroup level\n", 410 | " * it kills the process in this container\n", 411 | " * this is why you want to have one service per-container" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": { 417 | "slideshow": { 418 | "slide_type": "subslide" 419 | } 420 | }, 421 | "source": [ 422 | "## blkio cgroup\n", 423 | "\n", 424 | "* keeps track of IO for ea. grou\n", 425 | "* per block devices\n", 426 | "* read vs write\n", 427 | "* sync vs async\n", 428 | "* set throttle (limits) for each group\n", 429 | "* set relative weights for each group" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": { 435 | "slideshow": { 436 | "slide_type": "subslide" 437 | } 438 | }, 439 | "source": [ 440 | "## net_cls and net_prio cgroup\n", 441 | "\n", 442 | "* net_cls allows tagging of network packets with their origin cgroup\n", 443 | "* net_prio allows setting the priority of cgroups for different network interfaces" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": { 449 | "slideshow": { 450 | "slide_type": "subslide" 451 | } 452 | }, 453 | "source": [ 454 | "## devices cgroup\n", 455 | "\n", 456 | "* What tasks can use what device\n", 457 | "* Typically, things like\n", 458 | " * /dev/{tty,zero,random,null}\n", 459 | " * /dev/net/tun\n", 460 | " * /dev/fuse\n", 461 | " * /dev/dri (GPU)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": { 467 | "slideshow": { 468 | "slide_type": "subslide" 469 | } 470 | }, 471 | "source": [ 472 | "## freezer cgroup\n", 473 | "\n", 474 | "* Like SIGSTOP on the container\n", 475 | "* freeze/thaw a group of processes\n", 476 | "* process migration\n", 477 | "* cluster batch scheduling and process migration" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": { 483 | "slideshow": { 484 | "slide_type": "slide" 485 | } 486 | }, 487 | "source": [ 488 | "# Linux Namespaces" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": { 494 | "slideshow": { 495 | "slide_type": "subslide" 496 | } 497 | }, 498 | "source": [ 499 | "# What are namespaces?\n", 500 | "\n", 501 | "* If cgroups limit what you can use, namespaces limit what you can view\n", 502 | "* Takes a global resource and makes it look like processes have their own\n", 503 | "* Namespaces\n", 504 | " * pid (processes)\n", 505 | " * net (network stack)\n", 506 | " * mnt (filesystem and mount points)\n", 507 | " * uts (hostname)\n", 508 | " * ipc (interprocess communication)\n", 509 | " * user (user)\n", 510 | "* each process is in one namespace of each type" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": { 516 | "slideshow": { 517 | "slide_type": "subslide" 518 | } 519 | }, 520 | "source": [ 521 | "## pid namespace\n", 522 | "\n", 523 | "* see only other process in your pid namespace\n", 524 | "* pid in and outside of the container" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": { 530 | "slideshow": { 531 | "slide_type": "subslide" 532 | } 533 | }, 534 | "source": [ 535 | "## network namespace\n", 536 | "\n", 537 | "* processes within a given network namespace get their own private network stack, including\n", 538 | " * network interfaces (including lo)\n", 539 | " * routing tables\n", 540 | " * iptables routes\n", 541 | " * sockets (ss, netstate)\n", 542 | "* you can move a network interface across netns\n", 543 | " * have a container that sets up a vpn connection and then moves it across containers" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": { 549 | "slideshow": { 550 | "slide_type": "subslide" 551 | } 552 | }, 553 | "source": [ 554 | "## mnt namespace \n", 555 | "\n", 556 | "* Processes can have their own root fs\n", 557 | "* Processes also have \"private\" mounts\n", 558 | " * /tmp (scoped per user, per service)\n", 559 | "* Mounts can be private or shared\n", 560 | "* Can't pass a mount from a namespace to another" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": { 566 | "slideshow": { 567 | "slide_type": "subslide" 568 | } 569 | }, 570 | "source": [ 571 | "## uts namespace\n", 572 | "\n", 573 | "* can have your own hostname\n", 574 | "* isolating kernel and version identifiers" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": { 580 | "slideshow": { 581 | "slide_type": "subslide" 582 | } 583 | }, 584 | "source": [ 585 | "## ipc namespace\n", 586 | "\n", 587 | "* System V and posix IPC\n", 588 | "* allows a process to have its own \n", 589 | " * IPC semaphores\n", 590 | " * IPC message queues\n", 591 | " * IPC shared memory\n", 592 | "* without risk of conflict with other instances" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": { 598 | "slideshow": { 599 | "slide_type": "subslide" 600 | } 601 | }, 602 | "source": [ 603 | "## user namespace \n", 604 | "\n", 605 | "* map UID/GID inside the container to outside\n", 606 | "* This is as big topic\n", 607 | " * Only recently added to Docker\n", 608 | "* UIDs 0-1999 in the container are mapped to 10000->11999 on host, etc.\n", 609 | "* UID in containers becomes irrelevant. Just use UID 0 in the container\n", 610 | "* It gets squashed to a non-privileged user outside\n", 611 | " * Volumes *gotcha*" 612 | ] 613 | }, 614 | { 615 | "cell_type": "markdown", 616 | "metadata": { 617 | "slideshow": { 618 | "slide_type": "slide" 619 | } 620 | }, 621 | "source": [ 622 | "# Union Filesystem" 623 | ] 624 | }, 625 | { 626 | "cell_type": "markdown", 627 | "metadata": { 628 | "slideshow": { 629 | "slide_type": "subslide" 630 | } 631 | }, 632 | "source": [ 633 | "# What is a Union FS?\n", 634 | "\n", 635 | "* This is what makes containers lightweight\n", 636 | "* Allows different parts of a filesystem to be overlaid as transparent layers\n", 637 | "* Create a new container instantly instead of copying the whole filesystem\n", 638 | "* Storage drive keeps track of what has changed\n", 639 | "* Options\n", 640 | " * AUFS, overlay (file level)\n", 641 | " * device mapper (block level)\n", 642 | " * BTRFS, ZFS (filesystem level)" 643 | ] 644 | }, 645 | { 646 | "cell_type": "markdown", 647 | "metadata": { 648 | "slideshow": { 649 | "slide_type": "slide" 650 | } 651 | }, 652 | "source": [ 653 | "# Docker" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": { 659 | "slideshow": { 660 | "slide_type": "subslide" 661 | } 662 | }, 663 | "source": [ 664 | "## What is Docker\n", 665 | "\n", 666 | "\n", 667 | "* Docker is a platform that provides abstractions for working with containers and a container runtime\n", 668 | "* It is not the only way to manage software containers (!)" 669 | ] 670 | }, 671 | { 672 | "cell_type": "markdown", 673 | "metadata": { 674 | "slideshow": { 675 | "slide_type": "subslide" 676 | } 677 | }, 678 | "source": [ 679 | "# Docker Architecture\n", 680 | "\n", 681 | "![docker architecture](https://docs.docker.com/engine/article-img/architecture.svg)" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": { 687 | "slideshow": { 688 | "slide_type": "subslide" 689 | } 690 | }, 691 | "source": [ 692 | "# Docker Images\n", 693 | "\n", 694 | "* Read-only template from which containers are instantiated\n", 695 | "* Images consist of *layers*\n", 696 | " * these layers can be shared\n", 697 | "* The [Union file system](https://en.wikipedia.org/wiki/UnionFS) combines the layers into an image\n", 698 | "* The image layers are part of what makes docker lightweight\n", 699 | "* Updating one layer does not need to update other layers" 700 | ] 701 | }, 702 | { 703 | "cell_type": "markdown", 704 | "metadata": { 705 | "slideshow": { 706 | "slide_type": "subslide" 707 | } 708 | }, 709 | "source": [ 710 | "## Dockerfile\n", 711 | "\n", 712 | "\n", 713 | "

\n", 714 | "```\n", 715 | "FROM continuumio/miniconda3:4.1.11\n", 716 | "MAINTAINER \n", 717 | "\n", 718 | "RUN conda update -y conda && \\\n", 719 | " conda install -y -c conda-forge -c defaults --show-channel-urls --override-channels \\\n", 720 | " conda-build\n", 721 | "\n", 722 | "COPY requirements.txt /bootstrap/requirements.txt\n", 723 | "\n", 724 | "RUN conda install -y -c conda-forge -c defaults --file \\\n", 725 | " /bootstrap/requirements.txt && \\\n", 726 | " conda install -c damianavila82 rise && \\\n", 727 | " conda clean -tipsy\n", 728 | " \n", 729 | "RUN pip install --user graphviz\n", 730 | "\n", 731 | "RUN jupyter nbextension enable --py widgetsnbexdtension && \\\n", 732 | " jupyter nbextension install --py rise && \\\n", 733 | " jupyter nbextension enable --py rise\n", 734 | " \n", 735 | "EXPOSE 8888\n", 736 | "\n", 737 | "ENTRYPOINT [\"bash\", \"-c\", \"jupyter-notebook --no-browser --ip='*'\"]\n", 738 | "```\n", 739 | "
" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": { 745 | "slideshow": { 746 | "slide_type": "subslide" 747 | } 748 | }, 749 | "source": [ 750 | "## FROM\n", 751 | "\n", 752 | "```\n", 753 | "FROM continuumio/miniconda3:4.1.11\n", 754 | "```\n", 755 | "\n", 756 | "* Every Dockerfile needs to start with a `FROM` instruction\n", 757 | "* Specifies the *base image* and *tag*\n", 758 | "* Common examples: `ubuntu:16.04`, `debian:jessie`\n", 759 | " * debian is recommended as a best practice\n", 760 | "* docker maintains a list of [Official Repositories](https://hub.docker.com/explore/)" 761 | ] 762 | }, 763 | { 764 | "cell_type": "markdown", 765 | "metadata": { 766 | "slideshow": { 767 | "slide_type": "subslide" 768 | } 769 | }, 770 | "source": [ 771 | "## RUN\n", 772 | "\n", 773 | "```\n", 774 | "RUN conda update -y conda && \\\n", 775 | " conda install -y -c conda-forge \\\n", 776 | " -c defaults \\\n", 777 | " --show-channel-urls \\\n", 778 | " --override-channels \\\n", 779 | " conda-build\n", 780 | "```\n", 781 | "\n", 782 | "* The RUN instruction will execute any commands in a new layer on top of the current image and commit the results\n", 783 | "* The resulting committed image will be used for the next step in the Dockerfile" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": { 789 | "slideshow": { 790 | "slide_type": "subslide" 791 | } 792 | }, 793 | "source": [ 794 | "## RUN\n", 795 | "\n", 796 | "* Two forms\n", 797 | " * shell form runs the command in a shell `/bin/sh -c`\n", 798 | "```\n", 799 | "RUN \n", 800 | "```\n", 801 | " * exec mode\n", 802 | "```\n", 803 | "RUN [\"executable\", \"param1\", \"param2\"]\n", 804 | "```\n", 805 | "\n", 806 | "* Since each instruction is a layers, you want to group commands (and do cleanup)" 807 | ] 808 | }, 809 | { 810 | "cell_type": "markdown", 811 | "metadata": { 812 | "slideshow": { 813 | "slide_type": "subslide" 814 | } 815 | }, 816 | "source": [ 817 | "## COPY / ADD\n", 818 | "\n", 819 | "```\n", 820 | "COPY requirements.txt /bootstrap/requirements.txt\n", 821 | "```\n", 822 | "\n", 823 | "* The COPY instruction copies new files from the build context and adds them to the filesystem of the container\n", 824 | "* building an image takes place in a *build context*, most often the directory that contains the Dockerfile\n", 825 | "* The files must be in the build context\n", 826 | " * `COPY ../something` is not valid\n", 827 | "* ADD is similar to copy but has support for local-only tar extraction and remote URLs\n", 828 | "* COPY is preferred\n" 829 | ] 830 | }, 831 | { 832 | "cell_type": "markdown", 833 | "metadata": { 834 | "slideshow": { 835 | "slide_type": "subslide" 836 | } 837 | }, 838 | "source": [ 839 | "## EXPOSE\n", 840 | "\n", 841 | "```\n", 842 | "EXPOSE 8888\n", 843 | "```\n", 844 | "\n", 845 | "* The EXPOSE instruction informs the container to listen on the specified port\n", 846 | "* You must use the `--publish` flag to `docker run` to make these ports accessible to the host" 847 | ] 848 | }, 849 | { 850 | "cell_type": "markdown", 851 | "metadata": { 852 | "slideshow": { 853 | "slide_type": "subslide" 854 | } 855 | }, 856 | "source": [ 857 | "## CMD / ENTRYPOINT\n", 858 | "\n", 859 | "* This is what is executed when you run the container\n", 860 | "* [Understand how CMD and ENTRYPOINT Interact](https://docs.docker.com/engine/reference/builder/#/understand-how-cmd-and-entrypoint-interact)\n", 861 | "* Specify at least one\n", 862 | " * ENTRYPOINT to treat the container like an executable\n", 863 | " * CMD for default arguments to ENTRYPOINT" 864 | ] 865 | }, 866 | { 867 | "cell_type": "markdown", 868 | "metadata": { 869 | "slideshow": { 870 | "slide_type": "subslide" 871 | } 872 | }, 873 | "source": [ 874 | "## VOLUME\n", 875 | "\n", 876 | "* Docker volumes are a big topic\n", 877 | "* Launching a container, we have a series of read-only layers with a read-write layer mounted last\n", 878 | "* When you make changes to a file, that file is copied, but the underlying file exists still in the image\n", 879 | "* Practically, this means that changes do not persist when you delete a container\n", 880 | "* *Docker Volumes* exist outside the UFS\n", 881 | "* You can mount from the host to the container outside the UFS, using the `--volume` flag for `docker run`" 882 | ] 883 | }, 884 | { 885 | "cell_type": "markdown", 886 | "metadata": { 887 | "slideshow": { 888 | "slide_type": "slide" 889 | } 890 | }, 891 | "source": [ 892 | "# Putting It All Together" 893 | ] 894 | }, 895 | { 896 | "cell_type": "markdown", 897 | "metadata": { 898 | "slideshow": { 899 | "slide_type": "subslide" 900 | } 901 | }, 902 | "source": [ 903 | "## Docker Compose\n", 904 | "\n", 905 | "* A tool for building more complex, multi-container applications\n", 906 | "* Use a single-command to spin up this applications" 907 | ] 908 | }, 909 | { 910 | "cell_type": "markdown", 911 | "metadata": { 912 | "slideshow": { 913 | "slide_type": "subslide" 914 | } 915 | }, 916 | "source": [ 917 | "## Dask Distributed\n", 918 | "\n", 919 | "* Dask-Distributed defined using a docker-compose file\n", 920 | "\n", 921 | "```\n", 922 | "docker-compose --project-name dask-distributed \\\n", 923 | " up -d\n", 924 | "docker-compose -p dask-distributed scale dask-worker=4\n", 925 | "```\n", 926 | "\n", 927 | "* Very useful for proto-typing running dask applications in a truly distributed environment" 928 | ] 929 | }, 930 | { 931 | "cell_type": "markdown", 932 | "metadata": { 933 | "slideshow": { 934 | "slide_type": "subslide" 935 | } 936 | }, 937 | "source": [ 938 | "# Python Tools\n", 939 | "\n", 940 | "* [nsenter](https://github.com/zalando/python-nsenter)\n", 941 | " * Enter namespaces with a context manager\n", 942 | "* [docker-py](https://github.com/docker/docker-py)\n", 943 | " * Python docker client" 944 | ] 945 | }, 946 | { 947 | "cell_type": "markdown", 948 | "metadata": { 949 | "slideshow": { 950 | "slide_type": "slide" 951 | } 952 | }, 953 | "source": [ 954 | "# Resources\n", 955 | "\n", 956 | "* [Redhat's Resource Management Guide](https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Resource_Management_Guide/index.html)\n", 957 | "* [Kernel Documentation for cgroups v1](https://www.kernel.org/doc/Documentation/cgroup-v1/)\n", 958 | "* [cgroups, namespaces, and beyond: what are containers made from](https://www.youtube.com/watch?v=sK5i-N34im8)\n", 959 | "* [Deep dive into Docker storage drivers](https://jpetazzo.github.io/assets/2015-03-03-not-so-deep-dive-into-docker-storage-drivers.html#1)\n", 960 | "* [namespace man page](http://man7.org/linux/man-pages/man7/namespaces.7.html)\n", 961 | "* [Docker documentation](https://docs.docker.com)\n", 962 | "* [Best practices for writing Dockerfiles](https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/)\n", 963 | "* [You Could Have Invented Containers: An Explanatory Fantasy](https://medium.com/@gtrevorjay/you-could-have-invented-container-runtimes-an-explanatory-fantasy-764c5b389bd3#.svjwa71rv)" 964 | ] 965 | } 966 | ], 967 | "metadata": { 968 | "celltoolbar": "Slideshow", 969 | "kernelspec": { 970 | "display_name": "Python 3", 971 | "language": "python", 972 | "name": "python3" 973 | }, 974 | "language_info": { 975 | "codemirror_mode": { 976 | "name": "ipython", 977 | "version": 3 978 | }, 979 | "file_extension": ".py", 980 | "mimetype": "text/x-python", 981 | "name": "python", 982 | "nbconvert_exporter": "python", 983 | "pygments_lexer": "ipython3", 984 | "version": "3.7.6" 985 | } 986 | }, 987 | "nbformat": 4, 988 | "nbformat_minor": 1 989 | } 990 | -------------------------------------------------------------------------------- /notebooks/pstree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jseabold/pydata-chi-docker/369f9cf93dccbea3d1d31118cbe33934b46f4de5/notebooks/pstree.png -------------------------------------------------------------------------------- /up.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | docker-compose -p dask-distributed up -d 4 | docker-compose -p dask-distributed scale dask-worker=4 5 | --------------------------------------------------------------------------------