├── data
├── .gitkeep
└── weather-small
│ ├── 2014-01-01.hdf5
│ ├── 2014-01-02.hdf5
│ ├── 2014-01-03.hdf5
│ ├── 2014-01-04.hdf5
│ ├── 2014-01-05.hdf5
│ ├── 2014-01-06.hdf5
│ ├── 2014-01-07.hdf5
│ ├── 2014-01-08.hdf5
│ ├── 2014-01-09.hdf5
│ ├── 2014-01-10.hdf5
│ ├── 2014-01-11.hdf5
│ ├── 2014-01-12.hdf5
│ ├── 2014-01-13.hdf5
│ ├── 2014-01-14.hdf5
│ ├── 2014-01-15.hdf5
│ ├── 2014-01-16.hdf5
│ ├── 2014-01-17.hdf5
│ ├── 2014-01-18.hdf5
│ ├── 2014-01-19.hdf5
│ ├── 2014-01-20.hdf5
│ ├── 2014-01-21.hdf5
│ ├── 2014-01-22.hdf5
│ ├── 2014-01-23.hdf5
│ ├── 2014-01-24.hdf5
│ ├── 2014-01-25.hdf5
│ ├── 2014-01-26.hdf5
│ ├── 2014-01-27.hdf5
│ ├── 2014-01-28.hdf5
│ ├── 2014-01-29.hdf5
│ ├── 2014-01-30.hdf5
│ └── 2014-01-31.hdf5
├── images
├── barn.png
├── barn-2.png
├── mydask.png
├── fail-case.gif
├── race-car.png
├── dask-side-1.png
├── dask-side-2.png
├── dask-zones.gif
├── dask-zones.png
├── task-graph.png
├── dask_custom_parallel.png
└── mapreduce_vs_taskscheduling.png
├── README.md
├── .gitignore
├── environment.yml
├── LICENSE
├── prep_data.py
├── prep-alt.py
├── 03_array.ipynb
├── 05_delayed.ipynb
├── 02_dataframe.ipynb
└── 04_machine_learning.ipynb
/data/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/images/barn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/barn.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # intro-to-dask
2 | Resources for the Introduction to Dask e-learning tutorial
3 |
--------------------------------------------------------------------------------
/images/barn-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/barn-2.png
--------------------------------------------------------------------------------
/images/mydask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/mydask.png
--------------------------------------------------------------------------------
/images/fail-case.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/fail-case.gif
--------------------------------------------------------------------------------
/images/race-car.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/race-car.png
--------------------------------------------------------------------------------
/images/dask-side-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/dask-side-1.png
--------------------------------------------------------------------------------
/images/dask-side-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/dask-side-2.png
--------------------------------------------------------------------------------
/images/dask-zones.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/dask-zones.gif
--------------------------------------------------------------------------------
/images/dask-zones.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/dask-zones.png
--------------------------------------------------------------------------------
/images/task-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/task-graph.png
--------------------------------------------------------------------------------
/images/dask_custom_parallel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/dask_custom_parallel.png
--------------------------------------------------------------------------------
/data/weather-small/2014-01-01.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-01.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-02.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-02.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-03.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-03.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-04.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-04.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-05.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-05.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-06.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-06.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-07.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-07.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-08.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-08.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-09.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-09.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-10.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-10.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-11.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-11.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-12.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-12.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-13.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-13.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-14.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-14.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-15.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-15.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-16.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-16.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-17.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-17.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-18.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-18.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-19.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-19.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-20.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-20.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-21.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-21.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-22.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-22.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-23.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-23.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-24.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-24.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-25.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-25.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-26.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-26.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-27.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-27.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-28.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-28.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-29.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-29.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-30.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-30.hdf5
--------------------------------------------------------------------------------
/data/weather-small/2014-01-31.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/data/weather-small/2014-01-31.hdf5
--------------------------------------------------------------------------------
/images/mapreduce_vs_taskscheduling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coiled/dask-elearning/main/images/mapreduce_vs_taskscheduling.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Ignore Mac DS_Store files
3 | .DS_Store
4 |
5 | # Jupyter Notebook
6 | .ipynb_checkpoints
7 |
8 | # data folder content
9 | data/*.csv
10 | data/*tar.gz
11 | data/nycflights
12 | data/random.hdf5
13 | data/weather-big
14 |
15 | #dask graph
16 | mydask.png
17 |
18 | #dask worker space
19 | dask-worker-space
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: dask-elearn
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python=3.9
6 | - bokeh=2.4
7 | - dask=2022.6.1
8 | - dask-labextension
9 | - distributed=2022.6.1
10 | - matplotlib
11 | - pandas=1.4
12 | - pip
13 | - python-graphviz
14 | - ipycytoscape
15 | - pyarrow
16 | - s3fs
17 | - jupyterlab=3
18 | - h5py
19 | - holidays
20 | - scikit-learn>=0.22.1
21 | - scikit-image>=0.15.0
22 | - dask-ml
23 | - ipywidgets>=7.5
24 | - zarr
25 | - coiled
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2022, Richard Pelgrim
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/prep_data.py:
--------------------------------------------------------------------------------
1 | #This script was modify from original https://github.com/coiled/pydata-global-dask/blob/master/prep.py
2 | import time
3 | import sys
4 | import argparse
5 | import os
6 | from glob import glob
7 | import tarfile
8 | import urllib.request
9 |
10 | import pandas as pd
11 |
12 |
13 | DATASETS = ["flights", "all"]
14 | here = os.path.dirname(__file__)
15 | data_dir = os.path.abspath(os.path.join(here, "data"))
16 |
17 | print(f"{data_dir=}")
18 |
19 | def parse_args(args=None):
20 | parser = argparse.ArgumentParser(
21 | description="Downloads, generates and prepares data for the Dask tutorial."
22 | )
23 | parser.add_argument(
24 | "--no-ssl-verify",
25 | dest="no_ssl_verify",
26 | action="store_true",
27 | default=False,
28 | help="Disables SSL verification.",
29 | )
30 | parser.add_argument(
31 | "--small",
32 | action="store_true",
33 | default=None,
34 | help="Whether to use smaller example datasets. Checks DASK_TUTORIAL_SMALL environment variable if not specified.",
35 | )
36 | parser.add_argument(
37 | "-d", "--dataset", choices=DATASETS, help="Datasets to generate.", default="all"
38 | )
39 |
40 | return parser.parse_args(args)
41 |
42 |
43 | if not os.path.exists(data_dir):
44 | raise OSError(
45 | "data/ directory not found, aborting data preparation. "
46 | 'Restore it with "git checkout data" from the base '
47 | "directory."
48 | )
49 |
50 |
51 | def flights(small=None):
52 | start = time.time()
53 | flights_raw = os.path.join(data_dir, "nycflights.tar.gz")
54 | flightdir = os.path.join(data_dir, "nycflights")
55 | if small is None:
56 | small = bool(os.environ.get("DASK_TUTORIAL_SMALL", False))
57 |
58 | if small:
59 | N = 500
60 | else:
61 | N = 10_000
62 |
63 | if not os.path.exists(flights_raw):
64 | print("- Downloading NYC Flights dataset... ", end="", flush=True)
65 | url = "https://storage.googleapis.com/dask-tutorial-data/nycflights.tar.gz"
66 | urllib.request.urlretrieve(url, flights_raw)
67 | print("done", flush=True)
68 |
69 | if not os.path.exists(flightdir):
70 | print("- Extracting flight data... ", end="", flush=True)
71 | tar_path = os.path.join(data_dir, "nycflights.tar.gz")
72 | with tarfile.open(tar_path, mode="r:gz") as flights:
73 | flights.extractall(data_dir)
74 |
75 | if small:
76 | for path in glob(os.path.join(data_dir, "nycflights", "*.csv")):
77 | with open(path, "r") as f:
78 | lines = f.readlines()[:1000]
79 |
80 | with open(path, "w") as f:
81 | f.writelines(lines)
82 |
83 | print("done", flush=True)
84 |
85 | else:
86 | return
87 |
88 | end = time.time()
89 | print("** Created flights dataset! in {:0.2f}s**".format(end - start))
90 |
91 |
92 | def main(args=None):
93 | args = parse_args(args)
94 |
95 | if args.no_ssl_verify:
96 | print("- Disabling SSL Verification... ", end="", flush=True)
97 | import ssl
98 |
99 | ssl._create_default_https_context = ssl._create_unverified_context
100 | print("done", flush=True)
101 |
102 | if args.dataset == "flights" or args.dataset == "all":
103 | flights(args.small)
104 |
105 |
106 | if __name__ == "__main__":
107 | sys.exit(main())
108 |
--------------------------------------------------------------------------------
/prep-alt.py:
--------------------------------------------------------------------------------
1 | import time
2 | import sys
3 | import argparse
4 | import os
5 | from glob import glob
6 | import json
7 | import gzip
8 | import tarfile
9 | import urllib.request
10 |
11 | import h5py
12 | import numpy as np
13 | import pandas as pd
14 | import holidays
15 | from skimage.transform import resize
16 |
17 |
18 | DATASETS = ["random", "weather", "flights", "all"]
19 | here = os.path.dirname(__file__)
20 | data_dir = os.path.abspath(os.path.join(here, 'data'))
21 |
22 |
23 | def parse_args(args=None):
24 | parser = argparse.ArgumentParser(description='Downloads, generates and prepares data for the Dask tutorial.')
25 | parser.add_argument('--no-ssl-verify', dest='no_ssl_verify', action='store_true',
26 | default=False, help='Disables SSL verification.')
27 | parser.add_argument("--small", action="store_true", default=None,
28 | help="Whether to use smaller example datasets. Checks DASK_TUTORIAL_SMALL environment variable if not specified.")
29 | parser.add_argument("-d", "--dataset", choices=DATASETS, help="Datasets to generate.", default="all")
30 |
31 | return parser.parse_args(args)
32 |
33 |
34 |
35 | if not os.path.exists(data_dir):
36 | raise OSError('data/ directory not found, aborting data preparation. ' \
37 | 'Restore it with "git checkout data" from the base ' \
38 | 'directory.')
39 |
40 |
41 | def holiday():
42 | holidays_dir = os.path.join(data_dir, "holidays")
43 | if os.path.exists(holidays_dir):
44 | return
45 |
46 | years = [
47 | 1990, 1991, 1992, 1993, 1994,
48 | 1995, 1996, 1997, 1998, 1999
49 | ]
50 | holidays_dict = holidays.US(years=years)
51 | us_holidays = pd.DataFrame(
52 | data={
53 | "Date": holidays_dict.keys(),
54 | "holiday": holidays_dict.values()
55 | },
56 | )
57 | us_holidays = us_holidays.assign(
58 | Date=us_holidays.Date.astype("datetime64[ns]"))
59 | us_holidays.to_parquet(holidays_dir)
60 | print("Created holidays data.")
61 |
62 |
63 | def flights(small=None):
64 | start = time.time()
65 | flights_raw = os.path.join(data_dir, 'nycflights.tar.gz')
66 | flightdir = os.path.join(data_dir, 'nycflights')
67 | jsondir = os.path.join(data_dir, 'flightjson')
68 | if small is None:
69 | small = bool(os.environ.get("DASK_TUTORIAL_SMALL", False))
70 |
71 | if small:
72 | N = 500
73 | else:
74 | N = 10_000
75 |
76 | if not os.path.exists(flights_raw):
77 | print("- Downloading NYC Flights dataset... ", end='', flush=True)
78 | url = "https://storage.googleapis.com/dask-tutorial-data/nycflights.tar.gz"
79 | urllib.request.urlretrieve(url, flights_raw)
80 | print("done", flush=True)
81 |
82 | if not os.path.exists(flightdir):
83 | print("- Extracting flight data... ", end='', flush=True)
84 | tar_path = os.path.join(data_dir, 'nycflights.tar.gz')
85 | with tarfile.open(tar_path, mode='r:gz') as flights:
86 | flights.extractall('data/')
87 |
88 | if small:
89 | for path in glob(os.path.join(data_dir, "nycflights", "*.csv")):
90 | with open(path, 'r') as f:
91 | lines = f.readlines()[:1000]
92 |
93 | with open(path, 'w') as f:
94 | f.writelines(lines)
95 |
96 | print("done", flush=True)
97 |
98 | if not os.path.exists(jsondir):
99 | print("- Creating json data... ", end='', flush=True)
100 | os.mkdir(jsondir)
101 | for path in glob(os.path.join(data_dir, 'nycflights', '*.csv')):
102 | prefix = os.path.splitext(os.path.basename(path))[0]
103 | df = pd.read_csv(path, nrows=N)
104 | df.to_json(os.path.join(data_dir, 'flightjson', prefix + '.json'),
105 | orient='records', lines=True)
106 | print("done", flush=True)
107 | else:
108 | return
109 |
110 | end = time.time()
111 | print("** Created flights dataset! in {:0.2f}s**".format(end - start))
112 |
113 | def random_array(small=None):
114 | if small is None:
115 | small = bool(os.environ.get("DASK_TUTORIAL_SMALL", False))
116 |
117 | if small:
118 | blocksize = 5000
119 | else:
120 | blocksize = 1000000
121 |
122 | nblocks = 1000
123 | shape = nblocks * blocksize
124 |
125 | t0 = time.time()
126 | if os.path.exists(os.path.join(data_dir, 'random.hdf5')):
127 | return
128 |
129 | with h5py.File(os.path.join(data_dir, 'random.hdf5'), mode='w') as f:
130 | dset = f.create_dataset('/x', shape=(shape,), dtype='f4')
131 | for i in range(0, shape, blocksize):
132 | dset[i: i + blocksize] = np.random.exponential(size=blocksize)
133 |
134 | t1 = time.time()
135 | print("Created random data for array exercise in {:0.2f}s".format(t1 - t0))
136 |
137 |
138 | def create_weather(small=None):
139 | t0 = time.time()
140 | if small is None:
141 | small = bool(os.environ.get("DASK_TUTORIAL_SMALL", False))
142 |
143 | if small:
144 | growth = 1
145 | else:
146 | growth = 32
147 | filenames = sorted(glob(os.path.join(data_dir, 'weather-small', '*.hdf5')))
148 |
149 | if not filenames:
150 | ws_dir = os.path.join(data_dir, 'weather-small')
151 | raise ValueError('Did not find any hdf5 files in {}'.format(ws_dir))
152 |
153 | if not os.path.exists(os.path.join(data_dir, 'weather-big')):
154 | os.mkdir(os.path.join(data_dir, 'weather-big'))
155 |
156 | if all(os.path.exists(fn.replace('small', 'big')) for fn in filenames):
157 | return
158 |
159 | for fn in filenames:
160 | with h5py.File(fn, mode='r') as f:
161 | x = f['/t2m'][:]
162 |
163 | if small:
164 | y = x
165 | chunks = (180, 180)
166 | else:
167 | y = resize(x, (x.shape[0] * growth, x.shape[1] * growth), mode='constant')
168 | chunks = (500, 500)
169 |
170 | out_fn = os.path.join(data_dir, 'weather-big', os.path.split(fn)[-1])
171 |
172 | with h5py.File(out_fn, mode='w') as f:
173 | f.create_dataset('/t2m', data=y, chunks=chunks)
174 | t1 = time.time()
175 | print("Created weather dataset in {:0.2f}s".format(t1 - t0))
176 |
177 |
178 | def main(args=None):
179 | args = parse_args(args)
180 |
181 | if (args.no_ssl_verify):
182 | print("- Disabling SSL Verification... ", end='', flush=True)
183 | import ssl
184 | ssl._create_default_https_context = ssl._create_unverified_context
185 | print("done", flush=True)
186 |
187 | if args.dataset == "random" or args.dataset == "all":
188 | random_array(args.small)
189 | if args.dataset == "weather" or args.dataset == "all":
190 | create_weather(args.small)
191 | if args.dataset == "flights" or args.dataset == "all":
192 | flights(args.small)
193 | holiday()
194 |
195 |
196 | if __name__ == '__main__':
197 | sys.exit(main())
198 |
--------------------------------------------------------------------------------
/03_array.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "# Arrays"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "Dask array provides a parallel, larger-than-memory implementation of NumPy. \n",
24 | "\n",
25 | "It will look and feel a lot like NumPy, but does not suffer from the same scalability limitations.\n",
26 | "\n",
27 | "\n",
28 | "
\n",
29 | "\n",
30 | "\n",
31 | "* **Parallel**: Uses all of the cores on your computer\n",
32 | "* **Larger-than-memory**: Lets you work on datasets that are larger than your available memory by breaking up your array into many small pieces, operating on those pieces in an order that minimizes the memory footprint of your computation, and effectively streaming data from disk.\n",
33 | "* **Blocked Algorithms**: Perform large computations by performing many smaller computations\n",
34 | "\n",
35 | "In this notebook, we'll build some understanding by implementing some blocked algorithms from scratch. We'll then use Dask Array to analyze large datasets, in parallel, using a familiar NumPy-like API.\n"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## Blocked Algorithms"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "A *blocked algorithm* executes on a large dataset by breaking it up into many small blocks.\n",
50 | "\n",
51 | "For example, consider taking the sum of a billion numbers. We might instead break up the array into 1,000 chunks, each of size 1,000,000, take the sum of each chunk, and then take the sum of the intermediate sums.\n",
52 | "\n",
53 | "We achieve the intended result (one sum on one billion numbers) by performing many smaller results (one thousand sums on one million numbers each, followed by another sum of a thousand numbers.)\n",
54 | "\n",
55 | "We do exactly this with Python and NumPy in the following example:"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "### Create data"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "%run prep-alt.py -d random"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "# Load data with h5py\n",
81 | "# this creates a pointer to the data, but does not actually load\n",
82 | "import os\n",
83 | "\n",
84 | "import h5py\n",
85 | "\n",
86 | "f = h5py.File(os.path.join(\"data\", \"random.hdf5\"), mode=\"r\")\n",
87 | "dset = f[\"/x\"]"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "dset"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "### Example: Compute sum using blocked algorithm"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "Before using dask, let's consider the concept of blocked algorithms. We can compute the sum of a large number of elements by loading them chunk-by-chunk, and keeping a running total.\n",
111 | "\n",
112 | "Here we compute the sum of this large array on disk by \n",
113 | "\n",
114 | "1. Computing the sum of each 1,000,000 sized chunk of the array\n",
115 | "2. Computing the sum of the 1,000 intermediate sums\n",
116 | "\n",
117 | "Note that this is a sequential process in the notebook kernel, both the loading and summing."
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "%%time\n",
127 | "# Compute sum of large array, one million numbers at a time\n",
128 | "sums = []\n",
129 | "for i in range(0, 1_000_000_000, 1_000_000):\n",
130 | " chunk = dset[i : i + 1_000_000] # pull out numpy array\n",
131 | " sums.append(chunk.sum())\n",
132 | "\n",
133 | "total = sum(sums)\n",
134 | "print(total)"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "`dask.array` contains these algorithms\n",
142 | "--------------------------------------------\n",
143 | "\n",
144 | "Dask.array is a NumPy-like library that does these kinds of tricks to operate on large datasets that don't fit into memory. It extends beyond the linear problems discussed above to full N-Dimensional algorithms and a decent subset of the NumPy interface."
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "**Create `dask.array` object**"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "You can create a `dask.array` `Array` object with the `da.from_array` function. This function accepts\n",
159 | "\n",
160 | "1. `data`: Any object that supports NumPy slicing, like `dset`\n",
161 | "2. `chunks`: A chunk size to tell us how to block up our array, like `(1_000_000,)`"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "import dask.array as da\n",
171 | "\n",
172 | "x = da.from_array(dset, chunks=(1_000_000,))\n",
173 | "x"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "**Manipulate `dask.array` object as you would a numpy array**"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "Now that we have an `Array` we perform standard numpy-style computations like arithmetic, mathematics, slicing, reductions, etc..\n",
188 | "\n",
189 | "The interface is familiar, but the actual work is different. `dask_array.sum()` builds an expression of the computation. It does not do the computation yet. `numpy_array.sum()` computes the sum immediately."
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "result = x.sum()\n",
199 | "result"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "**Compute result**"
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | "Dask.array objects are lazily evaluated. Operations like `.sum` build up a graph of blocked tasks to execute. \n",
214 | "\n",
215 | "We ask for the final result with a call to `.compute()`. This triggers the actual computation."
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "result.compute()"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "### Example: Compute the mean"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "This is a small change to the example above."
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "x.mean().compute()"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "Does this match your result from before?"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "### Example: Compute the standard deviation\n",
262 | "\n",
263 | "Again, this follows regular NumPy syntax, except for the added `.compute()`"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "x.std().compute()"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "## Exercise: Meteorological data"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "%run prep-alt.py -d weather"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "There is 2GB of weather data in HDF5 files in `data/weather-big/*.hdf5`. We'll use the `h5py` library to interact with this data and `dask.array` to compute on it.\n",
296 | "\n",
297 | "Our goal is to visualize the average temperature on the surface of the Earth for this month. This will require a mean over all of this data. We'll do this in the following steps\n",
298 | "\n",
299 | "1. Create `h5py.Dataset` objects for each of the days of data on disk (`dsets`)\n",
300 | "2. Wrap these with `da.from_array` calls \n",
301 | "3. Stack these datasets along time with a call to `da.stack`\n",
302 | "4. Compute the mean along the newly stacked time axis with the `.mean()` method\n",
303 | "5. Visualize the result with `matplotlib.pyplot.imshow`"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "metadata": {},
310 | "outputs": [],
311 | "source": [
312 | "import os\n",
313 | "from glob import glob\n",
314 | "\n",
315 | "import h5py\n",
316 | "\n",
317 | "filenames = sorted(glob(os.path.join(\"data\", \"weather-big\", \"*.hdf5\")))\n",
318 | "dsets = [h5py.File(filename, mode=\"r\")[\"/t2m\"] for filename in filenames]\n",
319 | "dsets[0]"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": null,
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "type(dset)"
329 | ]
330 | },
331 | {
332 | "cell_type": "markdown",
333 | "metadata": {},
334 | "source": [
335 | "### Exercise 1: Integrate with `dask.array`"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {},
341 | "source": [
342 | "Make a list of `dask.array` objects out of your list of `h5py.Dataset` objects using the `da.from_array` function with a chunk size of `(500, 500)`."
343 | ]
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "metadata": {},
348 | "source": [
349 | "Uncomment and run the cell below to see the solution."
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {
356 | "jupyter": {
357 | "source_hidden": true
358 | },
359 | "tags": []
360 | },
361 | "outputs": [],
362 | "source": [
363 | "arrays = [da.from_array(dset, chunks=(500, 500)) for dset in dsets]\n",
364 | "arrays"
365 | ]
366 | },
367 | {
368 | "cell_type": "markdown",
369 | "metadata": {},
370 | "source": [
371 | "### Exercise 2: Stack list of `dask.array` objects into a single `dask.array` object with `da.stack`"
372 | ]
373 | },
374 | {
375 | "cell_type": "markdown",
376 | "metadata": {},
377 | "source": [
378 | "Stack these along the first axis so that the shape of the resulting array is `(31, 5760, 11520)`."
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {
385 | "jupyter": {
386 | "source_hidden": true
387 | },
388 | "tags": []
389 | },
390 | "outputs": [],
391 | "source": [
392 | "x = da.stack(arrays, axis=0)\n",
393 | "x"
394 | ]
395 | },
396 | {
397 | "cell_type": "markdown",
398 | "metadata": {},
399 | "source": [
400 | "### Exercise 3: Plot the mean of this array along the time (`0th`) axis\n",
401 | "\n",
402 | "Complete the following:\n",
403 | "\n",
404 | "```python\n",
405 | "result = ...\n",
406 | "fig = plt.figure(figsize=(16, 8))\n",
407 | "plt.imshow(result, cmap='RdBu_r')\n",
408 | "```"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": null,
414 | "metadata": {},
415 | "outputs": [],
416 | "source": [
417 | "import matplotlib.pyplot as plt"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {
424 | "jupyter": {
425 | "source_hidden": true
426 | },
427 | "tags": [
428 | "raises-exception"
429 | ]
430 | },
431 | "outputs": [],
432 | "source": [
433 | "result = x.mean(axis=0)\n",
434 | "fig = plt.figure(figsize=(16, 8))\n",
435 | "plt.imshow(result, cmap=\"RdBu_r\");"
436 | ]
437 | },
438 | {
439 | "cell_type": "markdown",
440 | "metadata": {},
441 | "source": [
442 | "Performance comparison\n",
443 | "---------------------------\n",
444 | "\n",
445 | "The following experiment was performed on a personal laptop with 16GB of RAM and 8 CPU cores. Your performance may vary. If you attempt the NumPy version then please ensure that you have more than 4GB of main memory."
446 | ]
447 | },
448 | {
449 | "cell_type": "markdown",
450 | "metadata": {},
451 | "source": [
452 | "**NumPy: ~7s, Needs gigabytes of memory**"
453 | ]
454 | },
455 | {
456 | "cell_type": "markdown",
457 | "metadata": {},
458 | "source": [
459 | "```python\n",
460 | "import numpy as np\n",
461 | "\n",
462 | "%%time \n",
463 | "x = np.random.normal(10, 0.1, size=(20000, 20000)) \n",
464 | "y = x.mean(axis=0)[::100] \n",
465 | "y \n",
466 | "\n",
467 | "CPU times: user 6.73 s, sys: 331 ms, total: 7.16 s\n",
468 | "Wall time: 7.11 s\n",
469 | "```"
470 | ]
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "metadata": {},
475 | "source": [
476 | "**Dask Array: ~1.5s, Needs megabytes of memory**"
477 | ]
478 | },
479 | {
480 | "cell_type": "markdown",
481 | "metadata": {},
482 | "source": [
483 | "```python\n",
484 | "import dask.array as da\n",
485 | "\n",
486 | "%%time\n",
487 | "x = da.random.normal(10, 0.1, size=(20000, 20000), chunks=(1000, 1000))\n",
488 | "y = x.mean(axis=0)[::100] \n",
489 | "y.compute() \n",
490 | "\n",
491 | "CPU times: user 635 ms, sys: 119 ms, total: 754 ms\n",
492 | "Wall time: 1.69 s\n",
493 | "```"
494 | ]
495 | },
496 | {
497 | "cell_type": "markdown",
498 | "metadata": {},
499 | "source": [
500 | "**Discussion**"
501 | ]
502 | },
503 | {
504 | "cell_type": "markdown",
505 | "metadata": {},
506 | "source": [
507 | "Dask finished faster, but used more total CPU time because Dask was able to transparently parallelize the computation because of the chunk size."
508 | ]
509 | }
510 | ],
511 | "metadata": {
512 | "anaconda-cloud": {},
513 | "kernelspec": {
514 | "display_name": "Python 3 (ipykernel)",
515 | "language": "python",
516 | "name": "python3"
517 | },
518 | "language_info": {
519 | "codemirror_mode": {
520 | "name": "ipython",
521 | "version": 3
522 | },
523 | "file_extension": ".py",
524 | "mimetype": "text/x-python",
525 | "name": "python",
526 | "nbconvert_exporter": "python",
527 | "pygments_lexer": "ipython3",
528 | "version": "3.9.13"
529 | }
530 | },
531 | "nbformat": 4,
532 | "nbformat_minor": 4
533 | }
534 |
--------------------------------------------------------------------------------
/05_delayed.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "ab072a3e",
6 | "metadata": {},
7 | "source": [
8 | "
\n",
11 | "\n",
12 | "# Dask Delayed\n",
13 | "\n",
14 | "Dask DataFrames, Dask Arrays and Dask-ML are parallel versions of PyData libraries you likely know and love. But sometimes we encounter problems that could benefit from parallel computing but that do no fit neatly into a DataFrame, Array or Machine-Learning workflow.\n",
15 | "\n",
16 | "Dask delayed is an interface that can be used to parallelize existing Python code and custom algorithms. \n",
17 | "\n",
18 | "A first step to determine if we can use `dask.delayed` is to identify if there is some level of parallelism that we haven't exploit and hopefully `dask.delayed` will take care of it. \n",
19 | "\n",
20 | "The following two functions will perform simple computations, where we use the `sleep` to simulate work. "
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "id": "ddfb8053",
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "from time import sleep\n",
31 | "\n",
32 | "def inc(x):\n",
33 | " \"\"\"Increments x by one\"\"\"\n",
34 | " sleep(1)\n",
35 | " return x + 1\n",
36 | "\n",
37 | "def add(x, y):\n",
38 | " \"\"\"Adds x and y\"\"\"\n",
39 | " sleep(1)\n",
40 | " return x + y"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "id": "54956d17",
46 | "metadata": {},
47 | "source": [
48 | "Let's do some operations and time these functions using the `%%time` magic at the beginning of the cell. "
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "id": "45bb4aa1",
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "%%time\n",
59 | "\n",
60 | "x = inc(1)\n",
61 | "y = inc(2)\n",
62 | "z = add(x, y)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "id": "f7c10bb2",
68 | "metadata": {},
69 | "source": [
70 | "The execution of the cell above took three seconds, this happens because we are calling each function sequentially. The computations above can be represented by the following graph:\n",
71 | "\n",
72 | "
\n",
75 | "\n",
76 | "From looking at the task graph, the opportunity for parallelization is more evident since the the two calls to the `inc` function are completely independent of one-another. Let's explore how `dask.delayed` can help us with this.\n",
77 | "\n",
78 | "\n",
79 | "### `dask.delayed` \n",
80 | "\n",
81 | "We can use `dask.delayed` to transform the `inc` and `add` functions into \"lazy\" versions of themselves. "
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "id": "acfae103",
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "from dask import delayed"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "id": "99e3dbf8",
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "%%time\n",
102 | "\n",
103 | "# x = inc(1)\n",
104 | "# y = inc(2)\n",
105 | "# z = add(x, y)\n",
106 | "\n",
107 | "a = delayed(inc)(1)\n",
108 | "b = delayed(inc)(2)\n",
109 | "c = delayed(add)(a, b)"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "id": "f12638d6",
115 | "metadata": {},
116 | "source": [
117 | "When we call the `delayed` version of the functions by passing the arguments, the original function is isn't actually called yet, that's why the execution finishes very quickly. When we called the `delayed` version of the functions, a `delayed` object is made, which keeps track of the functions to call and what arguments to pass to it. \n",
118 | "\n",
119 | "If we inspect `c`, we will notice that it instead of having the value five, we have what is called a `delayed` object."
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "id": "d885f706",
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "print(c)"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "id": "37007eaf",
135 | "metadata": {},
136 | "source": [
137 | "We can visualize this objects by doing:"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "id": "063f5c18",
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "c.visualize()"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "id": "4c2386a1",
153 | "metadata": {},
154 | "source": [
155 | "Up to this point the object `c` holds all the information we need to compute the result. We can evaluate the result with `.compute()`."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "id": "41558b67",
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "%%time\n",
166 | "\n",
167 | "c.compute()"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "id": "ed58cb96",
173 | "metadata": {},
174 | "source": [
175 | "Notice that now the computation took 2s instead of 3s, this is because the two `inc` computations are run in parallel. "
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "id": "556904bb",
181 | "metadata": {},
182 | "source": [
183 | "## Parallelizing a `for`-loop\n",
184 | "\n",
185 | "When we perform the same group of operation multiple times in the form of `for-loop`, there is a chance that we can perform this computations in parallel. For example, the following serial code, can be parallelized using `delayed`: "
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "cad984eb",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "data = list(range(8))"
196 | ]
197 | },
198 | {
199 | "cell_type": "markdown",
200 | "id": "594b7744",
201 | "metadata": {},
202 | "source": [
203 | "#### Sequential code"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "id": "44fa16ea",
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "%%time\n",
214 | "results = []\n",
215 | "for i in data:\n",
216 | " y = inc(i) # do something here\n",
217 | " results.append(y)\n",
218 | " \n",
219 | "total = sum(results) # do something here"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "id": "ed5cceae",
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "print(f'{total = }')"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "id": "8a68dff0",
235 | "metadata": {},
236 | "source": [
237 | "### Exercise 1 \n",
238 | "\n",
239 | "Notice that both the `inc` and `sum` operations can be done in parallel, use `delayed` to parallelize the sequential code above, compute the `total` and time it using `%%time` "
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "id": "836ccbe1",
245 | "metadata": {},
246 | "source": [
247 | "Uncomment and run the cell below to see the solution."
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "id": "47f2ee0b",
254 | "metadata": {
255 | "jupyter": {
256 | "source_hidden": true
257 | },
258 | "tags": []
259 | },
260 | "outputs": [],
261 | "source": [
262 | "results = []\n",
263 | "for i in data:\n",
264 | " y = delayed(inc)(i) \n",
265 | " results.append(y)\n",
266 | " \n",
267 | "total = delayed(sum)(results)"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "id": "7394312f",
273 | "metadata": {},
274 | "source": [
275 | "In the code above, the `sum` step is not run in parallel, but it depends on each of the `inc` steps, that's why it needs the `delayed` decorator too. The `inc`steps will be parallelized, then aggregated with the `sum` step.\n",
276 | "\n",
277 | "Notice that we can apply delayed to built-in functions, as we did in the case of `sum` in the code above. "
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "id": "9ab8ec9d",
284 | "metadata": {},
285 | "outputs": [],
286 | "source": [
287 | "total"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "id": "85ccafbf",
294 | "metadata": {},
295 | "outputs": [],
296 | "source": [
297 | "total.visualize()"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "id": "bd495500",
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "%%time\n",
308 | "total.compute()"
309 | ]
310 | },
311 | {
312 | "cell_type": "markdown",
313 | "id": "72ab18d7",
314 | "metadata": {},
315 | "source": [
316 | "### The `@delayed` syntax \n",
317 | "\n",
318 | "The `delayed` decorator can be also used by \"decorating\" with `@delayed` the function you want to parallelize."
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "id": "1faf2ba1",
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "@delayed \n",
329 | "def double(x):\n",
330 | " \"\"\"Decrease x by one\"\"\"\n",
331 | " sleep(1)\n",
332 | " return 2*x "
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "id": "409f9557",
338 | "metadata": {},
339 | "source": [
340 | "Then when we call this new `double` function we obtain a delayed object:"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "id": "4501eeda",
347 | "metadata": {},
348 | "outputs": [],
349 | "source": [
350 | "d = double(4)\n",
351 | "print(d)"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": null,
357 | "id": "1a313693",
358 | "metadata": {},
359 | "outputs": [],
360 | "source": [
361 | "%%time\n",
362 | "d.compute()"
363 | ]
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "id": "24a13213",
368 | "metadata": {},
369 | "source": [
370 | "### Exercise 2\n",
371 | "\n",
372 | "Using the `delayed` decorator create the parallel versions of `inc` and `add`\n",
373 | "\n",
374 | "```python\n",
375 | "def inc(x):\n",
376 | " \"\"\"Increments x by one\"\"\"\n",
377 | " sleep(1)\n",
378 | " return x + 1\n",
379 | "\n",
380 | "def add(x, y):\n",
381 | " \"\"\"Adds x and y\"\"\"\n",
382 | " sleep(1)\n",
383 | " return x + y\n",
384 | "```"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": null,
390 | "id": "7d2847d2",
391 | "metadata": {
392 | "jupyter": {
393 | "source_hidden": true
394 | },
395 | "tags": []
396 | },
397 | "outputs": [],
398 | "source": [
399 | "@delayed\n",
400 | "def inc(x):\n",
401 | " \"\"\"Increments x by one\"\"\"\n",
402 | " sleep(1)\n",
403 | " return x + 1\n",
404 | "\n",
405 | "@delayed\n",
406 | "def add(x, y):\n",
407 | " \"\"\"Adds x and y\"\"\"\n",
408 | " sleep(1)\n",
409 | " return x + y"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "id": "4a12256b",
415 | "metadata": {},
416 | "source": [
417 | "``Delayed`` objects support several standard Python operations, each of which creates another ``Delayed`` object representing the result:\n",
418 | "\n",
419 | "- Arithmetic operators, e.g. `*`, `-`, `+`\n",
420 | "- Item access and slicing, e.g. `x[0]`, `x[1:3]`\n",
421 | "- Attribute access, e.g. `x.size`\n",
422 | "- Method calls, e.g. `x.index(0)`\n",
423 | "\n",
424 | "For example you can do:"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": null,
430 | "id": "e5ed731f",
431 | "metadata": {},
432 | "outputs": [],
433 | "source": [
434 | "result = (inc(5) * inc(7)) + (inc(3) * inc(2))\n",
435 | "result.visualize()"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": null,
441 | "id": "cacd838b",
442 | "metadata": {},
443 | "outputs": [],
444 | "source": [
445 | "%%time\n",
446 | "result.compute()"
447 | ]
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "id": "d1ca4b69",
452 | "metadata": {},
453 | "source": [
454 | "## Another for-loop example "
455 | ]
456 | },
457 | {
458 | "cell_type": "markdown",
459 | "id": "58dcb089",
460 | "metadata": {},
461 | "source": [
462 | "Let's say we want to perform some operations like `inc`, `double` and `add` on a list of data, and finally aggregate all the results. We can use our `delayed` decorated functions to perform this computations faster. \n",
463 | "The serial version of the code below would take approximately 24 seconds, let's see how long does the parallel version takes:"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "id": "7b6b24b1",
470 | "metadata": {},
471 | "outputs": [],
472 | "source": [
473 | "data = list(range(8))\n",
474 | "\n",
475 | "output = []\n",
476 | "for x in data:\n",
477 | " a = inc(x) #parallel version\n",
478 | " b = double(x) #parallel version\n",
479 | " c = add(a, b) #parallel version\n",
480 | " output.append(c)\n",
481 | "\n",
482 | "total = delayed(sum)(output)\n",
483 | "total"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "id": "3e12ae2b",
489 | "metadata": {},
490 | "source": [
491 | "Noticed that `inc`, `double` and `add` in the code above are already the parallel versions, since we decorated with `@delayed`"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": null,
497 | "id": "e0f2d3e5",
498 | "metadata": {},
499 | "outputs": [],
500 | "source": [
501 | "total.visualize()"
502 | ]
503 | },
504 | {
505 | "cell_type": "markdown",
506 | "id": "57f38fc4",
507 | "metadata": {},
508 | "source": [
509 | "### Exercise: How long will this task graph take to compute on a machine with 8 cores?"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": null,
515 | "id": "f80ffe62",
516 | "metadata": {},
517 | "outputs": [],
518 | "source": [
519 | "%%time\n",
520 | "total.compute()"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "id": "f2ead99e",
526 | "metadata": {},
527 | "source": [
528 | "## Extra resources\n",
529 | "\n",
530 | "For more examples on `dask.delayed` check:\n",
531 | "- Main Dask tutorial: [Delayed lesson](https://github.com/dask/dask-tutorial/blob/main/01_dask.delayed.ipynb)\n",
532 | "- More examples on Delayed: [PyData global - Dask tutorial - Delayed](https://github.com/coiled/pydata-global-dask/blob/master/1-delayed.ipynb)\n",
533 | "- Short screencast on Dask delayed: [How to parallelize Python code with Dask Delayed (3min)](https://www.youtube.com/watch?v=-EUlNJI2QYs)\n",
534 | "- [Dask Delayed documentation](https://docs.dask.org/en/latest/delayed.html)\n",
535 | "- [Delayed Best Practices](https://docs.dask.org/en/latest/delayed-best-practices.html)\n"
536 | ]
537 | }
538 | ],
539 | "metadata": {
540 | "kernelspec": {
541 | "display_name": "Python 3 (ipykernel)",
542 | "language": "python",
543 | "name": "python3"
544 | },
545 | "language_info": {
546 | "codemirror_mode": {
547 | "name": "ipython",
548 | "version": 3
549 | },
550 | "file_extension": ".py",
551 | "mimetype": "text/x-python",
552 | "name": "python",
553 | "nbconvert_exporter": "python",
554 | "pygments_lexer": "ipython3",
555 | "version": "3.9.13"
556 | }
557 | },
558 | "nbformat": 4,
559 | "nbformat_minor": 5
560 | }
561 |
--------------------------------------------------------------------------------
/02_dataframe.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "8f20d516",
6 | "metadata": {
7 | "tags": []
8 | },
9 | "source": [
10 | "
\n",
13 | "\n",
14 | "# Process Tabular Data with Dask DataFrame\n",
15 | "In this notebook we will learn about the [Dask DataFrame](https://docs.dask.org/en/latest/dataframe.html), a tabular DataFrame interface based on pandas that will automatically build parallel computations."
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "id": "595e457f",
21 | "metadata": {
22 | "tags": []
23 | },
24 | "source": [
25 | "## When to use Dask DataFrames\n",
26 | "\n",
27 | "Pandas is great for tabular datasets that fit in memory. If your data fits in memory then you should use Pandas. **Dask becomes useful when the dataset you want to analyze is larger than your machine's RAM** where you would normally run into `MemoryError`s.\n",
28 | "\n",
29 | "```python\n",
30 | " MemoryError: ...\n",
31 | "```\n",
32 | "\n",
33 | "This also means:\n",
34 | "\n",
35 | "## Don't use Dask DataFrames if you don't need to!\n",
36 | "Distributed computing brings a lot of additional complexity into the mix and will **incur overhead**. If your dataset and computations fit comfortably within your local resources **this overhead will may be larger than the performance gain** you'll get by using Dask. In that case, stick with non-distributed libraries like pandas, numpy and scikit-learn. "
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "id": "389b5226",
42 | "metadata": {
43 | "tags": []
44 | },
45 | "source": [
46 | "## About this notebook\n",
47 | "During this tutorial, we will work with a dataset containg NYC flight data. This dataset is only about 200MB on disk so that you can download it in a reasonable time and exercises finish quickly, but Dask Dataframes will scale to datasets much larger than the memory on your local machine. "
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "id": "cd61890f",
53 | "metadata": {},
54 | "source": [
55 | "## Getting started with Dask DataFrames\n",
56 | "\n",
57 | "Let's use Dask DataFrame's to explore the NYC flight dataset. Dask's `read_csv` function supports wildcard characters like `\"*\"` which can be used to load an entire directory of CSV files."
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "id": "c69dd555",
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "%run prep_data.py -d flights"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "id": "5c3b20b3",
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "import os\n",
78 | "\n",
79 | "files = os.path.join('data', 'nycflights', '*.csv')\n",
80 | "files"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "id": "11b23870",
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "import dask.dataframe as dd"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "id": "4a7df915",
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "ddf = dd.read_csv(files,\n",
101 | " parse_dates={'Date': [0, 1, 2]},\n",
102 | " dtype={\"TailNum\": str,\n",
103 | " \"CRSElapsedTime\": float,\n",
104 | " \"Cancelled\": bool})\n",
105 | "ddf"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "id": "c61959ff",
111 | "metadata": {},
112 | "source": [
113 | "Notice that the representation of the dataframe object contains no data - Dask has just done enough to read the start of the first file, and infer the column names and dtypes.\n",
114 | "\n",
115 | "**Dask is lazy!**"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "id": "d714511c",
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "ddf.columns"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "id": "5a24944c",
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "ddf.dtypes"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "id": "b4a86ccb",
141 | "metadata": {},
142 | "source": [
143 | "Dask DataFrames have an `.npartitions` attribute which tells you how many partitions make up a Dask DataFrame.\n",
144 | "\n",
145 | "Dask is able to process larger-than-memory datasets by cutting computations into smaller parts and processing those in parallel."
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "id": "b04c8168",
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "ddf.npartitions"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "id": "047d2fb2",
161 | "metadata": {
162 | "tags": []
163 | },
164 | "source": [
165 | "## The pandas Look & Feel\n",
166 | "Dask DataFrames implement a well-used portion of the Pandas API in a way that allows for parallel and out-of-core computation. This means that a lot of Dask DataFrame code will look and feel familiar to pandas users: \n",
167 | "\n",
168 | "```python\n",
169 | "import pandas as pd import dask.dataframe as dd\n",
170 | "df = pd.read_csv('2015-01-01.csv') df = dd.read_csv('2015-*-*.csv')\n",
171 | "df.groupby(df.user_id).value.mean() df.groupby(df.user_id).value.mean().compute()\n",
172 | "```"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "id": "c87fe45a",
178 | "metadata": {
179 | "tags": []
180 | },
181 | "source": [
182 | "This is because, internally, **a Dask DataFrame is composed of many pandas DataFrames**: \n",
183 | "\n",
184 | "
\n",
185 | "\n",
186 | "Dask DataFrames are divided into different **partitions** where each partition is a pandas DataFrame. This is why driving the Dask car *can feel* like you're still driving the pandas car: Dask is performing a bunch of regular pandas operations on regular pandas objects under the hood.\n",
187 | "\n",
188 | "But don't forget that you've entered the world of distributed computing now -- which means you've added a lot more complexity to the mix. You now need to consider things like concurrency, state, data duplicates, data loss, etc.\n",
189 | "\n",
190 | "Luckily, with a high-level Collection like DataFrames, Dask handles most of these complicated questions for you. "
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "id": "67a6d01f",
196 | "metadata": {
197 | "tags": []
198 | },
199 | "source": [
200 | "## pandas-like Computations\n",
201 | "\n",
202 | "Let's see this in action with a more involved example. Let's compute the largest flight departure delay.\n",
203 | "\n",
204 | "In pandas we could do this by iterating over each file to find the individual maximums and then find the final maximum over the individual maximums.\n",
205 | "\n",
206 | "```python\n",
207 | "import pandas as pd\n",
208 | "\n",
209 | "files = os.listdir(os.path.join('data', 'nycflights'))\n",
210 | "\n",
211 | "maxes = []\n",
212 | "\n",
213 | "for file in files:\n",
214 | " df = pd.read_csv(os.path.join('data', 'nycflights', file))\n",
215 | " maxes.append(df.DepDelay.max())\n",
216 | "\n",
217 | "final_max = max(maxes)\n",
218 | "```\n",
219 | "\n",
220 | "Thankfully, we can do this with Dask DataFrames using pandas-like code:"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "id": "a21dc382",
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "max_delay = ddf[\"DepDelay\"].max()"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "id": "d3dce768",
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "max_delay"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "id": "83c59405",
246 | "metadata": {},
247 | "source": [
248 | "The above cell looks exactly like what we would do using pandas...but the result does not! \n",
249 | "\n",
250 | "Instead of the actual result of the computation, we only get some schematic information. This is because Dask DataFrames are lazily evaluated. This means that **no computation happens unless you explicitly tell Dask to do so** by calling `.compute()`.\n",
251 | "\n",
252 | "Before actually performing a computation, dask first constructs a task graph that it can use to optimize computing the result in parallel. You can think of a task graph as the recipe or routemap that contains all the necessary instructions to arrive at the final result. Once you call `.compute()`, Dask will execute the instructions contained in the task graph and perform computations in parallel.\n",
253 | "\n",
254 | "Let's look at the task graph to get a feel for how Dask's blocked algorithms work:"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "id": "e9c5128d",
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "max_delay.visualize()"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "id": "f0acae36",
270 | "metadata": {},
271 | "source": [
272 | "Some things to note:\n",
273 | "\n",
274 | "1. Up until this point everything is lazy. To evaluate the result for `max_delay`, call its `compute()` method:\n",
275 | "2. Dask will delete intermediate results (like the full pandas DataFrame for each file) as soon as possible.\n",
276 | " - This lets us handle datasets that are larger than memory\n",
277 | " - This means that repeated computations will have to load all of the data in each time "
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "id": "92bb3578",
284 | "metadata": {},
285 | "outputs": [],
286 | "source": [
287 | "%%time \n",
288 | "max_delay.compute()"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "id": "05540de7",
294 | "metadata": {
295 | "tags": []
296 | },
297 | "source": [
298 | "## More Dask DataFrame computations\n",
299 | "\n",
300 | "Let's see couple of examples on how the API for Dask DataFrames is the same than Pandas. If you are comfortable with Pandas, the following operations will look very familiar, except we will need to add the `compute()` to get the results wanted.\n",
301 | "\n",
302 | "### Example 1: Total of non-cancelled flights taken\n",
303 | "\n",
304 | "Notice that there is a column in our DataFrame called `\"Cancelled\"` that is a boolean. "
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "id": "0883849a",
311 | "metadata": {
312 | "jupyter": {
313 | "source_hidden": true
314 | },
315 | "tags": []
316 | },
317 | "outputs": [],
318 | "source": [
319 | "(~ddf[\"Cancelled\"]).sum().compute()"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "id": "5df23079",
325 | "metadata": {
326 | "tags": []
327 | },
328 | "source": [
329 | "### Example 2: Total of non-cancelled flights taken by airport\n",
330 | "\n",
331 | "We should select the non-canceled flights, use the operation `groupby` on the `\"Origin\"` column and finally use `count` to get the detailed per airport."
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "id": "7b01ecd4",
338 | "metadata": {
339 | "jupyter": {
340 | "source_hidden": true
341 | },
342 | "tags": []
343 | },
344 | "outputs": [],
345 | "source": [
346 | "ddf[~ddf[\"Cancelled\"]].groupby(\"Origin\")[\"Origin\"].count().compute()"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "id": "d7dec6b4",
352 | "metadata": {
353 | "tags": []
354 | },
355 | "source": [
356 | "### Exercise 1: What is the average departure delay from each airport?"
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "id": "b5668216",
362 | "metadata": {},
363 | "source": [
364 | "Uncomment and run the cell below to see the solution."
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": null,
370 | "id": "6319550d",
371 | "metadata": {
372 | "jupyter": {
373 | "source_hidden": true
374 | },
375 | "tags": []
376 | },
377 | "outputs": [],
378 | "source": [
379 | "ddf.groupby(\"Origin\")[\"DepDelay\"].mean().compute()"
380 | ]
381 | },
382 | {
383 | "cell_type": "markdown",
384 | "id": "230c37b2",
385 | "metadata": {
386 | "tags": []
387 | },
388 | "source": [
389 | "### Exercise 2: What day of the week has the worst average departure delay?\n",
390 | "Uncomment and run the cell below to see the solution."
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": null,
396 | "id": "4dacd560",
397 | "metadata": {
398 | "jupyter": {
399 | "source_hidden": true
400 | },
401 | "tags": []
402 | },
403 | "outputs": [],
404 | "source": [
405 | "ddf.groupby(\"DayOfWeek\")[\"DepDelay\"].mean().idxmax().compute()"
406 | ]
407 | },
408 | {
409 | "cell_type": "markdown",
410 | "id": "b0da1c43",
411 | "metadata": {
412 | "tags": []
413 | },
414 | "source": [
415 | "## Working with Partitions\n",
416 | "Dask DataFrames implements a large portion of the pandas API by simply performing pandas methods on its partitions (which are pandas objects). \n",
417 | "\n",
418 | "### Mapping Functions with `map_partitions`\n",
419 | "However, sometimes you might want to manipulate your Dask DataFrame with a custom function. You can use the `map_partitions` method for this.\n",
420 | "\n",
421 | "Imagine you find out that there was a 2-minute error in the `DepDelay` column.\n",
422 | "\n",
423 | "Let's create a pandas `apply` function that will subtract 2 from every input:"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "id": "ca97e0c4",
430 | "metadata": {},
431 | "outputs": [],
432 | "source": [
433 | "def subtract_2(df):\n",
434 | " return df.apply(lambda x: x-2)"
435 | ]
436 | },
437 | {
438 | "cell_type": "markdown",
439 | "id": "af5ffd7a",
440 | "metadata": {},
441 | "source": [
442 | "We can then map this function over all of our partitions:"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": null,
448 | "id": "c5db73a2",
449 | "metadata": {},
450 | "outputs": [],
451 | "source": [
452 | "ddf[\"Adjusted_DepDelay\"] = ddf[\"DepDelay\"].map_partitions(subtract_2)"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": null,
458 | "id": "c6f79e39",
459 | "metadata": {},
460 | "outputs": [],
461 | "source": [
462 | "ddf[[\"DepDelay\", \"Adjusted_DepDelay\"]].head()"
463 | ]
464 | },
465 | {
466 | "cell_type": "markdown",
467 | "id": "45884f8b",
468 | "metadata": {
469 | "tags": []
470 | },
471 | "source": [
472 | "## Performance tip: When to call .compute()\n",
473 | "\n",
474 | "In the examples and exercises above, we sometimes perform the same operation more than once (e.g. `read_csv`). Dask DataFrames hashes the arguments, allowing duplicate computations to be shared, and only computed once. You can use `dask.compute()` to merge task graphs of multiple operations.\n",
475 | "\n",
476 | "For example, let's compute the mean and standard deviation for departure delay of all non-canceled flights. Since Dask operations are lazy, those values aren't the final results until we `compute` them. They're just the recipe required to get the result.\n",
477 | "\n",
478 | "If we compute them with two calls to compute, there is no sharing of intermediate computations."
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": null,
484 | "id": "540952b2",
485 | "metadata": {},
486 | "outputs": [],
487 | "source": [
488 | "non_cancelled = ddf[~ddf[\"Cancelled\"]]\n",
489 | "mean_delay = non_cancelled[\"DepDelay\"].mean()\n",
490 | "std_delay = non_cancelled[\"DepDelay\"].std()"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": null,
496 | "id": "da852348",
497 | "metadata": {},
498 | "outputs": [],
499 | "source": [
500 | "%%time\n",
501 | "mean_delay_result = mean_delay.compute()\n",
502 | "std_delay_result = std_delay.compute()"
503 | ]
504 | },
505 | {
506 | "cell_type": "markdown",
507 | "id": "7ff6c61e",
508 | "metadata": {},
509 | "source": [
510 | "Now, let's see how long it takes if we try computing `mean_delay` and `std_delay` with a single `compute()` call."
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": null,
516 | "id": "1514304e",
517 | "metadata": {},
518 | "outputs": [],
519 | "source": [
520 | "import dask"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": null,
526 | "id": "eefc4c22",
527 | "metadata": {},
528 | "outputs": [],
529 | "source": [
530 | "%%time\n",
531 | "mean_delay_res, std_delay_res = dask.compute(mean_delay, std_delay)"
532 | ]
533 | },
534 | {
535 | "cell_type": "markdown",
536 | "id": "48db683a",
537 | "metadata": {},
538 | "source": [
539 | "Using `dask.compute` takes roughly 1/2 the time. This is because the task graphs for both results are merged when calling `dask.compute`, allowing shared operations (like `read_csv`) to only be done once instead of twice. In particular, using `dask.compute` only does the following once:\n",
540 | "\n",
541 | "- The calls to `read_csv`\n",
542 | "- The filter (`df[~df[\"Cancelled\"]]`)\n",
543 | "- The `\"DepDelay\"` column indexing\n",
544 | "- Some of the necessary reductions (`sum`, `count`)\n",
545 | "\n",
546 | "To see what the merged task graphs between multiple results look like (and what's shared), you can use the `dask.visualize` function:"
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": null,
552 | "id": "fc371ed9",
553 | "metadata": {},
554 | "outputs": [],
555 | "source": [
556 | "dask.visualize(mean_delay, std_delay)"
557 | ]
558 | },
559 | {
560 | "cell_type": "markdown",
561 | "id": "4b983d2a",
562 | "metadata": {
563 | "tags": []
564 | },
565 | "source": [
566 | "# Extra resources\n",
567 | "\n",
568 | "- Explore applying custom code to Dask DataFrames: [Dask Tutorial DataFrames](https://github.com/dask/dask-tutorial/blob/main/04_dataframe.ipynb)\n",
569 | "- [Dask DataFrame documentation](https://docs.dask.org/en/latest/dataframe.html)\n",
570 | "- [Dask DataFrame examples](https://examples.dask.org/dataframe.html)"
571 | ]
572 | }
573 | ],
574 | "metadata": {
575 | "kernelspec": {
576 | "display_name": "Python 3 (ipykernel)",
577 | "language": "python",
578 | "name": "python3"
579 | },
580 | "language_info": {
581 | "codemirror_mode": {
582 | "name": "ipython",
583 | "version": 3
584 | },
585 | "file_extension": ".py",
586 | "mimetype": "text/x-python",
587 | "name": "python",
588 | "nbconvert_exporter": "python",
589 | "pygments_lexer": "ipython3",
590 | "version": "3.9.13"
591 | }
592 | },
593 | "nbformat": 4,
594 | "nbformat_minor": 5
595 | }
596 |
--------------------------------------------------------------------------------
/04_machine_learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "a81512fd",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import warnings\n",
11 | "warnings.filterwarnings('ignore')"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "8bbaf545",
17 | "metadata": {},
18 | "source": [
19 | "
\n",
22 | "\n",
23 | "# Parallel and Distributed Machine Learning\n",
24 | "So far we have seen how Dask makes data analysis scalable with parallelization via Dask DataFrames and Dask Array. Let's now see how [Dask-ML](https://ml.dask.org/) allows us to do machine learning in a parallel and distributed manner. Note, machine learning is really just a special case of data analysis (one that automates analytical model building), so the 💪 Dask gains 💪 we've seen will apply here as well!\n",
25 | "\n",
26 | "> If you'd like a refresher on the difference between parallel and distributed computing, [here's a good discussion on StackExchange](https://cs.stackexchange.com/questions/1580/distributed-vs-parallel-computing). You can also check out [The Beginner's Guide to Distributed Computing](https://towardsdatascience.com/the-beginners-guide-to-distributed-computing-6d6833796318).\n",
27 | "\n",
28 | "### What we'll cover:\n",
29 | "1. Types of scaling problems in ML\n",
30 | "2. Scale Scikit-Learn with Joblib+Dask (compute-bound)\n",
31 | "3. Scale Scikit-Learn with Dask-ML (memory-bound)\n",
32 | "4. Scale XGBoost with Dask"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "id": "9e3c11a4",
38 | "metadata": {},
39 | "source": [
40 | "## Types of scaling problems in machine learning\n",
41 | "\n",
42 | "There are two main types of scaling challenges you can run into in your machine learning workflow: scaling the **size of your data** and scaling the **size of your model**. That is:\n",
43 | "\n",
44 | "1. **Memory-bound problems**: Data is larger than RAM, and sampling isn't an option.\n",
45 | "2. **CPU-bound problems**: Data fits in RAM, but training takes too long. Many hyperparameter combinations, a large ensemble of many models, etc.\n",
46 | "\n",
47 | "Here's a handy diagram for visualizing these problems:"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "id": "e6f1145d",
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "from IPython.display import Image\n",
58 | "Image(url=\"images/dask-zones.png\", width=400)"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "id": "4572e129",
64 | "metadata": {},
65 | "source": [
66 | "In the bottom-left quadrant, your datasets are not too large (they fit comfortably in RAM) and your model is not too large either. When these conditions are met, you are much better off using something like scikit-learn, XGBoost, and similar libraries. You don't need to leverage multiple machines in a distributed manner with a library like Dask-ML. However, if you are in any of the other quadrants, distributed machine learning is the way to go.\n",
67 | "\n",
68 | "Summarizing: \n",
69 | "\n",
70 | "* For in-memory problems, just use scikit-learn (or your favorite ML library).\n",
71 | "* For large models, use `dask` and `joblib` together with your favorite scikit-learn estimator.\n",
72 | "* For large datasets, use `dask_ml` or `dask-xgboost` estimators."
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "id": "f6571f76",
78 | "metadata": {},
79 | "source": [
80 | "## Scikit-Learn Refresher\n",
81 | "\n",
82 | "
\n",
85 | "\n",
86 | "In this section, we'll quickly run through a typical Scikit-Learn workflow:\n",
87 | "\n",
88 | "* Load some data (in this case, we'll generate it)\n",
89 | "* Import the Scikit-Learn module for our chosen ML algorithm\n",
90 | "* Create an estimator for that algorithm and fit it with our data\n",
91 | "* Inspect the learned attributes\n",
92 | "* Check the accuracy of our model\n"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "id": "800cf0f5",
98 | "metadata": {},
99 | "source": [
100 | "### Generate some random data"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "id": "ea55b66a",
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "from sklearn.datasets import make_classification\n",
111 | "\n",
112 | "# Generate data\n",
113 | "X, y = make_classification(n_samples=10000, n_features=4, random_state=0)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "id": "18a0577b",
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "# Let's take a look at X\n",
124 | "X[:8]"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "id": "06d00f43",
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "# Let's take a look at y\n",
135 | "y[:8]"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "id": "4328bcf5",
141 | "metadata": {},
142 | "source": [
143 | "### Fitting a SVC\n",
144 | "\n",
145 | "For this example, we will fit a [Support Vector Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)."
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "id": "84aca93d",
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "from sklearn.svm import SVC\n",
156 | "\n",
157 | "estimator = SVC(random_state=0)\n",
158 | "estimator.fit(X, y)"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "id": "a8e4d61c",
164 | "metadata": {},
165 | "source": [
166 | "We can inspect the learned features by taking a look a the `support_vectors_`:"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "id": "51398313",
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "estimator.support_vectors_[:4]"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "id": "71e11c69",
182 | "metadata": {},
183 | "source": [
184 | "And we check the accuracy:"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "id": "d2ab53c2",
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "estimator.score(X, y)"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "id": "c655296b",
200 | "metadata": {},
201 | "source": [
202 | "### Hyperparameter Optimization\n",
203 | "\n",
204 | "There are a few ways to learn the best *hyper*parameters while training. One is `GridSearchCV`.\n",
205 | "As the name implies, this does a brute-force search over a grid of hyperparameter combinations. Scikit-learn provides tools to automatically find the best parameter combinations via cross-validation (which is the \"CV\" in `GridSearchCV`)."
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "id": "619a7741",
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "from sklearn.model_selection import GridSearchCV"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "id": "e73bfac5",
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "%%time\n",
226 | "estimator = SVC(gamma='auto', random_state=0, probability=True)\n",
227 | "param_grid = {\n",
228 | " 'C': [0.001, 10.0],\n",
229 | " 'kernel': ['rbf', 'poly'],\n",
230 | "}\n",
231 | "\n",
232 | "# Brute-force search over a grid of hyperparameter combinations\n",
233 | "grid_search = GridSearchCV(estimator, param_grid, verbose=2, cv=2)\n",
234 | "grid_search.fit(X, y)"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "id": "39201c6f",
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "grid_search.best_params_, grid_search.best_score_"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "id": "3ba9f067",
250 | "metadata": {},
251 | "source": [
252 | "## Compute Bound: Single-machine parallelism with Joblib\n",
253 | "With Joblib, we can say that Scikit-Learn has *single-machine* parallelism.\n",
254 | "\n",
255 | "**Any Scikit-Learn estimator that can operate in parallel exposes an `n_jobs` keyword**, which tells you how many tasks to run in parallel. Specifying `n_jobs=-1` jobs means running the maximum possible number of tasks in parallel."
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "id": "be7a0201",
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "%%time\n",
266 | "grid_search = GridSearchCV(estimator, param_grid, verbose=2, cv=2, n_jobs=-1)\n",
267 | "grid_search.fit(X, y)"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "id": "bfd9d5ec",
273 | "metadata": {},
274 | "source": [
275 | "Notice that the computation above it is faster than before. "
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "id": "903b2750",
281 | "metadata": {},
282 | "source": [
283 | "## Compute Bound: Multi-machine parallelism with Dask\n",
284 | "\n",
285 | "In this section we'll see how Dask (plus Joblib and Scikit-Learn) gives us multi-machine parallelism. Here's what our grid search graph would look like if we allowed Dask to schedule our training \"jobs\" over multiple machines in our cluster:\n",
286 | "\n",
287 | "Dask can talk to Scikit-Learn (via Joblib) so that our *Dask cluster* is used to train a model. \n",
288 | "\n"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "id": "e089cf6d",
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "from dask.distributed import Client\n",
299 | "\n",
300 | "# create local Dask cluster with 8 workers (cores)\n",
301 | "client = Client(n_workers=8)\n",
302 | "client"
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "id": "d9f29710",
308 | "metadata": {},
309 | "source": [
310 | "**Note:** Click on Cluster Info, to see more details about the cluster. You can see the configuration of the cluster and some other specs. \n",
311 | "\n",
312 | "We can expand our problem by specifying more hyperparameters before training, and see how using `dask` as backend can help us. "
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "id": "7805b120",
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "param_grid = {\n",
323 | " 'C': [0.001, 0.1, 1.0, 2.5, 5, 10.0],\n",
324 | " 'kernel': ['rbf', 'poly', 'linear'],\n",
325 | " 'shrinking': [True, False],\n",
326 | "}\n",
327 | "\n",
328 | "grid_search = GridSearchCV(estimator, param_grid, verbose=2, cv=2, n_jobs=-1)"
329 | ]
330 | },
331 | {
332 | "cell_type": "markdown",
333 | "id": "a0089245",
334 | "metadata": {},
335 | "source": [
336 | "### Dask parallel backend\n",
337 | "\n",
338 | "We can fit our estimator with multi-machine parallelism by quickly *switching to a Dask parallel backend* when using joblib. "
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": null,
344 | "id": "5b678611",
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "import joblib"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "id": "74c38449",
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "%%time\n",
359 | "with joblib.parallel_backend(\"dask\", scatter=[X, y]):\n",
360 | " grid_search.fit(X, y)"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "id": "097c1a48",
366 | "metadata": {},
367 | "source": [
368 | "**What just happened?**\n",
369 | "\n",
370 | "Dask-ML developers worked with the Scikit-Learn and Joblib developers to implement a Dask parallel backend. So internally, scikit-learn now talks to Joblib, and Joblib talks to Dask, and Dask is what handles scheduling all of those tasks on multiple machines.\n",
371 | "\n",
372 | "The best parameters and best score:"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": null,
378 | "id": "fc5d3872",
379 | "metadata": {},
380 | "outputs": [],
381 | "source": [
382 | "grid_search.best_params_, grid_search.best_score_"
383 | ]
384 | },
385 | {
386 | "cell_type": "markdown",
387 | "id": "18c91175",
388 | "metadata": {
389 | "tags": []
390 | },
391 | "source": [
392 | "## But that was cheating...sort of"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "id": "64b9305b-f555-488e-862f-d5bde9c8deb9",
399 | "metadata": {},
400 | "outputs": [],
401 | "source": [
402 | "import coiled \n",
403 | "cluster = coiled.Cluster(\n",
404 | " name=\"intro-to-dask\",\n",
405 | " n_workers=10,\n",
406 | " worker_memory=\"16GiB\",\n",
407 | " package_sync=True,\n",
408 | ")"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": null,
414 | "id": "2a607d55-1856-4e08-af21-a4cac02c419f",
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "from distributed import Client\n",
419 | "client = Client(cluster)"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "id": "db2bdab2",
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "%%time\n",
430 | "with joblib.parallel_backend(\"dask\", scatter=[X, y]):\n",
431 | " grid_search.fit(X, y)"
432 | ]
433 | },
434 | {
435 | "cell_type": "markdown",
436 | "id": "3c66b64d",
437 | "metadata": {},
438 | "source": [
439 | "## Memory Bound: Parallel Machine Learning with Dask-ML\n",
440 | "\n",
441 | "We have seen how to work with larger models, but sometimes you'll want to train on a larger than memory dataset. `dask-ml` has implemented estimators that work well on Dask `Arrays` and `DataFrames` that may be larger than your machine's RAM."
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "id": "74bb3918",
448 | "metadata": {},
449 | "outputs": [],
450 | "source": [
451 | "from dask_ml.datasets import make_regression\n",
452 | "from dask_ml.linear_model import LinearRegression\n",
453 | "from dask_ml.model_selection import train_test_split"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": null,
459 | "id": "24a58ef5",
460 | "metadata": {},
461 | "outputs": [],
462 | "source": [
463 | "# create synthetic regression data\n",
464 | "X, y = make_regression(n_samples=10_000, chunks=100)"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "id": "9b56cc9a",
471 | "metadata": {},
472 | "outputs": [],
473 | "source": [
474 | "X"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "id": "e62aad50",
481 | "metadata": {},
482 | "outputs": [],
483 | "source": [
484 | "# create train/test splits\n",
485 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, test_size=0.3, convert_mixed_types=True)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": null,
491 | "id": "e19137a9",
492 | "metadata": {},
493 | "outputs": [],
494 | "source": [
495 | "# instantiate model\n",
496 | "lr = LinearRegression()"
497 | ]
498 | },
499 | {
500 | "cell_type": "markdown",
501 | "id": "dcfbb685",
502 | "metadata": {},
503 | "source": [
504 | "### Exercise:\n",
505 | "Can you fit this parallel Dask-ML `LinearRegression()` model on the training data?"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": null,
511 | "id": "44f48412",
512 | "metadata": {},
513 | "outputs": [],
514 | "source": [
515 | "# %load solutions/ml-ex-1.py\n",
516 | "lr.fit(X_train, y_train)"
517 | ]
518 | },
519 | {
520 | "cell_type": "markdown",
521 | "id": "f8b8a1f3",
522 | "metadata": {},
523 | "source": [
524 | "### Exercise:\n",
525 | "Can you make predictions with this `LinearRegression()` model?"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": null,
531 | "id": "a4cd999d",
532 | "metadata": {},
533 | "outputs": [],
534 | "source": [
535 | "# %load solutions/ml-ex-2.py\n",
536 | "y_pred = lr.predict(X_test)"
537 | ]
538 | },
539 | {
540 | "cell_type": "code",
541 | "execution_count": null,
542 | "id": "a027213e",
543 | "metadata": {},
544 | "outputs": [],
545 | "source": [
546 | "lr.score(X,y)"
547 | ]
548 | },
549 | {
550 | "cell_type": "markdown",
551 | "id": "630a511a",
552 | "metadata": {},
553 | "source": [
554 | "## Training XGBoost in Parallel"
555 | ]
556 | },
557 | {
558 | "cell_type": "markdown",
559 | "id": "6f2e8162",
560 | "metadata": {},
561 | "source": [
562 | "Dask-ML implements some of the most popular machine learning algorithms for parallel processing, but not all of them.\n",
563 | "\n",
564 | "For XGBoost, the maintainers of Dask and XGBoost took a different approach: they built a Dask Backend for XGBoost so you can run XGBoost in parallel with Dask straight from your normal XGBoost library.\n",
565 | "\n",
566 | "Running an XGBoost model with the distributed Dask backend requires minimal changes to your regular XGBoost code:\n",
567 | "\n",
568 | "```python\n",
569 | "import xgboost as xgb\n",
570 | "\n",
571 | "# Create the XGBoost DMatrix for our training and testing splits\n",
572 | "dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)\n",
573 | "dtest = xgb.dask.DaskDMatrix(client, X_test, y_test)\n",
574 | "\n",
575 | "# Set model parameters (XGBoost defaults)\n",
576 | "params = {\n",
577 | " \"max_depth\": 6,\n",
578 | " \"gamma\": 0,\n",
579 | " \"eta\": 0.3,\n",
580 | " \"min_child_weight\": 30,\n",
581 | " \"objective\": \"reg:squarederror\",\n",
582 | " \"grow_policy\": \"depthwise\"\n",
583 | "}\n",
584 | "\n",
585 | "# train the model\n",
586 | "output = xgb.dask.train(\n",
587 | " client, params, dtrain, num_boost_round=4,\n",
588 | " evals=[(dtrain, 'train')]\n",
589 | ")\n",
590 | "\n",
591 | "# make predictions\n",
592 | "y_pred = xgb.dask.predict(client, output, dtest)\n",
593 | "```\n",
594 | "\n",
595 | "See [this step-by-step tutorial](https://coiled.io/blog/dask-xgboost-python-example/) if you're interested to learn more."
596 | ]
597 | },
598 | {
599 | "cell_type": "markdown",
600 | "id": "7e378834",
601 | "metadata": {},
602 | "source": [
603 | "## Extra resources:\n",
604 | "\n",
605 | "- [Dask-ML documentation](https://ml.dask.org/)\n",
606 | "- [Getting started with Coiled](https://docs.coiled.io/user_guide/getting_started.html)"
607 | ]
608 | }
609 | ],
610 | "metadata": {
611 | "kernelspec": {
612 | "display_name": "Python 3 (ipykernel)",
613 | "language": "python",
614 | "name": "python3"
615 | },
616 | "language_info": {
617 | "codemirror_mode": {
618 | "name": "ipython",
619 | "version": 3
620 | },
621 | "file_extension": ".py",
622 | "mimetype": "text/x-python",
623 | "name": "python",
624 | "nbconvert_exporter": "python",
625 | "pygments_lexer": "ipython3",
626 | "version": "3.9.13"
627 | }
628 | },
629 | "nbformat": 4,
630 | "nbformat_minor": 5
631 | }
632 |
--------------------------------------------------------------------------------