├── .gitignore
├── Chapter 10.ipynb
├── Chapter 11
    ├── Chapter 11.ipynb
    ├── notebook
    │   ├── Dockerfile
    │   ├── build.sh
    │   ├── requirements.txt
    │   └── start-notebook.sh
    ├── scheduler
    │   ├── Dockerfile
    │   └── scheduler-start.sh
    └── worker
    │   ├── Dockerfile
    │   ├── build.sh
    │   ├── requirements.txt
    │   └── worker-start.sh
├── Chapter 2.ipynb
├── Chapter 3.ipynb
├── Chapter 4.ipynb
├── Chapter 5.ipynb
├── Chapter 6.ipynb
├── Chapter 7.ipynb
├── Chapter 8.ipynb
├── Chapter 9.ipynb
├── README.md
├── nyc-average-monthly-temp.csv
└── nyc-temp-data.csv


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/Chapter 10.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Science with Python and Dask\n",
  8 |     "## Chapter 10: Machine Learning with Dask-ML"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "### Section 10.1"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# Listing 10.1\n",
 25 |     "import dask.bag as bag\n",
 26 |     "import os\n",
 27 |     "from dask.diagnostics import ProgressBar\n",
 28 |     "\n",
 29 |     "os.chdir('/Users/jesse/Documents')\n",
 30 |     "raw_data = bag.read_text('foods.txt')\n",
 31 |     "\n",
 32 |     "def get_next_part(file, start_index, span_index=0, blocksize=1024):\n",
 33 |     "    file.seek(start_index)\n",
 34 |     "    buffer = file.read(blocksize + span_index).decode('cp1252')\n",
 35 |     "    delimiter_position = buffer.find('\\n\\n')\n",
 36 |     "    if delimiter_position == -1:\n",
 37 |     "        return get_next_part(file, start_index, span_index + blocksize)\n",
 38 |     "    else:\n",
 39 |     "        file.seek(start_index)\n",
 40 |     "        return start_index, delimiter_position\n",
 41 |     "    \n",
 42 |     "def get_item(filename, start_index, delimiter_position, encoding='cp1252'):\n",
 43 |     "    with open(filename, 'rb') as file_handle:\n",
 44 |     "        file_handle.seek(start_index)\n",
 45 |     "        text = file_handle.read(delimiter_position).decode(encoding)\n",
 46 |     "        elements = text.strip().split('\\n')\n",
 47 |     "        key_value_pairs = [(element.split(': ')[0], element.split(': ')[1]) \n",
 48 |     "                               if len(element.split(': ')) > 1 \n",
 49 |     "                               else ('unknown', element) \n",
 50 |     "                               for element in elements]\n",
 51 |     "        return dict(key_value_pairs)\n",
 52 |     "    \n",
 53 |     "with open('foods.txt', 'rb') as file_handle:\n",
 54 |     "    size = file_handle.seek(0,2) - 1\n",
 55 |     "    more_data = True\n",
 56 |     "    output = []\n",
 57 |     "    current_position = next_position = 0\n",
 58 |     "    while more_data:\n",
 59 |     "        if current_position >= size:\n",
 60 |     "            more_data = False\n",
 61 |     "        else:\n",
 62 |     "            current_position, next_position = get_next_part(file_handle, current_position, 0)\n",
 63 |     "            output.append((current_position, next_position))\n",
 64 |     "            current_position = current_position + next_position + 2\n",
 65 |     "            \n",
 66 |     "reviews = bag.from_sequence(output).map(lambda x: get_item('foods.txt', x[0], x[1]))\n",
 67 |     "\n",
 68 |     "def tag_positive_negative_by_score(element):\n",
 69 |     "    if float(element['review/score']) > 3:\n",
 70 |     "        element['review/sentiment'] = 'positive'\n",
 71 |     "    else:\n",
 72 |     "        element['review/sentiment'] = 'negative'\n",
 73 |     "    return element\n",
 74 |     "\n",
 75 |     "tagged_reviews = reviews.map(tag_positive_negative_by_score)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "### Section 10.1.1"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 7,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Listing 10.2\n",
 92 |     "from nltk.corpus import stopwords\n",
 93 |     "from nltk.tokenize import RegexpTokenizer\n",
 94 |     "from functools import partial\n",
 95 |     "\n",
 96 |     "tokenizer = RegexpTokenizer(r'\\w+')\n",
 97 |     "\n",
 98 |     "def extract_reviews(element):\n",
 99 |     "    element['review/tokens'] = element['review/text'].lower()\n",
100 |     "    return element\n",
101 |     "\n",
102 |     "def tokenize_reviews(element):\n",
103 |     "    element['review/tokens'] = tokenizer.tokenize(element['review/tokens'])\n",
104 |     "    return element\n",
105 |     "\n",
106 |     "def filter_stopword(word, stopwords):\n",
107 |     "    return word not in stopwords\n",
108 |     "\n",
109 |     "def filter_stopwords(element, stopwords):\n",
110 |     "    element['review/tokens'] = list(filter(partial(filter_stopword, stopwords=stopwords), element['review/tokens']))\n",
111 |     "    return element\n",
112 |     "\n",
113 |     "stopword_set = set(stopwords.words('english'))\n",
114 |     "more_stopwords = {'br', 'amazon', 'com', 'http', 'www', 'href', 'gp'}\n",
115 |     "all_stopwords = stopword_set.union(more_stopwords)\n",
116 |     "\n",
117 |     "review_extracted_text = tagged_reviews.map(extract_reviews)\n",
118 |     "review_tokens = review_extracted_text.map(tokenize_reviews)\n",
119 |     "review_text_clean = review_tokens.map(partial(filter_stopwords, stopwords=all_stopwords))"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 8,
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "[########################################] | 100% Completed | 34.8s\n"
132 |      ]
133 |     },
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "114290"
138 |       ]
139 |      },
140 |      "execution_count": 8,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "# Listing 10.3\n",
147 |     "def extract_tokens(element):\n",
148 |     "    return element['review/tokens']\n",
149 |     "\n",
150 |     "extracted_tokens = review_text_clean.map(extract_tokens)\n",
151 |     "unique_tokens = extracted_tokens.flatten().distinct()\n",
152 |     "\n",
153 |     "with ProgressBar():\n",
154 |     "    number_of_tokens = unique_tokens.count().compute()\n",
155 |     "number_of_tokens"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 9,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "[########################################] | 100% Completed | 49.4s\n"
168 |      ]
169 |     }
170 |    ],
171 |    "source": [
172 |     "# Listing 10.4\n",
173 |     "def count(accumulator, element):\n",
174 |     "    return accumulator + 1\n",
175 |     "\n",
176 |     "def combine(total_1, total_2):\n",
177 |     "    return total_1 + total_2\n",
178 |     "\n",
179 |     "with ProgressBar():\n",
180 |     "    token_counts = extracted_tokens.flatten().foldby(lambda x: x, count, 0, combine, 0).compute()\n",
181 |     "    \n",
182 |     "top_tokens = sorted(token_counts, key=lambda x: x[1], reverse=True)\n",
183 |     "top_100_tokens = list(map(lambda x: x[0], top_tokens[:100]))"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 11,
189 |    "metadata": {
190 |     "scrolled": false
191 |    },
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "({'target': 1,\n",
197 |        "  'features': array([1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
198 |        "         0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,\n",
199 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
200 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
201 |        "         0, 0, 0, 0, 0, 0, 0, 0])},\n",
202 |        " {'target': 0,\n",
203 |        "  'features': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
204 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
205 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
206 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
207 |        "         0, 0, 0, 0, 0, 0, 0, 0])},\n",
208 |        " {'target': 1,\n",
209 |        "  'features': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
210 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
211 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
212 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
213 |        "         0, 0, 0, 0, 0, 0, 0, 0])},\n",
214 |        " {'target': 0,\n",
215 |        "  'features': array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
216 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n",
217 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,\n",
218 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
219 |        "         0, 0, 0, 0, 0, 0, 0, 0])},\n",
220 |        " {'target': 1,\n",
221 |        "  'features': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
222 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
223 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
224 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
225 |        "         0, 0, 0, 0, 0, 0, 0, 0])})"
226 |       ]
227 |      },
228 |      "execution_count": 11,
229 |      "metadata": {},
230 |      "output_type": "execute_result"
231 |     }
232 |    ],
233 |    "source": [
234 |     "# Listing 10.5\n",
235 |     "import numpy as np\n",
236 |     "def vectorize_tokens(element):\n",
237 |     "    vectorized_tokens = np.where(np.isin(top_100_tokens, element['review/tokens']), 1, 0)\n",
238 |     "    element['review/token_vector'] = vectorized_tokens\n",
239 |     "    return element\n",
240 |     "\n",
241 |     "def prep_model_data(element):\n",
242 |     "    return {'target': 1 if element['review/sentiment'] == 'positive' else 0,\n",
243 |     "            'features': element['review/token_vector']}\n",
244 |     "\n",
245 |     "model_data = review_text_clean.map(vectorize_tokens).map(prep_model_data)\n",
246 |     "\n",
247 |     "model_data.take(5)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 12,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "# Listing 10.6\n",
257 |     "from dask import array as dask_array\n",
258 |     "def stacker(partition):\n",
259 |     "    return dask_array.concatenate([element for element in partition])\n",
260 |     "\n",
261 |     "with ProgressBar():\n",
262 |     "    feature_arrays = model_data.pluck('features').map(lambda x: dask_array.from_array(x, 1000).reshape(1,-1)).reduction(perpartition=stacker, aggregate=stacker)\n",
263 |     "    feature_array = feature_arrays.compute()\n",
264 |     "feature_array"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 14,
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "name": "stdout",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "[########################################] | 100% Completed |  5min 32.8s\n"
277 |      ]
278 |     }
279 |    ],
280 |    "source": [
281 |     "# Listing 10.7\n",
282 |     "with ProgressBar():\n",
283 |     "    feature_array.rechunk(5000).to_zarr('sentiment_feature_array.zarr')\n",
284 |     "    feature_array = dask_array.from_zarr('sentiment_feature_array.zarr')\n",
285 |     "    \n",
286 |     "with ProgressBar():\n",
287 |     "    target_arrays = model_data.pluck('target').map(lambda x: dask_array.from_array(x, 1000).reshape(-1,1)).reduction(perpartition=stacker, aggregate=stacker)\n",
288 |     "    target_arrays.compute().rechunk(5000).to_zarr('sentiment_target_array.zarr')\n",
289 |     "    target_array = dask_array.from_zarr('sentiment_target_array.zarr')"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "### Section 10.1.2"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 16,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "name": "stdout",
306 |      "output_type": "stream",
307 |      "text": [
308 |       "[########################################] | 100% Completed |  0.9s\n",
309 |       "[####################                    ] | 50% Completed |  1.6s"
310 |      ]
311 |     },
312 |     {
313 |      "name": "stderr",
314 |      "output_type": "stream",
315 |      "text": [
316 |       "/anaconda3/lib/python3.6/site-packages/dask_glm/utils.py:52: RuntimeWarning: overflow encountered in exp\n",
317 |       "  return np.exp(A)\n"
318 |      ]
319 |     },
320 |     {
321 |      "name": "stdout",
322 |      "output_type": "stream",
323 |      "text": [
324 |       "[########################################] | 100% Completed |  3.6s\n",
325 |       "[########################################] | 100% Completed |  3.8s\n",
326 |       "[########################################] | 100% Completed |  3.9s\n",
327 |       "[########################################] | 100% Completed |  3.8s\n",
328 |       "[########################################] | 100% Completed |  3.6s\n",
329 |       "[########################################] | 100% Completed |  3.9s\n",
330 |       "[########################################] | 100% Completed |  3.7s\n",
331 |       "[########################################] | 100% Completed |  3.5s\n",
332 |       "[########################################] | 100% Completed |  3.7s\n",
333 |       "[########################################] | 100% Completed |  4.0s\n",
334 |       "[########################################] | 100% Completed |  4.0s\n",
335 |       "[########################################] | 100% Completed |  4.0s\n",
336 |       "[########################################] | 100% Completed |  3.8s\n",
337 |       "[########################################] | 100% Completed |  3.9s\n",
338 |       "[########################################] | 100% Completed |  3.7s\n",
339 |       "[########################################] | 100% Completed |  3.5s\n",
340 |       "[########################################] | 100% Completed |  3.7s\n",
341 |       "[########################################] | 100% Completed |  3.8s\n",
342 |       "[########################################] | 100% Completed |  3.6s\n",
343 |       "[########################################] | 100% Completed |  3.6s\n",
344 |       "[########################################] | 100% Completed |  3.6s\n",
345 |       "[########################################] | 100% Completed |  3.7s\n",
346 |       "[########################################] | 100% Completed |  3.6s\n",
347 |       "[########################################] | 100% Completed |  3.4s\n",
348 |       "[########################################] | 100% Completed |  3.5s\n",
349 |       "[########################################] | 100% Completed |  3.7s\n",
350 |       "[########################################] | 100% Completed |  3.7s\n",
351 |       "[########################################] | 100% Completed |  3.6s\n",
352 |       "[########################################] | 100% Completed |  3.6s\n",
353 |       "[########################################] | 100% Completed |  3.7s\n",
354 |       "[########################################] | 100% Completed |  3.8s\n",
355 |       "[########################################] | 100% Completed |  3.9s\n",
356 |       "[########################################] | 100% Completed |  3.9s\n",
357 |       "[########################################] | 100% Completed |  4.0s\n",
358 |       "[########################################] | 100% Completed |  4.0s\n",
359 |       "[########################################] | 100% Completed |  3.7s\n",
360 |       "[########################################] | 100% Completed |  3.5s\n",
361 |       "[########################################] | 100% Completed |  3.7s\n",
362 |       "[########################################] | 100% Completed |  3.9s\n",
363 |       "[########################################] | 100% Completed |  3.9s\n",
364 |       "[########################################] | 100% Completed |  3.8s\n",
365 |       "[########################################] | 100% Completed |  3.6s\n",
366 |       "[########################################] | 100% Completed |  3.6s\n",
367 |       "[########################################] | 100% Completed |  3.6s\n",
368 |       "[########################################] | 100% Completed |  3.9s\n",
369 |       "[########################################] | 100% Completed |  3.5s\n",
370 |       "[########################################] | 100% Completed |  3.5s\n",
371 |       "[########################################] | 100% Completed |  3.4s\n",
372 |       "[########################################] | 100% Completed |  3.5s\n",
373 |       "[########################################] | 100% Completed |  3.6s\n",
374 |       "[########################################] | 100% Completed |  3.6s\n",
375 |       "[########################################] | 100% Completed |  3.6s\n",
376 |       "[########################################] | 100% Completed |  3.6s\n",
377 |       "[########################################] | 100% Completed |  3.6s\n",
378 |       "[########################################] | 100% Completed |  3.7s\n",
379 |       "[########################################] | 100% Completed |  3.7s\n",
380 |       "[########################################] | 100% Completed |  3.7s\n",
381 |       "[########################################] | 100% Completed |  3.8s\n",
382 |       "[########################################] | 100% Completed |  3.6s\n",
383 |       "[########################################] | 100% Completed |  3.8s\n",
384 |       "[########################################] | 100% Completed |  3.9s\n",
385 |       "[########################################] | 100% Completed |  3.9s\n",
386 |       "[########################################] | 100% Completed |  4.1s\n",
387 |       "[########################################] | 100% Completed |  3.9s\n",
388 |       "[########################################] | 100% Completed |  3.6s\n",
389 |       "[########################################] | 100% Completed |  3.8s\n",
390 |       "[########################################] | 100% Completed |  3.9s\n",
391 |       "[########################################] | 100% Completed |  4.1s\n",
392 |       "[########################################] | 100% Completed |  3.8s\n",
393 |       "[########################################] | 100% Completed |  3.6s\n",
394 |       "[########################################] | 100% Completed |  3.8s\n",
395 |       "[########################################] | 100% Completed |  3.7s\n",
396 |       "[########################################] | 100% Completed |  3.5s\n",
397 |       "[########################################] | 100% Completed |  3.7s\n",
398 |       "[########################################] | 100% Completed |  3.7s\n",
399 |       "[########################################] | 100% Completed |  3.6s\n",
400 |       "[########################################] | 100% Completed |  3.8s\n",
401 |       "[########################################] | 100% Completed |  4.0s\n",
402 |       "[########################################] | 100% Completed |  3.9s\n",
403 |       "[########################################] | 100% Completed |  3.9s\n",
404 |       "[########################################] | 100% Completed |  3.8s\n",
405 |       "[########################################] | 100% Completed |  4.0s\n",
406 |       "[########################################] | 100% Completed |  4.2s\n",
407 |       "[########################################] | 100% Completed |  3.7s\n",
408 |       "[########################################] | 100% Completed |  3.6s\n",
409 |       "[########################################] | 100% Completed |  3.8s\n",
410 |       "[########################################] | 100% Completed |  4.0s\n",
411 |       "[########################################] | 100% Completed |  3.8s\n",
412 |       "[########################################] | 100% Completed |  3.8s\n",
413 |       "[########################################] | 100% Completed |  3.6s\n",
414 |       "[########################################] | 100% Completed |  3.4s\n",
415 |       "[########################################] | 100% Completed |  3.4s\n",
416 |       "[########################################] | 100% Completed |  3.4s\n",
417 |       "[########################################] | 100% Completed |  3.6s\n",
418 |       "[########################################] | 100% Completed |  3.5s\n",
419 |       "[########################################] | 100% Completed |  3.8s\n",
420 |       "[########################################] | 100% Completed |  5.7s\n",
421 |       "[########################################] | 100% Completed |  4.0s\n",
422 |       "[########################################] | 100% Completed |  4.5s\n",
423 |       "[########################################] | 100% Completed |  5.1s\n"
424 |      ]
425 |     }
426 |    ],
427 |    "source": [
428 |     "# Listing 10.8\n",
429 |     "from dask_ml.linear_model import LogisticRegression\n",
430 |     "from dask_ml.model_selection import train_test_split\n",
431 |     "\n",
432 |     "X = feature_array\n",
433 |     "y = target_array.flatten()\n",
434 |     "\n",
435 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
436 |     "\n",
437 |     "lr = LogisticRegression()\n",
438 |     "\n",
439 |     "with ProgressBar():\n",
440 |     "    lr.fit(X_train, y_train)"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {},
446 |    "source": [
447 |     "### Section 10.2.1"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 17,
453 |    "metadata": {},
454 |    "outputs": [
455 |     {
456 |      "data": {
457 |       "text/plain": [
458 |        "0.79629173556626676"
459 |       ]
460 |      },
461 |      "execution_count": 17,
462 |      "metadata": {},
463 |      "output_type": "execute_result"
464 |     }
465 |    ],
466 |    "source": [
467 |     "# Listing 10.9\n",
468 |     "lr.score(X_test, y_test).compute()"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "markdown",
473 |    "metadata": {},
474 |    "source": [
475 |     "### Section 10.2.2"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": 18,
481 |    "metadata": {},
482 |    "outputs": [
483 |     {
484 |      "name": "stdout",
485 |      "output_type": "stream",
486 |      "text": [
487 |       "[########################################] | 100% Completed |  2.1s\n"
488 |      ]
489 |     }
490 |    ],
491 |    "source": [
492 |     "# Listing 10.10\n",
493 |     "from sklearn.naive_bayes import BernoulliNB\n",
494 |     "from dask_ml.wrappers import Incremental\n",
495 |     "\n",
496 |     "nb = BernoulliNB()\n",
497 |     "\n",
498 |     "parallel_nb = Incremental(nb)\n",
499 |     "\n",
500 |     "with ProgressBar():\n",
501 |     "    parallel_nb.fit(X_train, y_train, classes=[0,1])"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": 19,
507 |    "metadata": {},
508 |    "outputs": [
509 |     {
510 |      "data": {
511 |       "text/plain": [
512 |        "0.78886817014389754"
513 |       ]
514 |      },
515 |      "execution_count": 19,
516 |      "metadata": {},
517 |      "output_type": "execute_result"
518 |     }
519 |    ],
520 |    "source": [
521 |     "# Listing 10.11\n",
522 |     "parallel_nb.score(X_test, y_test)"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "markdown",
527 |    "metadata": {},
528 |    "source": [
529 |     "### Section 10.2.3"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 20,
535 |    "metadata": {},
536 |    "outputs": [
537 |     {
538 |      "name": "stdout",
539 |      "output_type": "stream",
540 |      "text": [
541 |       "[########################################] | 100% Completed | 23min 24.1s\n"
542 |      ]
543 |     }
544 |    ],
545 |    "source": [
546 |     "# Listing 10.12\n",
547 |     "from dask_ml.model_selection import GridSearchCV\n",
548 |     "\n",
549 |     "parameters = {'penalty': ['l1', 'l2'], 'C': [0.5, 1, 2]}\n",
550 |     "\n",
551 |     "lr = LogisticRegression()\n",
552 |     "tuned_lr = GridSearchCV(lr, parameters)\n",
553 |     "\n",
554 |     "with ProgressBar():\n",
555 |     "    tuned_lr.fit(X_train, y_train)   "
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 21,
561 |    "metadata": {
562 |     "scrolled": false
563 |    },
564 |    "outputs": [
565 |     {
566 |      "name": "stderr",
567 |      "output_type": "stream",
568 |      "text": [
569 |       "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
570 |       "  warnings.warn(*warn_args, **warn_kwargs)\n",
571 |       "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
572 |       "  warnings.warn(*warn_args, **warn_kwargs)\n",
573 |       "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
574 |       "  warnings.warn(*warn_args, **warn_kwargs)\n",
575 |       "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
576 |       "  warnings.warn(*warn_args, **warn_kwargs)\n",
577 |       "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
578 |       "  warnings.warn(*warn_args, **warn_kwargs)\n"
579 |      ]
580 |     },
581 |     {
582 |      "data": {
583 |       "text/html": [
584 |        "<div>\n",
585 |        "<style scoped>\n",
586 |        "    .dataframe tbody tr th:only-of-type {\n",
587 |        "        vertical-align: middle;\n",
588 |        "    }\n",
589 |        "\n",
590 |        "    .dataframe tbody tr th {\n",
591 |        "        vertical-align: top;\n",
592 |        "    }\n",
593 |        "\n",
594 |        "    .dataframe thead th {\n",
595 |        "        text-align: right;\n",
596 |        "    }\n",
597 |        "</style>\n",
598 |        "<table border=\"1\" class=\"dataframe\">\n",
599 |        "  <thead>\n",
600 |        "    <tr style=\"text-align: right;\">\n",
601 |        "      <th></th>\n",
602 |        "      <th>params</th>\n",
603 |        "      <th>mean_fit_time</th>\n",
604 |        "      <th>std_fit_time</th>\n",
605 |        "      <th>mean_score_time</th>\n",
606 |        "      <th>std_score_time</th>\n",
607 |        "      <th>split0_test_score</th>\n",
608 |        "      <th>split1_test_score</th>\n",
609 |        "      <th>split2_test_score</th>\n",
610 |        "      <th>mean_test_score</th>\n",
611 |        "      <th>std_test_score</th>\n",
612 |        "      <th>rank_test_score</th>\n",
613 |        "      <th>split0_train_score</th>\n",
614 |        "      <th>split1_train_score</th>\n",
615 |        "      <th>split2_train_score</th>\n",
616 |        "      <th>mean_train_score</th>\n",
617 |        "      <th>std_train_score</th>\n",
618 |        "      <th>param_C</th>\n",
619 |        "      <th>param_penalty</th>\n",
620 |        "    </tr>\n",
621 |        "  </thead>\n",
622 |        "  <tbody>\n",
623 |        "    <tr>\n",
624 |        "      <th>0</th>\n",
625 |        "      <td>{'C': 0.5, 'penalty': 'l1'}</td>\n",
626 |        "      <td>1308.978919</td>\n",
627 |        "      <td>11.548624</td>\n",
628 |        "      <td>0.347088</td>\n",
629 |        "      <td>0.044540</td>\n",
630 |        "      <td>0.790291</td>\n",
631 |        "      <td>0.793938</td>\n",
632 |        "      <td>0.797087</td>\n",
633 |        "      <td>0.793772</td>\n",
634 |        "      <td>0.002777</td>\n",
635 |        "      <td>4</td>\n",
636 |        "      <td>0.795671</td>\n",
637 |        "      <td>0.794152</td>\n",
638 |        "      <td>0.792604</td>\n",
639 |        "      <td>0.794142</td>\n",
640 |        "      <td>0.001252</td>\n",
641 |        "      <td>0.5</td>\n",
642 |        "      <td>l1</td>\n",
643 |        "    </tr>\n",
644 |        "    <tr>\n",
645 |        "      <th>1</th>\n",
646 |        "      <td>{'C': 0.5, 'penalty': 'l2'}</td>\n",
647 |        "      <td>143.865403</td>\n",
648 |        "      <td>2.276777</td>\n",
649 |        "      <td>0.626723</td>\n",
650 |        "      <td>0.145728</td>\n",
651 |        "      <td>0.790801</td>\n",
652 |        "      <td>0.793715</td>\n",
653 |        "      <td>0.796987</td>\n",
654 |        "      <td>0.793834</td>\n",
655 |        "      <td>0.002527</td>\n",
656 |        "      <td>1</td>\n",
657 |        "      <td>0.796081</td>\n",
658 |        "      <td>0.794008</td>\n",
659 |        "      <td>0.792264</td>\n",
660 |        "      <td>0.794118</td>\n",
661 |        "      <td>0.001560</td>\n",
662 |        "      <td>0.5</td>\n",
663 |        "      <td>l2</td>\n",
664 |        "    </tr>\n",
665 |        "    <tr>\n",
666 |        "      <th>2</th>\n",
667 |        "      <td>{'C': 1, 'penalty': 'l1'}</td>\n",
668 |        "      <td>1211.649146</td>\n",
669 |        "      <td>72.024862</td>\n",
670 |        "      <td>0.639021</td>\n",
671 |        "      <td>0.275957</td>\n",
672 |        "      <td>0.790689</td>\n",
673 |        "      <td>0.793551</td>\n",
674 |        "      <td>0.796559</td>\n",
675 |        "      <td>0.793600</td>\n",
676 |        "      <td>0.002397</td>\n",
677 |        "      <td>6</td>\n",
678 |        "      <td>0.796014</td>\n",
679 |        "      <td>0.793724</td>\n",
680 |        "      <td>0.792182</td>\n",
681 |        "      <td>0.793973</td>\n",
682 |        "      <td>0.001574</td>\n",
683 |        "      <td>1</td>\n",
684 |        "      <td>l1</td>\n",
685 |        "    </tr>\n",
686 |        "    <tr>\n",
687 |        "      <th>3</th>\n",
688 |        "      <td>{'C': 1, 'penalty': 'l2'}</td>\n",
689 |        "      <td>74.962411</td>\n",
690 |        "      <td>1.968621</td>\n",
691 |        "      <td>0.553580</td>\n",
692 |        "      <td>0.068979</td>\n",
693 |        "      <td>0.790801</td>\n",
694 |        "      <td>0.793715</td>\n",
695 |        "      <td>0.796987</td>\n",
696 |        "      <td>0.793834</td>\n",
697 |        "      <td>0.002527</td>\n",
698 |        "      <td>1</td>\n",
699 |        "      <td>0.796081</td>\n",
700 |        "      <td>0.794008</td>\n",
701 |        "      <td>0.792267</td>\n",
702 |        "      <td>0.794119</td>\n",
703 |        "      <td>0.001559</td>\n",
704 |        "      <td>1</td>\n",
705 |        "      <td>l2</td>\n",
706 |        "    </tr>\n",
707 |        "    <tr>\n",
708 |        "      <th>4</th>\n",
709 |        "      <td>{'C': 2, 'penalty': 'l1'}</td>\n",
710 |        "      <td>608.802576</td>\n",
711 |        "      <td>58.226398</td>\n",
712 |        "      <td>0.315940</td>\n",
713 |        "      <td>0.122815</td>\n",
714 |        "      <td>0.790701</td>\n",
715 |        "      <td>0.793592</td>\n",
716 |        "      <td>0.796835</td>\n",
717 |        "      <td>0.793709</td>\n",
718 |        "      <td>0.002505</td>\n",
719 |        "      <td>5</td>\n",
720 |        "      <td>0.796020</td>\n",
721 |        "      <td>0.793829</td>\n",
722 |        "      <td>0.792255</td>\n",
723 |        "      <td>0.794035</td>\n",
724 |        "      <td>0.001544</td>\n",
725 |        "      <td>2</td>\n",
726 |        "      <td>l1</td>\n",
727 |        "    </tr>\n",
728 |        "    <tr>\n",
729 |        "      <th>5</th>\n",
730 |        "      <td>{'C': 2, 'penalty': 'l2'}</td>\n",
731 |        "      <td>101.755454</td>\n",
732 |        "      <td>7.513333</td>\n",
733 |        "      <td>0.553664</td>\n",
734 |        "      <td>0.067346</td>\n",
735 |        "      <td>0.790801</td>\n",
736 |        "      <td>0.793715</td>\n",
737 |        "      <td>0.796987</td>\n",
738 |        "      <td>0.793834</td>\n",
739 |        "      <td>0.002527</td>\n",
740 |        "      <td>1</td>\n",
741 |        "      <td>0.796081</td>\n",
742 |        "      <td>0.794008</td>\n",
743 |        "      <td>0.792267</td>\n",
744 |        "      <td>0.794119</td>\n",
745 |        "      <td>0.001559</td>\n",
746 |        "      <td>2</td>\n",
747 |        "      <td>l2</td>\n",
748 |        "    </tr>\n",
749 |        "  </tbody>\n",
750 |        "</table>\n",
751 |        "</div>"
752 |       ],
753 |       "text/plain": [
754 |        "                        params  mean_fit_time  std_fit_time  mean_score_time  \\\n",
755 |        "0  {'C': 0.5, 'penalty': 'l1'}    1308.978919     11.548624         0.347088   \n",
756 |        "1  {'C': 0.5, 'penalty': 'l2'}     143.865403      2.276777         0.626723   \n",
757 |        "2    {'C': 1, 'penalty': 'l1'}    1211.649146     72.024862         0.639021   \n",
758 |        "3    {'C': 1, 'penalty': 'l2'}      74.962411      1.968621         0.553580   \n",
759 |        "4    {'C': 2, 'penalty': 'l1'}     608.802576     58.226398         0.315940   \n",
760 |        "5    {'C': 2, 'penalty': 'l2'}     101.755454      7.513333         0.553664   \n",
761 |        "\n",
762 |        "   std_score_time  split0_test_score  split1_test_score  split2_test_score  \\\n",
763 |        "0        0.044540           0.790291           0.793938           0.797087   \n",
764 |        "1        0.145728           0.790801           0.793715           0.796987   \n",
765 |        "2        0.275957           0.790689           0.793551           0.796559   \n",
766 |        "3        0.068979           0.790801           0.793715           0.796987   \n",
767 |        "4        0.122815           0.790701           0.793592           0.796835   \n",
768 |        "5        0.067346           0.790801           0.793715           0.796987   \n",
769 |        "\n",
770 |        "   mean_test_score  std_test_score  rank_test_score  split0_train_score  \\\n",
771 |        "0         0.793772        0.002777                4            0.795671   \n",
772 |        "1         0.793834        0.002527                1            0.796081   \n",
773 |        "2         0.793600        0.002397                6            0.796014   \n",
774 |        "3         0.793834        0.002527                1            0.796081   \n",
775 |        "4         0.793709        0.002505                5            0.796020   \n",
776 |        "5         0.793834        0.002527                1            0.796081   \n",
777 |        "\n",
778 |        "   split1_train_score  split2_train_score  mean_train_score  std_train_score  \\\n",
779 |        "0            0.794152            0.792604          0.794142         0.001252   \n",
780 |        "1            0.794008            0.792264          0.794118         0.001560   \n",
781 |        "2            0.793724            0.792182          0.793973         0.001574   \n",
782 |        "3            0.794008            0.792267          0.794119         0.001559   \n",
783 |        "4            0.793829            0.792255          0.794035         0.001544   \n",
784 |        "5            0.794008            0.792267          0.794119         0.001559   \n",
785 |        "\n",
786 |        "  param_C param_penalty  \n",
787 |        "0     0.5            l1  \n",
788 |        "1     0.5            l2  \n",
789 |        "2       1            l1  \n",
790 |        "3       1            l2  \n",
791 |        "4       2            l1  \n",
792 |        "5       2            l2  "
793 |       ]
794 |      },
795 |      "execution_count": 21,
796 |      "metadata": {},
797 |      "output_type": "execute_result"
798 |     }
799 |    ],
800 |    "source": [
801 |     "# Listing 10.13\n",
802 |     "import pandas as pd\n",
803 |     "pd.DataFrame(tuned_lr.cv_results_)"
804 |    ]
805 |   },
806 |   {
807 |    "cell_type": "markdown",
808 |    "metadata": {},
809 |    "source": [
810 |     "### Section 10.3"
811 |    ]
812 |   },
813 |   {
814 |    "cell_type": "code",
815 |    "execution_count": 22,
816 |    "metadata": {},
817 |    "outputs": [],
818 |    "source": [
819 |     "# Listing 10.14\n",
820 |     "import dill\n",
821 |     "with open('naive_bayes_model.pkl', 'wb') as file:\n",
822 |     "    dill.dump(parallel_nb, file)"
823 |    ]
824 |   },
825 |   {
826 |    "cell_type": "code",
827 |    "execution_count": 23,
828 |    "metadata": {},
829 |    "outputs": [
830 |     {
831 |      "data": {
832 |       "text/plain": [
833 |        "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
834 |        "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
835 |        "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
836 |        "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,\n",
837 |        "       1, 1, 1, 1, 1, 1, 1, 1])"
838 |       ]
839 |      },
840 |      "execution_count": 23,
841 |      "metadata": {},
842 |      "output_type": "execute_result"
843 |     }
844 |    ],
845 |    "source": [
846 |     "# Listing 10.15\n",
847 |     "with open('naive_bayes_model.pkl', 'rb') as file:\n",
848 |     "    nb = dill.load(file)\n",
849 |     "nb.predict(np.random.randint(0,2,(100,100)))"
850 |    ]
851 |   }
852 |  ],
853 |  "metadata": {
854 |   "kernelspec": {
855 |    "display_name": "Python 3",
856 |    "language": "python",
857 |    "name": "python3"
858 |   },
859 |   "language_info": {
860 |    "codemirror_mode": {
861 |     "name": "ipython",
862 |     "version": 3
863 |    },
864 |    "file_extension": ".py",
865 |    "mimetype": "text/x-python",
866 |    "name": "python",
867 |    "nbconvert_exporter": "python",
868 |    "pygments_lexer": "ipython3",
869 |    "version": "3.6.8"
870 |   }
871 |  },
872 |  "nbformat": 4,
873 |  "nbformat_minor": 2
874 | }
875 | 


--------------------------------------------------------------------------------
/Chapter 11/Chapter 11.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Science with Python and Dask\n",
  8 |     "## Chapter 11: Scaling and Deploying Dask"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "### Section 11.2"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 5,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "data": {
 25 |       "text/html": [
 26 |        "<table style=\"border: 2px solid white;\">\n",
 27 |        "<tr>\n",
 28 |        "<td style=\"vertical-align: top; border: 0px solid white\">\n",
 29 |        "<h3>Client</h3>\n",
 30 |        "<ul>\n",
 31 |        "  <li><b>Scheduler: </b>tcp://ip-10-0-0-64.ec2.internal:8786\n",
 32 |        "  <li><b>Dashboard: </b><a href='http://ip-10-0-0-64.ec2.internal:8787/status' target='_blank'>http://ip-10-0-0-64.ec2.internal:8787/status</a>\n",
 33 |        "</ul>\n",
 34 |        "</td>\n",
 35 |        "<td style=\"vertical-align: top; border: 0px solid white\">\n",
 36 |        "<h3>Cluster</h3>\n",
 37 |        "<ul>\n",
 38 |        "  <li><b>Workers: </b>6</li>\n",
 39 |        "  <li><b>Cores: </b>6</li>\n",
 40 |        "  <li><b>Memory: </b>6.20 GB</li>\n",
 41 |        "</ul>\n",
 42 |        "</td>\n",
 43 |        "</tr>\n",
 44 |        "</table>"
 45 |       ],
 46 |       "text/plain": [
 47 |        "<Client: scheduler='tcp://172.17.0.2:8786' processes=6 cores=6>"
 48 |       ]
 49 |      },
 50 |      "execution_count": 5,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "# Listing 11.2\n",
 57 |     "from dask.distributed import Client, progress\n",
 58 |     "client = Client()\n",
 59 |     "client"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 2,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "# Listing 11.3\n",
 69 |     "from dask import array as da\n",
 70 |     "feature_array = da.from_zarr('/data/sentiment_feature_array.zarr')\n",
 71 |     "target_array = da.from_zarr('/data/sentiment_target_array.zarr')"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 14,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "from dask_ml.linear_model import LogisticRegression\n",
 81 |     "from dask_ml.model_selection import train_test_split\n",
 82 |     "\n",
 83 |     "X = feature_array\n",
 84 |     "y = target_array.flatten()\n",
 85 |     "\n",
 86 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
 87 |     "\n",
 88 |     "lr = LogisticRegression()\n",
 89 |     "\n",
 90 |     "status = lr.fit(X_train, y_train)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "### Scenario 2"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 15,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "0.7962917355662668"
109 |       ]
110 |      },
111 |      "execution_count": 15,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "lr.score(X_test, y_test).compute()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 16,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "Incremental(estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),\n",
129 |        "      random_state=None, scoring=None, shuffle_blocks=True)"
130 |       ]
131 |      },
132 |      "execution_count": 16,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "from sklearn.naive_bayes import BernoulliNB\n",
139 |     "from dask_ml.wrappers import Incremental\n",
140 |     "\n",
141 |     "nb = BernoulliNB()\n",
142 |     "\n",
143 |     "parallel_nb = Incremental(nb)\n",
144 |     "\n",
145 |     "parallel_nb.fit(X_train, y_train, classes=[0,1])"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 17,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "0.7888681701438975"
157 |       ]
158 |      },
159 |      "execution_count": 17,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "parallel_nb.score(X_test, y_test)"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.6.8"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 2
190 | }
191 | 


--------------------------------------------------------------------------------
/Chapter 11/notebook/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM daskdev/dask-notebook
2 | USER root
3 | COPY requirements.txt build.sh ./
4 | COPY start-notebook.sh /opt/app
5 | RUN sh build.sh
6 | RUN rm build.sh
7 | EXPOSE 8888
8 | CMD ["sh","/opt/app/start-notebook.sh"]
9 | 


--------------------------------------------------------------------------------
/Chapter 11/notebook/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | sudo apt-get update
4 | sudo apt-get -y install build-essential
5 | echo "Getting Python packages..."
6 | pip install -U --no-cache-dir -r requirements.txt
7 | rm requirements.txt
8 | echo "Done!"
9 | 


--------------------------------------------------------------------------------
/Chapter 11/notebook/requirements.txt:
--------------------------------------------------------------------------------
1 | blosc
2 | zarr
3 | dask-ml
4 | 


--------------------------------------------------------------------------------
/Chapter 11/notebook/start-notebook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Get the scheduler name from EFS
3 | scheduler=$(cat /data/.scheduler)
4 | echo "Setting scheduler name to $scheduler"
5 | export DASK_SCHEDULER_ADDRESS="tcp://$scheduler:8786"
6 | 
7 | # Start the notebook server
8 | start.sh jupyter lab
9 | 


--------------------------------------------------------------------------------
/Chapter 11/scheduler/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM daskdev/dask
2 | 
3 | COPY scheduler-start.sh ./
4 | 
5 | EXPOSE 8786
6 | EXPOSE 8787
7 | 
8 | CMD ["sh","scheduler-start.sh"]
9 | 


--------------------------------------------------------------------------------
/Chapter 11/scheduler/scheduler-start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Write the hostname of the scheduler to the EFS system
 4 | hostname=$(hostname)
 5 | echo "Setting scheduler hostname to $hostname"
 6 | hostname > /data/.scheduler
 7 | 
 8 | # Start the scheduler
 9 | echo "Starting Dask Scheduler..."
10 | dask-scheduler
11 | 


--------------------------------------------------------------------------------
/Chapter 11/worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM daskdev/dask
 2 | 
 3 | USER root
 4 | 
 5 | # Install dependencies
 6 | COPY requirements.txt build.sh worker-start.sh ./
 7 | RUN sh build.sh
 8 | RUN rm build.sh
 9 | 
10 | CMD ["sh", "worker-start.sh"]
11 | 


--------------------------------------------------------------------------------
/Chapter 11/worker/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | apt-get update
4 | apt-get -y install build-essential
5 | echo "Getting Python packages..."
6 | pip install -U --no-cache-dir -r requirements.txt
7 | rm requirements.txt
8 | echo "Done!"
9 | 


--------------------------------------------------------------------------------
/Chapter 11/worker/requirements.txt:
--------------------------------------------------------------------------------
1 | blosc
2 | zarr
3 | dask-ml
4 | 


--------------------------------------------------------------------------------
/Chapter 11/worker/worker-start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Get the scheduler name from EFS
3 | scheduler=$(cat /data/.scheduler)
4 | echo "Setting scheduler hostname to $scheduler"
5 | echo "Starting Dask worker..."
6 | dask-worker --worker-port 8000 tcp://$scheduler:8786
7 | 


--------------------------------------------------------------------------------
/Chapter 3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Science with Python and Dask\n",
  8 |     "## Chapter 3 - Introducing Dask DataFrames"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "### Section 3.1"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# Listing 3.1\n",
 25 |     "person_IDs = [1,2,3]\n",
 26 |     "person_last_names = ['Smith', 'Williams', 'Williams']\n",
 27 |     "person_first_names = ['John', 'Bill', 'Jane']\n",
 28 |     "person_DOBs = ['1982-10-06', '1990-07-04', '1989-05-06']"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "### Section 3.2.1"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# Listing 3.2\n",
 45 |     "import pandas as pd\n",
 46 |     "import dask.dataframe as dd\n",
 47 |     "\n",
 48 |     "# Creating all the data as lists\n",
 49 |     "person_IDs = [1,2,3,4,5,6,7,8,9,10]\n",
 50 |     "person_last_names = ['Smith', 'Williams', 'Williams','Jackson','Johnson','Smith','Anderson','Christiansen','Carter','Davidson']\n",
 51 |     "person_first_names = ['John', 'Bill', 'Jane','Cathy','Stuart','James','Felicity','Liam','Nancy','Christina']\n",
 52 |     "person_DOBs = ['1982-10-06', '1990-07-04', '1989-05-06', '1974-01-24', '1995-06-05', '1984-04-16', '1976-09-15', '1992-10-02', '1986-02-05', '1993-08-11']\n",
 53 |     "\n",
 54 |     "# Storing the data in a Pandas DataFrame\n",
 55 |     "people_pandas_df = pd.DataFrame({'Person ID': person_IDs, \n",
 56 |     "              'Last Name': person_last_names, \n",
 57 |     "              'First Name': person_first_names,\n",
 58 |     "             'Date of Birth': person_DOBs},\n",
 59 |     "            columns=['Person ID', 'Last Name', 'First Name', 'Date of Birth'])\n",
 60 |     "\n",
 61 |     "# Converting the Pandas DataFrame to a Dask DataFrame\n",
 62 |     "people_dask_df = dd.from_pandas(people_pandas_df, npartitions=2)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 3,
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "name": "stdout",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "(0, 5, 9)\n",
 75 |       "2\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "# Listing 3.3\n",
 81 |     "print(people_dask_df.divisions)\n",
 82 |     "print(people_dask_df.npartitions)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "0    5\n",
 94 |        "1    5\n",
 95 |        "dtype: int64"
 96 |       ]
 97 |      },
 98 |      "execution_count": 4,
 99 |      "metadata": {},
100 |      "output_type": "execute_result"
101 |     }
102 |    ],
103 |    "source": [
104 |     "# Listing 3.4\n",
105 |     "people_dask_df.map_partitions(lambda x: len(x)).compute()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 8,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "0    3\n",
118 |       "1    5\n",
119 |       "dtype: int64\n",
120 |       "0    8\n",
121 |       "dtype: int64\n"
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "# Listing 3.5\n",
127 |     "people_filtered = people_dask_df[people_dask_df['Last Name'] != 'Williams']\n",
128 |     "print(people_filtered.map_partitions(lambda x: len(x)).compute())\n",
129 |     "\n",
130 |     "people_filtered_reduced = people_filtered.repartition(npartitions=1)\n",
131 |     "print(people_filtered_reduced.map_partitions(lambda x: len(x)).compute())"
132 |    ]
133 |   }
134 |  ],
135 |  "metadata": {
136 |   "kernelspec": {
137 |    "display_name": "Python 3",
138 |    "language": "python",
139 |    "name": "python3"
140 |   },
141 |   "language_info": {
142 |    "codemirror_mode": {
143 |     "name": "ipython",
144 |     "version": 3
145 |    },
146 |    "file_extension": ".py",
147 |    "mimetype": "text/x-python",
148 |    "name": "python",
149 |    "nbconvert_exporter": "python",
150 |    "pygments_lexer": "ipython3",
151 |    "version": "3.6.8"
152 |   }
153 |  },
154 |  "nbformat": 4,
155 |  "nbformat_minor": 2
156 | }
157 | 


--------------------------------------------------------------------------------
/Chapter 5.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Science with Python and Dask\n",
  8 |     "## Chapter 5: Cleaning and Transforming DataFrames"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "# Before beginning, set your working directory to where the data resides\n",
 18 |     "import os\n",
 19 |     "os.chdir('/Users/jesse/Documents')"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "### Intro Section"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Listing 5.1\n",
 36 |     "import dask.dataframe as dd\n",
 37 |     "from dask.diagnostics import ProgressBar\n",
 38 |     "import numpy as np\n",
 39 |     "\n",
 40 |     "dtypes = {\n",
 41 |     " 'Date First Observed': np.str,\n",
 42 |     " 'Days Parking In Effect    ': np.str,\n",
 43 |     " 'Double Parking Violation': np.str,\n",
 44 |     " 'Feet From Curb': np.float32,\n",
 45 |     " 'From Hours In Effect': np.str,\n",
 46 |     " 'House Number': np.str,\n",
 47 |     " 'Hydrant Violation': np.str,\n",
 48 |     " 'Intersecting Street': np.str,\n",
 49 |     " 'Issue Date': np.str,\n",
 50 |     " 'Issuer Code': np.float32,\n",
 51 |     " 'Issuer Command': np.str,\n",
 52 |     " 'Issuer Precinct': np.float32,\n",
 53 |     " 'Issuer Squad': np.str,\n",
 54 |     " 'Issuing Agency': np.str,\n",
 55 |     " 'Law Section': np.float32,\n",
 56 |     " 'Meter Number': np.str,\n",
 57 |     " 'No Standing or Stopping Violation': np.str,\n",
 58 |     " 'Plate ID': np.str,\n",
 59 |     " 'Plate Type': np.str,\n",
 60 |     " 'Registration State': np.str,\n",
 61 |     " 'Street Code1': np.uint32,\n",
 62 |     " 'Street Code2': np.uint32,\n",
 63 |     " 'Street Code3': np.uint32,\n",
 64 |     " 'Street Name': np.str,\n",
 65 |     " 'Sub Division': np.str,\n",
 66 |     " 'Summons Number': np.uint32,\n",
 67 |     " 'Time First Observed': np.str,\n",
 68 |     " 'To Hours In Effect': np.str,\n",
 69 |     " 'Unregistered Vehicle?': np.str,\n",
 70 |     " 'Vehicle Body Type': np.str,\n",
 71 |     " 'Vehicle Color': np.str,\n",
 72 |     " 'Vehicle Expiration Date': np.str,\n",
 73 |     " 'Vehicle Make': np.str,\n",
 74 |     " 'Vehicle Year': np.float32,\n",
 75 |     " 'Violation Code': np.uint16,\n",
 76 |     " 'Violation County': np.str,\n",
 77 |     " 'Violation Description': np.str,\n",
 78 |     " 'Violation In Front Of Or Opposite': np.str,\n",
 79 |     " 'Violation Legal Code': np.str,\n",
 80 |     " 'Violation Location': np.str,\n",
 81 |     " 'Violation Post Code': np.str,\n",
 82 |     " 'Violation Precinct': np.float32,\n",
 83 |     " 'Violation Time': np.str\n",
 84 |     "}\n",
 85 |     "\n",
 86 |     "nyc_data_raw = dd.read_csv('nyc-parking-tickets/*.csv', dtype=dtypes, usecols=dtypes.keys())"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "### Section 5.1.1"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# Listing 5.2\n",
103 |     "with ProgressBar():\n",
104 |     "    display(nyc_data_raw['Plate ID'].head())"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "# Listing 5.3\n",
114 |     "with ProgressBar():\n",
115 |     "    display(nyc_data_raw[['Plate ID', 'Registration State']].head())"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "# Listing 5.4\n",
125 |     "columns_to_select = ['Plate ID', 'Registration State']\n",
126 |     "\n",
127 |     "with ProgressBar():\n",
128 |     "    display(nyc_data_raw[columns_to_select].head())"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "### Section 5.1.2"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# Listing 5.5\n",
145 |     "with ProgressBar():\n",
146 |     "    display(nyc_data_raw.drop('Violation Code', axis=1).head())"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "# Listing 5.6\n",
156 |     "violationColumnNames = list(filter(lambda columnName: 'Violation' in columnName, nyc_data_raw.columns))\n",
157 |     "\n",
158 |     "with ProgressBar():\n",
159 |     "    display(nyc_data_raw.drop(violationColumnNames, axis=1).head())"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "### Section 5.1.3"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Listing 5.7\n",
176 |     "nyc_data_renamed = nyc_data_raw.rename(columns={'Plate ID':'License Plate'})\n",
177 |     "nyc_data_renamed"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "### Section 5.1.4"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "# Listing 5.8\n",
194 |     "with ProgressBar():\n",
195 |     "    display(nyc_data_raw.loc[56].head(1))"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "# Listing 5.9\n",
205 |     "with ProgressBar():\n",
206 |     "    display(nyc_data_raw.loc[100:200].head(100))"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "# Listing 5.10\n",
216 |     "with ProgressBar():\n",
217 |     "    some_rows = nyc_data_raw.loc[100:200].head(100)\n",
218 |     "some_rows.drop(range(100, 200, 2))"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "### Section 5.2.1"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "# Listing 5.11\n",
235 |     "missing_values = nyc_data_raw.isnull().sum()\n",
236 |     "with ProgressBar():\n",
237 |     "    percent_missing = ((missing_values / nyc_data_raw.index.size) * 100).compute()\n",
238 |     "percent_missing"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "### Section 5.2.2"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "# Listing 5.12\n",
255 |     "columns_to_drop = list(percent_missing[percent_missing >= 50].index)\n",
256 |     "nyc_data_clean_stage1 = nyc_data_raw.drop(columns_to_drop, axis=1)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "### Section 5.2.3"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "# Listing 5.13\n",
273 |     "with ProgressBar():\n",
274 |     "    count_of_vehicle_colors = nyc_data_clean_stage1['Vehicle Color'].value_counts().compute()\n",
275 |     "most_common_color = count_of_vehicle_colors.sort_values(ascending=False).index[0]\n",
276 |     "\n",
277 |     "# Fill missing vehicle color with the most common color\n",
278 |     "nyc_data_clean_stage2 = nyc_data_clean_stage1.fillna({'Vehicle Color': most_common_color})"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "### Section 5.2.4"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "# Listing 5.14\n",
295 |     "\n",
296 |     "# Updated to compensate for bug identified in https://github.com/dask/dask/issues/5854\n",
297 |     "\n",
298 |     "# Old code:\n",
299 |     "# rows_to_drop = list(percent_missing[(percent_missing > 0) & (percent_missing < 5)].index)\n",
300 |     "# nyc_data_clean_stage3 = nyc_data_clean_stage2.dropna(subset=rows_to_drop)\n",
301 |     "\n",
302 |     "# New code splits the rows to drop into two separate lists and chains the dropna methods to drop all the columns we want\n",
303 |     "rows_to_drop1 =['Plate ID', 'Vehicle Body Type', 'Vehicle Make', 'Vehicle Expiration Date', 'Violation Precinct', 'Issuer Precinct', 'Issuer Code', 'Violation Time', 'Street Name']\n",
304 |     "rows_to_drop2 =['Date First Observed', 'Law Section', 'Sub Division', 'Vehicle Color', 'Vehicle Year', 'Feet From Curb']\n",
305 |     "nyc_data_clean_stage3 = nyc_data_clean_stage2.dropna(subset=rows_to_drop1).dropna(subset=rows_to_drop2)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "### Section 5.2.5"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "# Listing 5.15\n",
322 |     "remaining_columns_to_clean = list(percent_missing[(percent_missing >= 5) & (percent_missing < 50)].index)\n",
323 |     "nyc_data_raw.dtypes[remaining_columns_to_clean]"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "# Listing 5.16\n",
333 |     "unknown_default_dict = dict(map(lambda columnName: (columnName, 'Unknown'), remaining_columns_to_clean))"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "# Listing 5.17\n",
343 |     "nyc_data_clean_stage4 = nyc_data_clean_stage3.fillna(unknown_default_dict)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {},
350 |    "outputs": [],
351 |    "source": [
352 |     "# Listing 5.18\n",
353 |     "with ProgressBar():\n",
354 |     "    print(nyc_data_clean_stage4.isnull().sum().compute())\n",
355 |     "    nyc_data_clean_stage4.persist()"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "### Section 5.3"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "# Listing 5.19\n",
372 |     "with ProgressBar():\n",
373 |     "    license_plate_types = nyc_data_clean_stage4['Plate Type'].value_counts().compute()\n",
374 |     "license_plate_types"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": [
383 |     "# Listing 5.20\n",
384 |     "condition = nyc_data_clean_stage4['Plate Type'].isin(['PAS', 'COM'])\n",
385 |     "plate_type_masked = nyc_data_clean_stage4['Plate Type'].where(condition, 'Other')\n",
386 |     "nyc_data_recode_stage1 = nyc_data_clean_stage4.drop('Plate Type', axis=1)\n",
387 |     "nyc_data_recode_stage2 = nyc_data_recode_stage1.assign(PlateType=plate_type_masked)\n",
388 |     "nyc_data_recode_stage3 = nyc_data_recode_stage2.rename(columns={'PlateType':'Plate Type'})"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "# Listing 5.21\n",
398 |     "with ProgressBar():\n",
399 |     "    display(nyc_data_recode_stage3['Plate Type'].value_counts().compute())"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": [
408 |     "# Listing 5.22\n",
409 |     "single_color = list(count_of_vehicle_colors[count_of_vehicle_colors == 1].index)\n",
410 |     "condition = nyc_data_clean_stage4['Vehicle Color'].isin(single_color)\n",
411 |     "vehicle_color_masked = nyc_data_clean_stage4['Vehicle Color'].mask(condition, 'Other')\n",
412 |     "nyc_data_recode_stage4 = nyc_data_recode_stage3.drop('Vehicle Color', axis=1)\n",
413 |     "nyc_data_recode_stage5 = nyc_data_recode_stage4.assign(VehicleColor=vehicle_color_masked)\n",
414 |     "nyc_data_recode_stage6 = nyc_data_recode_stage5.rename(columns={'VehicleColor':'Vehicle Color'})"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "### Section 5.4"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "# Listing 5.23\n",
431 |     "from datetime import datetime\n",
432 |     "issue_date_parsed = nyc_data_recode_stage6['Issue Date'].apply(lambda x: datetime.strptime(x, \"%m/%d/%Y\"), meta=datetime)\n",
433 |     "nyc_data_derived_stage1 = nyc_data_recode_stage6.drop('Issue Date', axis=1)\n",
434 |     "nyc_data_derived_stage2 = nyc_data_derived_stage1.assign(IssueDate=issue_date_parsed)\n",
435 |     "nyc_data_derived_stage3 = nyc_data_derived_stage2.rename(columns={'IssueDate':'Issue Date'})"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "# Listing 5.24\n",
445 |     "with ProgressBar():\n",
446 |     "    display(nyc_data_derived_stage3['Issue Date'].head())"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": null,
452 |    "metadata": {},
453 |    "outputs": [],
454 |    "source": [
455 |     "# Listing 5.25\n",
456 |     "issue_date_month_year = nyc_data_derived_stage3['Issue Date'].apply(lambda dt: dt.strftime(\"%Y%m\"), meta=str)\n",
457 |     "nyc_data_derived_stage4 = nyc_data_derived_stage3.assign(IssueMonthYear=issue_date_month_year)\n",
458 |     "nyc_data_derived_stage5 = nyc_data_derived_stage4.rename(columns={'IssueMonthYear':'Citation Issued Month Year'})"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": [
467 |     "# Listing 5.26\n",
468 |     "with ProgressBar():\n",
469 |     "    display(nyc_data_derived_stage5['Citation Issued Month Year'].head())"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "### Section 5.5.1"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "# Listing 5.27\n",
486 |     "months = ['201310','201410','201510','201610','201710']\n",
487 |     "condition = nyc_data_derived_stage5['Citation Issued Month Year'].isin(months)\n",
488 |     "october_citations = nyc_data_derived_stage5[condition]\n",
489 |     "\n",
490 |     "with ProgressBar():\n",
491 |     "    display(october_citations.head())"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": null,
497 |    "metadata": {},
498 |    "outputs": [],
499 |    "source": [
500 |     "# Listing 5.28\n",
501 |     "bound_date = '2016-4-25'\n",
502 |     "condition = nyc_data_derived_stage5['Issue Date'] > bound_date\n",
503 |     "citations_after_bound = nyc_data_derived_stage5[condition]\n",
504 |     "\n",
505 |     "with ProgressBar():\n",
506 |     "    display(citations_after_bound.head())"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "markdown",
511 |    "metadata": {},
512 |    "source": [
513 |     "### Section 5.5.1"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": null,
519 |    "metadata": {},
520 |    "outputs": [],
521 |    "source": [
522 |     "# Listing 5.29\n",
523 |     "with ProgressBar():\n",
524 |     "    condition = (nyc_data_derived_stage5['Issue Date'] > '2014-01-01') & (nyc_data_derived_stage5['Issue Date'] <= '2017-12-31')\n",
525 |     "    nyc_data_filtered = nyc_data_derived_stage5[condition]\n",
526 |     "    nyc_data_new_index = nyc_data_filtered.set_index('Citation Issued Month Year')"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": [
535 |     "# Listing 5.30\n",
536 |     "years = ['2014', '2015', '2016', '2017']\n",
537 |     "months = ['01','02','03','04','05','06','07','08','09','10','11','12']\n",
538 |     "divisions = [year + month for year in years for month in months]\n",
539 |     "\n",
540 |     "with ProgressBar():\n",
541 |     "    nyc_data_new_index.repartition(divisions=divisions).to_parquet('nyc_data_date_index', compression='snappy')\n",
542 |     "    \n",
543 |     "nyc_data_new_index = dd.read_parquet('nyc_data_date_index')"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "markdown",
548 |    "metadata": {},
549 |    "source": [
550 |     "### Section 5.6.1"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": null,
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "# Listing 5.31\n",
560 |     "import pandas as pd\n",
561 |     "nyc_temps = pd.read_csv('nyc-temp-data.csv')\n",
562 |     "\n",
563 |     "# Filtered out only the relevant months from the temperature data to accelerate the join\n",
564 |     "nyc_temps_filtered = nyc_temps[nyc_temps.monthYear.isin(divisions)]\n",
565 |     "\n",
566 |     "nyc_temps_indexed = nyc_temps_filtered.set_index(nyc_temps_filtered.monthYear.astype(str))\n",
567 |     "nyc_data_with_temps = nyc_data_new_index.join(nyc_temps_indexed, how='inner')\n",
568 |     "\n",
569 |     "with ProgressBar():\n",
570 |     "    display(nyc_data_with_temps.head(15))"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "markdown",
575 |    "metadata": {},
576 |    "source": [
577 |     "### Section 5.6.2"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": null,
583 |    "metadata": {},
584 |    "outputs": [],
585 |    "source": [
586 |     "# Listing 5.32\n",
587 |     "fy16 = dd.read_csv('nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2016.csv', dtype=dtypes, usecols=dtypes.keys())\n",
588 |     "fy17 = dd.read_csv('nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2017.csv', dtype=dtypes, usecols=dtypes.keys())\n",
589 |     "\n",
590 |     "fy1617 = fy16.append(fy17)\n",
591 |     "\n",
592 |     "with ProgressBar():\n",
593 |     "    print(fy16['Summons Number'].count().compute())\n",
594 |     "\n",
595 |     "with ProgressBar():\n",
596 |     "    print(fy17['Summons Number'].count().compute())\n",
597 |     "\n",
598 |     "with ProgressBar():\n",
599 |     "    print(fy1617['Summons Number'].count().compute())\n"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "markdown",
604 |    "metadata": {},
605 |    "source": [
606 |     "### Section 5.7.1"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": null,
612 |    "metadata": {},
613 |    "outputs": [],
614 |    "source": [
615 |     "# Listing 5.33\n",
616 |     "with ProgressBar():\n",
617 |     "    if not os.path.exists('nyc-final-csv'):\n",
618 |     "        os.makedirs('nyc-final-csv') \n",
619 |     "    nyc_data_with_temps.repartition(npartitions=1).to_csv('nyc-final-csv/part*.csv')"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": null,
625 |    "metadata": {},
626 |    "outputs": [],
627 |    "source": [
628 |     "# Listing 5.33\n",
629 |     "with ProgressBar():\n",
630 |     "    if not os.path.exists('nyc-final-csv-compressed'):\n",
631 |     "        os.makedirs('nyc-final-csv-compressed')\n",
632 |     "    nyc_data_with_temps.to_csv(\n",
633 |     "        filename='nyc-final-csv-compressed/*', \n",
634 |     "        compression='gzip', \n",
635 |     "        sep='|', \n",
636 |     "        na_rep='NULL', \n",
637 |     "        header=False, \n",
638 |     "        index=False)"
639 |    ]
640 |   },
641 |   {
642 |    "cell_type": "markdown",
643 |    "metadata": {},
644 |    "source": [
645 |     "### Listing 5.7.2"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "code",
650 |    "execution_count": null,
651 |    "metadata": {
652 |     "scrolled": false
653 |    },
654 |    "outputs": [],
655 |    "source": [
656 |     "# Listing 5.35\n",
657 |     "# Added reset_index as later versions of Dask raise an error stating the index column can't be found\n",
658 |     "with ProgressBar():\n",
659 |     "    nyc_data_with_temps.reset_index(drop=True).to_parquet('nyc_final', compression='snappy')"
660 |    ]
661 |   }
662 |  ],
663 |  "metadata": {
664 |   "kernelspec": {
665 |    "display_name": "Python 3",
666 |    "language": "python",
667 |    "name": "python3"
668 |   },
669 |   "language_info": {
670 |    "codemirror_mode": {
671 |     "name": "ipython",
672 |     "version": 3
673 |    },
674 |    "file_extension": ".py",
675 |    "mimetype": "text/x-python",
676 |    "name": "python",
677 |    "nbconvert_exporter": "python",
678 |    "pygments_lexer": "ipython3",
679 |    "version": "3.7.7"
680 |   }
681 |  },
682 |  "nbformat": 4,
683 |  "nbformat_minor": 2
684 | }
685 | 


--------------------------------------------------------------------------------
/Chapter 6.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Science with Python and Dask\n",
  8 |     "## Chapter 6: Summarizing and Analyzing DataFrames"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "# Before beginning, set your working directory to where the data resides\n",
 18 |     "import os\n",
 19 |     "os.chdir('/Users/jesse/Documents')"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "### Section 6.1.2"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Listing 6.1\n",
 36 |     "import dask.dataframe as dd\n",
 37 |     "import pyarrow\n",
 38 |     "from dask.diagnostics import ProgressBar\n",
 39 |     "\n",
 40 |     "nyc_data = dd.read_parquet('nyc_final', engine='pyarrow')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# Listing 6.2\n",
 50 |     "with ProgressBar():\n",
 51 |     "    vehicle_age_by_year = nyc_data['Vehicle Year'].value_counts().compute()\n",
 52 |     "vehicle_age_by_year"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Listing 6.3\n",
 62 |     "with ProgressBar():\n",
 63 |     "    condition = (nyc_data['Vehicle Year'] > 0) & (nyc_data['Vehicle Year'] <= 2018)\n",
 64 |     "    vehicle_age_by_year = nyc_data[condition]['Vehicle Year'].value_counts().compute().sort_index()\n",
 65 |     "vehicle_age_by_year"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Listing 6.4\n",
 75 |     "nyc_data_filtered = nyc_data[condition]\n",
 76 |     "\n",
 77 |     "def age_calculation(row):\n",
 78 |     "    return int(row['Issue Date'].year - row['Vehicle Year'])\n",
 79 |     "\n",
 80 |     "vehicle_age = nyc_data_filtered.apply(age_calculation, axis=1, meta=('Vehicle Age', 'int'))\n",
 81 |     "\n",
 82 |     "nyc_data_vehicle_age_stg1 = nyc_data_filtered.assign(VehicleAge=vehicle_age)\n",
 83 |     "nyc_data_vehicle_age_stg2 = nyc_data_vehicle_age_stg1.rename(columns={'VehicleAge':'Vehicle Age'})\n",
 84 |     "\n",
 85 |     "nyc_data_with_vehicle_age = nyc_data_vehicle_age_stg2[nyc_data_vehicle_age_stg2['Vehicle Age'] >= 0]"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# Listing 6.5\n",
 95 |     "with ProgressBar():\n",
 96 |     "    files = nyc_data_with_vehicle_age.to_parquet('nyc_data_vehicleAge', engine='pyarrow')\n",
 97 |     "\n",
 98 |     "nyc_data_with_vehicle_age = dd.read_parquet('nyc_data_vehicleAge', engine='pyarrow')"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "# Listing 6.6\n",
108 |     "from dask.array import stats as dask_stats\n",
109 |     "with ProgressBar():\n",
110 |     "    mean = nyc_data_with_vehicle_age['Vehicle Age'].mean().compute()\n",
111 |     "    stdev = nyc_data_with_vehicle_age['Vehicle Age'].std().compute()\n",
112 |     "    minimum = nyc_data_with_vehicle_age['Vehicle Age'].min().compute()\n",
113 |     "    maximum = nyc_data_with_vehicle_age['Vehicle Age'].max().compute()\n",
114 |     "    skewness = float(dask_stats.skew(nyc_data_with_vehicle_age['Vehicle Age'].values).compute())"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "### Section 6.1.3"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "# Listing 6.7\n",
131 |     "with ProgressBar():\n",
132 |     "    descriptive_stats = nyc_data_with_vehicle_age['Vehicle Age'].describe().compute()\n",
133 |     "descriptive_stats.round(2)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "### Section 6.2.2"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "# Listing 6.8\n",
150 |     "import pandas as pd\n",
151 |     "\n",
152 |     "years = ['2014', '2015', '2016', '2017']\n",
153 |     "months = ['01','02','03','04','05','06','07','08','09','10','11','12']\n",
154 |     "years_months = [year + month for year in years for month in months]\n",
155 |     "\n",
156 |     "sort_order = pd.Series(range(len(years_months)), index=years_months, name='custom_sort')\n",
157 |     "\n",
158 |     "def sort_by_months(dataframe, order):\n",
159 |     "    return dataframe.join(order).sort_values('custom_sort').drop('custom_sort', axis=1)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "# Listing 6.9\n",
169 |     "with ProgressBar():\n",
170 |     "    nyc_data_by_month = nyc_data.groupby('monthYear')\n",
171 |     "    citations_per_month = nyc_data_by_month['Summons Number'].count().compute()\n",
172 |     "sort_by_months(citations_per_month.to_frame(), sort_order)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "# Listing 6.10\n",
182 |     "with ProgressBar():\n",
183 |     "    condition = ~nyc_data['monthYear'].isin(['201707','201708','201709','201710','201711','201712'])\n",
184 |     "    nyc_data_filtered = nyc_data[condition]\n",
185 |     "    citations_and_temps = nyc_data_filtered.groupby('monthYear').agg({'Summons Number': 'count', 'Temp': 'mean'})\n",
186 |     "    correlation_matrix = citations_and_temps.corr().compute()\n",
187 |     "correlation_matrix"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "### Section 6.3.2"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "# Listing 6.11\n",
204 |     "nyc_data_with_vehicle_age = dd.read_parquet('nyc_data_vehicleAge', engine='pyarrow')\n",
205 |     "\n",
206 |     "nyc_data_filtered = nyc_data_with_vehicle_age[nyc_data_with_vehicle_age ['Plate Type'].isin(['PAS','COM'])]"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "# Listing 6.12\n",
216 |     "with ProgressBar():\n",
217 |     "    N = nyc_data_filtered['Vehicle Age'].count().compute()\n",
218 |     "    p = nyc_data_filtered['Plate Type'].unique().count().compute()\n",
219 |     "brown_forsythe_left = (N - p) / (p - 1)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# Listing 6.13\n",
229 |     "with ProgressBar():\n",
230 |     "    passenger_vehicles = nyc_data_filtered[nyc_data_filtered['Plate Type'] == 'PAS']\n",
231 |     "    commercial_vehicles = nyc_data_filtered[nyc_data_filtered['Plate Type'] == 'COM']\n",
232 |     "    median_PAS = passenger_vehicles['Vehicle Age'].quantile(0.5).compute()\n",
233 |     "    median_COM = commercial_vehicles['Vehicle Age'].quantile(0.5).compute()"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "# Listing 6.14\n",
243 |     "def absolute_deviation_from_median(row):\n",
244 |     "    if row['Plate Type'] == 'PAS':\n",
245 |     "        return abs(row['Vehicle Age'] - median_PAS)\n",
246 |     "    else:\n",
247 |     "        return abs(row['Vehicle Age'] - median_COM)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "# Listing 6.15\n",
257 |     "absolute_deviation = nyc_data_filtered.apply(absolute_deviation_from_median, axis=1, meta=('x', 'float32'))\n",
258 |     "\n",
259 |     "nyc_data_age_type_test_stg1 = nyc_data_filtered.assign(MedianDifferences = absolute_deviation)\n",
260 |     "nyc_data_age_type_test = nyc_data_age_type_test_stg1.rename(columns={'MedianDifferences':'Median Difference'})"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "# Listing 6.16\n",
270 |     "with ProgressBar():\n",
271 |     "    group_means = nyc_data_age_type_test.groupby('Plate Type')['Median Difference'].mean().compute()"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "# Listing 6.17\n",
281 |     "def group_mean_variance(row):\n",
282 |     "    if row['Plate Type'] == 'PAS':\n",
283 |     "        return (row['Median Difference'] - group_means['PAS'])**2\n",
284 |     "    else:\n",
285 |     "        return (row['Median Difference'] - group_means['COM'])**2\n",
286 |     "    \n",
287 |     "group_mean_variances = nyc_data_age_type_test.apply(group_mean_variance, axis=1, meta=('x', 'float32'))\n",
288 |     "\n",
289 |     "nyc_data_age_type_test_gmv_stg1 = nyc_data_age_type_test.assign(GroupMeanVariances = group_mean_variances)\n",
290 |     "nyc_data_age_type_test_gmv = nyc_data_age_type_test_gmv_stg1.rename(columns={'GroupMeanVariances':'Group Mean Variance'})"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "# Listing 6.18\n",
300 |     "with ProgressBar():\n",
301 |     "    brown_forsythe_right_denominator = nyc_data_age_type_test_gmv['Group Mean Variance'].sum().compute()"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "# Listing 6.19\n",
311 |     "with ProgressBar():\n",
312 |     "    grand_mean = nyc_data_age_type_test['Median Difference'].mean().compute()"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "# Listing 6.20\n",
322 |     "brown_forsythe_aggregation = dd.Aggregation(\n",
323 |     "    'Brown_Forsythe',\n",
324 |     "    lambda chunk: (chunk.count(), chunk.sum()),\n",
325 |     "    lambda chunk_count, chunk_sum: (chunk_count.sum(), chunk_sum.sum()),\n",
326 |     "    lambda group_count, group_sum: group_count * (((group_sum / group_count) - grand_mean)**2)\n",
327 |     ")"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": [
336 |     "# Listing 6.21\n",
337 |     "with ProgressBar():\n",
338 |     "    group_variances = nyc_data_age_type_test.groupby('Plate Type').agg({'Median Difference': brown_forsythe_aggregation}).compute()"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "# Listing 6.22\n",
348 |     "brown_forsythe_right_numerator = group_variances.sum()[0]"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {},
355 |    "outputs": [],
356 |    "source": [
357 |     "# Listing 6.23\n",
358 |     "F_statistic = brown_forsythe_left * (brown_forsythe_right_numerator / brown_forsythe_right_denominator)"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "# Listing 6.24\n",
368 |     "import scipy.stats as stats\n",
369 |     "alpha = 0.05\n",
370 |     "df1 = p - 1\n",
371 |     "df2 = N - p\n",
372 |     "F_critical = stats.f.ppf(q=1-alpha, dfn=df1, dfd=df2)"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "# Listing 6.25\n",
382 |     "print(\"Using the Brown-Forsythe Test for Equal Variance\")\n",
383 |     "print(\"The Null Hypothesis states: the variance is constant among groups\")\n",
384 |     "print(\"The Alternative Hypothesis states: the variance is not constant among groups\")\n",
385 |     "print(\"At a confidence level of \" + str(alpha) + \", the F statistic was \" + str(F_statistic) + \" and the F critical value was \" + str(F_critical) + \".\")\n",
386 |     "if F_statistic > F_critical:\n",
387 |     "    print(\"We can reject the null hypothesis. Set equal_var to False.\")\n",
388 |     "else:\n",
389 |     "    print(\"We fail to reject the null hypothesis. Set equal_var to True.\")"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "# Listing 6.26\n",
399 |     "with ProgressBar():\n",
400 |     "    pas = passenger_vehicles['Vehicle Age'].values.compute()\n",
401 |     "    com = commercial_vehicles['Vehicle Age'].values.compute()"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "# Listing 6.27\n",
411 |     "stats.ttest_ind(pas, com, equal_var=False)"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "markdown",
416 |    "metadata": {},
417 |    "source": [
418 |     "### Section 6.4.1"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "# Listing 6.28\n",
428 |     "with ProgressBar():\n",
429 |     "    condition = ~nyc_data['monthYear'].isin(['201707','201708','201709','201710','201711','201712'])\n",
430 |     "    nyc_data_filtered = nyc_data[condition]\n",
431 |     "    citations_by_month = nyc_data_filtered.groupby(nyc_data_filtered.index)['Summons Number'].count()"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "markdown",
436 |    "metadata": {},
437 |    "source": [
438 |     "### Section 6.4.2"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": [
447 |     "# Listing 6.29\n",
448 |     "with ProgressBar():\n",
449 |     "    three_month_SMA = citations_by_month.rolling(3).mean().compute()"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": null,
455 |    "metadata": {},
456 |    "outputs": [],
457 |    "source": [
458 |     "# Listing 6.30\n",
459 |     "citations_by_month.rolling(3, center=True).mean().head()"
460 |    ]
461 |   }
462 |  ],
463 |  "metadata": {
464 |   "kernelspec": {
465 |    "display_name": "Python 3",
466 |    "language": "python",
467 |    "name": "python3"
468 |   },
469 |   "language_info": {
470 |    "codemirror_mode": {
471 |     "name": "ipython",
472 |     "version": 3
473 |    },
474 |    "file_extension": ".py",
475 |    "mimetype": "text/x-python",
476 |    "name": "python",
477 |    "nbconvert_exporter": "python",
478 |    "pygments_lexer": "ipython3",
479 |    "version": "3.7.7"
480 |   }
481 |  },
482 |  "nbformat": 4,
483 |  "nbformat_minor": 2
484 | }
485 | 


--------------------------------------------------------------------------------
/Chapter 7.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Science with Python and Dask\n",
  8 |     "## Chapter 7: Visualizing DataFrames with Seaborn"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "### Section 7.2.1"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# Listing 7.1\n",
 25 |     "import dask.dataframe as dd\n",
 26 |     "import pyarrow\n",
 27 |     "from dask.diagnostics import ProgressBar\n",
 28 |     "import os\n",
 29 |     "import seaborn as sns\n",
 30 |     "import matplotlib.pyplot as plt\n",
 31 |     "\n",
 32 |     "# Set working directory and read in the data\n",
 33 |     "os.chdir('/Users/jesse/Documents')\n",
 34 |     "\n",
 35 |     "# Updated 6-AUG-2020\n",
 36 |     "# In an earlier update in chapter 5, the index was dropped to solve an error writing the final parquet file using newer versions of Dask\n",
 37 |     "# This got rid of the Citation Issued Month Year column, which is referenced multiple times in thie notebook\n",
 38 |     "# Added a rename of the monthYear column to fix this \n",
 39 |     "nyc_data = dd.read_parquet('nyc_final', engine='pyarrow').rename(columns={'monthYear':'Citation Issued Month Year'})"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Listing 7.2\n",
 49 |     "row_filter = ~nyc_data['Citation Issued Month Year'].isin(['07-2017','08-2017','09-2017','10-2017','11-2017','12-2017'])\n",
 50 |     "nyc_data_filtered = nyc_data[row_filter]\n",
 51 |     "\n",
 52 |     "citationsAndTemps = nyc_data_filtered.groupby('Citation Issued Month Year').agg({'Summons Number': 'count', 'Temp': 'mean'})"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Listing 7.3\n",
 62 |     "# Plot a scatter plot of Temp vs. Number of Citations\n",
 63 |     "sns.set(style=\"whitegrid\")\n",
 64 |     "f, ax = plt.subplots(figsize=(10, 10))\n",
 65 |     "sns.despine(f, left=True, bottom=True)\n",
 66 |     "\n",
 67 |     "with ProgressBar():\n",
 68 |     "    sns.scatterplot(x=\"Temp\", y=\"Summons Number\",\n",
 69 |     "                data=citationsAndTemps.compute(), ax=ax)\n",
 70 |     "    plt.ylim(ymin=0)\n",
 71 |     "    plt.xlim(xmin=0)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "### Section 7.2.2"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# Listing 7.4\n",
 88 |     "# Add a robust regression line to the scatter plot using regplot\n",
 89 |     "sns.set(style=\"whitegrid\")\n",
 90 |     "f, ax = plt.subplots(figsize=(10, 10))\n",
 91 |     "sns.despine(f, left=True, bottom=True)\n",
 92 |     "\n",
 93 |     "with ProgressBar():\n",
 94 |     "    sns.regplot(x=\"Temp\", y=\"Summons Number\",\n",
 95 |     "               data=citationsAndTemps.compute(), ax=ax,\n",
 96 |     "               robust=True)\n",
 97 |     "    plt.ylim(ymin=0)\n",
 98 |     "    plt.xlim(xmin=0)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "### Section 7.2.3"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "# Listing 7.5\n",
115 |     "# Try a non-linear fit by setting the order parameter to 2 (roughly parabolic shape)\n",
116 |     "sns.set(style=\"whitegrid\")\n",
117 |     "f, ax = plt.subplots(figsize=(10, 10))\n",
118 |     "sns.despine(f, left=True, bottom=True)\n",
119 |     "\n",
120 |     "with ProgressBar():\n",
121 |     "    sns.regplot(x=\"Temp\", y=\"Summons Number\",\n",
122 |     "               data=citationsAndTemps.compute(), ax=ax,\n",
123 |     "               order=2)\n",
124 |     "    plt.ylim(ymin=0)\n",
125 |     "    plt.xlim(xmin=0)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "### Section 7.3.1"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# Listing 7.6\n",
142 |     "# Read in the data and filter down to the six most popular vehicle colors\n",
143 |     "nyc_data_withVehicleAge = dd.read_parquet('nyc_data_vehicleAge', engine='pyarrow')\n",
144 |     "\n",
145 |     "row_filter = nyc_data_withVehicleAge['Vehicle Color'].isin(['BLACK','WHITE','GREY','RED','GREEN','BLUE'])\n",
146 |     "column_filter = ['Vehicle Age','Vehicle Color']\n",
147 |     "\n",
148 |     "ages_and_colors = nyc_data_withVehicleAge[row_filter][column_filter]"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "# Listing 7.7\n",
158 |     "# Get a count of how many vehicle citations match our criteria\n",
159 |     "with ProgressBar():\n",
160 |     "    print(ages_and_colors.count().compute())"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# Listing 7.8\n",
170 |     "# Plot a violinplot to show the distributions of vehicle ages grouped by vehicle color\n",
171 |     "sns.set(style=\"whitegrid\")\n",
172 |     "f, ax = plt.subplots(figsize=(10, 10))\n",
173 |     "sns.despine(f, left=True, bottom=True)\n",
174 |     "\n",
175 |     "group_order = [\"RED\", \"GREEN\", \"BLUE\", \"BLACK\", \"WHITE\", \"GREY\"]\n",
176 |     "\n",
177 |     "with ProgressBar():\n",
178 |     "    sns.violinplot(x=\"Vehicle Color\", y=\"Vehicle Age\", data=ages_and_colors.compute(), order=group_order, palette=group_order, ax=ax)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "### Section 7.3.2"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "# Listing 7.9\n",
195 |     "# Plot a violinplot of a random sample\n",
196 |     "sample = ages_and_colors.sample(frac=0.01)\n",
197 |     "\n",
198 |     "sns.set(style=\"whitegrid\")\n",
199 |     "f, ax = plt.subplots(figsize=(10, 10))\n",
200 |     "sns.despine(f, left=True, bottom=True)\n",
201 |     "\n",
202 |     "with ProgressBar():\n",
203 |     "    sns.violinplot(x=\"Vehicle Color\", y=\"Vehicle Age\", data=sample.compute(), order=group_order, palette=group_order, ax=ax)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "### Section 7.4"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "# Listing 7.10\n",
220 |     "from datetime import datetime\n",
221 |     "nyc_data_filtered = nyc_data[nyc_data['Issue Date'] < datetime(2017,1,1)]\n",
222 |     "\n",
223 |     "day_of_week = nyc_data_filtered['Issue Date'].apply(lambda x: x.strftime(\"%A\"), meta=str)\n",
224 |     "\n",
225 |     "month_of_year = nyc_data_filtered['Issue Date'].apply(lambda x: x.strftime(\"%B\"), meta=str)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "# Listing 7.11\n",
235 |     "# Add the columns back to the DataFrame\n",
236 |     "nyc_data_with_dates_raw = nyc_data_filtered.assign(DayOfWeek = day_of_week).assign(MonthOfYear = month_of_year)\n",
237 |     "column_map = {'DayOfWeek': 'Day of Week', 'MonthOfYear': 'Month of Year'}\n",
238 |     "nyc_data_with_dates = nyc_data_with_dates_raw.rename(columns=column_map)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "# Listing 7.12\n",
248 |     "# Count up the summons by month of year/day of week\n",
249 |     "with ProgressBar():\n",
250 |     "    summons_by_mydw = nyc_data_with_dates.groupby(['Day of Week', 'Month of Year'])['Summons Number'].count().compute()"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "# Listing 7.13\n",
260 |     "# Create a pivot table from the result\n",
261 |     "heatmap_data = summons_by_mydw.reset_index().pivot(\"Month of Year\", \"Day of Week\", \"Summons Number\")"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "# Listing 7.14\n",
271 |     "# Create a list of months and weekdays for sorting the data in the heatmap\n",
272 |     "months = ['January','February','March','April','May','June','July','August','September','October','November','December']\n",
273 |     "weekdays = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']\n",
274 |     "\n",
275 |     "# Draw a heatmap with Day of Week on the x axis and Month of Year on the Y axis\n",
276 |     "f, ax = plt.subplots(figsize=(10, 10))\n",
277 |     "sns.heatmap(heatmap_data.loc[months,weekdays], annot=True, fmt=\"d\", linewidths=1, cmap='Greys', ax=ax)"
278 |    ]
279 |   }
280 |  ],
281 |  "metadata": {
282 |   "kernelspec": {
283 |    "display_name": "Python 3",
284 |    "language": "python",
285 |    "name": "python3"
286 |   },
287 |   "language_info": {
288 |    "codemirror_mode": {
289 |     "name": "ipython",
290 |     "version": 3
291 |    },
292 |    "file_extension": ".py",
293 |    "mimetype": "text/x-python",
294 |    "name": "python",
295 |    "nbconvert_exporter": "python",
296 |    "pygments_lexer": "ipython3",
297 |    "version": "3.7.7"
298 |   }
299 |  },
300 |  "nbformat": 4,
301 |  "nbformat_minor": 2
302 | }
303 | 


--------------------------------------------------------------------------------
/Chapter 9.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Data Science with Python and Dask\n",
   8 |     "## Chapter 9: Working with Bags and Arrays"
   9 |    ]
  10 |   },
  11 |   {
  12 |    "cell_type": "markdown",
  13 |    "metadata": {},
  14 |    "source": [
  15 |     "### Section 9.1"
  16 |    ]
  17 |   },
  18 |   {
  19 |    "cell_type": "code",
  20 |    "execution_count": 1,
  21 |    "metadata": {},
  22 |    "outputs": [
  23 |     {
  24 |      "data": {
  25 |       "text/plain": [
  26 |        "dask.bag<bag-fro..., npartitions=1>"
  27 |       ]
  28 |      },
  29 |      "execution_count": 1,
  30 |      "metadata": {},
  31 |      "output_type": "execute_result"
  32 |     }
  33 |    ],
  34 |    "source": [
  35 |     "# Listing 9.1\n",
  36 |     "# Import bag and read in the data\n",
  37 |     "import dask.bag as bag\n",
  38 |     "import os\n",
  39 |     "\n",
  40 |     "os.chdir('/Users/jesse/Documents')\n",
  41 |     "raw_data = bag.read_text('foods.txt')\n",
  42 |     "raw_data"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "markdown",
  47 |    "metadata": {},
  48 |    "source": [
  49 |     "### Section 9.1.1"
  50 |    ]
  51 |   },
  52 |   {
  53 |    "cell_type": "code",
  54 |    "execution_count": 2,
  55 |    "metadata": {},
  56 |    "outputs": [
  57 |     {
  58 |      "data": {
  59 |       "text/plain": [
  60 |        "('product/productId: B001E4KFG0\\n',\n",
  61 |        " 'review/userId: A3SGXH7AUHU8GW\\n',\n",
  62 |        " 'review/profileName: delmartian\\n',\n",
  63 |        " 'review/helpfulness: 1/1\\n',\n",
  64 |        " 'review/score: 5.0\\n',\n",
  65 |        " 'review/time: 1303862400\\n',\n",
  66 |        " 'review/summary: Good Quality Dog Food\\n',\n",
  67 |        " 'review/text: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.\\n',\n",
  68 |        " '\\n',\n",
  69 |        " 'product/productId: B00813GRG4\\n')"
  70 |       ]
  71 |      },
  72 |      "execution_count": 2,
  73 |      "metadata": {},
  74 |      "output_type": "execute_result"
  75 |     }
  76 |    ],
  77 |    "source": [
  78 |     "# Listing 9.2\n",
  79 |     "# Take a small sample of the first few elements of the bag\n",
  80 |     "raw_data.take(10)"
  81 |    ]
  82 |   },
  83 |   {
  84 |    "cell_type": "markdown",
  85 |    "metadata": {},
  86 |    "source": [
  87 |     "### Section 9.1.2"
  88 |    ]
  89 |   },
  90 |   {
  91 |    "cell_type": "code",
  92 |    "execution_count": 3,
  93 |    "metadata": {},
  94 |    "outputs": [
  95 |     {
  96 |      "ename": "UnicodeDecodeError",
  97 |      "evalue": "'utf-8' codec can't decode byte 0xce in position 2620: invalid continuation byte",
  98 |      "output_type": "error",
  99 |      "traceback": [
 100 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 101 |       "\u001b[0;31mUnicodeDecodeError\u001b[0m                        Traceback (most recent call last)",
 102 |       "\u001b[0;32m<ipython-input-3-8e4b0d9ba0e2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# If we try to count across the file, we might run into an encoding error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 103 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/base.py\u001b[0m in \u001b[0;36mcompute\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    154\u001b[0m         \u001b[0mdask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbase\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    155\u001b[0m         \"\"\"\n\u001b[0;32m--> 156\u001b[0;31m         \u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraverse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    157\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 104 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/base.py\u001b[0m in \u001b[0;36mcompute\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    400\u001b[0m     \u001b[0mkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__dask_keys__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcollections\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    401\u001b[0m     \u001b[0mpostcomputes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__dask_postcompute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcollections\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 402\u001b[0;31m     \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mschedule\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdsk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    403\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mrepack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpostcomputes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    404\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 105 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/multiprocessing.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, **kwargs)\u001b[0m\n\u001b[1;32m    175\u001b[0m                            \u001b[0mget_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_process_get_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdumps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    176\u001b[0m                            \u001b[0mpack_exception\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpack_exception\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 177\u001b[0;31m                            raise_exception=reraise, **kwargs)\n\u001b[0m\u001b[1;32m    178\u001b[0m     \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    179\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mcleanup\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 106 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36mget_async\u001b[0;34m(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)\u001b[0m\n\u001b[1;32m    503\u001b[0m                         \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# Re-execute locally\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    504\u001b[0m                     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 505\u001b[0;31m                         \u001b[0mraise_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    506\u001b[0m                 \u001b[0mres\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mworker_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres_info\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    507\u001b[0m                 \u001b[0mstate\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cache'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 107 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/compatibility.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(exc, tb)\u001b[0m\n\u001b[1;32m     66\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     67\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     69\u001b[0m         \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 108 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36mexecute_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m    272\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    273\u001b[0m         \u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask_info\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 274\u001b[0;31m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    275\u001b[0m         \u001b[0mid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_id\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    276\u001b[0m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 109 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m    252\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mistask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    253\u001b[0m         \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m         \u001b[0margs2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    255\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    256\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 110 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m()\u001b[0m\n\u001b[1;32m    252\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mistask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    253\u001b[0m         \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m         \u001b[0margs2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    255\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    256\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 111 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m    249\u001b[0m     \"\"\"\n\u001b[1;32m    250\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    252\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mistask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    253\u001b[0m         \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 112 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m()\u001b[0m\n\u001b[1;32m    249\u001b[0m     \"\"\"\n\u001b[1;32m    250\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    252\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mistask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    253\u001b[0m         \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 113 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m    253\u001b[0m         \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    254\u001b[0m         \u001b[0margs2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 255\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    256\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    257\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 114 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/bag/core.py\u001b[0m in \u001b[0;36mempty_safe_apply\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2070\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_last\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2071\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mno_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2072\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2073\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_last\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpart\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2074\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mno_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 115 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/cytoolz/itertoolz.pyx\u001b[0m in \u001b[0;36mcytoolz.itertoolz.count\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1057\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1058\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1059\u001b[0;31m \u001b[0mcpdef\u001b[0m \u001b[0mobject\u001b[0m \u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1060\u001b[0m     \"\"\"\n\u001b[1;32m   1061\u001b[0m     \u001b[0mCount\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mnumber\u001b[0m \u001b[0mof\u001b[0m \u001b[0mitems\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 116 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/cytoolz/itertoolz.pyx\u001b[0m in \u001b[0;36mcytoolz.itertoolz.count\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1071\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1072\u001b[0m     \u001b[0mcdef\u001b[0m \u001b[0mPy_ssize_t\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1073\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1074\u001b[0m         \u001b[0mi\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1075\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 117 |       "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/bag/text.py\u001b[0m in \u001b[0;36mfile_to_blocks\u001b[0;34m()\u001b[0m\n\u001b[1;32m     86\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfile_to_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlazy_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     87\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mlazy_file\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 88\u001b[0;31m         \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     89\u001b[0m             \u001b[0;32myield\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 118 |       "\u001b[0;32m/anaconda3/lib/python3.6/codecs.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m()\u001b[0m\n\u001b[1;32m    319\u001b[0m         \u001b[0;31m# decode input (taking the buffer into account)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    320\u001b[0m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 321\u001b[0;31m         \u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconsumed\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfinal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    322\u001b[0m         \u001b[0;31m# keep undecoded input until the next call\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    323\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mconsumed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 119 |       "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xce in position 2620: invalid continuation byte"
 120 |      ]
 121 |     }
 122 |    ],
 123 |    "source": [
 124 |     "# Listing 9.3\n",
 125 |     "# If we try to count across the file, we might run into an encoding error\n",
 126 |     "raw_data.count().compute()"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": 4,
 132 |    "metadata": {},
 133 |    "outputs": [
 134 |     {
 135 |      "data": {
 136 |       "text/plain": [
 137 |        "5116093"
 138 |       ]
 139 |      },
 140 |      "execution_count": 4,
 141 |      "metadata": {},
 142 |      "output_type": "execute_result"
 143 |     }
 144 |    ],
 145 |    "source": [
 146 |     "# Listing 9.4\n",
 147 |     "raw_data = bag.read_text('foods.txt', encoding='cp1252')\n",
 148 |     "raw_data.count().compute()"
 149 |    ]
 150 |   },
 151 |   {
 152 |    "cell_type": "markdown",
 153 |    "metadata": {},
 154 |    "source": [
 155 |     "### Section 9.1.3"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "code",
 160 |    "execution_count": 6,
 161 |    "metadata": {},
 162 |    "outputs": [],
 163 |    "source": [
 164 |     "# Listing 9.5\n",
 165 |     "from dask.delayed import delayed\n",
 166 |     "\n",
 167 |     "def get_next_part(file, start_index, span_index=0, blocksize=1024):\n",
 168 |     "    file.seek(start_index)\n",
 169 |     "    buffer = file.read(blocksize + span_index).decode('cp1252')\n",
 170 |     "    delimiter_position = buffer.find('\\n\\n')\n",
 171 |     "    if delimiter_position == -1:\n",
 172 |     "        return get_next_part(file, start_index, span_index + blocksize)\n",
 173 |     "    else:\n",
 174 |     "        file.seek(start_index)\n",
 175 |     "        return start_index, delimiter_position"
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "code",
 180 |    "execution_count": 8,
 181 |    "metadata": {},
 182 |    "outputs": [],
 183 |    "source": [
 184 |     "# Listing 9.6\n",
 185 |     "with open('foods.txt', 'rb') as file_handle:\n",
 186 |     "    size = file_handle.seek(0,2) - 1\n",
 187 |     "    more_data = True\n",
 188 |     "    output = []\n",
 189 |     "    current_position = next_position = 0\n",
 190 |     "    while more_data:\n",
 191 |     "        if current_position >= size:\n",
 192 |     "            more_data = False\n",
 193 |     "        else:\n",
 194 |     "            current_position, next_position = get_next_part(file_handle, current_position, 0)\n",
 195 |     "            output.append((current_position, next_position))\n",
 196 |     "            current_position = current_position + next_position + 2"
 197 |    ]
 198 |   },
 199 |   {
 200 |    "cell_type": "code",
 201 |    "execution_count": 7,
 202 |    "metadata": {},
 203 |    "outputs": [],
 204 |    "source": [
 205 |     "# Listing 9.7\n",
 206 |     "def get_item(filename, start_index, delimiter_position, encoding='cp1252'):\n",
 207 |     "    with open(filename, 'rb') as file_handle:\n",
 208 |     "        file_handle.seek(start_index)\n",
 209 |     "        text = file_handle.read(delimiter_position).decode(encoding)\n",
 210 |     "        elements = text.strip().split('\\n')\n",
 211 |     "        key_value_pairs = [(element.split(': ')[0], element.split(': ')[1]) \n",
 212 |     "                               if len(element.split(': ')) > 1 \n",
 213 |     "                               else ('unknown', element) \n",
 214 |     "                               for element in elements]\n",
 215 |     "        return dict(key_value_pairs)"
 216 |    ]
 217 |   },
 218 |   {
 219 |    "cell_type": "code",
 220 |    "execution_count": 9,
 221 |    "metadata": {},
 222 |    "outputs": [],
 223 |    "source": [
 224 |     "# Listing 9.8\n",
 225 |     "reviews = bag.from_sequence(output).map(lambda x: get_item('foods.txt', x[0], x[1]))"
 226 |    ]
 227 |   },
 228 |   {
 229 |    "cell_type": "code",
 230 |    "execution_count": 10,
 231 |    "metadata": {},
 232 |    "outputs": [
 233 |     {
 234 |      "data": {
 235 |       "text/plain": [
 236 |        "({'product/productId': 'B001E4KFG0',\n",
 237 |        "  'review/userId': 'A3SGXH7AUHU8GW',\n",
 238 |        "  'review/profileName': 'delmartian',\n",
 239 |        "  'review/helpfulness': '1/1',\n",
 240 |        "  'review/score': '5.0',\n",
 241 |        "  'review/time': '1303862400',\n",
 242 |        "  'review/summary': 'Good Quality Dog Food',\n",
 243 |        "  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},\n",
 244 |        " {'product/productId': 'B00813GRG4',\n",
 245 |        "  'review/userId': 'A1D87F6ZCVE5NK',\n",
 246 |        "  'review/profileName': 'dll pa',\n",
 247 |        "  'review/helpfulness': '0/0',\n",
 248 |        "  'review/score': '1.0',\n",
 249 |        "  'review/time': '1346976000',\n",
 250 |        "  'review/summary': 'Not as Advertised',\n",
 251 |        "  'review/text': 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as \"Jumbo\".'})"
 252 |       ]
 253 |      },
 254 |      "execution_count": 10,
 255 |      "metadata": {},
 256 |      "output_type": "execute_result"
 257 |     }
 258 |    ],
 259 |    "source": [
 260 |     "# Listing 9.9\n",
 261 |     "reviews.take(2)"
 262 |    ]
 263 |   },
 264 |   {
 265 |    "cell_type": "code",
 266 |    "execution_count": 11,
 267 |    "metadata": {},
 268 |    "outputs": [
 269 |     {
 270 |      "name": "stdout",
 271 |      "output_type": "stream",
 272 |      "text": [
 273 |       "[########################################] | 100% Completed |  8.7s\n"
 274 |      ]
 275 |     },
 276 |     {
 277 |      "data": {
 278 |       "text/plain": [
 279 |        "568454"
 280 |       ]
 281 |      },
 282 |      "execution_count": 11,
 283 |      "metadata": {},
 284 |      "output_type": "execute_result"
 285 |     }
 286 |    ],
 287 |    "source": [
 288 |     "# Listing 9.10\n",
 289 |     "from dask.diagnostics import ProgressBar\n",
 290 |     "\n",
 291 |     "with ProgressBar():\n",
 292 |     "    count = reviews.count().compute()\n",
 293 |     "count"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "markdown",
 298 |    "metadata": {},
 299 |    "source": [
 300 |     "### Section 9.2.1"
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "code",
 305 |    "execution_count": 12,
 306 |    "metadata": {},
 307 |    "outputs": [],
 308 |    "source": [
 309 |     "# Listing 9.11\n",
 310 |     "def get_score(element):\n",
 311 |     "    score_numeric = float(element['review/score'])\n",
 312 |     "    return score_numeric"
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "code",
 317 |    "execution_count": 13,
 318 |    "metadata": {},
 319 |    "outputs": [
 320 |     {
 321 |      "data": {
 322 |       "text/plain": [
 323 |        "(5.0, 1.0, 4.0, 2.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0)"
 324 |       ]
 325 |      },
 326 |      "execution_count": 13,
 327 |      "metadata": {},
 328 |      "output_type": "execute_result"
 329 |     }
 330 |    ],
 331 |    "source": [
 332 |     "# Listing 9.12\n",
 333 |     "review_scores = reviews.map(get_score)\n",
 334 |     "review_scores.take(10)"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": 14,
 340 |    "metadata": {},
 341 |    "outputs": [
 342 |     {
 343 |      "data": {
 344 |       "text/plain": [
 345 |        "({'product/productId': 'B001E4KFG0',\n",
 346 |        "  'review/userId': 'A3SGXH7AUHU8GW',\n",
 347 |        "  'review/profileName': 'delmartian',\n",
 348 |        "  'review/helpfulness': '1/1',\n",
 349 |        "  'review/score': '5.0',\n",
 350 |        "  'review/time': '1303862400',\n",
 351 |        "  'review/summary': 'Good Quality Dog Food',\n",
 352 |        "  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',\n",
 353 |        "  'review/sentiment': 'positive'},\n",
 354 |        " {'product/productId': 'B00813GRG4',\n",
 355 |        "  'review/userId': 'A1D87F6ZCVE5NK',\n",
 356 |        "  'review/profileName': 'dll pa',\n",
 357 |        "  'review/helpfulness': '0/0',\n",
 358 |        "  'review/score': '1.0',\n",
 359 |        "  'review/time': '1346976000',\n",
 360 |        "  'review/summary': 'Not as Advertised',\n",
 361 |        "  'review/text': 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as \"Jumbo\".',\n",
 362 |        "  'review/sentiment': 'negative'},\n",
 363 |        " {'product/productId': 'B000LQOCH0',\n",
 364 |        "  'review/userId': 'ABXLMWJIXXAIN',\n",
 365 |        "  'review/profileName': 'Natalia Corres \"Natalia Corres\"',\n",
 366 |        "  'review/helpfulness': '1/1',\n",
 367 |        "  'review/score': '4.0',\n",
 368 |        "  'review/time': '1219017600',\n",
 369 |        "  'review/summary': '\"Delight\" says it all',\n",
 370 |        "  'review/text': 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\\' \"The Lion, The Witch, and The Wardrobe\" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',\n",
 371 |        "  'review/sentiment': 'positive'})"
 372 |       ]
 373 |      },
 374 |      "execution_count": 14,
 375 |      "metadata": {},
 376 |      "output_type": "execute_result"
 377 |     }
 378 |    ],
 379 |    "source": [
 380 |     "# Listing 9.13\n",
 381 |     "def tag_positive_negative_by_score(element):\n",
 382 |     "    if float(element['review/score']) > 3:\n",
 383 |     "        element['review/sentiment'] = 'positive'\n",
 384 |     "    else:\n",
 385 |     "        element['review/sentiment'] = 'negative'\n",
 386 |     "    return element\n",
 387 |     "\n",
 388 |     "reviews.map(tag_positive_negative_by_score).take(3)"
 389 |    ]
 390 |   },
 391 |   {
 392 |    "cell_type": "code",
 393 |    "execution_count": 15,
 394 |    "metadata": {},
 395 |    "outputs": [
 396 |     {
 397 |      "data": {
 398 |       "text/plain": [
 399 |        "({'product/productId': 'B001E4KFG0',\n",
 400 |        "  'review/userId': 'A3SGXH7AUHU8GW',\n",
 401 |        "  'review/profileName': 'delmartian',\n",
 402 |        "  'review/helpfulness': '1/1',\n",
 403 |        "  'review/score': '5.0',\n",
 404 |        "  'review/time': '1303862400',\n",
 405 |        "  'review/summary': 'Good Quality Dog Food',\n",
 406 |        "  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},)"
 407 |       ]
 408 |      },
 409 |      "execution_count": 15,
 410 |      "metadata": {},
 411 |      "output_type": "execute_result"
 412 |     }
 413 |    ],
 414 |    "source": [
 415 |     "# Listing 9.14\n",
 416 |     "reviews.take(1)"
 417 |    ]
 418 |   },
 419 |   {
 420 |    "cell_type": "markdown",
 421 |    "metadata": {},
 422 |    "source": [
 423 |     "### Section 9.2.2"
 424 |    ]
 425 |   },
 426 |   {
 427 |    "cell_type": "code",
 428 |    "execution_count": 16,
 429 |    "metadata": {},
 430 |    "outputs": [
 431 |     {
 432 |      "name": "stderr",
 433 |      "output_type": "stream",
 434 |      "text": [
 435 |       "/anaconda3/lib/python3.6/site-packages/dask/bag/core.py:2089: UserWarning: Insufficient elements for `take`. 5 elements requested, only 1 elements available. Try passing larger `npartitions` to `take`.\n",
 436 |       "  \"larger `npartitions` to `take`.\".format(n, len(r)))\n"
 437 |      ]
 438 |     },
 439 |     {
 440 |      "data": {
 441 |       "text/plain": [
 442 |        "({'product/productId': 'B001E4KFG0',\n",
 443 |        "  'review/userId': 'A3SGXH7AUHU8GW',\n",
 444 |        "  'review/profileName': 'delmartian',\n",
 445 |        "  'review/helpfulness': '1/1',\n",
 446 |        "  'review/score': '5.0',\n",
 447 |        "  'review/time': '1303862400',\n",
 448 |        "  'review/summary': 'Good Quality Dog Food',\n",
 449 |        "  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},)"
 450 |       ]
 451 |      },
 452 |      "execution_count": 16,
 453 |      "metadata": {},
 454 |      "output_type": "execute_result"
 455 |     }
 456 |    ],
 457 |    "source": [
 458 |     "# Listing 9.15\n",
 459 |     "specific_item = reviews.filter(lambda element: element['product/productId'] == 'B001E4KFG0')\n",
 460 |     "specific_item.take(5)"
 461 |    ]
 462 |   },
 463 |   {
 464 |    "cell_type": "code",
 465 |    "execution_count": 17,
 466 |    "metadata": {},
 467 |    "outputs": [
 468 |     {
 469 |      "data": {
 470 |       "text/plain": [
 471 |        "({'product/productId': 'B001E4KFG0',\n",
 472 |        "  'review/userId': 'A3SGXH7AUHU8GW',\n",
 473 |        "  'review/profileName': 'delmartian',\n",
 474 |        "  'review/helpfulness': '1/1',\n",
 475 |        "  'review/score': '5.0',\n",
 476 |        "  'review/time': '1303862400',\n",
 477 |        "  'review/summary': 'Good Quality Dog Food',\n",
 478 |        "  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},\n",
 479 |        " {'product/productId': 'B00171APVA',\n",
 480 |        "  'review/userId': 'A21BT40VZCCYT4',\n",
 481 |        "  'review/profileName': 'Carol A. Reed',\n",
 482 |        "  'review/helpfulness': '0/0',\n",
 483 |        "  'review/score': '5.0',\n",
 484 |        "  'review/time': '1351209600',\n",
 485 |        "  'review/summary': 'Healthy Dog Food',\n",
 486 |        "  'review/text': 'This is a very healthy dog food. Good for their digestion. Also good for small puppies. My dog eats her required amount at every feeding.'},\n",
 487 |        " {'product/productId': 'B0019CW0HE',\n",
 488 |        "  'review/userId': 'A1FD9E5C06UB6B',\n",
 489 |        "  'review/profileName': 'BRENDA DEMERS',\n",
 490 |        "  'review/helpfulness': '5/5',\n",
 491 |        "  'review/score': '3.0',\n",
 492 |        "  'review/time': '1301011200',\n",
 493 |        "  'review/summary': 'Natural Balance Lamb and Rice',\n",
 494 |        "  'review/text': 'While my dogs like all of the flavors that we have tried of this dog food, for some reason their itching increased when I tried the lamb and rice. I have some very itchy dogs and am giving them a limited ingredient dog food to try to help. The duck and sweet potato cut down on the itching significantly, but when we tried lamb and rice they started itching more once again. I like Natural Balance for the quality ingredients.'},\n",
 495 |        " {'product/productId': 'B0019CW0HE',\n",
 496 |        "  'review/userId': 'AK2CXHH9VRZ2A',\n",
 497 |        "  'review/profileName': 'I. GLENN',\n",
 498 |        "  'review/helpfulness': '4/4',\n",
 499 |        "  'review/score': '3.0',\n",
 500 |        "  'review/time': '1313193600',\n",
 501 |        "  'review/summary': 'INCREASED MY DOGS ITCHING',\n",
 502 |        "  'review/text': 'Awesome dog food. However, when given to my \"Boston\", who has severe reactions to some food ingredients; his itching increased to violent jumping out of bed at night, scratching. As soon as I changed to a different formula, the scratching stopped. So glad Natural Balance has other choices. I guess you have to try each, until you find what\\'s best for your pet.'},\n",
 503 |        " {'product/productId': 'B0019CW0HE',\n",
 504 |        "  'review/userId': 'A25BGFRHYHEZKK',\n",
 505 |        "  'review/profileName': \"Toby's mom\",\n",
 506 |        "  'review/helpfulness': '4/4',\n",
 507 |        "  'review/score': '5.0',\n",
 508 |        "  'review/time': '1292889600',\n",
 509 |        "  'review/summary': 'Great food!',\n",
 510 |        "  'review/text': 'We have three dogs and all of them love this food!  We bought it specifically for one of our dogs who has food allergies and it works great for him, no more hot spots or tummy problems.<br />I LOVE that it ships right to our door with free shipping.'})"
 511 |       ]
 512 |      },
 513 |      "execution_count": 17,
 514 |      "metadata": {},
 515 |      "output_type": "execute_result"
 516 |     }
 517 |    ],
 518 |    "source": [
 519 |     "# Listing 9.16\n",
 520 |     "keyword = reviews.filter(lambda element: 'dog' in element['review/text'])\n",
 521 |     "keyword.take(5)"
 522 |    ]
 523 |   },
 524 |   {
 525 |    "cell_type": "code",
 526 |    "execution_count": 18,
 527 |    "metadata": {},
 528 |    "outputs": [],
 529 |    "source": [
 530 |     "# Listing 9.17\n",
 531 |     "def is_helpful(element):\n",
 532 |     "    helpfulness = element['review/helpfulness'].strip().split('/')\n",
 533 |     "    number_of_helpful_votes = float(helpfulness[0])\n",
 534 |     "    number_of_total_votes = float(helpfulness[1])\n",
 535 |     "    # Watch for divide by 0 errors\n",
 536 |     "    if number_of_total_votes >= 1:\n",
 537 |     "        return number_of_helpful_votes / number_of_total_votes > 0.75\n",
 538 |     "    else:\n",
 539 |     "        return False"
 540 |    ]
 541 |   },
 542 |   {
 543 |    "cell_type": "code",
 544 |    "execution_count": 19,
 545 |    "metadata": {},
 546 |    "outputs": [
 547 |     {
 548 |      "data": {
 549 |       "text/plain": [
 550 |        "({'product/productId': 'B001E4KFG0',\n",
 551 |        "  'review/userId': 'A3SGXH7AUHU8GW',\n",
 552 |        "  'review/profileName': 'delmartian',\n",
 553 |        "  'review/helpfulness': '1/1',\n",
 554 |        "  'review/score': '5.0',\n",
 555 |        "  'review/time': '1303862400',\n",
 556 |        "  'review/summary': 'Good Quality Dog Food',\n",
 557 |        "  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},\n",
 558 |        " {'product/productId': 'B000LQOCH0',\n",
 559 |        "  'review/userId': 'ABXLMWJIXXAIN',\n",
 560 |        "  'review/profileName': 'Natalia Corres \"Natalia Corres\"',\n",
 561 |        "  'review/helpfulness': '1/1',\n",
 562 |        "  'review/score': '4.0',\n",
 563 |        "  'review/time': '1219017600',\n",
 564 |        "  'review/summary': '\"Delight\" says it all',\n",
 565 |        "  'review/text': 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\\' \"The Lion, The Witch, and The Wardrobe\" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.'})"
 566 |       ]
 567 |      },
 568 |      "execution_count": 19,
 569 |      "metadata": {},
 570 |      "output_type": "execute_result"
 571 |     }
 572 |    ],
 573 |    "source": [
 574 |     "# Listing 9.18\n",
 575 |     "helpful_reviews = reviews.filter(is_helpful)\n",
 576 |     "helpful_reviews.take(2)"
 577 |    ]
 578 |   },
 579 |   {
 580 |    "cell_type": "markdown",
 581 |    "metadata": {},
 582 |    "source": [
 583 |     "### Section 9.2.3"
 584 |    ]
 585 |   },
 586 |   {
 587 |    "cell_type": "code",
 588 |    "execution_count": 20,
 589 |    "metadata": {},
 590 |    "outputs": [],
 591 |    "source": [
 592 |     "# Listing 9.19\n",
 593 |     "helpful_review_scores = helpful_reviews.map(get_score)\n",
 594 |     "\n",
 595 |     "with ProgressBar():\n",
 596 |     "    all_mean = review_scores.mean().compute()\n",
 597 |     "    helpful_mean = helpful_review_scores.mean().compute()\n",
 598 |     "    \n",
 599 |     "print(f\"Mean Score of All Reviews: {round(all_mean, 2)}\\nMean Score of Helpful Reviews: {round(helpful_mean,2)}\")"
 600 |    ]
 601 |   },
 602 |   {
 603 |    "cell_type": "code",
 604 |    "execution_count": 24,
 605 |    "metadata": {},
 606 |    "outputs": [
 607 |     {
 608 |      "name": "stdout",
 609 |      "output_type": "stream",
 610 |      "text": [
 611 |       "[########################################] | 100% Completed | 10.8s\n",
 612 |       "[########################################] | 100% Completed |  9.9s\n",
 613 |       "Mean Length of Helpful Reviews: 459.36\n",
 614 |       "Mean Length of Unhelpful Reviews: 379.32\n"
 615 |      ]
 616 |     }
 617 |    ],
 618 |    "source": [
 619 |     "# Listing 9.20\n",
 620 |     "def get_length(element):\n",
 621 |     "    return len(element['review/text'])\n",
 622 |     "\n",
 623 |     "with ProgressBar():\n",
 624 |     "    review_length_helpful = helpful_reviews.map(get_length).mean().compute()\n",
 625 |     "    review_length_unhelpful = reviews.filter(lambda review: not is_helpful(review)).map(get_length).mean().compute()\n",
 626 |     "print(f\"Mean Length of Helpful Reviews: {round(review_length_helpful, 2)}\\nMean Length of Unhelpful Reviews: {round(review_length_unhelpful,2)}\")"
 627 |    ]
 628 |   },
 629 |   {
 630 |    "cell_type": "markdown",
 631 |    "metadata": {},
 632 |    "source": [
 633 |     "### Section 9.2.4"
 634 |    ]
 635 |   },
 636 |   {
 637 |    "cell_type": "code",
 638 |    "execution_count": 22,
 639 |    "metadata": {},
 640 |    "outputs": [
 641 |     {
 642 |      "name": "stdout",
 643 |      "output_type": "stream",
 644 |      "text": [
 645 |       "[########################################] | 100% Completed |  9.0s\n"
 646 |      ]
 647 |     },
 648 |     {
 649 |      "data": {
 650 |       "text/plain": [
 651 |        "[(5.0, 363122), (1.0, 52268), (4.0, 80655), (2.0, 29769), (3.0, 42640)]"
 652 |       ]
 653 |      },
 654 |      "execution_count": 22,
 655 |      "metadata": {},
 656 |      "output_type": "execute_result"
 657 |     }
 658 |    ],
 659 |    "source": [
 660 |     "# Listing 9.21\n",
 661 |     "def count(accumulator, element):\n",
 662 |     "    return accumulator + 1\n",
 663 |     "\n",
 664 |     "def combine(total1, total2):\n",
 665 |     "    return total1 + total2\n",
 666 |     "\n",
 667 |     "with ProgressBar():\n",
 668 |     "    count_of_reviews_by_score = reviews.foldby(get_score, count, 0, combine, 0).compute()\n",
 669 |     "count_of_reviews_by_score"
 670 |    ]
 671 |   },
 672 |   {
 673 |    "cell_type": "code",
 674 |    "execution_count": null,
 675 |    "metadata": {},
 676 |    "outputs": [],
 677 |    "source": [
 678 |     "# Listing 9.22\n",
 679 |     "# Listing 9.21 displays the following output:\n",
 680 |     "# [(5.0, 363122), (1.0, 52268), (4.0, 80655), (2.0, 29769), (3.0, 42640)]"
 681 |    ]
 682 |   },
 683 |   {
 684 |    "cell_type": "markdown",
 685 |    "metadata": {},
 686 |    "source": [
 687 |     "### Section 9.3"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "code",
 692 |    "execution_count": 16,
 693 |    "metadata": {},
 694 |    "outputs": [],
 695 |    "source": [
 696 |     "# Listing 9.23\n",
 697 |     "def get_score_and_helpfulness(element):\n",
 698 |     "    score_numeric = float(element['review/score'])\n",
 699 |     "    helpfulness = element['review/helpfulness'].strip().split('/')\n",
 700 |     "    number_of_helpful_votes = float(helpfulness[0])\n",
 701 |     "    number_of_total_votes = float(helpfulness[1])\n",
 702 |     "    # Watch for divide by 0 errors\n",
 703 |     "    if number_of_total_votes > 0:\n",
 704 |     "        helpfulness_percent = number_of_helpful_votes / number_of_total_votes\n",
 705 |     "    else:\n",
 706 |     "        helpfulness_percent = 0.\n",
 707 |     "    return (score_numeric, helpfulness_percent)"
 708 |    ]
 709 |   },
 710 |   {
 711 |    "cell_type": "code",
 712 |    "execution_count": null,
 713 |    "metadata": {},
 714 |    "outputs": [],
 715 |    "source": [
 716 |     "# Listing 9.24\n",
 717 |     "scores_and_helpfulness = reviews.map(get_score_and_helpfulness).to_dataframe(meta={'Review Scores': float, 'Helpfulness Percent': float})"
 718 |    ]
 719 |   },
 720 |   {
 721 |    "cell_type": "code",
 722 |    "execution_count": null,
 723 |    "metadata": {},
 724 |    "outputs": [],
 725 |    "source": [
 726 |     "# Listing 9.25\n",
 727 |     "with ProgressBar():\n",
 728 |     "    scores_and_helpfulness_stats = scores_and_helpfulness.describe().compute()\n",
 729 |     "scores_and_helpfulness_stats"
 730 |    ]
 731 |   },
 732 |   {
 733 |    "cell_type": "markdown",
 734 |    "metadata": {},
 735 |    "source": [
 736 |     "### Section 9.4.2"
 737 |    ]
 738 |   },
 739 |   {
 740 |    "cell_type": "code",
 741 |    "execution_count": 28,
 742 |    "metadata": {},
 743 |    "outputs": [
 744 |     {
 745 |      "data": {
 746 |       "text/plain": [
 747 |        "(['bought',\n",
 748 |        "  'several',\n",
 749 |        "  'vitality',\n",
 750 |        "  'canned',\n",
 751 |        "  'dog',\n",
 752 |        "  'food',\n",
 753 |        "  'products',\n",
 754 |        "  'found',\n",
 755 |        "  'good',\n",
 756 |        "  'quality',\n",
 757 |        "  'product',\n",
 758 |        "  'looks',\n",
 759 |        "  'like',\n",
 760 |        "  'stew',\n",
 761 |        "  'processed',\n",
 762 |        "  'meat',\n",
 763 |        "  'smells',\n",
 764 |        "  'better',\n",
 765 |        "  'labrador',\n",
 766 |        "  'finicky',\n",
 767 |        "  'appreciates',\n",
 768 |        "  'product',\n",
 769 |        "  'better'],)"
 770 |       ]
 771 |      },
 772 |      "execution_count": 28,
 773 |      "metadata": {},
 774 |      "output_type": "execute_result"
 775 |     }
 776 |    ],
 777 |    "source": [
 778 |     "# Listing 9.26\n",
 779 |     "from nltk.corpus import stopwords \n",
 780 |     "from nltk.tokenize import RegexpTokenizer\n",
 781 |     "from functools import partial\n",
 782 |     "\n",
 783 |     "tokenizer = RegexpTokenizer(r'\\w+')\n",
 784 |     "\n",
 785 |     "def extract_reviews(element):\n",
 786 |     "    return element['review/text'].lower()\n",
 787 |     "\n",
 788 |     "def filter_stopword(word, stopwords):\n",
 789 |     "    return word not in stopwords\n",
 790 |     "\n",
 791 |     "def filter_stopwords(tokens, stopwords):\n",
 792 |     "    return list(filter(partial(filter_stopword, stopwords=stopwords), tokens))\n",
 793 |     "\n",
 794 |     "stopword_set = set(stopwords.words('english'))"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "code",
 799 |    "execution_count": null,
 800 |    "metadata": {},
 801 |    "outputs": [],
 802 |    "source": [
 803 |     "# Listing 9.27\n",
 804 |     "review_text = reviews.map(extract_reviews)\n",
 805 |     "review_text_tokens = review_text.map(tokenizer.tokenize)\n",
 806 |     "review_text_clean = review_text_tokens.map(partial(filter_stopwords, stopwords=stopword_set))\n",
 807 |     "review_text_clean.take(1)"
 808 |    ]
 809 |   },
 810 |   {
 811 |    "cell_type": "code",
 812 |    "execution_count": 29,
 813 |    "metadata": {},
 814 |    "outputs": [
 815 |     {
 816 |      "data": {
 817 |       "text/plain": [
 818 |        "({('appreciates', 'product'),\n",
 819 |        "  ('better', 'labrador'),\n",
 820 |        "  ('bought', 'several'),\n",
 821 |        "  ('canned', 'dog'),\n",
 822 |        "  ('dog', 'food'),\n",
 823 |        "  ('finicky', 'appreciates'),\n",
 824 |        "  ('food', 'products'),\n",
 825 |        "  ('found', 'good'),\n",
 826 |        "  ('good', 'quality'),\n",
 827 |        "  ('labrador', 'finicky'),\n",
 828 |        "  ('like', 'stew'),\n",
 829 |        "  ('looks', 'like'),\n",
 830 |        "  ('meat', 'smells'),\n",
 831 |        "  ('processed', 'meat'),\n",
 832 |        "  ('product', 'better'),\n",
 833 |        "  ('product', 'looks'),\n",
 834 |        "  ('products', 'found'),\n",
 835 |        "  ('quality', 'product'),\n",
 836 |        "  ('several', 'vitality'),\n",
 837 |        "  ('smells', 'better'),\n",
 838 |        "  ('stew', 'processed'),\n",
 839 |        "  ('vitality', 'canned')},\n",
 840 |        " {('actually', 'small'),\n",
 841 |        "  ('arrived', 'labeled'),\n",
 842 |        "  ('error', 'vendor'),\n",
 843 |        "  ('intended', 'represent'),\n",
 844 |        "  ('jumbo', 'salted'),\n",
 845 |        "  ('labeled', 'jumbo'),\n",
 846 |        "  ('peanuts', 'actually'),\n",
 847 |        "  ('peanuts', 'peanuts'),\n",
 848 |        "  ('product', 'arrived'),\n",
 849 |        "  ('product', 'jumbo'),\n",
 850 |        "  ('represent', 'product'),\n",
 851 |        "  ('salted', 'peanuts'),\n",
 852 |        "  ('sized', 'unsalted'),\n",
 853 |        "  ('small', 'sized'),\n",
 854 |        "  ('sure', 'error'),\n",
 855 |        "  ('unsalted', 'sure'),\n",
 856 |        "  ('vendor', 'intended')})"
 857 |       ]
 858 |      },
 859 |      "execution_count": 29,
 860 |      "metadata": {},
 861 |      "output_type": "execute_result"
 862 |     }
 863 |    ],
 864 |    "source": [
 865 |     "# Listing 9.28\n",
 866 |     "def make_bigrams(tokens):\n",
 867 |     "    return set(nltk.bigrams(tokens))\n",
 868 |     "\n",
 869 |     "review_bigrams = review_text_clean.map(make_bigrams)\n",
 870 |     "review_bigrams.take(2)"
 871 |    ]
 872 |   },
 873 |   {
 874 |    "cell_type": "code",
 875 |    "execution_count": 30,
 876 |    "metadata": {},
 877 |    "outputs": [
 878 |     {
 879 |      "data": {
 880 |       "text/plain": [
 881 |        "(('product', 'better'),\n",
 882 |        " ('finicky', 'appreciates'),\n",
 883 |        " ('meat', 'smells'),\n",
 884 |        " ('looks', 'like'),\n",
 885 |        " ('good', 'quality'),\n",
 886 |        " ('vitality', 'canned'),\n",
 887 |        " ('like', 'stew'),\n",
 888 |        " ('processed', 'meat'),\n",
 889 |        " ('labrador', 'finicky'),\n",
 890 |        " ('several', 'vitality'))"
 891 |       ]
 892 |      },
 893 |      "execution_count": 30,
 894 |      "metadata": {},
 895 |      "output_type": "execute_result"
 896 |     }
 897 |    ],
 898 |    "source": [
 899 |     "# Listing 9.29\n",
 900 |     "all_bigrams = review_bigrams.flatten()\n",
 901 |     "all_bigrams.take(10)"
 902 |    ]
 903 |   },
 904 |   {
 905 |    "cell_type": "code",
 906 |    "execution_count": 31,
 907 |    "metadata": {},
 908 |    "outputs": [
 909 |     {
 910 |      "name": "stdout",
 911 |      "output_type": "stream",
 912 |      "text": [
 913 |       "[########################################] | 100% Completed | 11min  7.6s\n"
 914 |      ]
 915 |     },
 916 |     {
 917 |      "data": {
 918 |       "text/plain": [
 919 |        "[(('br', 'br'), 103258),\n",
 920 |        " (('amazon', 'com'), 15142),\n",
 921 |        " (('highly', 'recommend'), 14017),\n",
 922 |        " (('taste', 'like'), 13251),\n",
 923 |        " (('gluten', 'free'), 11641),\n",
 924 |        " (('grocery', 'store'), 11627),\n",
 925 |        " (('k', 'cups'), 11102),\n",
 926 |        " (('much', 'better'), 10681),\n",
 927 |        " (('http', 'www'), 10575),\n",
 928 |        " (('www', 'amazon'), 10517)]"
 929 |       ]
 930 |      },
 931 |      "execution_count": 31,
 932 |      "metadata": {},
 933 |      "output_type": "execute_result"
 934 |     }
 935 |    ],
 936 |    "source": [
 937 |     "# Listing 9.30\n",
 938 |     "with ProgressBar():\n",
 939 |     "    top10_bigrams = all_bigrams.foldby(lambda x: x, count, 0, combine, 0).topk(10, key=lambda x: x[1]).compute()\n",
 940 |     "top10_bigrams"
 941 |    ]
 942 |   },
 943 |   {
 944 |    "cell_type": "code",
 945 |    "execution_count": 32,
 946 |    "metadata": {},
 947 |    "outputs": [
 948 |     {
 949 |      "name": "stdout",
 950 |      "output_type": "stream",
 951 |      "text": [
 952 |       "[########################################] | 100% Completed | 11min 19.9s\n"
 953 |      ]
 954 |     },
 955 |     {
 956 |      "data": {
 957 |       "text/plain": [
 958 |        "[(('highly', 'recommend'), 14024),\n",
 959 |        " (('taste', 'like'), 13343),\n",
 960 |        " (('gluten', 'free'), 11641),\n",
 961 |        " (('grocery', 'store'), 11630),\n",
 962 |        " (('k', 'cups'), 11102),\n",
 963 |        " (('much', 'better'), 10695),\n",
 964 |        " (('tastes', 'like'), 10471),\n",
 965 |        " (('great', 'product'), 9192),\n",
 966 |        " (('cup', 'coffee'), 8988),\n",
 967 |        " (('really', 'good'), 8897)]"
 968 |       ]
 969 |      },
 970 |      "execution_count": 32,
 971 |      "metadata": {},
 972 |      "output_type": "execute_result"
 973 |     }
 974 |    ],
 975 |    "source": [
 976 |     "# Listing 9.31\n",
 977 |     "more_stopwords = {'br', 'amazon', 'com', 'http', 'www', 'href', 'gp'}\n",
 978 |     "all_stopwords = stopword_set.union(more_stopwords)\n",
 979 |     "\n",
 980 |     "filtered_bigrams = review_text_tokens.map(partial(filter_stopwords, stopwords=all_stopwords)).map(make_bigrams).flatten()\n",
 981 |     "\n",
 982 |     "with ProgressBar():\n",
 983 |     "    top10_bigrams = filtered_bigrams.foldby(lambda x: x, count, 0, combine, 0).topk(10, key=lambda x: x[1]).compute()\n",
 984 |     "top10_bigrams"
 985 |    ]
 986 |   },
 987 |   {
 988 |    "cell_type": "markdown",
 989 |    "metadata": {},
 990 |    "source": [
 991 |     "### Section 9.4.3"
 992 |    ]
 993 |   },
 994 |   {
 995 |    "cell_type": "code",
 996 |    "execution_count": 61,
 997 |    "metadata": {},
 998 |    "outputs": [
 999 |     {
1000 |      "name": "stdout",
1001 |      "output_type": "stream",
1002 |      "text": [
1003 |       "[########################################] | 100% Completed |  2min 25.9s\n"
1004 |      ]
1005 |     },
1006 |     {
1007 |      "data": {
1008 |       "text/plain": [
1009 |        "[(('taste', 'like'), 3352),\n",
1010 |        " (('tastes', 'like'), 2858),\n",
1011 |        " (('waste', 'money'), 2262),\n",
1012 |        " (('k', 'cups'), 1892),\n",
1013 |        " (('much', 'better'), 1659),\n",
1014 |        " (('thought', 'would'), 1604),\n",
1015 |        " (('tasted', 'like'), 1515),\n",
1016 |        " (('grocery', 'store'), 1489),\n",
1017 |        " (('would', 'recommend'), 1445),\n",
1018 |        " (('taste', 'good'), 1408)]"
1019 |       ]
1020 |      },
1021 |      "execution_count": 61,
1022 |      "metadata": {},
1023 |      "output_type": "execute_result"
1024 |     }
1025 |    ],
1026 |    "source": [
1027 |     "# Listing 9.32\n",
1028 |     "negative_review_text = reviews.filter(lambda review: float(review['review/score']) < 3).map(extract_reviews)\n",
1029 |     "negative_review_text_tokens = negative_review_text.map(tokenizer.tokenize)\n",
1030 |     "negative_review_text_clean = negative_review_text_tokens.map(partial(filter_stopwords, stopwords=all_stopwords))\n",
1031 |     "negative_review_bigrams = negative_review_text_clean.map(make_bigrams)\n",
1032 |     "negative_bigrams = negative_review_bigrams.flatten()\n",
1033 |     "\n",
1034 |     "with ProgressBar():\n",
1035 |     "    top10_negative_bigrams = negative_bigrams.foldby(lambda x: x, count, 0, combine, 0).topk(10, key=lambda x: x[1]).compute()\n",
1036 |     "top10_negative_bigrams"
1037 |    ]
1038 |   },
1039 |   {
1040 |    "cell_type": "code",
1041 |    "execution_count": null,
1042 |    "metadata": {},
1043 |    "outputs": [],
1044 |    "source": []
1045 |   }
1046 |  ],
1047 |  "metadata": {
1048 |   "kernelspec": {
1049 |    "display_name": "Python 3",
1050 |    "language": "python",
1051 |    "name": "python3"
1052 |   },
1053 |   "language_info": {
1054 |    "codemirror_mode": {
1055 |     "name": "ipython",
1056 |     "version": 3
1057 |    },
1058 |    "file_extension": ".py",
1059 |    "mimetype": "text/x-python",
1060 |    "name": "python",
1061 |    "nbconvert_exporter": "python",
1062 |    "pygments_lexer": "ipython3",
1063 |    "version": "3.6.8"
1064 |   }
1065 |  },
1066 |  "nbformat": 4,
1067 |  "nbformat_minor": 2
1068 | }
1069 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Science with Python and Dask
2 | Companion Notebooks and Data for Data Science with Python and Dask from Manning Publications by Jesse C. Daniel
3 | 
4 | https://www.manning.com/books/data-science-with-python-and-dask
5 | 


--------------------------------------------------------------------------------
/nyc-average-monthly-temp.csv:
--------------------------------------------------------------------------------
 1 | ﻿Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,
 2 | 2000,31.3,37.3,47.2,51,63.5,71.3,72.3,72.4,66,57,45.3,31.1,
 3 | 2001,33.6,35.9,39.6,53.9,63.6,72.9,73.1,78.7,67.7,58.5,52.7,44.1,
 4 | 2002,39.9,40.6,44.1,56.1,60.7,71.5,78.8,77.7,70.2,55.2,46,36,
 5 | 2003,27.5,30.1,43.1,49.8,58.7,68.4,75.8,76.7,67.9,55.1,50,37.6,
 6 | 2004,24.7,35,43.5,53.6,65.2,71.2,74.5,74.2,69.3,56,48.2,38.4,
 7 | 2005,31.3,36.5,39.5,55.1,58.9,74,77.5,79.7,73.3,57.9,49.6,35.3,
 8 | 2006,40.9,35.7,43.1,55.7,63.1,71,77.9,75.8,66.6,56.2,51.9,43.6,
 9 | 2007,37.5,28.3,42.2,50.3,65.2,71.4,75,74,70.3,63.6,45.4,37,
10 | 2008,36.5,35.8,42.6,55,60.1,74,78.4,73.8,68.8,55.1,45.9,38.1,
11 | 2009,27.9,36.7,42.4,54.5,62.5,67.5,72.7,75.7,66.3,55,51.1,35.9,
12 | 2010,32.5,33.1,48.2,57.9,65.3,74.7,81.3,77.4,71.1,58.1,47.9,32.8,
13 | 2011,29.7,36,42.3,54.3,64.5,72.3,80.2,75.3,70,57.1,51.9,43.3,
14 | 2012,37.3,40.9,50.9,54.8,65.1,71,78.8,76.7,68.8,58,43.9,41.5,
15 | 2013,35.1,33.9,40.1,53,62.8,72.7,79.8,74.6,67.9,60.2,45.3,38.5,
16 | 2014,28.6,31.6,37.7,52.3,64,72.5,76.1,74.5,69.7,59.6,45.3,40.5,
17 | 2015,29.9,23.9,38.1,54.3,68.5,71.2,78.8,79,74.5,58,52.8,50.8,
18 | 2016,34.5,37.7,48.9,53.3,62.8,72.3,78.7,79.2,71.8,58.8,49.8,38.3,
19 | 2017,38,41.6,39.2,57.2,61.1,72,76.8,74,70.5,64.1,46.6,35,
20 | ,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/nyc-temp-data.csv:
--------------------------------------------------------------------------------
  1 | Temp,monthYear
  2 | 31.3,200001
  3 | 37.3,200002
  4 | 47.2,200003
  5 | 51,200004
  6 | 63.5,200005
  7 | 71.3,200006
  8 | 72.3,200007
  9 | 72.4,200008
 10 | 66,200009
 11 | 57,200010
 12 | 45.3,200011
 13 | 31.1,200012
 14 | 33.6,200101
 15 | 35.9,200102
 16 | 39.6,200103
 17 | 53.9,200104
 18 | 63.6,200105
 19 | 72.9,200106
 20 | 73.1,200107
 21 | 78.7,200108
 22 | 67.7,200109
 23 | 58.5,200110
 24 | 52.7,200111
 25 | 44.1,200112
 26 | 39.9,200201
 27 | 40.6,200202
 28 | 44.1,200203
 29 | 56.1,200204
 30 | 60.7,200205
 31 | 71.5,200206
 32 | 78.8,200207
 33 | 77.7,200208
 34 | 70.2,200209
 35 | 55.2,200210
 36 | 46,200211
 37 | 36,200212
 38 | 27.5,200301
 39 | 30.1,200302
 40 | 43.1,200303
 41 | 49.8,200304
 42 | 58.7,200305
 43 | 68.4,200306
 44 | 75.8,200307
 45 | 76.7,200308
 46 | 67.9,200309
 47 | 55.1,200310
 48 | 50,200311
 49 | 37.6,200312
 50 | 24.7,200401
 51 | 35,200402
 52 | 43.5,200403
 53 | 53.6,200404
 54 | 65.2,200405
 55 | 71.2,200406
 56 | 74.5,200407
 57 | 74.2,200408
 58 | 69.3,200409
 59 | 56,200410
 60 | 48.2,200411
 61 | 38.4,200412
 62 | 31.3,200501
 63 | 36.5,200502
 64 | 39.5,200503
 65 | 55.1,200504
 66 | 58.9,200505
 67 | 74,200506
 68 | 77.5,200507
 69 | 79.7,200508
 70 | 73.3,200509
 71 | 57.9,200510
 72 | 49.6,200511
 73 | 35.3,200512
 74 | 40.9,200601
 75 | 35.7,200602
 76 | 43.1,200603
 77 | 55.7,200604
 78 | 63.1,200605
 79 | 71,200606
 80 | 77.9,200607
 81 | 75.8,200608
 82 | 66.6,200609
 83 | 56.2,200610
 84 | 51.9,200611
 85 | 43.6,200612
 86 | 37.5,200701
 87 | 28.3,200702
 88 | 42.2,200703
 89 | 50.3,200704
 90 | 65.2,200705
 91 | 71.4,200706
 92 | 75,200707
 93 | 74,200708
 94 | 70.3,200709
 95 | 63.6,200710
 96 | 45.4,200711
 97 | 37,200712
 98 | 36.5,200801
 99 | 35.8,200802
100 | 42.6,200803
101 | 55,200804
102 | 60.1,200805
103 | 74,200806
104 | 78.4,200807
105 | 73.8,200808
106 | 68.8,200809
107 | 55.1,200810
108 | 45.9,200811
109 | 38.1,200812
110 | 27.9,200901
111 | 36.7,200902
112 | 42.4,200903
113 | 54.5,200904
114 | 62.5,200905
115 | 67.5,200906
116 | 72.7,200907
117 | 75.7,200908
118 | 66.3,200909
119 | 55,200910
120 | 51.1,200911
121 | 35.9,200912
122 | 32.5,201001
123 | 33.1,201002
124 | 48.2,201003
125 | 57.9,201004
126 | 65.3,201005
127 | 74.7,201006
128 | 81.3,201007
129 | 77.4,201008
130 | 71.1,201009
131 | 58.1,201010
132 | 47.9,201011
133 | 32.8,201012
134 | 29.7,201101
135 | 36,201102
136 | 42.3,201103
137 | 54.3,201104
138 | 64.5,201105
139 | 72.3,201106
140 | 80.2,201107
141 | 75.3,201108
142 | 70,201109
143 | 57.1,201110
144 | 51.9,201111
145 | 43.3,201112
146 | 37.3,201201
147 | 40.9,201202
148 | 50.9,201203
149 | 54.8,201204
150 | 65.1,201205
151 | 71,201206
152 | 78.8,201207
153 | 76.7,201208
154 | 68.8,201209
155 | 58,201210
156 | 43.9,201211
157 | 41.5,201212
158 | 35.1,201301
159 | 33.9,201302
160 | 40.1,201303
161 | 53,201304
162 | 62.8,201305
163 | 72.7,201306
164 | 79.8,201307
165 | 74.6,201308
166 | 67.9,201309
167 | 60.2,201310
168 | 45.3,201311
169 | 38.5,201312
170 | 28.6,201401
171 | 31.6,201402
172 | 37.7,201403
173 | 52.3,201404
174 | 64,201405
175 | 72.5,201406
176 | 76.1,201407
177 | 74.5,201408
178 | 69.7,201409
179 | 59.6,201410
180 | 45.3,201411
181 | 40.5,201412
182 | 29.9,201501
183 | 23.9,201502
184 | 38.1,201503
185 | 54.3,201504
186 | 68.5,201505
187 | 71.2,201506
188 | 78.8,201507
189 | 79,201508
190 | 74.5,201509
191 | 58,201510
192 | 52.8,201511
193 | 50.8,201512
194 | 34.5,201601
195 | 37.7,201602
196 | 48.9,201603
197 | 53.3,201604
198 | 62.8,201605
199 | 72.3,201606
200 | 78.7,201607
201 | 79.2,201608
202 | 71.8,201609
203 | 58.8,201610
204 | 49.8,201611
205 | 38.3,201612
206 | 38,201701
207 | 41.6,201702
208 | 39.2,201703
209 | 57.2,201704
210 | 61.1,201705
211 | 72,201706
212 | 76.8,201707
213 | 74,201708
214 | 70.5,201709
215 | 64.1,201710
216 | 46.6,201711
217 | 35,201712


--------------------------------------------------------------------------------