├── Chapter 11
├── notebook
│ ├── requirements.txt
│ ├── Dockerfile
│ ├── build.sh
│ └── start-notebook.sh
├── worker
│ ├── requirements.txt
│ ├── Dockerfile
│ ├── build.sh
│ └── worker-start.sh
├── scheduler
│ ├── Dockerfile
│ └── scheduler-start.sh
└── Chapter 11.ipynb
├── README.md
├── nyc-average-monthly-temp.csv
├── Chapter 3.ipynb
├── nyc-temp-data.csv
├── Chapter 7.ipynb
├── Chapter 6.ipynb
├── Chapter 5.ipynb
├── Chapter 10.ipynb
└── Chapter 9.ipynb
/Chapter 11/notebook/requirements.txt:
--------------------------------------------------------------------------------
1 | blosc
2 | zarr
3 | dask-ml
4 |
--------------------------------------------------------------------------------
/Chapter 11/worker/requirements.txt:
--------------------------------------------------------------------------------
1 | blosc
2 | zarr
3 | dask-ml
4 |
--------------------------------------------------------------------------------
/Chapter 11/scheduler/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM daskdev/dask
2 |
3 | COPY scheduler-start.sh ./
4 |
5 | EXPOSE 8786
6 | EXPOSE 8787
7 |
8 | CMD ["sh","scheduler-start.sh"]
9 |
--------------------------------------------------------------------------------
/Chapter 11/worker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM daskdev/dask
2 |
3 | USER root
4 |
5 | # Install dependencies
6 | COPY requirements.txt build.sh worker-start.sh ./
7 | RUN sh build.sh
8 | RUN rm build.sh
9 |
10 | CMD ["sh", "worker-start.sh"]
11 |
--------------------------------------------------------------------------------
/Chapter 11/notebook/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM daskdev/dask-notebook
2 | USER root
3 | COPY requirements.txt build.sh ./
4 | COPY start-notebook.sh /opt/app
5 | RUN sh build.sh
6 | RUN rm build.sh
7 | EXPOSE 8888
8 | CMD ["sh","/opt/app/start-notebook.sh"]
9 |
--------------------------------------------------------------------------------
/Chapter 11/worker/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | apt-get update
4 | apt-get -y install build-essential
5 | echo "Getting Python packages..."
6 | pip install -U --no-cache-dir -r requirements.txt
7 | rm requirements.txt
8 | echo "Done!"
9 |
--------------------------------------------------------------------------------
/Chapter 11/notebook/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | sudo apt-get update
4 | sudo apt-get -y install build-essential
5 | echo "Getting Python packages..."
6 | pip install -U --no-cache-dir -r requirements.txt
7 | rm requirements.txt
8 | echo "Done!"
9 |
--------------------------------------------------------------------------------
/Chapter 11/worker/worker-start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Get the scheduler name from EFS
3 | scheduler=$(cat /data/.scheduler)
4 | echo "Setting scheduler hostname to $scheduler"
5 | echo "Starting Dask worker..."
6 | dask-worker --worker-port 8000 tcp://$scheduler:8786
7 |
--------------------------------------------------------------------------------
/Chapter 11/notebook/start-notebook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Get the scheduler name from EFS
3 | scheduler=$(cat /data/.scheduler)
4 | echo "Setting scheduler name to $scheduler"
5 | export DASK_SCHEDULER_ADDRESS="tcp://$scheduler:8786"
6 |
7 | # Start the notebook server
8 | start.sh jupyter lab
9 |
--------------------------------------------------------------------------------
/Chapter 11/scheduler/scheduler-start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Write the hostname of the scheduler to the EFS system
4 | hostname=$(hostname)
5 | echo "Setting scheduler hostname to $hostname"
6 | hostname > /data/.scheduler
7 |
8 | # Start the scheduler
9 | echo "Starting Dask Scheduler..."
10 | dask-scheduler
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 『파이썬과 대스크를 활용한 고성능 데이터 분석』
2 | #### 부제: 대규모 데이터셋의 분석, 시각화, 모델링부터 분산 앱 패키징과 배포까지
3 |
4 | 이 저장소는 한빛출판사에서 출간한 한국어판 『파이썬과 대스크를 활용한 고성능 데이터 분석』에서 참조하는 각종 소스 코드와 예제 데이터를 담고 있다. 원서는 Data Science with Python and Dask (저자: 제시 다니엘) 이다. 원서의 소스 코드는 [이곳](https://www.manning.com/books/data-science-with-python-and-dask)에서 다운로드할 수 있다.
5 |
--------------------------------------------------------------------------------
/nyc-average-monthly-temp.csv:
--------------------------------------------------------------------------------
1 | Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,
2 | 2000,31.3,37.3,47.2,51,63.5,71.3,72.3,72.4,66,57,45.3,31.1,
3 | 2001,33.6,35.9,39.6,53.9,63.6,72.9,73.1,78.7,67.7,58.5,52.7,44.1,
4 | 2002,39.9,40.6,44.1,56.1,60.7,71.5,78.8,77.7,70.2,55.2,46,36,
5 | 2003,27.5,30.1,43.1,49.8,58.7,68.4,75.8,76.7,67.9,55.1,50,37.6,
6 | 2004,24.7,35,43.5,53.6,65.2,71.2,74.5,74.2,69.3,56,48.2,38.4,
7 | 2005,31.3,36.5,39.5,55.1,58.9,74,77.5,79.7,73.3,57.9,49.6,35.3,
8 | 2006,40.9,35.7,43.1,55.7,63.1,71,77.9,75.8,66.6,56.2,51.9,43.6,
9 | 2007,37.5,28.3,42.2,50.3,65.2,71.4,75,74,70.3,63.6,45.4,37,
10 | 2008,36.5,35.8,42.6,55,60.1,74,78.4,73.8,68.8,55.1,45.9,38.1,
11 | 2009,27.9,36.7,42.4,54.5,62.5,67.5,72.7,75.7,66.3,55,51.1,35.9,
12 | 2010,32.5,33.1,48.2,57.9,65.3,74.7,81.3,77.4,71.1,58.1,47.9,32.8,
13 | 2011,29.7,36,42.3,54.3,64.5,72.3,80.2,75.3,70,57.1,51.9,43.3,
14 | 2012,37.3,40.9,50.9,54.8,65.1,71,78.8,76.7,68.8,58,43.9,41.5,
15 | 2013,35.1,33.9,40.1,53,62.8,72.7,79.8,74.6,67.9,60.2,45.3,38.5,
16 | 2014,28.6,31.6,37.7,52.3,64,72.5,76.1,74.5,69.7,59.6,45.3,40.5,
17 | 2015,29.9,23.9,38.1,54.3,68.5,71.2,78.8,79,74.5,58,52.8,50.8,
18 | 2016,34.5,37.7,48.9,53.3,62.8,72.3,78.7,79.2,71.8,58.8,49.8,38.3,
19 | 2017,38,41.6,39.2,57.2,61.1,72,76.8,74,70.5,64.1,46.6,35,
20 | ,,,,,,,,,,,,,
--------------------------------------------------------------------------------
/Chapter 3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science with Python and Dask\n",
8 | "## Chapter 3 - Introducing Dask DataFrames"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### Section 3.1"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "# Listing 3.1\n",
25 | "person_IDs = [1,2,3]\n",
26 | "person_last_names = ['Smith', 'Williams', 'Williams']\n",
27 | "person_first_names = ['John', 'Bill', 'Jane']\n",
28 | "person_DOBs = ['1982-10-06', '1990-07-04', '1989-05-06']"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "### Section 3.2.1"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 2,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# Listing 3.2\n",
45 | "import pandas as pd\n",
46 | "import dask.dataframe as dd\n",
47 | "\n",
48 | "# Creating all the data as lists\n",
49 | "person_IDs = [1,2,3,4,5,6,7,8,9,10]\n",
50 | "person_last_names = ['Smith', 'Williams', 'Williams','Jackson','Johnson','Smith','Anderson','Christiansen','Carter','Davidson']\n",
51 | "person_first_names = ['John', 'Bill', 'Jane','Cathy','Stuart','James','Felicity','Liam','Nancy','Christina']\n",
52 | "person_DOBs = ['1982-10-06', '1990-07-04', '1989-05-06', '1974-01-24', '1995-06-05', '1984-04-16', '1976-09-15', '1992-10-02', '1986-02-05', '1993-08-11']\n",
53 | "\n",
54 | "# Storing the data in a Pandas DataFrame\n",
55 | "people_pandas_df = pd.DataFrame({'Person ID': person_IDs, \n",
56 | " 'Last Name': person_last_names, \n",
57 | " 'First Name': person_first_names,\n",
58 | " 'Date of Birth': person_DOBs},\n",
59 | " columns=['Person ID', 'Last Name', 'First Name', 'Date of Birth'])\n",
60 | "\n",
61 | "# Converting the Pandas DataFrame to a Dask DataFrame\n",
62 | "people_dask_df = dd.from_pandas(people_pandas_df, npartitions=2)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 3,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "(0, 5, 9)\n",
75 | "2\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "# Listing 3.3\n",
81 | "print(people_dask_df.divisions)\n",
82 | "print(people_dask_df.npartitions)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 4,
88 | "metadata": {},
89 | "outputs": [
90 | {
91 | "data": {
92 | "text/plain": [
93 | "0 5\n",
94 | "1 5\n",
95 | "dtype: int64"
96 | ]
97 | },
98 | "execution_count": 4,
99 | "metadata": {},
100 | "output_type": "execute_result"
101 | }
102 | ],
103 | "source": [
104 | "# Listing 3.4\n",
105 | "people_dask_df.map_partitions(lambda x: len(x)).compute()"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 8,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "name": "stdout",
115 | "output_type": "stream",
116 | "text": [
117 | "0 3\n",
118 | "1 5\n",
119 | "dtype: int64\n",
120 | "0 8\n",
121 | "dtype: int64\n"
122 | ]
123 | }
124 | ],
125 | "source": [
126 | "# Listing 3.5\n",
127 | "people_filtered = people_dask_df[people_dask_df['Last Name'] != 'Williams']\n",
128 | "print(people_filtered.map_partitions(lambda x: len(x)).compute())\n",
129 | "\n",
130 | "people_filtered_reduced = people_filtered.repartition(npartitions=1)\n",
131 | "print(people_filtered_reduced.map_partitions(lambda x: len(x)).compute())"
132 | ]
133 | }
134 | ],
135 | "metadata": {
136 | "kernelspec": {
137 | "display_name": "Python 3",
138 | "language": "python",
139 | "name": "python3"
140 | },
141 | "language_info": {
142 | "codemirror_mode": {
143 | "name": "ipython",
144 | "version": 3
145 | },
146 | "file_extension": ".py",
147 | "mimetype": "text/x-python",
148 | "name": "python",
149 | "nbconvert_exporter": "python",
150 | "pygments_lexer": "ipython3",
151 | "version": "3.6.8"
152 | }
153 | },
154 | "nbformat": 4,
155 | "nbformat_minor": 2
156 | }
157 |
--------------------------------------------------------------------------------
/nyc-temp-data.csv:
--------------------------------------------------------------------------------
1 | ,Temp,monthYear
2 | 0,31.3,01-2000
3 | 1,33.6,01-2001
4 | 2,39.9,01-2002
5 | 3,27.5,01-2003
6 | 4,24.7,01-2004
7 | 5,31.3,01-2005
8 | 6,40.9,01-2006
9 | 7,37.5,01-2007
10 | 8,36.5,01-2008
11 | 9,27.9,01-2009
12 | 10,32.5,01-2010
13 | 11,29.7,01-2011
14 | 12,37.3,01-2012
15 | 13,35.1,01-2013
16 | 14,28.6,01-2014
17 | 15,29.9,01-2015
18 | 16,34.5,01-2016
19 | 17,38.0,01-2017
20 | 18,37.3,02-2000
21 | 19,35.9,02-2001
22 | 20,40.6,02-2002
23 | 21,30.1,02-2003
24 | 22,35.0,02-2004
25 | 23,36.5,02-2005
26 | 24,35.7,02-2006
27 | 25,28.3,02-2007
28 | 26,35.8,02-2008
29 | 27,36.7,02-2009
30 | 28,33.1,02-2010
31 | 29,36.0,02-2011
32 | 30,40.9,02-2012
33 | 31,33.9,02-2013
34 | 32,31.6,02-2014
35 | 33,23.9,02-2015
36 | 34,37.7,02-2016
37 | 35,41.6,02-2017
38 | 36,47.2,03-2000
39 | 37,39.6,03-2001
40 | 38,44.1,03-2002
41 | 39,43.1,03-2003
42 | 40,43.5,03-2004
43 | 41,39.5,03-2005
44 | 42,43.1,03-2006
45 | 43,42.2,03-2007
46 | 44,42.6,03-2008
47 | 45,42.4,03-2009
48 | 46,48.2,03-2010
49 | 47,42.3,03-2011
50 | 48,50.9,03-2012
51 | 49,40.1,03-2013
52 | 50,37.7,03-2014
53 | 51,38.1,03-2015
54 | 52,48.9,03-2016
55 | 53,39.2,03-2017
56 | 54,51.0,04-2000
57 | 55,53.9,04-2001
58 | 56,56.1,04-2002
59 | 57,49.8,04-2003
60 | 58,53.6,04-2004
61 | 59,55.1,04-2005
62 | 60,55.7,04-2006
63 | 61,50.3,04-2007
64 | 62,55.0,04-2008
65 | 63,54.5,04-2009
66 | 64,57.9,04-2010
67 | 65,54.3,04-2011
68 | 66,54.8,04-2012
69 | 67,53.0,04-2013
70 | 68,52.3,04-2014
71 | 69,54.3,04-2015
72 | 70,53.3,04-2016
73 | 71,57.2,04-2017
74 | 72,63.5,05-2000
75 | 73,63.6,05-2001
76 | 74,60.7,05-2002
77 | 75,58.7,05-2003
78 | 76,65.2,05-2004
79 | 77,58.9,05-2005
80 | 78,63.1,05-2006
81 | 79,65.2,05-2007
82 | 80,60.1,05-2008
83 | 81,62.5,05-2009
84 | 82,65.3,05-2010
85 | 83,64.5,05-2011
86 | 84,65.1,05-2012
87 | 85,62.8,05-2013
88 | 86,64.0,05-2014
89 | 87,68.5,05-2015
90 | 88,62.8,05-2016
91 | 89,61.1,05-2017
92 | 90,71.3,06-2000
93 | 91,72.9,06-2001
94 | 92,71.5,06-2002
95 | 93,68.4,06-2003
96 | 94,71.2,06-2004
97 | 95,74.0,06-2005
98 | 96,71.0,06-2006
99 | 97,71.4,06-2007
100 | 98,74.0,06-2008
101 | 99,67.5,06-2009
102 | 100,74.7,06-2010
103 | 101,72.3,06-2011
104 | 102,71.0,06-2012
105 | 103,72.7,06-2013
106 | 104,72.5,06-2014
107 | 105,71.2,06-2015
108 | 106,72.3,06-2016
109 | 107,72.0,06-2017
110 | 108,72.3,07-2000
111 | 109,73.1,07-2001
112 | 110,78.8,07-2002
113 | 111,75.8,07-2003
114 | 112,74.5,07-2004
115 | 113,77.5,07-2005
116 | 114,77.9,07-2006
117 | 115,75.0,07-2007
118 | 116,78.4,07-2008
119 | 117,72.7,07-2009
120 | 118,81.3,07-2010
121 | 119,80.2,07-2011
122 | 120,78.8,07-2012
123 | 121,79.8,07-2013
124 | 122,76.1,07-2014
125 | 123,78.8,07-2015
126 | 124,78.7,07-2016
127 | 125,76.8,07-2017
128 | 126,72.4,08-2000
129 | 127,78.7,08-2001
130 | 128,77.7,08-2002
131 | 129,76.7,08-2003
132 | 130,74.2,08-2004
133 | 131,79.7,08-2005
134 | 132,75.8,08-2006
135 | 133,74.0,08-2007
136 | 134,73.8,08-2008
137 | 135,75.7,08-2009
138 | 136,77.4,08-2010
139 | 137,75.3,08-2011
140 | 138,76.7,08-2012
141 | 139,74.6,08-2013
142 | 140,74.5,08-2014
143 | 141,79.0,08-2015
144 | 142,79.2,08-2016
145 | 143,74.0,08-2017
146 | 144,66.0,09-2000
147 | 145,67.7,09-2001
148 | 146,70.2,09-2002
149 | 147,67.9,09-2003
150 | 148,69.3,09-2004
151 | 149,73.3,09-2005
152 | 150,66.6,09-2006
153 | 151,70.3,09-2007
154 | 152,68.8,09-2008
155 | 153,66.3,09-2009
156 | 154,71.1,09-2010
157 | 155,70.0,09-2011
158 | 156,68.8,09-2012
159 | 157,67.9,09-2013
160 | 158,69.7,09-2014
161 | 159,74.5,09-2015
162 | 160,71.8,09-2016
163 | 161,70.5,09-2017
164 | 162,57.0,10-2000
165 | 163,58.5,10-2001
166 | 164,55.2,10-2002
167 | 165,55.1,10-2003
168 | 166,56.0,10-2004
169 | 167,57.9,10-2005
170 | 168,56.2,10-2006
171 | 169,63.6,10-2007
172 | 170,55.1,10-2008
173 | 171,55.0,10-2009
174 | 172,58.1,10-2010
175 | 173,57.1,10-2011
176 | 174,58.0,10-2012
177 | 175,60.2,10-2013
178 | 176,59.6,10-2014
179 | 177,58.0,10-2015
180 | 178,58.8,10-2016
181 | 179,64.1,10-2017
182 | 180,45.3,11-2000
183 | 181,52.7,11-2001
184 | 182,46.0,11-2002
185 | 183,50.0,11-2003
186 | 184,48.2,11-2004
187 | 185,49.6,11-2005
188 | 186,51.9,11-2006
189 | 187,45.4,11-2007
190 | 188,45.9,11-2008
191 | 189,51.1,11-2009
192 | 190,47.9,11-2010
193 | 191,51.9,11-2011
194 | 192,43.9,11-2012
195 | 193,45.3,11-2013
196 | 194,45.3,11-2014
197 | 195,52.8,11-2015
198 | 196,49.8,11-2016
199 | 197,46.6,11-2017
200 | 198,31.1,12-2000
201 | 199,44.1,12-2001
202 | 200,36.0,12-2002
203 | 201,37.6,12-2003
204 | 202,38.4,12-2004
205 | 203,35.3,12-2005
206 | 204,43.6,12-2006
207 | 205,37.0,12-2007
208 | 206,38.1,12-2008
209 | 207,35.9,12-2009
210 | 208,32.8,12-2010
211 | 209,43.3,12-2011
212 | 210,41.5,12-2012
213 | 211,38.5,12-2013
214 | 212,40.5,12-2014
215 | 213,50.8,12-2015
216 | 214,38.3,12-2016
217 | 215,35.0,12-2017
218 |
--------------------------------------------------------------------------------
/Chapter 11/Chapter 11.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science with Python and Dask\n",
8 | "## Chapter 11: Scaling and Deploying Dask"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### Section 11.2"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 5,
21 | "metadata": {},
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/html": [
26 | "
\n",
27 | "\n",
28 | "\n",
29 | "Client\n",
30 | "\n",
34 | " | \n",
35 | "\n",
36 | "Cluster\n",
37 | "\n",
38 | " - Workers: 6
\n",
39 | " - Cores: 6
\n",
40 | " - Memory: 6.20 GB
\n",
41 | " \n",
42 | " | \n",
43 | "
\n",
44 | "
"
45 | ],
46 | "text/plain": [
47 | ""
48 | ]
49 | },
50 | "execution_count": 5,
51 | "metadata": {},
52 | "output_type": "execute_result"
53 | }
54 | ],
55 | "source": [
56 | "# Listing 11.2\n",
57 | "from dask.distributed import Client, progress\n",
58 | "client = Client()\n",
59 | "client"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 2,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# Listing 11.3\n",
69 | "from dask import array as da\n",
70 | "feature_array = da.from_zarr('/data/sentiment_feature_array.zarr')\n",
71 | "target_array = da.from_zarr('/data/sentiment_target_array.zarr')"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 14,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "from dask_ml.linear_model import LogisticRegression\n",
81 | "from dask_ml.model_selection import train_test_split\n",
82 | "\n",
83 | "X = feature_array\n",
84 | "y = target_array.flatten()\n",
85 | "\n",
86 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
87 | "\n",
88 | "lr = LogisticRegression()\n",
89 | "\n",
90 | "status = lr.fit(X_train, y_train)"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "### Scenario 2"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 15,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "data": {
107 | "text/plain": [
108 | "0.7962917355662668"
109 | ]
110 | },
111 | "execution_count": 15,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "lr.score(X_test, y_test).compute()"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 16,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/plain": [
128 | "Incremental(estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),\n",
129 | " random_state=None, scoring=None, shuffle_blocks=True)"
130 | ]
131 | },
132 | "execution_count": 16,
133 | "metadata": {},
134 | "output_type": "execute_result"
135 | }
136 | ],
137 | "source": [
138 | "from sklearn.naive_bayes import BernoulliNB\n",
139 | "from dask_ml.wrappers import Incremental\n",
140 | "\n",
141 | "nb = BernoulliNB()\n",
142 | "\n",
143 | "parallel_nb = Incremental(nb)\n",
144 | "\n",
145 | "parallel_nb.fit(X_train, y_train, classes=[0,1])"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 17,
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "data": {
155 | "text/plain": [
156 | "0.7888681701438975"
157 | ]
158 | },
159 | "execution_count": 17,
160 | "metadata": {},
161 | "output_type": "execute_result"
162 | }
163 | ],
164 | "source": [
165 | "parallel_nb.score(X_test, y_test)"
166 | ]
167 | }
168 | ],
169 | "metadata": {
170 | "kernelspec": {
171 | "display_name": "Python 3",
172 | "language": "python",
173 | "name": "python3"
174 | },
175 | "language_info": {
176 | "codemirror_mode": {
177 | "name": "ipython",
178 | "version": 3
179 | },
180 | "file_extension": ".py",
181 | "mimetype": "text/x-python",
182 | "name": "python",
183 | "nbconvert_exporter": "python",
184 | "pygments_lexer": "ipython3",
185 | "version": "3.6.8"
186 | }
187 | },
188 | "nbformat": 4,
189 | "nbformat_minor": 2
190 | }
191 |
--------------------------------------------------------------------------------
/Chapter 7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science with Python and Dask\n",
8 | "## Chapter 7: Visualizing DataFrames with Seaborn"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### Section 7.2.1"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "# Listing 7.1\n",
25 | "import dask.dataframe as dd\n",
26 | "import pyarrow\n",
27 | "from dask.diagnostics import ProgressBar\n",
28 | "import os\n",
29 | "import seaborn as sns\n",
30 | "import matplotlib.pyplot as plt\n",
31 | "\n",
32 | "# Set working directory and read in the data\n",
33 | "os.chdir('/Users/jesse/Documents')\n",
34 | "nyc_data = dd.read_parquet('nyc_final', engine='pyarrow')"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# Listing 7.2\n",
44 | "row_filter = ~nyc_data['Citation Issued Month Year'].isin(['07-2017','08-2017','09-2017','10-2017','11-2017','12-2017'])\n",
45 | "nyc_data_filtered = nyc_data[row_filter]\n",
46 | "\n",
47 | "citationsAndTemps = nyc_data_filtered.groupby('Citation Issued Month Year').agg({'Summons Number': 'count', 'Temp': 'mean'})"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "# Listing 7.3\n",
57 | "# Plot a scatter plot of Temp vs. Number of Citations\n",
58 | "sns.set(style=\"whitegrid\")\n",
59 | "f, ax = plt.subplots(figsize=(10, 10))\n",
60 | "sns.despine(f, left=True, bottom=True)\n",
61 | "\n",
62 | "with ProgressBar():\n",
63 | " sns.scatterplot(x=\"Temp\", y=\"Summons Number\",\n",
64 | " data=citationsAndTemps.compute(), ax=ax)\n",
65 | " plt.ylim(ymin=0)\n",
66 | " plt.xlim(xmin=0)"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "### Section 7.2.2"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "# Listing 7.4\n",
83 | "# Add a robust regression line to the scatter plot using regplot\n",
84 | "sns.set(style=\"whitegrid\")\n",
85 | "f, ax = plt.subplots(figsize=(10, 10))\n",
86 | "sns.despine(f, left=True, bottom=True)\n",
87 | "\n",
88 | "with ProgressBar():\n",
89 | " sns.regplot(x=\"Temp\", y=\"Summons Number\",\n",
90 | " data=citationsAndTemps.compute(), ax=ax,\n",
91 | " robust=True)\n",
92 | " plt.ylim(ymin=0)\n",
93 | " plt.xlim(xmin=0)"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "### Section 7.2.3"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "# Listing 7.5\n",
110 | "# Try a non-linear fit by setting the order parameter to 2 (roughly parabolic shape)\n",
111 | "sns.set(style=\"whitegrid\")\n",
112 | "f, ax = plt.subplots(figsize=(10, 10))\n",
113 | "sns.despine(f, left=True, bottom=True)\n",
114 | "\n",
115 | "with ProgressBar():\n",
116 | " sns.regplot(x=\"Temp\", y=\"Summons Number\",\n",
117 | " data=citationsAndTemps.compute(), ax=ax,\n",
118 | " order=2)\n",
119 | " plt.ylim(ymin=0)\n",
120 | " plt.xlim(xmin=0)"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "### Section 7.3.1"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "# Listing 7.6\n",
137 | "# Read in the data and filter down to the six most popular vehicle colors\n",
138 | "nyc_data_withVehicleAge = dd.read_parquet('nyc_data_vehicleAge', engine='pyarrow')\n",
139 | "\n",
140 | "row_filter = nyc_data_withVehicleAge['Vehicle Color'].isin(['BLACK','WHITE','GREY','RED','GREEN','BLUE'])\n",
141 | "column_filter = ['Vehicle Age','Vehicle Color']\n",
142 | "\n",
143 | "ages_and_colors = nyc_data_withVehicleAge[row_filter][column_filter]"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "# Listing 7.7\n",
153 | "# Get a count of how many vehicle citations match our criteria\n",
154 | "with ProgressBar():\n",
155 | " print(ages_and_colors.count().compute())"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# Listing 7.8\n",
165 | "# Plot a violinplot to show the distributions of vehicle ages grouped by vehicle color\n",
166 | "sns.set(style=\"whitegrid\")\n",
167 | "f, ax = plt.subplots(figsize=(10, 10))\n",
168 | "sns.despine(f, left=True, bottom=True)\n",
169 | "\n",
170 | "group_order = [\"RED\", \"GREEN\", \"BLUE\", \"BLACK\", \"WHITE\", \"GREY\"]\n",
171 | "\n",
172 | "with ProgressBar():\n",
173 | " sns.violinplot(x=\"Vehicle Color\", y=\"Vehicle Age\", data=ages_and_colors.compute(), order=group_order, palette=group_order, ax=ax)"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "### Section 7.3.2"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "# Listing 7.9\n",
190 | "# Plot a violinplot of a random sample\n",
191 | "sample = ages_and_colors.sample(frac=0.01)\n",
192 | "\n",
193 | "sns.set(style=\"whitegrid\")\n",
194 | "f, ax = plt.subplots(figsize=(10, 10))\n",
195 | "sns.despine(f, left=True, bottom=True)\n",
196 | "\n",
197 | "with ProgressBar():\n",
198 | " sns.violinplot(x=\"Vehicle Color\", y=\"Vehicle Age\", data=sample.compute(), order=group_order, palette=group_order, ax=ax)"
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {},
204 | "source": [
205 | "### Section 7.4"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "# Listing 7.10\n",
215 | "from datetime import datetime\n",
216 | "nyc_data_filtered = nyc_data[nyc_data['Issue Date'] < datetime(2017,1,1)]\n",
217 | "\n",
218 | "day_of_week = nyc_data_filtered['Issue Date'].apply(lambda x: x.strftime(\"%A\"), meta=str)\n",
219 | "\n",
220 | "month_of_year = nyc_data_filtered['Issue Date'].apply(lambda x: x.strftime(\"%B\"), meta=str)"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "# Listing 7.11\n",
230 | "# Add the columns back to the DataFrame\n",
231 | "nyc_data_with_dates_raw = nyc_data_filtered.assign(DayOfWeek = day_of_week).assign(MonthOfYear = month_of_year)\n",
232 | "column_map = {'DayOfWeek': 'Day of Week', 'MonthOfYear': 'Month of Year'}\n",
233 | "nyc_data_with_dates = nyc_data_with_dates_raw.rename(columns=column_map)"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "# Listing 7.12\n",
243 | "# Count up the summons by month of year/day of week\n",
244 | "with ProgressBar():\n",
245 | " summons_by_mydw = nyc_data_with_dates.groupby(['Day of Week', 'Month of Year'])['Summons Number'].count().compute()"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "# Listing 7.13\n",
255 | "# Create a pivot table from the result\n",
256 | "heatmap_data = summons_by_mydw.reset_index().pivot(\"Month of Year\", \"Day of Week\", \"Summons Number\")"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "# Listing 7.14\n",
266 | "# Create a list of months and weekdays for sorting the data in the heatmap\n",
267 | "months = ['January','February','March','April','May','June','July','August','September','October','November','December']\n",
268 | "weekdays = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']\n",
269 | "\n",
270 | "# Draw a heatmap with Day of Week on the x axis and Month of Year on the Y axis\n",
271 | "f, ax = plt.subplots(figsize=(10, 10))\n",
272 | "sns.heatmap(heatmap_data.loc[months,weekdays], annot=True, fmt=\"d\", linewidths=1, cmap='Greys', ax=ax)"
273 | ]
274 | }
275 | ],
276 | "metadata": {
277 | "kernelspec": {
278 | "display_name": "Python 3",
279 | "language": "python",
280 | "name": "python3"
281 | },
282 | "language_info": {
283 | "codemirror_mode": {
284 | "name": "ipython",
285 | "version": 3
286 | },
287 | "file_extension": ".py",
288 | "mimetype": "text/x-python",
289 | "name": "python",
290 | "nbconvert_exporter": "python",
291 | "pygments_lexer": "ipython3",
292 | "version": "3.6.8"
293 | }
294 | },
295 | "nbformat": 4,
296 | "nbformat_minor": 2
297 | }
298 |
--------------------------------------------------------------------------------
/Chapter 6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science with Python and Dask\n",
8 | "## Chapter 6: Summarizing and Analyzing DataFrames"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "# Before beginning, set your working directory to where the data resides\n",
18 | "import os\n",
19 | "os.chdir('/Users/jesse/Documents')"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Section 6.1.2"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "# Listing 6.1\n",
36 | "import dask.dataframe as dd\n",
37 | "import pyarrow\n",
38 | "from dask.diagnostics import ProgressBar\n",
39 | "\n",
40 | "nyc_data = dd.read_parquet('nyc_final2', engine='pyarrow')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# Listing 6.2\n",
50 | "with ProgressBar():\n",
51 | " vehicle_age_by_year = nyc_data['Vehicle Year'].value_counts().compute()\n",
52 | "vehicle_age_by_year"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# Listing 6.3\n",
62 | "with ProgressBar():\n",
63 | " condition = (nyc_data['Vehicle Year'] > 0) & (nyc_data['Vehicle Year'] <= 2018)\n",
64 | " vehicle_age_by_year = nyc_data[condition]['Vehicle Year'].value_counts().compute().sort_index()\n",
65 | "vehicle_age_by_year"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "# Listing 6.4\n",
75 | "nyc_data_filtered = nyc_data[condition]\n",
76 | "\n",
77 | "def age_calculation(row):\n",
78 | " return int(row['Issue Date'].year - row['Vehicle Year'])\n",
79 | "\n",
80 | "vehicle_age = nyc_data_filtered.apply(age_calculation, axis=1, meta=('Vehicle Age', 'int'))\n",
81 | "\n",
82 | "nyc_data_vehicle_age_stg1 = nyc_data_filtered.assign(VehicleAge=vehicle_age)\n",
83 | "nyc_data_vehicle_age_stg2 = nyc_data_vehicle_age_stg1.rename(columns={'VehicleAge':'Vehicle Age'})\n",
84 | "\n",
85 | "nyc_data_with_vehicle_age = nyc_data_vehicle_age_stg2[nyc_data_vehicle_age_stg2['Vehicle Age'] >= 0]"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "# Listing 6.5\n",
95 | "with ProgressBar():\n",
96 | " files = nyc_data_with_vehicle_age.to_parquet('nyc_data_vehicleAge', engine='pyarrow')\n",
97 | "\n",
98 | "nyc_data_with_vehicle_age = dd.read_parquet('nyc_data_vehicleAge', engine='pyarrow')"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# Listing 6.6\n",
108 | "from dask.array import stats as dask_stats\n",
109 | "with ProgressBar():\n",
110 | " mean = nyc_data_with_vehicle_age['Vehicle Age'].mean().compute()\n",
111 | " stdev = nyc_data_with_vehicle_age['Vehicle Age'].std().compute()\n",
112 | " minimum = nyc_data_with_vehicle_age['Vehicle Age'].min().compute()\n",
113 | " maximum = nyc_data_with_vehicle_age['Vehicle Age'].max().compute()\n",
114 | " skewness = float(dask_stats.skew(nyc_data_with_vehicle_age['Vehicle Age'].values).compute())"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "### Section 6.1.3"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "# Listing 6.7\n",
131 | "with ProgressBar():\n",
132 | " descriptive_stats = nyc_data_with_vehicle_age['Vehicle Age'].describe().compute()\n",
133 | "descriptive_stats.round(2)"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "### Section 6.2.2"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "# Listing 6.8\n",
150 | "import pandas as pd\n",
151 | "\n",
152 | "years = ['2014', '2015', '2016', '2017']\n",
153 | "months = ['01','02','03','04','05','06','07','08','09','10','11','12']\n",
154 | "years_months = [year + month for year in years for month in months]\n",
155 | "\n",
156 | "sort_order = pd.Series(range(len(years_months)), index=years_months, name='custom_sort')\n",
157 | "\n",
158 | "def sort_by_months(dataframe, order):\n",
159 | " return dataframe.join(order).sort_values('custom_sort').drop('custom_sort', axis=1)"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "# Listing 6.9\n",
169 | "with ProgressBar():\n",
170 | " nyc_data_by_month = nyc_data.groupby('monthYear')\n",
171 | " citations_per_month = nyc_data_by_month['Summons Number'].count().compute()\n",
172 | "sort_by_months(citations_per_month.to_frame(), sort_order)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "# Listing 6.10\n",
182 | "with ProgressBar():\n",
183 | " condition = ~nyc_data['monthYear'].isin(['201707','201708','201709','201710','201711','201712'])\n",
184 | " nyc_data_filtered = nyc_data[condition]\n",
185 | " citations_and_temps = nyc_data_filtered.groupby('monthYear').agg({'Summons Number': 'count', 'Temp': 'mean'})\n",
186 | " correlation_matrix = citations_and_temps.corr().compute()\n",
187 | "correlation_matrix"
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {},
193 | "source": [
194 | "### Section 6.3.2"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "# Listing 6.11\n",
204 | "nyc_data_with_vehicle_age = dd.read_parquet('nyc_data_vehicleAge', engine='pyarrow')\n",
205 | "\n",
206 | "nyc_data_filtered = nyc_data_with_vehicle_age[nyc_data_with_vehicle_age ['Plate Type'].isin(['PAS','COM'])]"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "# Listing 6.12\n",
216 | "with ProgressBar():\n",
217 | " N = nyc_data_filtered['Vehicle Age'].count().compute()\n",
218 | " p = nyc_data_filtered['Plate Type'].unique().count().compute()\n",
219 | "brown_forsythe_left = (N - p) / (p - 1)"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": [
228 | "# Listing 6.13\n",
229 | "with ProgressBar():\n",
230 | " passenger_vehicles = nyc_data_filtered[nyc_data_filtered['Plate Type'] == 'PAS']\n",
231 | " commercial_vehicles = nyc_data_filtered[nyc_data_filtered['Plate Type'] == 'COM']\n",
232 | " median_PAS = passenger_vehicles['Vehicle Age'].quantile(0.5).compute()\n",
233 | " median_COM = commercial_vehicles['Vehicle Age'].quantile(0.5).compute()"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "# Listing 6.14\n",
243 | "def absolute_deviation_from_median(row):\n",
244 | " if row['Plate Type'] == 'PAS':\n",
245 | " return abs(row['Vehicle Age'] - median_PAS)\n",
246 | " else:\n",
247 | " return abs(row['Vehicle Age'] - median_COM)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "# Listing 6.15\n",
257 | "absolute_deviation = nyc_data_filtered.apply(absolute_deviation_from_median, axis=1, meta=('x', 'float32'))\n",
258 | "\n",
259 | "nyc_data_age_type_test_stg1 = nyc_data_filtered.assign(MedianDifferences = absolute_deviation)\n",
260 | "nyc_data_age_type_test = nyc_data_age_type_test_stg1.rename(columns={'MedianDifferences':'Median Difference'})"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "# Listing 6.16\n",
270 | "with ProgressBar():\n",
271 | " group_means = nyc_data_age_type_test.groupby('Plate Type')['Median Difference'].mean().compute()"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": [
280 | "# Listing 6.17\n",
281 | "def group_mean_variance(row):\n",
282 | " if row['Plate Type'] == 'PAS':\n",
283 | " return (row['Median Difference'] - group_means['PAS'])**2\n",
284 | " else:\n",
285 | " return (row['Median Difference'] - group_means['COM'])**2\n",
286 | " \n",
287 | "group_mean_variances = nyc_data_age_type_test.apply(group_mean_variance, axis=1, meta=('x', 'float32'))\n",
288 | "\n",
289 | "nyc_data_age_type_test_gmv_stg1 = nyc_data_age_type_test.assign(GroupMeanVariances = group_mean_variances)\n",
290 | "nyc_data_age_type_test_gmv = nyc_data_age_type_test_gmv_stg1.rename(columns={'GroupMeanVariances':'Group Mean Variance'})"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {},
297 | "outputs": [],
298 | "source": [
299 | "# Listing 6.18\n",
300 | "with ProgressBar():\n",
301 | " brown_forsythe_right_denominator = nyc_data_age_type_test_gmv['Group Mean Variance'].sum().compute()"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "metadata": {},
308 | "outputs": [],
309 | "source": [
310 | "# Listing 6.19\n",
311 | "with ProgressBar():\n",
312 | " grand_mean = nyc_data_age_type_test['Median Difference'].mean().compute()"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "# Listing 6.20\n",
322 | "brown_forsythe_aggregation = dd.Aggregation(\n",
323 | " 'Brown_Forsythe',\n",
324 | " lambda chunk: (chunk.count(), chunk.sum()),\n",
325 | " lambda chunk_count, chunk_sum: (chunk_count.sum(), chunk_sum.sum()),\n",
326 | " lambda group_count, group_sum: group_count * (((group_sum / group_count) - grand_mean)**2)\n",
327 | ")"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "# Listing 6.21\n",
337 | "with ProgressBar():\n",
338 | " group_variances = nyc_data_age_type_test.groupby('Plate Type').agg({'Median Difference': brown_forsythe_aggregation}).compute()"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": null,
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "# Listing 6.22\n",
348 | "brown_forsythe_right_numerator = group_variances.sum()[0]"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {},
355 | "outputs": [],
356 | "source": [
357 | "# Listing 6.23\n",
358 | "F_statistic = brown_forsythe_left * (brown_forsythe_right_numerator / brown_forsythe_right_denominator)"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "# Listing 6.24\n",
368 | "import scipy.stats as stats\n",
369 | "alpha = 0.05\n",
370 | "df1 = p - 1\n",
371 | "df2 = N - p\n",
372 | "F_critical = stats.f.ppf(q=1-alpha, dfn=df1, dfd=df2)"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": null,
378 | "metadata": {},
379 | "outputs": [],
380 | "source": [
381 | "# Listing 6.25\n",
382 | "print(\"Using the Brown-Forsythe Test for Equal Variance\")\n",
383 | "print(\"The Null Hypothesis states: the variance is constant among groups\")\n",
384 | "print(\"The Alternative Hypothesis states: the variance is not constant among groups\")\n",
385 | "print(\"At a confidence level of \" + str(alpha) + \", the F statistic was \" + str(F_statistic) + \" and the F critical value was \" + str(F_critical) + \".\")\n",
386 | "if F_statistic > F_critical:\n",
387 | " print(\"We can reject the null hypothesis. Set equal_var to False.\")\n",
388 | "else:\n",
389 | " print(\"We fail to reject the null hypothesis. Set equal_var to True.\")"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": null,
395 | "metadata": {},
396 | "outputs": [],
397 | "source": [
398 | "# Listing 6.26\n",
399 | "with ProgressBar():\n",
400 | " pas = passenger_vehicles['Vehicle Age'].values.compute()\n",
401 | " com = commercial_vehicles['Vehicle Age'].values.compute()"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "# Listing 6.27\n",
411 | "stats.ttest_ind(pas, com, equal_var=False)"
412 | ]
413 | },
414 | {
415 | "cell_type": "markdown",
416 | "metadata": {},
417 | "source": [
418 | "### Section 6.4.1"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": null,
424 | "metadata": {},
425 | "outputs": [],
426 | "source": [
427 | "# Listing 6.28\n",
428 | "with ProgressBar():\n",
429 | " condition = ~nyc_data['monthYear'].isin(['201707','201708','201709','201710','201711','201712'])\n",
430 | " nyc_data_filtered = nyc_data[condition]\n",
431 | " citations_by_month = nyc_data_filtered.groupby(nyc_data_filtered.index)['Summons Number'].count()"
432 | ]
433 | },
434 | {
435 | "cell_type": "markdown",
436 | "metadata": {},
437 | "source": [
438 | "### Section 6.4.2"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": null,
444 | "metadata": {},
445 | "outputs": [],
446 | "source": [
447 | "# Listing 6.29\n",
448 | "with ProgressBar():\n",
449 | " three_month_SMA = citations_by_month.rolling(3).mean().compute()"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "# Listing 6.30\n",
459 | "citations_by_month.rolling(3, center=True).mean().head()"
460 | ]
461 | }
462 | ],
463 | "metadata": {
464 | "kernelspec": {
465 | "display_name": "Python 3",
466 | "language": "python",
467 | "name": "python3"
468 | },
469 | "language_info": {
470 | "codemirror_mode": {
471 | "name": "ipython",
472 | "version": 3
473 | },
474 | "file_extension": ".py",
475 | "mimetype": "text/x-python",
476 | "name": "python",
477 | "nbconvert_exporter": "python",
478 | "pygments_lexer": "ipython3",
479 | "version": "3.6.8"
480 | }
481 | },
482 | "nbformat": 4,
483 | "nbformat_minor": 2
484 | }
485 |
--------------------------------------------------------------------------------
/Chapter 5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science with Python and Dask\n",
8 | "## Chapter 5: Cleaning and Transforming DataFrames"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "# Before beginning, set your working directory to where the data resides\n",
18 | "import os\n",
19 | "os.chdir('/Users/jesse/Documents')"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Intro Section"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "# Listing 5.1\n",
36 | "import dask.dataframe as dd\n",
37 | "from dask.diagnostics import ProgressBar\n",
38 | "import numpy as np\n",
39 | "\n",
40 | "dtypes = {\n",
41 | " 'Date First Observed': np.str,\n",
42 | " 'Days Parking In Effect ': np.str,\n",
43 | " 'Double Parking Violation': np.str,\n",
44 | " 'Feet From Curb': np.float32,\n",
45 | " 'From Hours In Effect': np.str,\n",
46 | " 'House Number': np.str,\n",
47 | " 'Hydrant Violation': np.str,\n",
48 | " 'Intersecting Street': np.str,\n",
49 | " 'Issue Date': np.str,\n",
50 | " 'Issuer Code': np.float32,\n",
51 | " 'Issuer Command': np.str,\n",
52 | " 'Issuer Precinct': np.float32,\n",
53 | " 'Issuer Squad': np.str,\n",
54 | " 'Issuing Agency': np.str,\n",
55 | " 'Law Section': np.float32,\n",
56 | " 'Meter Number': np.str,\n",
57 | " 'No Standing or Stopping Violation': np.str,\n",
58 | " 'Plate ID': np.str,\n",
59 | " 'Plate Type': np.str,\n",
60 | " 'Registration State': np.str,\n",
61 | " 'Street Code1': np.uint32,\n",
62 | " 'Street Code2': np.uint32,\n",
63 | " 'Street Code3': np.uint32,\n",
64 | " 'Street Name': np.str,\n",
65 | " 'Sub Division': np.str,\n",
66 | " 'Summons Number': np.uint32,\n",
67 | " 'Time First Observed': np.str,\n",
68 | " 'To Hours In Effect': np.str,\n",
69 | " 'Unregistered Vehicle?': np.str,\n",
70 | " 'Vehicle Body Type': np.str,\n",
71 | " 'Vehicle Color': np.str,\n",
72 | " 'Vehicle Expiration Date': np.str,\n",
73 | " 'Vehicle Make': np.str,\n",
74 | " 'Vehicle Year': np.float32,\n",
75 | " 'Violation Code': np.uint16,\n",
76 | " 'Violation County': np.str,\n",
77 | " 'Violation Description': np.str,\n",
78 | " 'Violation In Front Of Or Opposite': np.str,\n",
79 | " 'Violation Legal Code': np.str,\n",
80 | " 'Violation Location': np.str,\n",
81 | " 'Violation Post Code': np.str,\n",
82 | " 'Violation Precinct': np.float32,\n",
83 | " 'Violation Time': np.str\n",
84 | "}\n",
85 | "\n",
86 | "nyc_data_raw = dd.read_csv('nyc-parking-tickets/*.csv', dtype=dtypes, usecols=dtypes.keys())"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "### Section 5.1.1"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "# Listing 5.2\n",
103 | "with ProgressBar():\n",
104 | " display(nyc_data_raw['Plate ID'].head())"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "# Listing 5.3\n",
114 | "with ProgressBar():\n",
115 | " display(nyc_data_raw[['Plate ID', 'Registration State']].head())"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "# Listing 5.4\n",
125 | "columns_to_select = ['Plate ID', 'Registration State']\n",
126 | "\n",
127 | "with ProgressBar():\n",
128 | " display(nyc_data_raw[columns_to_select].head())"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Section 5.1.2"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "# Listing 5.5\n",
145 | "with ProgressBar():\n",
146 | " display(nyc_data_raw.drop('Violation Code', axis=1).head())"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "# Listing 5.6\n",
156 | "violationColumnNames = list(filter(lambda columnName: 'Violation' in columnName, nyc_data_raw.columns))\n",
157 | "\n",
158 | "with ProgressBar():\n",
159 | " display(nyc_data_raw.drop(violationColumnNames, axis=1).head())"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "### Section 5.1.3"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "# Listing 5.7\n",
176 | "nyc_data_renamed = nyc_data_raw.rename(columns={'Plate ID':'License Plate'})\n",
177 | "nyc_data_renamed"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "### Section 5.1.4"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "# Listing 5.8\n",
194 | "with ProgressBar():\n",
195 | " display(nyc_data_raw.loc[56].head(1))"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "# Listing 5.9\n",
205 | "with ProgressBar():\n",
206 | " display(nyc_data_raw.loc[100:200].head(100))"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "# Listing 5.10\n",
216 | "with ProgressBar():\n",
217 | " some_rows = nyc_data_raw.loc[100:200].head(100)\n",
218 | "some_rows.drop(range(100, 200, 2))"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "### Section 5.2.1"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "# Listing 5.11\n",
235 | "missing_values = nyc_data_raw.isnull().sum()\n",
236 | "with ProgressBar():\n",
237 | " percent_missing = ((missing_values / nyc_data_raw.index.size) * 100).compute()\n",
238 | "percent_missing"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "### Section 5.2.2"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "# Listing 5.12\n",
255 | "columns_to_drop = list(percent_missing[percent_missing >= 50].index)\n",
256 | "nyc_data_clean_stage1 = nyc_data_raw.drop(columns_to_drop, axis=1)"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "### Section 5.2.3"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "# Listing 5.13\n",
273 | "with ProgressBar():\n",
274 | " count_of_vehicle_colors = nyc_data_clean_stage1['Vehicle Color'].value_counts().compute()\n",
275 | "most_common_color = count_of_vehicle_colors.sort_values(ascending=False).index[0]\n",
276 | "\n",
277 | "# Fill missing vehicle color with the most common color\n",
278 | "nyc_data_clean_stage2 = nyc_data_clean_stage1.fillna({'Vehicle Color': most_common_color})"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "### Section 5.2.4"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "# Listing 5.14\n",
295 | "rows_to_drop = list(percent_missing[(percent_missing > 0) & (percent_missing < 5)].index)\n",
296 | "nyc_data_clean_stage3 = nyc_data_clean_stage2.dropna(subset=rows_to_drop)"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "### Section 5.2.5"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "metadata": {},
310 | "outputs": [],
311 | "source": [
312 | "# Listing 5.15\n",
313 | "remaining_columns_to_clean = list(percent_missing[(percent_missing >= 5) & (percent_missing < 50)].index)\n",
314 | "nyc_data_raw.dtypes[remaining_columns_to_clean]"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": null,
320 | "metadata": {},
321 | "outputs": [],
322 | "source": [
323 | "# Listing 5.16\n",
324 | "unknown_default_dict = dict(map(lambda columnName: (columnName, 'Unknown'), remaining_columns_to_clean))"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "# Listing 5.17\n",
334 | "nyc_data_clean_stage4 = nyc_data_clean_stage3.fillna(unknown_default_dict)"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": null,
340 | "metadata": {},
341 | "outputs": [],
342 | "source": [
343 | "# Listing 5.18\n",
344 | "with ProgressBar():\n",
345 | " print(nyc_data_clean_stage4.isnull().sum().compute())\n",
346 | " nyc_data_clean_stage4.persist()"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "metadata": {},
352 | "source": [
353 | "### Section 5.3"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "# Listing 5.19\n",
363 | "with ProgressBar():\n",
364 | " license_plate_types = nyc_data_clean_stage4['Plate Type'].value_counts().compute()\n",
365 | "license_plate_types"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "# Listing 5.20\n",
375 | "condition = nyc_data_clean_stage4['Plate Type'].isin(['PAS', 'COM'])\n",
376 | "plate_type_masked = nyc_data_clean_stage4['Plate Type'].where(condition, 'Other')\n",
377 | "nyc_data_recode_stage1 = nyc_data_clean_stage4.drop('Plate Type', axis=1)\n",
378 | "nyc_data_recode_stage2 = nyc_data_recode_stage1.assign(PlateType=plate_type_masked)\n",
379 | "nyc_data_recode_stage3 = nyc_data_recode_stage2.rename(columns={'PlateType':'Plate Type'})"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {},
386 | "outputs": [],
387 | "source": [
388 | "# Listing 5.21\n",
389 | "with ProgressBar():\n",
390 | " display(nyc_data_recode_stage3['Plate Type'].value_counts().compute())"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": null,
396 | "metadata": {},
397 | "outputs": [],
398 | "source": [
399 | "# Listing 5.22\n",
400 | "single_color = list(count_of_vehicle_colors[count_of_vehicle_colors == 1].index)\n",
401 | "condition = nyc_data_clean_stage4['Vehicle Color'].isin(single_color)\n",
402 | "vehicle_color_masked = nyc_data_clean_stage4['Vehicle Color'].mask(condition, 'Other')\n",
403 | "nyc_data_recode_stage4 = nyc_data_recode_stage3.drop('Vehicle Color', axis=1)\n",
404 | "nyc_data_recode_stage5 = nyc_data_recode_stage4.assign(VehicleColor=vehicle_color_masked)\n",
405 | "nyc_data_recode_stage6 = nyc_data_recode_stage5.rename(columns={'VehicleColor':'Vehicle Color'})"
406 | ]
407 | },
408 | {
409 | "cell_type": "markdown",
410 | "metadata": {},
411 | "source": [
412 | "### Section 5.4"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": null,
418 | "metadata": {},
419 | "outputs": [],
420 | "source": [
421 | "# Listing 5.23\n",
422 | "from datetime import datetime\n",
423 | "issue_date_parsed = nyc_data_recode_stage6['Issue Date'].apply(lambda x: datetime.strptime(x, \"%m/%d/%Y\"), meta=datetime)\n",
424 | "nyc_data_derived_stage1 = nyc_data_recode_stage6.drop('Issue Date', axis=1)\n",
425 | "nyc_data_derived_stage2 = nyc_data_derived_stage1.assign(IssueDate=issue_date_parsed)\n",
426 | "nyc_data_derived_stage3 = nyc_data_derived_stage2.rename(columns={'IssueDate':'Issue Date'})"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {},
433 | "outputs": [],
434 | "source": [
435 | "# Listing 5.24\n",
436 | "with ProgressBar():\n",
437 | " display(nyc_data_derived_stage3['Issue Date'].head())"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "metadata": {},
444 | "outputs": [],
445 | "source": [
446 | "# Listing 5.25\n",
447 | "issue_date_month_year = nyc_data_derived_stage3['Issue Date'].apply(lambda dt: dt.strftime(\"%Y%m\"), meta=int)\n",
448 | "nyc_data_derived_stage4 = nyc_data_derived_stage3.assign(IssueMonthYear=issue_date_month_year)\n",
449 | "nyc_data_derived_stage5 = nyc_data_derived_stage4.rename(columns={'IssueMonthYear':'Citation Issued Month Year'})"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "# Listing 5.26\n",
459 | "with ProgressBar():\n",
460 | " display(nyc_data_derived_stage5['Citation Issued Month Year'].head())"
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "metadata": {},
466 | "source": [
467 | "### Section 5.5.1"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": null,
473 | "metadata": {},
474 | "outputs": [],
475 | "source": [
476 | "# Listing 5.27\n",
477 | "months = ['201310','201410','201510','201610','201710']\n",
478 | "condition = nyc_data_derived_stage5['Citation Issued Month Year'].isin(months)\n",
479 | "october_citations = nyc_data_derived_stage5[condition]\n",
480 | "\n",
481 | "with ProgressBar():\n",
482 | " display(october_citations.head())"
483 | ]
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": null,
488 | "metadata": {},
489 | "outputs": [],
490 | "source": [
491 | "# Listing 5.28\n",
492 | "bound_date = '2016-4-25'\n",
493 | "condition = nyc_data_derived_stage5['Issue Date'] > bound_date\n",
494 | "citations_after_bound = nyc_data_derived_stage5[condition]\n",
495 | "\n",
496 | "with ProgressBar():\n",
497 | " display(citations_after_bound.head())"
498 | ]
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "metadata": {},
503 | "source": [
504 | "### Section 5.5.1"
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": null,
510 | "metadata": {},
511 | "outputs": [],
512 | "source": [
513 | "# Listing 5.29\n",
514 | "with ProgressBar():\n",
515 | " condition = (nyc_data_derived_stage5['Issue Date'] > '2014-01-01') & (nyc_data_derived_stage5['Issue Date'] <= '2017-12-31')\n",
516 | " nyc_data_filtered = nyc_data_derived_stage5[condition]\n",
517 | " nyc_data_new_index = nyc_data_filtered.set_index('Citation Issued Month Year')"
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": null,
523 | "metadata": {},
524 | "outputs": [],
525 | "source": [
526 | "# Listing 5.30\n",
527 | "years = ['2014', '2015', '2016', '2017']\n",
528 | "months = ['01','02','03','04','05','06','07','08','09','10','11','12']\n",
529 | "divisions = [year + month for year in years for month in months]\n",
530 | "\n",
531 | "with ProgressBar():\n",
532 | " nyc_data_new_index.repartition(divisions=divisions).to_parquet('nyc_data_date_index', compression='snappy')\n",
533 | " \n",
534 | "nyc_data_new_index = dd.read_parquet('nyc_data_date_index')"
535 | ]
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "metadata": {},
540 | "source": [
541 | "### Section 5.6.1"
542 | ]
543 | },
544 | {
545 | "cell_type": "code",
546 | "execution_count": null,
547 | "metadata": {},
548 | "outputs": [],
549 | "source": [
550 | "# Listing 5.31\n",
551 | "import pandas as pd\n",
552 | "nyc_temps = pd.read_csv('nyc-temp-data.csv')\n",
553 | "nyc_temps_indexed = nyc_temps.set_index(nyc_temps.monthYear.astype(str))\n",
554 | "\n",
555 | "nyc_data_with_temps = nyc_data_new_index.join(nyc_temps_indexed, how='inner')\n",
556 | "\n",
557 | "with ProgressBar():\n",
558 | " display(nyc_data_with_temps.head(15))"
559 | ]
560 | },
561 | {
562 | "cell_type": "markdown",
563 | "metadata": {},
564 | "source": [
565 | "### Section 5.6.2"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": null,
571 | "metadata": {},
572 | "outputs": [],
573 | "source": [
574 | "# Listing 5.32\n",
575 | "fy16 = dd.read_csv('nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2016.csv', dtype=dtypes, usecols=dtypes.keys())\n",
576 | "fy17 = dd.read_csv('nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2017.csv', dtype=dtypes, usecols=dtypes.keys())\n",
577 | "\n",
578 | "fy1617 = fy16.append(fy17)\n",
579 | "\n",
580 | "with ProgressBar():\n",
581 | " print(fy16['Summons Number'].count().compute())\n",
582 | "\n",
583 | "with ProgressBar():\n",
584 | " print(fy17['Summons Number'].count().compute())\n",
585 | "\n",
586 | "with ProgressBar():\n",
587 | " print(fy1617['Summons Number'].count().compute())\n"
588 | ]
589 | },
590 | {
591 | "cell_type": "markdown",
592 | "metadata": {},
593 | "source": [
594 | "### Section 5.7.1"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": null,
600 | "metadata": {},
601 | "outputs": [],
602 | "source": [
603 | "# Listing 5.33\n",
604 | "with ProgressBar():\n",
605 | " if not os.path.exists('nyc-final-csv'):\n",
606 | " os.makedirs('nyc-final-csv') \n",
607 | " nyc_data_with_temps.repartition(npartitions=1).to_csv('nyc-final-csv/part*.csv')"
608 | ]
609 | },
610 | {
611 | "cell_type": "code",
612 | "execution_count": null,
613 | "metadata": {},
614 | "outputs": [],
615 | "source": [
616 | "# Listing 5.33\n",
617 | "with ProgressBar():\n",
618 | " if not os.path.exists('nyc-final-csv-compressed'):\n",
619 | " os.makedirs('nyc-final-csv-compressed')\n",
620 | " nyc_data_with_temps.to_csv(\n",
621 | " filename='nyc-final-csv-compressed/*', \n",
622 | " compression='gzip', \n",
623 | " sep='|', \n",
624 | " na_rep='NULL', \n",
625 | " header=False, \n",
626 | " index=False)"
627 | ]
628 | },
629 | {
630 | "cell_type": "markdown",
631 | "metadata": {},
632 | "source": [
633 | "### Listing 5.7.2"
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "execution_count": null,
639 | "metadata": {},
640 | "outputs": [],
641 | "source": [
642 | "# Listing 5.35\n",
643 | "with ProgressBar():\n",
644 | " nyc_data_with_temps.to_parquet('nyc_final', compression='snappy')"
645 | ]
646 | }
647 | ],
648 | "metadata": {
649 | "kernelspec": {
650 | "display_name": "Python 3",
651 | "language": "python",
652 | "name": "python3"
653 | },
654 | "language_info": {
655 | "codemirror_mode": {
656 | "name": "ipython",
657 | "version": 3
658 | },
659 | "file_extension": ".py",
660 | "mimetype": "text/x-python",
661 | "name": "python",
662 | "nbconvert_exporter": "python",
663 | "pygments_lexer": "ipython3",
664 | "version": "3.6.8"
665 | }
666 | },
667 | "nbformat": 4,
668 | "nbformat_minor": 2
669 | }
670 |
--------------------------------------------------------------------------------
/Chapter 10.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science with Python and Dask\n",
8 | "## Chapter 10: Machine Learning with Dask-ML"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### Section 10.1"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "# Listing 10.1\n",
25 | "import dask.bag as bag\n",
26 | "import os\n",
27 | "from dask.diagnostics import ProgressBar\n",
28 | "\n",
29 | "os.chdir('/Users/jesse/Documents')\n",
30 | "raw_data = bag.read_text('foods.txt')\n",
31 | "\n",
32 | "def get_next_part(file, start_index, span_index=0, blocksize=1024):\n",
33 | " file.seek(start_index)\n",
34 | " buffer = file.read(blocksize + span_index).decode('cp1252')\n",
35 | " delimiter_position = buffer.find('\\n\\n')\n",
36 | " if delimiter_position == -1:\n",
37 | " return get_next_part(file, start_index, span_index + blocksize)\n",
38 | " else:\n",
39 | " file.seek(start_index)\n",
40 | " return start_index, delimiter_position\n",
41 | " \n",
42 | "def get_item(filename, start_index, delimiter_position, encoding='cp1252'):\n",
43 | " with open(filename, 'rb') as file_handle:\n",
44 | " file_handle.seek(start_index)\n",
45 | " text = file_handle.read(delimiter_position).decode(encoding)\n",
46 | " elements = text.strip().split('\\n')\n",
47 | " key_value_pairs = [(element.split(': ')[0], element.split(': ')[1]) \n",
48 | " if len(element.split(': ')) > 1 \n",
49 | " else ('unknown', element) \n",
50 | " for element in elements]\n",
51 | " return dict(key_value_pairs)\n",
52 | " \n",
53 | "with open('foods.txt', 'rb') as file_handle:\n",
54 | " size = file_handle.seek(0,2) - 1\n",
55 | " more_data = True\n",
56 | " output = []\n",
57 | " current_position = next_position = 0\n",
58 | " while more_data:\n",
59 | " if current_position >= size:\n",
60 | " more_data = False\n",
61 | " else:\n",
62 | " current_position, next_position = get_next_part(file_handle, current_position, 0)\n",
63 | " output.append((current_position, next_position))\n",
64 | " current_position = current_position + next_position + 2\n",
65 | " \n",
66 | "reviews = bag.from_sequence(output).map(lambda x: get_item('foods.txt', x[0], x[1]))\n",
67 | "\n",
68 | "def tag_positive_negative_by_score(element):\n",
69 | " if float(element['review/score']) > 3:\n",
70 | " element['review/sentiment'] = 'positive'\n",
71 | " else:\n",
72 | " element['review/sentiment'] = 'negative'\n",
73 | " return element\n",
74 | "\n",
75 | "tagged_reviews = reviews.map(tag_positive_negative_by_score)"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "### Section 10.1.1"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 7,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "# Listing 10.2\n",
92 | "from nltk.corpus import stopwords\n",
93 | "from nltk.tokenize import RegexpTokenizer\n",
94 | "from functools import partial\n",
95 | "\n",
96 | "tokenizer = RegexpTokenizer(r'\\w+')\n",
97 | "\n",
98 | "def extract_reviews(element):\n",
99 | " element['review/tokens'] = element['review/text'].lower()\n",
100 | " return element\n",
101 | "\n",
102 | "def tokenize_reviews(element):\n",
103 | " element['review/tokens'] = tokenizer.tokenize(element['review/tokens'])\n",
104 | " return element\n",
105 | "\n",
106 | "def filter_stopword(word, stopwords):\n",
107 | " return word not in stopwords\n",
108 | "\n",
109 | "def filter_stopwords(element, stopwords):\n",
110 | " element['review/tokens'] = list(filter(partial(filter_stopword, stopwords=stopwords), element['review/tokens']))\n",
111 | " return element\n",
112 | "\n",
113 | "stopword_set = set(stopwords.words('english'))\n",
114 | "more_stopwords = {'br', 'amazon', 'com', 'http', 'www', 'href', 'gp'}\n",
115 | "all_stopwords = stopword_set.union(more_stopwords)\n",
116 | "\n",
117 | "review_extracted_text = tagged_reviews.map(extract_reviews)\n",
118 | "review_tokens = review_extracted_text.map(tokenize_reviews)\n",
119 | "review_text_clean = review_tokens.map(partial(filter_stopwords, stopwords=all_stopwords))"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 8,
125 | "metadata": {},
126 | "outputs": [
127 | {
128 | "name": "stdout",
129 | "output_type": "stream",
130 | "text": [
131 | "[########################################] | 100% Completed | 34.8s\n"
132 | ]
133 | },
134 | {
135 | "data": {
136 | "text/plain": [
137 | "114290"
138 | ]
139 | },
140 | "execution_count": 8,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "# Listing 10.3\n",
147 | "def extract_tokens(element):\n",
148 | " return element['review/tokens']\n",
149 | "\n",
150 | "extracted_tokens = review_text_clean.map(extract_tokens)\n",
151 | "unique_tokens = extracted_tokens.flatten().distinct()\n",
152 | "\n",
153 | "with ProgressBar():\n",
154 | " number_of_tokens = unique_tokens.count().compute()\n",
155 | "number_of_tokens"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 9,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "[########################################] | 100% Completed | 49.4s\n"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "# Listing 10.4\n",
173 | "def count(accumulator, element):\n",
174 | " return accumulator + 1\n",
175 | "\n",
176 | "def combine(total_1, total_2):\n",
177 | " return total_1 + total_2\n",
178 | "\n",
179 | "with ProgressBar():\n",
180 | " token_counts = extracted_tokens.flatten().foldby(lambda x: x, count, 0, combine, 0).compute()\n",
181 | " \n",
182 | "top_tokens = sorted(token_counts, key=lambda x: x[1], reverse=True)\n",
183 | "top_100_tokens = list(map(lambda x: x[0], top_tokens[:100]))"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 11,
189 | "metadata": {
190 | "scrolled": false
191 | },
192 | "outputs": [
193 | {
194 | "data": {
195 | "text/plain": [
196 | "({'target': 1,\n",
197 | " 'features': array([1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
198 | " 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,\n",
199 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
200 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
201 | " 0, 0, 0, 0, 0, 0, 0, 0])},\n",
202 | " {'target': 0,\n",
203 | " 'features': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
204 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
205 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
206 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
207 | " 0, 0, 0, 0, 0, 0, 0, 0])},\n",
208 | " {'target': 1,\n",
209 | " 'features': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
210 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
211 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
212 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
213 | " 0, 0, 0, 0, 0, 0, 0, 0])},\n",
214 | " {'target': 0,\n",
215 | " 'features': array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
216 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n",
217 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,\n",
218 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
219 | " 0, 0, 0, 0, 0, 0, 0, 0])},\n",
220 | " {'target': 1,\n",
221 | " 'features': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
222 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
223 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
224 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
225 | " 0, 0, 0, 0, 0, 0, 0, 0])})"
226 | ]
227 | },
228 | "execution_count": 11,
229 | "metadata": {},
230 | "output_type": "execute_result"
231 | }
232 | ],
233 | "source": [
234 | "# Listing 10.5\n",
235 | "import numpy as np\n",
236 | "def vectorize_tokens(element):\n",
237 | " vectorized_tokens = np.where(np.isin(top_100_tokens, element['review/tokens']), 1, 0)\n",
238 | " element['review/token_vector'] = vectorized_tokens\n",
239 | " return element\n",
240 | "\n",
241 | "def prep_model_data(element):\n",
242 | " return {'target': 1 if element['review/sentiment'] == 'positive' else 0,\n",
243 | " 'features': element['review/token_vector']}\n",
244 | "\n",
245 | "model_data = review_text_clean.map(vectorize_tokens).map(prep_model_data)\n",
246 | "\n",
247 | "model_data.take(5)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 12,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "# Listing 10.6\n",
257 | "from dask import array as dask_array\n",
258 | "def stacker(partition):\n",
259 | " return dask_array.concatenate([element for element in partition])\n",
260 | "\n",
261 | "with ProgressBar():\n",
262 | " feature_arrays = model_data.pluck('features').map(lambda x: dask_array.from_array(x, 1000).reshape(1,-1)).reduction(perpartition=stacker, aggregate=stacker)\n",
263 | " feature_array = feature_arrays.compute()\n",
264 | "feature_array"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 14,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "name": "stdout",
274 | "output_type": "stream",
275 | "text": [
276 | "[########################################] | 100% Completed | 5min 32.8s\n"
277 | ]
278 | }
279 | ],
280 | "source": [
281 | "# Listing 10.7\n",
282 | "with ProgressBar():\n",
283 | " feature_array.rechunk(5000).to_zarr('sentiment_feature_array.zarr')\n",
284 | " feature_array = dask_array.from_zarr('sentiment_feature_array.zarr')\n",
285 | " \n",
286 | "with ProgressBar():\n",
287 | " target_arrays = model_data.pluck('target').map(lambda x: dask_array.from_array(x, 1000).reshape(-1,1)).reduction(perpartition=stacker, aggregate=stacker)\n",
288 | " target_arrays.compute().rechunk(5000).to_zarr('sentiment_target_array.zarr')\n",
289 | " target_array = dask_array.from_zarr('sentiment_target_array.zarr')"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "### Section 10.1.2"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 16,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "name": "stdout",
306 | "output_type": "stream",
307 | "text": [
308 | "[########################################] | 100% Completed | 0.9s\n",
309 | "[#################### ] | 50% Completed | 1.6s"
310 | ]
311 | },
312 | {
313 | "name": "stderr",
314 | "output_type": "stream",
315 | "text": [
316 | "/anaconda3/lib/python3.6/site-packages/dask_glm/utils.py:52: RuntimeWarning: overflow encountered in exp\n",
317 | " return np.exp(A)\n"
318 | ]
319 | },
320 | {
321 | "name": "stdout",
322 | "output_type": "stream",
323 | "text": [
324 | "[########################################] | 100% Completed | 3.6s\n",
325 | "[########################################] | 100% Completed | 3.8s\n",
326 | "[########################################] | 100% Completed | 3.9s\n",
327 | "[########################################] | 100% Completed | 3.8s\n",
328 | "[########################################] | 100% Completed | 3.6s\n",
329 | "[########################################] | 100% Completed | 3.9s\n",
330 | "[########################################] | 100% Completed | 3.7s\n",
331 | "[########################################] | 100% Completed | 3.5s\n",
332 | "[########################################] | 100% Completed | 3.7s\n",
333 | "[########################################] | 100% Completed | 4.0s\n",
334 | "[########################################] | 100% Completed | 4.0s\n",
335 | "[########################################] | 100% Completed | 4.0s\n",
336 | "[########################################] | 100% Completed | 3.8s\n",
337 | "[########################################] | 100% Completed | 3.9s\n",
338 | "[########################################] | 100% Completed | 3.7s\n",
339 | "[########################################] | 100% Completed | 3.5s\n",
340 | "[########################################] | 100% Completed | 3.7s\n",
341 | "[########################################] | 100% Completed | 3.8s\n",
342 | "[########################################] | 100% Completed | 3.6s\n",
343 | "[########################################] | 100% Completed | 3.6s\n",
344 | "[########################################] | 100% Completed | 3.6s\n",
345 | "[########################################] | 100% Completed | 3.7s\n",
346 | "[########################################] | 100% Completed | 3.6s\n",
347 | "[########################################] | 100% Completed | 3.4s\n",
348 | "[########################################] | 100% Completed | 3.5s\n",
349 | "[########################################] | 100% Completed | 3.7s\n",
350 | "[########################################] | 100% Completed | 3.7s\n",
351 | "[########################################] | 100% Completed | 3.6s\n",
352 | "[########################################] | 100% Completed | 3.6s\n",
353 | "[########################################] | 100% Completed | 3.7s\n",
354 | "[########################################] | 100% Completed | 3.8s\n",
355 | "[########################################] | 100% Completed | 3.9s\n",
356 | "[########################################] | 100% Completed | 3.9s\n",
357 | "[########################################] | 100% Completed | 4.0s\n",
358 | "[########################################] | 100% Completed | 4.0s\n",
359 | "[########################################] | 100% Completed | 3.7s\n",
360 | "[########################################] | 100% Completed | 3.5s\n",
361 | "[########################################] | 100% Completed | 3.7s\n",
362 | "[########################################] | 100% Completed | 3.9s\n",
363 | "[########################################] | 100% Completed | 3.9s\n",
364 | "[########################################] | 100% Completed | 3.8s\n",
365 | "[########################################] | 100% Completed | 3.6s\n",
366 | "[########################################] | 100% Completed | 3.6s\n",
367 | "[########################################] | 100% Completed | 3.6s\n",
368 | "[########################################] | 100% Completed | 3.9s\n",
369 | "[########################################] | 100% Completed | 3.5s\n",
370 | "[########################################] | 100% Completed | 3.5s\n",
371 | "[########################################] | 100% Completed | 3.4s\n",
372 | "[########################################] | 100% Completed | 3.5s\n",
373 | "[########################################] | 100% Completed | 3.6s\n",
374 | "[########################################] | 100% Completed | 3.6s\n",
375 | "[########################################] | 100% Completed | 3.6s\n",
376 | "[########################################] | 100% Completed | 3.6s\n",
377 | "[########################################] | 100% Completed | 3.6s\n",
378 | "[########################################] | 100% Completed | 3.7s\n",
379 | "[########################################] | 100% Completed | 3.7s\n",
380 | "[########################################] | 100% Completed | 3.7s\n",
381 | "[########################################] | 100% Completed | 3.8s\n",
382 | "[########################################] | 100% Completed | 3.6s\n",
383 | "[########################################] | 100% Completed | 3.8s\n",
384 | "[########################################] | 100% Completed | 3.9s\n",
385 | "[########################################] | 100% Completed | 3.9s\n",
386 | "[########################################] | 100% Completed | 4.1s\n",
387 | "[########################################] | 100% Completed | 3.9s\n",
388 | "[########################################] | 100% Completed | 3.6s\n",
389 | "[########################################] | 100% Completed | 3.8s\n",
390 | "[########################################] | 100% Completed | 3.9s\n",
391 | "[########################################] | 100% Completed | 4.1s\n",
392 | "[########################################] | 100% Completed | 3.8s\n",
393 | "[########################################] | 100% Completed | 3.6s\n",
394 | "[########################################] | 100% Completed | 3.8s\n",
395 | "[########################################] | 100% Completed | 3.7s\n",
396 | "[########################################] | 100% Completed | 3.5s\n",
397 | "[########################################] | 100% Completed | 3.7s\n",
398 | "[########################################] | 100% Completed | 3.7s\n",
399 | "[########################################] | 100% Completed | 3.6s\n",
400 | "[########################################] | 100% Completed | 3.8s\n",
401 | "[########################################] | 100% Completed | 4.0s\n",
402 | "[########################################] | 100% Completed | 3.9s\n",
403 | "[########################################] | 100% Completed | 3.9s\n",
404 | "[########################################] | 100% Completed | 3.8s\n",
405 | "[########################################] | 100% Completed | 4.0s\n",
406 | "[########################################] | 100% Completed | 4.2s\n",
407 | "[########################################] | 100% Completed | 3.7s\n",
408 | "[########################################] | 100% Completed | 3.6s\n",
409 | "[########################################] | 100% Completed | 3.8s\n",
410 | "[########################################] | 100% Completed | 4.0s\n",
411 | "[########################################] | 100% Completed | 3.8s\n",
412 | "[########################################] | 100% Completed | 3.8s\n",
413 | "[########################################] | 100% Completed | 3.6s\n",
414 | "[########################################] | 100% Completed | 3.4s\n",
415 | "[########################################] | 100% Completed | 3.4s\n",
416 | "[########################################] | 100% Completed | 3.4s\n",
417 | "[########################################] | 100% Completed | 3.6s\n",
418 | "[########################################] | 100% Completed | 3.5s\n",
419 | "[########################################] | 100% Completed | 3.8s\n",
420 | "[########################################] | 100% Completed | 5.7s\n",
421 | "[########################################] | 100% Completed | 4.0s\n",
422 | "[########################################] | 100% Completed | 4.5s\n",
423 | "[########################################] | 100% Completed | 5.1s\n"
424 | ]
425 | }
426 | ],
427 | "source": [
428 | "# Listing 10.8\n",
429 | "from dask_ml.linear_model import LogisticRegression\n",
430 | "from dask_ml.model_selection import train_test_split\n",
431 | "\n",
432 | "X = feature_array\n",
433 | "y = target_array.flatten()\n",
434 | "\n",
435 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
436 | "\n",
437 | "lr = LogisticRegression()\n",
438 | "\n",
439 | "with ProgressBar():\n",
440 | " lr.fit(X_train, y_train)"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "metadata": {},
446 | "source": [
447 | "### Section 10.2.1"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": 17,
453 | "metadata": {},
454 | "outputs": [
455 | {
456 | "data": {
457 | "text/plain": [
458 | "0.79629173556626676"
459 | ]
460 | },
461 | "execution_count": 17,
462 | "metadata": {},
463 | "output_type": "execute_result"
464 | }
465 | ],
466 | "source": [
467 | "# Listing 10.9\n",
468 | "lr.score(X_test, y_test).compute()"
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "metadata": {},
474 | "source": [
475 | "### Section 10.2.2"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 18,
481 | "metadata": {},
482 | "outputs": [
483 | {
484 | "name": "stdout",
485 | "output_type": "stream",
486 | "text": [
487 | "[########################################] | 100% Completed | 2.1s\n"
488 | ]
489 | }
490 | ],
491 | "source": [
492 | "# Listing 10.10\n",
493 | "from sklearn.naive_bayes import BernoulliNB\n",
494 | "from dask_ml.wrappers import Incremental\n",
495 | "\n",
496 | "nb = BernoulliNB()\n",
497 | "\n",
498 | "parallel_nb = Incremental(nb)\n",
499 | "\n",
500 | "with ProgressBar():\n",
501 | " parallel_nb.fit(X_train, y_train, classes=[0,1])"
502 | ]
503 | },
504 | {
505 | "cell_type": "code",
506 | "execution_count": 19,
507 | "metadata": {},
508 | "outputs": [
509 | {
510 | "data": {
511 | "text/plain": [
512 | "0.78886817014389754"
513 | ]
514 | },
515 | "execution_count": 19,
516 | "metadata": {},
517 | "output_type": "execute_result"
518 | }
519 | ],
520 | "source": [
521 | "# Listing 10.11\n",
522 | "parallel_nb.score(X_test, y_test)"
523 | ]
524 | },
525 | {
526 | "cell_type": "markdown",
527 | "metadata": {},
528 | "source": [
529 | "### Section 10.2.3"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": 20,
535 | "metadata": {},
536 | "outputs": [
537 | {
538 | "name": "stdout",
539 | "output_type": "stream",
540 | "text": [
541 | "[########################################] | 100% Completed | 23min 24.1s\n"
542 | ]
543 | }
544 | ],
545 | "source": [
546 | "# Listing 10.12\n",
547 | "from dask_ml.model_selection import GridSearchCV\n",
548 | "\n",
549 | "parameters = {'penalty': ['l1', 'l2'], 'C': [0.5, 1, 2]}\n",
550 | "\n",
551 | "lr = LogisticRegression()\n",
552 | "tuned_lr = GridSearchCV(lr, parameters)\n",
553 | "\n",
554 | "with ProgressBar():\n",
555 | " tuned_lr.fit(X_train, y_train) "
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": 21,
561 | "metadata": {
562 | "scrolled": false
563 | },
564 | "outputs": [
565 | {
566 | "name": "stderr",
567 | "output_type": "stream",
568 | "text": [
569 | "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
570 | " warnings.warn(*warn_args, **warn_kwargs)\n",
571 | "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
572 | " warnings.warn(*warn_args, **warn_kwargs)\n",
573 | "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
574 | " warnings.warn(*warn_args, **warn_kwargs)\n",
575 | "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
576 | " warnings.warn(*warn_args, **warn_kwargs)\n",
577 | "/anaconda3/lib/python3.6/site-packages/dask_ml/model_selection/utils.py:121: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in sklearn 0.21. If you need training scores, please set return_train_score=True\n",
578 | " warnings.warn(*warn_args, **warn_kwargs)\n"
579 | ]
580 | },
581 | {
582 | "data": {
583 | "text/html": [
584 | "\n",
585 | "\n",
598 | "
\n",
599 | " \n",
600 | " \n",
601 | " | \n",
602 | " params | \n",
603 | " mean_fit_time | \n",
604 | " std_fit_time | \n",
605 | " mean_score_time | \n",
606 | " std_score_time | \n",
607 | " split0_test_score | \n",
608 | " split1_test_score | \n",
609 | " split2_test_score | \n",
610 | " mean_test_score | \n",
611 | " std_test_score | \n",
612 | " rank_test_score | \n",
613 | " split0_train_score | \n",
614 | " split1_train_score | \n",
615 | " split2_train_score | \n",
616 | " mean_train_score | \n",
617 | " std_train_score | \n",
618 | " param_C | \n",
619 | " param_penalty | \n",
620 | "
\n",
621 | " \n",
622 | " \n",
623 | " \n",
624 | " | 0 | \n",
625 | " {'C': 0.5, 'penalty': 'l1'} | \n",
626 | " 1308.978919 | \n",
627 | " 11.548624 | \n",
628 | " 0.347088 | \n",
629 | " 0.044540 | \n",
630 | " 0.790291 | \n",
631 | " 0.793938 | \n",
632 | " 0.797087 | \n",
633 | " 0.793772 | \n",
634 | " 0.002777 | \n",
635 | " 4 | \n",
636 | " 0.795671 | \n",
637 | " 0.794152 | \n",
638 | " 0.792604 | \n",
639 | " 0.794142 | \n",
640 | " 0.001252 | \n",
641 | " 0.5 | \n",
642 | " l1 | \n",
643 | "
\n",
644 | " \n",
645 | " | 1 | \n",
646 | " {'C': 0.5, 'penalty': 'l2'} | \n",
647 | " 143.865403 | \n",
648 | " 2.276777 | \n",
649 | " 0.626723 | \n",
650 | " 0.145728 | \n",
651 | " 0.790801 | \n",
652 | " 0.793715 | \n",
653 | " 0.796987 | \n",
654 | " 0.793834 | \n",
655 | " 0.002527 | \n",
656 | " 1 | \n",
657 | " 0.796081 | \n",
658 | " 0.794008 | \n",
659 | " 0.792264 | \n",
660 | " 0.794118 | \n",
661 | " 0.001560 | \n",
662 | " 0.5 | \n",
663 | " l2 | \n",
664 | "
\n",
665 | " \n",
666 | " | 2 | \n",
667 | " {'C': 1, 'penalty': 'l1'} | \n",
668 | " 1211.649146 | \n",
669 | " 72.024862 | \n",
670 | " 0.639021 | \n",
671 | " 0.275957 | \n",
672 | " 0.790689 | \n",
673 | " 0.793551 | \n",
674 | " 0.796559 | \n",
675 | " 0.793600 | \n",
676 | " 0.002397 | \n",
677 | " 6 | \n",
678 | " 0.796014 | \n",
679 | " 0.793724 | \n",
680 | " 0.792182 | \n",
681 | " 0.793973 | \n",
682 | " 0.001574 | \n",
683 | " 1 | \n",
684 | " l1 | \n",
685 | "
\n",
686 | " \n",
687 | " | 3 | \n",
688 | " {'C': 1, 'penalty': 'l2'} | \n",
689 | " 74.962411 | \n",
690 | " 1.968621 | \n",
691 | " 0.553580 | \n",
692 | " 0.068979 | \n",
693 | " 0.790801 | \n",
694 | " 0.793715 | \n",
695 | " 0.796987 | \n",
696 | " 0.793834 | \n",
697 | " 0.002527 | \n",
698 | " 1 | \n",
699 | " 0.796081 | \n",
700 | " 0.794008 | \n",
701 | " 0.792267 | \n",
702 | " 0.794119 | \n",
703 | " 0.001559 | \n",
704 | " 1 | \n",
705 | " l2 | \n",
706 | "
\n",
707 | " \n",
708 | " | 4 | \n",
709 | " {'C': 2, 'penalty': 'l1'} | \n",
710 | " 608.802576 | \n",
711 | " 58.226398 | \n",
712 | " 0.315940 | \n",
713 | " 0.122815 | \n",
714 | " 0.790701 | \n",
715 | " 0.793592 | \n",
716 | " 0.796835 | \n",
717 | " 0.793709 | \n",
718 | " 0.002505 | \n",
719 | " 5 | \n",
720 | " 0.796020 | \n",
721 | " 0.793829 | \n",
722 | " 0.792255 | \n",
723 | " 0.794035 | \n",
724 | " 0.001544 | \n",
725 | " 2 | \n",
726 | " l1 | \n",
727 | "
\n",
728 | " \n",
729 | " | 5 | \n",
730 | " {'C': 2, 'penalty': 'l2'} | \n",
731 | " 101.755454 | \n",
732 | " 7.513333 | \n",
733 | " 0.553664 | \n",
734 | " 0.067346 | \n",
735 | " 0.790801 | \n",
736 | " 0.793715 | \n",
737 | " 0.796987 | \n",
738 | " 0.793834 | \n",
739 | " 0.002527 | \n",
740 | " 1 | \n",
741 | " 0.796081 | \n",
742 | " 0.794008 | \n",
743 | " 0.792267 | \n",
744 | " 0.794119 | \n",
745 | " 0.001559 | \n",
746 | " 2 | \n",
747 | " l2 | \n",
748 | "
\n",
749 | " \n",
750 | "
\n",
751 | "
"
752 | ],
753 | "text/plain": [
754 | " params mean_fit_time std_fit_time mean_score_time \\\n",
755 | "0 {'C': 0.5, 'penalty': 'l1'} 1308.978919 11.548624 0.347088 \n",
756 | "1 {'C': 0.5, 'penalty': 'l2'} 143.865403 2.276777 0.626723 \n",
757 | "2 {'C': 1, 'penalty': 'l1'} 1211.649146 72.024862 0.639021 \n",
758 | "3 {'C': 1, 'penalty': 'l2'} 74.962411 1.968621 0.553580 \n",
759 | "4 {'C': 2, 'penalty': 'l1'} 608.802576 58.226398 0.315940 \n",
760 | "5 {'C': 2, 'penalty': 'l2'} 101.755454 7.513333 0.553664 \n",
761 | "\n",
762 | " std_score_time split0_test_score split1_test_score split2_test_score \\\n",
763 | "0 0.044540 0.790291 0.793938 0.797087 \n",
764 | "1 0.145728 0.790801 0.793715 0.796987 \n",
765 | "2 0.275957 0.790689 0.793551 0.796559 \n",
766 | "3 0.068979 0.790801 0.793715 0.796987 \n",
767 | "4 0.122815 0.790701 0.793592 0.796835 \n",
768 | "5 0.067346 0.790801 0.793715 0.796987 \n",
769 | "\n",
770 | " mean_test_score std_test_score rank_test_score split0_train_score \\\n",
771 | "0 0.793772 0.002777 4 0.795671 \n",
772 | "1 0.793834 0.002527 1 0.796081 \n",
773 | "2 0.793600 0.002397 6 0.796014 \n",
774 | "3 0.793834 0.002527 1 0.796081 \n",
775 | "4 0.793709 0.002505 5 0.796020 \n",
776 | "5 0.793834 0.002527 1 0.796081 \n",
777 | "\n",
778 | " split1_train_score split2_train_score mean_train_score std_train_score \\\n",
779 | "0 0.794152 0.792604 0.794142 0.001252 \n",
780 | "1 0.794008 0.792264 0.794118 0.001560 \n",
781 | "2 0.793724 0.792182 0.793973 0.001574 \n",
782 | "3 0.794008 0.792267 0.794119 0.001559 \n",
783 | "4 0.793829 0.792255 0.794035 0.001544 \n",
784 | "5 0.794008 0.792267 0.794119 0.001559 \n",
785 | "\n",
786 | " param_C param_penalty \n",
787 | "0 0.5 l1 \n",
788 | "1 0.5 l2 \n",
789 | "2 1 l1 \n",
790 | "3 1 l2 \n",
791 | "4 2 l1 \n",
792 | "5 2 l2 "
793 | ]
794 | },
795 | "execution_count": 21,
796 | "metadata": {},
797 | "output_type": "execute_result"
798 | }
799 | ],
800 | "source": [
801 | "# Listing 10.13\n",
802 | "import pandas as pd\n",
803 | "pd.DataFrame(tuned_lr.cv_results_)"
804 | ]
805 | },
806 | {
807 | "cell_type": "markdown",
808 | "metadata": {},
809 | "source": [
810 | "### Section 10.3"
811 | ]
812 | },
813 | {
814 | "cell_type": "code",
815 | "execution_count": 22,
816 | "metadata": {},
817 | "outputs": [],
818 | "source": [
819 | "# Listing 10.14\n",
820 | "import dill\n",
821 | "with open('naive_bayes_model.pkl', 'wb') as file:\n",
822 | " dill.dump(parallel_nb, file)"
823 | ]
824 | },
825 | {
826 | "cell_type": "code",
827 | "execution_count": 23,
828 | "metadata": {},
829 | "outputs": [
830 | {
831 | "data": {
832 | "text/plain": [
833 | "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
834 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
835 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
836 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,\n",
837 | " 1, 1, 1, 1, 1, 1, 1, 1])"
838 | ]
839 | },
840 | "execution_count": 23,
841 | "metadata": {},
842 | "output_type": "execute_result"
843 | }
844 | ],
845 | "source": [
846 | "# Listing 10.15\n",
847 | "with open('naive_bayes_model.pkl', 'rb') as file:\n",
848 | " nb = dill.load(file)\n",
849 | "nb.predict(np.random.randint(0,2,(100,100)))"
850 | ]
851 | }
852 | ],
853 | "metadata": {
854 | "kernelspec": {
855 | "display_name": "Python 3",
856 | "language": "python",
857 | "name": "python3"
858 | },
859 | "language_info": {
860 | "codemirror_mode": {
861 | "name": "ipython",
862 | "version": 3
863 | },
864 | "file_extension": ".py",
865 | "mimetype": "text/x-python",
866 | "name": "python",
867 | "nbconvert_exporter": "python",
868 | "pygments_lexer": "ipython3",
869 | "version": "3.6.8"
870 | }
871 | },
872 | "nbformat": 4,
873 | "nbformat_minor": 2
874 | }
875 |
--------------------------------------------------------------------------------
/Chapter 9.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science with Python and Dask\n",
8 | "## Chapter 9: Working with Bags and Arrays"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### Section 9.1"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/plain": [
26 | "dask.bag"
27 | ]
28 | },
29 | "execution_count": 1,
30 | "metadata": {},
31 | "output_type": "execute_result"
32 | }
33 | ],
34 | "source": [
35 | "# Listing 9.1\n",
36 | "# Import bag and read in the data\n",
37 | "import dask.bag as bag\n",
38 | "import os\n",
39 | "\n",
40 | "os.chdir('/Users/jesse/Documents')\n",
41 | "raw_data = bag.read_text('foods.txt')\n",
42 | "raw_data"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "### Section 9.1.1"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/plain": [
60 | "('product/productId: B001E4KFG0\\n',\n",
61 | " 'review/userId: A3SGXH7AUHU8GW\\n',\n",
62 | " 'review/profileName: delmartian\\n',\n",
63 | " 'review/helpfulness: 1/1\\n',\n",
64 | " 'review/score: 5.0\\n',\n",
65 | " 'review/time: 1303862400\\n',\n",
66 | " 'review/summary: Good Quality Dog Food\\n',\n",
67 | " 'review/text: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.\\n',\n",
68 | " '\\n',\n",
69 | " 'product/productId: B00813GRG4\\n')"
70 | ]
71 | },
72 | "execution_count": 2,
73 | "metadata": {},
74 | "output_type": "execute_result"
75 | }
76 | ],
77 | "source": [
78 | "# Listing 9.2\n",
79 | "# Take a small sample of the first few elements of the bag\n",
80 | "raw_data.take(10)"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "### Section 9.1.2"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 3,
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "ename": "UnicodeDecodeError",
97 | "evalue": "'utf-8' codec can't decode byte 0xce in position 2620: invalid continuation byte",
98 | "output_type": "error",
99 | "traceback": [
100 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
101 | "\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
102 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# If we try to count across the file, we might run into an encoding error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
103 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/base.py\u001b[0m in \u001b[0;36mcompute\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 154\u001b[0m \u001b[0mdask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbase\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 155\u001b[0m \"\"\"\n\u001b[0;32m--> 156\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraverse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
104 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/base.py\u001b[0m in \u001b[0;36mcompute\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 400\u001b[0m \u001b[0mkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__dask_keys__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcollections\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 401\u001b[0m \u001b[0mpostcomputes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__dask_postcompute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcollections\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 402\u001b[0;31m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mschedule\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdsk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 403\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrepack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpostcomputes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 404\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
105 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/multiprocessing.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, **kwargs)\u001b[0m\n\u001b[1;32m 175\u001b[0m \u001b[0mget_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_process_get_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdumps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[0mpack_exception\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpack_exception\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 177\u001b[0;31m raise_exception=reraise, **kwargs)\n\u001b[0m\u001b[1;32m 178\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcleanup\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
106 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36mget_async\u001b[0;34m(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)\u001b[0m\n\u001b[1;32m 503\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Re-execute locally\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 505\u001b[0;31m \u001b[0mraise_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 506\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mworker_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres_info\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 507\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cache'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
107 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/compatibility.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(exc, tb)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
108 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36mexecute_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m 272\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask_info\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 274\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 275\u001b[0m \u001b[0mid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_id\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
109 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mistask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0margs2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
110 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mistask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0margs2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
111 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m 249\u001b[0m \"\"\"\n\u001b[1;32m 250\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mistask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
112 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 249\u001b[0m \"\"\"\n\u001b[1;32m 250\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mistask\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
113 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[0margs2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 255\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 256\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 257\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
114 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/bag/core.py\u001b[0m in \u001b[0;36mempty_safe_apply\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2070\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_last\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2071\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mno_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2072\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2073\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_last\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpart\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2074\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mno_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
115 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/cytoolz/itertoolz.pyx\u001b[0m in \u001b[0;36mcytoolz.itertoolz.count\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1057\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1058\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1059\u001b[0;31m \u001b[0mcpdef\u001b[0m \u001b[0mobject\u001b[0m \u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1060\u001b[0m \"\"\"\n\u001b[1;32m 1061\u001b[0m \u001b[0mCount\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mnumber\u001b[0m \u001b[0mof\u001b[0m \u001b[0mitems\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
116 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/cytoolz/itertoolz.pyx\u001b[0m in \u001b[0;36mcytoolz.itertoolz.count\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1071\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1072\u001b[0m \u001b[0mcdef\u001b[0m \u001b[0mPy_ssize_t\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1073\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1074\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1075\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
117 | "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/dask/bag/text.py\u001b[0m in \u001b[0;36mfile_to_blocks\u001b[0;34m()\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfile_to_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlazy_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mlazy_file\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
118 | "\u001b[0;32m/anaconda3/lib/python3.6/codecs.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m()\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[0;31m# decode input (taking the buffer into account)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 321\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconsumed\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfinal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 322\u001b[0m \u001b[0;31m# keep undecoded input until the next call\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mconsumed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
119 | "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xce in position 2620: invalid continuation byte"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "# Listing 9.3\n",
125 | "# If we try to count across the file, we might run into an encoding error\n",
126 | "raw_data.count().compute()"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 4,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "data": {
136 | "text/plain": [
137 | "5116093"
138 | ]
139 | },
140 | "execution_count": 4,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "# Listing 9.4\n",
147 | "raw_data = bag.read_text('foods.txt', encoding='cp1252')\n",
148 | "raw_data.count().compute()"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "### Section 9.1.3"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 6,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# Listing 9.5\n",
165 | "from dask.delayed import delayed\n",
166 | "\n",
167 | "def get_next_part(file, start_index, span_index=0, blocksize=1024):\n",
168 | " file.seek(start_index)\n",
169 | " buffer = file.read(blocksize + span_index).decode('cp1252')\n",
170 | " delimiter_position = buffer.find('\\n\\n')\n",
171 | " if delimiter_position == -1:\n",
172 | " return get_next_part(file, start_index, span_index + blocksize)\n",
173 | " else:\n",
174 | " file.seek(start_index)\n",
175 | " return start_index, delimiter_position"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 8,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "# Listing 9.6\n",
185 | "with open('foods.txt', 'rb') as file_handle:\n",
186 | " size = file_handle.seek(0,2) - 1\n",
187 | " more_data = True\n",
188 | " output = []\n",
189 | " current_position = next_position = 0\n",
190 | " while more_data:\n",
191 | " if current_position >= size:\n",
192 | " more_data = False\n",
193 | " else:\n",
194 | " current_position, next_position = get_next_part(file_handle, current_position, 0)\n",
195 | " output.append((current_position, next_position))\n",
196 | " current_position = current_position + next_position + 2"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 7,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "# Listing 9.7\n",
206 | "def get_item(filename, start_index, delimiter_position, encoding='cp1252'):\n",
207 | " with open(filename, 'rb') as file_handle:\n",
208 | " file_handle.seek(start_index)\n",
209 | " text = file_handle.read(delimiter_position).decode(encoding)\n",
210 | " elements = text.strip().split('\\n')\n",
211 | " key_value_pairs = [(element.split(': ')[0], element.split(': ')[1]) \n",
212 | " if len(element.split(': ')) > 1 \n",
213 | " else ('unknown', element) \n",
214 | " for element in elements]\n",
215 | " return dict(key_value_pairs)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 9,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "# Listing 9.8\n",
225 | "reviews = bag.from_sequence(output).map(lambda x: get_item('foods.txt', x[0], x[1]))"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 10,
231 | "metadata": {},
232 | "outputs": [
233 | {
234 | "data": {
235 | "text/plain": [
236 | "({'product/productId': 'B001E4KFG0',\n",
237 | " 'review/userId': 'A3SGXH7AUHU8GW',\n",
238 | " 'review/profileName': 'delmartian',\n",
239 | " 'review/helpfulness': '1/1',\n",
240 | " 'review/score': '5.0',\n",
241 | " 'review/time': '1303862400',\n",
242 | " 'review/summary': 'Good Quality Dog Food',\n",
243 | " 'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.'},\n",
244 | " {'product/productId': 'B00813GRG4',\n",
245 | " 'review/userId': 'A1D87F6ZCVE5NK',\n",
246 | " 'review/profileName': 'dll pa',\n",
247 | " 'review/helpfulness': '0/0',\n",
248 | " 'review/score': '1.0',\n",
249 | " 'review/time': '1346976000',\n",
250 | " 'review/summary': 'Not as Advertised',\n",
251 | " 'review/text': 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as \"Jumbo\".'})"
252 | ]
253 | },
254 | "execution_count": 10,
255 | "metadata": {},
256 | "output_type": "execute_result"
257 | }
258 | ],
259 | "source": [
260 | "# Listing 9.9\n",
261 | "reviews.take(2)"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 11,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "name": "stdout",
271 | "output_type": "stream",
272 | "text": [
273 | "[########################################] | 100% Completed | 8.7s\n"
274 | ]
275 | },
276 | {
277 | "data": {
278 | "text/plain": [
279 | "568454"
280 | ]
281 | },
282 | "execution_count": 11,
283 | "metadata": {},
284 | "output_type": "execute_result"
285 | }
286 | ],
287 | "source": [
288 | "# Listing 9.10\n",
289 | "from dask.diagnostics import ProgressBar\n",
290 | "\n",
291 | "with ProgressBar():\n",
292 | " count = reviews.count().compute()\n",
293 | "count"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "### Section 9.2.1"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 12,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "# Listing 9.11\n",
310 | "def get_score(element):\n",
311 | " score_numeric = float(element['review/score'])\n",
312 | " return score_numeric"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 13,
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "data": {
322 | "text/plain": [
323 | "(5.0, 1.0, 4.0, 2.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0)"
324 | ]
325 | },
326 | "execution_count": 13,
327 | "metadata": {},
328 | "output_type": "execute_result"
329 | }
330 | ],
331 | "source": [
332 | "# Listing 9.12\n",
333 | "review_scores = reviews.map(get_score)\n",
334 | "review_scores.take(10)"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 14,
340 | "metadata": {},
341 | "outputs": [
342 | {
343 | "data": {
344 | "text/plain": [
345 | "({'product/productId': 'B001E4KFG0',\n",
346 | " 'review/userId': 'A3SGXH7AUHU8GW',\n",
347 | " 'review/profileName': 'delmartian',\n",
348 | " 'review/helpfulness': '1/1',\n",
349 | " 'review/score': '5.0',\n",
350 | " 'review/time': '1303862400',\n",
351 | " 'review/summary': 'Good Quality Dog Food',\n",
352 | " 'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.',\n",
353 | " 'review/sentiment': 'positive'},\n",
354 | " {'product/productId': 'B00813GRG4',\n",
355 | " 'review/userId': 'A1D87F6ZCVE5NK',\n",
356 | " 'review/profileName': 'dll pa',\n",
357 | " 'review/helpfulness': '0/0',\n",
358 | " 'review/score': '1.0',\n",
359 | " 'review/time': '1346976000',\n",
360 | " 'review/summary': 'Not as Advertised',\n",
361 | " 'review/text': 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as \"Jumbo\".',\n",
362 | " 'review/sentiment': 'negative'},\n",
363 | " {'product/productId': 'B000LQOCH0',\n",
364 | " 'review/userId': 'ABXLMWJIXXAIN',\n",
365 | " 'review/profileName': 'Natalia Corres \"Natalia Corres\"',\n",
366 | " 'review/helpfulness': '1/1',\n",
367 | " 'review/score': '4.0',\n",
368 | " 'review/time': '1219017600',\n",
369 | " 'review/summary': '\"Delight\" says it all',\n",
370 | " 'review/text': 'This is a confection that has been around a few centuries. It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar. And it is a tiny mouthful of heaven. Not too chewy, and very flavorful. I highly recommend this yummy treat. If you are familiar with the story of C.S. Lewis\\' \"The Lion, The Witch, and The Wardrobe\" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',\n",
371 | " 'review/sentiment': 'positive'})"
372 | ]
373 | },
374 | "execution_count": 14,
375 | "metadata": {},
376 | "output_type": "execute_result"
377 | }
378 | ],
379 | "source": [
380 | "# Listing 9.13\n",
381 | "def tag_positive_negative_by_score(element):\n",
382 | " if float(element['review/score']) > 3:\n",
383 | " element['review/sentiment'] = 'positive'\n",
384 | " else:\n",
385 | " element['review/sentiment'] = 'negative'\n",
386 | " return element\n",
387 | "\n",
388 | "reviews.map(tag_positive_negative_by_score).take(3)"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 15,
394 | "metadata": {},
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | "({'product/productId': 'B001E4KFG0',\n",
400 | " 'review/userId': 'A3SGXH7AUHU8GW',\n",
401 | " 'review/profileName': 'delmartian',\n",
402 | " 'review/helpfulness': '1/1',\n",
403 | " 'review/score': '5.0',\n",
404 | " 'review/time': '1303862400',\n",
405 | " 'review/summary': 'Good Quality Dog Food',\n",
406 | " 'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.'},)"
407 | ]
408 | },
409 | "execution_count": 15,
410 | "metadata": {},
411 | "output_type": "execute_result"
412 | }
413 | ],
414 | "source": [
415 | "# Listing 9.14\n",
416 | "reviews.take(1)"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "### Section 9.2.2"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 16,
429 | "metadata": {},
430 | "outputs": [
431 | {
432 | "name": "stderr",
433 | "output_type": "stream",
434 | "text": [
435 | "/anaconda3/lib/python3.6/site-packages/dask/bag/core.py:2089: UserWarning: Insufficient elements for `take`. 5 elements requested, only 1 elements available. Try passing larger `npartitions` to `take`.\n",
436 | " \"larger `npartitions` to `take`.\".format(n, len(r)))\n"
437 | ]
438 | },
439 | {
440 | "data": {
441 | "text/plain": [
442 | "({'product/productId': 'B001E4KFG0',\n",
443 | " 'review/userId': 'A3SGXH7AUHU8GW',\n",
444 | " 'review/profileName': 'delmartian',\n",
445 | " 'review/helpfulness': '1/1',\n",
446 | " 'review/score': '5.0',\n",
447 | " 'review/time': '1303862400',\n",
448 | " 'review/summary': 'Good Quality Dog Food',\n",
449 | " 'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.'},)"
450 | ]
451 | },
452 | "execution_count": 16,
453 | "metadata": {},
454 | "output_type": "execute_result"
455 | }
456 | ],
457 | "source": [
458 | "# Listing 9.15\n",
459 | "specific_item = reviews.filter(lambda element: element['product/productId'] == 'B001E4KFG0')\n",
460 | "specific_item.take(5)"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 17,
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "data": {
470 | "text/plain": [
471 | "({'product/productId': 'B001E4KFG0',\n",
472 | " 'review/userId': 'A3SGXH7AUHU8GW',\n",
473 | " 'review/profileName': 'delmartian',\n",
474 | " 'review/helpfulness': '1/1',\n",
475 | " 'review/score': '5.0',\n",
476 | " 'review/time': '1303862400',\n",
477 | " 'review/summary': 'Good Quality Dog Food',\n",
478 | " 'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.'},\n",
479 | " {'product/productId': 'B00171APVA',\n",
480 | " 'review/userId': 'A21BT40VZCCYT4',\n",
481 | " 'review/profileName': 'Carol A. Reed',\n",
482 | " 'review/helpfulness': '0/0',\n",
483 | " 'review/score': '5.0',\n",
484 | " 'review/time': '1351209600',\n",
485 | " 'review/summary': 'Healthy Dog Food',\n",
486 | " 'review/text': 'This is a very healthy dog food. Good for their digestion. Also good for small puppies. My dog eats her required amount at every feeding.'},\n",
487 | " {'product/productId': 'B0019CW0HE',\n",
488 | " 'review/userId': 'A1FD9E5C06UB6B',\n",
489 | " 'review/profileName': 'BRENDA DEMERS',\n",
490 | " 'review/helpfulness': '5/5',\n",
491 | " 'review/score': '3.0',\n",
492 | " 'review/time': '1301011200',\n",
493 | " 'review/summary': 'Natural Balance Lamb and Rice',\n",
494 | " 'review/text': 'While my dogs like all of the flavors that we have tried of this dog food, for some reason their itching increased when I tried the lamb and rice. I have some very itchy dogs and am giving them a limited ingredient dog food to try to help. The duck and sweet potato cut down on the itching significantly, but when we tried lamb and rice they started itching more once again. I like Natural Balance for the quality ingredients.'},\n",
495 | " {'product/productId': 'B0019CW0HE',\n",
496 | " 'review/userId': 'AK2CXHH9VRZ2A',\n",
497 | " 'review/profileName': 'I. GLENN',\n",
498 | " 'review/helpfulness': '4/4',\n",
499 | " 'review/score': '3.0',\n",
500 | " 'review/time': '1313193600',\n",
501 | " 'review/summary': 'INCREASED MY DOGS ITCHING',\n",
502 | " 'review/text': 'Awesome dog food. However, when given to my \"Boston\", who has severe reactions to some food ingredients; his itching increased to violent jumping out of bed at night, scratching. As soon as I changed to a different formula, the scratching stopped. So glad Natural Balance has other choices. I guess you have to try each, until you find what\\'s best for your pet.'},\n",
503 | " {'product/productId': 'B0019CW0HE',\n",
504 | " 'review/userId': 'A25BGFRHYHEZKK',\n",
505 | " 'review/profileName': \"Toby's mom\",\n",
506 | " 'review/helpfulness': '4/4',\n",
507 | " 'review/score': '5.0',\n",
508 | " 'review/time': '1292889600',\n",
509 | " 'review/summary': 'Great food!',\n",
510 | " 'review/text': 'We have three dogs and all of them love this food! We bought it specifically for one of our dogs who has food allergies and it works great for him, no more hot spots or tummy problems.
I LOVE that it ships right to our door with free shipping.'})"
511 | ]
512 | },
513 | "execution_count": 17,
514 | "metadata": {},
515 | "output_type": "execute_result"
516 | }
517 | ],
518 | "source": [
519 | "# Listing 9.16\n",
520 | "keyword = reviews.filter(lambda element: 'dog' in element['review/text'])\n",
521 | "keyword.take(5)"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": 18,
527 | "metadata": {},
528 | "outputs": [],
529 | "source": [
530 | "# Listing 9.17\n",
531 | "def is_helpful(element):\n",
532 | " helpfulness = element['review/helpfulness'].strip().split('/')\n",
533 | " number_of_helpful_votes = float(helpfulness[0])\n",
534 | " number_of_total_votes = float(helpfulness[1])\n",
535 | " # Watch for divide by 0 errors\n",
536 | " if number_of_total_votes >= 1:\n",
537 | " return number_of_helpful_votes / number_of_total_votes > 0.75\n",
538 | " else:\n",
539 | " return False"
540 | ]
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": 19,
545 | "metadata": {},
546 | "outputs": [
547 | {
548 | "data": {
549 | "text/plain": [
550 | "({'product/productId': 'B001E4KFG0',\n",
551 | " 'review/userId': 'A3SGXH7AUHU8GW',\n",
552 | " 'review/profileName': 'delmartian',\n",
553 | " 'review/helpfulness': '1/1',\n",
554 | " 'review/score': '5.0',\n",
555 | " 'review/time': '1303862400',\n",
556 | " 'review/summary': 'Good Quality Dog Food',\n",
557 | " 'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.'},\n",
558 | " {'product/productId': 'B000LQOCH0',\n",
559 | " 'review/userId': 'ABXLMWJIXXAIN',\n",
560 | " 'review/profileName': 'Natalia Corres \"Natalia Corres\"',\n",
561 | " 'review/helpfulness': '1/1',\n",
562 | " 'review/score': '4.0',\n",
563 | " 'review/time': '1219017600',\n",
564 | " 'review/summary': '\"Delight\" says it all',\n",
565 | " 'review/text': 'This is a confection that has been around a few centuries. It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar. And it is a tiny mouthful of heaven. Not too chewy, and very flavorful. I highly recommend this yummy treat. If you are familiar with the story of C.S. Lewis\\' \"The Lion, The Witch, and The Wardrobe\" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.'})"
566 | ]
567 | },
568 | "execution_count": 19,
569 | "metadata": {},
570 | "output_type": "execute_result"
571 | }
572 | ],
573 | "source": [
574 | "# Listing 9.18\n",
575 | "helpful_reviews = reviews.filter(is_helpful)\n",
576 | "helpful_reviews.take(2)"
577 | ]
578 | },
579 | {
580 | "cell_type": "markdown",
581 | "metadata": {},
582 | "source": [
583 | "### Section 9.2.3"
584 | ]
585 | },
586 | {
587 | "cell_type": "code",
588 | "execution_count": 20,
589 | "metadata": {},
590 | "outputs": [],
591 | "source": [
592 | "# Listing 9.19\n",
593 | "helpful_review_scores = helpful_reviews.map(get_score)\n",
594 | "\n",
595 | "with ProgressBar():\n",
596 | " all_mean = review_scores.mean().compute()\n",
597 | " helpful_mean = helpful_review_scores.mean().compute()\n",
598 | " \n",
599 | "print(f\"Mean Score of All Reviews: {round(all_mean, 2)}\\nMean Score of Helpful Reviews: {round(helpful_mean,2)}\")"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 24,
605 | "metadata": {},
606 | "outputs": [
607 | {
608 | "name": "stdout",
609 | "output_type": "stream",
610 | "text": [
611 | "[########################################] | 100% Completed | 10.8s\n",
612 | "[########################################] | 100% Completed | 9.9s\n",
613 | "Mean Length of Helpful Reviews: 459.36\n",
614 | "Mean Length of Unhelpful Reviews: 379.32\n"
615 | ]
616 | }
617 | ],
618 | "source": [
619 | "# Listing 9.20\n",
620 | "def get_length(element):\n",
621 | " return len(element['review/text'])\n",
622 | "\n",
623 | "with ProgressBar():\n",
624 | " review_length_helpful = helpful_reviews.map(get_length).mean().compute()\n",
625 | " review_length_unhelpful = reviews.filter(lambda review: not is_helpful(review)).map(get_length).mean().compute()\n",
626 | "print(f\"Mean Length of Helpful Reviews: {round(review_length_helpful, 2)}\\nMean Length of Unhelpful Reviews: {round(review_length_unhelpful,2)}\")"
627 | ]
628 | },
629 | {
630 | "cell_type": "markdown",
631 | "metadata": {},
632 | "source": [
633 | "### Section 9.2.4"
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "execution_count": 22,
639 | "metadata": {},
640 | "outputs": [
641 | {
642 | "name": "stdout",
643 | "output_type": "stream",
644 | "text": [
645 | "[########################################] | 100% Completed | 9.0s\n"
646 | ]
647 | },
648 | {
649 | "data": {
650 | "text/plain": [
651 | "[(5.0, 363122), (1.0, 52268), (4.0, 80655), (2.0, 29769), (3.0, 42640)]"
652 | ]
653 | },
654 | "execution_count": 22,
655 | "metadata": {},
656 | "output_type": "execute_result"
657 | }
658 | ],
659 | "source": [
660 | "# Listing 9.21\n",
661 | "def count(accumulator, element):\n",
662 | " return accumulator + 1\n",
663 | "\n",
664 | "def combine(total1, total2):\n",
665 | " return total1 + total2\n",
666 | "\n",
667 | "with ProgressBar():\n",
668 | " count_of_reviews_by_score = reviews.foldby(get_score, count, 0, combine, 0).compute()\n",
669 | "count_of_reviews_by_score"
670 | ]
671 | },
672 | {
673 | "cell_type": "code",
674 | "execution_count": null,
675 | "metadata": {},
676 | "outputs": [],
677 | "source": [
678 | "# Listing 9.22\n",
679 | "# Listing 9.21 displays the following output:\n",
680 | "# [(5.0, 363122), (1.0, 52268), (4.0, 80655), (2.0, 29769), (3.0, 42640)]"
681 | ]
682 | },
683 | {
684 | "cell_type": "markdown",
685 | "metadata": {},
686 | "source": [
687 | "### Section 9.3"
688 | ]
689 | },
690 | {
691 | "cell_type": "code",
692 | "execution_count": 16,
693 | "metadata": {},
694 | "outputs": [],
695 | "source": [
696 | "# Listing 9.23\n",
697 | "def get_score_and_helpfulness(element):\n",
698 | " score_numeric = float(element['review/score'])\n",
699 | " helpfulness = element['review/helpfulness'].strip().split('/')\n",
700 | " number_of_helpful_votes = float(helpfulness[0])\n",
701 | " number_of_total_votes = float(helpfulness[1])\n",
702 | " # Watch for divide by 0 errors\n",
703 | " if number_of_total_votes > 0:\n",
704 | " helpfulness_percent = number_of_helpful_votes / number_of_total_votes\n",
705 | " else:\n",
706 | " helpfulness_percent = 0.\n",
707 | " return (score_numeric, helpfulness_percent)"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": null,
713 | "metadata": {},
714 | "outputs": [],
715 | "source": [
716 | "# Listing 9.24\n",
717 | "scores_and_helpfulness = reviews.map(get_score_and_helpfulness).to_dataframe(meta={'Review Scores': float, 'Helpfulness Percent': float})"
718 | ]
719 | },
720 | {
721 | "cell_type": "code",
722 | "execution_count": null,
723 | "metadata": {},
724 | "outputs": [],
725 | "source": [
726 | "# Listing 9.25\n",
727 | "with ProgressBar():\n",
728 | " scores_and_helpfulness_stats = scores_and_helpfulness.describe().compute()\n",
729 | "scores_and_helpfulness_stats"
730 | ]
731 | },
732 | {
733 | "cell_type": "markdown",
734 | "metadata": {},
735 | "source": [
736 | "### Section 9.4.2"
737 | ]
738 | },
739 | {
740 | "cell_type": "code",
741 | "execution_count": 28,
742 | "metadata": {},
743 | "outputs": [
744 | {
745 | "data": {
746 | "text/plain": [
747 | "(['bought',\n",
748 | " 'several',\n",
749 | " 'vitality',\n",
750 | " 'canned',\n",
751 | " 'dog',\n",
752 | " 'food',\n",
753 | " 'products',\n",
754 | " 'found',\n",
755 | " 'good',\n",
756 | " 'quality',\n",
757 | " 'product',\n",
758 | " 'looks',\n",
759 | " 'like',\n",
760 | " 'stew',\n",
761 | " 'processed',\n",
762 | " 'meat',\n",
763 | " 'smells',\n",
764 | " 'better',\n",
765 | " 'labrador',\n",
766 | " 'finicky',\n",
767 | " 'appreciates',\n",
768 | " 'product',\n",
769 | " 'better'],)"
770 | ]
771 | },
772 | "execution_count": 28,
773 | "metadata": {},
774 | "output_type": "execute_result"
775 | }
776 | ],
777 | "source": [
778 | "# Listing 9.26\n",
779 | "from nltk.corpus import stopwords \n",
780 | "from nltk.tokenize import RegexpTokenizer\n",
781 | "from functools import partial\n",
782 | "\n",
783 | "tokenizer = RegexpTokenizer(r'\\w+')\n",
784 | "\n",
785 | "def extract_reviews(element):\n",
786 | " return element['review/text'].lower()\n",
787 | "\n",
788 | "def filter_stopword(word, stopwords):\n",
789 | " return word not in stopwords\n",
790 | "\n",
791 | "def filter_stopwords(tokens, stopwords):\n",
792 | " return list(filter(partial(filter_stopword, stopwords=stopwords), tokens))\n",
793 | "\n",
794 | "stopword_set = set(stopwords.words('english'))"
795 | ]
796 | },
797 | {
798 | "cell_type": "code",
799 | "execution_count": null,
800 | "metadata": {},
801 | "outputs": [],
802 | "source": [
803 | "# Listing 9.27\n",
804 | "review_text = reviews.map(extract_reviews)\n",
805 | "review_text_tokens = review_text.map(tokenizer.tokenize)\n",
806 | "review_text_clean = review_text_tokens.map(partial(filter_stopwords, stopwords=stopword_set))\n",
807 | "review_text_clean.take(1)"
808 | ]
809 | },
810 | {
811 | "cell_type": "code",
812 | "execution_count": 29,
813 | "metadata": {},
814 | "outputs": [
815 | {
816 | "data": {
817 | "text/plain": [
818 | "({('appreciates', 'product'),\n",
819 | " ('better', 'labrador'),\n",
820 | " ('bought', 'several'),\n",
821 | " ('canned', 'dog'),\n",
822 | " ('dog', 'food'),\n",
823 | " ('finicky', 'appreciates'),\n",
824 | " ('food', 'products'),\n",
825 | " ('found', 'good'),\n",
826 | " ('good', 'quality'),\n",
827 | " ('labrador', 'finicky'),\n",
828 | " ('like', 'stew'),\n",
829 | " ('looks', 'like'),\n",
830 | " ('meat', 'smells'),\n",
831 | " ('processed', 'meat'),\n",
832 | " ('product', 'better'),\n",
833 | " ('product', 'looks'),\n",
834 | " ('products', 'found'),\n",
835 | " ('quality', 'product'),\n",
836 | " ('several', 'vitality'),\n",
837 | " ('smells', 'better'),\n",
838 | " ('stew', 'processed'),\n",
839 | " ('vitality', 'canned')},\n",
840 | " {('actually', 'small'),\n",
841 | " ('arrived', 'labeled'),\n",
842 | " ('error', 'vendor'),\n",
843 | " ('intended', 'represent'),\n",
844 | " ('jumbo', 'salted'),\n",
845 | " ('labeled', 'jumbo'),\n",
846 | " ('peanuts', 'actually'),\n",
847 | " ('peanuts', 'peanuts'),\n",
848 | " ('product', 'arrived'),\n",
849 | " ('product', 'jumbo'),\n",
850 | " ('represent', 'product'),\n",
851 | " ('salted', 'peanuts'),\n",
852 | " ('sized', 'unsalted'),\n",
853 | " ('small', 'sized'),\n",
854 | " ('sure', 'error'),\n",
855 | " ('unsalted', 'sure'),\n",
856 | " ('vendor', 'intended')})"
857 | ]
858 | },
859 | "execution_count": 29,
860 | "metadata": {},
861 | "output_type": "execute_result"
862 | }
863 | ],
864 | "source": [
865 | "# Listing 9.28\n",
866 | "def make_bigrams(tokens):\n",
867 | " return set(nltk.bigrams(tokens))\n",
868 | "\n",
869 | "review_bigrams = review_text_clean.map(make_bigrams)\n",
870 | "review_bigrams.take(2)"
871 | ]
872 | },
873 | {
874 | "cell_type": "code",
875 | "execution_count": 30,
876 | "metadata": {},
877 | "outputs": [
878 | {
879 | "data": {
880 | "text/plain": [
881 | "(('product', 'better'),\n",
882 | " ('finicky', 'appreciates'),\n",
883 | " ('meat', 'smells'),\n",
884 | " ('looks', 'like'),\n",
885 | " ('good', 'quality'),\n",
886 | " ('vitality', 'canned'),\n",
887 | " ('like', 'stew'),\n",
888 | " ('processed', 'meat'),\n",
889 | " ('labrador', 'finicky'),\n",
890 | " ('several', 'vitality'))"
891 | ]
892 | },
893 | "execution_count": 30,
894 | "metadata": {},
895 | "output_type": "execute_result"
896 | }
897 | ],
898 | "source": [
899 | "# Listing 9.29\n",
900 | "all_bigrams = review_bigrams.flatten()\n",
901 | "all_bigrams.take(10)"
902 | ]
903 | },
904 | {
905 | "cell_type": "code",
906 | "execution_count": 31,
907 | "metadata": {},
908 | "outputs": [
909 | {
910 | "name": "stdout",
911 | "output_type": "stream",
912 | "text": [
913 | "[########################################] | 100% Completed | 11min 7.6s\n"
914 | ]
915 | },
916 | {
917 | "data": {
918 | "text/plain": [
919 | "[(('br', 'br'), 103258),\n",
920 | " (('amazon', 'com'), 15142),\n",
921 | " (('highly', 'recommend'), 14017),\n",
922 | " (('taste', 'like'), 13251),\n",
923 | " (('gluten', 'free'), 11641),\n",
924 | " (('grocery', 'store'), 11627),\n",
925 | " (('k', 'cups'), 11102),\n",
926 | " (('much', 'better'), 10681),\n",
927 | " (('http', 'www'), 10575),\n",
928 | " (('www', 'amazon'), 10517)]"
929 | ]
930 | },
931 | "execution_count": 31,
932 | "metadata": {},
933 | "output_type": "execute_result"
934 | }
935 | ],
936 | "source": [
937 | "# Listing 9.30\n",
938 | "with ProgressBar():\n",
939 | " top10_bigrams = all_bigrams.foldby(lambda x: x, count, 0, combine, 0).topk(10, key=lambda x: x[1]).compute()\n",
940 | "top10_bigrams"
941 | ]
942 | },
943 | {
944 | "cell_type": "code",
945 | "execution_count": 32,
946 | "metadata": {},
947 | "outputs": [
948 | {
949 | "name": "stdout",
950 | "output_type": "stream",
951 | "text": [
952 | "[########################################] | 100% Completed | 11min 19.9s\n"
953 | ]
954 | },
955 | {
956 | "data": {
957 | "text/plain": [
958 | "[(('highly', 'recommend'), 14024),\n",
959 | " (('taste', 'like'), 13343),\n",
960 | " (('gluten', 'free'), 11641),\n",
961 | " (('grocery', 'store'), 11630),\n",
962 | " (('k', 'cups'), 11102),\n",
963 | " (('much', 'better'), 10695),\n",
964 | " (('tastes', 'like'), 10471),\n",
965 | " (('great', 'product'), 9192),\n",
966 | " (('cup', 'coffee'), 8988),\n",
967 | " (('really', 'good'), 8897)]"
968 | ]
969 | },
970 | "execution_count": 32,
971 | "metadata": {},
972 | "output_type": "execute_result"
973 | }
974 | ],
975 | "source": [
976 | "# Listing 9.31\n",
977 | "more_stopwords = {'br', 'amazon', 'com', 'http', 'www', 'href', 'gp'}\n",
978 | "all_stopwords = stopword_set.union(more_stopwords)\n",
979 | "\n",
980 | "filtered_bigrams = review_text_tokens.map(partial(filter_stopwords, stopwords=all_stopwords)).map(make_bigrams).flatten()\n",
981 | "\n",
982 | "with ProgressBar():\n",
983 | " top10_bigrams = filtered_bigrams.foldby(lambda x: x, count, 0, combine, 0).topk(10, key=lambda x: x[1]).compute()\n",
984 | "top10_bigrams"
985 | ]
986 | },
987 | {
988 | "cell_type": "markdown",
989 | "metadata": {},
990 | "source": [
991 | "### Section 9.4.3"
992 | ]
993 | },
994 | {
995 | "cell_type": "code",
996 | "execution_count": 61,
997 | "metadata": {},
998 | "outputs": [
999 | {
1000 | "name": "stdout",
1001 | "output_type": "stream",
1002 | "text": [
1003 | "[########################################] | 100% Completed | 2min 25.9s\n"
1004 | ]
1005 | },
1006 | {
1007 | "data": {
1008 | "text/plain": [
1009 | "[(('taste', 'like'), 3352),\n",
1010 | " (('tastes', 'like'), 2858),\n",
1011 | " (('waste', 'money'), 2262),\n",
1012 | " (('k', 'cups'), 1892),\n",
1013 | " (('much', 'better'), 1659),\n",
1014 | " (('thought', 'would'), 1604),\n",
1015 | " (('tasted', 'like'), 1515),\n",
1016 | " (('grocery', 'store'), 1489),\n",
1017 | " (('would', 'recommend'), 1445),\n",
1018 | " (('taste', 'good'), 1408)]"
1019 | ]
1020 | },
1021 | "execution_count": 61,
1022 | "metadata": {},
1023 | "output_type": "execute_result"
1024 | }
1025 | ],
1026 | "source": [
1027 | "# Listing 9.32\n",
1028 | "negative_review_text = reviews.filter(lambda review: float(review['review/score']) < 3).map(extract_reviews)\n",
1029 | "negative_review_text_tokens = negative_review_text.map(tokenizer.tokenize)\n",
1030 | "negative_review_text_clean = negative_review_text_tokens.map(partial(filter_stopwords, stopwords=all_stopwords))\n",
1031 | "negative_review_bigrams = negative_review_text_clean.map(make_bigrams)\n",
1032 | "negative_bigrams = negative_review_bigrams.flatten()\n",
1033 | "\n",
1034 | "with ProgressBar():\n",
1035 | " top10_negative_bigrams = negative_bigrams.foldby(lambda x: x, count, 0, combine, 0).topk(10, key=lambda x: x[1]).compute()\n",
1036 | "top10_negative_bigrams"
1037 | ]
1038 | },
1039 | {
1040 | "cell_type": "code",
1041 | "execution_count": null,
1042 | "metadata": {},
1043 | "outputs": [],
1044 | "source": []
1045 | }
1046 | ],
1047 | "metadata": {
1048 | "kernelspec": {
1049 | "display_name": "Python 3",
1050 | "language": "python",
1051 | "name": "python3"
1052 | },
1053 | "language_info": {
1054 | "codemirror_mode": {
1055 | "name": "ipython",
1056 | "version": 3
1057 | },
1058 | "file_extension": ".py",
1059 | "mimetype": "text/x-python",
1060 | "name": "python",
1061 | "nbconvert_exporter": "python",
1062 | "pygments_lexer": "ipython3",
1063 | "version": "3.6.8"
1064 | }
1065 | },
1066 | "nbformat": 4,
1067 | "nbformat_minor": 2
1068 | }
1069 |
--------------------------------------------------------------------------------