├── .DS_Store
├── Course_Notes.zip
├── Data Set Generator (remove me the future!)
├── .ipynb_checkpoints
│ └── Creating Fake Data-checkpoint.ipynb
├── Creating Fake Data.ipynb
├── DataSets
│ ├── Facebook_metrics.txt
│ ├── customer_churn.csv
│ ├── dataset_Facebook.csv
│ ├── dog_food.csv
│ └── hack_data.csv
├── fake_customers.csv
├── hello.csv
├── new_customers.csv
└── test.csv
├── Python-Crash-Course
├── .ipynb_checkpoints
│ ├── Python Crash Course Exercises - Solutions-checkpoint.ipynb
│ └── Python Crash Course Exercises-checkpoint.ipynb
├── Python Crash Course Exercises - Solutions.ipynb
├── Python Crash Course Exercises.ipynb
└── Python Crash Course.ipynb
├── README.md
├── Spark Streaming
├── .ipynb_checkpoints
│ └── Introduction to Spark Streaming-checkpoint.ipynb
├── Introduction to Spark Streaming.ipynb
└── TweetRead.py
├── Spark_DataFrame_Project_Exercise
├── Spark DataFrames Project Exercise - SOLUTIONS.ipynb
├── Spark DataFrames Project Exercise.ipynb
└── walmart_stock.csv
├── Spark_DataFrames
├── .ipynb_checkpoints
│ ├── DataFrame_Basic_Operations-checkpoint.ipynb
│ ├── DataFrame_Basics-checkpoint.ipynb
│ ├── Dates_and_Timestamps-checkpoint.ipynb
│ ├── GroupBy_and_Aggregate_Functions-checkpoint.ipynb
│ ├── Missing_Data-checkpoint.ipynb
│ ├── Spark DataFrames Project Exercise - SOLUTIONS-checkpoint.ipynb
│ └── Spark DataFrames Project Exercise-checkpoint.ipynb
├── ContainsNull.csv
├── DataFrame_Basic_Operations.ipynb
├── DataFrame_Basics.ipynb
├── Dates_and_Timestamps.ipynb
├── GroupBy_and_Aggregate_Functions.ipynb
├── Missing_Data.ipynb
├── appl_stock.csv
├── people.json
└── sales_info.csv
└── Spark_for_Machine_Learning
├── Clustering
├── .ipynb_checkpoints
│ ├── Clustering Code Along-checkpoint.ipynb
│ ├── Clustering_Code_Example-checkpoint.ipynb
│ ├── Clustering_Consulting_Project-checkpoint.ipynb
│ ├── Clustering_Consulting_Project_SOLUTIONS-checkpoint.ipynb
│ └── Random_Forest_Doc_Example-checkpoint.ipynb
├── Clustering Code Along.ipynb
├── Clustering_Code_Example.ipynb
├── Clustering_Consulting_Project.ipynb
├── Clustering_Consulting_Project_SOLUTIONS.ipynb
├── hack_data.csv
├── sample_kmeans_data.txt
├── seeds_dataset.csv
└── seeds_dataset.txt
├── Linear_Regression
├── .ipynb_checkpoints
│ ├── Data_Transformations-checkpoint.ipynb
│ ├── Linear_Regression_Code_Along-checkpoint.ipynb
│ ├── Linear_Regression_Consulting_Project-checkpoint.ipynb
│ ├── Linear_Regression_Consulting_Project_SOLUTIONS-checkpoint.ipynb
│ └── Linear_Regression_Example-checkpoint.ipynb
├── Data_Transformations.ipynb
├── Ecommerce_Customers.csv
├── Linear_Regression_Code_Along.ipynb
├── Linear_Regression_Consulting_Project.ipynb
├── Linear_Regression_Consulting_Project_SOLUTIONS.ipynb
├── Linear_Regression_Example.ipynb
├── cruise_ship_info.csv
├── fake_customers.csv
└── sample_linear_regression_data.txt
├── Logistic_Regression
├── .ipynb_checkpoints
│ ├── Log_regression_Code_Along-checkpoint.ipynb
│ ├── Logistic_Regression_Consulting_Project-checkpoint.ipynb
│ ├── Logistic_Regression_Consulting_Project_SOLUTIONS-checkpoint.ipynb
│ └── Logistic_Regression_Example-checkpoint.ipynb
├── Logistic_Regression_Consulting_Project.ipynb
├── Logistic_Regression_Consulting_Project_SOLUTIONS.ipynb
├── Logistic_Regression_Example.ipynb
├── Titanic_Log_Regression_Code_Along.ipynb
├── customer_churn.csv
├── new_customers.csv
├── sample_libsvm_data.txt
└── titanic.csv
├── Natural_Language_Processing
├── .ipynb_checkpoints
│ ├── NLP_Code_Along-checkpoint.ipynb
│ └── Tools_for_NLP-checkpoint.ipynb
├── NLP_Code_Along.ipynb
├── Tools_for_NLP.ipynb
└── smsspamcollection
│ ├── SMSSpamCollection
│ └── readme
├── Recommender_Systems
├── .ipynb_checkpoints
│ ├── Consulting Project - Recommender Systems-checkpoint.ipynb
│ └── Recommender_Code_Along-checkpoint.ipynb
├── Consulting Project - Recommender Systems.ipynb
├── Meal_Info.csv
├── Recommender_Code_Along.ipynb
└── movielens_ratings.csv
└── Tree_Methods
├── .ipynb_checkpoints
├── Tree Methods Code Along-checkpoint.ipynb
├── Tree_Methods_Consulting_Project-checkpoint.ipynb
├── Tree_Methods_Consulting_Project_SOLUTION-checkpoint.ipynb
└── Tree_Methods_Doc_Example-checkpoint.ipynb
├── College.csv
├── Tree Methods Code Along.ipynb
├── Tree_Methods_Consulting_Project.ipynb
├── Tree_Methods_Consulting_Project_SOLUTION.ipynb
├── Tree_Methods_Doc_Example.ipynb
├── dog_food.csv
└── sample_libsvm_data.txt
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuperJohn/spark-and-python-for-big-data-with-pyspark/2571210837c00e6315a9d93f0cd1dc35e2955375/.DS_Store
--------------------------------------------------------------------------------
/Course_Notes.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuperJohn/spark-and-python-for-big-data-with-pyspark/2571210837c00e6315a9d93f0cd1dc35e2955375/Course_Notes.zip
--------------------------------------------------------------------------------
/Data Set Generator (remove me the future!)/DataSets/Facebook_metrics.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuperJohn/spark-and-python-for-big-data-with-pyspark/2571210837c00e6315a9d93f0cd1dc35e2955375/Data Set Generator (remove me the future!)/DataSets/Facebook_metrics.txt
--------------------------------------------------------------------------------
/Data Set Generator (remove me the future!)/DataSets/dog_food.csv:
--------------------------------------------------------------------------------
1 | A,B,C,D,Spoiled
2 | 4,2,12.0,3,1
3 | 5,6,12.0,7,1
4 | 6,2,13.0,6,1
5 | 4,2,12.0,1,1
6 | 4,2,12.0,3,1
7 | 10,3,13.0,9,1
8 | 8,5,14.0,5,1
9 | 5,8,12.0,8,1
10 | 6,5,12.0,9,1
11 | 3,3,12.0,1,1
12 | 9,8,11.0,3,1
13 | 1,10,12.0,3,1
14 | 1,5,13.0,10,1
15 | 2,10,12.0,6,1
16 | 1,10,11.0,4,1
17 | 5,3,12.0,2,1
18 | 4,9,11.0,8,1
19 | 5,1,11.0,1,1
20 | 4,9,12.0,10,1
21 | 5,8,10.0,9,1
22 | 5,7,11.0,9,1
23 | 4,10,13.0,8,1
24 | 10,5,12.0,9,1
25 | 2,4,13.0,4,1
26 | 1,4,13.0,10,1
27 | 1,8,12.0,1,1
28 | 2,10,13.0,4,1
29 | 6,2,12.0,4,1
30 | 8,2,13.0,3,1
31 | 6,4,12.0,2,1
32 | 3,2,11.0,9,1
33 | 10,6,12.0,10,1
34 | 9,5,13.0,3,1
35 | 9,2,12.0,5,1
36 | 2,6,13.0,9,1
37 | 4,2,12.0,10,1
38 | 4,3,12.0,6,1
39 | 7,1,12.0,1,1
40 | 1,7,11.0,10,1
41 | 9,2,11.0,10,1
42 | 2,6,12.0,2,1
43 | 9,4,11.0,5,1
44 | 6,2,11.0,10,1
45 | 3,10,11.0,4,1
46 | 6,9,11.0,2,1
47 | 10,6,11.0,9,1
48 | 6,7,11.0,9,1
49 | 7,2,13.0,8,1
50 | 9,2,13.0,5,1
51 | 8,7,12.0,6,1
52 | 9,1,12.0,9,1
53 | 3,5,14.0,3,1
54 | 7,1,11.0,3,1
55 | 5,9,12.0,7,1
56 | 3,10,12.0,7,1
57 | 9,8,13.0,9,1
58 | 10,9,12.0,9,1
59 | 10,7,11.0,2,1
60 | 10,3,11.0,1,1
61 | 2,4,11.0,8,1
62 | 10,3,13.0,4,1
63 | 5,1,14.0,8,1
64 | 8,8,11.0,4,1
65 | 4,8,14.0,1,1
66 | 5,1,12.0,7,1
67 | 6,8,11.0,2,1
68 | 1,1,13.0,3,1
69 | 9,3,12.0,10,1
70 | 6,1,11.0,7,1
71 | 7,5,10.0,1,1
72 | 10,2,12.0,2,1
73 | 2,3,13.0,1,1
74 | 5,8,12.0,2,1
75 | 10,6,12.0,10,1
76 | 9,1,11.0,6,1
77 | 10,10,14.0,7,1
78 | 1,5,12.0,10,1
79 | 10,1,11.0,2,1
80 | 1,1,12.0,2,1
81 | 10,3,13.0,7,1
82 | 1,6,11.0,10,1
83 | 9,4,12.0,3,1
84 | 10,9,12.0,5,1
85 | 10,8,11.0,2,1
86 | 5,3,9.0,2,1
87 | 3,7,12.0,10,1
88 | 4,9,12.0,8,1
89 | 5,1,11.0,2,1
90 | 10,9,11.0,9,1
91 | 10,7,11.0,6,1
92 | 8,2,13.0,10,1
93 | 7,7,11.0,3,1
94 | 9,10,11.0,5,1
95 | 5,2,12.0,8,1
96 | 1,1,10.0,8,1
97 | 5,5,12.0,8,1
98 | 9,6,12.0,1,1
99 | 4,6,12.0,2,1
100 | 1,1,12.0,4,1
101 | 9,3,11.0,10,1
102 | 3,2,12.0,6,1
103 | 2,4,11.0,9,1
104 | 8,1,12.0,10,1
105 | 10,6,11.0,6,1
106 | 8,9,12.0,2,1
107 | 2,3,12.0,3,1
108 | 4,6,14.0,4,1
109 | 3,4,12.0,4,1
110 | 9,5,12.0,5,1
111 | 10,5,13.0,2,1
112 | 8,2,10.0,6,1
113 | 10,5,11.0,2,1
114 | 10,1,11.0,3,1
115 | 7,6,13.0,3,1
116 | 8,9,14.0,4,1
117 | 8,8,14.0,7,1
118 | 1,9,11.0,10,1
119 | 2,9,10.0,3,1
120 | 4,9,13.0,4,1
121 | 10,10,12.0,7,1
122 | 8,9,12.0,7,1
123 | 9,7,12.0,1,1
124 | 3,6,13.0,5,1
125 | 4,5,12.0,3,1
126 | 1,7,11.0,9,1
127 | 4,6,12.0,9,1
128 | 8,10,13.0,3,1
129 | 5,4,12.0,5,1
130 | 9,4,12.0,6,1
131 | 3,4,12.0,5,1
132 | 7,7,11.0,4,1
133 | 6,2,12.0,6,1
134 | 2,8,11.0,1,1
135 | 4,4,10.0,3,1
136 | 3,7,12.0,9,1
137 | 10,3,12.0,7,1
138 | 3,1,12.0,7,1
139 | 2,4,13.0,10,1
140 | 6,3,12.0,2,1
141 | 7,2,14.0,4,1
142 | 4,2,8.0,9,0
143 | 4,8,9.0,1,0
144 | 10,8,8.0,6,0
145 | 8,6,9.0,4,0
146 | 7,2,7.0,8,0
147 | 3,3,9.0,5,0
148 | 4,10,8.0,9,0
149 | 4,7,10.0,7,0
150 | 1,7,8.0,2,0
151 | 10,7,8.0,5,0
152 | 10,5,9.0,1,0
153 | 5,7,10.0,10,0
154 | 2,8,6.0,9,0
155 | 4,1,7.0,5,0
156 | 4,6,9.0,7,0
157 | 2,2,9.0,8,0
158 | 6,7,6.0,9,0
159 | 5,7,7.0,2,0
160 | 7,1,7.0,5,0
161 | 8,1,8.0,3,0
162 | 1,6,8.0,1,0
163 | 4,5,9.0,8,0
164 | 8,10,8.0,3,0
165 | 4,9,8.0,2,0
166 | 2,9,6.0,4,0
167 | 8,10,8.0,9,0
168 | 3,6,8.0,1,0
169 | 5,6,9.0,8,0
170 | 5,2,8.0,10,0
171 | 9,7,6.0,7,0
172 | 3,8,6.0,10,0
173 | 3,3,8.0,9,0
174 | 3,4,10.0,2,0
175 | 6,8,8.0,9,0
176 | 1,4,8.0,7,0
177 | 6,9,7.0,10,0
178 | 10,6,8.0,6,0
179 | 9,4,7.0,10,0
180 | 9,2,10.0,3,0
181 | 6,8,8.0,6,0
182 | 10,5,7.0,4,0
183 | 4,8,8.0,7,0
184 | 5,6,6.0,9,0
185 | 2,1,10.0,7,0
186 | 6,4,7.0,4,0
187 | 6,8,9.0,4,0
188 | 3,3,8.0,3,0
189 | 3,5,10.0,6,0
190 | 3,3,9.0,9,0
191 | 7,7,8.0,9,0
192 | 6,8,7.0,10,0
193 | 7,3,7.0,7,0
194 | 5,7,9.0,2,0
195 | 4,9,8.0,10,0
196 | 9,9,7.0,4,0
197 | 6,9,6.0,1,0
198 | 4,2,10.0,10,0
199 | 8,10,8.0,3,0
200 | 1,7,8.0,4,0
201 | 3,2,9.0,1,0
202 | 9,9,9.0,6,0
203 | 4,10,5.0,4,0
204 | 9,3,7.0,5,0
205 | 9,1,9.0,3,0
206 | 4,6,7.0,2,0
207 | 4,5,8.0,5,0
208 | 5,7,6.0,6,0
209 | 10,6,9.0,3,0
210 | 6,6,8.0,10,0
211 | 3,7,9.0,7,0
212 | 8,10,8.0,2,0
213 | 5,2,8.0,3,0
214 | 5,7,7.0,5,0
215 | 10,9,8.0,2,0
216 | 4,4,8.0,7,0
217 | 1,4,9.0,6,0
218 | 8,2,9.0,10,0
219 | 9,6,9.0,5,0
220 | 7,6,7.0,7,0
221 | 1,2,9.0,4,0
222 | 1,8,7.0,10,0
223 | 6,2,8.0,9,0
224 | 9,5,7.0,8,0
225 | 8,7,8.0,6,0
226 | 5,7,8.0,9,0
227 | 8,4,9.0,1,0
228 | 6,1,9.0,3,0
229 | 9,7,8.0,9,0
230 | 2,9,7.0,10,0
231 | 2,4,8.0,5,0
232 | 10,3,8.0,8,0
233 | 7,9,8.0,8,0
234 | 6,6,8.0,2,0
235 | 1,5,8.0,10,0
236 | 10,1,9.0,9,0
237 | 8,1,9.0,2,0
238 | 10,9,8.0,6,0
239 | 5,10,7.0,1,0
240 | 3,6,7.0,8,0
241 | 4,10,10.0,5,0
242 | 2,1,7.0,9,0
243 | 9,2,9.0,9,0
244 | 3,9,8.0,9,0
245 | 2,3,6.0,9,0
246 | 3,9,8.0,6,0
247 | 10,7,9.0,1,0
248 | 10,10,6.0,4,0
249 | 8,5,9.0,5,0
250 | 7,2,8.0,1,0
251 | 7,2,8.0,9,0
252 | 6,9,7.0,2,0
253 | 1,4,9.0,3,0
254 | 10,9,9.0,10,0
255 | 4,3,8.0,8,0
256 | 8,7,6.0,6,0
257 | 5,7,8.0,3,0
258 | 8,6,8.0,3,0
259 | 3,2,6.0,10,0
260 | 4,2,6.0,5,0
261 | 10,6,8.0,7,0
262 | 3,6,8.0,3,0
263 | 2,2,8.0,1,0
264 | 1,9,10.0,6,0
265 | 9,6,8.0,7,0
266 | 4,5,9.0,5,0
267 | 3,5,8.0,6,0
268 | 4,5,8.0,10,0
269 | 9,4,9.0,4,0
270 | 9,4,7.0,6,0
271 | 7,6,8.0,10,0
272 | 9,10,11.0,2,0
273 | 3,4,9.0,5,0
274 | 2,10,9.0,2,0
275 | 10,9,8.0,2,0
276 | 4,6,9.0,4,0
277 | 4,10,7.0,10,0
278 | 9,1,9.0,8,0
279 | 3,10,8.0,6,0
280 | 8,5,9.0,3,0
281 | 8,5,7.0,5,0
282 | 1,8,6.0,6,0
283 | 8,8,6.0,8,0
284 | 4,8,7.0,3,0
285 | 9,3,8.0,7,0
286 | 10,8,7.0,3,0
287 | 2,10,6.0,4,0
288 | 2,5,9.0,5,0
289 | 10,7,9.0,4,0
290 | 3,10,9.0,8,0
291 | 9,2,7.0,3,0
292 | 7,4,6.0,4,0
293 | 3,4,8.0,7,0
294 | 4,7,8.0,3,0
295 | 10,9,8.0,10,0
296 | 4,6,5.0,6,0
297 | 10,2,9.0,7,0
298 | 9,8,9.0,10,0
299 | 7,10,8.0,2,0
300 | 5,5,6.0,1,0
301 | 8,4,7.0,6,0
302 | 5,5,7.0,9,0
303 | 7,2,9.0,9,0
304 | 9,4,9.0,3,0
305 | 5,5,7.0,3,0
306 | 2,7,7.0,4,0
307 | 4,5,9.0,8,0
308 | 1,8,8.0,6,0
309 | 5,6,9.0,5,0
310 | 3,6,8.0,3,0
311 | 7,2,9.0,5,0
312 | 10,9,10.0,6,0
313 | 4,7,10.0,6,0
314 | 1,9,9.0,7,0
315 | 1,7,7.0,2,0
316 | 1,9,7.0,5,0
317 | 2,8,9.0,4,0
318 | 5,4,8.0,2,0
319 | 1,7,7.0,6,0
320 | 2,1,8.0,9,0
321 | 2,6,9.0,4,0
322 | 1,6,8.0,9,0
323 | 1,4,8.0,5,0
324 | 10,6,8.0,5,0
325 | 6,4,6.0,4,0
326 | 2,1,9.0,1,0
327 | 8,6,9.0,10,0
328 | 5,6,7.0,9,0
329 | 10,10,7.0,1,0
330 | 2,9,10.0,6,0
331 | 9,6,10.0,2,0
332 | 3,5,9.0,3,0
333 | 5,10,8.0,3,0
334 | 1,3,9.0,8,0
335 | 8,8,8.0,7,0
336 | 6,1,8.0,3,0
337 | 4,9,9.0,2,0
338 | 2,9,10.0,3,0
339 | 1,5,8.0,5,0
340 | 5,6,8.0,8,0
341 | 6,10,9.0,2,0
342 | 9,6,8.0,9,0
343 | 1,8,8.0,7,0
344 | 8,2,8.0,8,0
345 | 3,6,8.0,5,0
346 | 9,2,9.0,6,0
347 | 7,10,5.0,6,0
348 | 2,5,8.0,3,0
349 | 9,2,10.0,7,0
350 | 5,9,8.0,9,0
351 | 1,6,8.0,3,0
352 | 7,4,8.0,3,0
353 | 8,5,8.0,5,0
354 | 5,9,7.0,3,0
355 | 9,6,8.0,5,0
356 | 3,1,8.0,5,0
357 | 5,8,9.0,9,0
358 | 2,5,8.0,3,0
359 | 5,6,8.0,6,0
360 | 2,5,8.0,1,0
361 | 6,2,11.0,10,0
362 | 2,6,6.0,9,0
363 | 4,4,6.0,8,0
364 | 2,7,8.0,9,0
365 | 5,2,7.0,9,0
366 | 6,10,8.0,3,0
367 | 4,6,7.0,5,0
368 | 2,8,8.0,6,0
369 | 6,2,8.0,3,0
370 | 8,10,9.0,8,0
371 | 5,9,8.0,5,0
372 | 9,2,9.0,8,0
373 | 5,10,8.0,6,0
374 | 10,6,8.0,3,0
375 | 6,6,9.0,6,0
376 | 6,3,10.0,5,0
377 | 1,3,8.0,5,0
378 | 2,3,9.0,3,0
379 | 2,6,8.0,8,0
380 | 8,4,9.0,10,0
381 | 8,7,6.0,7,0
382 | 2,6,8.0,10,0
383 | 7,2,9.0,3,0
384 | 7,9,6.0,2,0
385 | 2,10,8.0,8,0
386 | 5,2,9.0,9,0
387 | 2,8,9.0,10,0
388 | 8,4,6.0,8,0
389 | 7,3,10.0,7,0
390 | 9,9,8.0,7,0
391 | 8,4,8.0,1,0
392 | 9,2,6.0,8,0
393 | 8,6,8.0,2,0
394 | 9,7,8.0,2,0
395 | 4,3,9.0,6,0
396 | 2,1,8.0,9,0
397 | 9,4,7.0,9,0
398 | 4,2,9.0,2,0
399 | 10,3,8.0,2,0
400 | 9,2,10.0,5,0
401 | 10,7,7.0,7,0
402 | 2,3,7.0,10,0
403 | 10,1,7.0,4,0
404 | 3,3,7.0,5,0
405 | 10,1,7.0,4,0
406 | 5,4,8.0,7,0
407 | 7,3,7.0,8,0
408 | 10,9,7.0,4,0
409 | 5,7,8.0,9,0
410 | 5,9,7.0,5,0
411 | 4,6,7.0,5,0
412 | 4,2,8.0,9,0
413 | 8,3,7.0,4,0
414 | 3,5,9.0,6,0
415 | 4,3,8.0,10,0
416 | 1,6,7.0,8,0
417 | 8,5,8.0,6,0
418 | 9,10,7.0,6,0
419 | 8,9,8.0,1,0
420 | 9,10,8.0,8,0
421 | 3,10,8.0,2,0
422 | 8,10,10.0,7,0
423 | 2,1,10.0,7,0
424 | 5,10,8.0,8,0
425 | 4,9,7.0,7,0
426 | 9,3,7.0,7,0
427 | 5,7,8.0,6,0
428 | 8,7,9.0,3,0
429 | 2,2,7.0,8,0
430 | 6,6,9.0,9,0
431 | 4,2,8.0,4,0
432 | 3,9,7.0,9,0
433 | 7,9,6.0,5,0
434 | 5,3,7.0,5,0
435 | 4,4,9.0,1,0
436 | 6,9,8.0,5,0
437 | 10,10,8.0,1,0
438 | 2,6,8.0,6,0
439 | 10,10,9.0,5,0
440 | 5,9,9.0,6,0
441 | 3,2,8.0,9,0
442 | 10,10,9.0,3,0
443 | 4,7,9.0,4,0
444 | 4,4,7.0,1,0
445 | 5,8,8.0,5,0
446 | 2,3,8.0,3,0
447 | 6,4,9.0,2,0
448 | 2,9,9.0,10,0
449 | 3,6,8.0,2,0
450 | 3,2,10.0,10,0
451 | 2,2,8.0,1,0
452 | 9,6,9.0,1,0
453 | 6,5,6.0,2,0
454 | 3,6,8.0,1,0
455 | 3,3,8.0,6,0
456 | 2,10,9.0,2,0
457 | 8,9,8.0,9,0
458 | 7,4,10.0,4,0
459 | 6,6,7.0,8,0
460 | 5,3,7.0,7,0
461 | 6,7,7.0,6,0
462 | 9,1,9.0,5,0
463 | 10,9,9.0,1,0
464 | 10,4,8.0,3,0
465 | 1,2,9.0,1,0
466 | 2,1,9.0,1,0
467 | 6,1,7.0,9,0
468 | 1,5,8.0,3,0
469 | 2,8,8.0,4,0
470 | 1,8,8.0,8,0
471 | 3,1,9.0,7,0
472 | 3,9,7.0,6,0
473 | 8,1,7.0,4,0
474 | 10,4,9.0,8,0
475 | 2,5,7.0,6,0
476 | 10,6,8.0,5,0
477 | 6,1,9.0,7,0
478 | 6,10,7.0,10,0
479 | 2,10,8.0,3,0
480 | 1,4,8.0,1,0
481 | 8,9,9.0,4,0
482 | 10,10,7.0,4,0
483 | 8,3,7.0,9,0
484 | 2,2,9.0,8,0
485 | 9,5,10.0,10,0
486 | 2,2,6.0,10,0
487 | 8,3,6.0,6,0
488 | 6,4,9.0,10,0
489 | 1,3,8.0,3,0
490 | 6,6,8.0,3,0
491 | 1,9,7.0,4,0
492 |
--------------------------------------------------------------------------------
/Data Set Generator (remove me the future!)/fake_customers.csv:
--------------------------------------------------------------------------------
1 | Names,Age,Phone,Location,Company,Lot,Sales
2 | Chelsea Taylor,46.0,1-431-660-1615x8629,"064 Stone Neck Apt. 766
3 | East Debrabury, FM 63246",Bentley-Waller,07 bz,0
4 | Pamela Williams,38.0,(101)883-0724x491,"5182 Emily Spurs
5 | West Lindsey, PA 79975",Gomez Group,21 cB,0
6 | Kristi Sandoval,41.0,+99(4)3518374928,"367 Nelson Gardens Apt. 209
7 | Ochoaview, MT 25437","Thomas, Brown and Stewart",25 to,0
8 | Ashley Morris,45.0,939-770-5901x336,"66532 Harris Loop
9 | West Susan, PR 68272-6257","Banks, Mendez and Reyes",46 rn,0
10 | Dwayne Nguyen,48.0,468-328-7711,"418 Martin Mall
11 | New John, MN 64235",Phelps-Bentley,97 lr,0
12 | Benjamin Nelson,43.0,257.443.9817x9922,"Unit 2069 Box 9542
13 | DPO AA 81875-0608",Madden-Murphy,76 YB,0
14 | Tanya Mcdonald,40.0,985.525.6864x365,"PSC 1888, Box 7629
15 | APO AE 68066-4189",Morgan-Wilson,74 HU,0
16 | Ashley Mullins,34.0,231-482-7034x4744,"9819 Flores Orchard Apt. 954
17 | Markchester, NE 71752-6833","Hall, Romero and Marshall",75 Ty,0
18 | David Hutchinson,39.0,932.142.2276,"Unit 8564 Box 6806
19 | DPO AE 41715",Hanna Ltd,84 Ho,0
20 | Kayla Arnold,31.0,550.464.0343x938,"9296 Matthew Oval Apt. 429
21 | Thomasborough, NJ 22056-5974",Bradley-Schwartz,74 lz,0
22 | Nathan Castaneda,37.0,498.517.0898x258,"02452 Dawn Tunnel Apt. 012
23 | Rodriguezmouth, MA 80967-6806",Young and Sons,51 AM,0
24 | Keith Nelson,46.0,1-434-023-4677,"6309 Dustin Heights
25 | Joseville, UT 00298-1977",Rodriguez Ltd,32 yr,0
26 | Kathleen Weaver,22.0,920-001-7389,"822 Smith Lodge Apt. 921
27 | Tonichester, KY 49154","Key, Johnson and Hunt",72 Uv,0
28 | Kevin Thomas,37.0,(536)901-0070x33732,"Unit 8732 Box 8363
29 | DPO AA 80979-6530",Patterson-Burton,69 mk,0
30 | Seth Lutz,38.0,1-689-306-8881x37712,"510 Michael Field
31 | East Kimberly, DE 21409",Kelley Inc,29 Ts,0
32 |
--------------------------------------------------------------------------------
/Data Set Generator (remove me the future!)/new_customers.csv:
--------------------------------------------------------------------------------
1 | Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company
2 | Andrew Mccall,37.0,9935.53,1,7.71,8.0,2011-08-29 18:37:54,"38612 Johnny Stravenue
3 | Nataliebury, WI 15717-8316",King Ltd,
4 | Michele Wright,23.0,7526.94,1,9.28,15.0,2013-07-22 18:19:54,"21083 Nicole Junction Suite 332
5 | Youngport, ME 23686-4381",Cannon-Benson
6 | Jeremy Chang,65.0,100.0,1,1.0,15.0,2006-12-11 07:48:13,"085 Austin Views
7 | Lake Julialand, WY 63726-4298",Barron-Robertson
8 | Megan Ferguson,32.0,6487.5,0,9.4,14.0,2016-10-28 05:32:13,"922 Wright Branch
9 | North Cynthialand, NC 64721",Sexton-Golden
10 | Taylor Young,32.0,13147.71,1,10.0,8.0,2012-03-20 00:36:46,"Unit 0789 Box 0734
11 | DPO AP 39702",Wood LLC,
12 | Jessica Drake,22.0,8445.26,1,3.46,14.0,2011-02-04 19:29:27,"1148 Tina Stravenue Apt. 978
13 | South Carlos, TX 21222-9221",Parks-Robbins,
14 |
--------------------------------------------------------------------------------
/Python-Crash-Course/.ipynb_checkpoints/Python Crash Course Exercises - Solutions-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Python Crash Course Exercises - Solutions\n",
8 | "\n",
9 | "This is an optional exercise to test your understanding of Python Basics. If you find this extremely challenging, then you probably are not ready for the rest of this course yet and don't have enough programming experience to continue. I would suggest you take another course more geared towards complete beginners, such as [Complete Python Bootcamp]()"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Exercises\n",
17 | "\n",
18 | "Answer the questions or complete the tasks outlined in bold below, use the specific method described if applicable."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "** What is 7 to the power of 4?**"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "2401"
39 | ]
40 | },
41 | "execution_count": 1,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": [
47 | "7 **4"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "** Split this string:**\n",
55 | "\n",
56 | " s = \"Hi there Sam!\"\n",
57 | " \n",
58 | "**into a list. **"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {
65 | "collapsed": true
66 | },
67 | "outputs": [],
68 | "source": [
69 | "s = 'Hi there Sam!'"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 3,
75 | "metadata": {
76 | "collapsed": false
77 | },
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/plain": [
82 | "['Hi', 'there', 'dad!']"
83 | ]
84 | },
85 | "execution_count": 3,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "s.split()"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "** Given the variables:**\n",
99 | "\n",
100 | " planet = \"Earth\"\n",
101 | " diameter = 12742\n",
102 | "\n",
103 | "** Use .format() to print the following string: **\n",
104 | "\n",
105 | " The diameter of Earth is 12742 kilometers."
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 5,
111 | "metadata": {
112 | "collapsed": true
113 | },
114 | "outputs": [],
115 | "source": [
116 | "planet = \"Earth\"\n",
117 | "diameter = 12742"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 6,
123 | "metadata": {
124 | "collapsed": false
125 | },
126 | "outputs": [
127 | {
128 | "name": "stdout",
129 | "output_type": "stream",
130 | "text": [
131 | "The diameter of Earth is 12742 kilometers.\n"
132 | ]
133 | }
134 | ],
135 | "source": [
136 | "print(\"The diameter of {} is {} kilometers.\".format(planet,diameter))"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "** Given this nested list, use indexing to grab the word \"hello\" **"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 7,
149 | "metadata": {
150 | "collapsed": true
151 | },
152 | "outputs": [],
153 | "source": [
154 | "lst = [1,2,[3,4],[5,[100,200,['hello']],23,11],1,7]"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 14,
160 | "metadata": {
161 | "collapsed": false
162 | },
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "'hello'"
168 | ]
169 | },
170 | "execution_count": 14,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "lst[3][1][2][0]"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "** Given this nest dictionary grab the word \"hello\". Be prepared, this will be annoying/tricky **"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 16,
189 | "metadata": {
190 | "collapsed": false
191 | },
192 | "outputs": [],
193 | "source": [
194 | "d = {'k1':[1,2,3,{'tricky':['oh','man','inception',{'target':[1,2,3,'hello']}]}]}"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 22,
200 | "metadata": {
201 | "collapsed": false
202 | },
203 | "outputs": [
204 | {
205 | "data": {
206 | "text/plain": [
207 | "'hello'"
208 | ]
209 | },
210 | "execution_count": 22,
211 | "metadata": {},
212 | "output_type": "execute_result"
213 | }
214 | ],
215 | "source": [
216 | "d['k1'][3]['tricky'][3]['target'][3]"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "** What is the main difference between a tuple and a list? **"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 23,
229 | "metadata": {
230 | "collapsed": true
231 | },
232 | "outputs": [],
233 | "source": [
234 | "# Just answer with text, no code necessary"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "** Create a function that grabs the email website domain from a string in the form: **\n",
242 | "\n",
243 | " user@domain.com\n",
244 | " \n",
245 | "**So for example, passing \"user@domain.com\" would return: domain.com**"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 24,
251 | "metadata": {
252 | "collapsed": true
253 | },
254 | "outputs": [],
255 | "source": [
256 | "def domainGet(email):\n",
257 | " return email.split('@')[-1]"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 26,
263 | "metadata": {
264 | "collapsed": false
265 | },
266 | "outputs": [
267 | {
268 | "data": {
269 | "text/plain": [
270 | "'domain.com'"
271 | ]
272 | },
273 | "execution_count": 26,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "domainGet('user@domain.com')"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "** Create a basic function that returns True if the word 'dog' is contained in the input string. Don't worry about edge cases like a punctuation being attached to the word dog, but do account for capitalization. **"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 27,
292 | "metadata": {
293 | "collapsed": true
294 | },
295 | "outputs": [],
296 | "source": [
297 | "def findDog(st):\n",
298 | " return 'dog' in st.lower().split()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 28,
304 | "metadata": {
305 | "collapsed": false
306 | },
307 | "outputs": [
308 | {
309 | "data": {
310 | "text/plain": [
311 | "True"
312 | ]
313 | },
314 | "execution_count": 28,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "findDog('Is there a dog here?')"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "** Create a function that counts the number of times the word \"dog\" occurs in a string. Again ignore edge cases. **"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 30,
333 | "metadata": {
334 | "collapsed": false
335 | },
336 | "outputs": [],
337 | "source": [
338 | "def countDog(st):\n",
339 | " count = 0\n",
340 | " for word in st.lower().split():\n",
341 | " if word == 'dog':\n",
342 | " count += 1\n",
343 | " return count"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 31,
349 | "metadata": {
350 | "collapsed": false
351 | },
352 | "outputs": [
353 | {
354 | "data": {
355 | "text/plain": [
356 | "2"
357 | ]
358 | },
359 | "execution_count": 31,
360 | "metadata": {},
361 | "output_type": "execute_result"
362 | }
363 | ],
364 | "source": [
365 | "countDog('This dog runs faster than the other dog dude!')"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "metadata": {},
371 | "source": [
372 | "### Final Problem\n",
373 | "**You are driving a little too fast, and a police officer stops you. Write a function\n",
374 | " to return one of 3 possible results: \"No ticket\", \"Small ticket\", or \"Big Ticket\". \n",
375 | " If your speed is 60 or less, the result is \"No Ticket\". If speed is between 61 \n",
376 | " and 80 inclusive, the result is \"Small Ticket\". If speed is 81 or more, the result is \"Big Ticket\". Unless it is your birthday (encoded as a boolean value in the parameters of the function) -- on your birthday, your speed can be 5 higher in all \n",
377 | " cases. **"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 4,
383 | "metadata": {
384 | "collapsed": true
385 | },
386 | "outputs": [],
387 | "source": [
388 | "def caught_speeding(speed, is_birthday):\n",
389 | " \n",
390 | " if is_birthday:\n",
391 | " speeding = speed - 5\n",
392 | " else:\n",
393 | " speeding = speed\n",
394 | " \n",
395 | " if speeding > 80:\n",
396 | " return 'Big Ticket'\n",
397 | " elif speeding > 60:\n",
398 | " return 'Small Ticket'\n",
399 | " else:\n",
400 | " return 'No Ticket'"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": 5,
406 | "metadata": {
407 | "collapsed": false
408 | },
409 | "outputs": [
410 | {
411 | "data": {
412 | "text/plain": [
413 | "'Small Ticket'"
414 | ]
415 | },
416 | "execution_count": 5,
417 | "metadata": {},
418 | "output_type": "execute_result"
419 | }
420 | ],
421 | "source": [
422 | "caught_speeding(81,True)"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": 6,
428 | "metadata": {
429 | "collapsed": false
430 | },
431 | "outputs": [
432 | {
433 | "data": {
434 | "text/plain": [
435 | "'Big Ticket'"
436 | ]
437 | },
438 | "execution_count": 6,
439 | "metadata": {},
440 | "output_type": "execute_result"
441 | }
442 | ],
443 | "source": [
444 | "caught_speeding(81,False)"
445 | ]
446 | },
447 | {
448 | "cell_type": "markdown",
449 | "metadata": {},
450 | "source": [
451 | "# Great job!"
452 | ]
453 | }
454 | ],
455 | "metadata": {
456 | "anaconda-cloud": {},
457 | "kernelspec": {
458 | "display_name": "Python [default]",
459 | "language": "python",
460 | "name": "python3"
461 | },
462 | "language_info": {
463 | "codemirror_mode": {
464 | "name": "ipython",
465 | "version": 3
466 | },
467 | "file_extension": ".py",
468 | "mimetype": "text/x-python",
469 | "name": "python",
470 | "nbconvert_exporter": "python",
471 | "pygments_lexer": "ipython3",
472 | "version": "3.5.3"
473 | }
474 | },
475 | "nbformat": 4,
476 | "nbformat_minor": 0
477 | }
478 |
--------------------------------------------------------------------------------
/Python-Crash-Course/.ipynb_checkpoints/Python Crash Course Exercises-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Python Crash Course Exercises \n",
8 | "\n",
9 | "This is an optional exercise to test your understanding of Python Basics. If you find this extremely challenging, then you probably are not ready for the rest of this course yet and don't have enough programming experience to continue. I would suggest you take another course more geared towards complete beginners, such as [Complete Python Bootcamp]()"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Exercises\n",
17 | "\n",
18 | "Answer the questions or complete the tasks outlined in bold below, use the specific method described if applicable."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "** What is 7 to the power of 4?**"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "2401"
39 | ]
40 | },
41 | "execution_count": 1,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": []
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "** Split this string:**\n",
53 | "\n",
54 | " s = \"Hi there Sam!\"\n",
55 | " \n",
56 | "**into a list. **"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 4,
62 | "metadata": {
63 | "collapsed": true
64 | },
65 | "outputs": [],
66 | "source": []
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 3,
71 | "metadata": {
72 | "collapsed": false
73 | },
74 | "outputs": [
75 | {
76 | "data": {
77 | "text/plain": [
78 | "['Hi', 'there', 'dad!']"
79 | ]
80 | },
81 | "execution_count": 3,
82 | "metadata": {},
83 | "output_type": "execute_result"
84 | }
85 | ],
86 | "source": []
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "** Given the variables:**\n",
93 | "\n",
94 | " planet = \"Earth\"\n",
95 | " diameter = 12742\n",
96 | "\n",
97 | "** Use .format() to print the following string: **\n",
98 | "\n",
99 | " The diameter of Earth is 12742 kilometers."
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 5,
105 | "metadata": {
106 | "collapsed": true
107 | },
108 | "outputs": [],
109 | "source": [
110 | "planet = \"Earth\"\n",
111 | "diameter = 12742"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 6,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [
121 | {
122 | "name": "stdout",
123 | "output_type": "stream",
124 | "text": [
125 | "The diameter of Earth is 12742 kilometers.\n"
126 | ]
127 | }
128 | ],
129 | "source": []
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "** Given this nested list, use indexing to grab the word \"hello\" **"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 7,
141 | "metadata": {
142 | "collapsed": true
143 | },
144 | "outputs": [],
145 | "source": [
146 | "lst = [1,2,[3,4],[5,[100,200,['hello']],23,11],1,7]"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 14,
152 | "metadata": {
153 | "collapsed": false
154 | },
155 | "outputs": [
156 | {
157 | "data": {
158 | "text/plain": [
159 | "'hello'"
160 | ]
161 | },
162 | "execution_count": 14,
163 | "metadata": {},
164 | "output_type": "execute_result"
165 | }
166 | ],
167 | "source": []
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "** Given this nest dictionary grab the word \"hello\". Be prepared, this will be annoying/tricky **"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 16,
179 | "metadata": {
180 | "collapsed": false
181 | },
182 | "outputs": [],
183 | "source": [
184 | "d = {'k1':[1,2,3,{'tricky':['oh','man','inception',{'target':[1,2,3,'hello']}]}]}"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 22,
190 | "metadata": {
191 | "collapsed": false
192 | },
193 | "outputs": [
194 | {
195 | "data": {
196 | "text/plain": [
197 | "'hello'"
198 | ]
199 | },
200 | "execution_count": 22,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": []
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "** What is the main difference between a tuple and a list? **"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 23,
217 | "metadata": {
218 | "collapsed": true
219 | },
220 | "outputs": [],
221 | "source": [
222 | "# Just answer with text, no code necessary"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "** Create a function that grabs the email website domain from a string in the form: **\n",
230 | "\n",
231 | " user@domain.com\n",
232 | " \n",
233 | "**So for example, passing \"user@domain.com\" would return: domain.com**"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 24,
239 | "metadata": {
240 | "collapsed": true
241 | },
242 | "outputs": [],
243 | "source": []
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 26,
248 | "metadata": {
249 | "collapsed": false
250 | },
251 | "outputs": [
252 | {
253 | "data": {
254 | "text/plain": [
255 | "'domain.com'"
256 | ]
257 | },
258 | "execution_count": 26,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": []
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "** Create a basic function that returns True if the word 'dog' is contained in the input string. Don't worry about edge cases like a punctuation being attached to the word dog, but do account for capitalization. **"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 27,
275 | "metadata": {
276 | "collapsed": true
277 | },
278 | "outputs": [],
279 | "source": []
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 28,
284 | "metadata": {
285 | "collapsed": false
286 | },
287 | "outputs": [
288 | {
289 | "data": {
290 | "text/plain": [
291 | "True"
292 | ]
293 | },
294 | "execution_count": 28,
295 | "metadata": {},
296 | "output_type": "execute_result"
297 | }
298 | ],
299 | "source": []
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "** Create a function that counts the number of times the word \"dog\" occurs in a string. Again ignore edge cases. **"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 30,
311 | "metadata": {
312 | "collapsed": false
313 | },
314 | "outputs": [],
315 | "source": []
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 31,
320 | "metadata": {
321 | "collapsed": false
322 | },
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/plain": [
327 | "2"
328 | ]
329 | },
330 | "execution_count": 31,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": []
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "### Final Problem\n",
342 | "**You are driving a little too fast, and a police officer stops you. Write a function\n",
343 | " to return one of 3 possible results: \"No ticket\", \"Small ticket\", or \"Big Ticket\". \n",
344 | " If your speed is 60 or less, the result is \"No Ticket\". If speed is between 61 \n",
345 | " and 80 inclusive, the result is \"Small Ticket\". If speed is 81 or more, the result is \"Big Ticket\". Unless it is your birthday (encoded as a boolean value in the parameters of the function) -- on your birthday, your speed can be 5 higher in all \n",
346 | " cases. **"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": 4,
352 | "metadata": {
353 | "collapsed": true
354 | },
355 | "outputs": [],
356 | "source": []
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 5,
361 | "metadata": {
362 | "collapsed": false
363 | },
364 | "outputs": [
365 | {
366 | "data": {
367 | "text/plain": [
368 | "'Small Ticket'"
369 | ]
370 | },
371 | "execution_count": 5,
372 | "metadata": {},
373 | "output_type": "execute_result"
374 | }
375 | ],
376 | "source": []
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 6,
381 | "metadata": {
382 | "collapsed": false
383 | },
384 | "outputs": [
385 | {
386 | "data": {
387 | "text/plain": [
388 | "'Big Ticket'"
389 | ]
390 | },
391 | "execution_count": 6,
392 | "metadata": {},
393 | "output_type": "execute_result"
394 | }
395 | ],
396 | "source": []
397 | },
398 | {
399 | "cell_type": "markdown",
400 | "metadata": {},
401 | "source": [
402 | "# Great job!"
403 | ]
404 | }
405 | ],
406 | "metadata": {
407 | "anaconda-cloud": {},
408 | "kernelspec": {
409 | "display_name": "Python [default]",
410 | "language": "python",
411 | "name": "python3"
412 | },
413 | "language_info": {
414 | "codemirror_mode": {
415 | "name": "ipython",
416 | "version": 3
417 | },
418 | "file_extension": ".py",
419 | "mimetype": "text/x-python",
420 | "name": "python",
421 | "nbconvert_exporter": "python",
422 | "pygments_lexer": "ipython3",
423 | "version": "3.5.3"
424 | }
425 | },
426 | "nbformat": 4,
427 | "nbformat_minor": 0
428 | }
429 |
--------------------------------------------------------------------------------
/Python-Crash-Course/Python Crash Course Exercises - Solutions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Python Crash Course Exercises - Solutions\n",
8 | "\n",
9 | "This is an optional exercise to test your understanding of Python Basics. If you find this extremely challenging, then you probably are not ready for the rest of this course yet and don't have enough programming experience to continue. I would suggest you take another course more geared towards complete beginners, such as [Complete Python Bootcamp]()"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Exercises\n",
17 | "\n",
18 | "Answer the questions or complete the tasks outlined in bold below, use the specific method described if applicable."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "** What is 7 to the power of 4?**"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "2401"
39 | ]
40 | },
41 | "execution_count": 1,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": [
47 | "7 **4"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "** Split this string:**\n",
55 | "\n",
56 | " s = \"Hi there Sam!\"\n",
57 | " \n",
58 | "**into a list. **"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {
65 | "collapsed": true
66 | },
67 | "outputs": [],
68 | "source": [
69 | "s = 'Hi there Sam!'"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 3,
75 | "metadata": {
76 | "collapsed": false
77 | },
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/plain": [
82 | "['Hi', 'there', 'dad!']"
83 | ]
84 | },
85 | "execution_count": 3,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "s.split()"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "** Given the variables:**\n",
99 | "\n",
100 | " planet = \"Earth\"\n",
101 | " diameter = 12742\n",
102 | "\n",
103 | "** Use .format() to print the following string: **\n",
104 | "\n",
105 | " The diameter of Earth is 12742 kilometers."
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 5,
111 | "metadata": {
112 | "collapsed": true
113 | },
114 | "outputs": [],
115 | "source": [
116 | "planet = \"Earth\"\n",
117 | "diameter = 12742"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 6,
123 | "metadata": {
124 | "collapsed": false
125 | },
126 | "outputs": [
127 | {
128 | "name": "stdout",
129 | "output_type": "stream",
130 | "text": [
131 | "The diameter of Earth is 12742 kilometers.\n"
132 | ]
133 | }
134 | ],
135 | "source": [
136 | "print(\"The diameter of {} is {} kilometers.\".format(planet,diameter))"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "** Given this nested list, use indexing to grab the word \"hello\" **"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 7,
149 | "metadata": {
150 | "collapsed": true
151 | },
152 | "outputs": [],
153 | "source": [
154 | "lst = [1,2,[3,4],[5,[100,200,['hello']],23,11],1,7]"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 14,
160 | "metadata": {
161 | "collapsed": false
162 | },
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "'hello'"
168 | ]
169 | },
170 | "execution_count": 14,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "lst[3][1][2][0]"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "** Given this nest dictionary grab the word \"hello\". Be prepared, this will be annoying/tricky **"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 16,
189 | "metadata": {
190 | "collapsed": false
191 | },
192 | "outputs": [],
193 | "source": [
194 | "d = {'k1':[1,2,3,{'tricky':['oh','man','inception',{'target':[1,2,3,'hello']}]}]}"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 22,
200 | "metadata": {
201 | "collapsed": false
202 | },
203 | "outputs": [
204 | {
205 | "data": {
206 | "text/plain": [
207 | "'hello'"
208 | ]
209 | },
210 | "execution_count": 22,
211 | "metadata": {},
212 | "output_type": "execute_result"
213 | }
214 | ],
215 | "source": [
216 | "d['k1'][3]['tricky'][3]['target'][3]"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "** What is the main difference between a tuple and a list? **"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 23,
229 | "metadata": {
230 | "collapsed": true
231 | },
232 | "outputs": [],
233 | "source": [
234 | "# Just answer with text, no code necessary"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "** Create a function that grabs the email website domain from a string in the form: **\n",
242 | "\n",
243 | " user@domain.com\n",
244 | " \n",
245 | "**So for example, passing \"user@domain.com\" would return: domain.com**"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 24,
251 | "metadata": {
252 | "collapsed": true
253 | },
254 | "outputs": [],
255 | "source": [
256 | "def domainGet(email):\n",
257 | " return email.split('@')[-1]"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 26,
263 | "metadata": {
264 | "collapsed": false
265 | },
266 | "outputs": [
267 | {
268 | "data": {
269 | "text/plain": [
270 | "'domain.com'"
271 | ]
272 | },
273 | "execution_count": 26,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "domainGet('user@domain.com')"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "** Create a basic function that returns True if the word 'dog' is contained in the input string. Don't worry about edge cases like a punctuation being attached to the word dog, but do account for capitalization. **"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 27,
292 | "metadata": {
293 | "collapsed": true
294 | },
295 | "outputs": [],
296 | "source": [
297 | "def findDog(st):\n",
298 | " return 'dog' in st.lower().split()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 28,
304 | "metadata": {
305 | "collapsed": false
306 | },
307 | "outputs": [
308 | {
309 | "data": {
310 | "text/plain": [
311 | "True"
312 | ]
313 | },
314 | "execution_count": 28,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "findDog('Is there a dog here?')"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "** Create a function that counts the number of times the word \"dog\" occurs in a string. Again ignore edge cases. **"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 30,
333 | "metadata": {
334 | "collapsed": false
335 | },
336 | "outputs": [],
337 | "source": [
338 | "def countDog(st):\n",
339 | " count = 0\n",
340 | " for word in st.lower().split():\n",
341 | " if word == 'dog':\n",
342 | " count += 1\n",
343 | " return count"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 31,
349 | "metadata": {
350 | "collapsed": false
351 | },
352 | "outputs": [
353 | {
354 | "data": {
355 | "text/plain": [
356 | "2"
357 | ]
358 | },
359 | "execution_count": 31,
360 | "metadata": {},
361 | "output_type": "execute_result"
362 | }
363 | ],
364 | "source": [
365 | "countDog('This dog runs faster than the other dog dude!')"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "metadata": {},
371 | "source": [
372 | "### Final Problem\n",
373 | "**You are driving a little too fast, and a police officer stops you. Write a function\n",
374 | " to return one of 3 possible results: \"No ticket\", \"Small ticket\", or \"Big Ticket\". \n",
375 | " If your speed is 60 or less, the result is \"No Ticket\". If speed is between 61 \n",
376 | " and 80 inclusive, the result is \"Small Ticket\". If speed is 81 or more, the result is \"Big Ticket\". Unless it is your birthday (encoded as a boolean value in the parameters of the function) -- on your birthday, your speed can be 5 higher in all \n",
377 | " cases. **"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 4,
383 | "metadata": {
384 | "collapsed": true
385 | },
386 | "outputs": [],
387 | "source": [
388 | "def caught_speeding(speed, is_birthday):\n",
389 | " \n",
390 | " if is_birthday:\n",
391 | " speeding = speed - 5\n",
392 | " else:\n",
393 | " speeding = speed\n",
394 | " \n",
395 | " if speeding > 80:\n",
396 | " return 'Big Ticket'\n",
397 | " elif speeding > 60:\n",
398 | " return 'Small Ticket'\n",
399 | " else:\n",
400 | " return 'No Ticket'"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": 5,
406 | "metadata": {
407 | "collapsed": false
408 | },
409 | "outputs": [
410 | {
411 | "data": {
412 | "text/plain": [
413 | "'Small Ticket'"
414 | ]
415 | },
416 | "execution_count": 5,
417 | "metadata": {},
418 | "output_type": "execute_result"
419 | }
420 | ],
421 | "source": [
422 | "caught_speeding(81,True)"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": 6,
428 | "metadata": {
429 | "collapsed": false
430 | },
431 | "outputs": [
432 | {
433 | "data": {
434 | "text/plain": [
435 | "'Big Ticket'"
436 | ]
437 | },
438 | "execution_count": 6,
439 | "metadata": {},
440 | "output_type": "execute_result"
441 | }
442 | ],
443 | "source": [
444 | "caught_speeding(81,False)"
445 | ]
446 | },
447 | {
448 | "cell_type": "markdown",
449 | "metadata": {},
450 | "source": [
451 | "# Great job!"
452 | ]
453 | }
454 | ],
455 | "metadata": {
456 | "anaconda-cloud": {},
457 | "kernelspec": {
458 | "display_name": "Python [default]",
459 | "language": "python",
460 | "name": "python3"
461 | },
462 | "language_info": {
463 | "codemirror_mode": {
464 | "name": "ipython",
465 | "version": 3
466 | },
467 | "file_extension": ".py",
468 | "mimetype": "text/x-python",
469 | "name": "python",
470 | "nbconvert_exporter": "python",
471 | "pygments_lexer": "ipython3",
472 | "version": "3.5.3"
473 | }
474 | },
475 | "nbformat": 4,
476 | "nbformat_minor": 0
477 | }
478 |
--------------------------------------------------------------------------------
/Python-Crash-Course/Python Crash Course Exercises.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Python Crash Course Exercises \n",
8 | "\n",
9 | "This is an optional exercise to test your understanding of Python Basics. If you find this extremely challenging, then you probably are not ready for the rest of this course yet and don't have enough programming experience to continue. I would suggest you take another course more geared towards complete beginners, such as [Complete Python Bootcamp]()"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Exercises\n",
17 | "\n",
18 | "Answer the questions or complete the tasks outlined in bold below, use the specific method described if applicable."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "** What is 7 to the power of 4?**"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "2401"
39 | ]
40 | },
41 | "execution_count": 1,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": []
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "** Split this string:**\n",
53 | "\n",
54 | " s = \"Hi there Sam!\"\n",
55 | " \n",
56 | "**into a list. **"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 4,
62 | "metadata": {
63 | "collapsed": true
64 | },
65 | "outputs": [],
66 | "source": []
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 3,
71 | "metadata": {
72 | "collapsed": false
73 | },
74 | "outputs": [
75 | {
76 | "data": {
77 | "text/plain": [
78 | "['Hi', 'there', 'dad!']"
79 | ]
80 | },
81 | "execution_count": 3,
82 | "metadata": {},
83 | "output_type": "execute_result"
84 | }
85 | ],
86 | "source": []
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "** Given the variables:**\n",
93 | "\n",
94 | " planet = \"Earth\"\n",
95 | " diameter = 12742\n",
96 | "\n",
97 | "** Use .format() to print the following string: **\n",
98 | "\n",
99 | " The diameter of Earth is 12742 kilometers."
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 5,
105 | "metadata": {
106 | "collapsed": true
107 | },
108 | "outputs": [],
109 | "source": [
110 | "planet = \"Earth\"\n",
111 | "diameter = 12742"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 6,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [
121 | {
122 | "name": "stdout",
123 | "output_type": "stream",
124 | "text": [
125 | "The diameter of Earth is 12742 kilometers.\n"
126 | ]
127 | }
128 | ],
129 | "source": []
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "** Given this nested list, use indexing to grab the word \"hello\" **"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 7,
141 | "metadata": {
142 | "collapsed": true
143 | },
144 | "outputs": [],
145 | "source": [
146 | "lst = [1,2,[3,4],[5,[100,200,['hello']],23,11],1,7]"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 14,
152 | "metadata": {
153 | "collapsed": false
154 | },
155 | "outputs": [
156 | {
157 | "data": {
158 | "text/plain": [
159 | "'hello'"
160 | ]
161 | },
162 | "execution_count": 14,
163 | "metadata": {},
164 | "output_type": "execute_result"
165 | }
166 | ],
167 | "source": []
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "** Given this nest dictionary grab the word \"hello\". Be prepared, this will be annoying/tricky **"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 16,
179 | "metadata": {
180 | "collapsed": false
181 | },
182 | "outputs": [],
183 | "source": [
184 | "d = {'k1':[1,2,3,{'tricky':['oh','man','inception',{'target':[1,2,3,'hello']}]}]}"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 22,
190 | "metadata": {
191 | "collapsed": false
192 | },
193 | "outputs": [
194 | {
195 | "data": {
196 | "text/plain": [
197 | "'hello'"
198 | ]
199 | },
200 | "execution_count": 22,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": []
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "** What is the main difference between a tuple and a list? **"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 23,
217 | "metadata": {
218 | "collapsed": true
219 | },
220 | "outputs": [],
221 | "source": [
222 | "# Just answer with text, no code necessary"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "** Create a function that grabs the email website domain from a string in the form: **\n",
230 | "\n",
231 | " user@domain.com\n",
232 | " \n",
233 | "**So for example, passing \"user@domain.com\" would return: domain.com**"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 24,
239 | "metadata": {
240 | "collapsed": true
241 | },
242 | "outputs": [],
243 | "source": []
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 26,
248 | "metadata": {
249 | "collapsed": false
250 | },
251 | "outputs": [
252 | {
253 | "data": {
254 | "text/plain": [
255 | "'domain.com'"
256 | ]
257 | },
258 | "execution_count": 26,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": []
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "** Create a basic function that returns True if the word 'dog' is contained in the input string. Don't worry about edge cases like a punctuation being attached to the word dog, but do account for capitalization. **"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 27,
275 | "metadata": {
276 | "collapsed": true
277 | },
278 | "outputs": [],
279 | "source": []
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 28,
284 | "metadata": {
285 | "collapsed": false
286 | },
287 | "outputs": [
288 | {
289 | "data": {
290 | "text/plain": [
291 | "True"
292 | ]
293 | },
294 | "execution_count": 28,
295 | "metadata": {},
296 | "output_type": "execute_result"
297 | }
298 | ],
299 | "source": []
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "** Create a function that counts the number of times the word \"dog\" occurs in a string. Again ignore edge cases. **"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 30,
311 | "metadata": {
312 | "collapsed": false
313 | },
314 | "outputs": [],
315 | "source": []
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 31,
320 | "metadata": {
321 | "collapsed": false
322 | },
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/plain": [
327 | "2"
328 | ]
329 | },
330 | "execution_count": 31,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": []
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "### Final Problem\n",
342 | "**You are driving a little too fast, and a police officer stops you. Write a function\n",
343 | " to return one of 3 possible results: \"No ticket\", \"Small ticket\", or \"Big Ticket\". \n",
344 | " If your speed is 60 or less, the result is \"No Ticket\". If speed is between 61 \n",
345 | " and 80 inclusive, the result is \"Small Ticket\". If speed is 81 or more, the result is \"Big Ticket\". Unless it is your birthday (encoded as a boolean value in the parameters of the function) -- on your birthday, your speed can be 5 higher in all \n",
346 | " cases. **"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": 4,
352 | "metadata": {
353 | "collapsed": true
354 | },
355 | "outputs": [],
356 | "source": []
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 5,
361 | "metadata": {
362 | "collapsed": false
363 | },
364 | "outputs": [
365 | {
366 | "data": {
367 | "text/plain": [
368 | "'Small Ticket'"
369 | ]
370 | },
371 | "execution_count": 5,
372 | "metadata": {},
373 | "output_type": "execute_result"
374 | }
375 | ],
376 | "source": []
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 6,
381 | "metadata": {
382 | "collapsed": false
383 | },
384 | "outputs": [
385 | {
386 | "data": {
387 | "text/plain": [
388 | "'Big Ticket'"
389 | ]
390 | },
391 | "execution_count": 6,
392 | "metadata": {},
393 | "output_type": "execute_result"
394 | }
395 | ],
396 | "source": []
397 | },
398 | {
399 | "cell_type": "markdown",
400 | "metadata": {},
401 | "source": [
402 | "# Great job!"
403 | ]
404 | }
405 | ],
406 | "metadata": {
407 | "anaconda-cloud": {},
408 | "kernelspec": {
409 | "display_name": "Python [default]",
410 | "language": "python",
411 | "name": "python3"
412 | },
413 | "language_info": {
414 | "codemirror_mode": {
415 | "name": "ipython",
416 | "version": 3
417 | },
418 | "file_extension": ".py",
419 | "mimetype": "text/x-python",
420 | "name": "python",
421 | "nbconvert_exporter": "python",
422 | "pygments_lexer": "ipython3",
423 | "version": "3.5.3"
424 | }
425 | },
426 | "nbformat": 4,
427 | "nbformat_minor": 0
428 | }
429 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python-and-Spark-for-Big-Data
2 | Course Notebooks for Python and Spark for Big Data
3 |
4 | Course Outline:
5 |
6 | * Course Introduction
7 | * Promo/Intro Video
8 | * Course Curriculum Overview
9 | * Introduction to Spark, RDDs, and Spark 2.0
10 |
11 | * Course Set-up
12 | * Set-up Overview
13 | * EC2 Installation Guide
14 | * Local Installation Guide with VirtualBox
15 | * Databricks Notebooks
16 | * Unix Command Line Basics and Jupyter Notebook Overview
17 |
18 | * Spark DataFrames
19 | * Spark DataFrames Section Introduction
20 | * Spark DataFrame Basics
21 | * Spark DataFrame Operations
22 | * Groupby and Aggregate Functions
23 | * Missing Data
24 | * Dates and Timestamps
25 |
26 | * Spark DataFrame Project
27 | * DataFrame Project Exercise
28 | * DataFrame Project Exercise Solutions
29 |
30 | * Machine Learning
31 | * Introduction to Machine Learning and ISLR
32 | * Machine Learning with Spark and Python and MLlib
33 | * Consulting Project Approach Overview
34 |
35 | * Linear Regression
36 | * Introduction to Linear Regression
37 | * Discussion on Data Transformations
38 | * Linear Regression with PySpark Example (Car Data)
39 | * Linear Regression Consulting Project (Housing Data)
40 | * Linear Regression Consulting Project Solution
41 |
42 | * Logistic Regression
43 | * Introduction to Logisitic Regression
44 | * Logistic Regression Example
45 | * Logistic Regression Consulting Project (Customer Churn)
46 | * Logistic Regression Consluting Project Solution
47 |
48 | * Tree Methods
49 | * Introduction to Tree Methods
50 | * Decision Tree and Random Forest Example
51 | * Random Forest Classification Consulting Project - Dog Food Data
52 | * RF Classification Consulting Project Solutions
53 | * RF Regression Project - (Facebook Data)
54 |
55 | * Clustering
56 | * Introduction to K-means Clustering
57 | * Clustering Example - Iris Dataset
58 | * Clustering Consulting Project - Customer Segmentation (Fake Data)
59 | * Clustering Consulting Project Solutions
60 |
61 | * Recommender System
62 | * Introduction to Recommender Systems and Collaborative Filtering
63 | * Code Along Project - MovieLens Dataset
64 | * Possible Consulting Project ? Company Service Reviews
65 |
66 | * Natural Language Processing
67 | * Introduction to Project/NLP/Naive Bayes Model
68 | * What are pipelines?
69 | * Code Along
70 |
71 | * Spark Streaming
72 | * Introduction to Spark Streaming
73 | * Spark Streaming Code-along!
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/Spark Streaming/TweetRead.py:
--------------------------------------------------------------------------------
1 | import tweepy
2 | from tweepy import OAuthHandler
3 | from tweepy import Stream
4 | from tweepy.streaming import StreamListener
5 | import socket
6 | import json
7 |
8 |
9 | # Set up your credentials
10 | consumer_key=''
11 | consumer_secret=''
12 | access_token =''
13 | access_secret=''
14 |
15 |
16 | class TweetsListener(StreamListener):
17 |
18 | def __init__(self, csocket):
19 | self.client_socket = csocket
20 |
21 | def on_data(self, data):
22 | try:
23 | msg = json.loads( data )
24 | print( msg['text'].encode('utf-8') )
25 | self.client_socket.send( msg['text'].encode('utf-8') )
26 | return True
27 | except BaseException as e:
28 | print("Error on_data: %s" % str(e))
29 | return True
30 |
31 | def on_error(self, status):
32 | print(status)
33 | return True
34 |
35 | def sendData(c_socket):
36 | auth = OAuthHandler(consumer_key, consumer_secret)
37 | auth.set_access_token(access_token, access_secret)
38 |
39 | twitter_stream = Stream(auth, TweetsListener(c_socket))
40 | twitter_stream.filter(track=['soccer'])
41 |
42 | if __name__ == "__main__":
43 | s = socket.socket() # Create a socket object
44 | host = "127.0.0.1" # Get local machine name
45 | port = 5555 # Reserve a port for your service.
46 | s.bind((host, port)) # Bind to the port
47 |
48 | print("Listening on port: %s" % str(port))
49 |
50 | s.listen(5) # Now wait for client connection.
51 | c, addr = s.accept() # Establish connection with client.
52 |
53 | print( "Received request from: " + str( addr ) )
54 |
55 | sendData( c )
--------------------------------------------------------------------------------
/Spark_DataFrames/.ipynb_checkpoints/Missing_Data-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Missing Data\n",
8 | "\n",
9 | "Often data sources are incomplete, which means you will have missing data, you have 3 basic options for filling in missing data (you will personally have to make the decision for what is the right approach:\n",
10 | "\n",
11 | "* Just keep the missing data points.\n",
12 | "* Drop them missing data points (including the entire row)\n",
13 | "* Fill them in with some other value.\n",
14 | "\n",
15 | "Let's cover examples of each of these methods!"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "source": [
24 | "## Keeping the missing data\n",
25 | "A few machine learning algorithms can easily deal with missing data, let's see what it looks like:"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "from pyspark.sql import SparkSession\n",
37 | "# May take a little while on a local computer\n",
38 | "spark = SparkSession.builder.appName(\"missingdata\").getOrCreate()"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 2,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "df = spark.read.csv(\"ContainsNull.csv\",header=True,inferSchema=True)"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "metadata": {
56 | "collapsed": false
57 | },
58 | "outputs": [
59 | {
60 | "name": "stdout",
61 | "output_type": "stream",
62 | "text": [
63 | "+----+-----+-----+\n",
64 | "| Id| Name|Sales|\n",
65 | "+----+-----+-----+\n",
66 | "|emp1| John| null|\n",
67 | "|emp2| null| null|\n",
68 | "|emp3| null|345.0|\n",
69 | "|emp4|Cindy|456.0|\n",
70 | "+----+-----+-----+\n",
71 | "\n"
72 | ]
73 | }
74 | ],
75 | "source": [
76 | "df.show()"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "Notice how the data remains as a null."
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "## Drop the missing data\n",
91 | "\n",
92 | "You can use the .na functions for missing data. The drop command has the following parameters:\n",
93 | "\n",
94 | " df.na.drop(how='any', thresh=None, subset=None)\n",
95 | " \n",
96 | " * param how: 'any' or 'all'.\n",
97 | " \n",
98 | " If 'any', drop a row if it contains any nulls.\n",
99 | " If 'all', drop a row only if all its values are null.\n",
100 | " \n",
101 | " * param thresh: int, default None\n",
102 | " \n",
103 | " If specified, drop rows that have less than `thresh` non-null values.\n",
104 | " This overwrites the `how` parameter.\n",
105 | " \n",
106 | " * param subset: \n",
107 | " optional list of column names to consider."
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 6,
113 | "metadata": {
114 | "collapsed": false
115 | },
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "+----+-----+-----+\n",
122 | "| Id| Name|Sales|\n",
123 | "+----+-----+-----+\n",
124 | "|emp4|Cindy|456.0|\n",
125 | "+----+-----+-----+\n",
126 | "\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "# Drop any row that contains missing data\n",
132 | "df.na.drop().show()"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 8,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [
142 | {
143 | "name": "stdout",
144 | "output_type": "stream",
145 | "text": [
146 | "+----+-----+-----+\n",
147 | "| Id| Name|Sales|\n",
148 | "+----+-----+-----+\n",
149 | "|emp1| John| null|\n",
150 | "|emp3| null|345.0|\n",
151 | "|emp4|Cindy|456.0|\n",
152 | "+----+-----+-----+\n",
153 | "\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "# Has to have at least 2 NON-null values\n",
159 | "df.na.drop(thresh=2).show()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 9,
165 | "metadata": {
166 | "collapsed": false
167 | },
168 | "outputs": [
169 | {
170 | "name": "stdout",
171 | "output_type": "stream",
172 | "text": [
173 | "+----+-----+-----+\n",
174 | "| Id| Name|Sales|\n",
175 | "+----+-----+-----+\n",
176 | "|emp3| null|345.0|\n",
177 | "|emp4|Cindy|456.0|\n",
178 | "+----+-----+-----+\n",
179 | "\n"
180 | ]
181 | }
182 | ],
183 | "source": [
184 | "df.na.drop(subset=[\"Sales\"]).show()"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 10,
190 | "metadata": {
191 | "collapsed": false
192 | },
193 | "outputs": [
194 | {
195 | "name": "stdout",
196 | "output_type": "stream",
197 | "text": [
198 | "+----+-----+-----+\n",
199 | "| Id| Name|Sales|\n",
200 | "+----+-----+-----+\n",
201 | "|emp4|Cindy|456.0|\n",
202 | "+----+-----+-----+\n",
203 | "\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "df.na.drop(how='any').show()"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 11,
214 | "metadata": {
215 | "collapsed": false
216 | },
217 | "outputs": [
218 | {
219 | "name": "stdout",
220 | "output_type": "stream",
221 | "text": [
222 | "+----+-----+-----+\n",
223 | "| Id| Name|Sales|\n",
224 | "+----+-----+-----+\n",
225 | "|emp1| John| null|\n",
226 | "|emp2| null| null|\n",
227 | "|emp3| null|345.0|\n",
228 | "|emp4|Cindy|456.0|\n",
229 | "+----+-----+-----+\n",
230 | "\n"
231 | ]
232 | }
233 | ],
234 | "source": [
235 | "df.na.drop(how='all').show()"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "## Fill the missing values\n",
243 | "\n",
244 | "We can also fill the missing values with new values. If you have multiple nulls across multiple data types, Spark is actually smart enough to match up the data types. For example:"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 15,
250 | "metadata": {
251 | "collapsed": false
252 | },
253 | "outputs": [
254 | {
255 | "name": "stdout",
256 | "output_type": "stream",
257 | "text": [
258 | "+----+---------+-----+\n",
259 | "| Id| Name|Sales|\n",
260 | "+----+---------+-----+\n",
261 | "|emp1| John| null|\n",
262 | "|emp2|NEW VALUE| null|\n",
263 | "|emp3|NEW VALUE|345.0|\n",
264 | "|emp4| Cindy|456.0|\n",
265 | "+----+---------+-----+\n",
266 | "\n"
267 | ]
268 | }
269 | ],
270 | "source": [
271 | "df.na.fill('NEW VALUE').show()"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 16,
277 | "metadata": {
278 | "collapsed": false
279 | },
280 | "outputs": [
281 | {
282 | "name": "stdout",
283 | "output_type": "stream",
284 | "text": [
285 | "+----+-----+-----+\n",
286 | "| Id| Name|Sales|\n",
287 | "+----+-----+-----+\n",
288 | "|emp1| John| 0.0|\n",
289 | "|emp2| null| 0.0|\n",
290 | "|emp3| null|345.0|\n",
291 | "|emp4|Cindy|456.0|\n",
292 | "+----+-----+-----+\n",
293 | "\n"
294 | ]
295 | }
296 | ],
297 | "source": [
298 | "df.na.fill(0).show()"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "Usually you should specify what columns you want to fill with the subset parameter"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 17,
311 | "metadata": {
312 | "collapsed": false
313 | },
314 | "outputs": [
315 | {
316 | "name": "stdout",
317 | "output_type": "stream",
318 | "text": [
319 | "+----+-------+-----+\n",
320 | "| Id| Name|Sales|\n",
321 | "+----+-------+-----+\n",
322 | "|emp1| John| null|\n",
323 | "|emp2|No Name| null|\n",
324 | "|emp3|No Name|345.0|\n",
325 | "|emp4| Cindy|456.0|\n",
326 | "+----+-------+-----+\n",
327 | "\n"
328 | ]
329 | }
330 | ],
331 | "source": [
332 | "df.na.fill('No Name',subset=['Name']).show()"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "A very common practice is to fill values with the mean value for the column, for example:"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 23,
345 | "metadata": {
346 | "collapsed": false
347 | },
348 | "outputs": [
349 | {
350 | "data": {
351 | "text/plain": [
352 | "400.5"
353 | ]
354 | },
355 | "execution_count": 23,
356 | "metadata": {},
357 | "output_type": "execute_result"
358 | }
359 | ],
360 | "source": [
361 | "from pyspark.sql.functions import mean\n",
362 | "mean_val = df.select(mean(df['Sales'])).collect()\n",
363 | "\n",
364 | "# Weird nested formatting of Row object!\n",
365 | "mean_val[0][0]"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 24,
371 | "metadata": {
372 | "collapsed": true
373 | },
374 | "outputs": [],
375 | "source": [
376 | "mean_sales = mean_val[0][0]"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 26,
382 | "metadata": {
383 | "collapsed": false
384 | },
385 | "outputs": [
386 | {
387 | "name": "stdout",
388 | "output_type": "stream",
389 | "text": [
390 | "+----+-----+-----+\n",
391 | "| Id| Name|Sales|\n",
392 | "+----+-----+-----+\n",
393 | "|emp1| John|400.5|\n",
394 | "|emp2| null|400.5|\n",
395 | "|emp3| null|345.0|\n",
396 | "|emp4|Cindy|456.0|\n",
397 | "+----+-----+-----+\n",
398 | "\n"
399 | ]
400 | }
401 | ],
402 | "source": [
403 | "df.na.fill(mean_sales,[\"Sales\"]).show()"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 28,
409 | "metadata": {
410 | "collapsed": false
411 | },
412 | "outputs": [
413 | {
414 | "name": "stdout",
415 | "output_type": "stream",
416 | "text": [
417 | "+----+-----+-----+\n",
418 | "| Id| Name|Sales|\n",
419 | "+----+-----+-----+\n",
420 | "|emp1| John|400.5|\n",
421 | "|emp2| null|400.5|\n",
422 | "|emp3| null|345.0|\n",
423 | "|emp4|Cindy|456.0|\n",
424 | "+----+-----+-----+\n",
425 | "\n"
426 | ]
427 | }
428 | ],
429 | "source": [
430 | "# One (very ugly) one-liner\n",
431 | "df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()"
432 | ]
433 | },
434 | {
435 | "cell_type": "markdown",
436 | "metadata": {},
437 | "source": [
438 | "That is all we need to know for now!"
439 | ]
440 | }
441 | ],
442 | "metadata": {
443 | "anaconda-cloud": {},
444 | "kernelspec": {
445 | "display_name": "Python [conda root]",
446 | "language": "python",
447 | "name": "conda-root-py"
448 | },
449 | "language_info": {
450 | "codemirror_mode": {
451 | "name": "ipython",
452 | "version": 3
453 | },
454 | "file_extension": ".py",
455 | "mimetype": "text/x-python",
456 | "name": "python",
457 | "nbconvert_exporter": "python",
458 | "pygments_lexer": "ipython3",
459 | "version": "3.5.3"
460 | }
461 | },
462 | "nbformat": 4,
463 | "nbformat_minor": 0
464 | }
465 |
--------------------------------------------------------------------------------
/Spark_DataFrames/ContainsNull.csv:
--------------------------------------------------------------------------------
1 | Id,Name,Sales
2 | emp1,John,
3 | emp2,,
4 | emp3,,345.0
5 | emp4,Cindy,456.0
6 |
--------------------------------------------------------------------------------
/Spark_DataFrames/Missing_Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Missing Data\n",
8 | "\n",
9 | "Often data sources are incomplete, which means you will have missing data, you have 3 basic options for filling in missing data (you will personally have to make the decision for what is the right approach:\n",
10 | "\n",
11 | "* Just keep the missing data points.\n",
12 | "* Drop them missing data points (including the entire row)\n",
13 | "* Fill them in with some other value.\n",
14 | "\n",
15 | "Let's cover examples of each of these methods!"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "source": [
24 | "## Keeping the missing data\n",
25 | "A few machine learning algorithms can easily deal with missing data, let's see what it looks like:"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "from pyspark.sql import SparkSession\n",
37 | "# May take a little while on a local computer\n",
38 | "spark = SparkSession.builder.appName(\"missingdata\").getOrCreate()"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 2,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "df = spark.read.csv(\"ContainsNull.csv\",header=True,inferSchema=True)"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "metadata": {
56 | "collapsed": false
57 | },
58 | "outputs": [
59 | {
60 | "name": "stdout",
61 | "output_type": "stream",
62 | "text": [
63 | "+----+-----+-----+\n",
64 | "| Id| Name|Sales|\n",
65 | "+----+-----+-----+\n",
66 | "|emp1| John| null|\n",
67 | "|emp2| null| null|\n",
68 | "|emp3| null|345.0|\n",
69 | "|emp4|Cindy|456.0|\n",
70 | "+----+-----+-----+\n",
71 | "\n"
72 | ]
73 | }
74 | ],
75 | "source": [
76 | "df.show()"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "Notice how the data remains as a null."
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "## Drop the missing data\n",
91 | "\n",
92 | "You can use the .na functions for missing data. The drop command has the following parameters:\n",
93 | "\n",
94 | " df.na.drop(how='any', thresh=None, subset=None)\n",
95 | " \n",
96 | " * param how: 'any' or 'all'.\n",
97 | " \n",
98 | " If 'any', drop a row if it contains any nulls.\n",
99 | " If 'all', drop a row only if all its values are null.\n",
100 | " \n",
101 | " * param thresh: int, default None\n",
102 | " \n",
103 | " If specified, drop rows that have less than `thresh` non-null values.\n",
104 | " This overwrites the `how` parameter.\n",
105 | " \n",
106 | " * param subset: \n",
107 | " optional list of column names to consider."
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 6,
113 | "metadata": {
114 | "collapsed": false
115 | },
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "+----+-----+-----+\n",
122 | "| Id| Name|Sales|\n",
123 | "+----+-----+-----+\n",
124 | "|emp4|Cindy|456.0|\n",
125 | "+----+-----+-----+\n",
126 | "\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "# Drop any row that contains missing data\n",
132 | "df.na.drop().show()"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 8,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [
142 | {
143 | "name": "stdout",
144 | "output_type": "stream",
145 | "text": [
146 | "+----+-----+-----+\n",
147 | "| Id| Name|Sales|\n",
148 | "+----+-----+-----+\n",
149 | "|emp1| John| null|\n",
150 | "|emp3| null|345.0|\n",
151 | "|emp4|Cindy|456.0|\n",
152 | "+----+-----+-----+\n",
153 | "\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "# Has to have at least 2 NON-null values\n",
159 | "df.na.drop(thresh=2).show()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 9,
165 | "metadata": {
166 | "collapsed": false
167 | },
168 | "outputs": [
169 | {
170 | "name": "stdout",
171 | "output_type": "stream",
172 | "text": [
173 | "+----+-----+-----+\n",
174 | "| Id| Name|Sales|\n",
175 | "+----+-----+-----+\n",
176 | "|emp3| null|345.0|\n",
177 | "|emp4|Cindy|456.0|\n",
178 | "+----+-----+-----+\n",
179 | "\n"
180 | ]
181 | }
182 | ],
183 | "source": [
184 | "df.na.drop(subset=[\"Sales\"]).show()"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 10,
190 | "metadata": {
191 | "collapsed": false
192 | },
193 | "outputs": [
194 | {
195 | "name": "stdout",
196 | "output_type": "stream",
197 | "text": [
198 | "+----+-----+-----+\n",
199 | "| Id| Name|Sales|\n",
200 | "+----+-----+-----+\n",
201 | "|emp4|Cindy|456.0|\n",
202 | "+----+-----+-----+\n",
203 | "\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "df.na.drop(how='any').show()"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 11,
214 | "metadata": {
215 | "collapsed": false
216 | },
217 | "outputs": [
218 | {
219 | "name": "stdout",
220 | "output_type": "stream",
221 | "text": [
222 | "+----+-----+-----+\n",
223 | "| Id| Name|Sales|\n",
224 | "+----+-----+-----+\n",
225 | "|emp1| John| null|\n",
226 | "|emp2| null| null|\n",
227 | "|emp3| null|345.0|\n",
228 | "|emp4|Cindy|456.0|\n",
229 | "+----+-----+-----+\n",
230 | "\n"
231 | ]
232 | }
233 | ],
234 | "source": [
235 | "df.na.drop(how='all').show()"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "## Fill the missing values\n",
243 | "\n",
244 | "We can also fill the missing values with new values. If you have multiple nulls across multiple data types, Spark is actually smart enough to match up the data types. For example:"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 15,
250 | "metadata": {
251 | "collapsed": false
252 | },
253 | "outputs": [
254 | {
255 | "name": "stdout",
256 | "output_type": "stream",
257 | "text": [
258 | "+----+---------+-----+\n",
259 | "| Id| Name|Sales|\n",
260 | "+----+---------+-----+\n",
261 | "|emp1| John| null|\n",
262 | "|emp2|NEW VALUE| null|\n",
263 | "|emp3|NEW VALUE|345.0|\n",
264 | "|emp4| Cindy|456.0|\n",
265 | "+----+---------+-----+\n",
266 | "\n"
267 | ]
268 | }
269 | ],
270 | "source": [
271 | "df.na.fill('NEW VALUE').show()"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 16,
277 | "metadata": {
278 | "collapsed": false
279 | },
280 | "outputs": [
281 | {
282 | "name": "stdout",
283 | "output_type": "stream",
284 | "text": [
285 | "+----+-----+-----+\n",
286 | "| Id| Name|Sales|\n",
287 | "+----+-----+-----+\n",
288 | "|emp1| John| 0.0|\n",
289 | "|emp2| null| 0.0|\n",
290 | "|emp3| null|345.0|\n",
291 | "|emp4|Cindy|456.0|\n",
292 | "+----+-----+-----+\n",
293 | "\n"
294 | ]
295 | }
296 | ],
297 | "source": [
298 | "df.na.fill(0).show()"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "Usually you should specify what columns you want to fill with the subset parameter"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 17,
311 | "metadata": {
312 | "collapsed": false
313 | },
314 | "outputs": [
315 | {
316 | "name": "stdout",
317 | "output_type": "stream",
318 | "text": [
319 | "+----+-------+-----+\n",
320 | "| Id| Name|Sales|\n",
321 | "+----+-------+-----+\n",
322 | "|emp1| John| null|\n",
323 | "|emp2|No Name| null|\n",
324 | "|emp3|No Name|345.0|\n",
325 | "|emp4| Cindy|456.0|\n",
326 | "+----+-------+-----+\n",
327 | "\n"
328 | ]
329 | }
330 | ],
331 | "source": [
332 | "df.na.fill('No Name',subset=['Name']).show()"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "A very common practice is to fill values with the mean value for the column, for example:"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 23,
345 | "metadata": {
346 | "collapsed": false
347 | },
348 | "outputs": [
349 | {
350 | "data": {
351 | "text/plain": [
352 | "400.5"
353 | ]
354 | },
355 | "execution_count": 23,
356 | "metadata": {},
357 | "output_type": "execute_result"
358 | }
359 | ],
360 | "source": [
361 | "from pyspark.sql.functions import mean\n",
362 | "mean_val = df.select(mean(df['Sales'])).collect()\n",
363 | "\n",
364 | "# Weird nested formatting of Row object!\n",
365 | "mean_val[0][0]"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 24,
371 | "metadata": {
372 | "collapsed": true
373 | },
374 | "outputs": [],
375 | "source": [
376 | "mean_sales = mean_val[0][0]"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 26,
382 | "metadata": {
383 | "collapsed": false
384 | },
385 | "outputs": [
386 | {
387 | "name": "stdout",
388 | "output_type": "stream",
389 | "text": [
390 | "+----+-----+-----+\n",
391 | "| Id| Name|Sales|\n",
392 | "+----+-----+-----+\n",
393 | "|emp1| John|400.5|\n",
394 | "|emp2| null|400.5|\n",
395 | "|emp3| null|345.0|\n",
396 | "|emp4|Cindy|456.0|\n",
397 | "+----+-----+-----+\n",
398 | "\n"
399 | ]
400 | }
401 | ],
402 | "source": [
403 | "df.na.fill(mean_sales,[\"Sales\"]).show()"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 28,
409 | "metadata": {
410 | "collapsed": false
411 | },
412 | "outputs": [
413 | {
414 | "name": "stdout",
415 | "output_type": "stream",
416 | "text": [
417 | "+----+-----+-----+\n",
418 | "| Id| Name|Sales|\n",
419 | "+----+-----+-----+\n",
420 | "|emp1| John|400.5|\n",
421 | "|emp2| null|400.5|\n",
422 | "|emp3| null|345.0|\n",
423 | "|emp4|Cindy|456.0|\n",
424 | "+----+-----+-----+\n",
425 | "\n"
426 | ]
427 | }
428 | ],
429 | "source": [
430 | "# One (very ugly) one-liner\n",
431 | "df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()"
432 | ]
433 | },
434 | {
435 | "cell_type": "markdown",
436 | "metadata": {},
437 | "source": [
438 | "That is all we need to know for now!"
439 | ]
440 | }
441 | ],
442 | "metadata": {
443 | "anaconda-cloud": {},
444 | "kernelspec": {
445 | "display_name": "Python [conda root]",
446 | "language": "python",
447 | "name": "conda-root-py"
448 | },
449 | "language_info": {
450 | "codemirror_mode": {
451 | "name": "ipython",
452 | "version": 3
453 | },
454 | "file_extension": ".py",
455 | "mimetype": "text/x-python",
456 | "name": "python",
457 | "nbconvert_exporter": "python",
458 | "pygments_lexer": "ipython3",
459 | "version": "3.5.3"
460 | }
461 | },
462 | "nbformat": 4,
463 | "nbformat_minor": 0
464 | }
465 |
--------------------------------------------------------------------------------
/Spark_DataFrames/people.json:
--------------------------------------------------------------------------------
1 | {"name":"Michael"}
2 | {"name":"Andy", "age":30}
3 | {"name":"Justin", "age":19}
4 |
--------------------------------------------------------------------------------
/Spark_DataFrames/sales_info.csv:
--------------------------------------------------------------------------------
1 | Company,Person,Sales
2 | GOOG,Sam,200
3 | GOOG,Charlie,120
4 | GOOG,Frank,340
5 | MSFT,Tina,600
6 | MSFT,Amy,124
7 | MSFT,Vanessa,243
8 | FB,Carl,870
9 | FB,Sarah,350
10 | APPL,John,250
11 | APPL,Linda, 130
12 | APPL,Mike, 750
13 | APPL, Chris, 350
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Clustering/.ipynb_checkpoints/Clustering Code Along-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Clustering Code Along\n",
8 | "\n",
9 | "We'll be working with a real data set about seeds, from UCI repository: https://archive.ics.uci.edu/ml/datasets/seeds."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for \n",
17 | "the experiment. High quality visualization of the internal kernel structure was detected using a soft X-ray technique. It is non-destructive and considerably cheaper than other more sophisticated imaging techniques like scanning microscopy or laser technology. The images were recorded on 13x18 cm X-ray KODAK plates. Studies were conducted using combine harvested wheat grain originating from experimental fields, explored at the Institute of Agrophysics of the Polish Academy of Sciences in Lublin. \n",
18 | "\n",
19 | "The data set can be used for the tasks of classification and cluster analysis.\n",
20 | "\n",
21 | "\n",
22 | "Attribute Information:\n",
23 | "\n",
24 | "To construct the data, seven geometric parameters of wheat kernels were measured: \n",
25 | "1. area A, \n",
26 | "2. perimeter P, \n",
27 | "3. compactness C = 4*pi*A/P^2, \n",
28 | "4. length of kernel, \n",
29 | "5. width of kernel, \n",
30 | "6. asymmetry coefficient \n",
31 | "7. length of kernel groove. \n",
32 | "All of these parameters were real-valued continuous.\n",
33 | "\n",
34 | "Let's see if we can cluster them in to 3 groups with K-means!"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 53,
40 | "metadata": {
41 | "collapsed": true
42 | },
43 | "outputs": [],
44 | "source": [
45 | "from pyspark.sql import SparkSession\n",
46 | "spark = SparkSession.builder.appName('cluster').getOrCreate()"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 54,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "from pyspark.ml.clustering import KMeans\n",
58 | "\n",
59 | "# Loads data.\n",
60 | "dataset = spark.read.csv(\"seeds_dataset.csv\",header=True,inferSchema=True)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 55,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)"
74 | ]
75 | },
76 | "execution_count": 55,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "dataset.head()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 56,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [
92 | {
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n",
97 | "|summary| area| perimeter| compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient| length_of_groove|\n",
98 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n",
99 | "| count| 210| 210| 210| 210| 210| 210| 210|\n",
100 | "| mean|14.847523809523816|14.559285714285718| 0.8709985714285714| 5.628533333333335| 3.258604761904762| 3.7001999999999997| 5.408071428571429|\n",
101 | "| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867| 1.5035589702547392|0.49148049910240543|\n",
102 | "| min| 10.59| 12.41| 0.8081| 4.899| 2.63| 0.765| 4.519|\n",
103 | "| max| 21.18| 17.25| 0.9183| 6.675| 4.033| 8.456| 6.55|\n",
104 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n",
105 | "\n"
106 | ]
107 | }
108 | ],
109 | "source": [
110 | "dataset.describe().show()"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "## Format the Data"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 57,
123 | "metadata": {
124 | "collapsed": true
125 | },
126 | "outputs": [],
127 | "source": [
128 | "from pyspark.ml.linalg import Vectors\n",
129 | "from pyspark.ml.feature import VectorAssembler"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 58,
135 | "metadata": {
136 | "collapsed": false
137 | },
138 | "outputs": [
139 | {
140 | "data": {
141 | "text/plain": [
142 | "['area',\n",
143 | " 'perimeter',\n",
144 | " 'compactness',\n",
145 | " 'length_of_kernel',\n",
146 | " 'width_of_kernel',\n",
147 | " 'asymmetry_coefficient',\n",
148 | " 'length_of_groove']"
149 | ]
150 | },
151 | "execution_count": 58,
152 | "metadata": {},
153 | "output_type": "execute_result"
154 | }
155 | ],
156 | "source": [
157 | "dataset.columns"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 59,
163 | "metadata": {
164 | "collapsed": true
165 | },
166 | "outputs": [],
167 | "source": [
168 | "vec_assembler = VectorAssembler(inputCols = dataset.columns, outputCol='features')"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 60,
174 | "metadata": {
175 | "collapsed": true
176 | },
177 | "outputs": [],
178 | "source": [
179 | "final_data = vec_assembler.transform(dataset)"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "## Scale the Data\n",
187 | "It is a good idea to scale our data to deal with the curse of dimensionality: https://en.wikipedia.org/wiki/Curse_of_dimensionality"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 61,
193 | "metadata": {
194 | "collapsed": true
195 | },
196 | "outputs": [],
197 | "source": [
198 | "from pyspark.ml.feature import StandardScaler"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 62,
204 | "metadata": {
205 | "collapsed": false
206 | },
207 | "outputs": [],
208 | "source": [
209 | "scaler = StandardScaler(inputCol=\"features\", outputCol=\"scaledFeatures\", withStd=True, withMean=False)"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 63,
215 | "metadata": {
216 | "collapsed": true
217 | },
218 | "outputs": [],
219 | "source": [
220 | "# Compute summary statistics by fitting the StandardScaler\n",
221 | "scalerModel = scaler.fit(final_data)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 64,
227 | "metadata": {
228 | "collapsed": false
229 | },
230 | "outputs": [],
231 | "source": [
232 | "# Normalize each feature to have unit standard deviation.\n",
233 | "final_data = scalerModel.transform(final_data)"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "## Train the Model and Evaluate"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 76,
246 | "metadata": {
247 | "collapsed": true
248 | },
249 | "outputs": [],
250 | "source": [
251 | "# Trains a k-means model.\n",
252 | "kmeans = KMeans(featuresCol='scaledFeatures',k=3)\n",
253 | "model = kmeans.fit(final_data)"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 77,
259 | "metadata": {
260 | "collapsed": false
261 | },
262 | "outputs": [
263 | {
264 | "name": "stdout",
265 | "output_type": "stream",
266 | "text": [
267 | "Within Set Sum of Squared Errors = 429.07559671506715\n"
268 | ]
269 | }
270 | ],
271 | "source": [
272 | "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n",
273 | "wssse = model.computeCost(final_data)\n",
274 | "print(\"Within Set Sum of Squared Errors = \" + str(wssse))"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 79,
280 | "metadata": {
281 | "collapsed": false
282 | },
283 | "outputs": [
284 | {
285 | "name": "stdout",
286 | "output_type": "stream",
287 | "text": [
288 | "Cluster Centers: \n",
289 | "[ 6.31670546 12.37109759 37.39491396 13.91155062 9.748067\n",
290 | " 2.39849968 12.2661748 ]\n",
291 | "[ 4.87257659 10.88120146 37.27692543 12.3410157 8.55443412\n",
292 | " 1.81649011 10.32998598]\n",
293 | "[ 4.06105916 10.13979506 35.80536984 11.82133095 7.50395937\n",
294 | " 3.27184732 10.42126018]\n"
295 | ]
296 | }
297 | ],
298 | "source": [
299 | "# Shows the result.\n",
300 | "centers = model.clusterCenters()\n",
301 | "print(\"Cluster Centers: \")\n",
302 | "for center in centers:\n",
303 | " print(center)"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 80,
309 | "metadata": {
310 | "collapsed": false
311 | },
312 | "outputs": [
313 | {
314 | "name": "stdout",
315 | "output_type": "stream",
316 | "text": [
317 | "+----------+\n",
318 | "|prediction|\n",
319 | "+----------+\n",
320 | "| 1|\n",
321 | "| 1|\n",
322 | "| 1|\n",
323 | "| 1|\n",
324 | "| 1|\n",
325 | "| 1|\n",
326 | "| 1|\n",
327 | "| 1|\n",
328 | "| 0|\n",
329 | "| 0|\n",
330 | "| 1|\n",
331 | "| 1|\n",
332 | "| 1|\n",
333 | "| 1|\n",
334 | "| 1|\n",
335 | "| 1|\n",
336 | "| 1|\n",
337 | "| 1|\n",
338 | "| 1|\n",
339 | "| 2|\n",
340 | "+----------+\n",
341 | "only showing top 20 rows\n",
342 | "\n"
343 | ]
344 | }
345 | ],
346 | "source": [
347 | "model.transform(final_data).select('prediction').show()"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {},
353 | "source": [
354 | "Now you are ready for your consulting Project!\n",
355 | "# Great Job!"
356 | ]
357 | }
358 | ],
359 | "metadata": {
360 | "anaconda-cloud": {},
361 | "kernelspec": {
362 | "display_name": "Python [conda root]",
363 | "language": "python",
364 | "name": "conda-root-py"
365 | },
366 | "language_info": {
367 | "codemirror_mode": {
368 | "name": "ipython",
369 | "version": 3
370 | },
371 | "file_extension": ".py",
372 | "mimetype": "text/x-python",
373 | "name": "python",
374 | "nbconvert_exporter": "python",
375 | "pygments_lexer": "ipython3",
376 | "version": "3.5.3"
377 | }
378 | },
379 | "nbformat": 4,
380 | "nbformat_minor": 0
381 | }
382 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Clustering/.ipynb_checkpoints/Clustering_Code_Example-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Clustering Documentation Example\n",
8 | "\n",
9 | "
K-means
\n",
10 | "\n",
11 | "k-means is one of the\n",
12 | "most commonly used clustering algorithms that clusters the data points into a\n",
13 | "predefined number of clusters. The MLlib implementation includes a parallelized\n",
14 | "variant of the k-means++ method\n",
15 | "called kmeans||.
\n",
16 | "\n",
17 | "KMeans
is implemented as an Estimator
and generates a KMeansModel
as the base model.
\n",
18 | "\n",
19 | "\n",
20 | "\n",
21 | "\n",
22 | " \n",
23 | " \n",
24 | " Param name | \n",
25 | " Type(s) | \n",
26 | " Default | \n",
27 | " Description | \n",
28 | "
\n",
29 | " \n",
30 | " \n",
31 | " \n",
32 | " featuresCol | \n",
33 | " Vector | \n",
34 | " \"features\" | \n",
35 | " Feature vector | \n",
36 | "
\n",
37 | " \n",
38 | "
\n",
39 | "\n",
40 | "Output Columns
\n",
41 | "\n",
42 | "\n",
43 | " \n",
44 | " \n",
45 | " Param name | \n",
46 | " Type(s) | \n",
47 | " Default | \n",
48 | " Description | \n",
49 | "
\n",
50 | " \n",
51 | " \n",
52 | " \n",
53 | " predictionCol | \n",
54 | " Int | \n",
55 | " \"prediction\" | \n",
56 | " Predicted cluster center | \n",
57 | "
\n",
58 | " \n",
59 | "
"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 2,
65 | "metadata": {
66 | "collapsed": true
67 | },
68 | "outputs": [],
69 | "source": [
70 | "#Cluster methods Example\n",
71 | "from pyspark.sql import SparkSession\n",
72 | "spark = SparkSession.builder.appName('cluster').getOrCreate()"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 3,
78 | "metadata": {
79 | "collapsed": false
80 | },
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "Within Set Sum of Squared Errors = 0.11999999999994547\n",
87 | "Cluster Centers: \n",
88 | "[ 9.1 9.1 9.1]\n",
89 | "[ 0.1 0.1 0.1]\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "from pyspark.ml.clustering import KMeans\n",
95 | "\n",
96 | "# Loads data.\n",
97 | "dataset = spark.read.format(\"libsvm\").load(\"sample_kmeans_data.txt\")\n",
98 | "\n",
99 | "# Trains a k-means model.\n",
100 | "kmeans = KMeans().setK(2).setSeed(1)\n",
101 | "model = kmeans.fit(dataset)\n",
102 | "\n",
103 | "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n",
104 | "wssse = model.computeCost(dataset)\n",
105 | "print(\"Within Set Sum of Squared Errors = \" + str(wssse))\n",
106 | "\n",
107 | "# Shows the result.\n",
108 | "centers = model.clusterCenters()\n",
109 | "print(\"Cluster Centers: \")\n",
110 | "for center in centers:\n",
111 | " print(center)"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "Alright let's code through our own example!"
119 | ]
120 | }
121 | ],
122 | "metadata": {
123 | "anaconda-cloud": {},
124 | "kernelspec": {
125 | "display_name": "Python [conda root]",
126 | "language": "python",
127 | "name": "conda-root-py"
128 | },
129 | "language_info": {
130 | "codemirror_mode": {
131 | "name": "ipython",
132 | "version": 3
133 | },
134 | "file_extension": ".py",
135 | "mimetype": "text/x-python",
136 | "name": "python",
137 | "nbconvert_exporter": "python",
138 | "pygments_lexer": "ipython3",
139 | "version": "3.5.3"
140 | }
141 | },
142 | "nbformat": 4,
143 | "nbformat_minor": 0
144 | }
145 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Clustering/.ipynb_checkpoints/Clustering_Consulting_Project-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Clustering Consulting Project \n",
8 | "\n",
9 | "A large technology firm needs your help, they've been hacked! Luckily their forensic engineers have grabbed valuable data about the hacks, including information like session time,locations, wpm typing speed, etc. The forensic engineer relates to you what she has been able to figure out so far, she has been able to grab meta data of each session that the hackers used to connect to their servers. These are the features of the data:\n",
10 | "\n",
11 | "* 'Session_Connection_Time': How long the session lasted in minutes\n",
12 | "* 'Bytes Transferred': Number of MB transferred during session\n",
13 | "* 'Kali_Trace_Used': Indicates if the hacker was using Kali Linux\n",
14 | "* 'Servers_Corrupted': Number of server corrupted during the attack\n",
15 | "* 'Pages_Corrupted': Number of pages illegally accessed\n",
16 | "* 'Location': Location attack came from (Probably useless because the hackers used VPNs)\n",
17 | "* 'WPM_Typing_Speed': Their estimated typing speed based on session logs.\n",
18 | "\n",
19 | "\n",
20 | "The technology firm has 3 potential hackers that perpetrated the attack. Their certain of the first two hackers but they aren't very sure if the third hacker was involved or not. They have requested your help! Can you help figure out whether or not the third suspect had anything to do with the attacks, or was it just two hackers? It's probably not possible to know for sure, but maybe what you've just learned about Clustering can help!\n",
21 | "\n",
22 | "**One last key fact, the forensic engineer knows that the hackers trade off attacks. Meaning they should each have roughly the same amount of attacks. For example if there were 100 total attacks, then in a 2 hacker situation each should have about 50 hacks, in a three hacker situation each would have about 33 hacks. The engineer believes this is the key element to solving this, but doesn't know how to distinguish this unlabeled data into groups of hackers.**"
23 | ]
24 | }
25 | ],
26 | "metadata": {
27 | "anaconda-cloud": {},
28 | "kernelspec": {
29 | "display_name": "Python [conda root]",
30 | "language": "python",
31 | "name": "conda-root-py"
32 | },
33 | "language_info": {
34 | "codemirror_mode": {
35 | "name": "ipython",
36 | "version": 3
37 | },
38 | "file_extension": ".py",
39 | "mimetype": "text/x-python",
40 | "name": "python",
41 | "nbconvert_exporter": "python",
42 | "pygments_lexer": "ipython3",
43 | "version": "3.5.3"
44 | }
45 | },
46 | "nbformat": 4,
47 | "nbformat_minor": 0
48 | }
49 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Clustering/.ipynb_checkpoints/Random_Forest_Doc_Example-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Random Forest Example\n",
8 | "\n",
9 | "This is just a quick walkthrough of the Documentation's Example of Random Forest:"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {
16 | "collapsed": true
17 | },
18 | "outputs": [],
19 | "source": [
20 | "from pyspark.ml import Pipeline\n",
21 | "from pyspark.ml.classification import RandomForestClassifier\n",
22 | "from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer\n",
23 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "collapsed": true
31 | },
32 | "outputs": [],
33 | "source": [
34 | "# Load and parse the data file, converting it to a DataFrame.\n",
35 | "data = spark.read.format(\"libsvm\").load(\"data/mllib/sample_libsvm_data.txt\")"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "collapsed": true
43 | },
44 | "outputs": [],
45 | "source": [
46 | "# Index labels, adding metadata to the label column.\n",
47 | "# Fit on whole dataset to include all labels in index.\n",
48 | "labelIndexer = StringIndexer(inputCol=\"label\", outputCol=\"indexedLabel\").fit(data)\n",
49 | "\n",
50 | "# Automatically identify categorical features, and index them.\n",
51 | "# Set maxCategories so features with > 4 distinct values are treated as continuous.\n",
52 | "featureIndexer = VectorIndexer(inputCol=\"features\", outputCol=\"indexedFeatures\", maxCategories=4).fit(data)\n",
53 | "\n",
54 | "# Split the data into training and test sets (30% held out for testing)\n",
55 | "(trainingData, testData) = data.randomSplit([0.7, 0.3])\n",
56 | "\n",
57 | "# Train a RandomForest model.\n",
58 | "rf = RandomForestClassifier(labelCol=\"indexedLabel\", featuresCol=\"indexedFeatures\", numTrees=10)\n",
59 | "\n",
60 | "# Convert indexed labels back to original labels.\n",
61 | "labelConverter = IndexToString(inputCol=\"prediction\", outputCol=\"predictedLabel\",\n",
62 | " labels=labelIndexer.labels)\n",
63 | "\n",
64 | "# Chain indexers and forest in a Pipeline\n",
65 | "pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])\n",
66 | "\n",
67 | "# Train model. This also runs the indexers.\n",
68 | "model = pipeline.fit(trainingData)\n",
69 | "\n",
70 | "# Make predictions.\n",
71 | "predictions = model.transform(testData)\n",
72 | "\n",
73 | "# Select example rows to display.\n",
74 | "predictions.select(\"predictedLabel\", \"label\", \"features\").show(5)\n",
75 | "\n",
76 | "# Select (prediction, true label) and compute test error\n",
77 | "evaluator = MulticlassClassificationEvaluator(\n",
78 | " labelCol=\"indexedLabel\", predictionCol=\"prediction\", metricName=\"accuracy\")\n",
79 | "accuracy = evaluator.evaluate(predictions)\n",
80 | "print(\"Test Error = %g\" % (1.0 - accuracy))\n",
81 | "\n",
82 | "rfModel = model.stages[2]\n",
83 | "print(rfModel) # summary only"
84 | ]
85 | }
86 | ],
87 | "metadata": {
88 | "anaconda-cloud": {},
89 | "kernelspec": {
90 | "display_name": "Python [conda root]",
91 | "language": "python",
92 | "name": "conda-root-py"
93 | },
94 | "language_info": {
95 | "codemirror_mode": {
96 | "name": "ipython",
97 | "version": 3
98 | },
99 | "file_extension": ".py",
100 | "mimetype": "text/x-python",
101 | "name": "python",
102 | "nbconvert_exporter": "python",
103 | "pygments_lexer": "ipython3",
104 | "version": "3.5.3"
105 | }
106 | },
107 | "nbformat": 4,
108 | "nbformat_minor": 0
109 | }
110 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Clustering/Clustering Code Along.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Clustering Code Along\n",
8 | "\n",
9 | "We'll be working with a real data set about seeds, from UCI repository: https://archive.ics.uci.edu/ml/datasets/seeds."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for \n",
17 | "the experiment. High quality visualization of the internal kernel structure was detected using a soft X-ray technique. It is non-destructive and considerably cheaper than other more sophisticated imaging techniques like scanning microscopy or laser technology. The images were recorded on 13x18 cm X-ray KODAK plates. Studies were conducted using combine harvested wheat grain originating from experimental fields, explored at the Institute of Agrophysics of the Polish Academy of Sciences in Lublin. \n",
18 | "\n",
19 | "The data set can be used for the tasks of classification and cluster analysis.\n",
20 | "\n",
21 | "\n",
22 | "Attribute Information:\n",
23 | "\n",
24 | "To construct the data, seven geometric parameters of wheat kernels were measured: \n",
25 | "1. area A, \n",
26 | "2. perimeter P, \n",
27 | "3. compactness C = 4*pi*A/P^2, \n",
28 | "4. length of kernel, \n",
29 | "5. width of kernel, \n",
30 | "6. asymmetry coefficient \n",
31 | "7. length of kernel groove. \n",
32 | "All of these parameters were real-valued continuous.\n",
33 | "\n",
34 | "Let's see if we can cluster them in to 3 groups with K-means!"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 53,
40 | "metadata": {
41 | "collapsed": true
42 | },
43 | "outputs": [],
44 | "source": [
45 | "from pyspark.sql import SparkSession\n",
46 | "spark = SparkSession.builder.appName('cluster').getOrCreate()"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 54,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "from pyspark.ml.clustering import KMeans\n",
58 | "\n",
59 | "# Loads data.\n",
60 | "dataset = spark.read.csv(\"seeds_dataset.csv\",header=True,inferSchema=True)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 55,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)"
74 | ]
75 | },
76 | "execution_count": 55,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "dataset.head()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 56,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [
92 | {
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n",
97 | "|summary| area| perimeter| compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient| length_of_groove|\n",
98 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n",
99 | "| count| 210| 210| 210| 210| 210| 210| 210|\n",
100 | "| mean|14.847523809523816|14.559285714285718| 0.8709985714285714| 5.628533333333335| 3.258604761904762| 3.7001999999999997| 5.408071428571429|\n",
101 | "| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867| 1.5035589702547392|0.49148049910240543|\n",
102 | "| min| 10.59| 12.41| 0.8081| 4.899| 2.63| 0.765| 4.519|\n",
103 | "| max| 21.18| 17.25| 0.9183| 6.675| 4.033| 8.456| 6.55|\n",
104 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n",
105 | "\n"
106 | ]
107 | }
108 | ],
109 | "source": [
110 | "dataset.describe().show()"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "## Format the Data"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 57,
123 | "metadata": {
124 | "collapsed": true
125 | },
126 | "outputs": [],
127 | "source": [
128 | "from pyspark.ml.linalg import Vectors\n",
129 | "from pyspark.ml.feature import VectorAssembler"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 58,
135 | "metadata": {
136 | "collapsed": false
137 | },
138 | "outputs": [
139 | {
140 | "data": {
141 | "text/plain": [
142 | "['area',\n",
143 | " 'perimeter',\n",
144 | " 'compactness',\n",
145 | " 'length_of_kernel',\n",
146 | " 'width_of_kernel',\n",
147 | " 'asymmetry_coefficient',\n",
148 | " 'length_of_groove']"
149 | ]
150 | },
151 | "execution_count": 58,
152 | "metadata": {},
153 | "output_type": "execute_result"
154 | }
155 | ],
156 | "source": [
157 | "dataset.columns"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 59,
163 | "metadata": {
164 | "collapsed": true
165 | },
166 | "outputs": [],
167 | "source": [
168 | "vec_assembler = VectorAssembler(inputCols = dataset.columns, outputCol='features')"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 60,
174 | "metadata": {
175 | "collapsed": true
176 | },
177 | "outputs": [],
178 | "source": [
179 | "final_data = vec_assembler.transform(dataset)"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "## Scale the Data\n",
187 | "It is a good idea to scale our data to deal with the curse of dimensionality: https://en.wikipedia.org/wiki/Curse_of_dimensionality"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 61,
193 | "metadata": {
194 | "collapsed": true
195 | },
196 | "outputs": [],
197 | "source": [
198 | "from pyspark.ml.feature import StandardScaler"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 62,
204 | "metadata": {
205 | "collapsed": false
206 | },
207 | "outputs": [],
208 | "source": [
209 | "scaler = StandardScaler(inputCol=\"features\", outputCol=\"scaledFeatures\", withStd=True, withMean=False)"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 63,
215 | "metadata": {
216 | "collapsed": true
217 | },
218 | "outputs": [],
219 | "source": [
220 | "# Compute summary statistics by fitting the StandardScaler\n",
221 | "scalerModel = scaler.fit(final_data)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 64,
227 | "metadata": {
228 | "collapsed": false
229 | },
230 | "outputs": [],
231 | "source": [
232 | "# Normalize each feature to have unit standard deviation.\n",
233 | "final_data = scalerModel.transform(final_data)"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "## Train the Model and Evaluate"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 76,
246 | "metadata": {
247 | "collapsed": true
248 | },
249 | "outputs": [],
250 | "source": [
251 | "# Trains a k-means model.\n",
252 | "kmeans = KMeans(featuresCol='scaledFeatures',k=3)\n",
253 | "model = kmeans.fit(final_data)"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 77,
259 | "metadata": {
260 | "collapsed": false
261 | },
262 | "outputs": [
263 | {
264 | "name": "stdout",
265 | "output_type": "stream",
266 | "text": [
267 | "Within Set Sum of Squared Errors = 429.07559671506715\n"
268 | ]
269 | }
270 | ],
271 | "source": [
272 | "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n",
273 | "wssse = model.computeCost(final_data)\n",
274 | "print(\"Within Set Sum of Squared Errors = \" + str(wssse))"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 79,
280 | "metadata": {
281 | "collapsed": false
282 | },
283 | "outputs": [
284 | {
285 | "name": "stdout",
286 | "output_type": "stream",
287 | "text": [
288 | "Cluster Centers: \n",
289 | "[ 6.31670546 12.37109759 37.39491396 13.91155062 9.748067\n",
290 | " 2.39849968 12.2661748 ]\n",
291 | "[ 4.87257659 10.88120146 37.27692543 12.3410157 8.55443412\n",
292 | " 1.81649011 10.32998598]\n",
293 | "[ 4.06105916 10.13979506 35.80536984 11.82133095 7.50395937\n",
294 | " 3.27184732 10.42126018]\n"
295 | ]
296 | }
297 | ],
298 | "source": [
299 | "# Shows the result.\n",
300 | "centers = model.clusterCenters()\n",
301 | "print(\"Cluster Centers: \")\n",
302 | "for center in centers:\n",
303 | " print(center)"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 80,
309 | "metadata": {
310 | "collapsed": false
311 | },
312 | "outputs": [
313 | {
314 | "name": "stdout",
315 | "output_type": "stream",
316 | "text": [
317 | "+----------+\n",
318 | "|prediction|\n",
319 | "+----------+\n",
320 | "| 1|\n",
321 | "| 1|\n",
322 | "| 1|\n",
323 | "| 1|\n",
324 | "| 1|\n",
325 | "| 1|\n",
326 | "| 1|\n",
327 | "| 1|\n",
328 | "| 0|\n",
329 | "| 0|\n",
330 | "| 1|\n",
331 | "| 1|\n",
332 | "| 1|\n",
333 | "| 1|\n",
334 | "| 1|\n",
335 | "| 1|\n",
336 | "| 1|\n",
337 | "| 1|\n",
338 | "| 1|\n",
339 | "| 2|\n",
340 | "+----------+\n",
341 | "only showing top 20 rows\n",
342 | "\n"
343 | ]
344 | }
345 | ],
346 | "source": [
347 | "model.transform(final_data).select('prediction').show()"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {},
353 | "source": [
354 | "Now you are ready for your consulting Project!\n",
355 | "# Great Job!"
356 | ]
357 | }
358 | ],
359 | "metadata": {
360 | "anaconda-cloud": {},
361 | "kernelspec": {
362 | "display_name": "Python [conda root]",
363 | "language": "python",
364 | "name": "conda-root-py"
365 | },
366 | "language_info": {
367 | "codemirror_mode": {
368 | "name": "ipython",
369 | "version": 3
370 | },
371 | "file_extension": ".py",
372 | "mimetype": "text/x-python",
373 | "name": "python",
374 | "nbconvert_exporter": "python",
375 | "pygments_lexer": "ipython3",
376 | "version": "3.5.3"
377 | }
378 | },
379 | "nbformat": 4,
380 | "nbformat_minor": 0
381 | }
382 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Clustering/Clustering_Code_Example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Clustering Documentation Example\n",
8 | "\n",
9 | "K-means
\n",
10 | "\n",
11 | "k-means is one of the\n",
12 | "most commonly used clustering algorithms that clusters the data points into a\n",
13 | "predefined number of clusters. The MLlib implementation includes a parallelized\n",
14 | "variant of the k-means++ method\n",
15 | "called kmeans||.
\n",
16 | "\n",
17 | "KMeans
is implemented as an Estimator
and generates a KMeansModel
as the base model.
\n",
18 | "\n",
19 | "\n",
20 | "\n",
21 | "\n",
22 | " \n",
23 | " \n",
24 | " Param name | \n",
25 | " Type(s) | \n",
26 | " Default | \n",
27 | " Description | \n",
28 | "
\n",
29 | " \n",
30 | " \n",
31 | " \n",
32 | " featuresCol | \n",
33 | " Vector | \n",
34 | " \"features\" | \n",
35 | " Feature vector | \n",
36 | "
\n",
37 | " \n",
38 | "
\n",
39 | "\n",
40 | "Output Columns
\n",
41 | "\n",
42 | "\n",
43 | " \n",
44 | " \n",
45 | " Param name | \n",
46 | " Type(s) | \n",
47 | " Default | \n",
48 | " Description | \n",
49 | "
\n",
50 | " \n",
51 | " \n",
52 | " \n",
53 | " predictionCol | \n",
54 | " Int | \n",
55 | " \"prediction\" | \n",
56 | " Predicted cluster center | \n",
57 | "
\n",
58 | " \n",
59 | "
"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 2,
65 | "metadata": {
66 | "collapsed": true
67 | },
68 | "outputs": [],
69 | "source": [
70 | "#Cluster methods Example\n",
71 | "from pyspark.sql import SparkSession\n",
72 | "spark = SparkSession.builder.appName('cluster').getOrCreate()"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 3,
78 | "metadata": {
79 | "collapsed": false
80 | },
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "Within Set Sum of Squared Errors = 0.11999999999994547\n",
87 | "Cluster Centers: \n",
88 | "[ 9.1 9.1 9.1]\n",
89 | "[ 0.1 0.1 0.1]\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "from pyspark.ml.clustering import KMeans\n",
95 | "\n",
96 | "# Loads data.\n",
97 | "dataset = spark.read.format(\"libsvm\").load(\"sample_kmeans_data.txt\")\n",
98 | "\n",
99 | "# Trains a k-means model.\n",
100 | "kmeans = KMeans().setK(2).setSeed(1)\n",
101 | "model = kmeans.fit(dataset)\n",
102 | "\n",
103 | "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n",
104 | "wssse = model.computeCost(dataset)\n",
105 | "print(\"Within Set Sum of Squared Errors = \" + str(wssse))\n",
106 | "\n",
107 | "# Shows the result.\n",
108 | "centers = model.clusterCenters()\n",
109 | "print(\"Cluster Centers: \")\n",
110 | "for center in centers:\n",
111 | " print(center)"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "Alright let's code through our own example!"
119 | ]
120 | }
121 | ],
122 | "metadata": {
123 | "anaconda-cloud": {},
124 | "kernelspec": {
125 | "display_name": "Python [conda root]",
126 | "language": "python",
127 | "name": "conda-root-py"
128 | },
129 | "language_info": {
130 | "codemirror_mode": {
131 | "name": "ipython",
132 | "version": 3
133 | },
134 | "file_extension": ".py",
135 | "mimetype": "text/x-python",
136 | "name": "python",
137 | "nbconvert_exporter": "python",
138 | "pygments_lexer": "ipython3",
139 | "version": "3.5.3"
140 | }
141 | },
142 | "nbformat": 4,
143 | "nbformat_minor": 0
144 | }
145 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Clustering/Clustering_Consulting_Project.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Clustering Consulting Project \n",
8 | "\n",
9 | "A large technology firm needs your help, they've been hacked! Luckily their forensic engineers have grabbed valuable data about the hacks, including information like session time,locations, wpm typing speed, etc. The forensic engineer relates to you what she has been able to figure out so far, she has been able to grab meta data of each session that the hackers used to connect to their servers. These are the features of the data:\n",
10 | "\n",
11 | "* 'Session_Connection_Time': How long the session lasted in minutes\n",
12 | "* 'Bytes Transferred': Number of MB transferred during session\n",
13 | "* 'Kali_Trace_Used': Indicates if the hacker was using Kali Linux\n",
14 | "* 'Servers_Corrupted': Number of server corrupted during the attack\n",
15 | "* 'Pages_Corrupted': Number of pages illegally accessed\n",
16 | "* 'Location': Location attack came from (Probably useless because the hackers used VPNs)\n",
17 | "* 'WPM_Typing_Speed': Their estimated typing speed based on session logs.\n",
18 | "\n",
19 | "\n",
20 | "The technology firm has 3 potential hackers that perpetrated the attack. Their certain of the first two hackers but they aren't very sure if the third hacker was involved or not. They have requested your help! Can you help figure out whether or not the third suspect had anything to do with the attacks, or was it just two hackers? It's probably not possible to know for sure, but maybe what you've just learned about Clustering can help!\n",
21 | "\n",
22 | "**One last key fact, the forensic engineer knows that the hackers trade off attacks. Meaning they should each have roughly the same amount of attacks. For example if there were 100 total attacks, then in a 2 hacker situation each should have about 50 hacks, in a three hacker situation each would have about 33 hacks. The engineer believes this is the key element to solving this, but doesn't know how to distinguish this unlabeled data into groups of hackers.**"
23 | ]
24 | }
25 | ],
26 | "metadata": {
27 | "anaconda-cloud": {},
28 | "kernelspec": {
29 | "display_name": "Python [conda root]",
30 | "language": "python",
31 | "name": "conda-root-py"
32 | },
33 | "language_info": {
34 | "codemirror_mode": {
35 | "name": "ipython",
36 | "version": 3
37 | },
38 | "file_extension": ".py",
39 | "mimetype": "text/x-python",
40 | "name": "python",
41 | "nbconvert_exporter": "python",
42 | "pygments_lexer": "ipython3",
43 | "version": "3.5.3"
44 | }
45 | },
46 | "nbformat": 4,
47 | "nbformat_minor": 0
48 | }
49 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Clustering/sample_kmeans_data.txt:
--------------------------------------------------------------------------------
1 | 0 1:0.0 2:0.0 3:0.0
2 | 1 1:0.1 2:0.1 3:0.1
3 | 2 1:0.2 2:0.2 3:0.2
4 | 3 1:9.0 2:9.0 3:9.0
5 | 4 1:9.1 2:9.1 3:9.1
6 | 5 1:9.2 2:9.2 3:9.2
7 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Clustering/seeds_dataset.csv:
--------------------------------------------------------------------------------
1 | area,perimeter,compactness,length_of_kernel,width_of_kernel,asymmetry_coefficient,length_of_groove
2 | 15.26,14.84,0.871,5.763,3.312,2.221,5.22
3 | 14.88,14.57,0.8811,5.553999999999999,3.333,1.018,4.956
4 | 14.29,14.09,0.905,5.291,3.3369999999999997,2.699,4.825
5 | 13.84,13.94,0.8955,5.324,3.3789999999999996,2.259,4.805
6 | 16.14,14.99,0.9034,5.6579999999999995,3.562,1.355,5.175
7 | 14.38,14.21,0.8951,5.386,3.312,2.4619999999999997,4.956
8 | 14.69,14.49,0.8799,5.563,3.259,3.5860000000000003,5.218999999999999
9 | 14.11,14.1,0.8911,5.42,3.302,2.7,5.0
10 | 16.63,15.46,0.8747,6.053,3.465,2.04,5.877000000000001
11 | 16.44,15.25,0.888,5.8839999999999995,3.505,1.969,5.5329999999999995
12 | 15.26,14.85,0.8696,5.7139999999999995,3.242,4.543,5.314
13 | 14.03,14.16,0.8796,5.438,3.201,1.7169999999999999,5.001
14 | 13.89,14.02,0.888,5.439,3.199,3.986,4.738
15 | 13.78,14.06,0.8759,5.479,3.156,3.136,4.872
16 | 13.74,14.05,0.8744,5.482,3.114,2.932,4.825
17 | 14.59,14.28,0.8993,5.351,3.333,4.185,4.781000000000001
18 | 13.99,13.83,0.9183,5.119,3.383,5.234,4.781000000000001
19 | 15.69,14.75,0.9058,5.527,3.514,1.599,5.046
20 | 14.7,14.21,0.9153,5.205,3.466,1.767,4.649
21 | 12.72,13.57,0.8686,5.226,3.049,4.102,4.914
22 | 14.16,14.4,0.8584,5.6579999999999995,3.1289999999999996,3.072,5.176
23 | 14.11,14.26,0.8722,5.52,3.168,2.688,5.218999999999999
24 | 15.88,14.9,0.8988,5.617999999999999,3.5069999999999997,0.765,5.091
25 | 12.08,13.23,0.8664,5.099,2.9360000000000004,1.415,4.961
26 | 15.01,14.76,0.8657,5.789,3.245,1.791,5.001
27 | 16.19,15.16,0.8849,5.832999999999999,3.4210000000000003,0.903,5.307
28 | 13.02,13.76,0.8641,5.395,3.0260000000000002,3.373,4.825
29 | 12.74,13.67,0.8564,5.395,2.9560000000000004,2.504,4.869
30 | 14.11,14.18,0.882,5.541,3.221,2.7539999999999996,5.038
31 | 13.45,14.02,0.8604,5.516,3.065,3.531,5.0969999999999995
32 | 13.16,13.82,0.8662,5.454,2.975,0.855,5.056
33 | 15.49,14.94,0.8724,5.757000000000001,3.3710000000000004,3.412,5.228
34 | 14.09,14.41,0.8529,5.7170000000000005,3.1860000000000004,3.92,5.2989999999999995
35 | 13.94,14.17,0.8728,5.585,3.15,2.124,5.012
36 | 15.05,14.68,0.8779,5.712000000000001,3.3280000000000003,2.129,5.36
37 | 16.12,15.0,0.9,5.709,3.485,2.27,5.443
38 | 16.2,15.27,0.8734,5.8260000000000005,3.464,2.823,5.527
39 | 17.08,15.38,0.9079,5.832000000000001,3.6830000000000003,2.9560000000000004,5.484
40 | 14.8,14.52,0.8823,5.656000000000001,3.2880000000000003,3.112,5.309
41 | 14.28,14.17,0.8944,5.397,3.298,6.685,5.001
42 | 13.54,13.85,0.8871,5.348,3.156,2.5869999999999997,5.178
43 | 13.5,13.85,0.8852,5.351,3.158,2.249,5.176
44 | 13.16,13.55,0.9009,5.138,3.201,2.461,4.783
45 | 15.5,14.86,0.882,5.877000000000001,3.3960000000000004,4.711,5.528
46 | 15.11,14.54,0.8986,5.579,3.4619999999999997,3.128,5.18
47 | 13.8,14.04,0.8794,5.376,3.155,1.56,4.961
48 | 15.36,14.76,0.8861,5.7010000000000005,3.3930000000000002,1.367,5.132000000000001
49 | 14.99,14.56,0.8883,5.57,3.377,2.958,5.175
50 | 14.79,14.52,0.8819,5.545,3.2910000000000004,2.7039999999999997,5.111000000000001
51 | 14.86,14.67,0.8676,5.678,3.258,2.129,5.351
52 | 14.43,14.4,0.8751,5.585,3.272,3.975,5.144
53 | 15.78,14.91,0.8923,5.6739999999999995,3.4339999999999997,5.593,5.136
54 | 14.49,14.61,0.8538,5.715,3.113,4.1160000000000005,5.396
55 | 14.33,14.28,0.8831,5.504,3.199,3.3280000000000003,5.224
56 | 14.52,14.6,0.8557,5.7410000000000005,3.113,1.4809999999999999,5.487
57 | 15.03,14.77,0.8658,5.702000000000001,3.2119999999999997,1.933,5.439
58 | 14.46,14.35,0.8818,5.388,3.377,2.802,5.044
59 | 14.92,14.43,0.9006,5.3839999999999995,3.412,1.1420000000000001,5.088
60 | 15.38,14.77,0.8857,5.662000000000001,3.4189999999999996,1.999,5.222
61 | 12.11,13.47,0.8392,5.159,3.032,1.5019999999999998,4.519
62 | 11.42,12.86,0.8683,5.008,2.85,2.7,4.607
63 | 11.23,12.63,0.884,4.902,2.8789999999999996,2.269,4.703
64 | 12.36,13.19,0.8923,5.0760000000000005,3.042,3.22,4.605
65 | 13.22,13.84,0.868,5.395,3.07,4.157,5.088
66 | 12.78,13.57,0.8716,5.2620000000000005,3.0260000000000002,1.176,4.782
67 | 12.88,13.5,0.8879,5.138999999999999,3.1189999999999998,2.352,4.607
68 | 14.34,14.37,0.8726,5.63,3.19,1.3130000000000002,5.15
69 | 14.01,14.29,0.8625,5.609,3.158,2.217,5.132000000000001
70 | 14.37,14.39,0.8726,5.569,3.153,1.464,5.3
71 | 12.73,13.75,0.8458,5.412000000000001,2.8819999999999997,3.533,5.067
72 | 17.63,15.98,0.8673,6.191,3.5610000000000004,4.0760000000000005,6.06
73 | 16.84,15.67,0.8623,5.997999999999999,3.484,4.675,5.877000000000001
74 | 17.26,15.73,0.8763,5.978,3.594,4.539,5.791
75 | 19.11,16.26,0.9081,6.154,3.93,2.9360000000000004,6.079
76 | 16.82,15.51,0.8786,6.017,3.486,4.004,5.841
77 | 16.77,15.62,0.8638,5.9270000000000005,3.438,4.92,5.795
78 | 17.32,15.91,0.8599,6.064,3.403,3.824,5.922000000000001
79 | 20.71,17.23,0.8763,6.579,3.8139999999999996,4.4510000000000005,6.4510000000000005
80 | 18.94,16.49,0.875,6.445,3.639,5.064,6.362
81 | 17.12,15.55,0.8892,5.85,3.5660000000000003,2.858,5.746
82 | 16.53,15.34,0.8823,5.875,3.467,5.532,5.88
83 | 18.72,16.19,0.8977,6.006,3.8569999999999998,5.324,5.879
84 | 20.2,16.89,0.8894,6.285,3.864,5.172999999999999,6.187
85 | 19.57,16.74,0.8779,6.3839999999999995,3.772,1.472,6.273
86 | 19.51,16.71,0.878,6.3660000000000005,3.801,2.9619999999999997,6.185
87 | 18.27,16.09,0.887,6.172999999999999,3.6510000000000002,2.443,6.197
88 | 18.88,16.26,0.8969,6.084,3.764,1.649,6.109
89 | 18.98,16.66,0.8590000000000001,6.5489999999999995,3.67,3.6910000000000003,6.497999999999999
90 | 21.18,17.21,0.8989,6.5729999999999995,4.033,5.78,6.231
91 | 20.88,17.05,0.9031,6.45,4.032,5.016,6.321000000000001
92 | 20.1,16.99,0.8746,6.581,3.785,1.955,6.449
93 | 18.76,16.2,0.8984,6.172000000000001,3.7960000000000003,3.12,6.053
94 | 18.81,16.29,0.8906,6.272,3.693,3.237,6.053
95 | 18.59,16.05,0.9066,6.037000000000001,3.86,6.001,5.877000000000001
96 | 18.36,16.52,0.8452,6.666,3.485,4.933,6.4479999999999995
97 | 16.87,15.65,0.8648,6.138999999999999,3.463,3.696,5.9670000000000005
98 | 19.31,16.59,0.8815,6.341,3.81,3.477,6.2379999999999995
99 | 18.98,16.57,0.8687,6.449,3.552,2.144,6.452999999999999
100 | 18.17,16.26,0.8637,6.271,3.512,2.853,6.273
101 | 18.72,16.34,0.8809999999999999,6.218999999999999,3.6839999999999997,2.188,6.097
102 | 16.41,15.25,0.8866,5.718,3.525,4.217,5.617999999999999
103 | 17.99,15.86,0.8992,5.89,3.694,2.068,5.837000000000001
104 | 19.46,16.5,0.8985,6.1129999999999995,3.892,4.308,6.0089999999999995
105 | 19.18,16.63,0.8717,6.369,3.681,3.3569999999999998,6.229
106 | 18.95,16.42,0.8829,6.247999999999999,3.755,3.3680000000000003,6.148
107 | 18.83,16.29,0.8917,6.037000000000001,3.786,2.553,5.879
108 | 18.85,16.17,0.9056,6.152,3.806,2.843,6.2
109 | 17.63,15.86,0.88,6.0329999999999995,3.573,3.747,5.928999999999999
110 | 19.94,16.92,0.8752,6.675,3.763,3.252,6.55
111 | 18.55,16.22,0.8865,6.153,3.674,1.7380000000000002,5.894
112 | 18.45,16.12,0.8921,6.107,3.7689999999999997,2.235,5.794
113 | 19.38,16.72,0.8716,6.303,3.7910000000000004,3.678,5.965
114 | 19.13,16.31,0.9035,6.183,3.9019999999999997,2.109,5.9239999999999995
115 | 19.14,16.61,0.8722,6.2589999999999995,3.737,6.682,6.053
116 | 20.97,17.25,0.8859,6.563,3.991,4.677,6.316
117 | 19.06,16.45,0.8854,6.416,3.719,2.248,6.162999999999999
118 | 18.96,16.2,0.9077,6.051,3.897,4.334,5.75
119 | 19.15,16.45,0.889,6.245,3.815,3.0839999999999996,6.185
120 | 18.89,16.23,0.9008,6.227,3.7689999999999997,3.639,5.966
121 | 20.03,16.9,0.8811,6.492999999999999,3.8569999999999998,3.063,6.32
122 | 20.24,16.91,0.8897,6.315,3.9619999999999997,5.901,6.188
123 | 18.14,16.12,0.8772,6.059,3.563,3.6189999999999998,6.011
124 | 16.17,15.38,0.8588,5.7620000000000005,3.387,4.2860000000000005,5.702999999999999
125 | 18.43,15.97,0.9077,5.98,3.7710000000000004,2.984,5.905
126 | 15.99,14.89,0.9064,5.3629999999999995,3.582,3.3360000000000003,5.144
127 | 18.75,16.18,0.8999,6.111000000000001,3.8689999999999998,4.188,5.992000000000001
128 | 18.65,16.41,0.8698,6.285,3.594,4.391,6.102
129 | 17.98,15.85,0.8993,5.979,3.687,2.2569999999999997,5.919
130 | 20.16,17.03,0.8735,6.513,3.773,1.91,6.185
131 | 17.55,15.66,0.8991,5.791,3.69,5.3660000000000005,5.6610000000000005
132 | 18.3,15.89,0.9108,5.979,3.755,2.8369999999999997,5.962000000000001
133 | 18.94,16.32,0.8942,6.144,3.825,2.908,5.949
134 | 15.38,14.9,0.8706,5.8839999999999995,3.2680000000000002,4.462,5.795
135 | 16.16,15.33,0.8644,5.845,3.395,4.266,5.795
136 | 15.56,14.89,0.8823,5.776,3.408,4.9719999999999995,5.847
137 | 15.38,14.66,0.899,5.477,3.465,3.6,5.439
138 | 17.36,15.76,0.8785,6.145,3.574,3.5260000000000002,5.971
139 | 15.57,15.15,0.8527,5.92,3.2310000000000003,2.64,5.879
140 | 15.6,15.11,0.858,5.832000000000001,3.286,2.725,5.752000000000001
141 | 16.23,15.18,0.885,5.872000000000001,3.472,3.7689999999999997,5.922000000000001
142 | 13.07,13.92,0.848,5.472,2.9939999999999998,5.303999999999999,5.395
143 | 13.32,13.94,0.8613,5.541,3.073,7.035,5.44
144 | 13.34,13.95,0.862,5.388999999999999,3.074,5.995,5.307
145 | 12.22,13.32,0.8652,5.224,2.967,5.468999999999999,5.221
146 | 11.82,13.4,0.8274,5.314,2.7769999999999997,4.471,5.178
147 | 11.21,13.13,0.8167,5.279,2.687,6.169,5.275
148 | 11.43,13.13,0.8335,5.176,2.719,2.221,5.132000000000001
149 | 12.49,13.46,0.8658,5.267,2.967,4.421,5.002
150 | 12.7,13.71,0.8491,5.386,2.911,3.26,5.316
151 | 10.79,12.93,0.8107,5.317,2.648,5.462000000000001,5.194
152 | 11.83,13.23,0.8496,5.263,2.84,5.195,5.307
153 | 12.01,13.52,0.8249,5.405,2.7760000000000002,6.992000000000001,5.27
154 | 12.26,13.6,0.8333,5.4079999999999995,2.833,4.756,5.36
155 | 11.18,13.04,0.8266,5.22,2.693,3.332,5.001
156 | 11.36,13.05,0.8382,5.175,2.755,4.048,5.263
157 | 11.19,13.05,0.8253,5.25,2.675,5.813,5.218999999999999
158 | 11.34,12.87,0.8596,5.053,2.8489999999999998,3.347,5.003
159 | 12.13,13.73,0.8081,5.394,2.745,4.825,5.22
160 | 11.75,13.52,0.8082,5.444,2.678,4.378,5.31
161 | 11.49,13.22,0.8263,5.303999999999999,2.695,5.388,5.31
162 | 12.54,13.67,0.8425,5.4510000000000005,2.8789999999999996,3.082,5.4910000000000005
163 | 12.02,13.33,0.8503,5.35,2.81,4.271,5.308
164 | 12.05,13.41,0.8416,5.267,2.847,4.988,5.046
165 | 12.55,13.57,0.8558,5.332999999999999,2.968,4.419,5.176
166 | 11.14,12.79,0.8558,5.011,2.7939999999999996,6.388,5.0489999999999995
167 | 12.1,13.15,0.8793,5.105,2.9410000000000003,2.201,5.056
168 | 12.44,13.59,0.8462,5.319,2.897,4.9239999999999995,5.27
169 | 12.15,13.45,0.8443,5.417000000000001,2.8369999999999997,3.638,5.337999999999999
170 | 11.35,13.12,0.8291,5.176,2.668,4.337,5.132000000000001
171 | 11.24,13.0,0.8359,5.09,2.715,3.5210000000000004,5.088
172 | 11.02,13.0,0.8189,5.325,2.701,6.735,5.162999999999999
173 | 11.55,13.1,0.8455,5.167000000000001,2.845,6.715,4.956
174 | 11.27,12.97,0.8419,5.088,2.763,4.309,5.0
175 | 11.4,13.08,0.8375,5.136,2.763,5.587999999999999,5.0889999999999995
176 | 10.83,12.96,0.8099,5.278,2.641,5.182,5.185
177 | 10.8,12.57,0.8590000000000001,4.981,2.821,4.773,5.063
178 | 11.26,13.01,0.8355,5.186,2.71,5.335,5.092
179 | 10.74,12.73,0.8329,5.145,2.642,4.702,4.963
180 | 11.48,13.05,0.8473,5.18,2.758,5.876,5.002
181 | 12.21,13.47,0.8453,5.357,2.8930000000000002,1.661,5.178
182 | 11.41,12.95,0.856,5.09,2.775,4.957,4.825
183 | 12.46,13.41,0.8706,5.236000000000001,3.017,4.987,5.147
184 | 12.19,13.36,0.8579,5.24,2.909,4.857,5.1579999999999995
185 | 11.65,13.07,0.8575,5.1080000000000005,2.85,5.209,5.135
186 | 12.89,13.77,0.8541,5.495,3.0260000000000002,6.185,5.316
187 | 11.56,13.31,0.8198,5.3629999999999995,2.6830000000000003,4.062,5.182
188 | 11.81,13.45,0.8198,5.412999999999999,2.716,4.898,5.352
189 | 10.91,12.8,0.8372,5.088,2.675,4.178999999999999,4.956
190 | 11.23,12.82,0.8594,5.0889999999999995,2.821,7.524,4.957
191 | 10.59,12.41,0.8648,4.899,2.787,4.975,4.794
192 | 10.93,12.8,0.8390000000000001,5.046,2.717,5.398,5.045
193 | 11.27,12.86,0.8563,5.091,2.804,3.985,5.001
194 | 11.87,13.02,0.8795,5.132000000000001,2.9530000000000003,3.597,5.132000000000001
195 | 10.82,12.83,0.8256,5.18,2.63,4.853,5.0889999999999995
196 | 12.11,13.27,0.8639,5.236000000000001,2.975,4.132,5.012
197 | 12.8,13.47,0.8859999999999999,5.16,3.1260000000000003,4.873,4.914
198 | 12.79,13.53,0.8786,5.224,3.054,5.483,4.958
199 | 13.37,13.78,0.8849,5.32,3.128,4.67,5.091
200 | 12.62,13.67,0.8481,5.41,2.911,3.306,5.231
201 | 12.76,13.38,0.8964,5.073,3.155,2.8280000000000003,4.83
202 | 12.38,13.44,0.8609,5.218999999999999,2.989,5.472,5.045
203 | 12.67,13.32,0.8977,4.984,3.135,2.3,4.745
204 | 11.18,12.72,0.868,5.0089999999999995,2.81,4.051,4.828
205 | 12.7,13.41,0.8874,5.183,3.091,8.456,5.0
206 | 12.37,13.47,0.8567,5.204,2.96,3.9189999999999996,5.001
207 | 12.19,13.2,0.8783,5.1370000000000005,2.9810000000000003,3.6310000000000002,4.87
208 | 11.23,12.88,0.8511,5.14,2.795,4.325,5.003
209 | 13.2,13.66,0.8883,5.236000000000001,3.2319999999999998,8.315,5.056
210 | 11.84,13.21,0.8521,5.175,2.8360000000000003,3.5980000000000003,5.044
211 | 12.3,13.34,0.8684,5.242999999999999,2.9739999999999998,5.6370000000000005,5.063
212 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Linear_Regression/.ipynb_checkpoints/Data_Transformations-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Transformations\n",
8 | "\n",
9 | "You won't always get data in a convienent format, often you will have to deal with data that is non-numerical, such as customer names, or zipcodes, country names, etc...\n",
10 | "\n",
11 | "A big part of working with data is using your own domain knowledge to build an intuition of how to deal with the data, sometimes the best course of action is to drop the data, other times feature-engineering is a good way to go, or you could try to transform the data into something the Machine Learning Algorithms will understand.\n",
12 | "\n",
13 | "Spark has several built in methods of dealing with thse transformations, check them all out here: http://spark.apache.org/docs/latest/ml-features.html\n",
14 | "\n",
15 | "Let's see some examples of all of this!"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 2,
21 | "metadata": {
22 | "collapsed": true
23 | },
24 | "outputs": [],
25 | "source": [
26 | "from pyspark.sql import SparkSession"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {
33 | "collapsed": true
34 | },
35 | "outputs": [],
36 | "source": [
37 | "spark = SparkSession.builder.appName('data').getOrCreate()"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 4,
43 | "metadata": {
44 | "collapsed": true
45 | },
46 | "outputs": [],
47 | "source": [
48 | "df = spark.read.csv('fake_customers.csv',inferSchema=True,header=True)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 5,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "+-------+----------+-----+\n",
63 | "| Name| Phone|Group|\n",
64 | "+-------+----------+-----+\n",
65 | "| John|4085552424| A|\n",
66 | "| Mike|3105552738| B|\n",
67 | "| Cassie|4085552424| B|\n",
68 | "| Laura|3105552438| B|\n",
69 | "| Sarah|4085551234| A|\n",
70 | "| David|3105557463| C|\n",
71 | "| Zach|4085553987| C|\n",
72 | "| Kiera|3105552938| A|\n",
73 | "| Alexa|4085559467| C|\n",
74 | "|Karissa|3105553475| A|\n",
75 | "+-------+----------+-----+\n",
76 | "\n"
77 | ]
78 | }
79 | ],
80 | "source": [
81 | "df.show()"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "## Data Features\n",
89 | "\n",
90 | "### StringIndexer\n",
91 | "\n",
92 | "We often have to convert string information into numerical information as a categorical feature. This is easily done with the StringIndexer Method:"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 6,
98 | "metadata": {
99 | "collapsed": false
100 | },
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | "+-------+--------+-------------+\n",
107 | "|user_id|category|categoryIndex|\n",
108 | "+-------+--------+-------------+\n",
109 | "| 0| a| 0.0|\n",
110 | "| 1| b| 2.0|\n",
111 | "| 2| c| 1.0|\n",
112 | "| 3| a| 0.0|\n",
113 | "| 4| a| 0.0|\n",
114 | "| 5| c| 1.0|\n",
115 | "+-------+--------+-------------+\n",
116 | "\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "from pyspark.ml.feature import StringIndexer\n",
122 | "\n",
123 | "df = spark.createDataFrame(\n",
124 | " [(0, \"a\"), (1, \"b\"), (2, \"c\"), (3, \"a\"), (4, \"a\"), (5, \"c\")],\n",
125 | " [\"user_id\", \"category\"])\n",
126 | "\n",
127 | "indexer = StringIndexer(inputCol=\"category\", outputCol=\"categoryIndex\")\n",
128 | "indexed = indexer.fit(df).transform(df)\n",
129 | "indexed.show()"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "The next step would be to encode these categories into \"dummy\" variables."
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {
143 | "collapsed": true
144 | },
145 | "outputs": [],
146 | "source": []
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "### VectorIndexer\n",
153 | "\n",
154 | "VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees. VectorAssembler accepts the following input column types: all numeric types, boolean type, and vector type. In each row, the values of the input columns will be concatenated into a vector in the specified order. \n",
155 | "\n",
156 | "Assume that we have a DataFrame with the columns id, hour, mobile, userFeatures, and clicked:\n",
157 | "\n",
158 | " id | hour | mobile | userFeatures | clicked\n",
159 | " ----|------|--------|------------------|---------\n",
160 | " 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0\n",
161 | " \n",
162 | "userFeatures is a vector column that contains three user features. We want to combine hour, mobile, and userFeatures into a single feature vector called features and use it to predict clicked or not. If we set VectorAssembler’s input columns to hour, mobile, and userFeatures and output column to features, after transformation we should get the following DataFrame:\n",
163 | "\n",
164 | " id | hour | mobile | userFeatures | clicked | features\n",
165 | " ----|------|--------|------------------|---------|-----------------------------\n",
166 | " 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 | [18.0, 1.0, 0.0, 10.0, 0.5]"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 14,
172 | "metadata": {
173 | "collapsed": false
174 | },
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "+---+----+------+--------------+-------+\n",
181 | "| id|hour|mobile| userFeatures|clicked|\n",
182 | "+---+----+------+--------------+-------+\n",
183 | "| 0| 18| 1.0|[0.0,10.0,0.5]| 1.0|\n",
184 | "+---+----+------+--------------+-------+\n",
185 | "\n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "from pyspark.ml.linalg import Vectors\n",
191 | "from pyspark.ml.feature import VectorAssembler\n",
192 | "\n",
193 | "dataset = spark.createDataFrame(\n",
194 | " [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],\n",
195 | " [\"id\", \"hour\", \"mobile\", \"userFeatures\", \"clicked\"])\n",
196 | "dataset.show()"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 15,
202 | "metadata": {
203 | "collapsed": false
204 | },
205 | "outputs": [
206 | {
207 | "name": "stdout",
208 | "output_type": "stream",
209 | "text": [
210 | "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\n",
211 | "+--------------------+-------+\n",
212 | "| features|clicked|\n",
213 | "+--------------------+-------+\n",
214 | "|[18.0,1.0,0.0,10....| 1.0|\n",
215 | "+--------------------+-------+\n",
216 | "\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "assembler = VectorAssembler(\n",
222 | " inputCols=[\"hour\", \"mobile\", \"userFeatures\"],\n",
223 | " outputCol=\"features\")\n",
224 | "\n",
225 | "output = assembler.transform(dataset)\n",
226 | "print(\"Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\")\n",
227 | "output.select(\"features\", \"clicked\").show()"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "There ar emany more data transformations available, we will cover them once we encounter a need for them, for now these were the most important ones.\n",
235 | "\n",
236 | "Let's continue on to Linear Regression!"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {
243 | "collapsed": true
244 | },
245 | "outputs": [],
246 | "source": []
247 | }
248 | ],
249 | "metadata": {
250 | "anaconda-cloud": {},
251 | "kernelspec": {
252 | "display_name": "Python [conda root]",
253 | "language": "python",
254 | "name": "conda-root-py"
255 | },
256 | "language_info": {
257 | "codemirror_mode": {
258 | "name": "ipython",
259 | "version": 3
260 | },
261 | "file_extension": ".py",
262 | "mimetype": "text/x-python",
263 | "name": "python",
264 | "nbconvert_exporter": "python",
265 | "pygments_lexer": "ipython3",
266 | "version": "3.5.3"
267 | }
268 | },
269 | "nbformat": 4,
270 | "nbformat_minor": 0
271 | }
272 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Linear_Regression/.ipynb_checkpoints/Linear_Regression_Consulting_Project-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Linear Regression Consulting Project"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {
13 | "collapsed": true
14 | },
15 | "source": [
16 | "Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n",
17 | "\n",
18 | "You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n",
19 | "\n",
20 | "They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n",
21 | "\n",
22 | "Here is what the data looks like so far:\n",
23 | "\n",
24 | " Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n",
25 | " ships.\n",
26 | "\n",
27 | "\n",
28 | " Variables/Columns\n",
29 | " Ship Name 1-20\n",
30 | " Cruise Line 21-40\n",
31 | " Age (as of 2013) 46-48\n",
32 | " Tonnage (1000s of tons) 50-56\n",
33 | " passengers (100s) 58-64\n",
34 | " Length (100s of feet) 66-72\n",
35 | " Cabins (100s) 74-80\n",
36 | " Passenger Density 82-88\n",
37 | " Crew (100s) 90-96\n",
38 | " \n",
39 | "It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis! \n",
40 | "\n",
41 | "Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!"
42 | ]
43 | }
44 | ],
45 | "metadata": {
46 | "anaconda-cloud": {},
47 | "kernelspec": {
48 | "display_name": "Python [conda root]",
49 | "language": "python",
50 | "name": "conda-root-py"
51 | },
52 | "language_info": {
53 | "codemirror_mode": {
54 | "name": "ipython",
55 | "version": 3
56 | },
57 | "file_extension": ".py",
58 | "mimetype": "text/x-python",
59 | "name": "python",
60 | "nbconvert_exporter": "python",
61 | "pygments_lexer": "ipython3",
62 | "version": "3.5.3"
63 | }
64 | },
65 | "nbformat": 4,
66 | "nbformat_minor": 0
67 | }
68 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Linear_Regression/Data_Transformations.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Transformations\n",
8 | "\n",
9 | "You won't always get data in a convienent format, often you will have to deal with data that is non-numerical, such as customer names, or zipcodes, country names, etc...\n",
10 | "\n",
11 | "A big part of working with data is using your own domain knowledge to build an intuition of how to deal with the data, sometimes the best course of action is to drop the data, other times feature-engineering is a good way to go, or you could try to transform the data into something the Machine Learning Algorithms will understand.\n",
12 | "\n",
13 | "Spark has several built in methods of dealing with thse transformations, check them all out here: http://spark.apache.org/docs/latest/ml-features.html\n",
14 | "\n",
15 | "Let's see some examples of all of this!"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 2,
21 | "metadata": {
22 | "collapsed": true
23 | },
24 | "outputs": [],
25 | "source": [
26 | "from pyspark.sql import SparkSession"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {
33 | "collapsed": true
34 | },
35 | "outputs": [],
36 | "source": [
37 | "spark = SparkSession.builder.appName('data').getOrCreate()"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 4,
43 | "metadata": {
44 | "collapsed": true
45 | },
46 | "outputs": [],
47 | "source": [
48 | "df = spark.read.csv('fake_customers.csv',inferSchema=True,header=True)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 5,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "+-------+----------+-----+\n",
63 | "| Name| Phone|Group|\n",
64 | "+-------+----------+-----+\n",
65 | "| John|4085552424| A|\n",
66 | "| Mike|3105552738| B|\n",
67 | "| Cassie|4085552424| B|\n",
68 | "| Laura|3105552438| B|\n",
69 | "| Sarah|4085551234| A|\n",
70 | "| David|3105557463| C|\n",
71 | "| Zach|4085553987| C|\n",
72 | "| Kiera|3105552938| A|\n",
73 | "| Alexa|4085559467| C|\n",
74 | "|Karissa|3105553475| A|\n",
75 | "+-------+----------+-----+\n",
76 | "\n"
77 | ]
78 | }
79 | ],
80 | "source": [
81 | "df.show()"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "## Data Features\n",
89 | "\n",
90 | "### StringIndexer\n",
91 | "\n",
92 | "We often have to convert string information into numerical information as a categorical feature. This is easily done with the StringIndexer Method:"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 6,
98 | "metadata": {
99 | "collapsed": false
100 | },
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | "+-------+--------+-------------+\n",
107 | "|user_id|category|categoryIndex|\n",
108 | "+-------+--------+-------------+\n",
109 | "| 0| a| 0.0|\n",
110 | "| 1| b| 2.0|\n",
111 | "| 2| c| 1.0|\n",
112 | "| 3| a| 0.0|\n",
113 | "| 4| a| 0.0|\n",
114 | "| 5| c| 1.0|\n",
115 | "+-------+--------+-------------+\n",
116 | "\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "from pyspark.ml.feature import StringIndexer\n",
122 | "\n",
123 | "df = spark.createDataFrame(\n",
124 | " [(0, \"a\"), (1, \"b\"), (2, \"c\"), (3, \"a\"), (4, \"a\"), (5, \"c\")],\n",
125 | " [\"user_id\", \"category\"])\n",
126 | "\n",
127 | "indexer = StringIndexer(inputCol=\"category\", outputCol=\"categoryIndex\")\n",
128 | "indexed = indexer.fit(df).transform(df)\n",
129 | "indexed.show()"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "The next step would be to encode these categories into \"dummy\" variables."
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {
143 | "collapsed": true
144 | },
145 | "outputs": [],
146 | "source": []
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "### VectorIndexer\n",
153 | "\n",
154 | "VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees. VectorAssembler accepts the following input column types: all numeric types, boolean type, and vector type. In each row, the values of the input columns will be concatenated into a vector in the specified order. \n",
155 | "\n",
156 | "Assume that we have a DataFrame with the columns id, hour, mobile, userFeatures, and clicked:\n",
157 | "\n",
158 | " id | hour | mobile | userFeatures | clicked\n",
159 | " ----|------|--------|------------------|---------\n",
160 | " 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0\n",
161 | " \n",
162 | "userFeatures is a vector column that contains three user features. We want to combine hour, mobile, and userFeatures into a single feature vector called features and use it to predict clicked or not. If we set VectorAssembler’s input columns to hour, mobile, and userFeatures and output column to features, after transformation we should get the following DataFrame:\n",
163 | "\n",
164 | " id | hour | mobile | userFeatures | clicked | features\n",
165 | " ----|------|--------|------------------|---------|-----------------------------\n",
166 | " 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 | [18.0, 1.0, 0.0, 10.0, 0.5]"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 14,
172 | "metadata": {
173 | "collapsed": false
174 | },
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "+---+----+------+--------------+-------+\n",
181 | "| id|hour|mobile| userFeatures|clicked|\n",
182 | "+---+----+------+--------------+-------+\n",
183 | "| 0| 18| 1.0|[0.0,10.0,0.5]| 1.0|\n",
184 | "+---+----+------+--------------+-------+\n",
185 | "\n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "from pyspark.ml.linalg import Vectors\n",
191 | "from pyspark.ml.feature import VectorAssembler\n",
192 | "\n",
193 | "dataset = spark.createDataFrame(\n",
194 | " [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],\n",
195 | " [\"id\", \"hour\", \"mobile\", \"userFeatures\", \"clicked\"])\n",
196 | "dataset.show()"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 15,
202 | "metadata": {
203 | "collapsed": false
204 | },
205 | "outputs": [
206 | {
207 | "name": "stdout",
208 | "output_type": "stream",
209 | "text": [
210 | "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\n",
211 | "+--------------------+-------+\n",
212 | "| features|clicked|\n",
213 | "+--------------------+-------+\n",
214 | "|[18.0,1.0,0.0,10....| 1.0|\n",
215 | "+--------------------+-------+\n",
216 | "\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "assembler = VectorAssembler(\n",
222 | " inputCols=[\"hour\", \"mobile\", \"userFeatures\"],\n",
223 | " outputCol=\"features\")\n",
224 | "\n",
225 | "output = assembler.transform(dataset)\n",
226 | "print(\"Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\")\n",
227 | "output.select(\"features\", \"clicked\").show()"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "There ar emany more data transformations available, we will cover them once we encounter a need for them, for now these were the most important ones.\n",
235 | "\n",
236 | "Let's continue on to Linear Regression!"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {
243 | "collapsed": true
244 | },
245 | "outputs": [],
246 | "source": []
247 | }
248 | ],
249 | "metadata": {
250 | "anaconda-cloud": {},
251 | "kernelspec": {
252 | "display_name": "Python [conda root]",
253 | "language": "python",
254 | "name": "conda-root-py"
255 | },
256 | "language_info": {
257 | "codemirror_mode": {
258 | "name": "ipython",
259 | "version": 3
260 | },
261 | "file_extension": ".py",
262 | "mimetype": "text/x-python",
263 | "name": "python",
264 | "nbconvert_exporter": "python",
265 | "pygments_lexer": "ipython3",
266 | "version": "3.5.3"
267 | }
268 | },
269 | "nbformat": 4,
270 | "nbformat_minor": 0
271 | }
272 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Linear_Regression/Linear_Regression_Consulting_Project.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Linear Regression Consulting Project"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {
13 | "collapsed": true
14 | },
15 | "source": [
16 | "Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n",
17 | "\n",
18 | "You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n",
19 | "\n",
20 | "They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n",
21 | "\n",
22 | "Here is what the data looks like so far:\n",
23 | "\n",
24 | " Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n",
25 | " ships.\n",
26 | "\n",
27 | "\n",
28 | " Variables/Columns\n",
29 | " Ship Name 1-20\n",
30 | " Cruise Line 21-40\n",
31 | " Age (as of 2013) 46-48\n",
32 | " Tonnage (1000s of tons) 50-56\n",
33 | " passengers (100s) 58-64\n",
34 | " Length (100s of feet) 66-72\n",
35 | " Cabins (100s) 74-80\n",
36 | " Passenger Density 82-88\n",
37 | " Crew (100s) 90-96\n",
38 | " \n",
39 | "It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis! \n",
40 | "\n",
41 | "Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!"
42 | ]
43 | }
44 | ],
45 | "metadata": {
46 | "anaconda-cloud": {},
47 | "kernelspec": {
48 | "display_name": "Python [conda root]",
49 | "language": "python",
50 | "name": "conda-root-py"
51 | },
52 | "language_info": {
53 | "codemirror_mode": {
54 | "name": "ipython",
55 | "version": 3
56 | },
57 | "file_extension": ".py",
58 | "mimetype": "text/x-python",
59 | "name": "python",
60 | "nbconvert_exporter": "python",
61 | "pygments_lexer": "ipython3",
62 | "version": "3.5.3"
63 | }
64 | },
65 | "nbformat": 4,
66 | "nbformat_minor": 0
67 | }
68 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Linear_Regression/cruise_ship_info.csv:
--------------------------------------------------------------------------------
1 | Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
2 | Journey,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55
3 | Quest,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55
4 | Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7
5 | Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
6 | Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0
7 | Ecstasy,Carnival,22,70.367,20.52,8.55,10.2,34.29,9.2
8 | Elation,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2
9 | Fantasy,Carnival,23,70.367,20.56,8.55,10.22,34.23,9.2
10 | Fascination,Carnival,19,70.367,20.52,8.55,10.2,34.29,9.2
11 | Freedom,Carnival,6,110.23899999999999,37.0,9.51,14.87,29.79,11.5
12 | Glory,Carnival,10,110.0,29.74,9.51,14.87,36.99,11.6
13 | Holiday,Carnival,28,46.052,14.52,7.27,7.26,31.72,6.6
14 | Imagination,Carnival,18,70.367,20.52,8.55,10.2,34.29,9.2
15 | Inspiration,Carnival,17,70.367,20.52,8.55,10.2,34.29,9.2
16 | Legend,Carnival,11,86.0,21.24,9.63,10.62,40.49,9.3
17 | Liberty*,Carnival,8,110.0,29.74,9.51,14.87,36.99,11.6
18 | Miracle,Carnival,9,88.5,21.24,9.63,10.62,41.67,10.3
19 | Paradise,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2
20 | Pride,Carnival,12,88.5,21.24,9.63,11.62,41.67,9.3
21 | Sensation,Carnival,20,70.367,20.52,8.55,10.2,34.29,9.2
22 | Spirit,Carnival,12,88.5,21.24,9.63,10.56,41.67,10.29
23 | Triumph,Carnival,14,101.509,27.58,8.93,13.21,36.81,10.0
24 | Valor,Carnival,9,110.0,29.74,9.52,14.87,36.99,11.6
25 | Victory,Carnival,13,101.509,27.58,8.93,13.79,36.81,11.5
26 | Century,Celebrity,18,70.60600000000001,17.7,8.15,8.75,39.89,8.58
27 | Constellation,Celebrity,11,91.0,20.32,9.65,9.75,44.78,9.99
28 | Galaxy,Celebrity,17,77.71300000000001,18.9,8.66,9.35,41.12,9.09
29 | Infinity,Celebrity,12,91.0,20.32,9.65,9.75,44.78,9.99
30 | Mercury,Celebrity,16,77.71300000000001,18.82,8.66,9.35,41.29,9.09
31 | Millenium,Celebrity,13,91.0,20.32,9.65,9.75,44.78,9.99
32 | Solstice,Celebrity,5,122.0,28.5,10.33,6.87,34.57,6.7
33 | Summit,Celebrity,12,91.0,20.32,9.65,9.75,44.78,9.99
34 | Xpedition,Celebrity,12,2.329,0.94,2.96,0.45,24.78,0.6
35 | Zenith,Celebrity,21,47.225,13.66,6.82,6.87,34.57,6.7
36 | Allegra,Costa,21,28.43,8.08,6.16,4.1,35.19,4.0
37 | Atlantica,Costa,13,85.619,21.14,9.57,10.56,40.5,9.2
38 | Classica,Costa,22,52.926,13.02,7.18,6.54,40.65,6.17
39 | Europa,Costa,27,53.872,14.94,7.98,7.67,36.06,6.36
40 | Fortuna,Costa,10,105.0,27.2,8.9,13.56,38.6,10.68
41 | Magica,Costa,9,105.0,27.2,8.9,13.56,38.6,10.68
42 | Marina,Costa,23,25.0,7.76,6.22,3.86,32.22,3.85
43 | Mediterranea,Costa,10,86.0,21.14,9.6,10.56,40.68,9.2
44 | Romantica,Costa,20,53.049,13.44,7.22,6.78,39.47,6.0
45 | Serena,Costa,6,112.0,38.0,9.51,15.0,29.47,10.9
46 | Victoria,Costa,17,75.166,19.28,8.28,9.64,38.99,7.66
47 | Serenity,Crystal,10,68.0,10.8,7.9,5.5,62.96,6.36
48 | Symphony,Crystal,18,51.004,9.4,7.81,4.8,54.26,5.45
49 | QueenElizabethII,Cunard,44,70.327,17.91,9.63,9.5,39.27,9.21
50 | QueenMary2,Cunard,10,151.4,26.2,11.32,11.34,57.79,12.53
51 | QueenVictoria,Cunard,6,90.0,20.0,9.64,10.29,45.0,9.0
52 | Magic,Disney,15,83.338,17.5,9.64,8.75,47.62,9.45
53 | Wonder,Disney,14,83.0,17.5,9.64,8.75,47.43,9.45
54 | Amsterdam,Holland_American,13,61.0,13.8,7.8,6.88,44.2,6.0
55 | Eurodam,Holland_American,5,86.0,21.04,9.36,10.22,40.87,8.0
56 | Maasdam,Holland_American,20,55.451,12.64,7.19,6.32,43.87,5.57
57 | Noordam,Holland_American,29,33.92,12.14,7.04,6.07,27.94,5.3
58 | Oosterdam,Holland_American,10,81.76899999999999,18.48,9.59,9.24,44.25,8.42
59 | Prinsendam,Holland_American,25,38.0,7.49,6.74,3.96,50.73,4.6
60 | Rotterdam,Holland_American,16,59.652,13.2,7.77,6.6,45.19,6.44
61 | Ryndam,Holland_American,19,55.451,12.66,7.19,6.33,43.8,5.88
62 | Statendam,Holland_American,20,55.451,12.66,7.19,6.33,43.8,5.88
63 | Veendam,Holland_American,17,55.451,12.66,7.19,6.33,43.8,5.88
64 | Volendam,Holland_American,14,63.0,14.4,7.77,7.2,43.75,5.61
65 | Westerdam,Holland_American,27,53.872,14.94,7.98,7.47,36.06,6.12
66 | Zaandam,Holland_American,13,63.0,14.4,7.77,7.2,43.75,5.31
67 | Zuiderdam,Holland_American,11,85.0,18.48,9.51,9.24,46.0,8.0
68 | Armonia,MSC,12,58.6,15.66,8.24,7.83,37.42,7.0
69 | Fantasia,MSC,5,133.5,39.59,10.93,16.37,33.72,13.13
70 | Lirica,MSC,10,58.825,15.6,8.23,7.65,37.71,7.0
71 | Melody,MSC,31,35.143,12.5,6.69,5.32,28.11,5.35
72 | Musica,MSC,7,89.6,25.5,9.61,12.75,35.14,9.87
73 | Opera,MSC,9,59.058,17.0,7.63,8.5,34.74,7.4
74 | Rhapsody,MSC,36,16.852,9.52,5.41,3.83,17.7,2.97
75 | Sinfonia,MSC,11,58.6,15.66,8.23,7.83,37.42,7.6
76 | Crown,Norwegian,25,34.25,10.52,6.15,5.26,32.56,4.7
77 | Dawn,Norwegian,11,90.0,22.4,9.65,11.2,40.18,11.0
78 | Dream,Norwegian,21,50.76,17.48,7.54,8.74,29.04,6.14
79 | Gem,Norwegian,6,93.0,23.94,9.65,11.97,38.85,11.09
80 | Jewel,Norwegian,8,91.0,22.44,9.65,11.22,40.55,11.0
81 | Majesty,Norwegian,21,38.0,10.56,5.67,5.28,35.98,4.38
82 | PrideofAloha,Norwegian,14,77.104,20.02,8.53,10.01,38.51,8.0
83 | PrideofAmerica,Norwegian,9,81.0,21.44,9.21,10.72,37.78,10.0
84 | Sea,Norwegian,25,42.0,15.04,7.08,7.52,27.93,6.3
85 | Spirit,Norwegian,15,75.33800000000001,19.56,8.79,9.83,38.52,13.0
86 | Star,Norwegian,40,28.0,11.5,6.74,4.0,24.35,3.8
87 | Sun,Norwegian,12,77.104,20.02,8.53,10.01,38.51,9.59
88 | Wind,Norwegian,20,50.76,17.48,7.54,8.74,29.04,6.14
89 | Insignia,Oceania,15,30.276999999999997,6.84,5.94,3.42,44.26,4.0
90 | Nautica,Oceania,13,30.276999999999997,6.84,5.94,3.42,44.26,4.0
91 | Regatta,Oceania,15,30.276999999999997,6.84,5.94,3.42,44.26,4.0
92 | MarcoPolo,Orient,48,22.08,8.26,5.78,4.25,26.73,3.5
93 | Arcadia,P&O,9,85.0,19.68,9.35,9.84,43.19,8.69
94 | Artemis,P&O,29,45.0,11.78,7.54,5.3,38.2,5.2
95 | Aurora,P&O,13,76.0,18.74,8.86,9.39,40.55,8.5
96 | Oceana,P&O,10,77.0,20.16,8.56,9.75,38.19,9.0
97 | Oriana,P&O,18,69.153,18.82,8.53,9.14,36.74,7.94
98 | Ventura,P&O,5,115.0,35.74,9.0,15.32,32.18,12.2
99 | Caribbean,Princess,9,116.0,26.0,9.51,13.0,44.62,11.0
100 | Coral,Princess,11,91.62700000000001,19.74,9.64,9.87,46.42,9.0
101 | Crown,Princess,7,116.0,31.0,9.51,15.57,37.42,12.0
102 | Dawn,Princess,16,77.499,19.5,8.56,10.5,39.74,9.0
103 | Diamond,Princess,9,113.0,26.74,9.51,13.37,42.26,12.38
104 | Emerald,Princess,6,113.0,37.82,9.51,15.57,29.88,12.0
105 | Golden,Princess,12,108.865,27.58,9.51,13.0,39.47,11.0
106 | Grand,Princess,15,108.806,26.0,9.51,13.0,41.85,11.1
107 | Island,Princess,10,91.62700000000001,19.74,9.64,9.87,46.42,9.0
108 | Pacific,Princess,14,30.276999999999997,6.86,5.93,3.44,44.14,3.73
109 | Regal,Princess,22,69.845,15.9,8.03,7.95,43.93,6.96
110 | Royal,Princess,29,44.348,12.0,7.54,6.0,36.96,5.2
111 | Saphire,Princess,9,113.0,26.74,9.51,13.37,42.26,12.38
112 | Sea,Princess,8,77.499,19.5,8.56,9.75,39.74,9.0
113 | Star,Princess,11,108.977,26.02,9.51,13.01,41.88,12.0
114 | Sun,Princess,18,77.499,19.5,8.56,9.75,39.74,9.0
115 | Tahitian,Princess,14,30.276999999999997,6.88,5.93,3.44,44.01,3.73
116 | ExplorerII,Regent_Seven_Seas,27,12.5,3.94,4.36,0.88,31.73,1.46
117 | Mariner,Regent_Seven_Seas,12,50.0,7.0,7.09,3.54,71.43,4.45
118 | Navigator,Regent_Seven_Seas,14,33.0,4.9,5.6,2.45,67.35,3.24
119 | PaulGauguin,Regent_Seven_Seas,16,19.2,3.2,5.13,1.6,60.0,2.11
120 | Voyager,Regent_Seven_Seas,10,46.0,7.0,6.7,1.82,65.71,4.47
121 | Adventure,Royal_Caribbean,12,138.0,31.14,10.2,15.57,44.32,11.85
122 | Brilliance,Royal_Caribbean,11,90.09,25.01,9.62,10.5,36.02,8.48
123 | Empress,Royal_Caribbean,23,48.563,20.2,6.92,8.0,24.04,6.71
124 | Enchantment,Royal_Caribbean,16,74.137,19.5,9.16,9.75,38.02,7.6
125 | Explorer,Royal_Caribbean,13,138.0,31.14,10.2,15.57,44.32,11.76
126 | Freedom,Royal_Caribbean,7,158.0,43.7,11.12,18.0,36.16,13.6
127 | Grandeur,Royal_Caribbean,17,74.137,19.5,9.16,9.75,38.02,7.6
128 | Independence,Royal_Caribbean,5,160.0,36.34,11.12,18.17,44.03,13.6
129 | Jewel,Royal_Caribbean,9,90.09,25.01,9.62,10.94,36.02,8.69
130 | Legend,Royal_Caribbean,18,70.0,18.0,8.67,9.0,38.89,7.2
131 | Liberty,Royal_Caribbean,6,158.0,43.7,11.25,18.0,36.16,13.6
132 | Majesty,Royal_Caribbean,21,73.941,27.44,8.8,11.75,26.95,8.22
133 | Mariner,Royal_Caribbean,10,138.0,31.14,10.2,15.57,44.32,11.85
134 | Monarch,Royal_Caribbean,22,73.941,27.44,8.8,11.77,30.94,8.22
135 | Navigator,Royal_Caribbean,11,138.0,31.14,10.2,15.57,44.32,11.85
136 | Oasis,Royal_Caribbean,4,220.0,54.0,11.82,27.0,40.74,21.0
137 | Radiance,Royal_Caribbean,12,90.09,25.01,9.62,10.5,36.02,8.68
138 | Rhapsody,Royal_Caribbean,16,78.491,24.35,9.15,10.0,32.23,7.65
139 | Serenade,Royal_Caribbean,10,90.09,25.01,9.62,10.5,36.02,8.58
140 | Sovreign,Royal_Caribbean,25,73.192,28.52,8.8,11.38,25.66,8.08
141 | Splendour,Royal_Caribbean,17,70.0,20.76,8.67,9.02,33.72,7.2
142 | Vision,Royal_Caribbean,15,78.491,24.35,9.15,10.0,32.23,6.6
143 | Voyager,Royal_Caribbean,14,138.0,31.14,10.2,15.57,44.32,11.76
144 | Legend,Seabourn,21,10.0,2.08,4.4,1.04,48.08,1.6
145 | Pride,Seabourn,27,10.0,2.08,4.4,1.04,48.08,1.6
146 | Spirit,Seabourn,24,10.0,2.08,4.4,1.04,48.08,1.6
147 | Cloud,Silversea,19,16.8,2.96,5.14,1.48,56.76,2.1
148 | Shadow,Silversea,13,25.0,3.82,5.97,1.94,65.45,2.95
149 | Whisper,Silversea,12,25.0,3.88,5.97,1.94,64.43,2.87
150 | Wind,Silversea,19,16.8,2.96,5.14,1.48,56.76,1.97
151 | Aries,Star,22,3.341,0.66,2.8,0.33,50.62,0.59
152 | Gemini,Star,21,19.093,8.0,5.37,4.0,23.87,4.7
153 | Libra,Star,12,42.0,14.8,7.13,7.4,28.38,6.8
154 | Pisces,Star,24,40.053000000000004,12.87,5.79,7.76,31.12,7.5
155 | Taurus,Star,22,3.341,0.66,2.79,0.33,50.62,0.59
156 | Virgo,Star,14,76.8,19.6,8.79,9.67,39.18,12.0
157 | Spirit,Windstar,25,5.35,1.58,4.4,0.74,33.86,0.88
158 | Star,Windstar,27,5.35,1.67,4.4,0.74,32.04,0.88
159 | Surf,Windstar,23,14.745,3.08,6.17,1.56,47.87,1.8
160 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Linear_Regression/fake_customers.csv:
--------------------------------------------------------------------------------
1 | Name,Phone,Group
2 | John,4085552424,A
3 | Mike,3105552738,B
4 | Cassie,4085552424,B
5 | Laura,3105552438,B
6 | Sarah,4085551234,A
7 | David,3105557463,C
8 | Zach,4085553987,C
9 | Kiera,3105552938,A
10 | Alexa,4085559467,C
11 | Karissa,3105553475,A
12 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Logistic_Regression/.ipynb_checkpoints/Logistic_Regression_Consulting_Project-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Logistic Regression Consulting Project"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {
13 | "collapsed": true
14 | },
15 | "source": [
16 | "## Binary Customer Churn\n",
17 | "\n",
18 | "A marketing agency has many customers that use their service to produce ads for the client/customer websites. They've noticed that they have quite a bit of churn in clients. They basically randomly assign account managers right now, but want you to create a machine learning model that will help predict which customers will churn (stop buying their service) so that they can correctly assign the customers most at risk to churn an account manager. Luckily they have some historical data, can you help them out? Create a classification algorithm that will help classify whether or not a customer churned. Then the company can test this against incoming data for future customers to predict which customers will churn and assign them an account manager.\n",
19 | "\n",
20 | "The data is saved as customer_churn.csv. Here are the fields and their definitions:\n",
21 | "\n",
22 | " Name : Name of the latest contact at Company\n",
23 | " Age: Customer Age\n",
24 | " Total_Purchase: Total Ads Purchased\n",
25 | " Account_Manager: Binary 0=No manager, 1= Account manager assigned\n",
26 | " Years: Totaly Years as a customer\n",
27 | " Num_sites: Number of websites that use the service.\n",
28 | " Onboard_date: Date that the name of the latest contact was onboarded\n",
29 | " Location: Client HQ Address\n",
30 | " Company: Name of Client Company\n",
31 | " \n",
32 | "Once you've created the model and evaluated it, test out the model on some new data (you can think of this almost like a hold-out set) that your client has provided, saved under new_customers.csv. The client wants to know which customers are most likely to churn given this data (they don't have the label yet)."
33 | ]
34 | }
35 | ],
36 | "metadata": {
37 | "anaconda-cloud": {},
38 | "kernelspec": {
39 | "display_name": "Python [conda root]",
40 | "language": "python",
41 | "name": "conda-root-py"
42 | },
43 | "language_info": {
44 | "codemirror_mode": {
45 | "name": "ipython",
46 | "version": 3
47 | },
48 | "file_extension": ".py",
49 | "mimetype": "text/x-python",
50 | "name": "python",
51 | "nbconvert_exporter": "python",
52 | "pygments_lexer": "ipython3",
53 | "version": "3.5.3"
54 | }
55 | },
56 | "nbformat": 4,
57 | "nbformat_minor": 0
58 | }
59 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Logistic_Regression/Logistic_Regression_Consulting_Project.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Logistic Regression Consulting Project"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {
13 | "collapsed": true
14 | },
15 | "source": [
16 | "## Binary Customer Churn\n",
17 | "\n",
18 | "A marketing agency has many customers that use their service to produce ads for the client/customer websites. They've noticed that they have quite a bit of churn in clients. They basically randomly assign account managers right now, but want you to create a machine learning model that will help predict which customers will churn (stop buying their service) so that they can correctly assign the customers most at risk to churn an account manager. Luckily they have some historical data, can you help them out? Create a classification algorithm that will help classify whether or not a customer churned. Then the company can test this against incoming data for future customers to predict which customers will churn and assign them an account manager.\n",
19 | "\n",
20 | "The data is saved as customer_churn.csv. Here are the fields and their definitions:\n",
21 | "\n",
22 | " Name : Name of the latest contact at Company\n",
23 | " Age: Customer Age\n",
24 | " Total_Purchase: Total Ads Purchased\n",
25 | " Account_Manager: Binary 0=No manager, 1= Account manager assigned\n",
26 | " Years: Totaly Years as a customer\n",
27 | " Num_sites: Number of websites that use the service.\n",
28 | " Onboard_date: Date that the name of the latest contact was onboarded\n",
29 | " Location: Client HQ Address\n",
30 | " Company: Name of Client Company\n",
31 | " \n",
32 | "Once you've created the model and evaluated it, test out the model on some new data (you can think of this almost like a hold-out set) that your client has provided, saved under new_customers.csv. The client wants to know which customers are most likely to churn given this data (they don't have the label yet)."
33 | ]
34 | }
35 | ],
36 | "metadata": {
37 | "anaconda-cloud": {},
38 | "kernelspec": {
39 | "display_name": "Python [conda root]",
40 | "language": "python",
41 | "name": "conda-root-py"
42 | },
43 | "language_info": {
44 | "codemirror_mode": {
45 | "name": "ipython",
46 | "version": 3
47 | },
48 | "file_extension": ".py",
49 | "mimetype": "text/x-python",
50 | "name": "python",
51 | "nbconvert_exporter": "python",
52 | "pygments_lexer": "ipython3",
53 | "version": "3.5.3"
54 | }
55 | },
56 | "nbformat": 4,
57 | "nbformat_minor": 0
58 | }
59 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Logistic_Regression/Titanic_Log_Regression_Code_Along.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Logistic Regression Code Along\n",
8 | "This is a code along of the famous titanic dataset, its always nice to start off with this dataset because it is an example you will find across pretty much every data analysis language."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {
15 | "collapsed": true
16 | },
17 | "outputs": [],
18 | "source": [
19 | "from pyspark.sql import SparkSession"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {
26 | "collapsed": true
27 | },
28 | "outputs": [],
29 | "source": [
30 | "spark = SparkSession.builder.appName('myproj').getOrCreate()"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {
37 | "collapsed": true
38 | },
39 | "outputs": [],
40 | "source": [
41 | "data = spark.read.csv('titanic.csv',inferSchema=True,header=True)"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 4,
47 | "metadata": {
48 | "collapsed": false
49 | },
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "root\n",
56 | " |-- PassengerId: integer (nullable = true)\n",
57 | " |-- Survived: integer (nullable = true)\n",
58 | " |-- Pclass: integer (nullable = true)\n",
59 | " |-- Name: string (nullable = true)\n",
60 | " |-- Sex: string (nullable = true)\n",
61 | " |-- Age: double (nullable = true)\n",
62 | " |-- SibSp: integer (nullable = true)\n",
63 | " |-- Parch: integer (nullable = true)\n",
64 | " |-- Ticket: string (nullable = true)\n",
65 | " |-- Fare: double (nullable = true)\n",
66 | " |-- Cabin: string (nullable = true)\n",
67 | " |-- Embarked: string (nullable = true)\n",
68 | "\n"
69 | ]
70 | }
71 | ],
72 | "source": [
73 | "data.printSchema()"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 7,
79 | "metadata": {
80 | "collapsed": false
81 | },
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/plain": [
86 | "['PassengerId',\n",
87 | " 'Survived',\n",
88 | " 'Pclass',\n",
89 | " 'Name',\n",
90 | " 'Sex',\n",
91 | " 'Age',\n",
92 | " 'SibSp',\n",
93 | " 'Parch',\n",
94 | " 'Ticket',\n",
95 | " 'Fare',\n",
96 | " 'Cabin',\n",
97 | " 'Embarked']"
98 | ]
99 | },
100 | "execution_count": 7,
101 | "metadata": {},
102 | "output_type": "execute_result"
103 | }
104 | ],
105 | "source": [
106 | "data.columns"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 8,
112 | "metadata": {
113 | "collapsed": true
114 | },
115 | "outputs": [],
116 | "source": [
117 | "my_cols = data.select(['Survived',\n",
118 | " 'Pclass',\n",
119 | " 'Sex',\n",
120 | " 'Age',\n",
121 | " 'SibSp',\n",
122 | " 'Parch',\n",
123 | " 'Fare',\n",
124 | " 'Embarked'])"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 29,
130 | "metadata": {
131 | "collapsed": false
132 | },
133 | "outputs": [],
134 | "source": [
135 | "my_final_data = my_cols.na.drop()"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "### Working with Categorical Columns\n",
143 | "\n",
144 | "Let's break this down into multiple steps to make it all clear."
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 12,
150 | "metadata": {
151 | "collapsed": true
152 | },
153 | "outputs": [],
154 | "source": [
155 | "from pyspark.ml.feature import (VectorAssembler,VectorIndexer,\n",
156 | " OneHotEncoder,StringIndexer)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 13,
162 | "metadata": {
163 | "collapsed": true
164 | },
165 | "outputs": [],
166 | "source": [
167 | "gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')\n",
168 | "gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 14,
174 | "metadata": {
175 | "collapsed": true
176 | },
177 | "outputs": [],
178 | "source": [
179 | "embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')\n",
180 | "embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 15,
186 | "metadata": {
187 | "collapsed": true
188 | },
189 | "outputs": [],
190 | "source": [
191 | "assembler = VectorAssembler(inputCols=['Pclass',\n",
192 | " 'SexVec',\n",
193 | " 'Age',\n",
194 | " 'SibSp',\n",
195 | " 'Parch',\n",
196 | " 'Fare',\n",
197 | " 'EmbarkVec'],outputCol='features')"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 30,
203 | "metadata": {
204 | "collapsed": true
205 | },
206 | "outputs": [],
207 | "source": [
208 | "from pyspark.ml.classification import LogisticRegression"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "## Pipelines \n",
216 | "\n",
217 | "Let's see an example of how to use pipelines (we'll get a lot more practice with these later!)"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 17,
223 | "metadata": {
224 | "collapsed": true
225 | },
226 | "outputs": [],
227 | "source": [
228 | "from pyspark.ml import Pipeline"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 18,
234 | "metadata": {
235 | "collapsed": true
236 | },
237 | "outputs": [],
238 | "source": [
239 | "log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 19,
245 | "metadata": {
246 | "collapsed": true
247 | },
248 | "outputs": [],
249 | "source": [
250 | "pipeline = Pipeline(stages=[gender_indexer,embark_indexer,\n",
251 | " gender_encoder,embark_encoder,\n",
252 | " assembler,log_reg_titanic])"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 20,
258 | "metadata": {
259 | "collapsed": true
260 | },
261 | "outputs": [],
262 | "source": [
263 | "train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7,.3])"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 21,
269 | "metadata": {
270 | "collapsed": true
271 | },
272 | "outputs": [],
273 | "source": [
274 | "fit_model = pipeline.fit(train_titanic_data)"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 22,
280 | "metadata": {
281 | "collapsed": true
282 | },
283 | "outputs": [],
284 | "source": [
285 | "results = fit_model.transform(test_titanic_data)"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 23,
291 | "metadata": {
292 | "collapsed": true
293 | },
294 | "outputs": [],
295 | "source": [
296 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 24,
302 | "metadata": {
303 | "collapsed": true
304 | },
305 | "outputs": [],
306 | "source": [
307 | "my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n",
308 | " labelCol='Survived')"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 26,
314 | "metadata": {
315 | "collapsed": false
316 | },
317 | "outputs": [
318 | {
319 | "name": "stdout",
320 | "output_type": "stream",
321 | "text": [
322 | "+--------+----------+\n",
323 | "|Survived|prediction|\n",
324 | "+--------+----------+\n",
325 | "| 0| 1.0|\n",
326 | "| 0| 1.0|\n",
327 | "| 0| 1.0|\n",
328 | "| 0| 1.0|\n",
329 | "| 0| 0.0|\n",
330 | "| 0| 1.0|\n",
331 | "| 0| 1.0|\n",
332 | "| 0| 0.0|\n",
333 | "| 0| 0.0|\n",
334 | "| 0| 0.0|\n",
335 | "| 0| 0.0|\n",
336 | "| 0| 0.0|\n",
337 | "| 0| 0.0|\n",
338 | "| 0| 0.0|\n",
339 | "| 0| 0.0|\n",
340 | "| 0| 0.0|\n",
341 | "| 0| 0.0|\n",
342 | "| 0| 1.0|\n",
343 | "| 0| 1.0|\n",
344 | "| 0| 1.0|\n",
345 | "+--------+----------+\n",
346 | "only showing top 20 rows\n",
347 | "\n"
348 | ]
349 | }
350 | ],
351 | "source": [
352 | "results.select('Survived','prediction').show()"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 27,
358 | "metadata": {
359 | "collapsed": true
360 | },
361 | "outputs": [],
362 | "source": [
363 | "AUC = my_eval.evaluate(results)"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 28,
369 | "metadata": {
370 | "collapsed": false
371 | },
372 | "outputs": [
373 | {
374 | "data": {
375 | "text/plain": [
376 | "0.7918269230769232"
377 | ]
378 | },
379 | "execution_count": 28,
380 | "metadata": {},
381 | "output_type": "execute_result"
382 | }
383 | ],
384 | "source": [
385 | "AUC"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "## Great Job!"
393 | ]
394 | }
395 | ],
396 | "metadata": {
397 | "anaconda-cloud": {},
398 | "kernelspec": {
399 | "display_name": "Python [conda root]",
400 | "language": "python",
401 | "name": "conda-root-py"
402 | },
403 | "language_info": {
404 | "codemirror_mode": {
405 | "name": "ipython",
406 | "version": 3
407 | },
408 | "file_extension": ".py",
409 | "mimetype": "text/x-python",
410 | "name": "python",
411 | "nbconvert_exporter": "python",
412 | "pygments_lexer": "ipython3",
413 | "version": "3.5.3"
414 | }
415 | },
416 | "nbformat": 4,
417 | "nbformat_minor": 0
418 | }
419 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Logistic_Regression/new_customers.csv:
--------------------------------------------------------------------------------
1 | Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company
2 | Andrew Mccall,37.0,9935.53,1,7.71,8.0,2011-08-29 18:37:54,"38612 Johnny Stravenue Nataliebury, WI 15717-8316",King Ltd,
3 | Michele Wright,23.0,7526.94,1,9.28,15.0,2013-07-22 18:19:54,"21083 Nicole Junction Suite 332, Youngport, ME 23686-4381",Cannon-Benson
4 | Jeremy Chang,65.0,100.0,1,1.0,15.0,2006-12-11 07:48:13,"085 Austin Views Lake Julialand, WY 63726-4298",Barron-Robertson
5 | Megan Ferguson,32.0,6487.5,0,9.4,14.0,2016-10-28 05:32:13,"922 Wright Branch North Cynthialand, NC 64721",Sexton-Golden
6 | Taylor Young,32.0,13147.71,1,10.0,8.0,2012-03-20 00:36:46,"Unit 0789 Box 0734 DPO AP 39702",Wood LLC,
7 | Jessica Drake,22.0,8445.26,1,3.46,14.0,2011-02-04 19:29:27,"1148 Tina Stravenue Apt. 978 South Carlos TX 21222 9221",Parks-Robbins
8 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Natural_Language_Processing/smsspamcollection/readme:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuperJohn/spark-and-python-for-big-data-with-pyspark/2571210837c00e6315a9d93f0cd1dc35e2955375/Spark_for_Machine_Learning/Natural_Language_Processing/smsspamcollection/readme
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Tree_Methods/.ipynb_checkpoints/Tree_Methods_Consulting_Project-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tree Methods Consulting Project "
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a \"filler\" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one!\n",
15 | "Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!\n",
16 | "\n",
17 | "* Pres_A : Percentage of preservative A in the mix\n",
18 | "* Pres_B : Percentage of preservative B in the mix\n",
19 | "* Pres_C : Percentage of preservative C in the mix\n",
20 | "* Pres_D : Percentage of preservative D in the mix\n",
21 | "* Spoiled: Label indicating whether or not the dog food batch was spoiled.\n",
22 | "___\n",
23 | "\n",
24 | "**Think carefully about what this problem is really asking you to solve. While we will use Machine Learning to solve this, it won't be with your typical train/test split workflow. If this confuses you, skip ahead to the solution code along walk-through!**\n",
25 | "____"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "# Good Luck!"
33 | ]
34 | }
35 | ],
36 | "metadata": {
37 | "anaconda-cloud": {},
38 | "kernelspec": {
39 | "display_name": "Python [conda root]",
40 | "language": "python",
41 | "name": "conda-root-py"
42 | },
43 | "language_info": {
44 | "codemirror_mode": {
45 | "name": "ipython",
46 | "version": 3
47 | },
48 | "file_extension": ".py",
49 | "mimetype": "text/x-python",
50 | "name": "python",
51 | "nbconvert_exporter": "python",
52 | "pygments_lexer": "ipython3",
53 | "version": "3.5.3"
54 | }
55 | },
56 | "nbformat": 4,
57 | "nbformat_minor": 0
58 | }
59 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Tree_Methods/.ipynb_checkpoints/Tree_Methods_Consulting_Project_SOLUTION-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tree Methods Consulting Project - SOLUTION"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a \"filler\" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one!\n",
15 | "Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!\n",
16 | "\n",
17 | "* Pres_A : Percentage of preservative A in the mix\n",
18 | "* Pres_B : Percentage of preservative B in the mix\n",
19 | "* Pres_C : Percentage of preservative C in the mix\n",
20 | "* Pres_D : Percentage of preservative D in the mix\n",
21 | "* Spoiled: Label indicating whether or not the dog food batch was spoiled.\n",
22 | "___\n",
23 | "\n",
24 | "**Think carefully about what this problem is really asking you to solve. While we will use Machine Learning to solve this, it won't be with your typical train/test split workflow. If this confuses you, skip ahead to the solution code along walk-through!**\n",
25 | "____"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 46,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "#Tree methods Example\n",
37 | "from pyspark.sql import SparkSession\n",
38 | "spark = SparkSession.builder.appName('dogfood').getOrCreate()"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 47,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "# Load training data\n",
50 | "data = spark.read.csv('dog_food.csv',inferSchema=True,header=True)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 48,
56 | "metadata": {
57 | "collapsed": false
58 | },
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "root\n",
65 | " |-- A: integer (nullable = true)\n",
66 | " |-- B: integer (nullable = true)\n",
67 | " |-- C: double (nullable = true)\n",
68 | " |-- D: integer (nullable = true)\n",
69 | " |-- Spoiled: double (nullable = true)\n",
70 | "\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "data.printSchema()"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 49,
81 | "metadata": {
82 | "collapsed": false
83 | },
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/plain": [
88 | "Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)"
89 | ]
90 | },
91 | "execution_count": 49,
92 | "metadata": {},
93 | "output_type": "execute_result"
94 | }
95 | ],
96 | "source": [
97 | "data.head()"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 50,
103 | "metadata": {
104 | "collapsed": false
105 | },
106 | "outputs": [
107 | {
108 | "name": "stdout",
109 | "output_type": "stream",
110 | "text": [
111 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n",
112 | "|summary| A| B| C| D| Spoiled|\n",
113 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n",
114 | "| count| 490| 490| 490| 490| 490|\n",
115 | "| mean| 5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|\n",
116 | "| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|\n",
117 | "| min| 1| 1| 5.0| 1| 0.0|\n",
118 | "| max| 10| 10| 14.0| 10| 1.0|\n",
119 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n",
120 | "\n"
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "data.describe().show()"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 51,
131 | "metadata": {
132 | "collapsed": true
133 | },
134 | "outputs": [],
135 | "source": [
136 | "# Import VectorAssembler and Vectors\n",
137 | "from pyspark.ml.linalg import Vectors\n",
138 | "from pyspark.ml.feature import VectorAssembler"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 52,
144 | "metadata": {
145 | "collapsed": false
146 | },
147 | "outputs": [
148 | {
149 | "data": {
150 | "text/plain": [
151 | "['A', 'B', 'C', 'D', 'Spoiled']"
152 | ]
153 | },
154 | "execution_count": 52,
155 | "metadata": {},
156 | "output_type": "execute_result"
157 | }
158 | ],
159 | "source": [
160 | "data.columns"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 53,
166 | "metadata": {
167 | "collapsed": false
168 | },
169 | "outputs": [],
170 | "source": [
171 | "assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol=\"features\")"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 54,
177 | "metadata": {
178 | "collapsed": true
179 | },
180 | "outputs": [],
181 | "source": [
182 | "output = assembler.transform(data)"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 55,
188 | "metadata": {
189 | "collapsed": true
190 | },
191 | "outputs": [],
192 | "source": [
193 | "from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 56,
199 | "metadata": {
200 | "collapsed": true
201 | },
202 | "outputs": [],
203 | "source": [
204 | "rfc = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 57,
210 | "metadata": {
211 | "collapsed": false
212 | },
213 | "outputs": [
214 | {
215 | "name": "stdout",
216 | "output_type": "stream",
217 | "text": [
218 | "root\n",
219 | " |-- A: integer (nullable = true)\n",
220 | " |-- B: integer (nullable = true)\n",
221 | " |-- C: double (nullable = true)\n",
222 | " |-- D: integer (nullable = true)\n",
223 | " |-- Spoiled: double (nullable = true)\n",
224 | " |-- features: vector (nullable = true)\n",
225 | "\n"
226 | ]
227 | }
228 | ],
229 | "source": [
230 | "output.printSchema()"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 58,
236 | "metadata": {
237 | "collapsed": false
238 | },
239 | "outputs": [
240 | {
241 | "data": {
242 | "text/plain": [
243 | "Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)"
244 | ]
245 | },
246 | "execution_count": 58,
247 | "metadata": {},
248 | "output_type": "execute_result"
249 | }
250 | ],
251 | "source": [
252 | "final_data = output.select('features','Spoiled')\n",
253 | "final_data.head()"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 59,
259 | "metadata": {
260 | "collapsed": false
261 | },
262 | "outputs": [],
263 | "source": [
264 | "rfc_model = rfc.fit(final_data)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 60,
270 | "metadata": {
271 | "collapsed": false
272 | },
273 | "outputs": [
274 | {
275 | "data": {
276 | "text/plain": [
277 | "SparseVector(4, {0: 0.0026, 1: 0.0089, 2: 0.9686, 3: 0.0199})"
278 | ]
279 | },
280 | "execution_count": 60,
281 | "metadata": {},
282 | "output_type": "execute_result"
283 | }
284 | ],
285 | "source": [
286 | "rfc_model.featureImportances"
287 | ]
288 | },
289 | {
290 | "cell_type": "markdown",
291 | "metadata": {},
292 | "source": [
293 | "Bingo! Feature at index 2 (Chemical C) is by far the most important feature, meaning it is causing the early spoilage! This is a pretty interesting use of a machine learning model in an alternative way!\n",
294 | "\n",
295 | "# Great Job"
296 | ]
297 | }
298 | ],
299 | "metadata": {
300 | "anaconda-cloud": {},
301 | "kernelspec": {
302 | "display_name": "Python [conda root]",
303 | "language": "python",
304 | "name": "conda-root-py"
305 | },
306 | "language_info": {
307 | "codemirror_mode": {
308 | "name": "ipython",
309 | "version": 3
310 | },
311 | "file_extension": ".py",
312 | "mimetype": "text/x-python",
313 | "name": "python",
314 | "nbconvert_exporter": "python",
315 | "pygments_lexer": "ipython3",
316 | "version": "3.5.3"
317 | }
318 | },
319 | "nbformat": 4,
320 | "nbformat_minor": 0
321 | }
322 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Tree_Methods/Tree_Methods_Consulting_Project.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tree Methods Consulting Project "
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a \"filler\" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one!\n",
15 | "Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!\n",
16 | "\n",
17 | "* Pres_A : Percentage of preservative A in the mix\n",
18 | "* Pres_B : Percentage of preservative B in the mix\n",
19 | "* Pres_C : Percentage of preservative C in the mix\n",
20 | "* Pres_D : Percentage of preservative D in the mix\n",
21 | "* Spoiled: Label indicating whether or not the dog food batch was spoiled.\n",
22 | "___\n",
23 | "\n",
24 | "**Think carefully about what this problem is really asking you to solve. While we will use Machine Learning to solve this, it won't be with your typical train/test split workflow. If this confuses you, skip ahead to the solution code along walk-through!**\n",
25 | "____"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "# Good Luck!"
33 | ]
34 | }
35 | ],
36 | "metadata": {
37 | "anaconda-cloud": {},
38 | "kernelspec": {
39 | "display_name": "Python [conda root]",
40 | "language": "python",
41 | "name": "conda-root-py"
42 | },
43 | "language_info": {
44 | "codemirror_mode": {
45 | "name": "ipython",
46 | "version": 3
47 | },
48 | "file_extension": ".py",
49 | "mimetype": "text/x-python",
50 | "name": "python",
51 | "nbconvert_exporter": "python",
52 | "pygments_lexer": "ipython3",
53 | "version": "3.5.3"
54 | }
55 | },
56 | "nbformat": 4,
57 | "nbformat_minor": 0
58 | }
59 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Tree_Methods/Tree_Methods_Consulting_Project_SOLUTION.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tree Methods Consulting Project - SOLUTION"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a \"filler\" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one!\n",
15 | "Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!\n",
16 | "\n",
17 | "* Pres_A : Percentage of preservative A in the mix\n",
18 | "* Pres_B : Percentage of preservative B in the mix\n",
19 | "* Pres_C : Percentage of preservative C in the mix\n",
20 | "* Pres_D : Percentage of preservative D in the mix\n",
21 | "* Spoiled: Label indicating whether or not the dog food batch was spoiled.\n",
22 | "___\n",
23 | "\n",
24 | "**Think carefully about what this problem is really asking you to solve. While we will use Machine Learning to solve this, it won't be with your typical train/test split workflow. If this confuses you, skip ahead to the solution code along walk-through!**\n",
25 | "____"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 46,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "#Tree methods Example\n",
37 | "from pyspark.sql import SparkSession\n",
38 | "spark = SparkSession.builder.appName('dogfood').getOrCreate()"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 47,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "# Load training data\n",
50 | "data = spark.read.csv('dog_food.csv',inferSchema=True,header=True)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 48,
56 | "metadata": {
57 | "collapsed": false
58 | },
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "root\n",
65 | " |-- A: integer (nullable = true)\n",
66 | " |-- B: integer (nullable = true)\n",
67 | " |-- C: double (nullable = true)\n",
68 | " |-- D: integer (nullable = true)\n",
69 | " |-- Spoiled: double (nullable = true)\n",
70 | "\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "data.printSchema()"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 49,
81 | "metadata": {
82 | "collapsed": false
83 | },
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/plain": [
88 | "Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)"
89 | ]
90 | },
91 | "execution_count": 49,
92 | "metadata": {},
93 | "output_type": "execute_result"
94 | }
95 | ],
96 | "source": [
97 | "data.head()"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 50,
103 | "metadata": {
104 | "collapsed": false
105 | },
106 | "outputs": [
107 | {
108 | "name": "stdout",
109 | "output_type": "stream",
110 | "text": [
111 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n",
112 | "|summary| A| B| C| D| Spoiled|\n",
113 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n",
114 | "| count| 490| 490| 490| 490| 490|\n",
115 | "| mean| 5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|\n",
116 | "| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|\n",
117 | "| min| 1| 1| 5.0| 1| 0.0|\n",
118 | "| max| 10| 10| 14.0| 10| 1.0|\n",
119 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n",
120 | "\n"
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "data.describe().show()"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 51,
131 | "metadata": {
132 | "collapsed": true
133 | },
134 | "outputs": [],
135 | "source": [
136 | "# Import VectorAssembler and Vectors\n",
137 | "from pyspark.ml.linalg import Vectors\n",
138 | "from pyspark.ml.feature import VectorAssembler"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 52,
144 | "metadata": {
145 | "collapsed": false
146 | },
147 | "outputs": [
148 | {
149 | "data": {
150 | "text/plain": [
151 | "['A', 'B', 'C', 'D', 'Spoiled']"
152 | ]
153 | },
154 | "execution_count": 52,
155 | "metadata": {},
156 | "output_type": "execute_result"
157 | }
158 | ],
159 | "source": [
160 | "data.columns"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 53,
166 | "metadata": {
167 | "collapsed": false
168 | },
169 | "outputs": [],
170 | "source": [
171 | "assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol=\"features\")"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 54,
177 | "metadata": {
178 | "collapsed": true
179 | },
180 | "outputs": [],
181 | "source": [
182 | "output = assembler.transform(data)"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 55,
188 | "metadata": {
189 | "collapsed": true
190 | },
191 | "outputs": [],
192 | "source": [
193 | "from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 56,
199 | "metadata": {
200 | "collapsed": true
201 | },
202 | "outputs": [],
203 | "source": [
204 | "rfc = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 57,
210 | "metadata": {
211 | "collapsed": false
212 | },
213 | "outputs": [
214 | {
215 | "name": "stdout",
216 | "output_type": "stream",
217 | "text": [
218 | "root\n",
219 | " |-- A: integer (nullable = true)\n",
220 | " |-- B: integer (nullable = true)\n",
221 | " |-- C: double (nullable = true)\n",
222 | " |-- D: integer (nullable = true)\n",
223 | " |-- Spoiled: double (nullable = true)\n",
224 | " |-- features: vector (nullable = true)\n",
225 | "\n"
226 | ]
227 | }
228 | ],
229 | "source": [
230 | "output.printSchema()"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 58,
236 | "metadata": {
237 | "collapsed": false
238 | },
239 | "outputs": [
240 | {
241 | "data": {
242 | "text/plain": [
243 | "Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)"
244 | ]
245 | },
246 | "execution_count": 58,
247 | "metadata": {},
248 | "output_type": "execute_result"
249 | }
250 | ],
251 | "source": [
252 | "final_data = output.select('features','Spoiled')\n",
253 | "final_data.head()"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 59,
259 | "metadata": {
260 | "collapsed": false
261 | },
262 | "outputs": [],
263 | "source": [
264 | "rfc_model = rfc.fit(final_data)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 60,
270 | "metadata": {
271 | "collapsed": false
272 | },
273 | "outputs": [
274 | {
275 | "data": {
276 | "text/plain": [
277 | "SparseVector(4, {0: 0.0026, 1: 0.0089, 2: 0.9686, 3: 0.0199})"
278 | ]
279 | },
280 | "execution_count": 60,
281 | "metadata": {},
282 | "output_type": "execute_result"
283 | }
284 | ],
285 | "source": [
286 | "rfc_model.featureImportances"
287 | ]
288 | },
289 | {
290 | "cell_type": "markdown",
291 | "metadata": {},
292 | "source": [
293 | "Bingo! Feature at index 2 (Chemical C) is by far the most important feature, meaning it is causing the early spoilage! This is a pretty interesting use of a machine learning model in an alternative way!\n",
294 | "\n",
295 | "# Great Job"
296 | ]
297 | }
298 | ],
299 | "metadata": {
300 | "anaconda-cloud": {},
301 | "kernelspec": {
302 | "display_name": "Python [conda root]",
303 | "language": "python",
304 | "name": "conda-root-py"
305 | },
306 | "language_info": {
307 | "codemirror_mode": {
308 | "name": "ipython",
309 | "version": 3
310 | },
311 | "file_extension": ".py",
312 | "mimetype": "text/x-python",
313 | "name": "python",
314 | "nbconvert_exporter": "python",
315 | "pygments_lexer": "ipython3",
316 | "version": "3.5.3"
317 | }
318 | },
319 | "nbformat": 4,
320 | "nbformat_minor": 0
321 | }
322 |
--------------------------------------------------------------------------------
/Spark_for_Machine_Learning/Tree_Methods/dog_food.csv:
--------------------------------------------------------------------------------
1 | A,B,C,D,Spoiled
2 | 4,2,12.0,3,1.0
3 | 5,6,12.0,7,1.0
4 | 6,2,13.0,6,1.0
5 | 4,2,12.0,1,1.0
6 | 4,2,12.0,3,1.0
7 | 10,3,13.0,9,1.0
8 | 8,5,14.0,5,1.0
9 | 5,8,12.0,8,1.0
10 | 6,5,12.0,9,1.0
11 | 3,3,12.0,1,1.0
12 | 9,8,11.0,3,1.0
13 | 1,10,12.0,3,1.0
14 | 1,5,13.0,10,1.0
15 | 2,10,12.0,6,1.0
16 | 1,10,11.0,4,1.0
17 | 5,3,12.0,2,1.0
18 | 4,9,11.0,8,1.0
19 | 5,1,11.0,1,1.0
20 | 4,9,12.0,10,1.0
21 | 5,8,10.0,9,1.0
22 | 5,7,11.0,9,1.0
23 | 4,10,13.0,8,1.0
24 | 10,5,12.0,9,1.0
25 | 2,4,13.0,4,1.0
26 | 1,4,13.0,10,1.0
27 | 1,8,12.0,1,1.0
28 | 2,10,13.0,4,1.0
29 | 6,2,12.0,4,1.0
30 | 8,2,13.0,3,1.0
31 | 6,4,12.0,2,1.0
32 | 3,2,11.0,9,1.0
33 | 10,6,12.0,10,1.0
34 | 9,5,13.0,3,1.0
35 | 9,2,12.0,5,1.0
36 | 2,6,13.0,9,1.0
37 | 4,2,12.0,10,1.0
38 | 4,3,12.0,6,1.0
39 | 7,1,12.0,1,1.0
40 | 1,7,11.0,10,1.0
41 | 9,2,11.0,10,1.0
42 | 2,6,12.0,2,1.0
43 | 9,4,11.0,5,1.0
44 | 6,2,11.0,10,1.0
45 | 3,10,11.0,4,1.0
46 | 6,9,11.0,2,1.0
47 | 10,6,11.0,9,1.0
48 | 6,7,11.0,9,1.0
49 | 7,2,13.0,8,1.0
50 | 9,2,13.0,5,1.0
51 | 8,7,12.0,6,1.0
52 | 9,1,12.0,9,1.0
53 | 3,5,14.0,3,1.0
54 | 7,1,11.0,3,1.0
55 | 5,9,12.0,7,1.0
56 | 3,10,12.0,7,1.0
57 | 9,8,13.0,9,1.0
58 | 10,9,12.0,9,1.0
59 | 10,7,11.0,2,1.0
60 | 10,3,11.0,1,1.0
61 | 2,4,11.0,8,1.0
62 | 10,3,13.0,4,1.0
63 | 5,1,14.0,8,1.0
64 | 8,8,11.0,4,1.0
65 | 4,8,14.0,1,1.0
66 | 5,1,12.0,7,1.0
67 | 6,8,11.0,2,1.0
68 | 1,1,13.0,3,1.0
69 | 9,3,12.0,10,1.0
70 | 6,1,11.0,7,1.0
71 | 7,5,10.0,1,1.0
72 | 10,2,12.0,2,1.0
73 | 2,3,13.0,1,1.0
74 | 5,8,12.0,2,1.0
75 | 10,6,12.0,10,1.0
76 | 9,1,11.0,6,1.0
77 | 10,10,14.0,7,1.0
78 | 1,5,12.0,10,1.0
79 | 10,1,11.0,2,1.0
80 | 1,1,12.0,2,1.0
81 | 10,3,13.0,7,1.0
82 | 1,6,11.0,10,1.0
83 | 9,4,12.0,3,1.0
84 | 10,9,12.0,5,1.0
85 | 10,8,11.0,2,1.0
86 | 5,3,9.0,2,1.0
87 | 3,7,12.0,10,1.0
88 | 4,9,12.0,8,1.0
89 | 5,1,11.0,2,1.0
90 | 10,9,11.0,9,1.0
91 | 10,7,11.0,6,1.0
92 | 8,2,13.0,10,1.0
93 | 7,7,11.0,3,1.0
94 | 9,10,11.0,5,1.0
95 | 5,2,12.0,8,1.0
96 | 1,1,10.0,8,1.0
97 | 5,5,12.0,8,1.0
98 | 9,6,12.0,1,1.0
99 | 4,6,12.0,2,1.0
100 | 1,1,12.0,4,1.0
101 | 9,3,11.0,10,1.0
102 | 3,2,12.0,6,1.0
103 | 2,4,11.0,9,1.0
104 | 8,1,12.0,10,1.0
105 | 10,6,11.0,6,1.0
106 | 8,9,12.0,2,1.0
107 | 2,3,12.0,3,1.0
108 | 4,6,14.0,4,1.0
109 | 3,4,12.0,4,1.0
110 | 9,5,12.0,5,1.0
111 | 10,5,13.0,2,1.0
112 | 8,2,10.0,6,1.0
113 | 10,5,11.0,2,1.0
114 | 10,1,11.0,3,1.0
115 | 7,6,13.0,3,1.0
116 | 8,9,14.0,4,1.0
117 | 8,8,14.0,7,1.0
118 | 1,9,11.0,10,1.0
119 | 2,9,10.0,3,1.0
120 | 4,9,13.0,4,1.0
121 | 10,10,12.0,7,1.0
122 | 8,9,12.0,7,1.0
123 | 9,7,12.0,1,1.0
124 | 3,6,13.0,5,1.0
125 | 4,5,12.0,3,1.0
126 | 1,7,11.0,9,1.0
127 | 4,6,12.0,9,1.0
128 | 8,10,13.0,3,1.0
129 | 5,4,12.0,5,1.0
130 | 9,4,12.0,6,1.0
131 | 3,4,12.0,5,1.0
132 | 7,7,11.0,4,1.0
133 | 6,2,12.0,6,1.0
134 | 2,8,11.0,1,1.0
135 | 4,4,10.0,3,1.0
136 | 3,7,12.0,9,1.0
137 | 10,3,12.0,7,1.0
138 | 3,1,12.0,7,1.0
139 | 2,4,13.0,10,1.0
140 | 6,3,12.0,2,1.0
141 | 7,2,14.0,4,1.0
142 | 4,2,8.0,9,0.0
143 | 4,8,9.0,1,0.0
144 | 10,8,8.0,6,0.0
145 | 8,6,9.0,4,0.0
146 | 7,2,7.0,8,0.0
147 | 3,3,9.0,5,0.0
148 | 4,10,8.0,9,0.0
149 | 4,7,10.0,7,0.0
150 | 1,7,8.0,2,0.0
151 | 10,7,8.0,5,0.0
152 | 10,5,9.0,1,0.0
153 | 5,7,10.0,10,0.0
154 | 2,8,6.0,9,0.0
155 | 4,1,7.0,5,0.0
156 | 4,6,9.0,7,0.0
157 | 2,2,9.0,8,0.0
158 | 6,7,6.0,9,0.0
159 | 5,7,7.0,2,0.0
160 | 7,1,7.0,5,0.0
161 | 8,1,8.0,3,0.0
162 | 1,6,8.0,1,0.0
163 | 4,5,9.0,8,0.0
164 | 8,10,8.0,3,0.0
165 | 4,9,8.0,2,0.0
166 | 2,9,6.0,4,0.0
167 | 8,10,8.0,9,0.0
168 | 3,6,8.0,1,0.0
169 | 5,6,9.0,8,0.0
170 | 5,2,8.0,10,0.0
171 | 9,7,6.0,7,0.0
172 | 3,8,6.0,10,0.0
173 | 3,3,8.0,9,0.0
174 | 3,4,10.0,2,0.0
175 | 6,8,8.0,9,0.0
176 | 1,4,8.0,7,0.0
177 | 6,9,7.0,10,0.0
178 | 10,6,8.0,6,0.0
179 | 9,4,7.0,10,0.0
180 | 9,2,10.0,3,0.0
181 | 6,8,8.0,6,0.0
182 | 10,5,7.0,4,0.0
183 | 4,8,8.0,7,0.0
184 | 5,6,6.0,9,0.0
185 | 2,1,10.0,7,0.0
186 | 6,4,7.0,4,0.0
187 | 6,8,9.0,4,0.0
188 | 3,3,8.0,3,0.0
189 | 3,5,10.0,6,0.0
190 | 3,3,9.0,9,0.0
191 | 7,7,8.0,9,0.0
192 | 6,8,7.0,10,0.0
193 | 7,3,7.0,7,0.0
194 | 5,7,9.0,2,0.0
195 | 4,9,8.0,10,0.0
196 | 9,9,7.0,4,0.0
197 | 6,9,6.0,1,0.0
198 | 4,2,10.0,10,0.0
199 | 8,10,8.0,3,0.0
200 | 1,7,8.0,4,0.0
201 | 3,2,9.0,1,0.0
202 | 9,9,9.0,6,0.0
203 | 4,10,5.0,4,0.0
204 | 9,3,7.0,5,0.0
205 | 9,1,9.0,3,0.0
206 | 4,6,7.0,2,0.0
207 | 4,5,8.0,5,0.0
208 | 5,7,6.0,6,0.0
209 | 10,6,9.0,3,0.0
210 | 6,6,8.0,10,0.0
211 | 3,7,9.0,7,0.0
212 | 8,10,8.0,2,0.0
213 | 5,2,8.0,3,0.0
214 | 5,7,7.0,5,0.0
215 | 10,9,8.0,2,0.0
216 | 4,4,8.0,7,0.0
217 | 1,4,9.0,6,0.0
218 | 8,2,9.0,10,0.0
219 | 9,6,9.0,5,0.0
220 | 7,6,7.0,7,0.0
221 | 1,2,9.0,4,0.0
222 | 1,8,7.0,10,0.0
223 | 6,2,8.0,9,0.0
224 | 9,5,7.0,8,0.0
225 | 8,7,8.0,6,0.0
226 | 5,7,8.0,9,0.0
227 | 8,4,9.0,1,0.0
228 | 6,1,9.0,3,0.0
229 | 9,7,8.0,9,0.0
230 | 2,9,7.0,10,0.0
231 | 2,4,8.0,5,0.0
232 | 10,3,8.0,8,0.0
233 | 7,9,8.0,8,0.0
234 | 6,6,8.0,2,0.0
235 | 1,5,8.0,10,0.0
236 | 10,1,9.0,9,0.0
237 | 8,1,9.0,2,0.0
238 | 10,9,8.0,6,0.0
239 | 5,10,7.0,1,0.0
240 | 3,6,7.0,8,0.0
241 | 4,10,10.0,5,0.0
242 | 2,1,7.0,9,0.0
243 | 9,2,9.0,9,0.0
244 | 3,9,8.0,9,0.0
245 | 2,3,6.0,9,0.0
246 | 3,9,8.0,6,0.0
247 | 10,7,9.0,1,0.0
248 | 10,10,6.0,4,0.0
249 | 8,5,9.0,5,0.0
250 | 7,2,8.0,1,0.0
251 | 7,2,8.0,9,0.0
252 | 6,9,7.0,2,0.0
253 | 1,4,9.0,3,0.0
254 | 10,9,9.0,10,0.0
255 | 4,3,8.0,8,0.0
256 | 8,7,6.0,6,0.0
257 | 5,7,8.0,3,0.0
258 | 8,6,8.0,3,0.0
259 | 3,2,6.0,10,0.0
260 | 4,2,6.0,5,0.0
261 | 10,6,8.0,7,0.0
262 | 3,6,8.0,3,0.0
263 | 2,2,8.0,1,0.0
264 | 1,9,10.0,6,0.0
265 | 9,6,8.0,7,0.0
266 | 4,5,9.0,5,0.0
267 | 3,5,8.0,6,0.0
268 | 4,5,8.0,10,0.0
269 | 9,4,9.0,4,0.0
270 | 9,4,7.0,6,0.0
271 | 7,6,8.0,10,0.0
272 | 9,10,11.0,2,0.0
273 | 3,4,9.0,5,0.0
274 | 2,10,9.0,2,0.0
275 | 10,9,8.0,2,0.0
276 | 4,6,9.0,4,0.0
277 | 4,10,7.0,10,0.0
278 | 9,1,9.0,8,0.0
279 | 3,10,8.0,6,0.0
280 | 8,5,9.0,3,0.0
281 | 8,5,7.0,5,0.0
282 | 1,8,6.0,6,0.0
283 | 8,8,6.0,8,0.0
284 | 4,8,7.0,3,0.0
285 | 9,3,8.0,7,0.0
286 | 10,8,7.0,3,0.0
287 | 2,10,6.0,4,0.0
288 | 2,5,9.0,5,0.0
289 | 10,7,9.0,4,0.0
290 | 3,10,9.0,8,0.0
291 | 9,2,7.0,3,0.0
292 | 7,4,6.0,4,0.0
293 | 3,4,8.0,7,0.0
294 | 4,7,8.0,3,0.0
295 | 10,9,8.0,10,0.0
296 | 4,6,5.0,6,0.0
297 | 10,2,9.0,7,0.0
298 | 9,8,9.0,10,0.0
299 | 7,10,8.0,2,0.0
300 | 5,5,6.0,1,0.0
301 | 8,4,7.0,6,0.0
302 | 5,5,7.0,9,0.0
303 | 7,2,9.0,9,0.0
304 | 9,4,9.0,3,0.0
305 | 5,5,7.0,3,0.0
306 | 2,7,7.0,4,0.0
307 | 4,5,9.0,8,0.0
308 | 1,8,8.0,6,0.0
309 | 5,6,9.0,5,0.0
310 | 3,6,8.0,3,0.0
311 | 7,2,9.0,5,0.0
312 | 10,9,10.0,6,0.0
313 | 4,7,10.0,6,0.0
314 | 1,9,9.0,7,0.0
315 | 1,7,7.0,2,0.0
316 | 1,9,7.0,5,0.0
317 | 2,8,9.0,4,0.0
318 | 5,4,8.0,2,0.0
319 | 1,7,7.0,6,0.0
320 | 2,1,8.0,9,0.0
321 | 2,6,9.0,4,0.0
322 | 1,6,8.0,9,0.0
323 | 1,4,8.0,5,0.0
324 | 10,6,8.0,5,0.0
325 | 6,4,6.0,4,0.0
326 | 2,1,9.0,1,0.0
327 | 8,6,9.0,10,0.0
328 | 5,6,7.0,9,0.0
329 | 10,10,7.0,1,0.0
330 | 2,9,10.0,6,0.0
331 | 9,6,10.0,2,0.0
332 | 3,5,9.0,3,0.0
333 | 5,10,8.0,3,0.0
334 | 1,3,9.0,8,0.0
335 | 8,8,8.0,7,0.0
336 | 6,1,8.0,3,0.0
337 | 4,9,9.0,2,0.0
338 | 2,9,10.0,3,0.0
339 | 1,5,8.0,5,0.0
340 | 5,6,8.0,8,0.0
341 | 6,10,9.0,2,0.0
342 | 9,6,8.0,9,0.0
343 | 1,8,8.0,7,0.0
344 | 8,2,8.0,8,0.0
345 | 3,6,8.0,5,0.0
346 | 9,2,9.0,6,0.0
347 | 7,10,5.0,6,0.0
348 | 2,5,8.0,3,0.0
349 | 9,2,10.0,7,0.0
350 | 5,9,8.0,9,0.0
351 | 1,6,8.0,3,0.0
352 | 7,4,8.0,3,0.0
353 | 8,5,8.0,5,0.0
354 | 5,9,7.0,3,0.0
355 | 9,6,8.0,5,0.0
356 | 3,1,8.0,5,0.0
357 | 5,8,9.0,9,0.0
358 | 2,5,8.0,3,0.0
359 | 5,6,8.0,6,0.0
360 | 2,5,8.0,1,0.0
361 | 6,2,11.0,10,0.0
362 | 2,6,6.0,9,0.0
363 | 4,4,6.0,8,0.0
364 | 2,7,8.0,9,0.0
365 | 5,2,7.0,9,0.0
366 | 6,10,8.0,3,0.0
367 | 4,6,7.0,5,0.0
368 | 2,8,8.0,6,0.0
369 | 6,2,8.0,3,0.0
370 | 8,10,9.0,8,0.0
371 | 5,9,8.0,5,0.0
372 | 9,2,9.0,8,0.0
373 | 5,10,8.0,6,0.0
374 | 10,6,8.0,3,0.0
375 | 6,6,9.0,6,0.0
376 | 6,3,10.0,5,0.0
377 | 1,3,8.0,5,0.0
378 | 2,3,9.0,3,0.0
379 | 2,6,8.0,8,0.0
380 | 8,4,9.0,10,0.0
381 | 8,7,6.0,7,0.0
382 | 2,6,8.0,10,0.0
383 | 7,2,9.0,3,0.0
384 | 7,9,6.0,2,0.0
385 | 2,10,8.0,8,0.0
386 | 5,2,9.0,9,0.0
387 | 2,8,9.0,10,0.0
388 | 8,4,6.0,8,0.0
389 | 7,3,10.0,7,0.0
390 | 9,9,8.0,7,0.0
391 | 8,4,8.0,1,0.0
392 | 9,2,6.0,8,0.0
393 | 8,6,8.0,2,0.0
394 | 9,7,8.0,2,0.0
395 | 4,3,9.0,6,0.0
396 | 2,1,8.0,9,0.0
397 | 9,4,7.0,9,0.0
398 | 4,2,9.0,2,0.0
399 | 10,3,8.0,2,0.0
400 | 9,2,10.0,5,0.0
401 | 10,7,7.0,7,0.0
402 | 2,3,7.0,10,0.0
403 | 10,1,7.0,4,0.0
404 | 3,3,7.0,5,0.0
405 | 10,1,7.0,4,0.0
406 | 5,4,8.0,7,0.0
407 | 7,3,7.0,8,0.0
408 | 10,9,7.0,4,0.0
409 | 5,7,8.0,9,0.0
410 | 5,9,7.0,5,0.0
411 | 4,6,7.0,5,0.0
412 | 4,2,8.0,9,0.0
413 | 8,3,7.0,4,0.0
414 | 3,5,9.0,6,0.0
415 | 4,3,8.0,10,0.0
416 | 1,6,7.0,8,0.0
417 | 8,5,8.0,6,0.0
418 | 9,10,7.0,6,0.0
419 | 8,9,8.0,1,0.0
420 | 9,10,8.0,8,0.0
421 | 3,10,8.0,2,0.0
422 | 8,10,10.0,7,0.0
423 | 2,1,10.0,7,0.0
424 | 5,10,8.0,8,0.0
425 | 4,9,7.0,7,0.0
426 | 9,3,7.0,7,0.0
427 | 5,7,8.0,6,0.0
428 | 8,7,9.0,3,0.0
429 | 2,2,7.0,8,0.0
430 | 6,6,9.0,9,0.0
431 | 4,2,8.0,4,0.0
432 | 3,9,7.0,9,0.0
433 | 7,9,6.0,5,0.0
434 | 5,3,7.0,5,0.0
435 | 4,4,9.0,1,0.0
436 | 6,9,8.0,5,0.0
437 | 10,10,8.0,1,0.0
438 | 2,6,8.0,6,0.0
439 | 10,10,9.0,5,0.0
440 | 5,9,9.0,6,0.0
441 | 3,2,8.0,9,0.0
442 | 10,10,9.0,3,0.0
443 | 4,7,9.0,4,0.0
444 | 4,4,7.0,1,0.0
445 | 5,8,8.0,5,0.0
446 | 2,3,8.0,3,0.0
447 | 6,4,9.0,2,0.0
448 | 2,9,9.0,10,0.0
449 | 3,6,8.0,2,0.0
450 | 3,2,10.0,10,0.0
451 | 2,2,8.0,1,0.0
452 | 9,6,9.0,1,0.0
453 | 6,5,6.0,2,0.0
454 | 3,6,8.0,1,0.0
455 | 3,3,8.0,6,0.0
456 | 2,10,9.0,2,0.0
457 | 8,9,8.0,9,0.0
458 | 7,4,10.0,4,0.0
459 | 6,6,7.0,8,0.0
460 | 5,3,7.0,7,0.0
461 | 6,7,7.0,6,0.0
462 | 9,1,9.0,5,0.0
463 | 10,9,9.0,1,0.0
464 | 10,4,8.0,3,0.0
465 | 1,2,9.0,1,0.0
466 | 2,1,9.0,1,0.0
467 | 6,1,7.0,9,0.0
468 | 1,5,8.0,3,0.0
469 | 2,8,8.0,4,0.0
470 | 1,8,8.0,8,0.0
471 | 3,1,9.0,7,0.0
472 | 3,9,7.0,6,0.0
473 | 8,1,7.0,4,0.0
474 | 10,4,9.0,8,0.0
475 | 2,5,7.0,6,0.0
476 | 10,6,8.0,5,0.0
477 | 6,1,9.0,7,0.0
478 | 6,10,7.0,10,0.0
479 | 2,10,8.0,3,0.0
480 | 1,4,8.0,1,0.0
481 | 8,9,9.0,4,0.0
482 | 10,10,7.0,4,0.0
483 | 8,3,7.0,9,0.0
484 | 2,2,9.0,8,0.0
485 | 9,5,10.0,10,0.0
486 | 2,2,6.0,10,0.0
487 | 8,3,6.0,6,0.0
488 | 6,4,9.0,10,0.0
489 | 1,3,8.0,3,0.0
490 | 6,6,8.0,3,0.0
491 | 1,9,7.0,4,0.0
492 |
--------------------------------------------------------------------------------