├── .DS_Store ├── Course_Notes.zip ├── Data Set Generator (remove me the future!) ├── .ipynb_checkpoints │ └── Creating Fake Data-checkpoint.ipynb ├── Creating Fake Data.ipynb ├── DataSets │ ├── Facebook_metrics.txt │ ├── customer_churn.csv │ ├── dataset_Facebook.csv │ ├── dog_food.csv │ └── hack_data.csv ├── fake_customers.csv ├── hello.csv ├── new_customers.csv └── test.csv ├── Python-Crash-Course ├── .ipynb_checkpoints │ ├── Python Crash Course Exercises - Solutions-checkpoint.ipynb │ └── Python Crash Course Exercises-checkpoint.ipynb ├── Python Crash Course Exercises - Solutions.ipynb ├── Python Crash Course Exercises.ipynb └── Python Crash Course.ipynb ├── README.md ├── Spark Streaming ├── .ipynb_checkpoints │ └── Introduction to Spark Streaming-checkpoint.ipynb ├── Introduction to Spark Streaming.ipynb └── TweetRead.py ├── Spark_DataFrame_Project_Exercise ├── Spark DataFrames Project Exercise - SOLUTIONS.ipynb ├── Spark DataFrames Project Exercise.ipynb └── walmart_stock.csv ├── Spark_DataFrames ├── .ipynb_checkpoints │ ├── DataFrame_Basic_Operations-checkpoint.ipynb │ ├── DataFrame_Basics-checkpoint.ipynb │ ├── Dates_and_Timestamps-checkpoint.ipynb │ ├── GroupBy_and_Aggregate_Functions-checkpoint.ipynb │ ├── Missing_Data-checkpoint.ipynb │ ├── Spark DataFrames Project Exercise - SOLUTIONS-checkpoint.ipynb │ └── Spark DataFrames Project Exercise-checkpoint.ipynb ├── ContainsNull.csv ├── DataFrame_Basic_Operations.ipynb ├── DataFrame_Basics.ipynb ├── Dates_and_Timestamps.ipynb ├── GroupBy_and_Aggregate_Functions.ipynb ├── Missing_Data.ipynb ├── appl_stock.csv ├── people.json └── sales_info.csv └── Spark_for_Machine_Learning ├── Clustering ├── .ipynb_checkpoints │ ├── Clustering Code Along-checkpoint.ipynb │ ├── Clustering_Code_Example-checkpoint.ipynb │ ├── Clustering_Consulting_Project-checkpoint.ipynb │ ├── Clustering_Consulting_Project_SOLUTIONS-checkpoint.ipynb │ └── Random_Forest_Doc_Example-checkpoint.ipynb ├── Clustering Code Along.ipynb ├── Clustering_Code_Example.ipynb ├── Clustering_Consulting_Project.ipynb ├── Clustering_Consulting_Project_SOLUTIONS.ipynb ├── hack_data.csv ├── sample_kmeans_data.txt ├── seeds_dataset.csv └── seeds_dataset.txt ├── Linear_Regression ├── .ipynb_checkpoints │ ├── Data_Transformations-checkpoint.ipynb │ ├── Linear_Regression_Code_Along-checkpoint.ipynb │ ├── Linear_Regression_Consulting_Project-checkpoint.ipynb │ ├── Linear_Regression_Consulting_Project_SOLUTIONS-checkpoint.ipynb │ └── Linear_Regression_Example-checkpoint.ipynb ├── Data_Transformations.ipynb ├── Ecommerce_Customers.csv ├── Linear_Regression_Code_Along.ipynb ├── Linear_Regression_Consulting_Project.ipynb ├── Linear_Regression_Consulting_Project_SOLUTIONS.ipynb ├── Linear_Regression_Example.ipynb ├── cruise_ship_info.csv ├── fake_customers.csv └── sample_linear_regression_data.txt ├── Logistic_Regression ├── .ipynb_checkpoints │ ├── Log_regression_Code_Along-checkpoint.ipynb │ ├── Logistic_Regression_Consulting_Project-checkpoint.ipynb │ ├── Logistic_Regression_Consulting_Project_SOLUTIONS-checkpoint.ipynb │ └── Logistic_Regression_Example-checkpoint.ipynb ├── Logistic_Regression_Consulting_Project.ipynb ├── Logistic_Regression_Consulting_Project_SOLUTIONS.ipynb ├── Logistic_Regression_Example.ipynb ├── Titanic_Log_Regression_Code_Along.ipynb ├── customer_churn.csv ├── new_customers.csv ├── sample_libsvm_data.txt └── titanic.csv ├── Natural_Language_Processing ├── .ipynb_checkpoints │ ├── NLP_Code_Along-checkpoint.ipynb │ └── Tools_for_NLP-checkpoint.ipynb ├── NLP_Code_Along.ipynb ├── Tools_for_NLP.ipynb └── smsspamcollection │ ├── SMSSpamCollection │ └── readme ├── Recommender_Systems ├── .ipynb_checkpoints │ ├── Consulting Project - Recommender Systems-checkpoint.ipynb │ └── Recommender_Code_Along-checkpoint.ipynb ├── Consulting Project - Recommender Systems.ipynb ├── Meal_Info.csv ├── Recommender_Code_Along.ipynb └── movielens_ratings.csv └── Tree_Methods ├── .ipynb_checkpoints ├── Tree Methods Code Along-checkpoint.ipynb ├── Tree_Methods_Consulting_Project-checkpoint.ipynb ├── Tree_Methods_Consulting_Project_SOLUTION-checkpoint.ipynb └── Tree_Methods_Doc_Example-checkpoint.ipynb ├── College.csv ├── Tree Methods Code Along.ipynb ├── Tree_Methods_Consulting_Project.ipynb ├── Tree_Methods_Consulting_Project_SOLUTION.ipynb ├── Tree_Methods_Doc_Example.ipynb ├── dog_food.csv └── sample_libsvm_data.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuperJohn/spark-and-python-for-big-data-with-pyspark/2571210837c00e6315a9d93f0cd1dc35e2955375/.DS_Store -------------------------------------------------------------------------------- /Course_Notes.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuperJohn/spark-and-python-for-big-data-with-pyspark/2571210837c00e6315a9d93f0cd1dc35e2955375/Course_Notes.zip -------------------------------------------------------------------------------- /Data Set Generator (remove me the future!)/DataSets/Facebook_metrics.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuperJohn/spark-and-python-for-big-data-with-pyspark/2571210837c00e6315a9d93f0cd1dc35e2955375/Data Set Generator (remove me the future!)/DataSets/Facebook_metrics.txt -------------------------------------------------------------------------------- /Data Set Generator (remove me the future!)/DataSets/dog_food.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D,Spoiled 2 | 4,2,12.0,3,1 3 | 5,6,12.0,7,1 4 | 6,2,13.0,6,1 5 | 4,2,12.0,1,1 6 | 4,2,12.0,3,1 7 | 10,3,13.0,9,1 8 | 8,5,14.0,5,1 9 | 5,8,12.0,8,1 10 | 6,5,12.0,9,1 11 | 3,3,12.0,1,1 12 | 9,8,11.0,3,1 13 | 1,10,12.0,3,1 14 | 1,5,13.0,10,1 15 | 2,10,12.0,6,1 16 | 1,10,11.0,4,1 17 | 5,3,12.0,2,1 18 | 4,9,11.0,8,1 19 | 5,1,11.0,1,1 20 | 4,9,12.0,10,1 21 | 5,8,10.0,9,1 22 | 5,7,11.0,9,1 23 | 4,10,13.0,8,1 24 | 10,5,12.0,9,1 25 | 2,4,13.0,4,1 26 | 1,4,13.0,10,1 27 | 1,8,12.0,1,1 28 | 2,10,13.0,4,1 29 | 6,2,12.0,4,1 30 | 8,2,13.0,3,1 31 | 6,4,12.0,2,1 32 | 3,2,11.0,9,1 33 | 10,6,12.0,10,1 34 | 9,5,13.0,3,1 35 | 9,2,12.0,5,1 36 | 2,6,13.0,9,1 37 | 4,2,12.0,10,1 38 | 4,3,12.0,6,1 39 | 7,1,12.0,1,1 40 | 1,7,11.0,10,1 41 | 9,2,11.0,10,1 42 | 2,6,12.0,2,1 43 | 9,4,11.0,5,1 44 | 6,2,11.0,10,1 45 | 3,10,11.0,4,1 46 | 6,9,11.0,2,1 47 | 10,6,11.0,9,1 48 | 6,7,11.0,9,1 49 | 7,2,13.0,8,1 50 | 9,2,13.0,5,1 51 | 8,7,12.0,6,1 52 | 9,1,12.0,9,1 53 | 3,5,14.0,3,1 54 | 7,1,11.0,3,1 55 | 5,9,12.0,7,1 56 | 3,10,12.0,7,1 57 | 9,8,13.0,9,1 58 | 10,9,12.0,9,1 59 | 10,7,11.0,2,1 60 | 10,3,11.0,1,1 61 | 2,4,11.0,8,1 62 | 10,3,13.0,4,1 63 | 5,1,14.0,8,1 64 | 8,8,11.0,4,1 65 | 4,8,14.0,1,1 66 | 5,1,12.0,7,1 67 | 6,8,11.0,2,1 68 | 1,1,13.0,3,1 69 | 9,3,12.0,10,1 70 | 6,1,11.0,7,1 71 | 7,5,10.0,1,1 72 | 10,2,12.0,2,1 73 | 2,3,13.0,1,1 74 | 5,8,12.0,2,1 75 | 10,6,12.0,10,1 76 | 9,1,11.0,6,1 77 | 10,10,14.0,7,1 78 | 1,5,12.0,10,1 79 | 10,1,11.0,2,1 80 | 1,1,12.0,2,1 81 | 10,3,13.0,7,1 82 | 1,6,11.0,10,1 83 | 9,4,12.0,3,1 84 | 10,9,12.0,5,1 85 | 10,8,11.0,2,1 86 | 5,3,9.0,2,1 87 | 3,7,12.0,10,1 88 | 4,9,12.0,8,1 89 | 5,1,11.0,2,1 90 | 10,9,11.0,9,1 91 | 10,7,11.0,6,1 92 | 8,2,13.0,10,1 93 | 7,7,11.0,3,1 94 | 9,10,11.0,5,1 95 | 5,2,12.0,8,1 96 | 1,1,10.0,8,1 97 | 5,5,12.0,8,1 98 | 9,6,12.0,1,1 99 | 4,6,12.0,2,1 100 | 1,1,12.0,4,1 101 | 9,3,11.0,10,1 102 | 3,2,12.0,6,1 103 | 2,4,11.0,9,1 104 | 8,1,12.0,10,1 105 | 10,6,11.0,6,1 106 | 8,9,12.0,2,1 107 | 2,3,12.0,3,1 108 | 4,6,14.0,4,1 109 | 3,4,12.0,4,1 110 | 9,5,12.0,5,1 111 | 10,5,13.0,2,1 112 | 8,2,10.0,6,1 113 | 10,5,11.0,2,1 114 | 10,1,11.0,3,1 115 | 7,6,13.0,3,1 116 | 8,9,14.0,4,1 117 | 8,8,14.0,7,1 118 | 1,9,11.0,10,1 119 | 2,9,10.0,3,1 120 | 4,9,13.0,4,1 121 | 10,10,12.0,7,1 122 | 8,9,12.0,7,1 123 | 9,7,12.0,1,1 124 | 3,6,13.0,5,1 125 | 4,5,12.0,3,1 126 | 1,7,11.0,9,1 127 | 4,6,12.0,9,1 128 | 8,10,13.0,3,1 129 | 5,4,12.0,5,1 130 | 9,4,12.0,6,1 131 | 3,4,12.0,5,1 132 | 7,7,11.0,4,1 133 | 6,2,12.0,6,1 134 | 2,8,11.0,1,1 135 | 4,4,10.0,3,1 136 | 3,7,12.0,9,1 137 | 10,3,12.0,7,1 138 | 3,1,12.0,7,1 139 | 2,4,13.0,10,1 140 | 6,3,12.0,2,1 141 | 7,2,14.0,4,1 142 | 4,2,8.0,9,0 143 | 4,8,9.0,1,0 144 | 10,8,8.0,6,0 145 | 8,6,9.0,4,0 146 | 7,2,7.0,8,0 147 | 3,3,9.0,5,0 148 | 4,10,8.0,9,0 149 | 4,7,10.0,7,0 150 | 1,7,8.0,2,0 151 | 10,7,8.0,5,0 152 | 10,5,9.0,1,0 153 | 5,7,10.0,10,0 154 | 2,8,6.0,9,0 155 | 4,1,7.0,5,0 156 | 4,6,9.0,7,0 157 | 2,2,9.0,8,0 158 | 6,7,6.0,9,0 159 | 5,7,7.0,2,0 160 | 7,1,7.0,5,0 161 | 8,1,8.0,3,0 162 | 1,6,8.0,1,0 163 | 4,5,9.0,8,0 164 | 8,10,8.0,3,0 165 | 4,9,8.0,2,0 166 | 2,9,6.0,4,0 167 | 8,10,8.0,9,0 168 | 3,6,8.0,1,0 169 | 5,6,9.0,8,0 170 | 5,2,8.0,10,0 171 | 9,7,6.0,7,0 172 | 3,8,6.0,10,0 173 | 3,3,8.0,9,0 174 | 3,4,10.0,2,0 175 | 6,8,8.0,9,0 176 | 1,4,8.0,7,0 177 | 6,9,7.0,10,0 178 | 10,6,8.0,6,0 179 | 9,4,7.0,10,0 180 | 9,2,10.0,3,0 181 | 6,8,8.0,6,0 182 | 10,5,7.0,4,0 183 | 4,8,8.0,7,0 184 | 5,6,6.0,9,0 185 | 2,1,10.0,7,0 186 | 6,4,7.0,4,0 187 | 6,8,9.0,4,0 188 | 3,3,8.0,3,0 189 | 3,5,10.0,6,0 190 | 3,3,9.0,9,0 191 | 7,7,8.0,9,0 192 | 6,8,7.0,10,0 193 | 7,3,7.0,7,0 194 | 5,7,9.0,2,0 195 | 4,9,8.0,10,0 196 | 9,9,7.0,4,0 197 | 6,9,6.0,1,0 198 | 4,2,10.0,10,0 199 | 8,10,8.0,3,0 200 | 1,7,8.0,4,0 201 | 3,2,9.0,1,0 202 | 9,9,9.0,6,0 203 | 4,10,5.0,4,0 204 | 9,3,7.0,5,0 205 | 9,1,9.0,3,0 206 | 4,6,7.0,2,0 207 | 4,5,8.0,5,0 208 | 5,7,6.0,6,0 209 | 10,6,9.0,3,0 210 | 6,6,8.0,10,0 211 | 3,7,9.0,7,0 212 | 8,10,8.0,2,0 213 | 5,2,8.0,3,0 214 | 5,7,7.0,5,0 215 | 10,9,8.0,2,0 216 | 4,4,8.0,7,0 217 | 1,4,9.0,6,0 218 | 8,2,9.0,10,0 219 | 9,6,9.0,5,0 220 | 7,6,7.0,7,0 221 | 1,2,9.0,4,0 222 | 1,8,7.0,10,0 223 | 6,2,8.0,9,0 224 | 9,5,7.0,8,0 225 | 8,7,8.0,6,0 226 | 5,7,8.0,9,0 227 | 8,4,9.0,1,0 228 | 6,1,9.0,3,0 229 | 9,7,8.0,9,0 230 | 2,9,7.0,10,0 231 | 2,4,8.0,5,0 232 | 10,3,8.0,8,0 233 | 7,9,8.0,8,0 234 | 6,6,8.0,2,0 235 | 1,5,8.0,10,0 236 | 10,1,9.0,9,0 237 | 8,1,9.0,2,0 238 | 10,9,8.0,6,0 239 | 5,10,7.0,1,0 240 | 3,6,7.0,8,0 241 | 4,10,10.0,5,0 242 | 2,1,7.0,9,0 243 | 9,2,9.0,9,0 244 | 3,9,8.0,9,0 245 | 2,3,6.0,9,0 246 | 3,9,8.0,6,0 247 | 10,7,9.0,1,0 248 | 10,10,6.0,4,0 249 | 8,5,9.0,5,0 250 | 7,2,8.0,1,0 251 | 7,2,8.0,9,0 252 | 6,9,7.0,2,0 253 | 1,4,9.0,3,0 254 | 10,9,9.0,10,0 255 | 4,3,8.0,8,0 256 | 8,7,6.0,6,0 257 | 5,7,8.0,3,0 258 | 8,6,8.0,3,0 259 | 3,2,6.0,10,0 260 | 4,2,6.0,5,0 261 | 10,6,8.0,7,0 262 | 3,6,8.0,3,0 263 | 2,2,8.0,1,0 264 | 1,9,10.0,6,0 265 | 9,6,8.0,7,0 266 | 4,5,9.0,5,0 267 | 3,5,8.0,6,0 268 | 4,5,8.0,10,0 269 | 9,4,9.0,4,0 270 | 9,4,7.0,6,0 271 | 7,6,8.0,10,0 272 | 9,10,11.0,2,0 273 | 3,4,9.0,5,0 274 | 2,10,9.0,2,0 275 | 10,9,8.0,2,0 276 | 4,6,9.0,4,0 277 | 4,10,7.0,10,0 278 | 9,1,9.0,8,0 279 | 3,10,8.0,6,0 280 | 8,5,9.0,3,0 281 | 8,5,7.0,5,0 282 | 1,8,6.0,6,0 283 | 8,8,6.0,8,0 284 | 4,8,7.0,3,0 285 | 9,3,8.0,7,0 286 | 10,8,7.0,3,0 287 | 2,10,6.0,4,0 288 | 2,5,9.0,5,0 289 | 10,7,9.0,4,0 290 | 3,10,9.0,8,0 291 | 9,2,7.0,3,0 292 | 7,4,6.0,4,0 293 | 3,4,8.0,7,0 294 | 4,7,8.0,3,0 295 | 10,9,8.0,10,0 296 | 4,6,5.0,6,0 297 | 10,2,9.0,7,0 298 | 9,8,9.0,10,0 299 | 7,10,8.0,2,0 300 | 5,5,6.0,1,0 301 | 8,4,7.0,6,0 302 | 5,5,7.0,9,0 303 | 7,2,9.0,9,0 304 | 9,4,9.0,3,0 305 | 5,5,7.0,3,0 306 | 2,7,7.0,4,0 307 | 4,5,9.0,8,0 308 | 1,8,8.0,6,0 309 | 5,6,9.0,5,0 310 | 3,6,8.0,3,0 311 | 7,2,9.0,5,0 312 | 10,9,10.0,6,0 313 | 4,7,10.0,6,0 314 | 1,9,9.0,7,0 315 | 1,7,7.0,2,0 316 | 1,9,7.0,5,0 317 | 2,8,9.0,4,0 318 | 5,4,8.0,2,0 319 | 1,7,7.0,6,0 320 | 2,1,8.0,9,0 321 | 2,6,9.0,4,0 322 | 1,6,8.0,9,0 323 | 1,4,8.0,5,0 324 | 10,6,8.0,5,0 325 | 6,4,6.0,4,0 326 | 2,1,9.0,1,0 327 | 8,6,9.0,10,0 328 | 5,6,7.0,9,0 329 | 10,10,7.0,1,0 330 | 2,9,10.0,6,0 331 | 9,6,10.0,2,0 332 | 3,5,9.0,3,0 333 | 5,10,8.0,3,0 334 | 1,3,9.0,8,0 335 | 8,8,8.0,7,0 336 | 6,1,8.0,3,0 337 | 4,9,9.0,2,0 338 | 2,9,10.0,3,0 339 | 1,5,8.0,5,0 340 | 5,6,8.0,8,0 341 | 6,10,9.0,2,0 342 | 9,6,8.0,9,0 343 | 1,8,8.0,7,0 344 | 8,2,8.0,8,0 345 | 3,6,8.0,5,0 346 | 9,2,9.0,6,0 347 | 7,10,5.0,6,0 348 | 2,5,8.0,3,0 349 | 9,2,10.0,7,0 350 | 5,9,8.0,9,0 351 | 1,6,8.0,3,0 352 | 7,4,8.0,3,0 353 | 8,5,8.0,5,0 354 | 5,9,7.0,3,0 355 | 9,6,8.0,5,0 356 | 3,1,8.0,5,0 357 | 5,8,9.0,9,0 358 | 2,5,8.0,3,0 359 | 5,6,8.0,6,0 360 | 2,5,8.0,1,0 361 | 6,2,11.0,10,0 362 | 2,6,6.0,9,0 363 | 4,4,6.0,8,0 364 | 2,7,8.0,9,0 365 | 5,2,7.0,9,0 366 | 6,10,8.0,3,0 367 | 4,6,7.0,5,0 368 | 2,8,8.0,6,0 369 | 6,2,8.0,3,0 370 | 8,10,9.0,8,0 371 | 5,9,8.0,5,0 372 | 9,2,9.0,8,0 373 | 5,10,8.0,6,0 374 | 10,6,8.0,3,0 375 | 6,6,9.0,6,0 376 | 6,3,10.0,5,0 377 | 1,3,8.0,5,0 378 | 2,3,9.0,3,0 379 | 2,6,8.0,8,0 380 | 8,4,9.0,10,0 381 | 8,7,6.0,7,0 382 | 2,6,8.0,10,0 383 | 7,2,9.0,3,0 384 | 7,9,6.0,2,0 385 | 2,10,8.0,8,0 386 | 5,2,9.0,9,0 387 | 2,8,9.0,10,0 388 | 8,4,6.0,8,0 389 | 7,3,10.0,7,0 390 | 9,9,8.0,7,0 391 | 8,4,8.0,1,0 392 | 9,2,6.0,8,0 393 | 8,6,8.0,2,0 394 | 9,7,8.0,2,0 395 | 4,3,9.0,6,0 396 | 2,1,8.0,9,0 397 | 9,4,7.0,9,0 398 | 4,2,9.0,2,0 399 | 10,3,8.0,2,0 400 | 9,2,10.0,5,0 401 | 10,7,7.0,7,0 402 | 2,3,7.0,10,0 403 | 10,1,7.0,4,0 404 | 3,3,7.0,5,0 405 | 10,1,7.0,4,0 406 | 5,4,8.0,7,0 407 | 7,3,7.0,8,0 408 | 10,9,7.0,4,0 409 | 5,7,8.0,9,0 410 | 5,9,7.0,5,0 411 | 4,6,7.0,5,0 412 | 4,2,8.0,9,0 413 | 8,3,7.0,4,0 414 | 3,5,9.0,6,0 415 | 4,3,8.0,10,0 416 | 1,6,7.0,8,0 417 | 8,5,8.0,6,0 418 | 9,10,7.0,6,0 419 | 8,9,8.0,1,0 420 | 9,10,8.0,8,0 421 | 3,10,8.0,2,0 422 | 8,10,10.0,7,0 423 | 2,1,10.0,7,0 424 | 5,10,8.0,8,0 425 | 4,9,7.0,7,0 426 | 9,3,7.0,7,0 427 | 5,7,8.0,6,0 428 | 8,7,9.0,3,0 429 | 2,2,7.0,8,0 430 | 6,6,9.0,9,0 431 | 4,2,8.0,4,0 432 | 3,9,7.0,9,0 433 | 7,9,6.0,5,0 434 | 5,3,7.0,5,0 435 | 4,4,9.0,1,0 436 | 6,9,8.0,5,0 437 | 10,10,8.0,1,0 438 | 2,6,8.0,6,0 439 | 10,10,9.0,5,0 440 | 5,9,9.0,6,0 441 | 3,2,8.0,9,0 442 | 10,10,9.0,3,0 443 | 4,7,9.0,4,0 444 | 4,4,7.0,1,0 445 | 5,8,8.0,5,0 446 | 2,3,8.0,3,0 447 | 6,4,9.0,2,0 448 | 2,9,9.0,10,0 449 | 3,6,8.0,2,0 450 | 3,2,10.0,10,0 451 | 2,2,8.0,1,0 452 | 9,6,9.0,1,0 453 | 6,5,6.0,2,0 454 | 3,6,8.0,1,0 455 | 3,3,8.0,6,0 456 | 2,10,9.0,2,0 457 | 8,9,8.0,9,0 458 | 7,4,10.0,4,0 459 | 6,6,7.0,8,0 460 | 5,3,7.0,7,0 461 | 6,7,7.0,6,0 462 | 9,1,9.0,5,0 463 | 10,9,9.0,1,0 464 | 10,4,8.0,3,0 465 | 1,2,9.0,1,0 466 | 2,1,9.0,1,0 467 | 6,1,7.0,9,0 468 | 1,5,8.0,3,0 469 | 2,8,8.0,4,0 470 | 1,8,8.0,8,0 471 | 3,1,9.0,7,0 472 | 3,9,7.0,6,0 473 | 8,1,7.0,4,0 474 | 10,4,9.0,8,0 475 | 2,5,7.0,6,0 476 | 10,6,8.0,5,0 477 | 6,1,9.0,7,0 478 | 6,10,7.0,10,0 479 | 2,10,8.0,3,0 480 | 1,4,8.0,1,0 481 | 8,9,9.0,4,0 482 | 10,10,7.0,4,0 483 | 8,3,7.0,9,0 484 | 2,2,9.0,8,0 485 | 9,5,10.0,10,0 486 | 2,2,6.0,10,0 487 | 8,3,6.0,6,0 488 | 6,4,9.0,10,0 489 | 1,3,8.0,3,0 490 | 6,6,8.0,3,0 491 | 1,9,7.0,4,0 492 | -------------------------------------------------------------------------------- /Data Set Generator (remove me the future!)/fake_customers.csv: -------------------------------------------------------------------------------- 1 | Names,Age,Phone,Location,Company,Lot,Sales 2 | Chelsea Taylor,46.0,1-431-660-1615x8629,"064 Stone Neck Apt. 766 3 | East Debrabury, FM 63246",Bentley-Waller,07 bz,0 4 | Pamela Williams,38.0,(101)883-0724x491,"5182 Emily Spurs 5 | West Lindsey, PA 79975",Gomez Group,21 cB,0 6 | Kristi Sandoval,41.0,+99(4)3518374928,"367 Nelson Gardens Apt. 209 7 | Ochoaview, MT 25437","Thomas, Brown and Stewart",25 to,0 8 | Ashley Morris,45.0,939-770-5901x336,"66532 Harris Loop 9 | West Susan, PR 68272-6257","Banks, Mendez and Reyes",46 rn,0 10 | Dwayne Nguyen,48.0,468-328-7711,"418 Martin Mall 11 | New John, MN 64235",Phelps-Bentley,97 lr,0 12 | Benjamin Nelson,43.0,257.443.9817x9922,"Unit 2069 Box 9542 13 | DPO AA 81875-0608",Madden-Murphy,76 YB,0 14 | Tanya Mcdonald,40.0,985.525.6864x365,"PSC 1888, Box 7629 15 | APO AE 68066-4189",Morgan-Wilson,74 HU,0 16 | Ashley Mullins,34.0,231-482-7034x4744,"9819 Flores Orchard Apt. 954 17 | Markchester, NE 71752-6833","Hall, Romero and Marshall",75 Ty,0 18 | David Hutchinson,39.0,932.142.2276,"Unit 8564 Box 6806 19 | DPO AE 41715",Hanna Ltd,84 Ho,0 20 | Kayla Arnold,31.0,550.464.0343x938,"9296 Matthew Oval Apt. 429 21 | Thomasborough, NJ 22056-5974",Bradley-Schwartz,74 lz,0 22 | Nathan Castaneda,37.0,498.517.0898x258,"02452 Dawn Tunnel Apt. 012 23 | Rodriguezmouth, MA 80967-6806",Young and Sons,51 AM,0 24 | Keith Nelson,46.0,1-434-023-4677,"6309 Dustin Heights 25 | Joseville, UT 00298-1977",Rodriguez Ltd,32 yr,0 26 | Kathleen Weaver,22.0,920-001-7389,"822 Smith Lodge Apt. 921 27 | Tonichester, KY 49154","Key, Johnson and Hunt",72 Uv,0 28 | Kevin Thomas,37.0,(536)901-0070x33732,"Unit 8732 Box 8363 29 | DPO AA 80979-6530",Patterson-Burton,69 mk,0 30 | Seth Lutz,38.0,1-689-306-8881x37712,"510 Michael Field 31 | East Kimberly, DE 21409",Kelley Inc,29 Ts,0 32 | -------------------------------------------------------------------------------- /Data Set Generator (remove me the future!)/new_customers.csv: -------------------------------------------------------------------------------- 1 | Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company 2 | Andrew Mccall,37.0,9935.53,1,7.71,8.0,2011-08-29 18:37:54,"38612 Johnny Stravenue 3 | Nataliebury, WI 15717-8316",King Ltd, 4 | Michele Wright,23.0,7526.94,1,9.28,15.0,2013-07-22 18:19:54,"21083 Nicole Junction Suite 332 5 | Youngport, ME 23686-4381",Cannon-Benson 6 | Jeremy Chang,65.0,100.0,1,1.0,15.0,2006-12-11 07:48:13,"085 Austin Views 7 | Lake Julialand, WY 63726-4298",Barron-Robertson 8 | Megan Ferguson,32.0,6487.5,0,9.4,14.0,2016-10-28 05:32:13,"922 Wright Branch 9 | North Cynthialand, NC 64721",Sexton-Golden 10 | Taylor Young,32.0,13147.71,1,10.0,8.0,2012-03-20 00:36:46,"Unit 0789 Box 0734 11 | DPO AP 39702",Wood LLC, 12 | Jessica Drake,22.0,8445.26,1,3.46,14.0,2011-02-04 19:29:27,"1148 Tina Stravenue Apt. 978 13 | South Carlos, TX 21222-9221",Parks-Robbins, 14 | -------------------------------------------------------------------------------- /Python-Crash-Course/.ipynb_checkpoints/Python Crash Course Exercises - Solutions-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python Crash Course Exercises - Solutions\n", 8 | "\n", 9 | "This is an optional exercise to test your understanding of Python Basics. If you find this extremely challenging, then you probably are not ready for the rest of this course yet and don't have enough programming experience to continue. I would suggest you take another course more geared towards complete beginners, such as [Complete Python Bootcamp]()" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Exercises\n", 17 | "\n", 18 | "Answer the questions or complete the tasks outlined in bold below, use the specific method described if applicable." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "** What is 7 to the power of 4?**" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "2401" 39 | ] 40 | }, 41 | "execution_count": 1, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "7 **4" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "** Split this string:**\n", 55 | "\n", 56 | " s = \"Hi there Sam!\"\n", 57 | " \n", 58 | "**into a list. **" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "s = 'Hi there Sam!'" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "['Hi', 'there', 'dad!']" 83 | ] 84 | }, 85 | "execution_count": 3, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "s.split()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "** Given the variables:**\n", 99 | "\n", 100 | " planet = \"Earth\"\n", 101 | " diameter = 12742\n", 102 | "\n", 103 | "** Use .format() to print the following string: **\n", 104 | "\n", 105 | " The diameter of Earth is 12742 kilometers." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "planet = \"Earth\"\n", 117 | "diameter = 12742" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "The diameter of Earth is 12742 kilometers.\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "print(\"The diameter of {} is {} kilometers.\".format(planet,diameter))" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "** Given this nested list, use indexing to grab the word \"hello\" **" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "lst = [1,2,[3,4],[5,[100,200,['hello']],23,11],1,7]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 14, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "'hello'" 168 | ] 169 | }, 170 | "execution_count": 14, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "lst[3][1][2][0]" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "** Given this nest dictionary grab the word \"hello\". Be prepared, this will be annoying/tricky **" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 16, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "d = {'k1':[1,2,3,{'tricky':['oh','man','inception',{'target':[1,2,3,'hello']}]}]}" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 22, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "'hello'" 208 | ] 209 | }, 210 | "execution_count": 22, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "d['k1'][3]['tricky'][3]['target'][3]" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "** What is the main difference between a tuple and a list? **" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 23, 229 | "metadata": { 230 | "collapsed": true 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "# Just answer with text, no code necessary" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "** Create a function that grabs the email website domain from a string in the form: **\n", 242 | "\n", 243 | " user@domain.com\n", 244 | " \n", 245 | "**So for example, passing \"user@domain.com\" would return: domain.com**" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 24, 251 | "metadata": { 252 | "collapsed": true 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "def domainGet(email):\n", 257 | " return email.split('@')[-1]" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 26, 263 | "metadata": { 264 | "collapsed": false 265 | }, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "'domain.com'" 271 | ] 272 | }, 273 | "execution_count": 26, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "domainGet('user@domain.com')" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "** Create a basic function that returns True if the word 'dog' is contained in the input string. Don't worry about edge cases like a punctuation being attached to the word dog, but do account for capitalization. **" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 27, 292 | "metadata": { 293 | "collapsed": true 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "def findDog(st):\n", 298 | " return 'dog' in st.lower().split()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 28, 304 | "metadata": { 305 | "collapsed": false 306 | }, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "True" 312 | ] 313 | }, 314 | "execution_count": 28, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "findDog('Is there a dog here?')" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "** Create a function that counts the number of times the word \"dog\" occurs in a string. Again ignore edge cases. **" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 30, 333 | "metadata": { 334 | "collapsed": false 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "def countDog(st):\n", 339 | " count = 0\n", 340 | " for word in st.lower().split():\n", 341 | " if word == 'dog':\n", 342 | " count += 1\n", 343 | " return count" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 31, 349 | "metadata": { 350 | "collapsed": false 351 | }, 352 | "outputs": [ 353 | { 354 | "data": { 355 | "text/plain": [ 356 | "2" 357 | ] 358 | }, 359 | "execution_count": 31, 360 | "metadata": {}, 361 | "output_type": "execute_result" 362 | } 363 | ], 364 | "source": [ 365 | "countDog('This dog runs faster than the other dog dude!')" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "### Final Problem\n", 373 | "**You are driving a little too fast, and a police officer stops you. Write a function\n", 374 | " to return one of 3 possible results: \"No ticket\", \"Small ticket\", or \"Big Ticket\". \n", 375 | " If your speed is 60 or less, the result is \"No Ticket\". If speed is between 61 \n", 376 | " and 80 inclusive, the result is \"Small Ticket\". If speed is 81 or more, the result is \"Big Ticket\". Unless it is your birthday (encoded as a boolean value in the parameters of the function) -- on your birthday, your speed can be 5 higher in all \n", 377 | " cases. **" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 4, 383 | "metadata": { 384 | "collapsed": true 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "def caught_speeding(speed, is_birthday):\n", 389 | " \n", 390 | " if is_birthday:\n", 391 | " speeding = speed - 5\n", 392 | " else:\n", 393 | " speeding = speed\n", 394 | " \n", 395 | " if speeding > 80:\n", 396 | " return 'Big Ticket'\n", 397 | " elif speeding > 60:\n", 398 | " return 'Small Ticket'\n", 399 | " else:\n", 400 | " return 'No Ticket'" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 5, 406 | "metadata": { 407 | "collapsed": false 408 | }, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "'Small Ticket'" 414 | ] 415 | }, 416 | "execution_count": 5, 417 | "metadata": {}, 418 | "output_type": "execute_result" 419 | } 420 | ], 421 | "source": [ 422 | "caught_speeding(81,True)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 6, 428 | "metadata": { 429 | "collapsed": false 430 | }, 431 | "outputs": [ 432 | { 433 | "data": { 434 | "text/plain": [ 435 | "'Big Ticket'" 436 | ] 437 | }, 438 | "execution_count": 6, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "caught_speeding(81,False)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "# Great job!" 452 | ] 453 | } 454 | ], 455 | "metadata": { 456 | "anaconda-cloud": {}, 457 | "kernelspec": { 458 | "display_name": "Python [default]", 459 | "language": "python", 460 | "name": "python3" 461 | }, 462 | "language_info": { 463 | "codemirror_mode": { 464 | "name": "ipython", 465 | "version": 3 466 | }, 467 | "file_extension": ".py", 468 | "mimetype": "text/x-python", 469 | "name": "python", 470 | "nbconvert_exporter": "python", 471 | "pygments_lexer": "ipython3", 472 | "version": "3.5.3" 473 | } 474 | }, 475 | "nbformat": 4, 476 | "nbformat_minor": 0 477 | } 478 | -------------------------------------------------------------------------------- /Python-Crash-Course/.ipynb_checkpoints/Python Crash Course Exercises-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python Crash Course Exercises \n", 8 | "\n", 9 | "This is an optional exercise to test your understanding of Python Basics. If you find this extremely challenging, then you probably are not ready for the rest of this course yet and don't have enough programming experience to continue. I would suggest you take another course more geared towards complete beginners, such as [Complete Python Bootcamp]()" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Exercises\n", 17 | "\n", 18 | "Answer the questions or complete the tasks outlined in bold below, use the specific method described if applicable." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "** What is 7 to the power of 4?**" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "2401" 39 | ] 40 | }, 41 | "execution_count": 1, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "** Split this string:**\n", 53 | "\n", 54 | " s = \"Hi there Sam!\"\n", 55 | " \n", 56 | "**into a list. **" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "['Hi', 'there', 'dad!']" 79 | ] 80 | }, 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "** Given the variables:**\n", 93 | "\n", 94 | " planet = \"Earth\"\n", 95 | " diameter = 12742\n", 96 | "\n", 97 | "** Use .format() to print the following string: **\n", 98 | "\n", 99 | " The diameter of Earth is 12742 kilometers." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "planet = \"Earth\"\n", 111 | "diameter = 12742" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "The diameter of Earth is 12742 kilometers.\n" 126 | ] 127 | } 128 | ], 129 | "source": [] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "** Given this nested list, use indexing to grab the word \"hello\" **" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 7, 141 | "metadata": { 142 | "collapsed": true 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "lst = [1,2,[3,4],[5,[100,200,['hello']],23,11],1,7]" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 14, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "'hello'" 160 | ] 161 | }, 162 | "execution_count": 14, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "** Given this nest dictionary grab the word \"hello\". Be prepared, this will be annoying/tricky **" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 16, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "d = {'k1':[1,2,3,{'tricky':['oh','man','inception',{'target':[1,2,3,'hello']}]}]}" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 22, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "'hello'" 198 | ] 199 | }, 200 | "execution_count": 22, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "** What is the main difference between a tuple and a list? **" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 23, 217 | "metadata": { 218 | "collapsed": true 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "# Just answer with text, no code necessary" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "** Create a function that grabs the email website domain from a string in the form: **\n", 230 | "\n", 231 | " user@domain.com\n", 232 | " \n", 233 | "**So for example, passing \"user@domain.com\" would return: domain.com**" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 24, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 26, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "'domain.com'" 256 | ] 257 | }, 258 | "execution_count": 26, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "** Create a basic function that returns True if the word 'dog' is contained in the input string. Don't worry about edge cases like a punctuation being attached to the word dog, but do account for capitalization. **" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 27, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 28, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "True" 292 | ] 293 | }, 294 | "execution_count": 28, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "** Create a function that counts the number of times the word \"dog\" occurs in a string. Again ignore edge cases. **" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 30, 311 | "metadata": { 312 | "collapsed": false 313 | }, 314 | "outputs": [], 315 | "source": [] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 31, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/plain": [ 327 | "2" 328 | ] 329 | }, 330 | "execution_count": 31, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "### Final Problem\n", 342 | "**You are driving a little too fast, and a police officer stops you. Write a function\n", 343 | " to return one of 3 possible results: \"No ticket\", \"Small ticket\", or \"Big Ticket\". \n", 344 | " If your speed is 60 or less, the result is \"No Ticket\". If speed is between 61 \n", 345 | " and 80 inclusive, the result is \"Small Ticket\". If speed is 81 or more, the result is \"Big Ticket\". Unless it is your birthday (encoded as a boolean value in the parameters of the function) -- on your birthday, your speed can be 5 higher in all \n", 346 | " cases. **" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 4, 352 | "metadata": { 353 | "collapsed": true 354 | }, 355 | "outputs": [], 356 | "source": [] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 5, 361 | "metadata": { 362 | "collapsed": false 363 | }, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "'Small Ticket'" 369 | ] 370 | }, 371 | "execution_count": 5, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 6, 381 | "metadata": { 382 | "collapsed": false 383 | }, 384 | "outputs": [ 385 | { 386 | "data": { 387 | "text/plain": [ 388 | "'Big Ticket'" 389 | ] 390 | }, 391 | "execution_count": 6, 392 | "metadata": {}, 393 | "output_type": "execute_result" 394 | } 395 | ], 396 | "source": [] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "# Great job!" 403 | ] 404 | } 405 | ], 406 | "metadata": { 407 | "anaconda-cloud": {}, 408 | "kernelspec": { 409 | "display_name": "Python [default]", 410 | "language": "python", 411 | "name": "python3" 412 | }, 413 | "language_info": { 414 | "codemirror_mode": { 415 | "name": "ipython", 416 | "version": 3 417 | }, 418 | "file_extension": ".py", 419 | "mimetype": "text/x-python", 420 | "name": "python", 421 | "nbconvert_exporter": "python", 422 | "pygments_lexer": "ipython3", 423 | "version": "3.5.3" 424 | } 425 | }, 426 | "nbformat": 4, 427 | "nbformat_minor": 0 428 | } 429 | -------------------------------------------------------------------------------- /Python-Crash-Course/Python Crash Course Exercises - Solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python Crash Course Exercises - Solutions\n", 8 | "\n", 9 | "This is an optional exercise to test your understanding of Python Basics. If you find this extremely challenging, then you probably are not ready for the rest of this course yet and don't have enough programming experience to continue. I would suggest you take another course more geared towards complete beginners, such as [Complete Python Bootcamp]()" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Exercises\n", 17 | "\n", 18 | "Answer the questions or complete the tasks outlined in bold below, use the specific method described if applicable." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "** What is 7 to the power of 4?**" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "2401" 39 | ] 40 | }, 41 | "execution_count": 1, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "7 **4" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "** Split this string:**\n", 55 | "\n", 56 | " s = \"Hi there Sam!\"\n", 57 | " \n", 58 | "**into a list. **" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "s = 'Hi there Sam!'" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "['Hi', 'there', 'dad!']" 83 | ] 84 | }, 85 | "execution_count": 3, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "s.split()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "** Given the variables:**\n", 99 | "\n", 100 | " planet = \"Earth\"\n", 101 | " diameter = 12742\n", 102 | "\n", 103 | "** Use .format() to print the following string: **\n", 104 | "\n", 105 | " The diameter of Earth is 12742 kilometers." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "planet = \"Earth\"\n", 117 | "diameter = 12742" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "The diameter of Earth is 12742 kilometers.\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "print(\"The diameter of {} is {} kilometers.\".format(planet,diameter))" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "** Given this nested list, use indexing to grab the word \"hello\" **" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "lst = [1,2,[3,4],[5,[100,200,['hello']],23,11],1,7]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 14, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "'hello'" 168 | ] 169 | }, 170 | "execution_count": 14, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "lst[3][1][2][0]" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "** Given this nest dictionary grab the word \"hello\". Be prepared, this will be annoying/tricky **" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 16, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "d = {'k1':[1,2,3,{'tricky':['oh','man','inception',{'target':[1,2,3,'hello']}]}]}" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 22, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "'hello'" 208 | ] 209 | }, 210 | "execution_count": 22, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "d['k1'][3]['tricky'][3]['target'][3]" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "** What is the main difference between a tuple and a list? **" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 23, 229 | "metadata": { 230 | "collapsed": true 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "# Just answer with text, no code necessary" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "** Create a function that grabs the email website domain from a string in the form: **\n", 242 | "\n", 243 | " user@domain.com\n", 244 | " \n", 245 | "**So for example, passing \"user@domain.com\" would return: domain.com**" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 24, 251 | "metadata": { 252 | "collapsed": true 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "def domainGet(email):\n", 257 | " return email.split('@')[-1]" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 26, 263 | "metadata": { 264 | "collapsed": false 265 | }, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "'domain.com'" 271 | ] 272 | }, 273 | "execution_count": 26, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "domainGet('user@domain.com')" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "** Create a basic function that returns True if the word 'dog' is contained in the input string. Don't worry about edge cases like a punctuation being attached to the word dog, but do account for capitalization. **" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 27, 292 | "metadata": { 293 | "collapsed": true 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "def findDog(st):\n", 298 | " return 'dog' in st.lower().split()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 28, 304 | "metadata": { 305 | "collapsed": false 306 | }, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "True" 312 | ] 313 | }, 314 | "execution_count": 28, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "findDog('Is there a dog here?')" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "** Create a function that counts the number of times the word \"dog\" occurs in a string. Again ignore edge cases. **" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 30, 333 | "metadata": { 334 | "collapsed": false 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "def countDog(st):\n", 339 | " count = 0\n", 340 | " for word in st.lower().split():\n", 341 | " if word == 'dog':\n", 342 | " count += 1\n", 343 | " return count" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 31, 349 | "metadata": { 350 | "collapsed": false 351 | }, 352 | "outputs": [ 353 | { 354 | "data": { 355 | "text/plain": [ 356 | "2" 357 | ] 358 | }, 359 | "execution_count": 31, 360 | "metadata": {}, 361 | "output_type": "execute_result" 362 | } 363 | ], 364 | "source": [ 365 | "countDog('This dog runs faster than the other dog dude!')" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "### Final Problem\n", 373 | "**You are driving a little too fast, and a police officer stops you. Write a function\n", 374 | " to return one of 3 possible results: \"No ticket\", \"Small ticket\", or \"Big Ticket\". \n", 375 | " If your speed is 60 or less, the result is \"No Ticket\". If speed is between 61 \n", 376 | " and 80 inclusive, the result is \"Small Ticket\". If speed is 81 or more, the result is \"Big Ticket\". Unless it is your birthday (encoded as a boolean value in the parameters of the function) -- on your birthday, your speed can be 5 higher in all \n", 377 | " cases. **" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 4, 383 | "metadata": { 384 | "collapsed": true 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "def caught_speeding(speed, is_birthday):\n", 389 | " \n", 390 | " if is_birthday:\n", 391 | " speeding = speed - 5\n", 392 | " else:\n", 393 | " speeding = speed\n", 394 | " \n", 395 | " if speeding > 80:\n", 396 | " return 'Big Ticket'\n", 397 | " elif speeding > 60:\n", 398 | " return 'Small Ticket'\n", 399 | " else:\n", 400 | " return 'No Ticket'" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 5, 406 | "metadata": { 407 | "collapsed": false 408 | }, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "'Small Ticket'" 414 | ] 415 | }, 416 | "execution_count": 5, 417 | "metadata": {}, 418 | "output_type": "execute_result" 419 | } 420 | ], 421 | "source": [ 422 | "caught_speeding(81,True)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 6, 428 | "metadata": { 429 | "collapsed": false 430 | }, 431 | "outputs": [ 432 | { 433 | "data": { 434 | "text/plain": [ 435 | "'Big Ticket'" 436 | ] 437 | }, 438 | "execution_count": 6, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "caught_speeding(81,False)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "# Great job!" 452 | ] 453 | } 454 | ], 455 | "metadata": { 456 | "anaconda-cloud": {}, 457 | "kernelspec": { 458 | "display_name": "Python [default]", 459 | "language": "python", 460 | "name": "python3" 461 | }, 462 | "language_info": { 463 | "codemirror_mode": { 464 | "name": "ipython", 465 | "version": 3 466 | }, 467 | "file_extension": ".py", 468 | "mimetype": "text/x-python", 469 | "name": "python", 470 | "nbconvert_exporter": "python", 471 | "pygments_lexer": "ipython3", 472 | "version": "3.5.3" 473 | } 474 | }, 475 | "nbformat": 4, 476 | "nbformat_minor": 0 477 | } 478 | -------------------------------------------------------------------------------- /Python-Crash-Course/Python Crash Course Exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python Crash Course Exercises \n", 8 | "\n", 9 | "This is an optional exercise to test your understanding of Python Basics. If you find this extremely challenging, then you probably are not ready for the rest of this course yet and don't have enough programming experience to continue. I would suggest you take another course more geared towards complete beginners, such as [Complete Python Bootcamp]()" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Exercises\n", 17 | "\n", 18 | "Answer the questions or complete the tasks outlined in bold below, use the specific method described if applicable." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "** What is 7 to the power of 4?**" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "2401" 39 | ] 40 | }, 41 | "execution_count": 1, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "** Split this string:**\n", 53 | "\n", 54 | " s = \"Hi there Sam!\"\n", 55 | " \n", 56 | "**into a list. **" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "['Hi', 'there', 'dad!']" 79 | ] 80 | }, 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "** Given the variables:**\n", 93 | "\n", 94 | " planet = \"Earth\"\n", 95 | " diameter = 12742\n", 96 | "\n", 97 | "** Use .format() to print the following string: **\n", 98 | "\n", 99 | " The diameter of Earth is 12742 kilometers." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "planet = \"Earth\"\n", 111 | "diameter = 12742" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "The diameter of Earth is 12742 kilometers.\n" 126 | ] 127 | } 128 | ], 129 | "source": [] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "** Given this nested list, use indexing to grab the word \"hello\" **" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 7, 141 | "metadata": { 142 | "collapsed": true 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "lst = [1,2,[3,4],[5,[100,200,['hello']],23,11],1,7]" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 14, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "'hello'" 160 | ] 161 | }, 162 | "execution_count": 14, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "** Given this nest dictionary grab the word \"hello\". Be prepared, this will be annoying/tricky **" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 16, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "d = {'k1':[1,2,3,{'tricky':['oh','man','inception',{'target':[1,2,3,'hello']}]}]}" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 22, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "'hello'" 198 | ] 199 | }, 200 | "execution_count": 22, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "** What is the main difference between a tuple and a list? **" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 23, 217 | "metadata": { 218 | "collapsed": true 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "# Just answer with text, no code necessary" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "** Create a function that grabs the email website domain from a string in the form: **\n", 230 | "\n", 231 | " user@domain.com\n", 232 | " \n", 233 | "**So for example, passing \"user@domain.com\" would return: domain.com**" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 24, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 26, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "'domain.com'" 256 | ] 257 | }, 258 | "execution_count": 26, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "** Create a basic function that returns True if the word 'dog' is contained in the input string. Don't worry about edge cases like a punctuation being attached to the word dog, but do account for capitalization. **" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 27, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 28, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "True" 292 | ] 293 | }, 294 | "execution_count": 28, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "** Create a function that counts the number of times the word \"dog\" occurs in a string. Again ignore edge cases. **" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 30, 311 | "metadata": { 312 | "collapsed": false 313 | }, 314 | "outputs": [], 315 | "source": [] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 31, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/plain": [ 327 | "2" 328 | ] 329 | }, 330 | "execution_count": 31, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "### Final Problem\n", 342 | "**You are driving a little too fast, and a police officer stops you. Write a function\n", 343 | " to return one of 3 possible results: \"No ticket\", \"Small ticket\", or \"Big Ticket\". \n", 344 | " If your speed is 60 or less, the result is \"No Ticket\". If speed is between 61 \n", 345 | " and 80 inclusive, the result is \"Small Ticket\". If speed is 81 or more, the result is \"Big Ticket\". Unless it is your birthday (encoded as a boolean value in the parameters of the function) -- on your birthday, your speed can be 5 higher in all \n", 346 | " cases. **" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 4, 352 | "metadata": { 353 | "collapsed": true 354 | }, 355 | "outputs": [], 356 | "source": [] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 5, 361 | "metadata": { 362 | "collapsed": false 363 | }, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "'Small Ticket'" 369 | ] 370 | }, 371 | "execution_count": 5, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 6, 381 | "metadata": { 382 | "collapsed": false 383 | }, 384 | "outputs": [ 385 | { 386 | "data": { 387 | "text/plain": [ 388 | "'Big Ticket'" 389 | ] 390 | }, 391 | "execution_count": 6, 392 | "metadata": {}, 393 | "output_type": "execute_result" 394 | } 395 | ], 396 | "source": [] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "# Great job!" 403 | ] 404 | } 405 | ], 406 | "metadata": { 407 | "anaconda-cloud": {}, 408 | "kernelspec": { 409 | "display_name": "Python [default]", 410 | "language": "python", 411 | "name": "python3" 412 | }, 413 | "language_info": { 414 | "codemirror_mode": { 415 | "name": "ipython", 416 | "version": 3 417 | }, 418 | "file_extension": ".py", 419 | "mimetype": "text/x-python", 420 | "name": "python", 421 | "nbconvert_exporter": "python", 422 | "pygments_lexer": "ipython3", 423 | "version": "3.5.3" 424 | } 425 | }, 426 | "nbformat": 4, 427 | "nbformat_minor": 0 428 | } 429 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-and-Spark-for-Big-Data 2 | Course Notebooks for Python and Spark for Big Data 3 | 4 | Course Outline: 5 | 6 | * Course Introduction 7 | * Promo/Intro Video 8 | * Course Curriculum Overview 9 | * Introduction to Spark, RDDs, and Spark 2.0 10 | 11 | * Course Set-up 12 | * Set-up Overview 13 | * EC2 Installation Guide 14 | * Local Installation Guide with VirtualBox 15 | * Databricks Notebooks 16 | * Unix Command Line Basics and Jupyter Notebook Overview 17 | 18 | * Spark DataFrames 19 | * Spark DataFrames Section Introduction 20 | * Spark DataFrame Basics 21 | * Spark DataFrame Operations 22 | * Groupby and Aggregate Functions 23 | * Missing Data 24 | * Dates and Timestamps 25 | 26 | * Spark DataFrame Project 27 | * DataFrame Project Exercise 28 | * DataFrame Project Exercise Solutions 29 | 30 | * Machine Learning 31 | * Introduction to Machine Learning and ISLR 32 | * Machine Learning with Spark and Python and MLlib 33 | * Consulting Project Approach Overview 34 | 35 | * Linear Regression 36 | * Introduction to Linear Regression 37 | * Discussion on Data Transformations 38 | * Linear Regression with PySpark Example (Car Data) 39 | * Linear Regression Consulting Project (Housing Data) 40 | * Linear Regression Consulting Project Solution 41 | 42 | * Logistic Regression 43 | * Introduction to Logisitic Regression 44 | * Logistic Regression Example 45 | * Logistic Regression Consulting Project (Customer Churn) 46 | * Logistic Regression Consluting Project Solution 47 | 48 | * Tree Methods 49 | * Introduction to Tree Methods 50 | * Decision Tree and Random Forest Example 51 | * Random Forest Classification Consulting Project - Dog Food Data 52 | * RF Classification Consulting Project Solutions 53 | * RF Regression Project - (Facebook Data) 54 | 55 | * Clustering 56 | * Introduction to K-means Clustering 57 | * Clustering Example - Iris Dataset 58 | * Clustering Consulting Project - Customer Segmentation (Fake Data) 59 | * Clustering Consulting Project Solutions 60 | 61 | * Recommender System 62 | * Introduction to Recommender Systems and Collaborative Filtering 63 | * Code Along Project - MovieLens Dataset 64 | * Possible Consulting Project ? Company Service Reviews 65 | 66 | * Natural Language Processing 67 | * Introduction to Project/NLP/Naive Bayes Model 68 | * What are pipelines? 69 | * Code Along 70 | 71 | * Spark Streaming 72 | * Introduction to Spark Streaming 73 | * Spark Streaming Code-along! 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /Spark Streaming/TweetRead.py: -------------------------------------------------------------------------------- 1 | import tweepy 2 | from tweepy import OAuthHandler 3 | from tweepy import Stream 4 | from tweepy.streaming import StreamListener 5 | import socket 6 | import json 7 | 8 | 9 | # Set up your credentials 10 | consumer_key='' 11 | consumer_secret='' 12 | access_token ='' 13 | access_secret='' 14 | 15 | 16 | class TweetsListener(StreamListener): 17 | 18 | def __init__(self, csocket): 19 | self.client_socket = csocket 20 | 21 | def on_data(self, data): 22 | try: 23 | msg = json.loads( data ) 24 | print( msg['text'].encode('utf-8') ) 25 | self.client_socket.send( msg['text'].encode('utf-8') ) 26 | return True 27 | except BaseException as e: 28 | print("Error on_data: %s" % str(e)) 29 | return True 30 | 31 | def on_error(self, status): 32 | print(status) 33 | return True 34 | 35 | def sendData(c_socket): 36 | auth = OAuthHandler(consumer_key, consumer_secret) 37 | auth.set_access_token(access_token, access_secret) 38 | 39 | twitter_stream = Stream(auth, TweetsListener(c_socket)) 40 | twitter_stream.filter(track=['soccer']) 41 | 42 | if __name__ == "__main__": 43 | s = socket.socket() # Create a socket object 44 | host = "127.0.0.1" # Get local machine name 45 | port = 5555 # Reserve a port for your service. 46 | s.bind((host, port)) # Bind to the port 47 | 48 | print("Listening on port: %s" % str(port)) 49 | 50 | s.listen(5) # Now wait for client connection. 51 | c, addr = s.accept() # Establish connection with client. 52 | 53 | print( "Received request from: " + str( addr ) ) 54 | 55 | sendData( c ) -------------------------------------------------------------------------------- /Spark_DataFrames/.ipynb_checkpoints/Missing_Data-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Missing Data\n", 8 | "\n", 9 | "Often data sources are incomplete, which means you will have missing data, you have 3 basic options for filling in missing data (you will personally have to make the decision for what is the right approach:\n", 10 | "\n", 11 | "* Just keep the missing data points.\n", 12 | "* Drop them missing data points (including the entire row)\n", 13 | "* Fill them in with some other value.\n", 14 | "\n", 15 | "Let's cover examples of each of these methods!" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "source": [ 24 | "## Keeping the missing data\n", 25 | "A few machine learning algorithms can easily deal with missing data, let's see what it looks like:" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "from pyspark.sql import SparkSession\n", 37 | "# May take a little while on a local computer\n", 38 | "spark = SparkSession.builder.appName(\"missingdata\").getOrCreate()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "df = spark.read.csv(\"ContainsNull.csv\",header=True,inferSchema=True)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "+----+-----+-----+\n", 64 | "| Id| Name|Sales|\n", 65 | "+----+-----+-----+\n", 66 | "|emp1| John| null|\n", 67 | "|emp2| null| null|\n", 68 | "|emp3| null|345.0|\n", 69 | "|emp4|Cindy|456.0|\n", 70 | "+----+-----+-----+\n", 71 | "\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "df.show()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Notice how the data remains as a null." 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Drop the missing data\n", 91 | "\n", 92 | "You can use the .na functions for missing data. The drop command has the following parameters:\n", 93 | "\n", 94 | " df.na.drop(how='any', thresh=None, subset=None)\n", 95 | " \n", 96 | " * param how: 'any' or 'all'.\n", 97 | " \n", 98 | " If 'any', drop a row if it contains any nulls.\n", 99 | " If 'all', drop a row only if all its values are null.\n", 100 | " \n", 101 | " * param thresh: int, default None\n", 102 | " \n", 103 | " If specified, drop rows that have less than `thresh` non-null values.\n", 104 | " This overwrites the `how` parameter.\n", 105 | " \n", 106 | " * param subset: \n", 107 | " optional list of column names to consider." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "+----+-----+-----+\n", 122 | "| Id| Name|Sales|\n", 123 | "+----+-----+-----+\n", 124 | "|emp4|Cindy|456.0|\n", 125 | "+----+-----+-----+\n", 126 | "\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "# Drop any row that contains missing data\n", 132 | "df.na.drop().show()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 8, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "+----+-----+-----+\n", 147 | "| Id| Name|Sales|\n", 148 | "+----+-----+-----+\n", 149 | "|emp1| John| null|\n", 150 | "|emp3| null|345.0|\n", 151 | "|emp4|Cindy|456.0|\n", 152 | "+----+-----+-----+\n", 153 | "\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "# Has to have at least 2 NON-null values\n", 159 | "df.na.drop(thresh=2).show()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 9, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "+----+-----+-----+\n", 174 | "| Id| Name|Sales|\n", 175 | "+----+-----+-----+\n", 176 | "|emp3| null|345.0|\n", 177 | "|emp4|Cindy|456.0|\n", 178 | "+----+-----+-----+\n", 179 | "\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "df.na.drop(subset=[\"Sales\"]).show()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 10, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "+----+-----+-----+\n", 199 | "| Id| Name|Sales|\n", 200 | "+----+-----+-----+\n", 201 | "|emp4|Cindy|456.0|\n", 202 | "+----+-----+-----+\n", 203 | "\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "df.na.drop(how='any').show()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 11, 214 | "metadata": { 215 | "collapsed": false 216 | }, 217 | "outputs": [ 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "+----+-----+-----+\n", 223 | "| Id| Name|Sales|\n", 224 | "+----+-----+-----+\n", 225 | "|emp1| John| null|\n", 226 | "|emp2| null| null|\n", 227 | "|emp3| null|345.0|\n", 228 | "|emp4|Cindy|456.0|\n", 229 | "+----+-----+-----+\n", 230 | "\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "df.na.drop(how='all').show()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "## Fill the missing values\n", 243 | "\n", 244 | "We can also fill the missing values with new values. If you have multiple nulls across multiple data types, Spark is actually smart enough to match up the data types. For example:" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 15, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "+----+---------+-----+\n", 259 | "| Id| Name|Sales|\n", 260 | "+----+---------+-----+\n", 261 | "|emp1| John| null|\n", 262 | "|emp2|NEW VALUE| null|\n", 263 | "|emp3|NEW VALUE|345.0|\n", 264 | "|emp4| Cindy|456.0|\n", 265 | "+----+---------+-----+\n", 266 | "\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "df.na.fill('NEW VALUE').show()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 16, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "+----+-----+-----+\n", 286 | "| Id| Name|Sales|\n", 287 | "+----+-----+-----+\n", 288 | "|emp1| John| 0.0|\n", 289 | "|emp2| null| 0.0|\n", 290 | "|emp3| null|345.0|\n", 291 | "|emp4|Cindy|456.0|\n", 292 | "+----+-----+-----+\n", 293 | "\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "df.na.fill(0).show()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Usually you should specify what columns you want to fill with the subset parameter" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 17, 311 | "metadata": { 312 | "collapsed": false 313 | }, 314 | "outputs": [ 315 | { 316 | "name": "stdout", 317 | "output_type": "stream", 318 | "text": [ 319 | "+----+-------+-----+\n", 320 | "| Id| Name|Sales|\n", 321 | "+----+-------+-----+\n", 322 | "|emp1| John| null|\n", 323 | "|emp2|No Name| null|\n", 324 | "|emp3|No Name|345.0|\n", 325 | "|emp4| Cindy|456.0|\n", 326 | "+----+-------+-----+\n", 327 | "\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "df.na.fill('No Name',subset=['Name']).show()" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "A very common practice is to fill values with the mean value for the column, for example:" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 23, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/plain": [ 352 | "400.5" 353 | ] 354 | }, 355 | "execution_count": 23, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "from pyspark.sql.functions import mean\n", 362 | "mean_val = df.select(mean(df['Sales'])).collect()\n", 363 | "\n", 364 | "# Weird nested formatting of Row object!\n", 365 | "mean_val[0][0]" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 24, 371 | "metadata": { 372 | "collapsed": true 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "mean_sales = mean_val[0][0]" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 26, 382 | "metadata": { 383 | "collapsed": false 384 | }, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "+----+-----+-----+\n", 391 | "| Id| Name|Sales|\n", 392 | "+----+-----+-----+\n", 393 | "|emp1| John|400.5|\n", 394 | "|emp2| null|400.5|\n", 395 | "|emp3| null|345.0|\n", 396 | "|emp4|Cindy|456.0|\n", 397 | "+----+-----+-----+\n", 398 | "\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "df.na.fill(mean_sales,[\"Sales\"]).show()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 28, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "+----+-----+-----+\n", 418 | "| Id| Name|Sales|\n", 419 | "+----+-----+-----+\n", 420 | "|emp1| John|400.5|\n", 421 | "|emp2| null|400.5|\n", 422 | "|emp3| null|345.0|\n", 423 | "|emp4|Cindy|456.0|\n", 424 | "+----+-----+-----+\n", 425 | "\n" 426 | ] 427 | } 428 | ], 429 | "source": [ 430 | "# One (very ugly) one-liner\n", 431 | "df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "That is all we need to know for now!" 439 | ] 440 | } 441 | ], 442 | "metadata": { 443 | "anaconda-cloud": {}, 444 | "kernelspec": { 445 | "display_name": "Python [conda root]", 446 | "language": "python", 447 | "name": "conda-root-py" 448 | }, 449 | "language_info": { 450 | "codemirror_mode": { 451 | "name": "ipython", 452 | "version": 3 453 | }, 454 | "file_extension": ".py", 455 | "mimetype": "text/x-python", 456 | "name": "python", 457 | "nbconvert_exporter": "python", 458 | "pygments_lexer": "ipython3", 459 | "version": "3.5.3" 460 | } 461 | }, 462 | "nbformat": 4, 463 | "nbformat_minor": 0 464 | } 465 | -------------------------------------------------------------------------------- /Spark_DataFrames/ContainsNull.csv: -------------------------------------------------------------------------------- 1 | Id,Name,Sales 2 | emp1,John, 3 | emp2,, 4 | emp3,,345.0 5 | emp4,Cindy,456.0 6 | -------------------------------------------------------------------------------- /Spark_DataFrames/Missing_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Missing Data\n", 8 | "\n", 9 | "Often data sources are incomplete, which means you will have missing data, you have 3 basic options for filling in missing data (you will personally have to make the decision for what is the right approach:\n", 10 | "\n", 11 | "* Just keep the missing data points.\n", 12 | "* Drop them missing data points (including the entire row)\n", 13 | "* Fill them in with some other value.\n", 14 | "\n", 15 | "Let's cover examples of each of these methods!" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "source": [ 24 | "## Keeping the missing data\n", 25 | "A few machine learning algorithms can easily deal with missing data, let's see what it looks like:" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "from pyspark.sql import SparkSession\n", 37 | "# May take a little while on a local computer\n", 38 | "spark = SparkSession.builder.appName(\"missingdata\").getOrCreate()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "df = spark.read.csv(\"ContainsNull.csv\",header=True,inferSchema=True)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "+----+-----+-----+\n", 64 | "| Id| Name|Sales|\n", 65 | "+----+-----+-----+\n", 66 | "|emp1| John| null|\n", 67 | "|emp2| null| null|\n", 68 | "|emp3| null|345.0|\n", 69 | "|emp4|Cindy|456.0|\n", 70 | "+----+-----+-----+\n", 71 | "\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "df.show()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Notice how the data remains as a null." 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Drop the missing data\n", 91 | "\n", 92 | "You can use the .na functions for missing data. The drop command has the following parameters:\n", 93 | "\n", 94 | " df.na.drop(how='any', thresh=None, subset=None)\n", 95 | " \n", 96 | " * param how: 'any' or 'all'.\n", 97 | " \n", 98 | " If 'any', drop a row if it contains any nulls.\n", 99 | " If 'all', drop a row only if all its values are null.\n", 100 | " \n", 101 | " * param thresh: int, default None\n", 102 | " \n", 103 | " If specified, drop rows that have less than `thresh` non-null values.\n", 104 | " This overwrites the `how` parameter.\n", 105 | " \n", 106 | " * param subset: \n", 107 | " optional list of column names to consider." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "+----+-----+-----+\n", 122 | "| Id| Name|Sales|\n", 123 | "+----+-----+-----+\n", 124 | "|emp4|Cindy|456.0|\n", 125 | "+----+-----+-----+\n", 126 | "\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "# Drop any row that contains missing data\n", 132 | "df.na.drop().show()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 8, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "+----+-----+-----+\n", 147 | "| Id| Name|Sales|\n", 148 | "+----+-----+-----+\n", 149 | "|emp1| John| null|\n", 150 | "|emp3| null|345.0|\n", 151 | "|emp4|Cindy|456.0|\n", 152 | "+----+-----+-----+\n", 153 | "\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "# Has to have at least 2 NON-null values\n", 159 | "df.na.drop(thresh=2).show()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 9, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "+----+-----+-----+\n", 174 | "| Id| Name|Sales|\n", 175 | "+----+-----+-----+\n", 176 | "|emp3| null|345.0|\n", 177 | "|emp4|Cindy|456.0|\n", 178 | "+----+-----+-----+\n", 179 | "\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "df.na.drop(subset=[\"Sales\"]).show()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 10, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "+----+-----+-----+\n", 199 | "| Id| Name|Sales|\n", 200 | "+----+-----+-----+\n", 201 | "|emp4|Cindy|456.0|\n", 202 | "+----+-----+-----+\n", 203 | "\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "df.na.drop(how='any').show()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 11, 214 | "metadata": { 215 | "collapsed": false 216 | }, 217 | "outputs": [ 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "+----+-----+-----+\n", 223 | "| Id| Name|Sales|\n", 224 | "+----+-----+-----+\n", 225 | "|emp1| John| null|\n", 226 | "|emp2| null| null|\n", 227 | "|emp3| null|345.0|\n", 228 | "|emp4|Cindy|456.0|\n", 229 | "+----+-----+-----+\n", 230 | "\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "df.na.drop(how='all').show()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "## Fill the missing values\n", 243 | "\n", 244 | "We can also fill the missing values with new values. If you have multiple nulls across multiple data types, Spark is actually smart enough to match up the data types. For example:" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 15, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "+----+---------+-----+\n", 259 | "| Id| Name|Sales|\n", 260 | "+----+---------+-----+\n", 261 | "|emp1| John| null|\n", 262 | "|emp2|NEW VALUE| null|\n", 263 | "|emp3|NEW VALUE|345.0|\n", 264 | "|emp4| Cindy|456.0|\n", 265 | "+----+---------+-----+\n", 266 | "\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "df.na.fill('NEW VALUE').show()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 16, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "+----+-----+-----+\n", 286 | "| Id| Name|Sales|\n", 287 | "+----+-----+-----+\n", 288 | "|emp1| John| 0.0|\n", 289 | "|emp2| null| 0.0|\n", 290 | "|emp3| null|345.0|\n", 291 | "|emp4|Cindy|456.0|\n", 292 | "+----+-----+-----+\n", 293 | "\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "df.na.fill(0).show()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Usually you should specify what columns you want to fill with the subset parameter" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 17, 311 | "metadata": { 312 | "collapsed": false 313 | }, 314 | "outputs": [ 315 | { 316 | "name": "stdout", 317 | "output_type": "stream", 318 | "text": [ 319 | "+----+-------+-----+\n", 320 | "| Id| Name|Sales|\n", 321 | "+----+-------+-----+\n", 322 | "|emp1| John| null|\n", 323 | "|emp2|No Name| null|\n", 324 | "|emp3|No Name|345.0|\n", 325 | "|emp4| Cindy|456.0|\n", 326 | "+----+-------+-----+\n", 327 | "\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "df.na.fill('No Name',subset=['Name']).show()" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "A very common practice is to fill values with the mean value for the column, for example:" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 23, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/plain": [ 352 | "400.5" 353 | ] 354 | }, 355 | "execution_count": 23, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "from pyspark.sql.functions import mean\n", 362 | "mean_val = df.select(mean(df['Sales'])).collect()\n", 363 | "\n", 364 | "# Weird nested formatting of Row object!\n", 365 | "mean_val[0][0]" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 24, 371 | "metadata": { 372 | "collapsed": true 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "mean_sales = mean_val[0][0]" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 26, 382 | "metadata": { 383 | "collapsed": false 384 | }, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "+----+-----+-----+\n", 391 | "| Id| Name|Sales|\n", 392 | "+----+-----+-----+\n", 393 | "|emp1| John|400.5|\n", 394 | "|emp2| null|400.5|\n", 395 | "|emp3| null|345.0|\n", 396 | "|emp4|Cindy|456.0|\n", 397 | "+----+-----+-----+\n", 398 | "\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "df.na.fill(mean_sales,[\"Sales\"]).show()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 28, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "+----+-----+-----+\n", 418 | "| Id| Name|Sales|\n", 419 | "+----+-----+-----+\n", 420 | "|emp1| John|400.5|\n", 421 | "|emp2| null|400.5|\n", 422 | "|emp3| null|345.0|\n", 423 | "|emp4|Cindy|456.0|\n", 424 | "+----+-----+-----+\n", 425 | "\n" 426 | ] 427 | } 428 | ], 429 | "source": [ 430 | "# One (very ugly) one-liner\n", 431 | "df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "That is all we need to know for now!" 439 | ] 440 | } 441 | ], 442 | "metadata": { 443 | "anaconda-cloud": {}, 444 | "kernelspec": { 445 | "display_name": "Python [conda root]", 446 | "language": "python", 447 | "name": "conda-root-py" 448 | }, 449 | "language_info": { 450 | "codemirror_mode": { 451 | "name": "ipython", 452 | "version": 3 453 | }, 454 | "file_extension": ".py", 455 | "mimetype": "text/x-python", 456 | "name": "python", 457 | "nbconvert_exporter": "python", 458 | "pygments_lexer": "ipython3", 459 | "version": "3.5.3" 460 | } 461 | }, 462 | "nbformat": 4, 463 | "nbformat_minor": 0 464 | } 465 | -------------------------------------------------------------------------------- /Spark_DataFrames/people.json: -------------------------------------------------------------------------------- 1 | {"name":"Michael"} 2 | {"name":"Andy", "age":30} 3 | {"name":"Justin", "age":19} 4 | -------------------------------------------------------------------------------- /Spark_DataFrames/sales_info.csv: -------------------------------------------------------------------------------- 1 | Company,Person,Sales 2 | GOOG,Sam,200 3 | GOOG,Charlie,120 4 | GOOG,Frank,340 5 | MSFT,Tina,600 6 | MSFT,Amy,124 7 | MSFT,Vanessa,243 8 | FB,Carl,870 9 | FB,Sarah,350 10 | APPL,John,250 11 | APPL,Linda, 130 12 | APPL,Mike, 750 13 | APPL, Chris, 350 -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Clustering/.ipynb_checkpoints/Clustering Code Along-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Clustering Code Along\n", 8 | "\n", 9 | "We'll be working with a real data set about seeds, from UCI repository: https://archive.ics.uci.edu/ml/datasets/seeds." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for \n", 17 | "the experiment. High quality visualization of the internal kernel structure was detected using a soft X-ray technique. It is non-destructive and considerably cheaper than other more sophisticated imaging techniques like scanning microscopy or laser technology. The images were recorded on 13x18 cm X-ray KODAK plates. Studies were conducted using combine harvested wheat grain originating from experimental fields, explored at the Institute of Agrophysics of the Polish Academy of Sciences in Lublin. \n", 18 | "\n", 19 | "The data set can be used for the tasks of classification and cluster analysis.\n", 20 | "\n", 21 | "\n", 22 | "Attribute Information:\n", 23 | "\n", 24 | "To construct the data, seven geometric parameters of wheat kernels were measured: \n", 25 | "1. area A, \n", 26 | "2. perimeter P, \n", 27 | "3. compactness C = 4*pi*A/P^2, \n", 28 | "4. length of kernel, \n", 29 | "5. width of kernel, \n", 30 | "6. asymmetry coefficient \n", 31 | "7. length of kernel groove. \n", 32 | "All of these parameters were real-valued continuous.\n", 33 | "\n", 34 | "Let's see if we can cluster them in to 3 groups with K-means!" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 53, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "from pyspark.sql import SparkSession\n", 46 | "spark = SparkSession.builder.appName('cluster').getOrCreate()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 54, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from pyspark.ml.clustering import KMeans\n", 58 | "\n", 59 | "# Loads data.\n", 60 | "dataset = spark.read.csv(\"seeds_dataset.csv\",header=True,inferSchema=True)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 55, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)" 74 | ] 75 | }, 76 | "execution_count": 55, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "dataset.head()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 56, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n", 97 | "|summary| area| perimeter| compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient| length_of_groove|\n", 98 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n", 99 | "| count| 210| 210| 210| 210| 210| 210| 210|\n", 100 | "| mean|14.847523809523816|14.559285714285718| 0.8709985714285714| 5.628533333333335| 3.258604761904762| 3.7001999999999997| 5.408071428571429|\n", 101 | "| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867| 1.5035589702547392|0.49148049910240543|\n", 102 | "| min| 10.59| 12.41| 0.8081| 4.899| 2.63| 0.765| 4.519|\n", 103 | "| max| 21.18| 17.25| 0.9183| 6.675| 4.033| 8.456| 6.55|\n", 104 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n", 105 | "\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "dataset.describe().show()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Format the Data" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 57, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "from pyspark.ml.linalg import Vectors\n", 129 | "from pyspark.ml.feature import VectorAssembler" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 58, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "['area',\n", 143 | " 'perimeter',\n", 144 | " 'compactness',\n", 145 | " 'length_of_kernel',\n", 146 | " 'width_of_kernel',\n", 147 | " 'asymmetry_coefficient',\n", 148 | " 'length_of_groove']" 149 | ] 150 | }, 151 | "execution_count": 58, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "dataset.columns" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 59, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "vec_assembler = VectorAssembler(inputCols = dataset.columns, outputCol='features')" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 60, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "final_data = vec_assembler.transform(dataset)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## Scale the Data\n", 187 | "It is a good idea to scale our data to deal with the curse of dimensionality: https://en.wikipedia.org/wiki/Curse_of_dimensionality" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 61, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "from pyspark.ml.feature import StandardScaler" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 62, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "scaler = StandardScaler(inputCol=\"features\", outputCol=\"scaledFeatures\", withStd=True, withMean=False)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 63, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "# Compute summary statistics by fitting the StandardScaler\n", 221 | "scalerModel = scaler.fit(final_data)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 64, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "# Normalize each feature to have unit standard deviation.\n", 233 | "final_data = scalerModel.transform(final_data)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## Train the Model and Evaluate" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 76, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "# Trains a k-means model.\n", 252 | "kmeans = KMeans(featuresCol='scaledFeatures',k=3)\n", 253 | "model = kmeans.fit(final_data)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 77, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "Within Set Sum of Squared Errors = 429.07559671506715\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n", 273 | "wssse = model.computeCost(final_data)\n", 274 | "print(\"Within Set Sum of Squared Errors = \" + str(wssse))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 79, 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "Cluster Centers: \n", 289 | "[ 6.31670546 12.37109759 37.39491396 13.91155062 9.748067\n", 290 | " 2.39849968 12.2661748 ]\n", 291 | "[ 4.87257659 10.88120146 37.27692543 12.3410157 8.55443412\n", 292 | " 1.81649011 10.32998598]\n", 293 | "[ 4.06105916 10.13979506 35.80536984 11.82133095 7.50395937\n", 294 | " 3.27184732 10.42126018]\n" 295 | ] 296 | } 297 | ], 298 | "source": [ 299 | "# Shows the result.\n", 300 | "centers = model.clusterCenters()\n", 301 | "print(\"Cluster Centers: \")\n", 302 | "for center in centers:\n", 303 | " print(center)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 80, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "+----------+\n", 318 | "|prediction|\n", 319 | "+----------+\n", 320 | "| 1|\n", 321 | "| 1|\n", 322 | "| 1|\n", 323 | "| 1|\n", 324 | "| 1|\n", 325 | "| 1|\n", 326 | "| 1|\n", 327 | "| 1|\n", 328 | "| 0|\n", 329 | "| 0|\n", 330 | "| 1|\n", 331 | "| 1|\n", 332 | "| 1|\n", 333 | "| 1|\n", 334 | "| 1|\n", 335 | "| 1|\n", 336 | "| 1|\n", 337 | "| 1|\n", 338 | "| 1|\n", 339 | "| 2|\n", 340 | "+----------+\n", 341 | "only showing top 20 rows\n", 342 | "\n" 343 | ] 344 | } 345 | ], 346 | "source": [ 347 | "model.transform(final_data).select('prediction').show()" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "Now you are ready for your consulting Project!\n", 355 | "# Great Job!" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "anaconda-cloud": {}, 361 | "kernelspec": { 362 | "display_name": "Python [conda root]", 363 | "language": "python", 364 | "name": "conda-root-py" 365 | }, 366 | "language_info": { 367 | "codemirror_mode": { 368 | "name": "ipython", 369 | "version": 3 370 | }, 371 | "file_extension": ".py", 372 | "mimetype": "text/x-python", 373 | "name": "python", 374 | "nbconvert_exporter": "python", 375 | "pygments_lexer": "ipython3", 376 | "version": "3.5.3" 377 | } 378 | }, 379 | "nbformat": 4, 380 | "nbformat_minor": 0 381 | } 382 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Clustering/.ipynb_checkpoints/Clustering_Code_Example-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Clustering Documentation Example\n", 8 | "\n", 9 | "

K-means

\n", 10 | "\n", 11 | "

k-means is one of the\n", 12 | "most commonly used clustering algorithms that clusters the data points into a\n", 13 | "predefined number of clusters. The MLlib implementation includes a parallelized\n", 14 | "variant of the k-means++ method\n", 15 | "called kmeans||.

\n", 16 | "\n", 17 | "

KMeans is implemented as an Estimator and generates a KMeansModel as the base model.

\n", 18 | "\n", 19 | "

Input Columns

\n", 20 | "\n", 21 | "\n", 22 | " \n", 23 | " \n", 24 | " \n", 25 | " \n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | "
Param nameType(s)DefaultDescription
featuresColVector\"features\"Feature vector
\n", 39 | "\n", 40 | "

Output Columns

\n", 41 | "\n", 42 | "\n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | "
Param nameType(s)DefaultDescription
predictionColInt\"prediction\"Predicted cluster center
" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "#Cluster methods Example\n", 71 | "from pyspark.sql import SparkSession\n", 72 | "spark = SparkSession.builder.appName('cluster').getOrCreate()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "Within Set Sum of Squared Errors = 0.11999999999994547\n", 87 | "Cluster Centers: \n", 88 | "[ 9.1 9.1 9.1]\n", 89 | "[ 0.1 0.1 0.1]\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "from pyspark.ml.clustering import KMeans\n", 95 | "\n", 96 | "# Loads data.\n", 97 | "dataset = spark.read.format(\"libsvm\").load(\"sample_kmeans_data.txt\")\n", 98 | "\n", 99 | "# Trains a k-means model.\n", 100 | "kmeans = KMeans().setK(2).setSeed(1)\n", 101 | "model = kmeans.fit(dataset)\n", 102 | "\n", 103 | "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n", 104 | "wssse = model.computeCost(dataset)\n", 105 | "print(\"Within Set Sum of Squared Errors = \" + str(wssse))\n", 106 | "\n", 107 | "# Shows the result.\n", 108 | "centers = model.clusterCenters()\n", 109 | "print(\"Cluster Centers: \")\n", 110 | "for center in centers:\n", 111 | " print(center)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Alright let's code through our own example!" 119 | ] 120 | } 121 | ], 122 | "metadata": { 123 | "anaconda-cloud": {}, 124 | "kernelspec": { 125 | "display_name": "Python [conda root]", 126 | "language": "python", 127 | "name": "conda-root-py" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.5.3" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 0 144 | } 145 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Clustering/.ipynb_checkpoints/Clustering_Consulting_Project-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Clustering Consulting Project \n", 8 | "\n", 9 | "A large technology firm needs your help, they've been hacked! Luckily their forensic engineers have grabbed valuable data about the hacks, including information like session time,locations, wpm typing speed, etc. The forensic engineer relates to you what she has been able to figure out so far, she has been able to grab meta data of each session that the hackers used to connect to their servers. These are the features of the data:\n", 10 | "\n", 11 | "* 'Session_Connection_Time': How long the session lasted in minutes\n", 12 | "* 'Bytes Transferred': Number of MB transferred during session\n", 13 | "* 'Kali_Trace_Used': Indicates if the hacker was using Kali Linux\n", 14 | "* 'Servers_Corrupted': Number of server corrupted during the attack\n", 15 | "* 'Pages_Corrupted': Number of pages illegally accessed\n", 16 | "* 'Location': Location attack came from (Probably useless because the hackers used VPNs)\n", 17 | "* 'WPM_Typing_Speed': Their estimated typing speed based on session logs.\n", 18 | "\n", 19 | "\n", 20 | "The technology firm has 3 potential hackers that perpetrated the attack. Their certain of the first two hackers but they aren't very sure if the third hacker was involved or not. They have requested your help! Can you help figure out whether or not the third suspect had anything to do with the attacks, or was it just two hackers? It's probably not possible to know for sure, but maybe what you've just learned about Clustering can help!\n", 21 | "\n", 22 | "**One last key fact, the forensic engineer knows that the hackers trade off attacks. Meaning they should each have roughly the same amount of attacks. For example if there were 100 total attacks, then in a 2 hacker situation each should have about 50 hacks, in a three hacker situation each would have about 33 hacks. The engineer believes this is the key element to solving this, but doesn't know how to distinguish this unlabeled data into groups of hackers.**" 23 | ] 24 | } 25 | ], 26 | "metadata": { 27 | "anaconda-cloud": {}, 28 | "kernelspec": { 29 | "display_name": "Python [conda root]", 30 | "language": "python", 31 | "name": "conda-root-py" 32 | }, 33 | "language_info": { 34 | "codemirror_mode": { 35 | "name": "ipython", 36 | "version": 3 37 | }, 38 | "file_extension": ".py", 39 | "mimetype": "text/x-python", 40 | "name": "python", 41 | "nbconvert_exporter": "python", 42 | "pygments_lexer": "ipython3", 43 | "version": "3.5.3" 44 | } 45 | }, 46 | "nbformat": 4, 47 | "nbformat_minor": 0 48 | } 49 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Clustering/.ipynb_checkpoints/Random_Forest_Doc_Example-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Random Forest Example\n", 8 | "\n", 9 | "This is just a quick walkthrough of the Documentation's Example of Random Forest:" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from pyspark.ml import Pipeline\n", 21 | "from pyspark.ml.classification import RandomForestClassifier\n", 22 | "from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer\n", 23 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "# Load and parse the data file, converting it to a DataFrame.\n", 35 | "data = spark.read.format(\"libsvm\").load(\"data/mllib/sample_libsvm_data.txt\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "# Index labels, adding metadata to the label column.\n", 47 | "# Fit on whole dataset to include all labels in index.\n", 48 | "labelIndexer = StringIndexer(inputCol=\"label\", outputCol=\"indexedLabel\").fit(data)\n", 49 | "\n", 50 | "# Automatically identify categorical features, and index them.\n", 51 | "# Set maxCategories so features with > 4 distinct values are treated as continuous.\n", 52 | "featureIndexer = VectorIndexer(inputCol=\"features\", outputCol=\"indexedFeatures\", maxCategories=4).fit(data)\n", 53 | "\n", 54 | "# Split the data into training and test sets (30% held out for testing)\n", 55 | "(trainingData, testData) = data.randomSplit([0.7, 0.3])\n", 56 | "\n", 57 | "# Train a RandomForest model.\n", 58 | "rf = RandomForestClassifier(labelCol=\"indexedLabel\", featuresCol=\"indexedFeatures\", numTrees=10)\n", 59 | "\n", 60 | "# Convert indexed labels back to original labels.\n", 61 | "labelConverter = IndexToString(inputCol=\"prediction\", outputCol=\"predictedLabel\",\n", 62 | " labels=labelIndexer.labels)\n", 63 | "\n", 64 | "# Chain indexers and forest in a Pipeline\n", 65 | "pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])\n", 66 | "\n", 67 | "# Train model. This also runs the indexers.\n", 68 | "model = pipeline.fit(trainingData)\n", 69 | "\n", 70 | "# Make predictions.\n", 71 | "predictions = model.transform(testData)\n", 72 | "\n", 73 | "# Select example rows to display.\n", 74 | "predictions.select(\"predictedLabel\", \"label\", \"features\").show(5)\n", 75 | "\n", 76 | "# Select (prediction, true label) and compute test error\n", 77 | "evaluator = MulticlassClassificationEvaluator(\n", 78 | " labelCol=\"indexedLabel\", predictionCol=\"prediction\", metricName=\"accuracy\")\n", 79 | "accuracy = evaluator.evaluate(predictions)\n", 80 | "print(\"Test Error = %g\" % (1.0 - accuracy))\n", 81 | "\n", 82 | "rfModel = model.stages[2]\n", 83 | "print(rfModel) # summary only" 84 | ] 85 | } 86 | ], 87 | "metadata": { 88 | "anaconda-cloud": {}, 89 | "kernelspec": { 90 | "display_name": "Python [conda root]", 91 | "language": "python", 92 | "name": "conda-root-py" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.5.3" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 0 109 | } 110 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Clustering/Clustering Code Along.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Clustering Code Along\n", 8 | "\n", 9 | "We'll be working with a real data set about seeds, from UCI repository: https://archive.ics.uci.edu/ml/datasets/seeds." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for \n", 17 | "the experiment. High quality visualization of the internal kernel structure was detected using a soft X-ray technique. It is non-destructive and considerably cheaper than other more sophisticated imaging techniques like scanning microscopy or laser technology. The images were recorded on 13x18 cm X-ray KODAK plates. Studies were conducted using combine harvested wheat grain originating from experimental fields, explored at the Institute of Agrophysics of the Polish Academy of Sciences in Lublin. \n", 18 | "\n", 19 | "The data set can be used for the tasks of classification and cluster analysis.\n", 20 | "\n", 21 | "\n", 22 | "Attribute Information:\n", 23 | "\n", 24 | "To construct the data, seven geometric parameters of wheat kernels were measured: \n", 25 | "1. area A, \n", 26 | "2. perimeter P, \n", 27 | "3. compactness C = 4*pi*A/P^2, \n", 28 | "4. length of kernel, \n", 29 | "5. width of kernel, \n", 30 | "6. asymmetry coefficient \n", 31 | "7. length of kernel groove. \n", 32 | "All of these parameters were real-valued continuous.\n", 33 | "\n", 34 | "Let's see if we can cluster them in to 3 groups with K-means!" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 53, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "from pyspark.sql import SparkSession\n", 46 | "spark = SparkSession.builder.appName('cluster').getOrCreate()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 54, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from pyspark.ml.clustering import KMeans\n", 58 | "\n", 59 | "# Loads data.\n", 60 | "dataset = spark.read.csv(\"seeds_dataset.csv\",header=True,inferSchema=True)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 55, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)" 74 | ] 75 | }, 76 | "execution_count": 55, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "dataset.head()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 56, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n", 97 | "|summary| area| perimeter| compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient| length_of_groove|\n", 98 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n", 99 | "| count| 210| 210| 210| 210| 210| 210| 210|\n", 100 | "| mean|14.847523809523816|14.559285714285718| 0.8709985714285714| 5.628533333333335| 3.258604761904762| 3.7001999999999997| 5.408071428571429|\n", 101 | "| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867| 1.5035589702547392|0.49148049910240543|\n", 102 | "| min| 10.59| 12.41| 0.8081| 4.899| 2.63| 0.765| 4.519|\n", 103 | "| max| 21.18| 17.25| 0.9183| 6.675| 4.033| 8.456| 6.55|\n", 104 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n", 105 | "\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "dataset.describe().show()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Format the Data" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 57, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "from pyspark.ml.linalg import Vectors\n", 129 | "from pyspark.ml.feature import VectorAssembler" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 58, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "['area',\n", 143 | " 'perimeter',\n", 144 | " 'compactness',\n", 145 | " 'length_of_kernel',\n", 146 | " 'width_of_kernel',\n", 147 | " 'asymmetry_coefficient',\n", 148 | " 'length_of_groove']" 149 | ] 150 | }, 151 | "execution_count": 58, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "dataset.columns" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 59, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "vec_assembler = VectorAssembler(inputCols = dataset.columns, outputCol='features')" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 60, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "final_data = vec_assembler.transform(dataset)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## Scale the Data\n", 187 | "It is a good idea to scale our data to deal with the curse of dimensionality: https://en.wikipedia.org/wiki/Curse_of_dimensionality" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 61, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "from pyspark.ml.feature import StandardScaler" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 62, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "scaler = StandardScaler(inputCol=\"features\", outputCol=\"scaledFeatures\", withStd=True, withMean=False)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 63, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "# Compute summary statistics by fitting the StandardScaler\n", 221 | "scalerModel = scaler.fit(final_data)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 64, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "# Normalize each feature to have unit standard deviation.\n", 233 | "final_data = scalerModel.transform(final_data)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## Train the Model and Evaluate" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 76, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "# Trains a k-means model.\n", 252 | "kmeans = KMeans(featuresCol='scaledFeatures',k=3)\n", 253 | "model = kmeans.fit(final_data)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 77, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "Within Set Sum of Squared Errors = 429.07559671506715\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n", 273 | "wssse = model.computeCost(final_data)\n", 274 | "print(\"Within Set Sum of Squared Errors = \" + str(wssse))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 79, 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "Cluster Centers: \n", 289 | "[ 6.31670546 12.37109759 37.39491396 13.91155062 9.748067\n", 290 | " 2.39849968 12.2661748 ]\n", 291 | "[ 4.87257659 10.88120146 37.27692543 12.3410157 8.55443412\n", 292 | " 1.81649011 10.32998598]\n", 293 | "[ 4.06105916 10.13979506 35.80536984 11.82133095 7.50395937\n", 294 | " 3.27184732 10.42126018]\n" 295 | ] 296 | } 297 | ], 298 | "source": [ 299 | "# Shows the result.\n", 300 | "centers = model.clusterCenters()\n", 301 | "print(\"Cluster Centers: \")\n", 302 | "for center in centers:\n", 303 | " print(center)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 80, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "+----------+\n", 318 | "|prediction|\n", 319 | "+----------+\n", 320 | "| 1|\n", 321 | "| 1|\n", 322 | "| 1|\n", 323 | "| 1|\n", 324 | "| 1|\n", 325 | "| 1|\n", 326 | "| 1|\n", 327 | "| 1|\n", 328 | "| 0|\n", 329 | "| 0|\n", 330 | "| 1|\n", 331 | "| 1|\n", 332 | "| 1|\n", 333 | "| 1|\n", 334 | "| 1|\n", 335 | "| 1|\n", 336 | "| 1|\n", 337 | "| 1|\n", 338 | "| 1|\n", 339 | "| 2|\n", 340 | "+----------+\n", 341 | "only showing top 20 rows\n", 342 | "\n" 343 | ] 344 | } 345 | ], 346 | "source": [ 347 | "model.transform(final_data).select('prediction').show()" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "Now you are ready for your consulting Project!\n", 355 | "# Great Job!" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "anaconda-cloud": {}, 361 | "kernelspec": { 362 | "display_name": "Python [conda root]", 363 | "language": "python", 364 | "name": "conda-root-py" 365 | }, 366 | "language_info": { 367 | "codemirror_mode": { 368 | "name": "ipython", 369 | "version": 3 370 | }, 371 | "file_extension": ".py", 372 | "mimetype": "text/x-python", 373 | "name": "python", 374 | "nbconvert_exporter": "python", 375 | "pygments_lexer": "ipython3", 376 | "version": "3.5.3" 377 | } 378 | }, 379 | "nbformat": 4, 380 | "nbformat_minor": 0 381 | } 382 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Clustering/Clustering_Code_Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Clustering Documentation Example\n", 8 | "\n", 9 | "

K-means

\n", 10 | "\n", 11 | "

k-means is one of the\n", 12 | "most commonly used clustering algorithms that clusters the data points into a\n", 13 | "predefined number of clusters. The MLlib implementation includes a parallelized\n", 14 | "variant of the k-means++ method\n", 15 | "called kmeans||.

\n", 16 | "\n", 17 | "

KMeans is implemented as an Estimator and generates a KMeansModel as the base model.

\n", 18 | "\n", 19 | "

Input Columns

\n", 20 | "\n", 21 | "\n", 22 | " \n", 23 | " \n", 24 | " \n", 25 | " \n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | "
Param nameType(s)DefaultDescription
featuresColVector\"features\"Feature vector
\n", 39 | "\n", 40 | "

Output Columns

\n", 41 | "\n", 42 | "\n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | "
Param nameType(s)DefaultDescription
predictionColInt\"prediction\"Predicted cluster center
" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "#Cluster methods Example\n", 71 | "from pyspark.sql import SparkSession\n", 72 | "spark = SparkSession.builder.appName('cluster').getOrCreate()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "Within Set Sum of Squared Errors = 0.11999999999994547\n", 87 | "Cluster Centers: \n", 88 | "[ 9.1 9.1 9.1]\n", 89 | "[ 0.1 0.1 0.1]\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "from pyspark.ml.clustering import KMeans\n", 95 | "\n", 96 | "# Loads data.\n", 97 | "dataset = spark.read.format(\"libsvm\").load(\"sample_kmeans_data.txt\")\n", 98 | "\n", 99 | "# Trains a k-means model.\n", 100 | "kmeans = KMeans().setK(2).setSeed(1)\n", 101 | "model = kmeans.fit(dataset)\n", 102 | "\n", 103 | "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n", 104 | "wssse = model.computeCost(dataset)\n", 105 | "print(\"Within Set Sum of Squared Errors = \" + str(wssse))\n", 106 | "\n", 107 | "# Shows the result.\n", 108 | "centers = model.clusterCenters()\n", 109 | "print(\"Cluster Centers: \")\n", 110 | "for center in centers:\n", 111 | " print(center)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Alright let's code through our own example!" 119 | ] 120 | } 121 | ], 122 | "metadata": { 123 | "anaconda-cloud": {}, 124 | "kernelspec": { 125 | "display_name": "Python [conda root]", 126 | "language": "python", 127 | "name": "conda-root-py" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.5.3" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 0 144 | } 145 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Clustering/Clustering_Consulting_Project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Clustering Consulting Project \n", 8 | "\n", 9 | "A large technology firm needs your help, they've been hacked! Luckily their forensic engineers have grabbed valuable data about the hacks, including information like session time,locations, wpm typing speed, etc. The forensic engineer relates to you what she has been able to figure out so far, she has been able to grab meta data of each session that the hackers used to connect to their servers. These are the features of the data:\n", 10 | "\n", 11 | "* 'Session_Connection_Time': How long the session lasted in minutes\n", 12 | "* 'Bytes Transferred': Number of MB transferred during session\n", 13 | "* 'Kali_Trace_Used': Indicates if the hacker was using Kali Linux\n", 14 | "* 'Servers_Corrupted': Number of server corrupted during the attack\n", 15 | "* 'Pages_Corrupted': Number of pages illegally accessed\n", 16 | "* 'Location': Location attack came from (Probably useless because the hackers used VPNs)\n", 17 | "* 'WPM_Typing_Speed': Their estimated typing speed based on session logs.\n", 18 | "\n", 19 | "\n", 20 | "The technology firm has 3 potential hackers that perpetrated the attack. Their certain of the first two hackers but they aren't very sure if the third hacker was involved or not. They have requested your help! Can you help figure out whether or not the third suspect had anything to do with the attacks, or was it just two hackers? It's probably not possible to know for sure, but maybe what you've just learned about Clustering can help!\n", 21 | "\n", 22 | "**One last key fact, the forensic engineer knows that the hackers trade off attacks. Meaning they should each have roughly the same amount of attacks. For example if there were 100 total attacks, then in a 2 hacker situation each should have about 50 hacks, in a three hacker situation each would have about 33 hacks. The engineer believes this is the key element to solving this, but doesn't know how to distinguish this unlabeled data into groups of hackers.**" 23 | ] 24 | } 25 | ], 26 | "metadata": { 27 | "anaconda-cloud": {}, 28 | "kernelspec": { 29 | "display_name": "Python [conda root]", 30 | "language": "python", 31 | "name": "conda-root-py" 32 | }, 33 | "language_info": { 34 | "codemirror_mode": { 35 | "name": "ipython", 36 | "version": 3 37 | }, 38 | "file_extension": ".py", 39 | "mimetype": "text/x-python", 40 | "name": "python", 41 | "nbconvert_exporter": "python", 42 | "pygments_lexer": "ipython3", 43 | "version": "3.5.3" 44 | } 45 | }, 46 | "nbformat": 4, 47 | "nbformat_minor": 0 48 | } 49 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Clustering/sample_kmeans_data.txt: -------------------------------------------------------------------------------- 1 | 0 1:0.0 2:0.0 3:0.0 2 | 1 1:0.1 2:0.1 3:0.1 3 | 2 1:0.2 2:0.2 3:0.2 4 | 3 1:9.0 2:9.0 3:9.0 5 | 4 1:9.1 2:9.1 3:9.1 6 | 5 1:9.2 2:9.2 3:9.2 7 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Clustering/seeds_dataset.csv: -------------------------------------------------------------------------------- 1 | area,perimeter,compactness,length_of_kernel,width_of_kernel,asymmetry_coefficient,length_of_groove 2 | 15.26,14.84,0.871,5.763,3.312,2.221,5.22 3 | 14.88,14.57,0.8811,5.553999999999999,3.333,1.018,4.956 4 | 14.29,14.09,0.905,5.291,3.3369999999999997,2.699,4.825 5 | 13.84,13.94,0.8955,5.324,3.3789999999999996,2.259,4.805 6 | 16.14,14.99,0.9034,5.6579999999999995,3.562,1.355,5.175 7 | 14.38,14.21,0.8951,5.386,3.312,2.4619999999999997,4.956 8 | 14.69,14.49,0.8799,5.563,3.259,3.5860000000000003,5.218999999999999 9 | 14.11,14.1,0.8911,5.42,3.302,2.7,5.0 10 | 16.63,15.46,0.8747,6.053,3.465,2.04,5.877000000000001 11 | 16.44,15.25,0.888,5.8839999999999995,3.505,1.969,5.5329999999999995 12 | 15.26,14.85,0.8696,5.7139999999999995,3.242,4.543,5.314 13 | 14.03,14.16,0.8796,5.438,3.201,1.7169999999999999,5.001 14 | 13.89,14.02,0.888,5.439,3.199,3.986,4.738 15 | 13.78,14.06,0.8759,5.479,3.156,3.136,4.872 16 | 13.74,14.05,0.8744,5.482,3.114,2.932,4.825 17 | 14.59,14.28,0.8993,5.351,3.333,4.185,4.781000000000001 18 | 13.99,13.83,0.9183,5.119,3.383,5.234,4.781000000000001 19 | 15.69,14.75,0.9058,5.527,3.514,1.599,5.046 20 | 14.7,14.21,0.9153,5.205,3.466,1.767,4.649 21 | 12.72,13.57,0.8686,5.226,3.049,4.102,4.914 22 | 14.16,14.4,0.8584,5.6579999999999995,3.1289999999999996,3.072,5.176 23 | 14.11,14.26,0.8722,5.52,3.168,2.688,5.218999999999999 24 | 15.88,14.9,0.8988,5.617999999999999,3.5069999999999997,0.765,5.091 25 | 12.08,13.23,0.8664,5.099,2.9360000000000004,1.415,4.961 26 | 15.01,14.76,0.8657,5.789,3.245,1.791,5.001 27 | 16.19,15.16,0.8849,5.832999999999999,3.4210000000000003,0.903,5.307 28 | 13.02,13.76,0.8641,5.395,3.0260000000000002,3.373,4.825 29 | 12.74,13.67,0.8564,5.395,2.9560000000000004,2.504,4.869 30 | 14.11,14.18,0.882,5.541,3.221,2.7539999999999996,5.038 31 | 13.45,14.02,0.8604,5.516,3.065,3.531,5.0969999999999995 32 | 13.16,13.82,0.8662,5.454,2.975,0.855,5.056 33 | 15.49,14.94,0.8724,5.757000000000001,3.3710000000000004,3.412,5.228 34 | 14.09,14.41,0.8529,5.7170000000000005,3.1860000000000004,3.92,5.2989999999999995 35 | 13.94,14.17,0.8728,5.585,3.15,2.124,5.012 36 | 15.05,14.68,0.8779,5.712000000000001,3.3280000000000003,2.129,5.36 37 | 16.12,15.0,0.9,5.709,3.485,2.27,5.443 38 | 16.2,15.27,0.8734,5.8260000000000005,3.464,2.823,5.527 39 | 17.08,15.38,0.9079,5.832000000000001,3.6830000000000003,2.9560000000000004,5.484 40 | 14.8,14.52,0.8823,5.656000000000001,3.2880000000000003,3.112,5.309 41 | 14.28,14.17,0.8944,5.397,3.298,6.685,5.001 42 | 13.54,13.85,0.8871,5.348,3.156,2.5869999999999997,5.178 43 | 13.5,13.85,0.8852,5.351,3.158,2.249,5.176 44 | 13.16,13.55,0.9009,5.138,3.201,2.461,4.783 45 | 15.5,14.86,0.882,5.877000000000001,3.3960000000000004,4.711,5.528 46 | 15.11,14.54,0.8986,5.579,3.4619999999999997,3.128,5.18 47 | 13.8,14.04,0.8794,5.376,3.155,1.56,4.961 48 | 15.36,14.76,0.8861,5.7010000000000005,3.3930000000000002,1.367,5.132000000000001 49 | 14.99,14.56,0.8883,5.57,3.377,2.958,5.175 50 | 14.79,14.52,0.8819,5.545,3.2910000000000004,2.7039999999999997,5.111000000000001 51 | 14.86,14.67,0.8676,5.678,3.258,2.129,5.351 52 | 14.43,14.4,0.8751,5.585,3.272,3.975,5.144 53 | 15.78,14.91,0.8923,5.6739999999999995,3.4339999999999997,5.593,5.136 54 | 14.49,14.61,0.8538,5.715,3.113,4.1160000000000005,5.396 55 | 14.33,14.28,0.8831,5.504,3.199,3.3280000000000003,5.224 56 | 14.52,14.6,0.8557,5.7410000000000005,3.113,1.4809999999999999,5.487 57 | 15.03,14.77,0.8658,5.702000000000001,3.2119999999999997,1.933,5.439 58 | 14.46,14.35,0.8818,5.388,3.377,2.802,5.044 59 | 14.92,14.43,0.9006,5.3839999999999995,3.412,1.1420000000000001,5.088 60 | 15.38,14.77,0.8857,5.662000000000001,3.4189999999999996,1.999,5.222 61 | 12.11,13.47,0.8392,5.159,3.032,1.5019999999999998,4.519 62 | 11.42,12.86,0.8683,5.008,2.85,2.7,4.607 63 | 11.23,12.63,0.884,4.902,2.8789999999999996,2.269,4.703 64 | 12.36,13.19,0.8923,5.0760000000000005,3.042,3.22,4.605 65 | 13.22,13.84,0.868,5.395,3.07,4.157,5.088 66 | 12.78,13.57,0.8716,5.2620000000000005,3.0260000000000002,1.176,4.782 67 | 12.88,13.5,0.8879,5.138999999999999,3.1189999999999998,2.352,4.607 68 | 14.34,14.37,0.8726,5.63,3.19,1.3130000000000002,5.15 69 | 14.01,14.29,0.8625,5.609,3.158,2.217,5.132000000000001 70 | 14.37,14.39,0.8726,5.569,3.153,1.464,5.3 71 | 12.73,13.75,0.8458,5.412000000000001,2.8819999999999997,3.533,5.067 72 | 17.63,15.98,0.8673,6.191,3.5610000000000004,4.0760000000000005,6.06 73 | 16.84,15.67,0.8623,5.997999999999999,3.484,4.675,5.877000000000001 74 | 17.26,15.73,0.8763,5.978,3.594,4.539,5.791 75 | 19.11,16.26,0.9081,6.154,3.93,2.9360000000000004,6.079 76 | 16.82,15.51,0.8786,6.017,3.486,4.004,5.841 77 | 16.77,15.62,0.8638,5.9270000000000005,3.438,4.92,5.795 78 | 17.32,15.91,0.8599,6.064,3.403,3.824,5.922000000000001 79 | 20.71,17.23,0.8763,6.579,3.8139999999999996,4.4510000000000005,6.4510000000000005 80 | 18.94,16.49,0.875,6.445,3.639,5.064,6.362 81 | 17.12,15.55,0.8892,5.85,3.5660000000000003,2.858,5.746 82 | 16.53,15.34,0.8823,5.875,3.467,5.532,5.88 83 | 18.72,16.19,0.8977,6.006,3.8569999999999998,5.324,5.879 84 | 20.2,16.89,0.8894,6.285,3.864,5.172999999999999,6.187 85 | 19.57,16.74,0.8779,6.3839999999999995,3.772,1.472,6.273 86 | 19.51,16.71,0.878,6.3660000000000005,3.801,2.9619999999999997,6.185 87 | 18.27,16.09,0.887,6.172999999999999,3.6510000000000002,2.443,6.197 88 | 18.88,16.26,0.8969,6.084,3.764,1.649,6.109 89 | 18.98,16.66,0.8590000000000001,6.5489999999999995,3.67,3.6910000000000003,6.497999999999999 90 | 21.18,17.21,0.8989,6.5729999999999995,4.033,5.78,6.231 91 | 20.88,17.05,0.9031,6.45,4.032,5.016,6.321000000000001 92 | 20.1,16.99,0.8746,6.581,3.785,1.955,6.449 93 | 18.76,16.2,0.8984,6.172000000000001,3.7960000000000003,3.12,6.053 94 | 18.81,16.29,0.8906,6.272,3.693,3.237,6.053 95 | 18.59,16.05,0.9066,6.037000000000001,3.86,6.001,5.877000000000001 96 | 18.36,16.52,0.8452,6.666,3.485,4.933,6.4479999999999995 97 | 16.87,15.65,0.8648,6.138999999999999,3.463,3.696,5.9670000000000005 98 | 19.31,16.59,0.8815,6.341,3.81,3.477,6.2379999999999995 99 | 18.98,16.57,0.8687,6.449,3.552,2.144,6.452999999999999 100 | 18.17,16.26,0.8637,6.271,3.512,2.853,6.273 101 | 18.72,16.34,0.8809999999999999,6.218999999999999,3.6839999999999997,2.188,6.097 102 | 16.41,15.25,0.8866,5.718,3.525,4.217,5.617999999999999 103 | 17.99,15.86,0.8992,5.89,3.694,2.068,5.837000000000001 104 | 19.46,16.5,0.8985,6.1129999999999995,3.892,4.308,6.0089999999999995 105 | 19.18,16.63,0.8717,6.369,3.681,3.3569999999999998,6.229 106 | 18.95,16.42,0.8829,6.247999999999999,3.755,3.3680000000000003,6.148 107 | 18.83,16.29,0.8917,6.037000000000001,3.786,2.553,5.879 108 | 18.85,16.17,0.9056,6.152,3.806,2.843,6.2 109 | 17.63,15.86,0.88,6.0329999999999995,3.573,3.747,5.928999999999999 110 | 19.94,16.92,0.8752,6.675,3.763,3.252,6.55 111 | 18.55,16.22,0.8865,6.153,3.674,1.7380000000000002,5.894 112 | 18.45,16.12,0.8921,6.107,3.7689999999999997,2.235,5.794 113 | 19.38,16.72,0.8716,6.303,3.7910000000000004,3.678,5.965 114 | 19.13,16.31,0.9035,6.183,3.9019999999999997,2.109,5.9239999999999995 115 | 19.14,16.61,0.8722,6.2589999999999995,3.737,6.682,6.053 116 | 20.97,17.25,0.8859,6.563,3.991,4.677,6.316 117 | 19.06,16.45,0.8854,6.416,3.719,2.248,6.162999999999999 118 | 18.96,16.2,0.9077,6.051,3.897,4.334,5.75 119 | 19.15,16.45,0.889,6.245,3.815,3.0839999999999996,6.185 120 | 18.89,16.23,0.9008,6.227,3.7689999999999997,3.639,5.966 121 | 20.03,16.9,0.8811,6.492999999999999,3.8569999999999998,3.063,6.32 122 | 20.24,16.91,0.8897,6.315,3.9619999999999997,5.901,6.188 123 | 18.14,16.12,0.8772,6.059,3.563,3.6189999999999998,6.011 124 | 16.17,15.38,0.8588,5.7620000000000005,3.387,4.2860000000000005,5.702999999999999 125 | 18.43,15.97,0.9077,5.98,3.7710000000000004,2.984,5.905 126 | 15.99,14.89,0.9064,5.3629999999999995,3.582,3.3360000000000003,5.144 127 | 18.75,16.18,0.8999,6.111000000000001,3.8689999999999998,4.188,5.992000000000001 128 | 18.65,16.41,0.8698,6.285,3.594,4.391,6.102 129 | 17.98,15.85,0.8993,5.979,3.687,2.2569999999999997,5.919 130 | 20.16,17.03,0.8735,6.513,3.773,1.91,6.185 131 | 17.55,15.66,0.8991,5.791,3.69,5.3660000000000005,5.6610000000000005 132 | 18.3,15.89,0.9108,5.979,3.755,2.8369999999999997,5.962000000000001 133 | 18.94,16.32,0.8942,6.144,3.825,2.908,5.949 134 | 15.38,14.9,0.8706,5.8839999999999995,3.2680000000000002,4.462,5.795 135 | 16.16,15.33,0.8644,5.845,3.395,4.266,5.795 136 | 15.56,14.89,0.8823,5.776,3.408,4.9719999999999995,5.847 137 | 15.38,14.66,0.899,5.477,3.465,3.6,5.439 138 | 17.36,15.76,0.8785,6.145,3.574,3.5260000000000002,5.971 139 | 15.57,15.15,0.8527,5.92,3.2310000000000003,2.64,5.879 140 | 15.6,15.11,0.858,5.832000000000001,3.286,2.725,5.752000000000001 141 | 16.23,15.18,0.885,5.872000000000001,3.472,3.7689999999999997,5.922000000000001 142 | 13.07,13.92,0.848,5.472,2.9939999999999998,5.303999999999999,5.395 143 | 13.32,13.94,0.8613,5.541,3.073,7.035,5.44 144 | 13.34,13.95,0.862,5.388999999999999,3.074,5.995,5.307 145 | 12.22,13.32,0.8652,5.224,2.967,5.468999999999999,5.221 146 | 11.82,13.4,0.8274,5.314,2.7769999999999997,4.471,5.178 147 | 11.21,13.13,0.8167,5.279,2.687,6.169,5.275 148 | 11.43,13.13,0.8335,5.176,2.719,2.221,5.132000000000001 149 | 12.49,13.46,0.8658,5.267,2.967,4.421,5.002 150 | 12.7,13.71,0.8491,5.386,2.911,3.26,5.316 151 | 10.79,12.93,0.8107,5.317,2.648,5.462000000000001,5.194 152 | 11.83,13.23,0.8496,5.263,2.84,5.195,5.307 153 | 12.01,13.52,0.8249,5.405,2.7760000000000002,6.992000000000001,5.27 154 | 12.26,13.6,0.8333,5.4079999999999995,2.833,4.756,5.36 155 | 11.18,13.04,0.8266,5.22,2.693,3.332,5.001 156 | 11.36,13.05,0.8382,5.175,2.755,4.048,5.263 157 | 11.19,13.05,0.8253,5.25,2.675,5.813,5.218999999999999 158 | 11.34,12.87,0.8596,5.053,2.8489999999999998,3.347,5.003 159 | 12.13,13.73,0.8081,5.394,2.745,4.825,5.22 160 | 11.75,13.52,0.8082,5.444,2.678,4.378,5.31 161 | 11.49,13.22,0.8263,5.303999999999999,2.695,5.388,5.31 162 | 12.54,13.67,0.8425,5.4510000000000005,2.8789999999999996,3.082,5.4910000000000005 163 | 12.02,13.33,0.8503,5.35,2.81,4.271,5.308 164 | 12.05,13.41,0.8416,5.267,2.847,4.988,5.046 165 | 12.55,13.57,0.8558,5.332999999999999,2.968,4.419,5.176 166 | 11.14,12.79,0.8558,5.011,2.7939999999999996,6.388,5.0489999999999995 167 | 12.1,13.15,0.8793,5.105,2.9410000000000003,2.201,5.056 168 | 12.44,13.59,0.8462,5.319,2.897,4.9239999999999995,5.27 169 | 12.15,13.45,0.8443,5.417000000000001,2.8369999999999997,3.638,5.337999999999999 170 | 11.35,13.12,0.8291,5.176,2.668,4.337,5.132000000000001 171 | 11.24,13.0,0.8359,5.09,2.715,3.5210000000000004,5.088 172 | 11.02,13.0,0.8189,5.325,2.701,6.735,5.162999999999999 173 | 11.55,13.1,0.8455,5.167000000000001,2.845,6.715,4.956 174 | 11.27,12.97,0.8419,5.088,2.763,4.309,5.0 175 | 11.4,13.08,0.8375,5.136,2.763,5.587999999999999,5.0889999999999995 176 | 10.83,12.96,0.8099,5.278,2.641,5.182,5.185 177 | 10.8,12.57,0.8590000000000001,4.981,2.821,4.773,5.063 178 | 11.26,13.01,0.8355,5.186,2.71,5.335,5.092 179 | 10.74,12.73,0.8329,5.145,2.642,4.702,4.963 180 | 11.48,13.05,0.8473,5.18,2.758,5.876,5.002 181 | 12.21,13.47,0.8453,5.357,2.8930000000000002,1.661,5.178 182 | 11.41,12.95,0.856,5.09,2.775,4.957,4.825 183 | 12.46,13.41,0.8706,5.236000000000001,3.017,4.987,5.147 184 | 12.19,13.36,0.8579,5.24,2.909,4.857,5.1579999999999995 185 | 11.65,13.07,0.8575,5.1080000000000005,2.85,5.209,5.135 186 | 12.89,13.77,0.8541,5.495,3.0260000000000002,6.185,5.316 187 | 11.56,13.31,0.8198,5.3629999999999995,2.6830000000000003,4.062,5.182 188 | 11.81,13.45,0.8198,5.412999999999999,2.716,4.898,5.352 189 | 10.91,12.8,0.8372,5.088,2.675,4.178999999999999,4.956 190 | 11.23,12.82,0.8594,5.0889999999999995,2.821,7.524,4.957 191 | 10.59,12.41,0.8648,4.899,2.787,4.975,4.794 192 | 10.93,12.8,0.8390000000000001,5.046,2.717,5.398,5.045 193 | 11.27,12.86,0.8563,5.091,2.804,3.985,5.001 194 | 11.87,13.02,0.8795,5.132000000000001,2.9530000000000003,3.597,5.132000000000001 195 | 10.82,12.83,0.8256,5.18,2.63,4.853,5.0889999999999995 196 | 12.11,13.27,0.8639,5.236000000000001,2.975,4.132,5.012 197 | 12.8,13.47,0.8859999999999999,5.16,3.1260000000000003,4.873,4.914 198 | 12.79,13.53,0.8786,5.224,3.054,5.483,4.958 199 | 13.37,13.78,0.8849,5.32,3.128,4.67,5.091 200 | 12.62,13.67,0.8481,5.41,2.911,3.306,5.231 201 | 12.76,13.38,0.8964,5.073,3.155,2.8280000000000003,4.83 202 | 12.38,13.44,0.8609,5.218999999999999,2.989,5.472,5.045 203 | 12.67,13.32,0.8977,4.984,3.135,2.3,4.745 204 | 11.18,12.72,0.868,5.0089999999999995,2.81,4.051,4.828 205 | 12.7,13.41,0.8874,5.183,3.091,8.456,5.0 206 | 12.37,13.47,0.8567,5.204,2.96,3.9189999999999996,5.001 207 | 12.19,13.2,0.8783,5.1370000000000005,2.9810000000000003,3.6310000000000002,4.87 208 | 11.23,12.88,0.8511,5.14,2.795,4.325,5.003 209 | 13.2,13.66,0.8883,5.236000000000001,3.2319999999999998,8.315,5.056 210 | 11.84,13.21,0.8521,5.175,2.8360000000000003,3.5980000000000003,5.044 211 | 12.3,13.34,0.8684,5.242999999999999,2.9739999999999998,5.6370000000000005,5.063 212 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Linear_Regression/.ipynb_checkpoints/Data_Transformations-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Transformations\n", 8 | "\n", 9 | "You won't always get data in a convienent format, often you will have to deal with data that is non-numerical, such as customer names, or zipcodes, country names, etc...\n", 10 | "\n", 11 | "A big part of working with data is using your own domain knowledge to build an intuition of how to deal with the data, sometimes the best course of action is to drop the data, other times feature-engineering is a good way to go, or you could try to transform the data into something the Machine Learning Algorithms will understand.\n", 12 | "\n", 13 | "Spark has several built in methods of dealing with thse transformations, check them all out here: http://spark.apache.org/docs/latest/ml-features.html\n", 14 | "\n", 15 | "Let's see some examples of all of this!" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "from pyspark.sql import SparkSession" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "spark = SparkSession.builder.appName('data').getOrCreate()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "df = spark.read.csv('fake_customers.csv',inferSchema=True,header=True)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "+-------+----------+-----+\n", 63 | "| Name| Phone|Group|\n", 64 | "+-------+----------+-----+\n", 65 | "| John|4085552424| A|\n", 66 | "| Mike|3105552738| B|\n", 67 | "| Cassie|4085552424| B|\n", 68 | "| Laura|3105552438| B|\n", 69 | "| Sarah|4085551234| A|\n", 70 | "| David|3105557463| C|\n", 71 | "| Zach|4085553987| C|\n", 72 | "| Kiera|3105552938| A|\n", 73 | "| Alexa|4085559467| C|\n", 74 | "|Karissa|3105553475| A|\n", 75 | "+-------+----------+-----+\n", 76 | "\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "df.show()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Data Features\n", 89 | "\n", 90 | "### StringIndexer\n", 91 | "\n", 92 | "We often have to convert string information into numerical information as a categorical feature. This is easily done with the StringIndexer Method:" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "+-------+--------+-------------+\n", 107 | "|user_id|category|categoryIndex|\n", 108 | "+-------+--------+-------------+\n", 109 | "| 0| a| 0.0|\n", 110 | "| 1| b| 2.0|\n", 111 | "| 2| c| 1.0|\n", 112 | "| 3| a| 0.0|\n", 113 | "| 4| a| 0.0|\n", 114 | "| 5| c| 1.0|\n", 115 | "+-------+--------+-------------+\n", 116 | "\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "from pyspark.ml.feature import StringIndexer\n", 122 | "\n", 123 | "df = spark.createDataFrame(\n", 124 | " [(0, \"a\"), (1, \"b\"), (2, \"c\"), (3, \"a\"), (4, \"a\"), (5, \"c\")],\n", 125 | " [\"user_id\", \"category\"])\n", 126 | "\n", 127 | "indexer = StringIndexer(inputCol=\"category\", outputCol=\"categoryIndex\")\n", 128 | "indexed = indexer.fit(df).transform(df)\n", 129 | "indexed.show()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "The next step would be to encode these categories into \"dummy\" variables." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### VectorIndexer\n", 153 | "\n", 154 | "VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees. VectorAssembler accepts the following input column types: all numeric types, boolean type, and vector type. In each row, the values of the input columns will be concatenated into a vector in the specified order. \n", 155 | "\n", 156 | "Assume that we have a DataFrame with the columns id, hour, mobile, userFeatures, and clicked:\n", 157 | "\n", 158 | " id | hour | mobile | userFeatures | clicked\n", 159 | " ----|------|--------|------------------|---------\n", 160 | " 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0\n", 161 | " \n", 162 | "userFeatures is a vector column that contains three user features. We want to combine hour, mobile, and userFeatures into a single feature vector called features and use it to predict clicked or not. If we set VectorAssembler’s input columns to hour, mobile, and userFeatures and output column to features, after transformation we should get the following DataFrame:\n", 163 | "\n", 164 | " id | hour | mobile | userFeatures | clicked | features\n", 165 | " ----|------|--------|------------------|---------|-----------------------------\n", 166 | " 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 | [18.0, 1.0, 0.0, 10.0, 0.5]" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 14, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "+---+----+------+--------------+-------+\n", 181 | "| id|hour|mobile| userFeatures|clicked|\n", 182 | "+---+----+------+--------------+-------+\n", 183 | "| 0| 18| 1.0|[0.0,10.0,0.5]| 1.0|\n", 184 | "+---+----+------+--------------+-------+\n", 185 | "\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "from pyspark.ml.linalg import Vectors\n", 191 | "from pyspark.ml.feature import VectorAssembler\n", 192 | "\n", 193 | "dataset = spark.createDataFrame(\n", 194 | " [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],\n", 195 | " [\"id\", \"hour\", \"mobile\", \"userFeatures\", \"clicked\"])\n", 196 | "dataset.show()" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 15, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\n", 211 | "+--------------------+-------+\n", 212 | "| features|clicked|\n", 213 | "+--------------------+-------+\n", 214 | "|[18.0,1.0,0.0,10....| 1.0|\n", 215 | "+--------------------+-------+\n", 216 | "\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "assembler = VectorAssembler(\n", 222 | " inputCols=[\"hour\", \"mobile\", \"userFeatures\"],\n", 223 | " outputCol=\"features\")\n", 224 | "\n", 225 | "output = assembler.transform(dataset)\n", 226 | "print(\"Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\")\n", 227 | "output.select(\"features\", \"clicked\").show()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "There ar emany more data transformations available, we will cover them once we encounter a need for them, for now these were the most important ones.\n", 235 | "\n", 236 | "Let's continue on to Linear Regression!" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [] 247 | } 248 | ], 249 | "metadata": { 250 | "anaconda-cloud": {}, 251 | "kernelspec": { 252 | "display_name": "Python [conda root]", 253 | "language": "python", 254 | "name": "conda-root-py" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.5.3" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 0 271 | } 272 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Linear_Regression/.ipynb_checkpoints/Linear_Regression_Consulting_Project-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Linear Regression Consulting Project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | "Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n", 17 | "\n", 18 | "You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n", 19 | "\n", 20 | "They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n", 21 | "\n", 22 | "Here is what the data looks like so far:\n", 23 | "\n", 24 | " Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n", 25 | " ships.\n", 26 | "\n", 27 | "\n", 28 | " Variables/Columns\n", 29 | " Ship Name 1-20\n", 30 | " Cruise Line 21-40\n", 31 | " Age (as of 2013) 46-48\n", 32 | " Tonnage (1000s of tons) 50-56\n", 33 | " passengers (100s) 58-64\n", 34 | " Length (100s of feet) 66-72\n", 35 | " Cabins (100s) 74-80\n", 36 | " Passenger Density 82-88\n", 37 | " Crew (100s) 90-96\n", 38 | " \n", 39 | "It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis! \n", 40 | "\n", 41 | "Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "anaconda-cloud": {}, 47 | "kernelspec": { 48 | "display_name": "Python [conda root]", 49 | "language": "python", 50 | "name": "conda-root-py" 51 | }, 52 | "language_info": { 53 | "codemirror_mode": { 54 | "name": "ipython", 55 | "version": 3 56 | }, 57 | "file_extension": ".py", 58 | "mimetype": "text/x-python", 59 | "name": "python", 60 | "nbconvert_exporter": "python", 61 | "pygments_lexer": "ipython3", 62 | "version": "3.5.3" 63 | } 64 | }, 65 | "nbformat": 4, 66 | "nbformat_minor": 0 67 | } 68 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Linear_Regression/Data_Transformations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Transformations\n", 8 | "\n", 9 | "You won't always get data in a convienent format, often you will have to deal with data that is non-numerical, such as customer names, or zipcodes, country names, etc...\n", 10 | "\n", 11 | "A big part of working with data is using your own domain knowledge to build an intuition of how to deal with the data, sometimes the best course of action is to drop the data, other times feature-engineering is a good way to go, or you could try to transform the data into something the Machine Learning Algorithms will understand.\n", 12 | "\n", 13 | "Spark has several built in methods of dealing with thse transformations, check them all out here: http://spark.apache.org/docs/latest/ml-features.html\n", 14 | "\n", 15 | "Let's see some examples of all of this!" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "from pyspark.sql import SparkSession" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "spark = SparkSession.builder.appName('data').getOrCreate()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "df = spark.read.csv('fake_customers.csv',inferSchema=True,header=True)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "+-------+----------+-----+\n", 63 | "| Name| Phone|Group|\n", 64 | "+-------+----------+-----+\n", 65 | "| John|4085552424| A|\n", 66 | "| Mike|3105552738| B|\n", 67 | "| Cassie|4085552424| B|\n", 68 | "| Laura|3105552438| B|\n", 69 | "| Sarah|4085551234| A|\n", 70 | "| David|3105557463| C|\n", 71 | "| Zach|4085553987| C|\n", 72 | "| Kiera|3105552938| A|\n", 73 | "| Alexa|4085559467| C|\n", 74 | "|Karissa|3105553475| A|\n", 75 | "+-------+----------+-----+\n", 76 | "\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "df.show()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Data Features\n", 89 | "\n", 90 | "### StringIndexer\n", 91 | "\n", 92 | "We often have to convert string information into numerical information as a categorical feature. This is easily done with the StringIndexer Method:" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "+-------+--------+-------------+\n", 107 | "|user_id|category|categoryIndex|\n", 108 | "+-------+--------+-------------+\n", 109 | "| 0| a| 0.0|\n", 110 | "| 1| b| 2.0|\n", 111 | "| 2| c| 1.0|\n", 112 | "| 3| a| 0.0|\n", 113 | "| 4| a| 0.0|\n", 114 | "| 5| c| 1.0|\n", 115 | "+-------+--------+-------------+\n", 116 | "\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "from pyspark.ml.feature import StringIndexer\n", 122 | "\n", 123 | "df = spark.createDataFrame(\n", 124 | " [(0, \"a\"), (1, \"b\"), (2, \"c\"), (3, \"a\"), (4, \"a\"), (5, \"c\")],\n", 125 | " [\"user_id\", \"category\"])\n", 126 | "\n", 127 | "indexer = StringIndexer(inputCol=\"category\", outputCol=\"categoryIndex\")\n", 128 | "indexed = indexer.fit(df).transform(df)\n", 129 | "indexed.show()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "The next step would be to encode these categories into \"dummy\" variables." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### VectorIndexer\n", 153 | "\n", 154 | "VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees. VectorAssembler accepts the following input column types: all numeric types, boolean type, and vector type. In each row, the values of the input columns will be concatenated into a vector in the specified order. \n", 155 | "\n", 156 | "Assume that we have a DataFrame with the columns id, hour, mobile, userFeatures, and clicked:\n", 157 | "\n", 158 | " id | hour | mobile | userFeatures | clicked\n", 159 | " ----|------|--------|------------------|---------\n", 160 | " 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0\n", 161 | " \n", 162 | "userFeatures is a vector column that contains three user features. We want to combine hour, mobile, and userFeatures into a single feature vector called features and use it to predict clicked or not. If we set VectorAssembler’s input columns to hour, mobile, and userFeatures and output column to features, after transformation we should get the following DataFrame:\n", 163 | "\n", 164 | " id | hour | mobile | userFeatures | clicked | features\n", 165 | " ----|------|--------|------------------|---------|-----------------------------\n", 166 | " 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 | [18.0, 1.0, 0.0, 10.0, 0.5]" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 14, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "+---+----+------+--------------+-------+\n", 181 | "| id|hour|mobile| userFeatures|clicked|\n", 182 | "+---+----+------+--------------+-------+\n", 183 | "| 0| 18| 1.0|[0.0,10.0,0.5]| 1.0|\n", 184 | "+---+----+------+--------------+-------+\n", 185 | "\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "from pyspark.ml.linalg import Vectors\n", 191 | "from pyspark.ml.feature import VectorAssembler\n", 192 | "\n", 193 | "dataset = spark.createDataFrame(\n", 194 | " [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],\n", 195 | " [\"id\", \"hour\", \"mobile\", \"userFeatures\", \"clicked\"])\n", 196 | "dataset.show()" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 15, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\n", 211 | "+--------------------+-------+\n", 212 | "| features|clicked|\n", 213 | "+--------------------+-------+\n", 214 | "|[18.0,1.0,0.0,10....| 1.0|\n", 215 | "+--------------------+-------+\n", 216 | "\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "assembler = VectorAssembler(\n", 222 | " inputCols=[\"hour\", \"mobile\", \"userFeatures\"],\n", 223 | " outputCol=\"features\")\n", 224 | "\n", 225 | "output = assembler.transform(dataset)\n", 226 | "print(\"Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\")\n", 227 | "output.select(\"features\", \"clicked\").show()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "There ar emany more data transformations available, we will cover them once we encounter a need for them, for now these were the most important ones.\n", 235 | "\n", 236 | "Let's continue on to Linear Regression!" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [] 247 | } 248 | ], 249 | "metadata": { 250 | "anaconda-cloud": {}, 251 | "kernelspec": { 252 | "display_name": "Python [conda root]", 253 | "language": "python", 254 | "name": "conda-root-py" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.5.3" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 0 271 | } 272 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Linear_Regression/Linear_Regression_Consulting_Project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Linear Regression Consulting Project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | "Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n", 17 | "\n", 18 | "You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n", 19 | "\n", 20 | "They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n", 21 | "\n", 22 | "Here is what the data looks like so far:\n", 23 | "\n", 24 | " Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n", 25 | " ships.\n", 26 | "\n", 27 | "\n", 28 | " Variables/Columns\n", 29 | " Ship Name 1-20\n", 30 | " Cruise Line 21-40\n", 31 | " Age (as of 2013) 46-48\n", 32 | " Tonnage (1000s of tons) 50-56\n", 33 | " passengers (100s) 58-64\n", 34 | " Length (100s of feet) 66-72\n", 35 | " Cabins (100s) 74-80\n", 36 | " Passenger Density 82-88\n", 37 | " Crew (100s) 90-96\n", 38 | " \n", 39 | "It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis! \n", 40 | "\n", 41 | "Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "anaconda-cloud": {}, 47 | "kernelspec": { 48 | "display_name": "Python [conda root]", 49 | "language": "python", 50 | "name": "conda-root-py" 51 | }, 52 | "language_info": { 53 | "codemirror_mode": { 54 | "name": "ipython", 55 | "version": 3 56 | }, 57 | "file_extension": ".py", 58 | "mimetype": "text/x-python", 59 | "name": "python", 60 | "nbconvert_exporter": "python", 61 | "pygments_lexer": "ipython3", 62 | "version": "3.5.3" 63 | } 64 | }, 65 | "nbformat": 4, 66 | "nbformat_minor": 0 67 | } 68 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Linear_Regression/cruise_ship_info.csv: -------------------------------------------------------------------------------- 1 | Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew 2 | Journey,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55 3 | Quest,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55 4 | Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7 5 | Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1 6 | Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0 7 | Ecstasy,Carnival,22,70.367,20.52,8.55,10.2,34.29,9.2 8 | Elation,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2 9 | Fantasy,Carnival,23,70.367,20.56,8.55,10.22,34.23,9.2 10 | Fascination,Carnival,19,70.367,20.52,8.55,10.2,34.29,9.2 11 | Freedom,Carnival,6,110.23899999999999,37.0,9.51,14.87,29.79,11.5 12 | Glory,Carnival,10,110.0,29.74,9.51,14.87,36.99,11.6 13 | Holiday,Carnival,28,46.052,14.52,7.27,7.26,31.72,6.6 14 | Imagination,Carnival,18,70.367,20.52,8.55,10.2,34.29,9.2 15 | Inspiration,Carnival,17,70.367,20.52,8.55,10.2,34.29,9.2 16 | Legend,Carnival,11,86.0,21.24,9.63,10.62,40.49,9.3 17 | Liberty*,Carnival,8,110.0,29.74,9.51,14.87,36.99,11.6 18 | Miracle,Carnival,9,88.5,21.24,9.63,10.62,41.67,10.3 19 | Paradise,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2 20 | Pride,Carnival,12,88.5,21.24,9.63,11.62,41.67,9.3 21 | Sensation,Carnival,20,70.367,20.52,8.55,10.2,34.29,9.2 22 | Spirit,Carnival,12,88.5,21.24,9.63,10.56,41.67,10.29 23 | Triumph,Carnival,14,101.509,27.58,8.93,13.21,36.81,10.0 24 | Valor,Carnival,9,110.0,29.74,9.52,14.87,36.99,11.6 25 | Victory,Carnival,13,101.509,27.58,8.93,13.79,36.81,11.5 26 | Century,Celebrity,18,70.60600000000001,17.7,8.15,8.75,39.89,8.58 27 | Constellation,Celebrity,11,91.0,20.32,9.65,9.75,44.78,9.99 28 | Galaxy,Celebrity,17,77.71300000000001,18.9,8.66,9.35,41.12,9.09 29 | Infinity,Celebrity,12,91.0,20.32,9.65,9.75,44.78,9.99 30 | Mercury,Celebrity,16,77.71300000000001,18.82,8.66,9.35,41.29,9.09 31 | Millenium,Celebrity,13,91.0,20.32,9.65,9.75,44.78,9.99 32 | Solstice,Celebrity,5,122.0,28.5,10.33,6.87,34.57,6.7 33 | Summit,Celebrity,12,91.0,20.32,9.65,9.75,44.78,9.99 34 | Xpedition,Celebrity,12,2.329,0.94,2.96,0.45,24.78,0.6 35 | Zenith,Celebrity,21,47.225,13.66,6.82,6.87,34.57,6.7 36 | Allegra,Costa,21,28.43,8.08,6.16,4.1,35.19,4.0 37 | Atlantica,Costa,13,85.619,21.14,9.57,10.56,40.5,9.2 38 | Classica,Costa,22,52.926,13.02,7.18,6.54,40.65,6.17 39 | Europa,Costa,27,53.872,14.94,7.98,7.67,36.06,6.36 40 | Fortuna,Costa,10,105.0,27.2,8.9,13.56,38.6,10.68 41 | Magica,Costa,9,105.0,27.2,8.9,13.56,38.6,10.68 42 | Marina,Costa,23,25.0,7.76,6.22,3.86,32.22,3.85 43 | Mediterranea,Costa,10,86.0,21.14,9.6,10.56,40.68,9.2 44 | Romantica,Costa,20,53.049,13.44,7.22,6.78,39.47,6.0 45 | Serena,Costa,6,112.0,38.0,9.51,15.0,29.47,10.9 46 | Victoria,Costa,17,75.166,19.28,8.28,9.64,38.99,7.66 47 | Serenity,Crystal,10,68.0,10.8,7.9,5.5,62.96,6.36 48 | Symphony,Crystal,18,51.004,9.4,7.81,4.8,54.26,5.45 49 | QueenElizabethII,Cunard,44,70.327,17.91,9.63,9.5,39.27,9.21 50 | QueenMary2,Cunard,10,151.4,26.2,11.32,11.34,57.79,12.53 51 | QueenVictoria,Cunard,6,90.0,20.0,9.64,10.29,45.0,9.0 52 | Magic,Disney,15,83.338,17.5,9.64,8.75,47.62,9.45 53 | Wonder,Disney,14,83.0,17.5,9.64,8.75,47.43,9.45 54 | Amsterdam,Holland_American,13,61.0,13.8,7.8,6.88,44.2,6.0 55 | Eurodam,Holland_American,5,86.0,21.04,9.36,10.22,40.87,8.0 56 | Maasdam,Holland_American,20,55.451,12.64,7.19,6.32,43.87,5.57 57 | Noordam,Holland_American,29,33.92,12.14,7.04,6.07,27.94,5.3 58 | Oosterdam,Holland_American,10,81.76899999999999,18.48,9.59,9.24,44.25,8.42 59 | Prinsendam,Holland_American,25,38.0,7.49,6.74,3.96,50.73,4.6 60 | Rotterdam,Holland_American,16,59.652,13.2,7.77,6.6,45.19,6.44 61 | Ryndam,Holland_American,19,55.451,12.66,7.19,6.33,43.8,5.88 62 | Statendam,Holland_American,20,55.451,12.66,7.19,6.33,43.8,5.88 63 | Veendam,Holland_American,17,55.451,12.66,7.19,6.33,43.8,5.88 64 | Volendam,Holland_American,14,63.0,14.4,7.77,7.2,43.75,5.61 65 | Westerdam,Holland_American,27,53.872,14.94,7.98,7.47,36.06,6.12 66 | Zaandam,Holland_American,13,63.0,14.4,7.77,7.2,43.75,5.31 67 | Zuiderdam,Holland_American,11,85.0,18.48,9.51,9.24,46.0,8.0 68 | Armonia,MSC,12,58.6,15.66,8.24,7.83,37.42,7.0 69 | Fantasia,MSC,5,133.5,39.59,10.93,16.37,33.72,13.13 70 | Lirica,MSC,10,58.825,15.6,8.23,7.65,37.71,7.0 71 | Melody,MSC,31,35.143,12.5,6.69,5.32,28.11,5.35 72 | Musica,MSC,7,89.6,25.5,9.61,12.75,35.14,9.87 73 | Opera,MSC,9,59.058,17.0,7.63,8.5,34.74,7.4 74 | Rhapsody,MSC,36,16.852,9.52,5.41,3.83,17.7,2.97 75 | Sinfonia,MSC,11,58.6,15.66,8.23,7.83,37.42,7.6 76 | Crown,Norwegian,25,34.25,10.52,6.15,5.26,32.56,4.7 77 | Dawn,Norwegian,11,90.0,22.4,9.65,11.2,40.18,11.0 78 | Dream,Norwegian,21,50.76,17.48,7.54,8.74,29.04,6.14 79 | Gem,Norwegian,6,93.0,23.94,9.65,11.97,38.85,11.09 80 | Jewel,Norwegian,8,91.0,22.44,9.65,11.22,40.55,11.0 81 | Majesty,Norwegian,21,38.0,10.56,5.67,5.28,35.98,4.38 82 | PrideofAloha,Norwegian,14,77.104,20.02,8.53,10.01,38.51,8.0 83 | PrideofAmerica,Norwegian,9,81.0,21.44,9.21,10.72,37.78,10.0 84 | Sea,Norwegian,25,42.0,15.04,7.08,7.52,27.93,6.3 85 | Spirit,Norwegian,15,75.33800000000001,19.56,8.79,9.83,38.52,13.0 86 | Star,Norwegian,40,28.0,11.5,6.74,4.0,24.35,3.8 87 | Sun,Norwegian,12,77.104,20.02,8.53,10.01,38.51,9.59 88 | Wind,Norwegian,20,50.76,17.48,7.54,8.74,29.04,6.14 89 | Insignia,Oceania,15,30.276999999999997,6.84,5.94,3.42,44.26,4.0 90 | Nautica,Oceania,13,30.276999999999997,6.84,5.94,3.42,44.26,4.0 91 | Regatta,Oceania,15,30.276999999999997,6.84,5.94,3.42,44.26,4.0 92 | MarcoPolo,Orient,48,22.08,8.26,5.78,4.25,26.73,3.5 93 | Arcadia,P&O,9,85.0,19.68,9.35,9.84,43.19,8.69 94 | Artemis,P&O,29,45.0,11.78,7.54,5.3,38.2,5.2 95 | Aurora,P&O,13,76.0,18.74,8.86,9.39,40.55,8.5 96 | Oceana,P&O,10,77.0,20.16,8.56,9.75,38.19,9.0 97 | Oriana,P&O,18,69.153,18.82,8.53,9.14,36.74,7.94 98 | Ventura,P&O,5,115.0,35.74,9.0,15.32,32.18,12.2 99 | Caribbean,Princess,9,116.0,26.0,9.51,13.0,44.62,11.0 100 | Coral,Princess,11,91.62700000000001,19.74,9.64,9.87,46.42,9.0 101 | Crown,Princess,7,116.0,31.0,9.51,15.57,37.42,12.0 102 | Dawn,Princess,16,77.499,19.5,8.56,10.5,39.74,9.0 103 | Diamond,Princess,9,113.0,26.74,9.51,13.37,42.26,12.38 104 | Emerald,Princess,6,113.0,37.82,9.51,15.57,29.88,12.0 105 | Golden,Princess,12,108.865,27.58,9.51,13.0,39.47,11.0 106 | Grand,Princess,15,108.806,26.0,9.51,13.0,41.85,11.1 107 | Island,Princess,10,91.62700000000001,19.74,9.64,9.87,46.42,9.0 108 | Pacific,Princess,14,30.276999999999997,6.86,5.93,3.44,44.14,3.73 109 | Regal,Princess,22,69.845,15.9,8.03,7.95,43.93,6.96 110 | Royal,Princess,29,44.348,12.0,7.54,6.0,36.96,5.2 111 | Saphire,Princess,9,113.0,26.74,9.51,13.37,42.26,12.38 112 | Sea,Princess,8,77.499,19.5,8.56,9.75,39.74,9.0 113 | Star,Princess,11,108.977,26.02,9.51,13.01,41.88,12.0 114 | Sun,Princess,18,77.499,19.5,8.56,9.75,39.74,9.0 115 | Tahitian,Princess,14,30.276999999999997,6.88,5.93,3.44,44.01,3.73 116 | ExplorerII,Regent_Seven_Seas,27,12.5,3.94,4.36,0.88,31.73,1.46 117 | Mariner,Regent_Seven_Seas,12,50.0,7.0,7.09,3.54,71.43,4.45 118 | Navigator,Regent_Seven_Seas,14,33.0,4.9,5.6,2.45,67.35,3.24 119 | PaulGauguin,Regent_Seven_Seas,16,19.2,3.2,5.13,1.6,60.0,2.11 120 | Voyager,Regent_Seven_Seas,10,46.0,7.0,6.7,1.82,65.71,4.47 121 | Adventure,Royal_Caribbean,12,138.0,31.14,10.2,15.57,44.32,11.85 122 | Brilliance,Royal_Caribbean,11,90.09,25.01,9.62,10.5,36.02,8.48 123 | Empress,Royal_Caribbean,23,48.563,20.2,6.92,8.0,24.04,6.71 124 | Enchantment,Royal_Caribbean,16,74.137,19.5,9.16,9.75,38.02,7.6 125 | Explorer,Royal_Caribbean,13,138.0,31.14,10.2,15.57,44.32,11.76 126 | Freedom,Royal_Caribbean,7,158.0,43.7,11.12,18.0,36.16,13.6 127 | Grandeur,Royal_Caribbean,17,74.137,19.5,9.16,9.75,38.02,7.6 128 | Independence,Royal_Caribbean,5,160.0,36.34,11.12,18.17,44.03,13.6 129 | Jewel,Royal_Caribbean,9,90.09,25.01,9.62,10.94,36.02,8.69 130 | Legend,Royal_Caribbean,18,70.0,18.0,8.67,9.0,38.89,7.2 131 | Liberty,Royal_Caribbean,6,158.0,43.7,11.25,18.0,36.16,13.6 132 | Majesty,Royal_Caribbean,21,73.941,27.44,8.8,11.75,26.95,8.22 133 | Mariner,Royal_Caribbean,10,138.0,31.14,10.2,15.57,44.32,11.85 134 | Monarch,Royal_Caribbean,22,73.941,27.44,8.8,11.77,30.94,8.22 135 | Navigator,Royal_Caribbean,11,138.0,31.14,10.2,15.57,44.32,11.85 136 | Oasis,Royal_Caribbean,4,220.0,54.0,11.82,27.0,40.74,21.0 137 | Radiance,Royal_Caribbean,12,90.09,25.01,9.62,10.5,36.02,8.68 138 | Rhapsody,Royal_Caribbean,16,78.491,24.35,9.15,10.0,32.23,7.65 139 | Serenade,Royal_Caribbean,10,90.09,25.01,9.62,10.5,36.02,8.58 140 | Sovreign,Royal_Caribbean,25,73.192,28.52,8.8,11.38,25.66,8.08 141 | Splendour,Royal_Caribbean,17,70.0,20.76,8.67,9.02,33.72,7.2 142 | Vision,Royal_Caribbean,15,78.491,24.35,9.15,10.0,32.23,6.6 143 | Voyager,Royal_Caribbean,14,138.0,31.14,10.2,15.57,44.32,11.76 144 | Legend,Seabourn,21,10.0,2.08,4.4,1.04,48.08,1.6 145 | Pride,Seabourn,27,10.0,2.08,4.4,1.04,48.08,1.6 146 | Spirit,Seabourn,24,10.0,2.08,4.4,1.04,48.08,1.6 147 | Cloud,Silversea,19,16.8,2.96,5.14,1.48,56.76,2.1 148 | Shadow,Silversea,13,25.0,3.82,5.97,1.94,65.45,2.95 149 | Whisper,Silversea,12,25.0,3.88,5.97,1.94,64.43,2.87 150 | Wind,Silversea,19,16.8,2.96,5.14,1.48,56.76,1.97 151 | Aries,Star,22,3.341,0.66,2.8,0.33,50.62,0.59 152 | Gemini,Star,21,19.093,8.0,5.37,4.0,23.87,4.7 153 | Libra,Star,12,42.0,14.8,7.13,7.4,28.38,6.8 154 | Pisces,Star,24,40.053000000000004,12.87,5.79,7.76,31.12,7.5 155 | Taurus,Star,22,3.341,0.66,2.79,0.33,50.62,0.59 156 | Virgo,Star,14,76.8,19.6,8.79,9.67,39.18,12.0 157 | Spirit,Windstar,25,5.35,1.58,4.4,0.74,33.86,0.88 158 | Star,Windstar,27,5.35,1.67,4.4,0.74,32.04,0.88 159 | Surf,Windstar,23,14.745,3.08,6.17,1.56,47.87,1.8 160 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Linear_Regression/fake_customers.csv: -------------------------------------------------------------------------------- 1 | Name,Phone,Group 2 | John,4085552424,A 3 | Mike,3105552738,B 4 | Cassie,4085552424,B 5 | Laura,3105552438,B 6 | Sarah,4085551234,A 7 | David,3105557463,C 8 | Zach,4085553987,C 9 | Kiera,3105552938,A 10 | Alexa,4085559467,C 11 | Karissa,3105553475,A 12 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Logistic_Regression/.ipynb_checkpoints/Logistic_Regression_Consulting_Project-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Logistic Regression Consulting Project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | "## Binary Customer Churn\n", 17 | "\n", 18 | "A marketing agency has many customers that use their service to produce ads for the client/customer websites. They've noticed that they have quite a bit of churn in clients. They basically randomly assign account managers right now, but want you to create a machine learning model that will help predict which customers will churn (stop buying their service) so that they can correctly assign the customers most at risk to churn an account manager. Luckily they have some historical data, can you help them out? Create a classification algorithm that will help classify whether or not a customer churned. Then the company can test this against incoming data for future customers to predict which customers will churn and assign them an account manager.\n", 19 | "\n", 20 | "The data is saved as customer_churn.csv. Here are the fields and their definitions:\n", 21 | "\n", 22 | " Name : Name of the latest contact at Company\n", 23 | " Age: Customer Age\n", 24 | " Total_Purchase: Total Ads Purchased\n", 25 | " Account_Manager: Binary 0=No manager, 1= Account manager assigned\n", 26 | " Years: Totaly Years as a customer\n", 27 | " Num_sites: Number of websites that use the service.\n", 28 | " Onboard_date: Date that the name of the latest contact was onboarded\n", 29 | " Location: Client HQ Address\n", 30 | " Company: Name of Client Company\n", 31 | " \n", 32 | "Once you've created the model and evaluated it, test out the model on some new data (you can think of this almost like a hold-out set) that your client has provided, saved under new_customers.csv. The client wants to know which customers are most likely to churn given this data (they don't have the label yet)." 33 | ] 34 | } 35 | ], 36 | "metadata": { 37 | "anaconda-cloud": {}, 38 | "kernelspec": { 39 | "display_name": "Python [conda root]", 40 | "language": "python", 41 | "name": "conda-root-py" 42 | }, 43 | "language_info": { 44 | "codemirror_mode": { 45 | "name": "ipython", 46 | "version": 3 47 | }, 48 | "file_extension": ".py", 49 | "mimetype": "text/x-python", 50 | "name": "python", 51 | "nbconvert_exporter": "python", 52 | "pygments_lexer": "ipython3", 53 | "version": "3.5.3" 54 | } 55 | }, 56 | "nbformat": 4, 57 | "nbformat_minor": 0 58 | } 59 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Logistic_Regression/Logistic_Regression_Consulting_Project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Logistic Regression Consulting Project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | "## Binary Customer Churn\n", 17 | "\n", 18 | "A marketing agency has many customers that use their service to produce ads for the client/customer websites. They've noticed that they have quite a bit of churn in clients. They basically randomly assign account managers right now, but want you to create a machine learning model that will help predict which customers will churn (stop buying their service) so that they can correctly assign the customers most at risk to churn an account manager. Luckily they have some historical data, can you help them out? Create a classification algorithm that will help classify whether or not a customer churned. Then the company can test this against incoming data for future customers to predict which customers will churn and assign them an account manager.\n", 19 | "\n", 20 | "The data is saved as customer_churn.csv. Here are the fields and their definitions:\n", 21 | "\n", 22 | " Name : Name of the latest contact at Company\n", 23 | " Age: Customer Age\n", 24 | " Total_Purchase: Total Ads Purchased\n", 25 | " Account_Manager: Binary 0=No manager, 1= Account manager assigned\n", 26 | " Years: Totaly Years as a customer\n", 27 | " Num_sites: Number of websites that use the service.\n", 28 | " Onboard_date: Date that the name of the latest contact was onboarded\n", 29 | " Location: Client HQ Address\n", 30 | " Company: Name of Client Company\n", 31 | " \n", 32 | "Once you've created the model and evaluated it, test out the model on some new data (you can think of this almost like a hold-out set) that your client has provided, saved under new_customers.csv. The client wants to know which customers are most likely to churn given this data (they don't have the label yet)." 33 | ] 34 | } 35 | ], 36 | "metadata": { 37 | "anaconda-cloud": {}, 38 | "kernelspec": { 39 | "display_name": "Python [conda root]", 40 | "language": "python", 41 | "name": "conda-root-py" 42 | }, 43 | "language_info": { 44 | "codemirror_mode": { 45 | "name": "ipython", 46 | "version": 3 47 | }, 48 | "file_extension": ".py", 49 | "mimetype": "text/x-python", 50 | "name": "python", 51 | "nbconvert_exporter": "python", 52 | "pygments_lexer": "ipython3", 53 | "version": "3.5.3" 54 | } 55 | }, 56 | "nbformat": 4, 57 | "nbformat_minor": 0 58 | } 59 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Logistic_Regression/Titanic_Log_Regression_Code_Along.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Logistic Regression Code Along\n", 8 | "This is a code along of the famous titanic dataset, its always nice to start off with this dataset because it is an example you will find across pretty much every data analysis language." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "from pyspark.sql import SparkSession" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "spark = SparkSession.builder.appName('myproj').getOrCreate()" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "data = spark.read.csv('titanic.csv',inferSchema=True,header=True)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "root\n", 56 | " |-- PassengerId: integer (nullable = true)\n", 57 | " |-- Survived: integer (nullable = true)\n", 58 | " |-- Pclass: integer (nullable = true)\n", 59 | " |-- Name: string (nullable = true)\n", 60 | " |-- Sex: string (nullable = true)\n", 61 | " |-- Age: double (nullable = true)\n", 62 | " |-- SibSp: integer (nullable = true)\n", 63 | " |-- Parch: integer (nullable = true)\n", 64 | " |-- Ticket: string (nullable = true)\n", 65 | " |-- Fare: double (nullable = true)\n", 66 | " |-- Cabin: string (nullable = true)\n", 67 | " |-- Embarked: string (nullable = true)\n", 68 | "\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "data.printSchema()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 7, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "['PassengerId',\n", 87 | " 'Survived',\n", 88 | " 'Pclass',\n", 89 | " 'Name',\n", 90 | " 'Sex',\n", 91 | " 'Age',\n", 92 | " 'SibSp',\n", 93 | " 'Parch',\n", 94 | " 'Ticket',\n", 95 | " 'Fare',\n", 96 | " 'Cabin',\n", 97 | " 'Embarked']" 98 | ] 99 | }, 100 | "execution_count": 7, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "data.columns" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 8, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "my_cols = data.select(['Survived',\n", 118 | " 'Pclass',\n", 119 | " 'Sex',\n", 120 | " 'Age',\n", 121 | " 'SibSp',\n", 122 | " 'Parch',\n", 123 | " 'Fare',\n", 124 | " 'Embarked'])" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 29, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "my_final_data = my_cols.na.drop()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "### Working with Categorical Columns\n", 143 | "\n", 144 | "Let's break this down into multiple steps to make it all clear." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 12, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "from pyspark.ml.feature import (VectorAssembler,VectorIndexer,\n", 156 | " OneHotEncoder,StringIndexer)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 13, 162 | "metadata": { 163 | "collapsed": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')\n", 168 | "gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 14, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')\n", 180 | "embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 15, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "assembler = VectorAssembler(inputCols=['Pclass',\n", 192 | " 'SexVec',\n", 193 | " 'Age',\n", 194 | " 'SibSp',\n", 195 | " 'Parch',\n", 196 | " 'Fare',\n", 197 | " 'EmbarkVec'],outputCol='features')" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 30, 203 | "metadata": { 204 | "collapsed": true 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "from pyspark.ml.classification import LogisticRegression" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## Pipelines \n", 216 | "\n", 217 | "Let's see an example of how to use pipelines (we'll get a lot more practice with these later!)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 17, 223 | "metadata": { 224 | "collapsed": true 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "from pyspark.ml import Pipeline" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 18, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 19, 245 | "metadata": { 246 | "collapsed": true 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "pipeline = Pipeline(stages=[gender_indexer,embark_indexer,\n", 251 | " gender_encoder,embark_encoder,\n", 252 | " assembler,log_reg_titanic])" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 20, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7,.3])" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 21, 269 | "metadata": { 270 | "collapsed": true 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "fit_model = pipeline.fit(train_titanic_data)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 22, 280 | "metadata": { 281 | "collapsed": true 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "results = fit_model.transform(test_titanic_data)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 23, 291 | "metadata": { 292 | "collapsed": true 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 24, 302 | "metadata": { 303 | "collapsed": true 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n", 308 | " labelCol='Survived')" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 26, 314 | "metadata": { 315 | "collapsed": false 316 | }, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "+--------+----------+\n", 323 | "|Survived|prediction|\n", 324 | "+--------+----------+\n", 325 | "| 0| 1.0|\n", 326 | "| 0| 1.0|\n", 327 | "| 0| 1.0|\n", 328 | "| 0| 1.0|\n", 329 | "| 0| 0.0|\n", 330 | "| 0| 1.0|\n", 331 | "| 0| 1.0|\n", 332 | "| 0| 0.0|\n", 333 | "| 0| 0.0|\n", 334 | "| 0| 0.0|\n", 335 | "| 0| 0.0|\n", 336 | "| 0| 0.0|\n", 337 | "| 0| 0.0|\n", 338 | "| 0| 0.0|\n", 339 | "| 0| 0.0|\n", 340 | "| 0| 0.0|\n", 341 | "| 0| 0.0|\n", 342 | "| 0| 1.0|\n", 343 | "| 0| 1.0|\n", 344 | "| 0| 1.0|\n", 345 | "+--------+----------+\n", 346 | "only showing top 20 rows\n", 347 | "\n" 348 | ] 349 | } 350 | ], 351 | "source": [ 352 | "results.select('Survived','prediction').show()" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 27, 358 | "metadata": { 359 | "collapsed": true 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "AUC = my_eval.evaluate(results)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 28, 369 | "metadata": { 370 | "collapsed": false 371 | }, 372 | "outputs": [ 373 | { 374 | "data": { 375 | "text/plain": [ 376 | "0.7918269230769232" 377 | ] 378 | }, 379 | "execution_count": 28, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "AUC" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "## Great Job!" 393 | ] 394 | } 395 | ], 396 | "metadata": { 397 | "anaconda-cloud": {}, 398 | "kernelspec": { 399 | "display_name": "Python [conda root]", 400 | "language": "python", 401 | "name": "conda-root-py" 402 | }, 403 | "language_info": { 404 | "codemirror_mode": { 405 | "name": "ipython", 406 | "version": 3 407 | }, 408 | "file_extension": ".py", 409 | "mimetype": "text/x-python", 410 | "name": "python", 411 | "nbconvert_exporter": "python", 412 | "pygments_lexer": "ipython3", 413 | "version": "3.5.3" 414 | } 415 | }, 416 | "nbformat": 4, 417 | "nbformat_minor": 0 418 | } 419 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Logistic_Regression/new_customers.csv: -------------------------------------------------------------------------------- 1 | Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company 2 | Andrew Mccall,37.0,9935.53,1,7.71,8.0,2011-08-29 18:37:54,"38612 Johnny Stravenue Nataliebury, WI 15717-8316",King Ltd, 3 | Michele Wright,23.0,7526.94,1,9.28,15.0,2013-07-22 18:19:54,"21083 Nicole Junction Suite 332, Youngport, ME 23686-4381",Cannon-Benson 4 | Jeremy Chang,65.0,100.0,1,1.0,15.0,2006-12-11 07:48:13,"085 Austin Views Lake Julialand, WY 63726-4298",Barron-Robertson 5 | Megan Ferguson,32.0,6487.5,0,9.4,14.0,2016-10-28 05:32:13,"922 Wright Branch North Cynthialand, NC 64721",Sexton-Golden 6 | Taylor Young,32.0,13147.71,1,10.0,8.0,2012-03-20 00:36:46,"Unit 0789 Box 0734 DPO AP 39702",Wood LLC, 7 | Jessica Drake,22.0,8445.26,1,3.46,14.0,2011-02-04 19:29:27,"1148 Tina Stravenue Apt. 978 South Carlos TX 21222 9221",Parks-Robbins 8 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Natural_Language_Processing/smsspamcollection/readme: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuperJohn/spark-and-python-for-big-data-with-pyspark/2571210837c00e6315a9d93f0cd1dc35e2955375/Spark_for_Machine_Learning/Natural_Language_Processing/smsspamcollection/readme -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Tree_Methods/.ipynb_checkpoints/Tree_Methods_Consulting_Project-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tree Methods Consulting Project " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a \"filler\" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one!\n", 15 | "Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!\n", 16 | "\n", 17 | "* Pres_A : Percentage of preservative A in the mix\n", 18 | "* Pres_B : Percentage of preservative B in the mix\n", 19 | "* Pres_C : Percentage of preservative C in the mix\n", 20 | "* Pres_D : Percentage of preservative D in the mix\n", 21 | "* Spoiled: Label indicating whether or not the dog food batch was spoiled.\n", 22 | "___\n", 23 | "\n", 24 | "**Think carefully about what this problem is really asking you to solve. While we will use Machine Learning to solve this, it won't be with your typical train/test split workflow. If this confuses you, skip ahead to the solution code along walk-through!**\n", 25 | "____" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Good Luck!" 33 | ] 34 | } 35 | ], 36 | "metadata": { 37 | "anaconda-cloud": {}, 38 | "kernelspec": { 39 | "display_name": "Python [conda root]", 40 | "language": "python", 41 | "name": "conda-root-py" 42 | }, 43 | "language_info": { 44 | "codemirror_mode": { 45 | "name": "ipython", 46 | "version": 3 47 | }, 48 | "file_extension": ".py", 49 | "mimetype": "text/x-python", 50 | "name": "python", 51 | "nbconvert_exporter": "python", 52 | "pygments_lexer": "ipython3", 53 | "version": "3.5.3" 54 | } 55 | }, 56 | "nbformat": 4, 57 | "nbformat_minor": 0 58 | } 59 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Tree_Methods/.ipynb_checkpoints/Tree_Methods_Consulting_Project_SOLUTION-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tree Methods Consulting Project - SOLUTION" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a \"filler\" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one!\n", 15 | "Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!\n", 16 | "\n", 17 | "* Pres_A : Percentage of preservative A in the mix\n", 18 | "* Pres_B : Percentage of preservative B in the mix\n", 19 | "* Pres_C : Percentage of preservative C in the mix\n", 20 | "* Pres_D : Percentage of preservative D in the mix\n", 21 | "* Spoiled: Label indicating whether or not the dog food batch was spoiled.\n", 22 | "___\n", 23 | "\n", 24 | "**Think carefully about what this problem is really asking you to solve. While we will use Machine Learning to solve this, it won't be with your typical train/test split workflow. If this confuses you, skip ahead to the solution code along walk-through!**\n", 25 | "____" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 46, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "#Tree methods Example\n", 37 | "from pyspark.sql import SparkSession\n", 38 | "spark = SparkSession.builder.appName('dogfood').getOrCreate()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 47, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "# Load training data\n", 50 | "data = spark.read.csv('dog_food.csv',inferSchema=True,header=True)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 48, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "root\n", 65 | " |-- A: integer (nullable = true)\n", 66 | " |-- B: integer (nullable = true)\n", 67 | " |-- C: double (nullable = true)\n", 68 | " |-- D: integer (nullable = true)\n", 69 | " |-- Spoiled: double (nullable = true)\n", 70 | "\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "data.printSchema()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 49, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)" 89 | ] 90 | }, 91 | "execution_count": 49, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "data.head()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 50, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n", 112 | "|summary| A| B| C| D| Spoiled|\n", 113 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n", 114 | "| count| 490| 490| 490| 490| 490|\n", 115 | "| mean| 5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|\n", 116 | "| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|\n", 117 | "| min| 1| 1| 5.0| 1| 0.0|\n", 118 | "| max| 10| 10| 14.0| 10| 1.0|\n", 119 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n", 120 | "\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "data.describe().show()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 51, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "# Import VectorAssembler and Vectors\n", 137 | "from pyspark.ml.linalg import Vectors\n", 138 | "from pyspark.ml.feature import VectorAssembler" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 52, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "['A', 'B', 'C', 'D', 'Spoiled']" 152 | ] 153 | }, 154 | "execution_count": 52, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "data.columns" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 53, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol=\"features\")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 54, 177 | "metadata": { 178 | "collapsed": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "output = assembler.transform(data)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 55, 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 56, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "rfc = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 57, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "root\n", 219 | " |-- A: integer (nullable = true)\n", 220 | " |-- B: integer (nullable = true)\n", 221 | " |-- C: double (nullable = true)\n", 222 | " |-- D: integer (nullable = true)\n", 223 | " |-- Spoiled: double (nullable = true)\n", 224 | " |-- features: vector (nullable = true)\n", 225 | "\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "output.printSchema()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 58, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)" 244 | ] 245 | }, 246 | "execution_count": 58, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "final_data = output.select('features','Spoiled')\n", 253 | "final_data.head()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 59, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "rfc_model = rfc.fit(final_data)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 60, 270 | "metadata": { 271 | "collapsed": false 272 | }, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "SparseVector(4, {0: 0.0026, 1: 0.0089, 2: 0.9686, 3: 0.0199})" 278 | ] 279 | }, 280 | "execution_count": 60, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "rfc_model.featureImportances" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "Bingo! Feature at index 2 (Chemical C) is by far the most important feature, meaning it is causing the early spoilage! This is a pretty interesting use of a machine learning model in an alternative way!\n", 294 | "\n", 295 | "# Great Job" 296 | ] 297 | } 298 | ], 299 | "metadata": { 300 | "anaconda-cloud": {}, 301 | "kernelspec": { 302 | "display_name": "Python [conda root]", 303 | "language": "python", 304 | "name": "conda-root-py" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.5.3" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 0 321 | } 322 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Tree_Methods/Tree_Methods_Consulting_Project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tree Methods Consulting Project " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a \"filler\" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one!\n", 15 | "Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!\n", 16 | "\n", 17 | "* Pres_A : Percentage of preservative A in the mix\n", 18 | "* Pres_B : Percentage of preservative B in the mix\n", 19 | "* Pres_C : Percentage of preservative C in the mix\n", 20 | "* Pres_D : Percentage of preservative D in the mix\n", 21 | "* Spoiled: Label indicating whether or not the dog food batch was spoiled.\n", 22 | "___\n", 23 | "\n", 24 | "**Think carefully about what this problem is really asking you to solve. While we will use Machine Learning to solve this, it won't be with your typical train/test split workflow. If this confuses you, skip ahead to the solution code along walk-through!**\n", 25 | "____" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Good Luck!" 33 | ] 34 | } 35 | ], 36 | "metadata": { 37 | "anaconda-cloud": {}, 38 | "kernelspec": { 39 | "display_name": "Python [conda root]", 40 | "language": "python", 41 | "name": "conda-root-py" 42 | }, 43 | "language_info": { 44 | "codemirror_mode": { 45 | "name": "ipython", 46 | "version": 3 47 | }, 48 | "file_extension": ".py", 49 | "mimetype": "text/x-python", 50 | "name": "python", 51 | "nbconvert_exporter": "python", 52 | "pygments_lexer": "ipython3", 53 | "version": "3.5.3" 54 | } 55 | }, 56 | "nbformat": 4, 57 | "nbformat_minor": 0 58 | } 59 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Tree_Methods/Tree_Methods_Consulting_Project_SOLUTION.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tree Methods Consulting Project - SOLUTION" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a \"filler\" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one!\n", 15 | "Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!\n", 16 | "\n", 17 | "* Pres_A : Percentage of preservative A in the mix\n", 18 | "* Pres_B : Percentage of preservative B in the mix\n", 19 | "* Pres_C : Percentage of preservative C in the mix\n", 20 | "* Pres_D : Percentage of preservative D in the mix\n", 21 | "* Spoiled: Label indicating whether or not the dog food batch was spoiled.\n", 22 | "___\n", 23 | "\n", 24 | "**Think carefully about what this problem is really asking you to solve. While we will use Machine Learning to solve this, it won't be with your typical train/test split workflow. If this confuses you, skip ahead to the solution code along walk-through!**\n", 25 | "____" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 46, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "#Tree methods Example\n", 37 | "from pyspark.sql import SparkSession\n", 38 | "spark = SparkSession.builder.appName('dogfood').getOrCreate()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 47, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "# Load training data\n", 50 | "data = spark.read.csv('dog_food.csv',inferSchema=True,header=True)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 48, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "root\n", 65 | " |-- A: integer (nullable = true)\n", 66 | " |-- B: integer (nullable = true)\n", 67 | " |-- C: double (nullable = true)\n", 68 | " |-- D: integer (nullable = true)\n", 69 | " |-- Spoiled: double (nullable = true)\n", 70 | "\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "data.printSchema()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 49, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)" 89 | ] 90 | }, 91 | "execution_count": 49, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "data.head()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 50, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n", 112 | "|summary| A| B| C| D| Spoiled|\n", 113 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n", 114 | "| count| 490| 490| 490| 490| 490|\n", 115 | "| mean| 5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|\n", 116 | "| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|\n", 117 | "| min| 1| 1| 5.0| 1| 0.0|\n", 118 | "| max| 10| 10| 14.0| 10| 1.0|\n", 119 | "+-------+------------------+------------------+------------------+------------------+-------------------+\n", 120 | "\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "data.describe().show()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 51, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "# Import VectorAssembler and Vectors\n", 137 | "from pyspark.ml.linalg import Vectors\n", 138 | "from pyspark.ml.feature import VectorAssembler" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 52, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "['A', 'B', 'C', 'D', 'Spoiled']" 152 | ] 153 | }, 154 | "execution_count": 52, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "data.columns" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 53, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol=\"features\")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 54, 177 | "metadata": { 178 | "collapsed": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "output = assembler.transform(data)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 55, 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 56, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "rfc = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 57, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "root\n", 219 | " |-- A: integer (nullable = true)\n", 220 | " |-- B: integer (nullable = true)\n", 221 | " |-- C: double (nullable = true)\n", 222 | " |-- D: integer (nullable = true)\n", 223 | " |-- Spoiled: double (nullable = true)\n", 224 | " |-- features: vector (nullable = true)\n", 225 | "\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "output.printSchema()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 58, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)" 244 | ] 245 | }, 246 | "execution_count": 58, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "final_data = output.select('features','Spoiled')\n", 253 | "final_data.head()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 59, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "rfc_model = rfc.fit(final_data)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 60, 270 | "metadata": { 271 | "collapsed": false 272 | }, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "SparseVector(4, {0: 0.0026, 1: 0.0089, 2: 0.9686, 3: 0.0199})" 278 | ] 279 | }, 280 | "execution_count": 60, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "rfc_model.featureImportances" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "Bingo! Feature at index 2 (Chemical C) is by far the most important feature, meaning it is causing the early spoilage! This is a pretty interesting use of a machine learning model in an alternative way!\n", 294 | "\n", 295 | "# Great Job" 296 | ] 297 | } 298 | ], 299 | "metadata": { 300 | "anaconda-cloud": {}, 301 | "kernelspec": { 302 | "display_name": "Python [conda root]", 303 | "language": "python", 304 | "name": "conda-root-py" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.5.3" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 0 321 | } 322 | -------------------------------------------------------------------------------- /Spark_for_Machine_Learning/Tree_Methods/dog_food.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D,Spoiled 2 | 4,2,12.0,3,1.0 3 | 5,6,12.0,7,1.0 4 | 6,2,13.0,6,1.0 5 | 4,2,12.0,1,1.0 6 | 4,2,12.0,3,1.0 7 | 10,3,13.0,9,1.0 8 | 8,5,14.0,5,1.0 9 | 5,8,12.0,8,1.0 10 | 6,5,12.0,9,1.0 11 | 3,3,12.0,1,1.0 12 | 9,8,11.0,3,1.0 13 | 1,10,12.0,3,1.0 14 | 1,5,13.0,10,1.0 15 | 2,10,12.0,6,1.0 16 | 1,10,11.0,4,1.0 17 | 5,3,12.0,2,1.0 18 | 4,9,11.0,8,1.0 19 | 5,1,11.0,1,1.0 20 | 4,9,12.0,10,1.0 21 | 5,8,10.0,9,1.0 22 | 5,7,11.0,9,1.0 23 | 4,10,13.0,8,1.0 24 | 10,5,12.0,9,1.0 25 | 2,4,13.0,4,1.0 26 | 1,4,13.0,10,1.0 27 | 1,8,12.0,1,1.0 28 | 2,10,13.0,4,1.0 29 | 6,2,12.0,4,1.0 30 | 8,2,13.0,3,1.0 31 | 6,4,12.0,2,1.0 32 | 3,2,11.0,9,1.0 33 | 10,6,12.0,10,1.0 34 | 9,5,13.0,3,1.0 35 | 9,2,12.0,5,1.0 36 | 2,6,13.0,9,1.0 37 | 4,2,12.0,10,1.0 38 | 4,3,12.0,6,1.0 39 | 7,1,12.0,1,1.0 40 | 1,7,11.0,10,1.0 41 | 9,2,11.0,10,1.0 42 | 2,6,12.0,2,1.0 43 | 9,4,11.0,5,1.0 44 | 6,2,11.0,10,1.0 45 | 3,10,11.0,4,1.0 46 | 6,9,11.0,2,1.0 47 | 10,6,11.0,9,1.0 48 | 6,7,11.0,9,1.0 49 | 7,2,13.0,8,1.0 50 | 9,2,13.0,5,1.0 51 | 8,7,12.0,6,1.0 52 | 9,1,12.0,9,1.0 53 | 3,5,14.0,3,1.0 54 | 7,1,11.0,3,1.0 55 | 5,9,12.0,7,1.0 56 | 3,10,12.0,7,1.0 57 | 9,8,13.0,9,1.0 58 | 10,9,12.0,9,1.0 59 | 10,7,11.0,2,1.0 60 | 10,3,11.0,1,1.0 61 | 2,4,11.0,8,1.0 62 | 10,3,13.0,4,1.0 63 | 5,1,14.0,8,1.0 64 | 8,8,11.0,4,1.0 65 | 4,8,14.0,1,1.0 66 | 5,1,12.0,7,1.0 67 | 6,8,11.0,2,1.0 68 | 1,1,13.0,3,1.0 69 | 9,3,12.0,10,1.0 70 | 6,1,11.0,7,1.0 71 | 7,5,10.0,1,1.0 72 | 10,2,12.0,2,1.0 73 | 2,3,13.0,1,1.0 74 | 5,8,12.0,2,1.0 75 | 10,6,12.0,10,1.0 76 | 9,1,11.0,6,1.0 77 | 10,10,14.0,7,1.0 78 | 1,5,12.0,10,1.0 79 | 10,1,11.0,2,1.0 80 | 1,1,12.0,2,1.0 81 | 10,3,13.0,7,1.0 82 | 1,6,11.0,10,1.0 83 | 9,4,12.0,3,1.0 84 | 10,9,12.0,5,1.0 85 | 10,8,11.0,2,1.0 86 | 5,3,9.0,2,1.0 87 | 3,7,12.0,10,1.0 88 | 4,9,12.0,8,1.0 89 | 5,1,11.0,2,1.0 90 | 10,9,11.0,9,1.0 91 | 10,7,11.0,6,1.0 92 | 8,2,13.0,10,1.0 93 | 7,7,11.0,3,1.0 94 | 9,10,11.0,5,1.0 95 | 5,2,12.0,8,1.0 96 | 1,1,10.0,8,1.0 97 | 5,5,12.0,8,1.0 98 | 9,6,12.0,1,1.0 99 | 4,6,12.0,2,1.0 100 | 1,1,12.0,4,1.0 101 | 9,3,11.0,10,1.0 102 | 3,2,12.0,6,1.0 103 | 2,4,11.0,9,1.0 104 | 8,1,12.0,10,1.0 105 | 10,6,11.0,6,1.0 106 | 8,9,12.0,2,1.0 107 | 2,3,12.0,3,1.0 108 | 4,6,14.0,4,1.0 109 | 3,4,12.0,4,1.0 110 | 9,5,12.0,5,1.0 111 | 10,5,13.0,2,1.0 112 | 8,2,10.0,6,1.0 113 | 10,5,11.0,2,1.0 114 | 10,1,11.0,3,1.0 115 | 7,6,13.0,3,1.0 116 | 8,9,14.0,4,1.0 117 | 8,8,14.0,7,1.0 118 | 1,9,11.0,10,1.0 119 | 2,9,10.0,3,1.0 120 | 4,9,13.0,4,1.0 121 | 10,10,12.0,7,1.0 122 | 8,9,12.0,7,1.0 123 | 9,7,12.0,1,1.0 124 | 3,6,13.0,5,1.0 125 | 4,5,12.0,3,1.0 126 | 1,7,11.0,9,1.0 127 | 4,6,12.0,9,1.0 128 | 8,10,13.0,3,1.0 129 | 5,4,12.0,5,1.0 130 | 9,4,12.0,6,1.0 131 | 3,4,12.0,5,1.0 132 | 7,7,11.0,4,1.0 133 | 6,2,12.0,6,1.0 134 | 2,8,11.0,1,1.0 135 | 4,4,10.0,3,1.0 136 | 3,7,12.0,9,1.0 137 | 10,3,12.0,7,1.0 138 | 3,1,12.0,7,1.0 139 | 2,4,13.0,10,1.0 140 | 6,3,12.0,2,1.0 141 | 7,2,14.0,4,1.0 142 | 4,2,8.0,9,0.0 143 | 4,8,9.0,1,0.0 144 | 10,8,8.0,6,0.0 145 | 8,6,9.0,4,0.0 146 | 7,2,7.0,8,0.0 147 | 3,3,9.0,5,0.0 148 | 4,10,8.0,9,0.0 149 | 4,7,10.0,7,0.0 150 | 1,7,8.0,2,0.0 151 | 10,7,8.0,5,0.0 152 | 10,5,9.0,1,0.0 153 | 5,7,10.0,10,0.0 154 | 2,8,6.0,9,0.0 155 | 4,1,7.0,5,0.0 156 | 4,6,9.0,7,0.0 157 | 2,2,9.0,8,0.0 158 | 6,7,6.0,9,0.0 159 | 5,7,7.0,2,0.0 160 | 7,1,7.0,5,0.0 161 | 8,1,8.0,3,0.0 162 | 1,6,8.0,1,0.0 163 | 4,5,9.0,8,0.0 164 | 8,10,8.0,3,0.0 165 | 4,9,8.0,2,0.0 166 | 2,9,6.0,4,0.0 167 | 8,10,8.0,9,0.0 168 | 3,6,8.0,1,0.0 169 | 5,6,9.0,8,0.0 170 | 5,2,8.0,10,0.0 171 | 9,7,6.0,7,0.0 172 | 3,8,6.0,10,0.0 173 | 3,3,8.0,9,0.0 174 | 3,4,10.0,2,0.0 175 | 6,8,8.0,9,0.0 176 | 1,4,8.0,7,0.0 177 | 6,9,7.0,10,0.0 178 | 10,6,8.0,6,0.0 179 | 9,4,7.0,10,0.0 180 | 9,2,10.0,3,0.0 181 | 6,8,8.0,6,0.0 182 | 10,5,7.0,4,0.0 183 | 4,8,8.0,7,0.0 184 | 5,6,6.0,9,0.0 185 | 2,1,10.0,7,0.0 186 | 6,4,7.0,4,0.0 187 | 6,8,9.0,4,0.0 188 | 3,3,8.0,3,0.0 189 | 3,5,10.0,6,0.0 190 | 3,3,9.0,9,0.0 191 | 7,7,8.0,9,0.0 192 | 6,8,7.0,10,0.0 193 | 7,3,7.0,7,0.0 194 | 5,7,9.0,2,0.0 195 | 4,9,8.0,10,0.0 196 | 9,9,7.0,4,0.0 197 | 6,9,6.0,1,0.0 198 | 4,2,10.0,10,0.0 199 | 8,10,8.0,3,0.0 200 | 1,7,8.0,4,0.0 201 | 3,2,9.0,1,0.0 202 | 9,9,9.0,6,0.0 203 | 4,10,5.0,4,0.0 204 | 9,3,7.0,5,0.0 205 | 9,1,9.0,3,0.0 206 | 4,6,7.0,2,0.0 207 | 4,5,8.0,5,0.0 208 | 5,7,6.0,6,0.0 209 | 10,6,9.0,3,0.0 210 | 6,6,8.0,10,0.0 211 | 3,7,9.0,7,0.0 212 | 8,10,8.0,2,0.0 213 | 5,2,8.0,3,0.0 214 | 5,7,7.0,5,0.0 215 | 10,9,8.0,2,0.0 216 | 4,4,8.0,7,0.0 217 | 1,4,9.0,6,0.0 218 | 8,2,9.0,10,0.0 219 | 9,6,9.0,5,0.0 220 | 7,6,7.0,7,0.0 221 | 1,2,9.0,4,0.0 222 | 1,8,7.0,10,0.0 223 | 6,2,8.0,9,0.0 224 | 9,5,7.0,8,0.0 225 | 8,7,8.0,6,0.0 226 | 5,7,8.0,9,0.0 227 | 8,4,9.0,1,0.0 228 | 6,1,9.0,3,0.0 229 | 9,7,8.0,9,0.0 230 | 2,9,7.0,10,0.0 231 | 2,4,8.0,5,0.0 232 | 10,3,8.0,8,0.0 233 | 7,9,8.0,8,0.0 234 | 6,6,8.0,2,0.0 235 | 1,5,8.0,10,0.0 236 | 10,1,9.0,9,0.0 237 | 8,1,9.0,2,0.0 238 | 10,9,8.0,6,0.0 239 | 5,10,7.0,1,0.0 240 | 3,6,7.0,8,0.0 241 | 4,10,10.0,5,0.0 242 | 2,1,7.0,9,0.0 243 | 9,2,9.0,9,0.0 244 | 3,9,8.0,9,0.0 245 | 2,3,6.0,9,0.0 246 | 3,9,8.0,6,0.0 247 | 10,7,9.0,1,0.0 248 | 10,10,6.0,4,0.0 249 | 8,5,9.0,5,0.0 250 | 7,2,8.0,1,0.0 251 | 7,2,8.0,9,0.0 252 | 6,9,7.0,2,0.0 253 | 1,4,9.0,3,0.0 254 | 10,9,9.0,10,0.0 255 | 4,3,8.0,8,0.0 256 | 8,7,6.0,6,0.0 257 | 5,7,8.0,3,0.0 258 | 8,6,8.0,3,0.0 259 | 3,2,6.0,10,0.0 260 | 4,2,6.0,5,0.0 261 | 10,6,8.0,7,0.0 262 | 3,6,8.0,3,0.0 263 | 2,2,8.0,1,0.0 264 | 1,9,10.0,6,0.0 265 | 9,6,8.0,7,0.0 266 | 4,5,9.0,5,0.0 267 | 3,5,8.0,6,0.0 268 | 4,5,8.0,10,0.0 269 | 9,4,9.0,4,0.0 270 | 9,4,7.0,6,0.0 271 | 7,6,8.0,10,0.0 272 | 9,10,11.0,2,0.0 273 | 3,4,9.0,5,0.0 274 | 2,10,9.0,2,0.0 275 | 10,9,8.0,2,0.0 276 | 4,6,9.0,4,0.0 277 | 4,10,7.0,10,0.0 278 | 9,1,9.0,8,0.0 279 | 3,10,8.0,6,0.0 280 | 8,5,9.0,3,0.0 281 | 8,5,7.0,5,0.0 282 | 1,8,6.0,6,0.0 283 | 8,8,6.0,8,0.0 284 | 4,8,7.0,3,0.0 285 | 9,3,8.0,7,0.0 286 | 10,8,7.0,3,0.0 287 | 2,10,6.0,4,0.0 288 | 2,5,9.0,5,0.0 289 | 10,7,9.0,4,0.0 290 | 3,10,9.0,8,0.0 291 | 9,2,7.0,3,0.0 292 | 7,4,6.0,4,0.0 293 | 3,4,8.0,7,0.0 294 | 4,7,8.0,3,0.0 295 | 10,9,8.0,10,0.0 296 | 4,6,5.0,6,0.0 297 | 10,2,9.0,7,0.0 298 | 9,8,9.0,10,0.0 299 | 7,10,8.0,2,0.0 300 | 5,5,6.0,1,0.0 301 | 8,4,7.0,6,0.0 302 | 5,5,7.0,9,0.0 303 | 7,2,9.0,9,0.0 304 | 9,4,9.0,3,0.0 305 | 5,5,7.0,3,0.0 306 | 2,7,7.0,4,0.0 307 | 4,5,9.0,8,0.0 308 | 1,8,8.0,6,0.0 309 | 5,6,9.0,5,0.0 310 | 3,6,8.0,3,0.0 311 | 7,2,9.0,5,0.0 312 | 10,9,10.0,6,0.0 313 | 4,7,10.0,6,0.0 314 | 1,9,9.0,7,0.0 315 | 1,7,7.0,2,0.0 316 | 1,9,7.0,5,0.0 317 | 2,8,9.0,4,0.0 318 | 5,4,8.0,2,0.0 319 | 1,7,7.0,6,0.0 320 | 2,1,8.0,9,0.0 321 | 2,6,9.0,4,0.0 322 | 1,6,8.0,9,0.0 323 | 1,4,8.0,5,0.0 324 | 10,6,8.0,5,0.0 325 | 6,4,6.0,4,0.0 326 | 2,1,9.0,1,0.0 327 | 8,6,9.0,10,0.0 328 | 5,6,7.0,9,0.0 329 | 10,10,7.0,1,0.0 330 | 2,9,10.0,6,0.0 331 | 9,6,10.0,2,0.0 332 | 3,5,9.0,3,0.0 333 | 5,10,8.0,3,0.0 334 | 1,3,9.0,8,0.0 335 | 8,8,8.0,7,0.0 336 | 6,1,8.0,3,0.0 337 | 4,9,9.0,2,0.0 338 | 2,9,10.0,3,0.0 339 | 1,5,8.0,5,0.0 340 | 5,6,8.0,8,0.0 341 | 6,10,9.0,2,0.0 342 | 9,6,8.0,9,0.0 343 | 1,8,8.0,7,0.0 344 | 8,2,8.0,8,0.0 345 | 3,6,8.0,5,0.0 346 | 9,2,9.0,6,0.0 347 | 7,10,5.0,6,0.0 348 | 2,5,8.0,3,0.0 349 | 9,2,10.0,7,0.0 350 | 5,9,8.0,9,0.0 351 | 1,6,8.0,3,0.0 352 | 7,4,8.0,3,0.0 353 | 8,5,8.0,5,0.0 354 | 5,9,7.0,3,0.0 355 | 9,6,8.0,5,0.0 356 | 3,1,8.0,5,0.0 357 | 5,8,9.0,9,0.0 358 | 2,5,8.0,3,0.0 359 | 5,6,8.0,6,0.0 360 | 2,5,8.0,1,0.0 361 | 6,2,11.0,10,0.0 362 | 2,6,6.0,9,0.0 363 | 4,4,6.0,8,0.0 364 | 2,7,8.0,9,0.0 365 | 5,2,7.0,9,0.0 366 | 6,10,8.0,3,0.0 367 | 4,6,7.0,5,0.0 368 | 2,8,8.0,6,0.0 369 | 6,2,8.0,3,0.0 370 | 8,10,9.0,8,0.0 371 | 5,9,8.0,5,0.0 372 | 9,2,9.0,8,0.0 373 | 5,10,8.0,6,0.0 374 | 10,6,8.0,3,0.0 375 | 6,6,9.0,6,0.0 376 | 6,3,10.0,5,0.0 377 | 1,3,8.0,5,0.0 378 | 2,3,9.0,3,0.0 379 | 2,6,8.0,8,0.0 380 | 8,4,9.0,10,0.0 381 | 8,7,6.0,7,0.0 382 | 2,6,8.0,10,0.0 383 | 7,2,9.0,3,0.0 384 | 7,9,6.0,2,0.0 385 | 2,10,8.0,8,0.0 386 | 5,2,9.0,9,0.0 387 | 2,8,9.0,10,0.0 388 | 8,4,6.0,8,0.0 389 | 7,3,10.0,7,0.0 390 | 9,9,8.0,7,0.0 391 | 8,4,8.0,1,0.0 392 | 9,2,6.0,8,0.0 393 | 8,6,8.0,2,0.0 394 | 9,7,8.0,2,0.0 395 | 4,3,9.0,6,0.0 396 | 2,1,8.0,9,0.0 397 | 9,4,7.0,9,0.0 398 | 4,2,9.0,2,0.0 399 | 10,3,8.0,2,0.0 400 | 9,2,10.0,5,0.0 401 | 10,7,7.0,7,0.0 402 | 2,3,7.0,10,0.0 403 | 10,1,7.0,4,0.0 404 | 3,3,7.0,5,0.0 405 | 10,1,7.0,4,0.0 406 | 5,4,8.0,7,0.0 407 | 7,3,7.0,8,0.0 408 | 10,9,7.0,4,0.0 409 | 5,7,8.0,9,0.0 410 | 5,9,7.0,5,0.0 411 | 4,6,7.0,5,0.0 412 | 4,2,8.0,9,0.0 413 | 8,3,7.0,4,0.0 414 | 3,5,9.0,6,0.0 415 | 4,3,8.0,10,0.0 416 | 1,6,7.0,8,0.0 417 | 8,5,8.0,6,0.0 418 | 9,10,7.0,6,0.0 419 | 8,9,8.0,1,0.0 420 | 9,10,8.0,8,0.0 421 | 3,10,8.0,2,0.0 422 | 8,10,10.0,7,0.0 423 | 2,1,10.0,7,0.0 424 | 5,10,8.0,8,0.0 425 | 4,9,7.0,7,0.0 426 | 9,3,7.0,7,0.0 427 | 5,7,8.0,6,0.0 428 | 8,7,9.0,3,0.0 429 | 2,2,7.0,8,0.0 430 | 6,6,9.0,9,0.0 431 | 4,2,8.0,4,0.0 432 | 3,9,7.0,9,0.0 433 | 7,9,6.0,5,0.0 434 | 5,3,7.0,5,0.0 435 | 4,4,9.0,1,0.0 436 | 6,9,8.0,5,0.0 437 | 10,10,8.0,1,0.0 438 | 2,6,8.0,6,0.0 439 | 10,10,9.0,5,0.0 440 | 5,9,9.0,6,0.0 441 | 3,2,8.0,9,0.0 442 | 10,10,9.0,3,0.0 443 | 4,7,9.0,4,0.0 444 | 4,4,7.0,1,0.0 445 | 5,8,8.0,5,0.0 446 | 2,3,8.0,3,0.0 447 | 6,4,9.0,2,0.0 448 | 2,9,9.0,10,0.0 449 | 3,6,8.0,2,0.0 450 | 3,2,10.0,10,0.0 451 | 2,2,8.0,1,0.0 452 | 9,6,9.0,1,0.0 453 | 6,5,6.0,2,0.0 454 | 3,6,8.0,1,0.0 455 | 3,3,8.0,6,0.0 456 | 2,10,9.0,2,0.0 457 | 8,9,8.0,9,0.0 458 | 7,4,10.0,4,0.0 459 | 6,6,7.0,8,0.0 460 | 5,3,7.0,7,0.0 461 | 6,7,7.0,6,0.0 462 | 9,1,9.0,5,0.0 463 | 10,9,9.0,1,0.0 464 | 10,4,8.0,3,0.0 465 | 1,2,9.0,1,0.0 466 | 2,1,9.0,1,0.0 467 | 6,1,7.0,9,0.0 468 | 1,5,8.0,3,0.0 469 | 2,8,8.0,4,0.0 470 | 1,8,8.0,8,0.0 471 | 3,1,9.0,7,0.0 472 | 3,9,7.0,6,0.0 473 | 8,1,7.0,4,0.0 474 | 10,4,9.0,8,0.0 475 | 2,5,7.0,6,0.0 476 | 10,6,8.0,5,0.0 477 | 6,1,9.0,7,0.0 478 | 6,10,7.0,10,0.0 479 | 2,10,8.0,3,0.0 480 | 1,4,8.0,1,0.0 481 | 8,9,9.0,4,0.0 482 | 10,10,7.0,4,0.0 483 | 8,3,7.0,9,0.0 484 | 2,2,9.0,8,0.0 485 | 9,5,10.0,10,0.0 486 | 2,2,6.0,10,0.0 487 | 8,3,6.0,6,0.0 488 | 6,4,9.0,10,0.0 489 | 1,3,8.0,3,0.0 490 | 6,6,8.0,3,0.0 491 | 1,9,7.0,4,0.0 492 | --------------------------------------------------------------------------------