├── .gitignore ├── LICENSE ├── README.md ├── RF_GBM ├── data │ ├── test.csv │ └── train.csv └── notebook │ ├── Bank Marketing.ipynb │ └── img │ ├── boosting.jpg │ ├── confusion_matrix.jpg │ ├── cv.png │ ├── onehot.png │ ├── random_forest.jpg │ ├── tree_ensemble1.png │ └── tree_ensemble2.png ├── cf_mba ├── data │ ├── groceries.csv │ ├── groceries_mba.csv │ └── lastfm-matrix-germany.csv └── notebook │ ├── 1. Collaborative Filtering.ipynb │ ├── 2. Market Basket Analysis.ipynb │ └── img │ ├── basket.jpg │ └── cosine.png ├── check_env.py ├── img ├── ISLR.jpeg ├── acquire.jpg ├── amit.png ├── approach.jpg ├── art.jpeg ├── bargava.jpg ├── book.png ├── books.jpg ├── break.jpg ├── clay.jpeg ├── craft.jpeg ├── estimating_coefficients.png ├── explore.jpg ├── frame.jpg ├── glass.jpg ├── insight.jpg ├── lens.jpeg ├── model.jpg ├── numbers.jpg ├── onion-image.jpg ├── onion.jpg ├── onion.png ├── overview.jpg ├── pair.jpg ├── postit.jpg ├── r2.gif ├── r_squared.png ├── refine.jpg ├── retail.jpg ├── science.jpeg ├── see.jpeg ├── single.jpeg ├── skills.png ├── slope_intercept.png ├── speak.jpeg ├── sports.jpg ├── stars.jpg ├── think.jpg ├── thinkstats.jpg ├── time.jpg ├── tool.jpg ├── travel.jpg ├── welcome.jpg ├── wesmckinney.jpg └── workshop.jpg ├── installation_instructions.md ├── overview.md ├── overview.pdf ├── python.txt ├── text_mining ├── Acquire.ipynb ├── DataTau.html ├── Explore.ipynb ├── Model.ipynb ├── Refine.ipynb ├── data_tau.csv ├── data_tau_days.csv ├── data_tau_ta.csv ├── flatlands.txt ├── img │ ├── chunk-segmentation.png │ ├── datatau.png │ ├── date.png │ ├── entity_extraction.png │ ├── gutenberg.png │ ├── punkt.png │ └── title.png ├── negative_words.txt ├── nltk_data.zip └── postive_words.txt └── time_series ├── 1-Frame.ipynb ├── 2-Acquire.ipynb ├── 3-Refine.ipynb ├── 4-Explore.ipynb ├── 5-Model.ipynb ├── 6-Insight.ipynb ├── MonthWiseMarketArrivals.csv ├── MonthWiseMarketArrivals.html ├── MonthWiseMarketArrivalsJan2016.html ├── MonthWiseMarketArrivals_Clean.csv ├── city_geocode.csv ├── img ├── Cov_nonstationary.png ├── Mean_nonstationary.png ├── Var_nonstationary.png ├── corr.svg ├── left_merge.png ├── onion_small.png ├── onion_tables.png ├── peeling_the_onion_small.png ├── pivot.png ├── splitapplycombine.png ├── subsetcolumns.png └── subsetrows.png └── state_geocode.csv /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | .Rproj.user 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Amit Kapoor 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning 2 | Workshop material for Machine Learning in Python 3 | by [Amit Kapoor](http://twitter.com/amitkaps) and [Bargava Subramanian](http://twitter.com/bargava) 4 | 5 | 0. [Overview](/overview.pdf) 6 | 7 | 1. [Time Series](/time_series) *(8 hours, Case - Peeling the Onion)* 8 | - Linear Trend Model 9 | - Random Walk 10 | - Moving Average 11 | - Exponential Smoothing 12 | - Decomposition 13 | - ARIMA Models 14 | - Tweaking Model Parameters 15 | 16 | 2. [Association Rule Mining](/cf_mba) *(4 hours, Case - Grocery)* 17 | - Apriori Algorithm 18 | - Market Basket Analysis 19 | 20 | 3. [Random Forest / Gradient Boosting](/RF_GBM) *(4 hours, Case - Bank Marketing)* 21 | - Intro to Ensemble Models, Bagging and Boosting 22 | - Gradient Boosting Classifier & Regressor 23 | - Random Forest Classifier & Regressor 24 | - Tuning Model Parameters 25 | 26 | 4. [Text Mining](/text_mining) *(6 hours, Case - DataTau)* 27 | - Regular Expression 28 | - Stopword Removal, Stemming 29 | - Word Cloud 30 | - Creating features from text 31 | - Term Frequency and Inverse Document Frequency (TF-IDF) 32 | - Topic Modeling - Latent Dirichlet Allocation (LDA) 33 | - Sentiment Analysis 34 | 35 | 36 | ###Script to check if requisite libraries for the workshop are present 37 | Please execute the following at the command prompt 38 | 39 | $ python check_env.py 40 | 41 | If any library has a `FAIL` message, please install/upgrade that library. 42 | 43 | Installation instructions can be found [here](https://github.com/amitkaps/machine-learning/blob/master/installation_instructions.md) 44 | 45 | --- 46 | ### Licensing 47 | 48 | Machine Learning using Python by Amit Kapoor and Bargava Subramanian is licensed under a MIT License. 49 | -------------------------------------------------------------------------------- /RF_GBM/notebook/Bank Marketing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Frame\n", 8 | "\n", 9 | "The client bank *XYZ* is running a direct marketing campaign. It wants to identify customers who would potentially be buying their new term deposit plan.\n", 10 | "\n", 11 | "# Acquire\n", 12 | "\n", 13 | "Data is obtained from UCI Machine Learning repository. \n", 14 | "http://mlr.cs.umass.edu/ml/datasets/Bank+Marketing\n", 15 | "\n", 16 | "Data from direct marketing campaign (phone calls) of a Portuguese Bank is provided. \n", 17 | "\n", 18 | "### Attribute Information:\n", 19 | "\n", 20 | "\n", 21 | "#### bank client data:\n", 22 | "\n", 23 | "1. age (numeric)\n", 24 | "2. job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')\n", 25 | "3. marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)\n", 26 | "4. education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')\n", 27 | "5. default: has credit in default? (categorical: 'no','yes','unknown')\n", 28 | "6. housing: has housing loan? (categorical: 'no','yes','unknown')\n", 29 | "7. loan: has personal loan? (categorical: 'no','yes','unknown')\n", 30 | "\n", 31 | "#### related with the last contact of the current campaign:\n", 32 | "\n", 33 | "8. contact: contact communication type (categorical: 'cellular','telephone') \n", 34 | "9. month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')\n", 35 | "10. day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')\n", 36 | "11. duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.\n", 37 | "\n", 38 | "#### other attributes:\n", 39 | "\n", 40 | "12. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)\n", 41 | "13. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)\n", 42 | "14. previous: number of contacts performed before this campaign and for this client (numeric)\n", 43 | "15. poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')\n", 44 | "\n", 45 | "#### social and economic context attributes\n", 46 | "\n", 47 | "16. emp.var.rate: employment variation rate - quarterly indicator (numeric)\n", 48 | "17. cons.price.idx: consumer price index - monthly indicator (numeric) \n", 49 | "18. cons.conf.idx: consumer confidence index - monthly indicator (numeric) \n", 50 | "19. euribor3m: euribor 3 month rate - daily indicator (numeric)\n", 51 | "20. nr.employed: number of employees - quarterly indicator (numeric)\n", 52 | "\n", 53 | "#### Output variable (desired target):\n", 54 | "y - has the client subscribed a term deposit? (binary: 'yes','no')\n", 55 | "\n", 56 | "The given data is randomly divided into train and test for the purpose of this workshop. Build the model for train and use it to predict on test. \n", 57 | "\n", 58 | "# Explore" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 1, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "#Import the necessary libraries\n", 70 | "import numpy as np\n", 71 | "import pandas as pd" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 2, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "#Read the train and test data\n", 83 | "train = pd.read_csv(\"../data/train.csv\")\n", 84 | "test = pd.read_csv(\"../data/test.csv\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "**Exercise 1**\n", 92 | "\n", 93 | "print the number of rows and columns of train and test" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 16, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "(35211, 17) (10000, 17)\n" 108 | ] 109 | } 110 | ], 111 | "source": [] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "**Exercise 2**\n", 118 | "\n", 119 | "Print the first 10 rows of train" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 4, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/html": [ 132 | "
\n", 133 | "\n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | "
agejobmaritaleducationdefaultbalancehousingloancontactdaymonthdurationcampaignpdayspreviouspoutcomedeposit
058managementmarriedtertiaryno2143yesnounknown5may2611-10unknownno
144techniciansinglesecondaryno29yesnounknown5may1511-10unknownno
233entrepreneurmarriedsecondaryno2yesyesunknown5may761-10unknownno
347blue-collarmarriedunknownno1506yesnounknown5may921-10unknownno
433unknownsingleunknownno1nonounknown5may1981-10unknownno
\n", 259 | "
" 260 | ], 261 | "text/plain": [ 262 | " age job marital education default balance housing loan \\\n", 263 | "0 58 management married tertiary no 2143 yes no \n", 264 | "1 44 technician single secondary no 29 yes no \n", 265 | "2 33 entrepreneur married secondary no 2 yes yes \n", 266 | "3 47 blue-collar married unknown no 1506 yes no \n", 267 | "4 33 unknown single unknown no 1 no no \n", 268 | "\n", 269 | " contact day month duration campaign pdays previous poutcome deposit \n", 270 | "0 unknown 5 may 261 1 -1 0 unknown no \n", 271 | "1 unknown 5 may 151 1 -1 0 unknown no \n", 272 | "2 unknown 5 may 76 1 -1 0 unknown no \n", 273 | "3 unknown 5 may 92 1 -1 0 unknown no \n", 274 | "4 unknown 5 may 198 1 -1 0 unknown no " 275 | ] 276 | }, 277 | "execution_count": 4, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "**Exercise 3**\n", 289 | "\n", 290 | "Print the column types of train and test. Are they the same in both train and test?" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 5, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [ 300 | { 301 | "data": { 302 | "text/plain": [ 303 | "age int64\n", 304 | "job object\n", 305 | "marital object\n", 306 | "education object\n", 307 | "default object\n", 308 | "balance int64\n", 309 | "housing object\n", 310 | "loan object\n", 311 | "contact object\n", 312 | "day int64\n", 313 | "month object\n", 314 | "duration int64\n", 315 | "campaign int64\n", 316 | "pdays int64\n", 317 | "previous int64\n", 318 | "poutcome object\n", 319 | "deposit object\n", 320 | "dtype: object" 321 | ] 322 | }, 323 | "execution_count": 5, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "#train" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 6, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/plain": [ 342 | "age int64\n", 343 | "job object\n", 344 | "marital object\n", 345 | "education object\n", 346 | "default object\n", 347 | "balance int64\n", 348 | "housing object\n", 349 | "loan object\n", 350 | "contact object\n", 351 | "day int64\n", 352 | "month object\n", 353 | "duration int64\n", 354 | "campaign int64\n", 355 | "pdays int64\n", 356 | "previous int64\n", 357 | "poutcome object\n", 358 | "deposit object\n", 359 | "dtype: object" 360 | ] 361 | }, 362 | "execution_count": 6, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "#test" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 7, 374 | "metadata": { 375 | "collapsed": true 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "#Are they the same?" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 64, 385 | "metadata": { 386 | "collapsed": true 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "#Combine train and test\n", 391 | "frames = [train, test]\n", 392 | "input = pd.concat(frames)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 9, 398 | "metadata": { 399 | "collapsed": false 400 | }, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/html": [ 405 | "
\n", 406 | "\n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | "
agejobmaritaleducationdefaultbalancehousingloancontactdaymonthdurationcampaignpdayspreviouspoutcomedeposit
058managementmarriedtertiaryno2143yesnounknown5may2611-10unknownno
144techniciansinglesecondaryno29yesnounknown5may1511-10unknownno
233entrepreneurmarriedsecondaryno2yesyesunknown5may761-10unknownno
347blue-collarmarriedunknownno1506yesnounknown5may921-10unknownno
433unknownsingleunknownno1nonounknown5may1981-10unknownno
\n", 532 | "
" 533 | ], 534 | "text/plain": [ 535 | " age job marital education default balance housing loan \\\n", 536 | "0 58 management married tertiary no 2143 yes no \n", 537 | "1 44 technician single secondary no 29 yes no \n", 538 | "2 33 entrepreneur married secondary no 2 yes yes \n", 539 | "3 47 blue-collar married unknown no 1506 yes no \n", 540 | "4 33 unknown single unknown no 1 no no \n", 541 | "\n", 542 | " contact day month duration campaign pdays previous poutcome deposit \n", 543 | "0 unknown 5 may 261 1 -1 0 unknown no \n", 544 | "1 unknown 5 may 151 1 -1 0 unknown no \n", 545 | "2 unknown 5 may 76 1 -1 0 unknown no \n", 546 | "3 unknown 5 may 92 1 -1 0 unknown no \n", 547 | "4 unknown 5 may 198 1 -1 0 unknown no " 548 | ] 549 | }, 550 | "execution_count": 9, 551 | "metadata": {}, 552 | "output_type": "execute_result" 553 | } 554 | ], 555 | "source": [ 556 | "#Print first 10 records of input" 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "metadata": {}, 562 | "source": [ 563 | "**Exercise 4**\n", 564 | "\n", 565 | "Find if any column has missing value\n", 566 | "There is a `pd.isnull` function. How to use that?" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 12, 572 | "metadata": { 573 | "collapsed": false 574 | }, 575 | "outputs": [ 576 | { 577 | "data": { 578 | "text/plain": [ 579 | "age 0\n", 580 | "job 0\n", 581 | "marital 0\n", 582 | "education 0\n", 583 | "default 0\n", 584 | "balance 0\n", 585 | "housing 0\n", 586 | "loan 0\n", 587 | "contact 0\n", 588 | "day 0\n", 589 | "month 0\n", 590 | "duration 0\n", 591 | "campaign 0\n", 592 | "pdays 0\n", 593 | "previous 0\n", 594 | "poutcome 0\n", 595 | "deposit 0\n", 596 | "dtype: int64" 597 | ] 598 | }, 599 | "execution_count": 12, 600 | "metadata": {}, 601 | "output_type": "execute_result" 602 | } 603 | ], 604 | "source": [] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 65, 609 | "metadata": { 610 | "collapsed": false 611 | }, 612 | "outputs": [], 613 | "source": [ 614 | "#Replace deposit with a numeric column\n", 615 | "#First, set all labels to be 0\n", 616 | "input.at[:, \"depositLabel\"] = 0\n", 617 | "#Now, set depositLabel to 1 whenever deposit is yes\n", 618 | "input.at[input.deposit==\"yes\", \"depositLabel\"] = 1" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": { 625 | "collapsed": true 626 | }, 627 | "outputs": [], 628 | "source": [] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": {}, 633 | "source": [ 634 | "**Exercise 5**\n", 635 | "\n", 636 | "Find % of customers in the input dataset who have purchased the term deposit" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 72, 642 | "metadata": { 643 | "collapsed": false 644 | }, 645 | "outputs": [ 646 | { 647 | "data": { 648 | "text/plain": [ 649 | "11.698480458295547" 650 | ] 651 | }, 652 | "execution_count": 72, 653 | "metadata": {}, 654 | "output_type": "execute_result" 655 | } 656 | ], 657 | "source": [] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 75, 662 | "metadata": { 663 | "collapsed": false 664 | }, 665 | "outputs": [ 666 | { 667 | "data": { 668 | "text/plain": [ 669 | "0 0\n", 670 | "1 0\n", 671 | "2 0\n", 672 | "3 0\n", 673 | "4 0\n", 674 | "5 0\n", 675 | "6 0\n", 676 | "7 0\n", 677 | "8 0\n", 678 | "9 0\n", 679 | "10 0\n", 680 | "11 0\n", 681 | "12 0\n", 682 | "13 0\n", 683 | "14 0\n", 684 | "15 0\n", 685 | "16 0\n", 686 | "17 0\n", 687 | "18 0\n", 688 | "19 0\n", 689 | "20 0\n", 690 | "21 0\n", 691 | "22 0\n", 692 | "23 0\n", 693 | "24 0\n", 694 | "25 0\n", 695 | "26 0\n", 696 | "27 0\n", 697 | "28 0\n", 698 | "29 0\n", 699 | " ..\n", 700 | "9970 1\n", 701 | "9971 1\n", 702 | "9972 1\n", 703 | "9973 1\n", 704 | "9974 1\n", 705 | "9975 1\n", 706 | "9976 1\n", 707 | "9977 1\n", 708 | "9978 1\n", 709 | "9979 1\n", 710 | "9980 1\n", 711 | "9981 1\n", 712 | "9982 1\n", 713 | "9983 1\n", 714 | "9984 1\n", 715 | "9985 1\n", 716 | "9986 1\n", 717 | "9987 1\n", 718 | "9988 1\n", 719 | "9989 1\n", 720 | "9990 1\n", 721 | "9991 1\n", 722 | "9992 1\n", 723 | "9993 1\n", 724 | "9994 1\n", 725 | "9995 1\n", 726 | "9996 1\n", 727 | "9997 1\n", 728 | "9998 1\n", 729 | "9999 1\n", 730 | "Name: depositLabel, dtype: int64" 731 | ] 732 | }, 733 | "execution_count": 75, 734 | "metadata": {}, 735 | "output_type": "execute_result" 736 | } 737 | ], 738 | "source": [ 739 | "#Create the labels \n", 740 | "labels = \n", 741 | "labels" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": 83, 747 | "metadata": { 748 | "collapsed": false 749 | }, 750 | "outputs": [], 751 | "source": [ 752 | "#Drop the deposit column \n", 753 | "input.drop([\"deposit\", \"depositLabel\"], axis=1)" 754 | ] 755 | }, 756 | { 757 | "cell_type": "markdown", 758 | "metadata": {}, 759 | "source": [ 760 | "**Exercise 6**\n", 761 | "\n", 762 | "Did it drop? If not, what has to be done?" 763 | ] 764 | }, 765 | { 766 | "cell_type": "markdown", 767 | "metadata": {}, 768 | "source": [ 769 | "**Exercise 7**\n", 770 | "\n", 771 | "Print columnn names of input" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": null, 777 | "metadata": { 778 | "collapsed": true 779 | }, 780 | "outputs": [], 781 | "source": [] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": 85, 786 | "metadata": { 787 | "collapsed": false 788 | }, 789 | "outputs": [], 790 | "source": [ 791 | "#Get list of columns that are continuous/integer\n", 792 | "continuous_variables = input.dtypes[input.dtypes != \"object\"].index" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": 86, 798 | "metadata": { 799 | "collapsed": false 800 | }, 801 | "outputs": [ 802 | { 803 | "data": { 804 | "text/plain": [ 805 | "Index([u'age', u'balance', u'day', u'duration', u'campaign', u'pdays',\n", 806 | " u'previous'],\n", 807 | " dtype='object')" 808 | ] 809 | }, 810 | "execution_count": 86, 811 | "metadata": {}, 812 | "output_type": "execute_result" 813 | } 814 | ], 815 | "source": [ 816 | "continuous_variables" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": 87, 822 | "metadata": { 823 | "collapsed": false 824 | }, 825 | "outputs": [], 826 | "source": [ 827 | "#Get list of columns that are categorical\n", 828 | "categorical_variables = input.dtypes[input.dtypes==\"object\"].index" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 88, 834 | "metadata": { 835 | "collapsed": false 836 | }, 837 | "outputs": [ 838 | { 839 | "data": { 840 | "text/plain": [ 841 | "Index([u'job', u'marital', u'education', u'default', u'housing', u'loan',\n", 842 | " u'contact', u'month', u'poutcome'],\n", 843 | " dtype='object')" 844 | ] 845 | }, 846 | "execution_count": 88, 847 | "metadata": {}, 848 | "output_type": "execute_result" 849 | } 850 | ], 851 | "source": [ 852 | "categorical_variables" 853 | ] 854 | }, 855 | { 856 | "cell_type": "markdown", 857 | "metadata": {}, 858 | "source": [ 859 | "**Exercise 8**\n", 860 | "\n", 861 | "Create `inputInteger` and `inputCategorical` - two datasets - one having integer variables and another having categorical variables" 862 | ] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "execution_count": 89, 867 | "metadata": { 868 | "collapsed": false 869 | }, 870 | "outputs": [], 871 | "source": [ 872 | "inputInteger = " 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": 91, 878 | "metadata": { 879 | "collapsed": false 880 | }, 881 | "outputs": [ 882 | { 883 | "data": { 884 | "text/html": [ 885 | "
\n", 886 | "\n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | "
agebalancedaydurationcampaignpdaysprevious
058214352611-10
1442951511-10
23325761-10
34715065921-10
433151981-10
\n", 952 | "
" 953 | ], 954 | "text/plain": [ 955 | " age balance day duration campaign pdays previous\n", 956 | "0 58 2143 5 261 1 -1 0\n", 957 | "1 44 29 5 151 1 -1 0\n", 958 | "2 33 2 5 76 1 -1 0\n", 959 | "3 47 1506 5 92 1 -1 0\n", 960 | "4 33 1 5 198 1 -1 0" 961 | ] 962 | }, 963 | "execution_count": 91, 964 | "metadata": {}, 965 | "output_type": "execute_result" 966 | } 967 | ], 968 | "source": [ 969 | "#print inputInteger\n", 970 | "inputInteger.head()" 971 | ] 972 | }, 973 | { 974 | "cell_type": "code", 975 | "execution_count": 93, 976 | "metadata": { 977 | "collapsed": false 978 | }, 979 | "outputs": [], 980 | "source": [ 981 | "inputCategorical = " 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": 94, 987 | "metadata": { 988 | "collapsed": false 989 | }, 990 | "outputs": [ 991 | { 992 | "data": { 993 | "text/html": [ 994 | "
\n", 995 | "\n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | "
jobmaritaleducationdefaulthousingloancontactmonthpoutcome
0managementmarriedtertiarynoyesnounknownmayunknown
1techniciansinglesecondarynoyesnounknownmayunknown
2entrepreneurmarriedsecondarynoyesyesunknownmayunknown
3blue-collarmarriedunknownnoyesnounknownmayunknown
4unknownsingleunknownnononounknownmayunknown
\n", 1073 | "
" 1074 | ], 1075 | "text/plain": [ 1076 | " job marital education default housing loan contact month \\\n", 1077 | "0 management married tertiary no yes no unknown may \n", 1078 | "1 technician single secondary no yes no unknown may \n", 1079 | "2 entrepreneur married secondary no yes yes unknown may \n", 1080 | "3 blue-collar married unknown no yes no unknown may \n", 1081 | "4 unknown single unknown no no no unknown may \n", 1082 | "\n", 1083 | " poutcome \n", 1084 | "0 unknown \n", 1085 | "1 unknown \n", 1086 | "2 unknown \n", 1087 | "3 unknown \n", 1088 | "4 unknown " 1089 | ] 1090 | }, 1091 | "execution_count": 94, 1092 | "metadata": {}, 1093 | "output_type": "execute_result" 1094 | } 1095 | ], 1096 | "source": [ 1097 | "#print inputCategorical\n", 1098 | "inputCategorical.head()" 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "code", 1103 | "execution_count": 101, 1104 | "metadata": { 1105 | "collapsed": true 1106 | }, 1107 | "outputs": [], 1108 | "source": [ 1109 | "#Convert categorical variables into Labels using labelEncoder\n", 1110 | "\n", 1111 | "inputCategorical = np.array(inputCategorical)\n" 1112 | ] 1113 | }, 1114 | { 1115 | "cell_type": "markdown", 1116 | "metadata": {}, 1117 | "source": [ 1118 | "**Exercise 9**\n", 1119 | "\n", 1120 | "Find length of `categorical_variables`" 1121 | ] 1122 | }, 1123 | { 1124 | "cell_type": "code", 1125 | "execution_count": 102, 1126 | "metadata": { 1127 | "collapsed": false 1128 | }, 1129 | "outputs": [ 1130 | { 1131 | "data": { 1132 | "text/plain": [ 1133 | "9" 1134 | ] 1135 | }, 1136 | "execution_count": 102, 1137 | "metadata": {}, 1138 | "output_type": "execute_result" 1139 | } 1140 | ], 1141 | "source": [] 1142 | }, 1143 | { 1144 | "cell_type": "code", 1145 | "execution_count": 119, 1146 | "metadata": { 1147 | "collapsed": true 1148 | }, 1149 | "outputs": [], 1150 | "source": [ 1151 | "#Load the preprocessing module\n", 1152 | "from sklearn import preprocessing" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": 103, 1158 | "metadata": { 1159 | "collapsed": true 1160 | }, 1161 | "outputs": [], 1162 | "source": [ 1163 | "for i in range(len(categorical_variables)):\n", 1164 | " lbl = preprocessing.LabelEncoder()\n", 1165 | " lbl.fit(list(inputCategorical[:,i]))\n", 1166 | " inputCategorical[:, i] = lbl.transform(inputCategorical[:, i])" 1167 | ] 1168 | }, 1169 | { 1170 | "cell_type": "code", 1171 | "execution_count": 105, 1172 | "metadata": { 1173 | "collapsed": true 1174 | }, 1175 | "outputs": [], 1176 | "source": [ 1177 | "#print inputCategorical" 1178 | ] 1179 | }, 1180 | { 1181 | "cell_type": "markdown", 1182 | "metadata": {}, 1183 | "source": [ 1184 | "**Exercise 10**\n", 1185 | "\n", 1186 | "Convert `inputInteger` to `numpy` array" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "code", 1191 | "execution_count": 107, 1192 | "metadata": { 1193 | "collapsed": false 1194 | }, 1195 | "outputs": [ 1196 | { 1197 | "data": { 1198 | "text/plain": [ 1199 | "array([[ 58, 2143, 5, ..., 1, -1, 0],\n", 1200 | " [ 44, 29, 5, ..., 1, -1, 0],\n", 1201 | " [ 33, 2, 5, ..., 1, -1, 0],\n", 1202 | " ..., \n", 1203 | " [ 69, 247, 22, ..., 2, -1, 0],\n", 1204 | " [ 48, 0, 28, ..., 2, -1, 0],\n", 1205 | " [ 31, 131, 15, ..., 1, -1, 0]])" 1206 | ] 1207 | }, 1208 | "execution_count": 107, 1209 | "metadata": {}, 1210 | "output_type": "execute_result" 1211 | } 1212 | ], 1213 | "source": [ 1214 | "inputInteger = \n", 1215 | "inputInteger" 1216 | ] 1217 | }, 1218 | { 1219 | "cell_type": "markdown", 1220 | "metadata": {}, 1221 | "source": [ 1222 | "**Exercise 11**\n", 1223 | "\n", 1224 | "Now, create the `inputUpdated` array that has both `inputInteger` and `inputCategorical` concatenated\n", 1225 | "\n", 1226 | "*Hint* Check function called `vstack` and `hstack`" 1227 | ] 1228 | }, 1229 | { 1230 | "cell_type": "code", 1231 | "execution_count": null, 1232 | "metadata": { 1233 | "collapsed": false 1234 | }, 1235 | "outputs": [], 1236 | "source": [] 1237 | }, 1238 | { 1239 | "cell_type": "code", 1240 | "execution_count": 118, 1241 | "metadata": { 1242 | "collapsed": false 1243 | }, 1244 | "outputs": [ 1245 | { 1246 | "data": { 1247 | "text/plain": [ 1248 | "(45211, 16)" 1249 | ] 1250 | }, 1251 | "execution_count": 118, 1252 | "metadata": {}, 1253 | "output_type": "execute_result" 1254 | } 1255 | ], 1256 | "source": [ 1257 | "inputUpdated.shape" 1258 | ] 1259 | }, 1260 | { 1261 | "cell_type": "markdown", 1262 | "metadata": {}, 1263 | "source": [ 1264 | "## Train the model\n", 1265 | "\n", 1266 | "### Model 1: Decision Tree" 1267 | ] 1268 | }, 1269 | { 1270 | "cell_type": "code", 1271 | "execution_count": 125, 1272 | "metadata": { 1273 | "collapsed": true 1274 | }, 1275 | "outputs": [], 1276 | "source": [ 1277 | "from sklearn import tree\n", 1278 | "from sklearn.externals.six import StringIO\n", 1279 | "import pydot" 1280 | ] 1281 | }, 1282 | { 1283 | "cell_type": "code", 1284 | "execution_count": 126, 1285 | "metadata": { 1286 | "collapsed": true 1287 | }, 1288 | "outputs": [], 1289 | "source": [ 1290 | "bankModelDT = tree.DecisionTreeClassifier(max_depth=2)" 1291 | ] 1292 | }, 1293 | { 1294 | "cell_type": "code", 1295 | "execution_count": 127, 1296 | "metadata": { 1297 | "collapsed": false 1298 | }, 1299 | "outputs": [ 1300 | { 1301 | "data": { 1302 | "text/plain": [ 1303 | "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n", 1304 | " max_features=None, max_leaf_nodes=None, min_samples_leaf=1,\n", 1305 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 1306 | " presort=False, random_state=None, splitter='best')" 1307 | ] 1308 | }, 1309 | "execution_count": 127, 1310 | "metadata": {}, 1311 | "output_type": "execute_result" 1312 | } 1313 | ], 1314 | "source": [ 1315 | "bankModelDT.fit(inputUpdated[:train.shape[0],:], labels[:train.shape[0]])" 1316 | ] 1317 | }, 1318 | { 1319 | "cell_type": "code", 1320 | "execution_count": 128, 1321 | "metadata": { 1322 | "collapsed": false 1323 | }, 1324 | "outputs": [ 1325 | { 1326 | "data": { 1327 | "text/plain": [ 1328 | "True" 1329 | ] 1330 | }, 1331 | "execution_count": 128, 1332 | "metadata": {}, 1333 | "output_type": "execute_result" 1334 | } 1335 | ], 1336 | "source": [ 1337 | "dot_data = StringIO() \n", 1338 | "tree.export_graphviz(bankModelDT, out_file=dot_data) \n", 1339 | "graph = pydot.graph_from_dot_data(dot_data.getvalue()) \n", 1340 | "graph.write_pdf(\"bankDT.pdf\") " 1341 | ] 1342 | }, 1343 | { 1344 | "cell_type": "code", 1345 | "execution_count": 129, 1346 | "metadata": { 1347 | "collapsed": true 1348 | }, 1349 | "outputs": [], 1350 | "source": [ 1351 | "#Check the pdf" 1352 | ] 1353 | }, 1354 | { 1355 | "cell_type": "markdown", 1356 | "metadata": {}, 1357 | "source": [ 1358 | "**Exercise 12**\n", 1359 | "\n", 1360 | "Now, change the max_depth = 6 and check the results.\n", 1361 | "\n", 1362 | "Then, change the max_depth= None and check the results" 1363 | ] 1364 | }, 1365 | { 1366 | "cell_type": "code", 1367 | "execution_count": null, 1368 | "metadata": { 1369 | "collapsed": false 1370 | }, 1371 | "outputs": [], 1372 | "source": [] 1373 | }, 1374 | { 1375 | "cell_type": "code", 1376 | "execution_count": 144, 1377 | "metadata": { 1378 | "collapsed": true 1379 | }, 1380 | "outputs": [], 1381 | "source": [ 1382 | "# Prediction\n", 1383 | "prediction_DT = bankModelDT.predict(inputUpdated[train.shape[0]:,:])" 1384 | ] 1385 | }, 1386 | { 1387 | "cell_type": "code", 1388 | "execution_count": 133, 1389 | "metadata": { 1390 | "collapsed": true 1391 | }, 1392 | "outputs": [], 1393 | "source": [ 1394 | "#Compute the error metrics" 1395 | ] 1396 | }, 1397 | { 1398 | "cell_type": "code", 1399 | "execution_count": 134, 1400 | "metadata": { 1401 | "collapsed": true 1402 | }, 1403 | "outputs": [], 1404 | "source": [ 1405 | "import sklearn.metrics" 1406 | ] 1407 | }, 1408 | { 1409 | "cell_type": "code", 1410 | "execution_count": 135, 1411 | "metadata": { 1412 | "collapsed": false 1413 | }, 1414 | "outputs": [ 1415 | { 1416 | "data": { 1417 | "text/plain": [ 1418 | "0.5" 1419 | ] 1420 | }, 1421 | "execution_count": 135, 1422 | "metadata": {}, 1423 | "output_type": "execute_result" 1424 | } 1425 | ], 1426 | "source": [ 1427 | "sklearn.metrics.auc(labels[train.shape[0]:], prediction_DT)" 1428 | ] 1429 | }, 1430 | { 1431 | "cell_type": "code", 1432 | "execution_count": 136, 1433 | "metadata": { 1434 | "collapsed": true 1435 | }, 1436 | "outputs": [], 1437 | "source": [ 1438 | "#What does that tell?" 1439 | ] 1440 | }, 1441 | { 1442 | "cell_type": "code", 1443 | "execution_count": 137, 1444 | "metadata": { 1445 | "collapsed": true 1446 | }, 1447 | "outputs": [], 1448 | "source": [ 1449 | "#What's the error AUC for the other Decision Tree Models" 1450 | ] 1451 | }, 1452 | { 1453 | "cell_type": "markdown", 1454 | "metadata": {}, 1455 | "source": [ 1456 | "**Exercise 13**\n", 1457 | "\n", 1458 | "Instead of predicting classes directly, predict the probability and check the `auc`" 1459 | ] 1460 | }, 1461 | { 1462 | "cell_type": "code", 1463 | "execution_count": null, 1464 | "metadata": { 1465 | "collapsed": true 1466 | }, 1467 | "outputs": [], 1468 | "source": [] 1469 | }, 1470 | { 1471 | "cell_type": "code", 1472 | "execution_count": 142, 1473 | "metadata": { 1474 | "collapsed": false 1475 | }, 1476 | "outputs": [ 1477 | { 1478 | "data": { 1479 | "text/plain": [ 1480 | "0.54849867669154428" 1481 | ] 1482 | }, 1483 | "execution_count": 142, 1484 | "metadata": {}, 1485 | "output_type": "execute_result" 1486 | } 1487 | ], 1488 | "source": [ 1489 | "sklearn.metrics.auc(labels[train.shape[0]:], prediction_DT[:,0])" 1490 | ] 1491 | }, 1492 | { 1493 | "cell_type": "markdown", 1494 | "metadata": {}, 1495 | "source": [ 1496 | "### Accuracy Metrics\n", 1497 | "\n", 1498 | "* AUC\n", 1499 | "* ROC\n", 1500 | "* Misclassification Rate\n", 1501 | "* Confusion Matrix\n", 1502 | "* Precision & Recall\n", 1503 | "\n", 1504 | "#### Confusion Matrix\n", 1505 | "\n", 1506 | "\n", 1507 | "\n", 1508 | "\n", 1509 | "#### Calculate True Positive Rate\n", 1510 | " TPR = TP / (TP+FN)\n", 1511 | "\n", 1512 | "#### Calculate False Positive Rate\n", 1513 | " FPR = FP / (FP+TN)\n", 1514 | " \n", 1515 | "#### Precision\n", 1516 | "\n", 1517 | "\n", 1518 | "#### Recall\n", 1519 | "\n", 1520 | "\n" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "code", 1525 | "execution_count": 147, 1526 | "metadata": { 1527 | "collapsed": true 1528 | }, 1529 | "outputs": [], 1530 | "source": [ 1531 | "#Precision and Recall" 1532 | ] 1533 | }, 1534 | { 1535 | "cell_type": "code", 1536 | "execution_count": 145, 1537 | "metadata": { 1538 | "collapsed": false 1539 | }, 1540 | "outputs": [ 1541 | { 1542 | "data": { 1543 | "text/plain": [ 1544 | "0.57177033492822971" 1545 | ] 1546 | }, 1547 | "execution_count": 145, 1548 | "metadata": {}, 1549 | "output_type": "execute_result" 1550 | } 1551 | ], 1552 | "source": [ 1553 | "sklearn.metrics.precision_score(labels[train.shape[0]:], prediction_DT)" 1554 | ] 1555 | }, 1556 | { 1557 | "cell_type": "code", 1558 | "execution_count": 146, 1559 | "metadata": { 1560 | "collapsed": false 1561 | }, 1562 | "outputs": [ 1563 | { 1564 | "data": { 1565 | "text/plain": [ 1566 | "0.20427350427350427" 1567 | ] 1568 | }, 1569 | "execution_count": 146, 1570 | "metadata": {}, 1571 | "output_type": "execute_result" 1572 | } 1573 | ], 1574 | "source": [ 1575 | "sklearn.metrics.recall_score(labels[train.shape[0]:], prediction_DT)" 1576 | ] 1577 | }, 1578 | { 1579 | "cell_type": "markdown", 1580 | "metadata": {}, 1581 | "source": [ 1582 | "# Ensemble Trees\n", 1583 | "\n", 1584 | "\n", 1585 | "\n", 1586 | "\n", 1587 | "
\n", 1588 | "
\n", 1589 | "
\n", 1590 | "
\n", 1591 | "
\n", 1592 | "
\n", 1593 | "\n", 1594 | "\n", 1595 | "\n", 1596 | "\n", 1597 | "\n", 1598 | "*src*: http://www.slideshare.net/hustwj/scaling-up-machine-learning-the-tutorial-kdd-2011-part-iia-tree-ensembles" 1599 | ] 1600 | }, 1601 | { 1602 | "cell_type": "markdown", 1603 | "metadata": {}, 1604 | "source": [ 1605 | "# Random Forest" 1606 | ] 1607 | }, 1608 | { 1609 | "cell_type": "markdown", 1610 | "metadata": {}, 1611 | "source": [ 1612 | "\n", 1613 | "\n", 1614 | "\n", 1615 | "\n", 1616 | "*src*: http://www.slideshare.net/0xdata/jan-vitek-distributedrandomforest522013" 1617 | ] 1618 | }, 1619 | { 1620 | "cell_type": "code", 1621 | "execution_count": 148, 1622 | "metadata": { 1623 | "collapsed": true 1624 | }, 1625 | "outputs": [], 1626 | "source": [ 1627 | "from sklearn.ensemble import RandomForestClassifier" 1628 | ] 1629 | }, 1630 | { 1631 | "cell_type": "code", 1632 | "execution_count": 157, 1633 | "metadata": { 1634 | "collapsed": true 1635 | }, 1636 | "outputs": [], 1637 | "source": [ 1638 | "bankModelRF = RandomForestClassifier(n_jobs=-1, oob_score=True)" 1639 | ] 1640 | }, 1641 | { 1642 | "cell_type": "code", 1643 | "execution_count": 158, 1644 | "metadata": { 1645 | "collapsed": false 1646 | }, 1647 | "outputs": [ 1648 | { 1649 | "data": { 1650 | "text/plain": [ 1651 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 1652 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 1653 | " min_samples_leaf=1, min_samples_split=2,\n", 1654 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,\n", 1655 | " oob_score=False, random_state=None, verbose=0,\n", 1656 | " warm_start=False)" 1657 | ] 1658 | }, 1659 | "execution_count": 158, 1660 | "metadata": {}, 1661 | "output_type": "execute_result" 1662 | } 1663 | ], 1664 | "source": [ 1665 | "bankModelRF.fit(inputUpdated[:train.shape[0],:], labels[:train.shape[0]])" 1666 | ] 1667 | }, 1668 | { 1669 | "cell_type": "code", 1670 | "execution_count": 156, 1671 | "metadata": { 1672 | "collapsed": false 1673 | }, 1674 | "outputs": [ 1675 | { 1676 | "data": { 1677 | "text/plain": [ 1678 | "0.89128397375820057" 1679 | ] 1680 | }, 1681 | "execution_count": 156, 1682 | "metadata": {}, 1683 | "output_type": "execute_result" 1684 | } 1685 | ], 1686 | "source": [ 1687 | "bankModelRF.oob_score_" 1688 | ] 1689 | }, 1690 | { 1691 | "cell_type": "markdown", 1692 | "metadata": {}, 1693 | "source": [ 1694 | "**Exercise 14**\n", 1695 | "\n", 1696 | "Do the following\n", 1697 | "\n", 1698 | "1. Predict on test\n", 1699 | "2. Find accuracy metrics: AUC, Precision, Recall \n", 1700 | "3. How does it compare against Decision Tree" 1701 | ] 1702 | }, 1703 | { 1704 | "cell_type": "code", 1705 | "execution_count": null, 1706 | "metadata": { 1707 | "collapsed": true 1708 | }, 1709 | "outputs": [], 1710 | "source": [] 1711 | }, 1712 | { 1713 | "cell_type": "markdown", 1714 | "metadata": {}, 1715 | "source": [ 1716 | "# Gradient Boosting Machines" 1717 | ] 1718 | }, 1719 | { 1720 | "cell_type": "markdown", 1721 | "metadata": {}, 1722 | "source": [ 1723 | "\n", 1724 | "\n", 1725 | "\n", 1726 | "\n", 1727 | "*src*: http://www.slideshare.net/hustwj/scaling-up-machine-learning-the-tutorial-kdd-2011-part-iia-tree-ensembles" 1728 | ] 1729 | }, 1730 | { 1731 | "cell_type": "code", 1732 | "execution_count": 160, 1733 | "metadata": { 1734 | "collapsed": true 1735 | }, 1736 | "outputs": [], 1737 | "source": [ 1738 | "import xgboost as xgb" 1739 | ] 1740 | }, 1741 | { 1742 | "cell_type": "code", 1743 | "execution_count": 176, 1744 | "metadata": { 1745 | "collapsed": true 1746 | }, 1747 | "outputs": [], 1748 | "source": [ 1749 | "params = {}\n", 1750 | "params[\"min_child_weight\"] = 3\n", 1751 | "params[\"subsample\"] = 0.7\n", 1752 | "params[\"colsample_bytree\"] = 0.7\n", 1753 | "params[\"scale_pos_weight\"] = 1\n", 1754 | "params[\"silent\"] = 0\n", 1755 | "params[\"max_depth\"] = 4\n", 1756 | "params[\"nthread\"] = 6\n", 1757 | "params[\"gamma\"] = 1\n", 1758 | "params[\"objective\"] = \"binary:logistic\"\n", 1759 | "params[\"eta\"] = 0.005\n", 1760 | "params[\"base_score\"] = 0.1\n", 1761 | "params[\"eval_metric\"] = \"auc\"\n", 1762 | "params[\"seed\"] = 123" 1763 | ] 1764 | }, 1765 | { 1766 | "cell_type": "code", 1767 | "execution_count": 177, 1768 | "metadata": { 1769 | "collapsed": true 1770 | }, 1771 | "outputs": [], 1772 | "source": [ 1773 | "plst = list(params.items())\n", 1774 | "num_rounds = 120" 1775 | ] 1776 | }, 1777 | { 1778 | "cell_type": "code", 1779 | "execution_count": 178, 1780 | "metadata": { 1781 | "collapsed": true 1782 | }, 1783 | "outputs": [], 1784 | "source": [ 1785 | "xgtrain_pv = xgb.DMatrix(inputUpdated[:train.shape[0],:], label=labels[:train.shape[0]])\n", 1786 | "watchlist = [(xgtrain_pv, 'train')]\n", 1787 | "bankModelXGB = xgb.train(plst, xgtrain_pv, num_rounds)" 1788 | ] 1789 | }, 1790 | { 1791 | "cell_type": "code", 1792 | "execution_count": 179, 1793 | "metadata": { 1794 | "collapsed": true 1795 | }, 1796 | "outputs": [], 1797 | "source": [ 1798 | "prediction_XGB = bankModelXGB.predict(xgb.DMatrix(inputUpdated[train.shape[0]:,:]))" 1799 | ] 1800 | }, 1801 | { 1802 | "cell_type": "code", 1803 | "execution_count": 180, 1804 | "metadata": { 1805 | "collapsed": false 1806 | }, 1807 | "outputs": [ 1808 | { 1809 | "data": { 1810 | "text/plain": [ 1811 | "0.19817152619361877" 1812 | ] 1813 | }, 1814 | "execution_count": 180, 1815 | "metadata": {}, 1816 | "output_type": "execute_result" 1817 | } 1818 | ], 1819 | "source": [ 1820 | "sklearn.metrics.auc(labels[train.shape[0]:], prediction_XGB)" 1821 | ] 1822 | }, 1823 | { 1824 | "cell_type": "markdown", 1825 | "metadata": {}, 1826 | "source": [ 1827 | "# Another way of encoding" 1828 | ] 1829 | }, 1830 | { 1831 | "cell_type": "markdown", 1832 | "metadata": {}, 1833 | "source": [ 1834 | "### One Hot Encoding\n", 1835 | "\n", 1836 | "\n", 1837 | "\n", 1838 | "\n", 1839 | "Whiteboard ! " 1840 | ] 1841 | }, 1842 | { 1843 | "cell_type": "code", 1844 | "execution_count": 175, 1845 | "metadata": { 1846 | "collapsed": true 1847 | }, 1848 | "outputs": [], 1849 | "source": [ 1850 | "inputOneHot = pd.get_dummies(input)" 1851 | ] 1852 | }, 1853 | { 1854 | "cell_type": "markdown", 1855 | "metadata": {}, 1856 | "source": [ 1857 | "**Exercise 15**\n", 1858 | "\n", 1859 | "On the one hot encoded data, train\n", 1860 | "\n", 1861 | "1. Decision Tree\n", 1862 | "2. Random Forest\n", 1863 | "3. xgboost\n", 1864 | "\n", 1865 | "Which one works best on the test dataset?" 1866 | ] 1867 | }, 1868 | { 1869 | "cell_type": "code", 1870 | "execution_count": null, 1871 | "metadata": { 1872 | "collapsed": true 1873 | }, 1874 | "outputs": [], 1875 | "source": [] 1876 | } 1877 | ], 1878 | "metadata": { 1879 | "kernelspec": { 1880 | "display_name": "Python 2", 1881 | "language": "python", 1882 | "name": "python2" 1883 | }, 1884 | "language_info": { 1885 | "codemirror_mode": { 1886 | "name": "ipython", 1887 | "version": 2 1888 | }, 1889 | "file_extension": ".py", 1890 | "mimetype": "text/x-python", 1891 | "name": "python", 1892 | "nbconvert_exporter": "python", 1893 | "pygments_lexer": "ipython2", 1894 | "version": "2.7.11" 1895 | } 1896 | }, 1897 | "nbformat": 4, 1898 | "nbformat_minor": 0 1899 | } 1900 | -------------------------------------------------------------------------------- /RF_GBM/notebook/img/boosting.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/boosting.jpg -------------------------------------------------------------------------------- /RF_GBM/notebook/img/confusion_matrix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/confusion_matrix.jpg -------------------------------------------------------------------------------- /RF_GBM/notebook/img/cv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/cv.png -------------------------------------------------------------------------------- /RF_GBM/notebook/img/onehot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/onehot.png -------------------------------------------------------------------------------- /RF_GBM/notebook/img/random_forest.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/random_forest.jpg -------------------------------------------------------------------------------- /RF_GBM/notebook/img/tree_ensemble1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/tree_ensemble1.png -------------------------------------------------------------------------------- /RF_GBM/notebook/img/tree_ensemble2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/tree_ensemble2.png -------------------------------------------------------------------------------- /cf_mba/notebook/img/basket.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/cf_mba/notebook/img/basket.jpg -------------------------------------------------------------------------------- /cf_mba/notebook/img/cosine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/cf_mba/notebook/img/cosine.png -------------------------------------------------------------------------------- /check_env.py: -------------------------------------------------------------------------------- 1 | # Authors: Amit Kapoor and Bargava Subramanian 2 | # Copyright (c) 2016 Amit Kapoor 3 | # License: MIT License 4 | 5 | """ 6 | This script will check if the environment setup is correct for the workshop. 7 | 8 | To run, please execute the following command from the command prompt 9 | >>> python check_env.py 10 | 11 | The output will indicate if any of the libraries are missing or need to be updated. 12 | 13 | This script is inspired from https://github.com/fonnesbeck/scipy2015_tutorial/blob/master/check_env.py 14 | """ 15 | 16 | from __future__ import print_function 17 | 18 | try: 19 | import curses 20 | curses.setupterm() 21 | assert curses.tigetnum("colors") > 2 22 | OK = "\x1b[1;%dm[ OK ]\x1b[0m" % (30 + curses.COLOR_GREEN) 23 | FAIL = "\x1b[1;%dm[FAIL]\x1b[0m" % (30 + curses.COLOR_RED) 24 | except: 25 | OK = '[ OK ]' 26 | FAIL = '[FAIL]' 27 | 28 | import sys 29 | try: 30 | import importlib 31 | except ImportError: 32 | print(FAIL, "Python version 2.7 is required, but %s is installed." % sys.version) 33 | from distutils.version import LooseVersion as Version 34 | 35 | def import_version(pkg, min_ver, fail_msg=""): 36 | mod = None 37 | try: 38 | mod = importlib.import_module(pkg) 39 | if((pkg=="spacy" or pkg=="wordcloud") and (mod > 0)): 40 | print(OK, '%s ' % (pkg)) 41 | else: 42 | #else: 43 | version = getattr(mod, "__version__", 0) or getattr(mod, "VERSION", 0) 44 | if Version(version) < min_ver: 45 | print(FAIL, "%s version %s or higher required, but %s installed." 46 | % (lib, min_ver, version)) 47 | else: 48 | print(OK, '%s version %s' % (pkg, version)) 49 | except ImportError: 50 | print(FAIL, '%s not installed. %s' % (pkg, fail_msg)) 51 | return mod 52 | 53 | 54 | # first check the python version 55 | print('Using python in', sys.prefix) 56 | print(sys.version) 57 | pyversion = Version(sys.version) 58 | if pyversion < "3": 59 | print(FAIL, "Python version 3 is required, but %s is installed." % sys.version) 60 | elif pyversion >= "2": 61 | if pyversion == "2.7": 62 | print(FAIL, "Python version 2.7 is installed. Please upgrade to version 3." ) 63 | else: 64 | print(FAIL, "Unknown Python version: %s" % sys.version) 65 | 66 | print() 67 | requirements = { 68 | 'gensim' :'0.12.4', 69 | 'IPython' : '4.0.3', 70 | 'jupyter' :'1.0.0', 71 | 'lda' : '1.0.3', 72 | 'networkx' : '1.11', 73 | 'nltk' : '3.1', 74 | 'matplotlib' :'1.5.0', 75 | 'nltk' : '3.1', 76 | 'numpy' : '1.10.4', 77 | 'pandas' : '0.17.1', 78 | 'PIL' : '1.1.7', 79 | 'scipy' : '0.17.0', 80 | 'sklearn' : '0.17', 81 | 'seaborn' :'0.6.0', 82 | 'spacy' :'0.100.6', 83 | 'statsmodels':'0.6.1', 84 | 'wordcloud' :'0.1', 85 | 'xgboost' :'0.4' 86 | 87 | } 88 | 89 | # now the dependencies 90 | for lib, required_version in list(requirements.items()): 91 | import_version(lib, required_version) 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /img/ISLR.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/ISLR.jpeg -------------------------------------------------------------------------------- /img/acquire.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/acquire.jpg -------------------------------------------------------------------------------- /img/amit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/amit.png -------------------------------------------------------------------------------- /img/approach.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/approach.jpg -------------------------------------------------------------------------------- /img/art.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/art.jpeg -------------------------------------------------------------------------------- /img/bargava.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/bargava.jpg -------------------------------------------------------------------------------- /img/book.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/book.png -------------------------------------------------------------------------------- /img/books.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/books.jpg -------------------------------------------------------------------------------- /img/break.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/break.jpg -------------------------------------------------------------------------------- /img/clay.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/clay.jpeg -------------------------------------------------------------------------------- /img/craft.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/craft.jpeg -------------------------------------------------------------------------------- /img/estimating_coefficients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/estimating_coefficients.png -------------------------------------------------------------------------------- /img/explore.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/explore.jpg -------------------------------------------------------------------------------- /img/frame.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/frame.jpg -------------------------------------------------------------------------------- /img/glass.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/glass.jpg -------------------------------------------------------------------------------- /img/insight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/insight.jpg -------------------------------------------------------------------------------- /img/lens.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/lens.jpeg -------------------------------------------------------------------------------- /img/model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/model.jpg -------------------------------------------------------------------------------- /img/numbers.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/numbers.jpg -------------------------------------------------------------------------------- /img/onion-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/onion-image.jpg -------------------------------------------------------------------------------- /img/onion.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/onion.jpg -------------------------------------------------------------------------------- /img/onion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/onion.png -------------------------------------------------------------------------------- /img/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/overview.jpg -------------------------------------------------------------------------------- /img/pair.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/pair.jpg -------------------------------------------------------------------------------- /img/postit.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/postit.jpg -------------------------------------------------------------------------------- /img/r2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/r2.gif -------------------------------------------------------------------------------- /img/r_squared.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/r_squared.png -------------------------------------------------------------------------------- /img/refine.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/refine.jpg -------------------------------------------------------------------------------- /img/retail.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/retail.jpg -------------------------------------------------------------------------------- /img/science.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/science.jpeg -------------------------------------------------------------------------------- /img/see.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/see.jpeg -------------------------------------------------------------------------------- /img/single.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/single.jpeg -------------------------------------------------------------------------------- /img/skills.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/skills.png -------------------------------------------------------------------------------- /img/slope_intercept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/slope_intercept.png -------------------------------------------------------------------------------- /img/speak.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/speak.jpeg -------------------------------------------------------------------------------- /img/sports.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/sports.jpg -------------------------------------------------------------------------------- /img/stars.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/stars.jpg -------------------------------------------------------------------------------- /img/think.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/think.jpg -------------------------------------------------------------------------------- /img/thinkstats.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/thinkstats.jpg -------------------------------------------------------------------------------- /img/time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/time.jpg -------------------------------------------------------------------------------- /img/tool.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/tool.jpg -------------------------------------------------------------------------------- /img/travel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/travel.jpg -------------------------------------------------------------------------------- /img/welcome.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/welcome.jpg -------------------------------------------------------------------------------- /img/wesmckinney.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/wesmckinney.jpg -------------------------------------------------------------------------------- /img/workshop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/workshop.jpg -------------------------------------------------------------------------------- /installation_instructions.md: -------------------------------------------------------------------------------- 1 | # Installation Instructions for the workshop 2 | 3 | 4 | ### Package Manager: Anaconda 5 | 6 | We strongly recommend using Anaconda. It can be downloaded from here: 7 | https://www.continuum.io/downloads 8 | 9 | It comes with `jupyter notebook` which is the IDE we will be using for the workshop 10 | 11 | We recommend using the Python 3.5 version. 12 | 13 | ### Required packages 14 | 15 | Run the following script at the command prompt to check if you have all the requisite packages installed. 16 | To run, please execute the following command from the command prompt 17 | 18 | $ python check_env.py 19 | 20 | The output will indicate if any of the libraries are missing or need to be updated. 21 | 22 | Any package that is missing can be installed by running the command at the command prompt 23 | 24 | $ pip install 25 | 26 | Any package that needs to be upgraded can be upgraded by running the command at the command prompt 27 | 28 | $ pip install --upgrade 29 | 30 | 31 | Replace <*package_name*> with the package that needs to be installed/upgraded. 32 | 33 | After all the packages are installed, please run the following two commands 34 | 35 | 1. Install all the corpora for the `nltk` module. Please be warned that this is a huge file and can take a while. Please refer http://www.nltk.org/data.html for further information on what it downloads 36 | 37 | `$ python -m nltk.downloader all` 38 | 39 | 2. Install all the corpora for the `spacy` module. Please be warned that this is a huge file and can take a while 40 | 41 | `$ python -m spacy.en.download` 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /overview.md: -------------------------------------------------------------------------------- 1 | ![](img/workshop.jpg) 2 | # Intro to Data Science and Machine Learning 3 | ### @amitkaps | @bargava 4 | 5 | --- 6 | 7 | ![](img/welcome.jpg) 8 | # Welcome 9 | 10 | --- 11 | 12 | # Facilitators 13 | ![](img/amit.png) 14 | ![](img/bargava.jpg) 15 | 16 | --- 17 | 18 | # Amit 19 | ## @amitkaps 20 | ![](img/amit.png) 21 | 22 | --- 23 | 24 | # Bargava 25 | ## @bargava 26 | ![](img/bargava.jpg) 27 | 28 | 29 | --- 30 | 31 | ![](img/lens.jpeg) 32 | # See the world through a data lens 33 | 34 | --- 35 | 36 | ![](img/see.jpeg) 37 | # "Data is just a clue to the end truth" 38 | -- Josh Smith 39 | 40 | --- 41 | 42 | ![](img/sports.jpg) 43 | ![](img/travel.jpg) 44 | ![](img/retail.jpg) 45 | # Data Driven Decisions 46 | 47 | --- 48 | 49 | ![](img/science.jpeg) 50 | # "Science is knowledge which we understand so well that we can teach it to a computer. Everything else is art" 51 | -- Donald Knuth 52 | 53 | --- 54 | 55 | ![](img/art.jpeg) 56 | # Data Science is an Art 57 | 58 | --- 59 | 60 | ![](img/glass.jpg) 61 | # Hypothesis Driven Approach 62 | 63 | --- 64 | 65 | ![](img/frame.jpg) 66 | # Frame 67 | ## "An approximate answer to the right problem is worth a good deal" 68 | 69 | --- 70 | 71 | ![](img/acquire.jpg) 72 | # Acquire 73 | ## "80% perspiration, 10% great idea, 10% great output" 74 | 75 | --- 76 | 77 | ![](img/refine.jpg) 78 | # Refine 79 | ## "All data is messy." 80 | 81 | --- 82 | 83 | ![](img/explore.jpg) 84 | # Explore 85 | ## "I don't know, what I don't know." 86 | 87 | --- 88 | 89 | ![](img/model.jpg) 90 | # Model 91 | ## "All models are wrong, but some are useful" 92 | 93 | --- 94 | 95 | ![](img/insight.jpg) 96 | # Insight 97 | ## "The goal is to turn data into insight" 98 | 99 | --- 100 | 101 | ![](img/approach.jpg) 102 | 103 | 104 | --- 105 | 106 | ![](img/think.jpg) 107 | ## "Doing data analyis requires quite a bit of thinking and we believe that when you’ve completed a good data analysis, you’ve spent more time thinking than doing." 108 | -- Roger Peng 109 | 110 | --- 111 | 112 | ![](img/tool.jpg) 113 | # Python Data Stack 114 | 115 | --- 116 | 117 | ![](img/books.jpg) 118 | # Case Studies 119 | 120 | --- 121 | # Day 1 122 | # Peeling the Onion 123 | ## Time Series Analysis 124 | ![](img/onion.jpg) 125 | 126 | --- 127 | 128 | # Day 2 129 | # Grocery 130 | ## Market Basket Analysis / Collaborative Filter 131 | 132 | --- 133 | 134 | # Day 2 135 | # BanK Marketing 136 | ## Random Forest and Gradient Boosting 137 | 138 | --- 139 | 140 | # Day 3 141 | # DataTau 142 | ## Text Analytics 143 | 144 | --- 145 | 146 | ![](img/clay.jpeg) 147 | # Learning Approach 148 | 149 | --- 150 | 151 | ![](img/single.jpeg) 152 | # Do the Exercises 153 | 154 | --- 155 | 156 | ![](img/pair.jpg) 157 | # Pair up & Learn 158 | 159 | --- 160 | 161 | ![](img/postit.jpg) 162 | # Call for Help 163 | 164 | --- 165 | 166 | ![](img/numbers.jpg) 167 | # Enjoy the workshop 168 | 169 | --- 170 | 171 | ## Workshop Material is available at the Github Repo 172 | ### [https://github.com/amitkaps/machine-learning](https://github.com/amitkaps/machine-learning) 173 | 174 | --- 175 | 176 | # Exercise 177 | 178 | --- 179 | 180 | # 1. Time Series Exercise 181 | 182 | ### "Predict the number of tickets that will be raised in the next week" 183 | 184 | - **Frame**: What to forecast? At what horizon? At what level? 185 | - **Acquire, Refine, Explore**: Do EDA to understand the trend and pattern within the data 186 | - **Models**: Mean Model, Linear Trend, Random Walk, Simple Moving Average, Exp Smoothing, Decomposition, ARIMA 187 | - **Insight**: Share the insight through a datavis of the models 188 | 189 | --- 190 | 191 | # 2. Text Analytics Exercise 192 | 193 | ### "Identify the entity, features & topics in the 'Comments' data or 'Twitter #machine learning' data" 194 | 195 | - **Frame**: What are the comments you are trying to understand? 196 | - **Acquire, Refine, Explore**: Do Wordcloud, Lemmatization, Part of Speech Analysis, and Entity Chunking 197 | - **Models**: TF-IDF, Topic Modelling, Sentiment Analysis 198 | - **Insight**: Share the insight through word cloud and topic visualisation 199 | 200 | --- 201 | 202 | # Feedback 203 | 204 | ### [https://amitkaps.typeform.com/to/i6wl2E](https://amitkaps.typeform.com/to/i6wl2E) 205 | 206 | 207 | --- 208 | 209 | # Recap 210 | 211 | --- 212 | 213 | ![](img/approach.jpg) 214 | 215 | --- 216 | 217 | ![](img/frame.jpg) 218 | # Frame 219 | - **Toy Problems** 220 | - **Simple Problems** 221 | - Complex Problems 222 | - Business Problems 223 | - Research Problems 224 | 225 | --- 226 | 227 | ![](img/acquire.jpg) 228 | # Acquire 229 | - **Scraping** (structured, unstructured) 230 | - **Files** (csv, xls, json, xml, pdf, ...) 231 | - Database (sqlite, ...) 232 | - APIs 233 | - Streaming 234 | 235 | --- 236 | 237 | ![](img/refine.jpg) 238 | # Refine 239 | - Data Cleaning (inconsistent, missing, ...) 240 | - **Data Refining** (derive, parse, merge, filter, convert, ...) 241 | - **Data Transformations** (group by, pivot, aggregate, sample, summarise, ...) 242 | 243 | 244 | --- 245 | 246 | ![](img/explore.jpg) 247 | # Explore 248 | - **Simple Vis** 249 | - Multi Dimensional Vis 250 | - Geographic Vis 251 | - Large Data Vis (Bin - Summarise - Smooth) 252 | - Interactive Vis 253 | 254 | --- 255 | 256 | ![](img/model.jpg) 257 | # Model - Supervised Learning 258 | - *Continuous*: Regression - **Linear**, Polynomial, Tree Based Methods - CART, **Random Forest**, Gradient Boosting Machines 259 | - *Classification* - **Logistics Regression**, Tree, KNN, SVM, Naive-Bayes, Bayesian Network 260 | 261 | --- 262 | 263 | ![](img/model.jpg) 264 | # Model - UnSupervised Learning 265 | - *Continuous*: Clustering & Dimensionality Reduction like PCA, SVD, MDS, K-means 266 | - *Categorical*: Association Analysis 267 | 268 | --- 269 | 270 | ![](img/model.jpg) 271 | # Model - Advanced / 272 | - **Time Series** 273 | - **Text Analytics** 274 | - Network / Graph Analytics 275 | - Optimization 276 | 277 | --- 278 | ![](img/model.jpg) 279 | # Model - Specialized 280 | - Reinforcement Learning 281 | - Online Learning 282 | - Deep Learning 283 | - Other Applications: Image, Speech 284 | 285 | 286 | --- 287 | 288 | ![](img/insight.jpg) 289 | # Insight 290 | - Narrative Visualisation 291 | - Dashboard Visualisation 292 | - Decision Making Tools 293 | - Automated Decision Tools 294 | 295 | --- 296 | 297 | # PyData Stack 298 | - **Acquire / Refine**: `Pandas, Beautiful Soup, Selenium, Requests, SQL Alchemy, Numpy, Blaze` 299 | - **Explore**: `MatPlotLib, Seaborn, Bokeh, Plotly, Vega, Folium` 300 | - **Model**: `Scikit-Learn, StatsModels, SciPy, Gensim, Keras, Tensor Flow, PySpark` 301 | - **Insight**: `Django, Flask` 302 | 303 | 304 | --- 305 | 306 | # Skills 307 | ![fit](img/skills.png) 308 | 309 | --- 310 | 311 | ![fit](img/skills.png) 312 | 313 | --- 314 | 315 | # Books 316 | 317 | ![fit](img/book.png) 318 | ![fit](img/wesmckinney.jpg) 319 | ![fit](img/thinkstats.jpg) 320 | 321 | 322 | --- 323 | 324 | ![fit](img/book.png) 325 | ![fit](img/wesmckinney.jpg) 326 | ![fit](img/thinkstats.jpg) 327 | 328 | --- 329 | 330 | ![left](img/ISLR.jpeg) 331 | ## Resources - Statistical Learning 332 | - One of the good books on statistical learning is ISLR -> [An Introduction to Statistical Learning with Application in R](http://www-bcf.usc.edu/~gareth/ISL/index.html) 333 | - You can find all the ISLR code in python at this github repo - [https://github.com/JWarmenhoven/ISLR-python](https://github.com/JWarmenhoven/ISLR-python) 334 | 335 | --- 336 | 337 | ## Resources - Time Series 338 | - [Forecasting: Principle and Text](https://www.otexts.org/fpp) 339 | - [Statistical forecasting: Notes on regression and time series analysis Case](http://people.duke.edu/~rnau/411home.htm) 340 | 341 | ## Resources - Text Analytics 342 | - [Natural Language Processing with Python](http://www.nltk.org/book/) 343 | 344 | 345 | --- 346 | ![](img/stars.jpg) 347 | # Online Course 348 | - Harvard Data Science Course - [CS 109 Course](http://cs109.github.io/2015/) (It is structured in similar way to the approach we shared) 349 | - Data Science Specialisation - [JHU Data Science](https://www.coursera.org/specializations/jhu-data-science) (It is a good course, though the material is coded in R) 350 |
351 | - Many more on Coursera & Udacity... 352 | 353 | 354 | --- 355 | ![](img/workshop.jpg) 356 | # We enjoyed the workshop! 357 | 358 | --- 359 | ![](img/speak.jpeg) 360 | # Speak to Us! 361 | 362 | --- 363 | 364 | ![](img/numbers.jpg) 365 | # Thank you 366 | ## @amitkaps | @bargava -------------------------------------------------------------------------------- /overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/overview.pdf -------------------------------------------------------------------------------- /python.txt: -------------------------------------------------------------------------------- 1 | abstract-rendering==0.5.1 2 | alabaster==0.7.7 3 | anaconda-client==1.2.2 4 | appnope==0.1.0 5 | appscript==1.0.1 6 | argcomplete==1.0.0 7 | astropy==1.1.1 8 | Babel==2.2.0 9 | beautifulsoup4==4.4.1 10 | bitarray==0.8.1 11 | blaze==0.9.0 12 | bokeh==0.11.0 13 | boto==2.39.0 14 | Bottleneck==1.0.0 15 | cffi==1.2.1 16 | clyent==1.2.0 17 | colorama==0.3.6 18 | conda==4.0.4 19 | conda-build==1.19.0 20 | conda-env==2.4.5 21 | configobj==5.0.6 22 | cryptography==1.0.2 23 | cycler==0.10.0 24 | Cython==0.23.4 25 | cytoolz==0.7.5 26 | datashape==0.5.0 27 | decorator==4.0.6 28 | docutils==0.12 29 | dynd===f641248 30 | et-xmlfile==1.0.1 31 | fastcache==1.0.2 32 | Flask==0.10.1 33 | futures==3.0.3 34 | greenlet==0.4.9 35 | h5py==2.5.0 36 | html5lib==0.999 37 | idna==2.0 38 | ipykernel==4.2.2 39 | ipython==4.0.3 40 | ipython-genutils==0.1.0 41 | ipywidgets==4.1.1 42 | itsdangerous==0.24 43 | jdcal==1.2 44 | jedi==0.9.0 45 | Jinja2==2.8 46 | jsonschema==2.4.0 47 | jupyter==1.0.0 48 | jupyter-client==4.1.1 49 | jupyter-console==4.1.0 50 | jupyter-core==4.0.6 51 | llvmlite==0.8.0 52 | lxml==3.5.0 53 | MarkupSafe==0.23 54 | matplotlib==1.5.1 55 | mistune==0.7.1 56 | multipledispatch==0.4.8 57 | nbconvert==4.1.0 58 | nbformat==4.0.1 59 | networkx==1.11 60 | nltk==3.2 61 | nose==1.3.7 62 | notebook==4.1.0 63 | numba==0.23.1 64 | numexpr==2.4.6 65 | numpy==1.10.4 66 | odo==0.4.0 67 | openpyxl==2.3.2 68 | pandas==0.17.1 69 | path.py==0.0.0 70 | patsy==0.4.0 71 | pep8==1.7.0 72 | pexpect==3.3 73 | pickleshare==0.5 74 | Pillow==3.1.0 75 | ply==3.8 76 | psutil==3.4.2 77 | ptyprocess==0.5 78 | py==1.4.31 79 | pyasn1==0.1.9 80 | pycosat==0.6.1 81 | pycparser==2.14 82 | pycrypto==2.6.1 83 | pycurl==7.19.5.3 84 | pyflakes==1.0.0 85 | Pygments==2.1 86 | pyOpenSSL==0.15.1 87 | pyparsing==2.0.3 88 | pytest==2.8.5 89 | python-dateutil==2.4.2 90 | pytz==2015.7 91 | PyYAML==3.11 92 | pyzmq==15.2.0 93 | qtconsole==4.1.1 94 | redis==2.10.3 95 | requests==2.9.1 96 | rope-py3k==0.9.4.post1 97 | scikit-image==0.11.3 98 | scikit-learn==0.17 99 | scipy==0.17.0 100 | seaborn==0.7.0 101 | simplegeneric==0.8.1 102 | six==1.10.0 103 | snowballstemmer==1.2.1 104 | sockjs-tornado==1.0.1 105 | Sphinx==1.3.5 106 | sphinx-rtd-theme==0.1.9 107 | spyder==2.3.8 108 | SQLAlchemy==1.0.12 109 | statsmodels==0.6.1 110 | sympy==0.7.6.1 111 | tables==3.2.2 112 | terminado==0.5 113 | toolz==0.7.4 114 | tornado==4.3 115 | traitlets==4.1.0 116 | unicodecsv==0.14.1 117 | Werkzeug==0.11.3 118 | xgboost==0.4a30 119 | xlrd==0.9.4 120 | XlsxWriter==0.8.4 121 | xlwings==0.6.4 122 | xlwt==1.0.0 123 | -------------------------------------------------------------------------------- /text_mining/DataTau.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | DataTau
DataTaunew | comments | leaders | submitlogin
1.
An Exploration of R, Yelp, and the Search for Good Indian Food (springboard.com)
5 points by Rogerh91 4 hours ago | discuss
2.
Spark Pipelines: Elegant Yet Powerful (insightdatalabs.com)
3 points by aouyang1 7 hours ago | discuss
3.
Deep Advances in Generative Modeling (youtube.com)
7 points by gwulfs 13 hours ago | 1 comment
4.
Shit VCs Say (buzzfeed.com)
3 points by Argentum01 8 hours ago | discuss
5.
Python, Machine Learning, and Language Wars (sebastianraschka.com)
4 points by pmigdal 15 hours ago | discuss
6.
A Neural Network in 11 lines of Python (github.io)
3 points by dekhtiar 13 hours ago | discuss
7.
Markov Chains Explained Visually (setosa.io)
13 points by zeroviscosity 1 day ago | 1 comment
8.
Dplython: Dplyr for Python (github.com)
13 points by thenaturalist 1 day ago | 3 comments
9.
Inferring causal impact using Bayesian structural time-series models (google.com)
8 points by Homunculiheaded 1 day ago | 1 comment
10.
A Billion Taxi Rides on Amazon EMR running Spark (marksblogg.com)
5 points by marklit 1 day ago | 1 comment
11.
Tutorial: Web scraping and mapping breweries with import.io and R (trendct.org)
4 points by jasdumas 1 day ago | discuss
12.
The rise of greedy robots (yanirseroussi.com)
4 points by yanir 2 days ago | discuss
13.
Python for Data Structures, Algorithms, and Interviews (github.com)
18 points by kokoubaby 4 days ago | discuss
14.
Extracting image metadata at scale (netflix.com)
2 points by zachwill 1 day ago | discuss
15.
Lift charts - A data scientist's secret weapon (datalifebalance.com)
14 points by datenheini 4 days ago | 2 comments
16.
How To Become A Machine Learning Expert In One Simple Step (swanintelligence.com)
4 points by swanint 2 days ago | discuss
17.
Engineers Shouldn’t Write ETL: High Functioning Data Science Departments (stitchfix.com)
10 points by legel 4 days ago | 3 comments
18.
Simple estimation of hierarchical events with petersburg (willmcginnis.com)
3 points by wdm0006 2 days ago | discuss
19.
Data Science Side Project
6 points by yashpatel5400 2 days ago | 8 comments
20.
Unsupervised Computer Vision: The Current State of the Art (stitchfix.com)
6 points by carlosfaham 3 days ago | discuss
21.
Data Engineering at Slack: Twelve Mistakes I've Made In My First Three Months (google.com)
14 points by gwulfs 6 days ago | 2 comments
22.
What data visualization tools do /r/DataIsBeautiful OC creators use? (randalolson.com)
3 points by pmigdal 2 days ago | discuss
23.
Reshaping in Pandas (nikolaygrozev.wordpress.com)
6 points by carlosgg 4 days ago | discuss
24.
An unusual interactive machine learning challenge (blackboxchallenge.com)
4 points by gglumov 3 days ago | discuss
25.
Datumbox Machine Learning Framework 0.7.0 Released (datumbox.com)
4 points by datumbox 3 days ago | discuss
26.
Data science intro for math/phys background (p.migdal.pl)
14 points by pmigdal 7 days ago | discuss
27.
Neural Networks demystified (lumiverse.io)
16 points by elyase 8 days ago | discuss
28.
What machines can learn from Apple Watch: detecting undiagnosed heart condition (insighthealthdata.com)
9 points by koukouhappy 6 days ago | discuss
29.
Data Science Tools: The Biggest Winners and Losers (dominodatalab.com)
12 points by AnnaOnTheWeb 7 days ago | discuss
30.
10 Years of Open Source Machine Learning (medium.com)
9 points by tstonez 6 days ago | 1 comment
More

37 |
RSS 38 | | Announcements 39 |
-------------------------------------------------------------------------------- /text_mining/Refine.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Refine the Data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 32, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 33, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "df = pd.read_csv('data_tau.csv')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 34, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/html": [ 42 | "
\n", 43 | "\n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | "
titledate
0An Exploration of R, Yelp, and the Search for ...5 points by Rogerh91 6 hours ago | discuss
1Deep Advances in Generative Modeling7 points by gwulfs 15 hours ago | 1 comment
2Spark Pipelines: Elegant Yet Powerful3 points by aouyang1 9 hours ago | discuss
3Shit VCs Say3 points by Argentum01 10 hours ago | discuss
4Python, Machine Learning, and Language Wars4 points by pmigdal 17 hours ago | discuss
\n", 79 | "
" 80 | ], 81 | "text/plain": [ 82 | " title \\\n", 83 | "0 An Exploration of R, Yelp, and the Search for ... \n", 84 | "1 Deep Advances in Generative Modeling \n", 85 | "2 Spark Pipelines: Elegant Yet Powerful \n", 86 | "3 Shit VCs Say \n", 87 | "4 Python, Machine Learning, and Language Wars \n", 88 | "\n", 89 | " date \n", 90 | "0 5 points by Rogerh91 6 hours ago | discuss \n", 91 | "1 7 points by gwulfs 15 hours ago | 1 comment \n", 92 | "2 3 points by aouyang1 9 hours ago | discuss \n", 93 | "3 3 points by Argentum01 10 hours ago | discuss \n", 94 | "4 4 points by pmigdal 17 hours ago | discuss " 95 | ] 96 | }, 97 | "execution_count": 34, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "df.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "To get the date of the title - we will need the following algorithm\n", 111 | "- If the string contains **hours** we can consider it **1 day**\n", 112 | "- And if the string has **day**, we pick the number preceding the **day**\n", 113 | "\n", 114 | "To apply this algorithm, we need to be able to pick these words and digits from a string. For that we will use Regular Expression." 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## Introduction to Regular Expression (Regex)\n", 122 | "\n", 123 | "Regular expression is a way of selecting text using symbols in a string.\n", 124 | "\n", 125 | "Refer to the following links for an interactive playground\n", 126 | "- [http://regexr.com](http://regexr.com/)\n", 127 | "- [http://regex101.com/](http://regex101.com/)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 35, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "import re" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 36, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "test_string = \"Hello world, welcome to 2016.\"" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 37, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "# We can pass the whole string and re.search will give the first occurence of the value\n", 161 | "# re.search - This function searches for first occurrence of RE pattern within string.\n", 162 | "a = re.search('Hello world, welcome to 2016',test_string)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 38, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "<_sre.SRE_Match object; span=(0, 28), match='Hello world, welcome to 2016'>" 176 | ] 177 | }, 178 | "execution_count": 38, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "a" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 39, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "'Hello world, welcome to 2016'" 198 | ] 199 | }, 200 | "execution_count": 39, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "a.group()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 40, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "'H'" 220 | ] 221 | }, 222 | "execution_count": 40, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "# Match the first letters in the string\n", 229 | "a = re.search('.',test_string)\n", 230 | "a.group()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 41, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "'Hello world, welcome to 2016.'" 244 | ] 245 | }, 246 | "execution_count": 41, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "# Match all the letters in the string\n", 253 | "a = re.search('.*',test_string)\n", 254 | "a.group()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 42, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "<_sre.SRE_Match object; span=(0, 5), match='Hello'>\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "a = re.search('Hello',test_string)\n", 274 | "print(a)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "** Some basic symbols**\n", 282 | "\n", 283 | "**`?`** \n", 284 | "\n", 285 | "The question mark indicates zero or one occurrences of the preceding element. For example, colou?r matches both \"color\" and \"colour\".\n", 286 | "\n", 287 | "**`\\*`**\n", 288 | "\n", 289 | "The asterisk indicates zero or more occurrences of the preceding element. For example, ab*c matches \"ac\", \"abc\", \"abbc\", \"abbbc\", and so on.\n", 290 | "\n", 291 | "**`\\+`**\t\n", 292 | "The plus sign indicates one or more occurrences of the preceding element. For example, ab+c matches \"abc\", \"abbc\", \"abbbc\", and so on, but not \"ac\".\n" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 43, 298 | "metadata": { 299 | "collapsed": false 300 | }, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "<_sre.SRE_Match object; span=(0, 2), match='He'>\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "a = re.search('\\w.',test_string)\n", 312 | "print(a)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 44, 318 | "metadata": { 319 | "collapsed": false 320 | }, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "<_sre.SRE_Match object; span=(0, 5), match='Hello'>\n" 327 | ] 328 | } 329 | ], 330 | "source": [ 331 | "a = re.search('\\w*',test_string)\n", 332 | "print(a)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "### Exercises" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 45, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "string = '''In 2016, we are learning Text Analytics in Data Science 101\n", 351 | " by scraping http://datatau.com'''" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 46, 357 | "metadata": { 358 | "collapsed": false 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "string = \"In 2016, we are learning Text Analytics in Data Science 101 by scraping http://datatau.com\"" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "Write a regex to pick the numbers 2016 from string above." 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "collapsed": true 377 | }, 378 | "outputs": [], 379 | "source": [] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "Write a regex to pick the url link (http://xyz.com) from the string above " 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": { 392 | "collapsed": true 393 | }, 394 | "outputs": [], 395 | "source": [] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "## Lets get the date from our string" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 47, 407 | "metadata": { 408 | "collapsed": false 409 | }, 410 | "outputs": [ 411 | { 412 | "data": { 413 | "text/html": [ 414 | "
\n", 415 | "\n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | "
titledate
0An Exploration of R, Yelp, and the Search for ...5 points by Rogerh91 6 hours ago | discuss
1Deep Advances in Generative Modeling7 points by gwulfs 15 hours ago | 1 comment
2Spark Pipelines: Elegant Yet Powerful3 points by aouyang1 9 hours ago | discuss
3Shit VCs Say3 points by Argentum01 10 hours ago | discuss
4Python, Machine Learning, and Language Wars4 points by pmigdal 17 hours ago | discuss
\n", 451 | "
" 452 | ], 453 | "text/plain": [ 454 | " title \\\n", 455 | "0 An Exploration of R, Yelp, and the Search for ... \n", 456 | "1 Deep Advances in Generative Modeling \n", 457 | "2 Spark Pipelines: Elegant Yet Powerful \n", 458 | "3 Shit VCs Say \n", 459 | "4 Python, Machine Learning, and Language Wars \n", 460 | "\n", 461 | " date \n", 462 | "0 5 points by Rogerh91 6 hours ago | discuss \n", 463 | "1 7 points by gwulfs 15 hours ago | 1 comment \n", 464 | "2 3 points by aouyang1 9 hours ago | discuss \n", 465 | "3 3 points by Argentum01 10 hours ago | discuss \n", 466 | "4 4 points by pmigdal 17 hours ago | discuss " 467 | ] 468 | }, 469 | "execution_count": 47, 470 | "metadata": {}, 471 | "output_type": "execute_result" 472 | } 473 | ], 474 | "source": [ 475 | "df.head()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 48, 481 | "metadata": { 482 | "collapsed": false 483 | }, 484 | "outputs": [ 485 | { 486 | "data": { 487 | "text/html": [ 488 | "
\n", 489 | "\n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | "
titledate
175Getting Started with Statistics for Data Science3 points by nickhould 35 days ago | discuss
176Rodeo 1.3 - Tab-completion for docstrings3 points by glamp 35 days ago | discuss
177Teaching D3.js - links3 points by pmigdal 35 days ago | discuss
178Parallel scikit-learn on YARN5 points by stijntonk 39 days ago | discuss
179Meetup: Free Live Webinar on Prescriptive Anal...2 points by ann928 32 days ago | discuss
\n", 525 | "
" 526 | ], 527 | "text/plain": [ 528 | " title \\\n", 529 | "175 Getting Started with Statistics for Data Science \n", 530 | "176 Rodeo 1.3 - Tab-completion for docstrings \n", 531 | "177 Teaching D3.js - links \n", 532 | "178 Parallel scikit-learn on YARN \n", 533 | "179 Meetup: Free Live Webinar on Prescriptive Anal... \n", 534 | "\n", 535 | " date \n", 536 | "175 3 points by nickhould 35 days ago | discuss \n", 537 | "176 3 points by glamp 35 days ago | discuss \n", 538 | "177 3 points by pmigdal 35 days ago | discuss \n", 539 | "178 5 points by stijntonk 39 days ago | discuss \n", 540 | "179 2 points by ann928 32 days ago | discuss " 541 | ] 542 | }, 543 | "execution_count": 48, 544 | "metadata": {}, 545 | "output_type": "execute_result" 546 | } 547 | ], 548 | "source": [ 549 | "df.tail()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 49, 555 | "metadata": { 556 | "collapsed": true 557 | }, 558 | "outputs": [], 559 | "source": [ 560 | "date_string = df['date'][0]" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 50, 566 | "metadata": { 567 | "collapsed": false 568 | }, 569 | "outputs": [ 570 | { 571 | "name": "stdout", 572 | "output_type": "stream", 573 | "text": [ 574 | "5 points by Rogerh91 6 hours ago | discuss\n" 575 | ] 576 | } 577 | ], 578 | "source": [ 579 | "print(date_string)" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 51, 585 | "metadata": { 586 | "collapsed": false 587 | }, 588 | "outputs": [ 589 | { 590 | "data": { 591 | "text/plain": [ 592 | "<_sre.SRE_Match object; span=(23, 28), match='hours'>" 593 | ] 594 | }, 595 | "execution_count": 51, 596 | "metadata": {}, 597 | "output_type": "execute_result" 598 | } 599 | ], 600 | "source": [ 601 | "re.search('hours',date_string)" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 52, 607 | "metadata": { 608 | "collapsed": true 609 | }, 610 | "outputs": [], 611 | "source": [ 612 | "date_string = df['date'][50]" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": 53, 618 | "metadata": { 619 | "collapsed": false 620 | }, 621 | "outputs": [ 622 | { 623 | "name": "stdout", 624 | "output_type": "stream", 625 | "text": [ 626 | "4 points by lefish 7 days ago | discuss\n" 627 | ] 628 | } 629 | ], 630 | "source": [ 631 | "print(date_string)" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 54, 637 | "metadata": { 638 | "collapsed": true 639 | }, 640 | "outputs": [], 641 | "source": [ 642 | "# If hours is not there, we don't get any match\n", 643 | "re.search('hours',date_string)" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 55, 649 | "metadata": { 650 | "collapsed": false 651 | }, 652 | "outputs": [ 653 | { 654 | "data": { 655 | "text/plain": [ 656 | "<_sre.SRE_Match object; span=(19, 24), match='7 day'>" 657 | ] 658 | }, 659 | "execution_count": 55, 660 | "metadata": {}, 661 | "output_type": "execute_result" 662 | } 663 | ], 664 | "source": [ 665 | "# Let us match the digit preceding the day text\n", 666 | "day_search = re.search('\\d+ day',date_string)\n", 667 | "day_search" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 56, 673 | "metadata": { 674 | "collapsed": false 675 | }, 676 | "outputs": [ 677 | { 678 | "data": { 679 | "text/plain": [ 680 | "'7 day'" 681 | ] 682 | }, 683 | "execution_count": 56, 684 | "metadata": {}, 685 | "output_type": "execute_result" 686 | } 687 | ], 688 | "source": [ 689 | "days_string = day_search.group(0)\n", 690 | "days_string" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 57, 696 | "metadata": { 697 | "collapsed": false 698 | }, 699 | "outputs": [ 700 | { 701 | "data": { 702 | "text/plain": [ 703 | "'7'" 704 | ] 705 | }, 706 | "execution_count": 57, 707 | "metadata": {}, 708 | "output_type": "execute_result" 709 | } 710 | ], 711 | "source": [ 712 | "days = days_string.split(' ')[0] \n", 713 | "days" 714 | ] 715 | }, 716 | { 717 | "cell_type": "markdown", 718 | "metadata": {}, 719 | "source": [] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 58, 724 | "metadata": { 725 | "collapsed": true 726 | }, 727 | "outputs": [], 728 | "source": [ 729 | "def return_reg_ex_days(row):\n", 730 | " days = ''\n", 731 | " if re.search('hours',row['date']) is not None:\n", 732 | " # print('hours',row['date'])\n", 733 | " days = 1\n", 734 | " else:\n", 735 | " day_search = re.search('\\d+ day',row['date'])\n", 736 | " # print('day',day_search.group(0))\n", 737 | " days = day_search.group(0).split(' ')[0] \n", 738 | " \n", 739 | " #print(row,days)\n", 740 | " return days\n", 741 | " " 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": 59, 747 | "metadata": { 748 | "collapsed": false 749 | }, 750 | "outputs": [], 751 | "source": [ 752 | "# Now we apply this function to each of the row in the dataframe\n", 753 | "df['days'] = df.apply(return_reg_ex_days,axis=1)" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 60, 759 | "metadata": { 760 | "collapsed": false 761 | }, 762 | "outputs": [ 763 | { 764 | "data": { 765 | "text/html": [ 766 | "
\n", 767 | "\n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | "
titledatedays
0An Exploration of R, Yelp, and the Search for ...5 points by Rogerh91 6 hours ago | discuss1
1Deep Advances in Generative Modeling7 points by gwulfs 15 hours ago | 1 comment1
2Spark Pipelines: Elegant Yet Powerful3 points by aouyang1 9 hours ago | discuss1
3Shit VCs Say3 points by Argentum01 10 hours ago | discuss1
4Python, Machine Learning, and Language Wars4 points by pmigdal 17 hours ago | discuss1
\n", 809 | "
" 810 | ], 811 | "text/plain": [ 812 | " title \\\n", 813 | "0 An Exploration of R, Yelp, and the Search for ... \n", 814 | "1 Deep Advances in Generative Modeling \n", 815 | "2 Spark Pipelines: Elegant Yet Powerful \n", 816 | "3 Shit VCs Say \n", 817 | "4 Python, Machine Learning, and Language Wars \n", 818 | "\n", 819 | " date days \n", 820 | "0 5 points by Rogerh91 6 hours ago | discuss 1 \n", 821 | "1 7 points by gwulfs 15 hours ago | 1 comment 1 \n", 822 | "2 3 points by aouyang1 9 hours ago | discuss 1 \n", 823 | "3 3 points by Argentum01 10 hours ago | discuss 1 \n", 824 | "4 4 points by pmigdal 17 hours ago | discuss 1 " 825 | ] 826 | }, 827 | "execution_count": 60, 828 | "metadata": {}, 829 | "output_type": "execute_result" 830 | } 831 | ], 832 | "source": [ 833 | "df.head()" 834 | ] 835 | }, 836 | { 837 | "cell_type": "code", 838 | "execution_count": 61, 839 | "metadata": { 840 | "collapsed": false 841 | }, 842 | "outputs": [ 843 | { 844 | "data": { 845 | "text/html": [ 846 | "
\n", 847 | "\n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | "
titledatedays
175Getting Started with Statistics for Data Science3 points by nickhould 35 days ago | discuss35
176Rodeo 1.3 - Tab-completion for docstrings3 points by glamp 35 days ago | discuss35
177Teaching D3.js - links3 points by pmigdal 35 days ago | discuss35
178Parallel scikit-learn on YARN5 points by stijntonk 39 days ago | discuss39
179Meetup: Free Live Webinar on Prescriptive Anal...2 points by ann928 32 days ago | discuss32
\n", 889 | "
" 890 | ], 891 | "text/plain": [ 892 | " title \\\n", 893 | "175 Getting Started with Statistics for Data Science \n", 894 | "176 Rodeo 1.3 - Tab-completion for docstrings \n", 895 | "177 Teaching D3.js - links \n", 896 | "178 Parallel scikit-learn on YARN \n", 897 | "179 Meetup: Free Live Webinar on Prescriptive Anal... \n", 898 | "\n", 899 | " date days \n", 900 | "175 3 points by nickhould 35 days ago | discuss 35 \n", 901 | "176 3 points by glamp 35 days ago | discuss 35 \n", 902 | "177 3 points by pmigdal 35 days ago | discuss 35 \n", 903 | "178 5 points by stijntonk 39 days ago | discuss 39 \n", 904 | "179 2 points by ann928 32 days ago | discuss 32 " 905 | ] 906 | }, 907 | "execution_count": 61, 908 | "metadata": {}, 909 | "output_type": "execute_result" 910 | } 911 | ], 912 | "source": [ 913 | "df.tail()" 914 | ] 915 | }, 916 | { 917 | "cell_type": "code", 918 | "execution_count": 62, 919 | "metadata": { 920 | "collapsed": true 921 | }, 922 | "outputs": [], 923 | "source": [ 924 | "# Let us save to a dataframe\n", 925 | "df.to_csv('data_tau_days.csv', index=False)" 926 | ] 927 | } 928 | ], 929 | "metadata": { 930 | "kernelspec": { 931 | "display_name": "Python 3", 932 | "language": "python", 933 | "name": "python3" 934 | }, 935 | "language_info": { 936 | "codemirror_mode": { 937 | "name": "ipython", 938 | "version": 3 939 | }, 940 | "file_extension": ".py", 941 | "mimetype": "text/x-python", 942 | "name": "python", 943 | "nbconvert_exporter": "python", 944 | "pygments_lexer": "ipython3", 945 | "version": "3.5.1" 946 | } 947 | }, 948 | "nbformat": 4, 949 | "nbformat_minor": 0 950 | } 951 | -------------------------------------------------------------------------------- /text_mining/data_tau.csv: -------------------------------------------------------------------------------- 1 | title,date 2 | "An Exploration of R, Yelp, and the Search for Good Indian Food",5 points by Rogerh91 6 hours ago | discuss 3 | Deep Advances in Generative Modeling,7 points by gwulfs 15 hours ago | 1 comment 4 | Spark Pipelines: Elegant Yet Powerful,3 points by aouyang1 9 hours ago | discuss 5 | Shit VCs Say,3 points by Argentum01 10 hours ago | discuss 6 | "Python, Machine Learning, and Language Wars",4 points by pmigdal 17 hours ago | discuss 7 | A Neural Network in 11 lines of Python ,3 points by dekhtiar 14 hours ago | discuss 8 | Markov Chains Explained Visually,13 points by zeroviscosity 1 day ago | 1 comment 9 | Dplython: Dplyr for Python,13 points by thenaturalist 1 day ago | 3 comments 10 | Inferring causal impact using Bayesian structural time-series models,8 points by Homunculiheaded 1 day ago | 1 comment 11 | A Billion Taxi Rides on Amazon EMR running Spark,5 points by marklit 1 day ago | 1 comment 12 | Tutorial: Web scraping and mapping breweries with import.io and R,4 points by jasdumas 1 day ago | discuss 13 | The rise of greedy robots,4 points by yanir 2 days ago | discuss 14 | "Python for Data Structures, Algorithms, and Interviews",18 points by kokoubaby 4 days ago | discuss 15 | Extracting image metadata at scale,2 points by zachwill 1 day ago | discuss 16 | Lift charts - A data scientist's secret weapon,14 points by datenheini 4 days ago | 2 comments 17 | Data Science Side Project,7 points by yashpatel5400 2 days ago | 9 comments 18 | How To Become A Machine Learning Expert In One Simple Step,4 points by swanint 2 days ago | discuss 19 | Engineers Shouldn’t Write ETL: High Functioning Data Science Departments,10 points by legel 4 days ago | 3 comments 20 | Simple estimation of hierarchical events with petersburg,3 points by wdm0006 2 days ago | discuss 21 | Unsupervised Computer Vision: The Current State of the Art,6 points by carlosfaham 3 days ago | discuss 22 | Data Engineering at Slack: Twelve Mistakes I've Made In My First Three Months,14 points by gwulfs 6 days ago | 2 comments 23 | What data visualization tools do /r/DataIsBeautiful OC creators use?,3 points by pmigdal 2 days ago | discuss 24 | Reshaping in Pandas,6 points by carlosgg 4 days ago | discuss 25 | An unusual interactive machine learning challenge,4 points by gglumov 3 days ago | discuss 26 | Datumbox Machine Learning Framework 0.7.0 Released,4 points by datumbox 3 days ago | discuss 27 | Data science intro for math/phys background,14 points by pmigdal 7 days ago | discuss 28 | Neural Networks demystified,16 points by elyase 8 days ago | discuss 29 | What machines can learn from Apple Watch: detecting undiagnosed heart condition,9 points by koukouhappy 6 days ago | discuss 30 | Data Science Tools: The Biggest Winners and Losers,12 points by AnnaOnTheWeb 7 days ago | discuss 31 | 10 Years of Open Source Machine Learning,9 points by tstonez 6 days ago | 1 comment 32 | Has your conversion rate changed? Bayesian timeseries analysis with Python,12 points by yummyfajitas 8 days ago | discuss 33 | Do jobs run in families?,5 points by Anon84 5 days ago | 1 comment 34 | Introduction to Scikit Flow - Simplified Interface to TensorFlow,8 points by lefish 7 days ago | discuss 35 | "XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow",8 points by crowwork 8 days ago | discuss 36 | How to learn machine learning?,8 points by kiechu 8 days ago | 1 comment 37 | The Deep Roots of Javascript Fatigue,5 points by nikkielizdemere 6 days ago | 1 comment 38 | How do we make Data Tau work?,27 points by hal8 9 days ago | 18 comments 39 | "Machine Learning: An In-Depth, Non-Technical Guide — Part 4",7 points by innoarchitech 8 days ago | discuss 40 | Data Science Slack channel - Click for invite,7 points by jyotsna 8 days ago | discuss 41 | [Ask DT] What are some rookie mistakes in R?,3 points by HKtemp 3 days ago | discuss 42 | "Playing ""Moneyball"" on EA FIFA 16",16 points by aabb13 13 days ago | 3 comments 43 | Intellexer - Natural Language Processing and Text Mining REST API,16 points by j_downer 13 days ago | discuss 44 | Descriptive Statistics in SQL,5 points by nickhould 7 days ago | discuss 45 | Genomic Data Visualization using Python,2 points by RadhouaneAniba 4 days ago | discuss 46 | How to Use Cohort Data to Analyze User Behavior,2 points by clevertap 4 days ago | discuss 47 | Making transparent how variations in analytical choices affect results,4 points by rahmaniacc 7 days ago | discuss 48 | Show DT: Datasets.co - An easy way to share and discover ml datasets,2 points by mrborgen86 4 days ago | discuss 49 | Is Scala a better choice than Python for Apache Spark?,7 points by srinify 10 days ago | 1 comment 50 | Julia: A Fast Language for Numerical Computing,7 points by srinify 10 days ago | 1 comment 51 | "An Ode To The Rice Cooker, The Smartest Kitchen Appliance I’ve Ever Owned",2 points by tfturing 4 days ago | discuss 52 | Computing Classification Evaluation Metrics in R,4 points by lefish 7 days ago | discuss 53 | Analyzing Golden State Warriors' passing network using GraphFrames in Spark,3 points by yukiegosapporo 6 days ago | discuss 54 | Megaman: Manifold Learning with Millions of points,4 points by dperry 8 days ago | 3 comments 55 | How to Detect Outliers on Parametric and Non Parametric Methods,2 points by clevertap 5 days ago | discuss 56 | BallR: Interactive NBA Shot Charts with R and Shiny,12 points by carlosgg 14 days ago | discuss 57 | A Billion Taxi Rides on Amazon EMR Running Presto,4 points by marklit 8 days ago | discuss 58 | Minecraft to run artificial intelligence experiments,4 points by bsadeghi 8 days ago | discuss 59 | Deep Q-Learning (Space Invaders),4 points by pmigdal 8 days ago | discuss 60 | Theano Tutorial,2 points by pmigdal 5 days ago | discuss 61 | The Personality Space of Cartoon Characters,3 points by lefish 7 days ago | discuss 62 | Announcing Apache Flink 1.0.0,11 points by mxm 14 days ago | discuss 63 | "Telemetry with Collectd, Logstash, Elasticsearch and Grafana (ELG)",3 points by helloanand 7 days ago | discuss 64 | Statisticians Agree: It’s Time To Stop Misusing P-Value,10 points by jpiburn 15 days ago | 5 comments 65 | Bayesian Reasoning in The Twilight Zone!,2 points by Homunculiheaded 6 days ago | discuss 66 | Bayesian Estimation of G Train Wait Times,7 points by jamesdreiss 12 days ago | discuss 67 | XGBoost: A Scalable Tree Boosting System article,6 points by tfturing 12 days ago | discuss 68 | Some experiments into explaining complex black box ensemble predictions,2 points by lefish 6 days ago | discuss 69 | Creating a Hadoop Pseudo-Distributed Environment,2 points by lefish 6 days ago | discuss 70 | "Data Science Pop-Up in Austin, TX",2 points by AnnaOnTheWeb 6 days ago | discuss 71 | Train your own image classifier with Inception in TensorFlow,7 points by elyase 13 days ago | discuss 72 | Shiny app for running a Tensorflow demo,3 points by shinyman 9 days ago | discuss 73 | File details and owners with gitnoc and git-pandas,3 points by wdm0006 9 days ago | discuss 74 | 7 Big Data Technologies and When to Use Them that All Data Engineers Should Know,2 points by galvanize 7 days ago | discuss 75 | Topic clusters with TF-IDF vectorization with Spark and Scala,2 points by lefish 7 days ago | discuss 76 | Neural Doodles: Workflows for the Next Generation of Artists,5 points by pmigdal 12 days ago | discuss 77 | Graph Databases 101,5 points by carlosgg 12 days ago | discuss 78 | DataRadar.IO - Data Science RSS Feed - Do you have enough data about your data,2 points by dekhtiar 8 days ago | 3 comments 79 | International Women's Day: What #PledgeForParity Means To Us,5 points by ddrum001 14 days ago | discuss 80 | Top 50 Data Science thought leaders on Twitter,3 points by datawerq 11 days ago | 3 comments 81 | Ask DT: Who Is Hiring? (March 2016),27 points by whoishiring 21 days ago | 15 comments 82 | Deriving Better Insights From Time Series Data With Cycle Plots,3 points by clevertap 11 days ago | discuss 83 | Introducing GraphFrames,7 points by falaki 19 days ago | discuss 84 | SQL for Data Analysis,4 points by nickhould 14 days ago | 6 comments 85 | Stream processing and messaging systems for the IoT age,3 points by gradientflow 12 days ago | discuss 86 | Announcing R Tools for Visual Studio,3 points by brakmic 13 days ago | discuss 87 | A simpler way to merge data streams,3 points by apoverton 13 days ago | discuss 88 | Optimizing Notification Timing for One Signal,9 points by megandias 26 days ago | discuss 89 | Skizze - A high throughput probabilistic data structure service and storage,3 points by seiflotfy 14 days ago | discuss 90 | Question: What do you want to say about working with data?,2 points by emiller425 8 days ago | discuss 91 | Genomic Ranges - an Introduction to Working with Genomic Data,3 points by AnnaOnTheWeb 13 days ago | discuss 92 | TensorFlow for Poets,9 points by ebellm 21 days ago | 1 comment 93 | Unsupervised Learning with Even Less Supervision Using Bayesian Optimization,2 points by idewanck 11 days ago | discuss 94 | How to work with large JSON datasets using Python and Pandas,9 points by brian_spiering 21 days ago | discuss 95 | DrivenData Competition: Model/Visualize Fog Patterns in Morocco,4 points by bull 15 days ago | discuss 96 | Deep Learning: Nine Lectures at Collège de France by Yan LeCun,5 points by Anon84 17 days ago | discuss 97 | Optimizing Facebook Campaigns with R,2 points by AnnaOnTheWeb 12 days ago | 1 comment 98 | "Trump Tweets on a Globe (aka Fun with d3, socket.io, and the Twitter API)",8 points by joelgrus 21 days ago | discuss 99 | Why pandas users should be excited about Apache Arrow,17 points by pmigdal 29 days ago | discuss 100 | Histogram intersection for change detection,8 points by datadive 22 days ago | discuss 101 | Distributed TensorFlow just open-sourced,10 points by elyase 25 days ago | discuss 102 | D3.js Screencasts (1 in 3 are free),4 points by Veerle 18 days ago | discuss 103 | Regression and Classification with Examples in R,5 points by soates 20 days ago | discuss 104 | Free online course on statistical shape modelling,8 points by shapemean 25 days ago | discuss 105 | "Don't worry about deep learning, deepen your understanding of causality instead",22 points by yanir 37 days ago | discuss 106 | Work with private repositories and other updates of the FlyElephant platform,2 points by m31 15 days ago | discuss 107 | How to import XML to almost anywhere,4 points by Jammink 20 days ago | discuss 108 | Survival Analysis of Cricket Player Careers,8 points by keshav92 26 days ago | 6 comments 109 | Generate image analogies using neural matching and blending,2 points by pmigdal 15 days ago | discuss 110 | "Analyzing 1.8M tweets from Super Bowl 50 (Twython, Twitter API, AYLIEN)",4 points by mikewally 20 days ago | discuss 111 | Newly released sklearn compatible library of categorical encoders,7 points by wdm0006 25 days ago | discuss 112 | Watch Tiny Neural Nets Learn,4 points by swanint 21 days ago | discuss 113 | Four pitfalls of hill climbing: An animated look,5 points by csaid81 23 days ago | discuss 114 | "Decision Forests, Convolutional Networks and the Models in-Between",2 points by ebellm 16 days ago | discuss 115 | How a Math Genius Hacked OkCupid to Find True Love,15 points by roh_codeur 34 days ago | discuss 116 | No developers for PyLearn2,3 points by tfturing 19 days ago | discuss 117 | Density Estimation with Dirichlet Process Mixtures using PyMC3,6 points by MidsizeBlowfish 25 days ago | discuss 118 | Using survival analysis and git-pandas to estimate code quality,3 points by wdm0006 20 days ago | discuss 119 | An Analysis of the Flint Michigan Water Crisis: Part 1 Initial Corrosivity,3 points by JHorn 20 days ago | discuss 120 | An Analysis of Republican Twitter Follower Interests,6 points by michelangelo 26 days ago | discuss 121 | Introduction to ML talk,8 points by cjbayesian 29 days ago | discuss 122 | GloVe vs word2vec revisited,3 points by pmigdal 20 days ago | discuss 123 | Overoptimizing: a story about kaggle,4 points by wdm0006 30 days ago | discuss 124 | Undergrad Data Analysis/Science internships SF Bay?,3 points by tctctc 15 days ago | 5 comments 125 | The Role of Statistical Significance in Growth Hacking,6 points by rawls234 27 days ago | discuss 126 | Data Science Course @ Harvard,7 points by rahmaniacc 29 days ago | 2 comments 127 | Principal Component Projection Without Principal Component Analysis,6 points by genofon 27 days ago | discuss 128 | "Machine Learning: An In-Depth, Non-Technical Guide - Part 3",7 points by innoarchitech 29 days ago | discuss 129 | Stochastic Dummy Boosting,2 points by mikeskim 18 days ago | discuss 130 | Interactive Map: Hong-Kong through The Lense of Instagram,2 points by BrianN 19 days ago | discuss 131 | Data Science at Monsanto,3 points by doctorcroc 22 days ago | discuss 132 | Data Science at Instacart,11 points by jeremystan 34 days ago | 3 comments 133 | Building a Streaming Search Platform,6 points by ddrum001 28 days ago | discuss 134 | Kafka Producer Latency with Large Topic Counts,3 points by marklit 26 days ago | discuss 135 | A Sneak Peak of the Cloud: the 2 Minute Intro for Beginners,2 points by andymaheshw 20 days ago | discuss 136 | Win-Vector video courses: price/status changes,2 points by jmount 20 days ago | discuss 137 | 50+ Data Science and Machine Learning Cheat Sheets,20 points by elyase 42 days ago | 1 comment 138 | One More Reason Not To Be Scared of Deep Learning,2 points by amplifier_khan 21 days ago | discuss 139 | Visual Logic Authoring vs Code,2 points by AnnaOnTheWeb 21 days ago | discuss 140 | Data Science in Python online training with hands-on experience,2 points by Puneet 21 days ago | discuss 141 | Viewing the US Presidential Primary Through the Lens of Twitter,8 points by michelangelo 33 days ago | discuss 142 | Caffe on Spark open sourced,4 points by rahmaniacc 27 days ago | discuss 143 | The Ethical Data Scientist,5 points by tfturing 29 days ago | discuss 144 | Answers to Frequently Asked Questions in Machine Learning,3 points by rasbt 21 days ago | discuss 145 | Intro to A/B Testing and P-Values,2 points by randyzwitch 22 days ago | discuss 146 | Visualizing State Level Data With R and Statebins,2 points by usujason 22 days ago | discuss 147 | "Probabilistic Graphical Models slides & video lectures (Eric Xing, CMU)",4 points by ororm 28 days ago | discuss 148 | Sense2vec with spaCy and Gensim,9 points by elyase 36 days ago | 2 comments 149 | A Billion NYC Taxi and Uber Rides in AWS Redshift,3 points by marklit 31 days ago | discuss 150 | How to Code and Understand DeepMind's Neural Stack Machine (in Python),2 points by genofon 23 days ago | discuss 151 | How to make polished Jupyter presentations with optional code visibility,9 points by csaid81 36 days ago | discuss 152 | How to become a Bayesian in eight easy steps,17 points by EtzA 44 days ago | 1 comment 153 | Optimizing .*: Details of Vectorization and Metaprogramming in Julia,4 points by randyzwitch 29 days ago | discuss 154 | IBM certified Apache Spark Online Training,8 points by divya_jain 36 days ago | discuss 155 | Geographic Data Science course,2 points by rk 25 days ago | discuss 156 | "The Daily Mail Stole My Visualization, Twice",5 points by thehoff 32 days ago | 1 comment 157 | Ensemble Methods: Improved Machine Learning Results,9 points by PyBloggers 38 days ago | discuss 158 | Apache Spark and unsupervised learning in security,2 points by gradientflow 26 days ago | discuss 159 | MachineJS: Automated machine learning- just give it a data file!,2 points by dsernst 26 days ago | discuss 160 | The NSA’s SKYNET program may be killing thousands of innocent people,6 points by zlipp 35 days ago | discuss 161 | "Big Dimensions, and What You Can Do About It",2 points by ramsey 27 days ago | discuss 162 | Automate Your Oscars Pool with R,2 points by jamesdreiss 27 days ago | discuss 163 | Signal Processing with LIGO GW150914 data,9 points by tfturing 39 days ago | discuss 164 | Overview of DeZyre and Coursera Data Science Course,5 points by ann928 34 days ago | discuss 165 | Upcoming Datathon in NYC,2 points by VicTrey 28 days ago | discuss 166 | Summarizing Data in SQL,15 points by elisebreda 46 days ago | discuss 167 | A/B Testing for Scammers,2 points by sameermanek 28 days ago | discuss 168 | Highly interpretable classifiers for scikit learn using Bayesian decision rules,2 points by mcnulty 28 days ago | discuss 169 | Auto-scaling scikit-learn with Spark,11 points by falaki 43 days ago | discuss 170 | Where the f*** can I park?,2 points by manugarri 29 days ago | discuss 171 | "Machine Learning: An In-Depth, Non-Technical Guide - Part 2",5 points by innoarchitech 36 days ago | discuss 172 | Webhose.io now offers a historical data archive,7 points by databuffer 40 days ago | discuss 173 | Meetup: Introduction to Machine Learning Algorithms for Data Science.,4 points by ann928 36 days ago | discuss 174 | Exploring the Limits of Language Modeling,8 points by soates 42 days ago | discuss 175 | Text Mining South Park,7 points by pmigdal 41 days ago | discuss 176 | Finding the K in K-means by Parametric Bootstrap,7 points by jmount 42 days ago | 1 comment 177 | Getting Started with Statistics for Data Science,3 points by nickhould 35 days ago | discuss 178 | Rodeo 1.3 - Tab-completion for docstrings,3 points by glamp 35 days ago | discuss 179 | Teaching D3.js - links,3 points by pmigdal 35 days ago | discuss 180 | Parallel scikit-learn on YARN,5 points by stijntonk 39 days ago | discuss 181 | Meetup: Free Live Webinar on Prescriptive Analytics for Fun and Profit,2 points by ann928 32 days ago | discuss 182 | -------------------------------------------------------------------------------- /text_mining/data_tau_days.csv: -------------------------------------------------------------------------------- 1 | title,date,days 2 | "An Exploration of R, Yelp, and the Search for Good Indian Food",5 points by Rogerh91 6 hours ago | discuss,1 3 | Deep Advances in Generative Modeling,7 points by gwulfs 15 hours ago | 1 comment,1 4 | Spark Pipelines: Elegant Yet Powerful,3 points by aouyang1 9 hours ago | discuss,1 5 | Shit VCs Say,3 points by Argentum01 10 hours ago | discuss,1 6 | "Python, Machine Learning, and Language Wars",4 points by pmigdal 17 hours ago | discuss,1 7 | A Neural Network in 11 lines of Python ,3 points by dekhtiar 14 hours ago | discuss,1 8 | Markov Chains Explained Visually,13 points by zeroviscosity 1 day ago | 1 comment,1 9 | Dplython: Dplyr for Python,13 points by thenaturalist 1 day ago | 3 comments,1 10 | Inferring causal impact using Bayesian structural time-series models,8 points by Homunculiheaded 1 day ago | 1 comment,1 11 | A Billion Taxi Rides on Amazon EMR running Spark,5 points by marklit 1 day ago | 1 comment,1 12 | Tutorial: Web scraping and mapping breweries with import.io and R,4 points by jasdumas 1 day ago | discuss,1 13 | The rise of greedy robots,4 points by yanir 2 days ago | discuss,2 14 | "Python for Data Structures, Algorithms, and Interviews",18 points by kokoubaby 4 days ago | discuss,4 15 | Extracting image metadata at scale,2 points by zachwill 1 day ago | discuss,1 16 | Lift charts - A data scientist's secret weapon,14 points by datenheini 4 days ago | 2 comments,4 17 | Data Science Side Project,7 points by yashpatel5400 2 days ago | 9 comments,2 18 | How To Become A Machine Learning Expert In One Simple Step,4 points by swanint 2 days ago | discuss,2 19 | Engineers Shouldn?t Write ETL: High Functioning Data Science Departments,10 points by legel 4 days ago | 3 comments,4 20 | Simple estimation of hierarchical events with petersburg,3 points by wdm0006 2 days ago | discuss,2 21 | Unsupervised Computer Vision: The Current State of the Art,6 points by carlosfaham 3 days ago | discuss,3 22 | Data Engineering at Slack: Twelve Mistakes I've Made In My First Three Months,14 points by gwulfs 6 days ago | 2 comments,6 23 | What data visualization tools do /r/DataIsBeautiful OC creators use?,3 points by pmigdal 2 days ago | discuss,2 24 | Reshaping in Pandas,6 points by carlosgg 4 days ago | discuss,4 25 | An unusual interactive machine learning challenge,4 points by gglumov 3 days ago | discuss,3 26 | Datumbox Machine Learning Framework 0.7.0 Released,4 points by datumbox 3 days ago | discuss,3 27 | Data science intro for math/phys background,14 points by pmigdal 7 days ago | discuss,7 28 | Neural Networks demystified,16 points by elyase 8 days ago | discuss,8 29 | What machines can learn from Apple Watch: detecting undiagnosed heart condition,9 points by koukouhappy 6 days ago | discuss,6 30 | Data Science Tools: The Biggest Winners and Losers,12 points by AnnaOnTheWeb 7 days ago | discuss,7 31 | 10 Years of Open Source Machine Learning,9 points by tstonez 6 days ago | 1 comment,6 32 | Has your conversion rate changed? Bayesian timeseries analysis with Python,12 points by yummyfajitas 8 days ago | discuss,8 33 | Do jobs run in families?,5 points by Anon84 5 days ago | 1 comment,5 34 | Introduction to Scikit Flow - Simplified Interface to TensorFlow,8 points by lefish 7 days ago | discuss,7 35 | "XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow",8 points by crowwork 8 days ago | discuss,8 36 | How to learn machine learning?,8 points by kiechu 8 days ago | 1 comment,8 37 | The Deep Roots of Javascript Fatigue,5 points by nikkielizdemere 6 days ago | 1 comment,6 38 | How do we make Data Tau work?,27 points by hal8 9 days ago | 18 comments,9 39 | "Machine Learning: An In-Depth, Non-Technical Guide???Part 4",7 points by innoarchitech 8 days ago | discuss,8 40 | Data Science Slack channel - Click for invite,7 points by jyotsna 8 days ago | discuss,8 41 | [Ask DT] What are some rookie mistakes in R?,3 points by HKtemp 3 days ago | discuss,3 42 | "Playing ""Moneyball"" on EA FIFA 16",16 points by aabb13 13 days ago | 3 comments,13 43 | Intellexer - Natural Language Processing and Text Mining REST API,16 points by j_downer 13 days ago | discuss,13 44 | Descriptive Statistics in SQL,5 points by nickhould 7 days ago | discuss,7 45 | Genomic Data Visualization using Python,2 points by RadhouaneAniba 4 days ago | discuss,4 46 | How to Use Cohort Data to Analyze User Behavior,2 points by clevertap 4 days ago | discuss,4 47 | Making transparent how variations in analytical choices affect results,4 points by rahmaniacc 7 days ago | discuss,7 48 | Show DT: Datasets.co - An easy way to share and discover ml datasets,2 points by mrborgen86 4 days ago | discuss,4 49 | Is Scala a better choice than Python for Apache Spark?,7 points by srinify 10 days ago | 1 comment,10 50 | Julia: A Fast Language for Numerical Computing,7 points by srinify 10 days ago | 1 comment,10 51 | "An Ode To The Rice Cooker, The Smartest Kitchen Appliance I?ve Ever Owned",2 points by tfturing 4 days ago | discuss,4 52 | Computing Classification Evaluation Metrics in R,4 points by lefish 7 days ago | discuss,7 53 | Analyzing Golden State Warriors' passing network using GraphFrames in Spark,3 points by yukiegosapporo 6 days ago | discuss,6 54 | Megaman: Manifold Learning with Millions of points,4 points by dperry 8 days ago | 3 comments,8 55 | How to Detect Outliers on Parametric and Non Parametric Methods,2 points by clevertap 5 days ago | discuss,5 56 | BallR: Interactive NBA Shot Charts with R and Shiny,12 points by carlosgg 14 days ago | discuss,14 57 | A Billion Taxi Rides on Amazon EMR Running Presto,4 points by marklit 8 days ago | discuss,8 58 | Minecraft to run artificial intelligence experiments,4 points by bsadeghi 8 days ago | discuss,8 59 | Deep Q-Learning (Space Invaders),4 points by pmigdal 8 days ago | discuss,8 60 | Theano Tutorial,2 points by pmigdal 5 days ago | discuss,5 61 | The Personality Space of Cartoon Characters,3 points by lefish 7 days ago | discuss,7 62 | Announcing Apache Flink 1.0.0,11 points by mxm 14 days ago | discuss,14 63 | "Telemetry with Collectd, Logstash, Elasticsearch and Grafana (ELG)",3 points by helloanand 7 days ago | discuss,7 64 | Statisticians Agree: It?s Time To Stop Misusing P-Value,10 points by jpiburn 15 days ago | 5 comments,15 65 | Bayesian Reasoning in The Twilight Zone!,2 points by Homunculiheaded 6 days ago | discuss,6 66 | Bayesian Estimation of G Train Wait Times,7 points by jamesdreiss 12 days ago | discuss,12 67 | XGBoost: A Scalable Tree Boosting System article,6 points by tfturing 12 days ago | discuss,12 68 | Some experiments into explaining complex black box ensemble predictions,2 points by lefish 6 days ago | discuss,6 69 | Creating a Hadoop Pseudo-Distributed Environment,2 points by lefish 6 days ago | discuss,6 70 | "Data Science Pop-Up in Austin, TX",2 points by AnnaOnTheWeb 6 days ago | discuss,6 71 | Train your own image classifier with Inception in TensorFlow,7 points by elyase 13 days ago | discuss,13 72 | Shiny app for running a Tensorflow demo,3 points by shinyman 9 days ago | discuss,9 73 | File details and owners with gitnoc and git-pandas,3 points by wdm0006 9 days ago | discuss,9 74 | 7 Big Data Technologies and When to Use Them that All Data Engineers Should Know,2 points by galvanize 7 days ago | discuss,7 75 | Topic clusters with TF-IDF vectorization with Spark and Scala,2 points by lefish 7 days ago | discuss,7 76 | Neural Doodles: Workflows for the Next Generation of Artists,5 points by pmigdal 12 days ago | discuss,12 77 | Graph Databases 101,5 points by carlosgg 12 days ago | discuss,12 78 | DataRadar.IO - Data Science RSS Feed - Do you have enough data about your data,2 points by dekhtiar 8 days ago | 3 comments,8 79 | International Women's Day: What #PledgeForParity Means To Us,5 points by ddrum001 14 days ago | discuss,14 80 | Top 50 Data Science thought leaders on Twitter,3 points by datawerq 11 days ago | 3 comments,11 81 | Ask DT: Who Is Hiring? (March 2016),27 points by whoishiring 21 days ago | 15 comments,21 82 | Deriving Better Insights From Time Series Data With Cycle Plots,3 points by clevertap 11 days ago | discuss,11 83 | Introducing GraphFrames,7 points by falaki 19 days ago | discuss,19 84 | SQL for Data Analysis,4 points by nickhould 14 days ago | 6 comments,14 85 | Stream processing and messaging systems for the IoT age,3 points by gradientflow 12 days ago | discuss,12 86 | Announcing R Tools for Visual Studio,3 points by brakmic 13 days ago | discuss,13 87 | A simpler way to merge data streams,3 points by apoverton 13 days ago | discuss,13 88 | Optimizing Notification Timing for One Signal,9 points by megandias 26 days ago | discuss,26 89 | Skizze - A high throughput probabilistic data structure service and storage,3 points by seiflotfy 14 days ago | discuss,14 90 | Question: What do you want to say about working with data?,2 points by emiller425 8 days ago | discuss,8 91 | Genomic Ranges - an Introduction to Working with Genomic Data,3 points by AnnaOnTheWeb 13 days ago | discuss,13 92 | TensorFlow for Poets,9 points by ebellm 21 days ago | 1 comment,21 93 | Unsupervised Learning with Even Less Supervision Using Bayesian Optimization,2 points by idewanck 11 days ago | discuss,11 94 | How to work with large JSON datasets using Python and Pandas,9 points by brian_spiering 21 days ago | discuss,21 95 | DrivenData Competition: Model/Visualize Fog Patterns in Morocco,4 points by bull 15 days ago | discuss,15 96 | Deep Learning: Nine Lectures at Coll?ge de France by Yan LeCun,5 points by Anon84 17 days ago | discuss,17 97 | Optimizing Facebook Campaigns with R,2 points by AnnaOnTheWeb 12 days ago | 1 comment,12 98 | "Trump Tweets on a Globe (aka Fun with d3, socket.io, and the Twitter API)",8 points by joelgrus 21 days ago | discuss,21 99 | Why pandas users should be excited about Apache Arrow,17 points by pmigdal 29 days ago | discuss,29 100 | Histogram intersection for change detection,8 points by datadive 22 days ago | discuss,22 101 | Distributed TensorFlow just open-sourced,10 points by elyase 25 days ago | discuss,25 102 | D3.js Screencasts (1 in 3 are free),4 points by Veerle 18 days ago | discuss,18 103 | Regression and Classification with Examples in R,5 points by soates 20 days ago | discuss,20 104 | Free online course on statistical shape modelling,8 points by shapemean 25 days ago | discuss,25 105 | "Don't worry about deep learning, deepen your understanding of causality instead",22 points by yanir 37 days ago | discuss,37 106 | Work with private repositories and other updates of the FlyElephant platform,2 points by m31 15 days ago | discuss,15 107 | How to import XML to almost anywhere,4 points by Jammink 20 days ago | discuss,20 108 | Survival Analysis of Cricket Player Careers,8 points by keshav92 26 days ago | 6 comments,26 109 | Generate image analogies using neural matching and blending,2 points by pmigdal 15 days ago | discuss,15 110 | "Analyzing 1.8M tweets from Super Bowl 50 (Twython, Twitter API, AYLIEN)",4 points by mikewally 20 days ago | discuss,20 111 | Newly released sklearn compatible library of categorical encoders,7 points by wdm0006 25 days ago | discuss,25 112 | Watch Tiny Neural Nets Learn,4 points by swanint 21 days ago | discuss,21 113 | Four pitfalls of hill climbing: An animated look,5 points by csaid81 23 days ago | discuss,23 114 | "Decision Forests, Convolutional Networks and the Models in-Between",2 points by ebellm 16 days ago | discuss,16 115 | How a Math Genius Hacked OkCupid to Find True Love,15 points by roh_codeur 34 days ago | discuss,34 116 | No developers for PyLearn2,3 points by tfturing 19 days ago | discuss,19 117 | Density Estimation with Dirichlet Process Mixtures using PyMC3,6 points by MidsizeBlowfish 25 days ago | discuss,25 118 | Using survival analysis and git-pandas to estimate code quality,3 points by wdm0006 20 days ago | discuss,20 119 | An Analysis of the Flint Michigan Water Crisis: Part 1 Initial Corrosivity,3 points by JHorn 20 days ago | discuss,20 120 | An Analysis of Republican Twitter Follower Interests,6 points by michelangelo 26 days ago | discuss,26 121 | Introduction to ML talk,8 points by cjbayesian 29 days ago | discuss,29 122 | GloVe vs word2vec revisited,3 points by pmigdal 20 days ago | discuss,20 123 | Overoptimizing: a story about kaggle,4 points by wdm0006 30 days ago | discuss,30 124 | Undergrad Data Analysis/Science internships SF Bay?,3 points by tctctc 15 days ago | 5 comments,15 125 | The Role of Statistical Significance in Growth Hacking,6 points by rawls234 27 days ago | discuss,27 126 | Data Science Course @ Harvard,7 points by rahmaniacc 29 days ago | 2 comments,29 127 | Principal Component Projection Without Principal Component Analysis,6 points by genofon 27 days ago | discuss,27 128 | "Machine Learning: An In-Depth, Non-Technical Guide - Part 3",7 points by innoarchitech 29 days ago | discuss,29 129 | Stochastic Dummy Boosting,2 points by mikeskim 18 days ago | discuss,18 130 | Interactive Map: Hong-Kong through The Lense of Instagram,2 points by BrianN 19 days ago | discuss,19 131 | Data Science at Monsanto,3 points by doctorcroc 22 days ago | discuss,22 132 | Data Science at Instacart,11 points by jeremystan 34 days ago | 3 comments,34 133 | Building a Streaming Search Platform,6 points by ddrum001 28 days ago | discuss,28 134 | Kafka Producer Latency with Large Topic Counts,3 points by marklit 26 days ago | discuss,26 135 | A Sneak Peak of the Cloud: the 2 Minute Intro for Beginners,2 points by andymaheshw 20 days ago | discuss,20 136 | Win-Vector video courses: price/status changes,2 points by jmount 20 days ago | discuss,20 137 | 50+ Data Science and Machine Learning Cheat Sheets,20 points by elyase 42 days ago | 1 comment,42 138 | One More Reason Not To Be Scared of Deep Learning,2 points by amplifier_khan 21 days ago | discuss,21 139 | Visual Logic Authoring vs Code,2 points by AnnaOnTheWeb 21 days ago | discuss,21 140 | Data Science in Python online training with hands-on experience,2 points by Puneet 21 days ago | discuss,21 141 | Viewing the US Presidential Primary Through the Lens of Twitter,8 points by michelangelo 33 days ago | discuss,33 142 | Caffe on Spark open sourced,4 points by rahmaniacc 27 days ago | discuss,27 143 | The Ethical Data Scientist,5 points by tfturing 29 days ago | discuss,29 144 | Answers to Frequently Asked Questions in Machine Learning,3 points by rasbt 21 days ago | discuss,21 145 | Intro to A/B Testing and P-Values,2 points by randyzwitch 22 days ago | discuss,22 146 | Visualizing State Level Data With R and Statebins,2 points by usujason 22 days ago | discuss,22 147 | "Probabilistic Graphical Models slides & video lectures (Eric Xing, CMU)",4 points by ororm 28 days ago | discuss,28 148 | Sense2vec with spaCy and Gensim,9 points by elyase 36 days ago | 2 comments,36 149 | A Billion NYC Taxi and Uber Rides in AWS Redshift,3 points by marklit 31 days ago | discuss,31 150 | How to Code and Understand DeepMind's Neural Stack Machine (in Python),2 points by genofon 23 days ago | discuss,23 151 | How to make polished Jupyter presentations with optional code visibility,9 points by csaid81 36 days ago | discuss,36 152 | How to become a Bayesian in eight easy steps,17 points by EtzA 44 days ago | 1 comment,44 153 | Optimizing .*: Details of Vectorization and Metaprogramming in Julia,4 points by randyzwitch 29 days ago | discuss,29 154 | IBM certified Apache Spark Online Training,8 points by divya_jain 36 days ago | discuss,36 155 | Geographic Data Science course,2 points by rk 25 days ago | discuss,25 156 | "The Daily Mail Stole My Visualization, Twice",5 points by thehoff 32 days ago | 1 comment,32 157 | Ensemble Methods: Improved Machine Learning Results,9 points by PyBloggers 38 days ago | discuss,38 158 | Apache Spark and unsupervised learning in security,2 points by gradientflow 26 days ago | discuss,26 159 | MachineJS: Automated machine learning- just give it a data file!,2 points by dsernst 26 days ago | discuss,26 160 | The NSA?s SKYNET program may be killing thousands of innocent people,6 points by zlipp 35 days ago | discuss,35 161 | "Big Dimensions, and What You Can Do About It",2 points by ramsey 27 days ago | discuss,27 162 | Automate Your Oscars Pool with R,2 points by jamesdreiss 27 days ago | discuss,27 163 | Signal Processing with LIGO GW150914 data,9 points by tfturing 39 days ago | discuss,39 164 | Overview of DeZyre and Coursera Data Science Course,5 points by ann928 34 days ago | discuss,34 165 | Upcoming Datathon in NYC,2 points by VicTrey 28 days ago | discuss,28 166 | Summarizing Data in SQL,15 points by elisebreda 46 days ago | discuss,46 167 | A/B Testing for Scammers,2 points by sameermanek 28 days ago | discuss,28 168 | Highly interpretable classifiers for scikit learn using Bayesian decision rules,2 points by mcnulty 28 days ago | discuss,28 169 | Auto-scaling scikit-learn with Spark,11 points by falaki 43 days ago | discuss,43 170 | Where the f*** can I park?,2 points by manugarri 29 days ago | discuss,29 171 | "Machine Learning: An In-Depth, Non-Technical Guide - Part 2",5 points by innoarchitech 36 days ago | discuss,36 172 | Webhose.io now offers a historical data archive,7 points by databuffer 40 days ago | discuss,40 173 | Meetup: Introduction to Machine Learning Algorithms for Data Science.,4 points by ann928 36 days ago | discuss,36 174 | Exploring the Limits of Language Modeling,8 points by soates 42 days ago | discuss,42 175 | Text Mining South Park,7 points by pmigdal 41 days ago | discuss,41 176 | Finding the K in K-means by Parametric Bootstrap,7 points by jmount 42 days ago | 1 comment,42 177 | Getting Started with Statistics for Data Science,3 points by nickhould 35 days ago | discuss,35 178 | Rodeo 1.3 - Tab-completion for docstrings,3 points by glamp 35 days ago | discuss,35 179 | Teaching D3.js - links,3 points by pmigdal 35 days ago | discuss,35 180 | Parallel scikit-learn on YARN,5 points by stijntonk 39 days ago | discuss,39 181 | Meetup: Free Live Webinar on Prescriptive Analytics for Fun and Profit,2 points by ann928 32 days ago | discuss,32 182 | -------------------------------------------------------------------------------- /text_mining/img/chunk-segmentation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/chunk-segmentation.png -------------------------------------------------------------------------------- /text_mining/img/datatau.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/datatau.png -------------------------------------------------------------------------------- /text_mining/img/date.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/date.png -------------------------------------------------------------------------------- /text_mining/img/entity_extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/entity_extraction.png -------------------------------------------------------------------------------- /text_mining/img/gutenberg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/gutenberg.png -------------------------------------------------------------------------------- /text_mining/img/punkt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/punkt.png -------------------------------------------------------------------------------- /text_mining/img/title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/title.png -------------------------------------------------------------------------------- /text_mining/nltk_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/nltk_data.zip -------------------------------------------------------------------------------- /time_series/1-Frame.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 1. Frame the Problem\n", 8 | "\n", 9 | "In late 2010, Onion prices shot through the roof and causing grave crisis. Apparently the crisis was caused by lack of rainfall in major onion producing region - Maharashtra and Karnataka and led to large scale hoarding by the traders. The crisis caused political tension in the country and described as \"a grave concern\" by then Prime Minister Manmohan Singh.\n", 10 | "\n", 11 | "\n", 12 | "- BBC Article in Dec 2010 - [Stink over onion crisis is enough to make you cry](http://www.bbc.co.uk/blogs/thereporters/soutikbiswas/2010/12/indias_onion_crisis.html)\n", 13 | "- Hindu OpEd in Dec 2010 - [The political price of onions](http://www.thehindu.com/opinion/editorial/article977100.ece)\n", 14 | "\n", 15 | "![](img/peeling_the_onion_small.png)\n", 16 | "\n", 17 | "So what are the type of questions on Onion Prices - you would like to ask. \n", 18 | "\n", 19 | "\n", 20 | "## Types of Question\n", 21 | "\n", 22 | "> \"Doing data analysis requires quite a bit of thinking and we believe that when you’ve completed a good data analysis, you’ve spent more time thinking than doing.\" - Roger Peng\n", 23 | "\n", 24 | "1. **Descriptive** - \"seeks to summarize a characteristic of a set of data\"\n", 25 | "2. **Exploratory** - \"analyze the data to see if there are patterns, trends, or relationships between variables\" (hypothesis generating) \n", 26 | "3. **Inferential** - \"a restatement of this proposed hypothesis as a question and would be answered by analyzing a different set of data\" (hypothesis testing)\n", 27 | "4. **Predictive** - \"determine the impact on one factor based on other factor in a population - to make a prediction\"\n", 28 | "5. **Causal** - \"asks whether changing one factor will change another factor in a population - to establish a causal link\" \n", 29 | "6. **Mechanistic** - \"establish *how* the change in one factor results in change in another factor in a population - to determine the exact mechanism\"\n", 30 | "\n", 31 | "\n", 32 | "### Descriptive \n", 33 | "- Which states have the highest onion production and sales?\n", 34 | "- Which city (Mandi's) have the highest sales?\n", 35 | "- What is the average price for Onion across a year in Bangalore?\n", 36 | "- ...\n", 37 | "\n", 38 | "### Exploratory & Inferential \n", 39 | "- Is there a large difference between High and Low prices of Onion in a day?\n", 40 | "- What is the trend of onion price across days or months in Bangalore?\n", 41 | "- How is the price on onion correlated with volume of onion?\n", 42 | "- How is the export volume of onion correlated to domestic production volume?\n", 43 | "- ...\n", 44 | "\n", 45 | "### Predictive \n", 46 | "- What is the price of onion likely to be next day?\n", 47 | "- What is the price of onion likely to be next month?\n", 48 | "- What will be the sales quantity of onion tommorrow in Delhi?\n", 49 | "- ...\n", 50 | "\n", 51 | "### Causal\n", 52 | "- Does the change in production of onion have an impact on the onion prices? \n", 53 | "- Does the change in rainfall in monsoon have an impact on onion prices?\n", 54 | "- ...\n", 55 | "\n", 56 | "### Mechanistic\n", 57 | "- How does change in onion production impact the price of onion?\n", 58 | "- How does onion export volumes impact the prices of onion in local markets in India?\n", 59 | "- ...\n", 60 | "\n", 61 | "\n", 62 | "## Questions we will attempt\n", 63 | "\n", 64 | "### 1. Descriptive: How big is the Bangalore onion market compared to other cities in India?\n", 65 | "\n", 66 | "### 2. Exploratory / Inferential: Have the price variation in onion prices in Bangalore really gone up over the years?\n", 67 | "\n", 68 | "### 3. Predictive: Can we predict the price of onion in Bangalore?" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.5.1" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 0 102 | } 103 | -------------------------------------------------------------------------------- /time_series/city_geocode.csv: -------------------------------------------------------------------------------- 1 | city,lon,lat 2 | GUWAHATI,91.7362365,26.1445169 3 | KOLKATA,88.363895,22.572646 4 | SRIRAMPUR,88.3385053,23.4033393 5 | SHEROAPHULY,88.3215014,22.7690032 6 | BURDWAN,87.8614793,23.2324214 7 | MIDNAPUR,87.3214908,22.4308892 8 | PURULIA,86.365208,23.3320779 9 | DHULIA,86.0618818,22.0347727 10 | BHUBNESWER,85.8245398,20.2960587 11 | BIHARSHARIF,85.5148735,25.1982147 12 | RANCHI,85.309562,23.3440997 13 | PATNA,85.1375645,25.5940947 14 | BALLIA,84.1487319,25.7584381 15 | DEORIA,83.7838214,26.4862373 16 | GORAKHPUR,83.3731675,26.7605545 17 | VARANASI,82.9739144,25.3176452 18 | RAJAHMUNDRY,81.8040345,17.0005383 19 | RAIPUR,81.6296413,21.2513844 20 | DINDORI,81.0768455,22.9417931 21 | LUCKNOW,80.946166,26.8466937 22 | KANPUR,80.3318736,26.449923 23 | CHENNAI,80.2707184,13.0826802 24 | HALDWANI,79.5129767,29.2182644 25 | BAREILLY,79.4304381,28.3670355 26 | NAGPUR,79.0881546,21.1458004 27 | ETAWAH,79.0046898,26.8117116 28 | SAGAR,78.7378068,23.838805 29 | SAIKHEDA,78.5831181,22.962215 30 | HYDERABAD,78.486671,17.385044 31 | KOLAR,78.1325611,13.1357446 32 | MADURAI,78.1197754,9.9252007 33 | ALIGARH,78.0880129,27.8973944 34 | KURNOOL,78.0372792,15.8281257 35 | DEHRADOON,78.0321918,30.3164945 36 | AGRA,78.0080745,27.1766701 37 | DINDIGUL,77.9802906,10.3673123 38 | CHICKBALLAPUR,77.7280396,13.432366 39 | MEERUT,77.7064137,28.9844618 40 | BANGALORE,77.5945627,12.9715987 41 | BHOPAL,77.412615,23.2599333 42 | RAICHUR,77.3439283,16.2120031 43 | DELHI,77.2090212,28.6139391 44 | SHIMLA,77.1734033,31.1048145 45 | KARNAL,76.9904825,29.6856929 46 | COIMBATORE,76.9558321,11.0168445 47 | PALAYAM,76.9513432,8.5027684 48 | TRIVENDRUM,76.9366376,8.5241391 49 | CHANDIGARH,76.7794179,30.7333148 50 | CHALLAKERE,76.6528225,14.313395 51 | ALWAR,76.6345735,27.5529907 52 | PATIALA,76.3868797,30.3397809 53 | DEVALA,76.3820088,11.4725502 54 | KHANNA,76.2112286,30.697852 55 | HASSAN,76.0995519,13.0068142 56 | DEWAS,76.0507949,22.9622672 57 | DHAVANGERE,75.9238397,14.4663438 58 | HOSHIARPUR,75.911483,31.5143178 59 | SOLAPUR,75.9063906,17.6599188 60 | KOTA,75.8647527,25.2138156 61 | INDORE,75.8577258,22.7195687 62 | LUDHIANA,75.8572758,30.900965 63 | JAIPUR,75.7872709,26.9124336 64 | UJJAIN,75.7849097,23.1793013 65 | BIJAPUR,75.710031,16.8301708 66 | JALANDHAR,75.5761829,31.3260152 67 | JALGAON,75.5626039,21.0076578 68 | HUBLI,75.1239547,15.3647083 69 | MANDSOUR,75.0692952,24.076836 70 | BHATINDA,74.9454745,30.210994 71 | SRINAGAR,74.9442585,34.1255413 72 | NEWASA,74.9281063,19.5511772 73 | AMRITSAR,74.8722642,31.6339793 74 | NEEMUCH,74.8624092,24.4763852 75 | JAMMU,74.8576539,32.7217819 76 | AHMEDNAGAR,74.7495916,19.0952075 77 | SHRIRAMPUR,74.6576091,19.6222323 78 | RAHURI,74.6488264,19.392678 79 | AJMER,74.6399163,26.4498954 80 | SANGALI,74.5814773,16.8523973 81 | MALEGAON,74.5100291,20.5547497 82 | BELGAUM,74.4976741,15.8496953 83 | RAHATA,74.483335,19.7127021 84 | YEOLA,74.4818698,20.0471229 85 | KOPERGAON,74.4790898,19.8916791 86 | MANMAD,74.4366016,20.2511789 87 | PHALTAN ,74.4360424,17.9844507 88 | CHANDVAD,74.2472779,20.3271277 89 | KOLHAPUR,74.2432527,16.7049873 90 | LASALGAON,74.2326058,20.1491422 91 | SANGAMNER,74.2079648,19.5771387 92 | SATANA,74.2032581,20.598224 93 | ABOHAR,74.1993043,30.1452928 94 | LONAND,74.1861821,18.041706 95 | NIPHAD,74.1093141,20.0799646 96 | SINNAR,74.0006328,19.8530593 97 | PIMPALGAON,73.9873787,20.1699678 98 | SRIGANGANAGAR,73.8771901,29.9038399 99 | JUNNAR,73.87425,19.2031842 100 | CHAKAN,73.8630346,18.7602664 101 | PUNE,73.8567437,18.5204303 102 | NASIK,73.7898023,19.9974533 103 | UDAIPUR,73.712479,24.585445 104 | BIKANER,73.3119159,28.0229348 105 | JODHPUR,73.0243094,26.2389469 106 | NANDGAON,72.9276008,18.3855337 107 | MUMBAI,72.8776559,19.0759837 108 | SURAT,72.8310607,21.1702401 109 | AHMEDABAD,72.5713621,23.022505 110 | DEESA,72.1906721,24.2585031 111 | BHAVNAGAR,72.1519304,21.7644725 112 | MAHUVA,71.7563169,21.0902193 113 | RAJKOT,70.8021599,22.3038945 114 | GONDAL,70.792297,21.9619463 115 | JAMNAGAR,70.05773,22.4707019 116 | KALVAN,73.13054,19.24033 117 | VANI,73.89189,20.33749 118 | BOMBORI,72.87766,19.07598 -------------------------------------------------------------------------------- /time_series/img/Cov_nonstationary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/Cov_nonstationary.png -------------------------------------------------------------------------------- /time_series/img/Mean_nonstationary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/Mean_nonstationary.png -------------------------------------------------------------------------------- /time_series/img/Var_nonstationary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/Var_nonstationary.png -------------------------------------------------------------------------------- /time_series/img/left_merge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/left_merge.png -------------------------------------------------------------------------------- /time_series/img/onion_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/onion_small.png -------------------------------------------------------------------------------- /time_series/img/onion_tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/onion_tables.png -------------------------------------------------------------------------------- /time_series/img/peeling_the_onion_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/peeling_the_onion_small.png -------------------------------------------------------------------------------- /time_series/img/pivot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/pivot.png -------------------------------------------------------------------------------- /time_series/img/splitapplycombine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/splitapplycombine.png -------------------------------------------------------------------------------- /time_series/img/subsetcolumns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/subsetcolumns.png -------------------------------------------------------------------------------- /time_series/img/subsetrows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/subsetrows.png -------------------------------------------------------------------------------- /time_series/state_geocode.csv: -------------------------------------------------------------------------------- 1 | "state","name","lon","lat" 2 | "MS","Maharashtra",75.7138884,19.7514798 3 | "GUJ","Gujarat",71.1923805,22.258652 4 | "MP","Madhya pradesh",78.6568942,22.9734229 5 | "TN","Tamil Nadu",78.6568942,11.1271225 6 | "KNT","Karnataka",75.7138884,15.3172775 7 | "DEL","Delhi",77.2090212,28.6139391 8 | "HR","Haryana",76.085601,29.0587757 9 | "RAJ","Rajasthan",74.2179326,27.0238036 10 | "AP","Andhra Pradesh",79.7399875,15.9128998 11 | "UP","Uttar Pradesh",80.9461592,26.8467088 12 | "JK","Jammu & Kashmir",74.8576539,32.7217819 13 | "BHR","Bihar",85.3131194,25.0960742 14 | "WB","West Bengal",87.8549755,22.9867569 15 | "HP","Himachal Pradesh",77.1733901,31.1048294 16 | "ASM","Assam",92.9375739,26.2006043 17 | "KEL","Kerala",76.2710833,10.8505159 18 | "JH","Jharkhand",85.2799354,23.6101808 19 | "OR","Orissa",85.0985236,20.9516658 20 | "PB","Punjab",75.3412179,31.1471305 21 | "KER","Kerala",76.2710833,10.8505159 22 | "CH","Chandigarh",76.7794179,30.7333148 23 | --------------------------------------------------------------------------------