├── .gitignore
├── LICENSE
├── README.md
├── RF_GBM
    ├── data
    │   ├── test.csv
    │   └── train.csv
    └── notebook
    │   ├── Bank Marketing.ipynb
    │   └── img
    │       ├── boosting.jpg
    │       ├── confusion_matrix.jpg
    │       ├── cv.png
    │       ├── onehot.png
    │       ├── random_forest.jpg
    │       ├── tree_ensemble1.png
    │       └── tree_ensemble2.png
├── cf_mba
    ├── data
    │   ├── groceries.csv
    │   ├── groceries_mba.csv
    │   └── lastfm-matrix-germany.csv
    └── notebook
    │   ├── 1. Collaborative Filtering.ipynb
    │   ├── 2. Market Basket Analysis.ipynb
    │   └── img
    │       ├── basket.jpg
    │       └── cosine.png
├── check_env.py
├── img
    ├── ISLR.jpeg
    ├── acquire.jpg
    ├── amit.png
    ├── approach.jpg
    ├── art.jpeg
    ├── bargava.jpg
    ├── book.png
    ├── books.jpg
    ├── break.jpg
    ├── clay.jpeg
    ├── craft.jpeg
    ├── estimating_coefficients.png
    ├── explore.jpg
    ├── frame.jpg
    ├── glass.jpg
    ├── insight.jpg
    ├── lens.jpeg
    ├── model.jpg
    ├── numbers.jpg
    ├── onion-image.jpg
    ├── onion.jpg
    ├── onion.png
    ├── overview.jpg
    ├── pair.jpg
    ├── postit.jpg
    ├── r2.gif
    ├── r_squared.png
    ├── refine.jpg
    ├── retail.jpg
    ├── science.jpeg
    ├── see.jpeg
    ├── single.jpeg
    ├── skills.png
    ├── slope_intercept.png
    ├── speak.jpeg
    ├── sports.jpg
    ├── stars.jpg
    ├── think.jpg
    ├── thinkstats.jpg
    ├── time.jpg
    ├── tool.jpg
    ├── travel.jpg
    ├── welcome.jpg
    ├── wesmckinney.jpg
    └── workshop.jpg
├── installation_instructions.md
├── overview.md
├── overview.pdf
├── python.txt
├── text_mining
    ├── Acquire.ipynb
    ├── DataTau.html
    ├── Explore.ipynb
    ├── Model.ipynb
    ├── Refine.ipynb
    ├── data_tau.csv
    ├── data_tau_days.csv
    ├── data_tau_ta.csv
    ├── flatlands.txt
    ├── img
    │   ├── chunk-segmentation.png
    │   ├── datatau.png
    │   ├── date.png
    │   ├── entity_extraction.png
    │   ├── gutenberg.png
    │   ├── punkt.png
    │   └── title.png
    ├── negative_words.txt
    ├── nltk_data.zip
    └── postive_words.txt
└── time_series
    ├── 1-Frame.ipynb
    ├── 2-Acquire.ipynb
    ├── 3-Refine.ipynb
    ├── 4-Explore.ipynb
    ├── 5-Model.ipynb
    ├── 6-Insight.ipynb
    ├── MonthWiseMarketArrivals.csv
    ├── MonthWiseMarketArrivals.html
    ├── MonthWiseMarketArrivalsJan2016.html
    ├── MonthWiseMarketArrivals_Clean.csv
    ├── city_geocode.csv
    ├── img
        ├── Cov_nonstationary.png
        ├── Mean_nonstationary.png
        ├── Var_nonstationary.png
        ├── corr.svg
        ├── left_merge.png
        ├── onion_small.png
        ├── onion_tables.png
        ├── peeling_the_onion_small.png
        ├── pivot.png
        ├── splitapplycombine.png
        ├── subsetcolumns.png
        └── subsetrows.png
    └── state_geocode.csv


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | .Rproj.user
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Amit Kapoor
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning
 2 | Workshop material for Machine Learning in Python
 3 | by [Amit Kapoor](http://twitter.com/amitkaps) and [Bargava Subramanian](http://twitter.com/bargava)
 4 | 
 5 | 0. [Overview](/overview.pdf)
 6 | 
 7 | 1. [Time Series](/time_series) *(8 hours, Case - Peeling the Onion)*
 8 | 	- Linear Trend Model
 9 | 	- Random Walk
10 | 	- Moving Average
11 | 	- Exponential Smoothing
12 | 	- Decomposition
13 | 	- ARIMA Models
14 | 	- Tweaking Model Parameters
15 | 
16 | 2. [Association Rule Mining](/cf_mba) *(4 hours, Case - Grocery)*
17 | 	- Apriori Algorithm
18 | 	- Market Basket Analysis
19 | 
20 | 3. [Random Forest / Gradient Boosting](/RF_GBM) *(4 hours, Case - Bank Marketing)*
21 | 	- Intro to Ensemble Models, Bagging and Boosting
22 | 	- Gradient Boosting Classifier & Regressor
23 | 	- Random Forest Classifier & Regressor
24 | 	- Tuning Model Parameters
25 | 
26 | 4. [Text Mining](/text_mining) *(6 hours, Case - DataTau)*
27 | 	- Regular Expression
28 | 	- Stopword Removal, Stemming
29 | 	- Word Cloud
30 | 	- Creating features from text
31 | 	- Term Frequency and Inverse Document Frequency (TF-IDF)
32 | 	- Topic Modeling - Latent Dirichlet Allocation (LDA)
33 | 	- Sentiment Analysis
34 | 
35 | 
36 | ###Script to check if requisite libraries for the workshop are present
37 | Please execute the following at the command prompt
38 | 
39 |     $ python check_env.py
40 | 
41 | If any library has a `FAIL` message, please install/upgrade that library.
42 | 
43 | Installation instructions can be found [here](https://github.com/amitkaps/machine-learning/blob/master/installation_instructions.md)
44 | 
45 | ---
46 | ### Licensing
47 | 
48 | Machine Learning using Python by <a href="https://twitter.com/amitkaps/">Amit Kapoor</a> and <a href="https://twitter.com/bargava/">Bargava Subramanian</a> is licensed under a <a rel="license" href="https://opensource.org/licenses/MIT">MIT License</a>.
49 | 


--------------------------------------------------------------------------------
/RF_GBM/notebook/Bank Marketing.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Frame\n",
   8 |     "\n",
   9 |     "The client bank *XYZ* is running a direct marketing campaign. It wants to identify customers who would potentially be buying their new term deposit plan.\n",
  10 |     "\n",
  11 |     "# Acquire\n",
  12 |     "\n",
  13 |     "Data is obtained from UCI Machine Learning repository. \n",
  14 |     "http://mlr.cs.umass.edu/ml/datasets/Bank+Marketing\n",
  15 |     "\n",
  16 |     "Data from direct marketing campaign (phone calls) of a Portuguese Bank is provided. \n",
  17 |     "\n",
  18 |     "### Attribute Information:\n",
  19 |     "\n",
  20 |     "\n",
  21 |     "#### bank client data:\n",
  22 |     "\n",
  23 |     "1. age (numeric)\n",
  24 |     "2. job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')\n",
  25 |     "3. marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)\n",
  26 |     "4. education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')\n",
  27 |     "5. default: has credit in default? (categorical: 'no','yes','unknown')\n",
  28 |     "6. housing: has housing loan? (categorical: 'no','yes','unknown')\n",
  29 |     "7. loan: has personal loan? (categorical: 'no','yes','unknown')\n",
  30 |     "\n",
  31 |     "#### related with the last contact of the current campaign:\n",
  32 |     "\n",
  33 |     "8. contact: contact communication type (categorical: 'cellular','telephone') \n",
  34 |     "9. month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')\n",
  35 |     "10. day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')\n",
  36 |     "11. duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.\n",
  37 |     "\n",
  38 |     "#### other attributes:\n",
  39 |     "\n",
  40 |     "12. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)\n",
  41 |     "13. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)\n",
  42 |     "14. previous: number of contacts performed before this campaign and for this client (numeric)\n",
  43 |     "15. poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')\n",
  44 |     "\n",
  45 |     "#### social and economic context attributes\n",
  46 |     "\n",
  47 |     "16. emp.var.rate: employment variation rate - quarterly indicator (numeric)\n",
  48 |     "17. cons.price.idx: consumer price index - monthly indicator (numeric) \n",
  49 |     "18. cons.conf.idx: consumer confidence index - monthly indicator (numeric) \n",
  50 |     "19. euribor3m: euribor 3 month rate - daily indicator (numeric)\n",
  51 |     "20. nr.employed: number of employees - quarterly indicator (numeric)\n",
  52 |     "\n",
  53 |     "#### Output variable (desired target):\n",
  54 |     "y - has the client subscribed a term deposit? (binary: 'yes','no')\n",
  55 |     "\n",
  56 |     "The given data is randomly divided into train and test for the purpose of this workshop. Build the model for train and use it to predict on test. \n",
  57 |     "\n",
  58 |     "# Explore"
  59 |    ]
  60 |   },
  61 |   {
  62 |    "cell_type": "code",
  63 |    "execution_count": 1,
  64 |    "metadata": {
  65 |     "collapsed": true
  66 |    },
  67 |    "outputs": [],
  68 |    "source": [
  69 |     "#Import the necessary libraries\n",
  70 |     "import numpy as np\n",
  71 |     "import pandas as pd"
  72 |    ]
  73 |   },
  74 |   {
  75 |    "cell_type": "code",
  76 |    "execution_count": 2,
  77 |    "metadata": {
  78 |     "collapsed": true
  79 |    },
  80 |    "outputs": [],
  81 |    "source": [
  82 |     "#Read the train and test data\n",
  83 |     "train = pd.read_csv(\"../data/train.csv\")\n",
  84 |     "test = pd.read_csv(\"../data/test.csv\")"
  85 |    ]
  86 |   },
  87 |   {
  88 |    "cell_type": "markdown",
  89 |    "metadata": {},
  90 |    "source": [
  91 |     "**Exercise 1**\n",
  92 |     "\n",
  93 |     "print the number of rows and columns of train and test"
  94 |    ]
  95 |   },
  96 |   {
  97 |    "cell_type": "code",
  98 |    "execution_count": 16,
  99 |    "metadata": {
 100 |     "collapsed": false
 101 |    },
 102 |    "outputs": [
 103 |     {
 104 |      "name": "stdout",
 105 |      "output_type": "stream",
 106 |      "text": [
 107 |       "(35211, 17) (10000, 17)\n"
 108 |      ]
 109 |     }
 110 |    ],
 111 |    "source": []
 112 |   },
 113 |   {
 114 |    "cell_type": "markdown",
 115 |    "metadata": {},
 116 |    "source": [
 117 |     "**Exercise 2**\n",
 118 |     "\n",
 119 |     "Print the first 10 rows of train"
 120 |    ]
 121 |   },
 122 |   {
 123 |    "cell_type": "code",
 124 |    "execution_count": 4,
 125 |    "metadata": {
 126 |     "collapsed": false
 127 |    },
 128 |    "outputs": [
 129 |     {
 130 |      "data": {
 131 |       "text/html": [
 132 |        "<div>\n",
 133 |        "<table border=\"1\" class=\"dataframe\">\n",
 134 |        "  <thead>\n",
 135 |        "    <tr style=\"text-align: right;\">\n",
 136 |        "      <th></th>\n",
 137 |        "      <th>age</th>\n",
 138 |        "      <th>job</th>\n",
 139 |        "      <th>marital</th>\n",
 140 |        "      <th>education</th>\n",
 141 |        "      <th>default</th>\n",
 142 |        "      <th>balance</th>\n",
 143 |        "      <th>housing</th>\n",
 144 |        "      <th>loan</th>\n",
 145 |        "      <th>contact</th>\n",
 146 |        "      <th>day</th>\n",
 147 |        "      <th>month</th>\n",
 148 |        "      <th>duration</th>\n",
 149 |        "      <th>campaign</th>\n",
 150 |        "      <th>pdays</th>\n",
 151 |        "      <th>previous</th>\n",
 152 |        "      <th>poutcome</th>\n",
 153 |        "      <th>deposit</th>\n",
 154 |        "    </tr>\n",
 155 |        "  </thead>\n",
 156 |        "  <tbody>\n",
 157 |        "    <tr>\n",
 158 |        "      <th>0</th>\n",
 159 |        "      <td>58</td>\n",
 160 |        "      <td>management</td>\n",
 161 |        "      <td>married</td>\n",
 162 |        "      <td>tertiary</td>\n",
 163 |        "      <td>no</td>\n",
 164 |        "      <td>2143</td>\n",
 165 |        "      <td>yes</td>\n",
 166 |        "      <td>no</td>\n",
 167 |        "      <td>unknown</td>\n",
 168 |        "      <td>5</td>\n",
 169 |        "      <td>may</td>\n",
 170 |        "      <td>261</td>\n",
 171 |        "      <td>1</td>\n",
 172 |        "      <td>-1</td>\n",
 173 |        "      <td>0</td>\n",
 174 |        "      <td>unknown</td>\n",
 175 |        "      <td>no</td>\n",
 176 |        "    </tr>\n",
 177 |        "    <tr>\n",
 178 |        "      <th>1</th>\n",
 179 |        "      <td>44</td>\n",
 180 |        "      <td>technician</td>\n",
 181 |        "      <td>single</td>\n",
 182 |        "      <td>secondary</td>\n",
 183 |        "      <td>no</td>\n",
 184 |        "      <td>29</td>\n",
 185 |        "      <td>yes</td>\n",
 186 |        "      <td>no</td>\n",
 187 |        "      <td>unknown</td>\n",
 188 |        "      <td>5</td>\n",
 189 |        "      <td>may</td>\n",
 190 |        "      <td>151</td>\n",
 191 |        "      <td>1</td>\n",
 192 |        "      <td>-1</td>\n",
 193 |        "      <td>0</td>\n",
 194 |        "      <td>unknown</td>\n",
 195 |        "      <td>no</td>\n",
 196 |        "    </tr>\n",
 197 |        "    <tr>\n",
 198 |        "      <th>2</th>\n",
 199 |        "      <td>33</td>\n",
 200 |        "      <td>entrepreneur</td>\n",
 201 |        "      <td>married</td>\n",
 202 |        "      <td>secondary</td>\n",
 203 |        "      <td>no</td>\n",
 204 |        "      <td>2</td>\n",
 205 |        "      <td>yes</td>\n",
 206 |        "      <td>yes</td>\n",
 207 |        "      <td>unknown</td>\n",
 208 |        "      <td>5</td>\n",
 209 |        "      <td>may</td>\n",
 210 |        "      <td>76</td>\n",
 211 |        "      <td>1</td>\n",
 212 |        "      <td>-1</td>\n",
 213 |        "      <td>0</td>\n",
 214 |        "      <td>unknown</td>\n",
 215 |        "      <td>no</td>\n",
 216 |        "    </tr>\n",
 217 |        "    <tr>\n",
 218 |        "      <th>3</th>\n",
 219 |        "      <td>47</td>\n",
 220 |        "      <td>blue-collar</td>\n",
 221 |        "      <td>married</td>\n",
 222 |        "      <td>unknown</td>\n",
 223 |        "      <td>no</td>\n",
 224 |        "      <td>1506</td>\n",
 225 |        "      <td>yes</td>\n",
 226 |        "      <td>no</td>\n",
 227 |        "      <td>unknown</td>\n",
 228 |        "      <td>5</td>\n",
 229 |        "      <td>may</td>\n",
 230 |        "      <td>92</td>\n",
 231 |        "      <td>1</td>\n",
 232 |        "      <td>-1</td>\n",
 233 |        "      <td>0</td>\n",
 234 |        "      <td>unknown</td>\n",
 235 |        "      <td>no</td>\n",
 236 |        "    </tr>\n",
 237 |        "    <tr>\n",
 238 |        "      <th>4</th>\n",
 239 |        "      <td>33</td>\n",
 240 |        "      <td>unknown</td>\n",
 241 |        "      <td>single</td>\n",
 242 |        "      <td>unknown</td>\n",
 243 |        "      <td>no</td>\n",
 244 |        "      <td>1</td>\n",
 245 |        "      <td>no</td>\n",
 246 |        "      <td>no</td>\n",
 247 |        "      <td>unknown</td>\n",
 248 |        "      <td>5</td>\n",
 249 |        "      <td>may</td>\n",
 250 |        "      <td>198</td>\n",
 251 |        "      <td>1</td>\n",
 252 |        "      <td>-1</td>\n",
 253 |        "      <td>0</td>\n",
 254 |        "      <td>unknown</td>\n",
 255 |        "      <td>no</td>\n",
 256 |        "    </tr>\n",
 257 |        "  </tbody>\n",
 258 |        "</table>\n",
 259 |        "</div>"
 260 |       ],
 261 |       "text/plain": [
 262 |        "   age           job  marital  education default  balance housing loan  \\\n",
 263 |        "0   58    management  married   tertiary      no     2143     yes   no   \n",
 264 |        "1   44    technician   single  secondary      no       29     yes   no   \n",
 265 |        "2   33  entrepreneur  married  secondary      no        2     yes  yes   \n",
 266 |        "3   47   blue-collar  married    unknown      no     1506     yes   no   \n",
 267 |        "4   33       unknown   single    unknown      no        1      no   no   \n",
 268 |        "\n",
 269 |        "   contact  day month  duration  campaign  pdays  previous poutcome deposit  \n",
 270 |        "0  unknown    5   may       261         1     -1         0  unknown      no  \n",
 271 |        "1  unknown    5   may       151         1     -1         0  unknown      no  \n",
 272 |        "2  unknown    5   may        76         1     -1         0  unknown      no  \n",
 273 |        "3  unknown    5   may        92         1     -1         0  unknown      no  \n",
 274 |        "4  unknown    5   may       198         1     -1         0  unknown      no  "
 275 |       ]
 276 |      },
 277 |      "execution_count": 4,
 278 |      "metadata": {},
 279 |      "output_type": "execute_result"
 280 |     }
 281 |    ],
 282 |    "source": []
 283 |   },
 284 |   {
 285 |    "cell_type": "markdown",
 286 |    "metadata": {},
 287 |    "source": [
 288 |     "**Exercise 3**\n",
 289 |     "\n",
 290 |     "Print the column types of train and test. Are they the same in both train and test?"
 291 |    ]
 292 |   },
 293 |   {
 294 |    "cell_type": "code",
 295 |    "execution_count": 5,
 296 |    "metadata": {
 297 |     "collapsed": false
 298 |    },
 299 |    "outputs": [
 300 |     {
 301 |      "data": {
 302 |       "text/plain": [
 303 |        "age           int64\n",
 304 |        "job          object\n",
 305 |        "marital      object\n",
 306 |        "education    object\n",
 307 |        "default      object\n",
 308 |        "balance       int64\n",
 309 |        "housing      object\n",
 310 |        "loan         object\n",
 311 |        "contact      object\n",
 312 |        "day           int64\n",
 313 |        "month        object\n",
 314 |        "duration      int64\n",
 315 |        "campaign      int64\n",
 316 |        "pdays         int64\n",
 317 |        "previous      int64\n",
 318 |        "poutcome     object\n",
 319 |        "deposit      object\n",
 320 |        "dtype: object"
 321 |       ]
 322 |      },
 323 |      "execution_count": 5,
 324 |      "metadata": {},
 325 |      "output_type": "execute_result"
 326 |     }
 327 |    ],
 328 |    "source": [
 329 |     "#train"
 330 |    ]
 331 |   },
 332 |   {
 333 |    "cell_type": "code",
 334 |    "execution_count": 6,
 335 |    "metadata": {
 336 |     "collapsed": false
 337 |    },
 338 |    "outputs": [
 339 |     {
 340 |      "data": {
 341 |       "text/plain": [
 342 |        "age           int64\n",
 343 |        "job          object\n",
 344 |        "marital      object\n",
 345 |        "education    object\n",
 346 |        "default      object\n",
 347 |        "balance       int64\n",
 348 |        "housing      object\n",
 349 |        "loan         object\n",
 350 |        "contact      object\n",
 351 |        "day           int64\n",
 352 |        "month        object\n",
 353 |        "duration      int64\n",
 354 |        "campaign      int64\n",
 355 |        "pdays         int64\n",
 356 |        "previous      int64\n",
 357 |        "poutcome     object\n",
 358 |        "deposit      object\n",
 359 |        "dtype: object"
 360 |       ]
 361 |      },
 362 |      "execution_count": 6,
 363 |      "metadata": {},
 364 |      "output_type": "execute_result"
 365 |     }
 366 |    ],
 367 |    "source": [
 368 |     "#test"
 369 |    ]
 370 |   },
 371 |   {
 372 |    "cell_type": "code",
 373 |    "execution_count": 7,
 374 |    "metadata": {
 375 |     "collapsed": true
 376 |    },
 377 |    "outputs": [],
 378 |    "source": [
 379 |     "#Are they the same?"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": 64,
 385 |    "metadata": {
 386 |     "collapsed": true
 387 |    },
 388 |    "outputs": [],
 389 |    "source": [
 390 |     "#Combine train and test\n",
 391 |     "frames = [train, test]\n",
 392 |     "input = pd.concat(frames)"
 393 |    ]
 394 |   },
 395 |   {
 396 |    "cell_type": "code",
 397 |    "execution_count": 9,
 398 |    "metadata": {
 399 |     "collapsed": false
 400 |    },
 401 |    "outputs": [
 402 |     {
 403 |      "data": {
 404 |       "text/html": [
 405 |        "<div>\n",
 406 |        "<table border=\"1\" class=\"dataframe\">\n",
 407 |        "  <thead>\n",
 408 |        "    <tr style=\"text-align: right;\">\n",
 409 |        "      <th></th>\n",
 410 |        "      <th>age</th>\n",
 411 |        "      <th>job</th>\n",
 412 |        "      <th>marital</th>\n",
 413 |        "      <th>education</th>\n",
 414 |        "      <th>default</th>\n",
 415 |        "      <th>balance</th>\n",
 416 |        "      <th>housing</th>\n",
 417 |        "      <th>loan</th>\n",
 418 |        "      <th>contact</th>\n",
 419 |        "      <th>day</th>\n",
 420 |        "      <th>month</th>\n",
 421 |        "      <th>duration</th>\n",
 422 |        "      <th>campaign</th>\n",
 423 |        "      <th>pdays</th>\n",
 424 |        "      <th>previous</th>\n",
 425 |        "      <th>poutcome</th>\n",
 426 |        "      <th>deposit</th>\n",
 427 |        "    </tr>\n",
 428 |        "  </thead>\n",
 429 |        "  <tbody>\n",
 430 |        "    <tr>\n",
 431 |        "      <th>0</th>\n",
 432 |        "      <td>58</td>\n",
 433 |        "      <td>management</td>\n",
 434 |        "      <td>married</td>\n",
 435 |        "      <td>tertiary</td>\n",
 436 |        "      <td>no</td>\n",
 437 |        "      <td>2143</td>\n",
 438 |        "      <td>yes</td>\n",
 439 |        "      <td>no</td>\n",
 440 |        "      <td>unknown</td>\n",
 441 |        "      <td>5</td>\n",
 442 |        "      <td>may</td>\n",
 443 |        "      <td>261</td>\n",
 444 |        "      <td>1</td>\n",
 445 |        "      <td>-1</td>\n",
 446 |        "      <td>0</td>\n",
 447 |        "      <td>unknown</td>\n",
 448 |        "      <td>no</td>\n",
 449 |        "    </tr>\n",
 450 |        "    <tr>\n",
 451 |        "      <th>1</th>\n",
 452 |        "      <td>44</td>\n",
 453 |        "      <td>technician</td>\n",
 454 |        "      <td>single</td>\n",
 455 |        "      <td>secondary</td>\n",
 456 |        "      <td>no</td>\n",
 457 |        "      <td>29</td>\n",
 458 |        "      <td>yes</td>\n",
 459 |        "      <td>no</td>\n",
 460 |        "      <td>unknown</td>\n",
 461 |        "      <td>5</td>\n",
 462 |        "      <td>may</td>\n",
 463 |        "      <td>151</td>\n",
 464 |        "      <td>1</td>\n",
 465 |        "      <td>-1</td>\n",
 466 |        "      <td>0</td>\n",
 467 |        "      <td>unknown</td>\n",
 468 |        "      <td>no</td>\n",
 469 |        "    </tr>\n",
 470 |        "    <tr>\n",
 471 |        "      <th>2</th>\n",
 472 |        "      <td>33</td>\n",
 473 |        "      <td>entrepreneur</td>\n",
 474 |        "      <td>married</td>\n",
 475 |        "      <td>secondary</td>\n",
 476 |        "      <td>no</td>\n",
 477 |        "      <td>2</td>\n",
 478 |        "      <td>yes</td>\n",
 479 |        "      <td>yes</td>\n",
 480 |        "      <td>unknown</td>\n",
 481 |        "      <td>5</td>\n",
 482 |        "      <td>may</td>\n",
 483 |        "      <td>76</td>\n",
 484 |        "      <td>1</td>\n",
 485 |        "      <td>-1</td>\n",
 486 |        "      <td>0</td>\n",
 487 |        "      <td>unknown</td>\n",
 488 |        "      <td>no</td>\n",
 489 |        "    </tr>\n",
 490 |        "    <tr>\n",
 491 |        "      <th>3</th>\n",
 492 |        "      <td>47</td>\n",
 493 |        "      <td>blue-collar</td>\n",
 494 |        "      <td>married</td>\n",
 495 |        "      <td>unknown</td>\n",
 496 |        "      <td>no</td>\n",
 497 |        "      <td>1506</td>\n",
 498 |        "      <td>yes</td>\n",
 499 |        "      <td>no</td>\n",
 500 |        "      <td>unknown</td>\n",
 501 |        "      <td>5</td>\n",
 502 |        "      <td>may</td>\n",
 503 |        "      <td>92</td>\n",
 504 |        "      <td>1</td>\n",
 505 |        "      <td>-1</td>\n",
 506 |        "      <td>0</td>\n",
 507 |        "      <td>unknown</td>\n",
 508 |        "      <td>no</td>\n",
 509 |        "    </tr>\n",
 510 |        "    <tr>\n",
 511 |        "      <th>4</th>\n",
 512 |        "      <td>33</td>\n",
 513 |        "      <td>unknown</td>\n",
 514 |        "      <td>single</td>\n",
 515 |        "      <td>unknown</td>\n",
 516 |        "      <td>no</td>\n",
 517 |        "      <td>1</td>\n",
 518 |        "      <td>no</td>\n",
 519 |        "      <td>no</td>\n",
 520 |        "      <td>unknown</td>\n",
 521 |        "      <td>5</td>\n",
 522 |        "      <td>may</td>\n",
 523 |        "      <td>198</td>\n",
 524 |        "      <td>1</td>\n",
 525 |        "      <td>-1</td>\n",
 526 |        "      <td>0</td>\n",
 527 |        "      <td>unknown</td>\n",
 528 |        "      <td>no</td>\n",
 529 |        "    </tr>\n",
 530 |        "  </tbody>\n",
 531 |        "</table>\n",
 532 |        "</div>"
 533 |       ],
 534 |       "text/plain": [
 535 |        "   age           job  marital  education default  balance housing loan  \\\n",
 536 |        "0   58    management  married   tertiary      no     2143     yes   no   \n",
 537 |        "1   44    technician   single  secondary      no       29     yes   no   \n",
 538 |        "2   33  entrepreneur  married  secondary      no        2     yes  yes   \n",
 539 |        "3   47   blue-collar  married    unknown      no     1506     yes   no   \n",
 540 |        "4   33       unknown   single    unknown      no        1      no   no   \n",
 541 |        "\n",
 542 |        "   contact  day month  duration  campaign  pdays  previous poutcome deposit  \n",
 543 |        "0  unknown    5   may       261         1     -1         0  unknown      no  \n",
 544 |        "1  unknown    5   may       151         1     -1         0  unknown      no  \n",
 545 |        "2  unknown    5   may        76         1     -1         0  unknown      no  \n",
 546 |        "3  unknown    5   may        92         1     -1         0  unknown      no  \n",
 547 |        "4  unknown    5   may       198         1     -1         0  unknown      no  "
 548 |       ]
 549 |      },
 550 |      "execution_count": 9,
 551 |      "metadata": {},
 552 |      "output_type": "execute_result"
 553 |     }
 554 |    ],
 555 |    "source": [
 556 |     "#Print first 10 records of input"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "markdown",
 561 |    "metadata": {},
 562 |    "source": [
 563 |     "**Exercise 4**\n",
 564 |     "\n",
 565 |     "Find if any column has missing value\n",
 566 |     "There is a `pd.isnull` function. How to use that?"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": 12,
 572 |    "metadata": {
 573 |     "collapsed": false
 574 |    },
 575 |    "outputs": [
 576 |     {
 577 |      "data": {
 578 |       "text/plain": [
 579 |        "age          0\n",
 580 |        "job          0\n",
 581 |        "marital      0\n",
 582 |        "education    0\n",
 583 |        "default      0\n",
 584 |        "balance      0\n",
 585 |        "housing      0\n",
 586 |        "loan         0\n",
 587 |        "contact      0\n",
 588 |        "day          0\n",
 589 |        "month        0\n",
 590 |        "duration     0\n",
 591 |        "campaign     0\n",
 592 |        "pdays        0\n",
 593 |        "previous     0\n",
 594 |        "poutcome     0\n",
 595 |        "deposit      0\n",
 596 |        "dtype: int64"
 597 |       ]
 598 |      },
 599 |      "execution_count": 12,
 600 |      "metadata": {},
 601 |      "output_type": "execute_result"
 602 |     }
 603 |    ],
 604 |    "source": []
 605 |   },
 606 |   {
 607 |    "cell_type": "code",
 608 |    "execution_count": 65,
 609 |    "metadata": {
 610 |     "collapsed": false
 611 |    },
 612 |    "outputs": [],
 613 |    "source": [
 614 |     "#Replace deposit with a numeric column\n",
 615 |     "#First, set all labels to be 0\n",
 616 |     "input.at[:, \"depositLabel\"] = 0\n",
 617 |     "#Now, set depositLabel to 1 whenever deposit is yes\n",
 618 |     "input.at[input.deposit==\"yes\", \"depositLabel\"] = 1"
 619 |    ]
 620 |   },
 621 |   {
 622 |    "cell_type": "code",
 623 |    "execution_count": null,
 624 |    "metadata": {
 625 |     "collapsed": true
 626 |    },
 627 |    "outputs": [],
 628 |    "source": []
 629 |   },
 630 |   {
 631 |    "cell_type": "markdown",
 632 |    "metadata": {},
 633 |    "source": [
 634 |     "**Exercise 5**\n",
 635 |     "\n",
 636 |     "Find % of customers in the input dataset who have purchased the term deposit"
 637 |    ]
 638 |   },
 639 |   {
 640 |    "cell_type": "code",
 641 |    "execution_count": 72,
 642 |    "metadata": {
 643 |     "collapsed": false
 644 |    },
 645 |    "outputs": [
 646 |     {
 647 |      "data": {
 648 |       "text/plain": [
 649 |        "11.698480458295547"
 650 |       ]
 651 |      },
 652 |      "execution_count": 72,
 653 |      "metadata": {},
 654 |      "output_type": "execute_result"
 655 |     }
 656 |    ],
 657 |    "source": []
 658 |   },
 659 |   {
 660 |    "cell_type": "code",
 661 |    "execution_count": 75,
 662 |    "metadata": {
 663 |     "collapsed": false
 664 |    },
 665 |    "outputs": [
 666 |     {
 667 |      "data": {
 668 |       "text/plain": [
 669 |        "0       0\n",
 670 |        "1       0\n",
 671 |        "2       0\n",
 672 |        "3       0\n",
 673 |        "4       0\n",
 674 |        "5       0\n",
 675 |        "6       0\n",
 676 |        "7       0\n",
 677 |        "8       0\n",
 678 |        "9       0\n",
 679 |        "10      0\n",
 680 |        "11      0\n",
 681 |        "12      0\n",
 682 |        "13      0\n",
 683 |        "14      0\n",
 684 |        "15      0\n",
 685 |        "16      0\n",
 686 |        "17      0\n",
 687 |        "18      0\n",
 688 |        "19      0\n",
 689 |        "20      0\n",
 690 |        "21      0\n",
 691 |        "22      0\n",
 692 |        "23      0\n",
 693 |        "24      0\n",
 694 |        "25      0\n",
 695 |        "26      0\n",
 696 |        "27      0\n",
 697 |        "28      0\n",
 698 |        "29      0\n",
 699 |        "       ..\n",
 700 |        "9970    1\n",
 701 |        "9971    1\n",
 702 |        "9972    1\n",
 703 |        "9973    1\n",
 704 |        "9974    1\n",
 705 |        "9975    1\n",
 706 |        "9976    1\n",
 707 |        "9977    1\n",
 708 |        "9978    1\n",
 709 |        "9979    1\n",
 710 |        "9980    1\n",
 711 |        "9981    1\n",
 712 |        "9982    1\n",
 713 |        "9983    1\n",
 714 |        "9984    1\n",
 715 |        "9985    1\n",
 716 |        "9986    1\n",
 717 |        "9987    1\n",
 718 |        "9988    1\n",
 719 |        "9989    1\n",
 720 |        "9990    1\n",
 721 |        "9991    1\n",
 722 |        "9992    1\n",
 723 |        "9993    1\n",
 724 |        "9994    1\n",
 725 |        "9995    1\n",
 726 |        "9996    1\n",
 727 |        "9997    1\n",
 728 |        "9998    1\n",
 729 |        "9999    1\n",
 730 |        "Name: depositLabel, dtype: int64"
 731 |       ]
 732 |      },
 733 |      "execution_count": 75,
 734 |      "metadata": {},
 735 |      "output_type": "execute_result"
 736 |     }
 737 |    ],
 738 |    "source": [
 739 |     "#Create the labels \n",
 740 |     "labels = \n",
 741 |     "labels"
 742 |    ]
 743 |   },
 744 |   {
 745 |    "cell_type": "code",
 746 |    "execution_count": 83,
 747 |    "metadata": {
 748 |     "collapsed": false
 749 |    },
 750 |    "outputs": [],
 751 |    "source": [
 752 |     "#Drop the deposit column \n",
 753 |     "input.drop([\"deposit\", \"depositLabel\"], axis=1)"
 754 |    ]
 755 |   },
 756 |   {
 757 |    "cell_type": "markdown",
 758 |    "metadata": {},
 759 |    "source": [
 760 |     "**Exercise 6**\n",
 761 |     "\n",
 762 |     "Did it drop? If not, what has to be done?"
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "markdown",
 767 |    "metadata": {},
 768 |    "source": [
 769 |     "**Exercise 7**\n",
 770 |     "\n",
 771 |     "Print columnn names of input"
 772 |    ]
 773 |   },
 774 |   {
 775 |    "cell_type": "code",
 776 |    "execution_count": null,
 777 |    "metadata": {
 778 |     "collapsed": true
 779 |    },
 780 |    "outputs": [],
 781 |    "source": []
 782 |   },
 783 |   {
 784 |    "cell_type": "code",
 785 |    "execution_count": 85,
 786 |    "metadata": {
 787 |     "collapsed": false
 788 |    },
 789 |    "outputs": [],
 790 |    "source": [
 791 |     "#Get list of columns that are continuous/integer\n",
 792 |     "continuous_variables = input.dtypes[input.dtypes != \"object\"].index"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "code",
 797 |    "execution_count": 86,
 798 |    "metadata": {
 799 |     "collapsed": false
 800 |    },
 801 |    "outputs": [
 802 |     {
 803 |      "data": {
 804 |       "text/plain": [
 805 |        "Index([u'age', u'balance', u'day', u'duration', u'campaign', u'pdays',\n",
 806 |        "       u'previous'],\n",
 807 |        "      dtype='object')"
 808 |       ]
 809 |      },
 810 |      "execution_count": 86,
 811 |      "metadata": {},
 812 |      "output_type": "execute_result"
 813 |     }
 814 |    ],
 815 |    "source": [
 816 |     "continuous_variables"
 817 |    ]
 818 |   },
 819 |   {
 820 |    "cell_type": "code",
 821 |    "execution_count": 87,
 822 |    "metadata": {
 823 |     "collapsed": false
 824 |    },
 825 |    "outputs": [],
 826 |    "source": [
 827 |     "#Get list of columns that are categorical\n",
 828 |     "categorical_variables = input.dtypes[input.dtypes==\"object\"].index"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "code",
 833 |    "execution_count": 88,
 834 |    "metadata": {
 835 |     "collapsed": false
 836 |    },
 837 |    "outputs": [
 838 |     {
 839 |      "data": {
 840 |       "text/plain": [
 841 |        "Index([u'job', u'marital', u'education', u'default', u'housing', u'loan',\n",
 842 |        "       u'contact', u'month', u'poutcome'],\n",
 843 |        "      dtype='object')"
 844 |       ]
 845 |      },
 846 |      "execution_count": 88,
 847 |      "metadata": {},
 848 |      "output_type": "execute_result"
 849 |     }
 850 |    ],
 851 |    "source": [
 852 |     "categorical_variables"
 853 |    ]
 854 |   },
 855 |   {
 856 |    "cell_type": "markdown",
 857 |    "metadata": {},
 858 |    "source": [
 859 |     "**Exercise 8**\n",
 860 |     "\n",
 861 |     "Create `inputInteger` and `inputCategorical` - two datasets - one having integer variables and another having categorical variables"
 862 |    ]
 863 |   },
 864 |   {
 865 |    "cell_type": "code",
 866 |    "execution_count": 89,
 867 |    "metadata": {
 868 |     "collapsed": false
 869 |    },
 870 |    "outputs": [],
 871 |    "source": [
 872 |     "inputInteger = "
 873 |    ]
 874 |   },
 875 |   {
 876 |    "cell_type": "code",
 877 |    "execution_count": 91,
 878 |    "metadata": {
 879 |     "collapsed": false
 880 |    },
 881 |    "outputs": [
 882 |     {
 883 |      "data": {
 884 |       "text/html": [
 885 |        "<div>\n",
 886 |        "<table border=\"1\" class=\"dataframe\">\n",
 887 |        "  <thead>\n",
 888 |        "    <tr style=\"text-align: right;\">\n",
 889 |        "      <th></th>\n",
 890 |        "      <th>age</th>\n",
 891 |        "      <th>balance</th>\n",
 892 |        "      <th>day</th>\n",
 893 |        "      <th>duration</th>\n",
 894 |        "      <th>campaign</th>\n",
 895 |        "      <th>pdays</th>\n",
 896 |        "      <th>previous</th>\n",
 897 |        "    </tr>\n",
 898 |        "  </thead>\n",
 899 |        "  <tbody>\n",
 900 |        "    <tr>\n",
 901 |        "      <th>0</th>\n",
 902 |        "      <td>58</td>\n",
 903 |        "      <td>2143</td>\n",
 904 |        "      <td>5</td>\n",
 905 |        "      <td>261</td>\n",
 906 |        "      <td>1</td>\n",
 907 |        "      <td>-1</td>\n",
 908 |        "      <td>0</td>\n",
 909 |        "    </tr>\n",
 910 |        "    <tr>\n",
 911 |        "      <th>1</th>\n",
 912 |        "      <td>44</td>\n",
 913 |        "      <td>29</td>\n",
 914 |        "      <td>5</td>\n",
 915 |        "      <td>151</td>\n",
 916 |        "      <td>1</td>\n",
 917 |        "      <td>-1</td>\n",
 918 |        "      <td>0</td>\n",
 919 |        "    </tr>\n",
 920 |        "    <tr>\n",
 921 |        "      <th>2</th>\n",
 922 |        "      <td>33</td>\n",
 923 |        "      <td>2</td>\n",
 924 |        "      <td>5</td>\n",
 925 |        "      <td>76</td>\n",
 926 |        "      <td>1</td>\n",
 927 |        "      <td>-1</td>\n",
 928 |        "      <td>0</td>\n",
 929 |        "    </tr>\n",
 930 |        "    <tr>\n",
 931 |        "      <th>3</th>\n",
 932 |        "      <td>47</td>\n",
 933 |        "      <td>1506</td>\n",
 934 |        "      <td>5</td>\n",
 935 |        "      <td>92</td>\n",
 936 |        "      <td>1</td>\n",
 937 |        "      <td>-1</td>\n",
 938 |        "      <td>0</td>\n",
 939 |        "    </tr>\n",
 940 |        "    <tr>\n",
 941 |        "      <th>4</th>\n",
 942 |        "      <td>33</td>\n",
 943 |        "      <td>1</td>\n",
 944 |        "      <td>5</td>\n",
 945 |        "      <td>198</td>\n",
 946 |        "      <td>1</td>\n",
 947 |        "      <td>-1</td>\n",
 948 |        "      <td>0</td>\n",
 949 |        "    </tr>\n",
 950 |        "  </tbody>\n",
 951 |        "</table>\n",
 952 |        "</div>"
 953 |       ],
 954 |       "text/plain": [
 955 |        "   age  balance  day  duration  campaign  pdays  previous\n",
 956 |        "0   58     2143    5       261         1     -1         0\n",
 957 |        "1   44       29    5       151         1     -1         0\n",
 958 |        "2   33        2    5        76         1     -1         0\n",
 959 |        "3   47     1506    5        92         1     -1         0\n",
 960 |        "4   33        1    5       198         1     -1         0"
 961 |       ]
 962 |      },
 963 |      "execution_count": 91,
 964 |      "metadata": {},
 965 |      "output_type": "execute_result"
 966 |     }
 967 |    ],
 968 |    "source": [
 969 |     "#print inputInteger\n",
 970 |     "inputInteger.head()"
 971 |    ]
 972 |   },
 973 |   {
 974 |    "cell_type": "code",
 975 |    "execution_count": 93,
 976 |    "metadata": {
 977 |     "collapsed": false
 978 |    },
 979 |    "outputs": [],
 980 |    "source": [
 981 |     "inputCategorical = "
 982 |    ]
 983 |   },
 984 |   {
 985 |    "cell_type": "code",
 986 |    "execution_count": 94,
 987 |    "metadata": {
 988 |     "collapsed": false
 989 |    },
 990 |    "outputs": [
 991 |     {
 992 |      "data": {
 993 |       "text/html": [
 994 |        "<div>\n",
 995 |        "<table border=\"1\" class=\"dataframe\">\n",
 996 |        "  <thead>\n",
 997 |        "    <tr style=\"text-align: right;\">\n",
 998 |        "      <th></th>\n",
 999 |        "      <th>job</th>\n",
1000 |        "      <th>marital</th>\n",
1001 |        "      <th>education</th>\n",
1002 |        "      <th>default</th>\n",
1003 |        "      <th>housing</th>\n",
1004 |        "      <th>loan</th>\n",
1005 |        "      <th>contact</th>\n",
1006 |        "      <th>month</th>\n",
1007 |        "      <th>poutcome</th>\n",
1008 |        "    </tr>\n",
1009 |        "  </thead>\n",
1010 |        "  <tbody>\n",
1011 |        "    <tr>\n",
1012 |        "      <th>0</th>\n",
1013 |        "      <td>management</td>\n",
1014 |        "      <td>married</td>\n",
1015 |        "      <td>tertiary</td>\n",
1016 |        "      <td>no</td>\n",
1017 |        "      <td>yes</td>\n",
1018 |        "      <td>no</td>\n",
1019 |        "      <td>unknown</td>\n",
1020 |        "      <td>may</td>\n",
1021 |        "      <td>unknown</td>\n",
1022 |        "    </tr>\n",
1023 |        "    <tr>\n",
1024 |        "      <th>1</th>\n",
1025 |        "      <td>technician</td>\n",
1026 |        "      <td>single</td>\n",
1027 |        "      <td>secondary</td>\n",
1028 |        "      <td>no</td>\n",
1029 |        "      <td>yes</td>\n",
1030 |        "      <td>no</td>\n",
1031 |        "      <td>unknown</td>\n",
1032 |        "      <td>may</td>\n",
1033 |        "      <td>unknown</td>\n",
1034 |        "    </tr>\n",
1035 |        "    <tr>\n",
1036 |        "      <th>2</th>\n",
1037 |        "      <td>entrepreneur</td>\n",
1038 |        "      <td>married</td>\n",
1039 |        "      <td>secondary</td>\n",
1040 |        "      <td>no</td>\n",
1041 |        "      <td>yes</td>\n",
1042 |        "      <td>yes</td>\n",
1043 |        "      <td>unknown</td>\n",
1044 |        "      <td>may</td>\n",
1045 |        "      <td>unknown</td>\n",
1046 |        "    </tr>\n",
1047 |        "    <tr>\n",
1048 |        "      <th>3</th>\n",
1049 |        "      <td>blue-collar</td>\n",
1050 |        "      <td>married</td>\n",
1051 |        "      <td>unknown</td>\n",
1052 |        "      <td>no</td>\n",
1053 |        "      <td>yes</td>\n",
1054 |        "      <td>no</td>\n",
1055 |        "      <td>unknown</td>\n",
1056 |        "      <td>may</td>\n",
1057 |        "      <td>unknown</td>\n",
1058 |        "    </tr>\n",
1059 |        "    <tr>\n",
1060 |        "      <th>4</th>\n",
1061 |        "      <td>unknown</td>\n",
1062 |        "      <td>single</td>\n",
1063 |        "      <td>unknown</td>\n",
1064 |        "      <td>no</td>\n",
1065 |        "      <td>no</td>\n",
1066 |        "      <td>no</td>\n",
1067 |        "      <td>unknown</td>\n",
1068 |        "      <td>may</td>\n",
1069 |        "      <td>unknown</td>\n",
1070 |        "    </tr>\n",
1071 |        "  </tbody>\n",
1072 |        "</table>\n",
1073 |        "</div>"
1074 |       ],
1075 |       "text/plain": [
1076 |        "            job  marital  education default housing loan  contact month  \\\n",
1077 |        "0    management  married   tertiary      no     yes   no  unknown   may   \n",
1078 |        "1    technician   single  secondary      no     yes   no  unknown   may   \n",
1079 |        "2  entrepreneur  married  secondary      no     yes  yes  unknown   may   \n",
1080 |        "3   blue-collar  married    unknown      no     yes   no  unknown   may   \n",
1081 |        "4       unknown   single    unknown      no      no   no  unknown   may   \n",
1082 |        "\n",
1083 |        "  poutcome  \n",
1084 |        "0  unknown  \n",
1085 |        "1  unknown  \n",
1086 |        "2  unknown  \n",
1087 |        "3  unknown  \n",
1088 |        "4  unknown  "
1089 |       ]
1090 |      },
1091 |      "execution_count": 94,
1092 |      "metadata": {},
1093 |      "output_type": "execute_result"
1094 |     }
1095 |    ],
1096 |    "source": [
1097 |     "#print inputCategorical\n",
1098 |     "inputCategorical.head()"
1099 |    ]
1100 |   },
1101 |   {
1102 |    "cell_type": "code",
1103 |    "execution_count": 101,
1104 |    "metadata": {
1105 |     "collapsed": true
1106 |    },
1107 |    "outputs": [],
1108 |    "source": [
1109 |     "#Convert categorical variables into Labels using labelEncoder\n",
1110 |     "\n",
1111 |     "inputCategorical = np.array(inputCategorical)\n"
1112 |    ]
1113 |   },
1114 |   {
1115 |    "cell_type": "markdown",
1116 |    "metadata": {},
1117 |    "source": [
1118 |     "**Exercise 9**\n",
1119 |     "\n",
1120 |     "Find length of `categorical_variables`"
1121 |    ]
1122 |   },
1123 |   {
1124 |    "cell_type": "code",
1125 |    "execution_count": 102,
1126 |    "metadata": {
1127 |     "collapsed": false
1128 |    },
1129 |    "outputs": [
1130 |     {
1131 |      "data": {
1132 |       "text/plain": [
1133 |        "9"
1134 |       ]
1135 |      },
1136 |      "execution_count": 102,
1137 |      "metadata": {},
1138 |      "output_type": "execute_result"
1139 |     }
1140 |    ],
1141 |    "source": []
1142 |   },
1143 |   {
1144 |    "cell_type": "code",
1145 |    "execution_count": 119,
1146 |    "metadata": {
1147 |     "collapsed": true
1148 |    },
1149 |    "outputs": [],
1150 |    "source": [
1151 |     "#Load the preprocessing module\n",
1152 |     "from sklearn import preprocessing"
1153 |    ]
1154 |   },
1155 |   {
1156 |    "cell_type": "code",
1157 |    "execution_count": 103,
1158 |    "metadata": {
1159 |     "collapsed": true
1160 |    },
1161 |    "outputs": [],
1162 |    "source": [
1163 |     "for i in range(len(categorical_variables)):\n",
1164 |     "    lbl = preprocessing.LabelEncoder()\n",
1165 |     "    lbl.fit(list(inputCategorical[:,i]))\n",
1166 |     "    inputCategorical[:, i] = lbl.transform(inputCategorical[:, i])"
1167 |    ]
1168 |   },
1169 |   {
1170 |    "cell_type": "code",
1171 |    "execution_count": 105,
1172 |    "metadata": {
1173 |     "collapsed": true
1174 |    },
1175 |    "outputs": [],
1176 |    "source": [
1177 |     "#print inputCategorical"
1178 |    ]
1179 |   },
1180 |   {
1181 |    "cell_type": "markdown",
1182 |    "metadata": {},
1183 |    "source": [
1184 |     "**Exercise 10**\n",
1185 |     "\n",
1186 |     "Convert `inputInteger` to `numpy` array"
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "code",
1191 |    "execution_count": 107,
1192 |    "metadata": {
1193 |     "collapsed": false
1194 |    },
1195 |    "outputs": [
1196 |     {
1197 |      "data": {
1198 |       "text/plain": [
1199 |        "array([[  58, 2143,    5, ...,    1,   -1,    0],\n",
1200 |        "       [  44,   29,    5, ...,    1,   -1,    0],\n",
1201 |        "       [  33,    2,    5, ...,    1,   -1,    0],\n",
1202 |        "       ..., \n",
1203 |        "       [  69,  247,   22, ...,    2,   -1,    0],\n",
1204 |        "       [  48,    0,   28, ...,    2,   -1,    0],\n",
1205 |        "       [  31,  131,   15, ...,    1,   -1,    0]])"
1206 |       ]
1207 |      },
1208 |      "execution_count": 107,
1209 |      "metadata": {},
1210 |      "output_type": "execute_result"
1211 |     }
1212 |    ],
1213 |    "source": [
1214 |     "inputInteger = \n",
1215 |     "inputInteger"
1216 |    ]
1217 |   },
1218 |   {
1219 |    "cell_type": "markdown",
1220 |    "metadata": {},
1221 |    "source": [
1222 |     "**Exercise 11**\n",
1223 |     "\n",
1224 |     "Now, create the `inputUpdated` array that has both `inputInteger` and `inputCategorical` concatenated\n",
1225 |     "\n",
1226 |     "*Hint* Check function called `vstack` and `hstack`"
1227 |    ]
1228 |   },
1229 |   {
1230 |    "cell_type": "code",
1231 |    "execution_count": null,
1232 |    "metadata": {
1233 |     "collapsed": false
1234 |    },
1235 |    "outputs": [],
1236 |    "source": []
1237 |   },
1238 |   {
1239 |    "cell_type": "code",
1240 |    "execution_count": 118,
1241 |    "metadata": {
1242 |     "collapsed": false
1243 |    },
1244 |    "outputs": [
1245 |     {
1246 |      "data": {
1247 |       "text/plain": [
1248 |        "(45211, 16)"
1249 |       ]
1250 |      },
1251 |      "execution_count": 118,
1252 |      "metadata": {},
1253 |      "output_type": "execute_result"
1254 |     }
1255 |    ],
1256 |    "source": [
1257 |     "inputUpdated.shape"
1258 |    ]
1259 |   },
1260 |   {
1261 |    "cell_type": "markdown",
1262 |    "metadata": {},
1263 |    "source": [
1264 |     "## Train the model\n",
1265 |     "\n",
1266 |     "### Model 1: Decision Tree"
1267 |    ]
1268 |   },
1269 |   {
1270 |    "cell_type": "code",
1271 |    "execution_count": 125,
1272 |    "metadata": {
1273 |     "collapsed": true
1274 |    },
1275 |    "outputs": [],
1276 |    "source": [
1277 |     "from sklearn import tree\n",
1278 |     "from sklearn.externals.six import StringIO\n",
1279 |     "import pydot"
1280 |    ]
1281 |   },
1282 |   {
1283 |    "cell_type": "code",
1284 |    "execution_count": 126,
1285 |    "metadata": {
1286 |     "collapsed": true
1287 |    },
1288 |    "outputs": [],
1289 |    "source": [
1290 |     "bankModelDT = tree.DecisionTreeClassifier(max_depth=2)"
1291 |    ]
1292 |   },
1293 |   {
1294 |    "cell_type": "code",
1295 |    "execution_count": 127,
1296 |    "metadata": {
1297 |     "collapsed": false
1298 |    },
1299 |    "outputs": [
1300 |     {
1301 |      "data": {
1302 |       "text/plain": [
1303 |        "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n",
1304 |        "            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,\n",
1305 |        "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
1306 |        "            presort=False, random_state=None, splitter='best')"
1307 |       ]
1308 |      },
1309 |      "execution_count": 127,
1310 |      "metadata": {},
1311 |      "output_type": "execute_result"
1312 |     }
1313 |    ],
1314 |    "source": [
1315 |     "bankModelDT.fit(inputUpdated[:train.shape[0],:], labels[:train.shape[0]])"
1316 |    ]
1317 |   },
1318 |   {
1319 |    "cell_type": "code",
1320 |    "execution_count": 128,
1321 |    "metadata": {
1322 |     "collapsed": false
1323 |    },
1324 |    "outputs": [
1325 |     {
1326 |      "data": {
1327 |       "text/plain": [
1328 |        "True"
1329 |       ]
1330 |      },
1331 |      "execution_count": 128,
1332 |      "metadata": {},
1333 |      "output_type": "execute_result"
1334 |     }
1335 |    ],
1336 |    "source": [
1337 |     "dot_data = StringIO() \n",
1338 |     "tree.export_graphviz(bankModelDT, out_file=dot_data) \n",
1339 |     "graph = pydot.graph_from_dot_data(dot_data.getvalue()) \n",
1340 |     "graph.write_pdf(\"bankDT.pdf\") "
1341 |    ]
1342 |   },
1343 |   {
1344 |    "cell_type": "code",
1345 |    "execution_count": 129,
1346 |    "metadata": {
1347 |     "collapsed": true
1348 |    },
1349 |    "outputs": [],
1350 |    "source": [
1351 |     "#Check the pdf"
1352 |    ]
1353 |   },
1354 |   {
1355 |    "cell_type": "markdown",
1356 |    "metadata": {},
1357 |    "source": [
1358 |     "**Exercise 12**\n",
1359 |     "\n",
1360 |     "Now, change the max_depth = 6 and check the results.\n",
1361 |     "\n",
1362 |     "Then, change the max_depth= None and check the results"
1363 |    ]
1364 |   },
1365 |   {
1366 |    "cell_type": "code",
1367 |    "execution_count": null,
1368 |    "metadata": {
1369 |     "collapsed": false
1370 |    },
1371 |    "outputs": [],
1372 |    "source": []
1373 |   },
1374 |   {
1375 |    "cell_type": "code",
1376 |    "execution_count": 144,
1377 |    "metadata": {
1378 |     "collapsed": true
1379 |    },
1380 |    "outputs": [],
1381 |    "source": [
1382 |     "# Prediction\n",
1383 |     "prediction_DT = bankModelDT.predict(inputUpdated[train.shape[0]:,:])"
1384 |    ]
1385 |   },
1386 |   {
1387 |    "cell_type": "code",
1388 |    "execution_count": 133,
1389 |    "metadata": {
1390 |     "collapsed": true
1391 |    },
1392 |    "outputs": [],
1393 |    "source": [
1394 |     "#Compute the error metrics"
1395 |    ]
1396 |   },
1397 |   {
1398 |    "cell_type": "code",
1399 |    "execution_count": 134,
1400 |    "metadata": {
1401 |     "collapsed": true
1402 |    },
1403 |    "outputs": [],
1404 |    "source": [
1405 |     "import sklearn.metrics"
1406 |    ]
1407 |   },
1408 |   {
1409 |    "cell_type": "code",
1410 |    "execution_count": 135,
1411 |    "metadata": {
1412 |     "collapsed": false
1413 |    },
1414 |    "outputs": [
1415 |     {
1416 |      "data": {
1417 |       "text/plain": [
1418 |        "0.5"
1419 |       ]
1420 |      },
1421 |      "execution_count": 135,
1422 |      "metadata": {},
1423 |      "output_type": "execute_result"
1424 |     }
1425 |    ],
1426 |    "source": [
1427 |     "sklearn.metrics.auc(labels[train.shape[0]:], prediction_DT)"
1428 |    ]
1429 |   },
1430 |   {
1431 |    "cell_type": "code",
1432 |    "execution_count": 136,
1433 |    "metadata": {
1434 |     "collapsed": true
1435 |    },
1436 |    "outputs": [],
1437 |    "source": [
1438 |     "#What does that tell?"
1439 |    ]
1440 |   },
1441 |   {
1442 |    "cell_type": "code",
1443 |    "execution_count": 137,
1444 |    "metadata": {
1445 |     "collapsed": true
1446 |    },
1447 |    "outputs": [],
1448 |    "source": [
1449 |     "#What's the error AUC for the other Decision Tree Models"
1450 |    ]
1451 |   },
1452 |   {
1453 |    "cell_type": "markdown",
1454 |    "metadata": {},
1455 |    "source": [
1456 |     "**Exercise 13**\n",
1457 |     "\n",
1458 |     "Instead of predicting classes directly, predict the probability and check the `auc`"
1459 |    ]
1460 |   },
1461 |   {
1462 |    "cell_type": "code",
1463 |    "execution_count": null,
1464 |    "metadata": {
1465 |     "collapsed": true
1466 |    },
1467 |    "outputs": [],
1468 |    "source": []
1469 |   },
1470 |   {
1471 |    "cell_type": "code",
1472 |    "execution_count": 142,
1473 |    "metadata": {
1474 |     "collapsed": false
1475 |    },
1476 |    "outputs": [
1477 |     {
1478 |      "data": {
1479 |       "text/plain": [
1480 |        "0.54849867669154428"
1481 |       ]
1482 |      },
1483 |      "execution_count": 142,
1484 |      "metadata": {},
1485 |      "output_type": "execute_result"
1486 |     }
1487 |    ],
1488 |    "source": [
1489 |     "sklearn.metrics.auc(labels[train.shape[0]:], prediction_DT[:,0])"
1490 |    ]
1491 |   },
1492 |   {
1493 |    "cell_type": "markdown",
1494 |    "metadata": {},
1495 |    "source": [
1496 |     "### Accuracy Metrics\n",
1497 |     "\n",
1498 |     "* AUC\n",
1499 |     "* ROC\n",
1500 |     "* Misclassification Rate\n",
1501 |     "* Confusion Matrix\n",
1502 |     "* Precision & Recall\n",
1503 |     "\n",
1504 |     "#### Confusion Matrix\n",
1505 |     "\n",
1506 |     "\n",
1507 |     "<img src=\"img/confusion_matrix.jpg\" style=\"width:604px;height:428px;\">\n",
1508 |     "\n",
1509 |     "#### Calculate True Positive Rate\n",
1510 |     "    TPR = TP / (TP+FN)\n",
1511 |     "\n",
1512 |     "#### Calculate False Positive Rate\n",
1513 |     "    FPR = FP / (FP+TN)\n",
1514 |     "    \n",
1515 |     "#### Precision\n",
1516 |     "\n",
1517 |     "\n",
1518 |     "#### Recall\n",
1519 |     "\n",
1520 |     "\n"
1521 |    ]
1522 |   },
1523 |   {
1524 |    "cell_type": "code",
1525 |    "execution_count": 147,
1526 |    "metadata": {
1527 |     "collapsed": true
1528 |    },
1529 |    "outputs": [],
1530 |    "source": [
1531 |     "#Precision and Recall"
1532 |    ]
1533 |   },
1534 |   {
1535 |    "cell_type": "code",
1536 |    "execution_count": 145,
1537 |    "metadata": {
1538 |     "collapsed": false
1539 |    },
1540 |    "outputs": [
1541 |     {
1542 |      "data": {
1543 |       "text/plain": [
1544 |        "0.57177033492822971"
1545 |       ]
1546 |      },
1547 |      "execution_count": 145,
1548 |      "metadata": {},
1549 |      "output_type": "execute_result"
1550 |     }
1551 |    ],
1552 |    "source": [
1553 |     "sklearn.metrics.precision_score(labels[train.shape[0]:], prediction_DT)"
1554 |    ]
1555 |   },
1556 |   {
1557 |    "cell_type": "code",
1558 |    "execution_count": 146,
1559 |    "metadata": {
1560 |     "collapsed": false
1561 |    },
1562 |    "outputs": [
1563 |     {
1564 |      "data": {
1565 |       "text/plain": [
1566 |        "0.20427350427350427"
1567 |       ]
1568 |      },
1569 |      "execution_count": 146,
1570 |      "metadata": {},
1571 |      "output_type": "execute_result"
1572 |     }
1573 |    ],
1574 |    "source": [
1575 |     "sklearn.metrics.recall_score(labels[train.shape[0]:], prediction_DT)"
1576 |    ]
1577 |   },
1578 |   {
1579 |    "cell_type": "markdown",
1580 |    "metadata": {},
1581 |    "source": [
1582 |     "# Ensemble Trees\n",
1583 |     "\n",
1584 |     "<img src=\"img/tree_ensemble1.png\" style=\"width:604px;height:428px;\">\n",
1585 |     "\n",
1586 |     "\n",
1587 |     "<br>\n",
1588 |     "<br>\n",
1589 |     "<br>\n",
1590 |     "<br>\n",
1591 |     "<br>\n",
1592 |     "<br>\n",
1593 |     "\n",
1594 |     "<img src=\"img/tree_ensemble2.png\" style=\"width:604px;height:428px;\">\n",
1595 |     "\n",
1596 |     "\n",
1597 |     "\n",
1598 |     "*src*: http://www.slideshare.net/hustwj/scaling-up-machine-learning-the-tutorial-kdd-2011-part-iia-tree-ensembles"
1599 |    ]
1600 |   },
1601 |   {
1602 |    "cell_type": "markdown",
1603 |    "metadata": {},
1604 |    "source": [
1605 |     "# Random Forest"
1606 |    ]
1607 |   },
1608 |   {
1609 |    "cell_type": "markdown",
1610 |    "metadata": {},
1611 |    "source": [
1612 |     "<img src=\"img/random_forest.jpg\" style=\"width:604px;height:428px;\">\n",
1613 |     "\n",
1614 |     "\n",
1615 |     "\n",
1616 |     "*src*: http://www.slideshare.net/0xdata/jan-vitek-distributedrandomforest522013"
1617 |    ]
1618 |   },
1619 |   {
1620 |    "cell_type": "code",
1621 |    "execution_count": 148,
1622 |    "metadata": {
1623 |     "collapsed": true
1624 |    },
1625 |    "outputs": [],
1626 |    "source": [
1627 |     "from sklearn.ensemble import RandomForestClassifier"
1628 |    ]
1629 |   },
1630 |   {
1631 |    "cell_type": "code",
1632 |    "execution_count": 157,
1633 |    "metadata": {
1634 |     "collapsed": true
1635 |    },
1636 |    "outputs": [],
1637 |    "source": [
1638 |     "bankModelRF = RandomForestClassifier(n_jobs=-1, oob_score=True)"
1639 |    ]
1640 |   },
1641 |   {
1642 |    "cell_type": "code",
1643 |    "execution_count": 158,
1644 |    "metadata": {
1645 |     "collapsed": false
1646 |    },
1647 |    "outputs": [
1648 |     {
1649 |      "data": {
1650 |       "text/plain": [
1651 |        "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
1652 |        "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
1653 |        "            min_samples_leaf=1, min_samples_split=2,\n",
1654 |        "            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,\n",
1655 |        "            oob_score=False, random_state=None, verbose=0,\n",
1656 |        "            warm_start=False)"
1657 |       ]
1658 |      },
1659 |      "execution_count": 158,
1660 |      "metadata": {},
1661 |      "output_type": "execute_result"
1662 |     }
1663 |    ],
1664 |    "source": [
1665 |     "bankModelRF.fit(inputUpdated[:train.shape[0],:], labels[:train.shape[0]])"
1666 |    ]
1667 |   },
1668 |   {
1669 |    "cell_type": "code",
1670 |    "execution_count": 156,
1671 |    "metadata": {
1672 |     "collapsed": false
1673 |    },
1674 |    "outputs": [
1675 |     {
1676 |      "data": {
1677 |       "text/plain": [
1678 |        "0.89128397375820057"
1679 |       ]
1680 |      },
1681 |      "execution_count": 156,
1682 |      "metadata": {},
1683 |      "output_type": "execute_result"
1684 |     }
1685 |    ],
1686 |    "source": [
1687 |     "bankModelRF.oob_score_"
1688 |    ]
1689 |   },
1690 |   {
1691 |    "cell_type": "markdown",
1692 |    "metadata": {},
1693 |    "source": [
1694 |     "**Exercise 14**\n",
1695 |     "\n",
1696 |     "Do the following\n",
1697 |     "\n",
1698 |     "1. Predict on test\n",
1699 |     "2. Find accuracy metrics: AUC, Precision, Recall \n",
1700 |     "3. How does it compare against Decision Tree"
1701 |    ]
1702 |   },
1703 |   {
1704 |    "cell_type": "code",
1705 |    "execution_count": null,
1706 |    "metadata": {
1707 |     "collapsed": true
1708 |    },
1709 |    "outputs": [],
1710 |    "source": []
1711 |   },
1712 |   {
1713 |    "cell_type": "markdown",
1714 |    "metadata": {},
1715 |    "source": [
1716 |     "# Gradient Boosting Machines"
1717 |    ]
1718 |   },
1719 |   {
1720 |    "cell_type": "markdown",
1721 |    "metadata": {},
1722 |    "source": [
1723 |     "<img src=\"img/boosting.jpg\" style=\"width:604px;height:428px;\">\n",
1724 |     "\n",
1725 |     "\n",
1726 |     "\n",
1727 |     "*src*: http://www.slideshare.net/hustwj/scaling-up-machine-learning-the-tutorial-kdd-2011-part-iia-tree-ensembles"
1728 |    ]
1729 |   },
1730 |   {
1731 |    "cell_type": "code",
1732 |    "execution_count": 160,
1733 |    "metadata": {
1734 |     "collapsed": true
1735 |    },
1736 |    "outputs": [],
1737 |    "source": [
1738 |     "import xgboost as xgb"
1739 |    ]
1740 |   },
1741 |   {
1742 |    "cell_type": "code",
1743 |    "execution_count": 176,
1744 |    "metadata": {
1745 |     "collapsed": true
1746 |    },
1747 |    "outputs": [],
1748 |    "source": [
1749 |     "params = {}\n",
1750 |     "params[\"min_child_weight\"] = 3\n",
1751 |     "params[\"subsample\"] = 0.7\n",
1752 |     "params[\"colsample_bytree\"] = 0.7\n",
1753 |     "params[\"scale_pos_weight\"] = 1\n",
1754 |     "params[\"silent\"] = 0\n",
1755 |     "params[\"max_depth\"] = 4\n",
1756 |     "params[\"nthread\"] = 6\n",
1757 |     "params[\"gamma\"] = 1\n",
1758 |     "params[\"objective\"] = \"binary:logistic\"\n",
1759 |     "params[\"eta\"] = 0.005\n",
1760 |     "params[\"base_score\"] = 0.1\n",
1761 |     "params[\"eval_metric\"] = \"auc\"\n",
1762 |     "params[\"seed\"] = 123"
1763 |    ]
1764 |   },
1765 |   {
1766 |    "cell_type": "code",
1767 |    "execution_count": 177,
1768 |    "metadata": {
1769 |     "collapsed": true
1770 |    },
1771 |    "outputs": [],
1772 |    "source": [
1773 |     "plst = list(params.items())\n",
1774 |     "num_rounds = 120"
1775 |    ]
1776 |   },
1777 |   {
1778 |    "cell_type": "code",
1779 |    "execution_count": 178,
1780 |    "metadata": {
1781 |     "collapsed": true
1782 |    },
1783 |    "outputs": [],
1784 |    "source": [
1785 |     "xgtrain_pv = xgb.DMatrix(inputUpdated[:train.shape[0],:], label=labels[:train.shape[0]])\n",
1786 |     "watchlist = [(xgtrain_pv, 'train')]\n",
1787 |     "bankModelXGB = xgb.train(plst, xgtrain_pv, num_rounds)"
1788 |    ]
1789 |   },
1790 |   {
1791 |    "cell_type": "code",
1792 |    "execution_count": 179,
1793 |    "metadata": {
1794 |     "collapsed": true
1795 |    },
1796 |    "outputs": [],
1797 |    "source": [
1798 |     "prediction_XGB = bankModelXGB.predict(xgb.DMatrix(inputUpdated[train.shape[0]:,:]))"
1799 |    ]
1800 |   },
1801 |   {
1802 |    "cell_type": "code",
1803 |    "execution_count": 180,
1804 |    "metadata": {
1805 |     "collapsed": false
1806 |    },
1807 |    "outputs": [
1808 |     {
1809 |      "data": {
1810 |       "text/plain": [
1811 |        "0.19817152619361877"
1812 |       ]
1813 |      },
1814 |      "execution_count": 180,
1815 |      "metadata": {},
1816 |      "output_type": "execute_result"
1817 |     }
1818 |    ],
1819 |    "source": [
1820 |     "sklearn.metrics.auc(labels[train.shape[0]:], prediction_XGB)"
1821 |    ]
1822 |   },
1823 |   {
1824 |    "cell_type": "markdown",
1825 |    "metadata": {},
1826 |    "source": [
1827 |     "# Another way of encoding"
1828 |    ]
1829 |   },
1830 |   {
1831 |    "cell_type": "markdown",
1832 |    "metadata": {},
1833 |    "source": [
1834 |     "### One Hot Encoding\n",
1835 |     "\n",
1836 |     "<img src=\"img/onehot.png\" style=\"width:404px;height:228px;\">\n",
1837 |     "\n",
1838 |     "\n",
1839 |     "Whiteboard ! "
1840 |    ]
1841 |   },
1842 |   {
1843 |    "cell_type": "code",
1844 |    "execution_count": 175,
1845 |    "metadata": {
1846 |     "collapsed": true
1847 |    },
1848 |    "outputs": [],
1849 |    "source": [
1850 |     "inputOneHot = pd.get_dummies(input)"
1851 |    ]
1852 |   },
1853 |   {
1854 |    "cell_type": "markdown",
1855 |    "metadata": {},
1856 |    "source": [
1857 |     "**Exercise 15**\n",
1858 |     "\n",
1859 |     "On the one hot encoded data, train\n",
1860 |     "\n",
1861 |     "1. Decision Tree\n",
1862 |     "2. Random Forest\n",
1863 |     "3. xgboost\n",
1864 |     "\n",
1865 |     "Which one works best on the test dataset?"
1866 |    ]
1867 |   },
1868 |   {
1869 |    "cell_type": "code",
1870 |    "execution_count": null,
1871 |    "metadata": {
1872 |     "collapsed": true
1873 |    },
1874 |    "outputs": [],
1875 |    "source": []
1876 |   }
1877 |  ],
1878 |  "metadata": {
1879 |   "kernelspec": {
1880 |    "display_name": "Python 2",
1881 |    "language": "python",
1882 |    "name": "python2"
1883 |   },
1884 |   "language_info": {
1885 |    "codemirror_mode": {
1886 |     "name": "ipython",
1887 |     "version": 2
1888 |    },
1889 |    "file_extension": ".py",
1890 |    "mimetype": "text/x-python",
1891 |    "name": "python",
1892 |    "nbconvert_exporter": "python",
1893 |    "pygments_lexer": "ipython2",
1894 |    "version": "2.7.11"
1895 |   }
1896 |  },
1897 |  "nbformat": 4,
1898 |  "nbformat_minor": 0
1899 | }
1900 | 


--------------------------------------------------------------------------------
/RF_GBM/notebook/img/boosting.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/boosting.jpg


--------------------------------------------------------------------------------
/RF_GBM/notebook/img/confusion_matrix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/confusion_matrix.jpg


--------------------------------------------------------------------------------
/RF_GBM/notebook/img/cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/cv.png


--------------------------------------------------------------------------------
/RF_GBM/notebook/img/onehot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/onehot.png


--------------------------------------------------------------------------------
/RF_GBM/notebook/img/random_forest.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/random_forest.jpg


--------------------------------------------------------------------------------
/RF_GBM/notebook/img/tree_ensemble1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/tree_ensemble1.png


--------------------------------------------------------------------------------
/RF_GBM/notebook/img/tree_ensemble2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/RF_GBM/notebook/img/tree_ensemble2.png


--------------------------------------------------------------------------------
/cf_mba/notebook/img/basket.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/cf_mba/notebook/img/basket.jpg


--------------------------------------------------------------------------------
/cf_mba/notebook/img/cosine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/cf_mba/notebook/img/cosine.png


--------------------------------------------------------------------------------
/check_env.py:
--------------------------------------------------------------------------------
 1 | # Authors: Amit Kapoor and Bargava Subramanian
 2 | # Copyright (c) 2016 Amit Kapoor 
 3 | # License: MIT License
 4 | 
 5 | """
 6 |     This script will check if the environment setup is correct for the workshop.
 7 | 
 8 |     To run, please execute the following command from the command prompt
 9 |                >>> python check_env.py
10 |     
11 |     The output will indicate if any of the libraries are missing or need to be updated. 
12 | 
13 |     This script is inspired from https://github.com/fonnesbeck/scipy2015_tutorial/blob/master/check_env.py
14 | """
15 | 
16 | from __future__ import print_function
17 | 
18 | try:
19 |     import curses
20 |     curses.setupterm()
21 |     assert curses.tigetnum("colors") > 2
22 |     OK = "\x1b[1;%dm[ OK ]\x1b[0m" % (30 + curses.COLOR_GREEN)
23 |     FAIL = "\x1b[1;%dm[FAIL]\x1b[0m" % (30 + curses.COLOR_RED)
24 | except:
25 |     OK = '[ OK ]'
26 |     FAIL = '[FAIL]'
27 | 
28 | import sys
29 | try:
30 |     import importlib
31 | except ImportError:
32 |     print(FAIL, "Python version 2.7 is required, but %s is installed." % sys.version)
33 | from distutils.version import LooseVersion as Version
34 | 
35 | def import_version(pkg, min_ver, fail_msg=""):
36 |     mod = None
37 |     try:
38 |         mod = importlib.import_module(pkg)
39 |         if((pkg=="spacy" or pkg=="wordcloud") and (mod > 0)):
40 |             print(OK, '%s ' % (pkg))
41 |         else:
42 |         #else:
43 |             version = getattr(mod, "__version__", 0) or getattr(mod, "VERSION", 0)
44 |             if Version(version) < min_ver:
45 |                 print(FAIL, "%s version %s or higher required, but %s installed."
46 |                     % (lib, min_ver, version))
47 |             else:
48 |                print(OK, '%s version %s' % (pkg, version))
49 |     except ImportError:
50 |         print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
51 |     return mod
52 | 
53 | 
54 | # first check the python version
55 | print('Using python in', sys.prefix)
56 | print(sys.version)
57 | pyversion = Version(sys.version)
58 | if pyversion < "3":
59 |     print(FAIL, "Python version 3 is required, but %s is installed." % sys.version)
60 | elif pyversion >= "2":
61 |     if pyversion == "2.7":
62 |         print(FAIL, "Python version 2.7 is installed. Please upgrade to version 3." )
63 | else:
64 |     print(FAIL, "Unknown Python version: %s" % sys.version)
65 | 
66 | print()
67 | requirements = {
68 |     'gensim'     :'0.12.4',
69 |     'IPython'    : '4.0.3',
70 |     'jupyter'    :'1.0.0',
71 |     'lda'        : '1.0.3',
72 |     'networkx'   : '1.11',
73 |     'nltk'       : '3.1',
74 |     'matplotlib' :'1.5.0',
75 |     'nltk'       : '3.1',
76 |     'numpy'      : '1.10.4',
77 |     'pandas'     : '0.17.1',
78 |     'PIL'        : '1.1.7',
79 |     'scipy'      : '0.17.0',
80 |     'sklearn'    : '0.17',
81 |     'seaborn'    :'0.6.0',
82 |     'spacy'      :'0.100.6',
83 |     'statsmodels':'0.6.1', 
84 |     'wordcloud'  :'0.1',
85 |     'xgboost'    :'0.4'    
86 | 
87 | }
88 | 
89 | # now the dependencies
90 | for lib, required_version in list(requirements.items()):
91 |     import_version(lib, required_version)
92 | 
93 | 
94 |     
95 | 


--------------------------------------------------------------------------------
/img/ISLR.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/ISLR.jpeg


--------------------------------------------------------------------------------
/img/acquire.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/acquire.jpg


--------------------------------------------------------------------------------
/img/amit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/amit.png


--------------------------------------------------------------------------------
/img/approach.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/approach.jpg


--------------------------------------------------------------------------------
/img/art.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/art.jpeg


--------------------------------------------------------------------------------
/img/bargava.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/bargava.jpg


--------------------------------------------------------------------------------
/img/book.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/book.png


--------------------------------------------------------------------------------
/img/books.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/books.jpg


--------------------------------------------------------------------------------
/img/break.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/break.jpg


--------------------------------------------------------------------------------
/img/clay.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/clay.jpeg


--------------------------------------------------------------------------------
/img/craft.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/craft.jpeg


--------------------------------------------------------------------------------
/img/estimating_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/estimating_coefficients.png


--------------------------------------------------------------------------------
/img/explore.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/explore.jpg


--------------------------------------------------------------------------------
/img/frame.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/frame.jpg


--------------------------------------------------------------------------------
/img/glass.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/glass.jpg


--------------------------------------------------------------------------------
/img/insight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/insight.jpg


--------------------------------------------------------------------------------
/img/lens.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/lens.jpeg


--------------------------------------------------------------------------------
/img/model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/model.jpg


--------------------------------------------------------------------------------
/img/numbers.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/numbers.jpg


--------------------------------------------------------------------------------
/img/onion-image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/onion-image.jpg


--------------------------------------------------------------------------------
/img/onion.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/onion.jpg


--------------------------------------------------------------------------------
/img/onion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/onion.png


--------------------------------------------------------------------------------
/img/overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/overview.jpg


--------------------------------------------------------------------------------
/img/pair.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/pair.jpg


--------------------------------------------------------------------------------
/img/postit.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/postit.jpg


--------------------------------------------------------------------------------
/img/r2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/r2.gif


--------------------------------------------------------------------------------
/img/r_squared.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/r_squared.png


--------------------------------------------------------------------------------
/img/refine.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/refine.jpg


--------------------------------------------------------------------------------
/img/retail.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/retail.jpg


--------------------------------------------------------------------------------
/img/science.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/science.jpeg


--------------------------------------------------------------------------------
/img/see.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/see.jpeg


--------------------------------------------------------------------------------
/img/single.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/single.jpeg


--------------------------------------------------------------------------------
/img/skills.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/skills.png


--------------------------------------------------------------------------------
/img/slope_intercept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/slope_intercept.png


--------------------------------------------------------------------------------
/img/speak.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/speak.jpeg


--------------------------------------------------------------------------------
/img/sports.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/sports.jpg


--------------------------------------------------------------------------------
/img/stars.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/stars.jpg


--------------------------------------------------------------------------------
/img/think.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/think.jpg


--------------------------------------------------------------------------------
/img/thinkstats.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/thinkstats.jpg


--------------------------------------------------------------------------------
/img/time.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/time.jpg


--------------------------------------------------------------------------------
/img/tool.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/tool.jpg


--------------------------------------------------------------------------------
/img/travel.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/travel.jpg


--------------------------------------------------------------------------------
/img/welcome.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/welcome.jpg


--------------------------------------------------------------------------------
/img/wesmckinney.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/wesmckinney.jpg


--------------------------------------------------------------------------------
/img/workshop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/img/workshop.jpg


--------------------------------------------------------------------------------
/installation_instructions.md:
--------------------------------------------------------------------------------
 1 | # Installation Instructions for the workshop
 2 | 
 3 | 
 4 | ### Package Manager: Anaconda
 5 | 
 6 | We strongly recommend using Anaconda. It can be downloaded from here:
 7 | https://www.continuum.io/downloads
 8 | 
 9 | It comes with `jupyter notebook` which is the IDE we will be using for the workshop
10 | 
11 | We recommend using the Python 3.5 version.
12 | 
13 | ### Required packages
14 | 
15 | Run the following script at the command prompt to check if you have all the requisite packages installed.
16 | To run, please execute the following command from the command prompt
17 | 
18 |     $ python check_env.py
19 | 
20 | The output will indicate if any of the libraries are missing or need to be updated. 
21 | 
22 | Any package that is missing can be installed by running the command at the command prompt
23 | 
24 |     $ pip install <package_name> 
25 | 
26 | Any package that needs to be upgraded can be upgraded by running the command at the command prompt
27 | 
28 |     $ pip install --upgrade <package_name> 
29 | 
30 | 
31 | Replace <*package_name*> with the package that needs to be installed/upgraded. 
32 | 
33 | After all the packages are installed, please run the following two commands
34 | 
35 | 1. Install all the corpora for the `nltk` module. Please be warned that this is a huge file and can take a while. Please refer http://www.nltk.org/data.html for further information on what it downloads
36 | 
37 |     `$ python -m nltk.downloader all`
38 | 
39 | 2. Install all the corpora for the `spacy` module. Please be warned that this is a huge file and can take a while
40 | 
41 |     `$ python -m spacy.en.download`
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/overview.md:
--------------------------------------------------------------------------------
  1 | ![](img/workshop.jpg)
  2 | # Intro to Data Science and Machine Learning 
  3 | ### @amitkaps | @bargava
  4 | 
  5 | ---
  6 | 
  7 | ![](img/welcome.jpg)
  8 | # Welcome
  9 | 
 10 | ---
 11 | 
 12 | # Facilitators
 13 | ![](img/amit.png)
 14 | ![](img/bargava.jpg)
 15 | 
 16 | ---
 17 | 
 18 | # Amit
 19 | ## @amitkaps
 20 | ![](img/amit.png)
 21 | 
 22 | ---
 23 | 
 24 | # Bargava
 25 | ## @bargava
 26 | ![](img/bargava.jpg)
 27 | 
 28 | 
 29 | ---
 30 | 
 31 | ![](img/lens.jpeg)
 32 | # See the world through a data lens
 33 | 
 34 | ---
 35 | 
 36 | ![](img/see.jpeg)
 37 | # "Data is just a clue to the end truth"
 38 | -- Josh Smith
 39 | 
 40 | --- 
 41 | 
 42 | ![](img/sports.jpg) 
 43 | ![](img/travel.jpg)
 44 | ![](img/retail.jpg)
 45 | # Data Driven Decisions
 46 | 
 47 | ---
 48 | 
 49 | ![](img/science.jpeg)
 50 | # "Science is knowledge which we understand so well that we can teach it to a computer. Everything else is art" 
 51 | -- Donald Knuth
 52 | 
 53 | ---
 54 | 
 55 | ![](img/art.jpeg)
 56 | # Data Science is an Art
 57 | 
 58 | ---
 59 | 
 60 | ![](img/glass.jpg)
 61 | # Hypothesis Driven Approach
 62 | 
 63 | ---
 64 | 
 65 | ![](img/frame.jpg)
 66 | # Frame
 67 | ## "An approximate answer to the right problem is worth a good deal"
 68 | 
 69 | ---
 70 | 
 71 | ![](img/acquire.jpg)
 72 | # Acquire
 73 | ## "80% perspiration, 10% great idea, 10% great output"
 74 | 
 75 | ---
 76 | 
 77 | ![](img/refine.jpg)
 78 | # Refine
 79 | ## "All data is messy."
 80 | 
 81 | ---
 82 | 
 83 | ![](img/explore.jpg)
 84 | # Explore
 85 | ## "I don't know, what I don't know."
 86 | 
 87 | ---
 88 | 
 89 | ![](img/model.jpg)
 90 | # Model
 91 | ## "All models are wrong, but some are useful"
 92 | 
 93 | ---
 94 | 
 95 | ![](img/insight.jpg)
 96 | # Insight
 97 | ## "The goal is to turn data into insight"
 98 | 
 99 | ---
100 | 
101 | ![](img/approach.jpg)
102 | 
103 | 
104 | ---
105 | 
106 | ![](img/think.jpg)
107 | ## "Doing data analyis requires quite a bit of thinking and we believe that when you’ve completed a good data analysis, you’ve spent more time thinking than doing." 
108 | -- Roger Peng
109 | 
110 | ---
111 | 
112 | ![](img/tool.jpg)
113 | # Python Data Stack
114 | 
115 | ---
116 | 
117 | ![](img/books.jpg)
118 | # Case Studies
119 | 
120 | --- 
121 | # Day 1
122 | # Peeling the Onion
123 | ## Time Series Analysis
124 | ![](img/onion.jpg)
125 | 
126 | ---
127 | 
128 | # Day 2
129 | # Grocery 
130 | ## Market Basket Analysis / Collaborative Filter
131 | 
132 | ---
133 | 
134 | # Day 2
135 | # BanK Marketing
136 | ## Random Forest and Gradient Boosting
137 | 
138 | ---
139 | 
140 | # Day 3
141 | # DataTau
142 | ## Text Analytics
143 | 
144 | ---
145 | 
146 | ![](img/clay.jpeg)
147 | # Learning Approach
148 | 
149 | --- 
150 | 
151 | ![](img/single.jpeg)
152 | # Do the Exercises
153 | 
154 | ---
155 | 
156 | ![](img/pair.jpg)
157 | # Pair up & Learn
158 | 
159 | ---
160 | 
161 | ![](img/postit.jpg)
162 | # Call for Help
163 | 
164 | ---
165 | 
166 | ![](img/numbers.jpg)
167 | # Enjoy the workshop
168 | 
169 | ---
170 |  
171 | ## Workshop Material is available at the Github Repo 
172 | ### [https://github.com/amitkaps/machine-learning](https://github.com/amitkaps/machine-learning)
173 | 
174 | ---
175 | 
176 | # Exercise
177 | 
178 | ---
179 | 
180 | # 1. Time Series Exercise
181 | 
182 | ### "Predict the number of tickets that will be raised in the next week"
183 | 
184 | - **Frame**: What to forecast? At what horizon? At what level?
185 | - **Acquire, Refine, Explore**: Do EDA to understand the trend and pattern within the data
186 | - **Models**: Mean Model, Linear Trend, Random Walk, Simple Moving Average, Exp Smoothing, Decomposition, ARIMA
187 | - **Insight**: Share the insight through a datavis of the models
188 | 
189 | ---
190 | 
191 | # 2. Text Analytics Exercise
192 | 
193 | ### "Identify the entity, features & topics in the 'Comments' data or 'Twitter #machine learning' data"
194 | 
195 | - **Frame**: What are the comments you are trying to understand? 
196 | - **Acquire, Refine, Explore**: Do Wordcloud, Lemmatization, Part of Speech Analysis, and Entity Chunking
197 | - **Models**: TF-IDF, Topic Modelling, Sentiment Analysis
198 | - **Insight**: Share the insight through word cloud and topic  visualisation
199 | 
200 | ---
201 | 
202 | # Feedback
203 | 
204 | ### [https://amitkaps.typeform.com/to/i6wl2E](https://amitkaps.typeform.com/to/i6wl2E)
205 | 
206 | 
207 | ---
208 | 
209 | # Recap
210 | 
211 | ---
212 | 
213 | ![](img/approach.jpg)
214 | 
215 | ---
216 | 
217 | ![](img/frame.jpg)
218 | # Frame
219 | - **Toy Problems**
220 | - **Simple Problems**
221 | - Complex Problems
222 | - Business Problems
223 | - Research Problems
224 | 
225 | ---
226 | 
227 | ![](img/acquire.jpg)
228 | # Acquire
229 | - **Scraping** (structured, unstructured)
230 | - **Files** (csv, xls, json, xml, pdf, ...)
231 | - Database (sqlite, ...)
232 | - APIs 
233 | - Streaming
234 | 
235 | ---
236 | 
237 | ![](img/refine.jpg)
238 | # Refine
239 | - Data Cleaning (inconsistent, missing, ...)
240 | - **Data Refining** (derive, parse, merge, filter, convert, ...)
241 | - **Data Transformations** (group by, pivot, aggregate, sample, summarise, ...)
242 | 
243 | 
244 | --- 
245 | 
246 | ![](img/explore.jpg)
247 | # Explore
248 | - **Simple Vis**
249 | - Multi Dimensional Vis
250 | - Geographic Vis
251 | - Large Data Vis (Bin - Summarise - Smooth)
252 | - Interactive Vis
253 | 
254 | ---
255 | 
256 | ![](img/model.jpg)
257 | # Model - Supervised Learning 
258 | - *Continuous*: Regression - **Linear**, Polynomial, Tree Based Methods - CART, **Random Forest**, Gradient Boosting Machines
259 | - *Classification* - **Logistics Regression**, Tree, KNN, SVM, Naive-Bayes, Bayesian Network
260 | 
261 | ---
262 | 
263 | ![](img/model.jpg)
264 | # Model - UnSupervised Learning 
265 | - *Continuous*: Clustering & Dimensionality Reduction like PCA, SVD, MDS, K-means
266 | - *Categorical*: Association Analysis
267 | 
268 | ---
269 | 
270 | ![](img/model.jpg)
271 | # Model - Advanced /
272 | - **Time Series**
273 | - **Text Analytics**
274 | - Network / Graph Analytics
275 | - Optimization
276 | 
277 | ---
278 | ![](img/model.jpg)
279 | # Model - Specialized
280 | - Reinforcement Learning
281 | - Online Learning
282 | - Deep Learning
283 | - Other Applications: Image, Speech 
284 | 
285 | 
286 | ---
287 | 
288 | ![](img/insight.jpg)
289 | # Insight
290 | - Narrative Visualisation
291 | - Dashboard Visualisation
292 | - Decision Making Tools
293 | - Automated Decision Tools
294 | 
295 | ---
296 | 
297 | # PyData Stack 
298 | - **Acquire / Refine**: `Pandas, Beautiful Soup, Selenium, Requests, SQL Alchemy, Numpy, Blaze`
299 | - **Explore**: `MatPlotLib, Seaborn, Bokeh, Plotly, Vega, Folium`
300 | - **Model**: `Scikit-Learn, StatsModels, SciPy, Gensim, Keras, Tensor Flow, PySpark`
301 | - **Insight**: `Django, Flask`
302 | 
303 | 
304 | ---
305 | 
306 | # Skills
307 | ![fit](img/skills.png)
308 | 
309 | ---
310 | 
311 | ![fit](img/skills.png)
312 | 
313 | ---
314 | 
315 | # Books
316 | 
317 | ![fit](img/book.png)
318 | ![fit](img/wesmckinney.jpg)
319 | ![fit](img/thinkstats.jpg)
320 | 
321 | 
322 | ---
323 | 
324 | ![fit](img/book.png)
325 | ![fit](img/wesmckinney.jpg)
326 | ![fit](img/thinkstats.jpg)
327 | 
328 | ---
329 | 
330 | ![left](img/ISLR.jpeg)
331 | ## Resources - Statistical Learning
332 | - One of the good books on statistical learning is ISLR -> [An Introduction to Statistical Learning with Application in R](http://www-bcf.usc.edu/~gareth/ISL/index.html)
333 | - You can find all the ISLR code in python at this github repo - [https://github.com/JWarmenhoven/ISLR-python](https://github.com/JWarmenhoven/ISLR-python)
334 | 
335 | ---
336 | 
337 | ## Resources - Time Series
338 | - [Forecasting: Principle and Text](https://www.otexts.org/fpp)
339 | - [Statistical forecasting: Notes on regression and time series analysis Case](http://people.duke.edu/~rnau/411home.htm)
340 | 
341 | ## Resources - Text Analytics
342 | - [Natural Language Processing with Python](http://www.nltk.org/book/)
343 | 
344 | 
345 | ---
346 | ![](img/stars.jpg)
347 | # Online Course
348 | - Harvard Data Science Course - [CS 109 Course](http://cs109.github.io/2015/) (It is structured in similar way to the approach we shared)
349 | - Data Science Specialisation - [JHU Data Science](https://www.coursera.org/specializations/jhu-data-science) (It is a good course, though the material is coded in R)
350 | <br>
351 | - Many more on Coursera & Udacity...
352 | 
353 | 
354 | ---
355 | ![](img/workshop.jpg)
356 | # We enjoyed the workshop!
357 | 
358 | ---
359 | ![](img/speak.jpeg)
360 | # Speak to Us!
361 | 
362 | ---
363 | 
364 | ![](img/numbers.jpg)
365 | # Thank you
366 | ## @amitkaps | @bargava


--------------------------------------------------------------------------------
/overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/overview.pdf


--------------------------------------------------------------------------------
/python.txt:
--------------------------------------------------------------------------------
  1 | abstract-rendering==0.5.1
  2 | alabaster==0.7.7
  3 | anaconda-client==1.2.2
  4 | appnope==0.1.0
  5 | appscript==1.0.1
  6 | argcomplete==1.0.0
  7 | astropy==1.1.1
  8 | Babel==2.2.0
  9 | beautifulsoup4==4.4.1
 10 | bitarray==0.8.1
 11 | blaze==0.9.0
 12 | bokeh==0.11.0
 13 | boto==2.39.0
 14 | Bottleneck==1.0.0
 15 | cffi==1.2.1
 16 | clyent==1.2.0
 17 | colorama==0.3.6
 18 | conda==4.0.4
 19 | conda-build==1.19.0
 20 | conda-env==2.4.5
 21 | configobj==5.0.6
 22 | cryptography==1.0.2
 23 | cycler==0.10.0
 24 | Cython==0.23.4
 25 | cytoolz==0.7.5
 26 | datashape==0.5.0
 27 | decorator==4.0.6
 28 | docutils==0.12
 29 | dynd===f641248
 30 | et-xmlfile==1.0.1
 31 | fastcache==1.0.2
 32 | Flask==0.10.1
 33 | futures==3.0.3
 34 | greenlet==0.4.9
 35 | h5py==2.5.0
 36 | html5lib==0.999
 37 | idna==2.0
 38 | ipykernel==4.2.2
 39 | ipython==4.0.3
 40 | ipython-genutils==0.1.0
 41 | ipywidgets==4.1.1
 42 | itsdangerous==0.24
 43 | jdcal==1.2
 44 | jedi==0.9.0
 45 | Jinja2==2.8
 46 | jsonschema==2.4.0
 47 | jupyter==1.0.0
 48 | jupyter-client==4.1.1
 49 | jupyter-console==4.1.0
 50 | jupyter-core==4.0.6
 51 | llvmlite==0.8.0
 52 | lxml==3.5.0
 53 | MarkupSafe==0.23
 54 | matplotlib==1.5.1
 55 | mistune==0.7.1
 56 | multipledispatch==0.4.8
 57 | nbconvert==4.1.0
 58 | nbformat==4.0.1
 59 | networkx==1.11
 60 | nltk==3.2
 61 | nose==1.3.7
 62 | notebook==4.1.0
 63 | numba==0.23.1
 64 | numexpr==2.4.6
 65 | numpy==1.10.4
 66 | odo==0.4.0
 67 | openpyxl==2.3.2
 68 | pandas==0.17.1
 69 | path.py==0.0.0
 70 | patsy==0.4.0
 71 | pep8==1.7.0
 72 | pexpect==3.3
 73 | pickleshare==0.5
 74 | Pillow==3.1.0
 75 | ply==3.8
 76 | psutil==3.4.2
 77 | ptyprocess==0.5
 78 | py==1.4.31
 79 | pyasn1==0.1.9
 80 | pycosat==0.6.1
 81 | pycparser==2.14
 82 | pycrypto==2.6.1
 83 | pycurl==7.19.5.3
 84 | pyflakes==1.0.0
 85 | Pygments==2.1
 86 | pyOpenSSL==0.15.1
 87 | pyparsing==2.0.3
 88 | pytest==2.8.5
 89 | python-dateutil==2.4.2
 90 | pytz==2015.7
 91 | PyYAML==3.11
 92 | pyzmq==15.2.0
 93 | qtconsole==4.1.1
 94 | redis==2.10.3
 95 | requests==2.9.1
 96 | rope-py3k==0.9.4.post1
 97 | scikit-image==0.11.3
 98 | scikit-learn==0.17
 99 | scipy==0.17.0
100 | seaborn==0.7.0
101 | simplegeneric==0.8.1
102 | six==1.10.0
103 | snowballstemmer==1.2.1
104 | sockjs-tornado==1.0.1
105 | Sphinx==1.3.5
106 | sphinx-rtd-theme==0.1.9
107 | spyder==2.3.8
108 | SQLAlchemy==1.0.12
109 | statsmodels==0.6.1
110 | sympy==0.7.6.1
111 | tables==3.2.2
112 | terminado==0.5
113 | toolz==0.7.4
114 | tornado==4.3
115 | traitlets==4.1.0
116 | unicodecsv==0.14.1
117 | Werkzeug==0.11.3
118 | xgboost==0.4a30
119 | xlrd==0.9.4
120 | XlsxWriter==0.8.4
121 | xlwings==0.6.4
122 | xlwt==1.0.0
123 | 


--------------------------------------------------------------------------------
/text_mining/DataTau.html:
--------------------------------------------------------------------------------
 1 | <html><head><link rel="stylesheet" type="text/css" href="news.css">
 2 | <link rel="shortcut icon" href="http://www.iconj.com/ico/d/x/dxo02ap56v.ico">
 3 | <script>
 4 | function byId(id) {
 5 |   return document.getElementById(id);
 6 | }
 7 | 
 8 | function vote(node) {
 9 |   var v = node.id.split(/_/);   // {'up', '123'}
10 |   var item = v[1]; 
11 | 
12 |   // adjust score
13 |   var score = byId('score_' + item);
14 |   var newscore = parseInt(score.innerHTML) + (v[0] == 'up' ? 1 : -1);
15 |   score.innerHTML = newscore + (newscore == 1 ? ' point' : ' points');
16 | 
17 |   // hide arrows
18 |   byId('up_'   + item).style.visibility = 'hidden';
19 |   byId('down_' + item).style.visibility = 'hidden';
20 | 
21 |   // ping server
22 |   var ping = new Image();
23 |   ping.src = node.href;
24 | 
25 |   return false; // cancel browser nav
26 | } </script><script>
27 | 
28 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
29 |   (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
30 |   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
31 |   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
32 | 
33 |   ga('create', 'UA-46326769-1', 'datatau.com');
34 |   ga('send', 'pageview');
35 | 
36 | </script><title>DataTau</title></head><body><center><table border=0 cellpadding=0 cellspacing=0 width="85%" bgcolor=#f6f6ef><tr><td bgcolor=#00b4b4><table border=0 cellpadding=0 cellspacing=0 width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="http://www.datatau.com"><img src="arc.png" width=18 height=18 style="border:1px #b4b400 solid;"></img></a></td><td style="line-height:12pt; height:10px;"><span class="pagetop"><b><a href="news">DataTau</a></b><img src="s.gif" height=1 width=10><a href="newest">new</a> | <a href="newcomments">comments</a> | <a href="leaders">leaders</a> | <a href="submit">submit</a></span></td><td style="text-align:right;padding-right:4px;"><span class="pagetop"><a href="/x?fnid=fDLzOSbeCa">login</a></span></td></tr></table></td></tr><tr style="height:10px"></tr><tr><td><table border=0 cellpadding=0 cellspacing=0><tr><td align=right valign=top class="title">1.</td><td><center><a id=nil href="vote?for=11989&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11989></span></center></td><td class="title"><a href="https://www.springboard.com/blog/eat-rate-love-an-exploration-of-r-yelp-and-the-search-for-good-indian-food/" rel="nofollow">An Exploration of R, Yelp, and the Search for Good Indian Food</a><span class="comhead"> (springboard.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11989>5 points</span> by <a href="user?id=Rogerh91">Rogerh91</a> 4 hours ago  | <a href="item?id=11989">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">2.</td><td><center><a id=nil href="vote?for=11986&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11986></span></center></td><td class="title"><a href="http://blog.insightdatalabs.com/spark-pipelines-elegant-yet-powerful/" rel="nofollow">Spark Pipelines: Elegant Yet Powerful</a><span class="comhead"> (insightdatalabs.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11986>3 points</span> by <a href="user?id=aouyang1">aouyang1</a> 7 hours ago  | <a href="item?id=11986">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">3.</td><td><center><a id=nil href="vote?for=11973&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11973></span></center></td><td class="title"><a href="https://www.youtube.com/watch?v=KeJINHjyzOU">Deep Advances in Generative Modeling</a><span class="comhead"> (youtube.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11973>7 points</span> by <a href="user?id=gwulfs">gwulfs</a> 13 hours ago  | <a href="item?id=11973">1 comment</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">4.</td><td><center><a id=nil href="vote?for=11980&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11980></span></center></td><td class="title"><a href="http://www.buzzfeed.com/westleyargentum/stuff-vcs-say#.lk1wooEBL" rel="nofollow">Shit VCs Say</a><span class="comhead"> (buzzfeed.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11980>3 points</span> by <a href="user?id=Argentum01">Argentum01</a> 8 hours ago  | <a href="item?id=11980">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">5.</td><td><center><a id=nil href="vote?for=11967&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11967></span></center></td><td class="title"><a href="http://sebastianraschka.com/blog/2015/why-python.html" rel="nofollow">Python, Machine Learning, and Language Wars</a><span class="comhead"> (sebastianraschka.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11967>4 points</span> by <a href="user?id=pmigdal">pmigdal</a> 15 hours ago  | <a href="item?id=11967">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">6.</td><td><center><a id=nil href="vote?for=11975&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11975></span></center></td><td class="title"><a href="https://iamtrask.github.io/2015/07/12/basic-python-network/" rel="nofollow">A Neural Network in 11 lines of Python </a><span class="comhead"> (github.io) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11975>3 points</span> by <a href="user?id=dekhtiar">dekhtiar</a> 13 hours ago  | <a href="item?id=11975">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">7.</td><td><center><a id=nil href="vote?for=11955&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11955></span></center></td><td class="title"><a href="http://setosa.io/ev/markov-chains/">Markov Chains Explained Visually</a><span class="comhead"> (setosa.io) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11955>13 points</span> by <a href="user?id=zeroviscosity">zeroviscosity</a> 1 day ago  | <a href="item?id=11955">1 comment</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">8.</td><td><center><a id=nil href="vote?for=11952&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11952></span></center></td><td class="title"><a href="https://github.com/dodger487/dplython">Dplython: Dplyr for Python</a><span class="comhead"> (github.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11952>13 points</span> by <a href="user?id=thenaturalist">thenaturalist</a> 1 day ago  | <a href="item?id=11952">3 comments</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">9.</td><td><center><a id=nil href="vote?for=11940&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11940></span></center></td><td class="title"><a href="http://research.google.com/pubs/pub41854.html">Inferring causal impact using Bayesian structural time-series models</a><span class="comhead"> (google.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11940>8 points</span> by <a href="user?id=Homunculiheaded">Homunculiheaded</a> 1 day ago  | <a href="item?id=11940">1 comment</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">10.</td><td><center><a id=nil href="vote?for=11948&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11948></span></center></td><td class="title"><a href="http://tech.marksblogg.com/billion-nyc-taxi-rides-spark-emr.html" rel="nofollow">A Billion Taxi Rides on Amazon EMR running Spark</a><span class="comhead"> (marksblogg.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11948>5 points</span> by <a href="user?id=marklit">marklit</a> 1 day ago  | <a href="item?id=11948">1 comment</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">11.</td><td><center><a id=nil href="vote?for=11946&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11946></span></center></td><td class="title"><a href="http://trendct.org/2016/03/18/tutorial-web-scraping-and-mapping-breweries-with-import-io-and-r/" rel="nofollow">Tutorial: Web scraping and mapping breweries with import.io and R</a><span class="comhead"> (trendct.org) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11946>4 points</span> by <a href="user?id=jasdumas">jasdumas</a> 1 day ago  | <a href="item?id=11946">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">12.</td><td><center><a id=nil href="vote?for=11939&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11939></span></center></td><td class="title"><a href="http://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/" rel="nofollow">The rise of greedy robots</a><span class="comhead"> (yanirseroussi.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11939>4 points</span> by <a href="user?id=yanir">yanir</a> 2 days ago  | <a href="item?id=11939">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">13.</td><td><center><a id=nil href="vote?for=11905&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11905></span></center></td><td class="title"><a href="https://github.com/jmportilla/Python-for-Algorithms--Data-Structures--and-Interviews">Python for Data Structures, Algorithms, and Interviews</a><span class="comhead"> (github.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11905>18 points</span> by <a href="user?id=kokoubaby">kokoubaby</a> 4 days ago  | <a href="item?id=11905">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">14.</td><td><center><a id=nil href="vote?for=11956&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11956></span></center></td><td class="title"><a href="http://techblog.netflix.com/2016/03/extracting-image-metadata-at-scale.html" rel="nofollow">Extracting image metadata at scale</a><span class="comhead"> (netflix.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11956>2 points</span> by <a href="user?id=zachwill">zachwill</a> 1 day ago  | <a href="item?id=11956">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">15.</td><td><center><a id=nil href="vote?for=11909&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11909></span></center></td><td class="title"><a href="http://blog.datalifebalance.com/lift-charts-a-data-scientists-secret-weapon/">Lift charts - A data scientist's secret weapon</a><span class="comhead"> (datalifebalance.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11909>14 points</span> by <a href="user?id=datenheini">datenheini</a> 4 days ago  | <a href="item?id=11909">2 comments</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">16.</td><td><center><a id=nil href="vote?for=11934&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11934></span></center></td><td class="title"><a href="http://swanintelligence.com/how-to-become-a-machine-learning-expert-in-one-simple-step.html" rel="nofollow">How To Become A Machine Learning Expert In One Simple Step</a><span class="comhead"> (swanintelligence.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11934>4 points</span> by <a href="user?id=swanint">swanint</a> 2 days ago  | <a href="item?id=11934">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">17.</td><td><center><a id=nil href="vote?for=11910&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11910></span></center></td><td class="title"><a href="http://multithreaded.stitchfix.com/blog/2016/03/16/engineers-shouldnt-write-etl/">Engineers Shouldn’t Write ETL: High Functioning Data Science Departments</a><span class="comhead"> (stitchfix.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11910>10 points</span> by <a href="user?id=legel">legel</a> 4 days ago  | <a href="item?id=11910">3 comments</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">18.</td><td><center><a id=nil href="vote?for=11937&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11937></span></center></td><td class="title"><a href="http://www.willmcginnis.com/2016/03/15/simple-estimation-hierarchical-events-petersburg/" rel="nofollow">Simple estimation of hierarchical events with petersburg</a><span class="comhead"> (willmcginnis.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11937>3 points</span> by <a href="user?id=wdm0006">wdm0006</a> 2 days ago  | <a href="item?id=11937">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">19.</td><td><center><a id=nil href="vote?for=11938&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11938></span></center></td><td class="title"><a href="item?id=11938">Data Science Side Project</a></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11938>6 points</span> by <a href="user?id=yashpatel5400">yashpatel5400</a> 2 days ago  | <a href="item?id=11938">8 comments</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">20.</td><td><center><a id=nil href="vote?for=11920&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11920></span></center></td><td class="title"><a href="http://multithreaded.stitchfix.com/blog/2016/02/04/computer-vision-state-of-the-art/">Unsupervised Computer Vision: The Current State of the Art</a><span class="comhead"> (stitchfix.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11920>6 points</span> by <a href="user?id=carlosfaham">carlosfaham</a> 3 days ago  | <a href="item?id=11920">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">21.</td><td><center><a id=nil href="vote?for=11882&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11882></span></center></td><td class="title"><a href="https://drive.google.com/file/d/0BxGB59WxQI5oTXpQd09jbVpvalE/view">Data Engineering at Slack: Twelve Mistakes I've Made In My First Three Months</a><span class="comhead"> (google.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11882>14 points</span> by <a href="user?id=gwulfs">gwulfs</a> 6 days ago  | <a href="item?id=11882">2 comments</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">22.</td><td><center><a id=nil href="vote?for=11931&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11931></span></center></td><td class="title"><a href="http://www.randalolson.com/2016/03/11/what-data-visualization-tools-do-rdataisbeautiful-oc-creators-use/" rel="nofollow">What data visualization tools do /r/DataIsBeautiful OC creators use?</a><span class="comhead"> (randalolson.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11931>3 points</span> by <a href="user?id=pmigdal">pmigdal</a> 2 days ago  | <a href="item?id=11931">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">23.</td><td><center><a id=nil href="vote?for=11917&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11917></span></center></td><td class="title"><a href="https://nikolaygrozev.wordpress.com/2015/07/01/reshaping-in-pandas-pivot-pivot-table-stack-and-unstack-explained-with-pictures/">Reshaping in Pandas</a><span class="comhead"> (nikolaygrozev.wordpress.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11917>6 points</span> by <a href="user?id=carlosgg">carlosgg</a> 4 days ago  | <a href="item?id=11917">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">24.</td><td><center><a id=nil href="vote?for=11923&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11923></span></center></td><td class="title"><a href="http://blackboxchallenge.com/eng" rel="nofollow">An unusual interactive machine learning challenge</a><span class="comhead"> (blackboxchallenge.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11923>4 points</span> by <a href="user?id=gglumov">gglumov</a> 3 days ago  | <a href="item?id=11923">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">25.</td><td><center><a id=nil href="vote?for=11922&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11922></span></center></td><td class="title"><a href="http://blog.datumbox.com/datumbox-machine-learning-framework-0-7-0-released/" rel="nofollow">Datumbox Machine Learning Framework 0.7.0 Released</a><span class="comhead"> (datumbox.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11922>4 points</span> by <a href="user?id=datumbox">datumbox</a> 3 days ago  | <a href="item?id=11922">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">26.</td><td><center><a id=nil href="vote?for=11865&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11865></span></center></td><td class="title"><a href="http://p.migdal.pl/2016/03/15/data-science-intro-for-math-phys-background.html">Data science intro for math/phys background</a><span class="comhead"> (p.migdal.pl) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11865>14 points</span> by <a href="user?id=pmigdal">pmigdal</a> 7 days ago  | <a href="item?id=11865">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">27.</td><td><center><a id=nil href="vote?for=11837&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11837></span></center></td><td class="title"><a href="http://lumiverse.io/series/neural-networks-demystified">Neural Networks demystified</a><span class="comhead"> (lumiverse.io) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11837>16 points</span> by <a href="user?id=elyase">elyase</a> 8 days ago  | <a href="item?id=11837">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">28.</td><td><center><a id=nil href="vote?for=11880&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11880></span></center></td><td class="title"><a href="http://insighthealthdata.com/blog/HealthyBeats/">What machines can learn from Apple Watch: detecting undiagnosed heart condition</a><span class="comhead"> (insighthealthdata.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11880>9 points</span> by <a href="user?id=koukouhappy">koukouhappy</a> 6 days ago  | <a href="item?id=11880">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">29.</td><td><center><a id=nil href="vote?for=11862&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11862></span></center></td><td class="title"><a href="http://blog.dominodatalab.com/open-source-winning-against-proprietary-data-science-vendors/">Data Science Tools: The Biggest Winners and Losers</a><span class="comhead"> (dominodatalab.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11862>12 points</span> by <a href="user?id=AnnaOnTheWeb">AnnaOnTheWeb</a> 7 days ago  | <a href="item?id=11862">discuss</a></td></tr><tr style="height:5px"></tr><tr><td align=right valign=top class="title">30.</td><td><center><a id=nil href="vote?for=11868&dir=up&whence=%6e%65%77%73"><img src="grayarrow.gif" border=0 vspace=3 hspace=2></a><span id=down_11868></span></center></td><td class="title"><a href="https://medium.com/@thomasrorystone/10-years-of-open-source-machine-learning-64bb6fb18eb2#.vp3r4try5">10 Years of Open Source Machine Learning</a><span class="comhead"> (medium.com) </span></td></tr><tr><td colspan=2></td><td class="subtext"><span id=score_11868>9 points</span> by <a href="user?id=tstonez">tstonez</a> 6 days ago  | <a href="item?id=11868">1 comment</a></td></tr><tr style="height:5px"></tr><tr style="height:10px"></tr><tr><td colspan=2></td><td class="title"><a href="/x?fnid=CSS821ucAs" rel="nofollow">More</a></td></tr></table></td></tr><tr><td><img src="s.gif" height=10 width=0><table width="100%" cellspacing=0 cellpadding=1><tr><td bgcolor=#00b4b4></td></tr></table><br>
37 | <center></center></td></tr></table></center><center><a href="http://www.datatau.com/rss">RSS
38 | </a><a href="http://www.datatau.com/item?id=1">| Announcements
39 | </a></center></body></html>


--------------------------------------------------------------------------------
/text_mining/Refine.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Refine the Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 32,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 33,
 24 |    "metadata": {
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "df = pd.read_csv('data_tau.csv')"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 34,
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/html": [
 42 |        "<div>\n",
 43 |        "<table border=\"1\" class=\"dataframe\">\n",
 44 |        "  <thead>\n",
 45 |        "    <tr style=\"text-align: right;\">\n",
 46 |        "      <th></th>\n",
 47 |        "      <th>title</th>\n",
 48 |        "      <th>date</th>\n",
 49 |        "    </tr>\n",
 50 |        "  </thead>\n",
 51 |        "  <tbody>\n",
 52 |        "    <tr>\n",
 53 |        "      <th>0</th>\n",
 54 |        "      <td>An Exploration of R, Yelp, and the Search for ...</td>\n",
 55 |        "      <td>5 points by Rogerh91 6 hours ago  | discuss</td>\n",
 56 |        "    </tr>\n",
 57 |        "    <tr>\n",
 58 |        "      <th>1</th>\n",
 59 |        "      <td>Deep Advances in Generative Modeling</td>\n",
 60 |        "      <td>7 points by gwulfs 15 hours ago  | 1 comment</td>\n",
 61 |        "    </tr>\n",
 62 |        "    <tr>\n",
 63 |        "      <th>2</th>\n",
 64 |        "      <td>Spark Pipelines: Elegant Yet Powerful</td>\n",
 65 |        "      <td>3 points by aouyang1 9 hours ago  | discuss</td>\n",
 66 |        "    </tr>\n",
 67 |        "    <tr>\n",
 68 |        "      <th>3</th>\n",
 69 |        "      <td>Shit VCs Say</td>\n",
 70 |        "      <td>3 points by Argentum01 10 hours ago  | discuss</td>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>4</th>\n",
 74 |        "      <td>Python, Machine Learning, and Language Wars</td>\n",
 75 |        "      <td>4 points by pmigdal 17 hours ago  | discuss</td>\n",
 76 |        "    </tr>\n",
 77 |        "  </tbody>\n",
 78 |        "</table>\n",
 79 |        "</div>"
 80 |       ],
 81 |       "text/plain": [
 82 |        "                                               title  \\\n",
 83 |        "0  An Exploration of R, Yelp, and the Search for ...   \n",
 84 |        "1               Deep Advances in Generative Modeling   \n",
 85 |        "2              Spark Pipelines: Elegant Yet Powerful   \n",
 86 |        "3                                       Shit VCs Say   \n",
 87 |        "4        Python, Machine Learning, and Language Wars   \n",
 88 |        "\n",
 89 |        "                                             date  \n",
 90 |        "0     5 points by Rogerh91 6 hours ago  | discuss  \n",
 91 |        "1    7 points by gwulfs 15 hours ago  | 1 comment  \n",
 92 |        "2     3 points by aouyang1 9 hours ago  | discuss  \n",
 93 |        "3  3 points by Argentum01 10 hours ago  | discuss  \n",
 94 |        "4     4 points by pmigdal 17 hours ago  | discuss  "
 95 |       ]
 96 |      },
 97 |      "execution_count": 34,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "df.head()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "To get the date of the title - we will need the following algorithm\n",
111 |     "- If the string contains **hours** we can consider it **1 day**\n",
112 |     "- And if the string has **day**, we pick the number preceding the **day**\n",
113 |     "\n",
114 |     "To apply this algorithm, we need to be able to pick these words and digits from a string. For that we will use Regular Expression."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Introduction to Regular Expression (Regex)\n",
122 |     "\n",
123 |     "Regular expression is a way of selecting text using symbols in a string.\n",
124 |     "\n",
125 |     "Refer to the following links for an interactive playground\n",
126 |     "- [http://regexr.com](http://regexr.com/)\n",
127 |     "- [http://regex101.com/](http://regex101.com/)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 35,
133 |    "metadata": {
134 |     "collapsed": true
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "import re"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 36,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "test_string = \"Hello world, welcome to 2016.\""
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 37,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "# We can pass the whole string and re.search will give the first occurence of the value\n",
161 |     "# re.search - This function searches for first occurrence of RE pattern within string.\n",
162 |     "a = re.search('Hello world, welcome to 2016',test_string)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 38,
168 |    "metadata": {
169 |     "collapsed": false
170 |    },
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "<_sre.SRE_Match object; span=(0, 28), match='Hello world, welcome to 2016'>"
176 |       ]
177 |      },
178 |      "execution_count": 38,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "a"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 39,
190 |    "metadata": {
191 |     "collapsed": false
192 |    },
193 |    "outputs": [
194 |     {
195 |      "data": {
196 |       "text/plain": [
197 |        "'Hello world, welcome to 2016'"
198 |       ]
199 |      },
200 |      "execution_count": 39,
201 |      "metadata": {},
202 |      "output_type": "execute_result"
203 |     }
204 |    ],
205 |    "source": [
206 |     "a.group()"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 40,
212 |    "metadata": {
213 |     "collapsed": false
214 |    },
215 |    "outputs": [
216 |     {
217 |      "data": {
218 |       "text/plain": [
219 |        "'H'"
220 |       ]
221 |      },
222 |      "execution_count": 40,
223 |      "metadata": {},
224 |      "output_type": "execute_result"
225 |     }
226 |    ],
227 |    "source": [
228 |     "# Match the first letters in the string\n",
229 |     "a = re.search('.',test_string)\n",
230 |     "a.group()"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 41,
236 |    "metadata": {
237 |     "collapsed": false
238 |    },
239 |    "outputs": [
240 |     {
241 |      "data": {
242 |       "text/plain": [
243 |        "'Hello world, welcome to 2016.'"
244 |       ]
245 |      },
246 |      "execution_count": 41,
247 |      "metadata": {},
248 |      "output_type": "execute_result"
249 |     }
250 |    ],
251 |    "source": [
252 |     "# Match all the letters in the string\n",
253 |     "a = re.search('.*',test_string)\n",
254 |     "a.group()"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 42,
260 |    "metadata": {
261 |     "collapsed": false
262 |    },
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "<_sre.SRE_Match object; span=(0, 5), match='Hello'>\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "a = re.search('Hello',test_string)\n",
274 |     "print(a)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "** Some basic symbols**\n",
282 |     "\n",
283 |     "**`?`**   \n",
284 |     "\n",
285 |     "The question mark indicates zero or one occurrences of the preceding element. For example, colou?r matches both \"color\" and \"colour\".\n",
286 |     "\n",
287 |     "**`\\*`**\n",
288 |     "\n",
289 |     "The asterisk indicates zero or more occurrences of the preceding element. For example, ab*c matches \"ac\", \"abc\", \"abbc\", \"abbbc\", and so on.\n",
290 |     "\n",
291 |     "**`\\+`**\t\n",
292 |     "The plus sign indicates one or more occurrences of the preceding element. For example, ab+c matches \"abc\", \"abbc\", \"abbbc\", and so on, but not \"ac\".\n"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 43,
298 |    "metadata": {
299 |     "collapsed": false
300 |    },
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "<_sre.SRE_Match object; span=(0, 2), match='He'>\n"
307 |      ]
308 |     }
309 |    ],
310 |    "source": [
311 |     "a = re.search('\\w.',test_string)\n",
312 |     "print(a)"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 44,
318 |    "metadata": {
319 |     "collapsed": false
320 |    },
321 |    "outputs": [
322 |     {
323 |      "name": "stdout",
324 |      "output_type": "stream",
325 |      "text": [
326 |       "<_sre.SRE_Match object; span=(0, 5), match='Hello'>\n"
327 |      ]
328 |     }
329 |    ],
330 |    "source": [
331 |     "a = re.search('\\w*',test_string)\n",
332 |     "print(a)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "### Exercises"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 45,
345 |    "metadata": {
346 |     "collapsed": true
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "string = '''In 2016, we are learning Text Analytics in Data Science 101\n",
351 |     "            by scraping http://datatau.com'''"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 46,
357 |    "metadata": {
358 |     "collapsed": false
359 |    },
360 |    "outputs": [],
361 |    "source": [
362 |     "string = \"In 2016, we are learning Text Analytics in Data Science 101 by scraping http://datatau.com\""
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "metadata": {},
368 |    "source": [
369 |     "Write a regex to pick the numbers 2016 from string above."
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {
376 |     "collapsed": true
377 |    },
378 |    "outputs": [],
379 |    "source": []
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "Write a regex to pick the url link (http://xyz.com) from the string above "
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {
392 |     "collapsed": true
393 |    },
394 |    "outputs": [],
395 |    "source": []
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "## Lets get the date from our string"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 47,
407 |    "metadata": {
408 |     "collapsed": false
409 |    },
410 |    "outputs": [
411 |     {
412 |      "data": {
413 |       "text/html": [
414 |        "<div>\n",
415 |        "<table border=\"1\" class=\"dataframe\">\n",
416 |        "  <thead>\n",
417 |        "    <tr style=\"text-align: right;\">\n",
418 |        "      <th></th>\n",
419 |        "      <th>title</th>\n",
420 |        "      <th>date</th>\n",
421 |        "    </tr>\n",
422 |        "  </thead>\n",
423 |        "  <tbody>\n",
424 |        "    <tr>\n",
425 |        "      <th>0</th>\n",
426 |        "      <td>An Exploration of R, Yelp, and the Search for ...</td>\n",
427 |        "      <td>5 points by Rogerh91 6 hours ago  | discuss</td>\n",
428 |        "    </tr>\n",
429 |        "    <tr>\n",
430 |        "      <th>1</th>\n",
431 |        "      <td>Deep Advances in Generative Modeling</td>\n",
432 |        "      <td>7 points by gwulfs 15 hours ago  | 1 comment</td>\n",
433 |        "    </tr>\n",
434 |        "    <tr>\n",
435 |        "      <th>2</th>\n",
436 |        "      <td>Spark Pipelines: Elegant Yet Powerful</td>\n",
437 |        "      <td>3 points by aouyang1 9 hours ago  | discuss</td>\n",
438 |        "    </tr>\n",
439 |        "    <tr>\n",
440 |        "      <th>3</th>\n",
441 |        "      <td>Shit VCs Say</td>\n",
442 |        "      <td>3 points by Argentum01 10 hours ago  | discuss</td>\n",
443 |        "    </tr>\n",
444 |        "    <tr>\n",
445 |        "      <th>4</th>\n",
446 |        "      <td>Python, Machine Learning, and Language Wars</td>\n",
447 |        "      <td>4 points by pmigdal 17 hours ago  | discuss</td>\n",
448 |        "    </tr>\n",
449 |        "  </tbody>\n",
450 |        "</table>\n",
451 |        "</div>"
452 |       ],
453 |       "text/plain": [
454 |        "                                               title  \\\n",
455 |        "0  An Exploration of R, Yelp, and the Search for ...   \n",
456 |        "1               Deep Advances in Generative Modeling   \n",
457 |        "2              Spark Pipelines: Elegant Yet Powerful   \n",
458 |        "3                                       Shit VCs Say   \n",
459 |        "4        Python, Machine Learning, and Language Wars   \n",
460 |        "\n",
461 |        "                                             date  \n",
462 |        "0     5 points by Rogerh91 6 hours ago  | discuss  \n",
463 |        "1    7 points by gwulfs 15 hours ago  | 1 comment  \n",
464 |        "2     3 points by aouyang1 9 hours ago  | discuss  \n",
465 |        "3  3 points by Argentum01 10 hours ago  | discuss  \n",
466 |        "4     4 points by pmigdal 17 hours ago  | discuss  "
467 |       ]
468 |      },
469 |      "execution_count": 47,
470 |      "metadata": {},
471 |      "output_type": "execute_result"
472 |     }
473 |    ],
474 |    "source": [
475 |     "df.head()"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": 48,
481 |    "metadata": {
482 |     "collapsed": false
483 |    },
484 |    "outputs": [
485 |     {
486 |      "data": {
487 |       "text/html": [
488 |        "<div>\n",
489 |        "<table border=\"1\" class=\"dataframe\">\n",
490 |        "  <thead>\n",
491 |        "    <tr style=\"text-align: right;\">\n",
492 |        "      <th></th>\n",
493 |        "      <th>title</th>\n",
494 |        "      <th>date</th>\n",
495 |        "    </tr>\n",
496 |        "  </thead>\n",
497 |        "  <tbody>\n",
498 |        "    <tr>\n",
499 |        "      <th>175</th>\n",
500 |        "      <td>Getting Started with Statistics for Data Science</td>\n",
501 |        "      <td>3 points by nickhould 35 days ago  | discuss</td>\n",
502 |        "    </tr>\n",
503 |        "    <tr>\n",
504 |        "      <th>176</th>\n",
505 |        "      <td>Rodeo 1.3 - Tab-completion for docstrings</td>\n",
506 |        "      <td>3 points by glamp 35 days ago  | discuss</td>\n",
507 |        "    </tr>\n",
508 |        "    <tr>\n",
509 |        "      <th>177</th>\n",
510 |        "      <td>Teaching D3.js - links</td>\n",
511 |        "      <td>3 points by pmigdal 35 days ago  | discuss</td>\n",
512 |        "    </tr>\n",
513 |        "    <tr>\n",
514 |        "      <th>178</th>\n",
515 |        "      <td>Parallel scikit-learn on YARN</td>\n",
516 |        "      <td>5 points by stijntonk 39 days ago  | discuss</td>\n",
517 |        "    </tr>\n",
518 |        "    <tr>\n",
519 |        "      <th>179</th>\n",
520 |        "      <td>Meetup: Free Live Webinar on Prescriptive Anal...</td>\n",
521 |        "      <td>2 points by ann928 32 days ago  | discuss</td>\n",
522 |        "    </tr>\n",
523 |        "  </tbody>\n",
524 |        "</table>\n",
525 |        "</div>"
526 |       ],
527 |       "text/plain": [
528 |        "                                                 title  \\\n",
529 |        "175   Getting Started with Statistics for Data Science   \n",
530 |        "176          Rodeo 1.3 - Tab-completion for docstrings   \n",
531 |        "177                             Teaching D3.js - links   \n",
532 |        "178                      Parallel scikit-learn on YARN   \n",
533 |        "179  Meetup: Free Live Webinar on Prescriptive Anal...   \n",
534 |        "\n",
535 |        "                                             date  \n",
536 |        "175  3 points by nickhould 35 days ago  | discuss  \n",
537 |        "176      3 points by glamp 35 days ago  | discuss  \n",
538 |        "177    3 points by pmigdal 35 days ago  | discuss  \n",
539 |        "178  5 points by stijntonk 39 days ago  | discuss  \n",
540 |        "179     2 points by ann928 32 days ago  | discuss  "
541 |       ]
542 |      },
543 |      "execution_count": 48,
544 |      "metadata": {},
545 |      "output_type": "execute_result"
546 |     }
547 |    ],
548 |    "source": [
549 |     "df.tail()"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 49,
555 |    "metadata": {
556 |     "collapsed": true
557 |    },
558 |    "outputs": [],
559 |    "source": [
560 |     "date_string = df['date'][0]"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 50,
566 |    "metadata": {
567 |     "collapsed": false
568 |    },
569 |    "outputs": [
570 |     {
571 |      "name": "stdout",
572 |      "output_type": "stream",
573 |      "text": [
574 |       "5 points by Rogerh91 6 hours ago  | discuss\n"
575 |      ]
576 |     }
577 |    ],
578 |    "source": [
579 |     "print(date_string)"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "code",
584 |    "execution_count": 51,
585 |    "metadata": {
586 |     "collapsed": false
587 |    },
588 |    "outputs": [
589 |     {
590 |      "data": {
591 |       "text/plain": [
592 |        "<_sre.SRE_Match object; span=(23, 28), match='hours'>"
593 |       ]
594 |      },
595 |      "execution_count": 51,
596 |      "metadata": {},
597 |      "output_type": "execute_result"
598 |     }
599 |    ],
600 |    "source": [
601 |     "re.search('hours',date_string)"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": 52,
607 |    "metadata": {
608 |     "collapsed": true
609 |    },
610 |    "outputs": [],
611 |    "source": [
612 |     "date_string = df['date'][50]"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": 53,
618 |    "metadata": {
619 |     "collapsed": false
620 |    },
621 |    "outputs": [
622 |     {
623 |      "name": "stdout",
624 |      "output_type": "stream",
625 |      "text": [
626 |       "4 points by lefish 7 days ago  | discuss\n"
627 |      ]
628 |     }
629 |    ],
630 |    "source": [
631 |     "print(date_string)"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": 54,
637 |    "metadata": {
638 |     "collapsed": true
639 |    },
640 |    "outputs": [],
641 |    "source": [
642 |     "# If hours is not there, we don't get any match\n",
643 |     "re.search('hours',date_string)"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": 55,
649 |    "metadata": {
650 |     "collapsed": false
651 |    },
652 |    "outputs": [
653 |     {
654 |      "data": {
655 |       "text/plain": [
656 |        "<_sre.SRE_Match object; span=(19, 24), match='7 day'>"
657 |       ]
658 |      },
659 |      "execution_count": 55,
660 |      "metadata": {},
661 |      "output_type": "execute_result"
662 |     }
663 |    ],
664 |    "source": [
665 |     "# Let us match the digit preceding the day text\n",
666 |     "day_search = re.search('\\d+ day',date_string)\n",
667 |     "day_search"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": 56,
673 |    "metadata": {
674 |     "collapsed": false
675 |    },
676 |    "outputs": [
677 |     {
678 |      "data": {
679 |       "text/plain": [
680 |        "'7 day'"
681 |       ]
682 |      },
683 |      "execution_count": 56,
684 |      "metadata": {},
685 |      "output_type": "execute_result"
686 |     }
687 |    ],
688 |    "source": [
689 |     "days_string = day_search.group(0)\n",
690 |     "days_string"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": 57,
696 |    "metadata": {
697 |     "collapsed": false
698 |    },
699 |    "outputs": [
700 |     {
701 |      "data": {
702 |       "text/plain": [
703 |        "'7'"
704 |       ]
705 |      },
706 |      "execution_count": 57,
707 |      "metadata": {},
708 |      "output_type": "execute_result"
709 |     }
710 |    ],
711 |    "source": [
712 |     "days = days_string.split(' ')[0] \n",
713 |     "days"
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "markdown",
718 |    "metadata": {},
719 |    "source": []
720 |   },
721 |   {
722 |    "cell_type": "code",
723 |    "execution_count": 58,
724 |    "metadata": {
725 |     "collapsed": true
726 |    },
727 |    "outputs": [],
728 |    "source": [
729 |     "def return_reg_ex_days(row):\n",
730 |     "    days = ''\n",
731 |     "    if re.search('hours',row['date']) is not None:\n",
732 |     "        # print('hours',row['date'])\n",
733 |     "        days = 1\n",
734 |     "    else:\n",
735 |     "        day_search = re.search('\\d+ day',row['date'])\n",
736 |     "        # print('day',day_search.group(0))\n",
737 |     "        days = day_search.group(0).split(' ')[0]    \n",
738 |     "    \n",
739 |     "    #print(row,days)\n",
740 |     "    return days\n",
741 |     "        "
742 |    ]
743 |   },
744 |   {
745 |    "cell_type": "code",
746 |    "execution_count": 59,
747 |    "metadata": {
748 |     "collapsed": false
749 |    },
750 |    "outputs": [],
751 |    "source": [
752 |     "# Now we apply this function to each of the row in the dataframe\n",
753 |     "df['days'] = df.apply(return_reg_ex_days,axis=1)"
754 |    ]
755 |   },
756 |   {
757 |    "cell_type": "code",
758 |    "execution_count": 60,
759 |    "metadata": {
760 |     "collapsed": false
761 |    },
762 |    "outputs": [
763 |     {
764 |      "data": {
765 |       "text/html": [
766 |        "<div>\n",
767 |        "<table border=\"1\" class=\"dataframe\">\n",
768 |        "  <thead>\n",
769 |        "    <tr style=\"text-align: right;\">\n",
770 |        "      <th></th>\n",
771 |        "      <th>title</th>\n",
772 |        "      <th>date</th>\n",
773 |        "      <th>days</th>\n",
774 |        "    </tr>\n",
775 |        "  </thead>\n",
776 |        "  <tbody>\n",
777 |        "    <tr>\n",
778 |        "      <th>0</th>\n",
779 |        "      <td>An Exploration of R, Yelp, and the Search for ...</td>\n",
780 |        "      <td>5 points by Rogerh91 6 hours ago  | discuss</td>\n",
781 |        "      <td>1</td>\n",
782 |        "    </tr>\n",
783 |        "    <tr>\n",
784 |        "      <th>1</th>\n",
785 |        "      <td>Deep Advances in Generative Modeling</td>\n",
786 |        "      <td>7 points by gwulfs 15 hours ago  | 1 comment</td>\n",
787 |        "      <td>1</td>\n",
788 |        "    </tr>\n",
789 |        "    <tr>\n",
790 |        "      <th>2</th>\n",
791 |        "      <td>Spark Pipelines: Elegant Yet Powerful</td>\n",
792 |        "      <td>3 points by aouyang1 9 hours ago  | discuss</td>\n",
793 |        "      <td>1</td>\n",
794 |        "    </tr>\n",
795 |        "    <tr>\n",
796 |        "      <th>3</th>\n",
797 |        "      <td>Shit VCs Say</td>\n",
798 |        "      <td>3 points by Argentum01 10 hours ago  | discuss</td>\n",
799 |        "      <td>1</td>\n",
800 |        "    </tr>\n",
801 |        "    <tr>\n",
802 |        "      <th>4</th>\n",
803 |        "      <td>Python, Machine Learning, and Language Wars</td>\n",
804 |        "      <td>4 points by pmigdal 17 hours ago  | discuss</td>\n",
805 |        "      <td>1</td>\n",
806 |        "    </tr>\n",
807 |        "  </tbody>\n",
808 |        "</table>\n",
809 |        "</div>"
810 |       ],
811 |       "text/plain": [
812 |        "                                               title  \\\n",
813 |        "0  An Exploration of R, Yelp, and the Search for ...   \n",
814 |        "1               Deep Advances in Generative Modeling   \n",
815 |        "2              Spark Pipelines: Elegant Yet Powerful   \n",
816 |        "3                                       Shit VCs Say   \n",
817 |        "4        Python, Machine Learning, and Language Wars   \n",
818 |        "\n",
819 |        "                                             date days  \n",
820 |        "0     5 points by Rogerh91 6 hours ago  | discuss    1  \n",
821 |        "1    7 points by gwulfs 15 hours ago  | 1 comment    1  \n",
822 |        "2     3 points by aouyang1 9 hours ago  | discuss    1  \n",
823 |        "3  3 points by Argentum01 10 hours ago  | discuss    1  \n",
824 |        "4     4 points by pmigdal 17 hours ago  | discuss    1  "
825 |       ]
826 |      },
827 |      "execution_count": 60,
828 |      "metadata": {},
829 |      "output_type": "execute_result"
830 |     }
831 |    ],
832 |    "source": [
833 |     "df.head()"
834 |    ]
835 |   },
836 |   {
837 |    "cell_type": "code",
838 |    "execution_count": 61,
839 |    "metadata": {
840 |     "collapsed": false
841 |    },
842 |    "outputs": [
843 |     {
844 |      "data": {
845 |       "text/html": [
846 |        "<div>\n",
847 |        "<table border=\"1\" class=\"dataframe\">\n",
848 |        "  <thead>\n",
849 |        "    <tr style=\"text-align: right;\">\n",
850 |        "      <th></th>\n",
851 |        "      <th>title</th>\n",
852 |        "      <th>date</th>\n",
853 |        "      <th>days</th>\n",
854 |        "    </tr>\n",
855 |        "  </thead>\n",
856 |        "  <tbody>\n",
857 |        "    <tr>\n",
858 |        "      <th>175</th>\n",
859 |        "      <td>Getting Started with Statistics for Data Science</td>\n",
860 |        "      <td>3 points by nickhould 35 days ago  | discuss</td>\n",
861 |        "      <td>35</td>\n",
862 |        "    </tr>\n",
863 |        "    <tr>\n",
864 |        "      <th>176</th>\n",
865 |        "      <td>Rodeo 1.3 - Tab-completion for docstrings</td>\n",
866 |        "      <td>3 points by glamp 35 days ago  | discuss</td>\n",
867 |        "      <td>35</td>\n",
868 |        "    </tr>\n",
869 |        "    <tr>\n",
870 |        "      <th>177</th>\n",
871 |        "      <td>Teaching D3.js - links</td>\n",
872 |        "      <td>3 points by pmigdal 35 days ago  | discuss</td>\n",
873 |        "      <td>35</td>\n",
874 |        "    </tr>\n",
875 |        "    <tr>\n",
876 |        "      <th>178</th>\n",
877 |        "      <td>Parallel scikit-learn on YARN</td>\n",
878 |        "      <td>5 points by stijntonk 39 days ago  | discuss</td>\n",
879 |        "      <td>39</td>\n",
880 |        "    </tr>\n",
881 |        "    <tr>\n",
882 |        "      <th>179</th>\n",
883 |        "      <td>Meetup: Free Live Webinar on Prescriptive Anal...</td>\n",
884 |        "      <td>2 points by ann928 32 days ago  | discuss</td>\n",
885 |        "      <td>32</td>\n",
886 |        "    </tr>\n",
887 |        "  </tbody>\n",
888 |        "</table>\n",
889 |        "</div>"
890 |       ],
891 |       "text/plain": [
892 |        "                                                 title  \\\n",
893 |        "175   Getting Started with Statistics for Data Science   \n",
894 |        "176          Rodeo 1.3 - Tab-completion for docstrings   \n",
895 |        "177                             Teaching D3.js - links   \n",
896 |        "178                      Parallel scikit-learn on YARN   \n",
897 |        "179  Meetup: Free Live Webinar on Prescriptive Anal...   \n",
898 |        "\n",
899 |        "                                             date days  \n",
900 |        "175  3 points by nickhould 35 days ago  | discuss   35  \n",
901 |        "176      3 points by glamp 35 days ago  | discuss   35  \n",
902 |        "177    3 points by pmigdal 35 days ago  | discuss   35  \n",
903 |        "178  5 points by stijntonk 39 days ago  | discuss   39  \n",
904 |        "179     2 points by ann928 32 days ago  | discuss   32  "
905 |       ]
906 |      },
907 |      "execution_count": 61,
908 |      "metadata": {},
909 |      "output_type": "execute_result"
910 |     }
911 |    ],
912 |    "source": [
913 |     "df.tail()"
914 |    ]
915 |   },
916 |   {
917 |    "cell_type": "code",
918 |    "execution_count": 62,
919 |    "metadata": {
920 |     "collapsed": true
921 |    },
922 |    "outputs": [],
923 |    "source": [
924 |     "# Let us save to a dataframe\n",
925 |     "df.to_csv('data_tau_days.csv', index=False)"
926 |    ]
927 |   }
928 |  ],
929 |  "metadata": {
930 |   "kernelspec": {
931 |    "display_name": "Python 3",
932 |    "language": "python",
933 |    "name": "python3"
934 |   },
935 |   "language_info": {
936 |    "codemirror_mode": {
937 |     "name": "ipython",
938 |     "version": 3
939 |    },
940 |    "file_extension": ".py",
941 |    "mimetype": "text/x-python",
942 |    "name": "python",
943 |    "nbconvert_exporter": "python",
944 |    "pygments_lexer": "ipython3",
945 |    "version": "3.5.1"
946 |   }
947 |  },
948 |  "nbformat": 4,
949 |  "nbformat_minor": 0
950 | }
951 | 


--------------------------------------------------------------------------------
/text_mining/data_tau.csv:
--------------------------------------------------------------------------------
  1 | title,date
  2 | "An Exploration of R, Yelp, and the Search for Good Indian Food",5 points by Rogerh91 6 hours ago  | discuss
  3 | Deep Advances in Generative Modeling,7 points by gwulfs 15 hours ago  | 1 comment
  4 | Spark Pipelines: Elegant Yet Powerful,3 points by aouyang1 9 hours ago  | discuss
  5 | Shit VCs Say,3 points by Argentum01 10 hours ago  | discuss
  6 | "Python, Machine Learning, and Language Wars",4 points by pmigdal 17 hours ago  | discuss
  7 | A Neural Network in 11 lines of Python ,3 points by dekhtiar 14 hours ago  | discuss
  8 | Markov Chains Explained Visually,13 points by zeroviscosity 1 day ago  | 1 comment
  9 | Dplython: Dplyr for Python,13 points by thenaturalist 1 day ago  | 3 comments
 10 | Inferring causal impact using Bayesian structural time-series models,8 points by Homunculiheaded 1 day ago  | 1 comment
 11 | A Billion Taxi Rides on Amazon EMR running Spark,5 points by marklit 1 day ago  | 1 comment
 12 | Tutorial: Web scraping and mapping breweries with import.io and R,4 points by jasdumas 1 day ago  | discuss
 13 | The rise of greedy robots,4 points by yanir 2 days ago  | discuss
 14 | "Python for Data Structures, Algorithms, and Interviews",18 points by kokoubaby 4 days ago  | discuss
 15 | Extracting image metadata at scale,2 points by zachwill 1 day ago  | discuss
 16 | Lift charts - A data scientist's secret weapon,14 points by datenheini 4 days ago  | 2 comments
 17 | Data Science Side Project,7 points by yashpatel5400 2 days ago  | 9 comments
 18 | How To Become A Machine Learning Expert In One Simple Step,4 points by swanint 2 days ago  | discuss
 19 | Engineers Shouldn’t Write ETL: High Functioning Data Science Departments,10 points by legel 4 days ago  | 3 comments
 20 | Simple estimation of hierarchical events with petersburg,3 points by wdm0006 2 days ago  | discuss
 21 | Unsupervised Computer Vision: The Current State of the Art,6 points by carlosfaham 3 days ago  | discuss
 22 | Data Engineering at Slack: Twelve Mistakes I've Made In My First Three Months,14 points by gwulfs 6 days ago  | 2 comments
 23 | What data visualization tools do /r/DataIsBeautiful OC creators use?,3 points by pmigdal 2 days ago  | discuss
 24 | Reshaping in Pandas,6 points by carlosgg 4 days ago  | discuss
 25 | An unusual interactive machine learning challenge,4 points by gglumov 3 days ago  | discuss
 26 | Datumbox Machine Learning Framework 0.7.0 Released,4 points by datumbox 3 days ago  | discuss
 27 | Data science intro for math/phys background,14 points by pmigdal 7 days ago  | discuss
 28 | Neural Networks demystified,16 points by elyase 8 days ago  | discuss
 29 | What machines can learn from Apple Watch: detecting undiagnosed heart condition,9 points by koukouhappy 6 days ago  | discuss
 30 | Data Science Tools: The Biggest Winners and Losers,12 points by AnnaOnTheWeb 7 days ago  | discuss
 31 | 10 Years of Open Source Machine Learning,9 points by tstonez 6 days ago  | 1 comment
 32 | Has your conversion rate changed? Bayesian timeseries analysis with Python,12 points by yummyfajitas 8 days ago  | discuss
 33 | Do jobs run in families?,5 points by Anon84 5 days ago  | 1 comment
 34 | Introduction to Scikit Flow -  Simplified Interface to TensorFlow,8 points by lefish 7 days ago  | discuss
 35 | "XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow",8 points by crowwork 8 days ago  | discuss
 36 | How to learn machine learning?,8 points by kiechu 8 days ago  | 1 comment
 37 | The Deep Roots of Javascript Fatigue,5 points by nikkielizdemere 6 days ago  | 1 comment
 38 | How do we make Data Tau work?,27 points by hal8 9 days ago  | 18 comments
 39 | "Machine Learning: An In-Depth, Non-Technical Guide — Part 4",7 points by innoarchitech 8 days ago  | discuss
 40 | Data Science Slack channel - Click for invite,7 points by jyotsna 8 days ago  | discuss
 41 | [Ask DT] What are some rookie mistakes in R?,3 points by HKtemp 3 days ago  | discuss
 42 | "Playing ""Moneyball"" on EA FIFA 16",16 points by aabb13 13 days ago  | 3 comments
 43 | Intellexer - Natural Language Processing and Text Mining REST API,16 points by j_downer 13 days ago  | discuss
 44 | Descriptive Statistics in SQL,5 points by nickhould 7 days ago  | discuss
 45 | Genomic Data Visualization using Python,2 points by RadhouaneAniba 4 days ago  | discuss
 46 | How to Use Cohort Data to Analyze User Behavior,2 points by clevertap 4 days ago  | discuss
 47 | Making transparent how variations in analytical choices affect results,4 points by rahmaniacc 7 days ago  | discuss
 48 | Show DT: Datasets.co - An easy way to share and discover ml datasets,2 points by mrborgen86 4 days ago  | discuss
 49 | Is Scala a better choice than Python for Apache Spark?,7 points by srinify 10 days ago  | 1 comment
 50 | Julia: A Fast Language for Numerical Computing,7 points by srinify 10 days ago  | 1 comment
 51 | "An Ode To The Rice Cooker, The Smartest Kitchen Appliance I’ve Ever Owned",2 points by tfturing 4 days ago  | discuss
 52 | Computing Classification Evaluation Metrics in R,4 points by lefish 7 days ago  | discuss
 53 | Analyzing Golden State Warriors' passing network using GraphFrames in Spark,3 points by yukiegosapporo 6 days ago  | discuss
 54 | Megaman: Manifold Learning with Millions of points,4 points by dperry 8 days ago  | 3 comments
 55 | How to Detect Outliers on Parametric and Non Parametric Methods,2 points by clevertap 5 days ago  | discuss
 56 | BallR: Interactive NBA Shot Charts with R and Shiny,12 points by carlosgg 14 days ago  | discuss
 57 | A Billion Taxi Rides on Amazon EMR Running Presto,4 points by marklit 8 days ago  | discuss
 58 | Minecraft to run artificial intelligence experiments,4 points by bsadeghi 8 days ago  | discuss
 59 | Deep Q-Learning (Space Invaders),4 points by pmigdal 8 days ago  | discuss
 60 | Theano Tutorial,2 points by pmigdal 5 days ago  | discuss
 61 | The Personality Space of Cartoon Characters,3 points by lefish 7 days ago  | discuss
 62 | Announcing Apache Flink 1.0.0,11 points by mxm 14 days ago  | discuss
 63 | "Telemetry with Collectd, Logstash, Elasticsearch and Grafana (ELG)",3 points by helloanand 7 days ago  | discuss
 64 | Statisticians Agree: It’s Time To Stop Misusing P-Value,10 points by jpiburn 15 days ago  | 5 comments
 65 | Bayesian Reasoning in The Twilight Zone!,2 points by Homunculiheaded 6 days ago  | discuss
 66 | Bayesian Estimation of G Train Wait Times,7 points by jamesdreiss 12 days ago  | discuss
 67 | XGBoost: A Scalable Tree Boosting System article,6 points by tfturing 12 days ago  | discuss
 68 | Some experiments into explaining complex black box ensemble predictions,2 points by lefish 6 days ago  | discuss
 69 | Creating a Hadoop Pseudo-Distributed Environment,2 points by lefish 6 days ago  | discuss
 70 | "Data Science Pop-Up in Austin, TX",2 points by AnnaOnTheWeb 6 days ago  | discuss
 71 | Train your own image classifier with Inception in TensorFlow,7 points by elyase 13 days ago  | discuss
 72 | Shiny app for running a Tensorflow demo,3 points by shinyman 9 days ago  | discuss
 73 | File details and owners with gitnoc and git-pandas,3 points by wdm0006 9 days ago  | discuss
 74 | 7 Big Data Technologies and When to Use Them that All Data Engineers Should Know,2 points by galvanize 7 days ago  | discuss
 75 | Topic clusters with TF-IDF vectorization with Spark and Scala,2 points by lefish 7 days ago  | discuss
 76 | Neural Doodles: Workflows for the Next Generation of Artists,5 points by pmigdal 12 days ago  | discuss
 77 | Graph Databases 101,5 points by carlosgg 12 days ago  | discuss
 78 | DataRadar.IO - Data Science RSS Feed - Do you have enough data about your data,2 points by dekhtiar 8 days ago  | 3 comments
 79 | International Women's Day: What #PledgeForParity Means To Us,5 points by ddrum001 14 days ago  | discuss
 80 | Top 50 Data Science thought leaders on Twitter,3 points by datawerq 11 days ago  | 3 comments
 81 | Ask DT: Who Is Hiring? (March 2016),27 points by whoishiring 21 days ago  | 15 comments
 82 | Deriving Better Insights From Time Series Data With Cycle Plots,3 points by clevertap 11 days ago  | discuss
 83 | Introducing GraphFrames,7 points by falaki 19 days ago  | discuss
 84 | SQL for Data Analysis,4 points by nickhould 14 days ago  | 6 comments
 85 | Stream processing and messaging systems for the IoT age,3 points by gradientflow 12 days ago  | discuss
 86 | Announcing R Tools for Visual Studio,3 points by brakmic 13 days ago  | discuss
 87 | A simpler way to merge data streams,3 points by apoverton 13 days ago  | discuss
 88 | Optimizing Notification Timing for One Signal,9 points by megandias 26 days ago  | discuss
 89 | Skizze - A high throughput probabilistic data structure service and storage,3 points by seiflotfy 14 days ago  | discuss
 90 | Question: What do you want to say about working with data?,2 points by emiller425 8 days ago  | discuss
 91 | Genomic Ranges - an Introduction to Working with Genomic Data,3 points by AnnaOnTheWeb 13 days ago  | discuss
 92 | TensorFlow for Poets,9 points by ebellm 21 days ago  | 1 comment
 93 | Unsupervised Learning with Even Less Supervision Using Bayesian Optimization,2 points by idewanck 11 days ago  | discuss
 94 | How to work with large JSON datasets using Python and Pandas,9 points by brian_spiering 21 days ago  | discuss
 95 | DrivenData Competition: Model/Visualize Fog Patterns in Morocco,4 points by bull 15 days ago  | discuss
 96 | Deep Learning: Nine Lectures at Collège de France by Yan LeCun,5 points by Anon84 17 days ago  | discuss
 97 | Optimizing Facebook Campaigns with R,2 points by AnnaOnTheWeb 12 days ago  | 1 comment
 98 | "Trump Tweets on a Globe (aka Fun with d3, socket.io, and the Twitter API)",8 points by joelgrus 21 days ago  | discuss
 99 | Why pandas users should be excited about Apache Arrow,17 points by pmigdal 29 days ago  | discuss
100 | Histogram intersection for change detection,8 points by datadive 22 days ago  | discuss
101 | Distributed TensorFlow just open-sourced,10 points by elyase 25 days ago  | discuss
102 | D3.js Screencasts (1 in 3 are free),4 points by Veerle 18 days ago  | discuss
103 | Regression and Classification with Examples in R,5 points by soates 20 days ago  | discuss
104 | Free online course on statistical shape modelling,8 points by shapemean 25 days ago  | discuss
105 | "Don't worry about deep learning, deepen your understanding of causality instead",22 points by yanir 37 days ago  | discuss
106 | Work with private repositories and other updates of the FlyElephant platform,2 points by m31 15 days ago  | discuss
107 | How to import XML to almost anywhere,4 points by Jammink 20 days ago  | discuss
108 | Survival Analysis of Cricket Player Careers,8 points by keshav92 26 days ago  | 6 comments
109 | Generate image analogies using neural matching and blending,2 points by pmigdal 15 days ago  | discuss
110 | "Analyzing 1.8M tweets from Super Bowl 50 (Twython, Twitter API, AYLIEN)",4 points by mikewally 20 days ago  | discuss
111 | Newly released sklearn compatible library of categorical encoders,7 points by wdm0006 25 days ago  | discuss
112 | Watch Tiny Neural Nets Learn,4 points by swanint 21 days ago  | discuss
113 | Four pitfalls of hill climbing: An animated look,5 points by csaid81 23 days ago  | discuss
114 | "Decision Forests, Convolutional Networks and the Models in-Between",2 points by ebellm 16 days ago  | discuss
115 | How a Math Genius Hacked OkCupid to Find True Love,15 points by roh_codeur 34 days ago  | discuss
116 | No developers for PyLearn2,3 points by tfturing 19 days ago  | discuss
117 | Density Estimation with Dirichlet Process Mixtures using PyMC3,6 points by MidsizeBlowfish 25 days ago  | discuss
118 | Using survival analysis and git-pandas to estimate code quality,3 points by wdm0006 20 days ago  | discuss
119 | An Analysis of the Flint Michigan Water Crisis: Part 1 Initial Corrosivity,3 points by JHorn 20 days ago  | discuss
120 | An Analysis of Republican Twitter Follower Interests,6 points by michelangelo 26 days ago  | discuss
121 | Introduction to ML talk,8 points by cjbayesian 29 days ago  | discuss
122 | GloVe vs word2vec revisited,3 points by pmigdal 20 days ago  | discuss
123 | Overoptimizing: a story about kaggle,4 points by wdm0006 30 days ago  | discuss
124 | Undergrad Data Analysis/Science internships SF Bay?,3 points by tctctc 15 days ago  | 5 comments
125 | The Role of Statistical Significance in Growth Hacking,6 points by rawls234 27 days ago  | discuss
126 | Data Science Course @ Harvard,7 points by rahmaniacc 29 days ago  | 2 comments
127 | Principal Component Projection Without Principal Component Analysis,6 points by genofon 27 days ago  | discuss
128 | "Machine Learning: An In-Depth, Non-Technical Guide - Part 3",7 points by innoarchitech 29 days ago  | discuss
129 | Stochastic Dummy Boosting,2 points by mikeskim 18 days ago  | discuss
130 | Interactive Map: Hong-Kong through The Lense of Instagram,2 points by BrianN 19 days ago  | discuss
131 | Data Science at Monsanto,3 points by doctorcroc 22 days ago  | discuss
132 | Data Science at Instacart,11 points by jeremystan 34 days ago  | 3 comments
133 | Building a Streaming Search Platform,6 points by ddrum001 28 days ago  | discuss
134 | Kafka Producer Latency with Large Topic Counts,3 points by marklit 26 days ago  | discuss
135 | A Sneak Peak of the Cloud: the 2 Minute Intro for Beginners,2 points by andymaheshw 20 days ago  | discuss
136 | Win-Vector video courses: price/status changes,2 points by jmount 20 days ago  | discuss
137 | 50+ Data Science and Machine Learning Cheat Sheets,20 points by elyase 42 days ago  | 1 comment
138 | One More Reason Not To Be Scared of Deep Learning,2 points by amplifier_khan 21 days ago  | discuss
139 | Visual Logic Authoring vs Code,2 points by AnnaOnTheWeb 21 days ago  | discuss
140 | Data Science in Python online training with hands-on experience,2 points by Puneet 21 days ago  | discuss
141 | Viewing the US Presidential Primary Through the Lens of Twitter,8 points by michelangelo 33 days ago  | discuss
142 | Caffe on Spark open sourced,4 points by rahmaniacc 27 days ago  | discuss
143 | The Ethical Data Scientist,5 points by tfturing 29 days ago  | discuss
144 | Answers to Frequently Asked Questions in Machine Learning,3 points by rasbt 21 days ago  | discuss
145 | Intro to A/B Testing and P-Values,2 points by randyzwitch 22 days ago  | discuss
146 | Visualizing State Level Data With R and Statebins,2 points by usujason 22 days ago  | discuss
147 | "Probabilistic Graphical Models slides & video lectures (Eric Xing, CMU)",4 points by ororm 28 days ago  | discuss
148 | Sense2vec with spaCy and Gensim,9 points by elyase 36 days ago  | 2 comments
149 | A Billion NYC Taxi and Uber Rides in AWS Redshift,3 points by marklit 31 days ago  | discuss
150 | How to Code and Understand DeepMind's Neural Stack Machine (in Python),2 points by genofon 23 days ago  | discuss
151 | How to make polished Jupyter presentations with optional code visibility,9 points by csaid81 36 days ago  | discuss
152 | How to become a Bayesian in eight easy steps,17 points by EtzA 44 days ago  | 1 comment
153 | Optimizing .*:  Details of Vectorization and Metaprogramming in Julia,4 points by randyzwitch 29 days ago  | discuss
154 | IBM certified Apache Spark Online Training,8 points by divya_jain 36 days ago  | discuss
155 | Geographic Data Science course,2 points by rk 25 days ago  | discuss
156 | "The Daily Mail Stole My Visualization, Twice",5 points by thehoff 32 days ago  | 1 comment
157 | Ensemble Methods: Improved Machine Learning Results,9 points by PyBloggers 38 days ago  | discuss
158 | Apache Spark and unsupervised learning in security,2 points by gradientflow 26 days ago  | discuss
159 | MachineJS: Automated machine learning- just give it a data file!,2 points by dsernst 26 days ago  | discuss
160 | The NSA’s SKYNET program may be killing thousands of innocent people,6 points by zlipp 35 days ago  | discuss
161 | "Big Dimensions, and What You Can Do About It",2 points by ramsey 27 days ago  | discuss
162 | Automate Your Oscars Pool with R,2 points by jamesdreiss 27 days ago  | discuss
163 | Signal Processing with LIGO GW150914 data,9 points by tfturing 39 days ago  | discuss
164 | Overview of DeZyre and Coursera Data Science Course,5 points by ann928 34 days ago  | discuss
165 | Upcoming Datathon in NYC,2 points by VicTrey 28 days ago  | discuss
166 | Summarizing Data in SQL,15 points by elisebreda 46 days ago  | discuss
167 | A/B Testing for Scammers,2 points by sameermanek 28 days ago  | discuss
168 | Highly interpretable classifiers for scikit learn using Bayesian decision rules,2 points by mcnulty 28 days ago  | discuss
169 | Auto-scaling scikit-learn with Spark,11 points by falaki 43 days ago  | discuss
170 | Where the f*** can I park?,2 points by manugarri 29 days ago  | discuss
171 | "Machine Learning: An In-Depth, Non-Technical Guide - Part 2",5 points by innoarchitech 36 days ago  | discuss
172 | Webhose.io now offers a historical data archive,7 points by databuffer 40 days ago  | discuss
173 | Meetup: Introduction to Machine Learning Algorithms for Data Science.,4 points by ann928 36 days ago  | discuss
174 | Exploring the Limits of Language Modeling,8 points by soates 42 days ago  | discuss
175 | Text Mining South Park,7 points by pmigdal 41 days ago  | discuss
176 | Finding the K in K-means by Parametric Bootstrap,7 points by jmount 42 days ago  | 1 comment
177 | Getting Started with Statistics for Data Science,3 points by nickhould 35 days ago  | discuss
178 | Rodeo 1.3 - Tab-completion for docstrings,3 points by glamp 35 days ago  | discuss
179 | Teaching D3.js - links,3 points by pmigdal 35 days ago  | discuss
180 | Parallel scikit-learn on YARN,5 points by stijntonk 39 days ago  | discuss
181 | Meetup: Free Live Webinar on Prescriptive Analytics for Fun and Profit,2 points by ann928 32 days ago  | discuss
182 | 


--------------------------------------------------------------------------------
/text_mining/data_tau_days.csv:
--------------------------------------------------------------------------------
  1 | title,date,days
  2 | "An Exploration of R, Yelp, and the Search for Good Indian Food",5 points by Rogerh91 6 hours ago  | discuss,1
  3 | Deep Advances in Generative Modeling,7 points by gwulfs 15 hours ago  | 1 comment,1
  4 | Spark Pipelines: Elegant Yet Powerful,3 points by aouyang1 9 hours ago  | discuss,1
  5 | Shit VCs Say,3 points by Argentum01 10 hours ago  | discuss,1
  6 | "Python, Machine Learning, and Language Wars",4 points by pmigdal 17 hours ago  | discuss,1
  7 | A Neural Network in 11 lines of Python ,3 points by dekhtiar 14 hours ago  | discuss,1
  8 | Markov Chains Explained Visually,13 points by zeroviscosity 1 day ago  | 1 comment,1
  9 | Dplython: Dplyr for Python,13 points by thenaturalist 1 day ago  | 3 comments,1
 10 | Inferring causal impact using Bayesian structural time-series models,8 points by Homunculiheaded 1 day ago  | 1 comment,1
 11 | A Billion Taxi Rides on Amazon EMR running Spark,5 points by marklit 1 day ago  | 1 comment,1
 12 | Tutorial: Web scraping and mapping breweries with import.io and R,4 points by jasdumas 1 day ago  | discuss,1
 13 | The rise of greedy robots,4 points by yanir 2 days ago  | discuss,2
 14 | "Python for Data Structures, Algorithms, and Interviews",18 points by kokoubaby 4 days ago  | discuss,4
 15 | Extracting image metadata at scale,2 points by zachwill 1 day ago  | discuss,1
 16 | Lift charts - A data scientist's secret weapon,14 points by datenheini 4 days ago  | 2 comments,4
 17 | Data Science Side Project,7 points by yashpatel5400 2 days ago  | 9 comments,2
 18 | How To Become A Machine Learning Expert In One Simple Step,4 points by swanint 2 days ago  | discuss,2
 19 | Engineers Shouldn?t Write ETL: High Functioning Data Science Departments,10 points by legel 4 days ago  | 3 comments,4
 20 | Simple estimation of hierarchical events with petersburg,3 points by wdm0006 2 days ago  | discuss,2
 21 | Unsupervised Computer Vision: The Current State of the Art,6 points by carlosfaham 3 days ago  | discuss,3
 22 | Data Engineering at Slack: Twelve Mistakes I've Made In My First Three Months,14 points by gwulfs 6 days ago  | 2 comments,6
 23 | What data visualization tools do /r/DataIsBeautiful OC creators use?,3 points by pmigdal 2 days ago  | discuss,2
 24 | Reshaping in Pandas,6 points by carlosgg 4 days ago  | discuss,4
 25 | An unusual interactive machine learning challenge,4 points by gglumov 3 days ago  | discuss,3
 26 | Datumbox Machine Learning Framework 0.7.0 Released,4 points by datumbox 3 days ago  | discuss,3
 27 | Data science intro for math/phys background,14 points by pmigdal 7 days ago  | discuss,7
 28 | Neural Networks demystified,16 points by elyase 8 days ago  | discuss,8
 29 | What machines can learn from Apple Watch: detecting undiagnosed heart condition,9 points by koukouhappy 6 days ago  | discuss,6
 30 | Data Science Tools: The Biggest Winners and Losers,12 points by AnnaOnTheWeb 7 days ago  | discuss,7
 31 | 10 Years of Open Source Machine Learning,9 points by tstonez 6 days ago  | 1 comment,6
 32 | Has your conversion rate changed? Bayesian timeseries analysis with Python,12 points by yummyfajitas 8 days ago  | discuss,8
 33 | Do jobs run in families?,5 points by Anon84 5 days ago  | 1 comment,5
 34 | Introduction to Scikit Flow -  Simplified Interface to TensorFlow,8 points by lefish 7 days ago  | discuss,7
 35 | "XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow",8 points by crowwork 8 days ago  | discuss,8
 36 | How to learn machine learning?,8 points by kiechu 8 days ago  | 1 comment,8
 37 | The Deep Roots of Javascript Fatigue,5 points by nikkielizdemere 6 days ago  | 1 comment,6
 38 | How do we make Data Tau work?,27 points by hal8 9 days ago  | 18 comments,9
 39 | "Machine Learning: An In-Depth, Non-Technical Guide???Part 4",7 points by innoarchitech 8 days ago  | discuss,8
 40 | Data Science Slack channel - Click for invite,7 points by jyotsna 8 days ago  | discuss,8
 41 | [Ask DT] What are some rookie mistakes in R?,3 points by HKtemp 3 days ago  | discuss,3
 42 | "Playing ""Moneyball"" on EA FIFA 16",16 points by aabb13 13 days ago  | 3 comments,13
 43 | Intellexer - Natural Language Processing and Text Mining REST API,16 points by j_downer 13 days ago  | discuss,13
 44 | Descriptive Statistics in SQL,5 points by nickhould 7 days ago  | discuss,7
 45 | Genomic Data Visualization using Python,2 points by RadhouaneAniba 4 days ago  | discuss,4
 46 | How to Use Cohort Data to Analyze User Behavior,2 points by clevertap 4 days ago  | discuss,4
 47 | Making transparent how variations in analytical choices affect results,4 points by rahmaniacc 7 days ago  | discuss,7
 48 | Show DT: Datasets.co - An easy way to share and discover ml datasets,2 points by mrborgen86 4 days ago  | discuss,4
 49 | Is Scala a better choice than Python for Apache Spark?,7 points by srinify 10 days ago  | 1 comment,10
 50 | Julia: A Fast Language for Numerical Computing,7 points by srinify 10 days ago  | 1 comment,10
 51 | "An Ode To The Rice Cooker, The Smartest Kitchen Appliance I?ve Ever Owned",2 points by tfturing 4 days ago  | discuss,4
 52 | Computing Classification Evaluation Metrics in R,4 points by lefish 7 days ago  | discuss,7
 53 | Analyzing Golden State Warriors' passing network using GraphFrames in Spark,3 points by yukiegosapporo 6 days ago  | discuss,6
 54 | Megaman: Manifold Learning with Millions of points,4 points by dperry 8 days ago  | 3 comments,8
 55 | How to Detect Outliers on Parametric and Non Parametric Methods,2 points by clevertap 5 days ago  | discuss,5
 56 | BallR: Interactive NBA Shot Charts with R and Shiny,12 points by carlosgg 14 days ago  | discuss,14
 57 | A Billion Taxi Rides on Amazon EMR Running Presto,4 points by marklit 8 days ago  | discuss,8
 58 | Minecraft to run artificial intelligence experiments,4 points by bsadeghi 8 days ago  | discuss,8
 59 | Deep Q-Learning (Space Invaders),4 points by pmigdal 8 days ago  | discuss,8
 60 | Theano Tutorial,2 points by pmigdal 5 days ago  | discuss,5
 61 | The Personality Space of Cartoon Characters,3 points by lefish 7 days ago  | discuss,7
 62 | Announcing Apache Flink 1.0.0,11 points by mxm 14 days ago  | discuss,14
 63 | "Telemetry with Collectd, Logstash, Elasticsearch and Grafana (ELG)",3 points by helloanand 7 days ago  | discuss,7
 64 | Statisticians Agree: It?s Time To Stop Misusing P-Value,10 points by jpiburn 15 days ago  | 5 comments,15
 65 | Bayesian Reasoning in The Twilight Zone!,2 points by Homunculiheaded 6 days ago  | discuss,6
 66 | Bayesian Estimation of G Train Wait Times,7 points by jamesdreiss 12 days ago  | discuss,12
 67 | XGBoost: A Scalable Tree Boosting System article,6 points by tfturing 12 days ago  | discuss,12
 68 | Some experiments into explaining complex black box ensemble predictions,2 points by lefish 6 days ago  | discuss,6
 69 | Creating a Hadoop Pseudo-Distributed Environment,2 points by lefish 6 days ago  | discuss,6
 70 | "Data Science Pop-Up in Austin, TX",2 points by AnnaOnTheWeb 6 days ago  | discuss,6
 71 | Train your own image classifier with Inception in TensorFlow,7 points by elyase 13 days ago  | discuss,13
 72 | Shiny app for running a Tensorflow demo,3 points by shinyman 9 days ago  | discuss,9
 73 | File details and owners with gitnoc and git-pandas,3 points by wdm0006 9 days ago  | discuss,9
 74 | 7 Big Data Technologies and When to Use Them that All Data Engineers Should Know,2 points by galvanize 7 days ago  | discuss,7
 75 | Topic clusters with TF-IDF vectorization with Spark and Scala,2 points by lefish 7 days ago  | discuss,7
 76 | Neural Doodles: Workflows for the Next Generation of Artists,5 points by pmigdal 12 days ago  | discuss,12
 77 | Graph Databases 101,5 points by carlosgg 12 days ago  | discuss,12
 78 | DataRadar.IO - Data Science RSS Feed - Do you have enough data about your data,2 points by dekhtiar 8 days ago  | 3 comments,8
 79 | International Women's Day: What #PledgeForParity Means To Us,5 points by ddrum001 14 days ago  | discuss,14
 80 | Top 50 Data Science thought leaders on Twitter,3 points by datawerq 11 days ago  | 3 comments,11
 81 | Ask DT: Who Is Hiring? (March 2016),27 points by whoishiring 21 days ago  | 15 comments,21
 82 | Deriving Better Insights From Time Series Data With Cycle Plots,3 points by clevertap 11 days ago  | discuss,11
 83 | Introducing GraphFrames,7 points by falaki 19 days ago  | discuss,19
 84 | SQL for Data Analysis,4 points by nickhould 14 days ago  | 6 comments,14
 85 | Stream processing and messaging systems for the IoT age,3 points by gradientflow 12 days ago  | discuss,12
 86 | Announcing R Tools for Visual Studio,3 points by brakmic 13 days ago  | discuss,13
 87 | A simpler way to merge data streams,3 points by apoverton 13 days ago  | discuss,13
 88 | Optimizing Notification Timing for One Signal,9 points by megandias 26 days ago  | discuss,26
 89 | Skizze - A high throughput probabilistic data structure service and storage,3 points by seiflotfy 14 days ago  | discuss,14
 90 | Question: What do you want to say about working with data?,2 points by emiller425 8 days ago  | discuss,8
 91 | Genomic Ranges - an Introduction to Working with Genomic Data,3 points by AnnaOnTheWeb 13 days ago  | discuss,13
 92 | TensorFlow for Poets,9 points by ebellm 21 days ago  | 1 comment,21
 93 | Unsupervised Learning with Even Less Supervision Using Bayesian Optimization,2 points by idewanck 11 days ago  | discuss,11
 94 | How to work with large JSON datasets using Python and Pandas,9 points by brian_spiering 21 days ago  | discuss,21
 95 | DrivenData Competition: Model/Visualize Fog Patterns in Morocco,4 points by bull 15 days ago  | discuss,15
 96 | Deep Learning: Nine Lectures at Coll?ge de France by Yan LeCun,5 points by Anon84 17 days ago  | discuss,17
 97 | Optimizing Facebook Campaigns with R,2 points by AnnaOnTheWeb 12 days ago  | 1 comment,12
 98 | "Trump Tweets on a Globe (aka Fun with d3, socket.io, and the Twitter API)",8 points by joelgrus 21 days ago  | discuss,21
 99 | Why pandas users should be excited about Apache Arrow,17 points by pmigdal 29 days ago  | discuss,29
100 | Histogram intersection for change detection,8 points by datadive 22 days ago  | discuss,22
101 | Distributed TensorFlow just open-sourced,10 points by elyase 25 days ago  | discuss,25
102 | D3.js Screencasts (1 in 3 are free),4 points by Veerle 18 days ago  | discuss,18
103 | Regression and Classification with Examples in R,5 points by soates 20 days ago  | discuss,20
104 | Free online course on statistical shape modelling,8 points by shapemean 25 days ago  | discuss,25
105 | "Don't worry about deep learning, deepen your understanding of causality instead",22 points by yanir 37 days ago  | discuss,37
106 | Work with private repositories and other updates of the FlyElephant platform,2 points by m31 15 days ago  | discuss,15
107 | How to import XML to almost anywhere,4 points by Jammink 20 days ago  | discuss,20
108 | Survival Analysis of Cricket Player Careers,8 points by keshav92 26 days ago  | 6 comments,26
109 | Generate image analogies using neural matching and blending,2 points by pmigdal 15 days ago  | discuss,15
110 | "Analyzing 1.8M tweets from Super Bowl 50 (Twython, Twitter API, AYLIEN)",4 points by mikewally 20 days ago  | discuss,20
111 | Newly released sklearn compatible library of categorical encoders,7 points by wdm0006 25 days ago  | discuss,25
112 | Watch Tiny Neural Nets Learn,4 points by swanint 21 days ago  | discuss,21
113 | Four pitfalls of hill climbing: An animated look,5 points by csaid81 23 days ago  | discuss,23
114 | "Decision Forests, Convolutional Networks and the Models in-Between",2 points by ebellm 16 days ago  | discuss,16
115 | How a Math Genius Hacked OkCupid to Find True Love,15 points by roh_codeur 34 days ago  | discuss,34
116 | No developers for PyLearn2,3 points by tfturing 19 days ago  | discuss,19
117 | Density Estimation with Dirichlet Process Mixtures using PyMC3,6 points by MidsizeBlowfish 25 days ago  | discuss,25
118 | Using survival analysis and git-pandas to estimate code quality,3 points by wdm0006 20 days ago  | discuss,20
119 | An Analysis of the Flint Michigan Water Crisis: Part 1 Initial Corrosivity,3 points by JHorn 20 days ago  | discuss,20
120 | An Analysis of Republican Twitter Follower Interests,6 points by michelangelo 26 days ago  | discuss,26
121 | Introduction to ML talk,8 points by cjbayesian 29 days ago  | discuss,29
122 | GloVe vs word2vec revisited,3 points by pmigdal 20 days ago  | discuss,20
123 | Overoptimizing: a story about kaggle,4 points by wdm0006 30 days ago  | discuss,30
124 | Undergrad Data Analysis/Science internships SF Bay?,3 points by tctctc 15 days ago  | 5 comments,15
125 | The Role of Statistical Significance in Growth Hacking,6 points by rawls234 27 days ago  | discuss,27
126 | Data Science Course @ Harvard,7 points by rahmaniacc 29 days ago  | 2 comments,29
127 | Principal Component Projection Without Principal Component Analysis,6 points by genofon 27 days ago  | discuss,27
128 | "Machine Learning: An In-Depth, Non-Technical Guide - Part 3",7 points by innoarchitech 29 days ago  | discuss,29
129 | Stochastic Dummy Boosting,2 points by mikeskim 18 days ago  | discuss,18
130 | Interactive Map: Hong-Kong through The Lense of Instagram,2 points by BrianN 19 days ago  | discuss,19
131 | Data Science at Monsanto,3 points by doctorcroc 22 days ago  | discuss,22
132 | Data Science at Instacart,11 points by jeremystan 34 days ago  | 3 comments,34
133 | Building a Streaming Search Platform,6 points by ddrum001 28 days ago  | discuss,28
134 | Kafka Producer Latency with Large Topic Counts,3 points by marklit 26 days ago  | discuss,26
135 | A Sneak Peak of the Cloud: the 2 Minute Intro for Beginners,2 points by andymaheshw 20 days ago  | discuss,20
136 | Win-Vector video courses: price/status changes,2 points by jmount 20 days ago  | discuss,20
137 | 50+ Data Science and Machine Learning Cheat Sheets,20 points by elyase 42 days ago  | 1 comment,42
138 | One More Reason Not To Be Scared of Deep Learning,2 points by amplifier_khan 21 days ago  | discuss,21
139 | Visual Logic Authoring vs Code,2 points by AnnaOnTheWeb 21 days ago  | discuss,21
140 | Data Science in Python online training with hands-on experience,2 points by Puneet 21 days ago  | discuss,21
141 | Viewing the US Presidential Primary Through the Lens of Twitter,8 points by michelangelo 33 days ago  | discuss,33
142 | Caffe on Spark open sourced,4 points by rahmaniacc 27 days ago  | discuss,27
143 | The Ethical Data Scientist,5 points by tfturing 29 days ago  | discuss,29
144 | Answers to Frequently Asked Questions in Machine Learning,3 points by rasbt 21 days ago  | discuss,21
145 | Intro to A/B Testing and P-Values,2 points by randyzwitch 22 days ago  | discuss,22
146 | Visualizing State Level Data With R and Statebins,2 points by usujason 22 days ago  | discuss,22
147 | "Probabilistic Graphical Models slides & video lectures (Eric Xing, CMU)",4 points by ororm 28 days ago  | discuss,28
148 | Sense2vec with spaCy and Gensim,9 points by elyase 36 days ago  | 2 comments,36
149 | A Billion NYC Taxi and Uber Rides in AWS Redshift,3 points by marklit 31 days ago  | discuss,31
150 | How to Code and Understand DeepMind's Neural Stack Machine (in Python),2 points by genofon 23 days ago  | discuss,23
151 | How to make polished Jupyter presentations with optional code visibility,9 points by csaid81 36 days ago  | discuss,36
152 | How to become a Bayesian in eight easy steps,17 points by EtzA 44 days ago  | 1 comment,44
153 | Optimizing .*:  Details of Vectorization and Metaprogramming in Julia,4 points by randyzwitch 29 days ago  | discuss,29
154 | IBM certified Apache Spark Online Training,8 points by divya_jain 36 days ago  | discuss,36
155 | Geographic Data Science course,2 points by rk 25 days ago  | discuss,25
156 | "The Daily Mail Stole My Visualization, Twice",5 points by thehoff 32 days ago  | 1 comment,32
157 | Ensemble Methods: Improved Machine Learning Results,9 points by PyBloggers 38 days ago  | discuss,38
158 | Apache Spark and unsupervised learning in security,2 points by gradientflow 26 days ago  | discuss,26
159 | MachineJS: Automated machine learning- just give it a data file!,2 points by dsernst 26 days ago  | discuss,26
160 | The NSA?s SKYNET program may be killing thousands of innocent people,6 points by zlipp 35 days ago  | discuss,35
161 | "Big Dimensions, and What You Can Do About It",2 points by ramsey 27 days ago  | discuss,27
162 | Automate Your Oscars Pool with R,2 points by jamesdreiss 27 days ago  | discuss,27
163 | Signal Processing with LIGO GW150914 data,9 points by tfturing 39 days ago  | discuss,39
164 | Overview of DeZyre and Coursera Data Science Course,5 points by ann928 34 days ago  | discuss,34
165 | Upcoming Datathon in NYC,2 points by VicTrey 28 days ago  | discuss,28
166 | Summarizing Data in SQL,15 points by elisebreda 46 days ago  | discuss,46
167 | A/B Testing for Scammers,2 points by sameermanek 28 days ago  | discuss,28
168 | Highly interpretable classifiers for scikit learn using Bayesian decision rules,2 points by mcnulty 28 days ago  | discuss,28
169 | Auto-scaling scikit-learn with Spark,11 points by falaki 43 days ago  | discuss,43
170 | Where the f*** can I park?,2 points by manugarri 29 days ago  | discuss,29
171 | "Machine Learning: An In-Depth, Non-Technical Guide - Part 2",5 points by innoarchitech 36 days ago  | discuss,36
172 | Webhose.io now offers a historical data archive,7 points by databuffer 40 days ago  | discuss,40
173 | Meetup: Introduction to Machine Learning Algorithms for Data Science.,4 points by ann928 36 days ago  | discuss,36
174 | Exploring the Limits of Language Modeling,8 points by soates 42 days ago  | discuss,42
175 | Text Mining South Park,7 points by pmigdal 41 days ago  | discuss,41
176 | Finding the K in K-means by Parametric Bootstrap,7 points by jmount 42 days ago  | 1 comment,42
177 | Getting Started with Statistics for Data Science,3 points by nickhould 35 days ago  | discuss,35
178 | Rodeo 1.3 - Tab-completion for docstrings,3 points by glamp 35 days ago  | discuss,35
179 | Teaching D3.js - links,3 points by pmigdal 35 days ago  | discuss,35
180 | Parallel scikit-learn on YARN,5 points by stijntonk 39 days ago  | discuss,39
181 | Meetup: Free Live Webinar on Prescriptive Analytics for Fun and Profit,2 points by ann928 32 days ago  | discuss,32
182 | 


--------------------------------------------------------------------------------
/text_mining/img/chunk-segmentation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/chunk-segmentation.png


--------------------------------------------------------------------------------
/text_mining/img/datatau.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/datatau.png


--------------------------------------------------------------------------------
/text_mining/img/date.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/date.png


--------------------------------------------------------------------------------
/text_mining/img/entity_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/entity_extraction.png


--------------------------------------------------------------------------------
/text_mining/img/gutenberg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/gutenberg.png


--------------------------------------------------------------------------------
/text_mining/img/punkt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/punkt.png


--------------------------------------------------------------------------------
/text_mining/img/title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/img/title.png


--------------------------------------------------------------------------------
/text_mining/nltk_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/text_mining/nltk_data.zip


--------------------------------------------------------------------------------
/time_series/1-Frame.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 1. Frame the Problem\n",
  8 |     "\n",
  9 |     "In late 2010, Onion prices shot through the roof and causing grave crisis. Apparently the crisis was caused by lack of rainfall in major onion producing region - Maharashtra and Karnataka and led to large scale hoarding by the traders. The crisis caused political tension in the country and described as \"a grave concern\" by then Prime Minister Manmohan Singh.\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "- BBC Article in Dec 2010 - [Stink over onion crisis is enough to make you cry](http://www.bbc.co.uk/blogs/thereporters/soutikbiswas/2010/12/indias_onion_crisis.html)\n",
 13 |     "- Hindu OpEd in Dec 2010 - [The political price of onions](http://www.thehindu.com/opinion/editorial/article977100.ece)\n",
 14 |     "\n",
 15 |     "![](img/peeling_the_onion_small.png)\n",
 16 |     "\n",
 17 |     "So what are the type of questions on Onion Prices - you would like to ask. \n",
 18 |     "\n",
 19 |     "\n",
 20 |     "## Types of Question\n",
 21 |     "\n",
 22 |     "> \"Doing data analysis requires quite a bit of thinking and we believe that when you’ve completed a good data analysis, you’ve spent more time thinking than doing.\" - Roger Peng\n",
 23 |     "\n",
 24 |     "1. **Descriptive** - \"seeks to summarize a characteristic of a set of data\"\n",
 25 |     "2. **Exploratory** - \"analyze the data to see if there are patterns, trends, or relationships between variables\" (hypothesis generating) \n",
 26 |     "3. **Inferential** - \"a restatement of this proposed hypothesis as a question and would be answered by analyzing a different set of data\" (hypothesis testing)\n",
 27 |     "4. **Predictive** - \"determine the impact on one factor based on other factor in a population - to make a prediction\"\n",
 28 |     "5. **Causal** - \"asks whether changing one factor will change another factor in a population - to establish a causal link\" \n",
 29 |     "6. **Mechanistic** - \"establish *how* the change in one factor results in change in another factor in a population - to determine the exact mechanism\"\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "### Descriptive \n",
 33 |     "- Which states have the highest onion production and sales?\n",
 34 |     "- Which city (Mandi's) have the highest sales?\n",
 35 |     "- What is the average price for Onion across a year in Bangalore?\n",
 36 |     "- ...\n",
 37 |     "\n",
 38 |     "### Exploratory & Inferential \n",
 39 |     "- Is there a large difference between High and Low prices of Onion in a day?\n",
 40 |     "- What is the trend of onion price across days or months in Bangalore?\n",
 41 |     "- How is the price on onion correlated with volume of onion?\n",
 42 |     "- How is the export volume of onion correlated to domestic production volume?\n",
 43 |     "- ...\n",
 44 |     "\n",
 45 |     "### Predictive \n",
 46 |     "- What is the price of onion likely to be next day?\n",
 47 |     "- What is the price of onion likely to be next month?\n",
 48 |     "- What will be the sales quantity of onion tommorrow in Delhi?\n",
 49 |     "- ...\n",
 50 |     "\n",
 51 |     "### Causal\n",
 52 |     "- Does the change in production of onion have an impact on the onion prices? \n",
 53 |     "- Does the change in rainfall in monsoon have an impact on onion prices?\n",
 54 |     "- ...\n",
 55 |     "\n",
 56 |     "### Mechanistic\n",
 57 |     "- How does change in onion production impact the price of onion?\n",
 58 |     "- How does onion export volumes impact the prices of onion in local markets in India?\n",
 59 |     "- ...\n",
 60 |     "\n",
 61 |     "\n",
 62 |     "## Questions we will attempt\n",
 63 |     "\n",
 64 |     "### 1. Descriptive:  How big is the Bangalore onion market compared to other cities in India?\n",
 65 |     "\n",
 66 |     "### 2. Exploratory / Inferential: Have the price variation in onion prices in Bangalore really gone up over the years?\n",
 67 |     "\n",
 68 |     "### 3. Predictive: Can we predict the price of onion in Bangalore?"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": []
 79 |   }
 80 |  ],
 81 |  "metadata": {
 82 |   "kernelspec": {
 83 |    "display_name": "Python 3",
 84 |    "language": "python",
 85 |    "name": "python3"
 86 |   },
 87 |   "language_info": {
 88 |    "codemirror_mode": {
 89 |     "name": "ipython",
 90 |     "version": 3
 91 |    },
 92 |    "file_extension": ".py",
 93 |    "mimetype": "text/x-python",
 94 |    "name": "python",
 95 |    "nbconvert_exporter": "python",
 96 |    "pygments_lexer": "ipython3",
 97 |    "version": "3.5.1"
 98 |   }
 99 |  },
100 |  "nbformat": 4,
101 |  "nbformat_minor": 0
102 | }
103 | 


--------------------------------------------------------------------------------
/time_series/city_geocode.csv:
--------------------------------------------------------------------------------
  1 | city,lon,lat
  2 | GUWAHATI,91.7362365,26.1445169
  3 | KOLKATA,88.363895,22.572646
  4 | SRIRAMPUR,88.3385053,23.4033393
  5 | SHEROAPHULY,88.3215014,22.7690032
  6 | BURDWAN,87.8614793,23.2324214
  7 | MIDNAPUR,87.3214908,22.4308892
  8 | PURULIA,86.365208,23.3320779
  9 | DHULIA,86.0618818,22.0347727
 10 | BHUBNESWER,85.8245398,20.2960587
 11 | BIHARSHARIF,85.5148735,25.1982147
 12 | RANCHI,85.309562,23.3440997
 13 | PATNA,85.1375645,25.5940947
 14 | BALLIA,84.1487319,25.7584381
 15 | DEORIA,83.7838214,26.4862373
 16 | GORAKHPUR,83.3731675,26.7605545
 17 | VARANASI,82.9739144,25.3176452
 18 | RAJAHMUNDRY,81.8040345,17.0005383
 19 | RAIPUR,81.6296413,21.2513844
 20 | DINDORI,81.0768455,22.9417931
 21 | LUCKNOW,80.946166,26.8466937
 22 | KANPUR,80.3318736,26.449923
 23 | CHENNAI,80.2707184,13.0826802
 24 | HALDWANI,79.5129767,29.2182644
 25 | BAREILLY,79.4304381,28.3670355
 26 | NAGPUR,79.0881546,21.1458004
 27 | ETAWAH,79.0046898,26.8117116
 28 | SAGAR,78.7378068,23.838805
 29 | SAIKHEDA,78.5831181,22.962215
 30 | HYDERABAD,78.486671,17.385044
 31 | KOLAR,78.1325611,13.1357446
 32 | MADURAI,78.1197754,9.9252007
 33 | ALIGARH,78.0880129,27.8973944
 34 | KURNOOL,78.0372792,15.8281257
 35 | DEHRADOON,78.0321918,30.3164945
 36 | AGRA,78.0080745,27.1766701
 37 | DINDIGUL,77.9802906,10.3673123
 38 | CHICKBALLAPUR,77.7280396,13.432366
 39 | MEERUT,77.7064137,28.9844618
 40 | BANGALORE,77.5945627,12.9715987
 41 | BHOPAL,77.412615,23.2599333
 42 | RAICHUR,77.3439283,16.2120031
 43 | DELHI,77.2090212,28.6139391
 44 | SHIMLA,77.1734033,31.1048145
 45 | KARNAL,76.9904825,29.6856929
 46 | COIMBATORE,76.9558321,11.0168445
 47 | PALAYAM,76.9513432,8.5027684
 48 | TRIVENDRUM,76.9366376,8.5241391
 49 | CHANDIGARH,76.7794179,30.7333148
 50 | CHALLAKERE,76.6528225,14.313395
 51 | ALWAR,76.6345735,27.5529907
 52 | PATIALA,76.3868797,30.3397809
 53 | DEVALA,76.3820088,11.4725502
 54 | KHANNA,76.2112286,30.697852
 55 | HASSAN,76.0995519,13.0068142
 56 | DEWAS,76.0507949,22.9622672
 57 | DHAVANGERE,75.9238397,14.4663438
 58 | HOSHIARPUR,75.911483,31.5143178
 59 | SOLAPUR,75.9063906,17.6599188
 60 | KOTA,75.8647527,25.2138156
 61 | INDORE,75.8577258,22.7195687
 62 | LUDHIANA,75.8572758,30.900965
 63 | JAIPUR,75.7872709,26.9124336
 64 | UJJAIN,75.7849097,23.1793013
 65 | BIJAPUR,75.710031,16.8301708
 66 | JALANDHAR,75.5761829,31.3260152
 67 | JALGAON,75.5626039,21.0076578
 68 | HUBLI,75.1239547,15.3647083
 69 | MANDSOUR,75.0692952,24.076836
 70 | BHATINDA,74.9454745,30.210994
 71 | SRINAGAR,74.9442585,34.1255413
 72 | NEWASA,74.9281063,19.5511772
 73 | AMRITSAR,74.8722642,31.6339793
 74 | NEEMUCH,74.8624092,24.4763852
 75 | JAMMU,74.8576539,32.7217819
 76 | AHMEDNAGAR,74.7495916,19.0952075
 77 | SHRIRAMPUR,74.6576091,19.6222323
 78 | RAHURI,74.6488264,19.392678
 79 | AJMER,74.6399163,26.4498954
 80 | SANGALI,74.5814773,16.8523973
 81 | MALEGAON,74.5100291,20.5547497
 82 | BELGAUM,74.4976741,15.8496953
 83 | RAHATA,74.483335,19.7127021
 84 | YEOLA,74.4818698,20.0471229
 85 | KOPERGAON,74.4790898,19.8916791
 86 | MANMAD,74.4366016,20.2511789
 87 | PHALTAN ,74.4360424,17.9844507
 88 | CHANDVAD,74.2472779,20.3271277
 89 | KOLHAPUR,74.2432527,16.7049873
 90 | LASALGAON,74.2326058,20.1491422
 91 | SANGAMNER,74.2079648,19.5771387
 92 | SATANA,74.2032581,20.598224
 93 | ABOHAR,74.1993043,30.1452928
 94 | LONAND,74.1861821,18.041706
 95 | NIPHAD,74.1093141,20.0799646
 96 | SINNAR,74.0006328,19.8530593
 97 | PIMPALGAON,73.9873787,20.1699678
 98 | SRIGANGANAGAR,73.8771901,29.9038399
 99 | JUNNAR,73.87425,19.2031842
100 | CHAKAN,73.8630346,18.7602664
101 | PUNE,73.8567437,18.5204303
102 | NASIK,73.7898023,19.9974533
103 | UDAIPUR,73.712479,24.585445
104 | BIKANER,73.3119159,28.0229348
105 | JODHPUR,73.0243094,26.2389469
106 | NANDGAON,72.9276008,18.3855337
107 | MUMBAI,72.8776559,19.0759837
108 | SURAT,72.8310607,21.1702401
109 | AHMEDABAD,72.5713621,23.022505
110 | DEESA,72.1906721,24.2585031
111 | BHAVNAGAR,72.1519304,21.7644725
112 | MAHUVA,71.7563169,21.0902193
113 | RAJKOT,70.8021599,22.3038945
114 | GONDAL,70.792297,21.9619463
115 | JAMNAGAR,70.05773,22.4707019
116 | KALVAN,73.13054,19.24033
117 | VANI,73.89189,20.33749
118 | BOMBORI,72.87766,19.07598


--------------------------------------------------------------------------------
/time_series/img/Cov_nonstationary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/Cov_nonstationary.png


--------------------------------------------------------------------------------
/time_series/img/Mean_nonstationary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/Mean_nonstationary.png


--------------------------------------------------------------------------------
/time_series/img/Var_nonstationary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/Var_nonstationary.png


--------------------------------------------------------------------------------
/time_series/img/left_merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/left_merge.png


--------------------------------------------------------------------------------
/time_series/img/onion_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/onion_small.png


--------------------------------------------------------------------------------
/time_series/img/onion_tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/onion_tables.png


--------------------------------------------------------------------------------
/time_series/img/peeling_the_onion_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/peeling_the_onion_small.png


--------------------------------------------------------------------------------
/time_series/img/pivot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/pivot.png


--------------------------------------------------------------------------------
/time_series/img/splitapplycombine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/splitapplycombine.png


--------------------------------------------------------------------------------
/time_series/img/subsetcolumns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/subsetcolumns.png


--------------------------------------------------------------------------------
/time_series/img/subsetrows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/machine-learning/3b15a08198e3e151719b75f1d58e9f2ff157b324/time_series/img/subsetrows.png


--------------------------------------------------------------------------------
/time_series/state_geocode.csv:
--------------------------------------------------------------------------------
 1 | "state","name","lon","lat"
 2 | "MS","Maharashtra",75.7138884,19.7514798
 3 | "GUJ","Gujarat",71.1923805,22.258652
 4 | "MP","Madhya pradesh",78.6568942,22.9734229
 5 | "TN","Tamil Nadu",78.6568942,11.1271225
 6 | "KNT","Karnataka",75.7138884,15.3172775
 7 | "DEL","Delhi",77.2090212,28.6139391
 8 | "HR","Haryana",76.085601,29.0587757
 9 | "RAJ","Rajasthan",74.2179326,27.0238036
10 | "AP","Andhra Pradesh",79.7399875,15.9128998
11 | "UP","Uttar Pradesh",80.9461592,26.8467088
12 | "JK","Jammu & Kashmir",74.8576539,32.7217819
13 | "BHR","Bihar",85.3131194,25.0960742
14 | "WB","West Bengal",87.8549755,22.9867569
15 | "HP","Himachal Pradesh",77.1733901,31.1048294
16 | "ASM","Assam",92.9375739,26.2006043
17 | "KEL","Kerala",76.2710833,10.8505159
18 | "JH","Jharkhand",85.2799354,23.6101808
19 | "OR","Orissa",85.0985236,20.9516658
20 | "PB","Punjab",75.3412179,31.1471305
21 | "KER","Kerala",76.2710833,10.8505159
22 | "CH","Chandigarh",76.7794179,30.7333148
23 | 


--------------------------------------------------------------------------------