├── .gitignore
├── Battlefin-s-big-data
    └── notebooks
    │   └── Battlefin_Analysis.ipynb
├── CrowdFlower
    ├── .ipynb_checkpoints
    │   ├── Basic SVM Model-checkpoint.ipynb
    │   └── Initial Analysis-checkpoint.ipynb
    ├── AdditionalFeatures.ipynb
    ├── AnalyzingMistakes.ipynb
    ├── AndreasMullerImplementation.ipynb
    ├── Basic SVM Model.ipynb
    ├── Blending.ipynb
    ├── CountVectorizer.ipynb
    ├── EDA.ipynb
    ├── EnsembleAllModels.ipynb
    ├── EnsembleFiles.ipynb
    ├── EnsembleSVCandNB.ipynb
    ├── Ensembling.ipynb
    ├── FeatureSelection.ipynb
    ├── GenerateFeatures.ipynb
    ├── GridSearchOnNaiveBayes.ipynb
    ├── Initial Analysis.ipynb
    ├── KNN distance processed.ipynb
    ├── Knn unprocessed.ipynb
    ├── Linear unprocessed.ipynb
    ├── ModelOnRelevanceVariance.ipynb
    ├── Non Linear Processed.ipynb
    ├── Non Linear SVM unprocessed.ipynb
    ├── OptimizeSVC.ipynb
    ├── Relevance_Scores.ipynb
    ├── SpellCorrection.ipynb
    ├── Stacking.ipynb
    ├── StemmingAndSVC.ipynb
    ├── TFIDF_Train_Plus_Test.ipynb
    ├── Vowpal wabbit.ipynb
    ├── query_features.py
    └── scripts
    │   ├── blending_helper.py
    │   ├── features.py
    │   ├── helper.py
    │   ├── model_train_plus_test.py
    │   └── models.py
├── HIV-Progression
    ├── .ipynb_checkpoints
    │   ├── Basic_Analysis-checkpoint.ipynb
    │   └── ClassBalancedModel-checkpoint.ipynb
    ├── Basic_Analysis.ipynb
    ├── ClassBalancedModel.ipynb
    ├── data
    │   ├── test_data.csv
    │   └── training_data.csv
    ├── helper.py
    └── initialSubmission.csv
├── Home Insurance
    ├── Exploratory Analysis.ipynb
    ├── Home Insurance.ipynb
    ├── features.py
    ├── scripts
    │   └── helper.py
    └── utils.py
├── Home-Depot
    ├── notebooks
    │   ├── Home-Depot-Analysis.ipynb
    │   └── Home-Depot-Models.ipynb
    └── scripts
    │   ├── cross-validation.py
    │   ├── dataset.py
    │   ├── eda.py
    │   ├── numerical_features.py
    │   ├── search_map.py
    │   ├── search_map.pyc
    │   └── text-features.py
├── PAKDD
    ├── PAKDD.ipynb
    ├── PAKDD_EDA.ipynb
    └── PAKDD_Signal_Processing_Approach.ipynb
├── Predict-Bio-Response
    ├── Predict-Bio-Response-Exploratory-Data-Analysis.ipynb
    ├── Predict-Bio-Response-Model-Building.ipynb
    ├── Predict-Bio-Response-Tree-Based-Models.ipynb
    └── data
    │   ├── test.csv
    │   └── train.csv
├── Predicting-Grants
    ├── .ipynb_checkpoints
    │   ├── Data Analysis-checkpoint.ipynb
    │   └── Description-checkpoint.ipynb
    ├── Data Analysis.ipynb
    ├── Description.ipynb
    └── data
    │   ├── unimelb_example.csv
    │   ├── unimelb_test.csv
    │   └── unimelb_training.csv
├── README.md
├── Rossman-Stores-Sales
    ├── .gitignore
    ├── .ipynb_checkpoints
    │   └── rossman_store_sales-checkpoint.ipynb
    ├── rossman_store_sales.ipynb
    └── scripts
    │   ├── helper.py
    │   └── rossman.py
├── Santander-Customer-Satisfaction
    ├── .gitignore
    ├── notebooks
    │   ├── Santander Customer Satisfaction - Exploratory Data Analysis.ipynb
    │   └── Santander Customer Satisfaction - Models.ipynb
    └── scripts
    │   ├── analysis.py
    │   ├── blending.py
    │   ├── cross-validation.py
    │   ├── feature_analysis.py
    │   ├── feature_importance.py
    │   ├── models.py
    │   ├── vector_quantization.py
    │   └── xgboost-tune.py
├── Whats-Cooking
    ├── .gitignore
    ├── .ipynb_checkpoints
    │   └── Whats Cooking-checkpoint.ipynb
    └── Whats Cooking.ipynb
└── cars-cancellation
    ├── .gitignore
    ├── .ipynb_checkpoints
        └── cars_cancellation-checkpoint.ipynb
    └── cars_cancellation.ipynb


/.gitignore:
--------------------------------------------------------------------------------
 1 | CrowdFlower/data
 2 | CrowdFlower/papers
 3 | CrowdFlower/submissions
 4 | CrowdFlower/.ipynb_checkpoints/*.ipynb
 5 | Rossman-Stores-Sales/data
 6 | Rossman-Stores-Sales/submissions
 7 | liberty-group/data
 8 | liberty-group/submissions
 9 | PAKDD/data
10 | PAKDD/submissions
11 | .ipynb_checkpoints
12 | .DS_Store
13 | Standard-Customer-Satisfaction/data
14 | Standard-Customer-Satisfaction/submissions
15 | plots/
16 | Battlefin-s-big-data/data/
17 | Battlefin-s-big-data/submissions/
18 | Home-Depot/data/
19 | Home-Depot/submissions/
20 | Predict-Bio-Response/submissions/
21 | 


--------------------------------------------------------------------------------
/CrowdFlower/.ipynb_checkpoints/Initial Analysis-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/CrowdFlower/AndreasMullerImplementation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "import seaborn as sns\n",
 15 |     "%matplotlib inline"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "%run scripts/helper.py\n",
 27 |     "%run scripts/features.py"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n",
 39 |     "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 4,
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "traindata = prepareText(crowd_train)\n",
 51 |     "testdata = prepareText(crowd_test)\n",
 52 |     "y = crowd_train.median_relevance.values"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 5,
 58 |    "metadata": {
 59 |     "collapsed": false
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "'bridal shower decorations Accent Pillow with Heart Design - Red/Black Red satin accent pillow embroidered with a heart in black thread. 8\" x 8\".'"
 66 |       ]
 67 |      },
 68 |      "execution_count": 5,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "# lets take a look at some sample training data\n",
 75 |     "traindata[0]"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 6,
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "'electric griddle Star-Max 48 in Electric Griddle '"
 89 |       ]
 90 |      },
 91 |      "execution_count": 6,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# lets take a look at sample test data\n",
 98 |     "testdata[0]"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 7,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "from sklearn.metrics import make_scorer\n",
110 |     "\n",
111 |     "# Weighted kappa scorer\n",
112 |     "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better=True)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 23,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "from sklearn.cross_validation import StratifiedShuffleSplit\n",
124 |     "\n",
125 |     "sss = StratifiedShuffleSplit(y, 3, train_size=7000, random_state=0)\n",
126 |     "train_index, test_index = next(iter(sss))"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 24,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "Xt = np.asarray(traindata)[train_index]\n",
138 |     "yt = np.asarray(y)[train_index]"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 25,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "from sklearn.pipeline import Pipeline\n",
150 |     "from sklearn.feature_selection import SelectPercentile, chi2\n",
151 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
152 |     "from sklearn.cross_validation import cross_val_score\n",
153 |     "from sklearn.svm import SVC\n",
154 |     "from sklearn.preprocessing import StandardScaler\n",
155 |     "from sklearn.metrics import confusion_matrix\n",
156 |     "from sklearn.decomposition import TruncatedSVD\n",
157 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
158 |     "from sklearn.naive_bayes import MultinomialNB"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 41,
164 |    "metadata": {
165 |     "collapsed": true
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "countvect_char = TfidfVectorizer(min_df=3, max_features=None, \n",
170 |     "            strip_accents='unicode', analyzer='char',\n",
171 |     "            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
172 |     "            stop_words = 'english')\n",
173 |     "\n",
174 |     "countvect_word = TfidfVectorizer(min_df=3, max_features=None, \n",
175 |     "            strip_accents='unicode', analyzer='word',\n",
176 |     "            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
177 |     "            stop_words = 'english')\n",
178 |     "\n",
179 |     "clf1 = MultinomialNB(alpha=.01)\n",
180 |     "clf2 = SVC(C=10.0)\n",
181 |     "\n",
182 |     "ft = FeatureStacker([('chars', countvect_char), ('words', countvect_word)])"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 42,
188 |    "metadata": {
189 |     "collapsed": false
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "pipeline = Pipeline([\n",
194 |     "        ('vect', ft),\n",
195 |     "        ('classifier', clf1)\n",
196 |     "    ])"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 22,
202 |    "metadata": {
203 |     "collapsed": false
204 |    },
205 |    "outputs": [
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "[ 0.30866723  0.29436923]\n"
211 |      ]
212 |     }
213 |    ],
214 |    "source": [
215 |     "scores = cross_val_score(pipeline, Xt, \n",
216 |     "                         yt, cv=2, scoring=kappa_scorer,\n",
217 |     "                         n_jobs=1)\n",
218 |     "print scores"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 43,
224 |    "metadata": {
225 |     "collapsed": false
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "pipeline1 = Pipeline([\n",
230 |     "        ('vect', ft),\n",
231 |     "        ('svd', TruncatedSVD(n_components=140)),\n",
232 |     "        ('scl', StandardScaler()),\n",
233 |     "        ('classifier', clf2)\n",
234 |     "    ])"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 51,
240 |    "metadata": {
241 |     "collapsed": false
242 |    },
243 |    "outputs": [
244 |     {
245 |      "name": "stdout",
246 |      "output_type": "stream",
247 |      "text": [
248 |       "[ 0.35911812  0.38454006]\n"
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "scores = cross_val_score(pipeline1, Xt, \n",
254 |     "                         yt, cv=2, scoring=kappa_scorer,\n",
255 |     "                         n_jobs=1)\n",
256 |     "print scores"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 29,
262 |    "metadata": {
263 |     "collapsed": true
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "from sklearn.cross_validation import train_test_split"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 30,
273 |    "metadata": {
274 |     "collapsed": true
275 |    },
276 |    "outputs": [],
277 |    "source": [
278 |     "Xtrain, Xvalidation, ytrain, yvalidation = train_test_split(traindata, y, test_size=0.2, random_state=0)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 44,
284 |    "metadata": {
285 |     "collapsed": false
286 |    },
287 |    "outputs": [
288 |     {
289 |      "data": {
290 |       "text/plain": [
291 |        "Pipeline(steps=[('vect', FeatureStacker(transformer_list=[('chars', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',\n",
292 |        "        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',\n",
293 |        "        lowercase=True, max_df=1.0, max_features=None, min_df=3,\n",
294 |        "        ngram_range=(1, 2), norm...  vocabulary=None))])), ('classifier', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])"
295 |       ]
296 |      },
297 |      "execution_count": 44,
298 |      "metadata": {},
299 |      "output_type": "execute_result"
300 |     }
301 |    ],
302 |    "source": [
303 |     "pipeline.fit(traindata, y)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 45,
309 |    "metadata": {
310 |     "collapsed": false
311 |    },
312 |    "outputs": [
313 |     {
314 |      "data": {
315 |       "text/plain": [
316 |        "Pipeline(steps=[('vect', FeatureStacker(transformer_list=[('chars', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',\n",
317 |        "        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',\n",
318 |        "        lowercase=True, max_df=1.0, max_features=None, min_df=3,\n",
319 |        "        ngram_range=(1, 2), norm...f', max_iter=-1, probability=False, random_state=None,\n",
320 |        "  shrinking=True, tol=0.001, verbose=False))])"
321 |       ]
322 |      },
323 |      "execution_count": 45,
324 |      "metadata": {},
325 |      "output_type": "execute_result"
326 |     }
327 |    ],
328 |    "source": [
329 |     "pipeline1.fit(traindata, y)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 33,
335 |    "metadata": {
336 |     "collapsed": true
337 |    },
338 |    "outputs": [],
339 |    "source": [
340 |     "first_model_predict = pipeline.predict(Xvalidation)\n",
341 |     "second_model_predict = pipeline1.predict(Xvalidation)"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 34,
347 |    "metadata": {
348 |     "collapsed": false
349 |    },
350 |    "outputs": [
351 |     {
352 |      "name": "stdout",
353 |      "output_type": "stream",
354 |      "text": [
355 |       "First model individual score 0.4457 \n"
356 |      ]
357 |     }
358 |    ],
359 |    "source": [
360 |     "print 'First model individual score %0.4f ' %(quadratic_weighted_kappa(yvalidation, first_model_predict))"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 35,
366 |    "metadata": {
367 |     "collapsed": false
368 |    },
369 |    "outputs": [
370 |     {
371 |      "name": "stdout",
372 |      "output_type": "stream",
373 |      "text": [
374 |       "Second model individual score 0.5342 \n"
375 |      ]
376 |     }
377 |    ],
378 |    "source": [
379 |     "print 'Second model individual score %0.4f ' %(quadratic_weighted_kappa(yvalidation, second_model_predict))"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 36,
385 |    "metadata": {
386 |     "collapsed": false
387 |    },
388 |    "outputs": [
389 |     {
390 |      "name": "stdout",
391 |      "output_type": "stream",
392 |      "text": [
393 |       "Average of two models score 0.5573 \n"
394 |      ]
395 |     }
396 |    ],
397 |    "source": [
398 |     "print 'Average of two models score %0.4f ' %(quadratic_weighted_kappa(yvalidation, (first_model_predict + second_model_predict) / 2))"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 46,
404 |    "metadata": {
405 |     "collapsed": false
406 |    },
407 |    "outputs": [],
408 |    "source": [
409 |     "# prediction on test data set\n",
410 |     "pred1 = pipeline.predict(testdata)"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 47,
416 |    "metadata": {
417 |     "collapsed": true
418 |    },
419 |    "outputs": [],
420 |    "source": [
421 |     "pred2 = pipeline1.predict(testdata)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 48,
427 |    "metadata": {
428 |     "collapsed": true
429 |    },
430 |    "outputs": [],
431 |    "source": [
432 |     "avg_pred = (pred1 + pred2) / 2"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": 49,
438 |    "metadata": {
439 |     "collapsed": true
440 |    },
441 |    "outputs": [],
442 |    "source": [
443 |     "# submission\n",
444 |     "make_submission(crowd_test.index.values.astype(int), avg_pred, 'ensemble1.csv')"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": null,
450 |    "metadata": {
451 |     "collapsed": true
452 |    },
453 |    "outputs": [],
454 |    "source": []
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {
460 |     "collapsed": true
461 |    },
462 |    "outputs": [],
463 |    "source": []
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {
469 |     "collapsed": true
470 |    },
471 |    "outputs": [],
472 |    "source": []
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": null,
477 |    "metadata": {
478 |     "collapsed": true
479 |    },
480 |    "outputs": [],
481 |    "source": []
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "metadata": {
487 |     "collapsed": true
488 |    },
489 |    "outputs": [],
490 |    "source": []
491 |   }
492 |  ],
493 |  "metadata": {
494 |   "kernelspec": {
495 |    "display_name": "Python 2",
496 |    "language": "python",
497 |    "name": "python2"
498 |   },
499 |   "language_info": {
500 |    "codemirror_mode": {
501 |     "name": "ipython",
502 |     "version": 2
503 |    },
504 |    "file_extension": ".py",
505 |    "mimetype": "text/x-python",
506 |    "name": "python",
507 |    "nbconvert_exporter": "python",
508 |    "pygments_lexer": "ipython2",
509 |    "version": "2.7.6"
510 |   }
511 |  },
512 |  "nbformat": 4,
513 |  "nbformat_minor": 0
514 | }
515 | 


--------------------------------------------------------------------------------
/CrowdFlower/Blending.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 12,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import warnings\n",
 13 |     "warnings.filterwarnings('ignore')"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 41,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "%run scripts/helper.py\n",
 25 |     "%run scripts/models.py\n",
 26 |     "%run scripts/blending_helper.py"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 42,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n",
 38 |     "y = y = crowd_train.median_relevance"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 43,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "train_index, test_index = ssSplit(y, train_size=500 ,random_state=1234)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 44,
 55 |    "metadata": {
 56 |     "collapsed": false
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "Xtrain = crowd_train.iloc[train_index]\n",
 61 |     "ytrain = y[train_index]\n",
 62 |     "\n",
 63 |     "Xtest = crowd_train.iloc[test_index]\n",
 64 |     "ytest = y.iloc[test_index]"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 45,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "Xtrain_text = tweak_text(Xtrain)\n",
 76 |     "Xtest_text = tweak_text(Xtest)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 46,
 82 |    "metadata": {
 83 |     "collapsed": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "corpus = []\n",
 88 |     "\n",
 89 |     "for x in Xtrain_text:\n",
 90 |     "    corpus.append(x)\n",
 91 |     "\n",
 92 |     "for x in Xtest_text:\n",
 93 |     "    corpus.append(x)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 47,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "1516"
107 |       ]
108 |      },
109 |      "execution_count": 47,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "len(corpus)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 60,
121 |    "metadata": {
122 |     "collapsed": true
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "from sklearn.cross_validation import StratifiedKFold\n",
127 |     "from sklearn.svm import SVC\n",
128 |     "from sklearn.linear_model import LogisticRegression"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 49,
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "Xtrain_data, tfv, svd, scl = prepareTrainData(Xtrain_text, corpus)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 50,
145 |    "metadata": {
146 |     "collapsed": false
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "Xtest_data = prepareTestData(Xtest_text, tfv, svd, scl) "
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 51,
156 |    "metadata": {
157 |     "collapsed": true
158 |    },
159 |    "outputs": [],
160 |    "source": [
161 |     "skf = list(StratifiedKFold(ytrain, 3))"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 56,
167 |    "metadata": {
168 |     "collapsed": false
169 |    },
170 |    "outputs": [],
171 |    "source": [
172 |     "clfs = [SVC(C=10.0, kernel='rbf', gamma=.00, probability=True),\n",
173 |     "        SVC(C=15.0, kernel='linear', probability=True)\n",
174 |     "       ]"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 57,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "Creating train and test sets for blending.\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "print \"Creating train and test sets for blending.\"\n",
194 |     "    \n",
195 |     "dataset_blend_train = np.zeros((Xtrain_data.shape[0], len(clfs)))\n",
196 |     "dataset_blend_test = np.zeros((Xtest_data.shape[0], len(clfs)))"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 58,
202 |    "metadata": {
203 |     "collapsed": false
204 |    },
205 |    "outputs": [
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "0 SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
211 |       "  kernel='rbf', max_iter=-1, probability=True, random_state=None,\n",
212 |       "  shrinking=True, tol=0.001, verbose=False)\n",
213 |       "Fold 0\n",
214 |       "Fold 1\n",
215 |       "Fold 2\n",
216 |       "1 SVC(C=15.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
217 |       "  kernel='linear', max_iter=-1, probability=True, random_state=None,\n",
218 |       "  shrinking=True, tol=0.001, verbose=False)\n",
219 |       "Fold 0\n",
220 |       "Fold 1\n",
221 |       "Fold 2\n"
222 |      ]
223 |     }
224 |    ],
225 |    "source": [
226 |     "for j, clf in enumerate(clfs):\n",
227 |     "    print j, clf\n",
228 |     "    dataset_blend_test_j = np.zeros((Xtest_data.shape[0], len(skf)))\n",
229 |     "    for i, (train, test) in enumerate(skf):\n",
230 |     "        print \"Fold\", i\n",
231 |     "        X_train = Xtrain_data[train]\n",
232 |     "        y_train = ytrain[train]\n",
233 |     "        X_test = Xtrain_data[test]\n",
234 |     "        y_test = ytrain[test]\n",
235 |     "        clf.fit(X_train, y_train)\n",
236 |     "        y_submission = clf.predict_proba(X_test)[:,1]\n",
237 |     "        dataset_blend_train[test, j] = y_submission\n",
238 |     "        dataset_blend_test_j[:, i] = clf.predict_proba(Xtest_data)[:,1]\n",
239 |     "    dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 68,
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "Blending.\n"
254 |      ]
255 |     },
256 |     {
257 |      "ename": "ValueError",
258 |      "evalue": "Input contains NaN, infinity or a value too large for dtype('float64').",
259 |      "output_type": "error",
260 |      "traceback": [
261 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
262 |       "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
263 |       "\u001b[1;32m<ipython-input-68-9ada720753fd>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[1;34m\"Blending.\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mclf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mLogisticRegression\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset_blend_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mytrain\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m \u001b[1;31m# y_submission = clf.predict_proba(dataset_blend_test)[:,1]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
264 |       "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\linear_model\\logistic.pyc\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m   1015\u001b[0m                              % self.C)\n\u001b[0;32m   1016\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1017\u001b[1;33m         \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'csr'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"C\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1018\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclasses_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1019\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msolver\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'liblinear'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'newton-cg'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'lbfgs'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
265 |       "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\utils\\validation.pyc\u001b[0m in \u001b[0;36mcheck_X_y\u001b[1;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)\u001b[0m\n\u001b[0;32m    443\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    444\u001b[0m         \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcolumn_or_1d\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwarn\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 445\u001b[1;33m         \u001b[0m_assert_all_finite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    446\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0my_numeric\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkind\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'O'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    447\u001b[0m         \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
266 |       "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\utils\\validation.pyc\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[1;34m(X)\u001b[0m\n\u001b[0;32m     50\u001b[0m             and not np.isfinite(X).all()):\n\u001b[0;32m     51\u001b[0m         raise ValueError(\"Input contains NaN, infinity\"\n\u001b[1;32m---> 52\u001b[1;33m                          \" or a value too large for %r.\" % X.dtype)\n\u001b[0m\u001b[0;32m     53\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     54\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
267 |       "\u001b[1;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float64')."
268 |      ]
269 |     }
270 |    ],
271 |    "source": [
272 |     "print \"Blending.\"\n",
273 |     "clf = LogisticRegression()\n",
274 |     "clf.fit(dataset_blend_train, ytrain)\n",
275 |     "# y_submission = clf.predict_proba(dataset_blend_test)[:,1]"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 71,
281 |    "metadata": {
282 |     "collapsed": false
283 |    },
284 |    "outputs": [
285 |     {
286 |      "data": {
287 |       "text/plain": [
288 |        "False"
289 |       ]
290 |      },
291 |      "execution_count": 71,
292 |      "metadata": {},
293 |      "output_type": "execute_result"
294 |     }
295 |    ],
296 |    "source": [
297 |     "(dataset_blend_train == np.nan).any()"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {
304 |     "collapsed": true
305 |    },
306 |    "outputs": [],
307 |    "source": []
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {
313 |     "collapsed": true
314 |    },
315 |    "outputs": [],
316 |    "source": []
317 |   }
318 |  ],
319 |  "metadata": {
320 |   "kernelspec": {
321 |    "display_name": "Python 2",
322 |    "language": "python",
323 |    "name": "python2"
324 |   },
325 |   "language_info": {
326 |    "codemirror_mode": {
327 |     "name": "ipython",
328 |     "version": 2
329 |    },
330 |    "file_extension": ".py",
331 |    "mimetype": "text/x-python",
332 |    "name": "python",
333 |    "nbconvert_exporter": "python",
334 |    "pygments_lexer": "ipython2",
335 |    "version": "2.7.6"
336 |   }
337 |  },
338 |  "nbformat": 4,
339 |  "nbformat_minor": 0
340 | }
341 | 


--------------------------------------------------------------------------------
/CrowdFlower/EnsembleAllModels.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy\n",
 12 |     "import pandas as pd"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 15,
 18 |    "metadata": {
 19 |     "collapsed": false
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "best_score = pd.read_csv('./submissions/best_score.csv')\n",
 24 |     "three_ensemble = pd.read_csv('./submissions/3ensemble.csv')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 16,
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "all_preds = pd.DataFrame({'best_score': best_score.prediction,\n",
 36 |     "                          'three_ensemble': three_ensemble.prediction\n",
 37 |     "                         })"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 17,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/html": [
 50 |        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 51 |        "<table border=\"1\" class=\"dataframe\">\n",
 52 |        "  <thead>\n",
 53 |        "    <tr style=\"text-align: right;\">\n",
 54 |        "      <th></th>\n",
 55 |        "      <th>best_score</th>\n",
 56 |        "      <th>three_ensemble</th>\n",
 57 |        "    </tr>\n",
 58 |        "  </thead>\n",
 59 |        "  <tbody>\n",
 60 |        "    <tr>\n",
 61 |        "      <th>best_score</th>\n",
 62 |        "      <td> 1.000000</td>\n",
 63 |        "      <td> 0.785756</td>\n",
 64 |        "    </tr>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>three_ensemble</th>\n",
 67 |        "      <td> 0.785756</td>\n",
 68 |        "      <td> 1.000000</td>\n",
 69 |        "    </tr>\n",
 70 |        "  </tbody>\n",
 71 |        "</table>\n",
 72 |        "</div>"
 73 |       ],
 74 |       "text/plain": [
 75 |        "                best_score  three_ensemble\n",
 76 |        "best_score        1.000000        0.785756\n",
 77 |        "three_ensemble    0.785756        1.000000"
 78 |       ]
 79 |      },
 80 |      "execution_count": 17,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "all_preds.corr()"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 18,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "preds = (all_preds.best_score + all_preds.three_ensemble) / 2"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 19,
103 |    "metadata": {
104 |     "collapsed": false
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "import math\n",
109 |     "\n",
110 |     "preds = [int(math.floor(x)) for x in preds]"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 20,
116 |    "metadata": {
117 |     "collapsed": false
118 |    },
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "[4, 3, 3, 2, 4, 4, 4, 3, 4, 2]"
124 |       ]
125 |      },
126 |      "execution_count": 20,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "preds[:10]"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 21,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "%run scripts/helper.py"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 22,
149 |    "metadata": {
150 |     "collapsed": false
151 |    },
152 |    "outputs": [],
153 |    "source": [
154 |     "make_submission(unprocessed_ensemble.id, preds, 'ensemble_best_three_ensemble.csv')"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {
161 |     "collapsed": true
162 |    },
163 |    "outputs": [],
164 |    "source": []
165 |   }
166 |  ],
167 |  "metadata": {
168 |   "kernelspec": {
169 |    "display_name": "Python 2",
170 |    "language": "python",
171 |    "name": "python2"
172 |   },
173 |   "language_info": {
174 |    "codemirror_mode": {
175 |     "name": "ipython",
176 |     "version": 2
177 |    },
178 |    "file_extension": ".py",
179 |    "mimetype": "text/x-python",
180 |    "name": "python",
181 |    "nbconvert_exporter": "python",
182 |    "pygments_lexer": "ipython2",
183 |    "version": "2.7.6"
184 |   }
185 |  },
186 |  "nbformat": 4,
187 |  "nbformat_minor": 0
188 | }
189 | 


--------------------------------------------------------------------------------
/CrowdFlower/EnsembleFiles.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 12,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "from glob import glob"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 4,
 19 |    "metadata": {
 20 |     "collapsed": false
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "all_submissions = {}\n",
 25 |     "for i, filename in enumerate(glob('./submissions/*.csv')):\n",
 26 |     "    all_submissions[i] = pd.read_csv(filename).prediction"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 6,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "all_submissions_df = pd.DataFrame(all_submissions)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 24,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "submissions_corr = all_submissions_df.corr()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 35,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "uncorrelated_submissions_pair = []\n",
 60 |     "\n",
 61 |     "for i in range(len(all_submissions)):\n",
 62 |     "    for j in range(len(all_submissions)):\n",
 63 |     "        if  i != j:\n",
 64 |     "            if submissions_corr.ix[i, j] < .75:\n",
 65 |     "                uncorrelated_submissions_pair.append((i, j))"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 16,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "ids = pd.read_csv('./submissions/3ensemble.csv').id"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 18,
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# Averaging all the submissions\n",
 88 |     "average_of_all_submissions = all_submissions_df.apply(np.mean, axis=1).map(lambda x: int(x))"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 20,
 94 |    "metadata": {
 95 |     "collapsed": false
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "%run scripts/helper.py"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 22,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "make_submission(ids, average_of_all_submissions, 'average_of_all_submissions.csv')"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {
117 |     "collapsed": true
118 |    },
119 |    "outputs": [],
120 |    "source": []
121 |   }
122 |  ],
123 |  "metadata": {
124 |   "kernelspec": {
125 |    "display_name": "Python 2",
126 |    "language": "python",
127 |    "name": "python2"
128 |   },
129 |   "language_info": {
130 |    "codemirror_mode": {
131 |     "name": "ipython",
132 |     "version": 2
133 |    },
134 |    "file_extension": ".py",
135 |    "mimetype": "text/x-python",
136 |    "name": "python",
137 |    "nbconvert_exporter": "python",
138 |    "pygments_lexer": "ipython2",
139 |    "version": "2.7.6"
140 |   }
141 |  },
142 |  "nbformat": 4,
143 |  "nbformat_minor": 0
144 | }
145 | 


--------------------------------------------------------------------------------
/CrowdFlower/EnsembleSVCandNB.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "import seaborn as sns\n",
 15 |     "%matplotlib inline"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "%run scripts/helper.py"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n",
 38 |     "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# fill in the missing np.nan values with empty string\n",
 50 |     "crowd_train.fillna('', inplace=True, axis=1)\n",
 51 |     "crowd_test.fillna('', inplace=True, axis=1)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 5,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "traindata = list(crowd_train.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))\n",
 63 |     "testdata = list(crowd_test.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 6,
 69 |    "metadata": {
 70 |     "collapsed": true
 71 |    },
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "y = crowd_train.median_relevance.values"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "### Train a support vector machine"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 7,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 41,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "tfv = TfidfVectorizer(min_df=1, max_features=None,\n",
104 |     "                     strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n",
105 |     "                     ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
106 |     "                     stop_words = 'english')"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {
113 |     "collapsed": true
114 |    },
115 |    "outputs": [],
116 |    "source": [
117 |     "tfv.fit(traindata)\n",
118 |     "X = tfv.transform(traindata)\n",
119 |     "X_test = tfv.transform(testdata)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 10,
125 |    "metadata": {
126 |     "collapsed": true
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "from sklearn.decomposition import TruncatedSVD\n",
131 |     "from sklearn.preprocessing import StandardScaler"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 11,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "svd = TruncatedSVD(n_components=140)\n",
143 |     "X_svd = svd.fit_transform(X)\n",
144 |     "X_test_svd = svd.transform(X_test)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 12,
150 |    "metadata": {
151 |     "collapsed": true
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "scl = StandardScaler()\n",
156 |     "X_svd_scl = scl.fit_transform(X_svd)\n",
157 |     "X_test_svd_scl = scl.transform(X_test_svd)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 13,
163 |    "metadata": {
164 |     "collapsed": true
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "from sklearn.svm import SVC"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 14,
174 |    "metadata": {
175 |     "collapsed": true
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "svc = SVC(C=10.0, gamma=.01)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 15,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/plain": [
192 |        "SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,\n",
193 |        "  gamma=0.01, kernel='rbf', max_iter=-1, probability=False,\n",
194 |        "  random_state=None, shrinking=True, tol=0.001, verbose=False)"
195 |       ]
196 |      },
197 |      "execution_count": 15,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "svc.fit(X_svd_scl, y)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "### Train a Multinomial NB classifier"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 16,
216 |    "metadata": {
217 |     "collapsed": true
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "from sklearn.naive_bayes import MultinomialNB"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 33,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "classifier = MultinomialNB(alpha=0.1).fit(X, y)"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 34,
238 |    "metadata": {
239 |     "collapsed": false
240 |    },
241 |    "outputs": [
242 |     {
243 |      "data": {
244 |       "text/plain": [
245 |        "0.7186444813827411"
246 |       ]
247 |      },
248 |      "execution_count": 34,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "quadratic_weighted_kappa(y, classifier.predict(X))"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "### Ensemble their predictions"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 19,
267 |    "metadata": {
268 |     "collapsed": true
269 |    },
270 |    "outputs": [],
271 |    "source": [
272 |     "svc_pred = svc.predict(X_test_svd_scl)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 35,
278 |    "metadata": {
279 |     "collapsed": true
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "nb_predict = classifier.predict(X_test)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 37,
289 |    "metadata": {
290 |     "collapsed": false
291 |    },
292 |    "outputs": [
293 |     {
294 |      "data": {
295 |       "text/plain": [
296 |        "(array([4, 3, 3, 2, 4, 4, 4, 4, 4, 2], dtype=int64),\n",
297 |        " array([4, 4, 3, 3, 4, 4, 4, 4, 4, 4], dtype=int64))"
298 |       ]
299 |      },
300 |      "execution_count": 37,
301 |      "metadata": {},
302 |      "output_type": "execute_result"
303 |     }
304 |    ],
305 |    "source": [
306 |     "svc_pred[:10], nb_predict[:10]"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 38,
312 |    "metadata": {
313 |     "collapsed": true
314 |    },
315 |    "outputs": [],
316 |    "source": [
317 |     "ensemble_predict = (svc_pred + nb_predict) / 2"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 39,
323 |    "metadata": {
324 |     "collapsed": false
325 |    },
326 |    "outputs": [
327 |     {
328 |      "data": {
329 |       "text/plain": [
330 |        "array([4, 3, 3, 2, 4, 4, 4, 4, 4, 3], dtype=int64)"
331 |       ]
332 |      },
333 |      "execution_count": 39,
334 |      "metadata": {},
335 |      "output_type": "execute_result"
336 |     }
337 |    ],
338 |    "source": [
339 |     "ensemble_predict[:10]"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 40,
345 |    "metadata": {
346 |     "collapsed": true
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "# Create your first submission file\n",
351 |     "submission = pd.DataFrame({\"id\": crowd_test.index.values.astype(int), \"prediction\": ensemble_predict})\n",
352 |     "submission.to_csv(\"./submissions/ensembleNBAndSVCoptimized.csv\", index=False)"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "metadata": {
359 |     "collapsed": true
360 |    },
361 |    "outputs": [],
362 |    "source": []
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {
368 |     "collapsed": true
369 |    },
370 |    "outputs": [],
371 |    "source": []
372 |   }
373 |  ],
374 |  "metadata": {
375 |   "kernelspec": {
376 |    "display_name": "Python 2",
377 |    "language": "python",
378 |    "name": "python2"
379 |   },
380 |   "language_info": {
381 |    "codemirror_mode": {
382 |     "name": "ipython",
383 |     "version": 2
384 |    },
385 |    "file_extension": ".py",
386 |    "mimetype": "text/x-python",
387 |    "name": "python",
388 |    "nbconvert_exporter": "python",
389 |    "pygments_lexer": "ipython2",
390 |    "version": "2.7.6"
391 |   }
392 |  },
393 |  "nbformat": 4,
394 |  "nbformat_minor": 0
395 | }
396 | 


--------------------------------------------------------------------------------
/CrowdFlower/FeatureSelection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import warnings\n",
 12 |     "warnings.filterwarnings('ignore')\n",
 13 |     "from sklearn.pipeline import Pipeline"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "%run query_features.py\n",
 25 |     "%run scripts/helper.py\n",
 26 |     "%run scripts/model_train_plus_test.py"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "crowd_train = load_file('./data/train.csv/train.csv', None)\n",
 38 |     "crowd_test = load_file('./data/test.csv/test.csv', None)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "crowd_train = crowd_train[crowd_train.relevance_variance < 0.5]"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 5,
 55 |    "metadata": {
 56 |     "collapsed": true
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "target = crowd_train.median_relevance.values"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 49,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# train_index, test_index = ssSplit(target, train_size=8000, random_state=44)\n",
 72 |     "train_index, test_index = ssSplit(target, train_size=1000, random_state=44)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 50,
 78 |    "metadata": {
 79 |     "collapsed": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "Xt = crowd_train.iloc[train_index]\n",
 84 |     "Xv = crowd_train.iloc[test_index]\n",
 85 |     "\n",
 86 |     "# Xt = crowd_train\n",
 87 |     "# Xv = crowd_test"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 51,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "yt = target[train_index]\n",
 99 |     "yv = target[test_index]\n",
100 |     "\n",
101 |     "# yt = target"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 52,
107 |    "metadata": {
108 |     "collapsed": true
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "Xt_tweaked = tweak_text(Xt)\n",
113 |     "Xv_tweaked = tweak_text(Xv)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 53,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "Xfitted, tfv = TFIDF(Xt_tweaked, None)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 54,
130 |    "metadata": {
131 |     "collapsed": true
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)\n",
136 |     "\n",
137 |     "scl = StandardScaler(copy=True, with_mean=True, with_std=True)\n",
138 |     "\n",
139 |     "clf = SVC(C=10.0, kernel='linear', degree=3, \n",
140 |     "        gamma=0.0, coef0=0.0, shrinking=True, probability=False, \n",
141 |     "        tol=0.001, cache_size=200, class_weight=None, \n",
142 |     "        verbose=False, max_iter=-1, random_state=None)\n",
143 |     "\n",
144 |     "keywords = keyword_counter(Xt)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 55,
150 |    "metadata": {
151 |     "collapsed": true
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "features = stack([keywords, Xfitted])"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 56,
161 |    "metadata": {
162 |     "collapsed": false
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "features_svd = svd.fit_transform(features)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 57,
172 |    "metadata": {
173 |     "collapsed": false
174 |    },
175 |    "outputs": [],
176 |    "source": [
177 |     "features_scl = scl.fit_transform(features_svd)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "### Feature selection"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 58,
190 |    "metadata": {
191 |     "collapsed": true
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "kappa_scorer = get_kappa_scorer()"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "collapsed": false
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "selector = feature_selection(features_scl, yt, clf, 1, None, kappa_scorer, None, 0)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 26,
212 |    "metadata": {
213 |     "collapsed": false
214 |    },
215 |    "outputs": [
216 |     {
217 |      "data": {
218 |       "text/plain": [
219 |        "array([ 0.09551443,  0.08587044,  0.09263525,  0.10222593,  0.10446155,\n",
220 |        "        0.11196421,  0.11980537,  0.1532042 ,  0.18316295,  0.18411899,\n",
221 |        "        0.19283421,  0.18717257,  0.23314848,  0.203963  ,  0.20825379,\n",
222 |        "        0.26279836,  0.30424894,  0.32203741,  0.30186082,  0.25278024,\n",
223 |        "        0.31742312,  0.32569698,  0.30012057,  0.27447408,  0.31833882,\n",
224 |        "        0.31342121,  0.31756955,  0.26548062,  0.22389084,  0.23368437,\n",
225 |        "        0.21912735,  0.21356475,  0.21658899,  0.18873339,  0.197842  ,\n",
226 |        "        0.21829272,  0.20929388,  0.21696621,  0.21999229,  0.24068457,\n",
227 |        "        0.23622419,  0.20397825,  0.21827953,  0.21135912,  0.23446343,\n",
228 |        "        0.19750452,  0.20252151,  0.18871767,  0.17076826,  0.10988198,\n",
229 |        "        0.13213942,  0.14247887,  0.17133736,  0.16121156,  0.13371489,\n",
230 |        "        0.16145429,  0.16127367,  0.17602269,  0.17288821,  0.15778934,\n",
231 |        "        0.17682649,  0.15262014,  0.17968618,  0.17732637,  0.17032855,\n",
232 |        "        0.17693857,  0.16577611,  0.1756472 ,  0.14748799,  0.13333879,\n",
233 |        "        0.13531875,  0.14256728,  0.14421636,  0.1297998 ,  0.12999505,\n",
234 |        "        0.13111776,  0.13347969,  0.13054624,  0.13066804,  0.11786777,\n",
235 |        "        0.10748329,  0.12599375,  0.13164103,  0.12126069,  0.13172245,\n",
236 |        "        0.14973141,  0.1381632 ,  0.14347926,  0.1494687 ,  0.11836732,\n",
237 |        "        0.14209801,  0.11081288,  0.12052675,  0.13134992,  0.15391132,\n",
238 |        "        0.16443798,  0.19489916,  0.18679589,  0.19408882,  0.19987597,\n",
239 |        "        0.19325446,  0.20296309,  0.18869364,  0.17369716,  0.17375526,\n",
240 |        "        0.15641049,  0.16361439,  0.15484795,  0.15943795,  0.16955748,\n",
241 |        "        0.1614979 ,  0.17480293,  0.14640037,  0.147536  ,  0.14943605,\n",
242 |        "        0.15927827,  0.16448356,  0.16394893,  0.14914186,  0.15109373,\n",
243 |        "        0.148167  ,  0.12231477,  0.171927  ,  0.1467997 ,  0.15001508,\n",
244 |        "        0.14589149,  0.14230615,  0.15261398,  0.15127757,  0.15941927,\n",
245 |        "        0.13826314,  0.12325342,  0.11749912,  0.11941622,  0.1273668 ,\n",
246 |        "        0.13661811,  0.11678706,  0.12084   ,  0.13081467,  0.13644204,\n",
247 |        "        0.14650248,  0.13725575,  0.12880575,  0.13964549,  0.11886544,\n",
248 |        "        0.10889425,  0.1252021 ,  0.1262044 ,  0.13128633,  0.1070466 ,\n",
249 |        "        0.10620304,  0.11012552,  0.12194561,  0.1185141 ,  0.10210466,\n",
250 |        "        0.12612228,  0.13550932,  0.1437801 ,  0.13409203,  0.13157652,\n",
251 |        "        0.13754568,  0.13349481,  0.1134101 ,  0.1084755 ,  0.11184213,\n",
252 |        "        0.12546086,  0.10496072,  0.10044754,  0.11875523,  0.12692686,\n",
253 |        "        0.12648058,  0.11382819,  0.10869305,  0.10664844,  0.09421044,\n",
254 |        "        0.11362754,  0.12713682,  0.12993287,  0.12456679,  0.12112542,\n",
255 |        "        0.12439039,  0.12986643,  0.13964398,  0.13674094,  0.11873711,\n",
256 |        "        0.12049872,  0.13033784,  0.12472941,  0.14054241,  0.13845084,\n",
257 |        "        0.14617703,  0.14190381,  0.12295125,  0.12317129,  0.1206635 ,\n",
258 |        "        0.10517095,  0.10900798,  0.10670199,  0.11276843,  0.10644653])"
259 |       ]
260 |      },
261 |      "execution_count": 26,
262 |      "metadata": {},
263 |      "output_type": "execute_result"
264 |     }
265 |    ],
266 |    "source": [
267 |     "selector.grid_scores_"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 27,
273 |    "metadata": {
274 |     "collapsed": true
275 |    },
276 |    "outputs": [],
277 |    "source": [
278 |     "best_estimator = selector.estimator_"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 36,
284 |    "metadata": {
285 |     "collapsed": false
286 |    },
287 |    "outputs": [
288 |     {
289 |      "data": {
290 |       "text/plain": [
291 |        "22"
292 |       ]
293 |      },
294 |      "execution_count": 36,
295 |      "metadata": {},
296 |      "output_type": "execute_result"
297 |     }
298 |    ],
299 |    "source": [
300 |     "selector."
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "### Predict on test set"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 28,
313 |    "metadata": {
314 |     "collapsed": true
315 |    },
316 |    "outputs": [],
317 |    "source": [
318 |     "keywords_test = keyword_counter(Xv)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 29,
324 |    "metadata": {
325 |     "collapsed": true
326 |    },
327 |    "outputs": [],
328 |    "source": [
329 |     "Xtest_fitted  = tfv.transform(Xv_tweaked)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 30,
335 |    "metadata": {
336 |     "collapsed": true
337 |    },
338 |    "outputs": [],
339 |    "source": [
340 |     "features_test = stack([keywords_test, Xtest_fitted])"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 32,
346 |    "metadata": {
347 |     "collapsed": true
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "features_svd_test = svd.transform(features_test)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 33,
357 |    "metadata": {
358 |     "collapsed": true
359 |    },
360 |    "outputs": [],
361 |    "source": [
362 |     "features_scl_test = scl.transform(features_svd_test)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 37,
368 |    "metadata": {
369 |     "collapsed": true
370 |    },
371 |    "outputs": [],
372 |    "source": [
373 |     "features_selected_test = selector.transform(features_scl_test)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 39,
379 |    "metadata": {
380 |     "collapsed": false
381 |    },
382 |    "outputs": [
383 |     {
384 |      "name": "stdout",
385 |      "output_type": "stream",
386 |      "text": [
387 |       "Weighted kappa score on test set 0.2867 \n"
388 |      ]
389 |     }
390 |    ],
391 |    "source": [
392 |     "print 'Weighted kappa score on test set %0.4f ' % quadratic_weighted_kappa(yv, best_estimator.predict(features_selected_test))"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {
399 |     "collapsed": true
400 |    },
401 |    "outputs": [],
402 |    "source": []
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {
408 |     "collapsed": true
409 |    },
410 |    "outputs": [],
411 |    "source": []
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": null,
416 |    "metadata": {
417 |     "collapsed": true
418 |    },
419 |    "outputs": [],
420 |    "source": []
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {
426 |     "collapsed": true
427 |    },
428 |    "outputs": [],
429 |    "source": []
430 |   }
431 |  ],
432 |  "metadata": {
433 |   "kernelspec": {
434 |    "display_name": "Python 2",
435 |    "language": "python",
436 |    "name": "python2"
437 |   },
438 |   "language_info": {
439 |    "codemirror_mode": {
440 |     "name": "ipython",
441 |     "version": 2
442 |    },
443 |    "file_extension": ".py",
444 |    "mimetype": "text/x-python",
445 |    "name": "python",
446 |    "nbconvert_exporter": "python",
447 |    "pygments_lexer": "ipython2",
448 |    "version": "2.7.6"
449 |   }
450 |  },
451 |  "nbformat": 4,
452 |  "nbformat_minor": 0
453 | }
454 | 


--------------------------------------------------------------------------------
/CrowdFlower/KNN distance processed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%run scripts/helper.py\n",
 12 |     "%run scripts/models.py"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {
 30 |     "collapsed": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "y = crowd_train.median_relevance.values"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 5,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from sklearn.cross_validation import cross_val_score\n",
 46 |     "from sklearn.metrics import make_scorer"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 6,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better = True)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 7,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stderr",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65497012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n",
 72 |       "  '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n",
 73 |       "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65516012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n",
 74 |       "  '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n",
 75 |       "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/6552101\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n",
 76 |       "  '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "X = tweak_text(crowd_train)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 8,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 93 |     "from sklearn.decomposition import TruncatedSVD\n",
 94 |     "from sklearn.preprocessing import StandardScaler"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 9,
100 |    "metadata": {
101 |     "collapsed": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "tfv = TfidfVectorizer(min_df=3, max_df=500, max_features=None, \n",
106 |     "            strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n",
107 |     "            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
108 |     "            stop_words = 'english')\n",
109 |     "\n",
110 |     "svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)\n",
111 |     "scl = StandardScaler(copy=True, with_mean=True, with_std=True)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 10,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "X_tfv = tfv.fit_transform(X)\n",
123 |     "X_svd = svd.fit_transform(X_tfv)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 22,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "from sklearn.neighbors import KNeighborsClassifier\n",
135 |     "\n",
136 |     "knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute')"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 21,
142 |    "metadata": {
143 |     "collapsed": false
144 |    },
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "min score 0.5001, max score 0.5005 and mean score 0.5003 \n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "scores = cross_val_score(knn, X_svd, y, scoring=kappa_scorer, n_jobs=1, cv=2)\n",
156 |     "print 'min score %0.4f, max score %0.4f and mean score %0.4f ' %(scores.min(), scores.max(), scores.mean())"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 17,
162 |    "metadata": {
163 |     "collapsed": true
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "tweaked_model, tfv, svd = build_knn_model(X, y, 'distance')"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 29,
173 |    "metadata": {
174 |     "collapsed": false
175 |    },
176 |    "outputs": [],
177 |    "source": [
178 |     "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 30,
184 |    "metadata": {
185 |     "collapsed": true
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "Xtest = tweak_text(crowd_test)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 32,
195 |    "metadata": {
196 |     "collapsed": false
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "predictions = []\n",
201 |     "for i in range(0, len(Xtest), 5000):\n",
202 |     "    preds = knn_model_predictions(tweaked_model, tfv, svd, Xtest[i:i+5000])\n",
203 |     "    predictions.append(preds)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 33,
209 |    "metadata": {
210 |     "collapsed": true
211 |    },
212 |    "outputs": [],
213 |    "source": [
214 |     "all_preds = np.hstack(predictions)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 34,
220 |    "metadata": {
221 |     "collapsed": false
222 |    },
223 |    "outputs": [
224 |     {
225 |      "data": {
226 |       "text/plain": [
227 |        "(22513,)"
228 |       ]
229 |      },
230 |      "execution_count": 34,
231 |      "metadata": {},
232 |      "output_type": "execute_result"
233 |     }
234 |    ],
235 |    "source": [
236 |     "all_preds.shape"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 35,
242 |    "metadata": {
243 |     "collapsed": true
244 |    },
245 |    "outputs": [],
246 |    "source": [
247 |     "make_submission(crowd_test.index.values.astype(int), all_preds, './model-submissions/knn_distance_processed.csv')"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {
254 |     "collapsed": true
255 |    },
256 |    "outputs": [],
257 |    "source": []
258 |   }
259 |  ],
260 |  "metadata": {
261 |   "kernelspec": {
262 |    "display_name": "Python 2",
263 |    "language": "python",
264 |    "name": "python2"
265 |   },
266 |   "language_info": {
267 |    "codemirror_mode": {
268 |     "name": "ipython",
269 |     "version": 2
270 |    },
271 |    "file_extension": ".py",
272 |    "mimetype": "text/x-python",
273 |    "name": "python",
274 |    "nbconvert_exporter": "python",
275 |    "pygments_lexer": "ipython2",
276 |    "version": "2.7.6"
277 |   }
278 |  },
279 |  "nbformat": 4,
280 |  "nbformat_minor": 0
281 | }
282 | 


--------------------------------------------------------------------------------
/CrowdFlower/Knn unprocessed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%run scripts/helper.py\n",
 12 |     "%run scripts/models.py"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n",
 24 |     "y = crowd_train.median_relevance.values"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "X = prepareText(crowd_train)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 4,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "knn_model_word, tfv_knn, svd_knn = build_knn_model(X, y, 'distance', 'word')"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 5,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 6,
 63 |    "metadata": {
 64 |     "collapsed": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "Xtest = prepareText(crowd_test)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 8,
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "predictions = []\n",
 80 |     "for i in range(0, len(Xtest), 5000):\n",
 81 |     "    preds = knn_model_predictions(knn_model_word, tfv_knn, svd_knn, Xtest[i:i+5000])\n",
 82 |     "    predictions.append(preds)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 9,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "all_preds = np.hstack(predictions)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 11,
 99 |    "metadata": {
100 |     "collapsed": true
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "make_submission(crowd_test.index.values.astype(int), all_preds, './model-submissions/knn_unprocessed.csv')"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {
111 |     "collapsed": true
112 |    },
113 |    "outputs": [],
114 |    "source": []
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": []
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {
129 |     "collapsed": true
130 |    },
131 |    "outputs": [],
132 |    "source": []
133 |   }
134 |  ],
135 |  "metadata": {
136 |   "kernelspec": {
137 |    "display_name": "Python 2",
138 |    "language": "python",
139 |    "name": "python2"
140 |   },
141 |   "language_info": {
142 |    "codemirror_mode": {
143 |     "name": "ipython",
144 |     "version": 2
145 |    },
146 |    "file_extension": ".py",
147 |    "mimetype": "text/x-python",
148 |    "name": "python",
149 |    "nbconvert_exporter": "python",
150 |    "pygments_lexer": "ipython2",
151 |    "version": "2.7.6"
152 |   }
153 |  },
154 |  "nbformat": 4,
155 |  "nbformat_minor": 0
156 | }
157 | 


--------------------------------------------------------------------------------
/CrowdFlower/Linear unprocessed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%run scripts/helper.py\n",
 12 |     "%run scripts/models.py"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n",
 24 |     "y = crowd_train.median_relevance.values"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "X = prepareText(crowd_train)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 4,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "linear_model, tfv_linear, select_linear = build_linear_model(X, y, 'word')"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 5,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 6,
 63 |    "metadata": {
 64 |     "collapsed": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "Xtest = prepareText(crowd_test)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "predictions = linear_model_predictions(linear_model, tfv_linear, select_linear, Xtest)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "make_submission(crowd_test.index.values.astype(int), predictions, './model-submissions/lin_unprocessed.csv')"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {
 97 |     "collapsed": true
 98 |    },
 99 |    "outputs": [],
100 |    "source": []
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": true
107 |    },
108 |    "outputs": [],
109 |    "source": []
110 |   }
111 |  ],
112 |  "metadata": {
113 |   "kernelspec": {
114 |    "display_name": "Python 2",
115 |    "language": "python",
116 |    "name": "python2"
117 |   },
118 |   "language_info": {
119 |    "codemirror_mode": {
120 |     "name": "ipython",
121 |     "version": 2
122 |    },
123 |    "file_extension": ".py",
124 |    "mimetype": "text/x-python",
125 |    "name": "python",
126 |    "nbconvert_exporter": "python",
127 |    "pygments_lexer": "ipython2",
128 |    "version": "2.7.6"
129 |   }
130 |  },
131 |  "nbformat": 4,
132 |  "nbformat_minor": 0
133 | }
134 | 


--------------------------------------------------------------------------------
/CrowdFlower/Non Linear Processed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 18,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%run scripts/helper.py\n",
 12 |     "%run scripts/models.py"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {
 30 |     "collapsed": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "y = crowd_train.median_relevance.values"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 4,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from sklearn.cross_validation import cross_val_score\n",
 46 |     "from sklearn.metrics import make_scorer"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 5,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better = True)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 6,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stderr",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65497012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n",
 72 |       "  '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n",
 73 |       "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65516012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n",
 74 |       "  '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n",
 75 |       "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/6552101\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n",
 76 |       "  '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "X = tweak_text(crowd_train)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 7,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 93 |     "from sklearn.decomposition import TruncatedSVD\n",
 94 |     "from sklearn.preprocessing import StandardScaler"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 14,
100 |    "metadata": {
101 |     "collapsed": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "tfv = TfidfVectorizer(min_df=3, max_df=500, max_features=None, \n",
106 |     "            strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n",
107 |     "            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
108 |     "            stop_words = 'english')\n",
109 |     "\n",
110 |     "svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)\n",
111 |     "scl = StandardScaler(copy=True, with_mean=True, with_std=True)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 15,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "X_tfv = tfv.fit_transform(X)\n",
123 |     "X_svd = svd.fit_transform(X_tfv)\n",
124 |     "X_scl = scl.fit_transform(X_svd)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 16,
130 |    "metadata": {
131 |     "collapsed": true
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "from sklearn.svm import SVC\n",
136 |     "\n",
137 |     "clf = SVC(C=10.0, kernel='rbf', degree=3, \n",
138 |     "        gamma=0.0, coef0=0.0, shrinking=True, probability=False, \n",
139 |     "        tol=0.001, cache_size=200, class_weight=None, \n",
140 |     "        verbose=False, max_iter=-1, random_state=None)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 17,
146 |    "metadata": {
147 |     "collapsed": false
148 |    },
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "min score 0.5302, max score 0.5406 and mean score 0.5354 \n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "scores = cross_val_score(clf, X_scl, y, scoring=kappa_scorer, n_jobs=1, cv=2)\n",
160 |     "print 'min score %0.4f, max score %0.4f and mean score %0.4f ' %(scores.min(), scores.max(), scores.mean())"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 20,
166 |    "metadata": {
167 |     "collapsed": false
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "tweaked_model, tfv, svd, scl = build_non_linear_model(X, y)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 21,
177 |    "metadata": {
178 |     "collapsed": true
179 |    },
180 |    "outputs": [],
181 |    "source": [
182 |     "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 22,
188 |    "metadata": {
189 |     "collapsed": false
190 |    },
191 |    "outputs": [
192 |     {
193 |      "name": "stderr",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65527\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n",
197 |       "  '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n",
198 |       "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januarya/14146012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n",
199 |       "  '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "Xtest = tweak_text(crowd_test)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 23,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "preds = non_linear_model_predictions(tweaked_model, tfv, svd, scl, Xtest)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 24,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "make_submission(crowd_test.index.values.astype(int), preds, './model-submissions/non_linear_processed.csv')"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": []
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "kernelspec": {
241 |    "display_name": "Python 2",
242 |    "language": "python",
243 |    "name": "python2"
244 |   },
245 |   "language_info": {
246 |    "codemirror_mode": {
247 |     "name": "ipython",
248 |     "version": 2
249 |    },
250 |    "file_extension": ".py",
251 |    "mimetype": "text/x-python",
252 |    "name": "python",
253 |    "nbconvert_exporter": "python",
254 |    "pygments_lexer": "ipython2",
255 |    "version": "2.7.6"
256 |   }
257 |  },
258 |  "nbformat": 4,
259 |  "nbformat_minor": 0
260 | }
261 | 


--------------------------------------------------------------------------------
/CrowdFlower/Non Linear SVM unprocessed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## SVM model on unprocessed text"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "%run scripts/helper.py\n",
 19 |     "%run scripts/models.py"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "y = crowd_train.median_relevance.values"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 4,
 47 |    "metadata": {
 48 |     "collapsed": true
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from sklearn.cross_validation import cross_val_score\n",
 53 |     "from sklearn.metrics import make_scorer"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 5,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better = True)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 6,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "X = prepareText(crowd_train)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 7,
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "non_linear_model, tfv, svd, scl = build_non_linear_model(X, y, 'word')"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 8,
 92 |    "metadata": {
 93 |     "collapsed": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 9,
103 |    "metadata": {
104 |     "collapsed": false
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "Xtest = prepareText(crowd_test)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 10,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "predictions = non_linear_model_predictions(non_linear_model, tfv, svd, scl, Xtest)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 32,
125 |    "metadata": {
126 |     "collapsed": false
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "make_submission(crowd_test.index.values.astype(int), predictions, './model-submissions/non_lin_unprocessed.csv')"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "collapsed": true
138 |    },
139 |    "outputs": [],
140 |    "source": []
141 |   }
142 |  ],
143 |  "metadata": {
144 |   "kernelspec": {
145 |    "display_name": "Python 2",
146 |    "language": "python",
147 |    "name": "python2"
148 |   },
149 |   "language_info": {
150 |    "codemirror_mode": {
151 |     "name": "ipython",
152 |     "version": 2
153 |    },
154 |    "file_extension": ".py",
155 |    "mimetype": "text/x-python",
156 |    "name": "python",
157 |    "nbconvert_exporter": "python",
158 |    "pygments_lexer": "ipython2",
159 |    "version": "2.7.6"
160 |   }
161 |  },
162 |  "nbformat": 4,
163 |  "nbformat_minor": 0
164 | }
165 | 


--------------------------------------------------------------------------------
/CrowdFlower/OptimizeSVC.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "%run scripts/helper.py"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n",
 37 |     "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 4,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# fill in the missing np.nan values with empty string\n",
 49 |     "crowd_train.fillna('', inplace=True, axis=1)\n",
 50 |     "crowd_test.fillna('', inplace=True, axis=1)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 5,
 56 |    "metadata": {
 57 |     "collapsed": true
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "traindata = list(crowd_train.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))\n",
 62 |     "testdata = list(crowd_test.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 6,
 68 |    "metadata": {
 69 |     "collapsed": true
 70 |    },
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "y = crowd_train.median_relevance.values"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 7,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 51,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "tfv = TfidfVectorizer(min_df=3, max_df=0.8, max_features=None,\n",
 96 |     "                     strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n",
 97 |     "                     ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
 98 |     "                     stop_words = 'english')"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 52,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "tfv.fit(traindata)\n",
110 |     "X = tfv.transform(traindata)\n",
111 |     "X_test = tfv.transform(testdata)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 53,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "from sklearn.cross_validation import train_test_split\n",
123 |     "from sklearn.cross_validation import cross_val_score"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 54,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 55,
140 |    "metadata": {
141 |     "collapsed": false
142 |    },
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "(8126, 44460) (2032, 44460) (8126,) (2032,)\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "print Xt.shape, Xv.shape, yt.shape, yv.shape"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 56,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "from sklearn.decomposition import TruncatedSVD\n",
165 |     "from sklearn.preprocessing import StandardScaler"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 57,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "svd = TruncatedSVD(n_components=140)\n",
177 |     "Xt_svd = svd.fit_transform(Xt)\n",
178 |     "Xv_svd = svd.transform(Xv)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 58,
184 |    "metadata": {
185 |     "collapsed": true
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "# scale features\n",
190 |     "\n",
191 |     "scl = StandardScaler()\n",
192 |     "Xt_svd_scl = scl.fit_transform(Xt_svd)\n",
193 |     "Xv_svd_scl = scl.transform(Xv_svd)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 59,
199 |    "metadata": {
200 |     "collapsed": true
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "from sklearn.metrics import make_scorer\n",
205 |     "# Weighted kappa scorer\n",
206 |     "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better=True)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 60,
212 |    "metadata": {
213 |     "collapsed": true
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "# cross validation\n",
218 |     "from sklearn.cross_validation import ShuffleSplit\n",
219 |     "from sklearn.svm import SVC"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 61,
225 |    "metadata": {
226 |     "collapsed": true
227 |    },
228 |    "outputs": [],
229 |    "source": [
230 |     "svc = SVC(C=10.0, gamma=.01)\n",
231 |     "cv = ShuffleSplit(Xt_svd_scl.shape[0], n_iter=2, test_size=.1, random_state=1724)\n",
232 |     "\n",
233 |     "test_scores = cross_val_score(svc, Xt_svd_scl, yt, cv=cv, scoring=kappa_scorer, n_jobs=1)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 62,
239 |    "metadata": {
240 |     "collapsed": false
241 |    },
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "min score 0.472, mean score 0.501 and max score 0.529\n"
248 |      ]
249 |     }
250 |    ],
251 |    "source": [
252 |     "print 'min score %0.3f, mean score %0.3f and max score %0.3f' %(test_scores.min(), test_scores.mean(), test_scores.max())"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 63,
258 |    "metadata": {
259 |     "collapsed": true
260 |    },
261 |    "outputs": [],
262 |    "source": [
263 |     "from sklearn.ensemble import RandomForestClassifier"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 64,
269 |    "metadata": {
270 |     "collapsed": true
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "rf = RandomForestClassifier(n_estimators=100)\n",
275 |     "test_scores = cross_val_score(rf, Xt_svd_scl, yt, cv=cv, scoring=kappa_scorer, n_jobs=1)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 65,
281 |    "metadata": {
282 |     "collapsed": false
283 |    },
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "min score 0.218, mean score 0.259 and max score 0.299\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "print 'min score %0.3f, mean score %0.3f and max score %0.3f' %(test_scores.min(), test_scores.mean(), test_scores.max())"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {
301 |     "collapsed": true
302 |    },
303 |    "outputs": [],
304 |    "source": []
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {
310 |     "collapsed": true
311 |    },
312 |    "outputs": [],
313 |    "source": []
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {
319 |     "collapsed": true
320 |    },
321 |    "outputs": [],
322 |    "source": []
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {
328 |     "collapsed": true
329 |    },
330 |    "outputs": [],
331 |    "source": []
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {
337 |     "collapsed": true
338 |    },
339 |    "outputs": [],
340 |    "source": []
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {
346 |     "collapsed": true
347 |    },
348 |    "outputs": [],
349 |    "source": []
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {
355 |     "collapsed": true
356 |    },
357 |    "outputs": [],
358 |    "source": []
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {
364 |     "collapsed": true
365 |    },
366 |    "outputs": [],
367 |    "source": []
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {
373 |     "collapsed": true
374 |    },
375 |    "outputs": [],
376 |    "source": []
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "metadata": {
382 |     "collapsed": true
383 |    },
384 |    "outputs": [],
385 |    "source": []
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {
391 |     "collapsed": true
392 |    },
393 |    "outputs": [],
394 |    "source": [
395 |     "crowd"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {
402 |     "collapsed": true
403 |    },
404 |    "outputs": [],
405 |    "source": []
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {
411 |     "collapsed": true
412 |    },
413 |    "outputs": [],
414 |    "source": []
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": null,
419 |    "metadata": {
420 |     "collapsed": true
421 |    },
422 |    "outputs": [],
423 |    "source": []
424 |   }
425 |  ],
426 |  "metadata": {
427 |   "kernelspec": {
428 |    "display_name": "Python 2",
429 |    "language": "python",
430 |    "name": "python2"
431 |   },
432 |   "language_info": {
433 |    "codemirror_mode": {
434 |     "name": "ipython",
435 |     "version": 2
436 |    },
437 |    "file_extension": ".py",
438 |    "mimetype": "text/x-python",
439 |    "name": "python",
440 |    "nbconvert_exporter": "python",
441 |    "pygments_lexer": "ipython2",
442 |    "version": "2.7.6"
443 |   }
444 |  },
445 |  "nbformat": 4,
446 |  "nbformat_minor": 0
447 | }
448 | 


--------------------------------------------------------------------------------
/CrowdFlower/SpellCorrection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 191,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import sys\n",
 12 |     "reload(sys)\n",
 13 |     "sys.setdefaultencoding('utf8')"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 178,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import warnings\n",
 25 |     "warnings.filterwarnings('ignore')\n",
 26 |     "from sklearn.pipeline import Pipeline"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 179,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "%run query_features.py\n",
 38 |     "%run scripts/helper.py\n",
 39 |     "%run scripts/model_train_plus_test.py"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 180,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "crowd_train = load_file('./data/train.csv/train.csv', None)\n",
 51 |     "crowd_test = load_file('./data/test.csv/test.csv', None)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 181,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# crowd_train = crowd_train[crowd_train.relevance_variance < 0.5]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 158,
 68 |    "metadata": {
 69 |     "collapsed": true
 70 |    },
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "target = crowd_train.median_relevance.values"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 182,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# train_index, test_index = ssSplit(target, train_size=8000, random_state=44)\n",
 85 |     "# train_index, test_index = ssSplit(target, train_size=500, random_state=44)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 183,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "Xt = crowd_train.iloc[train_index]\n",
 97 |     "Xv = crowd_train.iloc[test_index]\n",
 98 |     "\n",
 99 |     "# Xt = crowd_train\n",
100 |     "# Xv = crowd_test"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 184,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "yt = target[train_index]\n",
112 |     "yv = target[test_index]\n",
113 |     "\n",
114 |     "# yt = target"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 185,
120 |    "metadata": {
121 |     "collapsed": false
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "correct_map = build_query_correction_map(Xt, crowd_test)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 186,
131 |    "metadata": {
132 |     "collapsed": false
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "def spell_correct_query(x):\n",
137 |     "    if x not in correct_map:\n",
138 |     "        return x\n",
139 |     "    else:\n",
140 |     "        return correct_map[x]\n",
141 |     "    \n",
142 |     "Xt['query'] = Xt['query'].map(spell_correct_query)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 187,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "Xv['query'] = Xv['query'].map(spell_correct_query)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 188,
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "Xt_tweaked = tweak_text(Xt)\n",
165 |     "Xv_tweaked = tweak_text(Xv)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 189,
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "Xfitted, tfv = TFIDF(Xt_tweaked, None)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 192,
182 |    "metadata": {
183 |     "collapsed": false
184 |    },
185 |    "outputs": [],
186 |    "source": [
187 |     "svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)\n",
188 |     "\n",
189 |     "scl = StandardScaler(copy=True, with_mean=True, with_std=True)\n",
190 |     "\n",
191 |     "clf = SVC(C=10.0, kernel='rbf', degree=3, \n",
192 |     "        gamma=0.0, coef0=0.0, shrinking=True, probability=False, \n",
193 |     "        tol=0.001, cache_size=200, class_weight=None, \n",
194 |     "        verbose=False, max_iter=-1, random_state=None)\n",
195 |     "\n",
196 |     "keywords = keyword_counter(Xt)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 193,
202 |    "metadata": {
203 |     "collapsed": false
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "features = stack([keywords, Xfitted])"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 194,
213 |    "metadata": {
214 |     "collapsed": true
215 |    },
216 |    "outputs": [],
217 |    "source": [
218 |     "pipeline = Pipeline([('svd', svd), ('scl', scl), ('clf', clf)])"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 195,
224 |    "metadata": {
225 |     "collapsed": false
226 |    },
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/plain": [
231 |        "Pipeline(steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=200, n_iter=5,\n",
232 |        "       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
233 |        "  kernel='rbf', max_iter=-1, probability=False, random_state=None,\n",
234 |        "  shrinking=True, tol=0.001, verbose=False))])"
235 |       ]
236 |      },
237 |      "execution_count": 195,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "pipeline.fit(features, yt)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 196,
249 |    "metadata": {
250 |     "collapsed": false
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "keywords_test = keyword_counter(Xv)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 197,
260 |    "metadata": {
261 |     "collapsed": true
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "Xtest = tfv.transform(Xv_tweaked)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 198,
271 |    "metadata": {
272 |     "collapsed": true
273 |    },
274 |    "outputs": [],
275 |    "source": [
276 |     "features_test = stack([keywords_test, Xtest])"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 199,
282 |    "metadata": {
283 |     "collapsed": true
284 |    },
285 |    "outputs": [],
286 |    "source": [
287 |     "preds_new_model = pipeline.predict(features_test)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 148,
293 |    "metadata": {
294 |     "collapsed": true
295 |    },
296 |    "outputs": [],
297 |    "source": [
298 |     "# make_submission(crowd_test.id.values.astype(int), preds_new_model, 'spell_correct_rel.csv')"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 200,
304 |    "metadata": {
305 |     "collapsed": false
306 |    },
307 |    "outputs": [],
308 |    "source": [
309 |     "print 'Kappa score on validation set ', (quadratic_weighted_kappa(yv, preds_new_model))"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "### Linear model"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 174,
322 |    "metadata": {
323 |     "collapsed": true
324 |    },
325 |    "outputs": [],
326 |    "source": [
327 |     "linear_model, select = build_linear_model(features, yt)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 175,
333 |    "metadata": {
334 |     "collapsed": false
335 |    },
336 |    "outputs": [],
337 |    "source": [
338 |     "features_test_selected = select.transform(features_test)\n",
339 |     "linear_preds = linear_model.predict(features_test_selected)"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 152,
345 |    "metadata": {
346 |     "collapsed": false
347 |    },
348 |    "outputs": [
349 |     {
350 |      "name": "stdout",
351 |      "output_type": "stream",
352 |      "text": [
353 |       "Kappa score on validation set  0.57412956621\n"
354 |      ]
355 |     }
356 |    ],
357 |    "source": [
358 |     "# print 'Kappa score on validation set ', (quadratic_weighted_kappa(yv, linear_preds))"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 176,
364 |    "metadata": {
365 |     "collapsed": true
366 |    },
367 |    "outputs": [],
368 |    "source": [
369 |     "ensemble_lin_svm = (preds_new_model + linear_preds) / 2"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 155,
375 |    "metadata": {
376 |     "collapsed": false
377 |    },
378 |    "outputs": [
379 |     {
380 |      "name": "stdout",
381 |      "output_type": "stream",
382 |      "text": [
383 |       "Kappa score on validation set  0.659703361754\n"
384 |      ]
385 |     }
386 |    ],
387 |    "source": [
388 |     "# print 'Kappa score on validation set ', (quadratic_weighted_kappa(yv, ensemble_lin_svm))"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 177,
394 |    "metadata": {
395 |     "collapsed": true
396 |    },
397 |    "outputs": [],
398 |    "source": [
399 |     "make_submission(crowd_test.id.values.astype(int), ensemble_lin_svm, 'ensemble_lin_svm_title.csv')"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "markdown",
404 |    "metadata": {},
405 |    "source": [
406 |     "## Best score"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 48,
412 |    "metadata": {
413 |     "collapsed": true
414 |    },
415 |    "outputs": [],
416 |    "source": [
417 |     "best_score_df = pd.read_csv('./submissions/spell_correct_final_only_title.csv')"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 49,
423 |    "metadata": {
424 |     "collapsed": true
425 |    },
426 |    "outputs": [],
427 |    "source": [
428 |     "best_score = best_score_df.prediction"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 50,
434 |    "metadata": {
435 |     "collapsed": true
436 |    },
437 |    "outputs": [],
438 |    "source": [
439 |     "ensemble = (preds_new_model + best_score) / 2\n",
440 |     "ensemble_int = [int(score) for score in ensemble]"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 51,
446 |    "metadata": {
447 |     "collapsed": true
448 |    },
449 |    "outputs": [],
450 |    "source": [
451 |     "make_submission(crowd_test.id.values.astype(int), ensemble_int, 'spell_correct_title_relevance.csv')"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {
458 |     "collapsed": true
459 |    },
460 |    "outputs": [],
461 |    "source": []
462 |   }
463 |  ],
464 |  "metadata": {
465 |   "kernelspec": {
466 |    "display_name": "Python 2",
467 |    "language": "python",
468 |    "name": "python2"
469 |   },
470 |   "language_info": {
471 |    "codemirror_mode": {
472 |     "name": "ipython",
473 |     "version": 2
474 |    },
475 |    "file_extension": ".py",
476 |    "mimetype": "text/x-python",
477 |    "name": "python",
478 |    "nbconvert_exporter": "python",
479 |    "pygments_lexer": "ipython2",
480 |    "version": "2.7.6"
481 |   }
482 |  },
483 |  "nbformat": 4,
484 |  "nbformat_minor": 0
485 | }
486 | 


--------------------------------------------------------------------------------
/CrowdFlower/TFIDF_Train_Plus_Test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import warnings\n",
 13 |     "warnings.filterwarnings('ignore')"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 30,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "%run scripts/helper.py\n",
 25 |     "%run scripts/model_train_plus_test.py"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n",
 37 |     "y = crowd_train.median_relevance.values"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 4,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 5,
 54 |    "metadata": {
 55 |     "collapsed": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Stratified shuffle split\n",
 60 |     "train_idx, test_idx = ssSplit(y, train_size=500, random_state=1234)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 6,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# training and test set\n",
 72 |     "Xtrain = crowd_train.iloc[train_idx]\n",
 73 |     "ytrain = y[train_idx]\n",
 74 |     "\n",
 75 |     "Xtest = crowd_train.iloc[test_idx]\n",
 76 |     "ytest = y[test_idx]"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 7,
 82 |    "metadata": {
 83 |     "collapsed": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "Xtrain_text = tweak_text(Xtrain)\n",
 88 |     "#Xtest_text = tweak_text(Xtest)\n",
 89 |     "Xtest_text = tweak_text(crowd_test)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 8,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# whole corpus\n",
101 |     "corpus = []\n",
102 |     "\n",
103 |     "for i in range(len(Xtrain_text)):\n",
104 |     "    corpus.append(Xtrain_text[i])\n",
105 |     "\n",
106 |     "for j in range(len(Xtest_text)):\n",
107 |     "    corpus.append(Xtest_text[j])"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 15,
113 |    "metadata": {
114 |     "collapsed": true
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "Xvalidation_text = tweak_text(Xtest)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 16,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "for k in range(len(Xvalidation_text)):\n",
130 |     "    corpus.append(Xvalidation_text[k])"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 17,
136 |    "metadata": {
137 |     "collapsed": false
138 |    },
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "24029\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "print len(corpus)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 31,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "Xtrain_fitted, tfv = TFIDF(Xtrain_text, corpus)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 32,
166 |    "metadata": {
167 |     "collapsed": false
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "# Non linear svm model on processed text\n",
172 |     "svm, svd, scl = build_non_linear_model(Xtrain_fitted, ytrain)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 33,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "options = {\n",
184 |     "    'tfv': tfv,\n",
185 |     "    'svd': svd,\n",
186 |     "    'scl': scl\n",
187 |     "}"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 34,
193 |    "metadata": {
194 |     "collapsed": false
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "svm_pred_non_lin = make_predictions(svm, options, Xvalidation_text)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 35,
204 |    "metadata": {
205 |     "collapsed": false
206 |    },
207 |    "outputs": [
208 |     {
209 |      "name": "stdout",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "Kappa score on validation set 0.1151 \n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "print 'Kappa score on validation set %0.4f ' %(quadratic_weighted_kappa(ytest, svm_pred_non_lin))"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 59,
223 |    "metadata": {
224 |     "collapsed": false
225 |    },
226 |    "outputs": [],
227 |    "source": [
228 |     "# Linear svm model on processed text\n",
229 |     "svm_lin, select = build_linear_model(Xtrain_fitted, ytrain)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 64,
235 |    "metadata": {
236 |     "collapsed": false
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "options = {\n",
241 |     "    'tfv': tfv,\n",
242 |     "    'select': select\n",
243 |     "}"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 65,
249 |    "metadata": {
250 |     "collapsed": false
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "svm_pred_lin = make_predictions(svm_lin, options, Xtest_text)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 66,
260 |    "metadata": {
261 |     "collapsed": false
262 |    },
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "Validation set accuracy 0.4350 \n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(ytest, svm_pred_lin))"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {},
279 |    "source": [
280 |     "### Unprocessed text"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 68,
286 |    "metadata": {
287 |     "collapsed": true
288 |    },
289 |    "outputs": [],
290 |    "source": [
291 |     "Xtrain_text_u = prepareText(Xtrain)\n",
292 |     "Xtest_text_u = prepareText(Xtest)"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 69,
298 |    "metadata": {
299 |     "collapsed": true
300 |    },
301 |    "outputs": [],
302 |    "source": [
303 |     "# whole corpus\n",
304 |     "corpus_u = []\n",
305 |     "\n",
306 |     "for i in range(len(Xtrain_text_u)):\n",
307 |     "    corpus_u.append(Xtrain_text_u[i])\n",
308 |     "\n",
309 |     "for j in range(len(Xtest_text_u)):\n",
310 |     "    corpus_u.append(Xtest_text_u[j])"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 77,
316 |    "metadata": {
317 |     "collapsed": true
318 |    },
319 |    "outputs": [],
320 |    "source": [
321 |     "Xtrain_fitted_u, tfv_u = TFIDF(Xtrain_text_u, corpus_u)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 78,
327 |    "metadata": {
328 |     "collapsed": true
329 |    },
330 |    "outputs": [],
331 |    "source": [
332 |     "# Non linear svm model on unprocessed text\n",
333 |     "svm_u, svd_u, scl_u = build_non_linear_model(Xtrain_fitted_u, ytrain)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 79,
339 |    "metadata": {
340 |     "collapsed": true
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "options = {\n",
345 |     "    'tfv': tfv_u,\n",
346 |     "    'svd': svd_u,\n",
347 |     "    'scl': scl_u\n",
348 |     "}"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": 80,
354 |    "metadata": {
355 |     "collapsed": true
356 |    },
357 |    "outputs": [],
358 |    "source": [
359 |     "svm_pred_non_lin_u = make_predictions(svm_u, options, Xtest_text_u)"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 81,
365 |    "metadata": {
366 |     "collapsed": false
367 |    },
368 |    "outputs": [
369 |     {
370 |      "name": "stdout",
371 |      "output_type": "stream",
372 |      "text": [
373 |       "Validation set accuracy 0.5620 \n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(ytest, svm_pred_non_lin_u))"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 88,
384 |    "metadata": {
385 |     "collapsed": false
386 |    },
387 |    "outputs": [
388 |     {
389 |      "name": "stdout",
390 |      "output_type": "stream",
391 |      "text": [
392 |       "Ensemble of unprocessed and processed non linear SVM models  0.605575423197\n"
393 |      ]
394 |     }
395 |    ],
396 |    "source": [
397 |     "print 'Ensemble of unprocessed and processed non linear SVM models ', quadratic_weighted_kappa(ytest, (svm_pred_non_lin + svm_pred_non_lin_u) / 2)"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {
404 |     "collapsed": true
405 |    },
406 |    "outputs": [],
407 |    "source": []
408 |   }
409 |  ],
410 |  "metadata": {
411 |   "kernelspec": {
412 |    "display_name": "Python 2",
413 |    "language": "python",
414 |    "name": "python2"
415 |   },
416 |   "language_info": {
417 |    "codemirror_mode": {
418 |     "name": "ipython",
419 |     "version": 2
420 |    },
421 |    "file_extension": ".py",
422 |    "mimetype": "text/x-python",
423 |    "name": "python",
424 |    "nbconvert_exporter": "python",
425 |    "pygments_lexer": "ipython2",
426 |    "version": "2.7.6"
427 |   }
428 |  },
429 |  "nbformat": 4,
430 |  "nbformat_minor": 0
431 | }
432 | 


--------------------------------------------------------------------------------
/CrowdFlower/query_features.py:
--------------------------------------------------------------------------------
  1 | from sklearn.base import BaseEstimator
  2 | from nltk.corpus import stopwords
  3 | import numpy as np
  4 | from scipy import sparse
  5 | from nltk.stem import PorterStemmer
  6 | from nltk.corpus import wordnet as wn
  7 | 
  8 | 
  9 | stop = stopwords.words('english')
 10 | 
 11 | def is_query_in_response(train):
 12 |     query_terms = train['query'].split(' ')
 13 |     response = train['product_title'] + ' ' + train['product_description']
 14 |     
 15 |     stemmer = PorterStemmer()
 16 |     query_terms_stemmed = [stemmer.stem(q) for q in query_terms]
 17 |     response_stemmed = ''.join([stemmer.stem(r) for r in response])
 18 |     stop = stopwords.words('english')
 19 |        
 20 |     keyword = False
 21 |     
 22 |     for q in query_terms_stemmed:
 23 |         if q not in stop:
 24 |             keyword = True
 25 |             if response_stemmed.lower().find(q) == -1:
 26 |                 return 0
 27 |     
 28 |     if keyword == False:
 29 |         return 0
 30 |     else:
 31 |         return 1
 32 | 
 33 | 
 34 | 
 35 | def query_in_response(doc):
 36 | 	query_terms = doc['query'].split(' ')
 37 | 	unique_terms = list(set(query_terms))
 38 | 	response = doc['product_title'] + ' ' + doc['product_description']
 39 | 	keyword = False
 40 | 	
 41 | 	for q in unique_terms:
 42 | 		if q not in stop:
 43 | 			keyword = True
 44 | 			
 45 | 			if response.lower().find(q) == -1:
 46 | 				return 0
 47 | 
 48 | 	if keyword == False:
 49 | 		return 0
 50 | 	else:
 51 | 		return 1
 52 | 
 53 | 
 54 | def num_query_in_response(doc):
 55 | 	query_terms = doc['query'].split(' ')
 56 | 	unique_terms = list(set(query_terms))
 57 | 	response = doc['product_title'] + ' ' + doc['product_description']
 58 | 	count = 0
 59 | 
 60 | 	for q in unique_terms:
 61 | 		if q not in stop:
 62 | 			if response.lower().find(q) == -1:
 63 | 				count += 1
 64 | 
 65 | 	return count
 66 | 
 67 | def query_synonymns_check(x):
 68 |     query = x['query'].lower()
 69 |     query_terms = list(set(query.split()))
 70 |     response = x['product_title'].lower() + ' ' + x['product_description'].lower()
 71 |     query_synonymns = []
 72 |     stop = stopwords.words('english')
 73 |     
 74 |     for q in query_terms:
 75 |         for i, j in enumerate(wn.synsets(q)):
 76 |             query_synonymns.extend(j.lemma_names)
 77 |     
 78 |     count = 0
 79 |     for qsynonym in query_synonymns:
 80 |         if qsynonym not in stop and response.find(qsynonym) != -1:
 81 |             count += 1
 82 |     
 83 |     return count
 84 | 
 85 | 
 86 | 
 87 | def jaccard(x):
 88 |     query = x['query'].lower()
 89 |     title = x['product_title'].lower()
 90 |     description = x['product_description'].lower()
 91 |     response = title + ' ' + description
 92 |     
 93 |     query_set = set(query.split(' '))
 94 |     response_set = set(response.split(' '))
 95 |     
 96 |     query_response_intersection_len = len(query_set & response_set)
 97 |     query_response_union_len = len(query_set | response_set)
 98 |     
 99 |     return (query_response_intersection_len * 1.) / (query_response_union_len)
100 | 
101 | 
102 | def query_length(x):
103 | 	return len(x['query'].split(' '))
104 | 
105 | 
106 | def keyword_counter(document):
107 | 	query_in_resp_feat = document.apply(query_in_response, axis=1)
108 | 	num_query_feat = document.apply(num_query_in_response, axis=1)
109 | 	# query_synonym_count_feat = document.apply(query_synonymns_check, axis=1)
110 | 	# query_length_feat = document.apply(query_length, axis=1)
111 | 	# jaccard_dist = document.apply(jaccard, axis=1)
112 | 
113 | 	# return np.array([query_in_resp_feat, num_query_feat, query_synonym_count_feat]).T
114 | 	
115 | 	#query_in_resp_feat = document.apply(is_query_in_response, axis=1)
116 | 
117 | 	return np.array([query_in_resp_feat, num_query_feat]).T
118 | 
119 | def stack(features):
120 | 	features = sparse.hstack(features).tocsr()
121 | 	return features
122 | 
123 | def concat_examples(examples):
124 | 	total = sparse.vstack(examples).tocsr()
125 | 	return total
126 | 


--------------------------------------------------------------------------------
/CrowdFlower/scripts/blending_helper.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import TfidfVectorizer
 2 | from sklearn.feature_selection import SelectPercentile, chi2
 3 | from sklearn.decomposition import TruncatedSVD
 4 | from sklearn.preprocessing import StandardScaler
 5 | from sklearn.svm import SVC
 6 | from sklearn.naive_bayes import MultinomialNB
 7 | from sklearn.neighbors import KNeighborsClassifier
 8 | 
 9 | def prepareTrainData(X, Xwhole):
10 | 	tfv = TfidfVectorizer(min_df=3, max_features=None,
11 | 		  strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
12 | 		  ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english')
13 | 
14 | 	if Xwhole == None:
15 | 		X = tfv.fit_transform(X)
16 | 	else:
17 | 		tfv.fit(Xwhole)
18 | 		X = tfv.transform(X)
19 | 
20 | 	svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
21 | 	scl = StandardScaler(copy=True, with_mean=True, with_std=True)
22 | 	
23 | 	X = svd.fit_transform(X)
24 | 	X = scl.fit_transform(X)
25 | 
26 | 	return (X, tfv, svd, scl)
27 | 
28 | def prepareTestData(Xtest, tfv, svd, scl):
29 | 	Xtest = tfv.transform(Xtest)
30 | 	Xtest = svd.transform(Xtest)
31 | 	Xtest = scl.transform(Xtest)
32 | 
33 | 	return Xtest
34 | 


--------------------------------------------------------------------------------
/CrowdFlower/scripts/features.py:
--------------------------------------------------------------------------------
 1 | from scipy import sparse
 2 | import numpy as np
 3 | from sklearn.base import BaseEstimator
 4 | import pandas as pd
 5 | 
 6 | class FeatureStacker(BaseEstimator):
 7 | 	def __init__(self, transformer_list):
 8 | 		self.transformer_list = transformer_list
 9 | 
10 | 	def get_feature_names(self):
11 | 		pass
12 | 
13 | 	def fit(self, X, y=None):
14 | 		for name, trans in self.transformer_list:
15 | 			trans.fit(X, y)
16 | 		return self
17 | 
18 | 	def transform(self, X):
19 | 		features = []
20 | 		for name, trans in self.transformer_list:
21 | 			features.append(trans.transform(X))
22 | 
23 | 		issparse = [sparse.issparse(f) for f in features]
24 | 
25 | 		if np.any(issparse):
26 | 			features = sparse.hstack(features).tocsr()
27 | 		else:
28 | 			features = np.hstack(features)
29 | 
30 | 		return features
31 | 
32 | 	def get_params(self, deep=True):
33 | 		if not deep:
34 | 			return super(FeatureStacker, self).get_params(deep=False)
35 | 
36 | 		else:
37 | 			out = dict(self.transformer_list)
38 | 
39 | 			for name, trans in self.transformer_list:
40 | 				for key, value in trans.get_params(deep=True).iteritems():
41 | 					out['%s__%s' % (name, key)] = value
42 | 
43 | 			return out
44 | 


--------------------------------------------------------------------------------
/CrowdFlower/scripts/helper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from nltk.stem import PorterStemmer
  4 | from nltk.stem.wordnet import WordNetLemmatizer
  5 | import re
  6 | from HTMLParser import HTMLParser
  7 | from sklearn.cross_validation import StratifiedShuffleSplit
  8 | from bs4 import BeautifulSoup
  9 | from collections import Counter
 10 | import difflib
 11 | from nltk import bigrams
 12 | from sklearn.metrics import make_scorer
 13 | 
 14 | 
 15 | class MLStripper(HTMLParser):
 16 |     def __init__(self):
 17 |         self.reset()
 18 |         self.fed = []
 19 |     def handle_data(self, d):
 20 |         self.fed.append(d)
 21 |     def get_data(self):
 22 |         return ''.join(self.fed)
 23 | 
 24 | def strip_tags(html):
 25 |     s = MLStripper()
 26 |     s.feed(html)
 27 |     return s.get_data()
 28 |     
 29 | def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
 30 |     """
 31 |     Returns the confusion matrix between rater's ratings
 32 |     """
 33 |     assert(len(rater_a) == len(rater_b))
 34 |     if min_rating is None:
 35 |         min_rating = min(rater_a + rater_b)
 36 |     if max_rating is None:
 37 |         max_rating = max(rater_a + rater_b)
 38 |     num_ratings = int(max_rating - min_rating + 1)
 39 |     conf_mat = [[0 for i in range(num_ratings)]
 40 |                 for j in range(num_ratings)]
 41 |     for a, b in zip(rater_a, rater_b):
 42 |         conf_mat[a - min_rating][b - min_rating] += 1
 43 |     return conf_mat
 44 | 
 45 | 
 46 | def histogram(ratings, min_rating=None, max_rating=None):
 47 |     """
 48 |     Returns the counts of each type of rating that a rater made
 49 |     """
 50 |     if min_rating is None:
 51 |         min_rating = min(ratings)
 52 |     if max_rating is None:
 53 |         max_rating = max(ratings)
 54 |     num_ratings = int(max_rating - min_rating + 1)
 55 |     hist_ratings = [0 for x in range(num_ratings)]
 56 |     for r in ratings:
 57 |         hist_ratings[r - min_rating] += 1
 58 |     return hist_ratings
 59 | 
 60 | 
 61 | def quadratic_weighted_kappa(y, y_pred):
 62 |     """
 63 |     Calculates the quadratic weighted kappa
 64 |     axquadratic_weighted_kappa calculates the quadratic weighted kappa
 65 |     value, which is a measure of inter-rater agreement between two raters
 66 |     that provide discrete numeric ratings.  Potential values range from -1
 67 |     (representing complete disagreement) to 1 (representing complete
 68 |     agreement).  A kappa value of 0 is expected if all agreement is due to
 69 |     chance.
 70 |     quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
 71 |     each correspond to a list of integer ratings.  These lists must have the
 72 |     same length.
 73 |     The ratings should be integers, and it is assumed that they contain
 74 |     the complete range of possible ratings.
 75 |     quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
 76 |     is the minimum possible rating, and max_rating is the maximum possible
 77 |     rating
 78 |     """
 79 |     rater_a = y
 80 |     rater_b = y_pred
 81 |     min_rating=None
 82 |     max_rating=None
 83 |     rater_a = np.array(rater_a, dtype=int)
 84 |     rater_b = np.array(rater_b, dtype=int)
 85 |     assert(len(rater_a) == len(rater_b))
 86 |     if min_rating is None:
 87 |         min_rating = min(min(rater_a), min(rater_b))
 88 |     if max_rating is None:
 89 |         max_rating = max(max(rater_a), max(rater_b))
 90 |     conf_mat = confusion_matrix(rater_a, rater_b,
 91 |                                 min_rating, max_rating)
 92 |     num_ratings = len(conf_mat)
 93 |     num_scored_items = float(len(rater_a))
 94 | 
 95 |     hist_rater_a = histogram(rater_a, min_rating, max_rating)
 96 |     hist_rater_b = histogram(rater_b, min_rating, max_rating)
 97 | 
 98 |     numerator = 0.0
 99 |     denominator = 0.0
100 | 
101 |     for i in range(num_ratings):
102 |         for j in range(num_ratings):
103 |             expected_count = (hist_rater_a[i] * hist_rater_b[j]
104 |                               / num_scored_items)
105 |             d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
106 |             numerator += d * conf_mat[i][j] / num_scored_items
107 |             denominator += d * expected_count / num_scored_items
108 | 
109 |     return (1.0 - numerator / denominator)
110 | 
111 | def load_file(filename, index_col):
112 |     if index_col:
113 |         return pd.read_csv(filename, index_col=index_col).fillna('')
114 |     else:
115 |         return pd.read_csv(filename).fillna('')
116 | 
117 | def prepareText(df):
118 |     return list(df.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))
119 | 
120 | def how_uncorrelated(ytrue, model1pred, model2pred):
121 |     count = 0
122 | 
123 |     for i in range(len(ytrue)):
124 |         if ytrue[i] != model1pred[i] and ytrue[i] != model2pred[i]:
125 |             if model1pred[i] != model2pred[i]:
126 |                 count += 1
127 | 
128 |     return (count * 1. / len(ytrue)) * 100.0
129 | 
130 | def strip_html(data):
131 |     return [strip_tags(text) for text in data ]
132 | 
133 | def parseHTML(data):
134 |     return ' '.join([p.get_text() for p in BeautifulSoup(data)])
135 | 
136 | def stem_text(data):
137 |     stemmer = PorterStemmer()
138 |     stemmed_text = []
139 | 
140 |     for text in data:
141 |         words = text.split(' ')
142 |         stemmed_words = []
143 | 
144 |         for word in words:
145 |             stemmed_words.append(stemmer.stem(word.lower()))
146 | 
147 |         stemmed_text.append(' '.join(stemmed_words))
148 | 
149 |     return stemmed_text
150 | 
151 | def ssSplit(y, train_size=1000, random_state=0):
152 |     sss = StratifiedShuffleSplit(y, 3, train_size=train_size, random_state=random_state)
153 |     train_index, test_index = next(iter(sss))
154 | 
155 |     return (train_index, test_index) 
156 | 
157 | '''
158 | Auto correct a query based on the training set
159 | '''
160 | def build_query_correction_map(train, test):
161 |     # get all queries
162 |     queries = set(train['query'].values)
163 |     correct_map = {}
164 | 
165 |     for q in queries:
166 |         corrected_q = autocorrect_query(q, train, test)
167 |         correct_map[q] = corrected_q
168 | 
169 |     return correct_map
170 | 
171 | def autocorrect_query(query, train=None, test=None, cutoff=0.8):
172 |     train_data = train.values[train['query'].values == query, :]
173 |     test_data = test.values[test['query'].values == query, :]
174 | 
175 |     s = ''
176 | 
177 |     for r in train_data:
178 |         s = "%s %s %s"%(s,BeautifulSoup(r[2]).get_text(" ",strip=True),BeautifulSoup(r[3]).get_text(" ",strip=True))
179 |     
180 |     for r in test_data:
181 |         s = "%s %s %s"%(s,BeautifulSoup(r[2]).get_text(" ",strip=True),BeautifulSoup(r[3]).get_text(" ",strip=True))
182 |     
183 |     s = re.findall(r'[\'\"\w]+',s.lower())
184 |     s_bigram = [' '.join(i) for i in bigrams(s)]
185 |     s.extend(s_bigram)
186 |     
187 |     corrected_query = []    
188 |     for q in query.lower().split():
189 |         if len(q)<=2:
190 |             corrected_query.append(q)
191 |             continue
192 |         corrected_word = difflib.get_close_matches(q, s,n=1,cutoff=cutoff)
193 |         if len(corrected_word) >0:
194 |             corrected_query.append(corrected_word[0])
195 |         else :
196 |             corrected_query.append(q)   
197 |     return ' '.join(corrected_query)
198 | 
199 | '''
200 | Gets data for a particular relevance score
201 | '''
202 | def getText(data, y, label):
203 |     return [data[i] for i in range(len(y)) if y[i] == label]
204 | 
205 | 
206 | def lemmatize_text(data):
207 |     lmtzr = WordNetLemmatizer()
208 |     lemmatized_text = []
209 | 
210 |     for text in data:
211 |         words = text.split(' ')
212 |         lemmatized_words = []
213 | 
214 |         for word in words:
215 |             lemmatized_words.append(lmtzr.lemmatize(word.lower()))
216 | 
217 |         lemmatized_text.append(' '.join(lemmatized_words))
218 | 
219 |     return lemmatized_text
220 | 
221 | 
222 | # def tweak_text(train):
223 | #     s_data = []
224 | #     stemmer = PorterStemmer()
225 | 
226 | #     for i in range(train.shape[0]):
227 | #         s = (" ").join(["q"+ z for z in BeautifulSoup(train["query"].iloc[i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title.iloc[i]).get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description.iloc[i]).get_text(" ")
228 | #         s = re.sub("[^a-zA-Z0-9]"," ", s)
229 | #         s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
230 | #         s_data.append(s)
231 |     
232 | #     return s_data
233 | 
234 | def tweak_text(train):
235 |     s_data = []
236 |     stemmer = PorterStemmer()
237 | 
238 |     for i in range(train.shape[0]):
239 |         s = (" ").join(["q"+ z for z in BeautifulSoup(train["query"].iloc[i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title.iloc[i]).get_text(" ").split(" ")])
240 |         s = re.sub("[^a-zA-Z0-9]"," ", s)
241 |         s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
242 |         s_data.append(s)
243 |     
244 |     return s_data
245 | 
246 | 
247 | def lemmatize_text(train):
248 |     s_data = []
249 |     lmtzr = WordNetLemmatizer()
250 |     
251 |     for i in range(train.shape[0]):
252 |         s = (" ").join(["q"+ z for z in BeautifulSoup(train["query"].iloc[i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title.iloc[i]).get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description.iloc[i]).get_text(" ")
253 |         s = re.sub("[^a-zA-Z0-9]"," ", s)
254 |         s = (" ").join([lmtzr.lemmatize(z) for z in s.split(" ")])
255 |         s_data.append(s)
256 |     
257 |     return s_data
258 | 
259 | def most_common(arr):
260 |     arr = Counter(arr)
261 |     return arr.most_common(1)[0][0]
262 | 
263 | def get_kappa_scorer():
264 |     return make_scorer(quadratic_weighted_kappa, greater_is_better=True)
265 | 
266 | 
267 | '''
268 | Make a submission file in submissions folder in
269 | current working directory that can be uploaded to
270 | Kaggle.
271 | '''
272 | 
273 | def make_submission(idx, preds, filename):
274 |     submission = pd.DataFrame({"id": idx, "prediction": preds})
275 |     submission.to_csv("./submissions/" + filename, index=False)
276 | 
277 | 


--------------------------------------------------------------------------------
/CrowdFlower/scripts/model_train_plus_test.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import TfidfVectorizer
 2 | from sklearn.feature_selection import SelectPercentile, chi2
 3 | from sklearn.decomposition import TruncatedSVD
 4 | from sklearn.preprocessing import StandardScaler
 5 | from sklearn.svm import SVC
 6 | from sklearn.naive_bayes import MultinomialNB
 7 | from sklearn.neighbors import KNeighborsClassifier
 8 | from sklearn.feature_selection import SelectPercentile, chi2, RFECV
 9 | from sklearn.linear_model import LogisticRegression
10 | 
11 | 
12 | def TFIDF(Xtrain, Xwhole):
13 | 	tfv = TfidfVectorizer(min_df=3, max_df=700, max_features=None,
14 | 		  strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
15 | 		  ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english')
16 | 
17 | 	if Xwhole == None:
18 | 		return (tfv.fit_transform(Xtrain), tfv)
19 | 	else:
20 | 		tfv.fit(Xwhole)
21 | 		return (tfv.transform(Xtrain), tfv)
22 | 
23 | def build_non_linear_model(Xtrain, y):
24 | 	
25 | 	svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
26 | 	scl = StandardScaler(copy=True, with_mean=True, with_std=True)
27 | 	
28 | 	Xtrain = svd.fit_transform(Xtrain)
29 | 	Xtrain = scl.fit_transform(Xtrain)
30 | 
31 | 	clf = SVC(C=10.0, kernel='rbf', degree=3, 
32 | 		  gamma=0.0, coef0=0.0, shrinking=True, probability=False, 
33 | 		  tol=0.001, cache_size=200, class_weight=None, 
34 | 		  verbose=False, max_iter=-1, random_state=None)
35 | 
36 | 
37 | 	return (clf.fit(Xtrain, y), svd, scl)
38 | 
39 | def build_linear_model(X, y):
40 | 	select = SelectPercentile(score_func=chi2, percentile=20)
41 | 	clf = SVC(C=10.0, kernel='linear', probability=True)
42 | 
43 | 	X = select.fit_transform(X, y)
44 | 	return (clf.fit(X, y), select)
45 | 
46 | def build_knn_model(Xtrain, y):
47 | 	svd = TruncatedSVD(n_components=100, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
48 | 	scl = StandardScaler(copy=True, with_mean=True, with_std=True)
49 | 	
50 | 	Xtrain = svd.fit_transform(Xtrain)
51 | 	Xtrain = scl.fit_transform(Xtrain)
52 | 
53 | 	clf = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute')
54 | 
55 | 	return (clf.fit(Xtrain, y), svd, scl)
56 | 
57 | def feature_selection(X, y, pipeline, step=1, cv=None, scoring=None, estimator_params=None, verbose=0):
58 | 	selector = RFECV(pipeline, step=step, cv=cv, scoring=scoring, estimator_params=estimator_params, verbose=verbose)
59 | 	selector.fit(X, y)
60 | 
61 | 	return selector
62 | 
63 | 
64 | def make_predictions(model, options, Xtest):
65 | 	if options.has_key('tfv'):
66 | 		Xtest = options['tfv'].transform(Xtest)
67 | 	
68 | 	if options.has_key('svd'):
69 | 		Xtest = options['svd'].transform(Xtest)
70 | 	
71 | 	if options.has_key('scl'):
72 | 		Xtest = options['scl'].transform(Xtest)
73 | 
74 | 	if options.has_key('select'):
75 | 		Xtest = options['select'].transform(Xtest)
76 | 
77 | 	return model.predict(Xtest)


--------------------------------------------------------------------------------
/CrowdFlower/scripts/models.py:
--------------------------------------------------------------------------------
  1 | from sklearn.feature_extraction.text import TfidfVectorizer
  2 | from sklearn.feature_selection import SelectPercentile, chi2
  3 | from sklearn.decomposition import TruncatedSVD
  4 | from sklearn.preprocessing import StandardScaler
  5 | from sklearn.svm import SVC
  6 | from sklearn.naive_bayes import MultinomialNB
  7 | from sklearn.neighbors import KNeighborsClassifier
  8 | 
  9 | 
 10 | def vectorizer(analyzerType):
 11 | 	if analyzerType == None:
 12 | 		return TfidfVectorizer(min_df=3, max_features=None,
 13 | 		  strip_accents='unicode', analyzer='char', token_pattern=r'\w{1,}',
 14 | 		  ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english')
 15 | 	else: return TfidfVectorizer(min_df=3, max_features=None,
 16 | 		  strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
 17 | 		  ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english') 
 18 | 
 19 | def build_linear_model(X, y, analyzerType):
 20 | 	tfv = vectorizer(analyzerType)
 21 | 	select = SelectPercentile(score_func=chi2, percentile=15)
 22 | 	clf = SVC(C=12.0, kernel='linear')
 23 | 
 24 | 	X = tfv.fit_transform(X)
 25 | 	X = select.fit_transform(X, y)
 26 | 	return (clf.fit(X, y), tfv, select)
 27 | 
 28 | def build_non_linear_model(X, y, analyzerType):
 29 | 	tfv = vectorizer(analyzerType)
 30 | 	svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
 31 | 	scl = StandardScaler(copy=True, with_mean=True, with_std=True)
 32 | 	clf = SVC(C=10.0, kernel='rbf', degree=3, 
 33 | 		  gamma=0.0, coef0=0.0, shrinking=True, probability=False, 
 34 | 		  tol=0.001, cache_size=200, class_weight=None, 
 35 | 		  verbose=False, max_iter=-1, random_state=None)
 36 | 
 37 | 	tfv.fit(X)
 38 | 	X = tfv.transform(X)
 39 | 	X = svd.fit_transform(X)
 40 | 	X = scl.fit_transform(X)
 41 | 
 42 | 	return (clf.fit(X, y), tfv, svd, scl)
 43 | 
 44 | def build_knn_model(X, y, weights, analyzerType):
 45 | 	tfv = vectorizer(analyzerType)
 46 | 	svd = TruncatedSVD(n_components=250)
 47 | 
 48 | 	if weights == None:
 49 | 		clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
 50 | 	else:
 51 | 		clf = KNeighborsClassifier(n_neighbors=5, weights=weights, algorithm='brute')
 52 | 
 53 | 	tfv.fit(X)
 54 | 	X = tfv.transform(X)
 55 | 	X = svd.fit_transform(X)
 56 | 	
 57 | 	return (clf.fit(X, y), tfv, svd)
 58 | 
 59 | def build_naive_bayes(X, y):
 60 | 	tfv = vectorizer(analyzerType)
 61 | 	clf = MultinomialNB(alpha=.01)
 62 | 
 63 | 	X = tfv.fit_transform(X)
 64 | 	
 65 | 	return (clf.fit(X, y), tfv)	
 66 | 
 67 | 
 68 | def build_stopwords_tweak_model(X, y):
 69 | 	tfv = TfidfVectorizer(min_df=3 ,max_features=None,
 70 | 		  strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
 71 | 		  ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english')
 72 | 
 73 | 
 74 | 	svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
 75 | 	scl = StandardScaler(copy=True, with_mean=True, with_std=True)
 76 | 	clf = SVC(C=10.0, kernel='rbf', degree=3, 
 77 | 		  gamma=0.0, coef0=0.0, shrinking=True, probability=False, 
 78 | 		  tol=0.001, cache_size=200, class_weight=None, 
 79 | 		  verbose=False, max_iter=-1, random_state=None)
 80 | 
 81 | 	tfv.fit(X)
 82 | 	X = tfv.transform(X)
 83 | 	X = svd.fit_transform(X)
 84 | 	X = scl.fit_transform(X)
 85 | 
 86 | 	return (clf.fit(X, y), tfv, svd, scl)
 87 | 
 88 | 
 89 | '''
 90 | This function can be used for both linear kernel and SGDClassifier
 91 | '''
 92 | def linear_model_predictions(model, tfv, select, Xtest):
 93 | 	Xtest = tfv.transform(Xtest)
 94 | 	Xtest = select.transform(Xtest)
 95 | 
 96 | 	return model.predict(Xtest)
 97 | 
 98 | def non_linear_model_predictions(model, tfv, svd, scl, Xtest):
 99 | 	Xtest = tfv.transform(Xtest)
100 | 	Xtest = svd.transform(Xtest)
101 | 	Xtest = scl.transform(Xtest)
102 | 
103 | 	return model.predict(Xtest)
104 | 
105 | def naive_bayes_predictions(model, tfv, Xtest):
106 | 	Xtest = tfv.transform(Xtest)
107 | 	return model.predict(Xtest)
108 | 
109 | def knn_model_predictions(model, tfv, svd, Xtest):
110 | 	Xtest = tfv.transform(Xtest)
111 | 	Xtest = svd.transform(Xtest)
112 | 
113 | 	return model.predict(Xtest)


--------------------------------------------------------------------------------
/HIV-Progression/.ipynb_checkpoints/Basic_Analysis-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/HIV-Progression/.ipynb_checkpoints/ClassBalancedModel-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/HIV-Progression/helper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.cross_validation import ShuffleSplit
 5 | from sklearn.cross_validation import cross_val_score
 6 | from sklearn.learning_curve import validation_curve
 7 | 
 8 | def load_data(path, index_col):
 9 | 	"""
10 | 	Loads a csv file as pandas data frame
11 | 	"""
12 | 	return pd.read_csv(path, index_col=index_col)
13 | 
14 | def misclassification_percentage(y_true, y_pred):
15 | 
16 | 	"""
17 | 	Returns misclassification percentage ( misclassified_examples / total_examples * 100.0)
18 | 	"""
19 | 
20 | 	misclassified_examples = list(y_true == y_pred).count(False) * 1.
21 | 	total_examples = y_true.shape[0]
22 | 	return (misclassified_examples / total_examples) * 100.0
23 | 
24 | def validation_scores(model, X, y, n_iter=5, test_size=0.1):
25 | 	
26 | 	cv = ShuffleSplit(X.shape[0], n_iter=n_iter, test_size=test_size, random_state=0)
27 | 	test_scores = cross_val_score(model, X, y, cv=cv)
28 | 
29 | 	return test_scores
30 | 
31 | def plot_validation_curves(param_values, train_scores, test_scores):
32 | 	for i in range(train_scores.shape[1]):
33 | 		plt.semilogx(param_values, train_scores[:, i], alpha=0.4, lw=2, c='b')
34 | 		plt.semilogx(param_values, test_scores[:, i], alpha=0.4, lw=2, c='g')
35 | 	
36 | 	plt.ylabel("score for LogisticRegression(fit_intercept=True)")
37 | 	plt.xlabel("C")
38 | 	plt.title('Validation curves for the C parameter');
39 | 
40 | def validation_curves(model, X, y, n_iter, test_size):
41 | 	n_Cs = 10
42 | 	Cs = np.logspace(-5, 5, n_Cs)
43 | 	cv = ShuffleSplit(X.shape[0], n_iter=n_iter, test_size=test_size, random_state=0)
44 | 
45 | 	train_scores, test_scores = validation_curve(
46 | 		model, X, y, 'C', Cs, cv=cv)
47 | 
48 | 	return (Cs, train_scores, test_scores)
49 | 
50 | 
51 | class BaselineModel:
52 | 
53 | 	"""
54 | 	Takes in the most majority class and number of training examples
55 | 	and returns its prediction as elements of majority class.
56 | 	e.g. In our current training set majority class is 0 then it would
57 | 	return all values as being 0 as our prediction.
58 | 
59 | 	Any model that we develop must be compared with this baseline model.
60 | 	"""
61 | 
62 | 	def __init__(self, majority_class, num_examples):
63 | 		self.majority_class = majority_class
64 | 		self.num_examples = num_examples
65 | 
66 | 	def predict(self):
67 | 		return np.asarray([self.majority_class] * self.num_examples)
68 | 
69 | class Submission:
70 | 	"""
71 | 	Creates a submission in Kaggle competition format
72 | 	Column 1 will contain Patient Id and Column 2 will be 
73 | 	the prediction.
74 | 	"""
75 | 	def __init__(self, prediction):
76 | 		self.prediction = prediction
77 | 
78 | 	def create_submission(self, path):
79 | 		with open(path, 'wb') as outfile:
80 | 			for (patient_id, pred) in enumerate(self.prediction):
81 | 				outfile.write(str(patient_id) + ',' + str(pred))
82 | 				outfile.write('\n')
83 | 			outfile.close()


--------------------------------------------------------------------------------
/HIV-Progression/initialSubmission.csv:
--------------------------------------------------------------------------------
  1 | 0,0
  2 | 1,0
  3 | 2,0
  4 | 3,0
  5 | 4,0
  6 | 5,0
  7 | 6,0
  8 | 7,0
  9 | 8,0
 10 | 9,0
 11 | 10,0
 12 | 11,0
 13 | 12,0
 14 | 13,0
 15 | 14,0
 16 | 15,0
 17 | 16,0
 18 | 17,0
 19 | 18,0
 20 | 19,0
 21 | 20,0
 22 | 21,0
 23 | 22,0
 24 | 23,0
 25 | 24,0
 26 | 25,0
 27 | 26,0
 28 | 27,0
 29 | 28,0
 30 | 29,0
 31 | 30,0
 32 | 31,0
 33 | 32,0
 34 | 33,0
 35 | 34,0
 36 | 35,0
 37 | 36,0
 38 | 37,0
 39 | 38,0
 40 | 39,0
 41 | 40,0
 42 | 41,0
 43 | 42,0
 44 | 43,0
 45 | 44,0
 46 | 45,0
 47 | 46,0
 48 | 47,0
 49 | 48,0
 50 | 49,0
 51 | 50,0
 52 | 51,0
 53 | 52,0
 54 | 53,0
 55 | 54,0
 56 | 55,0
 57 | 56,0
 58 | 57,0
 59 | 58,0
 60 | 59,0
 61 | 60,0
 62 | 61,0
 63 | 62,0
 64 | 63,0
 65 | 64,0
 66 | 65,0
 67 | 66,0
 68 | 67,0
 69 | 68,0
 70 | 69,0
 71 | 70,0
 72 | 71,0
 73 | 72,0
 74 | 73,0
 75 | 74,0
 76 | 75,0
 77 | 76,0
 78 | 77,0
 79 | 78,0
 80 | 79,0
 81 | 80,0
 82 | 81,0
 83 | 82,0
 84 | 83,0
 85 | 84,0
 86 | 85,0
 87 | 86,0
 88 | 87,0
 89 | 88,0
 90 | 89,0
 91 | 90,0
 92 | 91,0
 93 | 92,0
 94 | 93,0
 95 | 94,0
 96 | 95,0
 97 | 96,0
 98 | 97,0
 99 | 98,0
100 | 99,0
101 | 100,0
102 | 101,0
103 | 102,0
104 | 103,0
105 | 104,0
106 | 105,0
107 | 106,0
108 | 107,0
109 | 108,0
110 | 109,0
111 | 110,0
112 | 111,0
113 | 112,0
114 | 113,0
115 | 114,0
116 | 115,0
117 | 116,0
118 | 117,0
119 | 118,0
120 | 119,0
121 | 120,0
122 | 121,0
123 | 122,0
124 | 123,0
125 | 124,0
126 | 125,0
127 | 126,0
128 | 127,0
129 | 128,0
130 | 129,0
131 | 130,0
132 | 131,0
133 | 132,0
134 | 133,0
135 | 134,0
136 | 135,0
137 | 136,0
138 | 137,0
139 | 138,0
140 | 139,0
141 | 140,0
142 | 141,0
143 | 142,0
144 | 143,0
145 | 144,0
146 | 145,0
147 | 146,0
148 | 147,0
149 | 148,0
150 | 149,0
151 | 150,0
152 | 151,0
153 | 152,0
154 | 153,0
155 | 154,0
156 | 155,0
157 | 156,0
158 | 157,0
159 | 158,0
160 | 159,0
161 | 160,0
162 | 161,0
163 | 162,0
164 | 163,0
165 | 164,0
166 | 165,0
167 | 166,0
168 | 167,0
169 | 168,0
170 | 169,0
171 | 170,0
172 | 171,0
173 | 172,0
174 | 173,0
175 | 174,0
176 | 175,0
177 | 176,0
178 | 177,0
179 | 178,0
180 | 179,0
181 | 180,0
182 | 181,0
183 | 182,0
184 | 183,0
185 | 184,0
186 | 185,0
187 | 186,0
188 | 187,0
189 | 188,0
190 | 189,0
191 | 190,0
192 | 191,0
193 | 192,0
194 | 193,0
195 | 194,0
196 | 195,0
197 | 196,0
198 | 197,0
199 | 198,0
200 | 199,0
201 | 200,0
202 | 201,0
203 | 202,0
204 | 203,0
205 | 204,0
206 | 205,0
207 | 206,0
208 | 207,0
209 | 208,0
210 | 209,0
211 | 210,0
212 | 211,0
213 | 212,0
214 | 213,0
215 | 214,0
216 | 215,0
217 | 216,0
218 | 217,0
219 | 218,0
220 | 219,0
221 | 220,0
222 | 221,0
223 | 222,0
224 | 223,0
225 | 224,0
226 | 225,0
227 | 226,0
228 | 227,0
229 | 228,0
230 | 229,0
231 | 230,0
232 | 231,0
233 | 232,0
234 | 233,0
235 | 234,0
236 | 235,0
237 | 236,0
238 | 237,0
239 | 238,0
240 | 239,0
241 | 240,0
242 | 241,0
243 | 242,0
244 | 243,0
245 | 244,0
246 | 245,0
247 | 246,0
248 | 247,0
249 | 248,0
250 | 249,0
251 | 250,0
252 | 251,0
253 | 252,0
254 | 253,0
255 | 254,0
256 | 255,0
257 | 256,0
258 | 257,0
259 | 258,0
260 | 259,0
261 | 260,0
262 | 261,0
263 | 262,0
264 | 263,0
265 | 264,0
266 | 265,0
267 | 266,0
268 | 267,0
269 | 268,0
270 | 269,0
271 | 270,0
272 | 271,0
273 | 272,0
274 | 273,0
275 | 274,0
276 | 275,0
277 | 276,0
278 | 277,0
279 | 278,0
280 | 279,0
281 | 280,0
282 | 281,0
283 | 282,0
284 | 283,0
285 | 284,0
286 | 285,0
287 | 286,0
288 | 287,0
289 | 288,0
290 | 289,0
291 | 290,0
292 | 291,0
293 | 292,0
294 | 293,0
295 | 294,0
296 | 295,0
297 | 296,0
298 | 297,0
299 | 298,0
300 | 299,0
301 | 300,0
302 | 301,0
303 | 302,0
304 | 303,0
305 | 304,0
306 | 305,0
307 | 306,0
308 | 307,0
309 | 308,0
310 | 309,0
311 | 310,0
312 | 311,0
313 | 312,0
314 | 313,0
315 | 314,0
316 | 315,0
317 | 316,0
318 | 317,0
319 | 318,0
320 | 319,0
321 | 320,0
322 | 321,0
323 | 322,0
324 | 323,0
325 | 324,0
326 | 325,0
327 | 326,0
328 | 327,0
329 | 328,0
330 | 329,0
331 | 330,0
332 | 331,0
333 | 332,0
334 | 333,0
335 | 334,0
336 | 335,0
337 | 336,0
338 | 337,0
339 | 338,0
340 | 339,0
341 | 340,0
342 | 341,0
343 | 342,0
344 | 343,0
345 | 344,0
346 | 345,0
347 | 346,0
348 | 347,0
349 | 348,0
350 | 349,0
351 | 350,0
352 | 351,0
353 | 352,0
354 | 353,0
355 | 354,0
356 | 355,0
357 | 356,0
358 | 357,0
359 | 358,0
360 | 359,0
361 | 360,0
362 | 361,0
363 | 362,0
364 | 363,0
365 | 364,0
366 | 365,0
367 | 366,0
368 | 367,0
369 | 368,0
370 | 369,0
371 | 370,0
372 | 371,0
373 | 372,0
374 | 373,0
375 | 374,0
376 | 375,0
377 | 376,0
378 | 377,0
379 | 378,0
380 | 379,0
381 | 380,0
382 | 381,0
383 | 382,0
384 | 383,0
385 | 384,0
386 | 385,0
387 | 386,0
388 | 387,0
389 | 388,0
390 | 389,0
391 | 390,0
392 | 391,0
393 | 392,0
394 | 393,0
395 | 394,0
396 | 395,0
397 | 396,0
398 | 397,0
399 | 398,0
400 | 399,0
401 | 400,0
402 | 401,0
403 | 402,0
404 | 403,0
405 | 404,0
406 | 405,0
407 | 406,0
408 | 407,0
409 | 408,0
410 | 409,0
411 | 410,0
412 | 411,0
413 | 412,0
414 | 413,0
415 | 414,0
416 | 415,0
417 | 416,0
418 | 417,0
419 | 418,0
420 | 419,0
421 | 420,0
422 | 421,0
423 | 422,0
424 | 423,0
425 | 424,0
426 | 425,0
427 | 426,0
428 | 427,0
429 | 428,0
430 | 429,0
431 | 430,0
432 | 431,0
433 | 432,0
434 | 433,0
435 | 434,0
436 | 435,0
437 | 436,0
438 | 437,0
439 | 438,0
440 | 439,0
441 | 440,0
442 | 441,0
443 | 442,0
444 | 443,0
445 | 444,0
446 | 445,0
447 | 446,0
448 | 447,0
449 | 448,0
450 | 449,0
451 | 450,0
452 | 451,0
453 | 452,0
454 | 453,0
455 | 454,0
456 | 455,0
457 | 456,0
458 | 457,0
459 | 458,0
460 | 459,0
461 | 460,0
462 | 461,0
463 | 462,0
464 | 463,0
465 | 464,0
466 | 465,0
467 | 466,0
468 | 467,0
469 | 468,0
470 | 469,0
471 | 470,0
472 | 471,0
473 | 472,0
474 | 473,0
475 | 474,0
476 | 475,0
477 | 476,0
478 | 477,0
479 | 478,0
480 | 479,0
481 | 480,0
482 | 481,0
483 | 482,0
484 | 483,0
485 | 484,0
486 | 485,0
487 | 486,0
488 | 487,0
489 | 488,0
490 | 489,0
491 | 490,0
492 | 491,0
493 | 492,0
494 | 493,0
495 | 494,0
496 | 495,0
497 | 496,0
498 | 497,0
499 | 498,0
500 | 499,0
501 | 500,0
502 | 501,0
503 | 502,0
504 | 503,0
505 | 504,0
506 | 505,0
507 | 506,0
508 | 507,0
509 | 508,0
510 | 509,0
511 | 510,0
512 | 511,0
513 | 512,0
514 | 513,0
515 | 514,0
516 | 515,0
517 | 516,0
518 | 517,0
519 | 518,0
520 | 519,0
521 | 520,0
522 | 521,0
523 | 522,0
524 | 523,0
525 | 524,0
526 | 525,0
527 | 526,0
528 | 527,0
529 | 528,0
530 | 529,0
531 | 530,0
532 | 531,0
533 | 532,0
534 | 533,0
535 | 534,0
536 | 535,0
537 | 536,0
538 | 537,0
539 | 538,0
540 | 539,0
541 | 540,0
542 | 541,0
543 | 542,0
544 | 543,0
545 | 544,0
546 | 545,0
547 | 546,0
548 | 547,0
549 | 548,0
550 | 549,0
551 | 550,0
552 | 551,0
553 | 552,0
554 | 553,0
555 | 554,0
556 | 555,0
557 | 556,0
558 | 557,0
559 | 558,0
560 | 559,0
561 | 560,0
562 | 561,0
563 | 562,0
564 | 563,0
565 | 564,0
566 | 565,0
567 | 566,0
568 | 567,0
569 | 568,0
570 | 569,0
571 | 570,0
572 | 571,0
573 | 572,0
574 | 573,0
575 | 574,0
576 | 575,0
577 | 576,0
578 | 577,0
579 | 578,0
580 | 579,0
581 | 580,0
582 | 581,0
583 | 582,0
584 | 583,0
585 | 584,0
586 | 585,0
587 | 586,0
588 | 587,0
589 | 588,0
590 | 589,0
591 | 590,0
592 | 591,0
593 | 592,0
594 | 593,0
595 | 594,0
596 | 595,0
597 | 596,0
598 | 597,0
599 | 598,0
600 | 599,0
601 | 600,0
602 | 601,0
603 | 602,0
604 | 603,0
605 | 604,0
606 | 605,0
607 | 606,0
608 | 607,0
609 | 608,0
610 | 609,0
611 | 610,0
612 | 611,0
613 | 612,0
614 | 613,0
615 | 614,0
616 | 615,0
617 | 616,0
618 | 617,0
619 | 618,0
620 | 619,0
621 | 620,0
622 | 621,0
623 | 622,0
624 | 623,0
625 | 624,0
626 | 625,0
627 | 626,0
628 | 627,0
629 | 628,0
630 | 629,0
631 | 630,0
632 | 631,0
633 | 632,0
634 | 633,0
635 | 634,0
636 | 635,0
637 | 636,0
638 | 637,0
639 | 638,0
640 | 639,0
641 | 640,0
642 | 641,0
643 | 642,0
644 | 643,0
645 | 644,0
646 | 645,0
647 | 646,0
648 | 647,0
649 | 648,0
650 | 649,0
651 | 650,0
652 | 651,0
653 | 652,0
654 | 653,0
655 | 654,0
656 | 655,0
657 | 656,0
658 | 657,0
659 | 658,0
660 | 659,0
661 | 660,0
662 | 661,0
663 | 662,0
664 | 663,0
665 | 664,0
666 | 665,0
667 | 666,0
668 | 667,0
669 | 668,0
670 | 669,0
671 | 670,0
672 | 671,0
673 | 672,0
674 | 673,0
675 | 674,0
676 | 675,0
677 | 676,0
678 | 677,0
679 | 678,0
680 | 679,0
681 | 680,0
682 | 681,0
683 | 682,0
684 | 683,0
685 | 684,0
686 | 685,0
687 | 686,0
688 | 687,0
689 | 688,0
690 | 689,0
691 | 690,0
692 | 691,0
693 | 


--------------------------------------------------------------------------------
/Home Insurance/features.py:
--------------------------------------------------------------------------------
  1 | from sklearn.base import BaseEstimator
  2 | from sklearn.preprocessing import LabelEncoder
  3 | from sklearn.feature_extraction import DictVectorizer
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | 
  8 | class FeatureTransformer(BaseEstimator):
  9 | 	"""
 10 | 	Generate features
 11 | 	"""
 12 | 
 13 | 	def __init__(self, train, test):
 14 | 		self.X = train
 15 | 		self.X_test = test
 16 | 		# self.X = pd.read_csv('./data/train.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')
 17 | 		# self.X_test = pd.read_csv('./data/test.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')
 18 | 
 19 | 		# self.X = self.X.fillna(-1)
 20 | 		# self.X_test = self.X_test.fillna(-1)
 21 | 		pass
 22 | 		
 23 | 
 24 | 	def get_feature_names(self):
 25 | 		feature_names = []
 26 | 
 27 | 		feature_names.extend(['year_original_quote', 'month_original_quote', 'weekday_original_quote'])
 28 | 		feature_names.extend(self.categorical_features_columns)
 29 | 		feature_names.extend(self.numerical_features_columns)
 30 | 
 31 | 		return np.array(feature_names)
 32 | 
 33 | 	def fit(self, X, y=None):
 34 | 		self.fit_transform(X, y)
 35 | 
 36 | 		return self
 37 | 
 38 | 	def fit_transform(self, X, y=None):
 39 | 		
 40 | 		date_features = self._process_dates(X)
 41 | 		is_nan_features = self._is_nan(X)
 42 | 		count_nan_features = self._count_nans(X)
 43 | 		count_undecodable = self._count_undecodable(X)
 44 | 		categorical_features = self._process_categorical_features(X)
 45 | 		numerical_features = self._process_numerical_features(X)
 46 | 
 47 | 		features = []
 48 | 		
 49 | 		features.append(date_features)
 50 | 		features.append(is_nan_features)
 51 | 		features.append(count_nan_features)
 52 | 		features.append(categorical_features)
 53 | 		features.append(numerical_features)
 54 | 
 55 | 		features = np.hstack(features)
 56 | 		
 57 | 		return features
 58 | 
 59 | 	def _process_dates(self, X):
 60 | 		'Extract year, month and weekday of original quote'
 61 | 
 62 | 		year_original_quote = X.Original_Quote_Date.dt.year
 63 | 		month_original_quote = X.Original_Quote_Date.dt.month
 64 | 		weekday_original_quote = X.Original_Quote_Date.dt.weekday
 65 | 
 66 | 		return np.array([year_original_quote, month_original_quote, weekday_original_quote]).T
 67 | 
 68 | 	def _is_nan(self, X):
 69 | 		'Check to see whether record has any nan value or not'
 70 | 		null_check = X.apply(lambda x: -1 in x.values, axis=1) * 1.
 71 | 
 72 | 		return np.array(null_check).reshape(-1, 1)
 73 | 
 74 | 	def _count_nans(self, X):
 75 | 		'Count number of missing values in a quote'
 76 | 
 77 | 		count_nans = X.apply(lambda x: list(x.values).count(-1), axis=1)
 78 | 
 79 | 		return np.array([count_nans]).T
 80 | 
 81 | 	def _count_undecodable(self, X):
 82 | 		'Count number of undecodable values (0)'
 83 | 
 84 | 		count_undecodable = X.apply(lambda x: list(x.values).count(0), axis=1)
 85 | 
 86 | 		return np.array([count_undecodable]).T
 87 | 
 88 | 	def _process_categorical_features(self, X):
 89 | 		'Encode categorical features into numerical features'
 90 | 
 91 | 		self.categorical_features_columns = X.select_dtypes(['object']).columns
 92 | 		categorical_features = []
 93 | 
 94 | 		for cat in self.categorical_features_columns:
 95 | 			lbl = LabelEncoder()
 96 | 
 97 | 			lbl.fit(pd.concat([self.X[cat], self.X_test[cat]], axis=0))
 98 | 
 99 | 			categorical_features.append(lbl.transform(X[cat]))
100 | 
101 | 		return np.array(categorical_features).T
102 | 
103 | 	def _process_numerical_features(self, X):
104 | 		'Return numerical features as it is'
105 | 
106 | 		self.numerical_features_columns = X.select_dtypes(['int32', 'int64', 'float32', 'float64'])
107 | 
108 | 		numerical_features = []
109 | 
110 | 		for col in self.numerical_features_columns:
111 | 			numerical_features.append(X[col])
112 | 
113 | 		return np.array(numerical_features).T
114 | 		
115 | 
116 | 	def transform(self, X):
117 | 		date_features = self._process_dates(X)
118 | 		is_nan_features = self._is_nan(X)
119 | 		count_nan_features = self._count_nans(X)
120 | 		count_undecodable = self._count_undecodable(X)
121 | 		categorical_features = self._process_categorical_features(X)
122 | 		numerical_features = self._process_numerical_features(X)
123 | 
124 | 		features = []
125 | 		
126 | 		features.append(date_features)
127 | 		features.append(is_nan_features)
128 | 		features.append(count_nan_features)
129 | 		features.append(categorical_features)
130 | 		features.append(numerical_features)
131 | 
132 | 		features = np.hstack(features)
133 | 		
134 | 		return features
135 | 	
136 | 


--------------------------------------------------------------------------------
/Home Insurance/scripts/helper.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cross_validation import StratifiedShuffleSplit
 2 | from sklearn.preprocessing import LabelEncoder
 3 | from sklearn.grid_search import RandomizedSearchCV
 4 | 
 5 | from collections import defaultdict
 6 | 
 7 | 
 8 | import pandas as pd
 9 | 
10 | def encode_labels(train, test):
11 | 	"""
12 | 	Encodes the categorical features into numerical features
13 | 	for both train and test dataframes
14 | 	"""
15 | 
16 | 	categorical_features = train.select_dtypes(['object']).columns
17 | 
18 | 	for col in categorical_features:
19 | 		total_values = pd.concat([train[col], test[col]], axis=0)
20 | 		
21 | 		lbl = LabelEncoder()
22 | 		
23 | 		lbl.fit(total_values)
24 | 		train[col] = lbl.transform(train[col])
25 | 		test[col] = lbl.transform(test[col])
26 | 
27 | 	return train, test
28 | 
29 | def cv_optimize(X, y, cv, clf, parameters):
30 | 	"""
31 | 	Randomized Grid search on the parameter space to find out the best
32 | 	parameter settings to produce an accurate model
33 | 	"""
34 | 
35 | 	rs = RandomizedSearchCV(clf, param_distributions=parameters, cv=cv, scoring='roc_auc')
36 | 	rs.fit(X, y)
37 | 
38 | 	return rs
39 | 
40 | def transform_for_ranked(preds, index):
41 | 	ranks = []
42 | 
43 | 	for i, pred in enumerate(preds):
44 | 		ranks.append((index[i], pred))
45 | 
46 | 	return ranks
47 | 
48 | 
49 | def ranked_averaging(predictions):
50 | 	all_ranks = defaultdict(list)
51 | 
52 | 	for i, preds in enumerate(predictions):
53 | 		individual_ranks = []
54 | 
55 | 		for e, pred in enumerate(preds):
56 | 			individual_ranks.append( (float(pred[1]), e, pred[0]) )
57 | 
58 | 		for rank, item in enumerate( sorted(individual_ranks) ) :
59 | 			all_ranks[(item[1], item[2])].append(rank)
60 | 
61 | 	average_ranks = []
62 | 
63 | 	for k in sorted(all_ranks):
64 | 		average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k))
65 | 
66 | 	ranked_ranks = []
67 | 
68 | 	for rank, k in enumerate(sorted(average_ranks)):
69 | 		ranked_ranks.append((k[1][0],k[1][1],(rank * 1.)/(len(average_ranks)-1)))
70 | 	
71 | 	return sorted(ranked_ranks)
72 | 


--------------------------------------------------------------------------------
/Home Insurance/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | 
 5 | def load_data(train_filename='./data/train.csv', test_filename='./data/test.csv'):
 6 | 	
 7 | 	print 'Loading datasets'
 8 | 	
 9 | 	train = pd.read_csv(train_filename, parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')
10 | 	test = pd.read_csv(test_filename, parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')
11 | 
12 | 	print 'Setting Quote Number as index'
13 | 
14 | 	return train, test
15 | 
16 | 
17 | def prepare_sample(train, n=1000):
18 | 	features = train.columns.drop('QuoteConversion_Flag')
19 | 
20 | 	train_2013 = train[train.Original_Quote_Date.dt.year==2013].sample(n=n)
21 | 	train_2014 = train[train.Original_Quote_Date.dt.year==2014].sample(n=n)
22 | 	train_2015 = train[train.Original_Quote_Date.dt.year==2015].sample(n=n)
23 | 
24 | 	train_merged = pd.concat([train_2013, train_2014, train_2015], axis=0)
25 | 	train_merged_shuffle = train_merged.iloc[np.random.permutation(len(train_merged))]
26 | 
27 | 	X = train_merged_shuffle[features]
28 | 	y = train_merged_shuffle['QuoteConversion_Flag']
29 | 
30 | 	return X, y
31 | 
32 | def random_sample(train, n):
33 | 	features = train.columns.drop('QuoteConversion_Flag')
34 | 
35 | 	train = train.take(np.random.permutation(len(train))[:n])
36 | 
37 | 	X = train[features]
38 | 	y = train['QuoteConversion_Flag']
39 | 
40 | 	return X, y
41 | 


--------------------------------------------------------------------------------
/Home-Depot/scripts/cross-validation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Mar 30 23:33:52 2016
 4 | 
 5 | @author: abhishek
 6 | """
 7 | import pandas as pd
 8 | import numpy as np
 9 | import cPickle
10 | 
11 | from sklearn.cross_validation import train_test_split
12 | from sklearn.preprocessing import StandardScaler
13 | from sklearn.linear_model import LinearRegression
14 | from sklearn.pipeline import Pipeline
15 | from sklearn.svm import SVR
16 | from sklearn.metrics import mean_squared_error
17 | 
18 | np.random.seed(1729)
19 | 
20 | with open('./data/synthesized/train_processed.pkl', 'r') as infile:    
21 |     train = cPickle.load(infile)
22 |     infile.close()
23 | 
24 | with open('./data/synthesized/test_processed.pkl', 'r') as infile:    
25 |     test = cPickle.load(infile)
26 |     infile.close()
27 |     
28 | 
29 | 
30 | 
31 | X_train, X_test, y_train, y_test = train_test_split(corpus_svd, train.relevance, test_size=0.3,
32 |                                                     random_state=44)
33 | 
34 | 
35 | scaler = StandardScaler()
36 | svr = SVR()
37 | 
38 | pipeline = Pipeline([('scaler', scaler), ('svr', svr)])
39 | pipeline.fit(X_train, y_train)
40 | 
41 | predsTrain = pipeline.predict(X_train)
42 | predsTest = pipeline.predict(X_test)
43 | 
44 | print 'RMSE on training examples %f ' %(np.sqrt(mean_squared_error(y_train, predsTrain)))
45 | print 'RMSE on test examples %f ' %(np.sqrt(mean_squared_error(y_test, predsTest)))
46 | 


--------------------------------------------------------------------------------
/Home-Depot/scripts/dataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Apr 10 14:30:44 2016
  4 | 
  5 | @author: abhishek
  6 | """
  7 | import pandas as pd
  8 | import numpy as np
  9 | import re
 10 | from search_map import spell_check_dict
 11 | from nltk import word_tokenize
 12 | from sklearn.metrics.pairwise import cosine_similarity
 13 | from sklearn.metrics import jaccard_similarity_score
 14 | from sklearn.feature_extraction.text import TfidfVectorizer
 15 | from sklearn.decomposition import TruncatedSVD
 16 | from scipy import sparse as sps
 17 | 
 18 | class Dataset(object):
 19 |     def __init__(self, train, test):
 20 |         self.train = train.copy()
 21 |         self.test = test.copy()
 22 |         
 23 |         self.y = train.relevance
 24 |         
 25 |         self.tfidf_vectorizer = TfidfVectorizer()
 26 |      
 27 |     def correct_search_terms(self, train, test):
 28 |         def correct_term(q):
 29 |             if q in spell_check_dict:
 30 |                 return spell_check_dict[q]
 31 |             else:
 32 |                 return q
 33 |         
 34 |         train_search_terms = train.search_term
 35 |         test_search_terms = test.search_term
 36 |   
 37 |         return train_search_terms, test_search_terms
 38 |     
 39 |     def stem_word(self, word):
 40 |         suffixes = ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
 41 |         
 42 |         for suffix in suffixes:
 43 |             if word.endswith(suffix):
 44 |                 return word[:-len(suffix)]
 45 |         
 46 |         return word
 47 |     
 48 |     def tokenize(self, sentence):
 49 |         return word_tokenize(sentence)
 50 |     
 51 |     def stemming(self, sentence):
 52 |         tokens = self.tokenize(sentence)
 53 |         stemmed = ' '.join([self.stem_word(token) for token in tokens])
 54 |         
 55 |         return stemmed
 56 |     
 57 |     def filter_characters(self, char):
 58 |         return char == '\n' or 32 <= ord(char) <= 126
 59 | 
 60 |     def sanitize_title(self, sentence):
 61 |         return filter(self.filter_characters, sentence)
 62 |     
 63 |     def preprocessing(self, to_stem=False):
 64 |         corrected_q_train, corrected_q_test = self.correct_search_terms(self.train, self.test)
 65 |         
 66 |         self.train['search_term'] = corrected_q_train
 67 |         self.test['search_term'] = corrected_q_test
 68 |         
 69 |         self.train['search_term'] = self.train.search_term.map(lambda x: x.lower())
 70 |         self.test['search_term'] = self.test.search_term.map(lambda x: x.lower())
 71 |         
 72 |         
 73 |         self.train['product_title'] = self.train.product_title.map(self.sanitize_title)
 74 |         self.test['product_title'] = self.test.product_title.map(self.sanitize_title)
 75 |         
 76 |         if to_stem:
 77 |             self.train['search_term'] = self.train.search_term.map(self.stemming)
 78 |             self.test['search_term'] = self.test.search_term.map(self.stemming)
 79 |     
 80 |     def num_tokens_query(self, query):
 81 |         return len(word_tokenize(query))
 82 |     
 83 |     def num_tokens_title(self, title):
 84 |         return len(word_tokenize(title))
 85 |     
 86 |     def cosine_similarity_score(self, row):
 87 |         query = row['search_term']
 88 |         title = row['product_title']
 89 |         
 90 |         corpus = np.array([query, title])
 91 |         tfidf_matrix = self.tfidf_vectorizer.fit_transform(corpus)
 92 |         
 93 |         normal_array = tfidf_matrix.toarray()
 94 |         
 95 |         query_repr = normal_array[0].reshape(-1, 1)
 96 |         title_repr = normal_array[1].reshape(-1, 1)
 97 |         
 98 |         return cosine_similarity(query_repr, title_repr)[0][0]
 99 |     
100 |     def jaccard_score(self, row):
101 |         query = row['search_term']
102 |         title = row['product_title']
103 |         
104 |         corpus = np.array([query, title])
105 |         tfidf_matrix = self.tfidf_vectorizer.fit_transform(corpus)
106 |         
107 |         return jaccard_similarity_score(tfidf_matrix[0], tfidf_matrix[1])
108 |         
109 |     
110 |     def numerical_features(self):
111 |         """
112 |         1. Number of tokens in the query
113 |         2. Number of tokens in the title
114 |         3. Cosine similarity between title and query
115 |         """
116 |         
117 |         self.train['num_query_tokens'] = self.train.search_term.map(self.num_tokens_query)
118 |         self.test['num_query_tokens'] = self.test.search_term.map(self.num_tokens_query)
119 |         
120 |         self.train['num_title_tokens'] = self.train.product_title.map(self.num_tokens_title)
121 |         self.test['num_title_tokens'] = self.test.product_title.map(self.num_tokens_title)
122 |         
123 |         self.train['cosine_score'] = self.train.apply(self.cosine_similarity_score, axis=1)
124 |         self.test['cosine_score'] = self.test.apply(self.cosine_similarity_score, axis=1)
125 |     
126 |     def text_features(self):
127 |         corpus_train = self.train.apply(lambda x: '%s %s' %(x['product_title'], x['search_term']), axis=1)
128 |         corpus_test = self.test.apply(lambda x: '%s %s' %(x['product_title'], x['search_term']), axis=1)
129 | 
130 |         tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=3)
131 |         corpus = tfidf.fit_transform(corpus_train.values)
132 |         corpus_test = tfidf.transform(corpus_test.values)
133 | 
134 |         svd = TruncatedSVD(n_components=200)
135 |         
136 |         self.corpus_svd = svd.fit_transform(corpus)
137 |         self.corpus_test_svd = svd.transform(corpus_test)
138 |     
139 |     def combine_features(self):
140 |         features = ['num_query_tokens', 'num_title_tokens', 'cosine_score']
141 |         
142 |         numerical_features = self.train[features]
143 |         numerical_features_test = self.test[features]
144 |         
145 |         self.processed_features_train = sps.hstack([numerical_features, self.corpus_svd])
146 |         self.processesd_features_test = sps.hstack([numerical_features_test, self.corpus_test_svd])
147 | 
148 |         
149 | train = pd.read_csv('./data/train.csv')
150 | test = pd.read_csv('./data/test.csv')
151 | 
152 | dataset = Dataset(train, test)
153 | dataset.preprocessing()
154 | dataset.text_features()
155 | dataset.numerical_features()
156 | dataset.combine_features()
157 | 
158 | 


--------------------------------------------------------------------------------
/Home-Depot/scripts/eda.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr  5 08:58:07 2016
 4 | 
 5 | @author: abhishek
 6 | """
 7 | import pandas as pd
 8 | import re
 9 | 
10 | # import libraries used for nlp
11 | from __future__ import division
12 | from nltk import word_tokenize
13 | from nltk import FreqDist
14 | from search_map import spell_check_dict
15 | 
16 | # load train, test, description and attributes files
17 | train = pd.read_csv('./data/train.csv', index_col='id')
18 | test = pd.read_csv('./data/test.csv', index_col='id')
19 | 
20 | description = pd.read_csv('./data/product_descriptions.csv')
21 | attributes = pd.read_csv('./data/attributes.csv')
22 | 
23 | 
24 | ## Frequency Analysis
25 | def default_tokenizer(sentence):
26 |     return sentence.split(' ')
27 | 
28 | def tokenize(sentence, tokenizer_type='word'):
29 |     if tokenizer_type == 'word':
30 |         return word_tokenize(sentence)
31 |     else:
32 |         return default_tokenizer(sentence)
33 | 
34 | def tokenize_sentences(sentences, n):
35 |     tokens = []
36 |     
37 |     for i in range(0, n):
38 |         tokens.extend(tokenize(sentences[i]))
39 |     
40 |     return tokens
41 | 
42 | def frequency_analysis(search_terms, n=50, num_terms=5):
43 |     tokens_list = tokenize_sentences(search_terms, n=n)
44 |     fdist = FreqDist(tokens_list)
45 |     
46 |     return fdist.most_common(n=num_terms)
47 |     
48 | 
49 | ## Relevance scores based on different patterns on training corpus
50 | def relevance_scores_by_pattern(train, pattern):
51 |     query_list = [(idx, w) for (idx, w) in enumerate(train.search_term.values) if re.search(pattern, w)]
52 |     relevance_scores = [train.iloc[idx]['relevance'] for (idx, w) in query_list]
53 |     
54 |     return relevance_scores
55 | 
56 | 
57 | # Do spelling mistakes have an effect on relevance scores?
58 | def spelling_mistakes_effect(train):
59 |     train = train.copy()
60 |     boolean_indicator = [1 if q in spell_check_dict else 0 for q in train.search_term]
61 |     train['is_incorrect'] = boolean_indicator
62 |     
63 |     mean_relevance_score_correct = train[train.is_incorrect == 0].relevance.mean()    
64 |     mean_relevance_score_incorrect = train[train.is_incorrect == 1].relevance.mean()    
65 |         
66 |     return mean_relevance_score_correct, mean_relevance_score_incorrect
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/Home-Depot/scripts/numerical_features.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Kaggle Home Depot Challenge
  4 | 
  5 | Evaluation Metric : RMSE
  6 | """
  7 | from __future__ import division
  8 | 
  9 | import pandas as pd
 10 | import re
 11 | from sklearn.feature_extraction import text
 12 | from difflib import SequenceMatcher as seq_matcher
 13 | import cPickle
 14 | 
 15 | 
 16 | pattern = re.compile(r'\b(' + r'|'.join(text.ENGLISH_STOP_WORDS) + r')\b\s*')
 17 | 
 18 | 
 19 | # load train and test set
 20 | train = pd.read_csv('../data/train.csv')
 21 | test = pd.read_csv('../data/test.csv')
 22 | 
 23 | # load product description and atttributes data
 24 | description = pd.read_csv('../data/product_descriptions.csv')
 25 | attributes = pd.read_csv('../data/attributes.csv')
 26 | 
 27 | brand = attributes[attributes.name == 'MFG Brand Name'][['product_uid', 'value']].rename(columns={'value': 'brand'})
 28 | color = attributes[attributes.name == 'Color Family'][['product_uid', 'value']].rename(columns={'value': 'color'})
 29 | 
 30 | # most of the queries are relevant
 31 | search_term_frequency_train = train.groupby('product_uid').size().reset_index()
 32 | search_term_frequency_train.columns = ['product_uid', 'query_frequency']
 33 | 
 34 | search_term_frequency_test = test.groupby('product_uid').size().reset_index()
 35 | search_term_frequency_test.columns = ['product_uid', 'query_frequency']
 36 | 
 37 | 
 38 | # merge this with train and test set
 39 | train = pd.merge(train, search_term_frequency_train, on='product_uid', how='left')
 40 | test = pd.merge(test, search_term_frequency_test, on='product_uid', how='left')
 41 | 
 42 | # merge with description, brand and color dataset
 43 | train = pd.merge(train, description, on='product_uid', how='left')
 44 | test = pd.merge(test, description, on='product_uid', how='left')
 45 | 
 46 | train = pd.merge(train, brand, on='product_uid', how='left')
 47 | test = pd.merge(test, brand, on='product_uid', how='left')
 48 | 
 49 | train = pd.merge(train, color, on='product_uid', how='left')
 50 | test = pd.merge(test, color, on='product_uid', how='left')
 51 | 
 52 | # missing values
 53 | train = train.fillna('')
 54 | test = test.fillna('')
 55 | 
 56 | 
 57 | # some preprocessing functions
 58 | def filter_characters(char):
 59 |     return char == '\n' or 32 <= ord(char) <= 126
 60 | 
 61 | def sanitize(s):
 62 |     s = s.replace('ft.', 'feet')
 63 |     s = s.replace('cu.', 'cubic')
 64 |     s = s.replace('mm', 'milimeters')
 65 |     s = s.replace('oz.', 'ounces')
 66 |     s = s.replace('btu', 'british thermal unit')
 67 |     s = s.replace('otr', 'over the range')
 68 |     s = s.replace('lb.', 'pounds')
 69 |     s = s.replace('in.', 'inches')
 70 |     s = s.replace('&amp;', 'and')
 71 |     s = s.replace('sq.', 'square')
 72 |     s = s.replace('gal.', 'gallon')
 73 |     
 74 |     return s
 75 | 
 76 | def preprocess(s):
 77 |     s = filter(filter_characters, s)
 78 |     s = s.lower()
 79 |     s = sanitize(s)
 80 |     
 81 |     return pattern.sub('', s)
 82 | 
 83 | # sanitize training and test    
 84 | train.loc[:, 'product_title'] = train.product_title.map(preprocess)
 85 | train.loc[:, 'search_term'] = train.search_term.map(preprocess)
 86 | 
 87 | train.loc[:, 'product_description'] = train.product_description.map(preprocess)
 88 | train.loc[:, 'brand'] = train.brand.map(preprocess)
 89 | train.loc[:, 'color'] = train.color.map(preprocess)
 90 | 
 91 | 
 92 | test.loc[:, 'product_title'] = test.product_title.map(preprocess)
 93 | test.loc[:, 'search_term'] = test.search_term.map(preprocess)
 94 | 
 95 | test.loc[:, 'product_description'] = test.product_description.map(preprocess)
 96 | test.loc[:, 'brand'] = test.brand.map(preprocess)
 97 | test.loc[:, 'color'] = test.color.map(preprocess)
 98 | 
 99 | 
100 | 
101 | # feature engineering
102 | def query_title_overlap(row):
103 |     query = row['search_term']
104 |     title = row['product_title']
105 |     query_words = query.split()
106 |     
107 |     count_overlap = 0
108 |     for word in query_words:
109 |         if query in title:
110 |             count_overlap += 1
111 |     
112 |     return count_overlap / (len(query_words) + 1)
113 | 
114 | def query_description_overlap(row):
115 |     query = row['search_term']
116 |     description = row['product_description']
117 |     query_words = query.split()
118 |     
119 |     count_overlap = 0
120 |     for word in query_words:
121 |         if query in description:
122 |             count_overlap += 1
123 |     
124 |     return count_overlap / (len(query_words) + 1)
125 |     
126 | def brand_matches(row):
127 |     query = row['search_term']
128 |     brand = row['brand']
129 |     query_words = query.split()
130 |     
131 |     count_overlap = 0
132 |     for word in query_words:
133 |         if query in brand:
134 |             count_overlap += 1
135 |     
136 |     return count_overlap
137 | 
138 | def compute_one_edit_distance(row):
139 |     query = row['search_term']
140 |     title = row['product_title']
141 |     
142 |     return 1 - seq_matcher(None, query, title).ratio()
143 | 
144 | train.loc[:, 'num_words_in_query'] = train.search_term.map(lambda x: len(x.split()))
145 | test.loc[:, 'num_words_in_query'] = test.search_term.map(lambda x: len(x.split()))
146 | 
147 | train.loc[:, 'query_title_overlap'] = train.apply(query_title_overlap, axis=1)
148 | test.loc[:, 'query_title_overlap'] = test.apply(query_title_overlap, axis=1)
149 | 
150 | train.loc[:, 'one_edit_distance'] = train.apply(compute_one_edit_distance, axis=1)
151 | test.loc[:, 'one_edit_distance'] = test.apply(compute_one_edit_distance, axis=1)
152 | 
153 | train.loc[:, 'query_description_overlap'] = train.apply(query_description_overlap, axis=1)
154 | test.loc[:, 'query_description_overlap'] = test.apply(query_description_overlap, axis=1)
155 | 
156 | train.loc[:, 'brand_match'] = train.apply(brand_matches, axis=1)
157 | test.loc[:, 'brand_match'] = test.apply(brand_matches, axis=1)
158 | 
159 | # serialize the object
160 | with open('../data/train_processed.pkl', 'w') as outfile:    
161 |     cPickle.dump(train, outfile)
162 |     outfile.close()
163 | 
164 | with open('../data/test_processed.pkl', 'w') as outfile:    
165 |     cPickle.dump(test, outfile)
166 |     outfile.close()
167 | 
168 |     


--------------------------------------------------------------------------------
/Home-Depot/scripts/search_map.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numb3r33/Kaggle-Competitions/6c4062dbcbd80869a2e0f5b93723ad217963d35b/Home-Depot/scripts/search_map.pyc


--------------------------------------------------------------------------------
/Home-Depot/scripts/text-features.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 27 13:50:57 2016
 4 | 
 5 | @author: abhishek
 6 | """
 7 | 
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | import re
12 | from nltk.stem import PorterStemmer
13 | from sklearn.feature_extraction.text import TfidfVectorizer
14 | from sklearn.decomposition import TruncatedSVD
15 | from search_map import spell_check_dict
16 | 
17 | 
18 | stemmer = PorterStemmer()
19 | 
20 | # load train and test set
21 | train = pd.read_csv('../data/train.csv')
22 | test = pd.read_csv('../data/test.csv')
23 | 
24 | # load product description and atttributes data
25 | description = pd.read_csv('../data/product_descriptions.csv')
26 | attributes = pd.read_csv('../data/attributes.csv')
27 | 
28 | def stem_words(sentence):
29 |     return ' '.join([stemmer.stem(word) for word in sentence.split(' ')])
30 |     
31 | 
32 | ## NOTE:
33 | ## graders were shown images instead of product descriptions
34 | ## and other attributes
35 | 
36 | train = pd.merge(train, description, on='product_uid', how='left')
37 | test = pd.merge(test, description, on='product_uid', how='left')
38 | 
39 | def correct_term(q):
40 |     if q in spell_check_dict:
41 |         return spell_check_dict[q]
42 |     else:
43 |         return q
44 | 
45 | # correct search queries
46 | train.loc[:, 'search_term'] = train.search_term.map(correct_term)
47 | test.loc[:, 'search_term'] = test.search_term.map(correct_term)
48 | 
49 | 
50 | ## remove non-alphanumeric characters
51 | train.loc[:, 'product_description'] = train.product_description.map(lambda x: re.sub(r'[^A-Za-z0-9 ]', 
52 |                                                                     ' ', x))
53 | 
54 | train.loc[:, 'search_term'] = train.search_term.map(lambda x: re.sub(r'[^A-Za-z0-9 ]', 
55 |                                                                     ' ', x))
56 | 
57 | test.loc[:, 'product_description'] = test.product_description.map(lambda x: re.sub(r'[^A-Za-z0-9 ]', 
58 |                                                                     ' ', x))
59 | 
60 | test.loc[:, 'search_term'] = test.search_term.map(lambda x: re.sub(r'[^A-Za-z0-9 ]', 
61 |                                                                     ' ', x))
62 | 
63 | train.loc[:, 'product_description'] = train.product_description.map(stem_words)
64 | train.loc[:, 'search_term'] = train.search_term.map(stem_words)
65 | 
66 | test.loc[:, 'product_description'] = test.product_description.map(stem_words)
67 | test.loc[:, 'search_term'] = test.product_description.map(stem_words)
68 | 
69 | 
70 | # corpus
71 | corpus = train.apply(lambda x: '%s %s' %(x['product_title'].lower(), x['search_term'].lower()), axis=1)
72 | corpus_test = test.apply(lambda x: '%s %s' %(x['product_title'].lower(), x['search_term'].lower()), axis=1)
73 | 
74 | tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=3)
75 | corpus = tfidf.fit_transform(corpus.values)
76 | corpus_test = tfidf.transform(corpus_test.values)
77 | 
78 | svd = TruncatedSVD(n_components=200)
79 | corpus_svd = svd.fit_transform(corpus)
80 | corpus_test_svd = svd.transform(corpus_test)
81 | 
82 | 


--------------------------------------------------------------------------------
/Predicting-Grants/.ipynb_checkpoints/Data Analysis-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/Predicting-Grants/.ipynb_checkpoints/Description-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/Predicting-Grants/Description.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Task Description"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "** This task requires participants to predict the outcome of grant applications for the University of Melbourne. **\n",
 15 |     "\n",
 16 |     "### Why should this problem be solved ?\n",
 17 |     "* Pool of funds available for research grants is steadily shrinking (in a relative sense). \n",
 18 |     "* In Australia, success rates have fallen to 20-25 per cent, meaning that most academics are spending valuable time making applications that end up being rejected."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Descrption about the dataset."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "* Dataset containing 249 features, including variables that represent the size of the grant, the general area of study and de-identified information on the investigators who are applying for the grant. \n",
 33 |     "* There 8,707 grant applications made between 2004 and 2008 which constitute **training examples**. Then there are 2,176 applications made in 2009 and the first half of 2010 which can be used as **test set**."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Evaluation Metric"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "### AUC - (Area Under Curve)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "<p><b> Metric </b> is the measure of evaluating the quality of preditions of a model. There are various different types of metrics out there <b>Area under Curve (AUC)</b> is one of those. For further reading you can follow the following description.</p>"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 8,
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/html": [
 67 |        "\n",
 68 |        "        <iframe\n",
 69 |        "            width=\"700\"\n",
 70 |        "            height=\"350\"\n",
 71 |        "            src=\"http://fastml.com/what-you-wanted-to-know-about-auc/\"\n",
 72 |        "            frameborder=\"0\"\n",
 73 |        "            allowfullscreen\n",
 74 |        "        ></iframe>\n",
 75 |        "        "
 76 |       ],
 77 |       "text/plain": [
 78 |        "<IPython.lib.display.IFrame at 0x7ff3700f09d0>"
 79 |       ]
 80 |      },
 81 |      "execution_count": 8,
 82 |      "metadata": {},
 83 |      "output_type": "execute_result"
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "from IPython.display import IFrame\n",
 88 |     "\n",
 89 |     "IFrame('http://fastml.com/what-you-wanted-to-know-about-auc/', width=700, height=350)"
 90 |    ]
 91 |   }
 92 |  ],
 93 |  "metadata": {
 94 |   "kernelspec": {
 95 |    "display_name": "Python 2",
 96 |    "language": "python",
 97 |    "name": "python2"
 98 |   },
 99 |   "language_info": {
100 |    "codemirror_mode": {
101 |     "name": "ipython",
102 |     "version": 2
103 |    },
104 |    "file_extension": ".py",
105 |    "mimetype": "text/x-python",
106 |    "name": "python",
107 |    "nbconvert_exporter": "python",
108 |    "pygments_lexer": "ipython2",
109 |    "version": "2.7.10"
110 |   }
111 |  },
112 |  "nbformat": 4,
113 |  "nbformat_minor": 0
114 | }
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Kaggle-Competitions
2 | All Kaggle competitions
3 | 


--------------------------------------------------------------------------------
/Rossman-Stores-Sales/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | submissions/
3 | 


--------------------------------------------------------------------------------
/Rossman-Stores-Sales/.ipynb_checkpoints/rossman_store_sales-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/Rossman-Stores-Sales/rossman_store_sales.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stderr",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "C:\\Users\\Abhishek\\Anaconda2\\lib\\site-packages\\matplotlib\\__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.\n",
 15 |       "  warnings.warn(self.msg_depr % (key, alt_key))\n"
 16 |      ]
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "# special IPython command to prepare the notebook for matplotlib\n",
 21 |     "%matplotlib inline \n",
 22 |     "\n",
 23 |     "import numpy as np\n",
 24 |     "import pandas as pd\n",
 25 |     "import scipy.stats as stats\n",
 26 |     "import matplotlib.pyplot as plt\n",
 27 |     "import sklearn\n",
 28 |     "import statsmodels.api as sm\n",
 29 |     "\n",
 30 |     "import seaborn as sns\n",
 31 |     "sns.set_style(\"whitegrid\")\n",
 32 |     "sns.set_context(\"poster\")\n",
 33 |     "\n",
 34 |     "from math import sqrt\n",
 35 |     "\n",
 36 |     "from sklearn.preprocessing import LabelEncoder\n",
 37 |     "from sklearn.linear_model import LinearRegression\n",
 38 |     "from sklearn.ensemble import RandomForestRegressor\n",
 39 |     "\n",
 40 |     "# special matplotlib argument for improved plots\n",
 41 |     "from matplotlib import rcParams\n"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 179,
 47 |    "metadata": {
 48 |     "collapsed": true
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "%run scripts/rossman.py\n",
 53 |     "%run scripts/helper.py"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 180,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "rossman = Rossman('./data/train.csv', './data/test.csv', './data/store.csv')"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 181,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# merge with stores data\n",
 76 |     "train_df_merged = rossman.merge_stores_data()"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 182,
 82 |    "metadata": {
 83 |     "collapsed": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# consider only those entries with non-zero sales value\n",
 88 |     "train_df_with_non_zero_sales = rossman.non_zero_sales_data()"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 183,
 94 |    "metadata": {
 95 |     "collapsed": true
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# test dataset\n",
100 |     "test_df = rossman.test_df.copy()"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 184,
106 |    "metadata": {
107 |     "collapsed": false
108 |    },
109 |    "outputs": [
110 |     {
111 |      "name": "stderr",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "<string>:48: SettingWithCopyWarning: \n",
115 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
116 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
117 |       "\n",
118 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "# preprocessing - converting all categorical variables into numerical values\n",
124 |     "train_df_processed, test_df_processed = preprocessing(train_df_with_non_zero_sales, test_df)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 185,
130 |    "metadata": {
131 |     "collapsed": false
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "# create three separate training examples for three years\n",
136 |     "\n",
137 |     "# train_df_2013 = get_data(train_df, '2013-01-01', '2013-12-31')\n",
138 |     "train_df_2014_2015 = get_data(train_df, '2014-01-01', '2015-12-31')\n",
139 |     "# train_df_2015 = get_data(train_df, '2015-01-01', '2015-12-31')"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 186,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "features = train_df_2013.columns.drop(['Date', 'Sales'])"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 187,
156 |    "metadata": {
157 |     "collapsed": true
158 |    },
159 |    "outputs": [],
160 |    "source": [
161 |     "# X_train_2013 = train_df_2013[features]\n",
162 |     "X_train_2014_2015 = train_df_2014[features]\n",
163 |     "# X_train_2015 = train_df_2015[features]"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 123,
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "# y_train_2013 = np.log1p(train_df_2013.Sales)\n",
175 |     "y_train_2014_2015 = np.log1p(train_df_2014.Sales)\n",
176 |     "# y_train_2015 = np.log1p(train_df_2015.Sales)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 61,
182 |    "metadata": {
183 |     "collapsed": true
184 |    },
185 |    "outputs": [],
186 |    "source": [
187 |     "# Extreme Gradient Boosting\n",
188 |     "## Creating models on dataset from three different years\n",
189 |     "## and testing it out on the final 6 weeks of year 2015\n",
190 |     "\n",
191 |     "import xgboost as xgb"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 124,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "# training a model on data from year 2013\n",
203 |     "# dtrain_2013 = xgb.DMatrix(X_train_2013, y_train_2013, missing=-999.0)\n",
204 |     "dtrain_2014 = xgb.DMatrix(X_train_2014, y_train_2014, missing=-999.0)\n",
205 |     "# dtrain_2015 = xgb.DMatrix(X_train_2015, y_train_2015, missing=-999.0)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 128,
211 |    "metadata": {
212 |     "collapsed": false
213 |    },
214 |    "outputs": [
215 |     {
216 |      "name": "stderr",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "C:\\Users\\Abhishek\\Anaconda2\\lib\\site-packages\\pandas\\core\\generic.py:2862: SettingWithCopyWarning: \n",
220 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
221 |       "\n",
222 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
223 |       "  self._update_inplace(new_data)\n"
224 |      ]
225 |     }
226 |    ],
227 |    "source": [
228 |     "Xtest = test_df_processed[features]\n",
229 |     "\n",
230 |     "Xtest.Open.fillna(1, inplace=True)\n",
231 |     "\n",
232 |     "Xtest_open_stores = Xtest[Xtest.Open == 1]\n",
233 |     "Xtest_closed_stores = Xtest[Xtest.Open == 0]\n",
234 |     "\n",
235 |     "dtest = xgb.DMatrix(Xtest_open_stores, missing=-999.0)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 129,
241 |    "metadata": {
242 |     "collapsed": false
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "params_2014 = dict((('silent', 1), ('nthread', 8), ('objective', 'reg:linear'),('eta', 0.05), \n",
247 |     "                    ('subsample', 0.8), ('colsample_bytree', 0.7), ('min_child_weight', 5), ('max_depth', 8)))\n",
248 |     "num_round = 1500\n",
249 |     "\n",
250 |     "model_2014 = xgb.train(params_2014, dtrain_2014, num_round, feval=rmspe_xg)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 130,
256 |    "metadata": {
257 |     "collapsed": false
258 |    },
259 |    "outputs": [],
260 |    "source": [
261 |     "# predictions for the year 2014\n",
262 |     "\n",
263 |     "predictions_test = np.expm1(model_2014.predict(dtest))"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 175,
269 |    "metadata": {
270 |     "collapsed": true
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "# predictions for open and closed stores and then stack them together\n",
275 |     "open_stores_test_ids = Xtest_open_stores.index.values + 1\n",
276 |     "closed_stores_test_ids = Xtest_closed_stores.index.values + 1\n",
277 |     "\n",
278 |     "open_stores_preds = predictions_test\n",
279 |     "closed_stores_preds = [0.] * len(closed_stores_test_ids)\n",
280 |     "\n",
281 |     "final_ids = np.hstack([open_stores_test_ids, closed_stores_test_ids])\n",
282 |     "final_preds = np.hstack([open_stores_preds, closed_stores_preds])"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 178,
288 |    "metadata": {
289 |     "collapsed": false
290 |    },
291 |    "outputs": [],
292 |    "source": [
293 |     "create_submission(final_ids, final_preds, 'xgb_only_2014.csv')"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 97,
299 |    "metadata": {
300 |     "collapsed": true
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "params_2015 = dict((('silent', 1), ('nthread', 8), ('objective', 'reg:linear'),('eta', 0.05), \n",
305 |     "                    ('subsample', 0.8), ('colsample_bytree', 0.7), ('min_child_weight', 5), ('max_depth', 8)))\n",
306 |     "num_round = 1000\n",
307 |     "\n",
308 |     "model_2015 = xgb.train(params_2015, dtrain_2015, num_round, feval=rmspe_xg)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 98,
314 |    "metadata": {
315 |     "collapsed": false
316 |    },
317 |    "outputs": [
318 |     {
319 |      "name": "stdout",
320 |      "output_type": "stream",
321 |      "text": [
322 |       "RMSPE error for model based on examples from the year 2015  0.224573258686\n"
323 |      ]
324 |     }
325 |    ],
326 |    "source": [
327 |     "# predictions for the year 2014\n",
328 |     "predictions_2015 = np.expm1(model_2015.predict(dtest))\n",
329 |     "\n",
330 |     "print 'RMSPE error for model based on examples from the year 2015 ', rmspe(ytest, predictions_2015)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 99,
336 |    "metadata": {
337 |     "collapsed": false
338 |    },
339 |    "outputs": [
340 |     {
341 |      "data": {
342 |       "text/html": [
343 |        "<div>\n",
344 |        "<table border=\"1\" class=\"dataframe\">\n",
345 |        "  <thead>\n",
346 |        "    <tr style=\"text-align: right;\">\n",
347 |        "      <th></th>\n",
348 |        "      <th>2013</th>\n",
349 |        "      <th>2014</th>\n",
350 |        "      <th>2015</th>\n",
351 |        "    </tr>\n",
352 |        "  </thead>\n",
353 |        "  <tbody>\n",
354 |        "    <tr>\n",
355 |        "      <th>2013</th>\n",
356 |        "      <td>1.000000</td>\n",
357 |        "      <td>0.940708</td>\n",
358 |        "      <td>0.918543</td>\n",
359 |        "    </tr>\n",
360 |        "    <tr>\n",
361 |        "      <th>2014</th>\n",
362 |        "      <td>0.940708</td>\n",
363 |        "      <td>1.000000</td>\n",
364 |        "      <td>0.933550</td>\n",
365 |        "    </tr>\n",
366 |        "    <tr>\n",
367 |        "      <th>2015</th>\n",
368 |        "      <td>0.918543</td>\n",
369 |        "      <td>0.933550</td>\n",
370 |        "      <td>1.000000</td>\n",
371 |        "    </tr>\n",
372 |        "  </tbody>\n",
373 |        "</table>\n",
374 |        "</div>"
375 |       ],
376 |       "text/plain": [
377 |        "          2013      2014      2015\n",
378 |        "2013  1.000000  0.940708  0.918543\n",
379 |        "2014  0.940708  1.000000  0.933550\n",
380 |        "2015  0.918543  0.933550  1.000000"
381 |       ]
382 |      },
383 |      "execution_count": 99,
384 |      "metadata": {},
385 |      "output_type": "execute_result"
386 |     }
387 |    ],
388 |    "source": [
389 |     "# find the correlations between three predictions\n",
390 |     "prediction_df = pd.DataFrame({'2013': predictions_2013, '2014': predictions_2014, '2015': predictions_2015})\n",
391 |     "prediction_df.corr()"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 108,
397 |    "metadata": {
398 |     "collapsed": true
399 |    },
400 |    "outputs": [],
401 |    "source": [
402 |     "prediction_avg = .1 * predictions_2013 + 0.8 * predictions_2014 + .1 * predictions_2015"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 109,
408 |    "metadata": {
409 |     "collapsed": false
410 |    },
411 |    "outputs": [
412 |     {
413 |      "name": "stdout",
414 |      "output_type": "stream",
415 |      "text": [
416 |       "RMSPE error for average of the predictions of three models  0.171631453195\n"
417 |      ]
418 |     }
419 |    ],
420 |    "source": [
421 |     "print 'RMSPE error for average of the predictions of three models ', rmspe(ytest, prediction_avg)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 111,
427 |    "metadata": {
428 |     "collapsed": false
429 |    },
430 |    "outputs": [
431 |     {
432 |      "data": {
433 |       "text/plain": [
434 |        "1    35093\n",
435 |        "0     5984\n",
436 |        "Name: Open, dtype: int64"
437 |       ]
438 |      },
439 |      "execution_count": 111,
440 |      "metadata": {},
441 |      "output_type": "execute_result"
442 |     }
443 |    ],
444 |    "source": [
445 |     "rossman.test_df.Open.value_counts()"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": null,
451 |    "metadata": {
452 |     "collapsed": true
453 |    },
454 |    "outputs": [],
455 |    "source": []
456 |   }
457 |  ],
458 |  "metadata": {
459 |   "kernelspec": {
460 |    "display_name": "Python 2",
461 |    "language": "python",
462 |    "name": "python2"
463 |   },
464 |   "language_info": {
465 |    "codemirror_mode": {
466 |     "name": "ipython",
467 |     "version": 2
468 |    },
469 |    "file_extension": ".py",
470 |    "mimetype": "text/x-python",
471 |    "name": "python",
472 |    "nbconvert_exporter": "python",
473 |    "pygments_lexer": "ipython2",
474 |    "version": "2.7.10"
475 |   }
476 |  },
477 |  "nbformat": 4,
478 |  "nbformat_minor": 0
479 | }
480 | 


--------------------------------------------------------------------------------
/Rossman-Stores-Sales/scripts/helper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from sklearn.preprocessing import LabelEncoder
 5 | 
 6 | 
 7 | def ToWeight(y):
 8 |     w = np.zeros(y.shape, dtype=float)
 9 |     ind = y != 0
10 |     w[ind] = 1./(y[ind]**2)
11 |     return w
12 | 
13 | 
14 | def rmspe(yhat, y):
15 |     w = ToWeight(y)
16 |     rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
17 |     return rmspe
18 | 
19 | 
20 | 
21 | def rmspe_xg(yhat, y):
22 |     
23 |     """
24 |     This implementation of Root Mean Square Percentage error
25 |     for XGBoost.
26 |     """
27 | 
28 |     y = y.get_label()
29 |     y = np.exp(y) - 1
30 |     yhat = np.exp(yhat) - 1
31 |     w = ToWeight(y)
32 |     rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
33 |     return "rmspe", rmspe
34 | 
35 | 
36 | def get_object_cols(train_df):
37 |     return [col for col in train_df.columns if train_df[col].dtype == 'O']
38 | 
39 | def preprocessing(train_df, test_df):
40 |     cols = get_object_cols(train_df)
41 | 
42 |     for col in cols:
43 |         lbl = LabelEncoder()
44 |         data = pd.concat([train_df[col], test_df[col]])
45 | 
46 |         lbl.fit(data)
47 | 
48 |         train_df[col] = lbl.transform(train_df[col])
49 |         test_df[col] = lbl.transform(test_df[col])
50 | 
51 |     return train_df, test_df
52 | 
53 | def get_data(train_df, start_date, end_date):
54 |     """
55 |     Gets data between date range
56 |     """
57 |     mask = ((train_df.Date >= start_date) & (train_df.Date <= end_date))
58 |     return train_df[mask]
59 | 
60 | 
61 | def create_submission(ids, preds, filename):
62 |     submission_df = pd.DataFrame({'Id': ids, 'Sales': preds})
63 |     submission_df.to_csv('./submissions/' + filename, index=False)


--------------------------------------------------------------------------------
/Rossman-Stores-Sales/scripts/rossman.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from math import sqrt
 3 | import numpy as np
 4 | from sklearn.cross_validation import train_test_split
 5 | 
 6 | class Rossman():
 7 |     
 8 |     def __init__(self, train_file_path, test_file_path, stores_file_path):
 9 |         """
10 |         Sets in the file path for training, test and store
11 |         info csv's
12 |         """
13 | 
14 |         self.train_df = self.load_dataset(train_file_path, date_col='Date')
15 |         self.test_df = self.load_dataset(test_file_path, date_col='Date')
16 |         self.stores_df = self.load_dataset(stores_file_path)
17 | 
18 | 
19 |     def load_dataset(self, file_path, date_col=None):
20 |         """
21 |         Loads dataset based on file path
22 |         """
23 | 
24 |         if date_col:
25 |             return pd.read_csv(file_path, parse_dates=[date_col])
26 |         else:
27 |             return pd.read_csv(file_path)
28 | 
29 | 
30 |     def non_zero_sales_data(self):
31 |         mask = self.train_df.Sales > 0
32 |         return self.train_df[mask]
33 | 
34 |     def split_train_test_mask(self, train_df, threshold_date, random_state=0):
35 | 
36 |         """
37 |         Splits the train_df into training and testing set
38 |         training data will have all the examples except for last 6 weeks
39 |         test data will examples for last 6 weeks
40 |         """
41 |         features = train_df.columns.drop(['Customers', 'PromoInterval'])
42 |         
43 |         train_df_before_threshold = train_df[train_df.Date <= threshold_date][features]
44 |         train_df_afer_threhold = train_df[train_df.Date > threshold_date][features]
45 | 
46 |         return train_df_before_threshold, train_df_afer_threhold
47 | 
48 | 
49 | 
50 |     def merge_stores_data(self):
51 |         """
52 |         Merge store information with training data and test data
53 |         """
54 | 
55 |         self.train_df = pd.merge(self.train_df, self.stores_df, on='Store', how='left')
56 |         self.test_df = pd.merge(self.test_df, self.stores_df, on='Store', how='left')
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/Santander-Customer-Satisfaction/.gitignore:
--------------------------------------------------------------------------------
1 | ../.DS_Store
2 | data/
3 | submissions/
4 | 


--------------------------------------------------------------------------------
/Santander-Customer-Satisfaction/scripts/analysis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Mar 28 07:52:35 2016
  4 | 
  5 | @author: abhishek
  6 | """
  7 | 
  8 | import pandas as pd
  9 | 
 10 | ## Evaluation metric is AUC
 11 | 
 12 | # load train and test files
 13 | train = pd.read_csv('data/train.csv', index_col='ID')
 14 | test = pd.read_csv('data/test.csv', index_col='ID')
 15 | 
 16 | 
 17 | ## NOTES
 18 | ##
 19 | ## 1. 9999999999 to mark missing values
 20 | ## 2. -999999 to mark missing values
 21 | 
 22 | ## need to remove some features because they are either constant or
 23 | ## identical to other column
 24 |         
 25 | def get_constant_features(df, columns):
 26 |     constant_features = []
 27 |     
 28 |     for col in columns:
 29 |         if df[col].std() == 0.0:
 30 |             constant_features.append(col)
 31 |     
 32 |     return constant_features
 33 | 
 34 | def get_identical_features(df, columns):
 35 |     identical_features = []
 36 |     
 37 |     for i in range(len(columns)):
 38 |         for j in range(i + 1, len(columns)):
 39 |             if (df[columns[i]] == df[columns[j]]).all():
 40 |                 identical_features.append(columns[i])
 41 |     
 42 |     identical_features = set(identical_features)
 43 |     identical_features = list(identical_features)
 44 |     
 45 |     return identical_features
 46 | 
 47 | def concat_features(constant_features, identical_features):
 48 |     features_to_remove = []
 49 |     
 50 |     for col in constant_features:
 51 |         features_to_remove.append(col)
 52 |     
 53 |     for col in identical_features:
 54 |         features_to_remove.append(col)
 55 |         
 56 |     return features_to_remove
 57 | 
 58 | columns = train.columns
 59 | 
 60 | constant_features = get_constant_features(train, columns)
 61 | columns = columns.drop(constant_features)
 62 | 
 63 | identical_features = get_identical_features(train, columns)
 64 | features_to_remove = concat_features(constant_features, identical_features)
 65 | 
 66 | ## var 3 has missing value ( -999999 )
 67 | ## 26 more features with missing values
 68 | ## Here is the list
 69 | 
 70 | some_more_features_with_constant_value = ['delta_num_trasp_var33_out_1y3', 'delta_num_reemb_var33_1y3',
 71 |                                           'delta_imp_trasp_var33_out_1y3', 'delta_imp_reemb_var33_1y3',
 72 |                                           'delta_imp_amort_var34_1y3', 'delta_imp_amort_var18_1y3']
 73 | 
 74 | features_with_9999999999 = ['delta_imp_amort_var18_1y3', 'delta_imp_amort_var34_1y3',
 75 |        'delta_imp_aport_var13_1y3', 'delta_imp_aport_var17_1y3',
 76 |        'delta_imp_aport_var33_1y3', 'delta_imp_compra_var44_1y3',
 77 |        'delta_imp_venta_var44_1y3', 'delta_num_aport_var13_1y3',
 78 |        'delta_num_aport_var17_1y3', 'delta_num_aport_var33_1y3',
 79 |        'delta_num_compra_var44_1y3', 'delta_num_reemb_var13_1y3',
 80 |        'delta_num_reemb_var17_1y3', 'delta_num_reemb_var33_1y3',
 81 |        'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var17_out_1y3',
 82 |        'delta_num_trasp_var33_in_1y3', 'delta_num_trasp_var33_out_1y3',
 83 |        'delta_num_venta_var44_1y3'
 84 |        ]
 85 |        
 86 | for feat_name in features_with_9999999999:
 87 |     train.loc[:, 'missing_%s' %(feat_name)] = (train[feat_name] == train[feat_name].max()).astype(int)   
 88 |     train.loc[:, feat_name] = train[feat_name].fillna(train[feat_name].mode())
 89 |     
 90 |     test.loc[:, 'missing_%s' %(feat_name)] = (test[feat_name] == test[feat_name].max()).astype(int)   
 91 |     test.loc[:, feat_name] = test[feat_name].fillna(test[feat_name].mode())
 92 |     
 93 |     
 94 | for feat_name in some_more_features_with_constant_value:
 95 |     features_to_remove.append(feat_name)
 96 | 
 97 | # treat var3 differently
 98 | train.loc[:, 'missing_value_var3'] = (train.var3 == -999999).astype(int)
 99 | train.loc[:, 'var3'] = train.var3.fillna(train.var3.mode())
100 | 
101 | test.loc[:, 'missing_value_var3'] = (test.var3 == -999999).astype(int)
102 | test.loc[:, 'var3'] = test.var3.fillna(train.var3.mode())
103 | 
104 | # remove features
105 | features = train.columns.drop(features_to_remove)
106 | 
107 | train_subset = train[features]
108 | 
109 | features = features.drop('TARGET')
110 | test_subset = test[features]
111 | 
112 | train_subset.to_csv('./data/train_processed_handle_na.csv', index=False)
113 | test_subset.to_csv('./data/test_processed_handle_na.csv', index=False)
114 | 


--------------------------------------------------------------------------------
/Santander-Customer-Satisfaction/scripts/blending.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Apr  4 08:38:24 2016
 4 | 
 5 | @author: abhishek
 6 | """
 7 | 
 8 | from __future__ import division
 9 | from sklearn.cross_validation import train_test_split, StratifiedKFold
10 | from sklearn.metrics import roc_auc_score
11 | from sklearn.ensemble import RandomForestClassifier
12 | from sklearn.linear_model import LogisticRegression
13 | from xgboost import XGBClassifier
14 | 
15 | import pandas as pd
16 | import numpy as np
17 | 
18 | # load train and test files
19 | 
20 | train = pd.read_csv('./data/train.csv', index_col='ID')
21 | test = pd.read_csv('./data/test.csv', index_col='ID')
22 | 
23 | 
24 | # set random seed
25 | np.random.seed(10)
26 | 
27 | X = train[train.columns.drop('TARGET')]
28 | y = train.TARGET
29 | 
30 | Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=44)
31 | 
32 | n_folds = 10
33 | 
34 | skf = list(StratifiedKFold(ytrain, n_folds))
35 | 
36 | clfs = [XGBClassifier(n_estimators=147, learning_rate=0.1, min_child_weight=2, colsample_bytree=0.9, subsample=0.95, seed=1279),
37 |         XGBClassifier(n_estimators=264, learning_rate=0.05, min_child_weight=2, colsample_bytree=0.9, subsample=0.9, seed=1729)]
38 | 
39 | print 'Creating train and test sets for blending'
40 | dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
41 | dataset_blend_test = np.zeros((test.shape[0], len(clfs)))
42 | 
43 | for j, clf in enumerate(clfs):
44 |     print j, clf
45 |     dataset_blend_test_j = np.zeros((test.shape[0], len(skf)))
46 |     for i, (train, test_) in enumerate(skf):
47 |         print "Fold", i
48 |         X_train = X.values[train]
49 |         y_train = y.values[train]
50 |         X_test = X.values[test_]
51 |         y_test = y.values[test_]
52 |         clf.fit(X_train, y_train)
53 |         y_submission = clf.predict_proba(X_test)[:,1]
54 |         dataset_blend_train[test_, j] = y_submission
55 |         dataset_blend_test_j[:, i] = clf.predict_proba(test)[:,1]
56 |     dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
57 | 
58 | print
59 | print "Blending."
60 | clf = LogisticRegression()
61 | clf.fit(dataset_blend_train, y)
62 | y_submission = clf.predict_proba(dataset_blend_test)[:,1]
63 | 
64 | print "Linear stretch of predictions to [0,1]"
65 | y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
66 | 
67 | 
68 | #print 'ROC AUC Score on test set %f ' %(roc_auc_score(ytest, y_submission))
69 | 
70 | submission_df = pd.read_csv('./data/sample_submission.csv')
71 | submission_df['TARGET'] = y_submission
72 | submission_df.to_csv('./submissions/blend_two_xgboost.csv', index=False)


--------------------------------------------------------------------------------
/Santander-Customer-Satisfaction/scripts/cross-validation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar 29 09:24:09 2016
 4 | 
 5 | @author: abhishek
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | from sklearn.cross_validation import train_test_split
12 | import xgboost as xgb
13 | 
14 | np.random.seed(44)
15 | 
16 | train = pd.read_csv('./data/train_processed_handle_na.csv')
17 | test = pd.read_csv('./data/test_processed_handle_na.csv')
18 | 
19 | X = train[train.columns.drop('TARGET')]
20 | y = train.TARGET
21 | 
22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1279)
23 | 
24 | # evaluate xgboost model
25 | param = dict([('max_depth', 3), ('learning_rate', 0.05), ('objective', 'binary:logistic'),
26 |              ('eval_metric', 'auc'), ('seed', 1729), ('min_child_weight', 2),
27 |              ('colsample_bytree', 0.95), ('subsample', 0.8)])
28 | 
29 | dtrain = xgb.DMatrix(X_train.values, label=y_train.values)
30 | dtest = xgb.DMatrix(X_test.values, label=y_test.values)
31 | watchlist = [(dtest, 'eval'), (dtrain, 'train')]
32 | 
33 | num_round = 1000000
34 | 
35 | xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=10)
36 | 


--------------------------------------------------------------------------------
/Santander-Customer-Satisfaction/scripts/feature_analysis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Mar 31 22:30:05 2016
  4 | 
  5 | @author: abhishek
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | import seaborn as sns
 12 | 
 13 | # load train and test set
 14 | 
 15 | train = pd.read_csv('./data/train.csv', index_col='ID')
 16 | test = pd.read_csv('./data/test.csv', index_col='ID')
 17 | 
 18 | ## Class that would represent different synthesized datasets
 19 | 
 20 | class Dataset():
 21 |     def __init__(self, train, test):
 22 |         self.train = train.copy()
 23 |         self.test = test.copy()
 24 |         self.features = train.columns[:-1]
 25 |     
 26 |     def impute_missing_values(self, strategy):
 27 |         missing_values = [-999999.0, 9999999999.0]
 28 |         
 29 |         for col in self.features:
 30 |             if (self.train[col] == missing_values[0]).any():  
 31 |                 self.train['is_missing_%s' %(col)] = (self.train[col] == missing_values[0]).astype(int)                
 32 |                 
 33 |                 if strategy == 'mean':    
 34 |                     strategy_applied_value = self.train[self.train[col] != missing_values[0]][col].mean()             
 35 |                 elif strategy == 'median':
 36 |                     strategy_applied_value = self.train[self.train[col] != missing_values[0]][col].median()
 37 |                 else:
 38 |                     strategy_applied_value = self.train[self.train[col] != missing_values[0]][col].mode()
 39 |                     
 40 |                 self.train[col] = self.train[col].replace(missing_values[0], strategy_applied_value)
 41 |                 
 42 |                 
 43 |                 self.test['is_missing_%s' %(col)] = (self.test[col] == missing_values[0]).astype(int)                
 44 |                 
 45 |                 if strategy == 'mean':    
 46 |                     strategy_applied_value = self.test[self.test[col] != missing_values[0]][col].mean()               
 47 |                 elif strategy == 'median':
 48 |                     strategy_applied_value = self.test[self.test[col] != missing_values[0]][col].median()
 49 |                 else:
 50 |                     strategy_applied_value = self.test[self.test[col] != missing_values[0]][col].mode()
 51 |                                 
 52 |                 self.test[col] = self.test[col].replace(missing_values[0], strategy_applied_value)
 53 |             
 54 |             elif (self.train[col] == missing_values[1]).any():
 55 |                 self.train['is_missing_%s' %(col)] = (self.train[col] == missing_values[1]).astype(int)
 56 |                 
 57 |                 if strategy == 'mean':    
 58 |                     strategy_applied_value = self.train[self.train[col] != missing_values[1]][col].mean()             
 59 |                 elif strategy == 'median':
 60 |                     strategy_applied_value = self.train[self.train[col] != missing_values[1]][col].median()
 61 |                 else:
 62 |                     strategy_applied_value = self.train[self.train[col] != missing_values[1]][col].mode()
 63 |                                 
 64 |                 self.train[col] = self.train[col].replace(missing_values[1], strategy_applied_value)
 65 | 
 66 |                 
 67 |                 self.test['is_missing_%s' %(col)] = (self.test[col] == missing_values[1]).astype(int)
 68 |                 
 69 |                 if strategy == 'mean':    
 70 |                     strategy_applied_value = self.test[self.test[col] != missing_values[1]][col]              
 71 |                 elif strategy == 'median':
 72 |                     strategy_applied_value = self.test[self.test[col] != missing_values[1]][col]
 73 |                 else:
 74 |                     strategy_applied_value = self.test[self.test[col] != missing_values[1]][col]
 75 |                 
 76 |                 self.test[col] = self.test[col].replace(missing_values[1], strategy_applied_value)
 77 |     
 78 |     def get_positive_valued_features(self):
 79 |         feature_status = (self.train < 0).any()
 80 |         neg_valued_features = feature_status[feature_status == True].index
 81 |         
 82 |         return self.features.drop(neg_valued_features)
 83 |         
 84 |     def log_transformation(self):
 85 |         self.non_neg_features = self.get_positive_valued_features()
 86 |         
 87 |         self.train[self.non_neg_features] = self.train[self.non_neg_features].applymap(np.log1p)
 88 |         self.test[self.non_neg_features] = self.test[self.non_neg_features].applymap(np.log1p)
 89 |     
 90 |     def discretize(self):
 91 |         self.train = self.train.astype(np.int)
 92 |         self.test = self.test.astype(np.int)
 93 |         
 94 |     def preprocess(self, impute_strategy):
 95 |         self.impute_missing_values(impute_strategy)
 96 |         self.log_transformation()
 97 | #        self.discretize()
 98 |         
 99 | 
100 | dataset_mean = Dataset(train, test)
101 | dataset_mean.preprocess('mean')
102 | 
103 | dataset_median = Dataset(train, test)
104 | dataset_median.preprocess('median')
105 | 
106 | dataset_mode = Dataset(train, test)
107 | dataset_mode.preprocess('mode')
108 |                 
109 |             
110 |                 


--------------------------------------------------------------------------------
/Santander-Customer-Satisfaction/scripts/feature_importance.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Mar 31 08:57:02 2016
 4 | 
 5 | @author: abhishek
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | from sklearn.ensemble import RandomForestClassifier
12 | 
13 | train = pd.read_csv('./data/train.csv')
14 | test = pd.read_csv('./data/test.csv')
15 | 
16 | X = train[train.columns.drop('TARGET')]
17 | y = train.TARGET
18 | 
19 | forest = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
20 | 
21 | forest.fit(X, y)
22 | importances = forest.feature_importances_
23 | std = np.std([tree.feature_importances_ for tree in forest.estimators_],
24 |              axis=0)
25 | indices = np.argsort(importances)[::-1]
26 | 
27 | # Print the feature ranking
28 | print("Feature ranking:")
29 | 
30 | for f in range(X.shape[1]):
31 |     print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
32 | 
33 | # Plot the feature importances of the forest
34 | #plt.figure()
35 | plt.title("Feature importances (RF)")
36 | plt.bar(range(10), importances[indices][:10],
37 |        color="r", yerr=std[indices][:10], align="center")
38 | plt.xticks(range(10), train.columns[indices[:10]], rotation=90)
39 | plt.xlim([-1, 10])
40 | plt.show()


--------------------------------------------------------------------------------
/Santander-Customer-Satisfaction/scripts/models.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 28 22:33:15 2016
 4 | 
 5 | @author: abhishek
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | from sklearn.cross_validation import train_test_split
12 | from sklearn.preprocessing import MinMaxScaler, StandardScaler
13 | from sklearn.linear_model import LogisticRegression
14 | from sklearn.feature_selection import SelectKBest, chi2
15 | from sklearn.pipeline import Pipeline
16 | from sklearn.metrics import roc_auc_score, confusion_matrix
17 | 
18 | import xgboost as xgb
19 | 
20 | np.random.seed(44)
21 | 
22 | train = pd.read_csv('./data/train_processed_handle_na.csv')
23 | test = pd.read_csv('./data/test_processed_handle_na.csv')
24 | 
25 | X = train[train.columns.drop('TARGET')]
26 | y = train.TARGET
27 | 
28 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1279)
29 | 
30 | # create model pipeline
31 | clf = xgb.XGBClassifier(n_estimators=250, learning_rate=0.05, max_depth=3, min_child_weight=2, 
32 |                         colsample_bytree=0.95, subsample=0.8, seed=1729)
33 | 
34 | xgb_pipeline = Pipeline([('clf', clf)])
35 | 
36 | scaler = MinMaxScaler()
37 | select = SelectKBest(chi2, k=200)
38 | 
39 | clf = LogisticRegression()
40 | log_pipeline = Pipeline([('scaler', scaler), ('select', select), ('clf', clf)])
41 | 
42 | xgb_pipeline.fit(X_train, y_train)
43 | log_pipeline.fit(X_train, y_train)
44 | 
45 | predsTrain_xgb = xgb_pipeline.predict_proba(X_train)[:, 1]
46 | predsTest_xgb = xgb_pipeline.predict_proba(X_test)[:, 1]
47 | 
48 | predsTrain_log = log_pipeline.predict_proba(X_train)[:, 1]
49 | predsTest_log = log_pipeline.predict_proba(X_test)[:, 1]
50 | 
51 | finalPredsTrain = 0.9 * predsTrain_xgb + 0.1 * predsTrain_log
52 | finalPredsTest = 0.9 * predsTest_xgb + 0.1 * predsTest_log
53 | 
54 | print 'predictions on the training set %f ' %(roc_auc_score(y_train, finalPredsTrain))
55 | print 'predictions on the test set %f ' %(roc_auc_score(y_test, finalPredsTest))
56 | 
57 | ### Train on full dataset
58 | xgb_pipeline.fit(X, y)
59 | log_pipeline.fit(X,y)
60 | 
61 | preds_xgb = xgb_pipeline.predict_proba(test)[:, 1]
62 | preds_log = log_pipeline.predict_proba(test)[:, 1]
63 | 
64 | predictions = 0.9 * preds_xgb + 0.1 * preds_log
65 | 
66 | submission = pd.read_csv('./data/sample_submission.csv')
67 | submission.loc[:, 'TARGET'] = predictions
68 | submission.to_csv('./submissions/ensemble_xgb_log.csv', index=False)


--------------------------------------------------------------------------------
/Santander-Customer-Satisfaction/scripts/vector_quantization.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Apr  4 21:27:37 2016
 4 | 
 5 | @author: abhishek
 6 | """
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | from scipy.cluster import vq
12 | 
13 | 
14 | # load train and test set
15 | train = pd.read_csv('./data/train.csv', index_col='ID')
16 | test = pd.read_csv('./data/test.csv', index_col='ID')
17 | 
18 | # columns with high frequency values
19 | high_frequency = [col for col in train.columns if len(train[col].unique()) > 10]
20 | 
21 | for col in high_frequency:
22 |     codebook = vq.kmeans(train[col].values.astype(float), 5)
23 |     train_values = []
24 |     test_values = []
25 | 
26 |     for val in train[col]:
27 |         train_values.append(vq.vq(val, codebook[0])[0][0])
28 |     
29 |     for val in test[col]:
30 |         test_values.append(vq.vq(val, codebook[0])[0][0])
31 |     
32 |     train[col] = np.array(train_values)
33 |     test[col] = np.array(test_values)
34 |     
35 | train.to_csv('./data/synthesized/train_vq.csv', index=False)
36 | test.to_csv('./data/synthesized/test_vq.csv', index=False)


--------------------------------------------------------------------------------
/Santander-Customer-Satisfaction/scripts/xgboost-tune.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 28 22:33:15 2016
 4 | 
 5 | @author: abhishek
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | from sklearn.cross_validation import train_test_split
12 | import xgboost as xgb
13 | 
14 | np.random.seed(44)
15 | 
16 | train = pd.read_csv('./data/synthesized/train_vq.csv')
17 | test = pd.read_csv('./data/synthesized/test_vq.csv')
18 | 
19 | X = train[train.columns.drop('TARGET')]
20 | y = train.TARGET
21 | 
22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1279)
23 | 
24 | # evaluate xgboost model
25 | param = dict([('max_depth', 3), ('learning_rate', 0.1), ('min_child_weight', 2),
26 |               ('colsample_bytree', 0.9), ('subsample', 0.8),
27 |               ('objective', 'binary:logistic'),
28 |              ('eval_metric', 'auc'), ('seed', 1729)])
29 | 
30 | dtrain = xgb.DMatrix(X_train.values, label=y_train.values)
31 | dtest = xgb.DMatrix(X_test.values, label=y_test.values)
32 | 
33 | watchlist = [(dtest, 'eval', (dtrain, 'train'))]
34 | 
35 | num_round = 100000
36 | 
37 | bst = xgb.train(param, dtrain, num_round, watchlist)


--------------------------------------------------------------------------------
/Whats-Cooking/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | scripts/
3 | submissions/
4 | *.zip
5 | *.csv
6 | .ipynb_checkpoints/
7 | 


--------------------------------------------------------------------------------
/cars-cancellation/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | submissions/
3 | 


--------------------------------------------------------------------------------