├── .gitignore ├── Battlefin-s-big-data └── notebooks │ └── Battlefin_Analysis.ipynb ├── CrowdFlower ├── .ipynb_checkpoints │ ├── Basic SVM Model-checkpoint.ipynb │ └── Initial Analysis-checkpoint.ipynb ├── AdditionalFeatures.ipynb ├── AnalyzingMistakes.ipynb ├── AndreasMullerImplementation.ipynb ├── Basic SVM Model.ipynb ├── Blending.ipynb ├── CountVectorizer.ipynb ├── EDA.ipynb ├── EnsembleAllModels.ipynb ├── EnsembleFiles.ipynb ├── EnsembleSVCandNB.ipynb ├── Ensembling.ipynb ├── FeatureSelection.ipynb ├── GenerateFeatures.ipynb ├── GridSearchOnNaiveBayes.ipynb ├── Initial Analysis.ipynb ├── KNN distance processed.ipynb ├── Knn unprocessed.ipynb ├── Linear unprocessed.ipynb ├── ModelOnRelevanceVariance.ipynb ├── Non Linear Processed.ipynb ├── Non Linear SVM unprocessed.ipynb ├── OptimizeSVC.ipynb ├── Relevance_Scores.ipynb ├── SpellCorrection.ipynb ├── Stacking.ipynb ├── StemmingAndSVC.ipynb ├── TFIDF_Train_Plus_Test.ipynb ├── Vowpal wabbit.ipynb ├── query_features.py └── scripts │ ├── blending_helper.py │ ├── features.py │ ├── helper.py │ ├── model_train_plus_test.py │ └── models.py ├── HIV-Progression ├── .ipynb_checkpoints │ ├── Basic_Analysis-checkpoint.ipynb │ └── ClassBalancedModel-checkpoint.ipynb ├── Basic_Analysis.ipynb ├── ClassBalancedModel.ipynb ├── data │ ├── test_data.csv │ └── training_data.csv ├── helper.py └── initialSubmission.csv ├── Home Insurance ├── Exploratory Analysis.ipynb ├── Home Insurance.ipynb ├── features.py ├── scripts │ └── helper.py └── utils.py ├── Home-Depot ├── notebooks │ ├── Home-Depot-Analysis.ipynb │ └── Home-Depot-Models.ipynb └── scripts │ ├── cross-validation.py │ ├── dataset.py │ ├── eda.py │ ├── numerical_features.py │ ├── search_map.py │ ├── search_map.pyc │ └── text-features.py ├── PAKDD ├── PAKDD.ipynb ├── PAKDD_EDA.ipynb └── PAKDD_Signal_Processing_Approach.ipynb ├── Predict-Bio-Response ├── Predict-Bio-Response-Exploratory-Data-Analysis.ipynb ├── Predict-Bio-Response-Model-Building.ipynb ├── Predict-Bio-Response-Tree-Based-Models.ipynb └── data │ ├── test.csv │ └── train.csv ├── Predicting-Grants ├── .ipynb_checkpoints │ ├── Data Analysis-checkpoint.ipynb │ └── Description-checkpoint.ipynb ├── Data Analysis.ipynb ├── Description.ipynb └── data │ ├── unimelb_example.csv │ ├── unimelb_test.csv │ └── unimelb_training.csv ├── README.md ├── Rossman-Stores-Sales ├── .gitignore ├── .ipynb_checkpoints │ └── rossman_store_sales-checkpoint.ipynb ├── rossman_store_sales.ipynb └── scripts │ ├── helper.py │ └── rossman.py ├── Santander-Customer-Satisfaction ├── .gitignore ├── notebooks │ ├── Santander Customer Satisfaction - Exploratory Data Analysis.ipynb │ └── Santander Customer Satisfaction - Models.ipynb └── scripts │ ├── analysis.py │ ├── blending.py │ ├── cross-validation.py │ ├── feature_analysis.py │ ├── feature_importance.py │ ├── models.py │ ├── vector_quantization.py │ └── xgboost-tune.py ├── Whats-Cooking ├── .gitignore ├── .ipynb_checkpoints │ └── Whats Cooking-checkpoint.ipynb └── Whats Cooking.ipynb └── cars-cancellation ├── .gitignore ├── .ipynb_checkpoints └── cars_cancellation-checkpoint.ipynb └── cars_cancellation.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | CrowdFlower/data 2 | CrowdFlower/papers 3 | CrowdFlower/submissions 4 | CrowdFlower/.ipynb_checkpoints/*.ipynb 5 | Rossman-Stores-Sales/data 6 | Rossman-Stores-Sales/submissions 7 | liberty-group/data 8 | liberty-group/submissions 9 | PAKDD/data 10 | PAKDD/submissions 11 | .ipynb_checkpoints 12 | .DS_Store 13 | Standard-Customer-Satisfaction/data 14 | Standard-Customer-Satisfaction/submissions 15 | plots/ 16 | Battlefin-s-big-data/data/ 17 | Battlefin-s-big-data/submissions/ 18 | Home-Depot/data/ 19 | Home-Depot/submissions/ 20 | Predict-Bio-Response/submissions/ 21 | -------------------------------------------------------------------------------- /CrowdFlower/.ipynb_checkpoints/Initial Analysis-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /CrowdFlower/AndreasMullerImplementation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "import seaborn as sns\n", 15 | "%matplotlib inline" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "%run scripts/helper.py\n", 27 | "%run scripts/features.py" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n", 39 | "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "traindata = prepareText(crowd_train)\n", 51 | "testdata = prepareText(crowd_test)\n", 52 | "y = crowd_train.median_relevance.values" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "'bridal shower decorations Accent Pillow with Heart Design - Red/Black Red satin accent pillow embroidered with a heart in black thread. 8\" x 8\".'" 66 | ] 67 | }, 68 | "execution_count": 5, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "# lets take a look at some sample training data\n", 75 | "traindata[0]" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 6, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "'electric griddle Star-Max 48 in Electric Griddle '" 89 | ] 90 | }, 91 | "execution_count": 6, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "# lets take a look at sample test data\n", 98 | "testdata[0]" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 7, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "from sklearn.metrics import make_scorer\n", 110 | "\n", 111 | "# Weighted kappa scorer\n", 112 | "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better=True)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 23, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "from sklearn.cross_validation import StratifiedShuffleSplit\n", 124 | "\n", 125 | "sss = StratifiedShuffleSplit(y, 3, train_size=7000, random_state=0)\n", 126 | "train_index, test_index = next(iter(sss))" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 24, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "Xt = np.asarray(traindata)[train_index]\n", 138 | "yt = np.asarray(y)[train_index]" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 25, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "from sklearn.pipeline import Pipeline\n", 150 | "from sklearn.feature_selection import SelectPercentile, chi2\n", 151 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 152 | "from sklearn.cross_validation import cross_val_score\n", 153 | "from sklearn.svm import SVC\n", 154 | "from sklearn.preprocessing import StandardScaler\n", 155 | "from sklearn.metrics import confusion_matrix\n", 156 | "from sklearn.decomposition import TruncatedSVD\n", 157 | "from sklearn.ensemble import GradientBoostingClassifier\n", 158 | "from sklearn.naive_bayes import MultinomialNB" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 41, 164 | "metadata": { 165 | "collapsed": true 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "countvect_char = TfidfVectorizer(min_df=3, max_features=None, \n", 170 | " strip_accents='unicode', analyzer='char',\n", 171 | " ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n", 172 | " stop_words = 'english')\n", 173 | "\n", 174 | "countvect_word = TfidfVectorizer(min_df=3, max_features=None, \n", 175 | " strip_accents='unicode', analyzer='word',\n", 176 | " ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n", 177 | " stop_words = 'english')\n", 178 | "\n", 179 | "clf1 = MultinomialNB(alpha=.01)\n", 180 | "clf2 = SVC(C=10.0)\n", 181 | "\n", 182 | "ft = FeatureStacker([('chars', countvect_char), ('words', countvect_word)])" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 42, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "pipeline = Pipeline([\n", 194 | " ('vect', ft),\n", 195 | " ('classifier', clf1)\n", 196 | " ])" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 22, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "[ 0.30866723 0.29436923]\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "scores = cross_val_score(pipeline, Xt, \n", 216 | " yt, cv=2, scoring=kappa_scorer,\n", 217 | " n_jobs=1)\n", 218 | "print scores" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 43, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "pipeline1 = Pipeline([\n", 230 | " ('vect', ft),\n", 231 | " ('svd', TruncatedSVD(n_components=140)),\n", 232 | " ('scl', StandardScaler()),\n", 233 | " ('classifier', clf2)\n", 234 | " ])" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 51, 240 | "metadata": { 241 | "collapsed": false 242 | }, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "[ 0.35911812 0.38454006]\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "scores = cross_val_score(pipeline1, Xt, \n", 254 | " yt, cv=2, scoring=kappa_scorer,\n", 255 | " n_jobs=1)\n", 256 | "print scores" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 29, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "from sklearn.cross_validation import train_test_split" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 30, 273 | "metadata": { 274 | "collapsed": true 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "Xtrain, Xvalidation, ytrain, yvalidation = train_test_split(traindata, y, test_size=0.2, random_state=0)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 44, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "Pipeline(steps=[('vect', FeatureStacker(transformer_list=[('chars', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',\n", 292 | " dtype=, encoding=u'utf-8', input=u'content',\n", 293 | " lowercase=True, max_df=1.0, max_features=None, min_df=3,\n", 294 | " ngram_range=(1, 2), norm... vocabulary=None))])), ('classifier', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])" 295 | ] 296 | }, 297 | "execution_count": 44, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "pipeline.fit(traindata, y)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 45, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [ 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "Pipeline(steps=[('vect', FeatureStacker(transformer_list=[('chars', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',\n", 317 | " dtype=, encoding=u'utf-8', input=u'content',\n", 318 | " lowercase=True, max_df=1.0, max_features=None, min_df=3,\n", 319 | " ngram_range=(1, 2), norm...f', max_iter=-1, probability=False, random_state=None,\n", 320 | " shrinking=True, tol=0.001, verbose=False))])" 321 | ] 322 | }, 323 | "execution_count": 45, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "pipeline1.fit(traindata, y)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 33, 335 | "metadata": { 336 | "collapsed": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "first_model_predict = pipeline.predict(Xvalidation)\n", 341 | "second_model_predict = pipeline1.predict(Xvalidation)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 34, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "First model individual score 0.4457 \n" 356 | ] 357 | } 358 | ], 359 | "source": [ 360 | "print 'First model individual score %0.4f ' %(quadratic_weighted_kappa(yvalidation, first_model_predict))" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 35, 366 | "metadata": { 367 | "collapsed": false 368 | }, 369 | "outputs": [ 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "Second model individual score 0.5342 \n" 375 | ] 376 | } 377 | ], 378 | "source": [ 379 | "print 'Second model individual score %0.4f ' %(quadratic_weighted_kappa(yvalidation, second_model_predict))" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 36, 385 | "metadata": { 386 | "collapsed": false 387 | }, 388 | "outputs": [ 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "Average of two models score 0.5573 \n" 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "print 'Average of two models score %0.4f ' %(quadratic_weighted_kappa(yvalidation, (first_model_predict + second_model_predict) / 2))" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 46, 404 | "metadata": { 405 | "collapsed": false 406 | }, 407 | "outputs": [], 408 | "source": [ 409 | "# prediction on test data set\n", 410 | "pred1 = pipeline.predict(testdata)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 47, 416 | "metadata": { 417 | "collapsed": true 418 | }, 419 | "outputs": [], 420 | "source": [ 421 | "pred2 = pipeline1.predict(testdata)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 48, 427 | "metadata": { 428 | "collapsed": true 429 | }, 430 | "outputs": [], 431 | "source": [ 432 | "avg_pred = (pred1 + pred2) / 2" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 49, 438 | "metadata": { 439 | "collapsed": true 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "# submission\n", 444 | "make_submission(crowd_test.index.values.astype(int), avg_pred, 'ensemble1.csv')" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "collapsed": true 452 | }, 453 | "outputs": [], 454 | "source": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": { 460 | "collapsed": true 461 | }, 462 | "outputs": [], 463 | "source": [] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": { 469 | "collapsed": true 470 | }, 471 | "outputs": [], 472 | "source": [] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": { 478 | "collapsed": true 479 | }, 480 | "outputs": [], 481 | "source": [] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": { 487 | "collapsed": true 488 | }, 489 | "outputs": [], 490 | "source": [] 491 | } 492 | ], 493 | "metadata": { 494 | "kernelspec": { 495 | "display_name": "Python 2", 496 | "language": "python", 497 | "name": "python2" 498 | }, 499 | "language_info": { 500 | "codemirror_mode": { 501 | "name": "ipython", 502 | "version": 2 503 | }, 504 | "file_extension": ".py", 505 | "mimetype": "text/x-python", 506 | "name": "python", 507 | "nbconvert_exporter": "python", 508 | "pygments_lexer": "ipython2", 509 | "version": "2.7.6" 510 | } 511 | }, 512 | "nbformat": 4, 513 | "nbformat_minor": 0 514 | } 515 | -------------------------------------------------------------------------------- /CrowdFlower/Blending.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import warnings\n", 13 | "warnings.filterwarnings('ignore')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 41, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "%run scripts/helper.py\n", 25 | "%run scripts/models.py\n", 26 | "%run scripts/blending_helper.py" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 42, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n", 38 | "y = y = crowd_train.median_relevance" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 43, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "train_index, test_index = ssSplit(y, train_size=500 ,random_state=1234)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 44, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "Xtrain = crowd_train.iloc[train_index]\n", 61 | "ytrain = y[train_index]\n", 62 | "\n", 63 | "Xtest = crowd_train.iloc[test_index]\n", 64 | "ytest = y.iloc[test_index]" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 45, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "Xtrain_text = tweak_text(Xtrain)\n", 76 | "Xtest_text = tweak_text(Xtest)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 46, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "corpus = []\n", 88 | "\n", 89 | "for x in Xtrain_text:\n", 90 | " corpus.append(x)\n", 91 | "\n", 92 | "for x in Xtest_text:\n", 93 | " corpus.append(x)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 47, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "1516" 107 | ] 108 | }, 109 | "execution_count": 47, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "len(corpus)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 60, 121 | "metadata": { 122 | "collapsed": true 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "from sklearn.cross_validation import StratifiedKFold\n", 127 | "from sklearn.svm import SVC\n", 128 | "from sklearn.linear_model import LogisticRegression" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 49, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "Xtrain_data, tfv, svd, scl = prepareTrainData(Xtrain_text, corpus)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 50, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "Xtest_data = prepareTestData(Xtest_text, tfv, svd, scl) " 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 51, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "skf = list(StratifiedKFold(ytrain, 3))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 56, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "clfs = [SVC(C=10.0, kernel='rbf', gamma=.00, probability=True),\n", 173 | " SVC(C=15.0, kernel='linear', probability=True)\n", 174 | " ]" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 57, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "Creating train and test sets for blending.\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "print \"Creating train and test sets for blending.\"\n", 194 | " \n", 195 | "dataset_blend_train = np.zeros((Xtrain_data.shape[0], len(clfs)))\n", 196 | "dataset_blend_test = np.zeros((Xtest_data.shape[0], len(clfs)))" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 58, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "0 SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n", 211 | " kernel='rbf', max_iter=-1, probability=True, random_state=None,\n", 212 | " shrinking=True, tol=0.001, verbose=False)\n", 213 | "Fold 0\n", 214 | "Fold 1\n", 215 | "Fold 2\n", 216 | "1 SVC(C=15.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n", 217 | " kernel='linear', max_iter=-1, probability=True, random_state=None,\n", 218 | " shrinking=True, tol=0.001, verbose=False)\n", 219 | "Fold 0\n", 220 | "Fold 1\n", 221 | "Fold 2\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "for j, clf in enumerate(clfs):\n", 227 | " print j, clf\n", 228 | " dataset_blend_test_j = np.zeros((Xtest_data.shape[0], len(skf)))\n", 229 | " for i, (train, test) in enumerate(skf):\n", 230 | " print \"Fold\", i\n", 231 | " X_train = Xtrain_data[train]\n", 232 | " y_train = ytrain[train]\n", 233 | " X_test = Xtrain_data[test]\n", 234 | " y_test = ytrain[test]\n", 235 | " clf.fit(X_train, y_train)\n", 236 | " y_submission = clf.predict_proba(X_test)[:,1]\n", 237 | " dataset_blend_train[test, j] = y_submission\n", 238 | " dataset_blend_test_j[:, i] = clf.predict_proba(Xtest_data)[:,1]\n", 239 | " dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 68, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "Blending.\n" 254 | ] 255 | }, 256 | { 257 | "ename": "ValueError", 258 | "evalue": "Input contains NaN, infinity or a value too large for dtype('float64').", 259 | "output_type": "error", 260 | "traceback": [ 261 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 262 | "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", 263 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[1;34m\"Blending.\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mclf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mLogisticRegression\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset_blend_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mytrain\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[1;31m# y_submission = clf.predict_proba(dataset_blend_test)[:,1]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 264 | "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\linear_model\\logistic.pyc\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 1015\u001b[0m % self.C)\n\u001b[0;32m 1016\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1017\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'csr'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"C\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1018\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclasses_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1019\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msolver\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'liblinear'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'newton-cg'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'lbfgs'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 265 | "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\utils\\validation.pyc\u001b[0m in \u001b[0;36mcheck_X_y\u001b[1;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)\u001b[0m\n\u001b[0;32m 443\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 444\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcolumn_or_1d\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwarn\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 445\u001b[1;33m \u001b[0m_assert_all_finite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 446\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0my_numeric\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkind\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'O'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 447\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 266 | "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\utils\\validation.pyc\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[1;34m(X)\u001b[0m\n\u001b[0;32m 50\u001b[0m and not np.isfinite(X).all()):\n\u001b[0;32m 51\u001b[0m raise ValueError(\"Input contains NaN, infinity\"\n\u001b[1;32m---> 52\u001b[1;33m \" or a value too large for %r.\" % X.dtype)\n\u001b[0m\u001b[0;32m 53\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 54\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 267 | "\u001b[1;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float64')." 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "print \"Blending.\"\n", 273 | "clf = LogisticRegression()\n", 274 | "clf.fit(dataset_blend_train, ytrain)\n", 275 | "# y_submission = clf.predict_proba(dataset_blend_test)[:,1]" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 71, 281 | "metadata": { 282 | "collapsed": false 283 | }, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "False" 289 | ] 290 | }, 291 | "execution_count": 71, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "(dataset_blend_train == np.nan).any()" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": { 304 | "collapsed": true 305 | }, 306 | "outputs": [], 307 | "source": [] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 2", 322 | "language": "python", 323 | "name": "python2" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 2 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython2", 335 | "version": "2.7.6" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 0 340 | } 341 | -------------------------------------------------------------------------------- /CrowdFlower/EnsembleAllModels.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 15, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "best_score = pd.read_csv('./submissions/best_score.csv')\n", 24 | "three_ensemble = pd.read_csv('./submissions/3ensemble.csv')" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 16, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "all_preds = pd.DataFrame({'best_score': best_score.prediction,\n", 36 | " 'three_ensemble': three_ensemble.prediction\n", 37 | " })" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 17, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/html": [ 50 | "
\n", 51 | "\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | "
best_scorethree_ensemble
best_score 1.000000 0.785756
three_ensemble 0.785756 1.000000
\n", 72 | "
" 73 | ], 74 | "text/plain": [ 75 | " best_score three_ensemble\n", 76 | "best_score 1.000000 0.785756\n", 77 | "three_ensemble 0.785756 1.000000" 78 | ] 79 | }, 80 | "execution_count": 17, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "all_preds.corr()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 18, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "preds = (all_preds.best_score + all_preds.three_ensemble) / 2" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 19, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "import math\n", 109 | "\n", 110 | "preds = [int(math.floor(x)) for x in preds]" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 20, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "[4, 3, 3, 2, 4, 4, 4, 3, 4, 2]" 124 | ] 125 | }, 126 | "execution_count": 20, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "preds[:10]" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 21, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "%run scripts/helper.py" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 22, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "make_submission(unprocessed_ensemble.id, preds, 'ensemble_best_three_ensemble.csv')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "collapsed": true 162 | }, 163 | "outputs": [], 164 | "source": [] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 2", 170 | "language": "python", 171 | "name": "python2" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 2 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython2", 183 | "version": "2.7.6" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 0 188 | } 189 | -------------------------------------------------------------------------------- /CrowdFlower/EnsembleFiles.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "from glob import glob" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 4, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "all_submissions = {}\n", 25 | "for i, filename in enumerate(glob('./submissions/*.csv')):\n", 26 | " all_submissions[i] = pd.read_csv(filename).prediction" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 6, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "all_submissions_df = pd.DataFrame(all_submissions)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 24, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "submissions_corr = all_submissions_df.corr()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 35, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "uncorrelated_submissions_pair = []\n", 60 | "\n", 61 | "for i in range(len(all_submissions)):\n", 62 | " for j in range(len(all_submissions)):\n", 63 | " if i != j:\n", 64 | " if submissions_corr.ix[i, j] < .75:\n", 65 | " uncorrelated_submissions_pair.append((i, j))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 16, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "ids = pd.read_csv('./submissions/3ensemble.csv').id" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 18, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "# Averaging all the submissions\n", 88 | "average_of_all_submissions = all_submissions_df.apply(np.mean, axis=1).map(lambda x: int(x))" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 20, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "%run scripts/helper.py" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 22, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "make_submission(ids, average_of_all_submissions, 'average_of_all_submissions.csv')" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "collapsed": true 118 | }, 119 | "outputs": [], 120 | "source": [] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "Python 2", 126 | "language": "python", 127 | "name": "python2" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 2 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython2", 139 | "version": "2.7.6" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 0 144 | } 145 | -------------------------------------------------------------------------------- /CrowdFlower/EnsembleSVCandNB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "import seaborn as sns\n", 15 | "%matplotlib inline" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "%run scripts/helper.py" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n", 38 | "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "# fill in the missing np.nan values with empty string\n", 50 | "crowd_train.fillna('', inplace=True, axis=1)\n", 51 | "crowd_test.fillna('', inplace=True, axis=1)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 5, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "traindata = list(crowd_train.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))\n", 63 | "testdata = list(crowd_test.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 6, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "y = crowd_train.median_relevance.values" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Train a support vector machine" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 7, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "from sklearn.feature_extraction.text import TfidfVectorizer" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 41, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "tfv = TfidfVectorizer(min_df=1, max_features=None,\n", 104 | " strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n", 105 | " ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n", 106 | " stop_words = 'english')" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "tfv.fit(traindata)\n", 118 | "X = tfv.transform(traindata)\n", 119 | "X_test = tfv.transform(testdata)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 10, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "from sklearn.decomposition import TruncatedSVD\n", 131 | "from sklearn.preprocessing import StandardScaler" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 11, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "svd = TruncatedSVD(n_components=140)\n", 143 | "X_svd = svd.fit_transform(X)\n", 144 | "X_test_svd = svd.transform(X_test)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 12, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "scl = StandardScaler()\n", 156 | "X_svd_scl = scl.fit_transform(X_svd)\n", 157 | "X_test_svd_scl = scl.transform(X_test_svd)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 13, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "from sklearn.svm import SVC" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 14, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "svc = SVC(C=10.0, gamma=.01)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 15, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,\n", 193 | " gamma=0.01, kernel='rbf', max_iter=-1, probability=False,\n", 194 | " random_state=None, shrinking=True, tol=0.001, verbose=False)" 195 | ] 196 | }, 197 | "execution_count": 15, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "svc.fit(X_svd_scl, y)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### Train a Multinomial NB classifier" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 16, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "from sklearn.naive_bayes import MultinomialNB" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 33, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "classifier = MultinomialNB(alpha=0.1).fit(X, y)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 34, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "0.7186444813827411" 246 | ] 247 | }, 248 | "execution_count": 34, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "quadratic_weighted_kappa(y, classifier.predict(X))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "### Ensemble their predictions" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 19, 267 | "metadata": { 268 | "collapsed": true 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "svc_pred = svc.predict(X_test_svd_scl)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 35, 278 | "metadata": { 279 | "collapsed": true 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "nb_predict = classifier.predict(X_test)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 37, 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "(array([4, 3, 3, 2, 4, 4, 4, 4, 4, 2], dtype=int64),\n", 297 | " array([4, 4, 3, 3, 4, 4, 4, 4, 4, 4], dtype=int64))" 298 | ] 299 | }, 300 | "execution_count": 37, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "svc_pred[:10], nb_predict[:10]" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 38, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "ensemble_predict = (svc_pred + nb_predict) / 2" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 39, 323 | "metadata": { 324 | "collapsed": false 325 | }, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "text/plain": [ 330 | "array([4, 3, 3, 2, 4, 4, 4, 4, 4, 3], dtype=int64)" 331 | ] 332 | }, 333 | "execution_count": 39, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "ensemble_predict[:10]" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 40, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "# Create your first submission file\n", 351 | "submission = pd.DataFrame({\"id\": crowd_test.index.values.astype(int), \"prediction\": ensemble_predict})\n", 352 | "submission.to_csv(\"./submissions/ensembleNBAndSVCoptimized.csv\", index=False)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": { 359 | "collapsed": true 360 | }, 361 | "outputs": [], 362 | "source": [] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "collapsed": true 369 | }, 370 | "outputs": [], 371 | "source": [] 372 | } 373 | ], 374 | "metadata": { 375 | "kernelspec": { 376 | "display_name": "Python 2", 377 | "language": "python", 378 | "name": "python2" 379 | }, 380 | "language_info": { 381 | "codemirror_mode": { 382 | "name": "ipython", 383 | "version": 2 384 | }, 385 | "file_extension": ".py", 386 | "mimetype": "text/x-python", 387 | "name": "python", 388 | "nbconvert_exporter": "python", 389 | "pygments_lexer": "ipython2", 390 | "version": "2.7.6" 391 | } 392 | }, 393 | "nbformat": 4, 394 | "nbformat_minor": 0 395 | } 396 | -------------------------------------------------------------------------------- /CrowdFlower/FeatureSelection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import warnings\n", 12 | "warnings.filterwarnings('ignore')\n", 13 | "from sklearn.pipeline import Pipeline" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "%run query_features.py\n", 25 | "%run scripts/helper.py\n", 26 | "%run scripts/model_train_plus_test.py" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "crowd_train = load_file('./data/train.csv/train.csv', None)\n", 38 | "crowd_test = load_file('./data/test.csv/test.csv', None)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "crowd_train = crowd_train[crowd_train.relevance_variance < 0.5]" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 5, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "target = crowd_train.median_relevance.values" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 49, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "# train_index, test_index = ssSplit(target, train_size=8000, random_state=44)\n", 72 | "train_index, test_index = ssSplit(target, train_size=1000, random_state=44)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 50, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "Xt = crowd_train.iloc[train_index]\n", 84 | "Xv = crowd_train.iloc[test_index]\n", 85 | "\n", 86 | "# Xt = crowd_train\n", 87 | "# Xv = crowd_test" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 51, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "yt = target[train_index]\n", 99 | "yv = target[test_index]\n", 100 | "\n", 101 | "# yt = target" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 52, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "Xt_tweaked = tweak_text(Xt)\n", 113 | "Xv_tweaked = tweak_text(Xv)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 53, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "Xfitted, tfv = TFIDF(Xt_tweaked, None)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 54, 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)\n", 136 | "\n", 137 | "scl = StandardScaler(copy=True, with_mean=True, with_std=True)\n", 138 | "\n", 139 | "clf = SVC(C=10.0, kernel='linear', degree=3, \n", 140 | " gamma=0.0, coef0=0.0, shrinking=True, probability=False, \n", 141 | " tol=0.001, cache_size=200, class_weight=None, \n", 142 | " verbose=False, max_iter=-1, random_state=None)\n", 143 | "\n", 144 | "keywords = keyword_counter(Xt)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 55, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "features = stack([keywords, Xfitted])" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 56, 161 | "metadata": { 162 | "collapsed": false 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "features_svd = svd.fit_transform(features)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 57, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "features_scl = scl.fit_transform(features_svd)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "### Feature selection" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 58, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "kappa_scorer = get_kappa_scorer()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "selector = feature_selection(features_scl, yt, clf, 1, None, kappa_scorer, None, 0)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 26, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "array([ 0.09551443, 0.08587044, 0.09263525, 0.10222593, 0.10446155,\n", 220 | " 0.11196421, 0.11980537, 0.1532042 , 0.18316295, 0.18411899,\n", 221 | " 0.19283421, 0.18717257, 0.23314848, 0.203963 , 0.20825379,\n", 222 | " 0.26279836, 0.30424894, 0.32203741, 0.30186082, 0.25278024,\n", 223 | " 0.31742312, 0.32569698, 0.30012057, 0.27447408, 0.31833882,\n", 224 | " 0.31342121, 0.31756955, 0.26548062, 0.22389084, 0.23368437,\n", 225 | " 0.21912735, 0.21356475, 0.21658899, 0.18873339, 0.197842 ,\n", 226 | " 0.21829272, 0.20929388, 0.21696621, 0.21999229, 0.24068457,\n", 227 | " 0.23622419, 0.20397825, 0.21827953, 0.21135912, 0.23446343,\n", 228 | " 0.19750452, 0.20252151, 0.18871767, 0.17076826, 0.10988198,\n", 229 | " 0.13213942, 0.14247887, 0.17133736, 0.16121156, 0.13371489,\n", 230 | " 0.16145429, 0.16127367, 0.17602269, 0.17288821, 0.15778934,\n", 231 | " 0.17682649, 0.15262014, 0.17968618, 0.17732637, 0.17032855,\n", 232 | " 0.17693857, 0.16577611, 0.1756472 , 0.14748799, 0.13333879,\n", 233 | " 0.13531875, 0.14256728, 0.14421636, 0.1297998 , 0.12999505,\n", 234 | " 0.13111776, 0.13347969, 0.13054624, 0.13066804, 0.11786777,\n", 235 | " 0.10748329, 0.12599375, 0.13164103, 0.12126069, 0.13172245,\n", 236 | " 0.14973141, 0.1381632 , 0.14347926, 0.1494687 , 0.11836732,\n", 237 | " 0.14209801, 0.11081288, 0.12052675, 0.13134992, 0.15391132,\n", 238 | " 0.16443798, 0.19489916, 0.18679589, 0.19408882, 0.19987597,\n", 239 | " 0.19325446, 0.20296309, 0.18869364, 0.17369716, 0.17375526,\n", 240 | " 0.15641049, 0.16361439, 0.15484795, 0.15943795, 0.16955748,\n", 241 | " 0.1614979 , 0.17480293, 0.14640037, 0.147536 , 0.14943605,\n", 242 | " 0.15927827, 0.16448356, 0.16394893, 0.14914186, 0.15109373,\n", 243 | " 0.148167 , 0.12231477, 0.171927 , 0.1467997 , 0.15001508,\n", 244 | " 0.14589149, 0.14230615, 0.15261398, 0.15127757, 0.15941927,\n", 245 | " 0.13826314, 0.12325342, 0.11749912, 0.11941622, 0.1273668 ,\n", 246 | " 0.13661811, 0.11678706, 0.12084 , 0.13081467, 0.13644204,\n", 247 | " 0.14650248, 0.13725575, 0.12880575, 0.13964549, 0.11886544,\n", 248 | " 0.10889425, 0.1252021 , 0.1262044 , 0.13128633, 0.1070466 ,\n", 249 | " 0.10620304, 0.11012552, 0.12194561, 0.1185141 , 0.10210466,\n", 250 | " 0.12612228, 0.13550932, 0.1437801 , 0.13409203, 0.13157652,\n", 251 | " 0.13754568, 0.13349481, 0.1134101 , 0.1084755 , 0.11184213,\n", 252 | " 0.12546086, 0.10496072, 0.10044754, 0.11875523, 0.12692686,\n", 253 | " 0.12648058, 0.11382819, 0.10869305, 0.10664844, 0.09421044,\n", 254 | " 0.11362754, 0.12713682, 0.12993287, 0.12456679, 0.12112542,\n", 255 | " 0.12439039, 0.12986643, 0.13964398, 0.13674094, 0.11873711,\n", 256 | " 0.12049872, 0.13033784, 0.12472941, 0.14054241, 0.13845084,\n", 257 | " 0.14617703, 0.14190381, 0.12295125, 0.12317129, 0.1206635 ,\n", 258 | " 0.10517095, 0.10900798, 0.10670199, 0.11276843, 0.10644653])" 259 | ] 260 | }, 261 | "execution_count": 26, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "selector.grid_scores_" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 27, 273 | "metadata": { 274 | "collapsed": true 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "best_estimator = selector.estimator_" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 36, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "22" 292 | ] 293 | }, 294 | "execution_count": 36, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "selector." 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "### Predict on test set" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 28, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "keywords_test = keyword_counter(Xv)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 29, 324 | "metadata": { 325 | "collapsed": true 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "Xtest_fitted = tfv.transform(Xv_tweaked)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 30, 335 | "metadata": { 336 | "collapsed": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "features_test = stack([keywords_test, Xtest_fitted])" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 32, 346 | "metadata": { 347 | "collapsed": true 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "features_svd_test = svd.transform(features_test)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 33, 357 | "metadata": { 358 | "collapsed": true 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "features_scl_test = scl.transform(features_svd_test)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 37, 368 | "metadata": { 369 | "collapsed": true 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "features_selected_test = selector.transform(features_scl_test)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 39, 379 | "metadata": { 380 | "collapsed": false 381 | }, 382 | "outputs": [ 383 | { 384 | "name": "stdout", 385 | "output_type": "stream", 386 | "text": [ 387 | "Weighted kappa score on test set 0.2867 \n" 388 | ] 389 | } 390 | ], 391 | "source": [ 392 | "print 'Weighted kappa score on test set %0.4f ' % quadratic_weighted_kappa(yv, best_estimator.predict(features_selected_test))" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "collapsed": true 400 | }, 401 | "outputs": [], 402 | "source": [] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": { 408 | "collapsed": true 409 | }, 410 | "outputs": [], 411 | "source": [] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": { 417 | "collapsed": true 418 | }, 419 | "outputs": [], 420 | "source": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": { 426 | "collapsed": true 427 | }, 428 | "outputs": [], 429 | "source": [] 430 | } 431 | ], 432 | "metadata": { 433 | "kernelspec": { 434 | "display_name": "Python 2", 435 | "language": "python", 436 | "name": "python2" 437 | }, 438 | "language_info": { 439 | "codemirror_mode": { 440 | "name": "ipython", 441 | "version": 2 442 | }, 443 | "file_extension": ".py", 444 | "mimetype": "text/x-python", 445 | "name": "python", 446 | "nbconvert_exporter": "python", 447 | "pygments_lexer": "ipython2", 448 | "version": "2.7.6" 449 | } 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 0 453 | } 454 | -------------------------------------------------------------------------------- /CrowdFlower/KNN distance processed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run scripts/helper.py\n", 12 | "%run scripts/models.py" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "y = crowd_train.median_relevance.values" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 5, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "from sklearn.cross_validation import cross_val_score\n", 46 | "from sklearn.metrics import make_scorer" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 6, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better = True)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 7, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [ 67 | { 68 | "name": "stderr", 69 | "output_type": "stream", 70 | "text": [ 71 | "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65497012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n", 72 | " '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n", 73 | "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65516012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n", 74 | " '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n", 75 | "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/6552101\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n", 76 | " '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "X = tweak_text(crowd_train)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 8, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 93 | "from sklearn.decomposition import TruncatedSVD\n", 94 | "from sklearn.preprocessing import StandardScaler" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 9, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "tfv = TfidfVectorizer(min_df=3, max_df=500, max_features=None, \n", 106 | " strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n", 107 | " ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n", 108 | " stop_words = 'english')\n", 109 | "\n", 110 | "svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)\n", 111 | "scl = StandardScaler(copy=True, with_mean=True, with_std=True)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 10, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "X_tfv = tfv.fit_transform(X)\n", 123 | "X_svd = svd.fit_transform(X_tfv)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 22, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "from sklearn.neighbors import KNeighborsClassifier\n", 135 | "\n", 136 | "knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute')" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 21, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "min score 0.5001, max score 0.5005 and mean score 0.5003 \n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "scores = cross_val_score(knn, X_svd, y, scoring=kappa_scorer, n_jobs=1, cv=2)\n", 156 | "print 'min score %0.4f, max score %0.4f and mean score %0.4f ' %(scores.min(), scores.max(), scores.mean())" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 17, 162 | "metadata": { 163 | "collapsed": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "tweaked_model, tfv, svd = build_knn_model(X, y, 'distance')" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 29, 173 | "metadata": { 174 | "collapsed": false 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 30, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "Xtest = tweak_text(crowd_test)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 32, 195 | "metadata": { 196 | "collapsed": false 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "predictions = []\n", 201 | "for i in range(0, len(Xtest), 5000):\n", 202 | " preds = knn_model_predictions(tweaked_model, tfv, svd, Xtest[i:i+5000])\n", 203 | " predictions.append(preds)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 33, 209 | "metadata": { 210 | "collapsed": true 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "all_preds = np.hstack(predictions)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 34, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/plain": [ 227 | "(22513,)" 228 | ] 229 | }, 230 | "execution_count": 34, 231 | "metadata": {}, 232 | "output_type": "execute_result" 233 | } 234 | ], 235 | "source": [ 236 | "all_preds.shape" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 35, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "make_submission(crowd_test.index.values.astype(int), all_preds, './model-submissions/knn_distance_processed.csv')" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python 2", 263 | "language": "python", 264 | "name": "python2" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 2 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython2", 276 | "version": "2.7.6" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 0 281 | } 282 | -------------------------------------------------------------------------------- /CrowdFlower/Knn unprocessed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run scripts/helper.py\n", 12 | "%run scripts/models.py" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n", 24 | "y = crowd_train.median_relevance.values" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "X = prepareText(crowd_train)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "knn_model_word, tfv_knn, svd_knn = build_knn_model(X, y, 'distance', 'word')" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "Xtest = prepareText(crowd_test)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 8, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "predictions = []\n", 80 | "for i in range(0, len(Xtest), 5000):\n", 81 | " preds = knn_model_predictions(knn_model_word, tfv_knn, svd_knn, Xtest[i:i+5000])\n", 82 | " predictions.append(preds)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 9, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "all_preds = np.hstack(predictions)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 11, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "make_submission(crowd_test.index.values.astype(int), all_preds, './model-submissions/knn_unprocessed.csv')" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [] 133 | } 134 | ], 135 | "metadata": { 136 | "kernelspec": { 137 | "display_name": "Python 2", 138 | "language": "python", 139 | "name": "python2" 140 | }, 141 | "language_info": { 142 | "codemirror_mode": { 143 | "name": "ipython", 144 | "version": 2 145 | }, 146 | "file_extension": ".py", 147 | "mimetype": "text/x-python", 148 | "name": "python", 149 | "nbconvert_exporter": "python", 150 | "pygments_lexer": "ipython2", 151 | "version": "2.7.6" 152 | } 153 | }, 154 | "nbformat": 4, 155 | "nbformat_minor": 0 156 | } 157 | -------------------------------------------------------------------------------- /CrowdFlower/Linear unprocessed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run scripts/helper.py\n", 12 | "%run scripts/models.py" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n", 24 | "y = crowd_train.median_relevance.values" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "X = prepareText(crowd_train)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "linear_model, tfv_linear, select_linear = build_linear_model(X, y, 'word')" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "Xtest = prepareText(crowd_test)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "predictions = linear_model_predictions(linear_model, tfv_linear, select_linear, Xtest)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "make_submission(crowd_test.index.values.astype(int), predictions, './model-submissions/lin_unprocessed.csv')" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [] 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "Python 2", 115 | "language": "python", 116 | "name": "python2" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 2 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython2", 128 | "version": "2.7.6" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 0 133 | } 134 | -------------------------------------------------------------------------------- /CrowdFlower/Non Linear Processed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 18, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run scripts/helper.py\n", 12 | "%run scripts/models.py" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "y = crowd_train.median_relevance.values" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 4, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "from sklearn.cross_validation import cross_val_score\n", 46 | "from sklearn.metrics import make_scorer" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better = True)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [ 67 | { 68 | "name": "stderr", 69 | "output_type": "stream", 70 | "text": [ 71 | "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65497012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n", 72 | " '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n", 73 | "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65516012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n", 74 | " '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n", 75 | "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/6552101\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n", 76 | " '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "X = tweak_text(crowd_train)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 7, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 93 | "from sklearn.decomposition import TruncatedSVD\n", 94 | "from sklearn.preprocessing import StandardScaler" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 14, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "tfv = TfidfVectorizer(min_df=3, max_df=500, max_features=None, \n", 106 | " strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n", 107 | " ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n", 108 | " stop_words = 'english')\n", 109 | "\n", 110 | "svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)\n", 111 | "scl = StandardScaler(copy=True, with_mean=True, with_std=True)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 15, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "X_tfv = tfv.fit_transform(X)\n", 123 | "X_svd = svd.fit_transform(X_tfv)\n", 124 | "X_scl = scl.fit_transform(X_svd)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 16, 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "from sklearn.svm import SVC\n", 136 | "\n", 137 | "clf = SVC(C=10.0, kernel='rbf', degree=3, \n", 138 | " gamma=0.0, coef0=0.0, shrinking=True, probability=False, \n", 139 | " tol=0.001, cache_size=200, class_weight=None, \n", 140 | " verbose=False, max_iter=-1, random_state=None)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 17, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "min score 0.5302, max score 0.5406 and mean score 0.5354 \n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "scores = cross_val_score(clf, X_scl, y, scoring=kappa_scorer, n_jobs=1, cv=2)\n", 160 | "print 'min score %0.4f, max score %0.4f and mean score %0.4f ' %(scores.min(), scores.max(), scores.mean())" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 20, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "tweaked_model, tfv, svd, scl = build_non_linear_model(X, y)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 21, 177 | "metadata": { 178 | "collapsed": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 22, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [ 192 | { 193 | "name": "stderr", 194 | "output_type": "stream", 195 | "text": [ 196 | "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65527\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n", 197 | " '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n", 198 | "C:\\Anaconda\\lib\\site-packages\\bs4\\__init__.py:176: UserWarning: \"http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januarya/14146012.jpg\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.\n", 199 | " '\"%s\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "Xtest = tweak_text(crowd_test)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 23, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "preds = non_linear_model_predictions(tweaked_model, tfv, svd, scl, Xtest)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 24, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "make_submission(crowd_test.index.values.astype(int), preds, './model-submissions/non_linear_processed.csv')" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "kernelspec": { 241 | "display_name": "Python 2", 242 | "language": "python", 243 | "name": "python2" 244 | }, 245 | "language_info": { 246 | "codemirror_mode": { 247 | "name": "ipython", 248 | "version": 2 249 | }, 250 | "file_extension": ".py", 251 | "mimetype": "text/x-python", 252 | "name": "python", 253 | "nbconvert_exporter": "python", 254 | "pygments_lexer": "ipython2", 255 | "version": "2.7.6" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 0 260 | } 261 | -------------------------------------------------------------------------------- /CrowdFlower/Non Linear SVM unprocessed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## SVM model on unprocessed text" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%run scripts/helper.py\n", 19 | "%run scripts/models.py" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "y = crowd_train.median_relevance.values" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "from sklearn.cross_validation import cross_val_score\n", 53 | "from sklearn.metrics import make_scorer" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 5, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better = True)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 6, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "X = prepareText(crowd_train)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "non_linear_model, tfv, svd, scl = build_non_linear_model(X, y, 'word')" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 8, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 9, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "Xtest = prepareText(crowd_test)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 10, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "predictions = non_linear_model_predictions(non_linear_model, tfv, svd, scl, Xtest)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 32, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "make_submission(crowd_test.index.values.astype(int), predictions, './model-submissions/non_lin_unprocessed.csv')" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": true 138 | }, 139 | "outputs": [], 140 | "source": [] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 2", 146 | "language": "python", 147 | "name": "python2" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 2 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython2", 159 | "version": "2.7.6" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 0 164 | } 165 | -------------------------------------------------------------------------------- /CrowdFlower/OptimizeSVC.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "%run scripts/helper.py" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n", 37 | "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "# fill in the missing np.nan values with empty string\n", 49 | "crowd_train.fillna('', inplace=True, axis=1)\n", 50 | "crowd_test.fillna('', inplace=True, axis=1)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "metadata": { 57 | "collapsed": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "traindata = list(crowd_train.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))\n", 62 | "testdata = list(crowd_test.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 6, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "y = crowd_train.median_relevance.values" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 7, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "from sklearn.feature_extraction.text import TfidfVectorizer" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 51, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "tfv = TfidfVectorizer(min_df=3, max_df=0.8, max_features=None,\n", 96 | " strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n", 97 | " ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n", 98 | " stop_words = 'english')" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 52, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "tfv.fit(traindata)\n", 110 | "X = tfv.transform(traindata)\n", 111 | "X_test = tfv.transform(testdata)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 53, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "from sklearn.cross_validation import train_test_split\n", 123 | "from sklearn.cross_validation import cross_val_score" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 54, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 55, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "(8126, 44460) (2032, 44460) (8126,) (2032,)\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "print Xt.shape, Xv.shape, yt.shape, yv.shape" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 56, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "from sklearn.decomposition import TruncatedSVD\n", 165 | "from sklearn.preprocessing import StandardScaler" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 57, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "svd = TruncatedSVD(n_components=140)\n", 177 | "Xt_svd = svd.fit_transform(Xt)\n", 178 | "Xv_svd = svd.transform(Xv)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 58, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "# scale features\n", 190 | "\n", 191 | "scl = StandardScaler()\n", 192 | "Xt_svd_scl = scl.fit_transform(Xt_svd)\n", 193 | "Xv_svd_scl = scl.transform(Xv_svd)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 59, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "from sklearn.metrics import make_scorer\n", 205 | "# Weighted kappa scorer\n", 206 | "kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better=True)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 60, 212 | "metadata": { 213 | "collapsed": true 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "# cross validation\n", 218 | "from sklearn.cross_validation import ShuffleSplit\n", 219 | "from sklearn.svm import SVC" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 61, 225 | "metadata": { 226 | "collapsed": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "svc = SVC(C=10.0, gamma=.01)\n", 231 | "cv = ShuffleSplit(Xt_svd_scl.shape[0], n_iter=2, test_size=.1, random_state=1724)\n", 232 | "\n", 233 | "test_scores = cross_val_score(svc, Xt_svd_scl, yt, cv=cv, scoring=kappa_scorer, n_jobs=1)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 62, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "min score 0.472, mean score 0.501 and max score 0.529\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "print 'min score %0.3f, mean score %0.3f and max score %0.3f' %(test_scores.min(), test_scores.mean(), test_scores.max())" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 63, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "from sklearn.ensemble import RandomForestClassifier" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 64, 269 | "metadata": { 270 | "collapsed": true 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "rf = RandomForestClassifier(n_estimators=100)\n", 275 | "test_scores = cross_val_score(rf, Xt_svd_scl, yt, cv=cv, scoring=kappa_scorer, n_jobs=1)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 65, 281 | "metadata": { 282 | "collapsed": false 283 | }, 284 | "outputs": [ 285 | { 286 | "name": "stdout", 287 | "output_type": "stream", 288 | "text": [ 289 | "min score 0.218, mean score 0.259 and max score 0.299\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "print 'min score %0.3f, mean score %0.3f and max score %0.3f' %(test_scores.min(), test_scores.mean(), test_scores.max())" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "collapsed": true 302 | }, 303 | "outputs": [], 304 | "source": [] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "collapsed": true 311 | }, 312 | "outputs": [], 313 | "source": [] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": { 328 | "collapsed": true 329 | }, 330 | "outputs": [], 331 | "source": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "collapsed": true 356 | }, 357 | "outputs": [], 358 | "source": [] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": true 374 | }, 375 | "outputs": [], 376 | "source": [] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "collapsed": true 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "crowd" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "collapsed": true 403 | }, 404 | "outputs": [], 405 | "source": [] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": { 411 | "collapsed": true 412 | }, 413 | "outputs": [], 414 | "source": [] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "collapsed": true 421 | }, 422 | "outputs": [], 423 | "source": [] 424 | } 425 | ], 426 | "metadata": { 427 | "kernelspec": { 428 | "display_name": "Python 2", 429 | "language": "python", 430 | "name": "python2" 431 | }, 432 | "language_info": { 433 | "codemirror_mode": { 434 | "name": "ipython", 435 | "version": 2 436 | }, 437 | "file_extension": ".py", 438 | "mimetype": "text/x-python", 439 | "name": "python", 440 | "nbconvert_exporter": "python", 441 | "pygments_lexer": "ipython2", 442 | "version": "2.7.6" 443 | } 444 | }, 445 | "nbformat": 4, 446 | "nbformat_minor": 0 447 | } 448 | -------------------------------------------------------------------------------- /CrowdFlower/SpellCorrection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 191, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import sys\n", 12 | "reload(sys)\n", 13 | "sys.setdefaultencoding('utf8')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 178, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import warnings\n", 25 | "warnings.filterwarnings('ignore')\n", 26 | "from sklearn.pipeline import Pipeline" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 179, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "%run query_features.py\n", 38 | "%run scripts/helper.py\n", 39 | "%run scripts/model_train_plus_test.py" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 180, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "crowd_train = load_file('./data/train.csv/train.csv', None)\n", 51 | "crowd_test = load_file('./data/test.csv/test.csv', None)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 181, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "# crowd_train = crowd_train[crowd_train.relevance_variance < 0.5]" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 158, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "target = crowd_train.median_relevance.values" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 182, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "# train_index, test_index = ssSplit(target, train_size=8000, random_state=44)\n", 85 | "# train_index, test_index = ssSplit(target, train_size=500, random_state=44)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 183, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "Xt = crowd_train.iloc[train_index]\n", 97 | "Xv = crowd_train.iloc[test_index]\n", 98 | "\n", 99 | "# Xt = crowd_train\n", 100 | "# Xv = crowd_test" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 184, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "yt = target[train_index]\n", 112 | "yv = target[test_index]\n", 113 | "\n", 114 | "# yt = target" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 185, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "correct_map = build_query_correction_map(Xt, crowd_test)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 186, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "def spell_correct_query(x):\n", 137 | " if x not in correct_map:\n", 138 | " return x\n", 139 | " else:\n", 140 | " return correct_map[x]\n", 141 | " \n", 142 | "Xt['query'] = Xt['query'].map(spell_correct_query)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 187, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "Xv['query'] = Xv['query'].map(spell_correct_query)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 188, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "Xt_tweaked = tweak_text(Xt)\n", 165 | "Xv_tweaked = tweak_text(Xv)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 189, 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "Xfitted, tfv = TFIDF(Xt_tweaked, None)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 192, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)\n", 188 | "\n", 189 | "scl = StandardScaler(copy=True, with_mean=True, with_std=True)\n", 190 | "\n", 191 | "clf = SVC(C=10.0, kernel='rbf', degree=3, \n", 192 | " gamma=0.0, coef0=0.0, shrinking=True, probability=False, \n", 193 | " tol=0.001, cache_size=200, class_weight=None, \n", 194 | " verbose=False, max_iter=-1, random_state=None)\n", 195 | "\n", 196 | "keywords = keyword_counter(Xt)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 193, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "features = stack([keywords, Xfitted])" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 194, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "pipeline = Pipeline([('svd', svd), ('scl', scl), ('clf', clf)])" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 195, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/plain": [ 231 | "Pipeline(steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=200, n_iter=5,\n", 232 | " random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n", 233 | " kernel='rbf', max_iter=-1, probability=False, random_state=None,\n", 234 | " shrinking=True, tol=0.001, verbose=False))])" 235 | ] 236 | }, 237 | "execution_count": 195, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "pipeline.fit(features, yt)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 196, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "keywords_test = keyword_counter(Xv)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 197, 260 | "metadata": { 261 | "collapsed": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "Xtest = tfv.transform(Xv_tweaked)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 198, 271 | "metadata": { 272 | "collapsed": true 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "features_test = stack([keywords_test, Xtest])" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 199, 282 | "metadata": { 283 | "collapsed": true 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "preds_new_model = pipeline.predict(features_test)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 148, 293 | "metadata": { 294 | "collapsed": true 295 | }, 296 | "outputs": [], 297 | "source": [ 298 | "# make_submission(crowd_test.id.values.astype(int), preds_new_model, 'spell_correct_rel.csv')" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 200, 304 | "metadata": { 305 | "collapsed": false 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "print 'Kappa score on validation set ', (quadratic_weighted_kappa(yv, preds_new_model))" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "### Linear model" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 174, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "linear_model, select = build_linear_model(features, yt)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 175, 333 | "metadata": { 334 | "collapsed": false 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "features_test_selected = select.transform(features_test)\n", 339 | "linear_preds = linear_model.predict(features_test_selected)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 152, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [ 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "Kappa score on validation set 0.57412956621\n" 354 | ] 355 | } 356 | ], 357 | "source": [ 358 | "# print 'Kappa score on validation set ', (quadratic_weighted_kappa(yv, linear_preds))" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 176, 364 | "metadata": { 365 | "collapsed": true 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "ensemble_lin_svm = (preds_new_model + linear_preds) / 2" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 155, 375 | "metadata": { 376 | "collapsed": false 377 | }, 378 | "outputs": [ 379 | { 380 | "name": "stdout", 381 | "output_type": "stream", 382 | "text": [ 383 | "Kappa score on validation set 0.659703361754\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "# print 'Kappa score on validation set ', (quadratic_weighted_kappa(yv, ensemble_lin_svm))" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 177, 394 | "metadata": { 395 | "collapsed": true 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "make_submission(crowd_test.id.values.astype(int), ensemble_lin_svm, 'ensemble_lin_svm_title.csv')" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "## Best score" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 48, 412 | "metadata": { 413 | "collapsed": true 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "best_score_df = pd.read_csv('./submissions/spell_correct_final_only_title.csv')" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 49, 423 | "metadata": { 424 | "collapsed": true 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "best_score = best_score_df.prediction" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 50, 434 | "metadata": { 435 | "collapsed": true 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "ensemble = (preds_new_model + best_score) / 2\n", 440 | "ensemble_int = [int(score) for score in ensemble]" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 51, 446 | "metadata": { 447 | "collapsed": true 448 | }, 449 | "outputs": [], 450 | "source": [ 451 | "make_submission(crowd_test.id.values.astype(int), ensemble_int, 'spell_correct_title_relevance.csv')" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": { 458 | "collapsed": true 459 | }, 460 | "outputs": [], 461 | "source": [] 462 | } 463 | ], 464 | "metadata": { 465 | "kernelspec": { 466 | "display_name": "Python 2", 467 | "language": "python", 468 | "name": "python2" 469 | }, 470 | "language_info": { 471 | "codemirror_mode": { 472 | "name": "ipython", 473 | "version": 2 474 | }, 475 | "file_extension": ".py", 476 | "mimetype": "text/x-python", 477 | "name": "python", 478 | "nbconvert_exporter": "python", 479 | "pygments_lexer": "ipython2", 480 | "version": "2.7.6" 481 | } 482 | }, 483 | "nbformat": 4, 484 | "nbformat_minor": 0 485 | } 486 | -------------------------------------------------------------------------------- /CrowdFlower/TFIDF_Train_Plus_Test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import warnings\n", 13 | "warnings.filterwarnings('ignore')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 30, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "%run scripts/helper.py\n", 25 | "%run scripts/model_train_plus_test.py" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "crowd_train = load_file('./data/train.csv/train.csv', index_col='id')\n", 37 | "y = crowd_train.median_relevance.values" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "crowd_test = load_file('./data/test.csv/test.csv', index_col='id')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "# Stratified shuffle split\n", 60 | "train_idx, test_idx = ssSplit(y, train_size=500, random_state=1234)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 6, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "# training and test set\n", 72 | "Xtrain = crowd_train.iloc[train_idx]\n", 73 | "ytrain = y[train_idx]\n", 74 | "\n", 75 | "Xtest = crowd_train.iloc[test_idx]\n", 76 | "ytest = y[test_idx]" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 7, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "Xtrain_text = tweak_text(Xtrain)\n", 88 | "#Xtest_text = tweak_text(Xtest)\n", 89 | "Xtest_text = tweak_text(crowd_test)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 8, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "# whole corpus\n", 101 | "corpus = []\n", 102 | "\n", 103 | "for i in range(len(Xtrain_text)):\n", 104 | " corpus.append(Xtrain_text[i])\n", 105 | "\n", 106 | "for j in range(len(Xtest_text)):\n", 107 | " corpus.append(Xtest_text[j])" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 15, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "Xvalidation_text = tweak_text(Xtest)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 16, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "for k in range(len(Xvalidation_text)):\n", 130 | " corpus.append(Xvalidation_text[k])" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 17, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "24029\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "print len(corpus)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 31, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "Xtrain_fitted, tfv = TFIDF(Xtrain_text, corpus)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 32, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "# Non linear svm model on processed text\n", 172 | "svm, svd, scl = build_non_linear_model(Xtrain_fitted, ytrain)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 33, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "options = {\n", 184 | " 'tfv': tfv,\n", 185 | " 'svd': svd,\n", 186 | " 'scl': scl\n", 187 | "}" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 34, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "svm_pred_non_lin = make_predictions(svm, options, Xvalidation_text)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 35, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "Kappa score on validation set 0.1151 \n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "print 'Kappa score on validation set %0.4f ' %(quadratic_weighted_kappa(ytest, svm_pred_non_lin))" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 59, 223 | "metadata": { 224 | "collapsed": false 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "# Linear svm model on processed text\n", 229 | "svm_lin, select = build_linear_model(Xtrain_fitted, ytrain)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 64, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "options = {\n", 241 | " 'tfv': tfv,\n", 242 | " 'select': select\n", 243 | "}" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 65, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "svm_pred_lin = make_predictions(svm_lin, options, Xtest_text)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 66, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "Validation set accuracy 0.4350 \n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(ytest, svm_pred_lin))" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "### Unprocessed text" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 68, 286 | "metadata": { 287 | "collapsed": true 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "Xtrain_text_u = prepareText(Xtrain)\n", 292 | "Xtest_text_u = prepareText(Xtest)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 69, 298 | "metadata": { 299 | "collapsed": true 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "# whole corpus\n", 304 | "corpus_u = []\n", 305 | "\n", 306 | "for i in range(len(Xtrain_text_u)):\n", 307 | " corpus_u.append(Xtrain_text_u[i])\n", 308 | "\n", 309 | "for j in range(len(Xtest_text_u)):\n", 310 | " corpus_u.append(Xtest_text_u[j])" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 77, 316 | "metadata": { 317 | "collapsed": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "Xtrain_fitted_u, tfv_u = TFIDF(Xtrain_text_u, corpus_u)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 78, 327 | "metadata": { 328 | "collapsed": true 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "# Non linear svm model on unprocessed text\n", 333 | "svm_u, svd_u, scl_u = build_non_linear_model(Xtrain_fitted_u, ytrain)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 79, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "options = {\n", 345 | " 'tfv': tfv_u,\n", 346 | " 'svd': svd_u,\n", 347 | " 'scl': scl_u\n", 348 | "}" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 80, 354 | "metadata": { 355 | "collapsed": true 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "svm_pred_non_lin_u = make_predictions(svm_u, options, Xtest_text_u)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 81, 365 | "metadata": { 366 | "collapsed": false 367 | }, 368 | "outputs": [ 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "Validation set accuracy 0.5620 \n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(ytest, svm_pred_non_lin_u))" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 88, 384 | "metadata": { 385 | "collapsed": false 386 | }, 387 | "outputs": [ 388 | { 389 | "name": "stdout", 390 | "output_type": "stream", 391 | "text": [ 392 | "Ensemble of unprocessed and processed non linear SVM models 0.605575423197\n" 393 | ] 394 | } 395 | ], 396 | "source": [ 397 | "print 'Ensemble of unprocessed and processed non linear SVM models ', quadratic_weighted_kappa(ytest, (svm_pred_non_lin + svm_pred_non_lin_u) / 2)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": { 404 | "collapsed": true 405 | }, 406 | "outputs": [], 407 | "source": [] 408 | } 409 | ], 410 | "metadata": { 411 | "kernelspec": { 412 | "display_name": "Python 2", 413 | "language": "python", 414 | "name": "python2" 415 | }, 416 | "language_info": { 417 | "codemirror_mode": { 418 | "name": "ipython", 419 | "version": 2 420 | }, 421 | "file_extension": ".py", 422 | "mimetype": "text/x-python", 423 | "name": "python", 424 | "nbconvert_exporter": "python", 425 | "pygments_lexer": "ipython2", 426 | "version": "2.7.6" 427 | } 428 | }, 429 | "nbformat": 4, 430 | "nbformat_minor": 0 431 | } 432 | -------------------------------------------------------------------------------- /CrowdFlower/query_features.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator 2 | from nltk.corpus import stopwords 3 | import numpy as np 4 | from scipy import sparse 5 | from nltk.stem import PorterStemmer 6 | from nltk.corpus import wordnet as wn 7 | 8 | 9 | stop = stopwords.words('english') 10 | 11 | def is_query_in_response(train): 12 | query_terms = train['query'].split(' ') 13 | response = train['product_title'] + ' ' + train['product_description'] 14 | 15 | stemmer = PorterStemmer() 16 | query_terms_stemmed = [stemmer.stem(q) for q in query_terms] 17 | response_stemmed = ''.join([stemmer.stem(r) for r in response]) 18 | stop = stopwords.words('english') 19 | 20 | keyword = False 21 | 22 | for q in query_terms_stemmed: 23 | if q not in stop: 24 | keyword = True 25 | if response_stemmed.lower().find(q) == -1: 26 | return 0 27 | 28 | if keyword == False: 29 | return 0 30 | else: 31 | return 1 32 | 33 | 34 | 35 | def query_in_response(doc): 36 | query_terms = doc['query'].split(' ') 37 | unique_terms = list(set(query_terms)) 38 | response = doc['product_title'] + ' ' + doc['product_description'] 39 | keyword = False 40 | 41 | for q in unique_terms: 42 | if q not in stop: 43 | keyword = True 44 | 45 | if response.lower().find(q) == -1: 46 | return 0 47 | 48 | if keyword == False: 49 | return 0 50 | else: 51 | return 1 52 | 53 | 54 | def num_query_in_response(doc): 55 | query_terms = doc['query'].split(' ') 56 | unique_terms = list(set(query_terms)) 57 | response = doc['product_title'] + ' ' + doc['product_description'] 58 | count = 0 59 | 60 | for q in unique_terms: 61 | if q not in stop: 62 | if response.lower().find(q) == -1: 63 | count += 1 64 | 65 | return count 66 | 67 | def query_synonymns_check(x): 68 | query = x['query'].lower() 69 | query_terms = list(set(query.split())) 70 | response = x['product_title'].lower() + ' ' + x['product_description'].lower() 71 | query_synonymns = [] 72 | stop = stopwords.words('english') 73 | 74 | for q in query_terms: 75 | for i, j in enumerate(wn.synsets(q)): 76 | query_synonymns.extend(j.lemma_names) 77 | 78 | count = 0 79 | for qsynonym in query_synonymns: 80 | if qsynonym not in stop and response.find(qsynonym) != -1: 81 | count += 1 82 | 83 | return count 84 | 85 | 86 | 87 | def jaccard(x): 88 | query = x['query'].lower() 89 | title = x['product_title'].lower() 90 | description = x['product_description'].lower() 91 | response = title + ' ' + description 92 | 93 | query_set = set(query.split(' ')) 94 | response_set = set(response.split(' ')) 95 | 96 | query_response_intersection_len = len(query_set & response_set) 97 | query_response_union_len = len(query_set | response_set) 98 | 99 | return (query_response_intersection_len * 1.) / (query_response_union_len) 100 | 101 | 102 | def query_length(x): 103 | return len(x['query'].split(' ')) 104 | 105 | 106 | def keyword_counter(document): 107 | query_in_resp_feat = document.apply(query_in_response, axis=1) 108 | num_query_feat = document.apply(num_query_in_response, axis=1) 109 | # query_synonym_count_feat = document.apply(query_synonymns_check, axis=1) 110 | # query_length_feat = document.apply(query_length, axis=1) 111 | # jaccard_dist = document.apply(jaccard, axis=1) 112 | 113 | # return np.array([query_in_resp_feat, num_query_feat, query_synonym_count_feat]).T 114 | 115 | #query_in_resp_feat = document.apply(is_query_in_response, axis=1) 116 | 117 | return np.array([query_in_resp_feat, num_query_feat]).T 118 | 119 | def stack(features): 120 | features = sparse.hstack(features).tocsr() 121 | return features 122 | 123 | def concat_examples(examples): 124 | total = sparse.vstack(examples).tocsr() 125 | return total 126 | -------------------------------------------------------------------------------- /CrowdFlower/scripts/blending_helper.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from sklearn.feature_selection import SelectPercentile, chi2 3 | from sklearn.decomposition import TruncatedSVD 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.svm import SVC 6 | from sklearn.naive_bayes import MultinomialNB 7 | from sklearn.neighbors import KNeighborsClassifier 8 | 9 | def prepareTrainData(X, Xwhole): 10 | tfv = TfidfVectorizer(min_df=3, max_features=None, 11 | strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', 12 | ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english') 13 | 14 | if Xwhole == None: 15 | X = tfv.fit_transform(X) 16 | else: 17 | tfv.fit(Xwhole) 18 | X = tfv.transform(X) 19 | 20 | svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0) 21 | scl = StandardScaler(copy=True, with_mean=True, with_std=True) 22 | 23 | X = svd.fit_transform(X) 24 | X = scl.fit_transform(X) 25 | 26 | return (X, tfv, svd, scl) 27 | 28 | def prepareTestData(Xtest, tfv, svd, scl): 29 | Xtest = tfv.transform(Xtest) 30 | Xtest = svd.transform(Xtest) 31 | Xtest = scl.transform(Xtest) 32 | 33 | return Xtest 34 | -------------------------------------------------------------------------------- /CrowdFlower/scripts/features.py: -------------------------------------------------------------------------------- 1 | from scipy import sparse 2 | import numpy as np 3 | from sklearn.base import BaseEstimator 4 | import pandas as pd 5 | 6 | class FeatureStacker(BaseEstimator): 7 | def __init__(self, transformer_list): 8 | self.transformer_list = transformer_list 9 | 10 | def get_feature_names(self): 11 | pass 12 | 13 | def fit(self, X, y=None): 14 | for name, trans in self.transformer_list: 15 | trans.fit(X, y) 16 | return self 17 | 18 | def transform(self, X): 19 | features = [] 20 | for name, trans in self.transformer_list: 21 | features.append(trans.transform(X)) 22 | 23 | issparse = [sparse.issparse(f) for f in features] 24 | 25 | if np.any(issparse): 26 | features = sparse.hstack(features).tocsr() 27 | else: 28 | features = np.hstack(features) 29 | 30 | return features 31 | 32 | def get_params(self, deep=True): 33 | if not deep: 34 | return super(FeatureStacker, self).get_params(deep=False) 35 | 36 | else: 37 | out = dict(self.transformer_list) 38 | 39 | for name, trans in self.transformer_list: 40 | for key, value in trans.get_params(deep=True).iteritems(): 41 | out['%s__%s' % (name, key)] = value 42 | 43 | return out 44 | -------------------------------------------------------------------------------- /CrowdFlower/scripts/helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from nltk.stem import PorterStemmer 4 | from nltk.stem.wordnet import WordNetLemmatizer 5 | import re 6 | from HTMLParser import HTMLParser 7 | from sklearn.cross_validation import StratifiedShuffleSplit 8 | from bs4 import BeautifulSoup 9 | from collections import Counter 10 | import difflib 11 | from nltk import bigrams 12 | from sklearn.metrics import make_scorer 13 | 14 | 15 | class MLStripper(HTMLParser): 16 | def __init__(self): 17 | self.reset() 18 | self.fed = [] 19 | def handle_data(self, d): 20 | self.fed.append(d) 21 | def get_data(self): 22 | return ''.join(self.fed) 23 | 24 | def strip_tags(html): 25 | s = MLStripper() 26 | s.feed(html) 27 | return s.get_data() 28 | 29 | def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None): 30 | """ 31 | Returns the confusion matrix between rater's ratings 32 | """ 33 | assert(len(rater_a) == len(rater_b)) 34 | if min_rating is None: 35 | min_rating = min(rater_a + rater_b) 36 | if max_rating is None: 37 | max_rating = max(rater_a + rater_b) 38 | num_ratings = int(max_rating - min_rating + 1) 39 | conf_mat = [[0 for i in range(num_ratings)] 40 | for j in range(num_ratings)] 41 | for a, b in zip(rater_a, rater_b): 42 | conf_mat[a - min_rating][b - min_rating] += 1 43 | return conf_mat 44 | 45 | 46 | def histogram(ratings, min_rating=None, max_rating=None): 47 | """ 48 | Returns the counts of each type of rating that a rater made 49 | """ 50 | if min_rating is None: 51 | min_rating = min(ratings) 52 | if max_rating is None: 53 | max_rating = max(ratings) 54 | num_ratings = int(max_rating - min_rating + 1) 55 | hist_ratings = [0 for x in range(num_ratings)] 56 | for r in ratings: 57 | hist_ratings[r - min_rating] += 1 58 | return hist_ratings 59 | 60 | 61 | def quadratic_weighted_kappa(y, y_pred): 62 | """ 63 | Calculates the quadratic weighted kappa 64 | axquadratic_weighted_kappa calculates the quadratic weighted kappa 65 | value, which is a measure of inter-rater agreement between two raters 66 | that provide discrete numeric ratings. Potential values range from -1 67 | (representing complete disagreement) to 1 (representing complete 68 | agreement). A kappa value of 0 is expected if all agreement is due to 69 | chance. 70 | quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b 71 | each correspond to a list of integer ratings. These lists must have the 72 | same length. 73 | The ratings should be integers, and it is assumed that they contain 74 | the complete range of possible ratings. 75 | quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating 76 | is the minimum possible rating, and max_rating is the maximum possible 77 | rating 78 | """ 79 | rater_a = y 80 | rater_b = y_pred 81 | min_rating=None 82 | max_rating=None 83 | rater_a = np.array(rater_a, dtype=int) 84 | rater_b = np.array(rater_b, dtype=int) 85 | assert(len(rater_a) == len(rater_b)) 86 | if min_rating is None: 87 | min_rating = min(min(rater_a), min(rater_b)) 88 | if max_rating is None: 89 | max_rating = max(max(rater_a), max(rater_b)) 90 | conf_mat = confusion_matrix(rater_a, rater_b, 91 | min_rating, max_rating) 92 | num_ratings = len(conf_mat) 93 | num_scored_items = float(len(rater_a)) 94 | 95 | hist_rater_a = histogram(rater_a, min_rating, max_rating) 96 | hist_rater_b = histogram(rater_b, min_rating, max_rating) 97 | 98 | numerator = 0.0 99 | denominator = 0.0 100 | 101 | for i in range(num_ratings): 102 | for j in range(num_ratings): 103 | expected_count = (hist_rater_a[i] * hist_rater_b[j] 104 | / num_scored_items) 105 | d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0) 106 | numerator += d * conf_mat[i][j] / num_scored_items 107 | denominator += d * expected_count / num_scored_items 108 | 109 | return (1.0 - numerator / denominator) 110 | 111 | def load_file(filename, index_col): 112 | if index_col: 113 | return pd.read_csv(filename, index_col=index_col).fillna('') 114 | else: 115 | return pd.read_csv(filename).fillna('') 116 | 117 | def prepareText(df): 118 | return list(df.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1)) 119 | 120 | def how_uncorrelated(ytrue, model1pred, model2pred): 121 | count = 0 122 | 123 | for i in range(len(ytrue)): 124 | if ytrue[i] != model1pred[i] and ytrue[i] != model2pred[i]: 125 | if model1pred[i] != model2pred[i]: 126 | count += 1 127 | 128 | return (count * 1. / len(ytrue)) * 100.0 129 | 130 | def strip_html(data): 131 | return [strip_tags(text) for text in data ] 132 | 133 | def parseHTML(data): 134 | return ' '.join([p.get_text() for p in BeautifulSoup(data)]) 135 | 136 | def stem_text(data): 137 | stemmer = PorterStemmer() 138 | stemmed_text = [] 139 | 140 | for text in data: 141 | words = text.split(' ') 142 | stemmed_words = [] 143 | 144 | for word in words: 145 | stemmed_words.append(stemmer.stem(word.lower())) 146 | 147 | stemmed_text.append(' '.join(stemmed_words)) 148 | 149 | return stemmed_text 150 | 151 | def ssSplit(y, train_size=1000, random_state=0): 152 | sss = StratifiedShuffleSplit(y, 3, train_size=train_size, random_state=random_state) 153 | train_index, test_index = next(iter(sss)) 154 | 155 | return (train_index, test_index) 156 | 157 | ''' 158 | Auto correct a query based on the training set 159 | ''' 160 | def build_query_correction_map(train, test): 161 | # get all queries 162 | queries = set(train['query'].values) 163 | correct_map = {} 164 | 165 | for q in queries: 166 | corrected_q = autocorrect_query(q, train, test) 167 | correct_map[q] = corrected_q 168 | 169 | return correct_map 170 | 171 | def autocorrect_query(query, train=None, test=None, cutoff=0.8): 172 | train_data = train.values[train['query'].values == query, :] 173 | test_data = test.values[test['query'].values == query, :] 174 | 175 | s = '' 176 | 177 | for r in train_data: 178 | s = "%s %s %s"%(s,BeautifulSoup(r[2]).get_text(" ",strip=True),BeautifulSoup(r[3]).get_text(" ",strip=True)) 179 | 180 | for r in test_data: 181 | s = "%s %s %s"%(s,BeautifulSoup(r[2]).get_text(" ",strip=True),BeautifulSoup(r[3]).get_text(" ",strip=True)) 182 | 183 | s = re.findall(r'[\'\"\w]+',s.lower()) 184 | s_bigram = [' '.join(i) for i in bigrams(s)] 185 | s.extend(s_bigram) 186 | 187 | corrected_query = [] 188 | for q in query.lower().split(): 189 | if len(q)<=2: 190 | corrected_query.append(q) 191 | continue 192 | corrected_word = difflib.get_close_matches(q, s,n=1,cutoff=cutoff) 193 | if len(corrected_word) >0: 194 | corrected_query.append(corrected_word[0]) 195 | else : 196 | corrected_query.append(q) 197 | return ' '.join(corrected_query) 198 | 199 | ''' 200 | Gets data for a particular relevance score 201 | ''' 202 | def getText(data, y, label): 203 | return [data[i] for i in range(len(y)) if y[i] == label] 204 | 205 | 206 | def lemmatize_text(data): 207 | lmtzr = WordNetLemmatizer() 208 | lemmatized_text = [] 209 | 210 | for text in data: 211 | words = text.split(' ') 212 | lemmatized_words = [] 213 | 214 | for word in words: 215 | lemmatized_words.append(lmtzr.lemmatize(word.lower())) 216 | 217 | lemmatized_text.append(' '.join(lemmatized_words)) 218 | 219 | return lemmatized_text 220 | 221 | 222 | # def tweak_text(train): 223 | # s_data = [] 224 | # stemmer = PorterStemmer() 225 | 226 | # for i in range(train.shape[0]): 227 | # s = (" ").join(["q"+ z for z in BeautifulSoup(train["query"].iloc[i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title.iloc[i]).get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description.iloc[i]).get_text(" ") 228 | # s = re.sub("[^a-zA-Z0-9]"," ", s) 229 | # s = (" ").join([stemmer.stem(z) for z in s.split(" ")]) 230 | # s_data.append(s) 231 | 232 | # return s_data 233 | 234 | def tweak_text(train): 235 | s_data = [] 236 | stemmer = PorterStemmer() 237 | 238 | for i in range(train.shape[0]): 239 | s = (" ").join(["q"+ z for z in BeautifulSoup(train["query"].iloc[i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title.iloc[i]).get_text(" ").split(" ")]) 240 | s = re.sub("[^a-zA-Z0-9]"," ", s) 241 | s = (" ").join([stemmer.stem(z) for z in s.split(" ")]) 242 | s_data.append(s) 243 | 244 | return s_data 245 | 246 | 247 | def lemmatize_text(train): 248 | s_data = [] 249 | lmtzr = WordNetLemmatizer() 250 | 251 | for i in range(train.shape[0]): 252 | s = (" ").join(["q"+ z for z in BeautifulSoup(train["query"].iloc[i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title.iloc[i]).get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description.iloc[i]).get_text(" ") 253 | s = re.sub("[^a-zA-Z0-9]"," ", s) 254 | s = (" ").join([lmtzr.lemmatize(z) for z in s.split(" ")]) 255 | s_data.append(s) 256 | 257 | return s_data 258 | 259 | def most_common(arr): 260 | arr = Counter(arr) 261 | return arr.most_common(1)[0][0] 262 | 263 | def get_kappa_scorer(): 264 | return make_scorer(quadratic_weighted_kappa, greater_is_better=True) 265 | 266 | 267 | ''' 268 | Make a submission file in submissions folder in 269 | current working directory that can be uploaded to 270 | Kaggle. 271 | ''' 272 | 273 | def make_submission(idx, preds, filename): 274 | submission = pd.DataFrame({"id": idx, "prediction": preds}) 275 | submission.to_csv("./submissions/" + filename, index=False) 276 | 277 | -------------------------------------------------------------------------------- /CrowdFlower/scripts/model_train_plus_test.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from sklearn.feature_selection import SelectPercentile, chi2 3 | from sklearn.decomposition import TruncatedSVD 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.svm import SVC 6 | from sklearn.naive_bayes import MultinomialNB 7 | from sklearn.neighbors import KNeighborsClassifier 8 | from sklearn.feature_selection import SelectPercentile, chi2, RFECV 9 | from sklearn.linear_model import LogisticRegression 10 | 11 | 12 | def TFIDF(Xtrain, Xwhole): 13 | tfv = TfidfVectorizer(min_df=3, max_df=700, max_features=None, 14 | strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', 15 | ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english') 16 | 17 | if Xwhole == None: 18 | return (tfv.fit_transform(Xtrain), tfv) 19 | else: 20 | tfv.fit(Xwhole) 21 | return (tfv.transform(Xtrain), tfv) 22 | 23 | def build_non_linear_model(Xtrain, y): 24 | 25 | svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0) 26 | scl = StandardScaler(copy=True, with_mean=True, with_std=True) 27 | 28 | Xtrain = svd.fit_transform(Xtrain) 29 | Xtrain = scl.fit_transform(Xtrain) 30 | 31 | clf = SVC(C=10.0, kernel='rbf', degree=3, 32 | gamma=0.0, coef0=0.0, shrinking=True, probability=False, 33 | tol=0.001, cache_size=200, class_weight=None, 34 | verbose=False, max_iter=-1, random_state=None) 35 | 36 | 37 | return (clf.fit(Xtrain, y), svd, scl) 38 | 39 | def build_linear_model(X, y): 40 | select = SelectPercentile(score_func=chi2, percentile=20) 41 | clf = SVC(C=10.0, kernel='linear', probability=True) 42 | 43 | X = select.fit_transform(X, y) 44 | return (clf.fit(X, y), select) 45 | 46 | def build_knn_model(Xtrain, y): 47 | svd = TruncatedSVD(n_components=100, algorithm='randomized', n_iter=5, random_state=None, tol=0.0) 48 | scl = StandardScaler(copy=True, with_mean=True, with_std=True) 49 | 50 | Xtrain = svd.fit_transform(Xtrain) 51 | Xtrain = scl.fit_transform(Xtrain) 52 | 53 | clf = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute') 54 | 55 | return (clf.fit(Xtrain, y), svd, scl) 56 | 57 | def feature_selection(X, y, pipeline, step=1, cv=None, scoring=None, estimator_params=None, verbose=0): 58 | selector = RFECV(pipeline, step=step, cv=cv, scoring=scoring, estimator_params=estimator_params, verbose=verbose) 59 | selector.fit(X, y) 60 | 61 | return selector 62 | 63 | 64 | def make_predictions(model, options, Xtest): 65 | if options.has_key('tfv'): 66 | Xtest = options['tfv'].transform(Xtest) 67 | 68 | if options.has_key('svd'): 69 | Xtest = options['svd'].transform(Xtest) 70 | 71 | if options.has_key('scl'): 72 | Xtest = options['scl'].transform(Xtest) 73 | 74 | if options.has_key('select'): 75 | Xtest = options['select'].transform(Xtest) 76 | 77 | return model.predict(Xtest) -------------------------------------------------------------------------------- /CrowdFlower/scripts/models.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from sklearn.feature_selection import SelectPercentile, chi2 3 | from sklearn.decomposition import TruncatedSVD 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.svm import SVC 6 | from sklearn.naive_bayes import MultinomialNB 7 | from sklearn.neighbors import KNeighborsClassifier 8 | 9 | 10 | def vectorizer(analyzerType): 11 | if analyzerType == None: 12 | return TfidfVectorizer(min_df=3, max_features=None, 13 | strip_accents='unicode', analyzer='char', token_pattern=r'\w{1,}', 14 | ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english') 15 | else: return TfidfVectorizer(min_df=3, max_features=None, 16 | strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', 17 | ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english') 18 | 19 | def build_linear_model(X, y, analyzerType): 20 | tfv = vectorizer(analyzerType) 21 | select = SelectPercentile(score_func=chi2, percentile=15) 22 | clf = SVC(C=12.0, kernel='linear') 23 | 24 | X = tfv.fit_transform(X) 25 | X = select.fit_transform(X, y) 26 | return (clf.fit(X, y), tfv, select) 27 | 28 | def build_non_linear_model(X, y, analyzerType): 29 | tfv = vectorizer(analyzerType) 30 | svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0) 31 | scl = StandardScaler(copy=True, with_mean=True, with_std=True) 32 | clf = SVC(C=10.0, kernel='rbf', degree=3, 33 | gamma=0.0, coef0=0.0, shrinking=True, probability=False, 34 | tol=0.001, cache_size=200, class_weight=None, 35 | verbose=False, max_iter=-1, random_state=None) 36 | 37 | tfv.fit(X) 38 | X = tfv.transform(X) 39 | X = svd.fit_transform(X) 40 | X = scl.fit_transform(X) 41 | 42 | return (clf.fit(X, y), tfv, svd, scl) 43 | 44 | def build_knn_model(X, y, weights, analyzerType): 45 | tfv = vectorizer(analyzerType) 46 | svd = TruncatedSVD(n_components=250) 47 | 48 | if weights == None: 49 | clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') 50 | else: 51 | clf = KNeighborsClassifier(n_neighbors=5, weights=weights, algorithm='brute') 52 | 53 | tfv.fit(X) 54 | X = tfv.transform(X) 55 | X = svd.fit_transform(X) 56 | 57 | return (clf.fit(X, y), tfv, svd) 58 | 59 | def build_naive_bayes(X, y): 60 | tfv = vectorizer(analyzerType) 61 | clf = MultinomialNB(alpha=.01) 62 | 63 | X = tfv.fit_transform(X) 64 | 65 | return (clf.fit(X, y), tfv) 66 | 67 | 68 | def build_stopwords_tweak_model(X, y): 69 | tfv = TfidfVectorizer(min_df=3 ,max_features=None, 70 | strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', 71 | ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english') 72 | 73 | 74 | svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0) 75 | scl = StandardScaler(copy=True, with_mean=True, with_std=True) 76 | clf = SVC(C=10.0, kernel='rbf', degree=3, 77 | gamma=0.0, coef0=0.0, shrinking=True, probability=False, 78 | tol=0.001, cache_size=200, class_weight=None, 79 | verbose=False, max_iter=-1, random_state=None) 80 | 81 | tfv.fit(X) 82 | X = tfv.transform(X) 83 | X = svd.fit_transform(X) 84 | X = scl.fit_transform(X) 85 | 86 | return (clf.fit(X, y), tfv, svd, scl) 87 | 88 | 89 | ''' 90 | This function can be used for both linear kernel and SGDClassifier 91 | ''' 92 | def linear_model_predictions(model, tfv, select, Xtest): 93 | Xtest = tfv.transform(Xtest) 94 | Xtest = select.transform(Xtest) 95 | 96 | return model.predict(Xtest) 97 | 98 | def non_linear_model_predictions(model, tfv, svd, scl, Xtest): 99 | Xtest = tfv.transform(Xtest) 100 | Xtest = svd.transform(Xtest) 101 | Xtest = scl.transform(Xtest) 102 | 103 | return model.predict(Xtest) 104 | 105 | def naive_bayes_predictions(model, tfv, Xtest): 106 | Xtest = tfv.transform(Xtest) 107 | return model.predict(Xtest) 108 | 109 | def knn_model_predictions(model, tfv, svd, Xtest): 110 | Xtest = tfv.transform(Xtest) 111 | Xtest = svd.transform(Xtest) 112 | 113 | return model.predict(Xtest) -------------------------------------------------------------------------------- /HIV-Progression/.ipynb_checkpoints/Basic_Analysis-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /HIV-Progression/.ipynb_checkpoints/ClassBalancedModel-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /HIV-Progression/helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.cross_validation import ShuffleSplit 5 | from sklearn.cross_validation import cross_val_score 6 | from sklearn.learning_curve import validation_curve 7 | 8 | def load_data(path, index_col): 9 | """ 10 | Loads a csv file as pandas data frame 11 | """ 12 | return pd.read_csv(path, index_col=index_col) 13 | 14 | def misclassification_percentage(y_true, y_pred): 15 | 16 | """ 17 | Returns misclassification percentage ( misclassified_examples / total_examples * 100.0) 18 | """ 19 | 20 | misclassified_examples = list(y_true == y_pred).count(False) * 1. 21 | total_examples = y_true.shape[0] 22 | return (misclassified_examples / total_examples) * 100.0 23 | 24 | def validation_scores(model, X, y, n_iter=5, test_size=0.1): 25 | 26 | cv = ShuffleSplit(X.shape[0], n_iter=n_iter, test_size=test_size, random_state=0) 27 | test_scores = cross_val_score(model, X, y, cv=cv) 28 | 29 | return test_scores 30 | 31 | def plot_validation_curves(param_values, train_scores, test_scores): 32 | for i in range(train_scores.shape[1]): 33 | plt.semilogx(param_values, train_scores[:, i], alpha=0.4, lw=2, c='b') 34 | plt.semilogx(param_values, test_scores[:, i], alpha=0.4, lw=2, c='g') 35 | 36 | plt.ylabel("score for LogisticRegression(fit_intercept=True)") 37 | plt.xlabel("C") 38 | plt.title('Validation curves for the C parameter'); 39 | 40 | def validation_curves(model, X, y, n_iter, test_size): 41 | n_Cs = 10 42 | Cs = np.logspace(-5, 5, n_Cs) 43 | cv = ShuffleSplit(X.shape[0], n_iter=n_iter, test_size=test_size, random_state=0) 44 | 45 | train_scores, test_scores = validation_curve( 46 | model, X, y, 'C', Cs, cv=cv) 47 | 48 | return (Cs, train_scores, test_scores) 49 | 50 | 51 | class BaselineModel: 52 | 53 | """ 54 | Takes in the most majority class and number of training examples 55 | and returns its prediction as elements of majority class. 56 | e.g. In our current training set majority class is 0 then it would 57 | return all values as being 0 as our prediction. 58 | 59 | Any model that we develop must be compared with this baseline model. 60 | """ 61 | 62 | def __init__(self, majority_class, num_examples): 63 | self.majority_class = majority_class 64 | self.num_examples = num_examples 65 | 66 | def predict(self): 67 | return np.asarray([self.majority_class] * self.num_examples) 68 | 69 | class Submission: 70 | """ 71 | Creates a submission in Kaggle competition format 72 | Column 1 will contain Patient Id and Column 2 will be 73 | the prediction. 74 | """ 75 | def __init__(self, prediction): 76 | self.prediction = prediction 77 | 78 | def create_submission(self, path): 79 | with open(path, 'wb') as outfile: 80 | for (patient_id, pred) in enumerate(self.prediction): 81 | outfile.write(str(patient_id) + ',' + str(pred)) 82 | outfile.write('\n') 83 | outfile.close() -------------------------------------------------------------------------------- /HIV-Progression/initialSubmission.csv: -------------------------------------------------------------------------------- 1 | 0,0 2 | 1,0 3 | 2,0 4 | 3,0 5 | 4,0 6 | 5,0 7 | 6,0 8 | 7,0 9 | 8,0 10 | 9,0 11 | 10,0 12 | 11,0 13 | 12,0 14 | 13,0 15 | 14,0 16 | 15,0 17 | 16,0 18 | 17,0 19 | 18,0 20 | 19,0 21 | 20,0 22 | 21,0 23 | 22,0 24 | 23,0 25 | 24,0 26 | 25,0 27 | 26,0 28 | 27,0 29 | 28,0 30 | 29,0 31 | 30,0 32 | 31,0 33 | 32,0 34 | 33,0 35 | 34,0 36 | 35,0 37 | 36,0 38 | 37,0 39 | 38,0 40 | 39,0 41 | 40,0 42 | 41,0 43 | 42,0 44 | 43,0 45 | 44,0 46 | 45,0 47 | 46,0 48 | 47,0 49 | 48,0 50 | 49,0 51 | 50,0 52 | 51,0 53 | 52,0 54 | 53,0 55 | 54,0 56 | 55,0 57 | 56,0 58 | 57,0 59 | 58,0 60 | 59,0 61 | 60,0 62 | 61,0 63 | 62,0 64 | 63,0 65 | 64,0 66 | 65,0 67 | 66,0 68 | 67,0 69 | 68,0 70 | 69,0 71 | 70,0 72 | 71,0 73 | 72,0 74 | 73,0 75 | 74,0 76 | 75,0 77 | 76,0 78 | 77,0 79 | 78,0 80 | 79,0 81 | 80,0 82 | 81,0 83 | 82,0 84 | 83,0 85 | 84,0 86 | 85,0 87 | 86,0 88 | 87,0 89 | 88,0 90 | 89,0 91 | 90,0 92 | 91,0 93 | 92,0 94 | 93,0 95 | 94,0 96 | 95,0 97 | 96,0 98 | 97,0 99 | 98,0 100 | 99,0 101 | 100,0 102 | 101,0 103 | 102,0 104 | 103,0 105 | 104,0 106 | 105,0 107 | 106,0 108 | 107,0 109 | 108,0 110 | 109,0 111 | 110,0 112 | 111,0 113 | 112,0 114 | 113,0 115 | 114,0 116 | 115,0 117 | 116,0 118 | 117,0 119 | 118,0 120 | 119,0 121 | 120,0 122 | 121,0 123 | 122,0 124 | 123,0 125 | 124,0 126 | 125,0 127 | 126,0 128 | 127,0 129 | 128,0 130 | 129,0 131 | 130,0 132 | 131,0 133 | 132,0 134 | 133,0 135 | 134,0 136 | 135,0 137 | 136,0 138 | 137,0 139 | 138,0 140 | 139,0 141 | 140,0 142 | 141,0 143 | 142,0 144 | 143,0 145 | 144,0 146 | 145,0 147 | 146,0 148 | 147,0 149 | 148,0 150 | 149,0 151 | 150,0 152 | 151,0 153 | 152,0 154 | 153,0 155 | 154,0 156 | 155,0 157 | 156,0 158 | 157,0 159 | 158,0 160 | 159,0 161 | 160,0 162 | 161,0 163 | 162,0 164 | 163,0 165 | 164,0 166 | 165,0 167 | 166,0 168 | 167,0 169 | 168,0 170 | 169,0 171 | 170,0 172 | 171,0 173 | 172,0 174 | 173,0 175 | 174,0 176 | 175,0 177 | 176,0 178 | 177,0 179 | 178,0 180 | 179,0 181 | 180,0 182 | 181,0 183 | 182,0 184 | 183,0 185 | 184,0 186 | 185,0 187 | 186,0 188 | 187,0 189 | 188,0 190 | 189,0 191 | 190,0 192 | 191,0 193 | 192,0 194 | 193,0 195 | 194,0 196 | 195,0 197 | 196,0 198 | 197,0 199 | 198,0 200 | 199,0 201 | 200,0 202 | 201,0 203 | 202,0 204 | 203,0 205 | 204,0 206 | 205,0 207 | 206,0 208 | 207,0 209 | 208,0 210 | 209,0 211 | 210,0 212 | 211,0 213 | 212,0 214 | 213,0 215 | 214,0 216 | 215,0 217 | 216,0 218 | 217,0 219 | 218,0 220 | 219,0 221 | 220,0 222 | 221,0 223 | 222,0 224 | 223,0 225 | 224,0 226 | 225,0 227 | 226,0 228 | 227,0 229 | 228,0 230 | 229,0 231 | 230,0 232 | 231,0 233 | 232,0 234 | 233,0 235 | 234,0 236 | 235,0 237 | 236,0 238 | 237,0 239 | 238,0 240 | 239,0 241 | 240,0 242 | 241,0 243 | 242,0 244 | 243,0 245 | 244,0 246 | 245,0 247 | 246,0 248 | 247,0 249 | 248,0 250 | 249,0 251 | 250,0 252 | 251,0 253 | 252,0 254 | 253,0 255 | 254,0 256 | 255,0 257 | 256,0 258 | 257,0 259 | 258,0 260 | 259,0 261 | 260,0 262 | 261,0 263 | 262,0 264 | 263,0 265 | 264,0 266 | 265,0 267 | 266,0 268 | 267,0 269 | 268,0 270 | 269,0 271 | 270,0 272 | 271,0 273 | 272,0 274 | 273,0 275 | 274,0 276 | 275,0 277 | 276,0 278 | 277,0 279 | 278,0 280 | 279,0 281 | 280,0 282 | 281,0 283 | 282,0 284 | 283,0 285 | 284,0 286 | 285,0 287 | 286,0 288 | 287,0 289 | 288,0 290 | 289,0 291 | 290,0 292 | 291,0 293 | 292,0 294 | 293,0 295 | 294,0 296 | 295,0 297 | 296,0 298 | 297,0 299 | 298,0 300 | 299,0 301 | 300,0 302 | 301,0 303 | 302,0 304 | 303,0 305 | 304,0 306 | 305,0 307 | 306,0 308 | 307,0 309 | 308,0 310 | 309,0 311 | 310,0 312 | 311,0 313 | 312,0 314 | 313,0 315 | 314,0 316 | 315,0 317 | 316,0 318 | 317,0 319 | 318,0 320 | 319,0 321 | 320,0 322 | 321,0 323 | 322,0 324 | 323,0 325 | 324,0 326 | 325,0 327 | 326,0 328 | 327,0 329 | 328,0 330 | 329,0 331 | 330,0 332 | 331,0 333 | 332,0 334 | 333,0 335 | 334,0 336 | 335,0 337 | 336,0 338 | 337,0 339 | 338,0 340 | 339,0 341 | 340,0 342 | 341,0 343 | 342,0 344 | 343,0 345 | 344,0 346 | 345,0 347 | 346,0 348 | 347,0 349 | 348,0 350 | 349,0 351 | 350,0 352 | 351,0 353 | 352,0 354 | 353,0 355 | 354,0 356 | 355,0 357 | 356,0 358 | 357,0 359 | 358,0 360 | 359,0 361 | 360,0 362 | 361,0 363 | 362,0 364 | 363,0 365 | 364,0 366 | 365,0 367 | 366,0 368 | 367,0 369 | 368,0 370 | 369,0 371 | 370,0 372 | 371,0 373 | 372,0 374 | 373,0 375 | 374,0 376 | 375,0 377 | 376,0 378 | 377,0 379 | 378,0 380 | 379,0 381 | 380,0 382 | 381,0 383 | 382,0 384 | 383,0 385 | 384,0 386 | 385,0 387 | 386,0 388 | 387,0 389 | 388,0 390 | 389,0 391 | 390,0 392 | 391,0 393 | 392,0 394 | 393,0 395 | 394,0 396 | 395,0 397 | 396,0 398 | 397,0 399 | 398,0 400 | 399,0 401 | 400,0 402 | 401,0 403 | 402,0 404 | 403,0 405 | 404,0 406 | 405,0 407 | 406,0 408 | 407,0 409 | 408,0 410 | 409,0 411 | 410,0 412 | 411,0 413 | 412,0 414 | 413,0 415 | 414,0 416 | 415,0 417 | 416,0 418 | 417,0 419 | 418,0 420 | 419,0 421 | 420,0 422 | 421,0 423 | 422,0 424 | 423,0 425 | 424,0 426 | 425,0 427 | 426,0 428 | 427,0 429 | 428,0 430 | 429,0 431 | 430,0 432 | 431,0 433 | 432,0 434 | 433,0 435 | 434,0 436 | 435,0 437 | 436,0 438 | 437,0 439 | 438,0 440 | 439,0 441 | 440,0 442 | 441,0 443 | 442,0 444 | 443,0 445 | 444,0 446 | 445,0 447 | 446,0 448 | 447,0 449 | 448,0 450 | 449,0 451 | 450,0 452 | 451,0 453 | 452,0 454 | 453,0 455 | 454,0 456 | 455,0 457 | 456,0 458 | 457,0 459 | 458,0 460 | 459,0 461 | 460,0 462 | 461,0 463 | 462,0 464 | 463,0 465 | 464,0 466 | 465,0 467 | 466,0 468 | 467,0 469 | 468,0 470 | 469,0 471 | 470,0 472 | 471,0 473 | 472,0 474 | 473,0 475 | 474,0 476 | 475,0 477 | 476,0 478 | 477,0 479 | 478,0 480 | 479,0 481 | 480,0 482 | 481,0 483 | 482,0 484 | 483,0 485 | 484,0 486 | 485,0 487 | 486,0 488 | 487,0 489 | 488,0 490 | 489,0 491 | 490,0 492 | 491,0 493 | 492,0 494 | 493,0 495 | 494,0 496 | 495,0 497 | 496,0 498 | 497,0 499 | 498,0 500 | 499,0 501 | 500,0 502 | 501,0 503 | 502,0 504 | 503,0 505 | 504,0 506 | 505,0 507 | 506,0 508 | 507,0 509 | 508,0 510 | 509,0 511 | 510,0 512 | 511,0 513 | 512,0 514 | 513,0 515 | 514,0 516 | 515,0 517 | 516,0 518 | 517,0 519 | 518,0 520 | 519,0 521 | 520,0 522 | 521,0 523 | 522,0 524 | 523,0 525 | 524,0 526 | 525,0 527 | 526,0 528 | 527,0 529 | 528,0 530 | 529,0 531 | 530,0 532 | 531,0 533 | 532,0 534 | 533,0 535 | 534,0 536 | 535,0 537 | 536,0 538 | 537,0 539 | 538,0 540 | 539,0 541 | 540,0 542 | 541,0 543 | 542,0 544 | 543,0 545 | 544,0 546 | 545,0 547 | 546,0 548 | 547,0 549 | 548,0 550 | 549,0 551 | 550,0 552 | 551,0 553 | 552,0 554 | 553,0 555 | 554,0 556 | 555,0 557 | 556,0 558 | 557,0 559 | 558,0 560 | 559,0 561 | 560,0 562 | 561,0 563 | 562,0 564 | 563,0 565 | 564,0 566 | 565,0 567 | 566,0 568 | 567,0 569 | 568,0 570 | 569,0 571 | 570,0 572 | 571,0 573 | 572,0 574 | 573,0 575 | 574,0 576 | 575,0 577 | 576,0 578 | 577,0 579 | 578,0 580 | 579,0 581 | 580,0 582 | 581,0 583 | 582,0 584 | 583,0 585 | 584,0 586 | 585,0 587 | 586,0 588 | 587,0 589 | 588,0 590 | 589,0 591 | 590,0 592 | 591,0 593 | 592,0 594 | 593,0 595 | 594,0 596 | 595,0 597 | 596,0 598 | 597,0 599 | 598,0 600 | 599,0 601 | 600,0 602 | 601,0 603 | 602,0 604 | 603,0 605 | 604,0 606 | 605,0 607 | 606,0 608 | 607,0 609 | 608,0 610 | 609,0 611 | 610,0 612 | 611,0 613 | 612,0 614 | 613,0 615 | 614,0 616 | 615,0 617 | 616,0 618 | 617,0 619 | 618,0 620 | 619,0 621 | 620,0 622 | 621,0 623 | 622,0 624 | 623,0 625 | 624,0 626 | 625,0 627 | 626,0 628 | 627,0 629 | 628,0 630 | 629,0 631 | 630,0 632 | 631,0 633 | 632,0 634 | 633,0 635 | 634,0 636 | 635,0 637 | 636,0 638 | 637,0 639 | 638,0 640 | 639,0 641 | 640,0 642 | 641,0 643 | 642,0 644 | 643,0 645 | 644,0 646 | 645,0 647 | 646,0 648 | 647,0 649 | 648,0 650 | 649,0 651 | 650,0 652 | 651,0 653 | 652,0 654 | 653,0 655 | 654,0 656 | 655,0 657 | 656,0 658 | 657,0 659 | 658,0 660 | 659,0 661 | 660,0 662 | 661,0 663 | 662,0 664 | 663,0 665 | 664,0 666 | 665,0 667 | 666,0 668 | 667,0 669 | 668,0 670 | 669,0 671 | 670,0 672 | 671,0 673 | 672,0 674 | 673,0 675 | 674,0 676 | 675,0 677 | 676,0 678 | 677,0 679 | 678,0 680 | 679,0 681 | 680,0 682 | 681,0 683 | 682,0 684 | 683,0 685 | 684,0 686 | 685,0 687 | 686,0 688 | 687,0 689 | 688,0 690 | 689,0 691 | 690,0 692 | 691,0 693 | -------------------------------------------------------------------------------- /Home Insurance/features.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator 2 | from sklearn.preprocessing import LabelEncoder 3 | from sklearn.feature_extraction import DictVectorizer 4 | 5 | import pandas as pd 6 | import numpy as np 7 | 8 | class FeatureTransformer(BaseEstimator): 9 | """ 10 | Generate features 11 | """ 12 | 13 | def __init__(self, train, test): 14 | self.X = train 15 | self.X_test = test 16 | # self.X = pd.read_csv('./data/train.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber') 17 | # self.X_test = pd.read_csv('./data/test.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber') 18 | 19 | # self.X = self.X.fillna(-1) 20 | # self.X_test = self.X_test.fillna(-1) 21 | pass 22 | 23 | 24 | def get_feature_names(self): 25 | feature_names = [] 26 | 27 | feature_names.extend(['year_original_quote', 'month_original_quote', 'weekday_original_quote']) 28 | feature_names.extend(self.categorical_features_columns) 29 | feature_names.extend(self.numerical_features_columns) 30 | 31 | return np.array(feature_names) 32 | 33 | def fit(self, X, y=None): 34 | self.fit_transform(X, y) 35 | 36 | return self 37 | 38 | def fit_transform(self, X, y=None): 39 | 40 | date_features = self._process_dates(X) 41 | is_nan_features = self._is_nan(X) 42 | count_nan_features = self._count_nans(X) 43 | count_undecodable = self._count_undecodable(X) 44 | categorical_features = self._process_categorical_features(X) 45 | numerical_features = self._process_numerical_features(X) 46 | 47 | features = [] 48 | 49 | features.append(date_features) 50 | features.append(is_nan_features) 51 | features.append(count_nan_features) 52 | features.append(categorical_features) 53 | features.append(numerical_features) 54 | 55 | features = np.hstack(features) 56 | 57 | return features 58 | 59 | def _process_dates(self, X): 60 | 'Extract year, month and weekday of original quote' 61 | 62 | year_original_quote = X.Original_Quote_Date.dt.year 63 | month_original_quote = X.Original_Quote_Date.dt.month 64 | weekday_original_quote = X.Original_Quote_Date.dt.weekday 65 | 66 | return np.array([year_original_quote, month_original_quote, weekday_original_quote]).T 67 | 68 | def _is_nan(self, X): 69 | 'Check to see whether record has any nan value or not' 70 | null_check = X.apply(lambda x: -1 in x.values, axis=1) * 1. 71 | 72 | return np.array(null_check).reshape(-1, 1) 73 | 74 | def _count_nans(self, X): 75 | 'Count number of missing values in a quote' 76 | 77 | count_nans = X.apply(lambda x: list(x.values).count(-1), axis=1) 78 | 79 | return np.array([count_nans]).T 80 | 81 | def _count_undecodable(self, X): 82 | 'Count number of undecodable values (0)' 83 | 84 | count_undecodable = X.apply(lambda x: list(x.values).count(0), axis=1) 85 | 86 | return np.array([count_undecodable]).T 87 | 88 | def _process_categorical_features(self, X): 89 | 'Encode categorical features into numerical features' 90 | 91 | self.categorical_features_columns = X.select_dtypes(['object']).columns 92 | categorical_features = [] 93 | 94 | for cat in self.categorical_features_columns: 95 | lbl = LabelEncoder() 96 | 97 | lbl.fit(pd.concat([self.X[cat], self.X_test[cat]], axis=0)) 98 | 99 | categorical_features.append(lbl.transform(X[cat])) 100 | 101 | return np.array(categorical_features).T 102 | 103 | def _process_numerical_features(self, X): 104 | 'Return numerical features as it is' 105 | 106 | self.numerical_features_columns = X.select_dtypes(['int32', 'int64', 'float32', 'float64']) 107 | 108 | numerical_features = [] 109 | 110 | for col in self.numerical_features_columns: 111 | numerical_features.append(X[col]) 112 | 113 | return np.array(numerical_features).T 114 | 115 | 116 | def transform(self, X): 117 | date_features = self._process_dates(X) 118 | is_nan_features = self._is_nan(X) 119 | count_nan_features = self._count_nans(X) 120 | count_undecodable = self._count_undecodable(X) 121 | categorical_features = self._process_categorical_features(X) 122 | numerical_features = self._process_numerical_features(X) 123 | 124 | features = [] 125 | 126 | features.append(date_features) 127 | features.append(is_nan_features) 128 | features.append(count_nan_features) 129 | features.append(categorical_features) 130 | features.append(numerical_features) 131 | 132 | features = np.hstack(features) 133 | 134 | return features 135 | 136 | -------------------------------------------------------------------------------- /Home Insurance/scripts/helper.py: -------------------------------------------------------------------------------- 1 | from sklearn.cross_validation import StratifiedShuffleSplit 2 | from sklearn.preprocessing import LabelEncoder 3 | from sklearn.grid_search import RandomizedSearchCV 4 | 5 | from collections import defaultdict 6 | 7 | 8 | import pandas as pd 9 | 10 | def encode_labels(train, test): 11 | """ 12 | Encodes the categorical features into numerical features 13 | for both train and test dataframes 14 | """ 15 | 16 | categorical_features = train.select_dtypes(['object']).columns 17 | 18 | for col in categorical_features: 19 | total_values = pd.concat([train[col], test[col]], axis=0) 20 | 21 | lbl = LabelEncoder() 22 | 23 | lbl.fit(total_values) 24 | train[col] = lbl.transform(train[col]) 25 | test[col] = lbl.transform(test[col]) 26 | 27 | return train, test 28 | 29 | def cv_optimize(X, y, cv, clf, parameters): 30 | """ 31 | Randomized Grid search on the parameter space to find out the best 32 | parameter settings to produce an accurate model 33 | """ 34 | 35 | rs = RandomizedSearchCV(clf, param_distributions=parameters, cv=cv, scoring='roc_auc') 36 | rs.fit(X, y) 37 | 38 | return rs 39 | 40 | def transform_for_ranked(preds, index): 41 | ranks = [] 42 | 43 | for i, pred in enumerate(preds): 44 | ranks.append((index[i], pred)) 45 | 46 | return ranks 47 | 48 | 49 | def ranked_averaging(predictions): 50 | all_ranks = defaultdict(list) 51 | 52 | for i, preds in enumerate(predictions): 53 | individual_ranks = [] 54 | 55 | for e, pred in enumerate(preds): 56 | individual_ranks.append( (float(pred[1]), e, pred[0]) ) 57 | 58 | for rank, item in enumerate( sorted(individual_ranks) ) : 59 | all_ranks[(item[1], item[2])].append(rank) 60 | 61 | average_ranks = [] 62 | 63 | for k in sorted(all_ranks): 64 | average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k)) 65 | 66 | ranked_ranks = [] 67 | 68 | for rank, k in enumerate(sorted(average_ranks)): 69 | ranked_ranks.append((k[1][0],k[1][1],(rank * 1.)/(len(average_ranks)-1))) 70 | 71 | return sorted(ranked_ranks) 72 | -------------------------------------------------------------------------------- /Home Insurance/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def load_data(train_filename='./data/train.csv', test_filename='./data/test.csv'): 6 | 7 | print 'Loading datasets' 8 | 9 | train = pd.read_csv(train_filename, parse_dates=['Original_Quote_Date'], index_col='QuoteNumber') 10 | test = pd.read_csv(test_filename, parse_dates=['Original_Quote_Date'], index_col='QuoteNumber') 11 | 12 | print 'Setting Quote Number as index' 13 | 14 | return train, test 15 | 16 | 17 | def prepare_sample(train, n=1000): 18 | features = train.columns.drop('QuoteConversion_Flag') 19 | 20 | train_2013 = train[train.Original_Quote_Date.dt.year==2013].sample(n=n) 21 | train_2014 = train[train.Original_Quote_Date.dt.year==2014].sample(n=n) 22 | train_2015 = train[train.Original_Quote_Date.dt.year==2015].sample(n=n) 23 | 24 | train_merged = pd.concat([train_2013, train_2014, train_2015], axis=0) 25 | train_merged_shuffle = train_merged.iloc[np.random.permutation(len(train_merged))] 26 | 27 | X = train_merged_shuffle[features] 28 | y = train_merged_shuffle['QuoteConversion_Flag'] 29 | 30 | return X, y 31 | 32 | def random_sample(train, n): 33 | features = train.columns.drop('QuoteConversion_Flag') 34 | 35 | train = train.take(np.random.permutation(len(train))[:n]) 36 | 37 | X = train[features] 38 | y = train['QuoteConversion_Flag'] 39 | 40 | return X, y 41 | -------------------------------------------------------------------------------- /Home-Depot/scripts/cross-validation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 30 23:33:52 2016 4 | 5 | @author: abhishek 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | import cPickle 10 | 11 | from sklearn.cross_validation import train_test_split 12 | from sklearn.preprocessing import StandardScaler 13 | from sklearn.linear_model import LinearRegression 14 | from sklearn.pipeline import Pipeline 15 | from sklearn.svm import SVR 16 | from sklearn.metrics import mean_squared_error 17 | 18 | np.random.seed(1729) 19 | 20 | with open('./data/synthesized/train_processed.pkl', 'r') as infile: 21 | train = cPickle.load(infile) 22 | infile.close() 23 | 24 | with open('./data/synthesized/test_processed.pkl', 'r') as infile: 25 | test = cPickle.load(infile) 26 | infile.close() 27 | 28 | 29 | 30 | 31 | X_train, X_test, y_train, y_test = train_test_split(corpus_svd, train.relevance, test_size=0.3, 32 | random_state=44) 33 | 34 | 35 | scaler = StandardScaler() 36 | svr = SVR() 37 | 38 | pipeline = Pipeline([('scaler', scaler), ('svr', svr)]) 39 | pipeline.fit(X_train, y_train) 40 | 41 | predsTrain = pipeline.predict(X_train) 42 | predsTest = pipeline.predict(X_test) 43 | 44 | print 'RMSE on training examples %f ' %(np.sqrt(mean_squared_error(y_train, predsTrain))) 45 | print 'RMSE on test examples %f ' %(np.sqrt(mean_squared_error(y_test, predsTest))) 46 | -------------------------------------------------------------------------------- /Home-Depot/scripts/dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 10 14:30:44 2016 4 | 5 | @author: abhishek 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | import re 10 | from search_map import spell_check_dict 11 | from nltk import word_tokenize 12 | from sklearn.metrics.pairwise import cosine_similarity 13 | from sklearn.metrics import jaccard_similarity_score 14 | from sklearn.feature_extraction.text import TfidfVectorizer 15 | from sklearn.decomposition import TruncatedSVD 16 | from scipy import sparse as sps 17 | 18 | class Dataset(object): 19 | def __init__(self, train, test): 20 | self.train = train.copy() 21 | self.test = test.copy() 22 | 23 | self.y = train.relevance 24 | 25 | self.tfidf_vectorizer = TfidfVectorizer() 26 | 27 | def correct_search_terms(self, train, test): 28 | def correct_term(q): 29 | if q in spell_check_dict: 30 | return spell_check_dict[q] 31 | else: 32 | return q 33 | 34 | train_search_terms = train.search_term 35 | test_search_terms = test.search_term 36 | 37 | return train_search_terms, test_search_terms 38 | 39 | def stem_word(self, word): 40 | suffixes = ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment'] 41 | 42 | for suffix in suffixes: 43 | if word.endswith(suffix): 44 | return word[:-len(suffix)] 45 | 46 | return word 47 | 48 | def tokenize(self, sentence): 49 | return word_tokenize(sentence) 50 | 51 | def stemming(self, sentence): 52 | tokens = self.tokenize(sentence) 53 | stemmed = ' '.join([self.stem_word(token) for token in tokens]) 54 | 55 | return stemmed 56 | 57 | def filter_characters(self, char): 58 | return char == '\n' or 32 <= ord(char) <= 126 59 | 60 | def sanitize_title(self, sentence): 61 | return filter(self.filter_characters, sentence) 62 | 63 | def preprocessing(self, to_stem=False): 64 | corrected_q_train, corrected_q_test = self.correct_search_terms(self.train, self.test) 65 | 66 | self.train['search_term'] = corrected_q_train 67 | self.test['search_term'] = corrected_q_test 68 | 69 | self.train['search_term'] = self.train.search_term.map(lambda x: x.lower()) 70 | self.test['search_term'] = self.test.search_term.map(lambda x: x.lower()) 71 | 72 | 73 | self.train['product_title'] = self.train.product_title.map(self.sanitize_title) 74 | self.test['product_title'] = self.test.product_title.map(self.sanitize_title) 75 | 76 | if to_stem: 77 | self.train['search_term'] = self.train.search_term.map(self.stemming) 78 | self.test['search_term'] = self.test.search_term.map(self.stemming) 79 | 80 | def num_tokens_query(self, query): 81 | return len(word_tokenize(query)) 82 | 83 | def num_tokens_title(self, title): 84 | return len(word_tokenize(title)) 85 | 86 | def cosine_similarity_score(self, row): 87 | query = row['search_term'] 88 | title = row['product_title'] 89 | 90 | corpus = np.array([query, title]) 91 | tfidf_matrix = self.tfidf_vectorizer.fit_transform(corpus) 92 | 93 | normal_array = tfidf_matrix.toarray() 94 | 95 | query_repr = normal_array[0].reshape(-1, 1) 96 | title_repr = normal_array[1].reshape(-1, 1) 97 | 98 | return cosine_similarity(query_repr, title_repr)[0][0] 99 | 100 | def jaccard_score(self, row): 101 | query = row['search_term'] 102 | title = row['product_title'] 103 | 104 | corpus = np.array([query, title]) 105 | tfidf_matrix = self.tfidf_vectorizer.fit_transform(corpus) 106 | 107 | return jaccard_similarity_score(tfidf_matrix[0], tfidf_matrix[1]) 108 | 109 | 110 | def numerical_features(self): 111 | """ 112 | 1. Number of tokens in the query 113 | 2. Number of tokens in the title 114 | 3. Cosine similarity between title and query 115 | """ 116 | 117 | self.train['num_query_tokens'] = self.train.search_term.map(self.num_tokens_query) 118 | self.test['num_query_tokens'] = self.test.search_term.map(self.num_tokens_query) 119 | 120 | self.train['num_title_tokens'] = self.train.product_title.map(self.num_tokens_title) 121 | self.test['num_title_tokens'] = self.test.product_title.map(self.num_tokens_title) 122 | 123 | self.train['cosine_score'] = self.train.apply(self.cosine_similarity_score, axis=1) 124 | self.test['cosine_score'] = self.test.apply(self.cosine_similarity_score, axis=1) 125 | 126 | def text_features(self): 127 | corpus_train = self.train.apply(lambda x: '%s %s' %(x['product_title'], x['search_term']), axis=1) 128 | corpus_test = self.test.apply(lambda x: '%s %s' %(x['product_title'], x['search_term']), axis=1) 129 | 130 | tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=3) 131 | corpus = tfidf.fit_transform(corpus_train.values) 132 | corpus_test = tfidf.transform(corpus_test.values) 133 | 134 | svd = TruncatedSVD(n_components=200) 135 | 136 | self.corpus_svd = svd.fit_transform(corpus) 137 | self.corpus_test_svd = svd.transform(corpus_test) 138 | 139 | def combine_features(self): 140 | features = ['num_query_tokens', 'num_title_tokens', 'cosine_score'] 141 | 142 | numerical_features = self.train[features] 143 | numerical_features_test = self.test[features] 144 | 145 | self.processed_features_train = sps.hstack([numerical_features, self.corpus_svd]) 146 | self.processesd_features_test = sps.hstack([numerical_features_test, self.corpus_test_svd]) 147 | 148 | 149 | train = pd.read_csv('./data/train.csv') 150 | test = pd.read_csv('./data/test.csv') 151 | 152 | dataset = Dataset(train, test) 153 | dataset.preprocessing() 154 | dataset.text_features() 155 | dataset.numerical_features() 156 | dataset.combine_features() 157 | 158 | -------------------------------------------------------------------------------- /Home-Depot/scripts/eda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 5 08:58:07 2016 4 | 5 | @author: abhishek 6 | """ 7 | import pandas as pd 8 | import re 9 | 10 | # import libraries used for nlp 11 | from __future__ import division 12 | from nltk import word_tokenize 13 | from nltk import FreqDist 14 | from search_map import spell_check_dict 15 | 16 | # load train, test, description and attributes files 17 | train = pd.read_csv('./data/train.csv', index_col='id') 18 | test = pd.read_csv('./data/test.csv', index_col='id') 19 | 20 | description = pd.read_csv('./data/product_descriptions.csv') 21 | attributes = pd.read_csv('./data/attributes.csv') 22 | 23 | 24 | ## Frequency Analysis 25 | def default_tokenizer(sentence): 26 | return sentence.split(' ') 27 | 28 | def tokenize(sentence, tokenizer_type='word'): 29 | if tokenizer_type == 'word': 30 | return word_tokenize(sentence) 31 | else: 32 | return default_tokenizer(sentence) 33 | 34 | def tokenize_sentences(sentences, n): 35 | tokens = [] 36 | 37 | for i in range(0, n): 38 | tokens.extend(tokenize(sentences[i])) 39 | 40 | return tokens 41 | 42 | def frequency_analysis(search_terms, n=50, num_terms=5): 43 | tokens_list = tokenize_sentences(search_terms, n=n) 44 | fdist = FreqDist(tokens_list) 45 | 46 | return fdist.most_common(n=num_terms) 47 | 48 | 49 | ## Relevance scores based on different patterns on training corpus 50 | def relevance_scores_by_pattern(train, pattern): 51 | query_list = [(idx, w) for (idx, w) in enumerate(train.search_term.values) if re.search(pattern, w)] 52 | relevance_scores = [train.iloc[idx]['relevance'] for (idx, w) in query_list] 53 | 54 | return relevance_scores 55 | 56 | 57 | # Do spelling mistakes have an effect on relevance scores? 58 | def spelling_mistakes_effect(train): 59 | train = train.copy() 60 | boolean_indicator = [1 if q in spell_check_dict else 0 for q in train.search_term] 61 | train['is_incorrect'] = boolean_indicator 62 | 63 | mean_relevance_score_correct = train[train.is_incorrect == 0].relevance.mean() 64 | mean_relevance_score_incorrect = train[train.is_incorrect == 1].relevance.mean() 65 | 66 | return mean_relevance_score_correct, mean_relevance_score_incorrect 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /Home-Depot/scripts/numerical_features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Kaggle Home Depot Challenge 4 | 5 | Evaluation Metric : RMSE 6 | """ 7 | from __future__ import division 8 | 9 | import pandas as pd 10 | import re 11 | from sklearn.feature_extraction import text 12 | from difflib import SequenceMatcher as seq_matcher 13 | import cPickle 14 | 15 | 16 | pattern = re.compile(r'\b(' + r'|'.join(text.ENGLISH_STOP_WORDS) + r')\b\s*') 17 | 18 | 19 | # load train and test set 20 | train = pd.read_csv('../data/train.csv') 21 | test = pd.read_csv('../data/test.csv') 22 | 23 | # load product description and atttributes data 24 | description = pd.read_csv('../data/product_descriptions.csv') 25 | attributes = pd.read_csv('../data/attributes.csv') 26 | 27 | brand = attributes[attributes.name == 'MFG Brand Name'][['product_uid', 'value']].rename(columns={'value': 'brand'}) 28 | color = attributes[attributes.name == 'Color Family'][['product_uid', 'value']].rename(columns={'value': 'color'}) 29 | 30 | # most of the queries are relevant 31 | search_term_frequency_train = train.groupby('product_uid').size().reset_index() 32 | search_term_frequency_train.columns = ['product_uid', 'query_frequency'] 33 | 34 | search_term_frequency_test = test.groupby('product_uid').size().reset_index() 35 | search_term_frequency_test.columns = ['product_uid', 'query_frequency'] 36 | 37 | 38 | # merge this with train and test set 39 | train = pd.merge(train, search_term_frequency_train, on='product_uid', how='left') 40 | test = pd.merge(test, search_term_frequency_test, on='product_uid', how='left') 41 | 42 | # merge with description, brand and color dataset 43 | train = pd.merge(train, description, on='product_uid', how='left') 44 | test = pd.merge(test, description, on='product_uid', how='left') 45 | 46 | train = pd.merge(train, brand, on='product_uid', how='left') 47 | test = pd.merge(test, brand, on='product_uid', how='left') 48 | 49 | train = pd.merge(train, color, on='product_uid', how='left') 50 | test = pd.merge(test, color, on='product_uid', how='left') 51 | 52 | # missing values 53 | train = train.fillna('') 54 | test = test.fillna('') 55 | 56 | 57 | # some preprocessing functions 58 | def filter_characters(char): 59 | return char == '\n' or 32 <= ord(char) <= 126 60 | 61 | def sanitize(s): 62 | s = s.replace('ft.', 'feet') 63 | s = s.replace('cu.', 'cubic') 64 | s = s.replace('mm', 'milimeters') 65 | s = s.replace('oz.', 'ounces') 66 | s = s.replace('btu', 'british thermal unit') 67 | s = s.replace('otr', 'over the range') 68 | s = s.replace('lb.', 'pounds') 69 | s = s.replace('in.', 'inches') 70 | s = s.replace('&', 'and') 71 | s = s.replace('sq.', 'square') 72 | s = s.replace('gal.', 'gallon') 73 | 74 | return s 75 | 76 | def preprocess(s): 77 | s = filter(filter_characters, s) 78 | s = s.lower() 79 | s = sanitize(s) 80 | 81 | return pattern.sub('', s) 82 | 83 | # sanitize training and test 84 | train.loc[:, 'product_title'] = train.product_title.map(preprocess) 85 | train.loc[:, 'search_term'] = train.search_term.map(preprocess) 86 | 87 | train.loc[:, 'product_description'] = train.product_description.map(preprocess) 88 | train.loc[:, 'brand'] = train.brand.map(preprocess) 89 | train.loc[:, 'color'] = train.color.map(preprocess) 90 | 91 | 92 | test.loc[:, 'product_title'] = test.product_title.map(preprocess) 93 | test.loc[:, 'search_term'] = test.search_term.map(preprocess) 94 | 95 | test.loc[:, 'product_description'] = test.product_description.map(preprocess) 96 | test.loc[:, 'brand'] = test.brand.map(preprocess) 97 | test.loc[:, 'color'] = test.color.map(preprocess) 98 | 99 | 100 | 101 | # feature engineering 102 | def query_title_overlap(row): 103 | query = row['search_term'] 104 | title = row['product_title'] 105 | query_words = query.split() 106 | 107 | count_overlap = 0 108 | for word in query_words: 109 | if query in title: 110 | count_overlap += 1 111 | 112 | return count_overlap / (len(query_words) + 1) 113 | 114 | def query_description_overlap(row): 115 | query = row['search_term'] 116 | description = row['product_description'] 117 | query_words = query.split() 118 | 119 | count_overlap = 0 120 | for word in query_words: 121 | if query in description: 122 | count_overlap += 1 123 | 124 | return count_overlap / (len(query_words) + 1) 125 | 126 | def brand_matches(row): 127 | query = row['search_term'] 128 | brand = row['brand'] 129 | query_words = query.split() 130 | 131 | count_overlap = 0 132 | for word in query_words: 133 | if query in brand: 134 | count_overlap += 1 135 | 136 | return count_overlap 137 | 138 | def compute_one_edit_distance(row): 139 | query = row['search_term'] 140 | title = row['product_title'] 141 | 142 | return 1 - seq_matcher(None, query, title).ratio() 143 | 144 | train.loc[:, 'num_words_in_query'] = train.search_term.map(lambda x: len(x.split())) 145 | test.loc[:, 'num_words_in_query'] = test.search_term.map(lambda x: len(x.split())) 146 | 147 | train.loc[:, 'query_title_overlap'] = train.apply(query_title_overlap, axis=1) 148 | test.loc[:, 'query_title_overlap'] = test.apply(query_title_overlap, axis=1) 149 | 150 | train.loc[:, 'one_edit_distance'] = train.apply(compute_one_edit_distance, axis=1) 151 | test.loc[:, 'one_edit_distance'] = test.apply(compute_one_edit_distance, axis=1) 152 | 153 | train.loc[:, 'query_description_overlap'] = train.apply(query_description_overlap, axis=1) 154 | test.loc[:, 'query_description_overlap'] = test.apply(query_description_overlap, axis=1) 155 | 156 | train.loc[:, 'brand_match'] = train.apply(brand_matches, axis=1) 157 | test.loc[:, 'brand_match'] = test.apply(brand_matches, axis=1) 158 | 159 | # serialize the object 160 | with open('../data/train_processed.pkl', 'w') as outfile: 161 | cPickle.dump(train, outfile) 162 | outfile.close() 163 | 164 | with open('../data/test_processed.pkl', 'w') as outfile: 165 | cPickle.dump(test, outfile) 166 | outfile.close() 167 | 168 | -------------------------------------------------------------------------------- /Home-Depot/scripts/search_map.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numb3r33/Kaggle-Competitions/6c4062dbcbd80869a2e0f5b93723ad217963d35b/Home-Depot/scripts/search_map.pyc -------------------------------------------------------------------------------- /Home-Depot/scripts/text-features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 27 13:50:57 2016 4 | 5 | @author: abhishek 6 | """ 7 | 8 | 9 | import pandas as pd 10 | import numpy as np 11 | import re 12 | from nltk.stem import PorterStemmer 13 | from sklearn.feature_extraction.text import TfidfVectorizer 14 | from sklearn.decomposition import TruncatedSVD 15 | from search_map import spell_check_dict 16 | 17 | 18 | stemmer = PorterStemmer() 19 | 20 | # load train and test set 21 | train = pd.read_csv('../data/train.csv') 22 | test = pd.read_csv('../data/test.csv') 23 | 24 | # load product description and atttributes data 25 | description = pd.read_csv('../data/product_descriptions.csv') 26 | attributes = pd.read_csv('../data/attributes.csv') 27 | 28 | def stem_words(sentence): 29 | return ' '.join([stemmer.stem(word) for word in sentence.split(' ')]) 30 | 31 | 32 | ## NOTE: 33 | ## graders were shown images instead of product descriptions 34 | ## and other attributes 35 | 36 | train = pd.merge(train, description, on='product_uid', how='left') 37 | test = pd.merge(test, description, on='product_uid', how='left') 38 | 39 | def correct_term(q): 40 | if q in spell_check_dict: 41 | return spell_check_dict[q] 42 | else: 43 | return q 44 | 45 | # correct search queries 46 | train.loc[:, 'search_term'] = train.search_term.map(correct_term) 47 | test.loc[:, 'search_term'] = test.search_term.map(correct_term) 48 | 49 | 50 | ## remove non-alphanumeric characters 51 | train.loc[:, 'product_description'] = train.product_description.map(lambda x: re.sub(r'[^A-Za-z0-9 ]', 52 | ' ', x)) 53 | 54 | train.loc[:, 'search_term'] = train.search_term.map(lambda x: re.sub(r'[^A-Za-z0-9 ]', 55 | ' ', x)) 56 | 57 | test.loc[:, 'product_description'] = test.product_description.map(lambda x: re.sub(r'[^A-Za-z0-9 ]', 58 | ' ', x)) 59 | 60 | test.loc[:, 'search_term'] = test.search_term.map(lambda x: re.sub(r'[^A-Za-z0-9 ]', 61 | ' ', x)) 62 | 63 | train.loc[:, 'product_description'] = train.product_description.map(stem_words) 64 | train.loc[:, 'search_term'] = train.search_term.map(stem_words) 65 | 66 | test.loc[:, 'product_description'] = test.product_description.map(stem_words) 67 | test.loc[:, 'search_term'] = test.product_description.map(stem_words) 68 | 69 | 70 | # corpus 71 | corpus = train.apply(lambda x: '%s %s' %(x['product_title'].lower(), x['search_term'].lower()), axis=1) 72 | corpus_test = test.apply(lambda x: '%s %s' %(x['product_title'].lower(), x['search_term'].lower()), axis=1) 73 | 74 | tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=3) 75 | corpus = tfidf.fit_transform(corpus.values) 76 | corpus_test = tfidf.transform(corpus_test.values) 77 | 78 | svd = TruncatedSVD(n_components=200) 79 | corpus_svd = svd.fit_transform(corpus) 80 | corpus_test_svd = svd.transform(corpus_test) 81 | 82 | -------------------------------------------------------------------------------- /Predicting-Grants/.ipynb_checkpoints/Data Analysis-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /Predicting-Grants/.ipynb_checkpoints/Description-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /Predicting-Grants/Description.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Task Description" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "** This task requires participants to predict the outcome of grant applications for the University of Melbourne. **\n", 15 | "\n", 16 | "### Why should this problem be solved ?\n", 17 | "* Pool of funds available for research grants is steadily shrinking (in a relative sense). \n", 18 | "* In Australia, success rates have fallen to 20-25 per cent, meaning that most academics are spending valuable time making applications that end up being rejected." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Descrption about the dataset." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "* Dataset containing 249 features, including variables that represent the size of the grant, the general area of study and de-identified information on the investigators who are applying for the grant. \n", 33 | "* There 8,707 grant applications made between 2004 and 2008 which constitute **training examples**. Then there are 2,176 applications made in 2009 and the first half of 2010 which can be used as **test set**." 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Evaluation Metric" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### AUC - (Area Under Curve)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "

Metric is the measure of evaluating the quality of preditions of a model. There are various different types of metrics out there Area under Curve (AUC) is one of those. For further reading you can follow the following description.

" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 8, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/html": [ 67 | "\n", 68 | " \n", 75 | " " 76 | ], 77 | "text/plain": [ 78 | "" 79 | ] 80 | }, 81 | "execution_count": 8, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "from IPython.display import IFrame\n", 88 | "\n", 89 | "IFrame('http://fastml.com/what-you-wanted-to-know-about-auc/', width=700, height=350)" 90 | ] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 2", 96 | "language": "python", 97 | "name": "python2" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 2 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython2", 109 | "version": "2.7.10" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 0 114 | } 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kaggle-Competitions 2 | All Kaggle competitions 3 | -------------------------------------------------------------------------------- /Rossman-Stores-Sales/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | submissions/ 3 | -------------------------------------------------------------------------------- /Rossman-Stores-Sales/.ipynb_checkpoints/rossman_store_sales-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /Rossman-Stores-Sales/rossman_store_sales.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "C:\\Users\\Abhishek\\Anaconda2\\lib\\site-packages\\matplotlib\\__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.\n", 15 | " warnings.warn(self.msg_depr % (key, alt_key))\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "# special IPython command to prepare the notebook for matplotlib\n", 21 | "%matplotlib inline \n", 22 | "\n", 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "import scipy.stats as stats\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "import sklearn\n", 28 | "import statsmodels.api as sm\n", 29 | "\n", 30 | "import seaborn as sns\n", 31 | "sns.set_style(\"whitegrid\")\n", 32 | "sns.set_context(\"poster\")\n", 33 | "\n", 34 | "from math import sqrt\n", 35 | "\n", 36 | "from sklearn.preprocessing import LabelEncoder\n", 37 | "from sklearn.linear_model import LinearRegression\n", 38 | "from sklearn.ensemble import RandomForestRegressor\n", 39 | "\n", 40 | "# special matplotlib argument for improved plots\n", 41 | "from matplotlib import rcParams\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 179, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "%run scripts/rossman.py\n", 53 | "%run scripts/helper.py" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 180, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "rossman = Rossman('./data/train.csv', './data/test.csv', './data/store.csv')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 181, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "# merge with stores data\n", 76 | "train_df_merged = rossman.merge_stores_data()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 182, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "# consider only those entries with non-zero sales value\n", 88 | "train_df_with_non_zero_sales = rossman.non_zero_sales_data()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 183, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "# test dataset\n", 100 | "test_df = rossman.test_df.copy()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 184, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [ 110 | { 111 | "name": "stderr", 112 | "output_type": "stream", 113 | "text": [ 114 | ":48: SettingWithCopyWarning: \n", 115 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 116 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 117 | "\n", 118 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "# preprocessing - converting all categorical variables into numerical values\n", 124 | "train_df_processed, test_df_processed = preprocessing(train_df_with_non_zero_sales, test_df)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 185, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "# create three separate training examples for three years\n", 136 | "\n", 137 | "# train_df_2013 = get_data(train_df, '2013-01-01', '2013-12-31')\n", 138 | "train_df_2014_2015 = get_data(train_df, '2014-01-01', '2015-12-31')\n", 139 | "# train_df_2015 = get_data(train_df, '2015-01-01', '2015-12-31')" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 186, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "features = train_df_2013.columns.drop(['Date', 'Sales'])" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 187, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "# X_train_2013 = train_df_2013[features]\n", 162 | "X_train_2014_2015 = train_df_2014[features]\n", 163 | "# X_train_2015 = train_df_2015[features]" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 123, 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "# y_train_2013 = np.log1p(train_df_2013.Sales)\n", 175 | "y_train_2014_2015 = np.log1p(train_df_2014.Sales)\n", 176 | "# y_train_2015 = np.log1p(train_df_2015.Sales)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 61, 182 | "metadata": { 183 | "collapsed": true 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "# Extreme Gradient Boosting\n", 188 | "## Creating models on dataset from three different years\n", 189 | "## and testing it out on the final 6 weeks of year 2015\n", 190 | "\n", 191 | "import xgboost as xgb" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 124, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "# training a model on data from year 2013\n", 203 | "# dtrain_2013 = xgb.DMatrix(X_train_2013, y_train_2013, missing=-999.0)\n", 204 | "dtrain_2014 = xgb.DMatrix(X_train_2014, y_train_2014, missing=-999.0)\n", 205 | "# dtrain_2015 = xgb.DMatrix(X_train_2015, y_train_2015, missing=-999.0)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 128, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "name": "stderr", 217 | "output_type": "stream", 218 | "text": [ 219 | "C:\\Users\\Abhishek\\Anaconda2\\lib\\site-packages\\pandas\\core\\generic.py:2862: SettingWithCopyWarning: \n", 220 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 221 | "\n", 222 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 223 | " self._update_inplace(new_data)\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "Xtest = test_df_processed[features]\n", 229 | "\n", 230 | "Xtest.Open.fillna(1, inplace=True)\n", 231 | "\n", 232 | "Xtest_open_stores = Xtest[Xtest.Open == 1]\n", 233 | "Xtest_closed_stores = Xtest[Xtest.Open == 0]\n", 234 | "\n", 235 | "dtest = xgb.DMatrix(Xtest_open_stores, missing=-999.0)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 129, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "params_2014 = dict((('silent', 1), ('nthread', 8), ('objective', 'reg:linear'),('eta', 0.05), \n", 247 | " ('subsample', 0.8), ('colsample_bytree', 0.7), ('min_child_weight', 5), ('max_depth', 8)))\n", 248 | "num_round = 1500\n", 249 | "\n", 250 | "model_2014 = xgb.train(params_2014, dtrain_2014, num_round, feval=rmspe_xg)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 130, 256 | "metadata": { 257 | "collapsed": false 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "# predictions for the year 2014\n", 262 | "\n", 263 | "predictions_test = np.expm1(model_2014.predict(dtest))" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 175, 269 | "metadata": { 270 | "collapsed": true 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "# predictions for open and closed stores and then stack them together\n", 275 | "open_stores_test_ids = Xtest_open_stores.index.values + 1\n", 276 | "closed_stores_test_ids = Xtest_closed_stores.index.values + 1\n", 277 | "\n", 278 | "open_stores_preds = predictions_test\n", 279 | "closed_stores_preds = [0.] * len(closed_stores_test_ids)\n", 280 | "\n", 281 | "final_ids = np.hstack([open_stores_test_ids, closed_stores_test_ids])\n", 282 | "final_preds = np.hstack([open_stores_preds, closed_stores_preds])" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 178, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "create_submission(final_ids, final_preds, 'xgb_only_2014.csv')" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 97, 299 | "metadata": { 300 | "collapsed": true 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "params_2015 = dict((('silent', 1), ('nthread', 8), ('objective', 'reg:linear'),('eta', 0.05), \n", 305 | " ('subsample', 0.8), ('colsample_bytree', 0.7), ('min_child_weight', 5), ('max_depth', 8)))\n", 306 | "num_round = 1000\n", 307 | "\n", 308 | "model_2015 = xgb.train(params_2015, dtrain_2015, num_round, feval=rmspe_xg)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 98, 314 | "metadata": { 315 | "collapsed": false 316 | }, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "RMSPE error for model based on examples from the year 2015 0.224573258686\n" 323 | ] 324 | } 325 | ], 326 | "source": [ 327 | "# predictions for the year 2014\n", 328 | "predictions_2015 = np.expm1(model_2015.predict(dtest))\n", 329 | "\n", 330 | "print 'RMSPE error for model based on examples from the year 2015 ', rmspe(ytest, predictions_2015)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 99, 336 | "metadata": { 337 | "collapsed": false 338 | }, 339 | "outputs": [ 340 | { 341 | "data": { 342 | "text/html": [ 343 | "
\n", 344 | "\n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | "
201320142015
20131.0000000.9407080.918543
20140.9407081.0000000.933550
20150.9185430.9335501.000000
\n", 374 | "
" 375 | ], 376 | "text/plain": [ 377 | " 2013 2014 2015\n", 378 | "2013 1.000000 0.940708 0.918543\n", 379 | "2014 0.940708 1.000000 0.933550\n", 380 | "2015 0.918543 0.933550 1.000000" 381 | ] 382 | }, 383 | "execution_count": 99, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "# find the correlations between three predictions\n", 390 | "prediction_df = pd.DataFrame({'2013': predictions_2013, '2014': predictions_2014, '2015': predictions_2015})\n", 391 | "prediction_df.corr()" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 108, 397 | "metadata": { 398 | "collapsed": true 399 | }, 400 | "outputs": [], 401 | "source": [ 402 | "prediction_avg = .1 * predictions_2013 + 0.8 * predictions_2014 + .1 * predictions_2015" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 109, 408 | "metadata": { 409 | "collapsed": false 410 | }, 411 | "outputs": [ 412 | { 413 | "name": "stdout", 414 | "output_type": "stream", 415 | "text": [ 416 | "RMSPE error for average of the predictions of three models 0.171631453195\n" 417 | ] 418 | } 419 | ], 420 | "source": [ 421 | "print 'RMSPE error for average of the predictions of three models ', rmspe(ytest, prediction_avg)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 111, 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [ 431 | { 432 | "data": { 433 | "text/plain": [ 434 | "1 35093\n", 435 | "0 5984\n", 436 | "Name: Open, dtype: int64" 437 | ] 438 | }, 439 | "execution_count": 111, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "rossman.test_df.Open.value_counts()" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": { 452 | "collapsed": true 453 | }, 454 | "outputs": [], 455 | "source": [] 456 | } 457 | ], 458 | "metadata": { 459 | "kernelspec": { 460 | "display_name": "Python 2", 461 | "language": "python", 462 | "name": "python2" 463 | }, 464 | "language_info": { 465 | "codemirror_mode": { 466 | "name": "ipython", 467 | "version": 2 468 | }, 469 | "file_extension": ".py", 470 | "mimetype": "text/x-python", 471 | "name": "python", 472 | "nbconvert_exporter": "python", 473 | "pygments_lexer": "ipython2", 474 | "version": "2.7.10" 475 | } 476 | }, 477 | "nbformat": 4, 478 | "nbformat_minor": 0 479 | } 480 | -------------------------------------------------------------------------------- /Rossman-Stores-Sales/scripts/helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sklearn.preprocessing import LabelEncoder 5 | 6 | 7 | def ToWeight(y): 8 | w = np.zeros(y.shape, dtype=float) 9 | ind = y != 0 10 | w[ind] = 1./(y[ind]**2) 11 | return w 12 | 13 | 14 | def rmspe(yhat, y): 15 | w = ToWeight(y) 16 | rmspe = np.sqrt(np.mean( w * (y - yhat)**2 )) 17 | return rmspe 18 | 19 | 20 | 21 | def rmspe_xg(yhat, y): 22 | 23 | """ 24 | This implementation of Root Mean Square Percentage error 25 | for XGBoost. 26 | """ 27 | 28 | y = y.get_label() 29 | y = np.exp(y) - 1 30 | yhat = np.exp(yhat) - 1 31 | w = ToWeight(y) 32 | rmspe = np.sqrt(np.mean(w * (y - yhat)**2)) 33 | return "rmspe", rmspe 34 | 35 | 36 | def get_object_cols(train_df): 37 | return [col for col in train_df.columns if train_df[col].dtype == 'O'] 38 | 39 | def preprocessing(train_df, test_df): 40 | cols = get_object_cols(train_df) 41 | 42 | for col in cols: 43 | lbl = LabelEncoder() 44 | data = pd.concat([train_df[col], test_df[col]]) 45 | 46 | lbl.fit(data) 47 | 48 | train_df[col] = lbl.transform(train_df[col]) 49 | test_df[col] = lbl.transform(test_df[col]) 50 | 51 | return train_df, test_df 52 | 53 | def get_data(train_df, start_date, end_date): 54 | """ 55 | Gets data between date range 56 | """ 57 | mask = ((train_df.Date >= start_date) & (train_df.Date <= end_date)) 58 | return train_df[mask] 59 | 60 | 61 | def create_submission(ids, preds, filename): 62 | submission_df = pd.DataFrame({'Id': ids, 'Sales': preds}) 63 | submission_df.to_csv('./submissions/' + filename, index=False) -------------------------------------------------------------------------------- /Rossman-Stores-Sales/scripts/rossman.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from math import sqrt 3 | import numpy as np 4 | from sklearn.cross_validation import train_test_split 5 | 6 | class Rossman(): 7 | 8 | def __init__(self, train_file_path, test_file_path, stores_file_path): 9 | """ 10 | Sets in the file path for training, test and store 11 | info csv's 12 | """ 13 | 14 | self.train_df = self.load_dataset(train_file_path, date_col='Date') 15 | self.test_df = self.load_dataset(test_file_path, date_col='Date') 16 | self.stores_df = self.load_dataset(stores_file_path) 17 | 18 | 19 | def load_dataset(self, file_path, date_col=None): 20 | """ 21 | Loads dataset based on file path 22 | """ 23 | 24 | if date_col: 25 | return pd.read_csv(file_path, parse_dates=[date_col]) 26 | else: 27 | return pd.read_csv(file_path) 28 | 29 | 30 | def non_zero_sales_data(self): 31 | mask = self.train_df.Sales > 0 32 | return self.train_df[mask] 33 | 34 | def split_train_test_mask(self, train_df, threshold_date, random_state=0): 35 | 36 | """ 37 | Splits the train_df into training and testing set 38 | training data will have all the examples except for last 6 weeks 39 | test data will examples for last 6 weeks 40 | """ 41 | features = train_df.columns.drop(['Customers', 'PromoInterval']) 42 | 43 | train_df_before_threshold = train_df[train_df.Date <= threshold_date][features] 44 | train_df_afer_threhold = train_df[train_df.Date > threshold_date][features] 45 | 46 | return train_df_before_threshold, train_df_afer_threhold 47 | 48 | 49 | 50 | def merge_stores_data(self): 51 | """ 52 | Merge store information with training data and test data 53 | """ 54 | 55 | self.train_df = pd.merge(self.train_df, self.stores_df, on='Store', how='left') 56 | self.test_df = pd.merge(self.test_df, self.stores_df, on='Store', how='left') 57 | 58 | 59 | -------------------------------------------------------------------------------- /Santander-Customer-Satisfaction/.gitignore: -------------------------------------------------------------------------------- 1 | ../.DS_Store 2 | data/ 3 | submissions/ 4 | -------------------------------------------------------------------------------- /Santander-Customer-Satisfaction/scripts/analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 28 07:52:35 2016 4 | 5 | @author: abhishek 6 | """ 7 | 8 | import pandas as pd 9 | 10 | ## Evaluation metric is AUC 11 | 12 | # load train and test files 13 | train = pd.read_csv('data/train.csv', index_col='ID') 14 | test = pd.read_csv('data/test.csv', index_col='ID') 15 | 16 | 17 | ## NOTES 18 | ## 19 | ## 1. 9999999999 to mark missing values 20 | ## 2. -999999 to mark missing values 21 | 22 | ## need to remove some features because they are either constant or 23 | ## identical to other column 24 | 25 | def get_constant_features(df, columns): 26 | constant_features = [] 27 | 28 | for col in columns: 29 | if df[col].std() == 0.0: 30 | constant_features.append(col) 31 | 32 | return constant_features 33 | 34 | def get_identical_features(df, columns): 35 | identical_features = [] 36 | 37 | for i in range(len(columns)): 38 | for j in range(i + 1, len(columns)): 39 | if (df[columns[i]] == df[columns[j]]).all(): 40 | identical_features.append(columns[i]) 41 | 42 | identical_features = set(identical_features) 43 | identical_features = list(identical_features) 44 | 45 | return identical_features 46 | 47 | def concat_features(constant_features, identical_features): 48 | features_to_remove = [] 49 | 50 | for col in constant_features: 51 | features_to_remove.append(col) 52 | 53 | for col in identical_features: 54 | features_to_remove.append(col) 55 | 56 | return features_to_remove 57 | 58 | columns = train.columns 59 | 60 | constant_features = get_constant_features(train, columns) 61 | columns = columns.drop(constant_features) 62 | 63 | identical_features = get_identical_features(train, columns) 64 | features_to_remove = concat_features(constant_features, identical_features) 65 | 66 | ## var 3 has missing value ( -999999 ) 67 | ## 26 more features with missing values 68 | ## Here is the list 69 | 70 | some_more_features_with_constant_value = ['delta_num_trasp_var33_out_1y3', 'delta_num_reemb_var33_1y3', 71 | 'delta_imp_trasp_var33_out_1y3', 'delta_imp_reemb_var33_1y3', 72 | 'delta_imp_amort_var34_1y3', 'delta_imp_amort_var18_1y3'] 73 | 74 | features_with_9999999999 = ['delta_imp_amort_var18_1y3', 'delta_imp_amort_var34_1y3', 75 | 'delta_imp_aport_var13_1y3', 'delta_imp_aport_var17_1y3', 76 | 'delta_imp_aport_var33_1y3', 'delta_imp_compra_var44_1y3', 77 | 'delta_imp_venta_var44_1y3', 'delta_num_aport_var13_1y3', 78 | 'delta_num_aport_var17_1y3', 'delta_num_aport_var33_1y3', 79 | 'delta_num_compra_var44_1y3', 'delta_num_reemb_var13_1y3', 80 | 'delta_num_reemb_var17_1y3', 'delta_num_reemb_var33_1y3', 81 | 'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var17_out_1y3', 82 | 'delta_num_trasp_var33_in_1y3', 'delta_num_trasp_var33_out_1y3', 83 | 'delta_num_venta_var44_1y3' 84 | ] 85 | 86 | for feat_name in features_with_9999999999: 87 | train.loc[:, 'missing_%s' %(feat_name)] = (train[feat_name] == train[feat_name].max()).astype(int) 88 | train.loc[:, feat_name] = train[feat_name].fillna(train[feat_name].mode()) 89 | 90 | test.loc[:, 'missing_%s' %(feat_name)] = (test[feat_name] == test[feat_name].max()).astype(int) 91 | test.loc[:, feat_name] = test[feat_name].fillna(test[feat_name].mode()) 92 | 93 | 94 | for feat_name in some_more_features_with_constant_value: 95 | features_to_remove.append(feat_name) 96 | 97 | # treat var3 differently 98 | train.loc[:, 'missing_value_var3'] = (train.var3 == -999999).astype(int) 99 | train.loc[:, 'var3'] = train.var3.fillna(train.var3.mode()) 100 | 101 | test.loc[:, 'missing_value_var3'] = (test.var3 == -999999).astype(int) 102 | test.loc[:, 'var3'] = test.var3.fillna(train.var3.mode()) 103 | 104 | # remove features 105 | features = train.columns.drop(features_to_remove) 106 | 107 | train_subset = train[features] 108 | 109 | features = features.drop('TARGET') 110 | test_subset = test[features] 111 | 112 | train_subset.to_csv('./data/train_processed_handle_na.csv', index=False) 113 | test_subset.to_csv('./data/test_processed_handle_na.csv', index=False) 114 | -------------------------------------------------------------------------------- /Santander-Customer-Satisfaction/scripts/blending.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 4 08:38:24 2016 4 | 5 | @author: abhishek 6 | """ 7 | 8 | from __future__ import division 9 | from sklearn.cross_validation import train_test_split, StratifiedKFold 10 | from sklearn.metrics import roc_auc_score 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.linear_model import LogisticRegression 13 | from xgboost import XGBClassifier 14 | 15 | import pandas as pd 16 | import numpy as np 17 | 18 | # load train and test files 19 | 20 | train = pd.read_csv('./data/train.csv', index_col='ID') 21 | test = pd.read_csv('./data/test.csv', index_col='ID') 22 | 23 | 24 | # set random seed 25 | np.random.seed(10) 26 | 27 | X = train[train.columns.drop('TARGET')] 28 | y = train.TARGET 29 | 30 | Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=44) 31 | 32 | n_folds = 10 33 | 34 | skf = list(StratifiedKFold(ytrain, n_folds)) 35 | 36 | clfs = [XGBClassifier(n_estimators=147, learning_rate=0.1, min_child_weight=2, colsample_bytree=0.9, subsample=0.95, seed=1279), 37 | XGBClassifier(n_estimators=264, learning_rate=0.05, min_child_weight=2, colsample_bytree=0.9, subsample=0.9, seed=1729)] 38 | 39 | print 'Creating train and test sets for blending' 40 | dataset_blend_train = np.zeros((X.shape[0], len(clfs))) 41 | dataset_blend_test = np.zeros((test.shape[0], len(clfs))) 42 | 43 | for j, clf in enumerate(clfs): 44 | print j, clf 45 | dataset_blend_test_j = np.zeros((test.shape[0], len(skf))) 46 | for i, (train, test_) in enumerate(skf): 47 | print "Fold", i 48 | X_train = X.values[train] 49 | y_train = y.values[train] 50 | X_test = X.values[test_] 51 | y_test = y.values[test_] 52 | clf.fit(X_train, y_train) 53 | y_submission = clf.predict_proba(X_test)[:,1] 54 | dataset_blend_train[test_, j] = y_submission 55 | dataset_blend_test_j[:, i] = clf.predict_proba(test)[:,1] 56 | dataset_blend_test[:,j] = dataset_blend_test_j.mean(1) 57 | 58 | print 59 | print "Blending." 60 | clf = LogisticRegression() 61 | clf.fit(dataset_blend_train, y) 62 | y_submission = clf.predict_proba(dataset_blend_test)[:,1] 63 | 64 | print "Linear stretch of predictions to [0,1]" 65 | y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) 66 | 67 | 68 | #print 'ROC AUC Score on test set %f ' %(roc_auc_score(ytest, y_submission)) 69 | 70 | submission_df = pd.read_csv('./data/sample_submission.csv') 71 | submission_df['TARGET'] = y_submission 72 | submission_df.to_csv('./submissions/blend_two_xgboost.csv', index=False) -------------------------------------------------------------------------------- /Santander-Customer-Satisfaction/scripts/cross-validation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 29 09:24:09 2016 4 | 5 | @author: abhishek 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from sklearn.cross_validation import train_test_split 12 | import xgboost as xgb 13 | 14 | np.random.seed(44) 15 | 16 | train = pd.read_csv('./data/train_processed_handle_na.csv') 17 | test = pd.read_csv('./data/test_processed_handle_na.csv') 18 | 19 | X = train[train.columns.drop('TARGET')] 20 | y = train.TARGET 21 | 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1279) 23 | 24 | # evaluate xgboost model 25 | param = dict([('max_depth', 3), ('learning_rate', 0.05), ('objective', 'binary:logistic'), 26 | ('eval_metric', 'auc'), ('seed', 1729), ('min_child_weight', 2), 27 | ('colsample_bytree', 0.95), ('subsample', 0.8)]) 28 | 29 | dtrain = xgb.DMatrix(X_train.values, label=y_train.values) 30 | dtest = xgb.DMatrix(X_test.values, label=y_test.values) 31 | watchlist = [(dtest, 'eval'), (dtrain, 'train')] 32 | 33 | num_round = 1000000 34 | 35 | xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=10) 36 | -------------------------------------------------------------------------------- /Santander-Customer-Satisfaction/scripts/feature_analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 31 22:30:05 2016 4 | 5 | @author: abhishek 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | import seaborn as sns 12 | 13 | # load train and test set 14 | 15 | train = pd.read_csv('./data/train.csv', index_col='ID') 16 | test = pd.read_csv('./data/test.csv', index_col='ID') 17 | 18 | ## Class that would represent different synthesized datasets 19 | 20 | class Dataset(): 21 | def __init__(self, train, test): 22 | self.train = train.copy() 23 | self.test = test.copy() 24 | self.features = train.columns[:-1] 25 | 26 | def impute_missing_values(self, strategy): 27 | missing_values = [-999999.0, 9999999999.0] 28 | 29 | for col in self.features: 30 | if (self.train[col] == missing_values[0]).any(): 31 | self.train['is_missing_%s' %(col)] = (self.train[col] == missing_values[0]).astype(int) 32 | 33 | if strategy == 'mean': 34 | strategy_applied_value = self.train[self.train[col] != missing_values[0]][col].mean() 35 | elif strategy == 'median': 36 | strategy_applied_value = self.train[self.train[col] != missing_values[0]][col].median() 37 | else: 38 | strategy_applied_value = self.train[self.train[col] != missing_values[0]][col].mode() 39 | 40 | self.train[col] = self.train[col].replace(missing_values[0], strategy_applied_value) 41 | 42 | 43 | self.test['is_missing_%s' %(col)] = (self.test[col] == missing_values[0]).astype(int) 44 | 45 | if strategy == 'mean': 46 | strategy_applied_value = self.test[self.test[col] != missing_values[0]][col].mean() 47 | elif strategy == 'median': 48 | strategy_applied_value = self.test[self.test[col] != missing_values[0]][col].median() 49 | else: 50 | strategy_applied_value = self.test[self.test[col] != missing_values[0]][col].mode() 51 | 52 | self.test[col] = self.test[col].replace(missing_values[0], strategy_applied_value) 53 | 54 | elif (self.train[col] == missing_values[1]).any(): 55 | self.train['is_missing_%s' %(col)] = (self.train[col] == missing_values[1]).astype(int) 56 | 57 | if strategy == 'mean': 58 | strategy_applied_value = self.train[self.train[col] != missing_values[1]][col].mean() 59 | elif strategy == 'median': 60 | strategy_applied_value = self.train[self.train[col] != missing_values[1]][col].median() 61 | else: 62 | strategy_applied_value = self.train[self.train[col] != missing_values[1]][col].mode() 63 | 64 | self.train[col] = self.train[col].replace(missing_values[1], strategy_applied_value) 65 | 66 | 67 | self.test['is_missing_%s' %(col)] = (self.test[col] == missing_values[1]).astype(int) 68 | 69 | if strategy == 'mean': 70 | strategy_applied_value = self.test[self.test[col] != missing_values[1]][col] 71 | elif strategy == 'median': 72 | strategy_applied_value = self.test[self.test[col] != missing_values[1]][col] 73 | else: 74 | strategy_applied_value = self.test[self.test[col] != missing_values[1]][col] 75 | 76 | self.test[col] = self.test[col].replace(missing_values[1], strategy_applied_value) 77 | 78 | def get_positive_valued_features(self): 79 | feature_status = (self.train < 0).any() 80 | neg_valued_features = feature_status[feature_status == True].index 81 | 82 | return self.features.drop(neg_valued_features) 83 | 84 | def log_transformation(self): 85 | self.non_neg_features = self.get_positive_valued_features() 86 | 87 | self.train[self.non_neg_features] = self.train[self.non_neg_features].applymap(np.log1p) 88 | self.test[self.non_neg_features] = self.test[self.non_neg_features].applymap(np.log1p) 89 | 90 | def discretize(self): 91 | self.train = self.train.astype(np.int) 92 | self.test = self.test.astype(np.int) 93 | 94 | def preprocess(self, impute_strategy): 95 | self.impute_missing_values(impute_strategy) 96 | self.log_transformation() 97 | # self.discretize() 98 | 99 | 100 | dataset_mean = Dataset(train, test) 101 | dataset_mean.preprocess('mean') 102 | 103 | dataset_median = Dataset(train, test) 104 | dataset_median.preprocess('median') 105 | 106 | dataset_mode = Dataset(train, test) 107 | dataset_mode.preprocess('mode') 108 | 109 | 110 | -------------------------------------------------------------------------------- /Santander-Customer-Satisfaction/scripts/feature_importance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 31 08:57:02 2016 4 | 5 | @author: abhishek 6 | """ 7 | 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | from sklearn.ensemble import RandomForestClassifier 12 | 13 | train = pd.read_csv('./data/train.csv') 14 | test = pd.read_csv('./data/test.csv') 15 | 16 | X = train[train.columns.drop('TARGET')] 17 | y = train.TARGET 18 | 19 | forest = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1) 20 | 21 | forest.fit(X, y) 22 | importances = forest.feature_importances_ 23 | std = np.std([tree.feature_importances_ for tree in forest.estimators_], 24 | axis=0) 25 | indices = np.argsort(importances)[::-1] 26 | 27 | # Print the feature ranking 28 | print("Feature ranking:") 29 | 30 | for f in range(X.shape[1]): 31 | print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) 32 | 33 | # Plot the feature importances of the forest 34 | #plt.figure() 35 | plt.title("Feature importances (RF)") 36 | plt.bar(range(10), importances[indices][:10], 37 | color="r", yerr=std[indices][:10], align="center") 38 | plt.xticks(range(10), train.columns[indices[:10]], rotation=90) 39 | plt.xlim([-1, 10]) 40 | plt.show() -------------------------------------------------------------------------------- /Santander-Customer-Satisfaction/scripts/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 28 22:33:15 2016 4 | 5 | @author: abhishek 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from sklearn.cross_validation import train_test_split 12 | from sklearn.preprocessing import MinMaxScaler, StandardScaler 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.feature_selection import SelectKBest, chi2 15 | from sklearn.pipeline import Pipeline 16 | from sklearn.metrics import roc_auc_score, confusion_matrix 17 | 18 | import xgboost as xgb 19 | 20 | np.random.seed(44) 21 | 22 | train = pd.read_csv('./data/train_processed_handle_na.csv') 23 | test = pd.read_csv('./data/test_processed_handle_na.csv') 24 | 25 | X = train[train.columns.drop('TARGET')] 26 | y = train.TARGET 27 | 28 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1279) 29 | 30 | # create model pipeline 31 | clf = xgb.XGBClassifier(n_estimators=250, learning_rate=0.05, max_depth=3, min_child_weight=2, 32 | colsample_bytree=0.95, subsample=0.8, seed=1729) 33 | 34 | xgb_pipeline = Pipeline([('clf', clf)]) 35 | 36 | scaler = MinMaxScaler() 37 | select = SelectKBest(chi2, k=200) 38 | 39 | clf = LogisticRegression() 40 | log_pipeline = Pipeline([('scaler', scaler), ('select', select), ('clf', clf)]) 41 | 42 | xgb_pipeline.fit(X_train, y_train) 43 | log_pipeline.fit(X_train, y_train) 44 | 45 | predsTrain_xgb = xgb_pipeline.predict_proba(X_train)[:, 1] 46 | predsTest_xgb = xgb_pipeline.predict_proba(X_test)[:, 1] 47 | 48 | predsTrain_log = log_pipeline.predict_proba(X_train)[:, 1] 49 | predsTest_log = log_pipeline.predict_proba(X_test)[:, 1] 50 | 51 | finalPredsTrain = 0.9 * predsTrain_xgb + 0.1 * predsTrain_log 52 | finalPredsTest = 0.9 * predsTest_xgb + 0.1 * predsTest_log 53 | 54 | print 'predictions on the training set %f ' %(roc_auc_score(y_train, finalPredsTrain)) 55 | print 'predictions on the test set %f ' %(roc_auc_score(y_test, finalPredsTest)) 56 | 57 | ### Train on full dataset 58 | xgb_pipeline.fit(X, y) 59 | log_pipeline.fit(X,y) 60 | 61 | preds_xgb = xgb_pipeline.predict_proba(test)[:, 1] 62 | preds_log = log_pipeline.predict_proba(test)[:, 1] 63 | 64 | predictions = 0.9 * preds_xgb + 0.1 * preds_log 65 | 66 | submission = pd.read_csv('./data/sample_submission.csv') 67 | submission.loc[:, 'TARGET'] = predictions 68 | submission.to_csv('./submissions/ensemble_xgb_log.csv', index=False) -------------------------------------------------------------------------------- /Santander-Customer-Satisfaction/scripts/vector_quantization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 4 21:27:37 2016 4 | 5 | @author: abhishek 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from scipy.cluster import vq 12 | 13 | 14 | # load train and test set 15 | train = pd.read_csv('./data/train.csv', index_col='ID') 16 | test = pd.read_csv('./data/test.csv', index_col='ID') 17 | 18 | # columns with high frequency values 19 | high_frequency = [col for col in train.columns if len(train[col].unique()) > 10] 20 | 21 | for col in high_frequency: 22 | codebook = vq.kmeans(train[col].values.astype(float), 5) 23 | train_values = [] 24 | test_values = [] 25 | 26 | for val in train[col]: 27 | train_values.append(vq.vq(val, codebook[0])[0][0]) 28 | 29 | for val in test[col]: 30 | test_values.append(vq.vq(val, codebook[0])[0][0]) 31 | 32 | train[col] = np.array(train_values) 33 | test[col] = np.array(test_values) 34 | 35 | train.to_csv('./data/synthesized/train_vq.csv', index=False) 36 | test.to_csv('./data/synthesized/test_vq.csv', index=False) -------------------------------------------------------------------------------- /Santander-Customer-Satisfaction/scripts/xgboost-tune.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 28 22:33:15 2016 4 | 5 | @author: abhishek 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from sklearn.cross_validation import train_test_split 12 | import xgboost as xgb 13 | 14 | np.random.seed(44) 15 | 16 | train = pd.read_csv('./data/synthesized/train_vq.csv') 17 | test = pd.read_csv('./data/synthesized/test_vq.csv') 18 | 19 | X = train[train.columns.drop('TARGET')] 20 | y = train.TARGET 21 | 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1279) 23 | 24 | # evaluate xgboost model 25 | param = dict([('max_depth', 3), ('learning_rate', 0.1), ('min_child_weight', 2), 26 | ('colsample_bytree', 0.9), ('subsample', 0.8), 27 | ('objective', 'binary:logistic'), 28 | ('eval_metric', 'auc'), ('seed', 1729)]) 29 | 30 | dtrain = xgb.DMatrix(X_train.values, label=y_train.values) 31 | dtest = xgb.DMatrix(X_test.values, label=y_test.values) 32 | 33 | watchlist = [(dtest, 'eval', (dtrain, 'train'))] 34 | 35 | num_round = 100000 36 | 37 | bst = xgb.train(param, dtrain, num_round, watchlist) -------------------------------------------------------------------------------- /Whats-Cooking/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | scripts/ 3 | submissions/ 4 | *.zip 5 | *.csv 6 | .ipynb_checkpoints/ 7 | -------------------------------------------------------------------------------- /cars-cancellation/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | submissions/ 3 | --------------------------------------------------------------------------------