├── LGB_CB_Python.ipynb ├── README.md └── xgb_nb.R /LGB_CB_Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Introduction\n", 8 | "\n", 9 | "Here you'll learn to build models using Catboost, Lightgbm and NaiveBayes algorithm in Python. Given the text classification problem, you'll also learn to clean data, create bag of words matrix, tf-idf matrix. \n", 10 | "\n", 11 | "On top of what's done here, next you can create a simple voting ensemble from the predictions generated from these models here." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "# Load Libraries\n", 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "from nltk.corpus import stopwords\n", 26 | "from nltk.stem import PorterStemmer\n", 27 | "from sklearn.ensemble import GradientBoostingClassifier\n", 28 | "from sklearn.naive_bayes import GaussianNB\n", 29 | "from sklearn.preprocessing import LabelEncoder\n", 30 | "import re\n", 31 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 32 | "from sklearn.model_selection import cross_val_score\n", 33 | "from sklearn.metrics import accuracy_score, make_scorer" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "# load data\n", 45 | "train = pd.read_csv(\"train.csv\")\n", 46 | "test = pd.read_csv(\"test.csv\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/html": [ 59 | "

\n", 60 | "\n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | "

	User_ID	Description	Browser_Used	Device_Used	Is_Response
0	id10326	The room was kind of clean but had a VERY stro...	Edge	Mobile	not happy
1	id10327	I stayed at the Crown Plaza April -- - April -...	Internet Explorer	Mobile	not happy
2	id10328	I booked this hotel through Hotwire at the low...	Mozilla	Tablet	not happy
3	id10329	Stayed here with husband and sons on the way t...	InternetExplorer	Desktop	happy
4	id10330	My girlfriends and I stayed here to celebrate ...	Edge	Tablet	not happy

\n", 114 | "

" 115 | ], 116 | "text/plain": [ 117 | " User_ID Description \\\n", 118 | "0 id10326 The room was kind of clean but had a VERY stro... \n", 119 | "1 id10327 I stayed at the Crown Plaza April -- - April -... \n", 120 | "2 id10328 I booked this hotel through Hotwire at the low... \n", 121 | "3 id10329 Stayed here with husband and sons on the way t... \n", 122 | "4 id10330 My girlfriends and I stayed here to celebrate ... \n", 123 | "\n", 124 | " Browser_Used Device_Used Is_Response \n", 125 | "0 Edge Mobile not happy \n", 126 | "1 Internet Explorer Mobile not happy \n", 127 | "2 Mozilla Tablet not happy \n", 128 | "3 InternetExplorer Desktop happy \n", 129 | "4 Edge Tablet not happy " 130 | ] 131 | }, 132 | "execution_count": 4, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "train.head()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 7, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "# function to clean data\n", 150 | "\n", 151 | "stops = set(stopwords.words(\"english\"))\n", 152 | "def cleanData(text, lowercase = False, remove_stops = False, stemming = False):\n", 153 | " txt = str(text)\n", 154 | " txt = re.sub(r'[^A-Za-z0-9\\s]',r'',txt)\n", 155 | " txt = re.sub(r'\\n',r' ',txt)\n", 156 | " \n", 157 | " if lowercase:\n", 158 | " txt = \" \".join([w.lower() for w in txt.split()])\n", 159 | " \n", 160 | " if remove_stops:\n", 161 | " txt = \" \".join([w for w in txt.split() if w not in stops])\n", 162 | " \n", 163 | " if stemming:\n", 164 | " st = PorterStemmer()\n", 165 | " txt = \" \".join([st.stem(w) for w in txt.split()])\n", 166 | "\n", 167 | " return txt" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 9, 173 | "metadata": { 174 | "collapsed": false 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "## join data\n", 179 | "test['Is_Response'] = np.nan\n", 180 | "alldata = pd.concat([train, test]).reset_index(drop=True)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 11, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "# clean description\n", 192 | "alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 13, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "# initialise the functions - we'll create separate models for each type.\n", 204 | "countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)\n", 205 | "tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 14, 211 | "metadata": { 212 | "collapsed": true 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "# create features\n", 217 | "bagofwords = countvec.fit_transform(alldata['Description'])\n", 218 | "tfidfdata = tfidfvec.fit_transform(alldata['Description'])" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 15, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "# label encode categorical features in data given\n", 230 | "cols = ['Browser_Used','Device_Used']\n", 231 | "\n", 232 | "for x in cols:\n", 233 | " lbl = LabelEncoder()\n", 234 | " alldata[x] = lbl.fit_transform(alldata[x])" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 16, 240 | "metadata": { 241 | "collapsed": false 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "# create dataframe for features\n", 246 | "bow_df = pd.DataFrame(bagofwords.todense())\n", 247 | "tfidf_df = pd.DataFrame(tfidfdata.todense())" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 17, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "# set column names\n", 259 | "bow_df.columns = ['col'+ str(x) for x in bow_df.columns]\n", 260 | "tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 18, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "# create separate data frame for bag of words and tf-idf\n", 272 | "\n", 273 | "bow_df_train = bow_df[:len(train)]\n", 274 | "bow_df_test = bow_df[len(train):]\n", 275 | "\n", 276 | "tfid_df_train = tfidf_df[:len(train)]\n", 277 | "tfid_df_test = tfidf_df[len(train):]" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 219, 283 | "metadata": { 284 | "collapsed": true 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "# split the merged data file into train and test respectively\n", 289 | "train_feats = alldata[~pd.isnull(alldata.Is_Response)]\n", 290 | "test_feats = alldata[pd.isnull(alldata.Is_Response)]" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 220, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [ 300 | { 301 | "name": "stderr", 302 | "output_type": "stream", 303 | "text": [ 304 | "/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", 305 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 306 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 307 | "\n", 308 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 309 | " app.launch_new_instance()\n" 310 | ] 311 | } 312 | ], 313 | "source": [ 314 | "### set target variable\n", 315 | "\n", 316 | "train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 24, 322 | "metadata": { 323 | "collapsed": false 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "# merge count (bag of word) features into train\n", 328 | "train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)\n", 329 | "test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)\n", 330 | "\n", 331 | "test_feats1.reset_index(drop=True, inplace=True)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 51, 337 | "metadata": { 338 | "collapsed": false 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "# merge into a new data frame with tf-idf features\n", 343 | "train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)\n", 344 | "test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "### NaiveBayes" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 28, 357 | "metadata": { 358 | "collapsed": true 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "# let's check cross validation score of the model\n", 363 | "# cv score acts a unbiased estimate of models accuracy on unseen data\n", 364 | "\n", 365 | "mod1 = GaussianNB()\n", 366 | "target = train_feats['Is_Response']" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 31, 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "outputs": [ 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | "[ 0.76311844 0.7745 0.7515 0.765 0.75837919]\n" 381 | ] 382 | } 383 | ], 384 | "source": [ 385 | "## Naive Bayes 1\n", 386 | "print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 32, 392 | "metadata": { 393 | "collapsed": false 394 | }, 395 | "outputs": [ 396 | { 397 | "name": "stdout", 398 | "output_type": "stream", 399 | "text": [ 400 | "[ 0.79310345 0.811 0.8035 0.815 0.79789895]\n" 401 | ] 402 | } 403 | ], 404 | "source": [ 405 | "## Naive Bayes 2 - tfidf is giving higher CV score\n", 406 | "print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 36, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "GaussianNB(priors=None)" 420 | ] 421 | }, 422 | "execution_count": 36, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "# make our first set of predictions\n", 429 | "\n", 430 | "clf1 = GaussianNB()\n", 431 | "clf1.fit(train_feats1, target)\n", 432 | "\n", 433 | "clf2 = GaussianNB()\n", 434 | "clf2.fit(train_feats2, target)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 41, 440 | "metadata": { 441 | "collapsed": false 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "preds1 = clf1.predict(test_feats1)\n", 446 | "preds2 = clf2.predict(test_feats2)" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 54, 452 | "metadata": { 453 | "collapsed": false 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "def to_labels(x):\n", 458 | " if x == 1:\n", 459 | " return \"happy\"\n", 460 | " return \"not_happy\"" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 68, 466 | "metadata": { 467 | "collapsed": false 468 | }, 469 | "outputs": [], 470 | "source": [ 471 | "sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})\n", 472 | "sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 72, 478 | "metadata": { 479 | "collapsed": false 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})\n", 484 | "sub2['Is_Response'] = sub2['Is_Response'].map(lambda x: to_labels(x))" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 83, 490 | "metadata": { 491 | "collapsed": false 492 | }, 493 | "outputs": [], 494 | "source": [ 495 | "sub1 = sub1[['User_ID', 'Is_Response']]\n", 496 | "sub2 = sub2[['User_ID', 'Is_Response']]" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 85, 502 | "metadata": { 503 | "collapsed": false 504 | }, 505 | "outputs": [], 506 | "source": [ 507 | "## write submission files\n", 508 | "sub1.to_csv('submissions/sub1_cv.csv', index=False)\n", 509 | "sub2.to_csv('submissions/sub2_tf.csv', index=False)s" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "metadata": {}, 515 | "source": [ 516 | "### LightGBM - 1\n", 517 | "\n", 518 | "We are prefering lightgbm over xgboost because of its speed.
\n", 519 | "In this model, we'll use count features for model training." 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 87, 525 | "metadata": { 526 | "collapsed": true 527 | }, 528 | "outputs": [], 529 | "source": [ 530 | "import lightgbm as lgb" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 88, 536 | "metadata": { 537 | "collapsed": false 538 | }, 539 | "outputs": [], 540 | "source": [ 541 | "# set the data in format lgb accepts\n", 542 | "d_train = lgb.Dataset(train_feats1, label = target)" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 89, 548 | "metadata": { 549 | "collapsed": true 550 | }, 551 | "outputs": [], 552 | "source": [ 553 | "## set parameters\n", 554 | "## you can tune the parameters can try to better score\n", 555 | "\n", 556 | "params = {'task': 'train',\n", 557 | " 'boosting_type': 'gbdt',\n", 558 | " 'objective': 'binary',\n", 559 | " 'metric': 'binary_error',\n", 560 | " 'learning_rate': 0.05, \n", 561 | " 'max_depth': 7, \n", 562 | " 'num_leaves': 21, \n", 563 | " 'feature_fraction': 0.3, \n", 564 | " 'bagging_fraction': 0.8, \n", 565 | " 'bagging_freq': 5}" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 98, 571 | "metadata": { 572 | "collapsed": false, 573 | "scrolled": true 574 | }, 575 | "outputs": [ 576 | { 577 | "name": "stdout", 578 | "output_type": "stream", 579 | "text": [ 580 | "[20]\tcv_agg's binary_error: 0.2132 + 0.00456488\n", 581 | "[40]\tcv_agg's binary_error: 0.195401 + 0.00625882\n", 582 | "[60]\tcv_agg's binary_error: 0.175601 + 0.00580722\n", 583 | "[80]\tcv_agg's binary_error: 0.1652 + 0.00589807\n", 584 | "[100]\tcv_agg's binary_error: 0.1568 + 0.00628195\n", 585 | "[120]\tcv_agg's binary_error: 0.1505 + 0.00328588\n", 586 | "[140]\tcv_agg's binary_error: 0.1487 + 0.00399728\n", 587 | "[160]\tcv_agg's binary_error: 0.147301 + 0.00497347\n", 588 | "[180]\tcv_agg's binary_error: 0.1445 + 0.00362296\n", 589 | "[200]\tcv_agg's binary_error: 0.1439 + 0.00429358\n", 590 | "[220]\tcv_agg's binary_error: 0.1417 + 0.00200147\n", 591 | "[240]\tcv_agg's binary_error: 0.1418 + 0.0040771\n", 592 | "[260]\tcv_agg's binary_error: 0.1401 + 0.00373791\n", 593 | "[280]\tcv_agg's binary_error: 0.1389 + 0.00517039\n", 594 | "[300]\tcv_agg's binary_error: 0.1376 + 0.00466764\n", 595 | "[320]\tcv_agg's binary_error: 0.136901 + 0.00507148\n", 596 | "[340]\tcv_agg's binary_error: 0.1357 + 0.00529898\n", 597 | "[360]\tcv_agg's binary_error: 0.1363 + 0.00505334\n", 598 | "[380]\tcv_agg's binary_error: 0.1353 + 0.0044035\n", 599 | "[400]\tcv_agg's binary_error: 0.1356 + 0.00428458\n", 600 | "[420]\tcv_agg's binary_error: 0.134501 + 0.00445613\n" 601 | ] 602 | } 603 | ], 604 | "source": [ 605 | "lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": 126, 611 | "metadata": { 612 | "collapsed": false 613 | }, 614 | "outputs": [], 615 | "source": [ 616 | "## get nround value which hd lowest error\n", 617 | "nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 129, 623 | "metadata": { 624 | "collapsed": false 625 | }, 626 | "outputs": [], 627 | "source": [ 628 | "## train the model\n", 629 | "model = lgb.train(params, d_train, num_boost_round=nround)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 130, 635 | "metadata": { 636 | "collapsed": true 637 | }, 638 | "outputs": [], 639 | "source": [ 640 | "## make predictions\n", 641 | "preds = model.predict(test_feats1)" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 157, 647 | "metadata": { 648 | "collapsed": true 649 | }, 650 | "outputs": [], 651 | "source": [ 652 | "# make submission\n", 653 | "\n", 654 | "def to_labels(x):\n", 655 | " if x > 0.66: # cutoff - you can change it and see if accuracy improves or plot AUC curve. \n", 656 | " return \"happy\"\n", 657 | " return \"not_happy\"\n", 658 | "\n", 659 | "sub3 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})\n", 660 | "sub3['Is_Response'] = sub3['Is_Response'].map(lambda x: to_labels(x))\n", 661 | "sub3 = sub3[['User_ID','Is_Response']]\n", 662 | "sub3.to_csv('submissions/sub3_lgb.csv', index=False) # 0.85518" 663 | ] 664 | }, 665 | { 666 | "cell_type": "markdown", 667 | "metadata": {}, 668 | "source": [ 669 | "### LightGBM - 2\n", 670 | "\n", 671 | "In this model, we'll use tf-idf features for model training." 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 140, 677 | "metadata": { 678 | "collapsed": false 679 | }, 680 | "outputs": [], 681 | "source": [ 682 | "# set data format\n", 683 | "d_train = lgb.Dataset(train_feats2, label = target)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": 144, 689 | "metadata": { 690 | "collapsed": true 691 | }, 692 | "outputs": [], 693 | "source": [ 694 | "# same parameters as above\n", 695 | "params = {'task': 'train',\n", 696 | " 'boosting_type': 'gbdt',\n", 697 | " 'objective': 'binary',\n", 698 | " 'metric': 'binary_error',\n", 699 | " 'learning_rate': 0.05, \n", 700 | " 'max_depth': 5, \n", 701 | " 'num_leaves': 11,\n", 702 | " 'feature_fraction': 0.3, \n", 703 | " 'bagging_fraction': 0.8, \n", 704 | " 'bagging_freq': 5}" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": 145, 710 | "metadata": { 711 | "collapsed": false, 712 | "scrolled": true 713 | }, 714 | "outputs": [ 715 | { 716 | "name": "stdout", 717 | "output_type": "stream", 718 | "text": [ 719 | "[20]\tcv_agg's binary_error: 0.226401 + 0.00518217\n", 720 | "[40]\tcv_agg's binary_error: 0.206602 + 0.00687761\n", 721 | "[60]\tcv_agg's binary_error: 0.183302 + 0.00791949\n", 722 | "[80]\tcv_agg's binary_error: 0.169801 + 0.00512283\n", 723 | "[100]\tcv_agg's binary_error: 0.164301 + 0.00632119\n", 724 | "[120]\tcv_agg's binary_error: 0.1578 + 0.00507081\n", 725 | "[140]\tcv_agg's binary_error: 0.1542 + 0.00524522\n", 726 | "[160]\tcv_agg's binary_error: 0.1516 + 0.00441669\n", 727 | "[180]\tcv_agg's binary_error: 0.148701 + 0.00512212\n", 728 | "[200]\tcv_agg's binary_error: 0.1461 + 0.00366096\n", 729 | "[220]\tcv_agg's binary_error: 0.1443 + 0.00362658\n", 730 | "[240]\tcv_agg's binary_error: 0.1437 + 0.00471092\n", 731 | "[260]\tcv_agg's binary_error: 0.143501 + 0.00450031\n", 732 | "[280]\tcv_agg's binary_error: 0.1405 + 0.00506581\n", 733 | "[300]\tcv_agg's binary_error: 0.1396 + 0.00578611\n", 734 | "[320]\tcv_agg's binary_error: 0.137801 + 0.00687571\n", 735 | "[340]\tcv_agg's binary_error: 0.138701 + 0.00679821\n", 736 | "[360]\tcv_agg's binary_error: 0.137201 + 0.00839438\n", 737 | "[380]\tcv_agg's binary_error: 0.137501 + 0.00738158\n", 738 | "[400]\tcv_agg's binary_error: 0.136401 + 0.00735946\n", 739 | "[420]\tcv_agg's binary_error: 0.136101 + 0.00702239\n", 740 | "[440]\tcv_agg's binary_error: 0.136901 + 0.00739423\n" 741 | ] 742 | } 743 | ], 744 | "source": [ 745 | "## do cross validation to find nround i.e. at this round (iteration) we can expect lowest error\n", 746 | "lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 146, 752 | "metadata": { 753 | "collapsed": false 754 | }, 755 | "outputs": [], 756 | "source": [ 757 | "# get nround value\n", 758 | "nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": 159, 764 | "metadata": { 765 | "collapsed": false 766 | }, 767 | "outputs": [], 768 | "source": [ 769 | "# train model\n", 770 | "model = lgb.train(params, d_train, num_boost_round=nround)" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": 160, 776 | "metadata": { 777 | "collapsed": true 778 | }, 779 | "outputs": [], 780 | "source": [ 781 | "# make prediction\n", 782 | "preds = model.predict(test_feats2)" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": 162, 788 | "metadata": { 789 | "collapsed": true 790 | }, 791 | "outputs": [], 792 | "source": [ 793 | "# make submission\n", 794 | "\n", 795 | "def to_labels(x):\n", 796 | " if x > 0.66:\n", 797 | " return \"happy\"\n", 798 | " return \"not_happy\"\n", 799 | "\n", 800 | "sub4 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})\n", 801 | "sub4['Is_Response'] = sub4['Is_Response'].map(lambda x: to_labels(x))\n", 802 | "sub4 = sub4[['User_ID','Is_Response']]\n", 803 | "sub4.to_csv('submissions/sub4_lgb.csv', index=False) # 0.84925" 804 | ] 805 | }, 806 | { 807 | "cell_type": "markdown", 808 | "metadata": {}, 809 | "source": [ 810 | "### CatBoost\n", 811 | "\n", 812 | "Catboost is a new package recently launched by Yandex. It is said that it works well when the data has many categorical features. We'll use it on count data and see it our model improves." 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": { 819 | "collapsed": true 820 | }, 821 | "outputs": [], 822 | "source": [ 823 | "## import library\n", 824 | "from catboost import CatBoostClassifier,cv, Pool" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": 193, 830 | "metadata": { 831 | "collapsed": true 832 | }, 833 | "outputs": [], 834 | "source": [ 835 | "## catboost accepts categorical columns as a list of column numbers. In this data, all columns are categorical\n", 836 | "cat_cols = [x for x in range(502)] ## 502 == train_feats1.shape[1] " 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": 196, 842 | "metadata": { 843 | "collapsed": true 844 | }, 845 | "outputs": [], 846 | "source": [ 847 | "## set parameters\n", 848 | "## you can refer the parameters here: https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list\n", 849 | "param = {\n", 850 | " 'use_best_model':True,\n", 851 | " 'loss_function':'CrossEntropy',\n", 852 | " 'eval_metric':'Accuracy',\n", 853 | " 'iterations':1000,\n", 854 | " 'depth':6,\n", 855 | " 'learning_rate':0.03,\n", 856 | " 'rsm':0.3,\n", 857 | " 'random_seed':2017,\n", 858 | " \n", 859 | " \n", 860 | "}" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": 197, 866 | "metadata": { 867 | "collapsed": false 868 | }, 869 | "outputs": [], 870 | "source": [ 871 | "## for doing cross validation, set data in Pool format\n", 872 | "my_dt = Pool(train_feats1, \n", 873 | " label=target,\n", 874 | " cat_features=cat_cols,\n", 875 | " column_description=None,\n", 876 | " delimiter='\\t',\n", 877 | " has_header=None,\n", 878 | " weight=None, \n", 879 | " baseline=None,\n", 880 | " feature_names=None,\n", 881 | " thread_count=1)" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": 198, 887 | "metadata": { 888 | "collapsed": false, 889 | "scrolled": true 890 | }, 891 | "outputs": [ 892 | { 893 | "name": "stdout", 894 | "output_type": "stream", 895 | "text": [ 896 | "Borders generated\n", 897 | "0:\t0:\t0:\t0:\t0:\t1:\t1:\t1:\t1:\t1:\t2:\t2:\t2:\t2:\t2:\t3:\t3:\t3:\t3:\t3:\t4:\t4:\t4:\t4:\t4:\t5:\t5:\t5:\t5:\t5:\t6:\t6:\t6:\t6:\t6:\t7:\t7:\t7:\t7:\t7:\t8:\t8:\t8:\t8:\t8:\t9:\t9:\t9:\t9:\t9:\t10:\t10:\t10:\t10:\t10:\t11:\t11:\t11:\t11:\t11:\t12:\t12:\t12:\t12:\t12:\t13:\t13:\t13:\t13:\t13:\t14:\t14:\t14:\t14:\t14:\t15:\t15:\t15:\t15:\t15:\t16:\t16:\t16:\t16:\t16:\t17:\t17:\t17:\t17:\t17:\t18:\t18:\t18:\t18:\t18:\t19:\t19:\t19:\t19:\t19:\t20:\t20:\t20:\t20:\t20:\t21:\t21:\t21:\t21:\t21:\t22:\t22:\t22:\t22:\t22:\t23:\t23:\t23:\t23:\t23:\t24:\t24:\t24:\t24:\t24:\t25:\t25:\t25:\t25:\t25:\t26:\t26:\t26:\t26:\t26:\t27:\t27:\t27:\t27:\t27:\t28:\t28:\t28:\t28:\t28:\t29:\t29:\t29:\t29:\t29:\t30:\t30:\t30:\t30:\t30:\t31:\t31:\t31:\t31:\t31:\t32:\t32:\t32:\t32:\t32:\t33:\t33:\t33:\t33:\t33:\t34:\t34:\t34:\t34:\t34:\t35:\t35:\t35:\t35:\t35:\t36:\t36:\t36:\t36:\t36:\t37:\t37:\t37:\t37:\t37:\t38:\t38:\t38:\t38:\t38:\t39:\t39:\t39:\t39:\t39:\t40:\t40:\t40:\t40:\t40:\t41:\t41:\t41:\t41:\t41:\t42:\t42:\t42:\t42:\t42:\t43:\t43:\t43:\t43:\t43:\t44:\t44:\t44:\t44:\t44:\t45:\t45:\t45:\t45:\t45:\t46:\t46:\t46:\t46:\t46:\t47:\t47:\t47:\t47:\t47:\t48:\t48:\t48:\t48:\t48:\t49:\t49:\t49:\t49:\t49:\t50:\t50:\t50:\t50:\t50:\t51:\t51:\t51:\t51:\t51:\t52:\t52:\t52:\t52:\t52:\t53:\t53:\t53:\t53:\t53:\t54:\t54:\t54:\t54:\t54:\t55:\t55:\t55:\t55:\t55:\t56:\t56:\t56:\t56:\t56:\t57:\t57:\t57:\t57:\t57:\t58:\t58:\t58:\t58:\t58:\t59:\t59:\t59:\t59:\t59:\t60:\t60:\t60:\t60:\t60:\t61:\t61:\t61:\t61:\t61:\t62:\t62:\t62:\t62:\t62:\t63:\t63:\t63:\t63:\t63:\t64:\t64:\t64:\t64:\t64:\t65:\t65:\t65:\t65:\t65:\t66:\t66:\t66:\t66:\t66:\t67:\t67:\t67:\t67:\t67:\t68:\t68:\t68:\t68:\t68:\t69:\t69:\t69:\t69:\t69:\t70:\t70:\t70:\t70:\t70:\t71:\t71:\t71:\t71:\t71:\t72:\t72:\t72:\t72:\t72:\t73:\t73:\t73:\t73:\t73:\t74:\t74:\t74:\t74:\t74:\t75:\t75:\t75:\t75:\t75:\t76:\t76:\t76:\t76:\t76:\t77:\t77:\t77:\t77:\t77:\t78:\t78:\t78:\t78:\t78:\t79:\t79:\t79:\t79:\t79:\t80:\t80:\t80:\t80:\t80:\t81:\t81:\t81:\t81:\t81:\t82:\t82:\t82:\t82:\t82:\t83:\t83:\t83:\t83:\t83:\t84:\t84:\t84:\t84:\t84:\t85:\t85:\t85:\t85:\t85:\t86:\t86:\t86:\t86:\t86:\t87:\t87:\t87:\t87:\t87:\t88:\t88:\t88:\t88:\t88:\t89:\t89:\t89:\t89:\t89:\t90:\t90:\t90:\t90:\t90:\t91:\t91:\t91:\t91:\t91:\t92:\t92:\t92:\t92:\t92:\t93:\t93:\t93:\t93:\t93:\t94:\t94:\t94:\t94:\t94:\t95:\t95:\t95:\t95:\t95:\t96:\t96:\t96:\t96:\t96:\t97:\t97:\t97:\t97:\t97:\t98:\t98:\t98:\t98:\t98:\t99:\t99:\t99:\t99:\t99:\t100:\t100:\t100:\t100:\t100:\t101:\t101:\t101:\t101:\t101:\t102:\t102:\t102:\t102:\t102:\t103:\t103:\t103:\t103:\t103:\t104:\t104:\t104:\t104:\t104:\t105:\t105:\t105:\t105:\t105:\t106:\t106:\t106:\t106:\t106:\t107:\t107:\t107:\t107:\t107:\t108:\t108:\t108:\t108:\t108:\t109:\t109:\t109:\t109:\t109:\t110:\t110:\t110:\t110:\t110:\t111:\t111:\t111:\t111:\t111:\t112:\t112:\t112:\t112:\t112:\t113:\t113:\t113:\t113:\t113:\t114:\t114:\t114:\t114:\t114:\t115:\t115:\t115:\t115:\t115:\t116:\t116:\t116:\t116:\t116:\t117:\t117:\t117:\t117:\t117:\t118:\t118:\t118:\t118:\t118:\t119:\t119:\t119:\t119:\t119:\t120:\t120:\t120:\t120:\t120:\t121:\t121:\t121:\t121:\t121:\t122:\t122:\t122:\t122:\t122:\t123:\t123:\t123:\t123:\t123:\t124:\t124:\t124:\t124:\t124:\t125:\t125:\t125:\t125:\t125:\t126:\t126:\t126:\t126:\t126:\t127:\t127:\t127:\t127:\t127:\t128:\t128:\t128:\t128:\t128:\t129:\t129:\t129:\t129:\t129:\t130:\t130:\t130:\t130:\t130:\t131:\t131:\t131:\t131:\t131:\t132:\t132:\t132:\t132:\t132:\t133:\t133:\t133:\t133:\t133:\t134:\t134:\t134:\t134:\t134:\t135:\t135:\t135:\t135:\t135:\t136:\t136:\t136:\t136:\t136:\t137:\t137:\t137:\t137:\t137:\t138:\t138:\t138:\t138:\t138:\t139:\t139:\t139:\t139:\t139:\t140:\t140:\t140:\t140:\t140:\t141:\t141:\t141:\t141:\t141:\t142:\t142:\t142:\t142:\t142:\t143:\t143:\t143:\t143:\t143:\t144:\t144:\t144:\t144:\t144:\t145:\t145:\t145:\t145:\t145:\t146:\t146:\t146:\t146:\t146:\t147:\t147:\t147:\t147:\t147:\t148:\t148:\t148:\t148:\t148:\t149:\t149:\t149:\t149:\t149:\t150:\t150:\t150:\t150:\t150:\t151:\t151:\t151:\t151:\t151:\t152:\t152:\t152:\t152:\t152:\t153:\t153:\t153:\t153:\t153:\t154:\t154:\t154:\t154:\t154:\t155:\t155:\t155:\t155:\t155:\t156:\t156:\t156:\t156:\t156:\t157:\t157:\t157:\t157:\t157:\t158:\t158:\t158:\t158:\t158:\t159:\t159:\t159:\t159:\t159:\t160:\t160:\t160:\t160:\t160:\t161:\t161:\t161:\t161:\t161:\t162:\t162:\t162:\t162:\t162:\t163:\t163:\t163:\t163:\t163:\t164:\t164:\t164:\t164:\t164:\t165:\t165:\t165:\t165:\t165:\t166:\t166:\t166:\t166:\t166:\t167:\t167:\t167:\t167:\t167:\t168:\t168:\t168:\t168:\t168:\t169:\t169:\t169:\t169:\t169:\t170:\t170:\t170:\t170:\t170:\t171:\t171:\t171:\t171:\t171:\t172:\t172:\t172:\t172:\t172:\t173:\t173:\t173:\t173:\t173:\t174:\t174:\t174:\t174:\t174:\t175:\t175:\t175:\t175:\t175:\t176:\t176:\t176:\t176:\t176:\t177:\t177:\t177:\t177:\t177:\t178:\t178:\t178:\t178:\t178:\t179:\t179:\t179:\t179:\t179:\t180:\t180:\t180:\t180:\t180:\t181:\t181:\t181:\t181:\t181:\t182:\t182:\t182:\t182:\t182:\t183:\t183:\t183:\t183:\t183:\t184:\t184:\t184:\t184:\t184:\t185:\t185:\t185:\t185:\t185:\t186:\t186:\t186:\t186:\t186:\t187:\t187:\t187:\t187:\t187:\t188:\t188:\t188:\t188:\t188:\t189:\t189:\t189:\t189:\t189:\t190:\t190:\t190:\t190:\t190:\t191:\t191:\t191:\t191:\t191:\t192:\t192:\t192:\t192:\t192:\t193:\t193:\t193:\t193:\t193:\t194:\t194:\t194:\t194:\t194:\t195:\t195:\t195:\t195:\t195:\t196:\t196:\t196:\t196:\t196:\t197:\t197:\t197:\t197:\t197:\t198:\t198:\t198:\t198:\t198:\t199:\t199:\t199:\t199:\t199:\t200:\t200:\t200:\t200:\t200:\t201:\t201:\t201:\t201:\t201:\t202:\t202:\t202:\t202:\t202:\t203:\t203:\t203:\t203:\t203:\t204:\t204:\t204:\t204:\t204:\t205:\t205:\t205:\t205:\t205:\t206:\t206:\t206:\t206:\t206:\t207:\t207:\t207:\t207:\t207:\t208:\t208:\t208:\t208:\t208:\t209:\t209:\t209:\t209:\t209:\t210:\t210:\t210:\t210:\t210:\t211:\t211:\t211:\t211:\t211:\t212:\t212:\t212:\t212:\t212:\t213:\t213:\t213:\t213:\t213:\t214:\t214:\t214:\t214:\t214:\t215:\t215:\t215:\t215:\t215:\t216:\t216:\t216:\t216:\t216:\t217:\t217:\t217:\t217:\t217:\t218:\t218:\t218:\t218:\t218:\t219:\t219:\t219:\t219:\t219:\t220:\t220:\t220:\t220:\t220:\t221:\t221:\t221:\t221:\t221:\t222:\t222:\t222:\t222:\t222:\t223:\t223:\t223:\t223:\t223:\t224:\t224:\t224:\t224:\t224:\t225:\t225:\t225:\t225:\t225:\t226:\t226:\t226:\t226:\t226:\t227:\t227:\t227:\t227:\t227:\t228:\t228:\t228:\t228:\t228:\t229:\t229:\t229:\t229:\t229:\t230:\t230:\t230:\t230:\t230:\t231:\t231:\t231:\t231:\t231:\t232:\t232:\t232:\t232:\t232:\t233:\t233:\t233:\t233:\t233:\t234:\t234:\t234:\t234:\t234:\t235:\t235:\t235:\t235:\t235:\t236:\t236:\t236:\t236:\t236:\t237:\t237:\t237:\t237:\t237:\t238:\t238:\t238:\t238:\t238:\t239:\t239:\t239:\t239:\t239:\t240:\t240:\t240:\t240:\t240:\t241:\t241:\t241:\t241:\t241:\t242:\t242:\t242:\t242:\t242:\t243:\t243:\t243:\t243:\t243:\t244:\t244:\t244:\t244:\t244:\t245:\t245:\t245:\t245:\t245:\t246:\t246:\t246:\t246:\t246:\t247:\t247:\t247:\t247:\t247:\t248:\t248:\t248:\t248:\t248:\t249:\t249:\t249:\t249:\t249:\t250:\t250:\t250:\t250:\t250:\t251:\t251:\t251:\t251:\t251:\t252:\t252:\t252:\t252:\t252:\t253:\t253:\t253:\t253:\t253:\t254:\t254:\t254:\t254:\t254:\t255:\t255:\t255:\t255:\t255:\t256:\t256:\t256:\t256:\t256:\t257:\t257:\t257:\t257:\t257:\t258:\t258:\t258:\t258:\t258:\t259:\t259:\t259:\t259:\t259:\t260:\t260:\t260:\t260:\t260:\t261:\t261:\t261:\t261:\t261:\t262:\t262:\t262:\t262:\t262:\t263:\t263:\t263:\t263:\t263:\t264:\t264:\t264:\t264:\t264:\t265:\t265:\t265:\t265:\t265:\t266:\t266:\t266:\t266:\t266:\t267:\t267:\t267:\t267:\t267:\t268:\t268:\t268:\t268:\t268:\t269:\t269:\t269:\t269:\t269:\t270:\t270:\t270:\t270:\t270:\t271:\t271:\t271:\t271:\t271:\t272:\t272:\t272:\t272:\t272:\t273:\t273:\t273:\t273:\t273:\t274:\t274:\t274:\t274:\t274:\t275:\t275:\t275:\t275:\t275:\t276:\t276:\t276:\t276:\t276:\t277:\t277:\t277:\t277:\t277:\t278:\t278:\t278:\t278:\t278:\t279:\t279:\t279:\t279:\t279:\t280:\t280:\t280:\t280:\t280:\t281:\t281:\t281:\t281:\t281:\t282:\t282:\t282:\t282:\t282:\t283:\t283:\t283:\t283:\t283:\t284:\t284:\t284:\t284:\t284:\t285:\t285:\t285:\t285:\t285:\t286:\t286:\t286:\t286:\t286:\t287:\t287:\t287:\t287:\t287:\t288:\t288:\t288:\t288:\t288:\t289:\t289:\t289:\t289:\t289:\t290:\t290:\t290:\t290:\t290:\t291:\t291:\t291:\t291:\t291:\t292:\t292:\t292:\t292:\t292:\t293:\t293:\t293:\t293:\t293:\t294:\t294:\t294:\t294:\t294:\t295:\t295:\t295:\t295:\t295:\t296:\t296:\t296:\t296:\t296:\t297:\t297:\t297:\t297:\t297:\t298:\t298:\t298:\t298:\t298:\t299:\t299:\t299:\t299:\t299:\t300:\t300:\t300:\t300:\t300:\t301:\t301:\t301:\t301:\t301:\t302:\t302:\t302:\t302:\t302:\t303:\t303:\t303:\t303:\t303:\t304:\t304:\t304:\t304:\t304:\t305:\t305:\t305:\t305:\t305:\t306:\t306:\t306:\t306:\t306:\t307:\t307:\t307:\t307:\t307:\t308:\t308:\t308:\t308:\t308:\t309:\t309:\t309:\t309:\t309:\t310:\t310:\t310:\t310:\t310:\t311:\t311:\t311:\t311:\t311:\t312:\t312:\t312:\t312:\t312:\t313:\t313:\t313:\t313:\t313:\t314:\t314:\t314:\t314:\t314:\t315:\t315:\t315:\t315:\t315:\t316:\t316:\t316:\t316:\t316:\t317:\t317:\t317:\t317:\t317:\t318:\t318:\t318:\t318:\t318:\t319:\t319:\t319:\t319:\t319:\t320:\t320:\t320:\t320:\t320:\t321:\t321:\t321:\t321:\t321:\t322:\t322:\t322:\t322:\t322:\t323:\t323:\t323:\t323:\t323:\t324:\t324:\t324:\t324:\t324:\t325:\t325:\t325:\t325:\t325:\t326:\t326:\t326:\t326:\t326:\t327:\t327:\t327:\t327:\t327:\t328:\t328:\t328:\t328:\t328:\t329:\t329:\t329:\t329:\t329:\t330:\t330:\t330:\t330:\t330:\t331:\t331:\t331:\t331:\t331:\t332:\t332:\t332:\t332:\t332:\t333:\t333:\t333:\t333:\t333:\t334:\t334:\t334:\t334:\t334:\t335:\t335:\t335:\t335:\t335:\t336:\t336:\t336:\t336:\t336:\t337:\t337:\t337:\t337:\t337:\t338:\t338:\t338:\t338:\t338:\t339:\t339:\t339:\t339:\t339:\t340:\t340:\t340:\t340:\t340:\t341:\t341:\t341:\t341:\t341:\t342:\t342:\t342:\t342:\t342:\t343:\t343:\t343:\t343:\t343:\t344:\t344:\t344:\t344:\t344:\t345:\t345:\t345:\t345:\t345:\t346:\t346:\t346:\t346:\t346:\t347:\t347:\t347:\t347:\t347:\t348:\t348:\t348:\t348:\t348:\t349:\t349:\t349:\t349:\t349:\t350:\t350:\t350:\t350:\t350:\t351:\t351:\t351:\t351:\t351:\t352:\t352:\t352:\t352:\t352:\t353:\t353:\t353:\t353:\t353:\t354:\t354:\t354:\t354:\t354:\t355:\t355:\t355:\t355:\t355:\t356:\t356:\t356:\t356:\t356:\t357:\t357:\t357:\t357:\t357:\t358:\t358:\t358:\t358:\t358:\t359:\t359:\t359:\t359:\t359:\t360:\t360:\t360:\t360:\t360:\t361:\t361:\t361:\t361:\t361:\t362:\t362:\t362:\t362:\t362:\t363:\t363:\t363:\t363:\t363:\t364:\t364:\t364:\t364:\t364:\t365:\t365:\t365:\t365:\t365:\t366:\t366:\t366:\t366:\t366:\t367:\t367:\t367:\t367:\t367:\t368:\t368:\t368:\t368:\t368:\t369:\t369:\t369:\t369:\t369:\t370:\t370:\t370:\t370:\t370:\t371:\t371:\t371:\t371:\t371:\t372:\t372:\t372:\t372:\t372:\t373:\t373:\t373:\t373:\t373:\t374:\t374:\t374:\t374:\t374:\t375:\t375:\t375:\t375:\t375:\t376:\t376:\t376:\t376:\t376:\t377:\t377:\t377:\t377:\t377:\t378:\t378:\t378:\t378:\t378:\t379:\t379:\t379:\t379:\t379:\t380:\t380:\t380:\t380:\t380:\t381:\t381:\t381:\t381:\t381:\t382:\t382:\t382:\t382:\t382:\t383:\t383:\t383:\t383:\t383:\t384:\t384:\t384:\t384:\t384:\t385:\t385:\t385:\t385:\t385:\t386:\t386:\t386:\t386:\t386:\t387:\t387:\t387:\t387:\t387:\t388:\t388:\t388:\t388:\t388:\t389:\t389:\t389:\t389:\t389:\t390:\t390:\t390:\t390:\t390:\t391:\t391:\t391:\t391:\t391:\t392:\t392:\t392:\t392:\t392:\t393:\t393:\t393:\t393:\t393:\t394:\t394:\t394:\t394:\t394:\t395:\t395:\t395:\t395:\t395:\t396:\t396:\t396:\t396:\t396:\t397:\t397:\t397:\t397:\t397:\t398:\t398:\t398:\t398:\t398:\t399:\t399:\t399:\t399:\t399:\t400:\t400:\t400:\t400:\t400:\t401:\t401:\t401:\t401:\t401:\t402:\t402:\t402:\t402:\t402:\t403:\t403:\t403:\t403:\t403:\t404:\t404:\t404:\t404:\t404:\t405:\t405:\t405:\t405:\t405:\t406:\t406:\t406:\t406:\t406:\t407:\t407:\t407:\t407:\t407:\t408:\t408:\t408:\t408:\t408:\t409:\t409:\t409:\t409:\t409:\t410:\t410:\t410:\t410:\t410:\t411:\t411:\t411:\t411:\t411:\t412:\t412:\t412:\t412:\t412:\t413:\t413:\t413:\t413:\t413:\t414:\t414:\t414:\t414:\t414:\t415:\t415:\t415:\t415:\t415:\t416:\t416:\t416:\t416:\t416:\t417:\t417:\t417:\t417:\t417:\t418:\t418:\t418:\t418:\t418:\t419:\t419:\t419:\t419:\t419:\t420:\t420:\t420:\t420:\t420:\t421:\t421:\t421:\t421:\t421:\t422:\t422:\t422:\t422:\t422:\t423:\t423:\t423:\t423:\t423:\t424:\t424:\t424:\t424:\t424:\t425:\t425:\t425:\t425:\t425:\t426:\t426:\t426:\t426:\t426:\t427:\t427:\t427:\t427:\t427:\t428:\t428:\t428:\t428:\t428:\t429:\t429:\t429:\t429:\t429:\t430:\t430:\t430:\t430:\t430:\t431:\t431:\t431:\t431:\t431:\t432:\t432:\t432:\t432:\t432:\t433:\t433:\t433:\t433:\t433:\t434:\t434:\t434:\t434:\t434:\t435:\t435:\t435:\t435:\t435:\t436:\t436:\t436:\t436:\t436:\t437:\t437:\t437:\t437:\t437:\t438:\t438:\t438:\t438:\t438:\t439:\t439:\t439:\t439:\t439:\t440:\t440:\t440:\t440:\t440:\t441:\t441:\t441:\t441:\t441:\t442:\t442:\t442:\t442:\t442:\t443:\t443:\t443:\t443:\t443:\t444:\t444:\t444:\t444:\t444:\t445:\t445:\t445:\t445:\t445:\t446:\t446:\t446:\t446:\t446:\t447:\t447:\t447:\t447:\t447:\t448:\t448:\t448:\t448:\t448:\t449:\t449:\t449:\t449:\t449:\t450:\t450:\t450:\t450:\t450:\t451:\t451:\t451:\t451:\t451:\t452:\t452:\t452:\t452:\t452:\t453:\t453:\t453:\t453:\t453:\t454:\t454:\t454:\t454:\t454:\t455:\t455:\t455:\t455:\t455:\t456:\t456:\t456:\t456:\t456:\t457:\t457:\t457:\t457:\t457:\t458:\t458:\t458:\t458:\t458:\t459:\t459:\t459:\t459:\t459:\t460:\t460:\t460:\t460:\t460:\t461:\t461:\t461:\t461:\t461:\t462:\t462:\t462:\t462:\t462:\t463:\t463:\t463:\t463:\t463:\t464:\t464:\t464:\t464:\t464:\t465:\t465:\t465:\t465:\t465:\t466:\t466:\t466:\t466:\t466:\t467:\t467:\t467:\t467:\t467:\t468:\t468:\t468:\t468:\t468:\t469:\t469:\t469:\t469:\t469:\t470:\t470:\t470:\t470:\t470:\t471:\t471:\t471:\t471:\t471:\t472:\t472:\t472:\t472:\t472:\t473:\t473:\t473:\t473:\t473:\t474:\t474:\t474:\t474:\t474:\t475:\t475:\t475:\t475:\t475:\t476:\t476:\t476:\t476:\t476:\t477:\t477:\t477:\t477:\t477:\t478:\t478:\t478:\t478:\t478:\t479:\t479:\t479:\t479:\t479:\t480:\t480:\t480:\t480:\t480:\t481:\t481:\t481:\t481:\t481:\t482:\t482:\t482:\t482:\t482:\t483:\t483:\t483:\t483:\t483:\t484:\t484:\t484:\t484:\t484:\t485:\t485:\t485:\t485:\t485:\t486:\t486:\t486:\t486:\t486:\t487:\t487:\t487:\t487:\t487:\t488:\t488:\t488:\t488:\t488:\t489:\t489:\t489:\t489:\t489:\t490:\t490:\t490:\t490:\t490:\t491:\t491:\t491:\t491:\t491:\t492:\t492:\t492:\t492:\t492:\t493:\t493:\t493:\t493:\t493:\t494:\t494:\t494:\t494:\t494:\t495:\t495:\t495:\t495:\t495:\t496:\t496:\t496:\t496:\t496:\t497:\t497:\t497:\t497:\t497:\t498:\t498:\t498:\t498:\t498:\t499:\t499:\t499:\t499:\t499:\t500:\t500:\t500:\t500:\t500:\t501:\t501:\t501:\t501:\t501:\t502:\t502:\t502:\t502:\t502:\t503:\t503:\t503:\t503:\t503:\t504:\t504:\t504:\t504:\t504:\t505:\t505:\t505:\t505:\t505:\t506:\t506:\t506:\t506:\t506:\t507:\t507:\t507:\t507:\t507:\t508:\t508:\t508:\t508:\t508:\t509:\t509:\t509:\t509:\t509:\t510:\t510:\t510:\t510:\t510:\t511:\t511:\t511:\t511:\t511:\t512:\t512:\t512:\t512:\t512:\t513:\t513:\t513:\t513:\t513:\t514:\t514:\t514:\t514:\t514:\t515:\t515:\t515:\t515:\t515:\t516:\t516:\t516:\t516:\t516:\t517:\t517:\t517:\t517:\t517:\t518:\t518:\t518:\t518:\t518:\t519:\t519:\t519:\t519:\t519:\t520:\t520:\t520:\t520:\t520:\t521:\t521:\t521:\t521:\t521:\t522:\t522:\t522:\t522:\t522:\t523:\t523:\t523:\t523:\t523:\t524:\t524:\t524:\t524:\t524:\t525:\t525:\t525:\t525:\t525:\t526:\t526:\t526:\t526:\t526:\t527:\t527:\t527:\t527:\t527:\t528:\t528:\t528:\t528:\t528:\t529:\t529:\t529:\t529:\t529:\t530:\t530:\t530:\t530:\t530:\t531:\t531:\t531:\t531:\t531:\t532:\t532:\t532:\t532:\t532:\t533:\t533:\t533:\t533:\t533:\t534:\t534:\t534:\t534:\t534:\t535:\t535:\t535:\t535:\t535:\t536:\t536:\t536:\t536:\t536:\t537:\t537:\t537:\t537:\t537:\t538:\t538:\t538:\t538:\t538:\t539:\t539:\t539:\t539:\t539:\t540:\t540:\t540:\t540:\t540:\t541:\t541:\t541:\t541:\t541:\t542:\t542:\t542:\t542:\t542:\t543:\t543:\t543:\t543:\t543:\t544:\t544:\t544:\t544:\t544:\t545:\t545:\t545:\t545:\t545:\t546:\t546:\t546:\t546:\t546:\t547:\t547:\t547:\t547:\t547:\t548:\t548:\t548:\t548:\t548:\t549:\t549:\t549:\t549:\t549:\t550:\t550:\t550:\t550:\t550:\t551:\t551:\t551:\t551:\t551:\t552:\t552:\t552:\t552:\t552:\t553:\t553:\t553:\t553:\t553:\t554:\t554:\t554:\t554:\t554:\t555:\t555:\t555:\t555:\t555:\t556:\t556:\t556:\t556:\t556:\t557:\t557:\t557:\t557:\t557:\t558:\t558:\t558:\t558:\t558:\t559:\t559:\t559:\t559:\t559:\t560:\t560:\t560:\t560:\t560:\t561:\t561:\t561:\t561:\t561:\t562:\t562:\t562:\t562:\t562:\t563:\t563:\t563:\t563:\t563:\t564:\t564:\t564:\t564:\t564:\t565:\t565:\t565:\t565:\t565:\t566:\t566:\t566:\t566:\t566:\t567:\t567:\t567:\t567:\t567:\t568:\t568:\t568:\t568:\t568:\t569:\t569:\t569:\t569:\t569:\t570:\t570:\t570:\t570:\t570:\t571:\t571:\t571:\t571:\t571:\t572:\t572:\t572:\t572:\t572:\t573:\t573:\t573:\t573:\t573:\t574:\t574:\t574:\t574:\t574:\t575:\t575:\t575:\t575:\t575:\t576:\t576:\t576:\t576:\t576:\t577:\t577:\t577:\t577:\t577:\t578:\t578:\t578:\t578:\t578:\t579:\t579:\t579:\t579:\t579:\t580:\t580:\t580:\t580:\t580:\t581:\t581:\t581:\t581:\t581:\t582:\t582:\t582:\t582:\t582:\t583:\t583:\t583:\t583:\t583:\t584:\t584:\t584:\t584:\t584:\t585:\t585:\t585:\t585:\t585:\t586:\t586:\t586:\t586:\t586:\t587:\t587:\t587:\t587:\t587:\t588:\t588:\t588:\t588:\t588:\t589:\t589:\t589:\t589:\t589:\t590:\t590:\t590:\t590:\t590:\t591:\t591:\t591:\t591:\t591:\t592:\t592:\t592:\t592:\t592:\t593:\t593:\t593:\t593:\t593:\t594:\t594:\t594:\t594:\t594:\t595:\t595:\t595:\t595:\t595:\t596:\t596:\t596:\t596:\t596:\t597:\t597:\t597:\t597:\t597:\t598:\t598:\t598:\t598:\t598:\t599:\t599:\t599:\t599:\t599:\t600:\t600:\t600:\t600:\t600:\t601:\t601:\t601:\t601:\t601:\t602:\t602:\t602:\t602:\t602:\t603:\t603:\t603:\t603:\t603:\t604:\t604:\t604:\t604:\t604:\t605:\t605:\t605:\t605:\t605:\t606:\t606:\t606:\t606:\t606:\t607:\t607:\t607:\t607:\t607:\t608:\t608:\t608:\t608:\t608:\t609:\t609:\t609:\t609:\t609:\t610:\t610:\t610:\t610:\t610:\t611:\t611:\t611:\t611:\t611:\t612:\t612:\t612:\t612:\t612:\t613:\t613:\t613:\t613:\t613:\t614:\t614:\t614:\t614:\t614:\t615:\t615:\t615:\t615:\t615:\t616:\t616:\t616:\t616:\t616:\t617:\t617:\t617:\t617:\t617:\t618:\t618:\t618:\t618:\t618:\t619:\t619:\t619:\t619:\t619:\t620:\t620:\t620:\t620:\t620:\t621:\t621:\t621:\t621:\t621:\t622:\t622:\t622:\t622:\t622:\t623:\t623:\t623:\t623:\t623:\t624:\t624:\t624:\t624:\t624:\t625:\t625:\t625:\t625:\t625:\t626:\t626:\t626:\t626:\t626:\t627:\t627:\t627:\t627:\t627:\t628:\t628:\t628:\t628:\t628:\t629:\t629:\t629:\t629:\t629:\t630:\t630:\t630:\t630:\t630:\t631:\t631:\t631:\t631:\t631:\t632:\t632:\t632:\t632:\t632:\t633:\t633:\t633:\t633:\t633:\t634:\t634:\t634:\t634:\t634:\t635:\t635:\t635:\t635:\t635:\t636:\t636:\t636:\t636:\t636:\t637:\t637:\t637:\t637:\t637:\t638:\t638:\t638:\t638:\t638:\t639:\t639:\t639:\t639:\t639:\t640:\t640:\t640:\t640:\t640:\t641:\t641:\t641:\t641:\t641:\t642:\t642:\t642:\t642:\t642:\t643:\t643:\t643:\t643:\t643:\t644:\t644:\t644:\t644:\t644:\t645:\t645:\t645:\t645:\t645:\t646:\t646:\t646:\t646:\t646:\t647:\t647:\t647:\t647:\t647:\t648:\t648:\t648:\t648:\t648:\t649:\t649:\t649:\t649:\t649:\t650:\t650:\t650:\t650:\t650:\t651:\t651:\t651:\t651:\t651:\t652:\t652:\t652:\t652:\t652:\t653:\t653:\t653:\t653:\t653:\t654:\t654:\t654:\t654:\t654:\t655:\t655:\t655:\t655:\t655:\t656:\t656:\t656:\t656:\t656:\t657:\t657:\t657:\t657:\t657:\t658:\t658:\t658:\t658:\t658:\t659:\t659:\t659:\t659:\t659:\t660:\t660:\t660:\t660:\t660:\t661:\t661:\t661:\t661:\t661:\t662:\t662:\t662:\t662:\t662:\t663:\t663:\t663:\t663:\t663:\t664:\t664:\t664:\t664:\t664:\t665:\t665:\t665:\t665:\t665:\t666:\t666:\t666:\t666:\t666:\t667:\t667:\t667:\t667:\t667:\t668:\t668:\t668:\t668:\t668:\t669:\t669:\t669:\t669:\t669:\t670:\t670:\t670:\t670:\t670:\t671:\t671:\t671:\t671:\t671:\t672:\t672:\t672:\t672:\t672:\t673:\t673:\t673:\t673:\t673:\t674:\t674:\t674:\t674:\t674:\t675:\t675:\t675:\t675:\t675:\t676:\t676:\t676:\t676:\t676:\t677:\t677:\t677:\t677:\t677:\t678:\t678:\t678:\t678:\t678:\t679:\t679:\t679:\t679:\t679:\t680:\t680:\t680:\t680:\t680:\t681:\t681:\t681:\t681:\t681:\t682:\t682:\t682:\t682:\t682:\t683:\t683:\t683:\t683:\t683:\t684:\t684:\t684:\t684:\t684:\t685:\t685:\t685:\t685:\t685:\t686:\t686:\t686:\t686:\t686:\t687:\t687:\t687:\t687:\t687:\t688:\t688:\t688:\t688:\t688:\t689:\t689:\t689:\t689:\t689:\t690:\t690:\t690:\t690:\t690:\t691:\t691:\t691:\t691:\t691:\t692:\t692:\t692:\t692:\t692:\t693:\t693:\t693:\t693:\t693:\t694:\t694:\t694:\t694:\t694:\t695:\t695:\t695:\t695:\t695:\t696:\t696:\t696:\t696:\t696:\t697:\t697:\t697:\t697:\t697:\t698:\t698:\t698:\t698:\t698:\t699:\t699:\t699:\t699:\t699:\t700:\t700:\t700:\t700:\t700:\t701:\t701:\t701:\t701:\t701:\t702:\t702:\t702:\t702:\t702:\t703:\t703:\t703:\t703:\t703:\t704:\t704:\t704:\t704:\t704:\t705:\t705:\t705:\t705:\t705:\t706:\t706:\t706:\t706:\t706:\t707:\t707:\t707:\t707:\t707:\t708:\t708:\t708:\t708:\t708:\t709:\t709:\t709:\t709:\t709:\t710:\t710:\t710:\t710:\t710:\t711:\t711:\t711:\t711:\t711:\t712:\t712:\t712:\t712:\t712:\t713:\t713:\t713:\t713:\t713:\t714:\t714:\t714:\t714:\t714:\t715:\t715:\t715:\t715:\t715:\t716:\t716:\t716:\t716:\t716:\t717:\t717:\t717:\t717:\t717:\t718:\t718:\t718:\t718:\t718:\t719:\t719:\t719:\t719:\t719:\t720:\t720:\t720:\t720:\t720:\t721:\t721:\t721:\t721:\t721:\t722:\t722:\t722:\t722:\t722:\t723:\t723:\t723:\t723:\t723:\t724:\t724:\t724:\t724:\t724:\t725:\t725:\t725:\t725:\t725:\t726:\t726:\t726:\t726:\t726:\t727:\t727:\t727:\t727:\t727:\t728:\t728:\t728:\t728:\t728:\t729:\t729:\t729:\t729:\t729:\t730:\t730:\t730:\t730:\t730:\t731:\t731:\t731:\t731:\t731:\t732:\t732:\t732:\t732:\t732:\t733:\t733:\t733:\t733:\t733:\t734:\t734:\t734:\t734:\t734:\t735:\t735:\t735:\t735:\t735:\t736:\t736:\t736:\t736:\t736:\t737:\t737:\t737:\t737:\t737:\t738:\t738:\t738:\t738:\t738:\t739:\t739:\t739:\t739:\t739:\t740:\t740:\t740:\t740:\t740:\t741:\t741:\t741:\t741:\t741:\t742:\t742:\t742:\t742:\t742:\t743:\t743:\t743:\t743:\t743:\t744:\t744:\t744:\t744:\t744:\t745:\t745:\t745:\t745:\t745:\t746:\t746:\t746:\t746:\t746:\t747:\t747:\t747:\t747:\t747:\t748:\t748:\t748:\t748:\t748:\t749:\t749:\t749:\t749:\t749:\t750:\t750:\t750:\t750:\t750:\t751:\t751:\t751:\t751:\t751:\t752:\t752:\t752:\t752:\t752:\t753:\t753:\t753:\t753:\t753:\t754:\t754:\t754:\t754:\t754:\t755:\t755:\t755:\t755:\t755:\t756:\t756:\t756:\t756:\t756:\t757:\t757:\t757:\t757:\t757:\t758:\t758:\t758:\t758:\t758:\t759:\t759:\t759:\t759:\t759:\t760:\t760:\t760:\t760:\t760:\t761:\t761:\t761:\t761:\t761:\t762:\t762:\t762:\t762:\t762:\t763:\t763:\t763:\t763:\t763:\t764:\t764:\t764:\t764:\t764:\t765:\t765:\t765:\t765:\t765:\t766:\t766:\t766:\t766:\t766:\t767:\t767:\t767:\t767:\t767:\t768:\t768:\t768:\t768:\t768:\t769:\t769:\t769:\t769:\t769:\t770:\t770:\t770:\t770:\t770:\t771:\t771:\t771:\t771:\t771:\t772:\t772:\t772:\t772:\t772:\t773:\t773:\t773:\t773:\t773:\t774:\t774:\t774:\t774:\t774:\t775:\t775:\t775:\t775:\t775:\t776:\t776:\t776:\t776:\t776:\t777:\t777:\t777:\t777:\t777:\t778:\t778:\t778:\t778:\t778:\t779:\t779:\t779:\t779:\t779:\t780:\t780:\t780:\t780:\t780:\t781:\t781:\t781:\t781:\t781:\t782:\t782:\t782:\t782:\t782:\t783:\t783:\t783:\t783:\t783:\t784:\t784:\t784:\t784:\t784:\t785:\t785:\t785:\t785:\t785:\t786:\t786:\t786:\t786:\t786:\t787:\t787:\t787:\t787:\t787:\t788:\t788:\t788:\t788:\t788:\t789:\t789:\t789:\t789:\t789:\t790:\t790:\t790:\t790:\t790:\t791:\t791:\t791:\t791:\t791:\t792:\t792:\t792:\t792:\t792:\t793:\t793:\t793:\t793:\t793:\t794:\t794:\t794:\t794:\t794:\t795:\t795:\t795:\t795:\t795:\t796:\t796:\t796:\t796:\t796:\t797:\t797:\t797:\t797:\t797:\t798:\t798:\t798:\t798:\t798:\t799:\t799:\t799:\t799:\t799:\t800:\t800:\t800:\t800:\t800:\t801:\t801:\t801:\t801:\t801:\t802:\t802:\t802:\t802:\t802:\t803:\t803:\t803:\t803:\t803:\t804:\t804:\t804:\t804:\t804:\t805:\t805:\t805:\t805:\t805:\t806:\t806:\t806:\t806:\t806:\t807:\t807:\t807:\t807:\t807:\t808:\t808:\t808:\t808:\t808:\t809:\t809:\t809:\t809:\t809:\t810:\t810:\t810:\t810:\t810:\t811:\t811:\t811:\t811:\t811:\t812:\t812:\t812:\t812:\t812:\t813:\t813:\t813:\t813:\t813:\t814:\t814:\t814:\t814:\t814:\t815:\t815:\t815:\t815:\t815:\t816:\t816:\t816:\t816:\t816:\t817:\t817:\t817:\t817:\t817:\t818:\t818:\t818:\t818:\t818:\t819:\t819:\t819:\t819:\t819:\t820:\t820:\t820:\t820:\t820:\t821:\t821:\t821:\t821:\t821:\t822:\t822:\t822:\t822:\t822:\t823:\t823:\t823:\t823:\t823:\t824:\t824:\t824:\t824:\t824:\t825:\t825:\t825:\t825:\t825:\t826:\t826:\t826:\t826:\t826:\t827:\t827:\t827:\t827:\t827:\t828:\t828:\t828:\t828:\t828:\t829:\t829:\t829:\t829:\t829:\t830:\t830:\t830:\t830:\t830:\t831:\t831:\t831:\t831:\t831:\t832:\t832:\t832:\t832:\t832:\t833:\t833:\t833:\t833:\t833:\t834:\t834:\t834:\t834:\t834:\t835:\t835:\t835:\t835:\t835:\t836:\t836:\t836:\t836:\t836:\t837:\t837:\t837:\t837:\t837:\t838:\t838:\t838:\t838:\t838:\t839:\t839:\t839:\t839:\t839:\t840:\t840:\t840:\t840:\t840:\t841:\t841:\t841:\t841:\t841:\t842:\t842:\t842:\t842:\t842:\t843:\t843:\t843:\t843:\t843:\t844:\t844:\t844:\t844:\t844:\t845:\t845:\t845:\t845:\t845:\t846:\t846:\t846:\t846:\t846:\t847:\t847:\t847:\t847:\t847:\t848:\t848:\t848:\t848:\t848:\t849:\t849:\t849:\t849:\t849:\t850:\t850:\t850:\t850:\t850:\t851:\t851:\t851:\t851:\t851:\t852:\t852:\t852:\t852:\t852:\t853:\t853:\t853:\t853:\t853:\t854:\t854:\t854:\t854:\t854:\t855:\t855:\t855:\t855:\t855:\t856:\t856:\t856:\t856:\t856:\t857:\t857:\t857:\t857:\t857:\t858:\t858:\t858:\t858:\t858:\t859:\t859:\t859:\t859:\t859:\t860:\t860:\t860:\t860:\t860:\t861:\t861:\t861:\t861:\t861:\t862:\t862:\t862:\t862:\t862:\t863:\t863:\t863:\t863:\t863:\t864:\t864:\t864:\t864:\t864:\t865:\t865:\t865:\t865:\t865:\t866:\t866:\t866:\t866:\t866:\t867:\t867:\t867:\t867:\t867:\t868:\t868:\t868:\t868:\t868:\t869:\t869:\t869:\t869:\t869:\t870:\t870:\t870:\t870:\t870:\t871:\t871:\t871:\t871:\t871:\t872:\t872:\t872:\t872:\t872:\t873:\t873:\t873:\t873:\t873:\t874:\t874:\t874:\t874:\t874:\t875:\t875:\t875:\t875:\t875:\t876:\t876:\t876:\t876:\t876:\t877:\t877:\t877:\t877:\t877:\t878:\t878:\t878:\t878:\t878:\t879:\t879:\t879:\t879:\t879:\t880:\t880:\t880:\t880:\t880:\t881:\t881:\t881:\t881:\t881:\t882:\t882:\t882:\t882:\t882:\t883:\t883:\t883:\t883:\t883:\t884:\t884:\t884:\t884:\t884:\t885:\t885:\t885:\t885:\t885:\t886:\t886:\t886:\t886:\t886:\t887:\t887:\t887:\t887:\t887:\t888:\t888:\t888:\t888:\t888:\t889:\t889:\t889:\t889:\t889:\t890:\t890:\t890:\t890:\t890:\t891:\t891:\t891:\t891:\t891:\t892:\t892:\t892:\t892:\t892:\t893:\t893:\t893:\t893:\t893:\t894:\t894:\t894:\t894:\t894:\t895:\t895:\t895:\t895:\t895:\t896:\t896:\t896:\t896:\t896:\t897:\t897:\t897:\t897:\t897:\t898:\t898:\t898:\t898:\t898:\t899:\t899:\t899:\t899:\t899:\t900:\t900:\t900:\t900:\t900:\t901:\t901:\t901:\t901:\t901:\t902:\t902:\t902:\t902:\t902:\t903:\t903:\t903:\t903:\t903:\t904:\t904:\t904:\t904:\t904:\t905:\t905:\t905:\t905:\t905:\t906:\t906:\t906:\t906:\t906:\t907:\t907:\t907:\t907:\t907:\t908:\t908:\t908:\t908:\t908:\t909:\t909:\t909:\t909:\t909:\t910:\t910:\t910:\t910:\t910:\t911:\t911:\t911:\t911:\t911:\t912:\t912:\t912:\t912:\t912:\t913:\t913:\t913:\t913:\t913:\t914:\t914:\t914:\t914:\t914:\t915:\t915:\t915:\t915:\t915:\t916:\t916:\t916:\t916:\t916:\t917:\t917:\t917:\t917:\t917:\t918:\t918:\t918:\t918:\t918:\t919:\t919:\t919:\t919:\t919:\t920:\t920:\t920:\t920:\t920:\t921:\t921:\t921:\t921:\t921:\t922:\t922:\t922:\t922:\t922:\t923:\t923:\t923:\t923:\t923:\t924:\t924:\t924:\t924:\t924:\t925:\t925:\t925:\t925:\t925:\t926:\t926:\t926:\t926:\t926:\t927:\t927:\t927:\t927:\t927:\t928:\t928:\t928:\t928:\t928:\t929:\t929:\t929:\t929:\t929:\t930:\t930:\t930:\t930:\t930:\t931:\t931:\t931:\t931:\t931:\t932:\t932:\t932:\t932:\t932:\t933:\t933:\t933:\t933:\t933:\t934:\t934:\t934:\t934:\t934:\t935:\t935:\t935:\t935:\t935:\t936:\t936:\t936:\t936:\t936:\t937:\t937:\t937:\t937:\t937:\t938:\t938:\t938:\t938:\t938:\t939:\t939:\t939:\t939:\t939:\t940:\t940:\t940:\t940:\t940:\t941:\t941:\t941:\t941:\t941:\t942:\t942:\t942:\t942:\t942:\t943:\t943:\t943:\t943:\t943:\t944:\t944:\t944:\t944:\t944:\t945:\t945:\t945:\t945:\t945:\t946:\t946:\t946:\t946:\t946:\t947:\t947:\t947:\t947:\t947:\t948:\t948:\t948:\t948:\t948:\t949:\t949:\t949:\t949:\t949:\t950:\t950:\t950:\t950:\t950:\t951:\t951:\t951:\t951:\t951:\t952:\t952:\t952:\t952:\t952:\t953:\t953:\t953:\t953:\t953:\t954:\t954:\t954:\t954:\t954:\t955:\t955:\t955:\t955:\t955:\t956:\t956:\t956:\t956:\t956:\t957:\t957:\t957:\t957:\t957:\t958:\t958:\t958:\t958:\t958:\t959:\t959:\t959:\t959:\t959:\t960:\t960:\t960:\t960:\t960:\t961:\t961:\t961:\t961:\t961:\t962:\t962:\t962:\t962:\t962:\t963:\t963:\t963:\t963:\t963:\t964:\t964:\t964:\t964:\t964:\t965:\t965:\t965:\t965:\t965:\t966:\t966:\t966:\t966:\t966:\t967:\t967:\t967:\t967:\t967:\t968:\t968:\t968:\t968:\t968:\t969:\t969:\t969:\t969:\t969:\t970:\t970:\t970:\t970:\t970:\t971:\t971:\t971:\t971:\t971:\t972:\t972:\t972:\t972:\t972:\t973:\t973:\t973:\t973:\t973:\t974:\t974:\t974:\t974:\t974:\t975:\t975:\t975:\t975:\t975:\t976:\t976:\t976:\t976:\t976:\t977:\t977:\t977:\t977:\t977:\t978:\t978:\t978:\t978:\t978:\t979:\t979:\t979:\t979:\t979:\t980:\t980:\t980:\t980:\t980:\t981:\t981:\t981:\t981:\t981:\t982:\t982:\t982:\t982:\t982:\t983:\t983:\t983:\t983:\t983:\t984:\t984:\t984:\t984:\t984:\t985:\t985:\t985:\t985:\t985:\t986:\t986:\t986:\t986:\t986:\t987:\t987:\t987:\t987:\t987:\t988:\t988:\t988:\t988:\t988:\t989:\t989:\t989:\t989:\t989:\t990:\t990:\t990:\t990:\t990:\t991:\t991:\t991:\t991:\t991:\t992:\t992:\t992:\t992:\t992:\t993:\t993:\t993:\t993:\t993:\t994:\t994:\t994:\t994:\t994:\t995:\t995:\t995:\t995:\t995:\t996:\t996:\t996:\t996:\t996:\t997:\t997:\t997:\t997:\t997:\t998:\t998:\t998:\t998:\t998:\t999:\t999:\t999:\t999:\t999:\t" 898 | ] 899 | } 900 | ], 901 | "source": [ 902 | "## run cv to get best iteration\n", 903 | "ctb_cv = cv(param, my_dt, fold_count=5, random_seed=2017)" 904 | ] 905 | }, 906 | { 907 | "cell_type": "code", 908 | "execution_count": 205, 909 | "metadata": { 910 | "collapsed": false 911 | }, 912 | "outputs": [], 913 | "source": [ 914 | "# fetch best round\n", 915 | "best_round = ctb_cv['b\\'Accuracy\\'_test_avg'].index(np.max(ctb_cv['b\\'Accuracy\\'_test_avg']))" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 206, 921 | "metadata": { 922 | "collapsed": true 923 | }, 924 | "outputs": [], 925 | "source": [ 926 | "## define the classifer model\n", 927 | "model = CatBoostClassifier(iterations=best_round, learning_rate=0.03,rsm = 0.3 ,depth=6, eval_metric='Accuracy', random_seed=2017)" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": 207, 933 | "metadata": { 934 | "collapsed": false 935 | }, 936 | "outputs": [ 937 | { 938 | "data": { 939 | "text/plain": [ 940 | "" 941 | ] 942 | }, 943 | "execution_count": 207, 944 | "metadata": {}, 945 | "output_type": "execute_result" 946 | } 947 | ], 948 | "source": [ 949 | "## train model\n", 950 | "model.fit(my_dt)" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": 208, 956 | "metadata": { 957 | "collapsed": true 958 | }, 959 | "outputs": [], 960 | "source": [ 961 | "## make predictions\n", 962 | "preds = model.predict(test_feats1)" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": 222, 968 | "metadata": { 969 | "collapsed": false 970 | }, 971 | "outputs": [], 972 | "source": [ 973 | "## make submission\n", 974 | "sub5 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})\n", 975 | "sub5['Is_Response'] = ['happy' if x == 1 else 'not_happy' for x in sub5['Is_Response']]\n", 976 | "sub5 = sub5[['User_ID','Is_Response']]\n", 977 | "sub5.to_csv('submissions/sub5_cb.csv', index=False)" 978 | ] 979 | } 980 | ], 981 | "metadata": { 982 | "kernelspec": { 983 | "display_name": "Python 3", 984 | "language": "python", 985 | "name": "python3" 986 | }, 987 | "language_info": { 988 | "codemirror_mode": { 989 | "name": "ipython", 990 | "version": 3 991 | }, 992 | "file_extension": ".py", 993 | "mimetype": "text/x-python", 994 | "name": "python", 995 | "nbconvert_exporter": "python", 996 | "pygments_lexer": "ipython3", 997 | "version": "3.5.2" 998 | } 999 | }, 1000 | "nbformat": 4, 1001 | "nbformat_minor": 2 1002 | } 1003 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Happiness-ML-Challenge 2 | 3 | 4 | This repository contains scripts shared during the machine learning challenge for beginners. In this challenge, participants have to predict the happinessfor customers. The data used in the scripts can be downloaded from the link given below.
5 | 6 | The aim of this challenge is to encourage beginners to gain more hands on experience in solving ML problems. 7 | 8 | **Challenge Name:** Predict the Happiness
9 | **Duration:** 30th August 2017 to 31st November 2017
10 | **Type:** Binary Classification
11 | **Metrics:** Accuracy 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /xgb_nb.R: -------------------------------------------------------------------------------- 1 | path <- "/happy_data/" 2 | setwd(path) 3 | 4 | # data manipulation 5 | library(data.table) 6 | 7 | # NLP 8 | library(tm) 9 | library(qdap) 10 | library(SnowballC) 11 | library(purrr) 12 | library(text2vec) 13 | 14 | # modeling 15 | library(e1071) # for naive bayes 16 | library(xgboost) 17 | 18 | # load data 19 | train <- fread("train.csv") 20 | test <- fread("test.csv") 21 | 22 | ## Clean Data ----------------------------------------------------------- 23 | 24 | cleanData <- function(data) 25 | { 26 | 27 | data[, Description := map_chr(Description, tolower)] # to lower 28 | data[, Description := map_chr(Description, function(k) gsub(pattern = "[[:punct:]]",replacement = "",x = k))] # remove punctuation 29 | data[, Description := map_chr(Description, function(k) gsub(pattern = "\\d+",replacement = "",x = k))] # remove digits 30 | data[, Description := map_chr(Description, function(k) replace_abbreviation(k))] # Sr. to Senior 31 | data[, Description := map_chr(Description, function(k) replace_contraction(k))] # isn't to is not 32 | data[,Description := map(Description, function(k) rm_stopwords(k, Top200Words, unlist = T))] # remove stopwords 33 | data[, Description := map(Description, function(k) stemmer(k))] # played, plays to play 34 | data[, Description := map(Description, function(k) k[nchar(k) > 2])] # remove two alphabet words like to, ok, po 35 | return (data) 36 | 37 | 38 | } 39 | 40 | train_clean <- cleanData(train) 41 | test_clean <- cleanData(test) 42 | 43 | 44 | # Bag of Words ------------------------------------------------------------ 45 | 46 | ## Bag of words technique converts the list of tokens (words) into a separate column with binary values in it. 47 | ## Lets understand it. 48 | 49 | ctext <- Corpus(VectorSource(train_clean$Description)) 50 | 51 | tdm = DocumentTermMatrix(ctext) 52 | print(tdm) 53 | 54 | # let's see how BOW looks like - every column becomes one feature 55 | inspect(tdm[1:10,1:5]) 56 | 57 | ## From here, we'll use text2vec package which provides immense potential for feature engineering 58 | ## we'll build two models 59 | # a) On Bag of Words Corpus 60 | # b) On TF-IDF Corpus 61 | # c) 2 Gram Model - Your to-do Task 62 | # You can read more about TF-IDF here: http://www.tfidf.com/ 63 | 64 | 65 | ## Bag of Words Model 66 | 67 | trte_data <- rbind(train[,.(User_ID, Description)], test[,.(User_ID, Description)]) 68 | trte_data$Description <- unlist(map(trte_data$Description, paste, collapse = ",")) 69 | 70 | bow <- itoken(trte_data$Description, preprocessor = tolower ,tokenizer = word_tokenizer, ids = trte_data$User_ID) 71 | bow_vocab <- create_vocabulary(bow) 72 | bow_vocab # now we have converted the text into tokens. woah! every word can be converted into a feature 73 | 74 | ## But not all words will be important, Are they ? let's remove words which occur less than 200 times in whole data 75 | pruned_bow <- prune_vocabulary(bow_vocab, term_count_min = 100) 76 | pruned_bow 77 | 78 | # get these vocabulary in a data frame for model training 79 | vovec <- vocab_vectorizer(pruned_bow) 80 | dtm_text <- create_dtm(bow, vovec) 81 | 82 | feats <- as.data.table(as.matrix(dtm_text)) 83 | feats[1:10,1:5] # see 1st 10 rows and 1st 5 columns 84 | 85 | # first feature set 86 | train_feats <- feats[1:nrow(train)] 87 | test_feats <- feats[(nrow(train)+1):nrow(feats)] 88 | 89 | cols <- setdiff(colnames(train), c('User_ID','Is_Response','Description')) 90 | for(x in cols) 91 | { 92 | if (class(train[[x]]) == 'character') 93 | { 94 | levels <- unique(c(train[[x]], test[[x]])) 95 | train[[x]] <- as.numeric(factor(train[[x]], levels = levels)) 96 | test[[x]] <- as.numeric(factor(test[[x]], levels = levels)) 97 | } 98 | } 99 | 100 | ## preparing data for training 101 | train_feats <- cbind(train_feats, train[,.(Browser_Used, Device_Used, Is_Response)]) 102 | test_feats <- cbind(test_feats, test[,.(Browser_Used, Device_Used)]) 103 | 104 | train_feats[, Is_Response := ifelse(Is_Response == 'happy',1,0)] 105 | train_feats[, Is_Response := as.factor(Is_Response)] 106 | 107 | ## naive Bayes is known to perform quite well in text classification problems 108 | 109 | model <- naiveBayes(Is_Response ~ ., data = train_feats, laplace = 1) 110 | preds <- predict(model, test_feats) 111 | 112 | # make your submission 113 | sub <- data.table(User_ID = test$User_ID, Is_Response = ifelse(preds == 1, "happy", "not_happy")) 114 | fwrite(sub, "sub1.csv") 115 | 116 | 117 | # TF -TDF Model ----------------------------------------------------------- 118 | 119 | TIDF <- TfIdf$new() 120 | dtm_text_tfidf <- fit_transform(dtm_text, TIDF) 121 | 122 | feats <- as.data.table(as.matrix(dtm_text_tfidf)) 123 | 124 | # second feature set 125 | train_feats <- feats[1:nrow(train)] 126 | test_feats <- feats[(nrow(train)+1):nrow(feats)] 127 | 128 | ## preparing data for training 129 | train_feats <- cbind(train_feats, train[,.(Browser_Used, Device_Used, Is_Response)]) 130 | test_feats <- cbind(test_feats, test[,.(Browser_Used, Device_Used)]) 131 | 132 | train_feats[, Is_Response := ifelse(Is_Response == "happy",1,0)] 133 | 134 | ## You can use naiveBayes Model here and compare the accuracy. 135 | ## let's try xgboost model here. 136 | 137 | # set parameters for xgboost 138 | param <- list(booster = "gbtree", 139 | objective = "binary:logistic", 140 | eval_metric = "error", 141 | #num_class = 9, 142 | eta = .2, 143 | # gamma = 1, 144 | max_depth = 6, 145 | min_child_weight = 0, 146 | subsample = .8, 147 | colsample_bytree = .3 148 | ) 149 | 150 | 151 | ## function to return predictions using best CV score 152 | 153 | predictions <- c() 154 | 155 | give_predictions <- function(train, test, params, iters) 156 | { 157 | 158 | dtrain <- xgb.DMatrix(data = as.matrix(train[,-c('Is_Response'),with=F]), label = train_feats$Is_Response) 159 | dtest <- xgb.DMatrix(data = as.matrix(test)) 160 | 161 | cv.model <- xgb.cv(params = params 162 | ,data = dtrain 163 | ,nrounds = iters 164 | ,nfold = 5L 165 | ,stratified = T 166 | ,early_stopping_rounds = 40 167 | ,print_every_n = 20 168 | ,maximize = F) 169 | 170 | best_it <- cv.model$best_iteration 171 | best_score <- cv.model$evaluation_log$test_error_mean[which.min(cv.model$evaluation_log$test_error_mean)] 172 | 173 | cat('CV model returned',best_score,'error score') 174 | 175 | tr.model <- xgb.train(params = param 176 | ,data = dtrain 177 | ,nrounds = best_it 178 | ,watchlist = list(train = dtrain) 179 | ,print_every_n = 20 180 | ) 181 | 182 | preds <- predict(tr.model, dtest) 183 | predictions <- append(predictions, preds) 184 | 185 | return(predictions) 186 | 187 | } 188 | 189 | # get predictions 190 | my_preds <- give_predictions(train_feats, test_feats, param, 1000) 191 | 192 | ## create submission file 193 | preds <- ifelse(my_preds > 0.66,1,0) #cutoff threshold 194 | sub2 <- data.table(User_ID = test$User_ID, Is_Response = preds) 195 | fwrite(sub2, "sub2.csv") 196 | 197 | 198 | ## What's Next ? 199 | 200 | ## Till now, we made 1-gram model i.e. one word per column. We can extend it to 2-3-n gram 201 | 202 | ## create another model with 2-gram features 203 | 204 | gr_vocab <- create_vocabulary(bow, ngram = c(1L,2L)) 205 | gr_vocab <- prune_vocabulary(gr_vocab, term_count_min = 150) 206 | gr_vocab 207 | 208 | bigram_vec <- vocab_vectorizer(gr_vocab) 209 | dtm_text <- create_dtm(bow, bigram_vec) 210 | 211 | # now you can follow step from Line 79 onwards to create another model. 212 | # incase you face difficulties, feel free to raise "Issues" above. 213 | 214 | 215 | --------------------------------------------------------------------------------