├── LGB_CB_Python.ipynb
├── README.md
└── xgb_nb.R
/LGB_CB_Python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Introduction\n",
8 | "\n",
9 | "Here you'll learn to build models using Catboost, Lightgbm and NaiveBayes algorithm in Python. Given the text classification problem, you'll also learn to clean data, create bag of words matrix, tf-idf matrix. \n",
10 | "\n",
11 | "On top of what's done here, next you can create a simple voting ensemble from the predictions generated from these models here."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {
18 | "collapsed": true
19 | },
20 | "outputs": [],
21 | "source": [
22 | "# Load Libraries\n",
23 | "import numpy as np\n",
24 | "import pandas as pd\n",
25 | "from nltk.corpus import stopwords\n",
26 | "from nltk.stem import PorterStemmer\n",
27 | "from sklearn.ensemble import GradientBoostingClassifier\n",
28 | "from sklearn.naive_bayes import GaussianNB\n",
29 | "from sklearn.preprocessing import LabelEncoder\n",
30 | "import re\n",
31 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
32 | "from sklearn.model_selection import cross_val_score\n",
33 | "from sklearn.metrics import accuracy_score, make_scorer"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "metadata": {
40 | "collapsed": false
41 | },
42 | "outputs": [],
43 | "source": [
44 | "# load data\n",
45 | "train = pd.read_csv(\"train.csv\")\n",
46 | "test = pd.read_csv(\"test.csv\")"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 4,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/html": [
59 | "
\n",
60 | "
\n",
61 | " \n",
62 | " \n",
63 | " | \n",
64 | " User_ID | \n",
65 | " Description | \n",
66 | " Browser_Used | \n",
67 | " Device_Used | \n",
68 | " Is_Response | \n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " \n",
73 | " 0 | \n",
74 | " id10326 | \n",
75 | " The room was kind of clean but had a VERY stro... | \n",
76 | " Edge | \n",
77 | " Mobile | \n",
78 | " not happy | \n",
79 | "
\n",
80 | " \n",
81 | " 1 | \n",
82 | " id10327 | \n",
83 | " I stayed at the Crown Plaza April -- - April -... | \n",
84 | " Internet Explorer | \n",
85 | " Mobile | \n",
86 | " not happy | \n",
87 | "
\n",
88 | " \n",
89 | " 2 | \n",
90 | " id10328 | \n",
91 | " I booked this hotel through Hotwire at the low... | \n",
92 | " Mozilla | \n",
93 | " Tablet | \n",
94 | " not happy | \n",
95 | "
\n",
96 | " \n",
97 | " 3 | \n",
98 | " id10329 | \n",
99 | " Stayed here with husband and sons on the way t... | \n",
100 | " InternetExplorer | \n",
101 | " Desktop | \n",
102 | " happy | \n",
103 | "
\n",
104 | " \n",
105 | " 4 | \n",
106 | " id10330 | \n",
107 | " My girlfriends and I stayed here to celebrate ... | \n",
108 | " Edge | \n",
109 | " Tablet | \n",
110 | " not happy | \n",
111 | "
\n",
112 | " \n",
113 | "
\n",
114 | "
"
115 | ],
116 | "text/plain": [
117 | " User_ID Description \\\n",
118 | "0 id10326 The room was kind of clean but had a VERY stro... \n",
119 | "1 id10327 I stayed at the Crown Plaza April -- - April -... \n",
120 | "2 id10328 I booked this hotel through Hotwire at the low... \n",
121 | "3 id10329 Stayed here with husband and sons on the way t... \n",
122 | "4 id10330 My girlfriends and I stayed here to celebrate ... \n",
123 | "\n",
124 | " Browser_Used Device_Used Is_Response \n",
125 | "0 Edge Mobile not happy \n",
126 | "1 Internet Explorer Mobile not happy \n",
127 | "2 Mozilla Tablet not happy \n",
128 | "3 InternetExplorer Desktop happy \n",
129 | "4 Edge Tablet not happy "
130 | ]
131 | },
132 | "execution_count": 4,
133 | "metadata": {},
134 | "output_type": "execute_result"
135 | }
136 | ],
137 | "source": [
138 | "train.head()"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 7,
144 | "metadata": {
145 | "collapsed": true
146 | },
147 | "outputs": [],
148 | "source": [
149 | "# function to clean data\n",
150 | "\n",
151 | "stops = set(stopwords.words(\"english\"))\n",
152 | "def cleanData(text, lowercase = False, remove_stops = False, stemming = False):\n",
153 | " txt = str(text)\n",
154 | " txt = re.sub(r'[^A-Za-z0-9\\s]',r'',txt)\n",
155 | " txt = re.sub(r'\\n',r' ',txt)\n",
156 | " \n",
157 | " if lowercase:\n",
158 | " txt = \" \".join([w.lower() for w in txt.split()])\n",
159 | " \n",
160 | " if remove_stops:\n",
161 | " txt = \" \".join([w for w in txt.split() if w not in stops])\n",
162 | " \n",
163 | " if stemming:\n",
164 | " st = PorterStemmer()\n",
165 | " txt = \" \".join([st.stem(w) for w in txt.split()])\n",
166 | "\n",
167 | " return txt"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 9,
173 | "metadata": {
174 | "collapsed": false
175 | },
176 | "outputs": [],
177 | "source": [
178 | "## join data\n",
179 | "test['Is_Response'] = np.nan\n",
180 | "alldata = pd.concat([train, test]).reset_index(drop=True)"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 11,
186 | "metadata": {
187 | "collapsed": true
188 | },
189 | "outputs": [],
190 | "source": [
191 | "# clean description\n",
192 | "alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 13,
198 | "metadata": {
199 | "collapsed": false
200 | },
201 | "outputs": [],
202 | "source": [
203 | "# initialise the functions - we'll create separate models for each type.\n",
204 | "countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)\n",
205 | "tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 14,
211 | "metadata": {
212 | "collapsed": true
213 | },
214 | "outputs": [],
215 | "source": [
216 | "# create features\n",
217 | "bagofwords = countvec.fit_transform(alldata['Description'])\n",
218 | "tfidfdata = tfidfvec.fit_transform(alldata['Description'])"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 15,
224 | "metadata": {
225 | "collapsed": false
226 | },
227 | "outputs": [],
228 | "source": [
229 | "# label encode categorical features in data given\n",
230 | "cols = ['Browser_Used','Device_Used']\n",
231 | "\n",
232 | "for x in cols:\n",
233 | " lbl = LabelEncoder()\n",
234 | " alldata[x] = lbl.fit_transform(alldata[x])"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 16,
240 | "metadata": {
241 | "collapsed": false
242 | },
243 | "outputs": [],
244 | "source": [
245 | "# create dataframe for features\n",
246 | "bow_df = pd.DataFrame(bagofwords.todense())\n",
247 | "tfidf_df = pd.DataFrame(tfidfdata.todense())"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 17,
253 | "metadata": {
254 | "collapsed": true
255 | },
256 | "outputs": [],
257 | "source": [
258 | "# set column names\n",
259 | "bow_df.columns = ['col'+ str(x) for x in bow_df.columns]\n",
260 | "tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 18,
266 | "metadata": {
267 | "collapsed": false
268 | },
269 | "outputs": [],
270 | "source": [
271 | "# create separate data frame for bag of words and tf-idf\n",
272 | "\n",
273 | "bow_df_train = bow_df[:len(train)]\n",
274 | "bow_df_test = bow_df[len(train):]\n",
275 | "\n",
276 | "tfid_df_train = tfidf_df[:len(train)]\n",
277 | "tfid_df_test = tfidf_df[len(train):]"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 219,
283 | "metadata": {
284 | "collapsed": true
285 | },
286 | "outputs": [],
287 | "source": [
288 | "# split the merged data file into train and test respectively\n",
289 | "train_feats = alldata[~pd.isnull(alldata.Is_Response)]\n",
290 | "test_feats = alldata[pd.isnull(alldata.Is_Response)]"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 220,
296 | "metadata": {
297 | "collapsed": false
298 | },
299 | "outputs": [
300 | {
301 | "name": "stderr",
302 | "output_type": "stream",
303 | "text": [
304 | "/home/manish/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n",
305 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
306 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
307 | "\n",
308 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
309 | " app.launch_new_instance()\n"
310 | ]
311 | }
312 | ],
313 | "source": [
314 | "### set target variable\n",
315 | "\n",
316 | "train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 24,
322 | "metadata": {
323 | "collapsed": false
324 | },
325 | "outputs": [],
326 | "source": [
327 | "# merge count (bag of word) features into train\n",
328 | "train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)\n",
329 | "test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)\n",
330 | "\n",
331 | "test_feats1.reset_index(drop=True, inplace=True)"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 51,
337 | "metadata": {
338 | "collapsed": false
339 | },
340 | "outputs": [],
341 | "source": [
342 | "# merge into a new data frame with tf-idf features\n",
343 | "train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)\n",
344 | "test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "### NaiveBayes"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 28,
357 | "metadata": {
358 | "collapsed": true
359 | },
360 | "outputs": [],
361 | "source": [
362 | "# let's check cross validation score of the model\n",
363 | "# cv score acts a unbiased estimate of models accuracy on unseen data\n",
364 | "\n",
365 | "mod1 = GaussianNB()\n",
366 | "target = train_feats['Is_Response']"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 31,
372 | "metadata": {
373 | "collapsed": false
374 | },
375 | "outputs": [
376 | {
377 | "name": "stdout",
378 | "output_type": "stream",
379 | "text": [
380 | "[ 0.76311844 0.7745 0.7515 0.765 0.75837919]\n"
381 | ]
382 | }
383 | ],
384 | "source": [
385 | "## Naive Bayes 1\n",
386 | "print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 32,
392 | "metadata": {
393 | "collapsed": false
394 | },
395 | "outputs": [
396 | {
397 | "name": "stdout",
398 | "output_type": "stream",
399 | "text": [
400 | "[ 0.79310345 0.811 0.8035 0.815 0.79789895]\n"
401 | ]
402 | }
403 | ],
404 | "source": [
405 | "## Naive Bayes 2 - tfidf is giving higher CV score\n",
406 | "print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": 36,
412 | "metadata": {
413 | "collapsed": false
414 | },
415 | "outputs": [
416 | {
417 | "data": {
418 | "text/plain": [
419 | "GaussianNB(priors=None)"
420 | ]
421 | },
422 | "execution_count": 36,
423 | "metadata": {},
424 | "output_type": "execute_result"
425 | }
426 | ],
427 | "source": [
428 | "# make our first set of predictions\n",
429 | "\n",
430 | "clf1 = GaussianNB()\n",
431 | "clf1.fit(train_feats1, target)\n",
432 | "\n",
433 | "clf2 = GaussianNB()\n",
434 | "clf2.fit(train_feats2, target)"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 41,
440 | "metadata": {
441 | "collapsed": false
442 | },
443 | "outputs": [],
444 | "source": [
445 | "preds1 = clf1.predict(test_feats1)\n",
446 | "preds2 = clf2.predict(test_feats2)"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 54,
452 | "metadata": {
453 | "collapsed": false
454 | },
455 | "outputs": [],
456 | "source": [
457 | "def to_labels(x):\n",
458 | " if x == 1:\n",
459 | " return \"happy\"\n",
460 | " return \"not_happy\""
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 68,
466 | "metadata": {
467 | "collapsed": false
468 | },
469 | "outputs": [],
470 | "source": [
471 | "sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})\n",
472 | "sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 72,
478 | "metadata": {
479 | "collapsed": false
480 | },
481 | "outputs": [],
482 | "source": [
483 | "sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})\n",
484 | "sub2['Is_Response'] = sub2['Is_Response'].map(lambda x: to_labels(x))"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": 83,
490 | "metadata": {
491 | "collapsed": false
492 | },
493 | "outputs": [],
494 | "source": [
495 | "sub1 = sub1[['User_ID', 'Is_Response']]\n",
496 | "sub2 = sub2[['User_ID', 'Is_Response']]"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 85,
502 | "metadata": {
503 | "collapsed": false
504 | },
505 | "outputs": [],
506 | "source": [
507 | "## write submission files\n",
508 | "sub1.to_csv('submissions/sub1_cv.csv', index=False)\n",
509 | "sub2.to_csv('submissions/sub2_tf.csv', index=False)s"
510 | ]
511 | },
512 | {
513 | "cell_type": "markdown",
514 | "metadata": {},
515 | "source": [
516 | "### LightGBM - 1\n",
517 | "\n",
518 | "We are prefering lightgbm over xgboost because of its speed.
\n",
519 | "In this model, we'll use count features for model training."
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": 87,
525 | "metadata": {
526 | "collapsed": true
527 | },
528 | "outputs": [],
529 | "source": [
530 | "import lightgbm as lgb"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 88,
536 | "metadata": {
537 | "collapsed": false
538 | },
539 | "outputs": [],
540 | "source": [
541 | "# set the data in format lgb accepts\n",
542 | "d_train = lgb.Dataset(train_feats1, label = target)"
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": 89,
548 | "metadata": {
549 | "collapsed": true
550 | },
551 | "outputs": [],
552 | "source": [
553 | "## set parameters\n",
554 | "## you can tune the parameters can try to better score\n",
555 | "\n",
556 | "params = {'task': 'train',\n",
557 | " 'boosting_type': 'gbdt',\n",
558 | " 'objective': 'binary',\n",
559 | " 'metric': 'binary_error',\n",
560 | " 'learning_rate': 0.05, \n",
561 | " 'max_depth': 7, \n",
562 | " 'num_leaves': 21, \n",
563 | " 'feature_fraction': 0.3, \n",
564 | " 'bagging_fraction': 0.8, \n",
565 | " 'bagging_freq': 5}"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 98,
571 | "metadata": {
572 | "collapsed": false,
573 | "scrolled": true
574 | },
575 | "outputs": [
576 | {
577 | "name": "stdout",
578 | "output_type": "stream",
579 | "text": [
580 | "[20]\tcv_agg's binary_error: 0.2132 + 0.00456488\n",
581 | "[40]\tcv_agg's binary_error: 0.195401 + 0.00625882\n",
582 | "[60]\tcv_agg's binary_error: 0.175601 + 0.00580722\n",
583 | "[80]\tcv_agg's binary_error: 0.1652 + 0.00589807\n",
584 | "[100]\tcv_agg's binary_error: 0.1568 + 0.00628195\n",
585 | "[120]\tcv_agg's binary_error: 0.1505 + 0.00328588\n",
586 | "[140]\tcv_agg's binary_error: 0.1487 + 0.00399728\n",
587 | "[160]\tcv_agg's binary_error: 0.147301 + 0.00497347\n",
588 | "[180]\tcv_agg's binary_error: 0.1445 + 0.00362296\n",
589 | "[200]\tcv_agg's binary_error: 0.1439 + 0.00429358\n",
590 | "[220]\tcv_agg's binary_error: 0.1417 + 0.00200147\n",
591 | "[240]\tcv_agg's binary_error: 0.1418 + 0.0040771\n",
592 | "[260]\tcv_agg's binary_error: 0.1401 + 0.00373791\n",
593 | "[280]\tcv_agg's binary_error: 0.1389 + 0.00517039\n",
594 | "[300]\tcv_agg's binary_error: 0.1376 + 0.00466764\n",
595 | "[320]\tcv_agg's binary_error: 0.136901 + 0.00507148\n",
596 | "[340]\tcv_agg's binary_error: 0.1357 + 0.00529898\n",
597 | "[360]\tcv_agg's binary_error: 0.1363 + 0.00505334\n",
598 | "[380]\tcv_agg's binary_error: 0.1353 + 0.0044035\n",
599 | "[400]\tcv_agg's binary_error: 0.1356 + 0.00428458\n",
600 | "[420]\tcv_agg's binary_error: 0.134501 + 0.00445613\n"
601 | ]
602 | }
603 | ],
604 | "source": [
605 | "lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": 126,
611 | "metadata": {
612 | "collapsed": false
613 | },
614 | "outputs": [],
615 | "source": [
616 | "## get nround value which hd lowest error\n",
617 | "nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))"
618 | ]
619 | },
620 | {
621 | "cell_type": "code",
622 | "execution_count": 129,
623 | "metadata": {
624 | "collapsed": false
625 | },
626 | "outputs": [],
627 | "source": [
628 | "## train the model\n",
629 | "model = lgb.train(params, d_train, num_boost_round=nround)"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 130,
635 | "metadata": {
636 | "collapsed": true
637 | },
638 | "outputs": [],
639 | "source": [
640 | "## make predictions\n",
641 | "preds = model.predict(test_feats1)"
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": 157,
647 | "metadata": {
648 | "collapsed": true
649 | },
650 | "outputs": [],
651 | "source": [
652 | "# make submission\n",
653 | "\n",
654 | "def to_labels(x):\n",
655 | " if x > 0.66: # cutoff - you can change it and see if accuracy improves or plot AUC curve. \n",
656 | " return \"happy\"\n",
657 | " return \"not_happy\"\n",
658 | "\n",
659 | "sub3 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})\n",
660 | "sub3['Is_Response'] = sub3['Is_Response'].map(lambda x: to_labels(x))\n",
661 | "sub3 = sub3[['User_ID','Is_Response']]\n",
662 | "sub3.to_csv('submissions/sub3_lgb.csv', index=False) # 0.85518"
663 | ]
664 | },
665 | {
666 | "cell_type": "markdown",
667 | "metadata": {},
668 | "source": [
669 | "### LightGBM - 2\n",
670 | "\n",
671 | "In this model, we'll use tf-idf features for model training."
672 | ]
673 | },
674 | {
675 | "cell_type": "code",
676 | "execution_count": 140,
677 | "metadata": {
678 | "collapsed": false
679 | },
680 | "outputs": [],
681 | "source": [
682 | "# set data format\n",
683 | "d_train = lgb.Dataset(train_feats2, label = target)"
684 | ]
685 | },
686 | {
687 | "cell_type": "code",
688 | "execution_count": 144,
689 | "metadata": {
690 | "collapsed": true
691 | },
692 | "outputs": [],
693 | "source": [
694 | "# same parameters as above\n",
695 | "params = {'task': 'train',\n",
696 | " 'boosting_type': 'gbdt',\n",
697 | " 'objective': 'binary',\n",
698 | " 'metric': 'binary_error',\n",
699 | " 'learning_rate': 0.05, \n",
700 | " 'max_depth': 5, \n",
701 | " 'num_leaves': 11,\n",
702 | " 'feature_fraction': 0.3, \n",
703 | " 'bagging_fraction': 0.8, \n",
704 | " 'bagging_freq': 5}"
705 | ]
706 | },
707 | {
708 | "cell_type": "code",
709 | "execution_count": 145,
710 | "metadata": {
711 | "collapsed": false,
712 | "scrolled": true
713 | },
714 | "outputs": [
715 | {
716 | "name": "stdout",
717 | "output_type": "stream",
718 | "text": [
719 | "[20]\tcv_agg's binary_error: 0.226401 + 0.00518217\n",
720 | "[40]\tcv_agg's binary_error: 0.206602 + 0.00687761\n",
721 | "[60]\tcv_agg's binary_error: 0.183302 + 0.00791949\n",
722 | "[80]\tcv_agg's binary_error: 0.169801 + 0.00512283\n",
723 | "[100]\tcv_agg's binary_error: 0.164301 + 0.00632119\n",
724 | "[120]\tcv_agg's binary_error: 0.1578 + 0.00507081\n",
725 | "[140]\tcv_agg's binary_error: 0.1542 + 0.00524522\n",
726 | "[160]\tcv_agg's binary_error: 0.1516 + 0.00441669\n",
727 | "[180]\tcv_agg's binary_error: 0.148701 + 0.00512212\n",
728 | "[200]\tcv_agg's binary_error: 0.1461 + 0.00366096\n",
729 | "[220]\tcv_agg's binary_error: 0.1443 + 0.00362658\n",
730 | "[240]\tcv_agg's binary_error: 0.1437 + 0.00471092\n",
731 | "[260]\tcv_agg's binary_error: 0.143501 + 0.00450031\n",
732 | "[280]\tcv_agg's binary_error: 0.1405 + 0.00506581\n",
733 | "[300]\tcv_agg's binary_error: 0.1396 + 0.00578611\n",
734 | "[320]\tcv_agg's binary_error: 0.137801 + 0.00687571\n",
735 | "[340]\tcv_agg's binary_error: 0.138701 + 0.00679821\n",
736 | "[360]\tcv_agg's binary_error: 0.137201 + 0.00839438\n",
737 | "[380]\tcv_agg's binary_error: 0.137501 + 0.00738158\n",
738 | "[400]\tcv_agg's binary_error: 0.136401 + 0.00735946\n",
739 | "[420]\tcv_agg's binary_error: 0.136101 + 0.00702239\n",
740 | "[440]\tcv_agg's binary_error: 0.136901 + 0.00739423\n"
741 | ]
742 | }
743 | ],
744 | "source": [
745 | "## do cross validation to find nround i.e. at this round (iteration) we can expect lowest error\n",
746 | "lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)"
747 | ]
748 | },
749 | {
750 | "cell_type": "code",
751 | "execution_count": 146,
752 | "metadata": {
753 | "collapsed": false
754 | },
755 | "outputs": [],
756 | "source": [
757 | "# get nround value\n",
758 | "nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))"
759 | ]
760 | },
761 | {
762 | "cell_type": "code",
763 | "execution_count": 159,
764 | "metadata": {
765 | "collapsed": false
766 | },
767 | "outputs": [],
768 | "source": [
769 | "# train model\n",
770 | "model = lgb.train(params, d_train, num_boost_round=nround)"
771 | ]
772 | },
773 | {
774 | "cell_type": "code",
775 | "execution_count": 160,
776 | "metadata": {
777 | "collapsed": true
778 | },
779 | "outputs": [],
780 | "source": [
781 | "# make prediction\n",
782 | "preds = model.predict(test_feats2)"
783 | ]
784 | },
785 | {
786 | "cell_type": "code",
787 | "execution_count": 162,
788 | "metadata": {
789 | "collapsed": true
790 | },
791 | "outputs": [],
792 | "source": [
793 | "# make submission\n",
794 | "\n",
795 | "def to_labels(x):\n",
796 | " if x > 0.66:\n",
797 | " return \"happy\"\n",
798 | " return \"not_happy\"\n",
799 | "\n",
800 | "sub4 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})\n",
801 | "sub4['Is_Response'] = sub4['Is_Response'].map(lambda x: to_labels(x))\n",
802 | "sub4 = sub4[['User_ID','Is_Response']]\n",
803 | "sub4.to_csv('submissions/sub4_lgb.csv', index=False) # 0.84925"
804 | ]
805 | },
806 | {
807 | "cell_type": "markdown",
808 | "metadata": {},
809 | "source": [
810 | "### CatBoost\n",
811 | "\n",
812 | "Catboost is a new package recently launched by Yandex. It is said that it works well when the data has many categorical features. We'll use it on count data and see it our model improves."
813 | ]
814 | },
815 | {
816 | "cell_type": "code",
817 | "execution_count": null,
818 | "metadata": {
819 | "collapsed": true
820 | },
821 | "outputs": [],
822 | "source": [
823 | "## import library\n",
824 | "from catboost import CatBoostClassifier,cv, Pool"
825 | ]
826 | },
827 | {
828 | "cell_type": "code",
829 | "execution_count": 193,
830 | "metadata": {
831 | "collapsed": true
832 | },
833 | "outputs": [],
834 | "source": [
835 | "## catboost accepts categorical columns as a list of column numbers. In this data, all columns are categorical\n",
836 | "cat_cols = [x for x in range(502)] ## 502 == train_feats1.shape[1] "
837 | ]
838 | },
839 | {
840 | "cell_type": "code",
841 | "execution_count": 196,
842 | "metadata": {
843 | "collapsed": true
844 | },
845 | "outputs": [],
846 | "source": [
847 | "## set parameters\n",
848 | "## you can refer the parameters here: https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list\n",
849 | "param = {\n",
850 | " 'use_best_model':True,\n",
851 | " 'loss_function':'CrossEntropy',\n",
852 | " 'eval_metric':'Accuracy',\n",
853 | " 'iterations':1000,\n",
854 | " 'depth':6,\n",
855 | " 'learning_rate':0.03,\n",
856 | " 'rsm':0.3,\n",
857 | " 'random_seed':2017,\n",
858 | " \n",
859 | " \n",
860 | "}"
861 | ]
862 | },
863 | {
864 | "cell_type": "code",
865 | "execution_count": 197,
866 | "metadata": {
867 | "collapsed": false
868 | },
869 | "outputs": [],
870 | "source": [
871 | "## for doing cross validation, set data in Pool format\n",
872 | "my_dt = Pool(train_feats1, \n",
873 | " label=target,\n",
874 | " cat_features=cat_cols,\n",
875 | " column_description=None,\n",
876 | " delimiter='\\t',\n",
877 | " has_header=None,\n",
878 | " weight=None, \n",
879 | " baseline=None,\n",
880 | " feature_names=None,\n",
881 | " thread_count=1)"
882 | ]
883 | },
884 | {
885 | "cell_type": "code",
886 | "execution_count": 198,
887 | "metadata": {
888 | "collapsed": false,
889 | "scrolled": true
890 | },
891 | "outputs": [
892 | {
893 | "name": "stdout",
894 | "output_type": "stream",
895 | "text": [
896 | "Borders generated\n",
897 | "0:\t0:\t0:\t0:\t0:\t1:\t1:\t1:\t1:\t1:\t2:\t2:\t2:\t2:\t2:\t3:\t3:\t3:\t3:\t3:\t4:\t4:\t4:\t4:\t4:\t5:\t5:\t5:\t5:\t5:\t6:\t6:\t6:\t6:\t6:\t7:\t7:\t7:\t7:\t7:\t8:\t8:\t8:\t8:\t8:\t9:\t9:\t9:\t9:\t9:\t10:\t10:\t10:\t10:\t10:\t11:\t11:\t11:\t11:\t11:\t12:\t12:\t12:\t12:\t12:\t13:\t13:\t13:\t13:\t13:\t14:\t14:\t14:\t14:\t14:\t15:\t15:\t15:\t15:\t15:\t16:\t16:\t16:\t16:\t16:\t17:\t17:\t17:\t17:\t17:\t18:\t18:\t18:\t18:\t18:\t19:\t19:\t19:\t19:\t19:\t20:\t20:\t20:\t20:\t20:\t21:\t21:\t21:\t21:\t21:\t22:\t22:\t22:\t22:\t22:\t23:\t23:\t23:\t23:\t23:\t24:\t24:\t24:\t24:\t24:\t25:\t25:\t25:\t25:\t25:\t26:\t26:\t26:\t26:\t26:\t27:\t27:\t27:\t27:\t27:\t28:\t28:\t28:\t28:\t28:\t29:\t29:\t29:\t29:\t29:\t30:\t30:\t30:\t30:\t30:\t31:\t31:\t31:\t31:\t31:\t32:\t32:\t32:\t32:\t32:\t33:\t33:\t33:\t33:\t33:\t34:\t34:\t34:\t34:\t34:\t35:\t35:\t35:\t35:\t35:\t36:\t36:\t36:\t36:\t36:\t37:\t37:\t37:\t37:\t37:\t38:\t38:\t38:\t38:\t38:\t39:\t39:\t39:\t39:\t39:\t40:\t40:\t40:\t40:\t40:\t41:\t41:\t41:\t41:\t41:\t42:\t42:\t42:\t42:\t42:\t43:\t43:\t43:\t43:\t43:\t44:\t44:\t44:\t44:\t44:\t45:\t45:\t45:\t45:\t45:\t46:\t46:\t46:\t46:\t46:\t47:\t47:\t47:\t47:\t47:\t48:\t48:\t48:\t48:\t48:\t49:\t49:\t49:\t49:\t49:\t50:\t50:\t50:\t50:\t50:\t51:\t51:\t51:\t51:\t51:\t52:\t52:\t52:\t52:\t52:\t53:\t53:\t53:\t53:\t53:\t54:\t54:\t54:\t54:\t54:\t55:\t55:\t55:\t55:\t55:\t56:\t56:\t56:\t56:\t56:\t57:\t57:\t57:\t57:\t57:\t58:\t58:\t58:\t58:\t58:\t59:\t59:\t59:\t59:\t59:\t60:\t60:\t60:\t60:\t60:\t61:\t61:\t61:\t61:\t61:\t62:\t62:\t62:\t62:\t62:\t63:\t63:\t63:\t63:\t63:\t64:\t64:\t64:\t64:\t64:\t65:\t65:\t65:\t65:\t65:\t66:\t66:\t66:\t66:\t66:\t67:\t67:\t67:\t67:\t67:\t68:\t68:\t68:\t68:\t68:\t69:\t69:\t69:\t69:\t69:\t70:\t70:\t70:\t70:\t70:\t71:\t71:\t71:\t71:\t71:\t72:\t72:\t72:\t72:\t72:\t73:\t73:\t73:\t73:\t73:\t74:\t74:\t74:\t74:\t74:\t75:\t75:\t75:\t75:\t75:\t76:\t76:\t76:\t76:\t76:\t77:\t77:\t77:\t77:\t77:\t78:\t78:\t78:\t78:\t78:\t79:\t79:\t79:\t79:\t79:\t80:\t80:\t80:\t80:\t80:\t81:\t81:\t81:\t81:\t81:\t82:\t82:\t82:\t82:\t82:\t83:\t83:\t83:\t83:\t83:\t84:\t84:\t84:\t84:\t84:\t85:\t85:\t85:\t85:\t85:\t86:\t86:\t86:\t86:\t86:\t87:\t87:\t87:\t87:\t87:\t88:\t88:\t88:\t88:\t88:\t89:\t89:\t89:\t89:\t89:\t90:\t90:\t90:\t90:\t90:\t91:\t91:\t91:\t91:\t91:\t92:\t92:\t92:\t92:\t92:\t93:\t93:\t93:\t93:\t93:\t94:\t94:\t94:\t94:\t94:\t95:\t95:\t95:\t95:\t95:\t96:\t96:\t96:\t96:\t96:\t97:\t97:\t97:\t97:\t97:\t98:\t98:\t98:\t98:\t98:\t99:\t99:\t99:\t99:\t99:\t100:\t100:\t100:\t100:\t100:\t101:\t101:\t101:\t101:\t101:\t102:\t102:\t102:\t102:\t102:\t103:\t103:\t103:\t103:\t103:\t104:\t104:\t104:\t104:\t104:\t105:\t105:\t105:\t105:\t105:\t106:\t106:\t106:\t106:\t106:\t107:\t107:\t107:\t107:\t107:\t108:\t108:\t108:\t108:\t108:\t109:\t109:\t109:\t109:\t109:\t110:\t110:\t110:\t110:\t110:\t111:\t111:\t111:\t111:\t111:\t112:\t112:\t112:\t112:\t112:\t113:\t113:\t113:\t113:\t113:\t114:\t114:\t114:\t114:\t114:\t115:\t115:\t115:\t115:\t115:\t116:\t116:\t116:\t116:\t116:\t117:\t117:\t117:\t117:\t117:\t118:\t118:\t118:\t118:\t118:\t119:\t119:\t119:\t119:\t119:\t120:\t120:\t120:\t120:\t120:\t121:\t121:\t121:\t121:\t121:\t122:\t122:\t122:\t122:\t122:\t123:\t123:\t123:\t123:\t123:\t124:\t124:\t124:\t124:\t124:\t125:\t125:\t125:\t125:\t125:\t126:\t126:\t126:\t126:\t126:\t127:\t127:\t127:\t127:\t127:\t128:\t128:\t128:\t128:\t128:\t129:\t129:\t129:\t129:\t129:\t130:\t130:\t130:\t130:\t130:\t131:\t131:\t131:\t131:\t131:\t132:\t132:\t132:\t132:\t132:\t133:\t133:\t133:\t133:\t133:\t134:\t134:\t134:\t134:\t134:\t135:\t135:\t135:\t135:\t135:\t136:\t136:\t136:\t136:\t136:\t137:\t137:\t137:\t137:\t137:\t138:\t138:\t138:\t138:\t138:\t139:\t139:\t139:\t139:\t139:\t140:\t140:\t140:\t140:\t140:\t141:\t141:\t141:\t141:\t141:\t142:\t142:\t142:\t142:\t142:\t143:\t143:\t143:\t143:\t143:\t144:\t144:\t144:\t144:\t144:\t145:\t145:\t145:\t145:\t145:\t146:\t146:\t146:\t146:\t146:\t147:\t147:\t147:\t147:\t147:\t148:\t148:\t148:\t148:\t148:\t149:\t149:\t149:\t149:\t149:\t150:\t150:\t150:\t150:\t150:\t151:\t151:\t151:\t151:\t151:\t152:\t152:\t152:\t152:\t152:\t153:\t153:\t153:\t153:\t153:\t154:\t154:\t154:\t154:\t154:\t155:\t155:\t155:\t155:\t155:\t156:\t156:\t156:\t156:\t156:\t157:\t157:\t157:\t157:\t157:\t158:\t158:\t158:\t158:\t158:\t159:\t159:\t159:\t159:\t159:\t160:\t160:\t160:\t160:\t160:\t161:\t161:\t161:\t161:\t161:\t162:\t162:\t162:\t162:\t162:\t163:\t163:\t163:\t163:\t163:\t164:\t164:\t164:\t164:\t164:\t165:\t165:\t165:\t165:\t165:\t166:\t166:\t166:\t166:\t166:\t167:\t167:\t167:\t167:\t167:\t168:\t168:\t168:\t168:\t168:\t169:\t169:\t169:\t169:\t169:\t170:\t170:\t170:\t170:\t170:\t171:\t171:\t171:\t171:\t171:\t172:\t172:\t172:\t172:\t172:\t173:\t173:\t173:\t173:\t173:\t174:\t174:\t174:\t174:\t174:\t175:\t175:\t175:\t175:\t175:\t176:\t176:\t176:\t176:\t176:\t177:\t177:\t177:\t177:\t177:\t178:\t178:\t178:\t178:\t178:\t179:\t179:\t179:\t179:\t179:\t180:\t180:\t180:\t180:\t180:\t181:\t181:\t181:\t181:\t181:\t182:\t182:\t182:\t182:\t182:\t183:\t183:\t183:\t183:\t183:\t184:\t184:\t184:\t184:\t184:\t185:\t185:\t185:\t185:\t185:\t186:\t186:\t186:\t186:\t186:\t187:\t187:\t187:\t187:\t187:\t188:\t188:\t188:\t188:\t188:\t189:\t189:\t189:\t189:\t189:\t190:\t190:\t190:\t190:\t190:\t191:\t191:\t191:\t191:\t191:\t192:\t192:\t192:\t192:\t192:\t193:\t193:\t193:\t193:\t193:\t194:\t194:\t194:\t194:\t194:\t195:\t195:\t195:\t195:\t195:\t196:\t196:\t196:\t196:\t196:\t197:\t197:\t197:\t197:\t197:\t198:\t198:\t198:\t198:\t198:\t199:\t199:\t199:\t199:\t199:\t200:\t200:\t200:\t200:\t200:\t201:\t201:\t201:\t201:\t201:\t202:\t202:\t202:\t202:\t202:\t203:\t203:\t203:\t203:\t203:\t204:\t204:\t204:\t204:\t204:\t205:\t205:\t205:\t205:\t205:\t206:\t206:\t206:\t206:\t206:\t207:\t207:\t207:\t207:\t207:\t208:\t208:\t208:\t208:\t208:\t209:\t209:\t209:\t209:\t209:\t210:\t210:\t210:\t210:\t210:\t211:\t211:\t211:\t211:\t211:\t212:\t212:\t212:\t212:\t212:\t213:\t213:\t213:\t213:\t213:\t214:\t214:\t214:\t214:\t214:\t215:\t215:\t215:\t215:\t215:\t216:\t216:\t216:\t216:\t216:\t217:\t217:\t217:\t217:\t217:\t218:\t218:\t218:\t218:\t218:\t219:\t219:\t219:\t219:\t219:\t220:\t220:\t220:\t220:\t220:\t221:\t221:\t221:\t221:\t221:\t222:\t222:\t222:\t222:\t222:\t223:\t223:\t223:\t223:\t223:\t224:\t224:\t224:\t224:\t224:\t225:\t225:\t225:\t225:\t225:\t226:\t226:\t226:\t226:\t226:\t227:\t227:\t227:\t227:\t227:\t228:\t228:\t228:\t228:\t228:\t229:\t229:\t229:\t229:\t229:\t230:\t230:\t230:\t230:\t230:\t231:\t231:\t231:\t231:\t231:\t232:\t232:\t232:\t232:\t232:\t233:\t233:\t233:\t233:\t233:\t234:\t234:\t234:\t234:\t234:\t235:\t235:\t235:\t235:\t235:\t236:\t236:\t236:\t236:\t236:\t237:\t237:\t237:\t237:\t237:\t238:\t238:\t238:\t238:\t238:\t239:\t239:\t239:\t239:\t239:\t240:\t240:\t240:\t240:\t240:\t241:\t241:\t241:\t241:\t241:\t242:\t242:\t242:\t242:\t242:\t243:\t243:\t243:\t243:\t243:\t244:\t244:\t244:\t244:\t244:\t245:\t245:\t245:\t245:\t245:\t246:\t246:\t246:\t246:\t246:\t247:\t247:\t247:\t247:\t247:\t248:\t248:\t248:\t248:\t248:\t249:\t249:\t249:\t249:\t249:\t250:\t250:\t250:\t250:\t250:\t251:\t251:\t251:\t251:\t251:\t252:\t252:\t252:\t252:\t252:\t253:\t253:\t253:\t253:\t253:\t254:\t254:\t254:\t254:\t254:\t255:\t255:\t255:\t255:\t255:\t256:\t256:\t256:\t256:\t256:\t257:\t257:\t257:\t257:\t257:\t258:\t258:\t258:\t258:\t258:\t259:\t259:\t259:\t259:\t259:\t260:\t260:\t260:\t260:\t260:\t261:\t261:\t261:\t261:\t261:\t262:\t262:\t262:\t262:\t262:\t263:\t263:\t263:\t263:\t263:\t264:\t264:\t264:\t264:\t264:\t265:\t265:\t265:\t265:\t265:\t266:\t266:\t266:\t266:\t266:\t267:\t267:\t267:\t267:\t267:\t268:\t268:\t268:\t268:\t268:\t269:\t269:\t269:\t269:\t269:\t270:\t270:\t270:\t270:\t270:\t271:\t271:\t271:\t271:\t271:\t272:\t272:\t272:\t272:\t272:\t273:\t273:\t273:\t273:\t273:\t274:\t274:\t274:\t274:\t274:\t275:\t275:\t275:\t275:\t275:\t276:\t276:\t276:\t276:\t276:\t277:\t277:\t277:\t277:\t277:\t278:\t278:\t278:\t278:\t278:\t279:\t279:\t279:\t279:\t279:\t280:\t280:\t280:\t280:\t280:\t281:\t281:\t281:\t281:\t281:\t282:\t282:\t282:\t282:\t282:\t283:\t283:\t283:\t283:\t283:\t284:\t284:\t284:\t284:\t284:\t285:\t285:\t285:\t285:\t285:\t286:\t286:\t286:\t286:\t286:\t287:\t287:\t287:\t287:\t287:\t288:\t288:\t288:\t288:\t288:\t289:\t289:\t289:\t289:\t289:\t290:\t290:\t290:\t290:\t290:\t291:\t291:\t291:\t291:\t291:\t292:\t292:\t292:\t292:\t292:\t293:\t293:\t293:\t293:\t293:\t294:\t294:\t294:\t294:\t294:\t295:\t295:\t295:\t295:\t295:\t296:\t296:\t296:\t296:\t296:\t297:\t297:\t297:\t297:\t297:\t298:\t298:\t298:\t298:\t298:\t299:\t299:\t299:\t299:\t299:\t300:\t300:\t300:\t300:\t300:\t301:\t301:\t301:\t301:\t301:\t302:\t302:\t302:\t302:\t302:\t303:\t303:\t303:\t303:\t303:\t304:\t304:\t304:\t304:\t304:\t305:\t305:\t305:\t305:\t305:\t306:\t306:\t306:\t306:\t306:\t307:\t307:\t307:\t307:\t307:\t308:\t308:\t308:\t308:\t308:\t309:\t309:\t309:\t309:\t309:\t310:\t310:\t310:\t310:\t310:\t311:\t311:\t311:\t311:\t311:\t312:\t312:\t312:\t312:\t312:\t313:\t313:\t313:\t313:\t313:\t314:\t314:\t314:\t314:\t314:\t315:\t315:\t315:\t315:\t315:\t316:\t316:\t316:\t316:\t316:\t317:\t317:\t317:\t317:\t317:\t318:\t318:\t318:\t318:\t318:\t319:\t319:\t319:\t319:\t319:\t320:\t320:\t320:\t320:\t320:\t321:\t321:\t321:\t321:\t321:\t322:\t322:\t322:\t322:\t322:\t323:\t323:\t323:\t323:\t323:\t324:\t324:\t324:\t324:\t324:\t325:\t325:\t325:\t325:\t325:\t326:\t326:\t326:\t326:\t326:\t327:\t327:\t327:\t327:\t327:\t328:\t328:\t328:\t328:\t328:\t329:\t329:\t329:\t329:\t329:\t330:\t330:\t330:\t330:\t330:\t331:\t331:\t331:\t331:\t331:\t332:\t332:\t332:\t332:\t332:\t333:\t333:\t333:\t333:\t333:\t334:\t334:\t334:\t334:\t334:\t335:\t335:\t335:\t335:\t335:\t336:\t336:\t336:\t336:\t336:\t337:\t337:\t337:\t337:\t337:\t338:\t338:\t338:\t338:\t338:\t339:\t339:\t339:\t339:\t339:\t340:\t340:\t340:\t340:\t340:\t341:\t341:\t341:\t341:\t341:\t342:\t342:\t342:\t342:\t342:\t343:\t343:\t343:\t343:\t343:\t344:\t344:\t344:\t344:\t344:\t345:\t345:\t345:\t345:\t345:\t346:\t346:\t346:\t346:\t346:\t347:\t347:\t347:\t347:\t347:\t348:\t348:\t348:\t348:\t348:\t349:\t349:\t349:\t349:\t349:\t350:\t350:\t350:\t350:\t350:\t351:\t351:\t351:\t351:\t351:\t352:\t352:\t352:\t352:\t352:\t353:\t353:\t353:\t353:\t353:\t354:\t354:\t354:\t354:\t354:\t355:\t355:\t355:\t355:\t355:\t356:\t356:\t356:\t356:\t356:\t357:\t357:\t357:\t357:\t357:\t358:\t358:\t358:\t358:\t358:\t359:\t359:\t359:\t359:\t359:\t360:\t360:\t360:\t360:\t360:\t361:\t361:\t361:\t361:\t361:\t362:\t362:\t362:\t362:\t362:\t363:\t363:\t363:\t363:\t363:\t364:\t364:\t364:\t364:\t364:\t365:\t365:\t365:\t365:\t365:\t366:\t366:\t366:\t366:\t366:\t367:\t367:\t367:\t367:\t367:\t368:\t368:\t368:\t368:\t368:\t369:\t369:\t369:\t369:\t369:\t370:\t370:\t370:\t370:\t370:\t371:\t371:\t371:\t371:\t371:\t372:\t372:\t372:\t372:\t372:\t373:\t373:\t373:\t373:\t373:\t374:\t374:\t374:\t374:\t374:\t375:\t375:\t375:\t375:\t375:\t376:\t376:\t376:\t376:\t376:\t377:\t377:\t377:\t377:\t377:\t378:\t378:\t378:\t378:\t378:\t379:\t379:\t379:\t379:\t379:\t380:\t380:\t380:\t380:\t380:\t381:\t381:\t381:\t381:\t381:\t382:\t382:\t382:\t382:\t382:\t383:\t383:\t383:\t383:\t383:\t384:\t384:\t384:\t384:\t384:\t385:\t385:\t385:\t385:\t385:\t386:\t386:\t386:\t386:\t386:\t387:\t387:\t387:\t387:\t387:\t388:\t388:\t388:\t388:\t388:\t389:\t389:\t389:\t389:\t389:\t390:\t390:\t390:\t390:\t390:\t391:\t391:\t391:\t391:\t391:\t392:\t392:\t392:\t392:\t392:\t393:\t393:\t393:\t393:\t393:\t394:\t394:\t394:\t394:\t394:\t395:\t395:\t395:\t395:\t395:\t396:\t396:\t396:\t396:\t396:\t397:\t397:\t397:\t397:\t397:\t398:\t398:\t398:\t398:\t398:\t399:\t399:\t399:\t399:\t399:\t400:\t400:\t400:\t400:\t400:\t401:\t401:\t401:\t401:\t401:\t402:\t402:\t402:\t402:\t402:\t403:\t403:\t403:\t403:\t403:\t404:\t404:\t404:\t404:\t404:\t405:\t405:\t405:\t405:\t405:\t406:\t406:\t406:\t406:\t406:\t407:\t407:\t407:\t407:\t407:\t408:\t408:\t408:\t408:\t408:\t409:\t409:\t409:\t409:\t409:\t410:\t410:\t410:\t410:\t410:\t411:\t411:\t411:\t411:\t411:\t412:\t412:\t412:\t412:\t412:\t413:\t413:\t413:\t413:\t413:\t414:\t414:\t414:\t414:\t414:\t415:\t415:\t415:\t415:\t415:\t416:\t416:\t416:\t416:\t416:\t417:\t417:\t417:\t417:\t417:\t418:\t418:\t418:\t418:\t418:\t419:\t419:\t419:\t419:\t419:\t420:\t420:\t420:\t420:\t420:\t421:\t421:\t421:\t421:\t421:\t422:\t422:\t422:\t422:\t422:\t423:\t423:\t423:\t423:\t423:\t424:\t424:\t424:\t424:\t424:\t425:\t425:\t425:\t425:\t425:\t426:\t426:\t426:\t426:\t426:\t427:\t427:\t427:\t427:\t427:\t428:\t428:\t428:\t428:\t428:\t429:\t429:\t429:\t429:\t429:\t430:\t430:\t430:\t430:\t430:\t431:\t431:\t431:\t431:\t431:\t432:\t432:\t432:\t432:\t432:\t433:\t433:\t433:\t433:\t433:\t434:\t434:\t434:\t434:\t434:\t435:\t435:\t435:\t435:\t435:\t436:\t436:\t436:\t436:\t436:\t437:\t437:\t437:\t437:\t437:\t438:\t438:\t438:\t438:\t438:\t439:\t439:\t439:\t439:\t439:\t440:\t440:\t440:\t440:\t440:\t441:\t441:\t441:\t441:\t441:\t442:\t442:\t442:\t442:\t442:\t443:\t443:\t443:\t443:\t443:\t444:\t444:\t444:\t444:\t444:\t445:\t445:\t445:\t445:\t445:\t446:\t446:\t446:\t446:\t446:\t447:\t447:\t447:\t447:\t447:\t448:\t448:\t448:\t448:\t448:\t449:\t449:\t449:\t449:\t449:\t450:\t450:\t450:\t450:\t450:\t451:\t451:\t451:\t451:\t451:\t452:\t452:\t452:\t452:\t452:\t453:\t453:\t453:\t453:\t453:\t454:\t454:\t454:\t454:\t454:\t455:\t455:\t455:\t455:\t455:\t456:\t456:\t456:\t456:\t456:\t457:\t457:\t457:\t457:\t457:\t458:\t458:\t458:\t458:\t458:\t459:\t459:\t459:\t459:\t459:\t460:\t460:\t460:\t460:\t460:\t461:\t461:\t461:\t461:\t461:\t462:\t462:\t462:\t462:\t462:\t463:\t463:\t463:\t463:\t463:\t464:\t464:\t464:\t464:\t464:\t465:\t465:\t465:\t465:\t465:\t466:\t466:\t466:\t466:\t466:\t467:\t467:\t467:\t467:\t467:\t468:\t468:\t468:\t468:\t468:\t469:\t469:\t469:\t469:\t469:\t470:\t470:\t470:\t470:\t470:\t471:\t471:\t471:\t471:\t471:\t472:\t472:\t472:\t472:\t472:\t473:\t473:\t473:\t473:\t473:\t474:\t474:\t474:\t474:\t474:\t475:\t475:\t475:\t475:\t475:\t476:\t476:\t476:\t476:\t476:\t477:\t477:\t477:\t477:\t477:\t478:\t478:\t478:\t478:\t478:\t479:\t479:\t479:\t479:\t479:\t480:\t480:\t480:\t480:\t480:\t481:\t481:\t481:\t481:\t481:\t482:\t482:\t482:\t482:\t482:\t483:\t483:\t483:\t483:\t483:\t484:\t484:\t484:\t484:\t484:\t485:\t485:\t485:\t485:\t485:\t486:\t486:\t486:\t486:\t486:\t487:\t487:\t487:\t487:\t487:\t488:\t488:\t488:\t488:\t488:\t489:\t489:\t489:\t489:\t489:\t490:\t490:\t490:\t490:\t490:\t491:\t491:\t491:\t491:\t491:\t492:\t492:\t492:\t492:\t492:\t493:\t493:\t493:\t493:\t493:\t494:\t494:\t494:\t494:\t494:\t495:\t495:\t495:\t495:\t495:\t496:\t496:\t496:\t496:\t496:\t497:\t497:\t497:\t497:\t497:\t498:\t498:\t498:\t498:\t498:\t499:\t499:\t499:\t499:\t499:\t500:\t500:\t500:\t500:\t500:\t501:\t501:\t501:\t501:\t501:\t502:\t502:\t502:\t502:\t502:\t503:\t503:\t503:\t503:\t503:\t504:\t504:\t504:\t504:\t504:\t505:\t505:\t505:\t505:\t505:\t506:\t506:\t506:\t506:\t506:\t507:\t507:\t507:\t507:\t507:\t508:\t508:\t508:\t508:\t508:\t509:\t509:\t509:\t509:\t509:\t510:\t510:\t510:\t510:\t510:\t511:\t511:\t511:\t511:\t511:\t512:\t512:\t512:\t512:\t512:\t513:\t513:\t513:\t513:\t513:\t514:\t514:\t514:\t514:\t514:\t515:\t515:\t515:\t515:\t515:\t516:\t516:\t516:\t516:\t516:\t517:\t517:\t517:\t517:\t517:\t518:\t518:\t518:\t518:\t518:\t519:\t519:\t519:\t519:\t519:\t520:\t520:\t520:\t520:\t520:\t521:\t521:\t521:\t521:\t521:\t522:\t522:\t522:\t522:\t522:\t523:\t523:\t523:\t523:\t523:\t524:\t524:\t524:\t524:\t524:\t525:\t525:\t525:\t525:\t525:\t526:\t526:\t526:\t526:\t526:\t527:\t527:\t527:\t527:\t527:\t528:\t528:\t528:\t528:\t528:\t529:\t529:\t529:\t529:\t529:\t530:\t530:\t530:\t530:\t530:\t531:\t531:\t531:\t531:\t531:\t532:\t532:\t532:\t532:\t532:\t533:\t533:\t533:\t533:\t533:\t534:\t534:\t534:\t534:\t534:\t535:\t535:\t535:\t535:\t535:\t536:\t536:\t536:\t536:\t536:\t537:\t537:\t537:\t537:\t537:\t538:\t538:\t538:\t538:\t538:\t539:\t539:\t539:\t539:\t539:\t540:\t540:\t540:\t540:\t540:\t541:\t541:\t541:\t541:\t541:\t542:\t542:\t542:\t542:\t542:\t543:\t543:\t543:\t543:\t543:\t544:\t544:\t544:\t544:\t544:\t545:\t545:\t545:\t545:\t545:\t546:\t546:\t546:\t546:\t546:\t547:\t547:\t547:\t547:\t547:\t548:\t548:\t548:\t548:\t548:\t549:\t549:\t549:\t549:\t549:\t550:\t550:\t550:\t550:\t550:\t551:\t551:\t551:\t551:\t551:\t552:\t552:\t552:\t552:\t552:\t553:\t553:\t553:\t553:\t553:\t554:\t554:\t554:\t554:\t554:\t555:\t555:\t555:\t555:\t555:\t556:\t556:\t556:\t556:\t556:\t557:\t557:\t557:\t557:\t557:\t558:\t558:\t558:\t558:\t558:\t559:\t559:\t559:\t559:\t559:\t560:\t560:\t560:\t560:\t560:\t561:\t561:\t561:\t561:\t561:\t562:\t562:\t562:\t562:\t562:\t563:\t563:\t563:\t563:\t563:\t564:\t564:\t564:\t564:\t564:\t565:\t565:\t565:\t565:\t565:\t566:\t566:\t566:\t566:\t566:\t567:\t567:\t567:\t567:\t567:\t568:\t568:\t568:\t568:\t568:\t569:\t569:\t569:\t569:\t569:\t570:\t570:\t570:\t570:\t570:\t571:\t571:\t571:\t571:\t571:\t572:\t572:\t572:\t572:\t572:\t573:\t573:\t573:\t573:\t573:\t574:\t574:\t574:\t574:\t574:\t575:\t575:\t575:\t575:\t575:\t576:\t576:\t576:\t576:\t576:\t577:\t577:\t577:\t577:\t577:\t578:\t578:\t578:\t578:\t578:\t579:\t579:\t579:\t579:\t579:\t580:\t580:\t580:\t580:\t580:\t581:\t581:\t581:\t581:\t581:\t582:\t582:\t582:\t582:\t582:\t583:\t583:\t583:\t583:\t583:\t584:\t584:\t584:\t584:\t584:\t585:\t585:\t585:\t585:\t585:\t586:\t586:\t586:\t586:\t586:\t587:\t587:\t587:\t587:\t587:\t588:\t588:\t588:\t588:\t588:\t589:\t589:\t589:\t589:\t589:\t590:\t590:\t590:\t590:\t590:\t591:\t591:\t591:\t591:\t591:\t592:\t592:\t592:\t592:\t592:\t593:\t593:\t593:\t593:\t593:\t594:\t594:\t594:\t594:\t594:\t595:\t595:\t595:\t595:\t595:\t596:\t596:\t596:\t596:\t596:\t597:\t597:\t597:\t597:\t597:\t598:\t598:\t598:\t598:\t598:\t599:\t599:\t599:\t599:\t599:\t600:\t600:\t600:\t600:\t600:\t601:\t601:\t601:\t601:\t601:\t602:\t602:\t602:\t602:\t602:\t603:\t603:\t603:\t603:\t603:\t604:\t604:\t604:\t604:\t604:\t605:\t605:\t605:\t605:\t605:\t606:\t606:\t606:\t606:\t606:\t607:\t607:\t607:\t607:\t607:\t608:\t608:\t608:\t608:\t608:\t609:\t609:\t609:\t609:\t609:\t610:\t610:\t610:\t610:\t610:\t611:\t611:\t611:\t611:\t611:\t612:\t612:\t612:\t612:\t612:\t613:\t613:\t613:\t613:\t613:\t614:\t614:\t614:\t614:\t614:\t615:\t615:\t615:\t615:\t615:\t616:\t616:\t616:\t616:\t616:\t617:\t617:\t617:\t617:\t617:\t618:\t618:\t618:\t618:\t618:\t619:\t619:\t619:\t619:\t619:\t620:\t620:\t620:\t620:\t620:\t621:\t621:\t621:\t621:\t621:\t622:\t622:\t622:\t622:\t622:\t623:\t623:\t623:\t623:\t623:\t624:\t624:\t624:\t624:\t624:\t625:\t625:\t625:\t625:\t625:\t626:\t626:\t626:\t626:\t626:\t627:\t627:\t627:\t627:\t627:\t628:\t628:\t628:\t628:\t628:\t629:\t629:\t629:\t629:\t629:\t630:\t630:\t630:\t630:\t630:\t631:\t631:\t631:\t631:\t631:\t632:\t632:\t632:\t632:\t632:\t633:\t633:\t633:\t633:\t633:\t634:\t634:\t634:\t634:\t634:\t635:\t635:\t635:\t635:\t635:\t636:\t636:\t636:\t636:\t636:\t637:\t637:\t637:\t637:\t637:\t638:\t638:\t638:\t638:\t638:\t639:\t639:\t639:\t639:\t639:\t640:\t640:\t640:\t640:\t640:\t641:\t641:\t641:\t641:\t641:\t642:\t642:\t642:\t642:\t642:\t643:\t643:\t643:\t643:\t643:\t644:\t644:\t644:\t644:\t644:\t645:\t645:\t645:\t645:\t645:\t646:\t646:\t646:\t646:\t646:\t647:\t647:\t647:\t647:\t647:\t648:\t648:\t648:\t648:\t648:\t649:\t649:\t649:\t649:\t649:\t650:\t650:\t650:\t650:\t650:\t651:\t651:\t651:\t651:\t651:\t652:\t652:\t652:\t652:\t652:\t653:\t653:\t653:\t653:\t653:\t654:\t654:\t654:\t654:\t654:\t655:\t655:\t655:\t655:\t655:\t656:\t656:\t656:\t656:\t656:\t657:\t657:\t657:\t657:\t657:\t658:\t658:\t658:\t658:\t658:\t659:\t659:\t659:\t659:\t659:\t660:\t660:\t660:\t660:\t660:\t661:\t661:\t661:\t661:\t661:\t662:\t662:\t662:\t662:\t662:\t663:\t663:\t663:\t663:\t663:\t664:\t664:\t664:\t664:\t664:\t665:\t665:\t665:\t665:\t665:\t666:\t666:\t666:\t666:\t666:\t667:\t667:\t667:\t667:\t667:\t668:\t668:\t668:\t668:\t668:\t669:\t669:\t669:\t669:\t669:\t670:\t670:\t670:\t670:\t670:\t671:\t671:\t671:\t671:\t671:\t672:\t672:\t672:\t672:\t672:\t673:\t673:\t673:\t673:\t673:\t674:\t674:\t674:\t674:\t674:\t675:\t675:\t675:\t675:\t675:\t676:\t676:\t676:\t676:\t676:\t677:\t677:\t677:\t677:\t677:\t678:\t678:\t678:\t678:\t678:\t679:\t679:\t679:\t679:\t679:\t680:\t680:\t680:\t680:\t680:\t681:\t681:\t681:\t681:\t681:\t682:\t682:\t682:\t682:\t682:\t683:\t683:\t683:\t683:\t683:\t684:\t684:\t684:\t684:\t684:\t685:\t685:\t685:\t685:\t685:\t686:\t686:\t686:\t686:\t686:\t687:\t687:\t687:\t687:\t687:\t688:\t688:\t688:\t688:\t688:\t689:\t689:\t689:\t689:\t689:\t690:\t690:\t690:\t690:\t690:\t691:\t691:\t691:\t691:\t691:\t692:\t692:\t692:\t692:\t692:\t693:\t693:\t693:\t693:\t693:\t694:\t694:\t694:\t694:\t694:\t695:\t695:\t695:\t695:\t695:\t696:\t696:\t696:\t696:\t696:\t697:\t697:\t697:\t697:\t697:\t698:\t698:\t698:\t698:\t698:\t699:\t699:\t699:\t699:\t699:\t700:\t700:\t700:\t700:\t700:\t701:\t701:\t701:\t701:\t701:\t702:\t702:\t702:\t702:\t702:\t703:\t703:\t703:\t703:\t703:\t704:\t704:\t704:\t704:\t704:\t705:\t705:\t705:\t705:\t705:\t706:\t706:\t706:\t706:\t706:\t707:\t707:\t707:\t707:\t707:\t708:\t708:\t708:\t708:\t708:\t709:\t709:\t709:\t709:\t709:\t710:\t710:\t710:\t710:\t710:\t711:\t711:\t711:\t711:\t711:\t712:\t712:\t712:\t712:\t712:\t713:\t713:\t713:\t713:\t713:\t714:\t714:\t714:\t714:\t714:\t715:\t715:\t715:\t715:\t715:\t716:\t716:\t716:\t716:\t716:\t717:\t717:\t717:\t717:\t717:\t718:\t718:\t718:\t718:\t718:\t719:\t719:\t719:\t719:\t719:\t720:\t720:\t720:\t720:\t720:\t721:\t721:\t721:\t721:\t721:\t722:\t722:\t722:\t722:\t722:\t723:\t723:\t723:\t723:\t723:\t724:\t724:\t724:\t724:\t724:\t725:\t725:\t725:\t725:\t725:\t726:\t726:\t726:\t726:\t726:\t727:\t727:\t727:\t727:\t727:\t728:\t728:\t728:\t728:\t728:\t729:\t729:\t729:\t729:\t729:\t730:\t730:\t730:\t730:\t730:\t731:\t731:\t731:\t731:\t731:\t732:\t732:\t732:\t732:\t732:\t733:\t733:\t733:\t733:\t733:\t734:\t734:\t734:\t734:\t734:\t735:\t735:\t735:\t735:\t735:\t736:\t736:\t736:\t736:\t736:\t737:\t737:\t737:\t737:\t737:\t738:\t738:\t738:\t738:\t738:\t739:\t739:\t739:\t739:\t739:\t740:\t740:\t740:\t740:\t740:\t741:\t741:\t741:\t741:\t741:\t742:\t742:\t742:\t742:\t742:\t743:\t743:\t743:\t743:\t743:\t744:\t744:\t744:\t744:\t744:\t745:\t745:\t745:\t745:\t745:\t746:\t746:\t746:\t746:\t746:\t747:\t747:\t747:\t747:\t747:\t748:\t748:\t748:\t748:\t748:\t749:\t749:\t749:\t749:\t749:\t750:\t750:\t750:\t750:\t750:\t751:\t751:\t751:\t751:\t751:\t752:\t752:\t752:\t752:\t752:\t753:\t753:\t753:\t753:\t753:\t754:\t754:\t754:\t754:\t754:\t755:\t755:\t755:\t755:\t755:\t756:\t756:\t756:\t756:\t756:\t757:\t757:\t757:\t757:\t757:\t758:\t758:\t758:\t758:\t758:\t759:\t759:\t759:\t759:\t759:\t760:\t760:\t760:\t760:\t760:\t761:\t761:\t761:\t761:\t761:\t762:\t762:\t762:\t762:\t762:\t763:\t763:\t763:\t763:\t763:\t764:\t764:\t764:\t764:\t764:\t765:\t765:\t765:\t765:\t765:\t766:\t766:\t766:\t766:\t766:\t767:\t767:\t767:\t767:\t767:\t768:\t768:\t768:\t768:\t768:\t769:\t769:\t769:\t769:\t769:\t770:\t770:\t770:\t770:\t770:\t771:\t771:\t771:\t771:\t771:\t772:\t772:\t772:\t772:\t772:\t773:\t773:\t773:\t773:\t773:\t774:\t774:\t774:\t774:\t774:\t775:\t775:\t775:\t775:\t775:\t776:\t776:\t776:\t776:\t776:\t777:\t777:\t777:\t777:\t777:\t778:\t778:\t778:\t778:\t778:\t779:\t779:\t779:\t779:\t779:\t780:\t780:\t780:\t780:\t780:\t781:\t781:\t781:\t781:\t781:\t782:\t782:\t782:\t782:\t782:\t783:\t783:\t783:\t783:\t783:\t784:\t784:\t784:\t784:\t784:\t785:\t785:\t785:\t785:\t785:\t786:\t786:\t786:\t786:\t786:\t787:\t787:\t787:\t787:\t787:\t788:\t788:\t788:\t788:\t788:\t789:\t789:\t789:\t789:\t789:\t790:\t790:\t790:\t790:\t790:\t791:\t791:\t791:\t791:\t791:\t792:\t792:\t792:\t792:\t792:\t793:\t793:\t793:\t793:\t793:\t794:\t794:\t794:\t794:\t794:\t795:\t795:\t795:\t795:\t795:\t796:\t796:\t796:\t796:\t796:\t797:\t797:\t797:\t797:\t797:\t798:\t798:\t798:\t798:\t798:\t799:\t799:\t799:\t799:\t799:\t800:\t800:\t800:\t800:\t800:\t801:\t801:\t801:\t801:\t801:\t802:\t802:\t802:\t802:\t802:\t803:\t803:\t803:\t803:\t803:\t804:\t804:\t804:\t804:\t804:\t805:\t805:\t805:\t805:\t805:\t806:\t806:\t806:\t806:\t806:\t807:\t807:\t807:\t807:\t807:\t808:\t808:\t808:\t808:\t808:\t809:\t809:\t809:\t809:\t809:\t810:\t810:\t810:\t810:\t810:\t811:\t811:\t811:\t811:\t811:\t812:\t812:\t812:\t812:\t812:\t813:\t813:\t813:\t813:\t813:\t814:\t814:\t814:\t814:\t814:\t815:\t815:\t815:\t815:\t815:\t816:\t816:\t816:\t816:\t816:\t817:\t817:\t817:\t817:\t817:\t818:\t818:\t818:\t818:\t818:\t819:\t819:\t819:\t819:\t819:\t820:\t820:\t820:\t820:\t820:\t821:\t821:\t821:\t821:\t821:\t822:\t822:\t822:\t822:\t822:\t823:\t823:\t823:\t823:\t823:\t824:\t824:\t824:\t824:\t824:\t825:\t825:\t825:\t825:\t825:\t826:\t826:\t826:\t826:\t826:\t827:\t827:\t827:\t827:\t827:\t828:\t828:\t828:\t828:\t828:\t829:\t829:\t829:\t829:\t829:\t830:\t830:\t830:\t830:\t830:\t831:\t831:\t831:\t831:\t831:\t832:\t832:\t832:\t832:\t832:\t833:\t833:\t833:\t833:\t833:\t834:\t834:\t834:\t834:\t834:\t835:\t835:\t835:\t835:\t835:\t836:\t836:\t836:\t836:\t836:\t837:\t837:\t837:\t837:\t837:\t838:\t838:\t838:\t838:\t838:\t839:\t839:\t839:\t839:\t839:\t840:\t840:\t840:\t840:\t840:\t841:\t841:\t841:\t841:\t841:\t842:\t842:\t842:\t842:\t842:\t843:\t843:\t843:\t843:\t843:\t844:\t844:\t844:\t844:\t844:\t845:\t845:\t845:\t845:\t845:\t846:\t846:\t846:\t846:\t846:\t847:\t847:\t847:\t847:\t847:\t848:\t848:\t848:\t848:\t848:\t849:\t849:\t849:\t849:\t849:\t850:\t850:\t850:\t850:\t850:\t851:\t851:\t851:\t851:\t851:\t852:\t852:\t852:\t852:\t852:\t853:\t853:\t853:\t853:\t853:\t854:\t854:\t854:\t854:\t854:\t855:\t855:\t855:\t855:\t855:\t856:\t856:\t856:\t856:\t856:\t857:\t857:\t857:\t857:\t857:\t858:\t858:\t858:\t858:\t858:\t859:\t859:\t859:\t859:\t859:\t860:\t860:\t860:\t860:\t860:\t861:\t861:\t861:\t861:\t861:\t862:\t862:\t862:\t862:\t862:\t863:\t863:\t863:\t863:\t863:\t864:\t864:\t864:\t864:\t864:\t865:\t865:\t865:\t865:\t865:\t866:\t866:\t866:\t866:\t866:\t867:\t867:\t867:\t867:\t867:\t868:\t868:\t868:\t868:\t868:\t869:\t869:\t869:\t869:\t869:\t870:\t870:\t870:\t870:\t870:\t871:\t871:\t871:\t871:\t871:\t872:\t872:\t872:\t872:\t872:\t873:\t873:\t873:\t873:\t873:\t874:\t874:\t874:\t874:\t874:\t875:\t875:\t875:\t875:\t875:\t876:\t876:\t876:\t876:\t876:\t877:\t877:\t877:\t877:\t877:\t878:\t878:\t878:\t878:\t878:\t879:\t879:\t879:\t879:\t879:\t880:\t880:\t880:\t880:\t880:\t881:\t881:\t881:\t881:\t881:\t882:\t882:\t882:\t882:\t882:\t883:\t883:\t883:\t883:\t883:\t884:\t884:\t884:\t884:\t884:\t885:\t885:\t885:\t885:\t885:\t886:\t886:\t886:\t886:\t886:\t887:\t887:\t887:\t887:\t887:\t888:\t888:\t888:\t888:\t888:\t889:\t889:\t889:\t889:\t889:\t890:\t890:\t890:\t890:\t890:\t891:\t891:\t891:\t891:\t891:\t892:\t892:\t892:\t892:\t892:\t893:\t893:\t893:\t893:\t893:\t894:\t894:\t894:\t894:\t894:\t895:\t895:\t895:\t895:\t895:\t896:\t896:\t896:\t896:\t896:\t897:\t897:\t897:\t897:\t897:\t898:\t898:\t898:\t898:\t898:\t899:\t899:\t899:\t899:\t899:\t900:\t900:\t900:\t900:\t900:\t901:\t901:\t901:\t901:\t901:\t902:\t902:\t902:\t902:\t902:\t903:\t903:\t903:\t903:\t903:\t904:\t904:\t904:\t904:\t904:\t905:\t905:\t905:\t905:\t905:\t906:\t906:\t906:\t906:\t906:\t907:\t907:\t907:\t907:\t907:\t908:\t908:\t908:\t908:\t908:\t909:\t909:\t909:\t909:\t909:\t910:\t910:\t910:\t910:\t910:\t911:\t911:\t911:\t911:\t911:\t912:\t912:\t912:\t912:\t912:\t913:\t913:\t913:\t913:\t913:\t914:\t914:\t914:\t914:\t914:\t915:\t915:\t915:\t915:\t915:\t916:\t916:\t916:\t916:\t916:\t917:\t917:\t917:\t917:\t917:\t918:\t918:\t918:\t918:\t918:\t919:\t919:\t919:\t919:\t919:\t920:\t920:\t920:\t920:\t920:\t921:\t921:\t921:\t921:\t921:\t922:\t922:\t922:\t922:\t922:\t923:\t923:\t923:\t923:\t923:\t924:\t924:\t924:\t924:\t924:\t925:\t925:\t925:\t925:\t925:\t926:\t926:\t926:\t926:\t926:\t927:\t927:\t927:\t927:\t927:\t928:\t928:\t928:\t928:\t928:\t929:\t929:\t929:\t929:\t929:\t930:\t930:\t930:\t930:\t930:\t931:\t931:\t931:\t931:\t931:\t932:\t932:\t932:\t932:\t932:\t933:\t933:\t933:\t933:\t933:\t934:\t934:\t934:\t934:\t934:\t935:\t935:\t935:\t935:\t935:\t936:\t936:\t936:\t936:\t936:\t937:\t937:\t937:\t937:\t937:\t938:\t938:\t938:\t938:\t938:\t939:\t939:\t939:\t939:\t939:\t940:\t940:\t940:\t940:\t940:\t941:\t941:\t941:\t941:\t941:\t942:\t942:\t942:\t942:\t942:\t943:\t943:\t943:\t943:\t943:\t944:\t944:\t944:\t944:\t944:\t945:\t945:\t945:\t945:\t945:\t946:\t946:\t946:\t946:\t946:\t947:\t947:\t947:\t947:\t947:\t948:\t948:\t948:\t948:\t948:\t949:\t949:\t949:\t949:\t949:\t950:\t950:\t950:\t950:\t950:\t951:\t951:\t951:\t951:\t951:\t952:\t952:\t952:\t952:\t952:\t953:\t953:\t953:\t953:\t953:\t954:\t954:\t954:\t954:\t954:\t955:\t955:\t955:\t955:\t955:\t956:\t956:\t956:\t956:\t956:\t957:\t957:\t957:\t957:\t957:\t958:\t958:\t958:\t958:\t958:\t959:\t959:\t959:\t959:\t959:\t960:\t960:\t960:\t960:\t960:\t961:\t961:\t961:\t961:\t961:\t962:\t962:\t962:\t962:\t962:\t963:\t963:\t963:\t963:\t963:\t964:\t964:\t964:\t964:\t964:\t965:\t965:\t965:\t965:\t965:\t966:\t966:\t966:\t966:\t966:\t967:\t967:\t967:\t967:\t967:\t968:\t968:\t968:\t968:\t968:\t969:\t969:\t969:\t969:\t969:\t970:\t970:\t970:\t970:\t970:\t971:\t971:\t971:\t971:\t971:\t972:\t972:\t972:\t972:\t972:\t973:\t973:\t973:\t973:\t973:\t974:\t974:\t974:\t974:\t974:\t975:\t975:\t975:\t975:\t975:\t976:\t976:\t976:\t976:\t976:\t977:\t977:\t977:\t977:\t977:\t978:\t978:\t978:\t978:\t978:\t979:\t979:\t979:\t979:\t979:\t980:\t980:\t980:\t980:\t980:\t981:\t981:\t981:\t981:\t981:\t982:\t982:\t982:\t982:\t982:\t983:\t983:\t983:\t983:\t983:\t984:\t984:\t984:\t984:\t984:\t985:\t985:\t985:\t985:\t985:\t986:\t986:\t986:\t986:\t986:\t987:\t987:\t987:\t987:\t987:\t988:\t988:\t988:\t988:\t988:\t989:\t989:\t989:\t989:\t989:\t990:\t990:\t990:\t990:\t990:\t991:\t991:\t991:\t991:\t991:\t992:\t992:\t992:\t992:\t992:\t993:\t993:\t993:\t993:\t993:\t994:\t994:\t994:\t994:\t994:\t995:\t995:\t995:\t995:\t995:\t996:\t996:\t996:\t996:\t996:\t997:\t997:\t997:\t997:\t997:\t998:\t998:\t998:\t998:\t998:\t999:\t999:\t999:\t999:\t999:\t"
898 | ]
899 | }
900 | ],
901 | "source": [
902 | "## run cv to get best iteration\n",
903 | "ctb_cv = cv(param, my_dt, fold_count=5, random_seed=2017)"
904 | ]
905 | },
906 | {
907 | "cell_type": "code",
908 | "execution_count": 205,
909 | "metadata": {
910 | "collapsed": false
911 | },
912 | "outputs": [],
913 | "source": [
914 | "# fetch best round\n",
915 | "best_round = ctb_cv['b\\'Accuracy\\'_test_avg'].index(np.max(ctb_cv['b\\'Accuracy\\'_test_avg']))"
916 | ]
917 | },
918 | {
919 | "cell_type": "code",
920 | "execution_count": 206,
921 | "metadata": {
922 | "collapsed": true
923 | },
924 | "outputs": [],
925 | "source": [
926 | "## define the classifer model\n",
927 | "model = CatBoostClassifier(iterations=best_round, learning_rate=0.03,rsm = 0.3 ,depth=6, eval_metric='Accuracy', random_seed=2017)"
928 | ]
929 | },
930 | {
931 | "cell_type": "code",
932 | "execution_count": 207,
933 | "metadata": {
934 | "collapsed": false
935 | },
936 | "outputs": [
937 | {
938 | "data": {
939 | "text/plain": [
940 | ""
941 | ]
942 | },
943 | "execution_count": 207,
944 | "metadata": {},
945 | "output_type": "execute_result"
946 | }
947 | ],
948 | "source": [
949 | "## train model\n",
950 | "model.fit(my_dt)"
951 | ]
952 | },
953 | {
954 | "cell_type": "code",
955 | "execution_count": 208,
956 | "metadata": {
957 | "collapsed": true
958 | },
959 | "outputs": [],
960 | "source": [
961 | "## make predictions\n",
962 | "preds = model.predict(test_feats1)"
963 | ]
964 | },
965 | {
966 | "cell_type": "code",
967 | "execution_count": 222,
968 | "metadata": {
969 | "collapsed": false
970 | },
971 | "outputs": [],
972 | "source": [
973 | "## make submission\n",
974 | "sub5 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})\n",
975 | "sub5['Is_Response'] = ['happy' if x == 1 else 'not_happy' for x in sub5['Is_Response']]\n",
976 | "sub5 = sub5[['User_ID','Is_Response']]\n",
977 | "sub5.to_csv('submissions/sub5_cb.csv', index=False)"
978 | ]
979 | }
980 | ],
981 | "metadata": {
982 | "kernelspec": {
983 | "display_name": "Python 3",
984 | "language": "python",
985 | "name": "python3"
986 | },
987 | "language_info": {
988 | "codemirror_mode": {
989 | "name": "ipython",
990 | "version": 3
991 | },
992 | "file_extension": ".py",
993 | "mimetype": "text/x-python",
994 | "name": "python",
995 | "nbconvert_exporter": "python",
996 | "pygments_lexer": "ipython3",
997 | "version": "3.5.2"
998 | }
999 | },
1000 | "nbformat": 4,
1001 | "nbformat_minor": 2
1002 | }
1003 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Happiness-ML-Challenge
2 |
3 |
4 | This repository contains scripts shared during the machine learning challenge for beginners. In this challenge, participants have to predict the happinessfor customers. The data used in the scripts can be downloaded from the link given below.
5 |
6 | The aim of this challenge is to encourage beginners to gain more hands on experience in solving ML problems.
7 |
8 | **Challenge Name:** Predict the Happiness
9 | **Duration:** 30th August 2017 to 31st November 2017
10 | **Type:** Binary Classification
11 | **Metrics:** Accuracy
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/xgb_nb.R:
--------------------------------------------------------------------------------
1 | path <- "/happy_data/"
2 | setwd(path)
3 |
4 | # data manipulation
5 | library(data.table)
6 |
7 | # NLP
8 | library(tm)
9 | library(qdap)
10 | library(SnowballC)
11 | library(purrr)
12 | library(text2vec)
13 |
14 | # modeling
15 | library(e1071) # for naive bayes
16 | library(xgboost)
17 |
18 | # load data
19 | train <- fread("train.csv")
20 | test <- fread("test.csv")
21 |
22 | ## Clean Data -----------------------------------------------------------
23 |
24 | cleanData <- function(data)
25 | {
26 |
27 | data[, Description := map_chr(Description, tolower)] # to lower
28 | data[, Description := map_chr(Description, function(k) gsub(pattern = "[[:punct:]]",replacement = "",x = k))] # remove punctuation
29 | data[, Description := map_chr(Description, function(k) gsub(pattern = "\\d+",replacement = "",x = k))] # remove digits
30 | data[, Description := map_chr(Description, function(k) replace_abbreviation(k))] # Sr. to Senior
31 | data[, Description := map_chr(Description, function(k) replace_contraction(k))] # isn't to is not
32 | data[,Description := map(Description, function(k) rm_stopwords(k, Top200Words, unlist = T))] # remove stopwords
33 | data[, Description := map(Description, function(k) stemmer(k))] # played, plays to play
34 | data[, Description := map(Description, function(k) k[nchar(k) > 2])] # remove two alphabet words like to, ok, po
35 | return (data)
36 |
37 |
38 | }
39 |
40 | train_clean <- cleanData(train)
41 | test_clean <- cleanData(test)
42 |
43 |
44 | # Bag of Words ------------------------------------------------------------
45 |
46 | ## Bag of words technique converts the list of tokens (words) into a separate column with binary values in it.
47 | ## Lets understand it.
48 |
49 | ctext <- Corpus(VectorSource(train_clean$Description))
50 |
51 | tdm = DocumentTermMatrix(ctext)
52 | print(tdm)
53 |
54 | # let's see how BOW looks like - every column becomes one feature
55 | inspect(tdm[1:10,1:5])
56 |
57 | ## From here, we'll use text2vec package which provides immense potential for feature engineering
58 | ## we'll build two models
59 | # a) On Bag of Words Corpus
60 | # b) On TF-IDF Corpus
61 | # c) 2 Gram Model - Your to-do Task
62 | # You can read more about TF-IDF here: http://www.tfidf.com/
63 |
64 |
65 | ## Bag of Words Model
66 |
67 | trte_data <- rbind(train[,.(User_ID, Description)], test[,.(User_ID, Description)])
68 | trte_data$Description <- unlist(map(trte_data$Description, paste, collapse = ","))
69 |
70 | bow <- itoken(trte_data$Description, preprocessor = tolower ,tokenizer = word_tokenizer, ids = trte_data$User_ID)
71 | bow_vocab <- create_vocabulary(bow)
72 | bow_vocab # now we have converted the text into tokens. woah! every word can be converted into a feature
73 |
74 | ## But not all words will be important, Are they ? let's remove words which occur less than 200 times in whole data
75 | pruned_bow <- prune_vocabulary(bow_vocab, term_count_min = 100)
76 | pruned_bow
77 |
78 | # get these vocabulary in a data frame for model training
79 | vovec <- vocab_vectorizer(pruned_bow)
80 | dtm_text <- create_dtm(bow, vovec)
81 |
82 | feats <- as.data.table(as.matrix(dtm_text))
83 | feats[1:10,1:5] # see 1st 10 rows and 1st 5 columns
84 |
85 | # first feature set
86 | train_feats <- feats[1:nrow(train)]
87 | test_feats <- feats[(nrow(train)+1):nrow(feats)]
88 |
89 | cols <- setdiff(colnames(train), c('User_ID','Is_Response','Description'))
90 | for(x in cols)
91 | {
92 | if (class(train[[x]]) == 'character')
93 | {
94 | levels <- unique(c(train[[x]], test[[x]]))
95 | train[[x]] <- as.numeric(factor(train[[x]], levels = levels))
96 | test[[x]] <- as.numeric(factor(test[[x]], levels = levels))
97 | }
98 | }
99 |
100 | ## preparing data for training
101 | train_feats <- cbind(train_feats, train[,.(Browser_Used, Device_Used, Is_Response)])
102 | test_feats <- cbind(test_feats, test[,.(Browser_Used, Device_Used)])
103 |
104 | train_feats[, Is_Response := ifelse(Is_Response == 'happy',1,0)]
105 | train_feats[, Is_Response := as.factor(Is_Response)]
106 |
107 | ## naive Bayes is known to perform quite well in text classification problems
108 |
109 | model <- naiveBayes(Is_Response ~ ., data = train_feats, laplace = 1)
110 | preds <- predict(model, test_feats)
111 |
112 | # make your submission
113 | sub <- data.table(User_ID = test$User_ID, Is_Response = ifelse(preds == 1, "happy", "not_happy"))
114 | fwrite(sub, "sub1.csv")
115 |
116 |
117 | # TF -TDF Model -----------------------------------------------------------
118 |
119 | TIDF <- TfIdf$new()
120 | dtm_text_tfidf <- fit_transform(dtm_text, TIDF)
121 |
122 | feats <- as.data.table(as.matrix(dtm_text_tfidf))
123 |
124 | # second feature set
125 | train_feats <- feats[1:nrow(train)]
126 | test_feats <- feats[(nrow(train)+1):nrow(feats)]
127 |
128 | ## preparing data for training
129 | train_feats <- cbind(train_feats, train[,.(Browser_Used, Device_Used, Is_Response)])
130 | test_feats <- cbind(test_feats, test[,.(Browser_Used, Device_Used)])
131 |
132 | train_feats[, Is_Response := ifelse(Is_Response == "happy",1,0)]
133 |
134 | ## You can use naiveBayes Model here and compare the accuracy.
135 | ## let's try xgboost model here.
136 |
137 | # set parameters for xgboost
138 | param <- list(booster = "gbtree",
139 | objective = "binary:logistic",
140 | eval_metric = "error",
141 | #num_class = 9,
142 | eta = .2,
143 | # gamma = 1,
144 | max_depth = 6,
145 | min_child_weight = 0,
146 | subsample = .8,
147 | colsample_bytree = .3
148 | )
149 |
150 |
151 | ## function to return predictions using best CV score
152 |
153 | predictions <- c()
154 |
155 | give_predictions <- function(train, test, params, iters)
156 | {
157 |
158 | dtrain <- xgb.DMatrix(data = as.matrix(train[,-c('Is_Response'),with=F]), label = train_feats$Is_Response)
159 | dtest <- xgb.DMatrix(data = as.matrix(test))
160 |
161 | cv.model <- xgb.cv(params = params
162 | ,data = dtrain
163 | ,nrounds = iters
164 | ,nfold = 5L
165 | ,stratified = T
166 | ,early_stopping_rounds = 40
167 | ,print_every_n = 20
168 | ,maximize = F)
169 |
170 | best_it <- cv.model$best_iteration
171 | best_score <- cv.model$evaluation_log$test_error_mean[which.min(cv.model$evaluation_log$test_error_mean)]
172 |
173 | cat('CV model returned',best_score,'error score')
174 |
175 | tr.model <- xgb.train(params = param
176 | ,data = dtrain
177 | ,nrounds = best_it
178 | ,watchlist = list(train = dtrain)
179 | ,print_every_n = 20
180 | )
181 |
182 | preds <- predict(tr.model, dtest)
183 | predictions <- append(predictions, preds)
184 |
185 | return(predictions)
186 |
187 | }
188 |
189 | # get predictions
190 | my_preds <- give_predictions(train_feats, test_feats, param, 1000)
191 |
192 | ## create submission file
193 | preds <- ifelse(my_preds > 0.66,1,0) #cutoff threshold
194 | sub2 <- data.table(User_ID = test$User_ID, Is_Response = preds)
195 | fwrite(sub2, "sub2.csv")
196 |
197 |
198 | ## What's Next ?
199 |
200 | ## Till now, we made 1-gram model i.e. one word per column. We can extend it to 2-3-n gram
201 |
202 | ## create another model with 2-gram features
203 |
204 | gr_vocab <- create_vocabulary(bow, ngram = c(1L,2L))
205 | gr_vocab <- prune_vocabulary(gr_vocab, term_count_min = 150)
206 | gr_vocab
207 |
208 | bigram_vec <- vocab_vectorizer(gr_vocab)
209 | dtm_text <- create_dtm(bow, bigram_vec)
210 |
211 | # now you can follow step from Line 79 onwards to create another model.
212 | # incase you face difficulties, feel free to raise "Issues" above.
213 |
214 |
215 |
--------------------------------------------------------------------------------