├── .gitignore
├── 1st_Place_Kunal
    ├── Features based on len of text.ipynb
    ├── LOM final.ipynb
    ├── LOM2.ipynb
    ├── LOM_1_model.ipynb
    ├── LOM_model_2.ipynb
    ├── LOM_text_features.ipynb
    └── README.md
├── 2nd_Place_Mark_SRK
    ├── Explorations.ipynb
    ├── build_model.py
    ├── build_model_xgb.py
    ├── ensemble.py
    └── readme.md
├── 3rd_Place_Aditya_Akash
    ├── 3rd_Place_Solution_Approach.docx
    ├── final_ensemble-simple_avg.ipynb
    ├── lgb_5fold-5_bag_nt45_rank_average_AND_lgb_5fold-5_bag_nt45_rank_average_4f.ipynb
    ├── lgb_5fold-5_bag_nt55_rank_average_5f_AND_lgb_5fold-5_bag_nt55_rank_average_4f.ipynb
    ├── lgb_new_features-v6-5fold_5bag_cv_retry_lb_692_ens6941-submitted.ipynb
    ├── lstm.ipynb
    ├── lstm_cnn.ipynb
    ├── readme.txt
    ├── user_cluster-kmeans.ipynb
    └── xgb_2fold-cv3_bag3_nt70_scalepos1_best_tree.ipynb
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | misc/
2 | *~
3 | 


--------------------------------------------------------------------------------
/1st_Place_Kunal/Features based on len of text.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib notebook"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "path = '/home/kunal/Downloads/lord_of_machines/'\n",
 26 |     "train = pd.read_csv(path + 'train.csv')\n",
 27 |     "test = pd.read_csv(path + 'test.csv')\n",
 28 |     "campaign = pd.read_csv(path +'campaign_data.csv')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 39,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "train_input = pd.read_csv(path + 'impact_encoded_train.csv')\n",
 40 |     "test_input = pd.read_csv(path + 'impact_encoded_test.csv')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "### CREATE EXTRA FEATURES USING GROUPBY STATISTICS"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "train = train.merge(campaign, on ='campaign_id',how = 'left')\n",
 59 |     "test = test.merge(campaign, on ='campaign_id',how = 'left')"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {
 66 |     "collapsed": true
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "#x1 = train.groupby('campaign_id')['is_click'].mean().sort_values(ascending = False).values"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "#x3 = train.groupby('campaign_id')['is_open'].mean().sort_values(ascending = False).values"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "#x2 = train['campaign_id'].value_counts().values"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 4,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "#def get_time(cell):\n",
104 |     "#    cell = cell.split(' ')[-1]\n",
105 |     "#    cell = cell.split(':')[0]\n",
106 |     "#    return cell\n",
107 |     "#train['hour'] = train['send_date'].apply(get_time)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {
114 |     "collapsed": true
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "#train[train['is_click'] == 1].hour.value_counts()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "#x5 = train['hour'].value_counts().values"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {
136 |     "collapsed": true
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "#test['hour'] = test['send_date'].apply(get_time)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "collapsed": true
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "#test['hour'].value_counts()"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "collapsed": true
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "train_input = pd.read_csv(path + 'impact_encoded_train.csv')\n",
163 |     "test_input = pd.read_csv(path + 'impact_encoded_test.csv')"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "#x3 = train.groupby('hour')['is_open'].mean().sort_values(ascending = False).values"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 5,
180 |    "metadata": {
181 |     "collapsed": true
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "## Day of the week\n",
186 |     "#def get_date(cell):\n",
187 |     "#    return cell.split(' ')[0]\n",
188 |     "#train['date'] = train['send_date'].apply(get_date)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {
195 |     "collapsed": true
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "#import datetime\n",
200 |     "#exp = train['date'][0]"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 6,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "#from datetime import datetime"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 7,
217 |    "metadata": {
218 |     "collapsed": true
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "#def get_weekday(cell):\n",
223 |     "#    return datetime.strptime(cell,'%d-%m-%Y').weekday()"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 8,
229 |    "metadata": {
230 |     "collapsed": true
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "#train['weekday'] = train['date'].apply(get_weekday)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {
241 |     "collapsed": true
242 |    },
243 |    "outputs": [],
244 |    "source": [
245 |     "#train['weekday'].value_counts()"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 9,
251 |    "metadata": {
252 |     "collapsed": true
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "#test['date'] = test['send_date'].apply(get_date)\n",
257 |     "#test['weekday'] = test['date'].apply(get_weekday)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {
264 |     "collapsed": true
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "tr.groupby('weekday')['is_open'].mean()"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {
275 |     "collapsed": true
276 |    },
277 |    "outputs": [],
278 |    "source": [
279 |     "train.groupby('hour')['is_open'].mean()"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 30,
285 |    "metadata": {
286 |     "collapsed": true
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "campaign['len_sub'] = campaign['subject'].str.split(' ').apply(len)"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 20,
296 |    "metadata": {
297 |     "collapsed": true
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "train['len_sub'] = train['campaign_id'].map(pd.Series(campaign['len_sub'],index = campaign['campaign_id']))\n",
302 |     "test['len_sub'] = test['campaign_id'].map(pd.Series(campaign['len_sub'],index = campaign['campaign_id']))"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 31,
308 |    "metadata": {
309 |     "collapsed": true
310 |    },
311 |    "outputs": [],
312 |    "source": [
313 |     "campaign['len_sub_email'] = campaign['email_body'].str.split(' ').apply(len)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 24,
319 |    "metadata": {
320 |     "collapsed": true
321 |    },
322 |    "outputs": [],
323 |    "source": [
324 |     "train['len_sub_email'] = train['campaign_id'].map(pd.Series(campaign['len_sub_email'],index = campaign['campaign_id']))\n",
325 |     "test['len_sub_email'] = test['campaign_id'].map(pd.Series(campaign['len_sub_email'],index = campaign['campaign_id']))"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 33,
331 |    "metadata": {
332 |     "collapsed": true
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "train.drop(['len_sub','len_sub_email'],axis = 1,inplace = True)\n",
337 |     "test.drop(['len_sub','len_sub_email'],axis = 1,inplace = True)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 34,
343 |    "metadata": {
344 |     "collapsed": true
345 |    },
346 |    "outputs": [],
347 |    "source": [
348 |     "train = train.merge(campaign[['campaign_id','len_sub','len_sub_email']],on = 'campaign_id',how = 'left')\n",
349 |     "test = test.merge(campaign[['campaign_id','len_sub','len_sub_email']],on = 'campaign_id',how = 'left')"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 41,
355 |    "metadata": {
356 |     "collapsed": true
357 |    },
358 |    "outputs": [],
359 |    "source": [
360 |     "train_input['len_sub'] = train['len_sub']\n",
361 |     "train_input['len_sub_email'] = train['len_sub_email']\n",
362 |     "\n",
363 |     "test_input['len_sub'] = test['len_sub']\n",
364 |     "test_input['len_sub_email'] = test['len_sub_email']"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 43,
370 |    "metadata": {
371 |     "collapsed": true
372 |    },
373 |    "outputs": [],
374 |    "source": [
375 |     "train_input.to_csv(path + 'impact_encoded_train.csv',index = False)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 44,
381 |    "metadata": {
382 |     "collapsed": true
383 |    },
384 |    "outputs": [],
385 |    "source": [
386 |     "test_input.to_csv(path + 'impact_encoded_test.csv',index = False)"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {
393 |     "collapsed": true
394 |    },
395 |    "outputs": [],
396 |    "source": [
397 |     "# This way we have randomness and are able to reproduce the behaviour within this cell.\n",
398 |     "np.random.seed(13)\n",
399 |     "from sklearn.model_selection import KFold\n",
400 |     "\n",
401 |     "def impact_coding(data, feature, target='y'):\n",
402 |     "    '''\n",
403 |     "    In this implementation we get the values and the dictionary as two different steps.\n",
404 |     "    This is just because initially we were ignoring the dictionary as a result variable.\n",
405 |     "    \n",
406 |     "    In this implementation the KFolds use shuffling. If you want reproducibility the cv \n",
407 |     "    could be moved to a parameter.\n",
408 |     "    '''\n",
409 |     "    n_folds = 10\n",
410 |     "    n_inner_folds = 5\n",
411 |     "    impact_coded = pd.Series()\n",
412 |     "    \n",
413 |     "    oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)\n",
414 |     "    kf = KFold(n_splits=n_folds, shuffle=True)\n",
415 |     "    oof_mean_cv = pd.DataFrame()\n",
416 |     "    split = 0\n",
417 |     "    for infold, oof in kf.split(data[feature]):\n",
418 |     "            impact_coded_cv = pd.Series()\n",
419 |     "            kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)\n",
420 |     "            inner_split = 0\n",
421 |     "            inner_oof_mean_cv = pd.DataFrame()\n",
422 |     "            oof_default_inner_mean = data.iloc[infold][target].mean()\n",
423 |     "            for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):\n",
424 |     "                # The mean to apply to the inner oof split (a 1/n_folds % based on the rest)\n",
425 |     "                oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()\n",
426 |     "                impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(\n",
427 |     "                            lambda x: oof_mean[x[feature]]\n",
428 |     "                                      if x[feature] in oof_mean.index\n",
429 |     "                                      else oof_default_inner_mean\n",
430 |     "                            , axis=1))\n",
431 |     "\n",
432 |     "                # Also populate mapping (this has all group -> mean for all inner CV folds)\n",
433 |     "                inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')\n",
434 |     "                inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)\n",
435 |     "                inner_split += 1\n",
436 |     "\n",
437 |     "            # Also populate mapping\n",
438 |     "            oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')\n",
439 |     "            oof_mean_cv.fillna(value=oof_default_mean, inplace=True)\n",
440 |     "            split += 1\n",
441 |     "            \n",
442 |     "            impact_coded = impact_coded.append(data.iloc[oof].apply(\n",
443 |     "                            lambda x: inner_oof_mean_cv.loc[x[feature]].mean()\n",
444 |     "                                      if x[feature] in inner_oof_mean_cv.index\n",
445 |     "                                      else oof_default_mean\n",
446 |     "                            , axis=1))\n",
447 |     "\n",
448 |     "    return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {
455 |     "collapsed": true
456 |    },
457 |    "outputs": [],
458 |    "source": [
459 |     "f = 'weekday'"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "metadata": {
466 |     "collapsed": true
467 |    },
468 |    "outputs": [],
469 |    "source": [
470 |     "# Apply the encoding to training and test data, and preserve the mapping\n",
471 |     "impact_coding_map = {}\n",
472 |     "print(\"Impact coding for {}\".format(f))\n",
473 |     "train[\"impact_encoded_{}\".format(f)], impact_coding_mapping, default_coding = impact_coding(train, f,'is_click')\n",
474 |     "impact_coding_map[f] = (impact_coding_mapping, default_coding)\n",
475 |     "mapping, default_mean = impact_coding_map[f]\n",
476 |     "test[\"impact_encoded_{}\".format(f)] = test.apply(lambda x: mapping[x[f]]\n",
477 |     "                                                                     if x[f] in mapping\n",
478 |     "                                                                     else default_mean\n",
479 |     "                                                           , axis=1)"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {
486 |     "collapsed": true
487 |    },
488 |    "outputs": [],
489 |    "source": [
490 |     "impact_coding_map = {}\n",
491 |     "print(\"Impact coding for {}\".format(f))\n",
492 |     "train[\"impact_encoded_open_{}\".format(f)], impact_coding_mapping, default_coding = impact_coding(train, f,'is_open')\n",
493 |     "impact_coding_map[f] = (impact_coding_mapping, default_coding)\n",
494 |     "mapping, default_mean = impact_coding_map[f]\n",
495 |     "test[\"impact_encoded_open_{}\".format(f)] = test.apply(lambda x: mapping[x[f]]\n",
496 |     "                                                                     if x[f] in mapping\n",
497 |     "                                                                     else default_mean\n",
498 |     "                                                           , axis=1)"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "metadata": {
505 |     "collapsed": true
506 |    },
507 |    "outputs": [],
508 |    "source": []
509 |   }
510 |  ],
511 |  "metadata": {
512 |   "kernelspec": {
513 |    "display_name": "Python 3",
514 |    "language": "python",
515 |    "name": "python3"
516 |   },
517 |   "language_info": {
518 |    "codemirror_mode": {
519 |     "name": "ipython",
520 |     "version": 3
521 |    },
522 |    "file_extension": ".py",
523 |    "mimetype": "text/x-python",
524 |    "name": "python",
525 |    "nbconvert_exporter": "python",
526 |    "pygments_lexer": "ipython3",
527 |    "version": "3.6.0"
528 |   }
529 |  },
530 |  "nbformat": 4,
531 |  "nbformat_minor": 2
532 | }
533 | 


--------------------------------------------------------------------------------
/1st_Place_Kunal/LOM final.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib notebook"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 5,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "path = '/home/kunal/Downloads/lord_of_machines/ensemble/'"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Take mean of the best solutions"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 85,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "df1 = pd.read_csv(path + 'f1.csv')\n",
 44 |     "#df2 = pd.read_csv(path + 'f2.csv')\n",
 45 |     "df3 = pd.read_csv(path + 'f3.csv')\n",
 46 |     "df4 = pd.read_csv(path + 'f4.csv')\n",
 47 |     "#df5 = pd.read_csv(path + 'f5.csv')\n",
 48 |     "df6 = pd.read_csv(path + 'f6.csv')"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 100,
 54 |    "metadata": {
 55 |     "collapsed": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "w1 = 0.25\n",
 60 |     "#w2 = 0\n",
 61 |     "w3 =0.25*0.5\n",
 62 |     "w4 = 0.5\n",
 63 |     "#w5 = 0\n",
 64 |     "w6 = 0.25*0.5"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 36,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "sample = pd.read_csv('/home/kunal/Downloads/lord_of_machines/sample.csv')"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 101,
 81 |    "metadata": {
 82 |     "collapsed": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "sample['is_click'] = (df1['is_click']*w1  + \n",
 87 |     "                        df3['is_click']*w3 + df4['is_click']*w4 +\n",
 88 |     "                     df6['is_click']*w6)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 102,
 94 |    "metadata": {
 95 |     "collapsed": true
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "sample.to_csv(path + 'The_best_solution.csv',index = False)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": true
107 |    },
108 |    "outputs": [],
109 |    "source": []
110 |   }
111 |  ],
112 |  "metadata": {
113 |   "kernelspec": {
114 |    "display_name": "Python 3",
115 |    "language": "python",
116 |    "name": "python3"
117 |   },
118 |   "language_info": {
119 |    "codemirror_mode": {
120 |     "name": "ipython",
121 |     "version": 3
122 |    },
123 |    "file_extension": ".py",
124 |    "mimetype": "text/x-python",
125 |    "name": "python",
126 |    "nbconvert_exporter": "python",
127 |    "pygments_lexer": "ipython3",
128 |    "version": "3.6.0"
129 |   }
130 |  },
131 |  "nbformat": 4,
132 |  "nbformat_minor": 2
133 | }
134 | 


--------------------------------------------------------------------------------
/1st_Place_Kunal/LOM2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib notebook"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "path = '/home/kunal/Downloads/lord_of_machines/'\n",
 26 |     "train = pd.read_csv(path + 'train.csv')\n",
 27 |     "test = pd.read_csv(path + 'test.csv')\n",
 28 |     "campaign = pd.read_csv(path +'campaign_data.csv')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Objective is to create aggregate features and encodings"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "#### Technique picked up from a kaggle forum. The code can be found here:https://www.kaggle.com/tnarik/likelihood-encoding-of-categorical-features/notebook"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# This way we have randomness and are able to reproduce the behaviour within this cell.\n",
 54 |     "np.random.seed(13)\n",
 55 |     "from sklearn.model_selection import KFold\n",
 56 |     "\n",
 57 |     "def impact_coding(data, feature, target='y'):\n",
 58 |     "    '''\n",
 59 |     "    In this implementation we get the values and the dictionary as two different steps.\n",
 60 |     "    This is just because initially we were ignoring the dictionary as a result variable.\n",
 61 |     "    \n",
 62 |     "    In this implementation the KFolds use shuffling. If you want reproducibility the cv \n",
 63 |     "    could be moved to a parameter.\n",
 64 |     "    '''\n",
 65 |     "    n_folds = 10\n",
 66 |     "    n_inner_folds = 5\n",
 67 |     "    impact_coded = pd.Series()\n",
 68 |     "    \n",
 69 |     "    oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)\n",
 70 |     "    kf = KFold(n_splits=n_folds, shuffle=True)\n",
 71 |     "    oof_mean_cv = pd.DataFrame()\n",
 72 |     "    split = 0\n",
 73 |     "    for infold, oof in kf.split(data[feature]):\n",
 74 |     "            impact_coded_cv = pd.Series()\n",
 75 |     "            kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)\n",
 76 |     "            inner_split = 0\n",
 77 |     "            inner_oof_mean_cv = pd.DataFrame()\n",
 78 |     "            oof_default_inner_mean = data.iloc[infold][target].mean()\n",
 79 |     "            for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):\n",
 80 |     "                # The mean to apply to the inner oof split (a 1/n_folds % based on the rest)\n",
 81 |     "                oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()\n",
 82 |     "                impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(\n",
 83 |     "                            lambda x: oof_mean[x[feature]]\n",
 84 |     "                                      if x[feature] in oof_mean.index\n",
 85 |     "                                      else oof_default_inner_mean\n",
 86 |     "                            , axis=1))\n",
 87 |     "\n",
 88 |     "                # Also populate mapping (this has all group -> mean for all inner CV folds)\n",
 89 |     "                inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')\n",
 90 |     "                inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)\n",
 91 |     "                inner_split += 1\n",
 92 |     "\n",
 93 |     "            # Also populate mapping\n",
 94 |     "            oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')\n",
 95 |     "            oof_mean_cv.fillna(value=oof_default_mean, inplace=True)\n",
 96 |     "            split += 1\n",
 97 |     "            \n",
 98 |     "            impact_coded = impact_coded.append(data.iloc[oof].apply(\n",
 99 |     "                            lambda x: inner_oof_mean_cv.loc[x[feature]].mean()\n",
100 |     "                                      if x[feature] in inner_oof_mean_cv.index\n",
101 |     "                                      else oof_default_mean\n",
102 |     "                            , axis=1))\n",
103 |     "\n",
104 |     "    return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "metadata": {
111 |     "collapsed": true
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "### Prepare dataset for encoding\n",
116 |     "train = train.merge(campaign,on = 'campaign_id',how = 'left')\n",
117 |     "test = test.merge(campaign, on = 'campaign_id',how = 'left')\n",
118 |     "train['user_id'] = train['user_id'].apply(str)\n",
119 |     "train['campaign_id'] = train['campaign_id'].apply(str)\n",
120 |     "test['user_id'] = test['user_id'].apply(str)\n",
121 |     "test['campaign_id'] = test['campaign_id'].apply(str)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 5,
127 |    "metadata": {
128 |     "collapsed": true
129 |    },
130 |    "outputs": [],
131 |    "source": [
132 |     "train = train.drop(['email_body', 'subject', 'email_url','send_date'],axis =1 )\n",
133 |     "test = test.drop(['email_body', 'subject', 'email_url','send_date'],axis =1 )"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 6,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "['id', 'user_id', 'campaign_id', 'communication_type']"
145 |       ]
146 |      },
147 |      "execution_count": 6,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "features = train.columns\n",
154 |     "numeric_features = []\n",
155 |     "categorical_features = []\n",
156 |     "\n",
157 |     "for dtype, feature in zip(train.dtypes, train.columns):\n",
158 |     "    if dtype == object:\n",
159 |     "        #print(column)\n",
160 |     "        #print(train_data[column].describe())\n",
161 |     "        categorical_features.append(feature)\n",
162 |     "    else:\n",
163 |     "        numeric_features.append(feature)\n",
164 |     "categorical_features"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 7,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "name": "stdout",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "Impact coding for user_id\n",
177 |       "Impact coding for campaign_id\n",
178 |       "Impact coding for communication_type\n"
179 |      ]
180 |     }
181 |    ],
182 |    "source": [
183 |     "# Apply the encoding to training and test data, and preserve the mapping\n",
184 |     "impact_coding_map = {}\n",
185 |     "for f in categorical_features[1:]:\n",
186 |     "    print(\"Impact coding for {}\".format(f))\n",
187 |     "    train[\"impact_encoded_{}\".format(f)], impact_coding_mapping, default_coding = impact_coding(train, f,'is_click')\n",
188 |     "    impact_coding_map[f] = (impact_coding_mapping, default_coding)\n",
189 |     "    mapping, default_mean = impact_coding_map[f]\n",
190 |     "    test[\"impact_encoded_{}\".format(f)] = test.apply(lambda x: mapping[x[f]]\n",
191 |     "                                                                         if x[f] in mapping\n",
192 |     "                                                                         else default_mean\n",
193 |     "                                                               , axis=1)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 8,
199 |    "metadata": {},
200 |    "outputs": [
201 |     {
202 |      "name": "stdout",
203 |      "output_type": "stream",
204 |      "text": [
205 |       "Impact coding for user_id\n",
206 |       "Impact coding for campaign_id\n",
207 |       "Impact coding for communication_type\n"
208 |      ]
209 |     }
210 |    ],
211 |    "source": [
212 |     "impact_coding_map = {}\n",
213 |     "for f in categorical_features[1:]:\n",
214 |     "    print(\"Impact coding for {}\".format(f))\n",
215 |     "    train[\"impact_encoded_open_{}\".format(f)], impact_coding_mapping, default_coding = impact_coding(train, f,'is_open')\n",
216 |     "    impact_coding_map[f] = (impact_coding_mapping, default_coding)\n",
217 |     "    mapping, default_mean = impact_coding_map[f]\n",
218 |     "    test[\"impact_encoded_open_{}\".format(f)] = test.apply(lambda x: mapping[x[f]]\n",
219 |     "                                                                         if x[f] in mapping\n",
220 |     "                                                                         else default_mean\n",
221 |     "                                                               , axis=1)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 9,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "train.to_csv(path + 'impact_encoded_train.csv',index = False)\n",
233 |     "test.to_csv(path + 'impact_encoded_test.csv',index = False)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {
240 |     "collapsed": true
241 |    },
242 |    "outputs": [],
243 |    "source": []
244 |   }
245 |  ],
246 |  "metadata": {
247 |   "kernelspec": {
248 |    "display_name": "Python 3",
249 |    "language": "python",
250 |    "name": "python3"
251 |   },
252 |   "language_info": {
253 |    "codemirror_mode": {
254 |     "name": "ipython",
255 |     "version": 3
256 |    },
257 |    "file_extension": ".py",
258 |    "mimetype": "text/x-python",
259 |    "name": "python",
260 |    "nbconvert_exporter": "python",
261 |    "pygments_lexer": "ipython3",
262 |    "version": "3.6.0"
263 |   }
264 |  },
265 |  "nbformat": 4,
266 |  "nbformat_minor": 2
267 | }
268 | 


--------------------------------------------------------------------------------
/1st_Place_Kunal/LOM_1_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib notebook"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "path = '/home/kunal/Downloads/lord_of_machines/'\n",
 26 |     "train = pd.read_csv(path + 'train.csv')\n",
 27 |     "test = pd.read_csv(path + 'test.csv')\n",
 28 |     "#campaign = pd.read_csv(path +'campaign_data.csv')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "train_input = pd.read_csv(path + 'impact_encoded_train.csv')\n",
 40 |     "test_input = pd.read_csv(path + 'impact_encoded_test.csv')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {
 47 |     "collapsed": true
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "user_features  = pd.read_csv(path + 'user_features.csv')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Modelling part"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {
 65 |     "collapsed": true
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "from sklearn.decomposition import PCA\n",
 70 |     "pca = PCA(40,random_state = 10)\n",
 71 |     "user_features_matrix = pca.fit_transform(user_features.iloc[:,1:])"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "user_features_matrix = pd.DataFrame(user_features_matrix)\n",
 83 |     "user_features_matrix['user_id'] = user_features['user_id']\n",
 84 |     "train_input = train_input.merge(user_features_matrix,on = 'user_id',how = 'left')\n"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "#pca2 = PCA(30,random_state = 10)\n",
 96 |     "#cluster_feature = pca2.fit_transform(campaign_features.iloc[:,1:])"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "#cluster_feature = pd.DataFrame(cluster_feature)\n",
108 |     "#cluster_feature['campaign_id'] = campaign['campaign_id']"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "#train_input['campaign_id'] = train['campaign_id']\n",
120 |     "#train_input = train_input.merge(cluster_feature,on = 'campaign_id',how = 'left')"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {
127 |     "collapsed": true
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "test_input = test_input.merge(user_features_matrix,on = 'user_id',how = 'left')\n",
132 |     "#test_input['campaign_id'] = test['campaign_id']\n",
133 |     "#test_input = test_input.merge(cluster_feature,on = 'campaign_id',how = 'left')"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "#x = encoded_features(train_input,train_input)\n",
145 |     "#y = encoded_features(df = train_input,df_new=test_input)\n",
146 |     "x = train_input.copy()\n",
147 |     "y = test_input.copy()\n",
148 |     "count_feature = pd.concat([train['user_id'],test['user_id']])\n",
149 |     "count_feature = count_feature.value_counts()\n",
150 |     "x['counts'] = x['user_id'].map(count_feature)\n",
151 |     "y['counts'] = y['user_id'].map(count_feature)\n",
152 |     "x = x.drop(['user_id','is_click','is_open','id','campaign_id','communication_type'],axis = 1)\n",
153 |     "y = y.drop(['user_id','id','campaign_id','communication_type'],axis = 1)\n",
154 |     "#x['total_open_percentage'] = x['total_open']/x['counts']\n",
155 |     "#y['total_open_percentage'] = y['total_open']/y['counts']"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {
162 |     "collapsed": true
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "#x.drop('Unnamed: 0',axis = 1,inplace=True)\n",
167 |     "#y.drop('Unnamed: 0',axis = 1,inplace=True)\n",
168 |     "print(x.shape)\n",
169 |     "print(y.shape)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {
176 |     "collapsed": true
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "print(train_input.shape)\n",
181 |     "print(test_input.shape)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {
188 |     "collapsed": true
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "### Check the columns in X and Y\n",
193 |     "### They should be same and id variables should not be present\n",
194 |     "print(x.columns)\n",
195 |     "print('*-'*50)\n",
196 |     "print(y.columns)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "from xgboost import XGBClassifier\n",
208 |     "from catboost import CatBoostClassifier\n",
209 |     "from sklearn.neighbors import KNeighborsClassifier\n",
210 |     "from sklearn.ensemble import RandomForestClassifier\n",
211 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
212 |     "from sklearn.linear_model import LogisticRegression\n",
213 |     "\n",
214 |     "xg = XGBClassifier(n_estimators = 600,max_depth = 6,gamma = 10)\n",
215 |     "#cb = CatBoostClassifier()\n",
216 |     "#knn = KNeighborsClassifier()\n",
217 |     "#rf = RandomForestClassifier()\n",
218 |     "#et = ExtraTreesClassifier()\n",
219 |     "#lr = LogisticRegression()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {
226 |     "collapsed": true
227 |    },
228 |    "outputs": [],
229 |    "source": [
230 |     "### Choose your algo and use fit method (replace classifier by the name of your algo constructor)\n",
231 |     "xg.fit(x,train['is_click'])\n"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "predictions = xg.predict_proba(y)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": true
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "sample = pd.read_csv(path + 'sample.csv')"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {
260 |     "collapsed": true
261 |    },
262 |    "outputs": [],
263 |    "source": [
264 |     "name_of_submission_file = 'final_sub'    # select name of file\n",
265 |     "sample['is_click'] = predictions[:,1]\n",
266 |     "sample.to_csv(path + '{}.csv'.format(name_of_submission_file), index=False)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {
273 |     "collapsed": true
274 |    },
275 |    "outputs": [],
276 |    "source": []
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {
282 |     "collapsed": true
283 |    },
284 |    "outputs": [],
285 |    "source": []
286 |   }
287 |  ],
288 |  "metadata": {
289 |   "kernelspec": {
290 |    "display_name": "Python 3",
291 |    "language": "python",
292 |    "name": "python3"
293 |   },
294 |   "language_info": {
295 |    "codemirror_mode": {
296 |     "name": "ipython",
297 |     "version": 3
298 |    },
299 |    "file_extension": ".py",
300 |    "mimetype": "text/x-python",
301 |    "name": "python",
302 |    "nbconvert_exporter": "python",
303 |    "pygments_lexer": "ipython3",
304 |    "version": "3.6.0"
305 |   }
306 |  },
307 |  "nbformat": 4,
308 |  "nbformat_minor": 2
309 | }
310 | 


--------------------------------------------------------------------------------
/1st_Place_Kunal/LOM_model_2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib notebook"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "path = '/home/kunal/Downloads/lord_of_machines/'\n",
 26 |     "train = pd.read_csv(path + 'train.csv')\n",
 27 |     "test = pd.read_csv(path + 'test.csv')\n",
 28 |     "campaign = pd.read_csv(path +'campaign_data.csv')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "#weekday_train = pd.read_csv(path + 'weekday_train.csv')\n",
 40 |     "#weekday_test = pd.read_csv(path + 'weekday_test.csv')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {
 47 |     "collapsed": true
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "train_input = pd.read_csv(path + 'impact_encoded_train.csv')\n",
 52 |     "test_input = pd.read_csv(path + 'impact_encoded_test.csv')"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {
 59 |     "collapsed": true
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "#train_input1 = pd.read_csv(path + 'train_new.csv')\n",
 64 |     "#test_input1 = pd.read_csv(path + 'test_new.csv')"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "### Generate user profile based on their interests in communication type"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "train = train.merge(campaign, on = 'campaign_id',how = 'left')\n",
 83 |     "test = test.merge(campaign, on = 'campaign_id',how = 'left')"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "collapsed": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "data = pd.concat([train[['user_id','communication_type']],test[['user_id','communication_type']]])"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {
101 |     "collapsed": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "user_profile = pd.crosstab(data['user_id'],data['communication_type'])"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {
112 |     "collapsed": true
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "user_profile.reset_index(inplace = True)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {
123 |     "collapsed": true
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "train_input = train_input.merge(user_profile,on = 'user_id',how = 'left')\n",
128 |     "test_input = test_input.merge(user_profile,on = 'user_id',how = 'left')"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "#train_input = pd.concat([train_input,weekday_train],axis = 1)\n",
140 |     "#test_input = pd.concat([test_input,weekday_test],axis = 1)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "collapsed": true
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "#train_input['impact_encoded_weekday'] = train['impact_encoded_weekday']\n",
152 |     "#train_input['impact_encoded_open_weekday'] = train['impact_encoded_open_weekday']\n",
153 |     "#test_input['impact_encoded_weekday'] = test['impact_encoded_weekday']\n",
154 |     "#test_input['impact_encoded_open_weekday'] = test['impact_encoded_open_weekday']"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {
161 |     "collapsed": true
162 |    },
163 |    "outputs": [],
164 |    "source": [
165 |     "#weekday_train.to_csv(path + 'weekday_train.csv',index = False)\n",
166 |     "#weekday_test.to_csv(path + 'weekday_test.csv',index = False)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {
173 |     "collapsed": true
174 |    },
175 |    "outputs": [],
176 |    "source": [
177 |     "train_input.columns"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {
184 |     "collapsed": true
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "user_features  = pd.read_csv(path + 'user_features.csv')"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {
195 |     "collapsed": true
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "#train_input['user_id'] = train['user_id']\n",
200 |     "#test_input['user_id'] = test['user_id']\n",
201 |     "#train_input['is_open'] = train['is_open']\n",
202 |     "#train_input['is_click'] = train['is_click']"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "#campaign_features = pd.read_csv(path + '2_gram_campaign_features.csv')"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {
220 |     "collapsed": true
221 |    },
222 |    "outputs": [],
223 |    "source": [
224 |     "#campaign_features.head()"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "## Modelling part"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "from sklearn.decomposition import PCA\n",
243 |     "pca = PCA(50,random_state = 10)\n",
244 |     "user_features_matrix = pca.fit_transform(user_features.iloc[:,1:])"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {
251 |     "collapsed": true
252 |    },
253 |    "outputs": [],
254 |    "source": [
255 |     "user_features_matrix = pd.DataFrame(user_features_matrix)\n",
256 |     "user_features_matrix['user_id'] = user_features['user_id']\n",
257 |     "train_input = train_input.merge(user_features_matrix,on = 'user_id',how = 'left')"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {
264 |     "collapsed": true
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "#pca2 = PCA(30,random_state = 10)\n",
269 |     "#cluster_feature = pca2.fit_transform(campaign_features.iloc[:,1:])"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {
276 |     "collapsed": true
277 |    },
278 |    "outputs": [],
279 |    "source": [
280 |     "#cluster_feature = pd.DataFrame(cluster_feature)\n",
281 |     "#cluster_feature['campaign_id'] = campaign['campaign_id']"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {
288 |     "collapsed": true
289 |    },
290 |    "outputs": [],
291 |    "source": [
292 |     "#train_input['campaign_id'] = train['campaign_id']\n",
293 |     "#train_input = train_input.merge(campaign_features,on = 'campaign_id',how = 'left')"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {
300 |     "collapsed": true
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "test_input = test_input.merge(user_features_matrix,on = 'user_id',how = 'left')\n",
305 |     "#test_input['campaign_id'] = test['campaign_id']\n",
306 |     "#test_input = test_input.merge(campaign_features,on = 'campaign_id',how = 'left')"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {
313 |     "collapsed": true
314 |    },
315 |    "outputs": [],
316 |    "source": [
317 |     "#x = encoded_features(train_input,train_input)\n",
318 |     "#y = encoded_features(df = train_input,df_new=test_input)\n",
319 |     "x = train_input.copy()\n",
320 |     "y = test_input.copy()\n",
321 |     "count_feature = pd.concat([train['user_id'],test['user_id']])\n",
322 |     "count_feature = count_feature.value_counts()\n",
323 |     "x['counts'] = x['user_id'].map(count_feature)\n",
324 |     "y['counts'] = y['user_id'].map(count_feature)\n",
325 |     "x = x.drop(['user_id','is_click','is_open','campaign_id','id','communication_type'],axis = 1)\n",
326 |     "y = y.drop(['user_id','campaign_id','id','communication_type'],axis = 1)\n",
327 |     "#x['total_open_percentage'] = x['total_open']/x['counts']\n",
328 |     "#y['total_open_percentage'] = y['total_open']/y['counts']"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {
335 |     "collapsed": true
336 |    },
337 |    "outputs": [],
338 |    "source": [
339 |     "### Check the columns in X and Y\n",
340 |     "### They should be same and id variables should not be present\n",
341 |     "print(x.columns)\n",
342 |     "#print('*-'*50)\n",
343 |     "print(y.columns)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {
350 |     "collapsed": true
351 |    },
352 |    "outputs": [],
353 |    "source": [
354 |     "#x.drop(['id','communication_type'],axis = 1,inplace = True)\n",
355 |     "#y.drop(['id','communication_type'],axis = 1,inplace = True)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {
362 |     "collapsed": true
363 |    },
364 |    "outputs": [],
365 |    "source": [
366 |     "#x.drop(['impact_encoded_weekday','impact_encoded_open_weekday'],axis = 1,inplace = True)\n",
367 |     "#y.drop(['impact_encoded_weekday','impact_encoded_open_weekday'],axis = 1,inplace = True)"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {
374 |     "collapsed": true
375 |    },
376 |    "outputs": [],
377 |    "source": [
378 |     "#from sklearn.model_selection import train_test_split\n",
379 |     "#Xtrain,Xtest,ytrain,ytest = train_test_split(x,train['is_click'],test_size = 0.7)"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {
386 |     "collapsed": true
387 |    },
388 |    "outputs": [],
389 |    "source": [
390 |     "from xgboost import XGBClassifier\n",
391 |     "#from catboost import CatBoostClassifier\n",
392 |     "#from sklearn.neighbors import KNeighborsClassifier\n",
393 |     "#from sklearn.ensemble import RandomForestClassifier\n",
394 |     "#from sklearn.ensemble import ExtraTreesClassifier\n",
395 |     "#from sklearn.linear_model import LogisticRegression\n",
396 |     "\n",
397 |     "xg = XGBClassifier(n_estimators = 500,max_depth= 7,gamma = 20,colsample_bylevel=0.9,colsample_bytree=0.9)\n",
398 |     "#cb = CatBoostClassifier()\n",
399 |     "#knn = KNeighborsClassifier()\n",
400 |     "#rf = RandomForestClassifier()\n",
401 |     "#et = ExtraTreesClassifier()\n",
402 |     "#lr = LogisticRegression()"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {
409 |     "collapsed": true
410 |    },
411 |    "outputs": [],
412 |    "source": [
413 |     "### Choose your algo and use fit method (replace classifier by the name of your algo constructor)\n",
414 |     "xg.fit(x,train['is_click'])\n"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {
421 |     "collapsed": true
422 |    },
423 |    "outputs": [],
424 |    "source": [
425 |     "predictions = xg.predict_proba(y)"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": null,
431 |    "metadata": {
432 |     "collapsed": true
433 |    },
434 |    "outputs": [],
435 |    "source": [
436 |     "sample = pd.read_csv(path + 'sample.csv')"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {
443 |     "collapsed": true
444 |    },
445 |    "outputs": [],
446 |    "source": [
447 |     "\n",
448 |     "name_of_submission_file = 'final_sub2'    # select name of file\n",
449 |     "sample['is_click'] = predictions[:,1]\n",
450 |     "sample.to_csv(path + '{}.csv'.format(name_of_submission_file),index = False)"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {
457 |     "collapsed": true
458 |    },
459 |    "outputs": [],
460 |    "source": [
461 |     "from xgboost import plot_importance"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {
468 |     "collapsed": true
469 |    },
470 |    "outputs": [],
471 |    "source": [
472 |     "from sklearn.metrics import roc_auc_score\n",
473 |     "print(roc_auc_score(train['is_click'],xg.predict_proba(x)[:,1]))"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {
480 |     "collapsed": true
481 |    },
482 |    "outputs": [],
483 |    "source": [
484 |     "plot_importance(xg)"
485 |    ]
486 |   }
487 |  ],
488 |  "metadata": {
489 |   "kernelspec": {
490 |    "display_name": "Python 3",
491 |    "language": "python",
492 |    "name": "python3"
493 |   },
494 |   "language_info": {
495 |    "codemirror_mode": {
496 |     "name": "ipython",
497 |     "version": 3
498 |    },
499 |    "file_extension": ".py",
500 |    "mimetype": "text/x-python",
501 |    "name": "python",
502 |    "nbconvert_exporter": "python",
503 |    "pygments_lexer": "ipython3",
504 |    "version": "3.6.0"
505 |   }
506 |  },
507 |  "nbformat": 4,
508 |  "nbformat_minor": 2
509 | }
510 | 


--------------------------------------------------------------------------------
/1st_Place_Kunal/LOM_text_features.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "%matplotlib notebook"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "path = '/home/kunal/Downloads/lord_of_machines/'\n",
 26 |     "train = pd.read_csv(path + 'train.csv')\n",
 27 |     "test = pd.read_csv(path + 'test.csv')\n",
 28 |     "campaign = pd.read_csv(path +'campaign_data.csv')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Create New user and campaign features based on text"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer\n",
 47 |     "from sklearn.decomposition import TruncatedSVD\n",
 48 |     "from sklearn.pipeline import make_pipeline\n",
 49 |     "from sklearn.preprocessing import Normalizer"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "collapsed": true
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "bow_vectorizer_uni = CountVectorizer(ngram_range=(1,1),stop_words='english')"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "import re\n",
 72 |     "from nltk.stem import PorterStemmer\n",
 73 |     "ps = PorterStemmer()"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "def review_to_words( cell ):\n",
 85 |     "      \n",
 86 |     "    letters_only = re.sub(\"[^a-zA-Z]\", \" \", cell) \n",
 87 |     "    words = letters_only.lower().split()                                \n",
 88 |     "    stemmed_words = [ps.stem(w) for w in words]\n",
 89 |     "    return( \" \".join( stemmed_words )) "
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "campaign['cleaned_subject'] = campaign['subject'].apply(review_to_words)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "X_train_bow_uni = bow_vectorizer_uni.fit_transform(campaign['cleaned_subject'])"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "X_train_bow_uni = pd.DataFrame(X_train_bow_uni.toarra())\n",
123 |     "X_train_bow_uni['campaign_id'] = campaign['campaign_id']"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "user_data = pd.concat([train[['user_id','campaign_id']],test[['user_id','campaign_id']]])"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "user_data_sum = user_data.merge(X_train_bow_uni,on = 'campaign_id',how = 'left').drop('campaign_id',axis = 1).groupby('user_id').sum()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "user_data_sum.reset_index(inplace = True)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {
163 |     "collapsed": true
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "user_data_sum.to_csv(path + 'user_features.csv',index= False)"
168 |    ]
169 |   }
170 |  ],
171 |  "metadata": {
172 |   "kernelspec": {
173 |    "display_name": "Python 3",
174 |    "language": "python",
175 |    "name": "python3"
176 |   },
177 |   "language_info": {
178 |    "codemirror_mode": {
179 |     "name": "ipython",
180 |     "version": 3
181 |    },
182 |    "file_extension": ".py",
183 |    "mimetype": "text/x-python",
184 |    "name": "python",
185 |    "nbconvert_exporter": "python",
186 |    "pygments_lexer": "ipython3",
187 |    "version": "3.6.0"
188 |   }
189 |  },
190 |  "nbformat": 4,
191 |  "nbformat_minor": 2
192 | }
193 | 


--------------------------------------------------------------------------------
/1st_Place_Kunal/README.md:
--------------------------------------------------------------------------------
 1 | ## Approach
 2 | 
 3 | The competition was based on an imbalanced binary classification problem with AUCROC metric.
 4 | I created several features based on textual information and user behaviour to arrive at my final solution
 5 | The features created were:
 6 | 1) Target encoding of user_id with respect to is_open and is_click
 7 | 2) Target encoding of campaign_id with respect to is_open and is_click
 8 | 3) Target encoding of communication_type with respect to is_open and is_click
 9 | 4) Length of email body (word wise)
10 | 5) Length of subject
11 | 6) Key feature : I pre-processed the text in the subject by removing stop words, lemmatizing them, removing punctuations etc. After that I used a bag of words (unigram) representation of different
12 | 		 campaign_ids based on their subject. This was followed by merging this dataset with campaing_ids present in the train and test data. After this merge operation. I used groupby sum based on user_id to obtain a unique representation for every user. This was followed by PCA to reduce the dimensions to 50. This operation added the biggest jump to my score.
13 | 7) Number of mails received by different users
14 | 8) Cross tab of user_id vs communication type
15 | 9) Numerical features present in the campaign_data
16 | 
17 | This became my general frame work for data preparation before feeding it into any model. An xgboost model with these set of features gave me score of 0.695+ on the public leaderboard. What followed after this was sheer pragmatism. I created several models based on approximately the same frame work and differentiated them by adding variability. Some of the important variations were:
18 | 1) Using bi-grams for BOW representation
19 | 2) Using tri-grams for BOW representation
20 | 3) Using all three of them
21 | 4) Using tf-idf with same (unigram,bi-gram,tri-gram)
22 | 5) Using lightboost, xgboost and catboost on each of the three representations above
23 | 6) Using truncated SVD instead of PCA for dimension reduction
24 | 7) I even dropped the best performing feature and tuned the hyper-parameters in such a way to arrive at similar scores using remaining features
25 | 8) Target encoding of weekday of sent mail
26 | 9) Cosine distance among the Glove vector representations of differnt campaign ids.
27 | 
28 | These are just some of them. I created many notebooks and added/dropped/modified many features and performed many experiments, which most of the time gave me a public lb score around the vicinity of (0.685 - 0.69). Even though the perfomance of all the models were similar, there predictions were not highly correlated. This gave me the opportunity to take advantage of weighted ensembles to arrive at a higher score. I took the most similar scoring prediction files with the least correlation and took their weighted average. I continued this process in an uphill fashion. I ended up with four best performing predictions with scores (0.699 - 0.7011). I again followed the same hueristic to arrive at my final score which gave me a public leader board score of 0.704. This entire process is very similar to model stacking where diverse base classifiers prediction is fed to a meta classifier to arrive at better predictions. Only in my case, it was me manually adjusting the weights assigned to different models by validating them against the public leader board.  
29 | 
30 | ## How to run the code?
31 | The order of running code files are:
32 | 
33 | 1) LOM2 - (for generating encodings of user id and campaign id using target encoding)
34 | 2) LOM-text_features - (for generating features based on text)
35 | 3) Features based on len of text
36 | 4) LOM_1_model & LOM_model_2 - (both are used for generation of final submissions)
37 | 5) LOM_final
38 | 
39 | PLease take note that I created many many solutions using different features, sometimes different hyper-parameters, and even different algorithms. As for final solution, I took their weighted ensemble (weights decided against public leaderboard).
40 | 
41 | The LOM_1_model and LOM_model_2 are my top two single performing models. However the best solution is a combination of different predictions. The files in the ensemble folder contain these different predictions (Kindly change the path accordingly in LOM_final notebook)
42 | 
43 | Some variations that I used for arriving at these predictions are:
44 | 1) Using bi-gram text_features
45 | 2) Increasing the dimension parameter in PCA
46 | 3) Using a mixture of bi-gram & tri-gram
47 | 4) Using a mixture of catboost and lightboost algorithm on uni-gram features
48 | 5) Averaging predictions over different xgboost depths.
49 | 6) Using tf-idf vectors instead of bag of words
50 | 
51 | Finally, it was a great competition with lots of learning and excitement. 
52 | Thank you team AV for organizing this contest.
53 | 
54 | 


--------------------------------------------------------------------------------
/2nd_Place_Mark_SRK/build_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import metrics, model_selection, ensemble, preprocessing, linear_model
  4 | import lightgbm as lgb
  5 | 
  6 | def getCountVar(compute_df, count_df, var_name, count_var="v1"):
  7 | 	grouped_df = count_df.groupby(var_name)[count_var].agg('count').reset_index()
  8 | 	grouped_df.columns = var_name + ["var_count"]
  9 | 
 10 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 11 | 	merged_df.fillna(np.mean(grouped_df["var_count"].values), inplace=True)
 12 | 	return list(merged_df["var_count"])
 13 | 
 14 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1):
 15 | 	if type(var_name) != type([]):
 16 | 		var_name = [var_name]
 17 | 	grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
 18 | 	grouped_df.columns = var_name + ["mean_value"]
 19 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 20 | 	merged_df.fillna(np.mean(target_df[target_var].values), inplace=True)
 21 | 	return list(merged_df["mean_value"])
 22 | 
 23 | def getDVEncodeVar2(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1):
 24 | 	if type(var_name) != type([]):
 25 | 		var_name = [var_name]
 26 | 	grouped_df = target_df.groupby(var_name)[target_var].agg(["sum"]).reset_index()
 27 | 	grouped_df.columns = var_name + ["sum_value"]
 28 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 29 | 	merged_df.fillna(np.mean(grouped_df["sum_value"].values), inplace=True)
 30 | 	return list(merged_df["sum_value"])
 31 | 
 32 | 
 33 | def runLR(train_X, train_y, test_X, test_y=None, test_X2=None):
 34 | 	model = linear_model.LogisticRegression(fit_intercept=True, C=0.3)
 35 | 	model.fit(train_X, train_y)
 36 | 	print model.coef_, model.intercept_
 37 | 	train_preds = model.predict_proba(train_X)[:,1]
 38 | 	test_preds = model.predict_proba(test_X)[:,1]
 39 | 	test_preds2 = model.predict_proba(test_X2)[:,1]
 40 | 	test_loss = 0
 41 | 	if test_y is not None:
 42 | 		train_loss = metrics.roc_auc_score(train_y, train_preds)
 43 | 		test_loss = metrics.roc_auc_score(test_y, test_preds)
 44 | 		print "Train and Test loss : ", train_loss, test_loss
 45 | 	return test_preds, test_loss, test_preds2
 46 | 
 47 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3):
 48 | 	model = ensemble.ExtraTreesClassifier(
 49 | 			n_estimators = 300,
 50 | 					max_depth = depth,
 51 | 					min_samples_split = 10,
 52 | 					min_samples_leaf = leaf,
 53 | 					max_features =  feat,
 54 | 					n_jobs = 6,
 55 | 					random_state = 0)
 56 | 	model.fit(train_X, train_y)
 57 | 	train_preds = model.predict_proba(train_X)[:,1]
 58 | 	test_preds = model.predict_proba(test_X)[:,1]
 59 | 	test_preds2 = model.predict_proba(test_X2)[:,1]
 60 | 	test_loss = 0
 61 | 	if test_y is not None:
 62 | 		train_loss = metrics.roc_auc_score(train_y, train_preds)
 63 | 		test_loss = metrics.roc_auc_score(test_y, test_preds)
 64 | 		print "Depth, leaf, feat : ", depth, leaf, feat
 65 | 		print "Train and Test loss : ", train_loss, test_loss
 66 | 	return test_preds, test_loss, test_preds2
 67 | 
 68 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001):
 69 | 	params = {}
 70 | 	params["objective"] = "binary"
 71 | 	params['metric'] = 'auc'
 72 | 	params["max_depth"] = dep
 73 | 	params["min_data_in_leaf"] = 100
 74 | 	params["learning_rate"] = eta
 75 | 	params["bagging_fraction"] = 0.7
 76 | 	params["feature_fraction"] = 0.7
 77 | 	params["bagging_freq"] = 5
 78 | 	params["bagging_seed"] = seed_val
 79 | 	params["verbosity"] = -1
 80 | 	num_rounds = rounds
 81 | 
 82 | 	plst = list(params.items())
 83 | 	lgtrain = lgb.Dataset(train_X, label=train_y)
 84 | 
 85 | 	if test_y is not None:
 86 | 		lgtest = lgb.Dataset(test_X, label=test_y)
 87 | 		model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20)
 88 | 	else:
 89 | 		lgtest = lgb.DMatrix(test_X)
 90 | 		model = lgb.train(params, lgtrain, num_rounds)
 91 | 
 92 | 	pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
 93 | 	pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
 94 | 
 95 | 	loss = 0
 96 | 	if test_y is not None:
 97 | 		loss = metrics.roc_auc_score(test_y, pred_test_y)
 98 | 		print loss
 99 | 		return pred_test_y, loss, pred_test_y2
100 | 	else:
101 | 		return pred_test_y, loss, pred_test_y2
102 | 
103 | if __name__ == "__main__":
104 | 	print "Reading input files..."
105 | 	train_df = pd.read_csv("../input/train_feat.csv")
106 | 	test_df = pd.read_csv("../input/test_feat.csv")
107 | 	campaign_df = pd.read_csv("../input/campaign_data.csv")
108 | 	train_df["is_open_alone"] = train_df["is_click"].astype('float') / np.maximum(train_df["is_open"],1)
109 | 	print train_df.shape, test_df.shape
110 | 	print train_df.head()
111 | 
112 | 
113 | 	print np.sort(train_df["campaign_id"].unique())
114 | 	#camp_indices = [[range(29, 47), range(47,56)], [range(47,56), range(29, 47)]]
115 | 
116 | 	print "Merging with campaign data.."
117 | 	train_df = pd.merge(train_df, campaign_df, on="campaign_id")
118 | 	test_df = pd.merge(test_df, campaign_df, on="campaign_id")
119 | 	print train_df.shape, test_df.shape
120 | 	kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
121 | 
122 | 	train_y_open = train_df["is_open"].values
123 | 	train_y = train_df["is_click"].values
124 | 	test_id = test_df["id"].values
125 | 	train_unique_campaigns = np.array(train_df["campaign_id"].unique()) 
126 | 	cols_to_use = ["user_cum_count", "user_count", "user_date_diff", "user_camp_diff", "hour"] #, "total_links","no_of_internal_links","no_of_images","no_of_sections"]
127 | 	#cols_to_use = ["user_cum_count", "user_count", "user_camp_diff"]
128 | 	#cols_to_use = []
129 | 	#cols_to_use = cols_to_use + ["first_open", "first_click", "second_open", "second_click", "third_open", "third_click"]
130 | 	cols_to_use = cols_to_use + ["user_min_date", "user_mean_date", "user_max_date", "user_std_date"]
131 | 	cols_to_use = cols_to_use + ["camp_"+str(i) for i in range(29,81)] + ["camps_sent"]
132 | 	#cols_to_use = cols_to_use + ["user_std_date_click", "user_std_date_open"]
133 | 		
134 | 	#print "Label encoding.."
135 | 	#for c in ["communication_type"]:
136 | 	#		cols_to_use.append(c)
137 | 	#		lbl = preprocessing.LabelEncoder()
138 | 	#		lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str')))
139 | 	#		train_df[c] = lbl.transform(list(train_df[c].values.astype('str')))
140 | 	#		test_df[c] = lbl.transform(list(test_df[c].values.astype('str')))
141 | 	
142 | 	
143 | 	#print "Full Count encoding.."
144 | 	#full_df = train_df.append(test_df)
145 | 	#print full_df.shape
146 | 	#for col in [["user_id"]]:
147 | 	#	if isinstance(col, list):
148 | 	#		col_name = "_".join(col)
149 | 	#	train_df[col_name + "_full_count"] = np.array( getCountVar(train_df, full_df, col, 'id'))
150 | 	#	test_df[col_name + "_full_count"] = np.array( getCountVar(test_df, full_df, col, 'id'))
151 | 	#	cols_to_use.append(col_name + "_full_count")
152 | 
153 | 			
154 | 	print "Count encoding.."
155 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
156 | 	#for col in [["user_id"]]:
157 | 		train_enc_values = np.zeros(train_df.shape[0])
158 | 		test_enc_values = 0
159 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
160 | 		#for [dev_camp, val_camp] in camp_indices:
161 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
162 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
163 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getCountVar(val_X[col], dev_X, col, 'is_click'))
164 | 			test_enc_values += np.array( getCountVar(test_df[col], dev_X, col, 'is_click'))
165 | 		test_enc_values /= 5.
166 | 		if isinstance(col, list):
167 | 			col = "_".join(col)
168 | 		train_df[col + "_count"] = train_enc_values
169 | 		test_df[col + "_count"] = test_enc_values
170 | 		cols_to_use.append(col + "_count")
171 | 		
172 | 
173 | 		
174 | 	print "Target encoding.."
175 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
176 | 	#for col in [["user_id"]]:
177 | 		train_enc_values = np.zeros(train_df.shape[0])
178 | 		test_enc_values = 0
179 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
180 | 		#for [dev_camp, val_camp] in camp_indices:
181 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
182 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
183 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_click'))
184 | 			test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_click'))
185 | 		test_enc_values /= 5.
186 | 		if isinstance(col, list):
187 | 			col = "_".join(col)
188 | 		train_df[col + "_enc"] = train_enc_values
189 | 		test_df[col + "_enc"] = test_enc_values
190 | 		cols_to_use.append(col + "_enc")
191 | 	
192 | 
193 | 	print "Open Target encoding.."
194 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
195 | 	#for col in [["user_id"]]:
196 | 		train_enc_values = np.zeros(train_df.shape[0])
197 | 		test_enc_values = 0
198 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
199 | 		#for [dev_camp, val_camp] in camp_indices:
200 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
201 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
202 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_open'))
203 | 			test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_open'))
204 | 		test_enc_values /= 5.
205 | 		if isinstance(col, list):
206 | 			col = "_".join(col)
207 | 		train_df[col + "_open_enc"] = train_enc_values
208 | 		test_df[col + "_open_enc"] = test_enc_values
209 | 		cols_to_use.append(col + "_open_enc")
210 | 			
211 | 	
212 | 
213 | 
214 | 	"""	
215 | 	print "Open Alone Target encoding.."
216 | 	#for col in [["user_id"], ["user_id", "communication_type"], ["user_id", "no_of_sections"]]:
217 | 	for col in [["user_id"]]:
218 | 		train_enc_values = np.zeros(train_df.shape[0])
219 | 		test_enc_values = 0
220 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
221 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
222 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
223 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar2(val_X[col], dev_X, col, 'is_open'))
224 | 			test_enc_values += np.array( getDVEncodeVar2(test_df[col], dev_X, col, 'is_open'))
225 | 		test_enc_values /= 5.
226 | 		if isinstance(col, list):
227 | 			col = "_".join(col)
228 | 		train_df[col + "_open_sum_enc"] = train_enc_values
229 | 		test_df[col + "_open_sum_enc"] = test_enc_values
230 | 		cols_to_use.append(col + "_open_sum_enc")	
231 | 	"""
232 | 	
233 | 	
234 | 	print cols_to_use
235 | 	train_X = train_df[cols_to_use]
236 | 	test_X = test_df[cols_to_use]
237 | 	print train_X.describe()
238 | 	print test_X.describe()
239 | 
240 | 	#train_X.fillna(-1, inplace=True)
241 | 	#test_X.fillna(-1, inplace=True)	
242 | 
243 | 	print "Model building.."
244 | 	model_name = "LGB"
245 | 	cv_scores = []
246 | 	pred_test_full = 0
247 | 	pred_val_full = np.zeros(train_df.shape[0])	
248 | 	for dev_index, val_index in kf.split(train_unique_campaigns):
249 | 	#for [dev_camp, val_camp] in camp_indices:
250 | 		dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
251 | 		dev_X, val_X = train_X[train_df['campaign_id'].isin(dev_camp)], train_X[train_df['campaign_id'].isin(val_camp)]
252 | 		dev_y, val_y = train_y[train_df['campaign_id'].isin(dev_camp)], train_y[train_df['campaign_id'].isin(val_camp)]
253 | 		print dev_X.shape, val_X.shape
254 | 
255 | 		if model_name == "LGB":
256 | 			pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
257 | 			pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
258 | 			pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
259 | 			pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
260 | 			pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
261 | 			loss = (loss1 + loss2 + loss3)/3. 
262 | 		elif model_name == "ET":
263 | 			pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3)
264 | 		elif model_name == "LR":
265 | 			pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X)
266 | 
267 | 		pred_test_full += pred_test
268 | 		pred_val_full[train_df['campaign_id'].isin(val_camp)] = pred_val
269 | 		loss = metrics.roc_auc_score(train_y[train_df['campaign_id'].isin(val_camp)], pred_val)
270 | 		cv_scores.append(loss)
271 | 		print cv_scores
272 | 	pred_test_full /= 5.
273 | 	print np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full)
274 | 
275 | 	sub_df = pd.DataFrame({"id":test_id})
276 | 	sub_df["is_click"] = pred_test_full
277 | 	sub_df.to_csv("srk_sub47.csv", index=False)
278 | 
279 | 
280 | 	
281 | 


--------------------------------------------------------------------------------
/2nd_Place_Mark_SRK/build_model_xgb.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import metrics, model_selection, ensemble, preprocessing, linear_model
  4 | import lightgbm as lgb
  5 | import xgboost as xgb
  6 | 
  7 | def getCountVar(compute_df, count_df, var_name, count_var="v1"):
  8 | 	grouped_df = count_df.groupby(var_name)[count_var].agg('count').reset_index()
  9 | 	grouped_df.columns = var_name + ["var_count"]
 10 | 
 11 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 12 | 	merged_df.fillna(np.mean(grouped_df["var_count"].values), inplace=True)
 13 | 	return list(merged_df["var_count"])
 14 | 
 15 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1):
 16 | 	if type(var_name) != type([]):
 17 | 		var_name = [var_name]
 18 | 	grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
 19 | 	grouped_df.columns = var_name + ["mean_value"]
 20 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 21 | 	merged_df.fillna(np.mean(target_df[target_var].values), inplace=True)
 22 | 	return list(merged_df["mean_value"])
 23 | 
 24 | def getDVEncodeVar2(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1):
 25 | 	if type(var_name) != type([]):
 26 | 		var_name = [var_name]
 27 | 	grouped_df = target_df.groupby(var_name)[target_var].agg(["sum"]).reset_index()
 28 | 	grouped_df.columns = var_name + ["sum_value"]
 29 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 30 | 	merged_df.fillna(np.mean(grouped_df["sum_value"].values), inplace=True)
 31 | 	return list(merged_df["sum_value"])
 32 | 
 33 | 
 34 | def runLR(train_X, train_y, test_X, test_y=None, test_X2=None):
 35 | 	model = linear_model.LogisticRegression(fit_intercept=True, C=0.3)
 36 | 	model.fit(train_X, train_y)
 37 | 	print model.coef_, model.intercept_
 38 | 	train_preds = model.predict_proba(train_X)[:,1]
 39 | 	test_preds = model.predict_proba(test_X)[:,1]
 40 | 	test_preds2 = model.predict_proba(test_X2)[:,1]
 41 | 	test_loss = 0
 42 | 	if test_y is not None:
 43 | 		train_loss = metrics.roc_auc_score(train_y, train_preds)
 44 | 		test_loss = metrics.roc_auc_score(test_y, test_preds)
 45 | 		print "Train and Test loss : ", train_loss, test_loss
 46 | 	return test_preds, test_loss, test_preds2
 47 | 
 48 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3):
 49 | 	model = ensemble.ExtraTreesClassifier(
 50 | 			n_estimators = 300,
 51 | 					max_depth = depth,
 52 | 					min_samples_split = 10,
 53 | 					min_samples_leaf = leaf,
 54 | 					max_features =  feat,
 55 | 					n_jobs = 6,
 56 | 					random_state = 0)
 57 | 	model.fit(train_X, train_y)
 58 | 	train_preds = model.predict_proba(train_X)[:,1]
 59 | 	test_preds = model.predict_proba(test_X)[:,1]
 60 | 	test_preds2 = model.predict_proba(test_X2)[:,1]
 61 | 	test_loss = 0
 62 | 	if test_y is not None:
 63 | 		train_loss = metrics.roc_auc_score(train_y, train_preds)
 64 | 		test_loss = metrics.roc_auc_score(test_y, test_preds)
 65 | 		print "Depth, leaf, feat : ", depth, leaf, feat
 66 | 		print "Train and Test loss : ", train_loss, test_loss
 67 | 	return test_preds, test_loss, test_preds2
 68 | 
 69 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001):
 70 | 	params = {}
 71 | 	params["objective"] = "binary"
 72 | 	params['metric'] = 'auc'
 73 | 	params["max_depth"] = dep
 74 | 	params["min_data_in_leaf"] = 100
 75 | 	params["learning_rate"] = eta
 76 | 	params["bagging_fraction"] = 0.7
 77 | 	params["feature_fraction"] = 0.7
 78 | 	params["bagging_freq"] = 5
 79 | 	params["bagging_seed"] = seed_val
 80 | 	params["verbosity"] = -1
 81 | 	num_rounds = rounds
 82 | 
 83 | 	plst = list(params.items())
 84 | 	lgtrain = lgb.Dataset(train_X, label=train_y)
 85 | 
 86 | 	if test_y is not None:
 87 | 		lgtest = lgb.Dataset(test_X, label=test_y)
 88 | 		model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20)
 89 | 	else:
 90 | 		lgtest = lgb.DMatrix(test_X)
 91 | 		model = lgb.train(params, lgtrain, num_rounds)
 92 | 
 93 | 	pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
 94 | 	pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
 95 | 
 96 | 	loss = 0
 97 | 	if test_y is not None:
 98 | 		loss = metrics.roc_auc_score(test_y, pred_test_y)
 99 | 		print loss
100 | 		return pred_test_y, loss, pred_test_y2
101 | 	else:
102 | 		return pred_test_y, loss, pred_test_y2
103 | 
104 | def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.001):
105 |         params = {}
106 |         params["objective"] = "binary:logistic"
107 |         params['eval_metric'] = 'auc'
108 |         params["eta"] = eta
109 |         params["subsample"] = 0.7
110 |         params["min_child_weight"] = 10
111 |         params["colsample_bytree"] = 0.7
112 |         params["max_depth"] = dep
113 |         params["silent"] = 1
114 |         params["seed"] = seed_val
115 |         #params["max_delta_step"] = 2
116 |         #params["gamma"] = 0.5
117 |         num_rounds = rounds
118 | 
119 |         plst = list(params.items())
120 |         xgtrain = xgb.DMatrix(train_X, label=train_y)
121 | 
122 |         if test_y is not None:
123 |                 xgtest = xgb.DMatrix(test_X, label=test_y)
124 |                 watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
125 |                 model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20)
126 |         else:
127 |                 xgtest = xgb.DMatrix(test_X)
128 |                 model = xgb.train(plst, xgtrain, num_rounds)
129 | 
130 |         pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
131 |         pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit)
132 | 
133 |         loss = 0
134 |         if test_y is not None:
135 |                 loss = metrics.log_loss(test_y, pred_test_y)
136 |                 print loss
137 |                 return pred_test_y, loss, pred_test_y2
138 |         else:
139 |                 return pred_test_y, loss, pred_test_y2
140 | 
141 | 
142 | if __name__ == "__main__":
143 | 	print "Reading input files..."
144 | 	train_df = pd.read_csv("../input/train_feat.csv")
145 | 	test_df = pd.read_csv("../input/test_feat.csv")
146 | 	campaign_df = pd.read_csv("../input/campaign_data.csv")
147 | 	train_df["is_open_alone"] = train_df["is_click"].astype('float') / np.maximum(train_df["is_open"],1)
148 | 	print train_df.shape, test_df.shape
149 | 	print train_df.head()
150 | 
151 | 
152 | 	print np.sort(train_df["campaign_id"].unique())
153 | 	#camp_indices = [[range(29, 47), range(47,56)], [range(47,56), range(29, 47)]]
154 | 
155 | 	print "Merging with campaign data.."
156 | 	train_df = pd.merge(train_df, campaign_df, on="campaign_id")
157 | 	test_df = pd.merge(test_df, campaign_df, on="campaign_id")
158 | 	print train_df.shape, test_df.shape
159 | 	kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=98765)
160 | 
161 | 	train_y_open = train_df["is_open"].values
162 | 	train_y = train_df["is_click"].values
163 | 	test_id = test_df["id"].values
164 | 	train_unique_campaigns = np.array(train_df["campaign_id"].unique()) 
165 | 	cols_to_use = ["user_cum_count", "user_count", "user_date_diff", "user_camp_diff", "hour"] #, "total_links","no_of_internal_links","no_of_images","no_of_sections"]
166 | 	#cols_to_use = ["user_cum_count", "user_count", "user_camp_diff"]
167 | 	#cols_to_use = []
168 | 	#cols_to_use = cols_to_use + ["first_open", "first_click", "second_open", "second_click", "third_open", "third_click"]
169 | 	cols_to_use = cols_to_use + ["user_min_date", "user_mean_date", "user_max_date", "user_std_date"]
170 | 	cols_to_use = cols_to_use + ["camp_"+str(i) for i in range(29,81)] + ["camps_sent"]
171 | 	#cols_to_use = cols_to_use + ["user_std_date_click", "user_std_date_open"]
172 | 		
173 | 	#print "Label encoding.."
174 | 	#for c in ["communication_type"]:
175 | 	#		cols_to_use.append(c)
176 | 	#		lbl = preprocessing.LabelEncoder()
177 | 	#		lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str')))
178 | 	#		train_df[c] = lbl.transform(list(train_df[c].values.astype('str')))
179 | 	#		test_df[c] = lbl.transform(list(test_df[c].values.astype('str')))
180 | 	
181 | 	
182 | 	#print "Full Count encoding.."
183 | 	#full_df = train_df.append(test_df)
184 | 	#print full_df.shape
185 | 	#for col in [["user_id"]]:
186 | 	#	if isinstance(col, list):
187 | 	#		col_name = "_".join(col)
188 | 	#	train_df[col_name + "_full_count"] = np.array( getCountVar(train_df, full_df, col, 'id'))
189 | 	#	test_df[col_name + "_full_count"] = np.array( getCountVar(test_df, full_df, col, 'id'))
190 | 	#	cols_to_use.append(col_name + "_full_count")
191 | 
192 | 			
193 | 	print "Count encoding.."
194 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
195 | 	#for col in [["user_id"]]:
196 | 		train_enc_values = np.zeros(train_df.shape[0])
197 | 		test_enc_values = 0
198 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
199 | 		#for [dev_camp, val_camp] in camp_indices:
200 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
201 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
202 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getCountVar(val_X[col], dev_X, col, 'is_click'))
203 | 			test_enc_values += np.array( getCountVar(test_df[col], dev_X, col, 'is_click'))
204 | 		test_enc_values /= 5.
205 | 		if isinstance(col, list):
206 | 			col = "_".join(col)
207 | 		train_df[col + "_count"] = train_enc_values
208 | 		test_df[col + "_count"] = test_enc_values
209 | 		cols_to_use.append(col + "_count")
210 | 		
211 | 
212 | 		
213 | 	print "Target encoding.."
214 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
215 | 	#for col in [["user_id"]]:
216 | 		train_enc_values = np.zeros(train_df.shape[0])
217 | 		test_enc_values = 0
218 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
219 | 		#for [dev_camp, val_camp] in camp_indices:
220 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
221 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
222 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_click'))
223 | 			test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_click'))
224 | 		test_enc_values /= 5.
225 | 		if isinstance(col, list):
226 | 			col = "_".join(col)
227 | 		train_df[col + "_enc"] = train_enc_values
228 | 		test_df[col + "_enc"] = test_enc_values
229 | 		cols_to_use.append(col + "_enc")
230 | 	
231 | 
232 | 	print "Open Target encoding.."
233 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
234 | 	#for col in [["user_id"]]:
235 | 		train_enc_values = np.zeros(train_df.shape[0])
236 | 		test_enc_values = 0
237 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
238 | 		#for [dev_camp, val_camp] in camp_indices:
239 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
240 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
241 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_open'))
242 | 			test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_open'))
243 | 		test_enc_values /= 5.
244 | 		if isinstance(col, list):
245 | 			col = "_".join(col)
246 | 		train_df[col + "_open_enc"] = train_enc_values
247 | 		test_df[col + "_open_enc"] = test_enc_values
248 | 		cols_to_use.append(col + "_open_enc")
249 | 			
250 | 	
251 | 
252 | 
253 | 	"""	
254 | 	print "Open Alone Target encoding.."
255 | 	#for col in [["user_id"], ["user_id", "communication_type"], ["user_id", "no_of_sections"]]:
256 | 	for col in [["user_id"]]:
257 | 		train_enc_values = np.zeros(train_df.shape[0])
258 | 		test_enc_values = 0
259 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
260 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
261 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
262 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar2(val_X[col], dev_X, col, 'is_open'))
263 | 			test_enc_values += np.array( getDVEncodeVar2(test_df[col], dev_X, col, 'is_open'))
264 | 		test_enc_values /= 5.
265 | 		if isinstance(col, list):
266 | 			col = "_".join(col)
267 | 		train_df[col + "_open_sum_enc"] = train_enc_values
268 | 		test_df[col + "_open_sum_enc"] = test_enc_values
269 | 		cols_to_use.append(col + "_open_sum_enc")	
270 | 	"""
271 | 	
272 | 	
273 | 	print cols_to_use
274 | 	train_X = train_df[cols_to_use]
275 | 	test_X = test_df[cols_to_use]
276 | 	print train_X.describe()
277 | 	print test_X.describe()
278 | 
279 | 	#train_X.fillna(-1, inplace=True)
280 | 	#test_X.fillna(-1, inplace=True)	
281 | 
282 | 	print "Model building.."
283 | 	model_name = "XGB"
284 | 	cv_scores = []
285 | 	pred_test_full = 0
286 | 	pred_val_full = np.zeros(train_df.shape[0])	
287 | 	for dev_index, val_index in kf.split(train_unique_campaigns):
288 | 	#for [dev_camp, val_camp] in camp_indices:
289 | 		dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
290 | 		dev_X, val_X = train_X[train_df['campaign_id'].isin(dev_camp)], train_X[train_df['campaign_id'].isin(val_camp)]
291 | 		dev_y, val_y = train_y[train_df['campaign_id'].isin(dev_camp)], train_y[train_df['campaign_id'].isin(val_camp)]
292 | 		print dev_X.shape, val_X.shape
293 | 
294 | 		if model_name == "LGB":
295 | 			pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
296 | 			pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
297 | 			pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
298 | 			pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
299 | 			pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
300 | 			loss = (loss1 + loss2 + loss3)/3. 
301 | 		elif model_name == "XGB":
302 | 			pred_val1, loss1, pred_test1 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
303 | 			pred_val2, loss2, pred_test2 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
304 | 			pred_val3, loss3, pred_test3 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
305 | 			pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
306 | 			pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
307 | 			loss = (loss1 + loss2 + loss3)/3. 
308 | 		elif model_name == "ET":
309 | 			pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3)
310 | 		elif model_name == "LR":
311 | 			pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X)
312 | 
313 | 		pred_test_full += pred_test
314 | 		pred_val_full[train_df['campaign_id'].isin(val_camp)] = pred_val
315 | 		loss = metrics.roc_auc_score(train_y[train_df['campaign_id'].isin(val_camp)], pred_val)
316 | 		cv_scores.append(loss)
317 | 		print cv_scores
318 | 	pred_test_full /= 5.
319 | 	print np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full)
320 | 
321 | 	sub_df = pd.DataFrame({"id":test_id})
322 | 	sub_df["is_click"] = pred_test_full
323 | 	sub_df.to_csv("srk_sub48.csv", index=False)
324 | 
325 | 
326 | 	
327 | 


--------------------------------------------------------------------------------
/2nd_Place_Mark_SRK/ensemble.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | s1 = pd.read_csv("../Submissions/srk_sub47.csv")
 5 | s2 = pd.read_csv("../Submissions/srk_sub48.csv")
 6 | #s3 = pd.read_csv("../Submissions/srk_sub23.csv")
 7 | #s4 = pd.read_csv("../Submissions/srk_sub24.csv")
 8 | 
 9 | #s1["is_click"] = 0.35*(0.5*s1["is_click"] + 0.5*s2["is_click"]) + 0.65*(0.65*(s3["is_click"])+0.35*(s4["is_click"]))
10 | s1["is_click"] = 0.5*s1["is_click"] + 0.5*s2["is_click"]
11 | s1.to_csv("srk_sub49.csv", index=False)
12 | 


--------------------------------------------------------------------------------
/2nd_Place_Mark_SRK/readme.md:
--------------------------------------------------------------------------------
 1 | ## Approach
 2 | Most of our time is spent on creating new features. We did validation split based on campaign ids. Our best single model is a light GBM that scored 0.7051 in LB. List of important features we used are:
 3 | 
 4 | 1. Target encoding on the user ID, user ID - communication type
 5 | 2. Min, max, mean and standard deviation of the mail sent time.
 6 | 3. One hot encoding of the campaigns.
 7 | 4. Time between current mail and previous mail
 8 | 5. Number of campaigns inbetween current mail and previous mail
 9 | 6. Total number of mail campaigns per user ID
10 | 7. Cumulative count of the mail at user level
11 | 8. Hour of the mail
12 | 
13 | ## How to run the code?
14 | Order of files to run
15 | 1. Explorations.ipynb - Code file to create the features.
16 | 2. build_model.py - Code file to build the Light GBM model
17 | 3. build_model_xgb.py - Code file to build the XGB model
18 | 4. ensemble.py - Code file to merge both the results.
19 | 


--------------------------------------------------------------------------------
/3rd_Place_Aditya_Akash/3rd_Place_Solution_Approach.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/analyticsvidhya/LordOftheMachines/5f8450eca5b941418f74a3d9934b1145f5d34d06/3rd_Place_Aditya_Akash/3rd_Place_Solution_Approach.docx


--------------------------------------------------------------------------------
/3rd_Place_Aditya_Akash/final_ensemble-simple_avg.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import numpy as np\n",
10 |     "import pandas as pd"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 2,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "cnnlstm = pd.read_csv('./submission_cnnlstm.csv')\n",
20 |     "cnn = pd.read_csv('./submission_cnn.csv')\n",
21 |     "adamax = pd.read_csv('./submission_lstm.csv')\n",
22 |     "\n",
23 |     "lgbm_nt_45_5f = pd.read_csv('./lgb_5fold-5_bag_nt45_rank_average.csv')\n",
24 |     "lgbm_nt_45_4f = pd.read_csv('./lgb_5fold-5_bag_nt45_rank_average_4f.csv')\n",
25 |     "\n",
26 |     "lgbm_nt_55_5f = pd.read_csv('./lgb_5fold-5_bag_nt55_rank_average_5f.csv')\n",
27 |     "lgbm_nt_55_4f = pd.read_csv('./lgb_5fold-5_bag_nt45_rank_average_4f.csv.')\n",
28 |     "lgbm_nt_55    = pd.read_csv('./lgb_5fold-5_bag_nt55_rank_average.csv')\n",
29 |     "\n",
30 |     "xgb_2f        = pd.read_csv('./xgb_2fold-cv2_bag3_nt70_scalepos1_nt70.csv')"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "code",
35 |    "execution_count": 3,
36 |    "metadata": {},
37 |    "outputs": [],
38 |    "source": [
39 |     "subm = xgb_2f.copy()"
40 |    ]
41 |   },
42 |   {
43 |    "cell_type": "code",
44 |    "execution_count": 4,
45 |    "metadata": {},
46 |    "outputs": [],
47 |    "source": [
48 |     "#First ensemble simple rank average of all\n",
49 |     "\n",
50 |     "test_shape0 = xgb_2f.shape[0]\n",
51 |     "subm.loc[:,'is_click'] = (cnn.is_click.rank()/test_shape0 + cnnlstm.is_click.rank()/test_shape0 + adamax.is_click.rank()/test_shape0 +\\\n",
52 |     "lgbm_nt_45_5f.is_click.rank()/test_shape0 + lgbm_nt_45_4f.is_click.rank()/test_shape0 + \\\n",
53 |     "lgbm_nt_55_5f.is_click.rank()/test_shape0 + \\\n",
54 |     "lgbm_nt_55_4f.is_click.rank()/test_shape0 + lgbm_nt_55.is_click.rank()/test_shape0 + \\\n",
55 |     "xgb_2f.is_click.rank()/test_shape0)/9\n"
56 |    ]
57 |   },
58 |   {
59 |    "cell_type": "code",
60 |    "execution_count": 5,
61 |    "metadata": {},
62 |    "outputs": [],
63 |    "source": [
64 |     "subm.to_csv('./fin_ens_rank_average_all.csv',index=False)"
65 |    ]
66 |   },
67 |   {
68 |    "cell_type": "code",
69 |    "execution_count": null,
70 |    "metadata": {},
71 |    "outputs": [],
72 |    "source": []
73 |   }
74 |  ],
75 |  "metadata": {
76 |   "kernelspec": {
77 |    "display_name": "Python 3",
78 |    "language": "python",
79 |    "name": "python3"
80 |   },
81 |   "language_info": {
82 |    "codemirror_mode": {
83 |     "name": "ipython",
84 |     "version": 3
85 |    },
86 |    "file_extension": ".py",
87 |    "mimetype": "text/x-python",
88 |    "name": "python",
89 |    "nbconvert_exporter": "python",
90 |    "pygments_lexer": "ipython3",
91 |    "version": "3.6.4"
92 |   }
93 |  },
94 |  "nbformat": 4,
95 |  "nbformat_minor": 2
96 | }
97 | 


--------------------------------------------------------------------------------
/3rd_Place_Aditya_Akash/lgb_new_features-v6-5fold_5bag_cv_retry_lb_692_ens6941-submitted.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 7,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import lightgbm as lgb\n",
 12 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 13 |     "from sklearn.decomposition import TruncatedSVD\n",
 14 |     "import gc\n",
 15 |     "from sklearn.preprocessing import LabelEncoder\n",
 16 |     "from sklearn.model_selection import KFold"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 8,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "1961"
 28 |       ]
 29 |      },
 30 |      "execution_count": 8,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "campaign  = pd.read_csv('input/campaign_data.csv')\n",
 37 |     "'''\n",
 38 |     "vectorizer = CountVectorizer(ngram_range=(1,3))\n",
 39 |     "n_grams = vectorizer.fit_transform(campaign.subject)\n",
 40 |     "tsvd = TruncatedSVD(2,n_iter=250)\n",
 41 |     "tsvd_subject_feats = tsvd.fit_transform(n_grams)\n",
 42 |     "campaign['email_body'] = campaign.email_body.apply(lambda x: x.replace(\"\\r\\n\",\"\"))\n",
 43 |     "vectorizer = CountVectorizer(ngram_range=(1,4))\n",
 44 |     "n_grams = vectorizer.fit_transform(campaign.email_body)\n",
 45 |     "tsvd = TruncatedSVD(4,n_iter=250)\n",
 46 |     "tsvd_email_body_feats = tsvd.fit_transform(n_grams)\n",
 47 |     "for i in range(tsvd_subject_feats.shape[1]):\n",
 48 |     "    campaign.loc[:,'sub_'+str(i)] = tsvd_subject_feats[:,i]\n",
 49 |     "for i in range(tsvd_email_body_feats.shape[1]):\n",
 50 |     "    campaign.loc[:,'eb_'+str(i)] = tsvd_email_body_feats[:,i]\n",
 51 |     "'''\n",
 52 |     "campaign1 = campaign.drop(['subject','email_url','email_body'],axis=1)\n",
 53 |     "gc.collect()"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 9,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "np.random.seed(123)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 10,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "def add_noise(series, noise_level):\n",
 72 |     "    return series * (1 + noise_level * np.random.randn(len(series)))\n",
 73 |     "def target_encode(trn_series=None,val_series=None,\n",
 74 |     "                  tst_series=None,\n",
 75 |     "                  target=None,\n",
 76 |     "                  min_samples_leaf=1,\n",
 77 |     "                  smoothing=1,\n",
 78 |     "                  noise_level=0):\n",
 79 |     "    \"\"\"\n",
 80 |     "    Smoothing is computed like in the following paper by Daniele Micci-Barreca\n",
 81 |     "    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf\n",
 82 |     "    trn_series : training categorical feature as a pd.Series\n",
 83 |     "    tst_series : test categorical feature as a pd.Series\n",
 84 |     "    target : target data as a pd.Series\n",
 85 |     "    min_samples_leaf (int) : minimum samples to take category average into account\n",
 86 |     "    smoothing (int) : smoothing effect to balance categorical average vs prior\n",
 87 |     "    \"\"\"\n",
 88 |     "    assert len(trn_series) == len(target)\n",
 89 |     "    #assert trn_series.name == tst_series.name\n",
 90 |     "    temp = pd.concat([trn_series, target], axis=1)\n",
 91 |     "    # Compute target mean\n",
 92 |     "    averages = temp.groupby(by=trn_series.name)[target.name].agg([\"mean\", \"count\"])\n",
 93 |     "    # Compute smoothing\n",
 94 |     "    smoothing = 1 / (1 + np.exp(-(averages[\"count\"] - min_samples_leaf) / smoothing))\n",
 95 |     "    # Apply average function to all target data\n",
 96 |     "    prior = target.mean()\n",
 97 |     "    # The bigger the count the less full_avg is taken into account\n",
 98 |     "    averages[target.name] = prior * (1 - smoothing) + averages[\"mean\"] * smoothing\n",
 99 |     "    averages.drop([\"mean\", \"count\"], axis=1, inplace=True)\n",
100 |     "    # Apply averages to trn and tst series\n",
101 |     "    ft_trn_series = pd.merge(\n",
102 |     "        trn_series.to_frame(trn_series.name),\n",
103 |     "        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n",
104 |     "        on=trn_series.name,\n",
105 |     "        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n",
106 |     "    # pd.merge does not keep the index so restore it\n",
107 |     "    ft_trn_series.index = trn_series.index\n",
108 |     "    ft_val_series = pd.merge(\n",
109 |     "        val_series.to_frame(val_series.name),\n",
110 |     "        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n",
111 |     "        on=val_series.name,\n",
112 |     "        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n",
113 |     "    ft_val_series.index = val_series.index\n",
114 |     "    \n",
115 |     "    ft_tst_series = pd.merge(\n",
116 |     "        tst_series.to_frame(tst_series.name),\n",
117 |     "        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n",
118 |     "        on=tst_series.name,\n",
119 |     "        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n",
120 |     "    # pd.merge does not keep the index so restore it\n",
121 |     "    ft_tst_series.index = tst_series.index\n",
122 |     "    return add_noise(ft_trn_series, noise_level), ft_val_series,ft_tst_series"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 11,
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "data": {
132 |       "text/plain": [
133 |        "0"
134 |       ]
135 |      },
136 |      "execution_count": 11,
137 |      "metadata": {},
138 |      "output_type": "execute_result"
139 |     }
140 |    ],
141 |    "source": [
142 |     "train = pd.read_csv('input/train.csv')\n",
143 |     "test = pd.read_csv('input/test.csv')\n",
144 |     "gc.collect()"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 12,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "all_data = pd.concat([train,test])"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 13,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "user_clust = pd.read_csv('./input/user_cluster1.csv')\n",
163 |     "all_data = all_data.merge(user_clust,on='user_id',how='left')"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 14,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "all_data['send_date'] = all_data.send_date.apply(lambda x: pd.datetime.strptime(x,'%d-%m-%Y %H:%M'))"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 15,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "all_data['send_dayofweek'] = all_data.send_date.dt.dayofweek"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 16,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/plain": [
192 |        "Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',\n",
193 |        "       'clust_id', 'send_dayofweek'],\n",
194 |        "      dtype='object')"
195 |       ]
196 |      },
197 |      "execution_count": 16,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "all_data.columns"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 17,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "#count features\n",
213 |     "all_data['cnt_sd'] = all_data.groupby('send_date')['user_id'].transform('count')"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 18,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "all_data = all_data.merge(campaign1,on='campaign_id',how='left')"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 19,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "data": {
232 |       "text/plain": [
233 |        "Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',\n",
234 |        "       'clust_id', 'send_dayofweek', 'cnt_sd', 'communication_type',\n",
235 |        "       'total_links', 'no_of_internal_links', 'no_of_images',\n",
236 |        "       'no_of_sections'],\n",
237 |        "      dtype='object')"
238 |       ]
239 |      },
240 |      "execution_count": 19,
241 |      "metadata": {},
242 |      "output_type": "execute_result"
243 |     }
244 |    ],
245 |    "source": [
246 |     "all_data.columns"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 20,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "le1 = LabelEncoder()\n",
256 |     "all_data.loc[:,'communication_type'] = le1.fit_transform(all_data.communication_type)   \n",
257 |     "all_data['usr_cnt'] = all_data.groupby('user_id')['user_id'].transform('count')\n",
258 |     "all_data['cm_cnt'] = np.log(all_data.groupby('communication_type')['communication_type'].transform('count'))\n",
259 |     "#all_data['camp_cnt'] = all_data.groupby('campaign_id')['campaign_id'].transform('count')"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 21,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "test = all_data[len(train):]\n",
269 |     "train = all_data[:len(train)]"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 22,
275 |    "metadata": {},
276 |    "outputs": [
277 |     {
278 |      "data": {
279 |       "text/plain": [
280 |        "35"
281 |       ]
282 |      },
283 |      "execution_count": 22,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "#del all_data\n",
290 |     "gc.collect()"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 23,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "lgb_params = {}\n",
300 |     "lgb_params['learning_rate'] = 0.01\n",
301 |     "lgb_params['num_leaves'] = 31\n",
302 |     "lgb_params['max_depth'] = 5\n",
303 |     "lgb_params['max_bin'] = 10\n",
304 |     "lgb_params['min_data_in_leaf'] = 50\n",
305 |     "lgb_params['subsample'] = 0.6\n",
306 |     "lgb_params['colsample_bytree'] = 0.7\n",
307 |     "lgb_params['feature_fraction'] = 0.7,\n",
308 |     "lgb_params['bagging_fraction'] = 0.77,\n",
309 |     "lgb_params['objective'] = 'binary'\n",
310 |     "lgb_params['metric'] = {'auc'}\n",
311 |     "lgb_params['verbose'] = 1\n",
312 |     "lgb_params['scale_pos_weight'] = 1.\n",
313 |     "lgb_params['boosting_type'] = 'gbdt'\n",
314 |     "lgb_params['min_split_gain'] = 0.0001\n",
315 |     "#lgb_params['bagging_fraction'] = 0.7\n",
316 |     "lgb_params['bagging_freq'] = 100000\n"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 26,
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "name": "stderr",
326 |      "output_type": "stream",
327 |      "text": [
328 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: SettingWithCopyWarning: \n",
329 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
330 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
331 |       "\n",
332 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
333 |       "  \n",
334 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
335 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
336 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
337 |       "\n",
338 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
339 |       "  \n"
340 |      ]
341 |     },
342 |     {
343 |      "name": "stdout",
344 |      "output_type": "stream",
345 |      "text": [
346 |       "Fold: 1\n",
347 |       "val_cid [29 30 31 32 33 34]\n",
348 |       "(331628, 16) (691563, 16)\n"
349 |      ]
350 |     },
351 |     {
352 |      "name": "stderr",
353 |      "output_type": "stream",
354 |      "text": [
355 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:357: SettingWithCopyWarning: \n",
356 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
357 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
358 |       "\n",
359 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
360 |       "  self.obj[key] = _infer_fill_value(value)\n",
361 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:537: SettingWithCopyWarning: \n",
362 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
363 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
364 |       "\n",
365 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
366 |       "  self.obj[item] = s\n",
367 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:61: SettingWithCopyWarning: \n",
368 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
369 |       "\n",
370 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
371 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:63: SettingWithCopyWarning: \n",
372 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
373 |       "\n",
374 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
375 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:69: SettingWithCopyWarning: \n",
376 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
377 |       "\n",
378 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
379 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:70: SettingWithCopyWarning: \n",
380 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
381 |       "\n",
382 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
383 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:82: SettingWithCopyWarning: \n",
384 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
385 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
386 |       "\n",
387 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
388 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\lightgbm\\basic.py:1036: UserWarning: Using categorical_feature in Dataset.\n",
389 |       "  warnings.warn('Using categorical_feature in Dataset.')\n",
390 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\lightgbm\\basic.py:681: UserWarning: categorical_feature in param dict is overrided.\n",
391 |       "  warnings.warn('categorical_feature in param dict is overrided.')\n"
392 |      ]
393 |     },
394 |     {
395 |      "name": "stdout",
396 |      "output_type": "stream",
397 |      "text": [
398 |       "Training until validation scores don't improve for 1000 rounds.\n",
399 |       "[10]\ttrain's auc: 0.959499\tvalid's auc: 0.64165\n",
400 |       "[20]\ttrain's auc: 0.960904\tvalid's auc: 0.682885\n",
401 |       "[30]\ttrain's auc: 0.961472\tvalid's auc: 0.685754\n",
402 |       "[40]\ttrain's auc: 0.961442\tvalid's auc: 0.685165\n",
403 |       "[50]\ttrain's auc: 0.962194\tvalid's auc: 0.683764\n",
404 |       "Did not meet early stopping. Best iteration is:\n",
405 |       "[55]\ttrain's auc: 0.96278\tvalid's auc: 0.686568\n"
406 |      ]
407 |     },
408 |     {
409 |      "name": "stderr",
410 |      "output_type": "stream",
411 |      "text": [
412 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:95: SettingWithCopyWarning: \n",
413 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
414 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
415 |       "\n",
416 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
417 |      ]
418 |     },
419 |     {
420 |      "name": "stdout",
421 |      "output_type": "stream",
422 |      "text": [
423 |       "Training until validation scores don't improve for 1000 rounds.\n",
424 |       "[10]\ttrain's auc: 0.959771\tvalid's auc: 0.681249\n",
425 |       "[20]\ttrain's auc: 0.958168\tvalid's auc: 0.688639\n",
426 |       "[30]\ttrain's auc: 0.958047\tvalid's auc: 0.688835\n",
427 |       "[40]\ttrain's auc: 0.960492\tvalid's auc: 0.688832\n",
428 |       "[50]\ttrain's auc: 0.961439\tvalid's auc: 0.688582\n",
429 |       "Did not meet early stopping. Best iteration is:\n",
430 |       "[46]\ttrain's auc: 0.961925\tvalid's auc: 0.688432\n"
431 |      ]
432 |     },
433 |     {
434 |      "name": "stderr",
435 |      "output_type": "stream",
436 |      "text": [
437 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:194: SettingWithCopyWarning: \n",
438 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
439 |       "\n",
440 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
441 |       "  self._setitem_with_indexer(indexer, value)\n",
442 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:94: SettingWithCopyWarning: \n",
443 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
444 |       "\n",
445 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
446 |      ]
447 |     },
448 |     {
449 |      "name": "stdout",
450 |      "output_type": "stream",
451 |      "text": [
452 |       "Training until validation scores don't improve for 1000 rounds.\n",
453 |       "[10]\ttrain's auc: 0.962265\tvalid's auc: 0.679662\n",
454 |       "[20]\ttrain's auc: 0.963529\tvalid's auc: 0.682266\n",
455 |       "[30]\ttrain's auc: 0.963445\tvalid's auc: 0.685344\n",
456 |       "[40]\ttrain's auc: 0.963438\tvalid's auc: 0.690044\n",
457 |       "[50]\ttrain's auc: 0.964154\tvalid's auc: 0.689997\n",
458 |       "Did not meet early stopping. Best iteration is:\n",
459 |       "[55]\ttrain's auc: 0.964456\tvalid's auc: 0.687732\n",
460 |       "Training until validation scores don't improve for 1000 rounds.\n",
461 |       "[10]\ttrain's auc: 0.962724\tvalid's auc: 0.684908\n",
462 |       "[20]\ttrain's auc: 0.961615\tvalid's auc: 0.685348\n",
463 |       "[30]\ttrain's auc: 0.962596\tvalid's auc: 0.68143\n",
464 |       "[40]\ttrain's auc: 0.962821\tvalid's auc: 0.682635\n",
465 |       "[50]\ttrain's auc: 0.963031\tvalid's auc: 0.683734\n",
466 |       "Did not meet early stopping. Best iteration is:\n",
467 |       "[51]\ttrain's auc: 0.963166\tvalid's auc: 0.683732\n",
468 |       "Training until validation scores don't improve for 1000 rounds.\n",
469 |       "[10]\ttrain's auc: 0.956924\tvalid's auc: 0.684525\n",
470 |       "[20]\ttrain's auc: 0.958931\tvalid's auc: 0.687749\n",
471 |       "[30]\ttrain's auc: 0.961068\tvalid's auc: 0.687752\n",
472 |       "[40]\ttrain's auc: 0.96252\tvalid's auc: 0.687513\n",
473 |       "[50]\ttrain's auc: 0.962575\tvalid's auc: 0.687599\n",
474 |       "Did not meet early stopping. Best iteration is:\n",
475 |       "[37]\ttrain's auc: 0.962846\tvalid's auc: 0.687325\n",
476 |       "Fold: 2\n",
477 |       "val_cid [35 36 37 38 39]\n",
478 |       "(95814, 16) (927377, 16)\n",
479 |       "Training until validation scores don't improve for 1000 rounds.\n",
480 |       "[10]\ttrain's auc: 0.957137\tvalid's auc: 0.744188\n",
481 |       "[20]\ttrain's auc: 0.956535\tvalid's auc: 0.754693\n",
482 |       "[30]\ttrain's auc: 0.957489\tvalid's auc: 0.758433\n",
483 |       "[40]\ttrain's auc: 0.95723\tvalid's auc: 0.761839\n",
484 |       "[50]\ttrain's auc: 0.957015\tvalid's auc: 0.761785\n",
485 |       "Did not meet early stopping. Best iteration is:\n",
486 |       "[34]\ttrain's auc: 0.957649\tvalid's auc: 0.757474\n",
487 |       "Training until validation scores don't improve for 1000 rounds.\n",
488 |       "[10]\ttrain's auc: 0.952665\tvalid's auc: 0.759722\n",
489 |       "[20]\ttrain's auc: 0.955711\tvalid's auc: 0.755574\n",
490 |       "[30]\ttrain's auc: 0.956282\tvalid's auc: 0.75638\n",
491 |       "[40]\ttrain's auc: 0.956652\tvalid's auc: 0.76035\n",
492 |       "[50]\ttrain's auc: 0.956908\tvalid's auc: 0.762324\n",
493 |       "Did not meet early stopping. Best iteration is:\n",
494 |       "[54]\ttrain's auc: 0.957156\tvalid's auc: 0.761822\n",
495 |       "Training until validation scores don't improve for 1000 rounds.\n",
496 |       "[10]\ttrain's auc: 0.949622\tvalid's auc: 0.745809\n",
497 |       "[20]\ttrain's auc: 0.954399\tvalid's auc: 0.755183\n",
498 |       "[30]\ttrain's auc: 0.95521\tvalid's auc: 0.763086\n",
499 |       "[40]\ttrain's auc: 0.955115\tvalid's auc: 0.763426\n",
500 |       "[50]\ttrain's auc: 0.956063\tvalid's auc: 0.762984\n",
501 |       "Did not meet early stopping. Best iteration is:\n",
502 |       "[54]\ttrain's auc: 0.956642\tvalid's auc: 0.762852\n",
503 |       "Training until validation scores don't improve for 1000 rounds.\n",
504 |       "[10]\ttrain's auc: 0.956036\tvalid's auc: 0.740168\n",
505 |       "[20]\ttrain's auc: 0.956537\tvalid's auc: 0.759695\n",
506 |       "[30]\ttrain's auc: 0.957196\tvalid's auc: 0.761407\n",
507 |       "[40]\ttrain's auc: 0.957056\tvalid's auc: 0.75995\n",
508 |       "[50]\ttrain's auc: 0.957319\tvalid's auc: 0.7601\n",
509 |       "Did not meet early stopping. Best iteration is:\n",
510 |       "[54]\ttrain's auc: 0.957449\tvalid's auc: 0.76011\n",
511 |       "Training until validation scores don't improve for 1000 rounds.\n",
512 |       "[10]\ttrain's auc: 0.957147\tvalid's auc: 0.740874\n",
513 |       "[20]\ttrain's auc: 0.957653\tvalid's auc: 0.741151\n",
514 |       "[30]\ttrain's auc: 0.957485\tvalid's auc: 0.74016\n",
515 |       "[40]\ttrain's auc: 0.957554\tvalid's auc: 0.753127\n",
516 |       "[50]\ttrain's auc: 0.957298\tvalid's auc: 0.752856\n",
517 |       "Did not meet early stopping. Best iteration is:\n",
518 |       "[18]\ttrain's auc: 0.957767\tvalid's auc: 0.741149\n",
519 |       "Fold: 3\n",
520 |       "val_cid [40 41 42 43 44]\n",
521 |       "(128426, 16) (894765, 16)\n",
522 |       "Training until validation scores don't improve for 1000 rounds.\n",
523 |       "[10]\ttrain's auc: 0.959043\tvalid's auc: 0.724628\n",
524 |       "[20]\ttrain's auc: 0.958536\tvalid's auc: 0.725294\n",
525 |       "[30]\ttrain's auc: 0.958994\tvalid's auc: 0.724168\n",
526 |       "[40]\ttrain's auc: 0.958844\tvalid's auc: 0.723607\n",
527 |       "[50]\ttrain's auc: 0.959404\tvalid's auc: 0.724542\n",
528 |       "Did not meet early stopping. Best iteration is:\n",
529 |       "[13]\ttrain's auc: 0.95958\tvalid's auc: 0.725282\n",
530 |       "Training until validation scores don't improve for 1000 rounds.\n",
531 |       "[10]\ttrain's auc: 0.957196\tvalid's auc: 0.723267\n",
532 |       "[20]\ttrain's auc: 0.957568\tvalid's auc: 0.726999\n",
533 |       "[30]\ttrain's auc: 0.957381\tvalid's auc: 0.727038\n",
534 |       "[40]\ttrain's auc: 0.958127\tvalid's auc: 0.728767\n",
535 |       "[50]\ttrain's auc: 0.958506\tvalid's auc: 0.728943\n",
536 |       "Did not meet early stopping. Best iteration is:\n",
537 |       "[51]\ttrain's auc: 0.958598\tvalid's auc: 0.728967\n",
538 |       "Training until validation scores don't improve for 1000 rounds.\n",
539 |       "[10]\ttrain's auc: 0.957409\tvalid's auc: 0.724771\n",
540 |       "[20]\ttrain's auc: 0.957839\tvalid's auc: 0.726644\n",
541 |       "[30]\ttrain's auc: 0.957793\tvalid's auc: 0.726089\n",
542 |       "[40]\ttrain's auc: 0.958122\tvalid's auc: 0.724817\n",
543 |       "[50]\ttrain's auc: 0.958601\tvalid's auc: 0.724415\n",
544 |       "Did not meet early stopping. Best iteration is:\n",
545 |       "[55]\ttrain's auc: 0.959014\tvalid's auc: 0.724543\n",
546 |       "Training until validation scores don't improve for 1000 rounds.\n",
547 |       "[10]\ttrain's auc: 0.955877\tvalid's auc: 0.722508\n",
548 |       "[20]\ttrain's auc: 0.957748\tvalid's auc: 0.72571\n",
549 |       "[30]\ttrain's auc: 0.958169\tvalid's auc: 0.725367\n",
550 |       "[40]\ttrain's auc: 0.958399\tvalid's auc: 0.725168\n",
551 |       "[50]\ttrain's auc: 0.95847\tvalid's auc: 0.725506\n",
552 |       "Did not meet early stopping. Best iteration is:\n",
553 |       "[55]\ttrain's auc: 0.958574\tvalid's auc: 0.726132\n",
554 |       "Training until validation scores don't improve for 1000 rounds.\n",
555 |       "[10]\ttrain's auc: 0.957648\tvalid's auc: 0.719033\n",
556 |       "[20]\ttrain's auc: 0.958564\tvalid's auc: 0.720191\n",
557 |       "[30]\ttrain's auc: 0.958778\tvalid's auc: 0.722423\n",
558 |       "[40]\ttrain's auc: 0.958336\tvalid's auc: 0.72308\n",
559 |       "[50]\ttrain's auc: 0.958835\tvalid's auc: 0.723032\n",
560 |       "Did not meet early stopping. Best iteration is:\n",
561 |       "[52]\ttrain's auc: 0.958914\tvalid's auc: 0.723603\n",
562 |       "Fold: 4\n",
563 |       "val_cid [45 46 47 48 49]\n",
564 |       "(162197, 16) (860994, 16)\n",
565 |       "Training until validation scores don't improve for 1000 rounds.\n",
566 |       "[10]\ttrain's auc: 0.956501\tvalid's auc: 0.699103\n",
567 |       "[20]\ttrain's auc: 0.956945\tvalid's auc: 0.699137\n",
568 |       "[30]\ttrain's auc: 0.957919\tvalid's auc: 0.699498\n",
569 |       "[40]\ttrain's auc: 0.958046\tvalid's auc: 0.703426\n",
570 |       "[50]\ttrain's auc: 0.959006\tvalid's auc: 0.711483\n",
571 |       "Did not meet early stopping. Best iteration is:\n",
572 |       "[55]\ttrain's auc: 0.959152\tvalid's auc: 0.712056\n",
573 |       "Training until validation scores don't improve for 1000 rounds.\n",
574 |       "[10]\ttrain's auc: 0.958598\tvalid's auc: 0.71064\n",
575 |       "[20]\ttrain's auc: 0.959706\tvalid's auc: 0.715877\n",
576 |       "[30]\ttrain's auc: 0.959755\tvalid's auc: 0.714485\n",
577 |       "[40]\ttrain's auc: 0.95969\tvalid's auc: 0.711536\n",
578 |       "[50]\ttrain's auc: 0.96003\tvalid's auc: 0.710615\n",
579 |       "Did not meet early stopping. Best iteration is:\n",
580 |       "[48]\ttrain's auc: 0.960204\tvalid's auc: 0.710831\n",
581 |       "Training until validation scores don't improve for 1000 rounds.\n",
582 |       "[10]\ttrain's auc: 0.956184\tvalid's auc: 0.714624\n",
583 |       "[20]\ttrain's auc: 0.95912\tvalid's auc: 0.711213\n",
584 |       "[30]\ttrain's auc: 0.959379\tvalid's auc: 0.710565\n",
585 |       "[40]\ttrain's auc: 0.959882\tvalid's auc: 0.710499\n",
586 |       "[50]\ttrain's auc: 0.959987\tvalid's auc: 0.711409\n",
587 |       "Did not meet early stopping. Best iteration is:\n",
588 |       "[49]\ttrain's auc: 0.960066\tvalid's auc: 0.71145\n",
589 |       "Training until validation scores don't improve for 1000 rounds.\n",
590 |       "[10]\ttrain's auc: 0.957558\tvalid's auc: 0.694027\n",
591 |       "[20]\ttrain's auc: 0.959607\tvalid's auc: 0.711264\n",
592 |       "[30]\ttrain's auc: 0.959002\tvalid's auc: 0.710682\n",
593 |       "[40]\ttrain's auc: 0.95812\tvalid's auc: 0.702361\n",
594 |       "[50]\ttrain's auc: 0.9582\tvalid's auc: 0.702199\n",
595 |       "Did not meet early stopping. Best iteration is:\n",
596 |       "[20]\ttrain's auc: 0.959607\tvalid's auc: 0.711264\n",
597 |       "Training until validation scores don't improve for 1000 rounds.\n",
598 |       "[10]\ttrain's auc: 0.956362\tvalid's auc: 0.715147\n",
599 |       "[20]\ttrain's auc: 0.956859\tvalid's auc: 0.714264\n",
600 |       "[30]\ttrain's auc: 0.958672\tvalid's auc: 0.71371\n",
601 |       "[40]\ttrain's auc: 0.95855\tvalid's auc: 0.704427\n",
602 |       "[50]\ttrain's auc: 0.959516\tvalid's auc: 0.704593\n",
603 |       "Did not meet early stopping. Best iteration is:\n",
604 |       "[52]\ttrain's auc: 0.959685\tvalid's auc: 0.704602\n",
605 |       "Fold: 5\n",
606 |       "val_cid [50 51 52 53 54]\n",
607 |       "(305126, 16) (718065, 16)\n",
608 |       "Training until validation scores don't improve for 1000 rounds.\n",
609 |       "[10]\ttrain's auc: 0.959101\tvalid's auc: 0.648002\n",
610 |       "[20]\ttrain's auc: 0.960381\tvalid's auc: 0.650036\n",
611 |       "[30]\ttrain's auc: 0.960476\tvalid's auc: 0.654387\n",
612 |       "[40]\ttrain's auc: 0.96103\tvalid's auc: 0.654446\n",
613 |       "[50]\ttrain's auc: 0.961455\tvalid's auc: 0.654559\n",
614 |       "Did not meet early stopping. Best iteration is:\n",
615 |       "[53]\ttrain's auc: 0.9617\tvalid's auc: 0.654462\n",
616 |       "Training until validation scores don't improve for 1000 rounds.\n",
617 |       "[10]\ttrain's auc: 0.959621\tvalid's auc: 0.654011\n",
618 |       "[20]\ttrain's auc: 0.960735\tvalid's auc: 0.653463\n",
619 |       "[30]\ttrain's auc: 0.960836\tvalid's auc: 0.6549\n",
620 |       "[40]\ttrain's auc: 0.961172\tvalid's auc: 0.654521\n",
621 |       "[50]\ttrain's auc: 0.961256\tvalid's auc: 0.653726\n"
622 |      ]
623 |     },
624 |     {
625 |      "name": "stdout",
626 |      "output_type": "stream",
627 |      "text": [
628 |       "Did not meet early stopping. Best iteration is:\n",
629 |       "[17]\ttrain's auc: 0.961457\tvalid's auc: 0.654111\n",
630 |       "Training until validation scores don't improve for 1000 rounds.\n",
631 |       "[10]\ttrain's auc: 0.95697\tvalid's auc: 0.65449\n",
632 |       "[20]\ttrain's auc: 0.960161\tvalid's auc: 0.655808\n",
633 |       "[30]\ttrain's auc: 0.960082\tvalid's auc: 0.655498\n",
634 |       "[40]\ttrain's auc: 0.960535\tvalid's auc: 0.655341\n",
635 |       "[50]\ttrain's auc: 0.960987\tvalid's auc: 0.654711\n",
636 |       "Did not meet early stopping. Best iteration is:\n",
637 |       "[55]\ttrain's auc: 0.961063\tvalid's auc: 0.654381\n",
638 |       "Training until validation scores don't improve for 1000 rounds.\n",
639 |       "[10]\ttrain's auc: 0.958435\tvalid's auc: 0.650125\n",
640 |       "[20]\ttrain's auc: 0.960372\tvalid's auc: 0.650232\n",
641 |       "[30]\ttrain's auc: 0.961111\tvalid's auc: 0.652803\n",
642 |       "[40]\ttrain's auc: 0.960978\tvalid's auc: 0.654544\n",
643 |       "[50]\ttrain's auc: 0.961009\tvalid's auc: 0.65462\n",
644 |       "Did not meet early stopping. Best iteration is:\n",
645 |       "[45]\ttrain's auc: 0.961195\tvalid's auc: 0.655012\n",
646 |       "Training until validation scores don't improve for 1000 rounds.\n",
647 |       "[10]\ttrain's auc: 0.960162\tvalid's auc: 0.654207\n",
648 |       "[20]\ttrain's auc: 0.961086\tvalid's auc: 0.653852\n",
649 |       "[30]\ttrain's auc: 0.961252\tvalid's auc: 0.654194\n",
650 |       "[40]\ttrain's auc: 0.961249\tvalid's auc: 0.654152\n",
651 |       "[50]\ttrain's auc: 0.961266\tvalid's auc: 0.654177\n",
652 |       "Did not meet early stopping. Best iteration is:\n",
653 |       "[55]\ttrain's auc: 0.961397\tvalid's auc: 0.654149\n"
654 |      ]
655 |     }
656 |    ],
657 |    "source": [
658 |     "nfold =5\n",
659 |     "kf = KFold(n_splits=nfold,random_state=123,shuffle=False)\n",
660 |     "unq_campaign_id = np.sort(train.campaign_id.unique())\n",
661 |     "\n",
662 |     "test_subm = test[['id']]\n",
663 |     "test_subm['is_click'] = 0\n",
664 |     "train_score = train[['is_click']]\n",
665 |     "train_score['pred'] = 0\n",
666 |     "nbag = 5\n",
667 |     "cf =0\n",
668 |     "for train_index, test_index in kf.split(unq_campaign_id):\n",
669 |     "    cf+=1\n",
670 |     "    print('Fold:',cf)\n",
671 |     "    \n",
672 |     "    test1 = test.copy()\n",
673 |     "    tr_cid = unq_campaign_id[train_index]\n",
674 |     "    val_cid = unq_campaign_id[test_index]\n",
675 |     "    print('val_cid',val_cid)\n",
676 |     "\n",
677 |     "    val = train[train.campaign_id.isin(val_cid)]\n",
678 |     "    train1 = train[train.campaign_id.isin(tr_cid)]\n",
679 |     "    print(val.shape,train1.shape)\n",
680 |     "\n",
681 |     "    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],\n",
682 |     "                             test1['user_id'],train1.is_click,noise_level=.9,smoothing=5)\n",
683 |     "    train1.loc[:,'mean_is_click'] = a1\n",
684 |     "    val.loc[:,'mean_is_click'] = a2\n",
685 |     "    test1.loc[:,'mean_is_click'] = a3\n",
686 |     "\n",
687 |     "\n",
688 |     "    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],\n",
689 |     "                             test1['user_id'],train1.is_open,noise_level=.9,smoothing=1.)\n",
690 |     "    train1.loc[:,'mean_is_open'] = a1\n",
691 |     "    val.loc[:,'mean_is_open'] = a2\n",
692 |     "    test1.loc[:,'mean_is_open'] = a3\n",
693 |     "\n",
694 |     "\n",
695 |     "    a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],\n",
696 |     "                             test1['communication_type'],train1.is_open,noise_level=0)\n",
697 |     "    train1.loc[:,'mean_ct'] = a1\n",
698 |     "    val.loc[:,'mean_ct'] = a2\n",
699 |     "    test1.loc[:,'mean_ct'] = a3\n",
700 |     "\n",
701 |     "    a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],\n",
702 |     "                             test1['communication_type'],train1.is_click,noise_level=0)\n",
703 |     "    train1.loc[:,'mean_clk_ct'] = a1\n",
704 |     "    val.loc[:,'mean_clk_ct'] = a2\n",
705 |     "    test1.loc[:,'mean_clk_ct'] = a3\n",
706 |     "\n",
707 |     "\n",
708 |     "    a1,a2,a3 = target_encode(train1['clust_id'],val['clust_id'],\n",
709 |     "                             test1['clust_id'],train1.is_click,noise_level=0)\n",
710 |     "    train1.loc[:,'mean_clk_clust_id'] = a1\n",
711 |     "    val.loc[:,'mean_clk_clust_id'] = a2\n",
712 |     "    test1.loc[:,'mean_clk_clust_id'] = a3\n",
713 |     "\n",
714 |     "\n",
715 |     "\n",
716 |     "    gc.collect()\n",
717 |     "    val.drop(['id','campaign_id','is_open','send_date',\n",
718 |     "              'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n",
719 |     "    train1.drop(['id','campaign_id','is_open','send_date',\n",
720 |     "                 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n",
721 |     "    test1.drop(['id','campaign_id','is_open','send_date',\n",
722 |     "               'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n",
723 |     "    gc.collect()\n",
724 |     "    train_y = train1.is_click.values\n",
725 |     "    val_y = val.is_click.values\n",
726 |     "    val.drop(['is_click'],axis=1,inplace=True)\n",
727 |     "    train1.drop(['is_click'],axis=1,inplace=True)\n",
728 |     "    test1.drop(['is_click'],axis=1,inplace=True)\n",
729 |     "    \n",
730 |     "    lgtrain = lgb.Dataset(train1, label=train_y,categorical_feature=['communication_type','send_dayofweek','clust_id'],\n",
731 |     "                      free_raw_data=False)\n",
732 |     "    lgvalid = lgb.Dataset(val, label=val_y,categorical_feature=['communication_type','send_dayofweek','clust_id'],\n",
733 |     "                     free_raw_data=False)\n",
734 |     "    gc.collect()\n",
735 |     "    \n",
736 |     "    evals_results = {}\n",
737 |     "    np.random.seed(0)\n",
738 |     "    \n",
739 |     "    test_subm['is_click'+str(cf)]=0\n",
740 |     "    \n",
741 |     "    for bg in range(nbag):\n",
742 |     "        lgb_params['feature_fraction_seed'] = 100*cf + bg\n",
743 |     "        bst1 = lgb.train(lgb_params, \n",
744 |     "                     lgtrain, \n",
745 |     "                     valid_sets=[lgtrain, lgvalid], \n",
746 |     "                     valid_names=['train','valid'], \n",
747 |     "                     evals_result=evals_results, \n",
748 |     "                     num_boost_round=55,\n",
749 |     "                     early_stopping_rounds=1000,\n",
750 |     "                     verbose_eval=10)\n",
751 |     "        train_score.loc[val.index,'pred'] += bst1.predict(val[train1.columns],num_iteration=51)\n",
752 |     "        test_subm['is_click'+str(cf)] += bst1.predict(test1[train1.columns],num_iteration=51)\n",
753 |     "    "
754 |    ]
755 |   },
756 |   {
757 |    "cell_type": "code",
758 |    "execution_count": 27,
759 |    "metadata": {},
760 |    "outputs": [],
761 |    "source": [
762 |     "from sklearn.metrics import roc_auc_score"
763 |    ]
764 |   },
765 |   {
766 |    "cell_type": "code",
767 |    "execution_count": 28,
768 |    "metadata": {},
769 |    "outputs": [
770 |     {
771 |      "name": "stderr",
772 |      "output_type": "stream",
773 |      "text": [
774 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
775 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
776 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
777 |       "\n",
778 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
779 |       "  \"\"\"Entry point for launching an IPython kernel.\n"
780 |      ]
781 |     }
782 |    ],
783 |    "source": [
784 |     "train_score['pred']/=nbag"
785 |    ]
786 |   },
787 |   {
788 |    "cell_type": "code",
789 |    "execution_count": 29,
790 |    "metadata": {},
791 |    "outputs": [
792 |     {
793 |      "data": {
794 |       "text/plain": [
795 |        "0.6198733764612789"
796 |       ]
797 |      },
798 |      "execution_count": 29,
799 |      "metadata": {},
800 |      "output_type": "execute_result"
801 |     }
802 |    ],
803 |    "source": [
804 |     "roc_auc_score(train_score.is_click,train_score.pred)"
805 |    ]
806 |   },
807 |   {
808 |    "cell_type": "code",
809 |    "execution_count": 30,
810 |    "metadata": {},
811 |    "outputs": [
812 |     {
813 |      "name": "stderr",
814 |      "output_type": "stream",
815 |      "text": [
816 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:537: SettingWithCopyWarning: \n",
817 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
818 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
819 |       "\n",
820 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
821 |       "  self.obj[item] = s\n"
822 |      ]
823 |     }
824 |    ],
825 |    "source": [
826 |     "test_subm.loc[:,'is_click'] = (test_subm['is_click1'].rank()/test_subm.shape[0] +\\\n",
827 |     "test_subm['is_click2'].rank()/test_subm.shape[0] + test_subm['is_click3'].rank()/test_subm.shape[0])/nfold"
828 |    ]
829 |   },
830 |   {
831 |    "cell_type": "code",
832 |    "execution_count": 32,
833 |    "metadata": {},
834 |    "outputs": [],
835 |    "source": [
836 |     "test_subm[['id','is_click']].to_csv('./lgb_5fold-5_bag_nt55_rank_average.csv',index=False)"
837 |    ]
838 |   }
839 |  ],
840 |  "metadata": {
841 |   "kernelspec": {
842 |    "display_name": "Python 3",
843 |    "language": "python",
844 |    "name": "python3"
845 |   },
846 |   "language_info": {
847 |    "codemirror_mode": {
848 |     "name": "ipython",
849 |     "version": 3
850 |    },
851 |    "file_extension": ".py",
852 |    "mimetype": "text/x-python",
853 |    "name": "python",
854 |    "nbconvert_exporter": "python",
855 |    "pygments_lexer": "ipython3",
856 |    "version": "3.6.4"
857 |   }
858 |  },
859 |  "nbformat": 4,
860 |  "nbformat_minor": 2
861 | }
862 | 


--------------------------------------------------------------------------------
/3rd_Place_Aditya_Akash/lstm_cnn.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 0,
   6 |    "metadata": {
   7 |     "colab": {
   8 |      "autoexec": {
   9 |       "startup": false,
  10 |       "wait_interval": 0
  11 |      }
  12 |     },
  13 |     "colab_type": "code",
  14 |     "id": "1wB8EOLGKyPE"
  15 |    },
  16 |    "outputs": [],
  17 |    "source": [
  18 |     "from keras.models import Sequential\n",
  19 |     "from keras.layers.core import Dense, Dropout, Activation\n",
  20 |     "from keras.layers import Merge, TimeDistributed, concatenate, Bidirectional, Masking, RepeatVector\n",
  21 |     "from keras.layers.embeddings import Embedding\n",
  22 |     "from keras.layers.recurrent import LSTM, GRU, SimpleRNN\n",
  23 |     "from keras.preprocessing.sequence import pad_sequences\n",
  24 |     "from keras.layers.convolutional import Convolution1D, MaxPooling1D\n",
  25 |     "from keras.callbacks import Callback\n",
  26 |     "from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU, Conv2D\n",
  27 |     "from keras.callbacks import Callback, History\n",
  28 |     "from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten\n",
  29 |     "from keras.preprocessing import text, sequence\n",
  30 |     "from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D, Masking\n",
  31 |     "from keras import initializers, regularizers, constraints, optimizers, layers, callbacks\n",
  32 |     "from keras.callbacks import EarlyStopping,ModelCheckpoint\n",
  33 |     "from keras.models import Model\n",
  34 |     "from keras.optimizers import Adam\n",
  35 |     "from sklearn.model_selection import train_test_split\n",
  36 |     "from sklearn.metrics import accuracy_score\n",
  37 |     "from sklearn.metrics import roc_auc_score"
  38 |    ]
  39 |   },
  40 |   {
  41 |    "cell_type": "code",
  42 |    "execution_count": 0,
  43 |    "metadata": {
  44 |     "colab": {
  45 |      "autoexec": {
  46 |       "startup": false,
  47 |       "wait_interval": 0
  48 |      }
  49 |     },
  50 |     "colab_type": "code",
  51 |     "id": "a7YM8iOuKyCU"
  52 |    },
  53 |    "outputs": [],
  54 |    "source": [
  55 |     "%matplotlib inline\n",
  56 |     "import pandas as pd\n",
  57 |     "import numpy as np\n",
  58 |     "# import lightgbm as lgb\n",
  59 |     "from sklearn.linear_model import LogisticRegression\n",
  60 |     "from sklearn.model_selection import train_test_split\n",
  61 |     "from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler\n",
  62 |     "from sklearn.pipeline import Pipeline\n",
  63 |     "import matplotlib.pyplot as plt"
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": 0,
  69 |    "metadata": {
  70 |     "colab": {
  71 |      "autoexec": {
  72 |       "startup": false,
  73 |       "wait_interval": 0
  74 |      }
  75 |     },
  76 |     "colab_type": "code",
  77 |     "id": "oa8zhndYKyCa"
  78 |    },
  79 |    "outputs": [],
  80 |    "source": [
  81 |     "train= pd.read_csv('train.csv')\n",
  82 |     "test= pd.read_csv('test_BDIfz5B.csv')\n",
  83 |     "campaign_df= pd.read_csv('campaign_data.csv')"
  84 |    ]
  85 |   },
  86 |   {
  87 |    "cell_type": "code",
  88 |    "execution_count": 0,
  89 |    "metadata": {
  90 |     "colab": {
  91 |      "autoexec": {
  92 |       "startup": false,
  93 |       "wait_interval": 0
  94 |      }
  95 |     },
  96 |     "colab_type": "code",
  97 |     "id": "BM5dBdV_KyEY"
  98 |    },
  99 |    "outputs": [],
 100 |    "source": [
 101 |     "train_df= train.merge(campaign_df,how='left')\n",
 102 |     "test_df= test.merge(campaign_df,how='left')"
 103 |    ]
 104 |   },
 105 |   {
 106 |    "cell_type": "code",
 107 |    "execution_count": 0,
 108 |    "metadata": {
 109 |     "colab": {
 110 |      "autoexec": {
 111 |       "startup": false,
 112 |       "wait_interval": 0
 113 |      }
 114 |     },
 115 |     "colab_type": "code",
 116 |     "id": "IzhZ-LitKyEf"
 117 |    },
 118 |    "outputs": [],
 119 |    "source": [
 120 |     "train_df.send_date=pd.to_datetime(train_df.send_date,format=\"%d-%m-%Y %H:%M\")"
 121 |    ]
 122 |   },
 123 |   {
 124 |    "cell_type": "code",
 125 |    "execution_count": 0,
 126 |    "metadata": {
 127 |     "colab": {
 128 |      "autoexec": {
 129 |       "startup": false,
 130 |       "wait_interval": 0
 131 |      }
 132 |     },
 133 |     "colab_type": "code",
 134 |     "id": "FEYTJLOrKyMb"
 135 |    },
 136 |    "outputs": [],
 137 |    "source": [
 138 |     "# Form sentences for clicks per user, open per user, etc\n",
 139 |     "sentences=train_df.groupby('user_id').is_click.apply(lambda x: list(x))"
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": 0,
 145 |    "metadata": {
 146 |     "colab": {
 147 |      "autoexec": {
 148 |       "startup": false,
 149 |       "wait_interval": 0
 150 |      }
 151 |     },
 152 |     "colab_type": "code",
 153 |     "id": "FjfydUWPKyMl"
 154 |    },
 155 |    "outputs": [],
 156 |    "source": [
 157 |     "sentences_open = train_df.groupby('user_id').is_open.apply(lambda x: list(x))"
 158 |    ]
 159 |   },
 160 |   {
 161 |    "cell_type": "code",
 162 |    "execution_count": 0,
 163 |    "metadata": {
 164 |     "colab": {
 165 |      "autoexec": {
 166 |       "startup": false,
 167 |       "wait_interval": 0
 168 |      }
 169 |     },
 170 |     "colab_type": "code",
 171 |     "id": "onK2KcK1KyNP"
 172 |    },
 173 |    "outputs": [],
 174 |    "source": [
 175 |     "train_df['communication_id'],invercom= pd.factorize(train_df.communication_type)"
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "code",
 180 |    "execution_count": 0,
 181 |    "metadata": {
 182 |     "colab": {
 183 |      "autoexec": {
 184 |       "startup": false,
 185 |       "wait_interval": 0
 186 |      }
 187 |     },
 188 |     "colab_type": "code",
 189 |     "id": "m8G_5wGJKyNd"
 190 |    },
 191 |    "outputs": [],
 192 |    "source": [
 193 |     "sentences_sec = train_df.groupby('user_id').no_of_sections.apply(lambda x: list(x))"
 194 |    ]
 195 |   },
 196 |   {
 197 |    "cell_type": "code",
 198 |    "execution_count": 0,
 199 |    "metadata": {
 200 |     "colab": {
 201 |      "autoexec": {
 202 |       "startup": false,
 203 |       "wait_interval": 0
 204 |      }
 205 |     },
 206 |     "colab_type": "code",
 207 |     "id": "1-vjG9j1KyNg"
 208 |    },
 209 |    "outputs": [],
 210 |    "source": [
 211 |     "sentences_com = train_df.groupby('user_id').no_of_images.apply(lambda x: list(x))"
 212 |    ]
 213 |   },
 214 |   {
 215 |    "cell_type": "code",
 216 |    "execution_count": 0,
 217 |    "metadata": {
 218 |     "colab": {
 219 |      "autoexec": {
 220 |       "startup": false,
 221 |       "wait_interval": 0
 222 |      }
 223 |     },
 224 |     "colab_type": "code",
 225 |     "id": "V19wlcxowGs1"
 226 |    },
 227 |    "outputs": [],
 228 |    "source": [
 229 |     "sentences_links = train_df.groupby('user_id').total_links.apply(lambda x: list(x))"
 230 |    ]
 231 |   },
 232 |   {
 233 |    "cell_type": "code",
 234 |    "execution_count": 0,
 235 |    "metadata": {
 236 |     "colab": {
 237 |      "autoexec": {
 238 |       "startup": false,
 239 |       "wait_interval": 0
 240 |      }
 241 |     },
 242 |     "colab_type": "code",
 243 |     "id": "TJRWvekdKyNk"
 244 |    },
 245 |    "outputs": [],
 246 |    "source": [
 247 |     "sennew=zip(sentences, sentences_open,sentences_sec,sentences_com,sentences_links)"
 248 |    ]
 249 |   },
 250 |   {
 251 |    "cell_type": "code",
 252 |    "execution_count": 0,
 253 |    "metadata": {
 254 |     "colab": {
 255 |      "autoexec": {
 256 |       "startup": false,
 257 |       "wait_interval": 0
 258 |      }
 259 |     },
 260 |     "colab_type": "code",
 261 |     "id": "oLH1TRCcKyNl"
 262 |    },
 263 |    "outputs": [],
 264 |    "source": [
 265 |     "# make n_rows*n_timesteps* n_features vector for all sequences\n",
 266 |     "sequences_ori=pad_sequences(pd.Series(sennew).apply(lambda x: zip(x[0],x[1],x[2],x[3])).tolist(),value=-1,padding='pre')"
 267 |    ]
 268 |   },
 269 |   {
 270 |    "cell_type": "code",
 271 |    "execution_count": 0,
 272 |    "metadata": {
 273 |     "colab": {
 274 |      "autoexec": {
 275 |       "startup": false,
 276 |       "wait_interval": 0
 277 |      }
 278 |     },
 279 |     "colab_type": "code",
 280 |     "id": "jPCXglb8KyOA"
 281 |    },
 282 |    "outputs": [],
 283 |    "source": [
 284 |     "# filter those which have length of 1\n",
 285 |     "sentences2=sentences[sentences.apply(lambda x: len(x))>1]"
 286 |    ]
 287 |   },
 288 |   {
 289 |    "cell_type": "code",
 290 |    "execution_count": 0,
 291 |    "metadata": {
 292 |     "colab": {
 293 |      "autoexec": {
 294 |       "startup": false,
 295 |       "wait_interval": 0
 296 |      }
 297 |     },
 298 |     "colab_type": "code",
 299 |     "id": "CJCBvAfIKyOX"
 300 |    },
 301 |    "outputs": [],
 302 |    "source": [
 303 |     "sentences_open2 = sentences_open[sentences_open.apply(lambda x: len(x))>1]"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "code",
 308 |    "execution_count": 0,
 309 |    "metadata": {
 310 |     "colab": {
 311 |      "autoexec": {
 312 |       "startup": false,
 313 |       "wait_interval": 0
 314 |      }
 315 |     },
 316 |     "colab_type": "code",
 317 |     "id": "uNKsZ5LaKyOc"
 318 |    },
 319 |    "outputs": [],
 320 |    "source": [
 321 |     "sentences_sec2= sentences_sec[sentences_sec.apply(lambda x: len(x)>1)]"
 322 |    ]
 323 |   },
 324 |   {
 325 |    "cell_type": "code",
 326 |    "execution_count": 0,
 327 |    "metadata": {
 328 |     "colab": {
 329 |      "autoexec": {
 330 |       "startup": false,
 331 |       "wait_interval": 0
 332 |      }
 333 |     },
 334 |     "colab_type": "code",
 335 |     "id": "k3FAD5qUKyOg"
 336 |    },
 337 |    "outputs": [],
 338 |    "source": [
 339 |     "sentences_com2= sentences_com[sentences_com.apply(lambda x: len(x)>1)]"
 340 |    ]
 341 |   },
 342 |   {
 343 |    "cell_type": "code",
 344 |    "execution_count": 0,
 345 |    "metadata": {
 346 |     "colab": {
 347 |      "autoexec": {
 348 |       "startup": false,
 349 |       "wait_interval": 0
 350 |      }
 351 |     },
 352 |     "colab_type": "code",
 353 |     "id": "8aFn0ROiwTE7"
 354 |    },
 355 |    "outputs": [],
 356 |    "source": [
 357 |     "sentences_links2= sentences_links[sentences_links.apply(lambda x: len(x)>1)]"
 358 |    ]
 359 |   },
 360 |   {
 361 |    "cell_type": "code",
 362 |    "execution_count": 0,
 363 |    "metadata": {
 364 |     "colab": {
 365 |      "autoexec": {
 366 |       "startup": false,
 367 |       "wait_interval": 0
 368 |      }
 369 |     },
 370 |     "colab_type": "code",
 371 |     "id": "MYxADjpvKyOj"
 372 |    },
 373 |    "outputs": [],
 374 |    "source": [
 375 |     "sennew=zip(sentences2, sentences_open2,sentences_sec2,sentences_com2,sentences_links2)\n",
 376 |     "# seqser=pd.concat((pd.Series(sennew).apply(lambda x: zip(x[0],x[1])).apply(lambda x: random.sample(x,len(x))),\n",
 377 |     "# pd.Series(sennew).apply(lambda x: zip(x[0],x[1]))))\n",
 378 |     "seqser=pd.Series(sennew).apply(lambda x: zip(x[0],x[1],x[2],x[3]))\n",
 379 |     "sequences=pad_sequences(seqser.tolist(),value=-1,padding='pre')"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": 149,
 385 |    "metadata": {
 386 |     "colab": {
 387 |      "autoexec": {
 388 |       "startup": false,
 389 |       "wait_interval": 0
 390 |      },
 391 |      "base_uri": "https://localhost:8080/",
 392 |      "height": 34,
 393 |      "output_extras": [
 394 |       {
 395 |        "item_id": 1
 396 |       }
 397 |      ]
 398 |     },
 399 |     "colab_type": "code",
 400 |     "executionInfo": {
 401 |      "elapsed": 1128,
 402 |      "status": "ok",
 403 |      "timestamp": 1522577949845,
 404 |      "user": {
 405 |       "displayName": "Akash Gupta",
 406 |       "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg",
 407 |       "userId": "111901583339877553911"
 408 |      },
 409 |      "user_tz": -330
 410 |     },
 411 |     "id": "76wQldP6KyO1",
 412 |     "outputId": "cc802a92-9e43-46ac-db15-d62e91c6f276"
 413 |    },
 414 |    "outputs": [
 415 |     {
 416 |      "data": {
 417 |       "text/plain": [
 418 |        "(151470, 20, 2)"
 419 |       ]
 420 |      },
 421 |      "execution_count": 149,
 422 |      "metadata": {
 423 |       "tags": []
 424 |      },
 425 |      "output_type": "execute_result"
 426 |     }
 427 |    ],
 428 |    "source": [
 429 |     "sequences.shape"
 430 |    ]
 431 |   },
 432 |   {
 433 |    "cell_type": "code",
 434 |    "execution_count": 150,
 435 |    "metadata": {
 436 |     "colab": {
 437 |      "autoexec": {
 438 |       "startup": false,
 439 |       "wait_interval": 0
 440 |      },
 441 |      "base_uri": "https://localhost:8080/",
 442 |      "height": 34,
 443 |      "output_extras": [
 444 |       {
 445 |        "item_id": 1
 446 |       }
 447 |      ]
 448 |     },
 449 |     "colab_type": "code",
 450 |     "executionInfo": {
 451 |      "elapsed": 1231,
 452 |      "status": "ok",
 453 |      "timestamp": 1522577951279,
 454 |      "user": {
 455 |       "displayName": "Akash Gupta",
 456 |       "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg",
 457 |       "userId": "111901583339877553911"
 458 |      },
 459 |      "user_tz": -330
 460 |     },
 461 |     "id": "rMXWBN8qKyPB",
 462 |     "outputId": "e282b7ee-dc48-40a7-ca92-f407b7677033"
 463 |    },
 464 |    "outputs": [
 465 |     {
 466 |      "data": {
 467 |       "text/plain": [
 468 |        "(168236, 20, 2)"
 469 |       ]
 470 |      },
 471 |      "execution_count": 150,
 472 |      "metadata": {
 473 |       "tags": []
 474 |      },
 475 |      "output_type": "execute_result"
 476 |     }
 477 |    ],
 478 |    "source": [
 479 |     "sequences_ori.shape"
 480 |    ]
 481 |   },
 482 |   {
 483 |    "cell_type": "code",
 484 |    "execution_count": 0,
 485 |    "metadata": {
 486 |     "colab": {
 487 |      "autoexec": {
 488 |       "startup": false,
 489 |       "wait_interval": 0
 490 |      }
 491 |     },
 492 |     "colab_type": "code",
 493 |     "id": "4Ha3DVMEKyPa"
 494 |    },
 495 |    "outputs": [],
 496 |    "source": [
 497 |     "train_X= sequences[:,:-1]\n",
 498 |     "train_Y = sequences[:,-1]"
 499 |    ]
 500 |   },
 501 |   {
 502 |    "cell_type": "code",
 503 |    "execution_count": 0,
 504 |    "metadata": {
 505 |     "colab": {
 506 |      "autoexec": {
 507 |       "startup": false,
 508 |       "wait_interval": 0
 509 |      }
 510 |     },
 511 |     "colab_type": "code",
 512 |     "id": "H1y6WnbOKyPf"
 513 |    },
 514 |    "outputs": [],
 515 |    "source": [
 516 |     "train_Y= train_Y[:,0]"
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "code",
 521 |    "execution_count": 178,
 522 |    "metadata": {
 523 |     "colab": {
 524 |      "autoexec": {
 525 |       "startup": false,
 526 |       "wait_interval": 0
 527 |      },
 528 |      "base_uri": "https://localhost:8080/",
 529 |      "height": 34,
 530 |      "output_extras": [
 531 |       {
 532 |        "item_id": 1
 533 |       }
 534 |      ]
 535 |     },
 536 |     "colab_type": "code",
 537 |     "executionInfo": {
 538 |      "elapsed": 942,
 539 |      "status": "ok",
 540 |      "timestamp": 1522578849746,
 541 |      "user": {
 542 |       "displayName": "Akash Gupta",
 543 |       "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg",
 544 |       "userId": "111901583339877553911"
 545 |      },
 546 |      "user_tz": -330
 547 |     },
 548 |     "id": "FHdF-ZMzKyPp",
 549 |     "outputId": "95c51c2a-0278-48c9-ab55-d9fabe1ec1fa"
 550 |    },
 551 |    "outputs": [
 552 |     {
 553 |      "data": {
 554 |       "text/plain": [
 555 |        "(151470, 19, 4)"
 556 |       ]
 557 |      },
 558 |      "execution_count": 178,
 559 |      "metadata": {
 560 |       "tags": []
 561 |      },
 562 |      "output_type": "execute_result"
 563 |     }
 564 |    ],
 565 |    "source": [
 566 |     "train_X.shape"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": 0,
 572 |    "metadata": {
 573 |     "colab": {
 574 |      "autoexec": {
 575 |       "startup": false,
 576 |       "wait_interval": 0
 577 |      }
 578 |     },
 579 |     "colab_type": "code",
 580 |     "id": "y5dWrSNTKyQF"
 581 |    },
 582 |    "outputs": [],
 583 |    "source": [
 584 |     "class RocAucEvaluation(Callback):\n",
 585 |     "    def __init__(self, validation_data=(), interval=1):\n",
 586 |     "        super(Callback, self).__init__()\n",
 587 |     "\n",
 588 |     "        self.interval = interval\n",
 589 |     "        self.X_val, self.y_val = validation_data\n",
 590 |     "\n",
 591 |     "    def on_epoch_end(self, epoch, logs={}):\n",
 592 |     "        if epoch % self.interval == 0:\n",
 593 |     "            y_pred = self.model.predict(self.X_val, verbose=0)\n",
 594 |     "            score = roc_auc_score(self.y_val, y_pred)\n",
 595 |     "            print(\"\\n ROC-AUC - epoch: {:d} - score: {:.6f}\".format(epoch+1, score))"
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "code",
 600 |    "execution_count": 179,
 601 |    "metadata": {
 602 |     "colab": {
 603 |      "autoexec": {
 604 |       "startup": false,
 605 |       "wait_interval": 0
 606 |      },
 607 |      "base_uri": "https://localhost:8080/",
 608 |      "height": 68,
 609 |      "output_extras": [
 610 |       {
 611 |        "item_id": 1
 612 |       }
 613 |      ]
 614 |     },
 615 |     "colab_type": "code",
 616 |     "executionInfo": {
 617 |      "elapsed": 1548,
 618 |      "status": "ok",
 619 |      "timestamp": 1522578889874,
 620 |      "user": {
 621 |       "displayName": "Akash Gupta",
 622 |       "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg",
 623 |       "userId": "111901583339877553911"
 624 |      },
 625 |      "user_tz": -330
 626 |     },
 627 |     "id": "fI7EvwzDKyQJ",
 628 |     "outputId": "454141ff-322d-4772-a400-f695edd49542"
 629 |    },
 630 |    "outputs": [
 631 |     {
 632 |      "name": "stdout",
 633 |      "output_type": "stream",
 634 |      "text": [
 635 |       "(None, 250)\n",
 636 |       "(None, 250)\n",
 637 |       "(None, 1)\n"
 638 |      ]
 639 |     }
 640 |    ],
 641 |    "source": [
 642 |     "\n",
 643 |     "# define nn model\n",
 644 |     "# emdedding_size=500\n",
 645 |     "# vocab_size=5502\n",
 646 |     "keras_model2 = Sequential()\n",
 647 |     "# keras_model2.add(Masking(mask_value=-1. ,input_shape=(None,2)))\n",
 648 |     "# keras_model2.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, \n",
 649 |     "#                     weights=[pretrained_weights]))\n",
 650 |     "# keras_model2/.add(LSTM(units=100,return_sequences=True))\n",
 651 |     "# keras_model2.add(Conv1D(64, kernel_size = 3, padding = \"valid\", kernel_initializer = \"glorot_uniform\",input_shape=(None,4)))\n",
 652 |     "# keras_model2.add(MaxPooling1D())\n",
 653 |     "# print keras_model2.output_shape\n",
 654 |     "# keras_model2.add(Conv1D(300, 3, padding='valid',activation='relu',strides=1 ,input_shape=(None,4)))\n",
 655 |     "# keras_model2.add(Conv1D(150, 3, padding='valid',activation='relu',strides=2))\n",
 656 |     "# keras_model2.add(Conv1D(75, 3, padding='valid',activation='relu',strides=2))\n",
 657 |     "# keras_model2.add(Flatten())\n",
 658 |     "# keras_model2.add(Dropout(0.2))\n",
 659 |     "# model.add(Dense(150,activation='sigmoid'))\n",
 660 |     "# model.add(Dropout(0.2))\n",
 661 |     "keras_model2.add(Conv1D(250,\n",
 662 |     "                 3,\n",
 663 |     "                 padding='valid',\n",
 664 |     "                 activation='relu',\n",
 665 |     "                 strides=1,input_shape=(None,4)))\n",
 666 |     "# we use max pooling:\n",
 667 |     "# keras_model2.add(GlobalMaxPooling1D())\n",
 668 |     "keras_model2.add((LSTM(units=100,return_sequences=False,recurrent_dropout=0.2)))\n",
 669 |     "# keras_model2.add((GRU(units=100,return_sequences=False,recurrent_dropout=0.2)))\n",
 670 |     "# keras_model2.add(Conv1D(300, 3, padding='valid',activation='relu',strides=2 ,input_shape=(None,4)))\n",
 671 |     "# keras_model2.add(Conv1D(150, 3, padding='valid',activation='relu',strides=2))\n",
 672 |     "# keras_model2.add(Conv1D(75, 3, padding='valid',activation='relu',strides=2))\n",
 673 |     "# keras_model2.add(Flatten())\n",
 674 |     "print keras_model2.output_shape\n",
 675 |     "# keras_model2.add(Conv1D(32, kernel_size = 3, padding = \"valid\", kernel_initializer = \"glorot_uniform\"))\n",
 676 |     "# print keras_model2.output_shape\n",
 677 |     "# keras_model2.add(GlobalMaxPooling1D())\n",
 678 |     "print keras_model2.output_shape\n",
 679 |     "\n",
 680 |     "keras_model2.add(Dropout(0.2))\n",
 681 |     "# keras_model2.add(LSTM(units=500,return_sequences=False))\n",
 682 |     "keras_model2.add(Dense(units=100))\n",
 683 |     "keras_model2.add(Dropout(0.2))\n",
 684 |     "keras_model2.add(Dense(units=10))\n",
 685 |     "keras_model2.add(Dropout(0.2))\n",
 686 |     "# keras_model2.add(Dense(units=5))\n",
 687 |     "# keras_model2.add(Dropout(0.2))\n",
 688 |     "\n",
 689 |     "keras_model2.add(Dense(units=1))\n",
 690 |     "print keras_model2.output_shape\n",
 691 |     "keras_model2.add(Activation('sigmoid'))\n",
 692 |     "keras_model2.compile(optimizer='Adamax', loss='binary_crossentropy',metrics=['accuracy','mse'])"
 693 |    ]
 694 |   },
 695 |   {
 696 |    "cell_type": "code",
 697 |    "execution_count": 0,
 698 |    "metadata": {
 699 |     "colab": {
 700 |      "autoexec": {
 701 |       "startup": false,
 702 |       "wait_interval": 0
 703 |      }
 704 |     },
 705 |     "colab_type": "code",
 706 |     "id": "WRgCCoQ3fohC"
 707 |    },
 708 |    "outputs": [],
 709 |    "source": [
 710 |     "# train_X=np.concatenate((np.ones_like(train_X[:,0]).reshape(train_X.shape[0],-1,4)*-1,train_X),axis=1)"
 711 |    ]
 712 |   },
 713 |   {
 714 |    "cell_type": "code",
 715 |    "execution_count": 0,
 716 |    "metadata": {
 717 |     "colab": {
 718 |      "autoexec": {
 719 |       "startup": false,
 720 |       "wait_interval": 0
 721 |      }
 722 |     },
 723 |     "colab_type": "code",
 724 |     "id": "mVEm8KAfKyQR"
 725 |    },
 726 |    "outputs": [],
 727 |    "source": [
 728 |     "xtrain, xval, ytrain, yval = train_test_split(train_X, train_Y, test_size=0.1, random_state=7)"
 729 |    ]
 730 |   },
 731 |   {
 732 |    "cell_type": "code",
 733 |    "execution_count": 182,
 734 |    "metadata": {
 735 |     "colab": {
 736 |      "autoexec": {
 737 |       "startup": false,
 738 |       "wait_interval": 0
 739 |      },
 740 |      "base_uri": "https://localhost:8080/",
 741 |      "height": 34,
 742 |      "output_extras": [
 743 |       {
 744 |        "item_id": 1
 745 |       }
 746 |      ]
 747 |     },
 748 |     "colab_type": "code",
 749 |     "executionInfo": {
 750 |      "elapsed": 1575,
 751 |      "status": "ok",
 752 |      "timestamp": 1522578897238,
 753 |      "user": {
 754 |       "displayName": "Akash Gupta",
 755 |       "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg",
 756 |       "userId": "111901583339877553911"
 757 |      },
 758 |      "user_tz": -330
 759 |     },
 760 |     "id": "NbQi0fReKyQc",
 761 |     "outputId": "0b85153a-b365-4b5f-f093-f63a2077cc91"
 762 |    },
 763 |    "outputs": [
 764 |     {
 765 |      "data": {
 766 |       "text/plain": [
 767 |        "((151470, 19, 4), (136323, 19, 4), (15147, 19, 4))"
 768 |       ]
 769 |      },
 770 |      "execution_count": 182,
 771 |      "metadata": {
 772 |       "tags": []
 773 |      },
 774 |      "output_type": "execute_result"
 775 |     }
 776 |    ],
 777 |    "source": [
 778 |     "train_X.shape, xtrain.shape, xval.shape"
 779 |    ]
 780 |   },
 781 |   {
 782 |    "cell_type": "code",
 783 |    "execution_count": 0,
 784 |    "metadata": {
 785 |     "colab": {
 786 |      "autoexec": {
 787 |       "startup": false,
 788 |       "wait_interval": 0
 789 |      }
 790 |     },
 791 |     "colab_type": "code",
 792 |     "id": "7efnrfCLKyQi"
 793 |    },
 794 |    "outputs": [],
 795 |    "source": [
 796 |     "# filepath=\"../input/best-model/best.hdf5\"\n",
 797 |     "import pickle\n",
 798 |     "filename_m= 'cnnlstm_'\n",
 799 |     "pickle.dump(keras_model2.to_json(),open(filename_m+'.pkl','w'))\n",
 800 |     "filepath=filename_m+\"weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5\"\n",
 801 |     "checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False, mode='max')\n",
 802 |     "ra_val = RocAucEvaluation(validation_data=(xval, yval), interval = 1)\n",
 803 |     "callbacks_list = [checkpoint,ra_val]"
 804 |    ]
 805 |   },
 806 |   {
 807 |    "cell_type": "code",
 808 |    "execution_count": 0,
 809 |    "metadata": {
 810 |     "colab": {
 811 |      "autoexec": {
 812 |       "startup": false,
 813 |       "wait_interval": 0
 814 |      }
 815 |     },
 816 |     "colab_type": "code",
 817 |     "id": "7a75OYPhKyQk"
 818 |    },
 819 |    "outputs": [],
 820 |    "source": [
 821 |     "sample_weight=np.concatenate((np.ones(train_X.shape[0]/3)*3,np.ones(train_X.shape[0]/3)*2,\n",
 822 |     "                              np.ones(train_X.shape[0]/3)))"
 823 |    ]
 824 |   },
 825 |   {
 826 |    "cell_type": "code",
 827 |    "execution_count": 185,
 828 |    "metadata": {
 829 |     "colab": {
 830 |      "autoexec": {
 831 |       "startup": false,
 832 |       "wait_interval": 0
 833 |      },
 834 |      "base_uri": "https://localhost:8080/",
 835 |      "height": 34,
 836 |      "output_extras": [
 837 |       {
 838 |        "item_id": 1
 839 |       }
 840 |      ]
 841 |     },
 842 |     "colab_type": "code",
 843 |     "executionInfo": {
 844 |      "elapsed": 950,
 845 |      "status": "ok",
 846 |      "timestamp": 1522578912424,
 847 |      "user": {
 848 |       "displayName": "Akash Gupta",
 849 |       "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg",
 850 |       "userId": "111901583339877553911"
 851 |      },
 852 |      "user_tz": -330
 853 |     },
 854 |     "id": "NE51h98Q1Unz",
 855 |     "outputId": "59ed1c69-b698-4d2c-d5d1-0d0f8c8ed41c"
 856 |    },
 857 |    "outputs": [
 858 |     {
 859 |      "data": {
 860 |       "text/plain": [
 861 |        "(136323, 19, 4)"
 862 |       ]
 863 |      },
 864 |      "execution_count": 185,
 865 |      "metadata": {
 866 |       "tags": []
 867 |      },
 868 |      "output_type": "execute_result"
 869 |     }
 870 |    ],
 871 |    "source": [
 872 |     "xtrain.shape"
 873 |    ]
 874 |   },
 875 |   {
 876 |    "cell_type": "code",
 877 |    "execution_count": null,
 878 |    "metadata": {
 879 |     "colab": {
 880 |      "autoexec": {
 881 |       "startup": false,
 882 |       "wait_interval": 0
 883 |      },
 884 |      "base_uri": "https://localhost:8080/",
 885 |      "height": 10271,
 886 |      "output_extras": [
 887 |       {
 888 |        "item_id": 21
 889 |       },
 890 |       {
 891 |        "item_id": 40
 892 |       },
 893 |       {
 894 |        "item_id": 57
 895 |       },
 896 |       {
 897 |        "item_id": 76
 898 |       },
 899 |       {
 900 |        "item_id": 94
 901 |       },
 902 |       {
 903 |        "item_id": 112
 904 |       },
 905 |       {
 906 |        "item_id": 130
 907 |       },
 908 |       {
 909 |        "item_id": 146
 910 |       },
 911 |       {
 912 |        "item_id": 164
 913 |       },
 914 |       {
 915 |        "item_id": 182
 916 |       },
 917 |       {
 918 |        "item_id": 201
 919 |       },
 920 |       {
 921 |        "item_id": 220
 922 |       },
 923 |       {
 924 |        "item_id": 239
 925 |       },
 926 |       {
 927 |        "item_id": 258
 928 |       },
 929 |       {
 930 |        "item_id": 274
 931 |       },
 932 |       {
 933 |        "item_id": 292
 934 |       },
 935 |       {
 936 |        "item_id": 311
 937 |       },
 938 |       {
 939 |        "item_id": 331
 940 |       },
 941 |       {
 942 |        "item_id": 349
 943 |       },
 944 |       {
 945 |        "item_id": 365
 946 |       },
 947 |       {
 948 |        "item_id": 366
 949 |       }
 950 |      ]
 951 |     },
 952 |     "colab_type": "code",
 953 |     "executionInfo": {
 954 |      "elapsed": 174954,
 955 |      "status": "ok",
 956 |      "timestamp": 1522579089258,
 957 |      "user": {
 958 |       "displayName": "Akash Gupta",
 959 |       "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg",
 960 |       "userId": "111901583339877553911"
 961 |      },
 962 |      "user_tz": -330
 963 |     },
 964 |     "id": "xThDJdWpKyQm",
 965 |     "outputId": "9fba9908-8f87-416c-b456-834cd6046ccb"
 966 |    },
 967 |    "outputs": [],
 968 |    "source": [
 969 |     "keras_model2.fit(xtrain,ytrain, epochs=19, validation_data=(xval,yval),\n",
 970 |     "          batch_size=1024, callbacks = callbacks_list,verbose=1)"
 971 |    ]
 972 |   },
 973 |   {
 974 |    "cell_type": "code",
 975 |    "execution_count": 0,
 976 |    "metadata": {
 977 |     "colab": {
 978 |      "autoexec": {
 979 |       "startup": false,
 980 |       "wait_interval": 0
 981 |      }
 982 |     },
 983 |     "colab_type": "code",
 984 |     "id": "e4z-mBLPKyQq"
 985 |    },
 986 |    "outputs": [],
 987 |    "source": [
 988 |     "  from keras.models import model_from_json\n",
 989 |     "# keras_model2= model_from_json(pickle.load(open('lstm2fea100.pkl')))\n",
 990 |     "# keras_model2.load_weights('lstmcorrectorderweights-improvement-24-0.99.hdf5')"
 991 |    ]
 992 |   },
 993 |   {
 994 |    "cell_type": "code",
 995 |    "execution_count": 0,
 996 |    "metadata": {
 997 |     "colab": {
 998 |      "autoexec": {
 999 |       "startup": false,
1000 |       "wait_interval": 0
1001 |      }
1002 |     },
1003 |     "colab_type": "code",
1004 |     "id": "xHgiafvmKyRY"
1005 |    },
1006 |    "outputs": [],
1007 |    "source": [
1008 |     "nnpred=keras_model2.predict(sequences_ori)"
1009 |    ]
1010 |   },
1011 |   {
1012 |    "cell_type": "code",
1013 |    "execution_count": 0,
1014 |    "metadata": {
1015 |     "colab": {
1016 |      "autoexec": {
1017 |       "startup": false,
1018 |       "wait_interval": 0
1019 |      }
1020 |     },
1021 |     "colab_type": "code",
1022 |     "id": "hLG63_X1KyRf"
1023 |    },
1024 |    "outputs": [],
1025 |    "source": [
1026 |     "snn= sentences.reset_index()"
1027 |    ]
1028 |   },
1029 |   {
1030 |    "cell_type": "code",
1031 |    "execution_count": 164,
1032 |    "metadata": {
1033 |     "colab": {
1034 |      "autoexec": {
1035 |       "startup": false,
1036 |       "wait_interval": 0
1037 |      },
1038 |      "base_uri": "https://localhost:8080/",
1039 |      "height": 34,
1040 |      "output_extras": [
1041 |       {
1042 |        "item_id": 1
1043 |       }
1044 |      ]
1045 |     },
1046 |     "colab_type": "code",
1047 |     "executionInfo": {
1048 |      "elapsed": 830,
1049 |      "status": "ok",
1050 |      "timestamp": 1522578652745,
1051 |      "user": {
1052 |       "displayName": "Akash Gupta",
1053 |       "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg",
1054 |       "userId": "111901583339877553911"
1055 |      },
1056 |      "user_tz": -330
1057 |     },
1058 |     "id": "ic8PLfrgKyRi",
1059 |     "outputId": "2d09df8f-dccd-42d4-db7a-3313b30108b0"
1060 |    },
1061 |    "outputs": [
1062 |     {
1063 |      "data": {
1064 |       "text/plain": [
1065 |        "((168236, 2), (168236, 1), (168236,))"
1066 |       ]
1067 |      },
1068 |      "execution_count": 164,
1069 |      "metadata": {
1070 |       "tags": []
1071 |      },
1072 |      "output_type": "execute_result"
1073 |     }
1074 |    ],
1075 |    "source": [
1076 |     "snn.shape, nnpred.shape , sentences.shape"
1077 |    ]
1078 |   },
1079 |   {
1080 |    "cell_type": "code",
1081 |    "execution_count": 0,
1082 |    "metadata": {
1083 |     "colab": {
1084 |      "autoexec": {
1085 |       "startup": false,
1086 |       "wait_interval": 0
1087 |      }
1088 |     },
1089 |     "colab_type": "code",
1090 |     "id": "yEUZfFu2KyRp"
1091 |    },
1092 |    "outputs": [],
1093 |    "source": [
1094 |     "snn['pred']= nnpred"
1095 |    ]
1096 |   },
1097 |   {
1098 |    "cell_type": "code",
1099 |    "execution_count": 0,
1100 |    "metadata": {
1101 |     "colab": {
1102 |      "autoexec": {
1103 |       "startup": false,
1104 |       "wait_interval": 0
1105 |      }
1106 |     },
1107 |     "colab_type": "code",
1108 |     "id": "1Z4rSIt6KyT3"
1109 |    },
1110 |    "outputs": [],
1111 |    "source": [
1112 |     "test_df['send_date']= pd.to_datetime(test_df.send_date,format=\"%d-%m-%Y %H:%M\")"
1113 |    ]
1114 |   },
1115 |   {
1116 |    "cell_type": "code",
1117 |    "execution_count": 0,
1118 |    "metadata": {
1119 |     "colab": {
1120 |      "autoexec": {
1121 |       "startup": false,
1122 |       "wait_interval": 0
1123 |      }
1124 |     },
1125 |     "colab_type": "code",
1126 |     "id": "W9dTls-JKyT-"
1127 |    },
1128 |    "outputs": [],
1129 |    "source": [
1130 |     "test_df['send_week']=test_df.send_date.dt.week\n",
1131 |     "test_df['send_day']= test_df.send_date.dt.day\n",
1132 |     "test_df['send_hour']= test_df.send_date.dt.hour\n",
1133 |     "test_df['send_hour']=(test_df.send_hour/6).astype('int')\n",
1134 |     "test_df['send_weekday']=test_df.send_date.dt.weekday\n",
1135 |     "# pred_nn=test_df.merge(snn,how='left').groupby(['campaign_id','send_weekday']).pred.apply(lambda x: x.fillna(x.mean()))"
1136 |    ]
1137 |   },
1138 |   {
1139 |    "cell_type": "code",
1140 |    "execution_count": 0,
1141 |    "metadata": {
1142 |     "colab": {
1143 |      "autoexec": {
1144 |       "startup": false,
1145 |       "wait_interval": 0
1146 |      }
1147 |     },
1148 |     "colab_type": "code",
1149 |     "id": "0jz0ZX4FKyRw"
1150 |    },
1151 |    "outputs": [],
1152 |    "source": [
1153 |     "pred_nn=test_df.merge(snn,how='left').groupby(['campaign_id','send_weekday',\n",
1154 |     "                                               'send_hour']).pred.apply(lambda x: x.fillna((x.quantile(0.9))))"
1155 |    ]
1156 |   },
1157 |   {
1158 |    "cell_type": "code",
1159 |    "execution_count": 0,
1160 |    "metadata": {
1161 |     "colab": {
1162 |      "autoexec": {
1163 |       "startup": false,
1164 |       "wait_interval": 0
1165 |      }
1166 |     },
1167 |     "colab_type": "code",
1168 |     "id": "3_b70e7aKyR2"
1169 |    },
1170 |    "outputs": [],
1171 |    "source": [
1172 |     "submission= pd.read_csv('sample_submission_4fcZwvQ.csv')\n",
1173 |     "submission.is_click = pred_nn\n",
1174 |     "submission.to_csv('submission_lstmcnn.csv',index=False)"
1175 |    ]
1176 |   },
1177 |   {
1178 |    "cell_type": "code",
1179 |    "execution_count": 6,
1180 |    "metadata": {},
1181 |    "outputs": [
1182 |     {
1183 |      "data": {
1184 |       "text/plain": [
1185 |        "'2.2.2'"
1186 |       ]
1187 |      },
1188 |      "execution_count": 6,
1189 |      "metadata": {},
1190 |      "output_type": "execute_result"
1191 |     }
1192 |    ],
1193 |    "source": [
1194 |     "import matplotlib as pd\n",
1195 |     "pd.__version__"
1196 |    ]
1197 |   },
1198 |   {
1199 |    "cell_type": "code",
1200 |    "execution_count": null,
1201 |    "metadata": {},
1202 |    "outputs": [],
1203 |    "source": []
1204 |   }
1205 |  ],
1206 |  "metadata": {
1207 |   "accelerator": "GPU",
1208 |   "colab": {
1209 |    "collapsed_sections": [],
1210 |    "default_view": {},
1211 |    "name": "Copy of Untitled-Copy1.ipynb",
1212 |    "provenance": [
1213 |     {
1214 |      "file_id": "1hznci-bKqi_hiGI3cTFlhZkB4s6fWc1G",
1215 |      "timestamp": 1522527298306
1216 |     }
1217 |    ],
1218 |    "version": "0.3.2",
1219 |    "views": {}
1220 |   },
1221 |   "kernelspec": {
1222 |    "display_name": "Python 3",
1223 |    "language": "python",
1224 |    "name": "python3"
1225 |   },
1226 |   "language_info": {
1227 |    "codemirror_mode": {
1228 |     "name": "ipython",
1229 |     "version": 3
1230 |    },
1231 |    "file_extension": ".py",
1232 |    "mimetype": "text/x-python",
1233 |    "name": "python",
1234 |    "nbconvert_exporter": "python",
1235 |    "pygments_lexer": "ipython3",
1236 |    "version": "3.6.4"
1237 |   }
1238 |  },
1239 |  "nbformat": 4,
1240 |  "nbformat_minor": 1
1241 | }
1242 | 


--------------------------------------------------------------------------------
/3rd_Place_Aditya_Akash/readme.txt:
--------------------------------------------------------------------------------
 1 | Packages used for executing Aditya's model
 2 | ==========================================
 3 | Python Python 3.6.4, Dependencies
 4 | ===========================================
 5 | lightgbm                  2.1.0  
 6 | scikit-learn              0.19.1   
 7 | pandas                    0.22.0  
 8 | numpy                     1.14.0  
 9 | xgboost			  0.7
10 | 
11 | Execution of Aditya's model
12 | ============================
13 | Execute following notebooks
14 | Note: train.csv and test.csv should be present in folder name "input". Output files will be genrated at current path
15 | 1. user_cluster-kmeans.ipynb, this will generate file user_cluster1.csv in input folder
16 | 2. xgb_2fold-cv3_bag3_nt70_scalepos1_best_tree.ipynb, this will generate test prediction file name "xgb_2fold-cv2_bag3_nt70_scalepos1_nt70.csv".
17 | 3. lgb_5fold-5_bag_nt45_rank_average_AND_lgb_5fold-5_bag_nt45_rank_average_4f.ipynb, this will generate test prediction file names "lgb_5fold-5_bag_nt45_rank_average.csv" and "lgb_5fold-5_bag_nt45_rank_average_4f.csv".
18 | 4. lgb_5fold-5_bag_nt55_rank_average_5f_AND_lgb_5fold-5_bag_nt55_rank_average_4f.ipynb, this will generate test prediction file names "lgb_5fold-5_bag_nt55_rank_average_5f.csv" and "lgb_5fold-5_bag_nt55_rank_average_4f.csv".
19 | 5. lgb_new_features-v6-5fold_5bag_cv_retry_lb_692_ens6941-submitted, this will generate lgb_5fold-5_bag_nt55_rank_average.csv.
20 | 
21 | 
22 | Packages used for executing Akash's model
23 | ==========================================
24 | Python Package Dependencies
25 | ===========================================
26 | 
27 | Keras			  2.0.8
28 | sklearn			  0.19.0
29 | pandas                    0.20.3
30 | numpy			  1.14.2
31 | matplotlib                2.2.2
32 | 
33 | Instructions:
34 | 
35 | 
36 | Execution of Akash's model
37 | ============================
38 | Execute each cell in following notebooks
39 | 1. cnn.ipynb, it will generate test prediction file submission_cnn.csv
40 | 2. lstm.ipynb, it will generate test prediction file submission_lstm.csv
41 | 3. lstm_cnn.ipynb, it will generate test prediction file submission_lstmcnn.csv
42 | 
43 | All train, test, and submission files should be kept in current path of notebook.
44 | 
45 | Ensemble Model
46 | =========================
47 | Execute final_ensemble-simple_avg.ipynb python notebook, that will rank average all the models output into final submssion.


--------------------------------------------------------------------------------
/3rd_Place_Aditya_Akash/user_cluster-kmeans.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "from sklearn.cluster import KMeans,DBSCAN"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "train = pd.read_csv('./input/train.csv',usecols=['campaign_id','user_id'])\n",
 21 |     "test = pd.read_csv('./input/test.csv',usecols=['campaign_id','user_id'])"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 3,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "all_data = pd.concat([train,test])"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 4,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "temp = pd.get_dummies(pd.Series(all_data.campaign_id), prefix='campaign_id')"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 5,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "all_data = pd.concat([all_data,temp],axis=1)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 6,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "all_data.drop('campaign_id',axis=1,inplace=True)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 7,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "ucamp_grp = all_data.groupby('user_id').sum()"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 8,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "ucamp_df = pd.DataFrame(ucamp_grp)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 9,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n",
 87 |        "    n_clusters=5, n_init=6, n_jobs=1, precompute_distances='auto',\n",
 88 |        "    random_state=None, tol=0.0001, verbose=0)"
 89 |       ]
 90 |      },
 91 |      "execution_count": 9,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "kmeans = KMeans(init='k-means++', n_clusters=5, n_init=6,max_iter=300)\n",
 98 |     "kmeans.fit(ucamp_df.values)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 10,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "ucamp_df['clust_id'] = kmeans.labels_"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 11,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "user_cluster = ucamp_df[['clust_id']].reset_index(drop=False)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 12,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "4    57030\n",
128 |        "3    52649\n",
129 |        "0    51266\n",
130 |        "2    36807\n",
131 |        "1    22966\n",
132 |        "Name: clust_id, dtype: int64"
133 |       ]
134 |      },
135 |      "execution_count": 12,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "user_cluster.clust_id.value_counts()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 13,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "user_cluster.to_csv('./input/user_cluster1.csv',index=False)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": []
159 |   }
160 |  ],
161 |  "metadata": {
162 |   "kernelspec": {
163 |    "display_name": "Python 3",
164 |    "language": "python",
165 |    "name": "python3"
166 |   },
167 |   "language_info": {
168 |    "codemirror_mode": {
169 |     "name": "ipython",
170 |     "version": 3
171 |    },
172 |    "file_extension": ".py",
173 |    "mimetype": "text/x-python",
174 |    "name": "python",
175 |    "nbconvert_exporter": "python",
176 |    "pygments_lexer": "ipython3",
177 |    "version": "3.6.4"
178 |   }
179 |  },
180 |  "nbformat": 4,
181 |  "nbformat_minor": 2
182 | }
183 | 


--------------------------------------------------------------------------------
/3rd_Place_Aditya_Akash/xgb_2fold-cv3_bag3_nt70_scalepos1_best_tree.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import xgboost as xgb\n",
 12 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 13 |     "from sklearn.decomposition import TruncatedSVD\n",
 14 |     "import gc\n",
 15 |     "from sklearn.preprocessing import LabelEncoder\n",
 16 |     "from sklearn.model_selection import KFold"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 3,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "0"
 28 |       ]
 29 |      },
 30 |      "execution_count": 3,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "campaign  = pd.read_csv('input/campaign_data.csv')\n",
 37 |     "campaign1 = campaign.drop(['subject','email_url','email_body'],axis=1)\n",
 38 |     "gc.collect()"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "np.random.seed(123)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 5,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "def add_noise(series, noise_level):\n",
 57 |     "    return series * (1 + noise_level * np.random.randn(len(series)))\n",
 58 |     "def target_encode(trn_series=None,val_series=None,\n",
 59 |     "                  tst_series=None,\n",
 60 |     "                  target=None,\n",
 61 |     "                  min_samples_leaf=1,\n",
 62 |     "                  smoothing=1,\n",
 63 |     "                  noise_level=0):\n",
 64 |     "    \"\"\"\n",
 65 |     "    Smoothing is computed like in the following paper by Daniele Micci-Barreca\n",
 66 |     "    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf\n",
 67 |     "    trn_series : training categorical feature as a pd.Series\n",
 68 |     "    tst_series : test categorical feature as a pd.Series\n",
 69 |     "    target : target data as a pd.Series\n",
 70 |     "    min_samples_leaf (int) : minimum samples to take category average into account\n",
 71 |     "    smoothing (int) : smoothing effect to balance categorical average vs prior\n",
 72 |     "    \"\"\"\n",
 73 |     "    assert len(trn_series) == len(target)\n",
 74 |     "    #assert trn_series.name == tst_series.name\n",
 75 |     "    temp = pd.concat([trn_series, target], axis=1)\n",
 76 |     "    # Compute target mean\n",
 77 |     "    averages = temp.groupby(by=trn_series.name)[target.name].agg([\"mean\", \"count\"])\n",
 78 |     "    # Compute smoothing\n",
 79 |     "    smoothing = 1 / (1 + np.exp(-(averages[\"count\"] - min_samples_leaf) / smoothing))\n",
 80 |     "    # Apply average function to all target data\n",
 81 |     "    prior = target.mean()\n",
 82 |     "    # The bigger the count the less full_avg is taken into account\n",
 83 |     "    averages[target.name] = prior * (1 - smoothing) + averages[\"mean\"] * smoothing\n",
 84 |     "    averages.drop([\"mean\", \"count\"], axis=1, inplace=True)\n",
 85 |     "    # Apply averages to trn and tst series\n",
 86 |     "    ft_trn_series = pd.merge(\n",
 87 |     "        trn_series.to_frame(trn_series.name),\n",
 88 |     "        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n",
 89 |     "        on=trn_series.name,\n",
 90 |     "        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n",
 91 |     "    # pd.merge does not keep the index so restore it\n",
 92 |     "    ft_trn_series.index = trn_series.index\n",
 93 |     "    ft_val_series = pd.merge(\n",
 94 |     "        val_series.to_frame(val_series.name),\n",
 95 |     "        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n",
 96 |     "        on=val_series.name,\n",
 97 |     "        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n",
 98 |     "    ft_val_series.index = val_series.index\n",
 99 |     "    \n",
100 |     "    ft_tst_series = pd.merge(\n",
101 |     "        tst_series.to_frame(tst_series.name),\n",
102 |     "        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n",
103 |     "        on=tst_series.name,\n",
104 |     "        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n",
105 |     "    # pd.merge does not keep the index so restore it\n",
106 |     "    ft_tst_series.index = tst_series.index\n",
107 |     "    return add_noise(ft_trn_series, noise_level), ft_val_series,ft_tst_series"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 6,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "0"
119 |       ]
120 |      },
121 |      "execution_count": 6,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "train = pd.read_csv('input/train.csv')\n",
128 |     "test = pd.read_csv('input/test.csv')\n",
129 |     "gc.collect()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 7,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "all_data = pd.concat([train,test])"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 8,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "user_clust = pd.read_csv('./input/user_cluster1.csv')\n",
148 |     "all_data = all_data.merge(user_clust,on='user_id',how='left')"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 10,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "all_data['send_date'] = all_data.send_date.apply(lambda x: pd.datetime.strptime(x,'%d-%m-%Y %H:%M'))"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 11,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "all_data['send_dayofweek'] = all_data.send_date.dt.dayofweek"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 12,
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "data": {
176 |       "text/plain": [
177 |        "Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',\n",
178 |        "       'clust_id', 'send_dayofweek'],\n",
179 |        "      dtype='object')"
180 |       ]
181 |      },
182 |      "execution_count": 12,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "all_data.columns"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 13,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "#count features\n",
198 |     "all_data['cnt_sd'] = all_data.groupby('send_date')['user_id'].transform('count')"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 14,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "all_data = all_data.merge(campaign1,on='campaign_id',how='left')"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 15,
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "data": {
217 |       "text/plain": [
218 |        "Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',\n",
219 |        "       'clust_id', 'send_dayofweek', 'cnt_sd', 'communication_type',\n",
220 |        "       'total_links', 'no_of_internal_links', 'no_of_images',\n",
221 |        "       'no_of_sections'],\n",
222 |        "      dtype='object')"
223 |       ]
224 |      },
225 |      "execution_count": 15,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "all_data.columns"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 16,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "le1 = LabelEncoder()\n",
241 |     "all_data.loc[:,'communication_type'] = le1.fit_transform(all_data.communication_type)   \n",
242 |     "all_data['usr_cnt'] = all_data.groupby('user_id')['user_id'].transform('count')\n",
243 |     "all_data['cm_cnt'] = np.log(all_data.groupby('communication_type')['communication_type'].transform('count'))\n",
244 |     "#all_data['camp_cnt'] = all_data.groupby('campaign_id')['campaign_id'].transform('count')"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 17,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "test = all_data[len(train):]\n",
254 |     "train = all_data[:len(train)]"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 18,
260 |    "metadata": {},
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/plain": [
265 |        "35"
266 |       ]
267 |      },
268 |      "execution_count": 18,
269 |      "metadata": {},
270 |      "output_type": "execute_result"
271 |     }
272 |    ],
273 |    "source": [
274 |     "#del all_data\n",
275 |     "gc.collect()"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 73,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "xgb_params = {}\n",
285 |     "xgb_params['eta'] = 0.07\n",
286 |     "xgb_params['max_depth'] = 5\n",
287 |     "xgb_params['max_leaves'] = 31\n",
288 |     "xgb_params['max_bin'] = 10\n",
289 |     "xgb_params['min_child_weight '] = 100\n",
290 |     "xgb_params['subsample'] = 0.6\n",
291 |     "xgb_params['colsample_bytree'] = 0.77\n",
292 |     "xgb_params['objective'] = 'binary:logistic'\n",
293 |     "xgb_params['eval_metric'] = 'auc'\n",
294 |     "xgb_params['verbose'] = 1\n",
295 |     "xgb_params['scale_pos_weight'] = 1.\n",
296 |     "\n",
297 |     "xgb_params['max_bin']=10\n",
298 |     "xgb_params['max_delta_step']=1\n",
299 |     "xgb_params['nthread']=7\n",
300 |     "xgb_params['booster']='gbtree'"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 91,
306 |    "metadata": {},
307 |    "outputs": [
308 |     {
309 |      "name": "stderr",
310 |      "output_type": "stream",
311 |      "text": [
312 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: SettingWithCopyWarning: \n",
313 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
314 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
315 |       "\n",
316 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
317 |       "  \n",
318 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
319 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
320 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
321 |       "\n",
322 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
323 |       "  \n"
324 |      ]
325 |     },
326 |     {
327 |      "name": "stdout",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "Fold: 1\n",
331 |       "val_cid [29 30 31 32 33 34 35 36 37 38 39 40 41]\n",
332 |       "(588141, 16) (435050, 16)\n"
333 |      ]
334 |     },
335 |     {
336 |      "name": "stderr",
337 |      "output_type": "stream",
338 |      "text": [
339 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:357: SettingWithCopyWarning: \n",
340 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
341 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
342 |       "\n",
343 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
344 |       "  self.obj[key] = _infer_fill_value(value)\n",
345 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:537: SettingWithCopyWarning: \n",
346 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
347 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
348 |       "\n",
349 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
350 |       "  self.obj[item] = s\n",
351 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:61: SettingWithCopyWarning: \n",
352 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
353 |       "\n",
354 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
355 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:63: SettingWithCopyWarning: \n",
356 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
357 |       "\n",
358 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
359 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:69: SettingWithCopyWarning: \n",
360 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
361 |       "\n",
362 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
363 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:70: SettingWithCopyWarning: \n",
364 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
365 |       "\n",
366 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
367 |      ]
368 |     },
369 |     {
370 |      "name": "stdout",
371 |      "output_type": "stream",
372 |      "text": [
373 |       "[0]\teval-auc:0.507239\ttrain-auc:0.630868\n",
374 |       "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n",
375 |       "\n",
376 |       "Will train until train-auc hasn't improved in 150 rounds.\n",
377 |       "[10]\teval-auc:0.549008\ttrain-auc:0.941565\n",
378 |       "[20]\teval-auc:0.54935\ttrain-auc:0.970986\n",
379 |       "[30]\teval-auc:0.621005\ttrain-auc:0.985933\n",
380 |       "[40]\teval-auc:0.623506\ttrain-auc:0.986568\n",
381 |       "[50]\teval-auc:0.63699\ttrain-auc:0.987577\n",
382 |       "[60]\teval-auc:0.643214\ttrain-auc:0.988096\n",
383 |       "[69]\teval-auc:0.640067\ttrain-auc:0.98841\n"
384 |      ]
385 |     },
386 |     {
387 |      "name": "stderr",
388 |      "output_type": "stream",
389 |      "text": [
390 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:89: SettingWithCopyWarning: \n",
391 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
392 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
393 |       "\n",
394 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
395 |      ]
396 |     },
397 |     {
398 |      "name": "stdout",
399 |      "output_type": "stream",
400 |      "text": [
401 |       "[0]\teval-auc:0.513118\ttrain-auc:0.699374\n",
402 |       "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n",
403 |       "\n",
404 |       "Will train until train-auc hasn't improved in 150 rounds.\n",
405 |       "[10]\teval-auc:0.549338\ttrain-auc:0.963648\n",
406 |       "[20]\teval-auc:0.549279\ttrain-auc:0.972213\n",
407 |       "[30]\teval-auc:0.585382\ttrain-auc:0.981735\n",
408 |       "[40]\teval-auc:0.628757\ttrain-auc:0.986893\n",
409 |       "[50]\teval-auc:0.648687\ttrain-auc:0.987708\n",
410 |       "[60]\teval-auc:0.644673\ttrain-auc:0.988209\n",
411 |       "[69]\teval-auc:0.643383\ttrain-auc:0.988605\n"
412 |      ]
413 |     },
414 |     {
415 |      "name": "stderr",
416 |      "output_type": "stream",
417 |      "text": [
418 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:194: SettingWithCopyWarning: \n",
419 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
420 |       "\n",
421 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
422 |       "  self._setitem_with_indexer(indexer, value)\n",
423 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:88: SettingWithCopyWarning: \n",
424 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
425 |       "\n",
426 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
427 |      ]
428 |     },
429 |     {
430 |      "name": "stdout",
431 |      "output_type": "stream",
432 |      "text": [
433 |       "[0]\teval-auc:0.508098\ttrain-auc:0.654833\n",
434 |       "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n",
435 |       "\n",
436 |       "Will train until train-auc hasn't improved in 150 rounds.\n",
437 |       "[10]\teval-auc:0.549292\ttrain-auc:0.96816\n",
438 |       "[20]\teval-auc:0.549248\ttrain-auc:0.971653\n",
439 |       "[30]\teval-auc:0.556878\ttrain-auc:0.977378\n",
440 |       "[40]\teval-auc:0.630023\ttrain-auc:0.986505\n",
441 |       "[50]\teval-auc:0.637828\ttrain-auc:0.986918\n",
442 |       "[60]\teval-auc:0.647163\ttrain-auc:0.988098\n",
443 |       "[69]\teval-auc:0.644905\ttrain-auc:0.98856\n",
444 |       "Fold: 2\n",
445 |       "val_cid [42 43 44 45 46 47 48 49 50 51 52 53 54]\n",
446 |       "(435050, 16) (588141, 16)\n",
447 |       "[0]\teval-auc:0.521143\ttrain-auc:0.660784\n",
448 |       "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n",
449 |       "\n",
450 |       "Will train until train-auc hasn't improved in 150 rounds.\n",
451 |       "[10]\teval-auc:0.574698\ttrain-auc:0.959211\n",
452 |       "[20]\teval-auc:0.582151\ttrain-auc:0.966154\n",
453 |       "[30]\teval-auc:0.64668\ttrain-auc:0.981158\n",
454 |       "[40]\teval-auc:0.646689\ttrain-auc:0.982213\n",
455 |       "[50]\teval-auc:0.646649\ttrain-auc:0.982668\n",
456 |       "[60]\teval-auc:0.669827\ttrain-auc:0.98389\n",
457 |       "[69]\teval-auc:0.679231\ttrain-auc:0.984133\n",
458 |       "[0]\teval-auc:0.513133\ttrain-auc:0.657905\n",
459 |       "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n",
460 |       "\n",
461 |       "Will train until train-auc hasn't improved in 150 rounds.\n",
462 |       "[10]\teval-auc:0.574706\ttrain-auc:0.958659\n",
463 |       "[20]\teval-auc:0.583715\ttrain-auc:0.96735\n",
464 |       "[30]\teval-auc:0.650561\ttrain-auc:0.980521\n",
465 |       "[40]\teval-auc:0.653512\ttrain-auc:0.982558\n",
466 |       "[50]\teval-auc:0.661876\ttrain-auc:0.983443\n",
467 |       "[60]\teval-auc:0.66456\ttrain-auc:0.983772\n",
468 |       "[69]\teval-auc:0.663491\ttrain-auc:0.983995\n",
469 |       "[0]\teval-auc:0.52116\ttrain-auc:0.693952\n",
470 |       "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n",
471 |       "\n",
472 |       "Will train until train-auc hasn't improved in 150 rounds.\n",
473 |       "[10]\teval-auc:0.574725\ttrain-auc:0.960942\n",
474 |       "[20]\teval-auc:0.602752\ttrain-auc:0.9715\n",
475 |       "[30]\teval-auc:0.646011\ttrain-auc:0.980437\n",
476 |       "[40]\teval-auc:0.650082\ttrain-auc:0.982535\n",
477 |       "[50]\teval-auc:0.669828\ttrain-auc:0.983306\n",
478 |       "[60]\teval-auc:0.673233\ttrain-auc:0.983707\n",
479 |       "[69]\teval-auc:0.676993\ttrain-auc:0.984073\n"
480 |      ]
481 |     }
482 |    ],
483 |    "source": [
484 |     "nfold =2\n",
485 |     "kf = KFold(n_splits=nfold,random_state=123,shuffle=False)\n",
486 |     "unq_campaign_id = np.sort(train.campaign_id.unique())\n",
487 |     "\n",
488 |     "test_subm = test[['id']]\n",
489 |     "test_subm['is_click'] = 0\n",
490 |     "train_score = train[['is_click']]\n",
491 |     "train_score['pred'] = 0\n",
492 |     "nbag = 3\n",
493 |     "cf =0\n",
494 |     "for train_index, test_index in kf.split(unq_campaign_id):\n",
495 |     "    cf+=1\n",
496 |     "    print('Fold:',cf)\n",
497 |     "    \n",
498 |     "    test1 = test.copy()\n",
499 |     "    tr_cid = unq_campaign_id[train_index]\n",
500 |     "    val_cid = unq_campaign_id[test_index]\n",
501 |     "    print('val_cid',val_cid)\n",
502 |     "\n",
503 |     "    val = train[train.campaign_id.isin(tr_cid)]\n",
504 |     "    train1 = train[train.campaign_id.isin(val_cid)]\n",
505 |     "    print(val.shape,train1.shape)\n",
506 |     "\n",
507 |     "    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],\n",
508 |     "                             test1['user_id'],train1.is_click,noise_level=.9,smoothing=5)\n",
509 |     "    train1.loc[:,'mean_is_click'] = a1\n",
510 |     "    val.loc[:,'mean_is_click'] = a2\n",
511 |     "    test1.loc[:,'mean_is_click'] = a3\n",
512 |     "\n",
513 |     "\n",
514 |     "    a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],\n",
515 |     "                             test1['user_id'],train1.is_open,noise_level=.9,smoothing=1.)\n",
516 |     "    train1.loc[:,'mean_is_open'] = a1\n",
517 |     "    val.loc[:,'mean_is_open'] = a2\n",
518 |     "    test1.loc[:,'mean_is_open'] = a3\n",
519 |     "\n",
520 |     "\n",
521 |     "    a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],\n",
522 |     "                             test1['communication_type'],train1.is_open,noise_level=0)\n",
523 |     "    train1.loc[:,'mean_ct'] = a1\n",
524 |     "    val.loc[:,'mean_ct'] = a2\n",
525 |     "    test1.loc[:,'mean_ct'] = a3\n",
526 |     "\n",
527 |     "    #a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],\n",
528 |     "    #                         test1['communication_type'],train1.is_click,noise_level=0)\n",
529 |     "    #train1.loc[:,'mean_clk_ct'] = a1\n",
530 |     "    #val.loc[:,'mean_clk_ct'] = a2\n",
531 |     "    #test1.loc[:,'mean_clk_ct'] = a3\n",
532 |     "\n",
533 |     "\n",
534 |     "    a1,a2,a3 = target_encode(train1['clust_id'],val['clust_id'],\n",
535 |     "                             test1['clust_id'],train1.is_click,noise_level=0)\n",
536 |     "    train1.loc[:,'mean_clk_clust_id'] = a1\n",
537 |     "    val.loc[:,'mean_clk_clust_id'] = a2\n",
538 |     "    test1.loc[:,'mean_clk_clust_id'] = a3\n",
539 |     "\n",
540 |     "\n",
541 |     "\n",
542 |     "    gc.collect()\n",
543 |     "    val.drop(['id','campaign_id','is_open','send_date',\n",
544 |     "              'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n",
545 |     "    train1.drop(['id','campaign_id','is_open','send_date',\n",
546 |     "                 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n",
547 |     "    test1.drop(['id','campaign_id','is_open','send_date',\n",
548 |     "               'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n",
549 |     "    gc.collect()\n",
550 |     "    train_y = train1.is_click.values\n",
551 |     "    val_y = val.is_click.values\n",
552 |     "    val.drop(['is_click'],axis=1,inplace=True)\n",
553 |     "    train1.drop(['is_click'],axis=1,inplace=True)\n",
554 |     "    test1.drop(['is_click'],axis=1,inplace=True)\n",
555 |     "    \n",
556 |     "    dtrain = xgb.DMatrix(train1,label=train_y)\n",
557 |     "    dval = xgb.DMatrix(val[train1.columns],label=val_y)\n",
558 |     "    dtest =  xgb.DMatrix(test1[train1.columns])\n",
559 |     "    gc.collect()\n",
560 |     "    \n",
561 |     "    evals_results = {}\n",
562 |     "    np.random.seed(0)\n",
563 |     "    \n",
564 |     "    for bg in range(nbag):\n",
565 |     "        xgb_params['seed'] = 100*cf + bg\n",
566 |     "        watchlist = [(dval, 'eval'), (dtrain, 'train')]\n",
567 |     "\n",
568 |     "        bst = xgb.train(xgb_params, dtrain, 70, watchlist,early_stopping_rounds=150,\n",
569 |     "                        verbose_eval=10,maximize=True)\n",
570 |     "    \n",
571 |     "        train_score.loc[val.index,'pred'] += bst.predict(dval)\n",
572 |     "        test_subm['is_click'] += bst.predict(dtest)\n",
573 |     "    "
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 93,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "from sklearn.metrics import roc_auc_score"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": 94,
588 |    "metadata": {},
589 |    "outputs": [
590 |     {
591 |      "name": "stderr",
592 |      "output_type": "stream",
593 |      "text": [
594 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
595 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
596 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
597 |       "\n",
598 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
599 |       "  \"\"\"Entry point for launching an IPython kernel.\n"
600 |      ]
601 |     }
602 |    ],
603 |    "source": [
604 |     "train_score['pred']/=3"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": 95,
610 |    "metadata": {},
611 |    "outputs": [
612 |     {
613 |      "data": {
614 |       "text/plain": [
615 |        "(   is_click      pred\n",
616 |        " 0       0.0  0.007969\n",
617 |        " 1       0.0  0.010713\n",
618 |        " 2       0.0  0.008167\n",
619 |        " 3       0.0  0.008279\n",
620 |        " 4       0.0  0.007871,          is_click      pred\n",
621 |        " 1023186       0.0  0.008056\n",
622 |        " 1023187       0.0  0.009954\n",
623 |        " 1023188       1.0  0.007931\n",
624 |        " 1023189       0.0  0.009988\n",
625 |        " 1023190       0.0  0.007871)"
626 |       ]
627 |      },
628 |      "execution_count": 95,
629 |      "metadata": {},
630 |      "output_type": "execute_result"
631 |     }
632 |    ],
633 |    "source": [
634 |     "train_score.head(5),train_score.tail(5)"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 96,
640 |    "metadata": {},
641 |    "outputs": [
642 |     {
643 |      "data": {
644 |       "text/plain": [
645 |        "0.6603355706439776"
646 |       ]
647 |      },
648 |      "execution_count": 96,
649 |      "metadata": {},
650 |      "output_type": "execute_result"
651 |     }
652 |    ],
653 |    "source": [
654 |     "roc_auc_score(train_score.is_click,train_score.pred)"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 97,
660 |    "metadata": {},
661 |    "outputs": [
662 |     {
663 |      "name": "stderr",
664 |      "output_type": "stream",
665 |      "text": [
666 |       "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
667 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
668 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
669 |       "\n",
670 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
671 |       "  \"\"\"Entry point for launching an IPython kernel.\n"
672 |      ]
673 |     }
674 |    ],
675 |    "source": [
676 |     "test_subm['is_click'] /= nfold*nbag"
677 |    ]
678 |   },
679 |   {
680 |    "cell_type": "code",
681 |    "execution_count": 98,
682 |    "metadata": {},
683 |    "outputs": [],
684 |    "source": [
685 |     "test_subm.to_csv('./xgb_2fold-cv2_bag3_nt70_scalepos1_nt70.csv',index=False)"
686 |    ]
687 |   }
688 |  ],
689 |  "metadata": {
690 |   "kernelspec": {
691 |    "display_name": "Python 3",
692 |    "language": "python",
693 |    "name": "python3"
694 |   },
695 |   "language_info": {
696 |    "codemirror_mode": {
697 |     "name": "ipython",
698 |     "version": 3
699 |    },
700 |    "file_extension": ".py",
701 |    "mimetype": "text/x-python",
702 |    "name": "python",
703 |    "nbconvert_exporter": "python",
704 |    "pygments_lexer": "ipython3",
705 |    "version": "3.6.4"
706 |   }
707 |  },
708 |  "nbformat": 4,
709 |  "nbformat_minor": 2
710 | }
711 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Codes related to Lord of the Machines
2 | 
3 | The repository contains Top-3 Winning solutions of "Lord of the Machines" (Competition Link : [https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/](https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/) )
4 | 
5 | Note: Although winning solutions are provided for use, datasets are not provided as the datasets are proprietary and do not comply with the License.
6 | 


--------------------------------------------------------------------------------