├── .gitignore ├── 1st_Place_Kunal ├── Features based on len of text.ipynb ├── LOM final.ipynb ├── LOM2.ipynb ├── LOM_1_model.ipynb ├── LOM_model_2.ipynb ├── LOM_text_features.ipynb └── README.md ├── 2nd_Place_Mark_SRK ├── Explorations.ipynb ├── build_model.py ├── build_model_xgb.py ├── ensemble.py └── readme.md ├── 3rd_Place_Aditya_Akash ├── 3rd_Place_Solution_Approach.docx ├── final_ensemble-simple_avg.ipynb ├── lgb_5fold-5_bag_nt45_rank_average_AND_lgb_5fold-5_bag_nt45_rank_average_4f.ipynb ├── lgb_5fold-5_bag_nt55_rank_average_5f_AND_lgb_5fold-5_bag_nt55_rank_average_4f.ipynb ├── lgb_new_features-v6-5fold_5bag_cv_retry_lb_692_ens6941-submitted.ipynb ├── lstm.ipynb ├── lstm_cnn.ipynb ├── readme.txt ├── user_cluster-kmeans.ipynb └── xgb_2fold-cv3_bag3_nt70_scalepos1_best_tree.ipynb └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | misc/ 2 | *~ 3 | -------------------------------------------------------------------------------- /1st_Place_Kunal/Features based on len of text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib notebook" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "path = '/home/kunal/Downloads/lord_of_machines/'\n", 26 | "train = pd.read_csv(path + 'train.csv')\n", 27 | "test = pd.read_csv(path + 'test.csv')\n", 28 | "campaign = pd.read_csv(path +'campaign_data.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 39, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "train_input = pd.read_csv(path + 'impact_encoded_train.csv')\n", 40 | "test_input = pd.read_csv(path + 'impact_encoded_test.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### CREATE EXTRA FEATURES USING GROUPBY STATISTICS" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "train = train.merge(campaign, on ='campaign_id',how = 'left')\n", 59 | "test = test.merge(campaign, on ='campaign_id',how = 'left')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "#x1 = train.groupby('campaign_id')['is_click'].mean().sort_values(ascending = False).values" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "#x3 = train.groupby('campaign_id')['is_open'].mean().sort_values(ascending = False).values" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "#x2 = train['campaign_id'].value_counts().values" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 4, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "#def get_time(cell):\n", 104 | "# cell = cell.split(' ')[-1]\n", 105 | "# cell = cell.split(':')[0]\n", 106 | "# return cell\n", 107 | "#train['hour'] = train['send_date'].apply(get_time)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "#train[train['is_click'] == 1].hour.value_counts()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "#x5 = train['hour'].value_counts().values" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "#test['hour'] = test['send_date'].apply(get_time)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "#test['hour'].value_counts()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": true 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "train_input = pd.read_csv(path + 'impact_encoded_train.csv')\n", 163 | "test_input = pd.read_csv(path + 'impact_encoded_test.csv')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "#x3 = train.groupby('hour')['is_open'].mean().sort_values(ascending = False).values" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 5, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "## Day of the week\n", 186 | "#def get_date(cell):\n", 187 | "# return cell.split(' ')[0]\n", 188 | "#train['date'] = train['send_date'].apply(get_date)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": true 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "#import datetime\n", 200 | "#exp = train['date'][0]" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 6, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "#from datetime import datetime" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 7, 217 | "metadata": { 218 | "collapsed": true 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "#def get_weekday(cell):\n", 223 | "# return datetime.strptime(cell,'%d-%m-%Y').weekday()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 8, 229 | "metadata": { 230 | "collapsed": true 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "#train['weekday'] = train['date'].apply(get_weekday)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": { 241 | "collapsed": true 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "#train['weekday'].value_counts()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 9, 251 | "metadata": { 252 | "collapsed": true 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "#test['date'] = test['send_date'].apply(get_date)\n", 257 | "#test['weekday'] = test['date'].apply(get_weekday)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "tr.groupby('weekday')['is_open'].mean()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": true 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "train.groupby('hour')['is_open'].mean()" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 30, 285 | "metadata": { 286 | "collapsed": true 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "campaign['len_sub'] = campaign['subject'].str.split(' ').apply(len)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 20, 296 | "metadata": { 297 | "collapsed": true 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "train['len_sub'] = train['campaign_id'].map(pd.Series(campaign['len_sub'],index = campaign['campaign_id']))\n", 302 | "test['len_sub'] = test['campaign_id'].map(pd.Series(campaign['len_sub'],index = campaign['campaign_id']))" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 31, 308 | "metadata": { 309 | "collapsed": true 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "campaign['len_sub_email'] = campaign['email_body'].str.split(' ').apply(len)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 24, 319 | "metadata": { 320 | "collapsed": true 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "train['len_sub_email'] = train['campaign_id'].map(pd.Series(campaign['len_sub_email'],index = campaign['campaign_id']))\n", 325 | "test['len_sub_email'] = test['campaign_id'].map(pd.Series(campaign['len_sub_email'],index = campaign['campaign_id']))" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 33, 331 | "metadata": { 332 | "collapsed": true 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "train.drop(['len_sub','len_sub_email'],axis = 1,inplace = True)\n", 337 | "test.drop(['len_sub','len_sub_email'],axis = 1,inplace = True)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 34, 343 | "metadata": { 344 | "collapsed": true 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "train = train.merge(campaign[['campaign_id','len_sub','len_sub_email']],on = 'campaign_id',how = 'left')\n", 349 | "test = test.merge(campaign[['campaign_id','len_sub','len_sub_email']],on = 'campaign_id',how = 'left')" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 41, 355 | "metadata": { 356 | "collapsed": true 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "train_input['len_sub'] = train['len_sub']\n", 361 | "train_input['len_sub_email'] = train['len_sub_email']\n", 362 | "\n", 363 | "test_input['len_sub'] = test['len_sub']\n", 364 | "test_input['len_sub_email'] = test['len_sub_email']" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 43, 370 | "metadata": { 371 | "collapsed": true 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "train_input.to_csv(path + 'impact_encoded_train.csv',index = False)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 44, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "test_input.to_csv(path + 'impact_encoded_test.csv',index = False)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "collapsed": true 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "# This way we have randomness and are able to reproduce the behaviour within this cell.\n", 398 | "np.random.seed(13)\n", 399 | "from sklearn.model_selection import KFold\n", 400 | "\n", 401 | "def impact_coding(data, feature, target='y'):\n", 402 | " '''\n", 403 | " In this implementation we get the values and the dictionary as two different steps.\n", 404 | " This is just because initially we were ignoring the dictionary as a result variable.\n", 405 | " \n", 406 | " In this implementation the KFolds use shuffling. If you want reproducibility the cv \n", 407 | " could be moved to a parameter.\n", 408 | " '''\n", 409 | " n_folds = 10\n", 410 | " n_inner_folds = 5\n", 411 | " impact_coded = pd.Series()\n", 412 | " \n", 413 | " oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)\n", 414 | " kf = KFold(n_splits=n_folds, shuffle=True)\n", 415 | " oof_mean_cv = pd.DataFrame()\n", 416 | " split = 0\n", 417 | " for infold, oof in kf.split(data[feature]):\n", 418 | " impact_coded_cv = pd.Series()\n", 419 | " kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)\n", 420 | " inner_split = 0\n", 421 | " inner_oof_mean_cv = pd.DataFrame()\n", 422 | " oof_default_inner_mean = data.iloc[infold][target].mean()\n", 423 | " for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):\n", 424 | " # The mean to apply to the inner oof split (a 1/n_folds % based on the rest)\n", 425 | " oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()\n", 426 | " impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(\n", 427 | " lambda x: oof_mean[x[feature]]\n", 428 | " if x[feature] in oof_mean.index\n", 429 | " else oof_default_inner_mean\n", 430 | " , axis=1))\n", 431 | "\n", 432 | " # Also populate mapping (this has all group -> mean for all inner CV folds)\n", 433 | " inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')\n", 434 | " inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)\n", 435 | " inner_split += 1\n", 436 | "\n", 437 | " # Also populate mapping\n", 438 | " oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')\n", 439 | " oof_mean_cv.fillna(value=oof_default_mean, inplace=True)\n", 440 | " split += 1\n", 441 | " \n", 442 | " impact_coded = impact_coded.append(data.iloc[oof].apply(\n", 443 | " lambda x: inner_oof_mean_cv.loc[x[feature]].mean()\n", 444 | " if x[feature] in inner_oof_mean_cv.index\n", 445 | " else oof_default_mean\n", 446 | " , axis=1))\n", 447 | "\n", 448 | " return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": { 455 | "collapsed": true 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "f = 'weekday'" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": { 466 | "collapsed": true 467 | }, 468 | "outputs": [], 469 | "source": [ 470 | "# Apply the encoding to training and test data, and preserve the mapping\n", 471 | "impact_coding_map = {}\n", 472 | "print(\"Impact coding for {}\".format(f))\n", 473 | "train[\"impact_encoded_{}\".format(f)], impact_coding_mapping, default_coding = impact_coding(train, f,'is_click')\n", 474 | "impact_coding_map[f] = (impact_coding_mapping, default_coding)\n", 475 | "mapping, default_mean = impact_coding_map[f]\n", 476 | "test[\"impact_encoded_{}\".format(f)] = test.apply(lambda x: mapping[x[f]]\n", 477 | " if x[f] in mapping\n", 478 | " else default_mean\n", 479 | " , axis=1)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": { 486 | "collapsed": true 487 | }, 488 | "outputs": [], 489 | "source": [ 490 | "impact_coding_map = {}\n", 491 | "print(\"Impact coding for {}\".format(f))\n", 492 | "train[\"impact_encoded_open_{}\".format(f)], impact_coding_mapping, default_coding = impact_coding(train, f,'is_open')\n", 493 | "impact_coding_map[f] = (impact_coding_mapping, default_coding)\n", 494 | "mapping, default_mean = impact_coding_map[f]\n", 495 | "test[\"impact_encoded_open_{}\".format(f)] = test.apply(lambda x: mapping[x[f]]\n", 496 | " if x[f] in mapping\n", 497 | " else default_mean\n", 498 | " , axis=1)" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "metadata": { 505 | "collapsed": true 506 | }, 507 | "outputs": [], 508 | "source": [] 509 | } 510 | ], 511 | "metadata": { 512 | "kernelspec": { 513 | "display_name": "Python 3", 514 | "language": "python", 515 | "name": "python3" 516 | }, 517 | "language_info": { 518 | "codemirror_mode": { 519 | "name": "ipython", 520 | "version": 3 521 | }, 522 | "file_extension": ".py", 523 | "mimetype": "text/x-python", 524 | "name": "python", 525 | "nbconvert_exporter": "python", 526 | "pygments_lexer": "ipython3", 527 | "version": "3.6.0" 528 | } 529 | }, 530 | "nbformat": 4, 531 | "nbformat_minor": 2 532 | } 533 | -------------------------------------------------------------------------------- /1st_Place_Kunal/LOM final.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib notebook" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 5, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "path = '/home/kunal/Downloads/lord_of_machines/ensemble/'" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Take mean of the best solutions" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 85, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "df1 = pd.read_csv(path + 'f1.csv')\n", 44 | "#df2 = pd.read_csv(path + 'f2.csv')\n", 45 | "df3 = pd.read_csv(path + 'f3.csv')\n", 46 | "df4 = pd.read_csv(path + 'f4.csv')\n", 47 | "#df5 = pd.read_csv(path + 'f5.csv')\n", 48 | "df6 = pd.read_csv(path + 'f6.csv')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 100, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "w1 = 0.25\n", 60 | "#w2 = 0\n", 61 | "w3 =0.25*0.5\n", 62 | "w4 = 0.5\n", 63 | "#w5 = 0\n", 64 | "w6 = 0.25*0.5" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 36, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "sample = pd.read_csv('/home/kunal/Downloads/lord_of_machines/sample.csv')" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 101, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "sample['is_click'] = (df1['is_click']*w1 + \n", 87 | " df3['is_click']*w3 + df4['is_click']*w4 +\n", 88 | " df6['is_click']*w6)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 102, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "sample.to_csv(path + 'The_best_solution.csv',index = False)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [] 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "Python 3", 115 | "language": "python", 116 | "name": "python3" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 3 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython3", 128 | "version": "3.6.0" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 2 133 | } 134 | -------------------------------------------------------------------------------- /1st_Place_Kunal/LOM2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib notebook" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "path = '/home/kunal/Downloads/lord_of_machines/'\n", 26 | "train = pd.read_csv(path + 'train.csv')\n", 27 | "test = pd.read_csv(path + 'test.csv')\n", 28 | "campaign = pd.read_csv(path +'campaign_data.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Objective is to create aggregate features and encodings" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "#### Technique picked up from a kaggle forum. The code can be found here:https://www.kaggle.com/tnarik/likelihood-encoding-of-categorical-features/notebook" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "# This way we have randomness and are able to reproduce the behaviour within this cell.\n", 54 | "np.random.seed(13)\n", 55 | "from sklearn.model_selection import KFold\n", 56 | "\n", 57 | "def impact_coding(data, feature, target='y'):\n", 58 | " '''\n", 59 | " In this implementation we get the values and the dictionary as two different steps.\n", 60 | " This is just because initially we were ignoring the dictionary as a result variable.\n", 61 | " \n", 62 | " In this implementation the KFolds use shuffling. If you want reproducibility the cv \n", 63 | " could be moved to a parameter.\n", 64 | " '''\n", 65 | " n_folds = 10\n", 66 | " n_inner_folds = 5\n", 67 | " impact_coded = pd.Series()\n", 68 | " \n", 69 | " oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)\n", 70 | " kf = KFold(n_splits=n_folds, shuffle=True)\n", 71 | " oof_mean_cv = pd.DataFrame()\n", 72 | " split = 0\n", 73 | " for infold, oof in kf.split(data[feature]):\n", 74 | " impact_coded_cv = pd.Series()\n", 75 | " kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)\n", 76 | " inner_split = 0\n", 77 | " inner_oof_mean_cv = pd.DataFrame()\n", 78 | " oof_default_inner_mean = data.iloc[infold][target].mean()\n", 79 | " for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):\n", 80 | " # The mean to apply to the inner oof split (a 1/n_folds % based on the rest)\n", 81 | " oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()\n", 82 | " impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(\n", 83 | " lambda x: oof_mean[x[feature]]\n", 84 | " if x[feature] in oof_mean.index\n", 85 | " else oof_default_inner_mean\n", 86 | " , axis=1))\n", 87 | "\n", 88 | " # Also populate mapping (this has all group -> mean for all inner CV folds)\n", 89 | " inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')\n", 90 | " inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)\n", 91 | " inner_split += 1\n", 92 | "\n", 93 | " # Also populate mapping\n", 94 | " oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')\n", 95 | " oof_mean_cv.fillna(value=oof_default_mean, inplace=True)\n", 96 | " split += 1\n", 97 | " \n", 98 | " impact_coded = impact_coded.append(data.iloc[oof].apply(\n", 99 | " lambda x: inner_oof_mean_cv.loc[x[feature]].mean()\n", 100 | " if x[feature] in inner_oof_mean_cv.index\n", 101 | " else oof_default_mean\n", 102 | " , axis=1))\n", 103 | "\n", 104 | " return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "### Prepare dataset for encoding\n", 116 | "train = train.merge(campaign,on = 'campaign_id',how = 'left')\n", 117 | "test = test.merge(campaign, on = 'campaign_id',how = 'left')\n", 118 | "train['user_id'] = train['user_id'].apply(str)\n", 119 | "train['campaign_id'] = train['campaign_id'].apply(str)\n", 120 | "test['user_id'] = test['user_id'].apply(str)\n", 121 | "test['campaign_id'] = test['campaign_id'].apply(str)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 5, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "train = train.drop(['email_body', 'subject', 'email_url','send_date'],axis =1 )\n", 133 | "test = test.drop(['email_body', 'subject', 'email_url','send_date'],axis =1 )" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 6, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "['id', 'user_id', 'campaign_id', 'communication_type']" 145 | ] 146 | }, 147 | "execution_count": 6, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "features = train.columns\n", 154 | "numeric_features = []\n", 155 | "categorical_features = []\n", 156 | "\n", 157 | "for dtype, feature in zip(train.dtypes, train.columns):\n", 158 | " if dtype == object:\n", 159 | " #print(column)\n", 160 | " #print(train_data[column].describe())\n", 161 | " categorical_features.append(feature)\n", 162 | " else:\n", 163 | " numeric_features.append(feature)\n", 164 | "categorical_features" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 7, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "Impact coding for user_id\n", 177 | "Impact coding for campaign_id\n", 178 | "Impact coding for communication_type\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "# Apply the encoding to training and test data, and preserve the mapping\n", 184 | "impact_coding_map = {}\n", 185 | "for f in categorical_features[1:]:\n", 186 | " print(\"Impact coding for {}\".format(f))\n", 187 | " train[\"impact_encoded_{}\".format(f)], impact_coding_mapping, default_coding = impact_coding(train, f,'is_click')\n", 188 | " impact_coding_map[f] = (impact_coding_mapping, default_coding)\n", 189 | " mapping, default_mean = impact_coding_map[f]\n", 190 | " test[\"impact_encoded_{}\".format(f)] = test.apply(lambda x: mapping[x[f]]\n", 191 | " if x[f] in mapping\n", 192 | " else default_mean\n", 193 | " , axis=1)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 8, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "Impact coding for user_id\n", 206 | "Impact coding for campaign_id\n", 207 | "Impact coding for communication_type\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "impact_coding_map = {}\n", 213 | "for f in categorical_features[1:]:\n", 214 | " print(\"Impact coding for {}\".format(f))\n", 215 | " train[\"impact_encoded_open_{}\".format(f)], impact_coding_mapping, default_coding = impact_coding(train, f,'is_open')\n", 216 | " impact_coding_map[f] = (impact_coding_mapping, default_coding)\n", 217 | " mapping, default_mean = impact_coding_map[f]\n", 218 | " test[\"impact_encoded_open_{}\".format(f)] = test.apply(lambda x: mapping[x[f]]\n", 219 | " if x[f] in mapping\n", 220 | " else default_mean\n", 221 | " , axis=1)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 9, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "train.to_csv(path + 'impact_encoded_train.csv',index = False)\n", 233 | "test.to_csv(path + 'impact_encoded_test.csv',index = False)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "Python 3", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.6.0" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 2 267 | } 268 | -------------------------------------------------------------------------------- /1st_Place_Kunal/LOM_1_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib notebook" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "path = '/home/kunal/Downloads/lord_of_machines/'\n", 26 | "train = pd.read_csv(path + 'train.csv')\n", 27 | "test = pd.read_csv(path + 'test.csv')\n", 28 | "#campaign = pd.read_csv(path +'campaign_data.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "train_input = pd.read_csv(path + 'impact_encoded_train.csv')\n", 40 | "test_input = pd.read_csv(path + 'impact_encoded_test.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "user_features = pd.read_csv(path + 'user_features.csv')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Modelling part" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "from sklearn.decomposition import PCA\n", 70 | "pca = PCA(40,random_state = 10)\n", 71 | "user_features_matrix = pca.fit_transform(user_features.iloc[:,1:])" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "user_features_matrix = pd.DataFrame(user_features_matrix)\n", 83 | "user_features_matrix['user_id'] = user_features['user_id']\n", 84 | "train_input = train_input.merge(user_features_matrix,on = 'user_id',how = 'left')\n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "#pca2 = PCA(30,random_state = 10)\n", 96 | "#cluster_feature = pca2.fit_transform(campaign_features.iloc[:,1:])" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "#cluster_feature = pd.DataFrame(cluster_feature)\n", 108 | "#cluster_feature['campaign_id'] = campaign['campaign_id']" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "#train_input['campaign_id'] = train['campaign_id']\n", 120 | "#train_input = train_input.merge(cluster_feature,on = 'campaign_id',how = 'left')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "test_input = test_input.merge(user_features_matrix,on = 'user_id',how = 'left')\n", 132 | "#test_input['campaign_id'] = test['campaign_id']\n", 133 | "#test_input = test_input.merge(cluster_feature,on = 'campaign_id',how = 'left')" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "#x = encoded_features(train_input,train_input)\n", 145 | "#y = encoded_features(df = train_input,df_new=test_input)\n", 146 | "x = train_input.copy()\n", 147 | "y = test_input.copy()\n", 148 | "count_feature = pd.concat([train['user_id'],test['user_id']])\n", 149 | "count_feature = count_feature.value_counts()\n", 150 | "x['counts'] = x['user_id'].map(count_feature)\n", 151 | "y['counts'] = y['user_id'].map(count_feature)\n", 152 | "x = x.drop(['user_id','is_click','is_open','id','campaign_id','communication_type'],axis = 1)\n", 153 | "y = y.drop(['user_id','id','campaign_id','communication_type'],axis = 1)\n", 154 | "#x['total_open_percentage'] = x['total_open']/x['counts']\n", 155 | "#y['total_open_percentage'] = y['total_open']/y['counts']" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": true 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "#x.drop('Unnamed: 0',axis = 1,inplace=True)\n", 167 | "#y.drop('Unnamed: 0',axis = 1,inplace=True)\n", 168 | "print(x.shape)\n", 169 | "print(y.shape)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "print(train_input.shape)\n", 181 | "print(test_input.shape)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "### Check the columns in X and Y\n", 193 | "### They should be same and id variables should not be present\n", 194 | "print(x.columns)\n", 195 | "print('*-'*50)\n", 196 | "print(y.columns)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "from xgboost import XGBClassifier\n", 208 | "from catboost import CatBoostClassifier\n", 209 | "from sklearn.neighbors import KNeighborsClassifier\n", 210 | "from sklearn.ensemble import RandomForestClassifier\n", 211 | "from sklearn.ensemble import ExtraTreesClassifier\n", 212 | "from sklearn.linear_model import LogisticRegression\n", 213 | "\n", 214 | "xg = XGBClassifier(n_estimators = 600,max_depth = 6,gamma = 10)\n", 215 | "#cb = CatBoostClassifier()\n", 216 | "#knn = KNeighborsClassifier()\n", 217 | "#rf = RandomForestClassifier()\n", 218 | "#et = ExtraTreesClassifier()\n", 219 | "#lr = LogisticRegression()" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "collapsed": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "### Choose your algo and use fit method (replace classifier by the name of your algo constructor)\n", 231 | "xg.fit(x,train['is_click'])\n" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "predictions = xg.predict_proba(y)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "sample = pd.read_csv(path + 'sample.csv')" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "collapsed": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "name_of_submission_file = 'final_sub' # select name of file\n", 265 | "sample['is_click'] = predictions[:,1]\n", 266 | "sample.to_csv(path + '{}.csv'.format(name_of_submission_file), index=False)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "collapsed": true 274 | }, 275 | "outputs": [], 276 | "source": [] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [] 286 | } 287 | ], 288 | "metadata": { 289 | "kernelspec": { 290 | "display_name": "Python 3", 291 | "language": "python", 292 | "name": "python3" 293 | }, 294 | "language_info": { 295 | "codemirror_mode": { 296 | "name": "ipython", 297 | "version": 3 298 | }, 299 | "file_extension": ".py", 300 | "mimetype": "text/x-python", 301 | "name": "python", 302 | "nbconvert_exporter": "python", 303 | "pygments_lexer": "ipython3", 304 | "version": "3.6.0" 305 | } 306 | }, 307 | "nbformat": 4, 308 | "nbformat_minor": 2 309 | } 310 | -------------------------------------------------------------------------------- /1st_Place_Kunal/LOM_model_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib notebook" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "path = '/home/kunal/Downloads/lord_of_machines/'\n", 26 | "train = pd.read_csv(path + 'train.csv')\n", 27 | "test = pd.read_csv(path + 'test.csv')\n", 28 | "campaign = pd.read_csv(path +'campaign_data.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "#weekday_train = pd.read_csv(path + 'weekday_train.csv')\n", 40 | "#weekday_test = pd.read_csv(path + 'weekday_test.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "train_input = pd.read_csv(path + 'impact_encoded_train.csv')\n", 52 | "test_input = pd.read_csv(path + 'impact_encoded_test.csv')" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "#train_input1 = pd.read_csv(path + 'train_new.csv')\n", 64 | "#test_input1 = pd.read_csv(path + 'test_new.csv')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "### Generate user profile based on their interests in communication type" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "train = train.merge(campaign, on = 'campaign_id',how = 'left')\n", 83 | "test = test.merge(campaign, on = 'campaign_id',how = 'left')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "data = pd.concat([train[['user_id','communication_type']],test[['user_id','communication_type']]])" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "user_profile = pd.crosstab(data['user_id'],data['communication_type'])" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "user_profile.reset_index(inplace = True)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "train_input = train_input.merge(user_profile,on = 'user_id',how = 'left')\n", 128 | "test_input = test_input.merge(user_profile,on = 'user_id',how = 'left')" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "#train_input = pd.concat([train_input,weekday_train],axis = 1)\n", 140 | "#test_input = pd.concat([test_input,weekday_test],axis = 1)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "#train_input['impact_encoded_weekday'] = train['impact_encoded_weekday']\n", 152 | "#train_input['impact_encoded_open_weekday'] = train['impact_encoded_open_weekday']\n", 153 | "#test_input['impact_encoded_weekday'] = test['impact_encoded_weekday']\n", 154 | "#test_input['impact_encoded_open_weekday'] = test['impact_encoded_open_weekday']" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "collapsed": true 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "#weekday_train.to_csv(path + 'weekday_train.csv',index = False)\n", 166 | "#weekday_test.to_csv(path + 'weekday_test.csv',index = False)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": true 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "train_input.columns" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "user_features = pd.read_csv(path + 'user_features.csv')" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": true 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "#train_input['user_id'] = train['user_id']\n", 200 | "#test_input['user_id'] = test['user_id']\n", 201 | "#train_input['is_open'] = train['is_open']\n", 202 | "#train_input['is_click'] = train['is_click']" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "#campaign_features = pd.read_csv(path + '2_gram_campaign_features.csv')" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "collapsed": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "#campaign_features.head()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "## Modelling part" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "from sklearn.decomposition import PCA\n", 243 | "pca = PCA(50,random_state = 10)\n", 244 | "user_features_matrix = pca.fit_transform(user_features.iloc[:,1:])" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": true 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "user_features_matrix = pd.DataFrame(user_features_matrix)\n", 256 | "user_features_matrix['user_id'] = user_features['user_id']\n", 257 | "train_input = train_input.merge(user_features_matrix,on = 'user_id',how = 'left')" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "#pca2 = PCA(30,random_state = 10)\n", 269 | "#cluster_feature = pca2.fit_transform(campaign_features.iloc[:,1:])" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "#cluster_feature = pd.DataFrame(cluster_feature)\n", 281 | "#cluster_feature['campaign_id'] = campaign['campaign_id']" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": { 288 | "collapsed": true 289 | }, 290 | "outputs": [], 291 | "source": [ 292 | "#train_input['campaign_id'] = train['campaign_id']\n", 293 | "#train_input = train_input.merge(campaign_features,on = 'campaign_id',how = 'left')" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": true 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "test_input = test_input.merge(user_features_matrix,on = 'user_id',how = 'left')\n", 305 | "#test_input['campaign_id'] = test['campaign_id']\n", 306 | "#test_input = test_input.merge(campaign_features,on = 'campaign_id',how = 'left')" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "#x = encoded_features(train_input,train_input)\n", 318 | "#y = encoded_features(df = train_input,df_new=test_input)\n", 319 | "x = train_input.copy()\n", 320 | "y = test_input.copy()\n", 321 | "count_feature = pd.concat([train['user_id'],test['user_id']])\n", 322 | "count_feature = count_feature.value_counts()\n", 323 | "x['counts'] = x['user_id'].map(count_feature)\n", 324 | "y['counts'] = y['user_id'].map(count_feature)\n", 325 | "x = x.drop(['user_id','is_click','is_open','campaign_id','id','communication_type'],axis = 1)\n", 326 | "y = y.drop(['user_id','campaign_id','id','communication_type'],axis = 1)\n", 327 | "#x['total_open_percentage'] = x['total_open']/x['counts']\n", 328 | "#y['total_open_percentage'] = y['total_open']/y['counts']" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": { 335 | "collapsed": true 336 | }, 337 | "outputs": [], 338 | "source": [ 339 | "### Check the columns in X and Y\n", 340 | "### They should be same and id variables should not be present\n", 341 | "print(x.columns)\n", 342 | "#print('*-'*50)\n", 343 | "print(y.columns)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "collapsed": true 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "#x.drop(['id','communication_type'],axis = 1,inplace = True)\n", 355 | "#y.drop(['id','communication_type'],axis = 1,inplace = True)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "collapsed": true 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "#x.drop(['impact_encoded_weekday','impact_encoded_open_weekday'],axis = 1,inplace = True)\n", 367 | "#y.drop(['impact_encoded_weekday','impact_encoded_open_weekday'],axis = 1,inplace = True)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": { 374 | "collapsed": true 375 | }, 376 | "outputs": [], 377 | "source": [ 378 | "#from sklearn.model_selection import train_test_split\n", 379 | "#Xtrain,Xtest,ytrain,ytest = train_test_split(x,train['is_click'],test_size = 0.7)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": { 386 | "collapsed": true 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "from xgboost import XGBClassifier\n", 391 | "#from catboost import CatBoostClassifier\n", 392 | "#from sklearn.neighbors import KNeighborsClassifier\n", 393 | "#from sklearn.ensemble import RandomForestClassifier\n", 394 | "#from sklearn.ensemble import ExtraTreesClassifier\n", 395 | "#from sklearn.linear_model import LogisticRegression\n", 396 | "\n", 397 | "xg = XGBClassifier(n_estimators = 500,max_depth= 7,gamma = 20,colsample_bylevel=0.9,colsample_bytree=0.9)\n", 398 | "#cb = CatBoostClassifier()\n", 399 | "#knn = KNeighborsClassifier()\n", 400 | "#rf = RandomForestClassifier()\n", 401 | "#et = ExtraTreesClassifier()\n", 402 | "#lr = LogisticRegression()" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": { 409 | "collapsed": true 410 | }, 411 | "outputs": [], 412 | "source": [ 413 | "### Choose your algo and use fit method (replace classifier by the name of your algo constructor)\n", 414 | "xg.fit(x,train['is_click'])\n" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": { 421 | "collapsed": true 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "predictions = xg.predict_proba(y)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": { 432 | "collapsed": true 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "sample = pd.read_csv(path + 'sample.csv')" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": { 443 | "collapsed": true 444 | }, 445 | "outputs": [], 446 | "source": [ 447 | "\n", 448 | "name_of_submission_file = 'final_sub2' # select name of file\n", 449 | "sample['is_click'] = predictions[:,1]\n", 450 | "sample.to_csv(path + '{}.csv'.format(name_of_submission_file),index = False)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": { 457 | "collapsed": true 458 | }, 459 | "outputs": [], 460 | "source": [ 461 | "from xgboost import plot_importance" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": { 468 | "collapsed": true 469 | }, 470 | "outputs": [], 471 | "source": [ 472 | "from sklearn.metrics import roc_auc_score\n", 473 | "print(roc_auc_score(train['is_click'],xg.predict_proba(x)[:,1]))" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": { 480 | "collapsed": true 481 | }, 482 | "outputs": [], 483 | "source": [ 484 | "plot_importance(xg)" 485 | ] 486 | } 487 | ], 488 | "metadata": { 489 | "kernelspec": { 490 | "display_name": "Python 3", 491 | "language": "python", 492 | "name": "python3" 493 | }, 494 | "language_info": { 495 | "codemirror_mode": { 496 | "name": "ipython", 497 | "version": 3 498 | }, 499 | "file_extension": ".py", 500 | "mimetype": "text/x-python", 501 | "name": "python", 502 | "nbconvert_exporter": "python", 503 | "pygments_lexer": "ipython3", 504 | "version": "3.6.0" 505 | } 506 | }, 507 | "nbformat": 4, 508 | "nbformat_minor": 2 509 | } 510 | -------------------------------------------------------------------------------- /1st_Place_Kunal/LOM_text_features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib notebook" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "path = '/home/kunal/Downloads/lord_of_machines/'\n", 26 | "train = pd.read_csv(path + 'train.csv')\n", 27 | "test = pd.read_csv(path + 'test.csv')\n", 28 | "campaign = pd.read_csv(path +'campaign_data.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Create New user and campaign features based on text" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer\n", 47 | "from sklearn.decomposition import TruncatedSVD\n", 48 | "from sklearn.pipeline import make_pipeline\n", 49 | "from sklearn.preprocessing import Normalizer" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "bow_vectorizer_uni = CountVectorizer(ngram_range=(1,1),stop_words='english')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "import re\n", 72 | "from nltk.stem import PorterStemmer\n", 73 | "ps = PorterStemmer()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "def review_to_words( cell ):\n", 85 | " \n", 86 | " letters_only = re.sub(\"[^a-zA-Z]\", \" \", cell) \n", 87 | " words = letters_only.lower().split() \n", 88 | " stemmed_words = [ps.stem(w) for w in words]\n", 89 | " return( \" \".join( stemmed_words )) " 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "campaign['cleaned_subject'] = campaign['subject'].apply(review_to_words)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "X_train_bow_uni = bow_vectorizer_uni.fit_transform(campaign['cleaned_subject'])" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "X_train_bow_uni = pd.DataFrame(X_train_bow_uni.toarra())\n", 123 | "X_train_bow_uni['campaign_id'] = campaign['campaign_id']" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "user_data = pd.concat([train[['user_id','campaign_id']],test[['user_id','campaign_id']]])" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "user_data_sum = user_data.merge(X_train_bow_uni,on = 'campaign_id',how = 'left').drop('campaign_id',axis = 1).groupby('user_id').sum()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "user_data_sum.reset_index(inplace = True)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "user_data_sum.to_csv(path + 'user_features.csv',index= False)" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "kernelspec": { 173 | "display_name": "Python 3", 174 | "language": "python", 175 | "name": "python3" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.6.0" 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 2 192 | } 193 | -------------------------------------------------------------------------------- /1st_Place_Kunal/README.md: -------------------------------------------------------------------------------- 1 | ## Approach 2 | 3 | The competition was based on an imbalanced binary classification problem with AUCROC metric. 4 | I created several features based on textual information and user behaviour to arrive at my final solution 5 | The features created were: 6 | 1) Target encoding of user_id with respect to is_open and is_click 7 | 2) Target encoding of campaign_id with respect to is_open and is_click 8 | 3) Target encoding of communication_type with respect to is_open and is_click 9 | 4) Length of email body (word wise) 10 | 5) Length of subject 11 | 6) Key feature : I pre-processed the text in the subject by removing stop words, lemmatizing them, removing punctuations etc. After that I used a bag of words (unigram) representation of different 12 | campaign_ids based on their subject. This was followed by merging this dataset with campaing_ids present in the train and test data. After this merge operation. I used groupby sum based on user_id to obtain a unique representation for every user. This was followed by PCA to reduce the dimensions to 50. This operation added the biggest jump to my score. 13 | 7) Number of mails received by different users 14 | 8) Cross tab of user_id vs communication type 15 | 9) Numerical features present in the campaign_data 16 | 17 | This became my general frame work for data preparation before feeding it into any model. An xgboost model with these set of features gave me score of 0.695+ on the public leaderboard. What followed after this was sheer pragmatism. I created several models based on approximately the same frame work and differentiated them by adding variability. Some of the important variations were: 18 | 1) Using bi-grams for BOW representation 19 | 2) Using tri-grams for BOW representation 20 | 3) Using all three of them 21 | 4) Using tf-idf with same (unigram,bi-gram,tri-gram) 22 | 5) Using lightboost, xgboost and catboost on each of the three representations above 23 | 6) Using truncated SVD instead of PCA for dimension reduction 24 | 7) I even dropped the best performing feature and tuned the hyper-parameters in such a way to arrive at similar scores using remaining features 25 | 8) Target encoding of weekday of sent mail 26 | 9) Cosine distance among the Glove vector representations of differnt campaign ids. 27 | 28 | These are just some of them. I created many notebooks and added/dropped/modified many features and performed many experiments, which most of the time gave me a public lb score around the vicinity of (0.685 - 0.69). Even though the perfomance of all the models were similar, there predictions were not highly correlated. This gave me the opportunity to take advantage of weighted ensembles to arrive at a higher score. I took the most similar scoring prediction files with the least correlation and took their weighted average. I continued this process in an uphill fashion. I ended up with four best performing predictions with scores (0.699 - 0.7011). I again followed the same hueristic to arrive at my final score which gave me a public leader board score of 0.704. This entire process is very similar to model stacking where diverse base classifiers prediction is fed to a meta classifier to arrive at better predictions. Only in my case, it was me manually adjusting the weights assigned to different models by validating them against the public leader board. 29 | 30 | ## How to run the code? 31 | The order of running code files are: 32 | 33 | 1) LOM2 - (for generating encodings of user id and campaign id using target encoding) 34 | 2) LOM-text_features - (for generating features based on text) 35 | 3) Features based on len of text 36 | 4) LOM_1_model & LOM_model_2 - (both are used for generation of final submissions) 37 | 5) LOM_final 38 | 39 | PLease take note that I created many many solutions using different features, sometimes different hyper-parameters, and even different algorithms. As for final solution, I took their weighted ensemble (weights decided against public leaderboard). 40 | 41 | The LOM_1_model and LOM_model_2 are my top two single performing models. However the best solution is a combination of different predictions. The files in the ensemble folder contain these different predictions (Kindly change the path accordingly in LOM_final notebook) 42 | 43 | Some variations that I used for arriving at these predictions are: 44 | 1) Using bi-gram text_features 45 | 2) Increasing the dimension parameter in PCA 46 | 3) Using a mixture of bi-gram & tri-gram 47 | 4) Using a mixture of catboost and lightboost algorithm on uni-gram features 48 | 5) Averaging predictions over different xgboost depths. 49 | 6) Using tf-idf vectors instead of bag of words 50 | 51 | Finally, it was a great competition with lots of learning and excitement. 52 | Thank you team AV for organizing this contest. 53 | 54 | -------------------------------------------------------------------------------- /2nd_Place_Mark_SRK/build_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import metrics, model_selection, ensemble, preprocessing, linear_model 4 | import lightgbm as lgb 5 | 6 | def getCountVar(compute_df, count_df, var_name, count_var="v1"): 7 | grouped_df = count_df.groupby(var_name)[count_var].agg('count').reset_index() 8 | grouped_df.columns = var_name + ["var_count"] 9 | 10 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 11 | merged_df.fillna(np.mean(grouped_df["var_count"].values), inplace=True) 12 | return list(merged_df["var_count"]) 13 | 14 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1): 15 | if type(var_name) != type([]): 16 | var_name = [var_name] 17 | grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index() 18 | grouped_df.columns = var_name + ["mean_value"] 19 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 20 | merged_df.fillna(np.mean(target_df[target_var].values), inplace=True) 21 | return list(merged_df["mean_value"]) 22 | 23 | def getDVEncodeVar2(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1): 24 | if type(var_name) != type([]): 25 | var_name = [var_name] 26 | grouped_df = target_df.groupby(var_name)[target_var].agg(["sum"]).reset_index() 27 | grouped_df.columns = var_name + ["sum_value"] 28 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 29 | merged_df.fillna(np.mean(grouped_df["sum_value"].values), inplace=True) 30 | return list(merged_df["sum_value"]) 31 | 32 | 33 | def runLR(train_X, train_y, test_X, test_y=None, test_X2=None): 34 | model = linear_model.LogisticRegression(fit_intercept=True, C=0.3) 35 | model.fit(train_X, train_y) 36 | print model.coef_, model.intercept_ 37 | train_preds = model.predict_proba(train_X)[:,1] 38 | test_preds = model.predict_proba(test_X)[:,1] 39 | test_preds2 = model.predict_proba(test_X2)[:,1] 40 | test_loss = 0 41 | if test_y is not None: 42 | train_loss = metrics.roc_auc_score(train_y, train_preds) 43 | test_loss = metrics.roc_auc_score(test_y, test_preds) 44 | print "Train and Test loss : ", train_loss, test_loss 45 | return test_preds, test_loss, test_preds2 46 | 47 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3): 48 | model = ensemble.ExtraTreesClassifier( 49 | n_estimators = 300, 50 | max_depth = depth, 51 | min_samples_split = 10, 52 | min_samples_leaf = leaf, 53 | max_features = feat, 54 | n_jobs = 6, 55 | random_state = 0) 56 | model.fit(train_X, train_y) 57 | train_preds = model.predict_proba(train_X)[:,1] 58 | test_preds = model.predict_proba(test_X)[:,1] 59 | test_preds2 = model.predict_proba(test_X2)[:,1] 60 | test_loss = 0 61 | if test_y is not None: 62 | train_loss = metrics.roc_auc_score(train_y, train_preds) 63 | test_loss = metrics.roc_auc_score(test_y, test_preds) 64 | print "Depth, leaf, feat : ", depth, leaf, feat 65 | print "Train and Test loss : ", train_loss, test_loss 66 | return test_preds, test_loss, test_preds2 67 | 68 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001): 69 | params = {} 70 | params["objective"] = "binary" 71 | params['metric'] = 'auc' 72 | params["max_depth"] = dep 73 | params["min_data_in_leaf"] = 100 74 | params["learning_rate"] = eta 75 | params["bagging_fraction"] = 0.7 76 | params["feature_fraction"] = 0.7 77 | params["bagging_freq"] = 5 78 | params["bagging_seed"] = seed_val 79 | params["verbosity"] = -1 80 | num_rounds = rounds 81 | 82 | plst = list(params.items()) 83 | lgtrain = lgb.Dataset(train_X, label=train_y) 84 | 85 | if test_y is not None: 86 | lgtest = lgb.Dataset(test_X, label=test_y) 87 | model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20) 88 | else: 89 | lgtest = lgb.DMatrix(test_X) 90 | model = lgb.train(params, lgtrain, num_rounds) 91 | 92 | pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) 93 | pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration) 94 | 95 | loss = 0 96 | if test_y is not None: 97 | loss = metrics.roc_auc_score(test_y, pred_test_y) 98 | print loss 99 | return pred_test_y, loss, pred_test_y2 100 | else: 101 | return pred_test_y, loss, pred_test_y2 102 | 103 | if __name__ == "__main__": 104 | print "Reading input files..." 105 | train_df = pd.read_csv("../input/train_feat.csv") 106 | test_df = pd.read_csv("../input/test_feat.csv") 107 | campaign_df = pd.read_csv("../input/campaign_data.csv") 108 | train_df["is_open_alone"] = train_df["is_click"].astype('float') / np.maximum(train_df["is_open"],1) 109 | print train_df.shape, test_df.shape 110 | print train_df.head() 111 | 112 | 113 | print np.sort(train_df["campaign_id"].unique()) 114 | #camp_indices = [[range(29, 47), range(47,56)], [range(47,56), range(29, 47)]] 115 | 116 | print "Merging with campaign data.." 117 | train_df = pd.merge(train_df, campaign_df, on="campaign_id") 118 | test_df = pd.merge(test_df, campaign_df, on="campaign_id") 119 | print train_df.shape, test_df.shape 120 | kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017) 121 | 122 | train_y_open = train_df["is_open"].values 123 | train_y = train_df["is_click"].values 124 | test_id = test_df["id"].values 125 | train_unique_campaigns = np.array(train_df["campaign_id"].unique()) 126 | cols_to_use = ["user_cum_count", "user_count", "user_date_diff", "user_camp_diff", "hour"] #, "total_links","no_of_internal_links","no_of_images","no_of_sections"] 127 | #cols_to_use = ["user_cum_count", "user_count", "user_camp_diff"] 128 | #cols_to_use = [] 129 | #cols_to_use = cols_to_use + ["first_open", "first_click", "second_open", "second_click", "third_open", "third_click"] 130 | cols_to_use = cols_to_use + ["user_min_date", "user_mean_date", "user_max_date", "user_std_date"] 131 | cols_to_use = cols_to_use + ["camp_"+str(i) for i in range(29,81)] + ["camps_sent"] 132 | #cols_to_use = cols_to_use + ["user_std_date_click", "user_std_date_open"] 133 | 134 | #print "Label encoding.." 135 | #for c in ["communication_type"]: 136 | # cols_to_use.append(c) 137 | # lbl = preprocessing.LabelEncoder() 138 | # lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str'))) 139 | # train_df[c] = lbl.transform(list(train_df[c].values.astype('str'))) 140 | # test_df[c] = lbl.transform(list(test_df[c].values.astype('str'))) 141 | 142 | 143 | #print "Full Count encoding.." 144 | #full_df = train_df.append(test_df) 145 | #print full_df.shape 146 | #for col in [["user_id"]]: 147 | # if isinstance(col, list): 148 | # col_name = "_".join(col) 149 | # train_df[col_name + "_full_count"] = np.array( getCountVar(train_df, full_df, col, 'id')) 150 | # test_df[col_name + "_full_count"] = np.array( getCountVar(test_df, full_df, col, 'id')) 151 | # cols_to_use.append(col_name + "_full_count") 152 | 153 | 154 | print "Count encoding.." 155 | for col in [["user_id"], ["user_id", "communication_type"]]: 156 | #for col in [["user_id"]]: 157 | train_enc_values = np.zeros(train_df.shape[0]) 158 | test_enc_values = 0 159 | for dev_index, val_index in kf.split(train_unique_campaigns): 160 | #for [dev_camp, val_camp] in camp_indices: 161 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 162 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 163 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getCountVar(val_X[col], dev_X, col, 'is_click')) 164 | test_enc_values += np.array( getCountVar(test_df[col], dev_X, col, 'is_click')) 165 | test_enc_values /= 5. 166 | if isinstance(col, list): 167 | col = "_".join(col) 168 | train_df[col + "_count"] = train_enc_values 169 | test_df[col + "_count"] = test_enc_values 170 | cols_to_use.append(col + "_count") 171 | 172 | 173 | 174 | print "Target encoding.." 175 | for col in [["user_id"], ["user_id", "communication_type"]]: 176 | #for col in [["user_id"]]: 177 | train_enc_values = np.zeros(train_df.shape[0]) 178 | test_enc_values = 0 179 | for dev_index, val_index in kf.split(train_unique_campaigns): 180 | #for [dev_camp, val_camp] in camp_indices: 181 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 182 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 183 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_click')) 184 | test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_click')) 185 | test_enc_values /= 5. 186 | if isinstance(col, list): 187 | col = "_".join(col) 188 | train_df[col + "_enc"] = train_enc_values 189 | test_df[col + "_enc"] = test_enc_values 190 | cols_to_use.append(col + "_enc") 191 | 192 | 193 | print "Open Target encoding.." 194 | for col in [["user_id"], ["user_id", "communication_type"]]: 195 | #for col in [["user_id"]]: 196 | train_enc_values = np.zeros(train_df.shape[0]) 197 | test_enc_values = 0 198 | for dev_index, val_index in kf.split(train_unique_campaigns): 199 | #for [dev_camp, val_camp] in camp_indices: 200 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 201 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 202 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_open')) 203 | test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_open')) 204 | test_enc_values /= 5. 205 | if isinstance(col, list): 206 | col = "_".join(col) 207 | train_df[col + "_open_enc"] = train_enc_values 208 | test_df[col + "_open_enc"] = test_enc_values 209 | cols_to_use.append(col + "_open_enc") 210 | 211 | 212 | 213 | 214 | """ 215 | print "Open Alone Target encoding.." 216 | #for col in [["user_id"], ["user_id", "communication_type"], ["user_id", "no_of_sections"]]: 217 | for col in [["user_id"]]: 218 | train_enc_values = np.zeros(train_df.shape[0]) 219 | test_enc_values = 0 220 | for dev_index, val_index in kf.split(train_unique_campaigns): 221 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 222 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 223 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar2(val_X[col], dev_X, col, 'is_open')) 224 | test_enc_values += np.array( getDVEncodeVar2(test_df[col], dev_X, col, 'is_open')) 225 | test_enc_values /= 5. 226 | if isinstance(col, list): 227 | col = "_".join(col) 228 | train_df[col + "_open_sum_enc"] = train_enc_values 229 | test_df[col + "_open_sum_enc"] = test_enc_values 230 | cols_to_use.append(col + "_open_sum_enc") 231 | """ 232 | 233 | 234 | print cols_to_use 235 | train_X = train_df[cols_to_use] 236 | test_X = test_df[cols_to_use] 237 | print train_X.describe() 238 | print test_X.describe() 239 | 240 | #train_X.fillna(-1, inplace=True) 241 | #test_X.fillna(-1, inplace=True) 242 | 243 | print "Model building.." 244 | model_name = "LGB" 245 | cv_scores = [] 246 | pred_test_full = 0 247 | pred_val_full = np.zeros(train_df.shape[0]) 248 | for dev_index, val_index in kf.split(train_unique_campaigns): 249 | #for [dev_camp, val_camp] in camp_indices: 250 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 251 | dev_X, val_X = train_X[train_df['campaign_id'].isin(dev_camp)], train_X[train_df['campaign_id'].isin(val_camp)] 252 | dev_y, val_y = train_y[train_df['campaign_id'].isin(dev_camp)], train_y[train_df['campaign_id'].isin(val_camp)] 253 | print dev_X.shape, val_X.shape 254 | 255 | if model_name == "LGB": 256 | pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4) 257 | pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018) 258 | pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876) 259 | pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 260 | pred_test = (pred_test1 + pred_test2 + pred_test3)/3. 261 | loss = (loss1 + loss2 + loss3)/3. 262 | elif model_name == "ET": 263 | pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3) 264 | elif model_name == "LR": 265 | pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X) 266 | 267 | pred_test_full += pred_test 268 | pred_val_full[train_df['campaign_id'].isin(val_camp)] = pred_val 269 | loss = metrics.roc_auc_score(train_y[train_df['campaign_id'].isin(val_camp)], pred_val) 270 | cv_scores.append(loss) 271 | print cv_scores 272 | pred_test_full /= 5. 273 | print np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full) 274 | 275 | sub_df = pd.DataFrame({"id":test_id}) 276 | sub_df["is_click"] = pred_test_full 277 | sub_df.to_csv("srk_sub47.csv", index=False) 278 | 279 | 280 | 281 | -------------------------------------------------------------------------------- /2nd_Place_Mark_SRK/build_model_xgb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import metrics, model_selection, ensemble, preprocessing, linear_model 4 | import lightgbm as lgb 5 | import xgboost as xgb 6 | 7 | def getCountVar(compute_df, count_df, var_name, count_var="v1"): 8 | grouped_df = count_df.groupby(var_name)[count_var].agg('count').reset_index() 9 | grouped_df.columns = var_name + ["var_count"] 10 | 11 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 12 | merged_df.fillna(np.mean(grouped_df["var_count"].values), inplace=True) 13 | return list(merged_df["var_count"]) 14 | 15 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1): 16 | if type(var_name) != type([]): 17 | var_name = [var_name] 18 | grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index() 19 | grouped_df.columns = var_name + ["mean_value"] 20 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 21 | merged_df.fillna(np.mean(target_df[target_var].values), inplace=True) 22 | return list(merged_df["mean_value"]) 23 | 24 | def getDVEncodeVar2(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1): 25 | if type(var_name) != type([]): 26 | var_name = [var_name] 27 | grouped_df = target_df.groupby(var_name)[target_var].agg(["sum"]).reset_index() 28 | grouped_df.columns = var_name + ["sum_value"] 29 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 30 | merged_df.fillna(np.mean(grouped_df["sum_value"].values), inplace=True) 31 | return list(merged_df["sum_value"]) 32 | 33 | 34 | def runLR(train_X, train_y, test_X, test_y=None, test_X2=None): 35 | model = linear_model.LogisticRegression(fit_intercept=True, C=0.3) 36 | model.fit(train_X, train_y) 37 | print model.coef_, model.intercept_ 38 | train_preds = model.predict_proba(train_X)[:,1] 39 | test_preds = model.predict_proba(test_X)[:,1] 40 | test_preds2 = model.predict_proba(test_X2)[:,1] 41 | test_loss = 0 42 | if test_y is not None: 43 | train_loss = metrics.roc_auc_score(train_y, train_preds) 44 | test_loss = metrics.roc_auc_score(test_y, test_preds) 45 | print "Train and Test loss : ", train_loss, test_loss 46 | return test_preds, test_loss, test_preds2 47 | 48 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3): 49 | model = ensemble.ExtraTreesClassifier( 50 | n_estimators = 300, 51 | max_depth = depth, 52 | min_samples_split = 10, 53 | min_samples_leaf = leaf, 54 | max_features = feat, 55 | n_jobs = 6, 56 | random_state = 0) 57 | model.fit(train_X, train_y) 58 | train_preds = model.predict_proba(train_X)[:,1] 59 | test_preds = model.predict_proba(test_X)[:,1] 60 | test_preds2 = model.predict_proba(test_X2)[:,1] 61 | test_loss = 0 62 | if test_y is not None: 63 | train_loss = metrics.roc_auc_score(train_y, train_preds) 64 | test_loss = metrics.roc_auc_score(test_y, test_preds) 65 | print "Depth, leaf, feat : ", depth, leaf, feat 66 | print "Train and Test loss : ", train_loss, test_loss 67 | return test_preds, test_loss, test_preds2 68 | 69 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001): 70 | params = {} 71 | params["objective"] = "binary" 72 | params['metric'] = 'auc' 73 | params["max_depth"] = dep 74 | params["min_data_in_leaf"] = 100 75 | params["learning_rate"] = eta 76 | params["bagging_fraction"] = 0.7 77 | params["feature_fraction"] = 0.7 78 | params["bagging_freq"] = 5 79 | params["bagging_seed"] = seed_val 80 | params["verbosity"] = -1 81 | num_rounds = rounds 82 | 83 | plst = list(params.items()) 84 | lgtrain = lgb.Dataset(train_X, label=train_y) 85 | 86 | if test_y is not None: 87 | lgtest = lgb.Dataset(test_X, label=test_y) 88 | model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20) 89 | else: 90 | lgtest = lgb.DMatrix(test_X) 91 | model = lgb.train(params, lgtrain, num_rounds) 92 | 93 | pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) 94 | pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration) 95 | 96 | loss = 0 97 | if test_y is not None: 98 | loss = metrics.roc_auc_score(test_y, pred_test_y) 99 | print loss 100 | return pred_test_y, loss, pred_test_y2 101 | else: 102 | return pred_test_y, loss, pred_test_y2 103 | 104 | def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.001): 105 | params = {} 106 | params["objective"] = "binary:logistic" 107 | params['eval_metric'] = 'auc' 108 | params["eta"] = eta 109 | params["subsample"] = 0.7 110 | params["min_child_weight"] = 10 111 | params["colsample_bytree"] = 0.7 112 | params["max_depth"] = dep 113 | params["silent"] = 1 114 | params["seed"] = seed_val 115 | #params["max_delta_step"] = 2 116 | #params["gamma"] = 0.5 117 | num_rounds = rounds 118 | 119 | plst = list(params.items()) 120 | xgtrain = xgb.DMatrix(train_X, label=train_y) 121 | 122 | if test_y is not None: 123 | xgtest = xgb.DMatrix(test_X, label=test_y) 124 | watchlist = [ (xgtrain,'train'), (xgtest, 'test') ] 125 | model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20) 126 | else: 127 | xgtest = xgb.DMatrix(test_X) 128 | model = xgb.train(plst, xgtrain, num_rounds) 129 | 130 | pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit) 131 | pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit) 132 | 133 | loss = 0 134 | if test_y is not None: 135 | loss = metrics.log_loss(test_y, pred_test_y) 136 | print loss 137 | return pred_test_y, loss, pred_test_y2 138 | else: 139 | return pred_test_y, loss, pred_test_y2 140 | 141 | 142 | if __name__ == "__main__": 143 | print "Reading input files..." 144 | train_df = pd.read_csv("../input/train_feat.csv") 145 | test_df = pd.read_csv("../input/test_feat.csv") 146 | campaign_df = pd.read_csv("../input/campaign_data.csv") 147 | train_df["is_open_alone"] = train_df["is_click"].astype('float') / np.maximum(train_df["is_open"],1) 148 | print train_df.shape, test_df.shape 149 | print train_df.head() 150 | 151 | 152 | print np.sort(train_df["campaign_id"].unique()) 153 | #camp_indices = [[range(29, 47), range(47,56)], [range(47,56), range(29, 47)]] 154 | 155 | print "Merging with campaign data.." 156 | train_df = pd.merge(train_df, campaign_df, on="campaign_id") 157 | test_df = pd.merge(test_df, campaign_df, on="campaign_id") 158 | print train_df.shape, test_df.shape 159 | kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=98765) 160 | 161 | train_y_open = train_df["is_open"].values 162 | train_y = train_df["is_click"].values 163 | test_id = test_df["id"].values 164 | train_unique_campaigns = np.array(train_df["campaign_id"].unique()) 165 | cols_to_use = ["user_cum_count", "user_count", "user_date_diff", "user_camp_diff", "hour"] #, "total_links","no_of_internal_links","no_of_images","no_of_sections"] 166 | #cols_to_use = ["user_cum_count", "user_count", "user_camp_diff"] 167 | #cols_to_use = [] 168 | #cols_to_use = cols_to_use + ["first_open", "first_click", "second_open", "second_click", "third_open", "third_click"] 169 | cols_to_use = cols_to_use + ["user_min_date", "user_mean_date", "user_max_date", "user_std_date"] 170 | cols_to_use = cols_to_use + ["camp_"+str(i) for i in range(29,81)] + ["camps_sent"] 171 | #cols_to_use = cols_to_use + ["user_std_date_click", "user_std_date_open"] 172 | 173 | #print "Label encoding.." 174 | #for c in ["communication_type"]: 175 | # cols_to_use.append(c) 176 | # lbl = preprocessing.LabelEncoder() 177 | # lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str'))) 178 | # train_df[c] = lbl.transform(list(train_df[c].values.astype('str'))) 179 | # test_df[c] = lbl.transform(list(test_df[c].values.astype('str'))) 180 | 181 | 182 | #print "Full Count encoding.." 183 | #full_df = train_df.append(test_df) 184 | #print full_df.shape 185 | #for col in [["user_id"]]: 186 | # if isinstance(col, list): 187 | # col_name = "_".join(col) 188 | # train_df[col_name + "_full_count"] = np.array( getCountVar(train_df, full_df, col, 'id')) 189 | # test_df[col_name + "_full_count"] = np.array( getCountVar(test_df, full_df, col, 'id')) 190 | # cols_to_use.append(col_name + "_full_count") 191 | 192 | 193 | print "Count encoding.." 194 | for col in [["user_id"], ["user_id", "communication_type"]]: 195 | #for col in [["user_id"]]: 196 | train_enc_values = np.zeros(train_df.shape[0]) 197 | test_enc_values = 0 198 | for dev_index, val_index in kf.split(train_unique_campaigns): 199 | #for [dev_camp, val_camp] in camp_indices: 200 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 201 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 202 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getCountVar(val_X[col], dev_X, col, 'is_click')) 203 | test_enc_values += np.array( getCountVar(test_df[col], dev_X, col, 'is_click')) 204 | test_enc_values /= 5. 205 | if isinstance(col, list): 206 | col = "_".join(col) 207 | train_df[col + "_count"] = train_enc_values 208 | test_df[col + "_count"] = test_enc_values 209 | cols_to_use.append(col + "_count") 210 | 211 | 212 | 213 | print "Target encoding.." 214 | for col in [["user_id"], ["user_id", "communication_type"]]: 215 | #for col in [["user_id"]]: 216 | train_enc_values = np.zeros(train_df.shape[0]) 217 | test_enc_values = 0 218 | for dev_index, val_index in kf.split(train_unique_campaigns): 219 | #for [dev_camp, val_camp] in camp_indices: 220 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 221 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 222 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_click')) 223 | test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_click')) 224 | test_enc_values /= 5. 225 | if isinstance(col, list): 226 | col = "_".join(col) 227 | train_df[col + "_enc"] = train_enc_values 228 | test_df[col + "_enc"] = test_enc_values 229 | cols_to_use.append(col + "_enc") 230 | 231 | 232 | print "Open Target encoding.." 233 | for col in [["user_id"], ["user_id", "communication_type"]]: 234 | #for col in [["user_id"]]: 235 | train_enc_values = np.zeros(train_df.shape[0]) 236 | test_enc_values = 0 237 | for dev_index, val_index in kf.split(train_unique_campaigns): 238 | #for [dev_camp, val_camp] in camp_indices: 239 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 240 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 241 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_open')) 242 | test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_open')) 243 | test_enc_values /= 5. 244 | if isinstance(col, list): 245 | col = "_".join(col) 246 | train_df[col + "_open_enc"] = train_enc_values 247 | test_df[col + "_open_enc"] = test_enc_values 248 | cols_to_use.append(col + "_open_enc") 249 | 250 | 251 | 252 | 253 | """ 254 | print "Open Alone Target encoding.." 255 | #for col in [["user_id"], ["user_id", "communication_type"], ["user_id", "no_of_sections"]]: 256 | for col in [["user_id"]]: 257 | train_enc_values = np.zeros(train_df.shape[0]) 258 | test_enc_values = 0 259 | for dev_index, val_index in kf.split(train_unique_campaigns): 260 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 261 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 262 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar2(val_X[col], dev_X, col, 'is_open')) 263 | test_enc_values += np.array( getDVEncodeVar2(test_df[col], dev_X, col, 'is_open')) 264 | test_enc_values /= 5. 265 | if isinstance(col, list): 266 | col = "_".join(col) 267 | train_df[col + "_open_sum_enc"] = train_enc_values 268 | test_df[col + "_open_sum_enc"] = test_enc_values 269 | cols_to_use.append(col + "_open_sum_enc") 270 | """ 271 | 272 | 273 | print cols_to_use 274 | train_X = train_df[cols_to_use] 275 | test_X = test_df[cols_to_use] 276 | print train_X.describe() 277 | print test_X.describe() 278 | 279 | #train_X.fillna(-1, inplace=True) 280 | #test_X.fillna(-1, inplace=True) 281 | 282 | print "Model building.." 283 | model_name = "XGB" 284 | cv_scores = [] 285 | pred_test_full = 0 286 | pred_val_full = np.zeros(train_df.shape[0]) 287 | for dev_index, val_index in kf.split(train_unique_campaigns): 288 | #for [dev_camp, val_camp] in camp_indices: 289 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 290 | dev_X, val_X = train_X[train_df['campaign_id'].isin(dev_camp)], train_X[train_df['campaign_id'].isin(val_camp)] 291 | dev_y, val_y = train_y[train_df['campaign_id'].isin(dev_camp)], train_y[train_df['campaign_id'].isin(val_camp)] 292 | print dev_X.shape, val_X.shape 293 | 294 | if model_name == "LGB": 295 | pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4) 296 | pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018) 297 | pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876) 298 | pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 299 | pred_test = (pred_test1 + pred_test2 + pred_test3)/3. 300 | loss = (loss1 + loss2 + loss3)/3. 301 | elif model_name == "XGB": 302 | pred_val1, loss1, pred_test1 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4) 303 | pred_val2, loss2, pred_test2 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018) 304 | pred_val3, loss3, pred_test3 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876) 305 | pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 306 | pred_test = (pred_test1 + pred_test2 + pred_test3)/3. 307 | loss = (loss1 + loss2 + loss3)/3. 308 | elif model_name == "ET": 309 | pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3) 310 | elif model_name == "LR": 311 | pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X) 312 | 313 | pred_test_full += pred_test 314 | pred_val_full[train_df['campaign_id'].isin(val_camp)] = pred_val 315 | loss = metrics.roc_auc_score(train_y[train_df['campaign_id'].isin(val_camp)], pred_val) 316 | cv_scores.append(loss) 317 | print cv_scores 318 | pred_test_full /= 5. 319 | print np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full) 320 | 321 | sub_df = pd.DataFrame({"id":test_id}) 322 | sub_df["is_click"] = pred_test_full 323 | sub_df.to_csv("srk_sub48.csv", index=False) 324 | 325 | 326 | 327 | -------------------------------------------------------------------------------- /2nd_Place_Mark_SRK/ensemble.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | s1 = pd.read_csv("../Submissions/srk_sub47.csv") 5 | s2 = pd.read_csv("../Submissions/srk_sub48.csv") 6 | #s3 = pd.read_csv("../Submissions/srk_sub23.csv") 7 | #s4 = pd.read_csv("../Submissions/srk_sub24.csv") 8 | 9 | #s1["is_click"] = 0.35*(0.5*s1["is_click"] + 0.5*s2["is_click"]) + 0.65*(0.65*(s3["is_click"])+0.35*(s4["is_click"])) 10 | s1["is_click"] = 0.5*s1["is_click"] + 0.5*s2["is_click"] 11 | s1.to_csv("srk_sub49.csv", index=False) 12 | -------------------------------------------------------------------------------- /2nd_Place_Mark_SRK/readme.md: -------------------------------------------------------------------------------- 1 | ## Approach 2 | Most of our time is spent on creating new features. We did validation split based on campaign ids. Our best single model is a light GBM that scored 0.7051 in LB. List of important features we used are: 3 | 4 | 1. Target encoding on the user ID, user ID - communication type 5 | 2. Min, max, mean and standard deviation of the mail sent time. 6 | 3. One hot encoding of the campaigns. 7 | 4. Time between current mail and previous mail 8 | 5. Number of campaigns inbetween current mail and previous mail 9 | 6. Total number of mail campaigns per user ID 10 | 7. Cumulative count of the mail at user level 11 | 8. Hour of the mail 12 | 13 | ## How to run the code? 14 | Order of files to run 15 | 1. Explorations.ipynb - Code file to create the features. 16 | 2. build_model.py - Code file to build the Light GBM model 17 | 3. build_model_xgb.py - Code file to build the XGB model 18 | 4. ensemble.py - Code file to merge both the results. 19 | -------------------------------------------------------------------------------- /3rd_Place_Aditya_Akash/3rd_Place_Solution_Approach.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/analyticsvidhya/LordOftheMachines/5f8450eca5b941418f74a3d9934b1145f5d34d06/3rd_Place_Aditya_Akash/3rd_Place_Solution_Approach.docx -------------------------------------------------------------------------------- /3rd_Place_Aditya_Akash/final_ensemble-simple_avg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "cnnlstm = pd.read_csv('./submission_cnnlstm.csv')\n", 20 | "cnn = pd.read_csv('./submission_cnn.csv')\n", 21 | "adamax = pd.read_csv('./submission_lstm.csv')\n", 22 | "\n", 23 | "lgbm_nt_45_5f = pd.read_csv('./lgb_5fold-5_bag_nt45_rank_average.csv')\n", 24 | "lgbm_nt_45_4f = pd.read_csv('./lgb_5fold-5_bag_nt45_rank_average_4f.csv')\n", 25 | "\n", 26 | "lgbm_nt_55_5f = pd.read_csv('./lgb_5fold-5_bag_nt55_rank_average_5f.csv')\n", 27 | "lgbm_nt_55_4f = pd.read_csv('./lgb_5fold-5_bag_nt45_rank_average_4f.csv.')\n", 28 | "lgbm_nt_55 = pd.read_csv('./lgb_5fold-5_bag_nt55_rank_average.csv')\n", 29 | "\n", 30 | "xgb_2f = pd.read_csv('./xgb_2fold-cv2_bag3_nt70_scalepos1_nt70.csv')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "subm = xgb_2f.copy()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "#First ensemble simple rank average of all\n", 49 | "\n", 50 | "test_shape0 = xgb_2f.shape[0]\n", 51 | "subm.loc[:,'is_click'] = (cnn.is_click.rank()/test_shape0 + cnnlstm.is_click.rank()/test_shape0 + adamax.is_click.rank()/test_shape0 +\\\n", 52 | "lgbm_nt_45_5f.is_click.rank()/test_shape0 + lgbm_nt_45_4f.is_click.rank()/test_shape0 + \\\n", 53 | "lgbm_nt_55_5f.is_click.rank()/test_shape0 + \\\n", 54 | "lgbm_nt_55_4f.is_click.rank()/test_shape0 + lgbm_nt_55.is_click.rank()/test_shape0 + \\\n", 55 | "xgb_2f.is_click.rank()/test_shape0)/9\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 5, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "subm.to_csv('./fin_ens_rank_average_all.csv',index=False)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.6.4" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 2 96 | } 97 | -------------------------------------------------------------------------------- /3rd_Place_Aditya_Akash/lgb_new_features-v6-5fold_5bag_cv_retry_lb_692_ens6941-submitted.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import lightgbm as lgb\n", 12 | "from sklearn.feature_extraction.text import CountVectorizer\n", 13 | "from sklearn.decomposition import TruncatedSVD\n", 14 | "import gc\n", 15 | "from sklearn.preprocessing import LabelEncoder\n", 16 | "from sklearn.model_selection import KFold" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 8, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "1961" 28 | ] 29 | }, 30 | "execution_count": 8, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "campaign = pd.read_csv('input/campaign_data.csv')\n", 37 | "'''\n", 38 | "vectorizer = CountVectorizer(ngram_range=(1,3))\n", 39 | "n_grams = vectorizer.fit_transform(campaign.subject)\n", 40 | "tsvd = TruncatedSVD(2,n_iter=250)\n", 41 | "tsvd_subject_feats = tsvd.fit_transform(n_grams)\n", 42 | "campaign['email_body'] = campaign.email_body.apply(lambda x: x.replace(\"\\r\\n\",\"\"))\n", 43 | "vectorizer = CountVectorizer(ngram_range=(1,4))\n", 44 | "n_grams = vectorizer.fit_transform(campaign.email_body)\n", 45 | "tsvd = TruncatedSVD(4,n_iter=250)\n", 46 | "tsvd_email_body_feats = tsvd.fit_transform(n_grams)\n", 47 | "for i in range(tsvd_subject_feats.shape[1]):\n", 48 | " campaign.loc[:,'sub_'+str(i)] = tsvd_subject_feats[:,i]\n", 49 | "for i in range(tsvd_email_body_feats.shape[1]):\n", 50 | " campaign.loc[:,'eb_'+str(i)] = tsvd_email_body_feats[:,i]\n", 51 | "'''\n", 52 | "campaign1 = campaign.drop(['subject','email_url','email_body'],axis=1)\n", 53 | "gc.collect()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 9, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "np.random.seed(123)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 10, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "def add_noise(series, noise_level):\n", 72 | " return series * (1 + noise_level * np.random.randn(len(series)))\n", 73 | "def target_encode(trn_series=None,val_series=None,\n", 74 | " tst_series=None,\n", 75 | " target=None,\n", 76 | " min_samples_leaf=1,\n", 77 | " smoothing=1,\n", 78 | " noise_level=0):\n", 79 | " \"\"\"\n", 80 | " Smoothing is computed like in the following paper by Daniele Micci-Barreca\n", 81 | " https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf\n", 82 | " trn_series : training categorical feature as a pd.Series\n", 83 | " tst_series : test categorical feature as a pd.Series\n", 84 | " target : target data as a pd.Series\n", 85 | " min_samples_leaf (int) : minimum samples to take category average into account\n", 86 | " smoothing (int) : smoothing effect to balance categorical average vs prior\n", 87 | " \"\"\"\n", 88 | " assert len(trn_series) == len(target)\n", 89 | " #assert trn_series.name == tst_series.name\n", 90 | " temp = pd.concat([trn_series, target], axis=1)\n", 91 | " # Compute target mean\n", 92 | " averages = temp.groupby(by=trn_series.name)[target.name].agg([\"mean\", \"count\"])\n", 93 | " # Compute smoothing\n", 94 | " smoothing = 1 / (1 + np.exp(-(averages[\"count\"] - min_samples_leaf) / smoothing))\n", 95 | " # Apply average function to all target data\n", 96 | " prior = target.mean()\n", 97 | " # The bigger the count the less full_avg is taken into account\n", 98 | " averages[target.name] = prior * (1 - smoothing) + averages[\"mean\"] * smoothing\n", 99 | " averages.drop([\"mean\", \"count\"], axis=1, inplace=True)\n", 100 | " # Apply averages to trn and tst series\n", 101 | " ft_trn_series = pd.merge(\n", 102 | " trn_series.to_frame(trn_series.name),\n", 103 | " averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n", 104 | " on=trn_series.name,\n", 105 | " how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n", 106 | " # pd.merge does not keep the index so restore it\n", 107 | " ft_trn_series.index = trn_series.index\n", 108 | " ft_val_series = pd.merge(\n", 109 | " val_series.to_frame(val_series.name),\n", 110 | " averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n", 111 | " on=val_series.name,\n", 112 | " how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n", 113 | " ft_val_series.index = val_series.index\n", 114 | " \n", 115 | " ft_tst_series = pd.merge(\n", 116 | " tst_series.to_frame(tst_series.name),\n", 117 | " averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n", 118 | " on=tst_series.name,\n", 119 | " how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n", 120 | " # pd.merge does not keep the index so restore it\n", 121 | " ft_tst_series.index = tst_series.index\n", 122 | " return add_noise(ft_trn_series, noise_level), ft_val_series,ft_tst_series" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 11, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "0" 134 | ] 135 | }, 136 | "execution_count": 11, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "train = pd.read_csv('input/train.csv')\n", 143 | "test = pd.read_csv('input/test.csv')\n", 144 | "gc.collect()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 12, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "all_data = pd.concat([train,test])" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 13, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "user_clust = pd.read_csv('./input/user_cluster1.csv')\n", 163 | "all_data = all_data.merge(user_clust,on='user_id',how='left')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 14, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "all_data['send_date'] = all_data.send_date.apply(lambda x: pd.datetime.strptime(x,'%d-%m-%Y %H:%M'))" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 15, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "all_data['send_dayofweek'] = all_data.send_date.dt.dayofweek" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 16, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',\n", 193 | " 'clust_id', 'send_dayofweek'],\n", 194 | " dtype='object')" 195 | ] 196 | }, 197 | "execution_count": 16, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "all_data.columns" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 17, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "#count features\n", 213 | "all_data['cnt_sd'] = all_data.groupby('send_date')['user_id'].transform('count')" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 18, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "all_data = all_data.merge(campaign1,on='campaign_id',how='left')" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 19, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',\n", 234 | " 'clust_id', 'send_dayofweek', 'cnt_sd', 'communication_type',\n", 235 | " 'total_links', 'no_of_internal_links', 'no_of_images',\n", 236 | " 'no_of_sections'],\n", 237 | " dtype='object')" 238 | ] 239 | }, 240 | "execution_count": 19, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "all_data.columns" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 20, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "le1 = LabelEncoder()\n", 256 | "all_data.loc[:,'communication_type'] = le1.fit_transform(all_data.communication_type) \n", 257 | "all_data['usr_cnt'] = all_data.groupby('user_id')['user_id'].transform('count')\n", 258 | "all_data['cm_cnt'] = np.log(all_data.groupby('communication_type')['communication_type'].transform('count'))\n", 259 | "#all_data['camp_cnt'] = all_data.groupby('campaign_id')['campaign_id'].transform('count')" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 21, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "test = all_data[len(train):]\n", 269 | "train = all_data[:len(train)]" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 22, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "35" 281 | ] 282 | }, 283 | "execution_count": 22, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "#del all_data\n", 290 | "gc.collect()" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 23, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "lgb_params = {}\n", 300 | "lgb_params['learning_rate'] = 0.01\n", 301 | "lgb_params['num_leaves'] = 31\n", 302 | "lgb_params['max_depth'] = 5\n", 303 | "lgb_params['max_bin'] = 10\n", 304 | "lgb_params['min_data_in_leaf'] = 50\n", 305 | "lgb_params['subsample'] = 0.6\n", 306 | "lgb_params['colsample_bytree'] = 0.7\n", 307 | "lgb_params['feature_fraction'] = 0.7,\n", 308 | "lgb_params['bagging_fraction'] = 0.77,\n", 309 | "lgb_params['objective'] = 'binary'\n", 310 | "lgb_params['metric'] = {'auc'}\n", 311 | "lgb_params['verbose'] = 1\n", 312 | "lgb_params['scale_pos_weight'] = 1.\n", 313 | "lgb_params['boosting_type'] = 'gbdt'\n", 314 | "lgb_params['min_split_gain'] = 0.0001\n", 315 | "#lgb_params['bagging_fraction'] = 0.7\n", 316 | "lgb_params['bagging_freq'] = 100000\n" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 26, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "name": "stderr", 326 | "output_type": "stream", 327 | "text": [ 328 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: SettingWithCopyWarning: \n", 329 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 330 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 331 | "\n", 332 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 333 | " \n", 334 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n", 335 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 336 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 337 | "\n", 338 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 339 | " \n" 340 | ] 341 | }, 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "Fold: 1\n", 347 | "val_cid [29 30 31 32 33 34]\n", 348 | "(331628, 16) (691563, 16)\n" 349 | ] 350 | }, 351 | { 352 | "name": "stderr", 353 | "output_type": "stream", 354 | "text": [ 355 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:357: SettingWithCopyWarning: \n", 356 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 357 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 358 | "\n", 359 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 360 | " self.obj[key] = _infer_fill_value(value)\n", 361 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:537: SettingWithCopyWarning: \n", 362 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 363 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 364 | "\n", 365 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 366 | " self.obj[item] = s\n", 367 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:61: SettingWithCopyWarning: \n", 368 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 369 | "\n", 370 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 371 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:63: SettingWithCopyWarning: \n", 372 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 373 | "\n", 374 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 375 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:69: SettingWithCopyWarning: \n", 376 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 377 | "\n", 378 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 379 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:70: SettingWithCopyWarning: \n", 380 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 381 | "\n", 382 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 383 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:82: SettingWithCopyWarning: \n", 384 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 385 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 386 | "\n", 387 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 388 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\lightgbm\\basic.py:1036: UserWarning: Using categorical_feature in Dataset.\n", 389 | " warnings.warn('Using categorical_feature in Dataset.')\n", 390 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\lightgbm\\basic.py:681: UserWarning: categorical_feature in param dict is overrided.\n", 391 | " warnings.warn('categorical_feature in param dict is overrided.')\n" 392 | ] 393 | }, 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "Training until validation scores don't improve for 1000 rounds.\n", 399 | "[10]\ttrain's auc: 0.959499\tvalid's auc: 0.64165\n", 400 | "[20]\ttrain's auc: 0.960904\tvalid's auc: 0.682885\n", 401 | "[30]\ttrain's auc: 0.961472\tvalid's auc: 0.685754\n", 402 | "[40]\ttrain's auc: 0.961442\tvalid's auc: 0.685165\n", 403 | "[50]\ttrain's auc: 0.962194\tvalid's auc: 0.683764\n", 404 | "Did not meet early stopping. Best iteration is:\n", 405 | "[55]\ttrain's auc: 0.96278\tvalid's auc: 0.686568\n" 406 | ] 407 | }, 408 | { 409 | "name": "stderr", 410 | "output_type": "stream", 411 | "text": [ 412 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:95: SettingWithCopyWarning: \n", 413 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 414 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 415 | "\n", 416 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 417 | ] 418 | }, 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "Training until validation scores don't improve for 1000 rounds.\n", 424 | "[10]\ttrain's auc: 0.959771\tvalid's auc: 0.681249\n", 425 | "[20]\ttrain's auc: 0.958168\tvalid's auc: 0.688639\n", 426 | "[30]\ttrain's auc: 0.958047\tvalid's auc: 0.688835\n", 427 | "[40]\ttrain's auc: 0.960492\tvalid's auc: 0.688832\n", 428 | "[50]\ttrain's auc: 0.961439\tvalid's auc: 0.688582\n", 429 | "Did not meet early stopping. Best iteration is:\n", 430 | "[46]\ttrain's auc: 0.961925\tvalid's auc: 0.688432\n" 431 | ] 432 | }, 433 | { 434 | "name": "stderr", 435 | "output_type": "stream", 436 | "text": [ 437 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:194: SettingWithCopyWarning: \n", 438 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 439 | "\n", 440 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 441 | " self._setitem_with_indexer(indexer, value)\n", 442 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:94: SettingWithCopyWarning: \n", 443 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 444 | "\n", 445 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 446 | ] 447 | }, 448 | { 449 | "name": "stdout", 450 | "output_type": "stream", 451 | "text": [ 452 | "Training until validation scores don't improve for 1000 rounds.\n", 453 | "[10]\ttrain's auc: 0.962265\tvalid's auc: 0.679662\n", 454 | "[20]\ttrain's auc: 0.963529\tvalid's auc: 0.682266\n", 455 | "[30]\ttrain's auc: 0.963445\tvalid's auc: 0.685344\n", 456 | "[40]\ttrain's auc: 0.963438\tvalid's auc: 0.690044\n", 457 | "[50]\ttrain's auc: 0.964154\tvalid's auc: 0.689997\n", 458 | "Did not meet early stopping. Best iteration is:\n", 459 | "[55]\ttrain's auc: 0.964456\tvalid's auc: 0.687732\n", 460 | "Training until validation scores don't improve for 1000 rounds.\n", 461 | "[10]\ttrain's auc: 0.962724\tvalid's auc: 0.684908\n", 462 | "[20]\ttrain's auc: 0.961615\tvalid's auc: 0.685348\n", 463 | "[30]\ttrain's auc: 0.962596\tvalid's auc: 0.68143\n", 464 | "[40]\ttrain's auc: 0.962821\tvalid's auc: 0.682635\n", 465 | "[50]\ttrain's auc: 0.963031\tvalid's auc: 0.683734\n", 466 | "Did not meet early stopping. Best iteration is:\n", 467 | "[51]\ttrain's auc: 0.963166\tvalid's auc: 0.683732\n", 468 | "Training until validation scores don't improve for 1000 rounds.\n", 469 | "[10]\ttrain's auc: 0.956924\tvalid's auc: 0.684525\n", 470 | "[20]\ttrain's auc: 0.958931\tvalid's auc: 0.687749\n", 471 | "[30]\ttrain's auc: 0.961068\tvalid's auc: 0.687752\n", 472 | "[40]\ttrain's auc: 0.96252\tvalid's auc: 0.687513\n", 473 | "[50]\ttrain's auc: 0.962575\tvalid's auc: 0.687599\n", 474 | "Did not meet early stopping. Best iteration is:\n", 475 | "[37]\ttrain's auc: 0.962846\tvalid's auc: 0.687325\n", 476 | "Fold: 2\n", 477 | "val_cid [35 36 37 38 39]\n", 478 | "(95814, 16) (927377, 16)\n", 479 | "Training until validation scores don't improve for 1000 rounds.\n", 480 | "[10]\ttrain's auc: 0.957137\tvalid's auc: 0.744188\n", 481 | "[20]\ttrain's auc: 0.956535\tvalid's auc: 0.754693\n", 482 | "[30]\ttrain's auc: 0.957489\tvalid's auc: 0.758433\n", 483 | "[40]\ttrain's auc: 0.95723\tvalid's auc: 0.761839\n", 484 | "[50]\ttrain's auc: 0.957015\tvalid's auc: 0.761785\n", 485 | "Did not meet early stopping. Best iteration is:\n", 486 | "[34]\ttrain's auc: 0.957649\tvalid's auc: 0.757474\n", 487 | "Training until validation scores don't improve for 1000 rounds.\n", 488 | "[10]\ttrain's auc: 0.952665\tvalid's auc: 0.759722\n", 489 | "[20]\ttrain's auc: 0.955711\tvalid's auc: 0.755574\n", 490 | "[30]\ttrain's auc: 0.956282\tvalid's auc: 0.75638\n", 491 | "[40]\ttrain's auc: 0.956652\tvalid's auc: 0.76035\n", 492 | "[50]\ttrain's auc: 0.956908\tvalid's auc: 0.762324\n", 493 | "Did not meet early stopping. Best iteration is:\n", 494 | "[54]\ttrain's auc: 0.957156\tvalid's auc: 0.761822\n", 495 | "Training until validation scores don't improve for 1000 rounds.\n", 496 | "[10]\ttrain's auc: 0.949622\tvalid's auc: 0.745809\n", 497 | "[20]\ttrain's auc: 0.954399\tvalid's auc: 0.755183\n", 498 | "[30]\ttrain's auc: 0.95521\tvalid's auc: 0.763086\n", 499 | "[40]\ttrain's auc: 0.955115\tvalid's auc: 0.763426\n", 500 | "[50]\ttrain's auc: 0.956063\tvalid's auc: 0.762984\n", 501 | "Did not meet early stopping. Best iteration is:\n", 502 | "[54]\ttrain's auc: 0.956642\tvalid's auc: 0.762852\n", 503 | "Training until validation scores don't improve for 1000 rounds.\n", 504 | "[10]\ttrain's auc: 0.956036\tvalid's auc: 0.740168\n", 505 | "[20]\ttrain's auc: 0.956537\tvalid's auc: 0.759695\n", 506 | "[30]\ttrain's auc: 0.957196\tvalid's auc: 0.761407\n", 507 | "[40]\ttrain's auc: 0.957056\tvalid's auc: 0.75995\n", 508 | "[50]\ttrain's auc: 0.957319\tvalid's auc: 0.7601\n", 509 | "Did not meet early stopping. Best iteration is:\n", 510 | "[54]\ttrain's auc: 0.957449\tvalid's auc: 0.76011\n", 511 | "Training until validation scores don't improve for 1000 rounds.\n", 512 | "[10]\ttrain's auc: 0.957147\tvalid's auc: 0.740874\n", 513 | "[20]\ttrain's auc: 0.957653\tvalid's auc: 0.741151\n", 514 | "[30]\ttrain's auc: 0.957485\tvalid's auc: 0.74016\n", 515 | "[40]\ttrain's auc: 0.957554\tvalid's auc: 0.753127\n", 516 | "[50]\ttrain's auc: 0.957298\tvalid's auc: 0.752856\n", 517 | "Did not meet early stopping. Best iteration is:\n", 518 | "[18]\ttrain's auc: 0.957767\tvalid's auc: 0.741149\n", 519 | "Fold: 3\n", 520 | "val_cid [40 41 42 43 44]\n", 521 | "(128426, 16) (894765, 16)\n", 522 | "Training until validation scores don't improve for 1000 rounds.\n", 523 | "[10]\ttrain's auc: 0.959043\tvalid's auc: 0.724628\n", 524 | "[20]\ttrain's auc: 0.958536\tvalid's auc: 0.725294\n", 525 | "[30]\ttrain's auc: 0.958994\tvalid's auc: 0.724168\n", 526 | "[40]\ttrain's auc: 0.958844\tvalid's auc: 0.723607\n", 527 | "[50]\ttrain's auc: 0.959404\tvalid's auc: 0.724542\n", 528 | "Did not meet early stopping. Best iteration is:\n", 529 | "[13]\ttrain's auc: 0.95958\tvalid's auc: 0.725282\n", 530 | "Training until validation scores don't improve for 1000 rounds.\n", 531 | "[10]\ttrain's auc: 0.957196\tvalid's auc: 0.723267\n", 532 | "[20]\ttrain's auc: 0.957568\tvalid's auc: 0.726999\n", 533 | "[30]\ttrain's auc: 0.957381\tvalid's auc: 0.727038\n", 534 | "[40]\ttrain's auc: 0.958127\tvalid's auc: 0.728767\n", 535 | "[50]\ttrain's auc: 0.958506\tvalid's auc: 0.728943\n", 536 | "Did not meet early stopping. Best iteration is:\n", 537 | "[51]\ttrain's auc: 0.958598\tvalid's auc: 0.728967\n", 538 | "Training until validation scores don't improve for 1000 rounds.\n", 539 | "[10]\ttrain's auc: 0.957409\tvalid's auc: 0.724771\n", 540 | "[20]\ttrain's auc: 0.957839\tvalid's auc: 0.726644\n", 541 | "[30]\ttrain's auc: 0.957793\tvalid's auc: 0.726089\n", 542 | "[40]\ttrain's auc: 0.958122\tvalid's auc: 0.724817\n", 543 | "[50]\ttrain's auc: 0.958601\tvalid's auc: 0.724415\n", 544 | "Did not meet early stopping. Best iteration is:\n", 545 | "[55]\ttrain's auc: 0.959014\tvalid's auc: 0.724543\n", 546 | "Training until validation scores don't improve for 1000 rounds.\n", 547 | "[10]\ttrain's auc: 0.955877\tvalid's auc: 0.722508\n", 548 | "[20]\ttrain's auc: 0.957748\tvalid's auc: 0.72571\n", 549 | "[30]\ttrain's auc: 0.958169\tvalid's auc: 0.725367\n", 550 | "[40]\ttrain's auc: 0.958399\tvalid's auc: 0.725168\n", 551 | "[50]\ttrain's auc: 0.95847\tvalid's auc: 0.725506\n", 552 | "Did not meet early stopping. Best iteration is:\n", 553 | "[55]\ttrain's auc: 0.958574\tvalid's auc: 0.726132\n", 554 | "Training until validation scores don't improve for 1000 rounds.\n", 555 | "[10]\ttrain's auc: 0.957648\tvalid's auc: 0.719033\n", 556 | "[20]\ttrain's auc: 0.958564\tvalid's auc: 0.720191\n", 557 | "[30]\ttrain's auc: 0.958778\tvalid's auc: 0.722423\n", 558 | "[40]\ttrain's auc: 0.958336\tvalid's auc: 0.72308\n", 559 | "[50]\ttrain's auc: 0.958835\tvalid's auc: 0.723032\n", 560 | "Did not meet early stopping. Best iteration is:\n", 561 | "[52]\ttrain's auc: 0.958914\tvalid's auc: 0.723603\n", 562 | "Fold: 4\n", 563 | "val_cid [45 46 47 48 49]\n", 564 | "(162197, 16) (860994, 16)\n", 565 | "Training until validation scores don't improve for 1000 rounds.\n", 566 | "[10]\ttrain's auc: 0.956501\tvalid's auc: 0.699103\n", 567 | "[20]\ttrain's auc: 0.956945\tvalid's auc: 0.699137\n", 568 | "[30]\ttrain's auc: 0.957919\tvalid's auc: 0.699498\n", 569 | "[40]\ttrain's auc: 0.958046\tvalid's auc: 0.703426\n", 570 | "[50]\ttrain's auc: 0.959006\tvalid's auc: 0.711483\n", 571 | "Did not meet early stopping. Best iteration is:\n", 572 | "[55]\ttrain's auc: 0.959152\tvalid's auc: 0.712056\n", 573 | "Training until validation scores don't improve for 1000 rounds.\n", 574 | "[10]\ttrain's auc: 0.958598\tvalid's auc: 0.71064\n", 575 | "[20]\ttrain's auc: 0.959706\tvalid's auc: 0.715877\n", 576 | "[30]\ttrain's auc: 0.959755\tvalid's auc: 0.714485\n", 577 | "[40]\ttrain's auc: 0.95969\tvalid's auc: 0.711536\n", 578 | "[50]\ttrain's auc: 0.96003\tvalid's auc: 0.710615\n", 579 | "Did not meet early stopping. Best iteration is:\n", 580 | "[48]\ttrain's auc: 0.960204\tvalid's auc: 0.710831\n", 581 | "Training until validation scores don't improve for 1000 rounds.\n", 582 | "[10]\ttrain's auc: 0.956184\tvalid's auc: 0.714624\n", 583 | "[20]\ttrain's auc: 0.95912\tvalid's auc: 0.711213\n", 584 | "[30]\ttrain's auc: 0.959379\tvalid's auc: 0.710565\n", 585 | "[40]\ttrain's auc: 0.959882\tvalid's auc: 0.710499\n", 586 | "[50]\ttrain's auc: 0.959987\tvalid's auc: 0.711409\n", 587 | "Did not meet early stopping. Best iteration is:\n", 588 | "[49]\ttrain's auc: 0.960066\tvalid's auc: 0.71145\n", 589 | "Training until validation scores don't improve for 1000 rounds.\n", 590 | "[10]\ttrain's auc: 0.957558\tvalid's auc: 0.694027\n", 591 | "[20]\ttrain's auc: 0.959607\tvalid's auc: 0.711264\n", 592 | "[30]\ttrain's auc: 0.959002\tvalid's auc: 0.710682\n", 593 | "[40]\ttrain's auc: 0.95812\tvalid's auc: 0.702361\n", 594 | "[50]\ttrain's auc: 0.9582\tvalid's auc: 0.702199\n", 595 | "Did not meet early stopping. Best iteration is:\n", 596 | "[20]\ttrain's auc: 0.959607\tvalid's auc: 0.711264\n", 597 | "Training until validation scores don't improve for 1000 rounds.\n", 598 | "[10]\ttrain's auc: 0.956362\tvalid's auc: 0.715147\n", 599 | "[20]\ttrain's auc: 0.956859\tvalid's auc: 0.714264\n", 600 | "[30]\ttrain's auc: 0.958672\tvalid's auc: 0.71371\n", 601 | "[40]\ttrain's auc: 0.95855\tvalid's auc: 0.704427\n", 602 | "[50]\ttrain's auc: 0.959516\tvalid's auc: 0.704593\n", 603 | "Did not meet early stopping. Best iteration is:\n", 604 | "[52]\ttrain's auc: 0.959685\tvalid's auc: 0.704602\n", 605 | "Fold: 5\n", 606 | "val_cid [50 51 52 53 54]\n", 607 | "(305126, 16) (718065, 16)\n", 608 | "Training until validation scores don't improve for 1000 rounds.\n", 609 | "[10]\ttrain's auc: 0.959101\tvalid's auc: 0.648002\n", 610 | "[20]\ttrain's auc: 0.960381\tvalid's auc: 0.650036\n", 611 | "[30]\ttrain's auc: 0.960476\tvalid's auc: 0.654387\n", 612 | "[40]\ttrain's auc: 0.96103\tvalid's auc: 0.654446\n", 613 | "[50]\ttrain's auc: 0.961455\tvalid's auc: 0.654559\n", 614 | "Did not meet early stopping. Best iteration is:\n", 615 | "[53]\ttrain's auc: 0.9617\tvalid's auc: 0.654462\n", 616 | "Training until validation scores don't improve for 1000 rounds.\n", 617 | "[10]\ttrain's auc: 0.959621\tvalid's auc: 0.654011\n", 618 | "[20]\ttrain's auc: 0.960735\tvalid's auc: 0.653463\n", 619 | "[30]\ttrain's auc: 0.960836\tvalid's auc: 0.6549\n", 620 | "[40]\ttrain's auc: 0.961172\tvalid's auc: 0.654521\n", 621 | "[50]\ttrain's auc: 0.961256\tvalid's auc: 0.653726\n" 622 | ] 623 | }, 624 | { 625 | "name": "stdout", 626 | "output_type": "stream", 627 | "text": [ 628 | "Did not meet early stopping. Best iteration is:\n", 629 | "[17]\ttrain's auc: 0.961457\tvalid's auc: 0.654111\n", 630 | "Training until validation scores don't improve for 1000 rounds.\n", 631 | "[10]\ttrain's auc: 0.95697\tvalid's auc: 0.65449\n", 632 | "[20]\ttrain's auc: 0.960161\tvalid's auc: 0.655808\n", 633 | "[30]\ttrain's auc: 0.960082\tvalid's auc: 0.655498\n", 634 | "[40]\ttrain's auc: 0.960535\tvalid's auc: 0.655341\n", 635 | "[50]\ttrain's auc: 0.960987\tvalid's auc: 0.654711\n", 636 | "Did not meet early stopping. Best iteration is:\n", 637 | "[55]\ttrain's auc: 0.961063\tvalid's auc: 0.654381\n", 638 | "Training until validation scores don't improve for 1000 rounds.\n", 639 | "[10]\ttrain's auc: 0.958435\tvalid's auc: 0.650125\n", 640 | "[20]\ttrain's auc: 0.960372\tvalid's auc: 0.650232\n", 641 | "[30]\ttrain's auc: 0.961111\tvalid's auc: 0.652803\n", 642 | "[40]\ttrain's auc: 0.960978\tvalid's auc: 0.654544\n", 643 | "[50]\ttrain's auc: 0.961009\tvalid's auc: 0.65462\n", 644 | "Did not meet early stopping. Best iteration is:\n", 645 | "[45]\ttrain's auc: 0.961195\tvalid's auc: 0.655012\n", 646 | "Training until validation scores don't improve for 1000 rounds.\n", 647 | "[10]\ttrain's auc: 0.960162\tvalid's auc: 0.654207\n", 648 | "[20]\ttrain's auc: 0.961086\tvalid's auc: 0.653852\n", 649 | "[30]\ttrain's auc: 0.961252\tvalid's auc: 0.654194\n", 650 | "[40]\ttrain's auc: 0.961249\tvalid's auc: 0.654152\n", 651 | "[50]\ttrain's auc: 0.961266\tvalid's auc: 0.654177\n", 652 | "Did not meet early stopping. Best iteration is:\n", 653 | "[55]\ttrain's auc: 0.961397\tvalid's auc: 0.654149\n" 654 | ] 655 | } 656 | ], 657 | "source": [ 658 | "nfold =5\n", 659 | "kf = KFold(n_splits=nfold,random_state=123,shuffle=False)\n", 660 | "unq_campaign_id = np.sort(train.campaign_id.unique())\n", 661 | "\n", 662 | "test_subm = test[['id']]\n", 663 | "test_subm['is_click'] = 0\n", 664 | "train_score = train[['is_click']]\n", 665 | "train_score['pred'] = 0\n", 666 | "nbag = 5\n", 667 | "cf =0\n", 668 | "for train_index, test_index in kf.split(unq_campaign_id):\n", 669 | " cf+=1\n", 670 | " print('Fold:',cf)\n", 671 | " \n", 672 | " test1 = test.copy()\n", 673 | " tr_cid = unq_campaign_id[train_index]\n", 674 | " val_cid = unq_campaign_id[test_index]\n", 675 | " print('val_cid',val_cid)\n", 676 | "\n", 677 | " val = train[train.campaign_id.isin(val_cid)]\n", 678 | " train1 = train[train.campaign_id.isin(tr_cid)]\n", 679 | " print(val.shape,train1.shape)\n", 680 | "\n", 681 | " a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],\n", 682 | " test1['user_id'],train1.is_click,noise_level=.9,smoothing=5)\n", 683 | " train1.loc[:,'mean_is_click'] = a1\n", 684 | " val.loc[:,'mean_is_click'] = a2\n", 685 | " test1.loc[:,'mean_is_click'] = a3\n", 686 | "\n", 687 | "\n", 688 | " a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],\n", 689 | " test1['user_id'],train1.is_open,noise_level=.9,smoothing=1.)\n", 690 | " train1.loc[:,'mean_is_open'] = a1\n", 691 | " val.loc[:,'mean_is_open'] = a2\n", 692 | " test1.loc[:,'mean_is_open'] = a3\n", 693 | "\n", 694 | "\n", 695 | " a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],\n", 696 | " test1['communication_type'],train1.is_open,noise_level=0)\n", 697 | " train1.loc[:,'mean_ct'] = a1\n", 698 | " val.loc[:,'mean_ct'] = a2\n", 699 | " test1.loc[:,'mean_ct'] = a3\n", 700 | "\n", 701 | " a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],\n", 702 | " test1['communication_type'],train1.is_click,noise_level=0)\n", 703 | " train1.loc[:,'mean_clk_ct'] = a1\n", 704 | " val.loc[:,'mean_clk_ct'] = a2\n", 705 | " test1.loc[:,'mean_clk_ct'] = a3\n", 706 | "\n", 707 | "\n", 708 | " a1,a2,a3 = target_encode(train1['clust_id'],val['clust_id'],\n", 709 | " test1['clust_id'],train1.is_click,noise_level=0)\n", 710 | " train1.loc[:,'mean_clk_clust_id'] = a1\n", 711 | " val.loc[:,'mean_clk_clust_id'] = a2\n", 712 | " test1.loc[:,'mean_clk_clust_id'] = a3\n", 713 | "\n", 714 | "\n", 715 | "\n", 716 | " gc.collect()\n", 717 | " val.drop(['id','campaign_id','is_open','send_date',\n", 718 | " 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n", 719 | " train1.drop(['id','campaign_id','is_open','send_date',\n", 720 | " 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n", 721 | " test1.drop(['id','campaign_id','is_open','send_date',\n", 722 | " 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n", 723 | " gc.collect()\n", 724 | " train_y = train1.is_click.values\n", 725 | " val_y = val.is_click.values\n", 726 | " val.drop(['is_click'],axis=1,inplace=True)\n", 727 | " train1.drop(['is_click'],axis=1,inplace=True)\n", 728 | " test1.drop(['is_click'],axis=1,inplace=True)\n", 729 | " \n", 730 | " lgtrain = lgb.Dataset(train1, label=train_y,categorical_feature=['communication_type','send_dayofweek','clust_id'],\n", 731 | " free_raw_data=False)\n", 732 | " lgvalid = lgb.Dataset(val, label=val_y,categorical_feature=['communication_type','send_dayofweek','clust_id'],\n", 733 | " free_raw_data=False)\n", 734 | " gc.collect()\n", 735 | " \n", 736 | " evals_results = {}\n", 737 | " np.random.seed(0)\n", 738 | " \n", 739 | " test_subm['is_click'+str(cf)]=0\n", 740 | " \n", 741 | " for bg in range(nbag):\n", 742 | " lgb_params['feature_fraction_seed'] = 100*cf + bg\n", 743 | " bst1 = lgb.train(lgb_params, \n", 744 | " lgtrain, \n", 745 | " valid_sets=[lgtrain, lgvalid], \n", 746 | " valid_names=['train','valid'], \n", 747 | " evals_result=evals_results, \n", 748 | " num_boost_round=55,\n", 749 | " early_stopping_rounds=1000,\n", 750 | " verbose_eval=10)\n", 751 | " train_score.loc[val.index,'pred'] += bst1.predict(val[train1.columns],num_iteration=51)\n", 752 | " test_subm['is_click'+str(cf)] += bst1.predict(test1[train1.columns],num_iteration=51)\n", 753 | " " 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 27, 759 | "metadata": {}, 760 | "outputs": [], 761 | "source": [ 762 | "from sklearn.metrics import roc_auc_score" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": 28, 768 | "metadata": {}, 769 | "outputs": [ 770 | { 771 | "name": "stderr", 772 | "output_type": "stream", 773 | "text": [ 774 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 775 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 776 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 777 | "\n", 778 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 779 | " \"\"\"Entry point for launching an IPython kernel.\n" 780 | ] 781 | } 782 | ], 783 | "source": [ 784 | "train_score['pred']/=nbag" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": 29, 790 | "metadata": {}, 791 | "outputs": [ 792 | { 793 | "data": { 794 | "text/plain": [ 795 | "0.6198733764612789" 796 | ] 797 | }, 798 | "execution_count": 29, 799 | "metadata": {}, 800 | "output_type": "execute_result" 801 | } 802 | ], 803 | "source": [ 804 | "roc_auc_score(train_score.is_click,train_score.pred)" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": 30, 810 | "metadata": {}, 811 | "outputs": [ 812 | { 813 | "name": "stderr", 814 | "output_type": "stream", 815 | "text": [ 816 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:537: SettingWithCopyWarning: \n", 817 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 818 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 819 | "\n", 820 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 821 | " self.obj[item] = s\n" 822 | ] 823 | } 824 | ], 825 | "source": [ 826 | "test_subm.loc[:,'is_click'] = (test_subm['is_click1'].rank()/test_subm.shape[0] +\\\n", 827 | "test_subm['is_click2'].rank()/test_subm.shape[0] + test_subm['is_click3'].rank()/test_subm.shape[0])/nfold" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 32, 833 | "metadata": {}, 834 | "outputs": [], 835 | "source": [ 836 | "test_subm[['id','is_click']].to_csv('./lgb_5fold-5_bag_nt55_rank_average.csv',index=False)" 837 | ] 838 | } 839 | ], 840 | "metadata": { 841 | "kernelspec": { 842 | "display_name": "Python 3", 843 | "language": "python", 844 | "name": "python3" 845 | }, 846 | "language_info": { 847 | "codemirror_mode": { 848 | "name": "ipython", 849 | "version": 3 850 | }, 851 | "file_extension": ".py", 852 | "mimetype": "text/x-python", 853 | "name": "python", 854 | "nbconvert_exporter": "python", 855 | "pygments_lexer": "ipython3", 856 | "version": "3.6.4" 857 | } 858 | }, 859 | "nbformat": 4, 860 | "nbformat_minor": 2 861 | } 862 | -------------------------------------------------------------------------------- /3rd_Place_Aditya_Akash/lstm_cnn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "colab": { 8 | "autoexec": { 9 | "startup": false, 10 | "wait_interval": 0 11 | } 12 | }, 13 | "colab_type": "code", 14 | "id": "1wB8EOLGKyPE" 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from keras.models import Sequential\n", 19 | "from keras.layers.core import Dense, Dropout, Activation\n", 20 | "from keras.layers import Merge, TimeDistributed, concatenate, Bidirectional, Masking, RepeatVector\n", 21 | "from keras.layers.embeddings import Embedding\n", 22 | "from keras.layers.recurrent import LSTM, GRU, SimpleRNN\n", 23 | "from keras.preprocessing.sequence import pad_sequences\n", 24 | "from keras.layers.convolutional import Convolution1D, MaxPooling1D\n", 25 | "from keras.callbacks import Callback\n", 26 | "from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU, Conv2D\n", 27 | "from keras.callbacks import Callback, History\n", 28 | "from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten\n", 29 | "from keras.preprocessing import text, sequence\n", 30 | "from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D, Masking\n", 31 | "from keras import initializers, regularizers, constraints, optimizers, layers, callbacks\n", 32 | "from keras.callbacks import EarlyStopping,ModelCheckpoint\n", 33 | "from keras.models import Model\n", 34 | "from keras.optimizers import Adam\n", 35 | "from sklearn.model_selection import train_test_split\n", 36 | "from sklearn.metrics import accuracy_score\n", 37 | "from sklearn.metrics import roc_auc_score" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 0, 43 | "metadata": { 44 | "colab": { 45 | "autoexec": { 46 | "startup": false, 47 | "wait_interval": 0 48 | } 49 | }, 50 | "colab_type": "code", 51 | "id": "a7YM8iOuKyCU" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "%matplotlib inline\n", 56 | "import pandas as pd\n", 57 | "import numpy as np\n", 58 | "# import lightgbm as lgb\n", 59 | "from sklearn.linear_model import LogisticRegression\n", 60 | "from sklearn.model_selection import train_test_split\n", 61 | "from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler\n", 62 | "from sklearn.pipeline import Pipeline\n", 63 | "import matplotlib.pyplot as plt" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 0, 69 | "metadata": { 70 | "colab": { 71 | "autoexec": { 72 | "startup": false, 73 | "wait_interval": 0 74 | } 75 | }, 76 | "colab_type": "code", 77 | "id": "oa8zhndYKyCa" 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "train= pd.read_csv('train.csv')\n", 82 | "test= pd.read_csv('test_BDIfz5B.csv')\n", 83 | "campaign_df= pd.read_csv('campaign_data.csv')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 0, 89 | "metadata": { 90 | "colab": { 91 | "autoexec": { 92 | "startup": false, 93 | "wait_interval": 0 94 | } 95 | }, 96 | "colab_type": "code", 97 | "id": "BM5dBdV_KyEY" 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "train_df= train.merge(campaign_df,how='left')\n", 102 | "test_df= test.merge(campaign_df,how='left')" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 0, 108 | "metadata": { 109 | "colab": { 110 | "autoexec": { 111 | "startup": false, 112 | "wait_interval": 0 113 | } 114 | }, 115 | "colab_type": "code", 116 | "id": "IzhZ-LitKyEf" 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "train_df.send_date=pd.to_datetime(train_df.send_date,format=\"%d-%m-%Y %H:%M\")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 0, 126 | "metadata": { 127 | "colab": { 128 | "autoexec": { 129 | "startup": false, 130 | "wait_interval": 0 131 | } 132 | }, 133 | "colab_type": "code", 134 | "id": "FEYTJLOrKyMb" 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "# Form sentences for clicks per user, open per user, etc\n", 139 | "sentences=train_df.groupby('user_id').is_click.apply(lambda x: list(x))" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 0, 145 | "metadata": { 146 | "colab": { 147 | "autoexec": { 148 | "startup": false, 149 | "wait_interval": 0 150 | } 151 | }, 152 | "colab_type": "code", 153 | "id": "FjfydUWPKyMl" 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "sentences_open = train_df.groupby('user_id').is_open.apply(lambda x: list(x))" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 0, 163 | "metadata": { 164 | "colab": { 165 | "autoexec": { 166 | "startup": false, 167 | "wait_interval": 0 168 | } 169 | }, 170 | "colab_type": "code", 171 | "id": "onK2KcK1KyNP" 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "train_df['communication_id'],invercom= pd.factorize(train_df.communication_type)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 0, 181 | "metadata": { 182 | "colab": { 183 | "autoexec": { 184 | "startup": false, 185 | "wait_interval": 0 186 | } 187 | }, 188 | "colab_type": "code", 189 | "id": "m8G_5wGJKyNd" 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "sentences_sec = train_df.groupby('user_id').no_of_sections.apply(lambda x: list(x))" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 0, 199 | "metadata": { 200 | "colab": { 201 | "autoexec": { 202 | "startup": false, 203 | "wait_interval": 0 204 | } 205 | }, 206 | "colab_type": "code", 207 | "id": "1-vjG9j1KyNg" 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "sentences_com = train_df.groupby('user_id').no_of_images.apply(lambda x: list(x))" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 0, 217 | "metadata": { 218 | "colab": { 219 | "autoexec": { 220 | "startup": false, 221 | "wait_interval": 0 222 | } 223 | }, 224 | "colab_type": "code", 225 | "id": "V19wlcxowGs1" 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "sentences_links = train_df.groupby('user_id').total_links.apply(lambda x: list(x))" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 0, 235 | "metadata": { 236 | "colab": { 237 | "autoexec": { 238 | "startup": false, 239 | "wait_interval": 0 240 | } 241 | }, 242 | "colab_type": "code", 243 | "id": "TJRWvekdKyNk" 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "sennew=zip(sentences, sentences_open,sentences_sec,sentences_com,sentences_links)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 0, 253 | "metadata": { 254 | "colab": { 255 | "autoexec": { 256 | "startup": false, 257 | "wait_interval": 0 258 | } 259 | }, 260 | "colab_type": "code", 261 | "id": "oLH1TRCcKyNl" 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "# make n_rows*n_timesteps* n_features vector for all sequences\n", 266 | "sequences_ori=pad_sequences(pd.Series(sennew).apply(lambda x: zip(x[0],x[1],x[2],x[3])).tolist(),value=-1,padding='pre')" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 0, 272 | "metadata": { 273 | "colab": { 274 | "autoexec": { 275 | "startup": false, 276 | "wait_interval": 0 277 | } 278 | }, 279 | "colab_type": "code", 280 | "id": "jPCXglb8KyOA" 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "# filter those which have length of 1\n", 285 | "sentences2=sentences[sentences.apply(lambda x: len(x))>1]" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 0, 291 | "metadata": { 292 | "colab": { 293 | "autoexec": { 294 | "startup": false, 295 | "wait_interval": 0 296 | } 297 | }, 298 | "colab_type": "code", 299 | "id": "CJCBvAfIKyOX" 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "sentences_open2 = sentences_open[sentences_open.apply(lambda x: len(x))>1]" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 0, 309 | "metadata": { 310 | "colab": { 311 | "autoexec": { 312 | "startup": false, 313 | "wait_interval": 0 314 | } 315 | }, 316 | "colab_type": "code", 317 | "id": "uNKsZ5LaKyOc" 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "sentences_sec2= sentences_sec[sentences_sec.apply(lambda x: len(x)>1)]" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 0, 327 | "metadata": { 328 | "colab": { 329 | "autoexec": { 330 | "startup": false, 331 | "wait_interval": 0 332 | } 333 | }, 334 | "colab_type": "code", 335 | "id": "k3FAD5qUKyOg" 336 | }, 337 | "outputs": [], 338 | "source": [ 339 | "sentences_com2= sentences_com[sentences_com.apply(lambda x: len(x)>1)]" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 0, 345 | "metadata": { 346 | "colab": { 347 | "autoexec": { 348 | "startup": false, 349 | "wait_interval": 0 350 | } 351 | }, 352 | "colab_type": "code", 353 | "id": "8aFn0ROiwTE7" 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "sentences_links2= sentences_links[sentences_links.apply(lambda x: len(x)>1)]" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 0, 363 | "metadata": { 364 | "colab": { 365 | "autoexec": { 366 | "startup": false, 367 | "wait_interval": 0 368 | } 369 | }, 370 | "colab_type": "code", 371 | "id": "MYxADjpvKyOj" 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "sennew=zip(sentences2, sentences_open2,sentences_sec2,sentences_com2,sentences_links2)\n", 376 | "# seqser=pd.concat((pd.Series(sennew).apply(lambda x: zip(x[0],x[1])).apply(lambda x: random.sample(x,len(x))),\n", 377 | "# pd.Series(sennew).apply(lambda x: zip(x[0],x[1]))))\n", 378 | "seqser=pd.Series(sennew).apply(lambda x: zip(x[0],x[1],x[2],x[3]))\n", 379 | "sequences=pad_sequences(seqser.tolist(),value=-1,padding='pre')" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 149, 385 | "metadata": { 386 | "colab": { 387 | "autoexec": { 388 | "startup": false, 389 | "wait_interval": 0 390 | }, 391 | "base_uri": "https://localhost:8080/", 392 | "height": 34, 393 | "output_extras": [ 394 | { 395 | "item_id": 1 396 | } 397 | ] 398 | }, 399 | "colab_type": "code", 400 | "executionInfo": { 401 | "elapsed": 1128, 402 | "status": "ok", 403 | "timestamp": 1522577949845, 404 | "user": { 405 | "displayName": "Akash Gupta", 406 | "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg", 407 | "userId": "111901583339877553911" 408 | }, 409 | "user_tz": -330 410 | }, 411 | "id": "76wQldP6KyO1", 412 | "outputId": "cc802a92-9e43-46ac-db15-d62e91c6f276" 413 | }, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/plain": [ 418 | "(151470, 20, 2)" 419 | ] 420 | }, 421 | "execution_count": 149, 422 | "metadata": { 423 | "tags": [] 424 | }, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "sequences.shape" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 150, 435 | "metadata": { 436 | "colab": { 437 | "autoexec": { 438 | "startup": false, 439 | "wait_interval": 0 440 | }, 441 | "base_uri": "https://localhost:8080/", 442 | "height": 34, 443 | "output_extras": [ 444 | { 445 | "item_id": 1 446 | } 447 | ] 448 | }, 449 | "colab_type": "code", 450 | "executionInfo": { 451 | "elapsed": 1231, 452 | "status": "ok", 453 | "timestamp": 1522577951279, 454 | "user": { 455 | "displayName": "Akash Gupta", 456 | "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg", 457 | "userId": "111901583339877553911" 458 | }, 459 | "user_tz": -330 460 | }, 461 | "id": "rMXWBN8qKyPB", 462 | "outputId": "e282b7ee-dc48-40a7-ca92-f407b7677033" 463 | }, 464 | "outputs": [ 465 | { 466 | "data": { 467 | "text/plain": [ 468 | "(168236, 20, 2)" 469 | ] 470 | }, 471 | "execution_count": 150, 472 | "metadata": { 473 | "tags": [] 474 | }, 475 | "output_type": "execute_result" 476 | } 477 | ], 478 | "source": [ 479 | "sequences_ori.shape" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 0, 485 | "metadata": { 486 | "colab": { 487 | "autoexec": { 488 | "startup": false, 489 | "wait_interval": 0 490 | } 491 | }, 492 | "colab_type": "code", 493 | "id": "4Ha3DVMEKyPa" 494 | }, 495 | "outputs": [], 496 | "source": [ 497 | "train_X= sequences[:,:-1]\n", 498 | "train_Y = sequences[:,-1]" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 0, 504 | "metadata": { 505 | "colab": { 506 | "autoexec": { 507 | "startup": false, 508 | "wait_interval": 0 509 | } 510 | }, 511 | "colab_type": "code", 512 | "id": "H1y6WnbOKyPf" 513 | }, 514 | "outputs": [], 515 | "source": [ 516 | "train_Y= train_Y[:,0]" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 178, 522 | "metadata": { 523 | "colab": { 524 | "autoexec": { 525 | "startup": false, 526 | "wait_interval": 0 527 | }, 528 | "base_uri": "https://localhost:8080/", 529 | "height": 34, 530 | "output_extras": [ 531 | { 532 | "item_id": 1 533 | } 534 | ] 535 | }, 536 | "colab_type": "code", 537 | "executionInfo": { 538 | "elapsed": 942, 539 | "status": "ok", 540 | "timestamp": 1522578849746, 541 | "user": { 542 | "displayName": "Akash Gupta", 543 | "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg", 544 | "userId": "111901583339877553911" 545 | }, 546 | "user_tz": -330 547 | }, 548 | "id": "FHdF-ZMzKyPp", 549 | "outputId": "95c51c2a-0278-48c9-ab55-d9fabe1ec1fa" 550 | }, 551 | "outputs": [ 552 | { 553 | "data": { 554 | "text/plain": [ 555 | "(151470, 19, 4)" 556 | ] 557 | }, 558 | "execution_count": 178, 559 | "metadata": { 560 | "tags": [] 561 | }, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "train_X.shape" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 0, 572 | "metadata": { 573 | "colab": { 574 | "autoexec": { 575 | "startup": false, 576 | "wait_interval": 0 577 | } 578 | }, 579 | "colab_type": "code", 580 | "id": "y5dWrSNTKyQF" 581 | }, 582 | "outputs": [], 583 | "source": [ 584 | "class RocAucEvaluation(Callback):\n", 585 | " def __init__(self, validation_data=(), interval=1):\n", 586 | " super(Callback, self).__init__()\n", 587 | "\n", 588 | " self.interval = interval\n", 589 | " self.X_val, self.y_val = validation_data\n", 590 | "\n", 591 | " def on_epoch_end(self, epoch, logs={}):\n", 592 | " if epoch % self.interval == 0:\n", 593 | " y_pred = self.model.predict(self.X_val, verbose=0)\n", 594 | " score = roc_auc_score(self.y_val, y_pred)\n", 595 | " print(\"\\n ROC-AUC - epoch: {:d} - score: {:.6f}\".format(epoch+1, score))" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 179, 601 | "metadata": { 602 | "colab": { 603 | "autoexec": { 604 | "startup": false, 605 | "wait_interval": 0 606 | }, 607 | "base_uri": "https://localhost:8080/", 608 | "height": 68, 609 | "output_extras": [ 610 | { 611 | "item_id": 1 612 | } 613 | ] 614 | }, 615 | "colab_type": "code", 616 | "executionInfo": { 617 | "elapsed": 1548, 618 | "status": "ok", 619 | "timestamp": 1522578889874, 620 | "user": { 621 | "displayName": "Akash Gupta", 622 | "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg", 623 | "userId": "111901583339877553911" 624 | }, 625 | "user_tz": -330 626 | }, 627 | "id": "fI7EvwzDKyQJ", 628 | "outputId": "454141ff-322d-4772-a400-f695edd49542" 629 | }, 630 | "outputs": [ 631 | { 632 | "name": "stdout", 633 | "output_type": "stream", 634 | "text": [ 635 | "(None, 250)\n", 636 | "(None, 250)\n", 637 | "(None, 1)\n" 638 | ] 639 | } 640 | ], 641 | "source": [ 642 | "\n", 643 | "# define nn model\n", 644 | "# emdedding_size=500\n", 645 | "# vocab_size=5502\n", 646 | "keras_model2 = Sequential()\n", 647 | "# keras_model2.add(Masking(mask_value=-1. ,input_shape=(None,2)))\n", 648 | "# keras_model2.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, \n", 649 | "# weights=[pretrained_weights]))\n", 650 | "# keras_model2/.add(LSTM(units=100,return_sequences=True))\n", 651 | "# keras_model2.add(Conv1D(64, kernel_size = 3, padding = \"valid\", kernel_initializer = \"glorot_uniform\",input_shape=(None,4)))\n", 652 | "# keras_model2.add(MaxPooling1D())\n", 653 | "# print keras_model2.output_shape\n", 654 | "# keras_model2.add(Conv1D(300, 3, padding='valid',activation='relu',strides=1 ,input_shape=(None,4)))\n", 655 | "# keras_model2.add(Conv1D(150, 3, padding='valid',activation='relu',strides=2))\n", 656 | "# keras_model2.add(Conv1D(75, 3, padding='valid',activation='relu',strides=2))\n", 657 | "# keras_model2.add(Flatten())\n", 658 | "# keras_model2.add(Dropout(0.2))\n", 659 | "# model.add(Dense(150,activation='sigmoid'))\n", 660 | "# model.add(Dropout(0.2))\n", 661 | "keras_model2.add(Conv1D(250,\n", 662 | " 3,\n", 663 | " padding='valid',\n", 664 | " activation='relu',\n", 665 | " strides=1,input_shape=(None,4)))\n", 666 | "# we use max pooling:\n", 667 | "# keras_model2.add(GlobalMaxPooling1D())\n", 668 | "keras_model2.add((LSTM(units=100,return_sequences=False,recurrent_dropout=0.2)))\n", 669 | "# keras_model2.add((GRU(units=100,return_sequences=False,recurrent_dropout=0.2)))\n", 670 | "# keras_model2.add(Conv1D(300, 3, padding='valid',activation='relu',strides=2 ,input_shape=(None,4)))\n", 671 | "# keras_model2.add(Conv1D(150, 3, padding='valid',activation='relu',strides=2))\n", 672 | "# keras_model2.add(Conv1D(75, 3, padding='valid',activation='relu',strides=2))\n", 673 | "# keras_model2.add(Flatten())\n", 674 | "print keras_model2.output_shape\n", 675 | "# keras_model2.add(Conv1D(32, kernel_size = 3, padding = \"valid\", kernel_initializer = \"glorot_uniform\"))\n", 676 | "# print keras_model2.output_shape\n", 677 | "# keras_model2.add(GlobalMaxPooling1D())\n", 678 | "print keras_model2.output_shape\n", 679 | "\n", 680 | "keras_model2.add(Dropout(0.2))\n", 681 | "# keras_model2.add(LSTM(units=500,return_sequences=False))\n", 682 | "keras_model2.add(Dense(units=100))\n", 683 | "keras_model2.add(Dropout(0.2))\n", 684 | "keras_model2.add(Dense(units=10))\n", 685 | "keras_model2.add(Dropout(0.2))\n", 686 | "# keras_model2.add(Dense(units=5))\n", 687 | "# keras_model2.add(Dropout(0.2))\n", 688 | "\n", 689 | "keras_model2.add(Dense(units=1))\n", 690 | "print keras_model2.output_shape\n", 691 | "keras_model2.add(Activation('sigmoid'))\n", 692 | "keras_model2.compile(optimizer='Adamax', loss='binary_crossentropy',metrics=['accuracy','mse'])" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 0, 698 | "metadata": { 699 | "colab": { 700 | "autoexec": { 701 | "startup": false, 702 | "wait_interval": 0 703 | } 704 | }, 705 | "colab_type": "code", 706 | "id": "WRgCCoQ3fohC" 707 | }, 708 | "outputs": [], 709 | "source": [ 710 | "# train_X=np.concatenate((np.ones_like(train_X[:,0]).reshape(train_X.shape[0],-1,4)*-1,train_X),axis=1)" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 0, 716 | "metadata": { 717 | "colab": { 718 | "autoexec": { 719 | "startup": false, 720 | "wait_interval": 0 721 | } 722 | }, 723 | "colab_type": "code", 724 | "id": "mVEm8KAfKyQR" 725 | }, 726 | "outputs": [], 727 | "source": [ 728 | "xtrain, xval, ytrain, yval = train_test_split(train_X, train_Y, test_size=0.1, random_state=7)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 182, 734 | "metadata": { 735 | "colab": { 736 | "autoexec": { 737 | "startup": false, 738 | "wait_interval": 0 739 | }, 740 | "base_uri": "https://localhost:8080/", 741 | "height": 34, 742 | "output_extras": [ 743 | { 744 | "item_id": 1 745 | } 746 | ] 747 | }, 748 | "colab_type": "code", 749 | "executionInfo": { 750 | "elapsed": 1575, 751 | "status": "ok", 752 | "timestamp": 1522578897238, 753 | "user": { 754 | "displayName": "Akash Gupta", 755 | "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg", 756 | "userId": "111901583339877553911" 757 | }, 758 | "user_tz": -330 759 | }, 760 | "id": "NbQi0fReKyQc", 761 | "outputId": "0b85153a-b365-4b5f-f093-f63a2077cc91" 762 | }, 763 | "outputs": [ 764 | { 765 | "data": { 766 | "text/plain": [ 767 | "((151470, 19, 4), (136323, 19, 4), (15147, 19, 4))" 768 | ] 769 | }, 770 | "execution_count": 182, 771 | "metadata": { 772 | "tags": [] 773 | }, 774 | "output_type": "execute_result" 775 | } 776 | ], 777 | "source": [ 778 | "train_X.shape, xtrain.shape, xval.shape" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": 0, 784 | "metadata": { 785 | "colab": { 786 | "autoexec": { 787 | "startup": false, 788 | "wait_interval": 0 789 | } 790 | }, 791 | "colab_type": "code", 792 | "id": "7efnrfCLKyQi" 793 | }, 794 | "outputs": [], 795 | "source": [ 796 | "# filepath=\"../input/best-model/best.hdf5\"\n", 797 | "import pickle\n", 798 | "filename_m= 'cnnlstm_'\n", 799 | "pickle.dump(keras_model2.to_json(),open(filename_m+'.pkl','w'))\n", 800 | "filepath=filename_m+\"weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5\"\n", 801 | "checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False, mode='max')\n", 802 | "ra_val = RocAucEvaluation(validation_data=(xval, yval), interval = 1)\n", 803 | "callbacks_list = [checkpoint,ra_val]" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": 0, 809 | "metadata": { 810 | "colab": { 811 | "autoexec": { 812 | "startup": false, 813 | "wait_interval": 0 814 | } 815 | }, 816 | "colab_type": "code", 817 | "id": "7a75OYPhKyQk" 818 | }, 819 | "outputs": [], 820 | "source": [ 821 | "sample_weight=np.concatenate((np.ones(train_X.shape[0]/3)*3,np.ones(train_X.shape[0]/3)*2,\n", 822 | " np.ones(train_X.shape[0]/3)))" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": 185, 828 | "metadata": { 829 | "colab": { 830 | "autoexec": { 831 | "startup": false, 832 | "wait_interval": 0 833 | }, 834 | "base_uri": "https://localhost:8080/", 835 | "height": 34, 836 | "output_extras": [ 837 | { 838 | "item_id": 1 839 | } 840 | ] 841 | }, 842 | "colab_type": "code", 843 | "executionInfo": { 844 | "elapsed": 950, 845 | "status": "ok", 846 | "timestamp": 1522578912424, 847 | "user": { 848 | "displayName": "Akash Gupta", 849 | "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg", 850 | "userId": "111901583339877553911" 851 | }, 852 | "user_tz": -330 853 | }, 854 | "id": "NE51h98Q1Unz", 855 | "outputId": "59ed1c69-b698-4d2c-d5d1-0d0f8c8ed41c" 856 | }, 857 | "outputs": [ 858 | { 859 | "data": { 860 | "text/plain": [ 861 | "(136323, 19, 4)" 862 | ] 863 | }, 864 | "execution_count": 185, 865 | "metadata": { 866 | "tags": [] 867 | }, 868 | "output_type": "execute_result" 869 | } 870 | ], 871 | "source": [ 872 | "xtrain.shape" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": null, 878 | "metadata": { 879 | "colab": { 880 | "autoexec": { 881 | "startup": false, 882 | "wait_interval": 0 883 | }, 884 | "base_uri": "https://localhost:8080/", 885 | "height": 10271, 886 | "output_extras": [ 887 | { 888 | "item_id": 21 889 | }, 890 | { 891 | "item_id": 40 892 | }, 893 | { 894 | "item_id": 57 895 | }, 896 | { 897 | "item_id": 76 898 | }, 899 | { 900 | "item_id": 94 901 | }, 902 | { 903 | "item_id": 112 904 | }, 905 | { 906 | "item_id": 130 907 | }, 908 | { 909 | "item_id": 146 910 | }, 911 | { 912 | "item_id": 164 913 | }, 914 | { 915 | "item_id": 182 916 | }, 917 | { 918 | "item_id": 201 919 | }, 920 | { 921 | "item_id": 220 922 | }, 923 | { 924 | "item_id": 239 925 | }, 926 | { 927 | "item_id": 258 928 | }, 929 | { 930 | "item_id": 274 931 | }, 932 | { 933 | "item_id": 292 934 | }, 935 | { 936 | "item_id": 311 937 | }, 938 | { 939 | "item_id": 331 940 | }, 941 | { 942 | "item_id": 349 943 | }, 944 | { 945 | "item_id": 365 946 | }, 947 | { 948 | "item_id": 366 949 | } 950 | ] 951 | }, 952 | "colab_type": "code", 953 | "executionInfo": { 954 | "elapsed": 174954, 955 | "status": "ok", 956 | "timestamp": 1522579089258, 957 | "user": { 958 | "displayName": "Akash Gupta", 959 | "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg", 960 | "userId": "111901583339877553911" 961 | }, 962 | "user_tz": -330 963 | }, 964 | "id": "xThDJdWpKyQm", 965 | "outputId": "9fba9908-8f87-416c-b456-834cd6046ccb" 966 | }, 967 | "outputs": [], 968 | "source": [ 969 | "keras_model2.fit(xtrain,ytrain, epochs=19, validation_data=(xval,yval),\n", 970 | " batch_size=1024, callbacks = callbacks_list,verbose=1)" 971 | ] 972 | }, 973 | { 974 | "cell_type": "code", 975 | "execution_count": 0, 976 | "metadata": { 977 | "colab": { 978 | "autoexec": { 979 | "startup": false, 980 | "wait_interval": 0 981 | } 982 | }, 983 | "colab_type": "code", 984 | "id": "e4z-mBLPKyQq" 985 | }, 986 | "outputs": [], 987 | "source": [ 988 | " from keras.models import model_from_json\n", 989 | "# keras_model2= model_from_json(pickle.load(open('lstm2fea100.pkl')))\n", 990 | "# keras_model2.load_weights('lstmcorrectorderweights-improvement-24-0.99.hdf5')" 991 | ] 992 | }, 993 | { 994 | "cell_type": "code", 995 | "execution_count": 0, 996 | "metadata": { 997 | "colab": { 998 | "autoexec": { 999 | "startup": false, 1000 | "wait_interval": 0 1001 | } 1002 | }, 1003 | "colab_type": "code", 1004 | "id": "xHgiafvmKyRY" 1005 | }, 1006 | "outputs": [], 1007 | "source": [ 1008 | "nnpred=keras_model2.predict(sequences_ori)" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": 0, 1014 | "metadata": { 1015 | "colab": { 1016 | "autoexec": { 1017 | "startup": false, 1018 | "wait_interval": 0 1019 | } 1020 | }, 1021 | "colab_type": "code", 1022 | "id": "hLG63_X1KyRf" 1023 | }, 1024 | "outputs": [], 1025 | "source": [ 1026 | "snn= sentences.reset_index()" 1027 | ] 1028 | }, 1029 | { 1030 | "cell_type": "code", 1031 | "execution_count": 164, 1032 | "metadata": { 1033 | "colab": { 1034 | "autoexec": { 1035 | "startup": false, 1036 | "wait_interval": 0 1037 | }, 1038 | "base_uri": "https://localhost:8080/", 1039 | "height": 34, 1040 | "output_extras": [ 1041 | { 1042 | "item_id": 1 1043 | } 1044 | ] 1045 | }, 1046 | "colab_type": "code", 1047 | "executionInfo": { 1048 | "elapsed": 830, 1049 | "status": "ok", 1050 | "timestamp": 1522578652745, 1051 | "user": { 1052 | "displayName": "Akash Gupta", 1053 | "photoUrl": "//lh4.googleusercontent.com/-lbaoCqdKDCg/AAAAAAAAAAI/AAAAAAAAMhQ/YGRqD6U9Se8/s50-c-k-no/photo.jpg", 1054 | "userId": "111901583339877553911" 1055 | }, 1056 | "user_tz": -330 1057 | }, 1058 | "id": "ic8PLfrgKyRi", 1059 | "outputId": "2d09df8f-dccd-42d4-db7a-3313b30108b0" 1060 | }, 1061 | "outputs": [ 1062 | { 1063 | "data": { 1064 | "text/plain": [ 1065 | "((168236, 2), (168236, 1), (168236,))" 1066 | ] 1067 | }, 1068 | "execution_count": 164, 1069 | "metadata": { 1070 | "tags": [] 1071 | }, 1072 | "output_type": "execute_result" 1073 | } 1074 | ], 1075 | "source": [ 1076 | "snn.shape, nnpred.shape , sentences.shape" 1077 | ] 1078 | }, 1079 | { 1080 | "cell_type": "code", 1081 | "execution_count": 0, 1082 | "metadata": { 1083 | "colab": { 1084 | "autoexec": { 1085 | "startup": false, 1086 | "wait_interval": 0 1087 | } 1088 | }, 1089 | "colab_type": "code", 1090 | "id": "yEUZfFu2KyRp" 1091 | }, 1092 | "outputs": [], 1093 | "source": [ 1094 | "snn['pred']= nnpred" 1095 | ] 1096 | }, 1097 | { 1098 | "cell_type": "code", 1099 | "execution_count": 0, 1100 | "metadata": { 1101 | "colab": { 1102 | "autoexec": { 1103 | "startup": false, 1104 | "wait_interval": 0 1105 | } 1106 | }, 1107 | "colab_type": "code", 1108 | "id": "1Z4rSIt6KyT3" 1109 | }, 1110 | "outputs": [], 1111 | "source": [ 1112 | "test_df['send_date']= pd.to_datetime(test_df.send_date,format=\"%d-%m-%Y %H:%M\")" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": 0, 1118 | "metadata": { 1119 | "colab": { 1120 | "autoexec": { 1121 | "startup": false, 1122 | "wait_interval": 0 1123 | } 1124 | }, 1125 | "colab_type": "code", 1126 | "id": "W9dTls-JKyT-" 1127 | }, 1128 | "outputs": [], 1129 | "source": [ 1130 | "test_df['send_week']=test_df.send_date.dt.week\n", 1131 | "test_df['send_day']= test_df.send_date.dt.day\n", 1132 | "test_df['send_hour']= test_df.send_date.dt.hour\n", 1133 | "test_df['send_hour']=(test_df.send_hour/6).astype('int')\n", 1134 | "test_df['send_weekday']=test_df.send_date.dt.weekday\n", 1135 | "# pred_nn=test_df.merge(snn,how='left').groupby(['campaign_id','send_weekday']).pred.apply(lambda x: x.fillna(x.mean()))" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "code", 1140 | "execution_count": 0, 1141 | "metadata": { 1142 | "colab": { 1143 | "autoexec": { 1144 | "startup": false, 1145 | "wait_interval": 0 1146 | } 1147 | }, 1148 | "colab_type": "code", 1149 | "id": "0jz0ZX4FKyRw" 1150 | }, 1151 | "outputs": [], 1152 | "source": [ 1153 | "pred_nn=test_df.merge(snn,how='left').groupby(['campaign_id','send_weekday',\n", 1154 | " 'send_hour']).pred.apply(lambda x: x.fillna((x.quantile(0.9))))" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "code", 1159 | "execution_count": 0, 1160 | "metadata": { 1161 | "colab": { 1162 | "autoexec": { 1163 | "startup": false, 1164 | "wait_interval": 0 1165 | } 1166 | }, 1167 | "colab_type": "code", 1168 | "id": "3_b70e7aKyR2" 1169 | }, 1170 | "outputs": [], 1171 | "source": [ 1172 | "submission= pd.read_csv('sample_submission_4fcZwvQ.csv')\n", 1173 | "submission.is_click = pred_nn\n", 1174 | "submission.to_csv('submission_lstmcnn.csv',index=False)" 1175 | ] 1176 | }, 1177 | { 1178 | "cell_type": "code", 1179 | "execution_count": 6, 1180 | "metadata": {}, 1181 | "outputs": [ 1182 | { 1183 | "data": { 1184 | "text/plain": [ 1185 | "'2.2.2'" 1186 | ] 1187 | }, 1188 | "execution_count": 6, 1189 | "metadata": {}, 1190 | "output_type": "execute_result" 1191 | } 1192 | ], 1193 | "source": [ 1194 | "import matplotlib as pd\n", 1195 | "pd.__version__" 1196 | ] 1197 | }, 1198 | { 1199 | "cell_type": "code", 1200 | "execution_count": null, 1201 | "metadata": {}, 1202 | "outputs": [], 1203 | "source": [] 1204 | } 1205 | ], 1206 | "metadata": { 1207 | "accelerator": "GPU", 1208 | "colab": { 1209 | "collapsed_sections": [], 1210 | "default_view": {}, 1211 | "name": "Copy of Untitled-Copy1.ipynb", 1212 | "provenance": [ 1213 | { 1214 | "file_id": "1hznci-bKqi_hiGI3cTFlhZkB4s6fWc1G", 1215 | "timestamp": 1522527298306 1216 | } 1217 | ], 1218 | "version": "0.3.2", 1219 | "views": {} 1220 | }, 1221 | "kernelspec": { 1222 | "display_name": "Python 3", 1223 | "language": "python", 1224 | "name": "python3" 1225 | }, 1226 | "language_info": { 1227 | "codemirror_mode": { 1228 | "name": "ipython", 1229 | "version": 3 1230 | }, 1231 | "file_extension": ".py", 1232 | "mimetype": "text/x-python", 1233 | "name": "python", 1234 | "nbconvert_exporter": "python", 1235 | "pygments_lexer": "ipython3", 1236 | "version": "3.6.4" 1237 | } 1238 | }, 1239 | "nbformat": 4, 1240 | "nbformat_minor": 1 1241 | } 1242 | -------------------------------------------------------------------------------- /3rd_Place_Aditya_Akash/readme.txt: -------------------------------------------------------------------------------- 1 | Packages used for executing Aditya's model 2 | ========================================== 3 | Python Python 3.6.4, Dependencies 4 | =========================================== 5 | lightgbm 2.1.0 6 | scikit-learn 0.19.1 7 | pandas 0.22.0 8 | numpy 1.14.0 9 | xgboost 0.7 10 | 11 | Execution of Aditya's model 12 | ============================ 13 | Execute following notebooks 14 | Note: train.csv and test.csv should be present in folder name "input". Output files will be genrated at current path 15 | 1. user_cluster-kmeans.ipynb, this will generate file user_cluster1.csv in input folder 16 | 2. xgb_2fold-cv3_bag3_nt70_scalepos1_best_tree.ipynb, this will generate test prediction file name "xgb_2fold-cv2_bag3_nt70_scalepos1_nt70.csv". 17 | 3. lgb_5fold-5_bag_nt45_rank_average_AND_lgb_5fold-5_bag_nt45_rank_average_4f.ipynb, this will generate test prediction file names "lgb_5fold-5_bag_nt45_rank_average.csv" and "lgb_5fold-5_bag_nt45_rank_average_4f.csv". 18 | 4. lgb_5fold-5_bag_nt55_rank_average_5f_AND_lgb_5fold-5_bag_nt55_rank_average_4f.ipynb, this will generate test prediction file names "lgb_5fold-5_bag_nt55_rank_average_5f.csv" and "lgb_5fold-5_bag_nt55_rank_average_4f.csv". 19 | 5. lgb_new_features-v6-5fold_5bag_cv_retry_lb_692_ens6941-submitted, this will generate lgb_5fold-5_bag_nt55_rank_average.csv. 20 | 21 | 22 | Packages used for executing Akash's model 23 | ========================================== 24 | Python Package Dependencies 25 | =========================================== 26 | 27 | Keras 2.0.8 28 | sklearn 0.19.0 29 | pandas 0.20.3 30 | numpy 1.14.2 31 | matplotlib 2.2.2 32 | 33 | Instructions: 34 | 35 | 36 | Execution of Akash's model 37 | ============================ 38 | Execute each cell in following notebooks 39 | 1. cnn.ipynb, it will generate test prediction file submission_cnn.csv 40 | 2. lstm.ipynb, it will generate test prediction file submission_lstm.csv 41 | 3. lstm_cnn.ipynb, it will generate test prediction file submission_lstmcnn.csv 42 | 43 | All train, test, and submission files should be kept in current path of notebook. 44 | 45 | Ensemble Model 46 | ========================= 47 | Execute final_ensemble-simple_avg.ipynb python notebook, that will rank average all the models output into final submssion. -------------------------------------------------------------------------------- /3rd_Place_Aditya_Akash/user_cluster-kmeans.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "from sklearn.cluster import KMeans,DBSCAN" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "train = pd.read_csv('./input/train.csv',usecols=['campaign_id','user_id'])\n", 21 | "test = pd.read_csv('./input/test.csv',usecols=['campaign_id','user_id'])" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "all_data = pd.concat([train,test])" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "temp = pd.get_dummies(pd.Series(all_data.campaign_id), prefix='campaign_id')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 5, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "all_data = pd.concat([all_data,temp],axis=1)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 6, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "all_data.drop('campaign_id',axis=1,inplace=True)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 7, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "ucamp_grp = all_data.groupby('user_id').sum()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 8, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "ucamp_df = pd.DataFrame(ucamp_grp)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 9, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n", 87 | " n_clusters=5, n_init=6, n_jobs=1, precompute_distances='auto',\n", 88 | " random_state=None, tol=0.0001, verbose=0)" 89 | ] 90 | }, 91 | "execution_count": 9, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "kmeans = KMeans(init='k-means++', n_clusters=5, n_init=6,max_iter=300)\n", 98 | "kmeans.fit(ucamp_df.values)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 10, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "ucamp_df['clust_id'] = kmeans.labels_" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 11, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "user_cluster = ucamp_df[['clust_id']].reset_index(drop=False)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 12, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "4 57030\n", 128 | "3 52649\n", 129 | "0 51266\n", 130 | "2 36807\n", 131 | "1 22966\n", 132 | "Name: clust_id, dtype: int64" 133 | ] 134 | }, 135 | "execution_count": 12, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "user_cluster.clust_id.value_counts()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 13, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "user_cluster.to_csv('./input/user_cluster1.csv',index=False)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 3", 164 | "language": "python", 165 | "name": "python3" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.6.4" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 2 182 | } 183 | -------------------------------------------------------------------------------- /3rd_Place_Aditya_Akash/xgb_2fold-cv3_bag3_nt70_scalepos1_best_tree.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import xgboost as xgb\n", 12 | "from sklearn.feature_extraction.text import CountVectorizer\n", 13 | "from sklearn.decomposition import TruncatedSVD\n", 14 | "import gc\n", 15 | "from sklearn.preprocessing import LabelEncoder\n", 16 | "from sklearn.model_selection import KFold" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "0" 28 | ] 29 | }, 30 | "execution_count": 3, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "campaign = pd.read_csv('input/campaign_data.csv')\n", 37 | "campaign1 = campaign.drop(['subject','email_url','email_body'],axis=1)\n", 38 | "gc.collect()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "np.random.seed(123)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "def add_noise(series, noise_level):\n", 57 | " return series * (1 + noise_level * np.random.randn(len(series)))\n", 58 | "def target_encode(trn_series=None,val_series=None,\n", 59 | " tst_series=None,\n", 60 | " target=None,\n", 61 | " min_samples_leaf=1,\n", 62 | " smoothing=1,\n", 63 | " noise_level=0):\n", 64 | " \"\"\"\n", 65 | " Smoothing is computed like in the following paper by Daniele Micci-Barreca\n", 66 | " https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf\n", 67 | " trn_series : training categorical feature as a pd.Series\n", 68 | " tst_series : test categorical feature as a pd.Series\n", 69 | " target : target data as a pd.Series\n", 70 | " min_samples_leaf (int) : minimum samples to take category average into account\n", 71 | " smoothing (int) : smoothing effect to balance categorical average vs prior\n", 72 | " \"\"\"\n", 73 | " assert len(trn_series) == len(target)\n", 74 | " #assert trn_series.name == tst_series.name\n", 75 | " temp = pd.concat([trn_series, target], axis=1)\n", 76 | " # Compute target mean\n", 77 | " averages = temp.groupby(by=trn_series.name)[target.name].agg([\"mean\", \"count\"])\n", 78 | " # Compute smoothing\n", 79 | " smoothing = 1 / (1 + np.exp(-(averages[\"count\"] - min_samples_leaf) / smoothing))\n", 80 | " # Apply average function to all target data\n", 81 | " prior = target.mean()\n", 82 | " # The bigger the count the less full_avg is taken into account\n", 83 | " averages[target.name] = prior * (1 - smoothing) + averages[\"mean\"] * smoothing\n", 84 | " averages.drop([\"mean\", \"count\"], axis=1, inplace=True)\n", 85 | " # Apply averages to trn and tst series\n", 86 | " ft_trn_series = pd.merge(\n", 87 | " trn_series.to_frame(trn_series.name),\n", 88 | " averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n", 89 | " on=trn_series.name,\n", 90 | " how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n", 91 | " # pd.merge does not keep the index so restore it\n", 92 | " ft_trn_series.index = trn_series.index\n", 93 | " ft_val_series = pd.merge(\n", 94 | " val_series.to_frame(val_series.name),\n", 95 | " averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n", 96 | " on=val_series.name,\n", 97 | " how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n", 98 | " ft_val_series.index = val_series.index\n", 99 | " \n", 100 | " ft_tst_series = pd.merge(\n", 101 | " tst_series.to_frame(tst_series.name),\n", 102 | " averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),\n", 103 | " on=tst_series.name,\n", 104 | " how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)\n", 105 | " # pd.merge does not keep the index so restore it\n", 106 | " ft_tst_series.index = tst_series.index\n", 107 | " return add_noise(ft_trn_series, noise_level), ft_val_series,ft_tst_series" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "0" 119 | ] 120 | }, 121 | "execution_count": 6, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "train = pd.read_csv('input/train.csv')\n", 128 | "test = pd.read_csv('input/test.csv')\n", 129 | "gc.collect()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 7, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "all_data = pd.concat([train,test])" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "user_clust = pd.read_csv('./input/user_cluster1.csv')\n", 148 | "all_data = all_data.merge(user_clust,on='user_id',how='left')" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 10, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "all_data['send_date'] = all_data.send_date.apply(lambda x: pd.datetime.strptime(x,'%d-%m-%Y %H:%M'))" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 11, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "all_data['send_dayofweek'] = all_data.send_date.dt.dayofweek" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 12, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',\n", 178 | " 'clust_id', 'send_dayofweek'],\n", 179 | " dtype='object')" 180 | ] 181 | }, 182 | "execution_count": 12, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "all_data.columns" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 13, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "#count features\n", 198 | "all_data['cnt_sd'] = all_data.groupby('send_date')['user_id'].transform('count')" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 14, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "all_data = all_data.merge(campaign1,on='campaign_id',how='left')" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 15, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "Index(['campaign_id', 'id', 'is_click', 'is_open', 'send_date', 'user_id',\n", 219 | " 'clust_id', 'send_dayofweek', 'cnt_sd', 'communication_type',\n", 220 | " 'total_links', 'no_of_internal_links', 'no_of_images',\n", 221 | " 'no_of_sections'],\n", 222 | " dtype='object')" 223 | ] 224 | }, 225 | "execution_count": 15, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "all_data.columns" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 16, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "le1 = LabelEncoder()\n", 241 | "all_data.loc[:,'communication_type'] = le1.fit_transform(all_data.communication_type) \n", 242 | "all_data['usr_cnt'] = all_data.groupby('user_id')['user_id'].transform('count')\n", 243 | "all_data['cm_cnt'] = np.log(all_data.groupby('communication_type')['communication_type'].transform('count'))\n", 244 | "#all_data['camp_cnt'] = all_data.groupby('campaign_id')['campaign_id'].transform('count')" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 17, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "test = all_data[len(train):]\n", 254 | "train = all_data[:len(train)]" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 18, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "35" 266 | ] 267 | }, 268 | "execution_count": 18, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "#del all_data\n", 275 | "gc.collect()" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 73, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "xgb_params = {}\n", 285 | "xgb_params['eta'] = 0.07\n", 286 | "xgb_params['max_depth'] = 5\n", 287 | "xgb_params['max_leaves'] = 31\n", 288 | "xgb_params['max_bin'] = 10\n", 289 | "xgb_params['min_child_weight '] = 100\n", 290 | "xgb_params['subsample'] = 0.6\n", 291 | "xgb_params['colsample_bytree'] = 0.77\n", 292 | "xgb_params['objective'] = 'binary:logistic'\n", 293 | "xgb_params['eval_metric'] = 'auc'\n", 294 | "xgb_params['verbose'] = 1\n", 295 | "xgb_params['scale_pos_weight'] = 1.\n", 296 | "\n", 297 | "xgb_params['max_bin']=10\n", 298 | "xgb_params['max_delta_step']=1\n", 299 | "xgb_params['nthread']=7\n", 300 | "xgb_params['booster']='gbtree'" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 91, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "name": "stderr", 310 | "output_type": "stream", 311 | "text": [ 312 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: SettingWithCopyWarning: \n", 313 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 314 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 315 | "\n", 316 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 317 | " \n", 318 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n", 319 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 320 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 321 | "\n", 322 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 323 | " \n" 324 | ] 325 | }, 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "Fold: 1\n", 331 | "val_cid [29 30 31 32 33 34 35 36 37 38 39 40 41]\n", 332 | "(588141, 16) (435050, 16)\n" 333 | ] 334 | }, 335 | { 336 | "name": "stderr", 337 | "output_type": "stream", 338 | "text": [ 339 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:357: SettingWithCopyWarning: \n", 340 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 341 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 342 | "\n", 343 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 344 | " self.obj[key] = _infer_fill_value(value)\n", 345 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:537: SettingWithCopyWarning: \n", 346 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 347 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 348 | "\n", 349 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 350 | " self.obj[item] = s\n", 351 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:61: SettingWithCopyWarning: \n", 352 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 353 | "\n", 354 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 355 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:63: SettingWithCopyWarning: \n", 356 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 357 | "\n", 358 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 359 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:69: SettingWithCopyWarning: \n", 360 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 361 | "\n", 362 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 363 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:70: SettingWithCopyWarning: \n", 364 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 365 | "\n", 366 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 367 | ] 368 | }, 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "[0]\teval-auc:0.507239\ttrain-auc:0.630868\n", 374 | "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n", 375 | "\n", 376 | "Will train until train-auc hasn't improved in 150 rounds.\n", 377 | "[10]\teval-auc:0.549008\ttrain-auc:0.941565\n", 378 | "[20]\teval-auc:0.54935\ttrain-auc:0.970986\n", 379 | "[30]\teval-auc:0.621005\ttrain-auc:0.985933\n", 380 | "[40]\teval-auc:0.623506\ttrain-auc:0.986568\n", 381 | "[50]\teval-auc:0.63699\ttrain-auc:0.987577\n", 382 | "[60]\teval-auc:0.643214\ttrain-auc:0.988096\n", 383 | "[69]\teval-auc:0.640067\ttrain-auc:0.98841\n" 384 | ] 385 | }, 386 | { 387 | "name": "stderr", 388 | "output_type": "stream", 389 | "text": [ 390 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:89: SettingWithCopyWarning: \n", 391 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 392 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 393 | "\n", 394 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 395 | ] 396 | }, 397 | { 398 | "name": "stdout", 399 | "output_type": "stream", 400 | "text": [ 401 | "[0]\teval-auc:0.513118\ttrain-auc:0.699374\n", 402 | "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n", 403 | "\n", 404 | "Will train until train-auc hasn't improved in 150 rounds.\n", 405 | "[10]\teval-auc:0.549338\ttrain-auc:0.963648\n", 406 | "[20]\teval-auc:0.549279\ttrain-auc:0.972213\n", 407 | "[30]\teval-auc:0.585382\ttrain-auc:0.981735\n", 408 | "[40]\teval-auc:0.628757\ttrain-auc:0.986893\n", 409 | "[50]\teval-auc:0.648687\ttrain-auc:0.987708\n", 410 | "[60]\teval-auc:0.644673\ttrain-auc:0.988209\n", 411 | "[69]\teval-auc:0.643383\ttrain-auc:0.988605\n" 412 | ] 413 | }, 414 | { 415 | "name": "stderr", 416 | "output_type": "stream", 417 | "text": [ 418 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:194: SettingWithCopyWarning: \n", 419 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 420 | "\n", 421 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 422 | " self._setitem_with_indexer(indexer, value)\n", 423 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:88: SettingWithCopyWarning: \n", 424 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 425 | "\n", 426 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 427 | ] 428 | }, 429 | { 430 | "name": "stdout", 431 | "output_type": "stream", 432 | "text": [ 433 | "[0]\teval-auc:0.508098\ttrain-auc:0.654833\n", 434 | "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n", 435 | "\n", 436 | "Will train until train-auc hasn't improved in 150 rounds.\n", 437 | "[10]\teval-auc:0.549292\ttrain-auc:0.96816\n", 438 | "[20]\teval-auc:0.549248\ttrain-auc:0.971653\n", 439 | "[30]\teval-auc:0.556878\ttrain-auc:0.977378\n", 440 | "[40]\teval-auc:0.630023\ttrain-auc:0.986505\n", 441 | "[50]\teval-auc:0.637828\ttrain-auc:0.986918\n", 442 | "[60]\teval-auc:0.647163\ttrain-auc:0.988098\n", 443 | "[69]\teval-auc:0.644905\ttrain-auc:0.98856\n", 444 | "Fold: 2\n", 445 | "val_cid [42 43 44 45 46 47 48 49 50 51 52 53 54]\n", 446 | "(435050, 16) (588141, 16)\n", 447 | "[0]\teval-auc:0.521143\ttrain-auc:0.660784\n", 448 | "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n", 449 | "\n", 450 | "Will train until train-auc hasn't improved in 150 rounds.\n", 451 | "[10]\teval-auc:0.574698\ttrain-auc:0.959211\n", 452 | "[20]\teval-auc:0.582151\ttrain-auc:0.966154\n", 453 | "[30]\teval-auc:0.64668\ttrain-auc:0.981158\n", 454 | "[40]\teval-auc:0.646689\ttrain-auc:0.982213\n", 455 | "[50]\teval-auc:0.646649\ttrain-auc:0.982668\n", 456 | "[60]\teval-auc:0.669827\ttrain-auc:0.98389\n", 457 | "[69]\teval-auc:0.679231\ttrain-auc:0.984133\n", 458 | "[0]\teval-auc:0.513133\ttrain-auc:0.657905\n", 459 | "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n", 460 | "\n", 461 | "Will train until train-auc hasn't improved in 150 rounds.\n", 462 | "[10]\teval-auc:0.574706\ttrain-auc:0.958659\n", 463 | "[20]\teval-auc:0.583715\ttrain-auc:0.96735\n", 464 | "[30]\teval-auc:0.650561\ttrain-auc:0.980521\n", 465 | "[40]\teval-auc:0.653512\ttrain-auc:0.982558\n", 466 | "[50]\teval-auc:0.661876\ttrain-auc:0.983443\n", 467 | "[60]\teval-auc:0.66456\ttrain-auc:0.983772\n", 468 | "[69]\teval-auc:0.663491\ttrain-auc:0.983995\n", 469 | "[0]\teval-auc:0.52116\ttrain-auc:0.693952\n", 470 | "Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n", 471 | "\n", 472 | "Will train until train-auc hasn't improved in 150 rounds.\n", 473 | "[10]\teval-auc:0.574725\ttrain-auc:0.960942\n", 474 | "[20]\teval-auc:0.602752\ttrain-auc:0.9715\n", 475 | "[30]\teval-auc:0.646011\ttrain-auc:0.980437\n", 476 | "[40]\teval-auc:0.650082\ttrain-auc:0.982535\n", 477 | "[50]\teval-auc:0.669828\ttrain-auc:0.983306\n", 478 | "[60]\teval-auc:0.673233\ttrain-auc:0.983707\n", 479 | "[69]\teval-auc:0.676993\ttrain-auc:0.984073\n" 480 | ] 481 | } 482 | ], 483 | "source": [ 484 | "nfold =2\n", 485 | "kf = KFold(n_splits=nfold,random_state=123,shuffle=False)\n", 486 | "unq_campaign_id = np.sort(train.campaign_id.unique())\n", 487 | "\n", 488 | "test_subm = test[['id']]\n", 489 | "test_subm['is_click'] = 0\n", 490 | "train_score = train[['is_click']]\n", 491 | "train_score['pred'] = 0\n", 492 | "nbag = 3\n", 493 | "cf =0\n", 494 | "for train_index, test_index in kf.split(unq_campaign_id):\n", 495 | " cf+=1\n", 496 | " print('Fold:',cf)\n", 497 | " \n", 498 | " test1 = test.copy()\n", 499 | " tr_cid = unq_campaign_id[train_index]\n", 500 | " val_cid = unq_campaign_id[test_index]\n", 501 | " print('val_cid',val_cid)\n", 502 | "\n", 503 | " val = train[train.campaign_id.isin(tr_cid)]\n", 504 | " train1 = train[train.campaign_id.isin(val_cid)]\n", 505 | " print(val.shape,train1.shape)\n", 506 | "\n", 507 | " a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],\n", 508 | " test1['user_id'],train1.is_click,noise_level=.9,smoothing=5)\n", 509 | " train1.loc[:,'mean_is_click'] = a1\n", 510 | " val.loc[:,'mean_is_click'] = a2\n", 511 | " test1.loc[:,'mean_is_click'] = a3\n", 512 | "\n", 513 | "\n", 514 | " a1,a2,a3 = target_encode(train1['user_id'],val['user_id'],\n", 515 | " test1['user_id'],train1.is_open,noise_level=.9,smoothing=1.)\n", 516 | " train1.loc[:,'mean_is_open'] = a1\n", 517 | " val.loc[:,'mean_is_open'] = a2\n", 518 | " test1.loc[:,'mean_is_open'] = a3\n", 519 | "\n", 520 | "\n", 521 | " a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],\n", 522 | " test1['communication_type'],train1.is_open,noise_level=0)\n", 523 | " train1.loc[:,'mean_ct'] = a1\n", 524 | " val.loc[:,'mean_ct'] = a2\n", 525 | " test1.loc[:,'mean_ct'] = a3\n", 526 | "\n", 527 | " #a1,a2,a3 = target_encode(train1['communication_type'],val['communication_type'],\n", 528 | " # test1['communication_type'],train1.is_click,noise_level=0)\n", 529 | " #train1.loc[:,'mean_clk_ct'] = a1\n", 530 | " #val.loc[:,'mean_clk_ct'] = a2\n", 531 | " #test1.loc[:,'mean_clk_ct'] = a3\n", 532 | "\n", 533 | "\n", 534 | " a1,a2,a3 = target_encode(train1['clust_id'],val['clust_id'],\n", 535 | " test1['clust_id'],train1.is_click,noise_level=0)\n", 536 | " train1.loc[:,'mean_clk_clust_id'] = a1\n", 537 | " val.loc[:,'mean_clk_clust_id'] = a2\n", 538 | " test1.loc[:,'mean_clk_clust_id'] = a3\n", 539 | "\n", 540 | "\n", 541 | "\n", 542 | " gc.collect()\n", 543 | " val.drop(['id','campaign_id','is_open','send_date',\n", 544 | " 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n", 545 | " train1.drop(['id','campaign_id','is_open','send_date',\n", 546 | " 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n", 547 | " test1.drop(['id','campaign_id','is_open','send_date',\n", 548 | " 'user_id','no_of_images','no_of_sections','no_of_internal_links'],axis=1,inplace=True)\n", 549 | " gc.collect()\n", 550 | " train_y = train1.is_click.values\n", 551 | " val_y = val.is_click.values\n", 552 | " val.drop(['is_click'],axis=1,inplace=True)\n", 553 | " train1.drop(['is_click'],axis=1,inplace=True)\n", 554 | " test1.drop(['is_click'],axis=1,inplace=True)\n", 555 | " \n", 556 | " dtrain = xgb.DMatrix(train1,label=train_y)\n", 557 | " dval = xgb.DMatrix(val[train1.columns],label=val_y)\n", 558 | " dtest = xgb.DMatrix(test1[train1.columns])\n", 559 | " gc.collect()\n", 560 | " \n", 561 | " evals_results = {}\n", 562 | " np.random.seed(0)\n", 563 | " \n", 564 | " for bg in range(nbag):\n", 565 | " xgb_params['seed'] = 100*cf + bg\n", 566 | " watchlist = [(dval, 'eval'), (dtrain, 'train')]\n", 567 | "\n", 568 | " bst = xgb.train(xgb_params, dtrain, 70, watchlist,early_stopping_rounds=150,\n", 569 | " verbose_eval=10,maximize=True)\n", 570 | " \n", 571 | " train_score.loc[val.index,'pred'] += bst.predict(dval)\n", 572 | " test_subm['is_click'] += bst.predict(dtest)\n", 573 | " " 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 93, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "from sklearn.metrics import roc_auc_score" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 94, 588 | "metadata": {}, 589 | "outputs": [ 590 | { 591 | "name": "stderr", 592 | "output_type": "stream", 593 | "text": [ 594 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 595 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 596 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 597 | "\n", 598 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 599 | " \"\"\"Entry point for launching an IPython kernel.\n" 600 | ] 601 | } 602 | ], 603 | "source": [ 604 | "train_score['pred']/=3" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 95, 610 | "metadata": {}, 611 | "outputs": [ 612 | { 613 | "data": { 614 | "text/plain": [ 615 | "( is_click pred\n", 616 | " 0 0.0 0.007969\n", 617 | " 1 0.0 0.010713\n", 618 | " 2 0.0 0.008167\n", 619 | " 3 0.0 0.008279\n", 620 | " 4 0.0 0.007871, is_click pred\n", 621 | " 1023186 0.0 0.008056\n", 622 | " 1023187 0.0 0.009954\n", 623 | " 1023188 1.0 0.007931\n", 624 | " 1023189 0.0 0.009988\n", 625 | " 1023190 0.0 0.007871)" 626 | ] 627 | }, 628 | "execution_count": 95, 629 | "metadata": {}, 630 | "output_type": "execute_result" 631 | } 632 | ], 633 | "source": [ 634 | "train_score.head(5),train_score.tail(5)" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 96, 640 | "metadata": {}, 641 | "outputs": [ 642 | { 643 | "data": { 644 | "text/plain": [ 645 | "0.6603355706439776" 646 | ] 647 | }, 648 | "execution_count": 96, 649 | "metadata": {}, 650 | "output_type": "execute_result" 651 | } 652 | ], 653 | "source": [ 654 | "roc_auc_score(train_score.is_click,train_score.pred)" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 97, 660 | "metadata": {}, 661 | "outputs": [ 662 | { 663 | "name": "stderr", 664 | "output_type": "stream", 665 | "text": [ 666 | "C:\\Users\\esinadi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 667 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 668 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 669 | "\n", 670 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 671 | " \"\"\"Entry point for launching an IPython kernel.\n" 672 | ] 673 | } 674 | ], 675 | "source": [ 676 | "test_subm['is_click'] /= nfold*nbag" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": 98, 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "test_subm.to_csv('./xgb_2fold-cv2_bag3_nt70_scalepos1_nt70.csv',index=False)" 686 | ] 687 | } 688 | ], 689 | "metadata": { 690 | "kernelspec": { 691 | "display_name": "Python 3", 692 | "language": "python", 693 | "name": "python3" 694 | }, 695 | "language_info": { 696 | "codemirror_mode": { 697 | "name": "ipython", 698 | "version": 3 699 | }, 700 | "file_extension": ".py", 701 | "mimetype": "text/x-python", 702 | "name": "python", 703 | "nbconvert_exporter": "python", 704 | "pygments_lexer": "ipython3", 705 | "version": "3.6.4" 706 | } 707 | }, 708 | "nbformat": 4, 709 | "nbformat_minor": 2 710 | } 711 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Codes related to Lord of the Machines 2 | 3 | The repository contains Top-3 Winning solutions of "Lord of the Machines" (Competition Link : [https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/](https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/) ) 4 | 5 | Note: Although winning solutions are provided for use, datasets are not provided as the datasets are proprietary and do not comply with the License. 6 | --------------------------------------------------------------------------------