├── README.md ├── Final_blend.ipynb ├── XGBOOST_notebook.ipynb ├── ANOTHERLGBMODEL.ipynb ├── new_data_creation.ipynb └── indianda.ipynb /README.md: -------------------------------------------------------------------------------- 1 | #Please read this before you run kernels 2 | 3 | 1- RUN ANOTHERLGBMODEL.ipynb to get ANOTHERLGB.csv 4 | 5 | 2- RUN new_data_creation.ipynb TO get a new data ,Here is a Link to get the data IT'S A PRIVATE DATA www.kaggle.com/dataset/198175af5d025f8e7e9d6f28f34b6d7ecfc58f242d4049bdc6b50462e5821b44 6 | #of course i will not share this link when i will share the solution 7 | 8 | 3-RUN submission1 to get submission1.csv 9 | 10 | 4-RUN XGBOOST_notebook.ipynb to get subxgb.csv 11 | 12 | 5-RUN _submission2.ipynb to get submission2.csv 13 | 14 | 6-RUN EKHERMOU7AWLA.ipynb to get ekher_mou7awla.csv 15 | 16 | 7-RUN indianda.ipynb TO get indiana.csv 17 | 18 | 19 | 8-After runing those kernels you can run FINAL_BLEND.ipynb to get the finale_sub.csv 20 | 21 | 22 | 23 | ## [On the Leaderboard](https://zindi.africa/competitions/airqo-ugandan-air-quality-forecast-challenge/leaderboard) 24 | 25 | Look for the team named : **Forecasters**
26 | Rank : 12/501 27 | 28 | ## Authors 29 | 30 |

31 | 32 | | Name | Zindi ID | Github ID | 33 | |----------------|--------------------------------------------------|------------------------------------------| 34 | |Azer KSOURI |[@plndz](https://zindi.africa/users/plndz) |[@Az-Ks](https://github.com/Az-Ks) | 35 | |Helmi Klai |[@Klai](https://zindi.africa/users/Klai) |[@Klaimohelmi](https://github.com/Klaimohelmi) | 36 | |Muhamed TUO |[@Muhamed_Tuo](https://zindi.africa/users/Muhamed_Tuo) |[@NazarioR9](https://github.com/NazarioR9)| 37 | |Saurabh Kumar |[@Saurabh502](https://zindi.africa/users/Saurabh502) | 38 | 39 |

40 | -------------------------------------------------------------------------------- /Final_blend.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Final_blend.ipynb", 7 | "provenance": [], 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "view-in-github", 20 | "colab_type": "text" 21 | }, 22 | "source": [ 23 | " $\"Open$ " 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "id": "24e_LkQNwtip", 30 | "colab_type": "text" 31 | }, 32 | "source": [ 33 | "FIRST BLEND" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "m1VpWdXKvMih", 40 | "colab_type": "code", 41 | "colab": {} 42 | }, 43 | "source": [ 44 | "import pandas as pd \n", 45 | "\n", 46 | "\n", 47 | "submission1 = pd.read_csv('submission1.csv')\n", 48 | "sub_xgb = pd.read_csv('subxgb.csv')\n", 49 | "\n", 50 | "submission_ = submission1.copy()\n", 51 | "\n", 52 | "submission_['target'] = sub_xgb['target']*0.3 + submission1['target']*0.7" 53 | ], 54 | "execution_count": 1, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "lT0xCG7pwxcW", 61 | "colab_type": "text" 62 | }, 63 | "source": [ 64 | "**SECOND BLEND**" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "id": "dNTJwhfbw3ZH", 71 | "colab_type": "code", 72 | "colab": {} 73 | }, 74 | "source": [ 75 | "submission2 =pd.read_csv('submission2.csv')\n", 76 | "\n", 77 | "comb1['target'] = submission_['target']*0.5 + submission2['target']*0.5" 78 | ], 79 | "execution_count": 7, 80 | "outputs": [] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": { 85 | "id": "zhJJ5yA6xbsW", 86 | "colab_type": "text" 87 | }, 88 | "source": [ 89 | "**FINAL BLEND BLEND**" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "metadata": { 95 | "id": "UDXHuUPexfV4", 96 | "colab_type": "code", 97 | "colab": {} 98 | }, 99 | "source": [ 100 | "#comb1 * 0.4288 + indianda * 0.1072 + liyjib37.07lgb * 0.264 + 0.2 * ekher mouhawla\n", 101 | "\n", 102 | "\n", 103 | "indiana =pd.read_csv('indiana.csv')\n", 104 | "another_lgb = pd.read_csv('ANOTHERLGB.csv')\n", 105 | "ekher_mou7awla =pd.read_csv('ekher_mou7awla.csv')\n", 106 | "\n", 107 | "\n", 108 | "\n", 109 | "sub_finale= indiana.copy()\n", 110 | "\n", 111 | "sub_finale['target'] = comb1['target']*0.4288 + indiana['target']*0.1072 + another_lgb['target']* 0.264 + ekher_mou7awla['target']*0.2" 112 | ], 113 | "execution_count": 9, 114 | "outputs": [] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "metadata": { 119 | "id": "e0vbG9Ez0bmI", 120 | "colab_type": "code", 121 | "colab": {} 122 | }, 123 | "source": [ 124 | "sub_finale.to_csv('sub_finale.csv',index=False)" 125 | ], 126 | "execution_count": 10, 127 | "outputs": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "metadata": { 132 | "id": "O48UkdYhYWA3", 133 | "colab_type": "code", 134 | "colab": {} 135 | }, 136 | "source": [ 137 | "" 138 | ], 139 | "execution_count": null, 140 | "outputs": [] 141 | } 142 | ] 143 | } -------------------------------------------------------------------------------- /XGBOOST_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.6.8" 21 | }, 22 | "colab": { 23 | "name": "XGBOOST_.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [], 26 | "include_colab_link": true 27 | }, 28 | "accelerator": "GPU" 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | " $\"Open$ " 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "ZbVtKR2DolJZ", 45 | "colab_type": "text" 46 | }, 47 | "source": [ 48 | "## **PLEASE CHENGE YOUR RUNTIME TYPE TO GPU **" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "metadata": { 54 | "id": "lygrd_4THhsP", 55 | "colab_type": "code", 56 | "colab": {} 57 | }, 58 | "source": [ 59 | "import pandas as pd \n", 60 | "import numpy as np \n", 61 | "from tqdm import tqdm\n", 62 | "import math\n", 63 | "import gc" 64 | ], 65 | "execution_count": 0, 66 | "outputs": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "metadata": { 71 | "id": "QDbr-4b3nuGb", 72 | "colab_type": "code", 73 | "colab": {} 74 | }, 75 | "source": [ 76 | "from google.colab import drive\n", 77 | "drive.mount('/content/drive')" 78 | ], 79 | "execution_count": 0, 80 | "outputs": [] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "metadata": { 85 | "id": "ZZF7jjBgnuJa", 86 | "colab_type": "code", 87 | "colab": {} 88 | }, 89 | "source": [ 90 | "path = '/content/drive/My Drive/'" 91 | ], 92 | "execution_count": 0, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "metadata": { 98 | "id": "tRthNH90Hhsa", 99 | "colab_type": "code", 100 | "colab": {} 101 | }, 102 | "source": [ 103 | "train=pd.read_csv(\"/content/drive/My Drive/Train (14).csv\")\n", 104 | "test=pd.read_csv(\"/content/drive/My Drive/Test (8).csv\")" 105 | ], 106 | "execution_count": 0, 107 | "outputs": [] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "id": "mApRDmFGHhsz", 113 | "colab_type": "text" 114 | }, 115 | "source": [ 116 | "### Features engineering part" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "metadata": { 122 | "id": "7lskkR2rHhss", 123 | "colab_type": "code", 124 | "colab": {} 125 | }, 126 | "source": [ 127 | "# covert features fron string to List of values \n", 128 | "def replace_nan(x):\n", 129 | " if x==\" \":\n", 130 | " return np.nan\n", 131 | " else :\n", 132 | " return float(x)\n", 133 | "features=[\"temp\",\"precip\",\"rel_humidity\",\"wind_dir\",\"wind_spd\",\"atmos_press\"]\n", 134 | "for feature in features : \n", 135 | " train[feature]=train[feature].apply(lambda x: [ replace_nan(X) for X in x.replace(\"nan\",\" \").split(\",\")])\n", 136 | " test[feature]=test[feature].apply(lambda x: [ replace_nan(X) for X in x.replace(\"nan\",\" \").split(\",\")]) " 137 | ], 138 | "execution_count": 0, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "metadata": { 144 | "id": "sg8-zujFHhs1", 145 | "colab_type": "code", 146 | "colab": {} 147 | }, 148 | "source": [ 149 | "def aggregate_features(x,col_name):\n", 150 | " x[\"max_\"+col_name]=x[col_name].apply(np.max)\n", 151 | " x[\"min_\"+col_name]=x[col_name].apply(np.min)\n", 152 | " x[\"mean_\"+col_name]=x[col_name].apply(np.mean)\n", 153 | " x[\"std_\"+col_name]=x[col_name].apply(np.std)\n", 154 | " #x[\"var_\"+col_name]=x[col_name].apply(np.var)\n", 155 | " x[\"sum_\"+col_name]=x[col_name].apply(np.sum)\n", 156 | " x[\"ptp_\"+col_name]=x[col_name].apply(np.ptp)\n", 157 | " return x \n", 158 | "def remove_nan_values(x):\n", 159 | " return [e for e in x if not math.isnan(e)]" 160 | ], 161 | "execution_count": 0, 162 | "outputs": [] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "metadata": { 167 | "id": "jSHXJxduHhs6", 168 | "colab_type": "code", 169 | "outputId": "6acc98a4-5bdd-480f-fa0a-19bb8d5d406c", 170 | "colab": { 171 | "base_uri": "https://localhost:8080/", 172 | "height": 170 173 | } 174 | }, 175 | "source": [ 176 | "data=pd.concat([train,test],sort=False).reset_index(drop=True)\n", 177 | "data.columns.tolist()" 178 | ], 179 | "execution_count": 0, 180 | "outputs": [ 181 | { 182 | "output_type": "execute_result", 183 | "data": { 184 | "text/plain": [ 185 | "['ID',\n", 186 | " 'location',\n", 187 | " 'temp',\n", 188 | " 'precip',\n", 189 | " 'rel_humidity',\n", 190 | " 'wind_dir',\n", 191 | " 'wind_spd',\n", 192 | " 'atmos_press',\n", 193 | " 'target']" 194 | ] 195 | }, 196 | "metadata": { 197 | "tags": [] 198 | }, 199 | "execution_count": 60 200 | } 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "metadata": { 206 | "id": "K4xhuB8H3Wwb", 207 | "colab_type": "code", 208 | "colab": {} 209 | }, 210 | "source": [ 211 | "for x in range(121):\n", 212 | " data[\"newtemp\"+ str(x)] = data.temp.str[x]\n", 213 | " data[\"newprecip\"+ str(x)] = data.precip.str[x]\n", 214 | " data[\"newrel_humidity\"+ str(x)] = data.rel_humidity.str[x]\n", 215 | " data[\"newwind_dir\"+ str(x)] = data.wind_dir.str[x]\n", 216 | " data[\"windspeed\"+ str(x)] = data.wind_spd.str[x]\n", 217 | " data[\"atmospherepressure\"+ str(x)] = data.atmos_press.str[x]" 218 | ], 219 | "execution_count": 0, 220 | "outputs": [] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "metadata": { 225 | "id": "1dYviSGeHhs_", 226 | "colab_type": "code", 227 | "outputId": "410c532d-f019-4c35-fe11-191f48aa8641", 228 | "colab": { 229 | "base_uri": "https://localhost:8080/", 230 | "height": 34 231 | } 232 | }, 233 | "source": [ 234 | "data.shape" 235 | ], 236 | "execution_count": 0, 237 | "outputs": [ 238 | { 239 | "output_type": "execute_result", 240 | "data": { 241 | "text/plain": [ 242 | "(20574, 735)" 243 | ] 244 | }, 245 | "metadata": { 246 | "tags": [] 247 | }, 248 | "execution_count": 62 249 | } 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "metadata": { 255 | "id": "tROUcB5p6esB", 256 | "colab_type": "code", 257 | "outputId": "497fd88b-da88-4aa0-e363-ee3fb0b97277", 258 | "colab": { 259 | "base_uri": "https://localhost:8080/", 260 | "height": 34 261 | } 262 | }, 263 | "source": [ 264 | "len(data.precip[1])" 265 | ], 266 | "execution_count": 0, 267 | "outputs": [ 268 | { 269 | "output_type": "execute_result", 270 | "data": { 271 | "text/plain": [ 272 | "121" 273 | ] 274 | }, 275 | "metadata": { 276 | "tags": [] 277 | }, 278 | "execution_count": 63 279 | } 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "cNcORQK-HhtG", 286 | "colab_type": "code", 287 | "outputId": "6407285c-9708-4cb5-f226-46c83fefc96f", 288 | "colab": { 289 | "base_uri": "https://localhost:8080/", 290 | "height": 34 291 | } 292 | }, 293 | "source": [ 294 | "for col_name in tqdm(features):\n", 295 | " data[col_name]=data[col_name].apply(remove_nan_values)" 296 | ], 297 | "execution_count": 0, 298 | "outputs": [ 299 | { 300 | "output_type": "stream", 301 | "text": [ 302 | "100%|██████████| 6/6 [00:02<00:00, 2.45it/s]\n" 303 | ], 304 | "name": "stderr" 305 | } 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "metadata": { 311 | "id": "K9iStUBEHhtO", 312 | "colab_type": "code", 313 | "outputId": "d81d1df4-a973-4d50-933b-50197a69cc6c", 314 | "colab": { 315 | "base_uri": "https://localhost:8080/", 316 | "height": 34 317 | } 318 | }, 319 | "source": [ 320 | "for col_name in tqdm(features):\n", 321 | " data=aggregate_features(data,col_name)" 322 | ], 323 | "execution_count": 0, 324 | "outputs": [ 325 | { 326 | "output_type": "stream", 327 | "text": [ 328 | "100%|██████████| 6/6 [00:14<00:00, 2.45s/it]\n" 329 | ], 330 | "name": "stderr" 331 | } 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "metadata": { 337 | "id": "aiAM7WA3HhtR", 338 | "colab_type": "code", 339 | "colab": {} 340 | }, 341 | "source": [ 342 | "data.drop(features,1,inplace=True)" 343 | ], 344 | "execution_count": 0, 345 | "outputs": [] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "metadata": { 350 | "id": "aG-kLSttrM41", 351 | "colab_type": "code", 352 | "colab": {} 353 | }, 354 | "source": [ 355 | "hum_features = list(data.filter(regex='rel_humidity.*').columns)\n", 356 | "temp_features = list( data.filter(regex='newtemp.*').columns) \n", 357 | "precip_features = list(data.filter(regex='newprecip*').columns)\n", 358 | "winddir_features = list( data.filter(regex='newwind_dir.*').columns)\n", 359 | "windspead_features = list( data.filter(regex='windspeed.*').columns)\n", 360 | "atm_features = list(data.filter(regex='atmos.*').columns)\n", 361 | "\n", 362 | "\n", 363 | "\n", 364 | "\n", 365 | "hum_features= hum_features[36:]\n", 366 | "temp_features=temp_features[36:] \n", 367 | "precip_features=precip_features[31:]\n", 368 | "winddir_features=winddir_features[36:]\n", 369 | "windspead_features=windspead_features[36:]\n", 370 | "atm_features=atm_features[36:]\n", 371 | "\n", 372 | "\n", 373 | "\n", 374 | "\n", 375 | "\n", 376 | "data[hum_features]= data[hum_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n", 377 | "\n", 378 | "\n", 379 | "\n", 380 | "data[temp_features]= data[temp_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n", 381 | "\n", 382 | "\n", 383 | "data[precip_features]= data[precip_features].apply(lambda x: x.fillna(float(0.0)),axis=1)\n", 384 | "\n", 385 | "\n", 386 | "\n", 387 | "data[winddir_features]= data[winddir_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n", 388 | "\n", 389 | "\n", 390 | "\n", 391 | "data[windspead_features]= data[windspead_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n", 392 | "\n", 393 | "\n", 394 | "\n", 395 | "\n", 396 | "data[atm_features]= data[atm_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n" 397 | ], 398 | "execution_count": 0, 399 | "outputs": [] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "metadata": { 404 | "id": "b-inscoEHhtU", 405 | "colab_type": "code", 406 | "colab": {} 407 | }, 408 | "source": [ 409 | "train=data[data.target.notnull()].reset_index(drop=True)\n", 410 | "test=data[data.target.isna()].reset_index(drop=True)" 411 | ], 412 | "execution_count": 0, 413 | "outputs": [] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "metadata": { 418 | "id": "fS2h1hJcaASK", 419 | "colab_type": "code", 420 | "outputId": "63cf9ce6-6cf1-4b74-b36d-e0769a2df411", 421 | "colab": { 422 | "base_uri": "https://localhost:8080/", 423 | "height": 34 424 | } 425 | }, 426 | "source": [ 427 | "train.shape,test.shape" 428 | ], 429 | "execution_count": 0, 430 | "outputs": [ 431 | { 432 | "output_type": "execute_result", 433 | "data": { 434 | "text/plain": [ 435 | "((15539, 765), (5035, 765))" 436 | ] 437 | }, 438 | "metadata": { 439 | "tags": [] 440 | }, 441 | "execution_count": 69 442 | } 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": { 448 | "id": "mWcrHrV5CSPx", 449 | "colab_type": "text" 450 | }, 451 | "source": [ 452 | "###end of la3b" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "metadata": { 458 | "id": "EjI-PlKfvJno", 459 | "colab_type": "code", 460 | "colab": {} 461 | }, 462 | "source": [ 463 | "import pandas as pd\n", 464 | "import numpy as np\n", 465 | "import matplotlib.pyplot as plt\n", 466 | "import seaborn as sns\n", 467 | "import warnings\n", 468 | "warnings.filterwarnings('ignore')\n", 469 | "\n", 470 | "from math import sqrt \n", 471 | "import lightgbm as lgb\n", 472 | "from sklearn.metrics import mean_squared_error \n", 473 | "from sklearn.model_selection import KFold, train_test_split\n", 474 | "import requests\n", 475 | "from io import StringIO \n", 476 | "%matplotlib inline\n", 477 | "pd.set_option(\"display.max_rows\", 100)\n", 478 | "pd.set_option(\"display.max_columns\", 100)\n", 479 | "import numpy as np\n", 480 | "import matplotlib.pyplot as plt\n", 481 | "import seaborn as sns\n", 482 | "import warnings\n", 483 | "warnings.filterwarnings('ignore')\n", 484 | "\n", 485 | "from math import sqrt \n", 486 | "import lightgbm as lgb\n", 487 | "from sklearn.metrics import mean_squared_error \n", 488 | "from sklearn.model_selection import KFold, train_test_split\n", 489 | "import requests\n", 490 | "from io import StringIO \n", 491 | "%matplotlib inline\n", 492 | "pd.set_option(\"display.max_rows\", 100)\n", 493 | "pd.set_option(\"display.max_columns\", 100)" 494 | ], 495 | "execution_count": 0, 496 | "outputs": [] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "metadata": { 501 | "id": "hs2TN42eqkR6", 502 | "colab_type": "code", 503 | "outputId": "28ad2dd0-e4d3-4c36-e19a-f7de3d8c3db1", 504 | "colab": { 505 | "base_uri": "https://localhost:8080/", 506 | "height": 34 507 | } 508 | }, 509 | "source": [ 510 | "test.drop(\"target\",axis = 1,inplace = True)\n", 511 | "train.shape,test.shape" 512 | ], 513 | "execution_count": 0, 514 | "outputs": [ 515 | { 516 | "output_type": "execute_result", 517 | "data": { 518 | "text/plain": [ 519 | "((15539, 765), (5035, 764))" 520 | ] 521 | }, 522 | "metadata": { 523 | "tags": [] 524 | }, 525 | "execution_count": 71 526 | } 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "metadata": { 532 | "id": "TyT7Uu02rEDg", 533 | "colab_type": "code", 534 | "outputId": "4a027e7a-25ca-4afd-ab36-25077b690a38", 535 | "colab": { 536 | "base_uri": "https://localhost:8080/", 537 | "height": 34 538 | } 539 | }, 540 | "source": [ 541 | "from sklearn.preprocessing import LabelEncoder\n", 542 | "lab = LabelEncoder()\n", 543 | "lab.fit(train[\"location\"])\n", 544 | "train.location = lab.transform(train.location)\n", 545 | "test.location = lab.transform(test.location)\n", 546 | "train.target.min(),train.target.max()" 547 | ], 548 | "execution_count": 0, 549 | "outputs": [ 550 | { 551 | "output_type": "execute_result", 552 | "data": { 553 | "text/plain": [ 554 | "(1.4526190476190477, 475.82)" 555 | ] 556 | }, 557 | "metadata": { 558 | "tags": [] 559 | }, 560 | "execution_count": 72 561 | } 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "metadata": { 567 | "id": "xEBTcrsTrD_s", 568 | "colab_type": "code", 569 | "colab": {} 570 | }, 571 | "source": [ 572 | "X=train.drop([\"ID\",'target'],axis =1)\n", 573 | "y = train.target\n", 574 | "test_id = test['ID']" 575 | ], 576 | "execution_count": 0, 577 | "outputs": [] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "metadata": { 582 | "id": "KKciRChkkgGR", 583 | "colab_type": "code", 584 | "colab": {} 585 | }, 586 | "source": [ 587 | "from sklearn.cluster import KMeans\n", 588 | "X=X.fillna(-9999)\n", 589 | "\n", 590 | "kmeans = KMeans(n_clusters=6).fit(X)\n", 591 | "X['cluster']=kmeans.predict(X)\n", 592 | "test=test.fillna(-9999)\n", 593 | "test['cluster']=kmeans.predict(test.drop([\"ID\"],axis = 1))\n", 594 | "\n", 595 | "\n" 596 | ], 597 | "execution_count": 0, 598 | "outputs": [] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "metadata": { 603 | "id": "OS2hPCB3kgN6", 604 | "colab_type": "code", 605 | "colab": {} 606 | }, 607 | "source": [ 608 | "air_temperature_filler = pd.DataFrame(X.groupby(['location','mean_wind_dir'])['mean_temp'].mean())\n", 609 | "X['mean temp per month']=0\n", 610 | "for i in range (len(X)):\n", 611 | " \n", 612 | " X['mean temp per month'][i]=air_temperature_filler.loc[(X['location'][i],X['mean_wind_dir'][i]), :]\n", 613 | "\n", 614 | "air_temperature_filler = pd.DataFrame(test.groupby(['location','mean_wind_dir'])['mean_temp'].mean())\n", 615 | "\n", 616 | "test['mean temp per month']=0\n", 617 | "for i in range (len(test)):\n", 618 | " \n", 619 | " test['mean temp per month'][i]=air_temperature_filler.loc[(test['location'][i],test['mean_wind_dir'][i]), :]" 620 | ], 621 | "execution_count": 0, 622 | "outputs": [] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "metadata": { 627 | "id": "N2ciDCovkgVu", 628 | "colab_type": "code", 629 | "colab": {} 630 | }, 631 | "source": [ 632 | "p0=['newtemp0', 'newprecip0', 'newrel_humidity0', 'newwind_dir0',\n", 633 | " 'windspeed0', 'atmospherepressure0']\n", 634 | "p4=['newtemp30', 'newprecip30', 'newrel_humidity30', 'newwind_dir30',\n", 635 | " 'windspeed30', 'atmospherepressure30'] \n", 636 | "p1=['newtemp60', 'newprecip60', 'newrel_humidity60', 'newwind_dir60',\n", 637 | " 'windspeed60', 'atmospherepressure60'] \n", 638 | "p2=['newtemp90', 'newprecip90', 'newrel_humidity90', 'newwind_dir90',\n", 639 | " 'windspeed90', 'atmospherepressure90'] \n", 640 | "p3=['newtemp120', 'newprecip120', 'newrel_humidity120', 'newwind_dir120',\n", 641 | " 'windspeed120', 'atmospherepressure120'] \n", 642 | "from sklearn.decomposition import PCA, FastICA\n", 643 | "pca = PCA(random_state=42,n_components=1)\n", 644 | "train_pca = pca.fit_transform(X[p0])\n", 645 | "X['pca_0'] = train_pca[:,0]\n", 646 | "test_pca = pca.fit_transform(test[p0])\n", 647 | "test['pca_0'] = test_pca[:,0]\n", 648 | "train_pca = pca.fit_transform(X[p1])\n", 649 | "X['pca_1'] = train_pca[:,0]\n", 650 | "test_pca = pca.fit_transform(test[p1])\n", 651 | "test['pca_1'] = test_pca[:,0]\n", 652 | "train_pca = pca.fit_transform(X[p2])\n", 653 | "X['pca_2'] = train_pca[:,0]\n", 654 | "test_pca = pca.fit_transform(test[p2])\n", 655 | "test['pca_2'] = test_pca[:,0]\n", 656 | "\n", 657 | "train_pca = pca.fit_transform(X[p3])\n", 658 | "X['pca_3'] = train_pca[:,0]\n", 659 | "test_pca = pca.fit_transform(test[p3])\n", 660 | "test['pca_3'] = test_pca[:,0]\n", 661 | "\n", 662 | "train_pca = pca.fit_transform(X[p4])\n", 663 | "X['pca_4'] = train_pca[:,0]\n", 664 | "test_pca = pca.fit_transform(test[p4])\n", 665 | "test['pca_4'] = test_pca[:,0]\n", 666 | "\n", 667 | "test=test.drop(columns=['pca_2','pca_4'])\n", 668 | "X=X.drop(columns=['pca_4','pca_2'])" 669 | ], 670 | "execution_count": 0, 671 | "outputs": [] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "metadata": { 676 | "id": "9Ju_Y7wQkgTJ", 677 | "colab_type": "code", 678 | "colab": {} 679 | }, 680 | "source": [ 681 | "X=X.drop(columns=['newtemp105', 'newprecip105', 'newrel_humidity105', 'newwind_dir105',\n", 682 | " 'windspeed105', 'atmospherepressure105','newtemp9', 'newprecip9', 'newrel_humidity9', 'newwind_dir9',\n", 683 | " 'windspeed9', 'atmospherepressure9'])\n", 684 | "test=test.drop(columns=['newtemp105', 'newprecip105', 'newrel_humidity105', 'newwind_dir105',\n", 685 | " 'windspeed105', 'atmospherepressure105','newtemp9', 'newprecip9', 'newrel_humidity9', 'newwind_dir9',\n", 686 | " 'windspeed9', 'atmospherepressure9']) " 687 | ], 688 | "execution_count": 0, 689 | "outputs": [] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": { 694 | "id": "gSK7Goien71V", 695 | "colab_type": "text" 696 | }, 697 | "source": [ 698 | "## **hana & and hanat are the data generated by arima ** \n", 699 | "## PLEASE CHECK THAT YOU'VE UPLOAD IT " 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "metadata": { 705 | "id": "9PxHJ_n7eSiI", 706 | "colab_type": "code", 707 | "colab": {} 708 | }, 709 | "source": [ 710 | "hana=pd.read_csv('/content/winddirforecasttrain ADD THOSE COLUMNS TO TRAIN.csv')" 711 | ], 712 | "execution_count": 0, 713 | "outputs": [] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "metadata": { 718 | "id": "oSm55maceSu6", 719 | "colab_type": "code", 720 | "colab": {} 721 | }, 722 | "source": [ 723 | "hanat=pd.read_csv('/content/winddirforecasttest ADD THOSE FEATURES TO TEST.csv')" 724 | ], 725 | "execution_count": 0, 726 | "outputs": [] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "metadata": { 731 | "id": "ApLZ3O8lfLHE", 732 | "colab_type": "code", 733 | "outputId": "ef927c34-7793-4f7d-917a-c97463d50a0b", 734 | "colab": { 735 | "base_uri": "https://localhost:8080/", 736 | "height": 224 737 | } 738 | }, 739 | "source": [ 740 | "hanat.head()" 741 | ], 742 | "execution_count": 0, 743 | "outputs": [ 744 | { 745 | "output_type": "execute_result", 746 | "data": { 747 | "text/html": [ 748 | "

\n", 749 | "\n", 762 | "\n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | "

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17
0	197.250658	212.394441	219.223270	222.302612	223.691189	224.317343	224.599697	224.727019	224.784433	224.810323	224.821998	224.827262	224.829636	224.830707	224.831189	224.831407	224.831505	224.831549
1	146.588427	165.851197	176.443787	182.268648	185.471737	187.233115	188.201695	188.734318	189.027206	189.188266	189.276832	189.325535	189.352316	189.367044	189.375142	189.379595	189.382044	189.383391
2	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288	165.153288
3	268.745697	236.704786	220.128369	211.552532	207.115808	204.820462	203.632961	203.018605	202.700768	202.536334	202.451263	202.407252	202.384483	202.372703	202.366609	202.363456	202.361825	202.360981
4	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258	185.847258

\n", 894 | "

" 895 | ], 896 | "text/plain": [ 897 | " 0 1 2 3 4 5 \\\n", 898 | "0 197.250658 212.394441 219.223270 222.302612 223.691189 224.317343 \n", 899 | "1 146.588427 165.851197 176.443787 182.268648 185.471737 187.233115 \n", 900 | "2 165.153288 165.153288 165.153288 165.153288 165.153288 165.153288 \n", 901 | "3 268.745697 236.704786 220.128369 211.552532 207.115808 204.820462 \n", 902 | "4 185.847258 185.847258 185.847258 185.847258 185.847258 185.847258 \n", 903 | "\n", 904 | " 6 7 8 9 10 11 \\\n", 905 | "0 224.599697 224.727019 224.784433 224.810323 224.821998 224.827262 \n", 906 | "1 188.201695 188.734318 189.027206 189.188266 189.276832 189.325535 \n", 907 | "2 165.153288 165.153288 165.153288 165.153288 165.153288 165.153288 \n", 908 | "3 203.632961 203.018605 202.700768 202.536334 202.451263 202.407252 \n", 909 | "4 185.847258 185.847258 185.847258 185.847258 185.847258 185.847258 \n", 910 | "\n", 911 | " 12 13 14 15 16 17 \n", 912 | "0 224.829636 224.830707 224.831189 224.831407 224.831505 224.831549 \n", 913 | "1 189.352316 189.367044 189.375142 189.379595 189.382044 189.383391 \n", 914 | "2 165.153288 165.153288 165.153288 165.153288 165.153288 165.153288 \n", 915 | "3 202.384483 202.372703 202.366609 202.363456 202.361825 202.360981 \n", 916 | "4 185.847258 185.847258 185.847258 185.847258 185.847258 185.847258 " 917 | ] 918 | }, 919 | "metadata": { 920 | "tags": [] 921 | }, 922 | "execution_count": 80 923 | } 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "metadata": { 929 | "id": "YjqMLxIBiXc7", 930 | "colab_type": "code", 931 | "colab": {} 932 | }, 933 | "source": [ 934 | "for i in hana.columns :\n", 935 | " X[i]=hana[i]\n", 936 | "for i in hanat :\n", 937 | " test[i]=hanat[i]" 938 | ], 939 | "execution_count": 0, 940 | "outputs": [] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "metadata": { 945 | "id": "bEZT2dUUinEj", 946 | "colab_type": "code", 947 | "outputId": "9f596bb0-b364-4cd9-8327-82185537eb14", 948 | "colab": { 949 | "base_uri": "https://localhost:8080/", 950 | "height": 34 951 | } 952 | }, 953 | "source": [ 954 | "X.shape,test.shape" 955 | ], 956 | "execution_count": 0, 957 | "outputs": [ 958 | { 959 | "output_type": "execute_result", 960 | "data": { 961 | "text/plain": [ 962 | "((15539, 774), (5035, 775))" 963 | ] 964 | }, 965 | "metadata": { 966 | "tags": [] 967 | }, 968 | "execution_count": 82 969 | } 970 | ] 971 | }, 972 | { 973 | "cell_type": "code", 974 | "metadata": { 975 | "id": "Vuv40f3DJOes", 976 | "colab_type": "code", 977 | "colab": {} 978 | }, 979 | "source": [ 980 | "import xgboost as xgb\n", 981 | "\n", 982 | "params = {\n", 983 | " 'gpu_id': 0, \n", 984 | " #'n_gpus': 2, \n", 985 | " 'objective': 'reg:squarederror', \n", 986 | " 'eval_metric': 'rmse', \n", 987 | " \n", 988 | " 'booster': 'gbtree', \n", 989 | " \n", 990 | " 'n_estimators': 10000, \n", 991 | " 'tree_method': 'gpu_hist', \n", 992 | " 'grow_policy': 'lossguide', \n", 993 | " 'max_depth': 8, \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " 'learning_rate': 0.01, \n", 998 | " 'max_bin': 200, \n", 999 | " 'max_leaves': 200, \n", 1000 | " \n", 1001 | " 'reg_alpha': 10, \n", 1002 | " 'reg_lambda': 50, \n", 1003 | " 'subsample': 0.9 ,\n", 1004 | " }\n", 1005 | "model = xgb.XGBRegressor(**params)" 1006 | ], 1007 | "execution_count": 0, 1008 | "outputs": [] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "metadata": { 1013 | "id": "MBcQ1gQXzA_c", 1014 | "colab_type": "code", 1015 | "outputId": "de37b93e-f15e-4bda-8e69-7ad74ef11e78", 1016 | "colab": { 1017 | "base_uri": "https://localhost:8080/", 1018 | "height": 170 1019 | } 1020 | }, 1021 | "source": [ 1022 | "model.fit(X,y)" 1023 | ], 1024 | "execution_count": 0, 1025 | "outputs": [ 1026 | { 1027 | "output_type": "execute_result", 1028 | "data": { 1029 | "text/plain": [ 1030 | "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", 1031 | " colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',\n", 1032 | " gamma=0, gpu_id=0, grow_policy='lossguide', importance_type='gain',\n", 1033 | " learning_rate=0.01, max_bin=200, max_delta_step=0, max_depth=8,\n", 1034 | " max_leaves=200, min_child_weight=1, missing=None,\n", 1035 | " n_estimators=10000, n_jobs=1, nthread=None,\n", 1036 | " objective='reg:squarederror', random_state=0, reg_alpha=10,\n", 1037 | " reg_lambda=50, scale_pos_weight=1, seed=None, silent=None,\n", 1038 | " subsample=0.9, tree_method='gpu_hist', verbosity=1)" 1039 | ] 1040 | }, 1041 | "metadata": { 1042 | "tags": [] 1043 | }, 1044 | "execution_count": 84 1045 | } 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "metadata": { 1051 | "id": "dnNV6t7azZ81", 1052 | "colab_type": "code", 1053 | "colab": {} 1054 | }, 1055 | "source": [ 1056 | "pp=model.predict(test.drop([\"ID\"],axis = 1))" 1057 | ], 1058 | "execution_count": 0, 1059 | "outputs": [] 1060 | }, 1061 | { 1062 | "cell_type": "code", 1063 | "metadata": { 1064 | "id": "DU4h2FgiLaOB", 1065 | "colab_type": "code", 1066 | "outputId": "0cafd63b-638a-422d-a9d7-0aa9615437f6", 1067 | "colab": { 1068 | "base_uri": "https://localhost:8080/", 1069 | "height": 34 1070 | } 1071 | }, 1072 | "source": [ 1073 | "d = {'ID': test_id, 'target': pp}\n", 1074 | "subxgb = pd.DataFrame(data=d)\n", 1075 | "subxgb = subxgb[['ID', 'target']]\n", 1076 | "subxgb.target.min(),subxgb.target.max()" 1077 | ], 1078 | "execution_count": 0, 1079 | "outputs": [ 1080 | { 1081 | "output_type": "execute_result", 1082 | "data": { 1083 | "text/plain": [ 1084 | "(15.86525821685791, 238.6833038330078)" 1085 | ] 1086 | }, 1087 | "metadata": { 1088 | "tags": [] 1089 | }, 1090 | "execution_count": 86 1091 | } 1092 | ] 1093 | }, 1094 | { 1095 | "cell_type": "code", 1096 | "metadata": { 1097 | "id": "TOCcTSqULg4_", 1098 | "colab_type": "code", 1099 | "colab": {} 1100 | }, 1101 | "source": [ 1102 | "subxgb.to_csv('subxgb.csv',index=False)\n", 1103 | "from google.colab import files\n", 1104 | "files.download(\"subxgb.csv\")" 1105 | ], 1106 | "execution_count": 0, 1107 | "outputs": [] 1108 | } 1109 | ] 1110 | } -------------------------------------------------------------------------------- /ANOTHERLGBMODEL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "language": "python", 7 | "display_name": "Python 3", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "pygments_lexer": "ipython3", 12 | "nbconvert_exporter": "python", 13 | "version": "3.6.4", 14 | "file_extension": ".py", 15 | "codemirror_mode": { 16 | "name": "ipython", 17 | "version": 3 18 | }, 19 | "name": "python", 20 | "mimetype": "text/x-python" 21 | }, 22 | "colab": { 23 | "name": "kernel37.07LGBAZER.ipynb", 24 | "provenance": [], 25 | "include_colab_link": true 26 | } 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "view-in-github", 33 | "colab_type": "text" 34 | }, 35 | "source": [ 36 | " $\"Open$ " 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "MDcv3QxAxxUb", 43 | "colab_type": "text" 44 | }, 45 | "source": [ 46 | "# PLEASE MAKE SURE TO RUN THIS ON KAGGLE TO GET THE SAME SCORE " 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "metadata": { 52 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 53 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 54 | "trusted": true, 55 | "id": "HN3OBq3gxu5J", 56 | "colab_type": "code", 57 | "colab": {} 58 | }, 59 | "source": [ 60 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 61 | "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", 62 | "# For example, here's several helpful packages to load\n", 63 | "\n", 64 | "import numpy as np # linear algebra\n", 65 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 66 | "\n", 67 | "# Input data files are available in the read-only \"../input/\" directory\n", 68 | "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", 69 | "\n", 70 | "import os\n", 71 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 72 | " for filename in filenames:\n", 73 | " print(os.path.join(dirname, filename))\n", 74 | "\n", 75 | "# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", 76 | "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" 77 | ], 78 | "execution_count": 0, 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "metadata": { 84 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", 85 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 86 | "trusted": true, 87 | "id": "M_fa5gkqxu5N", 88 | "colab_type": "code", 89 | "colab": {} 90 | }, 91 | "source": [ 92 | "import os\n", 93 | "import sys\n", 94 | "import gc\n", 95 | "import math\n", 96 | "import random\n", 97 | "import pickle\n", 98 | "import pandas as pd\n", 99 | "import numpy as np\n", 100 | "import seaborn as sns\n", 101 | "from tqdm.notebook import tqdm\n", 102 | "import category_encoders as ce\n", 103 | "import matplotlib.pyplot as plt" 104 | ], 105 | "execution_count": 0, 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "metadata": { 111 | "trusted": true, 112 | "id": "1W0PjzQixu5P", 113 | "colab_type": "code", 114 | "colab": {} 115 | }, 116 | "source": [ 117 | "from sklearn.model_selection import KFold, StratifiedKFold, train_test_split\n", 118 | "from sklearn.metrics import f1_score, confusion_matrix, classification_report\n", 119 | "from sklearn.metrics import mean_squared_error\n", 120 | "from sklearn.feature_selection import SelectFromModel\n", 121 | "from sklearn.linear_model import Lasso, LassoCV\n", 122 | "from sklearn.ensemble import RandomForestRegressor" 123 | ], 124 | "execution_count": 0, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "metadata": { 130 | "trusted": true, 131 | "id": "juUsndZAxu5S", 132 | "colab_type": "code", 133 | "colab": {} 134 | }, 135 | "source": [ 136 | "import lightgbm as lgbm \n", 137 | "import xgboost as xgb\n", 138 | "import catboost as cat\n", 139 | "from catboost import CatBoostRegressor, Pool, CatBoostClassifier" 140 | ], 141 | "execution_count": 0, 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "metadata": { 147 | "trusted": true, 148 | "id": "PlwDxh2Axu5U", 149 | "colab_type": "code", 150 | "colab": {} 151 | }, 152 | "source": [ 153 | "seed = 2020\n", 154 | "random.seed(seed)\n", 155 | "np.random.seed(seed)" 156 | ], 157 | "execution_count": 0, 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "trusted": true, 164 | "id": "nyo1ql9pxu5W", 165 | "colab_type": "code", 166 | "colab": {} 167 | }, 168 | "source": [ 169 | "os.makedirs('MODELS/', exist_ok=True)\n", 170 | "os.makedirs('/DATASET/CSV/', exist_ok=True)\n", 171 | "os.makedirs('/DATASET/ZIP/', exist_ok=True)\n", 172 | "os.makedirs('/DATASET/DOWNLOAD/', exist_ok=True)" 173 | ], 174 | "execution_count": 0, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "trusted": true, 181 | "id": "s7MPyZf-xu5Y", 182 | "colab_type": "code", 183 | "colab": {} 184 | }, 185 | "source": [ 186 | "mapper = {\n", 187 | " \"GOOD\": 0,\n", 188 | " \"MODERATE\": 1,\n", 189 | " \"SENSITIVE\": 2,\n", 190 | " \"UNHEALTHY\": 3,\n", 191 | " \"V_UNHEALTHY\": 4,\n", 192 | " \"HAZARDOUS\": 5\n", 193 | "}" 194 | ], 195 | "execution_count": 0, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "metadata": { 201 | "trusted": true, 202 | "id": "bcQN-kcrxu5a", 203 | "colab_type": "code", 204 | "colab": {} 205 | }, 206 | "source": [ 207 | "def categorize(target):\n", 208 | " if target <= 12:\n", 209 | " return \"GOOD\"\n", 210 | " elif target <=35:\n", 211 | " return \"MODERATE\"\n", 212 | " elif target <= 55:\n", 213 | " return \"SENSITIVE\"\n", 214 | " elif target <= 150:\n", 215 | " return \"UNHEALTHY\"\n", 216 | " elif target <= 250:\n", 217 | " return \"V_UNHEALTHY\"\n", 218 | " else:\n", 219 | " return \"HAZARDOUS\"" 220 | ], 221 | "execution_count": 0, 222 | "outputs": [] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "metadata": { 227 | "trusted": true, 228 | "id": "bP_lms_Cxu5d", 229 | "colab_type": "code", 230 | "colab": {} 231 | }, 232 | "source": [ 233 | "def split_into_days(df, features, days=5):\n", 234 | " width = 24\n", 235 | " for feature in features:\n", 236 | " for day in range(days):\n", 237 | " df[feature+'_day_'+str(day)] = df[feature].apply(lambda x: x[day*width:(day+1)*width])\n", 238 | " df[feature+'_target_reading_day'] = df[feature].apply(lambda x: x[-1])" 239 | ], 240 | "execution_count": 0, 241 | "outputs": [] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "metadata": { 246 | "trusted": true, 247 | "id": "izS1G2WKxu5f", 248 | "colab_type": "code", 249 | "colab": {} 250 | }, 251 | "source": [ 252 | "# covert features fron string to List of values \n", 253 | "def replace_nan(x):\n", 254 | " if x==\" \":\n", 255 | " return np.nan\n", 256 | " else :\n", 257 | " return float(x) " 258 | ], 259 | "execution_count": 0, 260 | "outputs": [] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "metadata": { 265 | "trusted": true, 266 | "id": "_Mhiv4vXxu5h", 267 | "colab_type": "code", 268 | "colab": {} 269 | }, 270 | "source": [ 271 | "def aggregate_features(x,col_name):\n", 272 | " x[\"max_\"+col_name]=x[col_name].apply(np.max)\n", 273 | " x[\"min_\"+col_name]=x[col_name].apply(np.min)\n", 274 | " x[\"mean_\"+col_name]=x[col_name].apply(np.mean)\n", 275 | " x[\"std_\"+col_name]=x[col_name].apply(np.std)\n", 276 | " x[\"var_\"+col_name]=x[col_name].apply(np.var)\n", 277 | " x[\"median_\"+col_name]=x[col_name].apply(np.median)\n", 278 | " x[\"ptp_\"+col_name]=x[col_name].apply(np.ptp)\n", 279 | " return x " 280 | ], 281 | "execution_count": 0, 282 | "outputs": [] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "metadata": { 287 | "trusted": true, 288 | "id": "XFoIoSlvxu5i", 289 | "colab_type": "code", 290 | "colab": {} 291 | }, 292 | "source": [ 293 | "def remove_nan_values(x):\n", 294 | " strict = [e for e in x if not math.isnan(e)]\n", 295 | " if len(strict) == 0:\n", 296 | " strict = [np.nan]\n", 297 | " return strict" 298 | ], 299 | "execution_count": 0, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "trusted": true, 306 | "id": "FM4Vhiz2xu5l", 307 | "colab_type": "code", 308 | "colab": {} 309 | }, 310 | "source": [ 311 | "def metric(y,x):\n", 312 | " return np.sqrt(mean_squared_error(x,y))" 313 | ], 314 | "execution_count": 0, 315 | "outputs": [] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "metadata": { 320 | "trusted": true, 321 | "id": "xY1VTpUwxu5m", 322 | "colab_type": "code", 323 | "colab": {} 324 | }, 325 | "source": [ 326 | "def train_function(model,train,test,params,other_params,target_name,features,metric, model_name):\n", 327 | " folds_num=train.fold.nunique()\n", 328 | " validation=train[[id_name,\"fold\",target_name]].copy()\n", 329 | " validation[\"pred_\"+target_name]=0\n", 330 | " sub=test[[id_name]].copy()\n", 331 | " feat_imps = pd.DataFrame()\n", 332 | " feat_imps['Features'] = features\n", 333 | " \n", 334 | " for fold in np.sort(train.fold.unique()):\n", 335 | " print(\"#\"*50+\" {} \".format(fold)+\"#\"*50)\n", 336 | " os.makedirs(\"model_save/{}/{}/{}\".format(model_name,Experiment_name,str(int(fold))), exist_ok=True)\n", 337 | " X_train=train[train.fold!=fold]\n", 338 | " X_val=train[train.fold==fold]\n", 339 | " \n", 340 | " train_pred,validation_pred,test_pred,feat_imp=model(X_train,X_val,test,params,other_params)\n", 341 | "\n", 342 | " validation.loc[validation.fold==fold,\"pred_\"+target_name]=validation_pred\n", 343 | " sub[target_name]=test_pred/folds_num\n", 344 | " train_score=metric(X_train[target_name],train_pred)\n", 345 | " val_score=metric(X_val[target_name],validation_pred)\n", 346 | " feat_imps[fold] = feat_imp\n", 347 | " print(\"train score : {} validation score : {}\".format(round(train_score,4),round(val_score,4)))\n", 348 | " \n", 349 | " final_validation_score=metric(validation[target_name],validation[\"pred_\"+target_name])\n", 350 | " print(\"final validation score : {}\".format(final_validation_score))\n", 351 | " \n", 352 | " return sub,validation,final_validation_score,feat_imps\n", 353 | "\n", 354 | "def lgbm_model(X_train,X_val,X_test,params,other_params):\n", 355 | " dtrain = lgbm.Dataset(data=X_train[features], label=X_train[target_name], feature_name=features)\n", 356 | " dval = lgbm.Dataset(data=X_val[features], label=X_val[target_name], feature_name=features)\n", 357 | "\n", 358 | " model = lgbm.train(\n", 359 | " params=params,\n", 360 | " train_set=dtrain,\n", 361 | " num_boost_round=other_params[\"num_boost_round\"],\n", 362 | " valid_sets=(dtrain, dval),\n", 363 | " early_stopping_rounds=other_params[\"early_stopping_rounds\"],\n", 364 | " verbose_eval=other_params[\"verbose_eval\"],\n", 365 | " )\n", 366 | " best_iteration = model.best_iteration\n", 367 | " train_pred=model.predict(X_train[features], num_iteration=best_iteration)\n", 368 | " validation_pred=model.predict(X_val[features], num_iteration=best_iteration)\n", 369 | " test_pred=model.predict(test[features], num_iteration=best_iteration)\n", 370 | " feat_imp = model.feature_importance(iteration=best_iteration)\n", 371 | " \n", 372 | " return train_pred,validation_pred,test_pred, feat_imp\n", 373 | "\n", 374 | "def cat_model(X_train,X_val,X_test,params,other_params):\n", 375 | " dtrain = Pool(data=X_train[features], label=X_train[target_name], feature_names=features)\n", 376 | " dval = Pool(data=X_val[features], label=X_val[target_name], feature_names=features)\n", 377 | "\n", 378 | " model = CatBoostRegressor(**params)\n", 379 | " model.fit(dtrain,\n", 380 | " eval_set=[dval],\n", 381 | " use_best_model=True,\n", 382 | " verbose_eval=other_params[\"verbose_eval\"],\n", 383 | " )\n", 384 | "\n", 385 | " best_iteration = model.best_iteration_\n", 386 | " train_pred = model.predict(X_train[features])\n", 387 | " validation_pred = model.predict(X_val[features])\n", 388 | " test_pred = model.predict(test[features])\n", 389 | " feat_imp = model.feature_importances_\n", 390 | " \n", 391 | " return train_pred,validation_pred,test_pred, feat_imp" 392 | ], 393 | "execution_count": 0, 394 | "outputs": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "metadata": { 399 | "trusted": true, 400 | "id": "6zM5oaqPxu5o", 401 | "colab_type": "code", 402 | "colab": {} 403 | }, 404 | "source": [ 405 | "cols = ['location', 'loc_altitude', 'km2', 'aspect',\n", 406 | " 'dist_trunk', 'dist_primary', 'dist_secondary',\n", 407 | " 'dist_tertiary', 'dist_unclassified', 'dist_residential', 'popn', 'hh',\n", 408 | " 'hh_cook_charcoal', 'hh_cook_firewood', 'hh_burn_waste']" 409 | ], 410 | "execution_count": 0, 411 | "outputs": [] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "metadata": { 416 | "trusted": true, 417 | "id": "FAo46IrOxu5q", 418 | "colab_type": "code", 419 | "colab": {} 420 | }, 421 | "source": [ 422 | "train = pd.read_csv('../input/airqo-ugandan-air-quality-forecast-challenge-zindi/Train (1).csv')\n", 423 | "test = pd.read_csv('../input/airqo-ugandan-air-quality-forecast-challenge-zindi/Test (1).csv')\n", 424 | "meta = pd.read_csv('../input/airqo-ugandan-air-quality-forecast-challenge-zindi/airqo_metadata.csv', usecols=cols)" 425 | ], 426 | "execution_count": 0, 427 | "outputs": [] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "metadata": { 432 | "trusted": true, 433 | "id": "EWn__niTxu5t", 434 | "colab_type": "code", 435 | "colab": {} 436 | }, 437 | "source": [ 438 | "sns.barplot(x='location', y='target', data=train)" 439 | ], 440 | "execution_count": 0, 441 | "outputs": [] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "metadata": { 446 | "trusted": true, 447 | "id": "6j6g4kIBxu5v", 448 | "colab_type": "code", 449 | "colab": {} 450 | }, 451 | "source": [ 452 | "features = [\"temp\", \"precip\", \"rel_humidity\", \"wind_dir\", \"wind_spd\", \"atmos_press\"]\n", 453 | "\n", 454 | "days_features = [\n", 455 | "'temp_day_0', 'temp_day_1', 'temp_day_2', 'temp_day_3', 'temp_day_4', \n", 456 | "'precip_day_0', 'precip_day_1', 'precip_day_2', 'precip_day_3','precip_day_4',\n", 457 | "'rel_humidity_day_0','rel_humidity_day_1', 'rel_humidity_day_2', 'rel_humidity_day_3','rel_humidity_day_4',\n", 458 | "'wind_dir_day_0', 'wind_dir_day_1', 'wind_dir_day_2', 'wind_dir_day_3','wind_dir_day_4',\n", 459 | "'wind_spd_day_0','wind_spd_day_1', 'wind_spd_day_2', 'wind_spd_day_3', 'wind_spd_day_4',\n", 460 | "'atmos_press_day_0', 'atmos_press_day_1','atmos_press_day_2', 'atmos_press_day_3', 'atmos_press_day_4']" 461 | ], 462 | "execution_count": 0, 463 | "outputs": [] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "metadata": { 468 | "trusted": true, 469 | "id": "eGv82_T5xu5y", 470 | "colab_type": "code", 471 | "colab": {} 472 | }, 473 | "source": [ 474 | "for feature in features :\n", 475 | " train[feature] = train[feature].apply(lambda x: [ replace_nan(X) for X in x.replace(\"nan\",\" \").split(\",\")])\n", 476 | " test[feature] = test[feature].apply(lambda x: [ replace_nan(X) for X in x.replace(\"nan\",\" \").split(\",\")])" 477 | ], 478 | "execution_count": 0, 479 | "outputs": [] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "metadata": { 484 | "trusted": true, 485 | "id": "it6idKZ4xu5z", 486 | "colab_type": "code", 487 | "colab": {} 488 | }, 489 | "source": [ 490 | "datav1 = pd.concat([train, test],sort=False).reset_index(drop=True)\n", 491 | "datav2 = datav1.copy()" 492 | ], 493 | "execution_count": 0, 494 | "outputs": [] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "metadata": { 499 | "trusted": true, 500 | "id": "6cYe2pCDxu51", 501 | "colab_type": "code", 502 | "colab": {} 503 | }, 504 | "source": [ 505 | "for col_name in tqdm(features):\n", 506 | " split_into_days(datav1, features)" 507 | ], 508 | "execution_count": 0, 509 | "outputs": [] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "metadata": { 514 | "trusted": true, 515 | "id": "9fVk3hRyxu53", 516 | "colab_type": "code", 517 | "colab": {} 518 | }, 519 | "source": [ 520 | "for col_name in tqdm(days_features):\n", 521 | " datav1[col_name] = datav1[col_name].apply(remove_nan_values)\n", 522 | "\n", 523 | "for col_name in tqdm(days_features):\n", 524 | " datav1 = aggregate_features(datav1,col_name)" 525 | ], 526 | "execution_count": 0, 527 | "outputs": [] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "metadata": { 532 | "trusted": true, 533 | "id": "PlC9yiZ1xu54", 534 | "colab_type": "code", 535 | "colab": {} 536 | }, 537 | "source": [ 538 | "for col_name in tqdm(features):\n", 539 | " datav1[col_name] = datav1[col_name].apply(remove_nan_values)\n", 540 | "\n", 541 | "for col_name in tqdm(features):\n", 542 | " datav1 = aggregate_features(datav1,col_name)" 543 | ], 544 | "execution_count": 0, 545 | "outputs": [] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "metadata": { 550 | "trusted": true, 551 | "id": "2SoIHewUxu56", 552 | "colab_type": "code", 553 | "colab": {} 554 | }, 555 | "source": [ 556 | "for feat in features:\n", 557 | " for i in range(len(datav2.loc[1, features[0]])):\n", 558 | " datav2[feat+f'_{i}'] = datav2[feat].apply(lambda x: x[i])" 559 | ], 560 | "execution_count": 0, 561 | "outputs": [] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "metadata": { 566 | "trusted": true, 567 | "id": "5TVE9KpIxu57", 568 | "colab_type": "code", 569 | "colab": {} 570 | }, 571 | "source": [ 572 | "datav1.drop(features+days_features, axis=1, inplace=True)\n", 573 | "datav2.drop(features+['target', 'ID','location'], axis=1, inplace=True)" 574 | ], 575 | "execution_count": 0, 576 | "outputs": [] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "metadata": { 581 | "trusted": true, 582 | "id": "S65anRC0xu59", 583 | "colab_type": "code", 584 | "colab": {} 585 | }, 586 | "source": [ 587 | "data = pd.concat([datav1, datav2], axis=1)\n", 588 | "#data = datav1.copy()" 589 | ], 590 | "execution_count": 0, 591 | "outputs": [] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "metadata": { 596 | "trusted": true, 597 | "id": "Xo0WMKN2xu5_", 598 | "colab_type": "code", 599 | "colab": {} 600 | }, 601 | "source": [ 602 | "oe = ce.OrdinalEncoder(cols=['location'])\n", 603 | "data['binned_location'] = oe.fit_transform(data['location'])" 604 | ], 605 | "execution_count": 0, 606 | "outputs": [] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "metadata": { 611 | "trusted": true, 612 | "id": "PxY9NNO4xu6B", 613 | "colab_type": "code", 614 | "colab": {} 615 | }, 616 | "source": [ 617 | "meta.fillna(-9999, inplace=True)" 618 | ], 619 | "execution_count": 0, 620 | "outputs": [] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "metadata": { 625 | "trusted": true, 626 | "id": "48jydDiRxu6D", 627 | "colab_type": "code", 628 | "colab": {} 629 | }, 630 | "source": [ 631 | "aggs = {\n", 632 | " 'binned_location': ['count'],\n", 633 | " 'target': ['mean', 'min', 'max', 'std', 'quantile', 'sum'],\n", 634 | "\n", 635 | "}" 636 | ], 637 | "execution_count": 0, 638 | "outputs": [] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "metadata": { 643 | "trusted": true, 644 | "id": "oPKGrSXyxu6E", 645 | "colab_type": "code", 646 | "colab": {} 647 | }, 648 | "source": [ 649 | "meta_stats = data.groupby('location').agg(aggs)" 650 | ], 651 | "execution_count": 0, 652 | "outputs": [] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "metadata": { 657 | "trusted": true, 658 | "id": "ZEdUf6frxu6G", 659 | "colab_type": "code", 660 | "colab": {} 661 | }, 662 | "source": [ 663 | "meta_stats = meta_stats.merge(meta, how='inner', on='location')" 664 | ], 665 | "execution_count": 0, 666 | "outputs": [] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "metadata": { 671 | "trusted": true, 672 | "id": "4IrEOEUmxu6I", 673 | "colab_type": "code", 674 | "colab": {} 675 | }, 676 | "source": [ 677 | "meta_stats.rename(\n", 678 | " columns = {\n", 679 | " ('binned_location', 'count') : 'count',\n", 680 | " ('target', 'mean') : 'mean_target', \n", 681 | " ('target', 'min') : 'min_target',\n", 682 | " ('target', 'max') : 'max_target',\n", 683 | " ('target', 'std') : 'std_target',\n", 684 | " ('target', 'quantile') : 'quantile_target',\n", 685 | " ('target', 'sum') : 'sum_target'\n", 686 | " },\n", 687 | " inplace=True\n", 688 | ")" 689 | ], 690 | "execution_count": 0, 691 | "outputs": [] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "metadata": { 696 | "trusted": true, 697 | "id": "pFljrdYgxu6J", 698 | "colab_type": "code", 699 | "colab": {} 700 | }, 701 | "source": [ 702 | "meta_stats['mean_pm2.5_per_km2'] = meta_stats['mean_target']/meta_stats['km2']\n", 703 | "meta_stats['sum_pm2.5_per_km2'] = meta_stats['sum_target']/meta_stats['km2']\n", 704 | "meta_stats['device_per_km2'] = meta_stats['count']/meta_stats['km2']" 705 | ], 706 | "execution_count": 0, 707 | "outputs": [] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "metadata": { 712 | "trusted": true, 713 | "id": "Nvdl8aXDxu6L", 714 | "colab_type": "code", 715 | "colab": {} 716 | }, 717 | "source": [ 718 | "meta_stats['sum_target'] = meta_stats['sum_target'].apply(np.log2)\n", 719 | "meta_stats['sum_pm2.5_per_km2'] = meta_stats['sum_pm2.5_per_km2'].apply(np.log2)" 720 | ], 721 | "execution_count": 0, 722 | "outputs": [] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "metadata": { 727 | "trusted": true, 728 | "id": "zmF8CGw0xu6M", 729 | "colab_type": "code", 730 | "colab": {} 731 | }, 732 | "source": [ 733 | "data = data.merge(meta_stats, how='left', on=['location'])" 734 | ], 735 | "execution_count": 0, 736 | "outputs": [] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "metadata": { 741 | "trusted": true, 742 | "id": "BvciyhPUxu6O", 743 | "colab_type": "code", 744 | "colab": {} 745 | }, 746 | "source": [ 747 | "data['mean_temp_day_3']" 748 | ], 749 | "execution_count": 0, 750 | "outputs": [] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "metadata": { 755 | "trusted": true, 756 | "id": "NVKgBcCBxu6Q", 757 | "colab_type": "code", 758 | "colab": {} 759 | }, 760 | "source": [ 761 | "hum_features = list(data.filter(regex='rel_humidity_.*').columns)\n", 762 | "temp_features = list( data.filter(regex='temp_.*').columns) \n", 763 | "precip_features = list(data.filter(regex='precip.*').columns)\n", 764 | "winddir_features = list( data.filter(regex='wind_dir_.*').columns)\n", 765 | "windspead_features = list( data.filter(regex='wind_spd_.*').columns)\n", 766 | "atm_features = list(data.filter(regex='atmos_press_.*').columns)\n", 767 | "\n", 768 | "\n", 769 | "\n", 770 | "\n", 771 | "hum_features= hum_features[36:]\n", 772 | "temp_features=temp_features[36:] \n", 773 | "precip_features=precip_features[31:]\n", 774 | "winddir_features=winddir_features[36:]\n", 775 | "windspead_features=windspead_features[36:]\n", 776 | "atm_features=atm_features[36:]\n", 777 | "\n", 778 | "\n", 779 | "\n", 780 | "\n", 781 | "\n", 782 | "data[hum_features]= data[hum_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n", 783 | "\n", 784 | "\n", 785 | "\n", 786 | "data[temp_features]= data[temp_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n", 787 | "\n", 788 | "\n", 789 | "data[precip_features]= data[precip_features].apply(lambda x: x.fillna(float(0.0)),axis=1)\n", 790 | "\n", 791 | "\n", 792 | "\n", 793 | "data[winddir_features]= data[winddir_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n", 794 | "\n", 795 | "\n", 796 | "\n", 797 | "data[windspead_features]= data[windspead_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n", 798 | "\n", 799 | "\n", 800 | "\n", 801 | "\n", 802 | "data[atm_features]= data[atm_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n" 803 | ], 804 | "execution_count": 0, 805 | "outputs": [] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "metadata": { 810 | "trusted": true, 811 | "id": "AafXo5ESxu6T", 812 | "colab_type": "code", 813 | "colab": {} 814 | }, 815 | "source": [ 816 | "data['relation1'] = data['wind_spd_118'] +data['wind_spd_119'] +data['wind_spd_120']\n", 817 | "data['relation2'] = data['temp_89'] + data['temp_95'] + data['temp_48'] +data['temp_70'] + data['temp_88'] + data['temp_72']\n", 818 | "data['relation3'] = data['rel_humidity_112'] + data['rel_humidity_113'] + data['rel_humidity_102'] + data['rel_humidity_42'] + data['rel_humidity_3'] \n", 819 | "data['relation4'] = data['atmos_press_103'] + data['atmos_press_7'] +data['atmos_press_10'] +data['atmos_press_109'] +data['atmos_press_116']" 820 | ], 821 | "execution_count": 0, 822 | "outputs": [] 823 | }, 824 | { 825 | "cell_type": "code", 826 | "metadata": { 827 | "trusted": true, 828 | "id": "uN7EC45bxu6U", 829 | "colab_type": "code", 830 | "colab": {} 831 | }, 832 | "source": [ 833 | "wind_dir_feature_ = list(data.filter(regex='wind_dir.*').columns)\n", 834 | "len([x for x in data.columns if 'wind_dir' in x]) , len(wind_dir_feature_)" 835 | ], 836 | "execution_count": 0, 837 | "outputs": [] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "metadata": { 842 | "trusted": true, 843 | "id": "xRQys-BFxu6W", 844 | "colab_type": "code", 845 | "colab": {} 846 | }, 847 | "source": [ 848 | "def radian_conv(degree):\n", 849 | " \"\"\"\n", 850 | " Return radian.\n", 851 | " \"\"\"\n", 852 | " return np.radians(degree) \n", 853 | "\n", 854 | "\n", 855 | "\n", 856 | "for col in wind_dir_feature_ :\n", 857 | " \n", 858 | " data[col] = data[col].apply(radian_conv)\n" 859 | ], 860 | "execution_count": 0, 861 | "outputs": [] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "metadata": { 866 | "trusted": true, 867 | "id": "Ko3yQM7bxu6Y", 868 | "colab_type": "code", 869 | "colab": {} 870 | }, 871 | "source": [ 872 | "train = data[data.target.notnull()].reset_index(drop=True)\n", 873 | "test = data[data.target.isna()].reset_index(drop=True)" 874 | ], 875 | "execution_count": 0, 876 | "outputs": [] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "metadata": { 881 | "trusted": true, 882 | "id": "5IZ9qeRnxu6d", 883 | "colab_type": "code", 884 | "colab": {} 885 | }, 886 | "source": [ 887 | "to_drop = ['min_precip_day_0', 'min_precip_day_1', 'min_precip_day_2',\n", 888 | " 'min_precip_day_3', 'min_precip_day_4', 'min_precip', \n", 889 | " 'median_precip_day_0', 'median_precip_day_1', 'median_precip_day_2',\n", 890 | " 'median_precip_day_3', 'median_precip_day_4', 'median_precip',\n", 891 | " ]" 892 | ], 893 | "execution_count": 0, 894 | "outputs": [] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "metadata": { 899 | "trusted": true, 900 | "id": "fS0fK_LUxu6f", 901 | "colab_type": "code", 902 | "colab": {} 903 | }, 904 | "source": [ 905 | "train.drop(labels=to_drop, axis=1, inplace=True)\n", 906 | "test.drop(labels=to_drop, axis=1, inplace=True)" 907 | ], 908 | "execution_count": 0, 909 | "outputs": [] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "metadata": { 914 | "trusted": true, 915 | "id": "01RV_1ynxu6i", 916 | "colab_type": "code", 917 | "colab": {} 918 | }, 919 | "source": [ 920 | "features = train.columns.difference(['ID', 'target', 'binned_location'])\n", 921 | "select_features = train.columns.difference(['ID', 'target', 'location'])\n", 922 | "target = 'target'" 923 | ], 924 | "execution_count": 0, 925 | "outputs": [] 926 | }, 927 | { 928 | "cell_type": "code", 929 | "metadata": { 930 | "trusted": true, 931 | "id": "DG4ZV1EZxu6k", 932 | "colab_type": "code", 933 | "colab": {} 934 | }, 935 | "source": [ 936 | "best_features = ['atmos_press_1', 'atmos_press_10', 'atmos_press_103',\n", 937 | " 'atmos_press_104', 'atmos_press_105', 'atmos_press_109',\n", 938 | " 'atmos_press_110', 'atmos_press_115', 'atmos_press_116',\n", 939 | " 'atmos_press_14', 'atmos_press_19', 'atmos_press_2',\n", 940 | " 'atmos_press_20', 'atmos_press_21', 'atmos_press_25',\n", 941 | " 'atmos_press_26', 'atmos_press_27', 'atmos_press_28',\n", 942 | " 'atmos_press_3', 'atmos_press_31', 'atmos_press_32',\n", 943 | " 'atmos_press_33', 'atmos_press_37', 'atmos_press_38',\n", 944 | " 'atmos_press_39', 'atmos_press_43', 'atmos_press_44',\n", 945 | " 'atmos_press_50', 'atmos_press_51', 'atmos_press_52',\n", 946 | " 'atmos_press_56', 'atmos_press_57', 'atmos_press_61',\n", 947 | " 'atmos_press_62', 'atmos_press_67', 'atmos_press_68',\n", 948 | " 'atmos_press_7', 'atmos_press_75', 'atmos_press_8',\n", 949 | " 'atmos_press_80', 'atmos_press_81', 'atmos_press_86',\n", 950 | " 'atmos_press_9', 'atmos_press_91', 'atmos_press_92',\n", 951 | " 'atmos_press_93', 'atmos_press_98', 'atmos_press_99',\n", 952 | " 'hh_burn_waste', 'max_atmos_press', 'max_precip',\n", 953 | " 'max_rel_humidity', \n", 954 | " 'max_rel_humidity_day_2',\n", 955 | " 'max_temp', 'max_wind_dir',\n", 956 | " 'max_wind_dir_day_2',\n", 957 | " 'max_wind_spd',\n", 958 | " 'mean_atmos_press', 'mean_precip', 'mean_rel_humidity',\n", 959 | " \n", 960 | " \n", 961 | " 'mean_target', 'mean_temp_day_2',\n", 962 | " 'mean_wind_dir',\n", 963 | " 'mean_wind_dir_day_2',\n", 964 | " 'mean_wind_spd',\n", 965 | " 'median_atmos_press',\n", 966 | " 'median_rel_humidity', \n", 967 | " 'median_rel_humidity_day_2', \n", 968 | " 'median_temp',\n", 969 | " 'median_temp_day_2', \n", 970 | " 'median_wind_dir',\n", 971 | " 'median_wind_dir_day_2',\n", 972 | " \n", 973 | " 'median_wind_spd',\n", 974 | " 'median_wind_spd_day_2', \n", 975 | " 'min_atmos_press_day_2',\n", 976 | " 'min_rel_humidity',\n", 977 | " 'min_temp',\n", 978 | " 'min_temp_day_2',\n", 979 | " 'min_wind_dir',\n", 980 | " \n", 981 | " 'min_wind_spd', 'min_wind_spd_day_2',\n", 982 | " 'ptp_atmos_press',\n", 983 | " \n", 984 | " 'ptp_precip', 'ptp_rel_humidity',\n", 985 | " 'ptp_wind_dir',\n", 986 | " 'ptp_wind_spd',\n", 987 | " 'rel_humidity_102', 'rel_humidity_112', 'rel_humidity_113',\n", 988 | " 'rel_humidity_114', 'rel_humidity_3', 'rel_humidity_42',\n", 989 | " 'rel_humidity_63', 'rel_humidity_78', 'std_atmos_press',\n", 990 | " 'std_precip', 'std_rel_humidity', 'std_wind_dir',\n", 991 | " 'std_wind_spd', 'temp_0', 'temp_1',\n", 992 | " 'temp_102', 'temp_113', 'temp_114', 'temp_118', 'temp_120',\n", 993 | " 'temp_16', 'temp_17', 'temp_2', 'temp_20', 'temp_22', 'temp_23',\n", 994 | " 'temp_24', 'temp_25', 'temp_30', 'temp_40', 'temp_41', 'temp_48',\n", 995 | " 'temp_49', 'temp_5', 'temp_50', 'temp_54', 'temp_64', 'temp_70',\n", 996 | " 'temp_71', 'temp_72', 'temp_78', 'temp_88', 'temp_89', 'temp_92',\n", 997 | " 'temp_94', 'temp_95', 'temp_97', 'temp_target_reading_day',\n", 998 | " 'var_atmos_press', 'var_precip', 'var_rel_humidity', 'var_temp',\n", 999 | " 'var_wind_dir', 'var_wind_spd',\n", 1000 | " 'wind_spd_108', 'wind_spd_118', 'wind_spd_119', 'wind_spd_120',\n", 1001 | " 'wind_spd_29', 'wind_spd_target_reading_day' ,\n", 1002 | " 'relation1','relation2','relation3',\n", 1003 | " ]" 1004 | ], 1005 | "execution_count": 0, 1006 | "outputs": [] 1007 | }, 1008 | { 1009 | "cell_type": "code", 1010 | "metadata": { 1011 | "trusted": true, 1012 | "id": "Se050WX6xu6l", 1013 | "colab_type": "code", 1014 | "colab": {} 1015 | }, 1016 | "source": [ 1017 | "fold = KFold(n_splits=20, random_state=seed)" 1018 | ], 1019 | "execution_count": 0, 1020 | "outputs": [] 1021 | }, 1022 | { 1023 | "cell_type": "code", 1024 | "metadata": { 1025 | "trusted": true, 1026 | "id": "bGOgFqJexu6n", 1027 | "colab_type": "code", 1028 | "colab": {} 1029 | }, 1030 | "source": [ 1031 | "import lightgbm as lgb\n", 1032 | "params = {\n", 1033 | " 'objective' :'regression',\n", 1034 | " 'learning_rate' : 0.1,\n", 1035 | " 'num_iterations': 1500,\n", 1036 | " 'max_bins': 150, \n", 1037 | " 'max_depth' :7 ,\n", 1038 | " 'num_leaves' : 200,\n", 1039 | " 'feature_fraction': 0.64, \n", 1040 | " 'bagging_fraction': 0.8, \n", 1041 | " 'bagging_freq':1,\n", 1042 | " 'boosting_type' : 'gbdt',\n", 1043 | " 'metric': 'rmse' ,\n", 1044 | " 'min_data_in_leaf':15,\n", 1045 | " 'reg_lambda' :150\n", 1046 | "}\n", 1047 | "\n" 1048 | ], 1049 | "execution_count": 0, 1050 | "outputs": [] 1051 | }, 1052 | { 1053 | "cell_type": "code", 1054 | "metadata": { 1055 | "trusted": true, 1056 | "id": "ic2XJvDoxu6p", 1057 | "colab_type": "code", 1058 | "colab": {} 1059 | }, 1060 | "source": [ 1061 | "test_preds = []\n", 1062 | "score_oofs = []\n", 1063 | "feats = pd.DataFrame({'features': best_features}) #You can change \n", 1064 | "\n", 1065 | "for i, (tr, vr) in enumerate(fold.split(train)):\n", 1066 | " X, Y = train.loc[tr, best_features], train.loc[tr, target]\n", 1067 | " x, y = train.loc[vr, best_features], train.loc[vr, target]\n", 1068 | "\n", 1069 | " model = lgb.LGBMRegressor(**params)\n", 1070 | " model.fit(X, Y, eval_set=[(x,y)], verbose=100, early_stopping_rounds=200)\n", 1071 | " pred = model.predict(x)\n", 1072 | " test_pred = model.predict(test[best_features])\n", 1073 | " sc = metric(y, pred)\n", 1074 | " score_oofs.append(sc)\n", 1075 | " test_preds.append(test_pred)\n", 1076 | " feats[f'Fold {i}'] = model.feature_importances_\n", 1077 | " # print('RMSE : {}'.format(sc))\n", 1078 | "feats['Importances'] = feats.mean(axis=1)\n", 1079 | "print()\n", 1080 | "print('CV RMSE : {}'.format(np.mean(score_oofs, axis=0)))" 1081 | ], 1082 | "execution_count": 0, 1083 | "outputs": [] 1084 | }, 1085 | { 1086 | "cell_type": "code", 1087 | "metadata": { 1088 | "trusted": true, 1089 | "id": "H0TQsVrBxu6q", 1090 | "colab_type": "code", 1091 | "colab": {} 1092 | }, 1093 | "source": [ 1094 | "print('CV RMSE : {}'.format(np.mean(score_oofs, axis=0)))" 1095 | ], 1096 | "execution_count": 0, 1097 | "outputs": [] 1098 | }, 1099 | { 1100 | "cell_type": "markdown", 1101 | "metadata": { 1102 | "id": "3sGBtFpTyXfT", 1103 | "colab_type": "text" 1104 | }, 1105 | "source": [ 1106 | "## ON KAGGLE CV RMSE ==22.69329469880266" 1107 | ] 1108 | }, 1109 | { 1110 | "cell_type": "code", 1111 | "metadata": { 1112 | "trusted": true, 1113 | "id": "zsZqw6p_xu6r", 1114 | "colab_type": "code", 1115 | "colab": {} 1116 | }, 1117 | "source": [ 1118 | "test[target] = np.mean(test_preds, axis=0)\n", 1119 | "test[['ID', target]].to_csv('ANOTHERLGB.csv', index=False)" 1120 | ], 1121 | "execution_count": 0, 1122 | "outputs": [] 1123 | }, 1124 | { 1125 | "cell_type": "code", 1126 | "metadata": { 1127 | "trusted": true, 1128 | "id": "0vjmYYbqxu6t", 1129 | "colab_type": "code", 1130 | "colab": {} 1131 | }, 1132 | "source": [ 1133 | "feats.head()" 1134 | ], 1135 | "execution_count": 0, 1136 | "outputs": [] 1137 | }, 1138 | { 1139 | "cell_type": "code", 1140 | "metadata": { 1141 | "trusted": true, 1142 | "id": "xeyGQnUIxu6u", 1143 | "colab_type": "code", 1144 | "colab": {} 1145 | }, 1146 | "source": [ 1147 | "fig = plt.figure(figsize=(20,40))\n", 1148 | "sns.barplot(x='Importances', y='features', data=feats.sort_values(by='Importances', ascending=False));" 1149 | ], 1150 | "execution_count": 0, 1151 | "outputs": [] 1152 | } 1153 | ] 1154 | } -------------------------------------------------------------------------------- /new_data_creation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "language": "python", 7 | "display_name": "Python 3", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "pygments_lexer": "ipython3", 12 | "nbconvert_exporter": "python", 13 | "version": "3.6.4", 14 | "file_extension": ".py", 15 | "codemirror_mode": { 16 | "name": "ipython", 17 | "version": 3 18 | }, 19 | "name": "python", 20 | "mimetype": "text/x-python" 21 | }, 22 | "colab": { 23 | "name": "new_data_creation.ipynb", 24 | "provenance": [], 25 | "include_colab_link": true 26 | } 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "view-in-github", 33 | "colab_type": "text" 34 | }, 35 | "source": [ 36 | " $\"Open$ " 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "id": "lygrd_4THhsP", 43 | "trusted": true, 44 | "colab_type": "code", 45 | "colab": {} 46 | }, 47 | "source": [ 48 | "import pandas as pd \n", 49 | "import numpy as np \n", 50 | "from tqdm import tqdm\n", 51 | "import math\n", 52 | "import gc" 53 | ], 54 | "execution_count": 0, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "metadata": { 60 | "id": "laCuoZYCHhsW", 61 | "trusted": true, 62 | "colab_type": "code", 63 | "colab": {} 64 | }, 65 | "source": [ 66 | "\n", 67 | "pd.set_option('display.max_rows', 500)\n", 68 | "pd.set_option('display.max_columns', 500)\n", 69 | "pd.set_option('display.width', 1000)" 70 | ], 71 | "execution_count": 0, 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "metadata": { 77 | "id": "tRthNH90Hhsa", 78 | "trusted": true, 79 | "colab_type": "code", 80 | "colab": {} 81 | }, 82 | "source": [ 83 | "train=pd.read_csv(\"../input/airqo-ugandan-air-quality-forecast-challenge-zindi/Train (1).csv\")\n", 84 | "test=pd.read_csv(\"../input/airqo-ugandan-air-quality-forecast-challenge-zindi/Test (1).csv\")" 85 | ], 86 | "execution_count": 0, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "id": "j_1ClVwsHhsf", 93 | "trusted": true, 94 | "colab_type": "code", 95 | "colab": {} 96 | }, 97 | "source": [ 98 | "train.head()" 99 | ], 100 | "execution_count": 0, 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "metadata": { 106 | "id": "7lskkR2rHhss", 107 | "trusted": true, 108 | "colab_type": "code", 109 | "colab": {} 110 | }, 111 | "source": [ 112 | "# covert features fron string to List of values \n", 113 | "def replace_nan(x):\n", 114 | " if x==\"nan\":\n", 115 | " return np.nan\n", 116 | " else :\n", 117 | " return float(x)\n", 118 | "features=[\"temp\",\"precip\",\"rel_humidity\",\"wind_dir\",\"wind_spd\",\"atmos_press\"]\n" 119 | ], 120 | "execution_count": 0, 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "metadata": { 126 | "trusted": true, 127 | "id": "zBgcfd_2nDdL", 128 | "colab_type": "code", 129 | "colab": {} 130 | }, 131 | "source": [ 132 | "train1 = train.copy()\n", 133 | "test1 = test.copy()" 134 | ], 135 | "execution_count": 0, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "metadata": { 141 | "trusted": true, 142 | "id": "cr3UXTc3nDdO", 143 | "colab_type": "code", 144 | "colab": {} 145 | }, 146 | "source": [ 147 | "# covert features fron string to List of values \n", 148 | "def replace_nan1(x):\n", 149 | " if x==\" \":\n", 150 | " return np.nan\n", 151 | " else :\n", 152 | " return float(x)\n", 153 | "features=[\"temp\",\"precip\",\"rel_humidity\",\"wind_dir\",\"wind_spd\",\"atmos_press\"]\n", 154 | "for feature in features : \n", 155 | " train1[feature]=train1[feature].apply(lambda x: [ replace_nan1(X) for X in x.replace(\"nan\",\" \").split(\",\")])\n", 156 | " test1[feature]=test1[feature].apply(lambda x: [ replace_nan1(X) for X in x.replace(\"nan\",\" \").split(\",\")]) " 157 | ], 158 | "execution_count": 0, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "id": "mApRDmFGHhsz", 165 | "colab_type": "text" 166 | }, 167 | "source": [ 168 | "### Features engineering part" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "metadata": { 174 | "trusted": true, 175 | "id": "ZD2E9Np_nDdR", 176 | "colab_type": "code", 177 | "colab": {} 178 | }, 179 | "source": [ 180 | "def percentile(n):\n", 181 | " def percentile_(x):\n", 182 | " return np.percentile(x, n)\n", 183 | " percentile_.__name__ = 'percentile_%s' % n\n", 184 | " return percentile_" 185 | ], 186 | "execution_count": 0, 187 | "outputs": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "metadata": { 192 | "id": "sg8-zujFHhs1", 193 | "trusted": true, 194 | "colab_type": "code", 195 | "colab": {} 196 | }, 197 | "source": [ 198 | "def aggregate_features(x,col_name):\n", 199 | " x[\"max_\"+col_name]=x[col_name].apply(np.max)\n", 200 | " x[\"min_\"+col_name]=x[col_name].apply(np.min)\n", 201 | " x[\"mean_\"+col_name]=x[col_name].apply(np.mean)\n", 202 | " x[\"std_\"+col_name]=x[col_name].apply(np.std)\n", 203 | " x[\"var_\"+col_name]=x[col_name].apply(np.var)\n", 204 | " x[\"median_\"+col_name]=x[col_name].apply(np.median)\n", 205 | " x[\"ptp_\"+col_name]=x[col_name].apply(np.ptp)\n", 206 | " x[\"p1_\"+col_name]=x[col_name].apply(percentile(1))\n", 207 | " x[\"p5_\"+col_name]=x[col_name].apply(percentile(5))\n", 208 | " x[\"p10_\"+col_name]=x[col_name].apply(percentile(10))\n", 209 | " x[\"p20_\"+col_name]=x[col_name].apply(percentile(20))\n", 210 | " x[\"p30_\"+col_name]=x[col_name].apply(percentile(30))\n", 211 | " x[\"p40_\"+col_name]=x[col_name].apply(percentile(40))\n", 212 | " x[\"p60_\"+col_name]=x[col_name].apply(percentile(60))\n", 213 | " x[\"p70_\"+col_name]=x[col_name].apply(percentile(70))\n", 214 | " x[\"p80_\"+col_name]=x[col_name].apply(percentile(80))\n", 215 | " x[\"p90_\"+col_name]=x[col_name].apply(percentile(90))\n", 216 | " x[\"p95_\"+col_name]=x[col_name].apply(percentile(95))\n", 217 | " x[\"p99_\"+col_name]=x[col_name].apply(percentile(99))\n", 218 | " return x \n", 219 | "def remove_nan_values(x):\n", 220 | " return [e for e in x if not math.isnan(e)]\n" 221 | ], 222 | "execution_count": 0, 223 | "outputs": [] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "metadata": { 228 | "id": "jSHXJxduHhs6", 229 | "trusted": true, 230 | "colab_type": "code", 231 | "colab": {} 232 | }, 233 | "source": [ 234 | "data=pd.concat([train,test],sort=False).reset_index(drop=True)\n", 235 | "data2=pd.concat([train1,test1],sort=False).reset_index(drop=True)\n", 236 | "data2.columns.tolist()" 237 | ], 238 | "execution_count": 0, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "metadata": { 244 | "trusted": true, 245 | "id": "CG8sz-usnDdZ", 246 | "colab_type": "code", 247 | "colab": {} 248 | }, 249 | "source": [ 250 | "for col_name in tqdm(features):\n", 251 | " data2[col_name]=data2[col_name].apply(remove_nan_values)" 252 | ], 253 | "execution_count": 0, 254 | "outputs": [] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "metadata": { 259 | "trusted": true, 260 | "id": "hw9abEfknDda", 261 | "colab_type": "code", 262 | "colab": {} 263 | }, 264 | "source": [ 265 | "for col_name in tqdm(features):\n", 266 | " data2=aggregate_features(data2,col_name)" 267 | ], 268 | "execution_count": 0, 269 | "outputs": [] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "metadata": { 274 | "trusted": true, 275 | "id": "Rq5JJrUWnDdd", 276 | "colab_type": "code", 277 | "colab": {} 278 | }, 279 | "source": [ 280 | "data2.head(2)" 281 | ], 282 | "execution_count": 0, 283 | "outputs": [] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "metadata": { 288 | "id": "1dYviSGeHhs_", 289 | "trusted": true, 290 | "colab_type": "code", 291 | "colab": {} 292 | }, 293 | "source": [ 294 | "data.shape,data2.shape" 295 | ], 296 | "execution_count": 0, 297 | "outputs": [] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "metadata": { 302 | "trusted": true, 303 | "id": "n77Rz_oCnDdg", 304 | "colab_type": "code", 305 | "colab": {} 306 | }, 307 | "source": [ 308 | "#['temp_'+str(i) for i in temp_df.columns]" 309 | ], 310 | "execution_count": 0, 311 | "outputs": [] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "metadata": { 316 | "trusted": true, 317 | "id": "BBWMTSZ3nDdi", 318 | "colab_type": "code", 319 | "colab": {} 320 | }, 321 | "source": [ 322 | "data1 = pd.DataFrame()\n", 323 | "for col_name in tqdm(features):\n", 324 | " temp_df = data[col_name].apply(lambda x: pd.Series(x.split(',')))\n", 325 | " temp_df.columns = [col_name +str(i) for i in temp_df.columns]\n", 326 | " data1 = pd.concat([data1,temp_df],axis=1)" 327 | ], 328 | "execution_count": 0, 329 | "outputs": [] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "metadata": { 334 | "trusted": true, 335 | "id": "sA4jCX2mnDdk", 336 | "colab_type": "code", 337 | "colab": {} 338 | }, 339 | "source": [ 340 | "for col_name in tqdm(data1.columns):\n", 341 | " data1[col_name] = data1[col_name].apply(replace_nan)" 342 | ], 343 | "execution_count": 0, 344 | "outputs": [] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "metadata": { 349 | "trusted": true, 350 | "id": "9GmmluWunDdm", 351 | "colab_type": "code", 352 | "colab": {} 353 | }, 354 | "source": [ 355 | "data1" 356 | ], 357 | "execution_count": 0, 358 | "outputs": [] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "metadata": { 363 | "trusted": true, 364 | "id": "bQokampdnDdo", 365 | "colab_type": "code", 366 | "colab": {} 367 | }, 368 | "source": [ 369 | "for col_name in tqdm(data1.columns):\n", 370 | " data1[col_name] = data1[col_name].fillna(0)" 371 | ], 372 | "execution_count": 0, 373 | "outputs": [] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "metadata": { 378 | "trusted": true, 379 | "id": "072VFzctnDdq", 380 | "colab_type": "code", 381 | "colab": {} 382 | }, 383 | "source": [ 384 | "data1.head()" 385 | ], 386 | "execution_count": 0, 387 | "outputs": [] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "metadata": { 392 | "trusted": true, 393 | "id": "zgHkBgQ2nDds", 394 | "colab_type": "code", 395 | "colab": {} 396 | }, 397 | "source": [ 398 | "data11 = data1.copy()\n", 399 | "data11['ID'] = data['ID']" 400 | ], 401 | "execution_count": 0, 402 | "outputs": [] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "metadata": { 407 | "trusted": true, 408 | "id": "ddnjMZ-7nDdt", 409 | "colab_type": "code", 410 | "colab": {} 411 | }, 412 | "source": [ 413 | "data11.head()" 414 | ], 415 | "execution_count": 0, 416 | "outputs": [] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "metadata": { 421 | "trusted": true, 422 | "id": "9onPi6UWnDdv", 423 | "colab_type": "code", 424 | "colab": {} 425 | }, 426 | "source": [ 427 | "def fun(x):\n", 428 | " if x.split('_')[0] == 'atmos':\n", 429 | " return x" 430 | ], 431 | "execution_count": 0, 432 | "outputs": [] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "metadata": { 437 | "trusted": true, 438 | "id": "O5-PQIwcnDdx", 439 | "colab_type": "code", 440 | "colab": {} 441 | }, 442 | "source": [ 443 | "atmos = list(filter(fun,data11.columns))" 444 | ], 445 | "execution_count": 0, 446 | "outputs": [] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "metadata": { 451 | "trusted": true, 452 | "id": "lBVOMbj-nDdz", 453 | "colab_type": "code", 454 | "colab": {} 455 | }, 456 | "source": [ 457 | "data11 = data11[['ID']+atmos]" 458 | ], 459 | "execution_count": 0, 460 | "outputs": [] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "metadata": { 465 | "trusted": true, 466 | "id": "mWuRl8mKnDd1", 467 | "colab_type": "code", 468 | "colab": {} 469 | }, 470 | "source": [ 471 | "data11_atmos = pd.melt(data11,id_vars=[\"ID\"],var_name='atmos', value_name='value_atmos')\n" 472 | ], 473 | "execution_count": 0, 474 | "outputs": [] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "metadata": { 479 | "trusted": true, 480 | "id": "fS87ZClWnDd4", 481 | "colab_type": "code", 482 | "colab": {} 483 | }, 484 | "source": [ 485 | "for i in tqdm(range(1, 15)):\n", 486 | " data11_atmos[f'magic_{i}'] = data11_atmos.sort_values(by='ID')['value_atmos'].shift(i).expanding().mean().fillna(method='ffill').sort_index()\n", 487 | " data11_atmos[f'magic2_{i}'] = data11_atmos.sort_values(by='ID')['value_atmos'].shift(-i).expanding().mean().fillna(method='bfill').sort_index()" 488 | ], 489 | "execution_count": 0, 490 | "outputs": [] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "metadata": { 495 | "trusted": true, 496 | "id": "xmVcBhHlnDd5", 497 | "colab_type": "code", 498 | "colab": {} 499 | }, 500 | "source": [ 501 | "data11_atmos[data11_atmos.ID == 'ID_train_1'].head(3)" 502 | ], 503 | "execution_count": 0, 504 | "outputs": [] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "metadata": { 509 | "trusted": true, 510 | "id": "cpWEAZ_9nDd7", 511 | "colab_type": "code", 512 | "colab": {} 513 | }, 514 | "source": [ 515 | "data11_piv = pd.pivot_table(data11_atmos,index='ID',columns = 'atmos',values = 'magic_1')\n", 516 | "data11_piv.columns = ['atmos_magic_1'+i for i in data11_piv.columns]\n", 517 | "for i in tqdm(range(2,15)):\n", 518 | " temp = pd.pivot_table(data11_atmos,index='ID',columns = 'atmos',values = 'magic_'+str(i))\n", 519 | " temp.columns = ['atmos_magic_'+str(i)+j for j in temp.columns]\n", 520 | " data11_piv = pd.concat([data11_piv,temp],axis=1)" 521 | ], 522 | "execution_count": 0, 523 | "outputs": [] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "metadata": { 528 | "trusted": true, 529 | "id": "2Kt3ZD8unDd-", 530 | "colab_type": "code", 531 | "colab": {} 532 | }, 533 | "source": [ 534 | "data11_piv.reset_index(inplace=True)" 535 | ], 536 | "execution_count": 0, 537 | "outputs": [] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "metadata": { 542 | "trusted": true, 543 | "id": "SXV38vqCnDeA", 544 | "colab_type": "code", 545 | "colab": {} 546 | }, 547 | "source": [ 548 | "data11 = pd.merge(data11,data11_piv,on='ID',how='left')" 549 | ], 550 | "execution_count": 0, 551 | "outputs": [] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "metadata": { 556 | "trusted": true, 557 | "id": "Du4dLdyfnDeC", 558 | "colab_type": "code", 559 | "colab": {} 560 | }, 561 | "source": [ 562 | "m = data1.isnull().sum()\n", 563 | "sum(m[m>0])" 564 | ], 565 | "execution_count": 0, 566 | "outputs": [] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "metadata": { 571 | "trusted": true, 572 | "id": "D65TOOg8nDeE", 573 | "colab_type": "code", 574 | "colab": {} 575 | }, 576 | "source": [ 577 | "data11.head()" 578 | ], 579 | "execution_count": 0, 580 | "outputs": [] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "metadata": { 585 | "id": "aiAM7WA3HhtR", 586 | "trusted": true, 587 | "colab_type": "code", 588 | "colab": {} 589 | }, 590 | "source": [ 591 | "data2.drop(features,1,inplace=True)" 592 | ], 593 | "execution_count": 0, 594 | "outputs": [] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "metadata": { 599 | "trusted": true, 600 | "id": "jFlOZI75nDeH", 601 | "colab_type": "code", 602 | "colab": {} 603 | }, 604 | "source": [ 605 | "data2" 606 | ], 607 | "execution_count": 0, 608 | "outputs": [] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "metadata": { 613 | "trusted": true, 614 | "id": "TzKMLGREnDeJ", 615 | "colab_type": "code", 616 | "colab": {} 617 | }, 618 | "source": [ 619 | "data1['ID'] = data['ID']\n", 620 | "data1['target'] = data['target']\n", 621 | "data1['location'] = data['location']" 622 | ], 623 | "execution_count": 0, 624 | "outputs": [] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "metadata": { 629 | "trusted": true, 630 | "id": "4hlTnqkwnDeK", 631 | "colab_type": "code", 632 | "colab": {} 633 | }, 634 | "source": [ 635 | "data1.head()" 636 | ], 637 | "execution_count": 0, 638 | "outputs": [] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "metadata": { 643 | "trusted": true, 644 | "id": "53nZZvVfnDeM", 645 | "colab_type": "code", 646 | "colab": {} 647 | }, 648 | "source": [ 649 | "data1.shape" 650 | ], 651 | "execution_count": 0, 652 | "outputs": [] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "metadata": { 657 | "trusted": true, 658 | "id": "t-3S4WYRnDeO", 659 | "colab_type": "code", 660 | "colab": {} 661 | }, 662 | "source": [ 663 | "# data1 = pd.merge(data1,data11,on='ID',how='inner')\n", 664 | "# data1.shape" 665 | ], 666 | "execution_count": 0, 667 | "outputs": [] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "metadata": { 672 | "id": "b-inscoEHhtU", 673 | "trusted": true, 674 | "colab_type": "code", 675 | "colab": {} 676 | }, 677 | "source": [ 678 | "train=data1[data1.target.notnull()].reset_index(drop=True)\n", 679 | "test=data1[data1.target.isna()].reset_index(drop=True)" 680 | ], 681 | "execution_count": 0, 682 | "outputs": [] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "metadata": { 687 | "trusted": true, 688 | "id": "cSDLwzdBnDeR", 689 | "colab_type": "code", 690 | "colab": {} 691 | }, 692 | "source": [ 693 | "train1=data2[data2.target.notnull()].reset_index(drop=True)\n", 694 | "test1=data2[data2.target.isna()].reset_index(drop=True)" 695 | ], 696 | "execution_count": 0, 697 | "outputs": [] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "metadata": { 702 | "id": "j5JqDT3SHhtY", 703 | "trusted": true, 704 | "colab_type": "code", 705 | "colab": {} 706 | }, 707 | "source": [ 708 | "del data,data1,data2\n", 709 | "gc.collect()" 710 | ], 711 | "execution_count": 0, 712 | "outputs": [] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "metadata": { 717 | "id": "h_m0qW-3Hhtb", 718 | "trusted": true, 719 | "colab_type": "code", 720 | "colab": {} 721 | }, 722 | "source": [ 723 | "train.head()" 724 | ], 725 | "execution_count": 0, 726 | "outputs": [] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "metadata": { 731 | "id": "Ycw-LMZvHhtf", 732 | "trusted": true, 733 | "colab_type": "code", 734 | "colab": {} 735 | }, 736 | "source": [ 737 | "train1.drop(['ID','location','target'],axis=1,inplace=True)\n", 738 | "test1.drop(['ID','location','target'],axis=1,inplace=True)" 739 | ], 740 | "execution_count": 0, 741 | "outputs": [] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "metadata": { 746 | "trusted": true, 747 | "id": "3Wp03kuLnDea", 748 | "colab_type": "code", 749 | "colab": {} 750 | }, 751 | "source": [ 752 | "train.shape,test.shape" 753 | ], 754 | "execution_count": 0, 755 | "outputs": [] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "metadata": { 760 | "trusted": true, 761 | "id": "e2fukqsAnDec", 762 | "colab_type": "code", 763 | "colab": {} 764 | }, 765 | "source": [ 766 | "train = pd.concat([train,train1],axis=1)\n", 767 | "test = pd.concat([test,test1],axis=1)\n", 768 | "train.shape,test.shape" 769 | ], 770 | "execution_count": 0, 771 | "outputs": [] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "metadata": { 776 | "id": "rXICDSaMHhtk", 777 | "trusted": true, 778 | "colab_type": "code", 779 | "colab": {} 780 | }, 781 | "source": [ 782 | "Experiment_name=\"simple_model\"" 783 | ], 784 | "execution_count": 0, 785 | "outputs": [] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "metadata": { 790 | "trusted": true, 791 | "id": "HIM-oxjJnDef", 792 | "colab_type": "code", 793 | "colab": {} 794 | }, 795 | "source": [ 796 | "train_id = train['ID']\n", 797 | "test_id = test['ID']\n", 798 | "y = train['target']" 799 | ], 800 | "execution_count": 0, 801 | "outputs": [] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "metadata": { 806 | "trusted": true, 807 | "id": "DewPn3Y0nDeh", 808 | "colab_type": "code", 809 | "colab": {} 810 | }, 811 | "source": [ 812 | "#train.drop(['ID','target'],axis=1,inplace=True)\n", 813 | "#test.drop(['ID','target'],axis=1,inplace=True)" 814 | ], 815 | "execution_count": 0, 816 | "outputs": [] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "metadata": { 821 | "trusted": false, 822 | "id": "BTt2Fet-nDei", 823 | "colab_type": "code", 824 | "colab": {} 825 | }, 826 | "source": [ 827 | "# from sklearn.preprocessing import LabelEncoder\n", 828 | "# lab = LabelEncoder()\n", 829 | "# lab.fit(train[\"location\"])\n", 830 | "\n", 831 | "# train.location = lab.transform(train.location)\n", 832 | "# test.location = lab.transform(test.location)" 833 | ], 834 | "execution_count": 0, 835 | "outputs": [] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "metadata": { 840 | "trusted": true, 841 | "id": "Zw0z2yR1nDej", 842 | "colab_type": "code", 843 | "colab": {} 844 | }, 845 | "source": [ 846 | "import category_encoders as ce\n", 847 | "encoder = ce.CatBoostEncoder(cols=[\"location\"])\n", 848 | "encoder.fit(train, y)\n", 849 | "train = encoder.transform(train)\n", 850 | "test = encoder.transform(test)" 851 | ], 852 | "execution_count": 0, 853 | "outputs": [] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "metadata": { 858 | "trusted": true, 859 | "id": "kF4AcdjknDel", 860 | "colab_type": "code", 861 | "colab": {} 862 | }, 863 | "source": [ 864 | "train.shape,test.shape" 865 | ], 866 | "execution_count": 0, 867 | "outputs": [] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "metadata": { 872 | "trusted": true, 873 | "id": "V1guD8gFnDeo", 874 | "colab_type": "code", 875 | "colab": {} 876 | }, 877 | "source": [ 878 | "X = train.copy()" 879 | ], 880 | "execution_count": 0, 881 | "outputs": [] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "metadata": { 886 | "trusted": true, 887 | "id": "kNMjT4HpnDer", 888 | "colab_type": "code", 889 | "colab": {} 890 | }, 891 | "source": [ 892 | "X['ID'] = train_id\n", 893 | "test['ID'] = test_id\n", 894 | "X['target'] = y" 895 | ], 896 | "execution_count": 0, 897 | "outputs": [] 898 | }, 899 | { 900 | "cell_type": "code", 901 | "metadata": { 902 | "trusted": true, 903 | "id": "Zi6-JGMfnDes", 904 | "colab_type": "code", 905 | "colab": {} 906 | }, 907 | "source": [ 908 | "X.to_csv('train_df.csv',index=False)\n", 909 | "test.to_csv('test_df.csv',index=False)" 910 | ], 911 | "execution_count": 0, 912 | "outputs": [] 913 | }, 914 | { 915 | "cell_type": "markdown", 916 | "metadata": { 917 | "id": "ra9AQc3gnDeu", 918 | "colab_type": "text" 919 | }, 920 | "source": [ 921 | "## **PART 2 // THIS PART WILL TAKE AROUND 2H++**" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "metadata": { 927 | "trusted": true, 928 | "id": "dfEXW7S9nDeu", 929 | "colab_type": "code", 930 | "colab": {} 931 | }, 932 | "source": [ 933 | "import pandas as pd\n", 934 | "import numpy as np\n", 935 | "import matplotlib.pyplot as plt\n", 936 | "import seaborn as sns\n", 937 | "import warnings\n", 938 | "warnings.filterwarnings('ignore')\n", 939 | "\n", 940 | "from math import sqrt \n", 941 | "import lightgbm as lgb\n", 942 | "from sklearn.metrics import mean_squared_error \n", 943 | "from sklearn.model_selection import KFold, train_test_split\n", 944 | "\n", 945 | "\n", 946 | "from scipy.sparse import csr_matrix\n", 947 | "import gc\n", 948 | "\n", 949 | "import pandas as pd\n", 950 | "import numpy as np\n", 951 | "import matplotlib.pyplot as plt\n", 952 | "import seaborn as sns\n", 953 | "from xgboost import XGBRegressor\n", 954 | "from catboost import CatBoostRegressor\n", 955 | "from lightgbm import LGBMRegressor\n", 956 | "from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,BaggingRegressor,AdaBoostRegressor,ExtraTreesRegressor\n", 957 | "from sklearn.tree import DecisionTreeRegressor\n", 958 | "from sklearn.linear_model import LinearRegression\n", 959 | "from sklearn.ensemble import StackingRegressor\n", 960 | "#from ngboost import NGBRegressor\n", 961 | "from sklearn.metrics import mean_squared_log_error\n", 962 | "from sklearn.preprocessing import LabelEncoder\n", 963 | "from sklearn.model_selection import KFold,StratifiedKFold, GroupKFold\n", 964 | "import datetime\n", 965 | "from statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n", 966 | "# from fastai.tabular import *\n", 967 | "import warnings\n", 968 | "from tqdm import *\n", 969 | "warnings.filterwarnings(\"ignore\")\n", 970 | "np.random.seed(0)\n", 971 | "pd.set_option('display.max_rows', 500)\n", 972 | "pd.set_option('display.max_columns', 500)\n", 973 | "pd.set_option('display.width', 1000)" 974 | ], 975 | "execution_count": 0, 976 | "outputs": [] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "metadata": { 981 | "trusted": true, 982 | "id": "HmZwAO-9nDex", 983 | "colab_type": "code", 984 | "colab": {} 985 | }, 986 | "source": [ 987 | "# Memory reduction helper function:\n", 988 | "def reduce_mem_usage(df, verbose=True):\n", 989 | " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", 990 | " start_mem = df.memory_usage().sum() / 1024**2 \n", 991 | " for col in df.columns: #columns\n", 992 | " col_type = df[col].dtypes\n", 993 | " if col_type in numerics: #numerics\n", 994 | " c_min = df[col].min()\n", 995 | " c_max = df[col].max()\n", 996 | " if str(col_type)[:3] == 'int':\n", 997 | " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", 998 | " df[col] = df[col].astype(np.int8)\n", 999 | " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", 1000 | " df[col] = df[col].astype(np.int16)\n", 1001 | " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", 1002 | " df[col] = df[col].astype(np.int32)\n", 1003 | " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", 1004 | " df[col] = df[col].astype(np.int64) \n", 1005 | " else:\n", 1006 | " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", 1007 | " df[col] = df[col].astype(np.float16)\n", 1008 | " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", 1009 | " df[col] = df[col].astype(np.float32)\n", 1010 | " else:\n", 1011 | " df[col] = df[col].astype(np.float64) \n", 1012 | " end_mem = df.memory_usage().sum() / 1024**2\n", 1013 | " if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))\n", 1014 | " return df" 1015 | ], 1016 | "execution_count": 0, 1017 | "outputs": [] 1018 | }, 1019 | { 1020 | "cell_type": "code", 1021 | "metadata": { 1022 | "trusted": true, 1023 | "id": "XAJLlciBnDey", 1024 | "colab_type": "code", 1025 | "colab": {} 1026 | }, 1027 | "source": [ 1028 | "train =X\n", 1029 | "test=test.drop('target',1)\n", 1030 | "train.shape,test.shape" 1031 | ], 1032 | "execution_count": 0, 1033 | "outputs": [] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "metadata": { 1038 | "trusted": true, 1039 | "id": "4XpTRUoQnDez", 1040 | "colab_type": "code", 1041 | "colab": {} 1042 | }, 1043 | "source": [ 1044 | "train = reduce_mem_usage(train)\n", 1045 | "test = reduce_mem_usage(test)" 1046 | ], 1047 | "execution_count": 0, 1048 | "outputs": [] 1049 | }, 1050 | { 1051 | "cell_type": "code", 1052 | "metadata": { 1053 | "trusted": true, 1054 | "id": "HVykT3F9nDe2", 1055 | "colab_type": "code", 1056 | "colab": {} 1057 | }, 1058 | "source": [ 1059 | "temp = [\"temp{}\".format(i) for i in range(121)]\n", 1060 | "precip = [\"precip{}\".format(i) for i in range(121)]\n", 1061 | "rel_humidity = [\"rel_humidity{}\".format(i) for i in range(121)]\n", 1062 | "wind_dir = [\"wind_dir{}\".format(i) for i in range(121)]\n", 1063 | "wind_spd = [\"wind_spd{}\".format(i) for i in range(121)]\n", 1064 | "atmos_press = [\"atmos_press{}\".format(i) for i in range(121)]" 1065 | ], 1066 | "execution_count": 0, 1067 | "outputs": [] 1068 | }, 1069 | { 1070 | "cell_type": "code", 1071 | "metadata": { 1072 | "trusted": true, 1073 | "id": "U08Kp65bnDe3", 1074 | "colab_type": "code", 1075 | "colab": {} 1076 | }, 1077 | "source": [ 1078 | "rem_org = list(set(test.columns) - set(temp+precip+rel_humidity+wind_dir+wind_spd+atmos_press))\n", 1079 | "len(rem_org)" 1080 | ], 1081 | "execution_count": 0, 1082 | "outputs": [] 1083 | }, 1084 | { 1085 | "cell_type": "code", 1086 | "metadata": { 1087 | "trusted": true, 1088 | "id": "Nk4bL20MnDe5", 1089 | "colab_type": "code", 1090 | "colab": {} 1091 | }, 1092 | "source": [ 1093 | "import re\n", 1094 | "def fun(x):\n", 1095 | " return int(re.findall(r'\\d+', x)[0])" 1096 | ], 1097 | "execution_count": 0, 1098 | "outputs": [] 1099 | }, 1100 | { 1101 | "cell_type": "code", 1102 | "metadata": { 1103 | "trusted": true, 1104 | "id": "hzy9rVzCnDe6", 1105 | "colab_type": "code", 1106 | "colab": {} 1107 | }, 1108 | "source": [ 1109 | "l = ['precip','rel_humidity','wind_dir','wind_spd','atmos_press']" 1110 | ], 1111 | "execution_count": 0, 1112 | "outputs": [] 1113 | }, 1114 | { 1115 | "cell_type": "code", 1116 | "metadata": { 1117 | "trusted": true, 1118 | "id": "P0sg2E8xnDe7", 1119 | "colab_type": "code", 1120 | "colab": {} 1121 | }, 1122 | "source": [ 1123 | "data = train[['ID']+temp]\n", 1124 | "data = data.melt(id_vars=[\"ID\"],var_name=\"d\", value_name='temp')\n", 1125 | "data['d'] = data['d'].apply(fun)\n", 1126 | "for n,i in enumerate([precip,rel_humidity,wind_dir,wind_spd,atmos_press]):\n", 1127 | " data1 = train[['ID']+i]\n", 1128 | " data1 = data1.melt(id_vars=[\"ID\"],var_name=\"d\", value_name=l[n])\n", 1129 | " data1['d'] = data1['d'].apply(fun)\n", 1130 | " data = pd.merge(data,data1,on=['ID','d'],how='inner')\n", 1131 | " print('*'*8,n)" 1132 | ], 1133 | "execution_count": 0, 1134 | "outputs": [] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "metadata": { 1139 | "trusted": true, 1140 | "id": "npYQ54HpnDe8", 1141 | "colab_type": "code", 1142 | "colab": {} 1143 | }, 1144 | "source": [ 1145 | "data_t = test[['ID']+temp]\n", 1146 | "data_t = data_t.melt(id_vars=[\"ID\"],var_name=\"d\", value_name='temp')\n", 1147 | "data_t['d'] = data_t['d'].apply(fun)\n", 1148 | "for n,i in enumerate([precip,rel_humidity,wind_dir,wind_spd,atmos_press]):\n", 1149 | " data1 = test[['ID']+i]\n", 1150 | " data1 = data1.melt(id_vars=[\"ID\"],var_name=\"d\", value_name=l[n])\n", 1151 | " data1['d'] = data1['d'].apply(fun)\n", 1152 | " data_t = pd.merge(data_t,data1,on=['ID','d'],how='inner')\n", 1153 | " print('*'*8,n)" 1154 | ], 1155 | "execution_count": 0, 1156 | "outputs": [] 1157 | }, 1158 | { 1159 | "cell_type": "code", 1160 | "metadata": { 1161 | "trusted": true, 1162 | "id": "phNrAka9nDe-", 1163 | "colab_type": "code", 1164 | "colab": {} 1165 | }, 1166 | "source": [ 1167 | "data['train'] = True\n", 1168 | "data_t['train'] = False\n", 1169 | "\n", 1170 | "data = pd.concat([data,data_t])\n", 1171 | "data.reset_index(drop=True,inplace=True)\n", 1172 | "data['train'].value_counts()" 1173 | ], 1174 | "execution_count": 0, 1175 | "outputs": [] 1176 | }, 1177 | { 1178 | "cell_type": "code", 1179 | "metadata": { 1180 | "trusted": true, 1181 | "id": "75YPN5stnDe_", 1182 | "colab_type": "code", 1183 | "colab": {} 1184 | }, 1185 | "source": [ 1186 | "data.head()" 1187 | ], 1188 | "execution_count": 0, 1189 | "outputs": [] 1190 | }, 1191 | { 1192 | "cell_type": "code", 1193 | "metadata": { 1194 | "trusted": true, 1195 | "id": "XiPKBJUWnDfA", 1196 | "colab_type": "code", 1197 | "colab": {} 1198 | }, 1199 | "source": [ 1200 | "def roll1(h):\n", 1201 | " if h <= 24:\n", 1202 | " return('day1')\n", 1203 | " elif (h>24) and (h<=48):\n", 1204 | " return('day2')\n", 1205 | " elif (h>48) and (h<=72):\n", 1206 | " return('day3')\n", 1207 | " elif (h>72) and (h<=96):\n", 1208 | " return('day4')\n", 1209 | " elif (h>96) and (h<=120):\n", 1210 | " return('day5')\n", 1211 | " \n", 1212 | " \n", 1213 | "def roll2(h):\n", 1214 | " if (h <= 6) or ((h-24)>0 and (h-24)<=6)or ((h-24*2)>0 and (h-24*2)<=6)or ((h-24*3)>0 and (h-24*3)<=6)or ((h-24*4)>0 and (h-24*4)<=6):\n", 1215 | " return('Morning')\n", 1216 | " elif (h <= 12) or ((h-24)>0 and (h-24)<=12)or ((h-24*2)>0 and (h-24*2)<=12)or ((h-24*3)>0 and (h-24*3)<=12)or ((h-24*4)>0 and (h-24*4)<=12):\n", 1217 | " return('Noon')\n", 1218 | " elif (h <= 18) or ((h-24)>0 and (h-24)<=18)or ((h-24*2)>0 and (h-24*2)<=18)or ((h-24*3)>0 and (h-24*3)<=18)or ((h-24*4)>0 and (h-24*4)<=18):\n", 1219 | " return('Evening')\n", 1220 | " else:\n", 1221 | " return('Night')\n", 1222 | " \n", 1223 | "def roll3(h):\n", 1224 | " if (h <= 3) or ((h-24)>0 and (h-24)<=3)or ((h-24*2)>0 and (h-24*2)<=3)or ((h-24*3)>0 and (h-24*3)<=3)or ((h-24*4)>0 and (h-24*4)<=3):\n", 1225 | " return('Mor1')\n", 1226 | " elif (h <= 6) or ((h-24)>0 and (h-24)<=6)or ((h-24*2)>0 and (h-24*2)<=6)or ((h-24*3)>0 and (h-24*3)<=6)or ((h-24*4)>0 and (h-24*4)<=6):\n", 1227 | " return('Mor2')\n", 1228 | " elif (h <= 9) or ((h-24)>0 and (h-24)<=9)or ((h-24*2)>0 and (h-24*2)<=9)or ((h-24*3)>0 and (h-24*3)<=9)or ((h-24*4)>0 and (h-24*4)<=9):\n", 1229 | " return('Noo1')\n", 1230 | " elif (h <= 12) or ((h-24)>0 and (h-24)<=12)or ((h-24*2)>0 and (h-24*2)<=12)or ((h-24*3)>0 and (h-24*3)<=12)or ((h-24*4)>0 and (h-24*4)<=12):\n", 1231 | " return('Noo2')\n", 1232 | " elif (h <= 15) or ((h-24)>0 and (h-24)<=15)or ((h-24*2)>0 and (h-24*2)<=15)or ((h-24*3)>0 and (h-24*3)<=15)or ((h-24*4)>0 and (h-24*4)<=15):\n", 1233 | " return('Eve1')\n", 1234 | " elif (h <= 18) or ((h-24)>0 and (h-24)<=18)or ((h-24*2)>0 and (h-24*2)<=18)or ((h-24*3)>0 and (h-24*3)<=18)or ((h-24*4)>0 and (h-24*4)<=18):\n", 1235 | " return('Eve2')\n", 1236 | " elif (h <= 21) or ((h-24)>0 and (h-24)<=21)or ((h-24*2)>0 and (h-24*2)<=21)or ((h-24*3)>0 and (h-24*3)<=21)or ((h-24*4)>0 and (h-24*4)<=21):\n", 1237 | " return('Nig1')\n", 1238 | " else:\n", 1239 | " return('Nig2')\n", 1240 | "\n", 1241 | "def roll4(h):\n", 1242 | " if (h <= 8) or ((h-24)>0 and (h-24)<=8)or ((h-24*2)>0 and (h-24*2)<=8)or ((h-24*3)>0 and (h-24*3)<=8)or ((h-24*4)>0 and (h-24*4)<=8):\n", 1243 | " return('First')\n", 1244 | " elif (h <= 16) or ((h-24)>0 and (h-24)<=16)or ((h-24*2)>0 and (h-24*2)<=16)or ((h-24*3)>0 and (h-24*3)<=16)or ((h-24*4)>0 and (h-24*4)<=16):\n", 1245 | " return('Second')\n", 1246 | " else:\n", 1247 | " return('Third')\n", 1248 | " \n", 1249 | "def roll5(h):\n", 1250 | " if (h <= 12) or ((h-24)>0 and (h-24)<=12)or ((h-24*2)>0 and (h-24*2)<=12)or ((h-24*3)>0 and (h-24*3)<=12)or ((h-24*4)>0 and (h-24*4)<=12):\n", 1251 | " return('F_half')\n", 1252 | " else:\n", 1253 | " return('S_half')" 1254 | ], 1255 | "execution_count": 0, 1256 | "outputs": [] 1257 | }, 1258 | { 1259 | "cell_type": "code", 1260 | "metadata": { 1261 | "trusted": true, 1262 | "id": "HUtseFywnDfB", 1263 | "colab_type": "code", 1264 | "colab": {} 1265 | }, 1266 | "source": [ 1267 | "data['type_of_day3'] = data['d'].apply(roll5)\n", 1268 | "data['type_of_day2'] = data['d'].apply(roll4)\n", 1269 | "data['type_of_day1'] = data['d'].apply(roll3)\n", 1270 | "data['type_of_day'] = data['d'].apply(roll2)\n", 1271 | "data['day'] = data['d'].apply(roll1)" 1272 | ], 1273 | "execution_count": 0, 1274 | "outputs": [] 1275 | }, 1276 | { 1277 | "cell_type": "code", 1278 | "metadata": { 1279 | "trusted": true, 1280 | "id": "W5n5GQWMnDfC", 1281 | "colab_type": "code", 1282 | "colab": {} 1283 | }, 1284 | "source": [ 1285 | "data.groupby(['train','day','type_of_day3']).count()['temp']" 1286 | ], 1287 | "execution_count": 0, 1288 | "outputs": [] 1289 | }, 1290 | { 1291 | "cell_type": "code", 1292 | "metadata": { 1293 | "trusted": true, 1294 | "id": "2mX5kNPvnDfG", 1295 | "colab_type": "code", 1296 | "colab": {} 1297 | }, 1298 | "source": [ 1299 | "data[(data['train'] == True) & (data['day'] == 'day5') & (data['type_of_day'] == 'Morning')].groupby('d').count()['temp']" 1300 | ], 1301 | "execution_count": 0, 1302 | "outputs": [] 1303 | }, 1304 | { 1305 | "cell_type": "code", 1306 | "metadata": { 1307 | "trusted": true, 1308 | "id": "U3fivT72nDfH", 1309 | "colab_type": "code", 1310 | "colab": {} 1311 | }, 1312 | "source": [ 1313 | "data = reduce_mem_usage(data)" 1314 | ], 1315 | "execution_count": 0, 1316 | "outputs": [] 1317 | }, 1318 | { 1319 | "cell_type": "code", 1320 | "metadata": { 1321 | "trusted": true, 1322 | "id": "h2QQZFk7nDfI", 1323 | "colab_type": "code", 1324 | "colab": {} 1325 | }, 1326 | "source": [ 1327 | "def percentile(n) :\n", 1328 | " def percentile_(x) : \n", 1329 | " return np.percentile(x, n)\n", 1330 | " percentile_.__name__ = 'percentile_%s' % n\n", 1331 | " return percentile_" 1332 | ], 1333 | "execution_count": 0, 1334 | "outputs": [] 1335 | }, 1336 | { 1337 | "cell_type": "code", 1338 | "metadata": { 1339 | "trusted": true, 1340 | "id": "beNOLTNVnDfK", 1341 | "colab_type": "code", 1342 | "colab": {} 1343 | }, 1344 | "source": [ 1345 | "%%time\n", 1346 | "grp_data1 = data.drop(['d','train'],axis=1).groupby(['ID','day']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n", 1347 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n", 1348 | " ,percentile(90),percentile(95),percentile(99)])\n", 1349 | "\n", 1350 | "grp_data1.columns = ['_'.join(col).strip() for col in grp_data1.columns.values]\n", 1351 | "grp_data1.reset_index(inplace = True)" 1352 | ], 1353 | "execution_count": 0, 1354 | "outputs": [] 1355 | }, 1356 | { 1357 | "cell_type": "code", 1358 | "metadata": { 1359 | "trusted": true, 1360 | "id": "yCdrAS4hnDfL", 1361 | "colab_type": "code", 1362 | "colab": {} 1363 | }, 1364 | "source": [ 1365 | "grp_data1.head()" 1366 | ], 1367 | "execution_count": 0, 1368 | "outputs": [] 1369 | }, 1370 | { 1371 | "cell_type": "code", 1372 | "metadata": { 1373 | "trusted": true, 1374 | "id": "ntdn73R5nDfM", 1375 | "colab_type": "code", 1376 | "colab": {} 1377 | }, 1378 | "source": [ 1379 | "%%time\n", 1380 | "grp_data2 = data.drop(['d','train'],axis=1).groupby(['ID','type_of_day']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n", 1381 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n", 1382 | " ,percentile(90),percentile(95),percentile(99)])\n", 1383 | "\n", 1384 | "grp_data2.columns = ['_'.join(col).strip() for col in grp_data2.columns.values]\n", 1385 | "grp_data2.reset_index(inplace=True)" 1386 | ], 1387 | "execution_count": 0, 1388 | "outputs": [] 1389 | }, 1390 | { 1391 | "cell_type": "code", 1392 | "metadata": { 1393 | "trusted": true, 1394 | "id": "d2JYuFCznDfO", 1395 | "colab_type": "code", 1396 | "colab": {} 1397 | }, 1398 | "source": [ 1399 | "grp_data2.head()" 1400 | ], 1401 | "execution_count": 0, 1402 | "outputs": [] 1403 | }, 1404 | { 1405 | "cell_type": "code", 1406 | "metadata": { 1407 | "trusted": true, 1408 | "id": "x_9ql5MInDfQ", 1409 | "colab_type": "code", 1410 | "colab": {} 1411 | }, 1412 | "source": [ 1413 | "%%time\n", 1414 | "#data['day_type_of_day'] = data['day']+ '_' + data['type_of_day']\n", 1415 | "\n", 1416 | "\n", 1417 | "grp_data3 = data.drop(['d','train'],axis=1).groupby(['ID','type_of_day1']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n", 1418 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n", 1419 | " ,percentile(90),percentile(95),percentile(99)])\n", 1420 | "\n", 1421 | "grp_data3.columns = ['_'.join(col).strip() for col in grp_data3.columns.values]\n", 1422 | "grp_data3.reset_index(inplace=True)" 1423 | ], 1424 | "execution_count": 0, 1425 | "outputs": [] 1426 | }, 1427 | { 1428 | "cell_type": "code", 1429 | "metadata": { 1430 | "trusted": true, 1431 | "id": "jaqnN8nTnDfR", 1432 | "colab_type": "code", 1433 | "colab": {} 1434 | }, 1435 | "source": [ 1436 | "grp_data3.shape" 1437 | ], 1438 | "execution_count": 0, 1439 | "outputs": [] 1440 | }, 1441 | { 1442 | "cell_type": "code", 1443 | "metadata": { 1444 | "trusted": true, 1445 | "id": "5eQJvsYZnDfS", 1446 | "colab_type": "code", 1447 | "colab": {} 1448 | }, 1449 | "source": [ 1450 | "%%time\n", 1451 | "grp_data4 = data.drop(['d','train'],axis=1).groupby(['ID','type_of_day2']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n", 1452 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n", 1453 | " ,percentile(90),percentile(95),percentile(99)])\n", 1454 | "\n", 1455 | "grp_data4.columns = ['_'.join(col).strip() for col in grp_data4.columns.values]\n", 1456 | "grp_data4.reset_index(inplace=True)" 1457 | ], 1458 | "execution_count": 0, 1459 | "outputs": [] 1460 | }, 1461 | { 1462 | "cell_type": "code", 1463 | "metadata": { 1464 | "trusted": true, 1465 | "id": "mlyp-_ZnnDfT", 1466 | "colab_type": "code", 1467 | "colab": {} 1468 | }, 1469 | "source": [ 1470 | "grp_data4.shape" 1471 | ], 1472 | "execution_count": 0, 1473 | "outputs": [] 1474 | }, 1475 | { 1476 | "cell_type": "code", 1477 | "metadata": { 1478 | "trusted": true, 1479 | "id": "a9-Lzf8UnDfU", 1480 | "colab_type": "code", 1481 | "colab": {} 1482 | }, 1483 | "source": [ 1484 | "%%time\n", 1485 | "#data['day_type_of_day1'] = data['day']+ '_' + data['type_of_day1']\n", 1486 | "\n", 1487 | "\n", 1488 | "grp_data5 = data.drop(['d','train'],axis=1).groupby(['ID','type_of_day3']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n", 1489 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n", 1490 | " ,percentile(90),percentile(95),percentile(99)])\n", 1491 | "\n", 1492 | "grp_data5.columns = ['_'.join(col).strip() for col in grp_data5.columns.values]\n", 1493 | "grp_data5.reset_index(inplace=True)" 1494 | ], 1495 | "execution_count": 0, 1496 | "outputs": [] 1497 | }, 1498 | { 1499 | "cell_type": "code", 1500 | "metadata": { 1501 | "trusted": true, 1502 | "id": "Ox43UwZmnDfW", 1503 | "colab_type": "code", 1504 | "colab": {} 1505 | }, 1506 | "source": [ 1507 | "grp_data5.shape" 1508 | ], 1509 | "execution_count": 0, 1510 | "outputs": [] 1511 | }, 1512 | { 1513 | "cell_type": "code", 1514 | "metadata": { 1515 | "trusted": true, 1516 | "id": "muiCFc8enDfZ", 1517 | "colab_type": "code", 1518 | "colab": {} 1519 | }, 1520 | "source": [ 1521 | "grp_data5" 1522 | ], 1523 | "execution_count": 0, 1524 | "outputs": [] 1525 | }, 1526 | { 1527 | "cell_type": "code", 1528 | "metadata": { 1529 | "trusted": true, 1530 | "id": "mpICUUQenDfb", 1531 | "colab_type": "code", 1532 | "colab": {} 1533 | }, 1534 | "source": [ 1535 | "grp_data11 = pd.pivot_table(data = grp_data1,index='ID',columns = 'day',values = 'temp_mean')\n", 1536 | "grp_data11.columns = ['temp_mean'+ i for i in grp_data11.columns]\n", 1537 | "grp_data11.reset_index(inplace=True)\n", 1538 | "for i in tqdm(grp_data1.drop(['ID','day','temp_mean'],axis=1).columns):\n", 1539 | " temp = pd.pivot_table(data = grp_data1,index='ID',columns = 'day',values = i)\n", 1540 | " temp.columns = [i+ j for j in temp.columns]\n", 1541 | " temp.reset_index(inplace=True)\n", 1542 | " grp_data11 = pd.merge(grp_data11,temp,on='ID',how='left')" 1543 | ], 1544 | "execution_count": 0, 1545 | "outputs": [] 1546 | }, 1547 | { 1548 | "cell_type": "code", 1549 | "metadata": { 1550 | "trusted": true, 1551 | "id": "9DBVhIuLnDfd", 1552 | "colab_type": "code", 1553 | "colab": {} 1554 | }, 1555 | "source": [ 1556 | "grp_data11_col = list(grp_data11.columns)" 1557 | ], 1558 | "execution_count": 0, 1559 | "outputs": [] 1560 | }, 1561 | { 1562 | "cell_type": "code", 1563 | "metadata": { 1564 | "trusted": true, 1565 | "id": "UMXdBe5VnDfe", 1566 | "colab_type": "code", 1567 | "colab": {} 1568 | }, 1569 | "source": [ 1570 | "grp_data21 = pd.pivot_table(data = grp_data2,index='ID',columns = 'type_of_day',values = 'temp_mean')\n", 1571 | "grp_data21.columns = ['temp_mean'+ i for i in grp_data21.columns]\n", 1572 | "grp_data21.reset_index(inplace=True)\n", 1573 | "for i in tqdm(grp_data2.drop(['ID','type_of_day','temp_mean'],axis=1).columns):\n", 1574 | " temp = pd.pivot_table(data = grp_data2,index='ID',columns = 'type_of_day',values = i)\n", 1575 | " temp.columns = [i+ j for j in temp.columns]\n", 1576 | " temp.reset_index(inplace=True)\n", 1577 | " grp_data21 = pd.merge(grp_data21,temp,on='ID',how='left')" 1578 | ], 1579 | "execution_count": 0, 1580 | "outputs": [] 1581 | }, 1582 | { 1583 | "cell_type": "code", 1584 | "metadata": { 1585 | "trusted": true, 1586 | "id": "otD9qfeVnDff", 1587 | "colab_type": "code", 1588 | "colab": {} 1589 | }, 1590 | "source": [ 1591 | "grp_data21_col = list(grp_data21.columns)" 1592 | ], 1593 | "execution_count": 0, 1594 | "outputs": [] 1595 | }, 1596 | { 1597 | "cell_type": "code", 1598 | "metadata": { 1599 | "trusted": true, 1600 | "id": "cEsFR3afnDfg", 1601 | "colab_type": "code", 1602 | "colab": {} 1603 | }, 1604 | "source": [ 1605 | "grp_data31 = pd.pivot_table(data = grp_data3,index='ID',columns = 'type_of_day1',values = 'temp_mean')\n", 1606 | "grp_data31.columns = ['temp_mean'+ i for i in grp_data31.columns]\n", 1607 | "grp_data31.reset_index(inplace=True)\n", 1608 | "for i in tqdm(grp_data3.drop(['ID','type_of_day1','temp_mean'],axis=1).columns):\n", 1609 | " temp = pd.pivot_table(data = grp_data3,index='ID',columns = 'type_of_day1',values = i)\n", 1610 | " temp.columns = [i+ j for j in temp.columns]\n", 1611 | " temp.reset_index(inplace=True)\n", 1612 | " grp_data31 = pd.merge(grp_data31,temp,on='ID',how='left')" 1613 | ], 1614 | "execution_count": 0, 1615 | "outputs": [] 1616 | }, 1617 | { 1618 | "cell_type": "code", 1619 | "metadata": { 1620 | "trusted": true, 1621 | "id": "kNik__8AnDfi", 1622 | "colab_type": "code", 1623 | "colab": {} 1624 | }, 1625 | "source": [ 1626 | "grp_data31_col = list(grp_data31.columns)" 1627 | ], 1628 | "execution_count": 0, 1629 | "outputs": [] 1630 | }, 1631 | { 1632 | "cell_type": "code", 1633 | "metadata": { 1634 | "trusted": true, 1635 | "id": "e5upctQonDfj", 1636 | "colab_type": "code", 1637 | "colab": {} 1638 | }, 1639 | "source": [ 1640 | "grp_data41 = pd.pivot_table(data = grp_data4,index='ID',columns = 'type_of_day2',values = 'temp_mean')\n", 1641 | "grp_data41.columns = ['temp_mean'+ i for i in grp_data41.columns]\n", 1642 | "grp_data41.reset_index(inplace=True)\n", 1643 | "for i in tqdm(grp_data4.drop(['ID','type_of_day2','temp_mean'],axis=1).columns):\n", 1644 | " temp = pd.pivot_table(data = grp_data4,index='ID',columns = 'type_of_day2',values = i)\n", 1645 | " temp.columns = [i+ j for j in temp.columns]\n", 1646 | " temp.reset_index(inplace=True)\n", 1647 | " grp_data41 = pd.merge(grp_data41,temp,on='ID',how='left')\n", 1648 | " " 1649 | ], 1650 | "execution_count": 0, 1651 | "outputs": [] 1652 | }, 1653 | { 1654 | "cell_type": "code", 1655 | "metadata": { 1656 | "trusted": true, 1657 | "id": "56CMRr_unDfk", 1658 | "colab_type": "code", 1659 | "colab": {} 1660 | }, 1661 | "source": [ 1662 | "grp_data41_col = list(grp_data41.columns)" 1663 | ], 1664 | "execution_count": 0, 1665 | "outputs": [] 1666 | }, 1667 | { 1668 | "cell_type": "code", 1669 | "metadata": { 1670 | "trusted": true, 1671 | "id": "4wXi_1_anDfl", 1672 | "colab_type": "code", 1673 | "colab": {} 1674 | }, 1675 | "source": [ 1676 | "grp_data51 = pd.pivot_table(data = grp_data5,index='ID',columns = 'type_of_day3',values = 'temp_mean')\n", 1677 | "grp_data51.columns = ['temp_mean'+ i for i in grp_data51.columns]\n", 1678 | "grp_data51.reset_index(inplace=True)\n", 1679 | "for i in tqdm(grp_data5.drop(['ID','type_of_day3','temp_mean'],axis=1).columns):\n", 1680 | " temp = pd.pivot_table(data = grp_data5,index='ID',columns = 'type_of_day3',values = i)\n", 1681 | " temp.columns = [i+ j for j in temp.columns]\n", 1682 | " temp.reset_index(inplace=True)\n", 1683 | " grp_data51 = pd.merge(grp_data51,temp,on='ID',how='left')\n", 1684 | " " 1685 | ], 1686 | "execution_count": 0, 1687 | "outputs": [] 1688 | }, 1689 | { 1690 | "cell_type": "code", 1691 | "metadata": { 1692 | "trusted": true, 1693 | "id": "Yf3BFI8bnDfm", 1694 | "colab_type": "code", 1695 | "colab": {} 1696 | }, 1697 | "source": [ 1698 | "grp_data51_col = list(grp_data51.columns)" 1699 | ], 1700 | "execution_count": 0, 1701 | "outputs": [] 1702 | }, 1703 | { 1704 | "cell_type": "code", 1705 | "metadata": { 1706 | "trusted": true, 1707 | "id": "p6hV54JvnDfo", 1708 | "colab_type": "code", 1709 | "colab": {} 1710 | }, 1711 | "source": [ 1712 | "grp_data_all = pd.merge(grp_data11,grp_data21,on='ID')\n", 1713 | "grp_data_all = pd.merge(grp_data_all,grp_data31,on='ID')\n", 1714 | "grp_data_all = pd.merge(grp_data_all,grp_data41,on='ID')\n", 1715 | "grp_data_all = pd.merge(grp_data_all,grp_data51,on='ID')\n", 1716 | "grp_data_all.shape" 1717 | ], 1718 | "execution_count": 0, 1719 | "outputs": [] 1720 | }, 1721 | { 1722 | "cell_type": "code", 1723 | "metadata": { 1724 | "trusted": true, 1725 | "id": "eYYTPQE7nDfp", 1726 | "colab_type": "code", 1727 | "colab": {} 1728 | }, 1729 | "source": [ 1730 | "train_df = pd.merge(train,grp_data_all,on='ID',how='left')\n", 1731 | "test_df = pd.merge(test,grp_data_all,on='ID',how='left')\n", 1732 | "train_df.shape,test_df.shape,train.shape,test.shape" 1733 | ], 1734 | "execution_count": 0, 1735 | "outputs": [] 1736 | }, 1737 | { 1738 | "cell_type": "code", 1739 | "metadata": { 1740 | "trusted": true, 1741 | "id": "Vo3f1vLunDfq", 1742 | "colab_type": "code", 1743 | "colab": {} 1744 | }, 1745 | "source": [ 1746 | "train_df.to_csv('final_train_df.csv',index=False)\n", 1747 | "test_df.to_csv('final_test_df.csv',index=False)" 1748 | ], 1749 | "execution_count": 0, 1750 | "outputs": [] 1751 | }, 1752 | { 1753 | "cell_type": "code", 1754 | "metadata": { 1755 | "trusted": true, 1756 | "id": "oVRYGGGjnDfq", 1757 | "colab_type": "code", 1758 | "colab": {} 1759 | }, 1760 | "source": [ 1761 | "pd.DataFrame(rem_org).to_csv('rem_org.csv',index=False)\n", 1762 | "pd.DataFrame(grp_data11_col).to_csv('grp_data11_col.csv',index=False)\n", 1763 | "pd.DataFrame(grp_data21_col).to_csv('grp_data21_col.csv',index=False)\n", 1764 | "pd.DataFrame(grp_data31_col).to_csv('grp_data31_col.csv',index=False)\n", 1765 | "pd.DataFrame(grp_data41_col).to_csv('grp_data41_col.csv',index=False)\n", 1766 | "pd.DataFrame(grp_data51_col).to_csv('grp_data51_col.csv',index=False)\n" 1767 | ], 1768 | "execution_count": 0, 1769 | "outputs": [] 1770 | }, 1771 | { 1772 | "cell_type": "code", 1773 | "metadata": { 1774 | "trusted": true, 1775 | "id": "adpmLxbSnDfu", 1776 | "colab_type": "code", 1777 | "outputId": "bfeceb2f-8f95-4b86-8ba9-6f355a888f95", 1778 | "colab": {} 1779 | }, 1780 | "source": [ 1781 | "train_df.shape,test_df.shape,train.shape,test.shape" 1782 | ], 1783 | "execution_count": 0, 1784 | "outputs": [ 1785 | { 1786 | "output_type": "execute_result", 1787 | "data": { 1788 | "text/plain": [ 1789 | "((15539, 3087), (5035, 3086), (15539, 843), (5035, 842))" 1790 | ] 1791 | }, 1792 | "metadata": { 1793 | "tags": [] 1794 | }, 1795 | "execution_count": 102 1796 | } 1797 | ] 1798 | } 1799 | ] 1800 | } -------------------------------------------------------------------------------- /indianda.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.6" 21 | }, 22 | "colab": { 23 | "name": "F7_33_29.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [], 26 | "include_colab_link": true 27 | } 28 | }, 29 | "cells": [ 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "view-in-github", 34 | "colab_type": "text" 35 | }, 36 | "source": [ 37 | " $\"Open$ " 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "28tisEbR66ru", 44 | "colab_type": "code", 45 | "outputId": "95b52c1b-9542-455c-bc29-d52b6a7a5c64", 46 | "colab": { 47 | "base_uri": "https://localhost:8080/", 48 | "height": 343 49 | } 50 | }, 51 | "source": [ 52 | "pip install catboost" 53 | ], 54 | "execution_count": 0, 55 | "outputs": [ 56 | { 57 | "output_type": "stream", 58 | "text": [ 59 | "Collecting catboost\n", 60 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b2/aa/e61819d04ef2bbee778bf4b3a748db1f3ad23512377e43ecfdc3211437a0/catboost-0.23.2-cp36-none-manylinux1_x86_64.whl (64.8MB)\n", 61 | "\u001b[K |████████████████████████████████| 64.8MB 122kB/s \n", 62 | "\u001b[?25hRequirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (1.0.3)\n", 63 | "Requirement already satisfied: plotly in /usr/local/lib/python3.6/dist-packages (from catboost) (4.4.1)\n", 64 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from catboost) (1.12.0)\n", 65 | "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from catboost) (1.4.1)\n", 66 | "Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (1.18.4)\n", 67 | "Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from catboost) (0.10.1)\n", 68 | "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from catboost) (3.2.1)\n", 69 | "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2018.9)\n", 70 | "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2.8.1)\n", 71 | "Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly->catboost) (1.3.3)\n", 72 | "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (0.10.0)\n", 73 | "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (1.2.0)\n", 74 | "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (2.4.7)\n", 75 | "Installing collected packages: catboost\n", 76 | "Successfully installed catboost-0.23.2\n" 77 | ], 78 | "name": "stdout" 79 | } 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "metadata": { 85 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 86 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 87 | "id": "Bt7cbLC6m9NS", 88 | "colab_type": "code", 89 | "colab": {} 90 | }, 91 | "source": [ 92 | "import pandas as pd\n", 93 | "import numpy as np\n", 94 | "import matplotlib.pyplot as plt\n", 95 | "import seaborn as sns\n", 96 | "import warnings\n", 97 | "warnings.filterwarnings('ignore')\n", 98 | "\n", 99 | "from math import sqrt \n", 100 | "import lightgbm as lgb\n", 101 | "from sklearn.metrics import mean_squared_error \n", 102 | "from sklearn.model_selection import KFold, train_test_split\n", 103 | "\n", 104 | "\n", 105 | "from scipy.sparse import csr_matrix\n", 106 | "import gc\n", 107 | "\n", 108 | "import pandas as pd\n", 109 | "import numpy as np\n", 110 | "import matplotlib.pyplot as plt\n", 111 | "import seaborn as sns\n", 112 | "from xgboost import XGBRegressor\n", 113 | "from catboost import CatBoostRegressor\n", 114 | "from lightgbm import LGBMRegressor\n", 115 | "from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,BaggingRegressor,AdaBoostRegressor,ExtraTreesRegressor\n", 116 | "from sklearn.tree import DecisionTreeRegressor\n", 117 | "from sklearn.linear_model import LinearRegression\n", 118 | "from sklearn.ensemble import StackingRegressor\n", 119 | "#from ngboost import NGBRegressor\n", 120 | "from sklearn.metrics import mean_squared_log_error\n", 121 | "from sklearn.preprocessing import LabelEncoder\n", 122 | "from sklearn.model_selection import KFold,StratifiedKFold, GroupKFold\n", 123 | "import datetime\n", 124 | "from statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n", 125 | "# from fastai.tabular import *\n", 126 | "import warnings\n", 127 | "warnings.filterwarnings(\"ignore\")\n", 128 | "np.random.seed(0)\n", 129 | "pd.set_option('display.max_rows', 500)\n", 130 | "pd.set_option('display.max_columns', 500)\n", 131 | "pd.set_option('display.width', 1000)" 132 | ], 133 | "execution_count": 0, 134 | "outputs": [] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "metadata": { 139 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 140 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", 141 | "id": "iIXDQzmam9NZ", 142 | "colab_type": "code", 143 | "colab": {} 144 | }, 145 | "source": [ 146 | "# Memory reduction helper function:\n", 147 | "def reduce_mem_usage(df, verbose=True):\n", 148 | " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", 149 | " start_mem = df.memory_usage().sum() / 1024**2 \n", 150 | " for col in df.columns: #columns\n", 151 | " col_type = df[col].dtypes\n", 152 | " if col_type in numerics: #numerics\n", 153 | " c_min = df[col].min()\n", 154 | " c_max = df[col].max()\n", 155 | " if str(col_type)[:3] == 'int':\n", 156 | " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", 157 | " df[col] = df[col].astype(np.int8)\n", 158 | " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", 159 | " df[col] = df[col].astype(np.int16)\n", 160 | " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", 161 | " df[col] = df[col].astype(np.int32)\n", 162 | " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", 163 | " df[col] = df[col].astype(np.int64) \n", 164 | " else:\n", 165 | " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", 166 | " df[col] = df[col].astype(np.float16)\n", 167 | " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", 168 | " df[col] = df[col].astype(np.float32)\n", 169 | " else:\n", 170 | " df[col] = df[col].astype(np.float64) \n", 171 | " end_mem = df.memory_usage().sum() / 1024**2\n", 172 | " if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))\n", 173 | " return df" 174 | ], 175 | "execution_count": 0, 176 | "outputs": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "metadata": { 181 | "id": "o0q54GGtm9Nc", 182 | "colab_type": "code", 183 | "colab": {} 184 | }, 185 | "source": [ 186 | "from IPython.display import HTML\n", 187 | "import pandas as pd\n", 188 | "import numpy as np\n", 189 | "import base64\n", 190 | "\n", 191 | "# download it (will only work for files < 2MB or so)\n", 192 | "def create_download_link(df, title = \"Download CSV file\", filename = \"rf.csv\"): \n", 193 | " csv = df.to_csv(index=False)\n", 194 | " b64 = base64.b64encode(csv.encode())\n", 195 | " payload = b64.decode()\n", 196 | " html = '{title}'\n", 197 | " html = html.format(payload=payload,title=title,filename=filename)\n", 198 | " return HTML(html)" 199 | ], 200 | "execution_count": 0, 201 | "outputs": [] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "metadata": { 206 | "id": "16tEwM7Zy2LT", 207 | "colab_type": "code", 208 | "outputId": "72843c89-5503-46f7-83ce-cd9c47fc0d64", 209 | "colab": { 210 | "base_uri": "https://localhost:8080/", 211 | "height": 122 212 | } 213 | }, 214 | "source": [ 215 | "from google.colab import drive\n", 216 | "drive.mount('/content/drive')" 217 | ], 218 | "execution_count": 0, 219 | "outputs": [ 220 | { 221 | "output_type": "stream", 222 | "text": [ 223 | "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n", 224 | "\n", 225 | "Enter your authorization code:\n", 226 | "··········\n", 227 | "Mounted at /content/drive\n" 228 | ], 229 | "name": "stdout" 230 | } 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "metadata": { 236 | "id": "Ijyk-OtEm9Nf", 237 | "colab_type": "code", 238 | "colab": {} 239 | }, 240 | "source": [ 241 | "path = '/content/drive/My Drive/'" 242 | ], 243 | "execution_count": 0, 244 | "outputs": [] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "metadata": { 249 | "id": "3KlNJQizm9Ni", 250 | "colab_type": "code", 251 | "outputId": "517007c7-f366-4887-8476-8be272e14b5c", 252 | "colab": { 253 | "base_uri": "https://localhost:8080/", 254 | "height": 34 255 | } 256 | }, 257 | "source": [ 258 | "train_df = pd.read_csv(path+'train_df.csv')\n", 259 | "test_df = pd.read_csv(path+'test_df.csv')\n", 260 | "train_df.shape,test_df.shape" 261 | ], 262 | "execution_count": 0, 263 | "outputs": [ 264 | { 265 | "output_type": "execute_result", 266 | "data": { 267 | "text/plain": [ 268 | "((15539, 3087), (5035, 3086))" 269 | ] 270 | }, 271 | "metadata": { 272 | "tags": [] 273 | }, 274 | "execution_count": 48 275 | } 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "metadata": { 281 | "id": "UcJaZ8Gqm9Nq", 282 | "colab_type": "code", 283 | "colab": {} 284 | }, 285 | "source": [ 286 | "temp = [\"temp{}\".format(i) for i in range(121)]\n", 287 | "precip = [\"precip{}\".format(i) for i in range(121)]\n", 288 | "rel_humidity = [\"rel_humidity{}\".format(i) for i in range(121)]\n", 289 | "wind_dir = [\"wind_dir{}\".format(i) for i in range(121)]\n", 290 | "wind_spd = [\"wind_spd{}\".format(i) for i in range(121)]\n", 291 | "atmos_press = [\"atmos_press{}\".format(i) for i in range(121)]" 292 | ], 293 | "execution_count": 0, 294 | "outputs": [] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "metadata": { 299 | "id": "5e37zAVpm9Nu", 300 | "colab_type": "code", 301 | "colab": {} 302 | }, 303 | "source": [ 304 | "grp_data11_col = pd.read_csv(path+'grp_data11_col.csv')\n", 305 | "grp_data11_col = list(grp_data11_col['0'].values)\n", 306 | "\n", 307 | "grp_data21_col = pd.read_csv(path+'grp_data21_col.csv')\n", 308 | "grp_data21_col = list(grp_data21_col['0'].values)\n", 309 | "\n", 310 | "grp_data31_col = pd.read_csv(path+'grp_data31_col.csv')\n", 311 | "grp_data31_col = list(grp_data31_col['0'].values)\n", 312 | "\n", 313 | "grp_data41_col = pd.read_csv(path+'grp_data41_col.csv')\n", 314 | "grp_data41_col = list(grp_data41_col['0'].values)\n", 315 | "\n", 316 | "grp_data51_col = pd.read_csv(path+'grp_data51_col.csv')\n", 317 | "grp_data51_col = list(grp_data51_col['0'].values)\n", 318 | "\n", 319 | "rem_org = pd.read_csv(path+'rem_org.csv')\n", 320 | "rem_org = list(rem_org['0'].values)" 321 | ], 322 | "execution_count": 0, 323 | "outputs": [] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "metadata": { 328 | "id": "kwukjWmwm9Nz", 329 | "colab_type": "code", 330 | "outputId": "1c245c4e-5a90-4f1d-beac-d1eb85dd94c3", 331 | "colab": { 332 | "base_uri": "https://localhost:8080/", 333 | "height": 34 334 | } 335 | }, 336 | "source": [ 337 | "len(rem_org),len(grp_data11_col),len(grp_data21_col),len(grp_data31_col),len(grp_data41_col),len(grp_data51_col)" 338 | ], 339 | "execution_count": 0, 340 | "outputs": [ 341 | { 342 | "output_type": "execute_result", 343 | "data": { 344 | "text/plain": [ 345 | "(116, 511, 409, 817, 307, 205)" 346 | ] 347 | }, 348 | "metadata": { 349 | "tags": [] 350 | }, 351 | "execution_count": 52 352 | } 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "LEvsmujUm9N4", 359 | "colab_type": "code", 360 | "colab": {} 361 | }, 362 | "source": [ 363 | "f1 = list(set(temp + precip + rel_humidity + wind_dir + wind_spd + atmos_press + rem_org)) # orig\n", 364 | "f2 = list(set(f1 + grp_data11_col)) #day\n", 365 | "f3 = list(set(f1 + grp_data21_col)) #typeofday\n", 366 | "f4 = list(set(f1 + grp_data31_col)) #type_of_day1\n", 367 | "f5 = list(set(f1 + grp_data41_col)) #type_of_day2\n", 368 | "f6 = list(set(f1 + grp_data51_col)) #type_of_day3\n", 369 | "f7 = list(set(f1 + f3 + f5)) #type_of_day day_type_of_day2\n", 370 | "f8 = list(set(f1 + f3 + f6)) #type_of_day day_type_of_day3\n", 371 | "f9 = list(set(f1 + f4 + f5)) #type_of_day1 day_type_of_day2\n", 372 | "f10 = list(set(f1 + f4 + f6)) #type_of_day1 day_type_of_day3\n", 373 | "f11 = list(set(f1 + f5 + f6)) #type_of_day2 day_type_of_day3\n", 374 | "f12 = list(set(f1 + f3+ f5 + f6)) #type_of_day type_of_day2 day_type_of_day3\n", 375 | "f13 = list(set(f1 + f4+ f5 + f6)) #type_of_day1 type_of_day2 day_type_of_day3\n", 376 | "f14 = list(set(f1 +f2+ f4+ f5 + f3 + f6)) #type_of_day1 type_of_day2 day_type_of_day3" 377 | ], 378 | "execution_count": 0, 379 | "outputs": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "metadata": { 384 | "id": "5Hb8Jukxm9N6", 385 | "colab_type": "code", 386 | "outputId": "4ea33593-48c8-4634-e69b-8f44b6106567", 387 | "colab": { 388 | "base_uri": "https://localhost:8080/", 389 | "height": 34 390 | } 391 | }, 392 | "source": [ 393 | "train = train_df[f7+['target']]\n", 394 | "test = test_df[f7]\n", 395 | "train.shape,test.shape" 396 | ], 397 | "execution_count": 0, 398 | "outputs": [ 399 | { 400 | "output_type": "execute_result", 401 | "data": { 402 | "text/plain": [ 403 | "((15539, 1557), (5035, 1556))" 404 | ] 405 | }, 406 | "metadata": { 407 | "tags": [] 408 | }, 409 | "execution_count": 109 410 | } 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "metadata": { 416 | "id": "jfT0g2KSm9OA", 417 | "colab_type": "code", 418 | "outputId": "3d45143c-ffb8-4b37-867e-a81282d9a584", 419 | "colab": { 420 | "base_uri": "https://localhost:8080/", 421 | "height": 34 422 | } 423 | }, 424 | "source": [ 425 | "train_id = train['ID']\n", 426 | "test_id = test['ID']\n", 427 | "y = train['target']\n", 428 | "\n", 429 | "train.drop(['ID','target'],axis=1,inplace=True)\n", 430 | "test.drop('ID',axis=1,inplace=True)\n", 431 | "train.shape,test.shape" 432 | ], 433 | "execution_count": 0, 434 | "outputs": [ 435 | { 436 | "output_type": "execute_result", 437 | "data": { 438 | "text/plain": [ 439 | "((15539, 1555), (5035, 1555))" 440 | ] 441 | }, 442 | "metadata": { 443 | "tags": [] 444 | }, 445 | "execution_count": 110 446 | } 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "metadata": { 452 | "id": "6FPdKg3d5xsq", 453 | "colab_type": "code", 454 | "colab": {} 455 | }, 456 | "source": [ 457 | "forecastwindte=pd.read_csv('/content/winddirforecasttest ADD THOSE FEATURES TO TEST.csv')\n", 458 | "forecastwindtr=pd.read_csv('/content/winddirforecasttrain ADD THOSE COLUMNS TO TRAIN.csv')" 459 | ], 460 | "execution_count": 0, 461 | "outputs": [] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "metadata": { 466 | "id": "liV-ze6x1EZe", 467 | "colab_type": "code", 468 | "colab": {} 469 | }, 470 | "source": [ 471 | "X = train.copy()\n", 472 | "Xtest = test.copy()" 473 | ], 474 | "execution_count": 0, 475 | "outputs": [] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "metadata": { 480 | "id": "W9Lwdhd3pv4M", 481 | "colab_type": "code", 482 | "colab": {} 483 | }, 484 | "source": [ 485 | "for i in forecastwindte.columns :\n", 486 | " X[i]=forecastwindtr[i]\n", 487 | " Xtest[i]=forecastwindte[i]" 488 | ], 489 | "execution_count": 0, 490 | "outputs": [] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "metadata": { 495 | "id": "jyHCFKUBm9OI", 496 | "colab_type": "code", 497 | "outputId": "622d0c9c-1098-4396-8887-1191d120d292", 498 | "colab": { 499 | "base_uri": "https://localhost:8080/", 500 | "height": 51 501 | } 502 | }, 503 | "source": [ 504 | "%%time\n", 505 | "from lightgbm import LGBMRegressor\n", 506 | "lgb = LGBMRegressor()\n", 507 | "lgb.fit(X,y)\n", 508 | "\n", 509 | "imp = pd.DataFrame(lgb.feature_importances_,index=X.columns)\n", 510 | "l = list(imp[imp[0]>3].index)\n", 511 | "len(l)" 512 | ], 513 | "execution_count": 0, 514 | "outputs": [ 515 | { 516 | "output_type": "stream", 517 | "text": [ 518 | "CPU times: user 1min 10s, sys: 119 ms, total: 1min 11s\n", 519 | "Wall time: 36.8 s\n" 520 | ], 521 | "name": "stdout" 522 | } 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "metadata": { 528 | "id": "o4AbdDPW7-62", 529 | "colab_type": "code", 530 | "outputId": "f2eb1295-08a0-4018-b962-f92967e6d81f", 531 | "colab": { 532 | "base_uri": "https://localhost:8080/", 533 | "height": 34 534 | } 535 | }, 536 | "source": [ 537 | "len(l),X.shape" 538 | ], 539 | "execution_count": 0, 540 | "outputs": [ 541 | { 542 | "output_type": "execute_result", 543 | "data": { 544 | "text/plain": [ 545 | "(251, (15539, 1573))" 546 | ] 547 | }, 548 | "metadata": { 549 | "tags": [] 550 | }, 551 | "execution_count": 115 552 | } 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "metadata": { 558 | "id": "XFezR9QtJr03", 559 | "colab_type": "code", 560 | "outputId": "1c23c1af-cbe0-4a21-92a1-1070abd88fd6", 561 | "colab": { 562 | "base_uri": "https://localhost:8080/", 563 | "height": 34 564 | } 565 | }, 566 | "source": [ 567 | "X[l].shape" 568 | ], 569 | "execution_count": 0, 570 | "outputs": [ 571 | { 572 | "output_type": "execute_result", 573 | "data": { 574 | "text/plain": [ 575 | "(15539, 251)" 576 | ] 577 | }, 578 | "metadata": { 579 | "tags": [] 580 | }, 581 | "execution_count": 116 582 | } 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "metadata": { 588 | "id": "cJBxSH73m9OL", 589 | "colab_type": "code", 590 | "outputId": "47da5b3d-1904-47c4-a9a8-d9f13ad871d7", 591 | "colab": { 592 | "base_uri": "https://localhost:8080/", 593 | "height": 1000 594 | } 595 | }, 596 | "source": [ 597 | "from catboost import CatBoostRegressor\n", 598 | "errcb2=[]\n", 599 | "y_pred_totcb2=[]\n", 600 | "from sklearn.model_selection import KFold,StratifiedKFold, TimeSeriesSplit\n", 601 | "from sklearn.metrics import mean_squared_error\n", 602 | "fold=KFold(n_splits=20)#15#5#10\n", 603 | "i=1\n", 604 | "for train_index, test_index in fold.split(X,y):\n", 605 | " X_train, X_test = X[l].values[train_index], X[l].values[test_index]\n", 606 | " y_train, y_test = y.values[train_index], y.values[test_index]\n", 607 | " m2 = CatBoostRegressor(n_estimators=5000,eval_metric='RMSE',learning_rate=0.175, random_seed= 42, use_best_model=True )\n", 608 | " m2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=200,verbose=200)\n", 609 | " preds=m2.predict(X_test)\n", 610 | " print(\"err: \",np.sqrt(mean_squared_error(y_test,preds)))\n", 611 | " errcb2.append(np.sqrt(mean_squared_error(y_test,preds)))\n", 612 | " p2 = m2.predict(Xtest[l])\n", 613 | " y_pred_totcb2.append(p2)\n", 614 | "np.mean(errcb2)" 615 | ], 616 | "execution_count": 0, 617 | "outputs": [ 618 | { 619 | "output_type": "stream", 620 | "text": [ 621 | "0:\tlearn: 40.0876723\ttest: 40.0876723\ttest1: 40.8671914\tbest: 40.8671914 (0)\ttotal: 82.9ms\tremaining: 6m 54s\n", 622 | "200:\tlearn: 18.7261668\ttest: 18.7261668\ttest1: 24.3206390\tbest: 24.2671957 (195)\ttotal: 15.7s\tremaining: 6m 14s\n", 623 | "400:\tlearn: 14.0273746\ttest: 14.0273746\ttest1: 23.2844559\tbest: 23.2426862 (384)\ttotal: 31.2s\tremaining: 5m 58s\n", 624 | "600:\tlearn: 11.1481095\ttest: 11.1481095\ttest1: 22.9916124\tbest: 22.9214612 (544)\ttotal: 46.8s\tremaining: 5m 42s\n", 625 | "800:\tlearn: 9.1067324\ttest: 9.1067324\ttest1: 22.9154239\tbest: 22.8916184 (796)\ttotal: 1m 2s\tremaining: 5m 26s\n", 626 | "1000:\tlearn: 7.5375543\ttest: 7.5375543\ttest1: 22.8136649\tbest: 22.8033906 (983)\ttotal: 1m 17s\tremaining: 5m 10s\n", 627 | "1200:\tlearn: 6.2964674\ttest: 6.2964674\ttest1: 22.7658360\tbest: 22.7564960 (1192)\ttotal: 1m 33s\tremaining: 4m 54s\n", 628 | "1400:\tlearn: 5.3083733\ttest: 5.3083733\ttest1: 22.7173545\tbest: 22.6998592 (1353)\ttotal: 1m 49s\tremaining: 4m 41s\n", 629 | "1600:\tlearn: 4.4913219\ttest: 4.4913219\ttest1: 22.6589477\tbest: 22.6589477 (1600)\ttotal: 2m 5s\tremaining: 4m 27s\n", 630 | "1800:\tlearn: 3.8271519\ttest: 3.8271519\ttest1: 22.6417856\tbest: 22.6403727 (1793)\ttotal: 2m 21s\tremaining: 4m 10s\n", 631 | "2000:\tlearn: 3.2675556\ttest: 3.2675556\ttest1: 22.6056521\tbest: 22.6013030 (1996)\ttotal: 2m 36s\tremaining: 3m 55s\n", 632 | "2200:\tlearn: 2.7916963\ttest: 2.7916963\ttest1: 22.5825413\tbest: 22.5472933 (2089)\ttotal: 2m 52s\tremaining: 3m 39s\n", 633 | "Stopped by overfitting detector (200 iterations wait)\n", 634 | "\n", 635 | "bestTest = 22.54729334\n", 636 | "bestIteration = 2089\n", 637 | "\n", 638 | "Shrink model to first 2090 iterations.\n", 639 | "err: 22.54729355867787\n", 640 | "0:\tlearn: 40.2335396\ttest: 40.2335396\ttest1: 37.5305630\tbest: 37.5305630 (0)\ttotal: 79.2ms\tremaining: 6m 35s\n", 641 | "200:\tlearn: 18.7921257\ttest: 18.7921257\ttest1: 22.5270938\tbest: 22.5066513 (193)\ttotal: 15.5s\tremaining: 6m 10s\n", 642 | "400:\tlearn: 14.1558333\ttest: 14.1558333\ttest1: 21.6030513\tbest: 21.5935892 (389)\ttotal: 30.9s\tremaining: 5m 54s\n", 643 | "600:\tlearn: 11.2241101\ttest: 11.2241101\ttest1: 21.2615330\tbest: 21.2415566 (587)\ttotal: 46.4s\tremaining: 5m 39s\n", 644 | "800:\tlearn: 9.1447559\ttest: 9.1447559\ttest1: 21.1092259\tbest: 21.0980380 (788)\ttotal: 1m 1s\tremaining: 5m 24s\n", 645 | "1000:\tlearn: 7.5405212\ttest: 7.5405212\ttest1: 20.9789300\tbest: 20.9686746 (996)\ttotal: 1m 17s\tremaining: 5m 9s\n", 646 | "1200:\tlearn: 6.2914202\ttest: 6.2914202\ttest1: 20.9586354\tbest: 20.9396207 (1183)\ttotal: 1m 32s\tremaining: 4m 53s\n", 647 | "1400:\tlearn: 5.2881884\ttest: 5.2881884\ttest1: 20.9648652\tbest: 20.9364999 (1228)\ttotal: 1m 48s\tremaining: 4m 38s\n", 648 | "Stopped by overfitting detector (200 iterations wait)\n", 649 | "\n", 650 | "bestTest = 20.93649987\n", 651 | "bestIteration = 1228\n", 652 | "\n", 653 | "Shrink model to first 1229 iterations.\n", 654 | "err: 20.936499717501444\n", 655 | "0:\tlearn: 40.2699643\ttest: 40.2699643\ttest1: 40.8415667\tbest: 40.8415667 (0)\ttotal: 76.5ms\tremaining: 6m 22s\n", 656 | "200:\tlearn: 18.6753351\ttest: 18.6753351\ttest1: 27.4187183\tbest: 27.4088922 (193)\ttotal: 15.5s\tremaining: 6m 10s\n", 657 | "400:\tlearn: 13.9656204\ttest: 13.9656204\ttest1: 26.2749454\tbest: 26.2749454 (400)\ttotal: 31s\tremaining: 5m 55s\n", 658 | "600:\tlearn: 11.1377151\ttest: 11.1377151\ttest1: 25.7939350\tbest: 25.7758090 (597)\ttotal: 46.6s\tremaining: 5m 40s\n", 659 | "800:\tlearn: 9.0698954\ttest: 9.0698954\ttest1: 25.6473063\tbest: 25.6330944 (796)\ttotal: 1m 2s\tremaining: 5m 26s\n", 660 | "1000:\tlearn: 7.4735173\ttest: 7.4735173\ttest1: 25.6020918\tbest: 25.5582752 (956)\ttotal: 1m 17s\tremaining: 5m 10s\n", 661 | "1200:\tlearn: 6.2397293\ttest: 6.2397293\ttest1: 25.4641451\tbest: 25.4639263 (1199)\ttotal: 1m 33s\tremaining: 4m 55s\n", 662 | "1400:\tlearn: 5.2201605\ttest: 5.2201605\ttest1: 25.4301693\tbest: 25.4189535 (1337)\ttotal: 1m 48s\tremaining: 4m 39s\n", 663 | "1600:\tlearn: 4.4155659\ttest: 4.4155659\ttest1: 25.4454091\tbest: 25.3828358 (1479)\ttotal: 2m 4s\tremaining: 4m 24s\n", 664 | "Stopped by overfitting detector (200 iterations wait)\n", 665 | "\n", 666 | "bestTest = 25.38283585\n", 667 | "bestIteration = 1479\n", 668 | "\n", 669 | "Shrink model to first 1480 iterations.\n", 670 | "err: 25.382835740606435\n", 671 | "0:\tlearn: 40.0010233\ttest: 40.0010233\ttest1: 42.8422200\tbest: 42.8422200 (0)\ttotal: 77.1ms\tremaining: 6m 25s\n", 672 | "200:\tlearn: 18.5276182\ttest: 18.5276182\ttest1: 26.6084511\tbest: 26.5904701 (199)\ttotal: 15.5s\tremaining: 6m 11s\n", 673 | "400:\tlearn: 13.9511841\ttest: 13.9511841\ttest1: 25.2719486\tbest: 25.2719486 (400)\ttotal: 31s\tremaining: 5m 55s\n", 674 | "600:\tlearn: 11.0352126\ttest: 11.0352126\ttest1: 24.5883537\tbest: 24.5883537 (600)\ttotal: 46.5s\tremaining: 5m 40s\n", 675 | "800:\tlearn: 8.9755095\ttest: 8.9755095\ttest1: 24.3519888\tbest: 24.3087064 (740)\ttotal: 1m 1s\tremaining: 5m 24s\n", 676 | "1000:\tlearn: 7.4336900\ttest: 7.4336900\ttest1: 24.0587788\tbest: 24.0576351 (999)\ttotal: 1m 17s\tremaining: 5m 9s\n", 677 | "1200:\tlearn: 6.1865076\ttest: 6.1865076\ttest1: 23.8666247\tbest: 23.8666247 (1200)\ttotal: 1m 33s\tremaining: 4m 54s\n", 678 | "1400:\tlearn: 5.2120808\ttest: 5.2120808\ttest1: 23.8334914\tbest: 23.8274649 (1387)\ttotal: 1m 48s\tremaining: 4m 38s\n", 679 | "1600:\tlearn: 4.4131917\ttest: 4.4131917\ttest1: 23.7687539\tbest: 23.7601979 (1578)\ttotal: 2m 4s\tremaining: 4m 23s\n", 680 | "1800:\tlearn: 3.7598475\ttest: 3.7598475\ttest1: 23.7438139\tbest: 23.7426417 (1687)\ttotal: 2m 19s\tremaining: 4m 8s\n", 681 | "2000:\tlearn: 3.2096719\ttest: 3.2096719\ttest1: 23.6954691\tbest: 23.6877001 (1940)\ttotal: 2m 35s\tremaining: 3m 53s\n", 682 | "2200:\tlearn: 2.7333636\ttest: 2.7333636\ttest1: 23.6486949\tbest: 23.6438895 (2193)\ttotal: 2m 51s\tremaining: 3m 38s\n", 683 | "2400:\tlearn: 2.3576374\ttest: 2.3576374\ttest1: 23.6143687\tbest: 23.6135243 (2398)\ttotal: 3m 7s\tremaining: 3m 22s\n", 684 | "2600:\tlearn: 2.0288923\ttest: 2.0288923\ttest1: 23.6012689\tbest: 23.6000219 (2593)\ttotal: 3m 22s\tremaining: 3m 7s\n", 685 | "2800:\tlearn: 1.7491326\ttest: 1.7491326\ttest1: 23.5908282\tbest: 23.5875003 (2782)\ttotal: 3m 38s\tremaining: 2m 51s\n", 686 | "3000:\tlearn: 1.5159515\ttest: 1.5159515\ttest1: 23.5843994\tbest: 23.5841580 (2971)\ttotal: 3m 54s\tremaining: 2m 36s\n", 687 | "3200:\tlearn: 1.3248261\ttest: 1.3248261\ttest1: 23.5876218\tbest: 23.5837679 (3127)\ttotal: 4m 9s\tremaining: 2m 20s\n", 688 | "3400:\tlearn: 1.1520374\ttest: 1.1520374\ttest1: 23.5751287\tbest: 23.5742021 (3396)\ttotal: 4m 25s\tremaining: 2m 4s\n", 689 | "3600:\tlearn: 1.0056445\ttest: 1.0056445\ttest1: 23.5690721\tbest: 23.5683305 (3566)\ttotal: 4m 41s\tremaining: 1m 49s\n", 690 | "3800:\tlearn: 0.8768814\ttest: 0.8768814\ttest1: 23.5550729\tbest: 23.5541394 (3796)\ttotal: 4m 58s\tremaining: 1m 34s\n", 691 | "4000:\tlearn: 0.7579140\ttest: 0.7579140\ttest1: 23.5492676\tbest: 23.5476309 (3974)\ttotal: 5m 14s\tremaining: 1m 18s\n", 692 | "4200:\tlearn: 0.6576364\ttest: 0.6576364\ttest1: 23.5353984\tbest: 23.5342956 (4181)\ttotal: 5m 31s\tremaining: 1m 2s\n", 693 | "4400:\tlearn: 0.5711441\ttest: 0.5711441\ttest1: 23.5296140\tbest: 23.5290636 (4375)\ttotal: 5m 48s\tremaining: 47.4s\n", 694 | "4600:\tlearn: 0.5002357\ttest: 0.5002357\ttest1: 23.5301061\tbest: 23.5279730 (4528)\ttotal: 6m 3s\tremaining: 31.6s\n", 695 | "Stopped by overfitting detector (200 iterations wait)\n", 696 | "\n", 697 | "bestTest = 23.52797302\n", 698 | "bestIteration = 4528\n", 699 | "\n", 700 | "Shrink model to first 4529 iterations.\n", 701 | "err: 23.527972862069507\n", 702 | "0:\tlearn: 40.1718664\ttest: 40.1718664\ttest1: 41.1290550\tbest: 41.1290550 (0)\ttotal: 77.2ms\tremaining: 6m 26s\n", 703 | "200:\tlearn: 18.4836731\ttest: 18.4836731\ttest1: 25.4569672\tbest: 25.4569672 (200)\ttotal: 15.5s\tremaining: 6m 10s\n", 704 | "400:\tlearn: 13.9355300\ttest: 13.9355300\ttest1: 24.4773893\tbest: 24.4763157 (392)\ttotal: 31.2s\tremaining: 5m 57s\n", 705 | "600:\tlearn: 11.0812480\ttest: 11.0812480\ttest1: 23.9179170\tbest: 23.9087132 (596)\ttotal: 46.7s\tremaining: 5m 41s\n", 706 | "800:\tlearn: 9.0671201\ttest: 9.0671201\ttest1: 23.4813391\tbest: 23.4790011 (796)\ttotal: 1m 2s\tremaining: 5m 26s\n", 707 | "1000:\tlearn: 7.4996077\ttest: 7.4996077\ttest1: 23.3210801\tbest: 23.2980099 (978)\ttotal: 1m 17s\tremaining: 5m 10s\n", 708 | "1200:\tlearn: 6.2619113\ttest: 6.2619113\ttest1: 23.0835750\tbest: 23.0835750 (1200)\ttotal: 1m 33s\tremaining: 4m 55s\n", 709 | "1400:\tlearn: 5.2647254\ttest: 5.2647254\ttest1: 23.0231194\tbest: 23.0185763 (1397)\ttotal: 1m 49s\tremaining: 4m 40s\n", 710 | "1600:\tlearn: 4.4365986\ttest: 4.4365986\ttest1: 22.8960975\tbest: 22.8952064 (1599)\ttotal: 2m 4s\tremaining: 4m 24s\n", 711 | "1800:\tlearn: 3.7980562\ttest: 3.7980562\ttest1: 22.8099496\tbest: 22.8013149 (1793)\ttotal: 2m 20s\tremaining: 4m 8s\n", 712 | "2000:\tlearn: 3.2304707\ttest: 3.2304707\ttest1: 22.7436463\tbest: 22.7402474 (1986)\ttotal: 2m 35s\tremaining: 3m 53s\n", 713 | "2200:\tlearn: 2.7697644\ttest: 2.7697644\ttest1: 22.7045545\tbest: 22.7005346 (2180)\ttotal: 2m 51s\tremaining: 3m 38s\n", 714 | "2400:\tlearn: 2.3801329\ttest: 2.3801329\ttest1: 22.6786484\tbest: 22.6739218 (2396)\ttotal: 3m 7s\tremaining: 3m 22s\n", 715 | "2600:\tlearn: 2.0552460\ttest: 2.0552460\ttest1: 22.6356006\tbest: 22.6291453 (2583)\ttotal: 3m 22s\tremaining: 3m 6s\n", 716 | "2800:\tlearn: 1.7799421\ttest: 1.7799421\ttest1: 22.5960161\tbest: 22.5948527 (2787)\ttotal: 3m 38s\tremaining: 2m 51s\n", 717 | "3000:\tlearn: 1.5323343\ttest: 1.5323343\ttest1: 22.5770603\tbest: 22.5754886 (2946)\ttotal: 3m 53s\tremaining: 2m 35s\n", 718 | "3200:\tlearn: 1.3334044\ttest: 1.3334044\ttest1: 22.5594997\tbest: 22.5587520 (3148)\ttotal: 4m 9s\tremaining: 2m 20s\n", 719 | "3400:\tlearn: 1.1560733\ttest: 1.1560733\ttest1: 22.5518747\tbest: 22.5495954 (3345)\ttotal: 4m 25s\tremaining: 2m 4s\n", 720 | "3600:\tlearn: 1.0070794\ttest: 1.0070794\ttest1: 22.5210004\tbest: 22.5210004 (3600)\ttotal: 4m 40s\tremaining: 1m 49s\n", 721 | "3800:\tlearn: 0.8806402\ttest: 0.8806402\ttest1: 22.5110279\tbest: 22.5082813 (3784)\ttotal: 4m 56s\tremaining: 1m 33s\n", 722 | "4000:\tlearn: 0.7648709\ttest: 0.7648709\ttest1: 22.5080329\tbest: 22.5075276 (3993)\ttotal: 5m 12s\tremaining: 1m 17s\n", 723 | "4200:\tlearn: 0.6667834\ttest: 0.6667834\ttest1: 22.5034791\tbest: 22.5006112 (4108)\ttotal: 5m 29s\tremaining: 1m 2s\n", 724 | "Stopped by overfitting detector (200 iterations wait)\n", 725 | "\n", 726 | "bestTest = 22.50061121\n", 727 | "bestIteration = 4108\n", 728 | "\n", 729 | "Shrink model to first 4109 iterations.\n", 730 | "err: 22.500611177040856\n", 731 | "0:\tlearn: 40.3333876\ttest: 40.3333876\ttest1: 37.7283832\tbest: 37.7283832 (0)\ttotal: 77.2ms\tremaining: 6m 25s\n", 732 | "200:\tlearn: 18.5427660\ttest: 18.5427660\ttest1: 24.1868932\tbest: 24.1868932 (200)\ttotal: 15.6s\tremaining: 6m 12s\n", 733 | "400:\tlearn: 14.0666296\ttest: 14.0666296\ttest1: 23.2303523\tbest: 23.2043217 (382)\ttotal: 31s\tremaining: 5m 55s\n", 734 | "600:\tlearn: 11.2061832\ttest: 11.2061832\ttest1: 22.7633444\tbest: 22.7618894 (589)\ttotal: 46.5s\tremaining: 5m 40s\n", 735 | "800:\tlearn: 9.0756666\ttest: 9.0756666\ttest1: 22.5403978\tbest: 22.5362058 (799)\ttotal: 1m 2s\tremaining: 5m 25s\n", 736 | "1000:\tlearn: 7.4452019\ttest: 7.4452019\ttest1: 22.4006724\tbest: 22.3917867 (972)\ttotal: 1m 17s\tremaining: 5m 10s\n", 737 | "1200:\tlearn: 6.2142752\ttest: 6.2142752\ttest1: 22.3990171\tbest: 22.3441165 (1105)\ttotal: 1m 33s\tremaining: 4m 55s\n", 738 | "1400:\tlearn: 5.2024268\ttest: 5.2024268\ttest1: 22.3002244\tbest: 22.2999298 (1392)\ttotal: 1m 49s\tremaining: 4m 40s\n", 739 | "1600:\tlearn: 4.3919601\ttest: 4.3919601\ttest1: 22.2671572\tbest: 22.2620270 (1555)\ttotal: 2m 4s\tremaining: 4m 24s\n", 740 | "1800:\tlearn: 3.7456035\ttest: 3.7456035\ttest1: 22.2398679\tbest: 22.2380782 (1792)\ttotal: 2m 20s\tremaining: 4m 9s\n", 741 | "2000:\tlearn: 3.1848984\ttest: 3.1848984\ttest1: 22.2351871\tbest: 22.2207260 (1898)\ttotal: 2m 36s\tremaining: 3m 53s\n", 742 | "2200:\tlearn: 2.7256742\ttest: 2.7256742\ttest1: 22.2073729\tbest: 22.2030415 (2179)\ttotal: 2m 51s\tremaining: 3m 38s\n", 743 | "2400:\tlearn: 2.3578896\ttest: 2.3578896\ttest1: 22.1812838\tbest: 22.1704177 (2369)\ttotal: 3m 8s\tremaining: 3m 24s\n", 744 | "Stopped by overfitting detector (200 iterations wait)\n", 745 | "\n", 746 | "bestTest = 22.17041767\n", 747 | "bestIteration = 2369\n", 748 | "\n", 749 | "Shrink model to first 2370 iterations.\n", 750 | "err: 22.170417520833542\n", 751 | "0:\tlearn: 40.1457357\ttest: 40.1457357\ttest1: 41.3746201\tbest: 41.3746201 (0)\ttotal: 76ms\tremaining: 6m 19s\n", 752 | "200:\tlearn: 18.8420693\ttest: 18.8420693\ttest1: 25.4619658\tbest: 25.4619658 (200)\ttotal: 15.5s\tremaining: 6m 9s\n", 753 | "400:\tlearn: 14.2407549\ttest: 14.2407549\ttest1: 24.6336516\tbest: 24.6336516 (400)\ttotal: 31s\tremaining: 5m 55s\n", 754 | "600:\tlearn: 11.2038467\ttest: 11.2038467\ttest1: 24.2377383\tbest: 24.2286849 (599)\ttotal: 46.5s\tremaining: 5m 40s\n", 755 | "800:\tlearn: 9.2022496\ttest: 9.2022496\ttest1: 24.1784844\tbest: 24.1441411 (779)\ttotal: 1m 2s\tremaining: 5m 25s\n", 756 | "1000:\tlearn: 7.5905481\ttest: 7.5905481\ttest1: 24.0048152\tbest: 24.0048152 (1000)\ttotal: 1m 17s\tremaining: 5m 9s\n", 757 | "1200:\tlearn: 6.2990399\ttest: 6.2990399\ttest1: 23.8977774\tbest: 23.8908460 (1198)\ttotal: 1m 33s\tremaining: 4m 54s\n", 758 | "1400:\tlearn: 5.2766369\ttest: 5.2766369\ttest1: 23.7992180\tbest: 23.7963697 (1398)\ttotal: 1m 48s\tremaining: 4m 39s\n", 759 | "1600:\tlearn: 4.4485887\ttest: 4.4485887\ttest1: 23.7374916\tbest: 23.7374916 (1600)\ttotal: 2m 4s\tremaining: 4m 23s\n", 760 | "1800:\tlearn: 3.7643137\ttest: 3.7643137\ttest1: 23.6571016\tbest: 23.6542180 (1786)\ttotal: 2m 19s\tremaining: 4m 8s\n", 761 | "2000:\tlearn: 3.2122577\ttest: 3.2122577\ttest1: 23.6518104\tbest: 23.6267640 (1845)\ttotal: 2m 35s\tremaining: 3m 53s\n", 762 | "Stopped by overfitting detector (200 iterations wait)\n", 763 | "\n", 764 | "bestTest = 23.62676395\n", 765 | "bestIteration = 1845\n", 766 | "\n", 767 | "Shrink model to first 1846 iterations.\n", 768 | "err: 23.626763973224094\n", 769 | "0:\tlearn: 40.0831508\ttest: 40.0831508\ttest1: 40.2759953\tbest: 40.2759953 (0)\ttotal: 76.9ms\tremaining: 6m 24s\n", 770 | "200:\tlearn: 18.5545627\ttest: 18.5545627\ttest1: 25.7563384\tbest: 25.7297570 (199)\ttotal: 15.7s\tremaining: 6m 14s\n", 771 | "400:\tlearn: 14.0818511\ttest: 14.0818511\ttest1: 24.6033475\tbest: 24.5966082 (399)\ttotal: 31.1s\tremaining: 5m 57s\n", 772 | "600:\tlearn: 11.1993655\ttest: 11.1993655\ttest1: 24.0628108\tbest: 24.0190008 (591)\ttotal: 46.7s\tremaining: 5m 41s\n", 773 | "800:\tlearn: 9.1069544\ttest: 9.1069544\ttest1: 23.8031567\tbest: 23.8022204 (758)\ttotal: 1m 2s\tremaining: 5m 26s\n", 774 | "1000:\tlearn: 7.5044513\ttest: 7.5044513\ttest1: 23.6677070\tbest: 23.6677070 (1000)\ttotal: 1m 17s\tremaining: 5m 11s\n", 775 | "1200:\tlearn: 6.2820095\ttest: 6.2820095\ttest1: 23.6479612\tbest: 23.5872076 (1140)\ttotal: 1m 33s\tremaining: 4m 56s\n", 776 | "Stopped by overfitting detector (200 iterations wait)\n", 777 | "\n", 778 | "bestTest = 23.58720765\n", 779 | "bestIteration = 1140\n", 780 | "\n", 781 | "Shrink model to first 1141 iterations.\n", 782 | "err: 23.587207766724326\n", 783 | "0:\tlearn: 40.1447603\ttest: 40.1447603\ttest1: 40.0493867\tbest: 40.0493867 (0)\ttotal: 80.4ms\tremaining: 6m 41s\n", 784 | "200:\tlearn: 18.5434333\ttest: 18.5434333\ttest1: 26.1527334\tbest: 26.1379897 (198)\ttotal: 15.6s\tremaining: 6m 11s\n", 785 | "400:\tlearn: 13.9166313\ttest: 13.9166313\ttest1: 24.9135312\tbest: 24.9033703 (390)\ttotal: 31.1s\tremaining: 5m 56s\n", 786 | "600:\tlearn: 11.0496011\ttest: 11.0496011\ttest1: 24.5134111\tbest: 24.5005027 (595)\ttotal: 46.6s\tremaining: 5m 41s\n", 787 | "800:\tlearn: 8.9726530\ttest: 8.9726530\ttest1: 24.1596452\tbest: 24.1402399 (776)\ttotal: 1m 2s\tremaining: 5m 26s\n", 788 | "1000:\tlearn: 7.4108886\ttest: 7.4108886\ttest1: 23.9606031\tbest: 23.9555570 (991)\ttotal: 1m 17s\tremaining: 5m 10s\n", 789 | "1200:\tlearn: 6.1814401\ttest: 6.1814401\ttest1: 23.8160093\tbest: 23.8134094 (1184)\ttotal: 1m 33s\tremaining: 4m 55s\n", 790 | "1400:\tlearn: 5.2224432\ttest: 5.2224432\ttest1: 23.7993644\tbest: 23.7636155 (1331)\ttotal: 1m 49s\tremaining: 4m 40s\n", 791 | "1600:\tlearn: 4.4005124\ttest: 4.4005124\ttest1: 23.7306468\tbest: 23.7220672 (1584)\ttotal: 2m 4s\tremaining: 4m 25s\n", 792 | "1800:\tlearn: 3.7469906\ttest: 3.7469906\ttest1: 23.7112497\tbest: 23.6937853 (1660)\ttotal: 2m 20s\tremaining: 4m 9s\n", 793 | "2000:\tlearn: 3.2017034\ttest: 3.2017034\ttest1: 23.6699849\tbest: 23.6622811 (1989)\ttotal: 2m 36s\tremaining: 3m 54s\n", 794 | "2200:\tlearn: 2.7308572\ttest: 2.7308572\ttest1: 23.6678084\tbest: 23.6493446 (2068)\ttotal: 2m 52s\tremaining: 3m 38s\n", 795 | "2400:\tlearn: 2.3556090\ttest: 2.3556090\ttest1: 23.6214624\tbest: 23.6193422 (2398)\ttotal: 3m 7s\tremaining: 3m 23s\n", 796 | "2600:\tlearn: 2.0327111\ttest: 2.0327111\ttest1: 23.6190175\tbest: 23.6180589 (2423)\ttotal: 3m 23s\tremaining: 3m 7s\n", 797 | "2800:\tlearn: 1.7515269\ttest: 1.7515269\ttest1: 23.6220019\tbest: 23.6106827 (2615)\ttotal: 3m 39s\tremaining: 2m 52s\n", 798 | "Stopped by overfitting detector (200 iterations wait)\n", 799 | "\n", 800 | "bestTest = 23.6106827\n", 801 | "bestIteration = 2615\n", 802 | "\n", 803 | "Shrink model to first 2616 iterations.\n", 804 | "err: 23.610682571289285\n", 805 | "0:\tlearn: 40.2442856\ttest: 40.2442856\ttest1: 40.2821978\tbest: 40.2821978 (0)\ttotal: 82.8ms\tremaining: 6m 54s\n", 806 | "200:\tlearn: 18.5585820\ttest: 18.5585820\ttest1: 25.1762684\tbest: 25.1686103 (199)\ttotal: 15.6s\tremaining: 6m 13s\n", 807 | "400:\tlearn: 14.0746033\ttest: 14.0746033\ttest1: 24.1080371\tbest: 24.1080371 (400)\ttotal: 31.2s\tremaining: 5m 57s\n", 808 | "600:\tlearn: 11.1698903\ttest: 11.1698903\ttest1: 23.6591978\tbest: 23.6344024 (597)\ttotal: 46.7s\tremaining: 5m 41s\n", 809 | "800:\tlearn: 9.0968928\ttest: 9.0968928\ttest1: 23.4967794\tbest: 23.4480566 (731)\ttotal: 1m 2s\tremaining: 5m 27s\n", 810 | "1000:\tlearn: 7.4999226\ttest: 7.4999226\ttest1: 23.3029310\tbest: 23.3029310 (1000)\ttotal: 1m 18s\tremaining: 5m 12s\n", 811 | "1200:\tlearn: 6.2268303\ttest: 6.2268303\ttest1: 23.2306710\tbest: 23.2032763 (1158)\ttotal: 1m 33s\tremaining: 4m 56s\n", 812 | "1400:\tlearn: 5.2526449\ttest: 5.2526449\ttest1: 23.1735124\tbest: 23.1647173 (1382)\ttotal: 1m 50s\tremaining: 4m 44s\n", 813 | "1600:\tlearn: 4.4453553\ttest: 4.4453553\ttest1: 23.0764276\tbest: 23.0760354 (1599)\ttotal: 2m 6s\tremaining: 4m 28s\n", 814 | "1800:\tlearn: 3.7782981\ttest: 3.7782981\ttest1: 23.0401895\tbest: 23.0269641 (1790)\ttotal: 2m 22s\tremaining: 4m 12s\n", 815 | "2000:\tlearn: 3.2257521\ttest: 3.2257521\ttest1: 23.0495526\tbest: 23.0257095 (1863)\ttotal: 2m 37s\tremaining: 3m 56s\n", 816 | "Stopped by overfitting detector (200 iterations wait)\n", 817 | "\n", 818 | "bestTest = 23.02570946\n", 819 | "bestIteration = 1863\n", 820 | "\n", 821 | "Shrink model to first 1864 iterations.\n", 822 | "err: 23.025709845930972\n", 823 | "0:\tlearn: 40.3255443\ttest: 40.3255443\ttest1: 38.6459117\tbest: 38.6459117 (0)\ttotal: 78.7ms\tremaining: 6m 33s\n", 824 | "200:\tlearn: 18.5589204\ttest: 18.5589204\ttest1: 24.6792365\tbest: 24.6682080 (198)\ttotal: 15.6s\tremaining: 6m 11s\n", 825 | "400:\tlearn: 13.9842886\ttest: 13.9842886\ttest1: 23.3331800\tbest: 23.3331800 (400)\ttotal: 31.1s\tremaining: 5m 56s\n", 826 | "600:\tlearn: 11.1243029\ttest: 11.1243029\ttest1: 22.7763060\tbest: 22.7763060 (600)\ttotal: 46.6s\tremaining: 5m 41s\n", 827 | "800:\tlearn: 9.0916540\ttest: 9.0916540\ttest1: 22.5603758\tbest: 22.5405960 (745)\ttotal: 1m 2s\tremaining: 5m 25s\n", 828 | "1000:\tlearn: 7.4839058\ttest: 7.4839058\ttest1: 22.3351773\tbest: 22.3351773 (1000)\ttotal: 1m 17s\tremaining: 5m 10s\n", 829 | "1200:\tlearn: 6.2634279\ttest: 6.2634279\ttest1: 22.1963710\tbest: 22.1869120 (1197)\ttotal: 1m 33s\tremaining: 4m 54s\n", 830 | "1400:\tlearn: 5.2682685\ttest: 5.2682685\ttest1: 22.1296374\tbest: 22.1175826 (1371)\ttotal: 1m 48s\tremaining: 4m 39s\n", 831 | "1600:\tlearn: 4.4702299\ttest: 4.4702299\ttest1: 22.0735495\tbest: 22.0672424 (1571)\ttotal: 2m 4s\tremaining: 4m 23s\n", 832 | "Stopped by overfitting detector (200 iterations wait)\n", 833 | "\n", 834 | "bestTest = 22.06724241\n", 835 | "bestIteration = 1571\n", 836 | "\n", 837 | "Shrink model to first 1572 iterations.\n", 838 | "err: 22.067242551903032\n", 839 | "0:\tlearn: 40.3360136\ttest: 40.3360136\ttest1: 39.0164885\tbest: 39.0164885 (0)\ttotal: 77.5ms\tremaining: 6m 27s\n", 840 | "200:\tlearn: 18.6665069\ttest: 18.6665069\ttest1: 24.0657402\tbest: 24.0657402 (200)\ttotal: 15.5s\tremaining: 6m 11s\n", 841 | "400:\tlearn: 14.0490032\ttest: 14.0490032\ttest1: 23.1715470\tbest: 23.1636967 (393)\ttotal: 31.1s\tremaining: 5m 56s\n", 842 | "600:\tlearn: 11.1426550\ttest: 11.1426550\ttest1: 22.6796131\tbest: 22.6796131 (600)\ttotal: 46.5s\tremaining: 5m 40s\n", 843 | "800:\tlearn: 9.1014829\ttest: 9.1014829\ttest1: 22.4250826\tbest: 22.4250826 (800)\ttotal: 1m 1s\tremaining: 5m 24s\n", 844 | "1000:\tlearn: 7.5011249\ttest: 7.5011249\ttest1: 22.3179993\tbest: 22.3145331 (994)\ttotal: 1m 17s\tremaining: 5m 10s\n", 845 | "1200:\tlearn: 6.2489588\ttest: 6.2489588\ttest1: 22.2562969\tbest: 22.2363510 (1178)\ttotal: 1m 33s\tremaining: 4m 54s\n", 846 | "1400:\tlearn: 5.2631956\ttest: 5.2631956\ttest1: 22.2078329\tbest: 22.1983166 (1371)\ttotal: 1m 48s\tremaining: 4m 39s\n", 847 | "1600:\tlearn: 4.4421702\ttest: 4.4421702\ttest1: 22.1234305\tbest: 22.1234305 (1600)\ttotal: 2m 4s\tremaining: 4m 23s\n", 848 | "1800:\tlearn: 3.7661850\ttest: 3.7661850\ttest1: 22.0167844\tbest: 22.0161885 (1798)\ttotal: 2m 20s\tremaining: 4m 8s\n", 849 | "2000:\tlearn: 3.2289727\ttest: 3.2289727\ttest1: 21.9929626\tbest: 21.9918824 (1982)\ttotal: 2m 35s\tremaining: 3m 53s\n", 850 | "2200:\tlearn: 2.7692197\ttest: 2.7692197\ttest1: 21.9787016\tbest: 21.9774238 (2196)\ttotal: 2m 51s\tremaining: 3m 37s\n", 851 | "2400:\tlearn: 2.3751767\ttest: 2.3751767\ttest1: 21.9644917\tbest: 21.9644917 (2400)\ttotal: 3m 6s\tremaining: 3m 22s\n", 852 | "2600:\tlearn: 2.0464960\ttest: 2.0464960\ttest1: 21.9390869\tbest: 21.9305328 (2526)\ttotal: 3m 22s\tremaining: 3m 6s\n", 853 | "2800:\tlearn: 1.7718744\ttest: 1.7718744\ttest1: 21.9219081\tbest: 21.9219081 (2800)\ttotal: 3m 37s\tremaining: 2m 51s\n", 854 | "3000:\tlearn: 1.5257647\ttest: 1.5257647\ttest1: 21.9343371\tbest: 21.9192063 (2817)\ttotal: 3m 53s\tremaining: 2m 35s\n", 855 | "Stopped by overfitting detector (200 iterations wait)\n", 856 | "\n", 857 | "bestTest = 21.91920631\n", 858 | "bestIteration = 2817\n", 859 | "\n", 860 | "Shrink model to first 2818 iterations.\n", 861 | "err: 21.91920650501892\n", 862 | "0:\tlearn: 40.2495261\ttest: 40.2495261\ttest1: 40.2830735\tbest: 40.2830735 (0)\ttotal: 76.6ms\tremaining: 6m 22s\n", 863 | "200:\tlearn: 18.7101338\ttest: 18.7101338\ttest1: 22.9273356\tbest: 22.9273356 (200)\ttotal: 15.5s\tremaining: 6m 10s\n", 864 | "400:\tlearn: 13.9303330\ttest: 13.9303330\ttest1: 21.7517536\tbest: 21.7506623 (399)\ttotal: 31s\tremaining: 5m 55s\n", 865 | "600:\tlearn: 11.1010006\ttest: 11.1010006\ttest1: 21.2733640\tbest: 21.2726085 (598)\ttotal: 46.4s\tremaining: 5m 39s\n", 866 | "800:\tlearn: 9.0670233\ttest: 9.0670233\ttest1: 21.0638435\tbest: 21.0273690 (761)\ttotal: 1m 2s\tremaining: 5m 25s\n", 867 | "Stopped by overfitting detector (200 iterations wait)\n", 868 | "\n", 869 | "bestTest = 21.02736896\n", 870 | "bestIteration = 761\n", 871 | "\n", 872 | "Shrink model to first 762 iterations.\n", 873 | "err: 21.02736888901848\n", 874 | "0:\tlearn: 40.2029701\ttest: 40.2029701\ttest1: 41.6243888\tbest: 41.6243888 (0)\ttotal: 77.3ms\tremaining: 6m 26s\n", 875 | "200:\tlearn: 18.6679265\ttest: 18.6679265\ttest1: 23.4785938\tbest: 23.4755171 (195)\ttotal: 15.6s\tremaining: 6m 12s\n", 876 | "400:\tlearn: 13.9525682\ttest: 13.9525682\ttest1: 22.4108766\tbest: 22.4050359 (399)\ttotal: 31.1s\tremaining: 5m 56s\n", 877 | "600:\tlearn: 11.0323003\ttest: 11.0323003\ttest1: 22.1950919\tbest: 22.1655481 (591)\ttotal: 46.6s\tremaining: 5m 40s\n", 878 | "800:\tlearn: 8.9495728\ttest: 8.9495728\ttest1: 21.8626894\tbest: 21.8481166 (787)\ttotal: 1m 2s\tremaining: 5m 25s\n", 879 | "1000:\tlearn: 7.3931876\ttest: 7.3931876\ttest1: 21.7407306\tbest: 21.7274289 (979)\ttotal: 1m 17s\tremaining: 5m 10s\n", 880 | "1200:\tlearn: 6.1955991\ttest: 6.1955991\ttest1: 21.6722801\tbest: 21.6695920 (1198)\ttotal: 1m 34s\tremaining: 4m 59s\n", 881 | "1400:\tlearn: 5.2090663\ttest: 5.2090663\ttest1: 21.5923481\tbest: 21.5894501 (1396)\ttotal: 1m 50s\tremaining: 4m 43s\n", 882 | "1600:\tlearn: 4.4209683\ttest: 4.4209683\ttest1: 21.5497588\tbest: 21.5369012 (1475)\ttotal: 2m 5s\tremaining: 4m 27s\n", 883 | "1800:\tlearn: 3.7330390\ttest: 3.7330390\ttest1: 21.4947114\tbest: 21.4829271 (1734)\ttotal: 2m 21s\tremaining: 4m 11s\n", 884 | "2000:\tlearn: 3.1849787\ttest: 3.1849787\ttest1: 21.4697018\tbest: 21.4556831 (1926)\ttotal: 2m 37s\tremaining: 3m 55s\n", 885 | "Stopped by overfitting detector (200 iterations wait)\n", 886 | "\n", 887 | "bestTest = 21.45568305\n", 888 | "bestIteration = 1926\n", 889 | "\n", 890 | "Shrink model to first 1927 iterations.\n", 891 | "err: 21.45568292565179\n", 892 | "0:\tlearn: 39.9752611\ttest: 39.9752611\ttest1: 42.7216821\tbest: 42.7216821 (0)\ttotal: 76.1ms\tremaining: 6m 20s\n", 893 | "200:\tlearn: 18.6873486\ttest: 18.6873486\ttest1: 25.3251105\tbest: 25.3226036 (198)\ttotal: 15.6s\tremaining: 6m 11s\n", 894 | "400:\tlearn: 14.1755577\ttest: 14.1755577\ttest1: 24.2777607\tbest: 24.2654741 (394)\ttotal: 31s\tremaining: 5m 55s\n", 895 | "600:\tlearn: 11.2402725\ttest: 11.2402725\ttest1: 23.7855228\tbest: 23.7843534 (599)\ttotal: 46.5s\tremaining: 5m 40s\n", 896 | "800:\tlearn: 9.1229660\ttest: 9.1229660\ttest1: 23.4983693\tbest: 23.4983693 (800)\ttotal: 1m 1s\tremaining: 5m 24s\n", 897 | "1000:\tlearn: 7.5522934\ttest: 7.5522934\ttest1: 23.3819595\tbest: 23.3819595 (1000)\ttotal: 1m 17s\tremaining: 5m 9s\n", 898 | "1200:\tlearn: 6.2628687\ttest: 6.2628687\ttest1: 23.3963507\tbest: 23.3748201 (1129)\ttotal: 1m 33s\tremaining: 4m 54s\n", 899 | "1400:\tlearn: 5.2614718\ttest: 5.2614718\ttest1: 23.2582987\tbest: 23.2582987 (1400)\ttotal: 1m 48s\tremaining: 4m 39s\n", 900 | "1600:\tlearn: 4.4622843\ttest: 4.4622843\ttest1: 23.1452926\tbest: 23.1452926 (1600)\ttotal: 2m 4s\tremaining: 4m 23s\n", 901 | "1800:\tlearn: 3.7903927\ttest: 3.7903927\ttest1: 23.1218691\tbest: 23.1180035 (1641)\ttotal: 2m 19s\tremaining: 4m 8s\n", 902 | "2000:\tlearn: 3.2200017\ttest: 3.2200017\ttest1: 23.0990783\tbest: 23.0876150 (1955)\ttotal: 2m 35s\tremaining: 3m 53s\n", 903 | "2200:\tlearn: 2.7512936\ttest: 2.7512936\ttest1: 23.0878790\tbest: 23.0819325 (2192)\ttotal: 2m 51s\tremaining: 3m 37s\n", 904 | "Stopped by overfitting detector (200 iterations wait)\n", 905 | "\n", 906 | "bestTest = 23.08193251\n", 907 | "bestIteration = 2192\n", 908 | "\n", 909 | "Shrink model to first 2193 iterations.\n", 910 | "err: 23.081932696315334\n", 911 | "0:\tlearn: 40.3101864\ttest: 40.3101864\ttest1: 38.3758215\tbest: 38.3758215 (0)\ttotal: 76.3ms\tremaining: 6m 21s\n", 912 | "200:\tlearn: 18.5300099\ttest: 18.5300099\ttest1: 24.4798489\tbest: 24.4798489 (200)\ttotal: 15.6s\tremaining: 6m 12s\n", 913 | "400:\tlearn: 13.9776861\ttest: 13.9776861\ttest1: 23.5322861\tbest: 23.5322861 (400)\ttotal: 31.2s\tremaining: 5m 57s\n", 914 | "600:\tlearn: 11.1768234\ttest: 11.1768234\ttest1: 22.9532575\tbest: 22.9493678 (599)\ttotal: 46.7s\tremaining: 5m 41s\n", 915 | "800:\tlearn: 9.0597343\ttest: 9.0597343\ttest1: 22.6141236\tbest: 22.6009812 (796)\ttotal: 1m 2s\tremaining: 5m 25s\n", 916 | "1000:\tlearn: 7.5045138\ttest: 7.5045138\ttest1: 22.5008777\tbest: 22.4964678 (995)\ttotal: 1m 17s\tremaining: 5m 10s\n", 917 | "1200:\tlearn: 6.2556477\ttest: 6.2556477\ttest1: 22.3478979\tbest: 22.3394701 (1193)\ttotal: 1m 33s\tremaining: 4m 54s\n", 918 | "1400:\tlearn: 5.2750707\ttest: 5.2750707\ttest1: 22.2989177\tbest: 22.2841951 (1361)\ttotal: 1m 48s\tremaining: 4m 39s\n", 919 | "1600:\tlearn: 4.4615398\ttest: 4.4615398\ttest1: 22.2866401\tbest: 22.2627218 (1571)\ttotal: 2m 4s\tremaining: 4m 23s\n", 920 | "1800:\tlearn: 3.7752326\ttest: 3.7752326\ttest1: 22.2431147\tbest: 22.2390858 (1792)\ttotal: 2m 20s\tremaining: 4m 8s\n", 921 | "2000:\tlearn: 3.2245801\ttest: 3.2245801\ttest1: 22.2071219\tbest: 22.2071219 (2000)\ttotal: 2m 35s\tremaining: 3m 53s\n", 922 | "2200:\tlearn: 2.7548921\ttest: 2.7548921\ttest1: 22.2134860\tbest: 22.1926314 (2082)\ttotal: 2m 51s\tremaining: 3m 37s\n", 923 | "Stopped by overfitting detector (200 iterations wait)\n", 924 | "\n", 925 | "bestTest = 22.19263139\n", 926 | "bestIteration = 2082\n", 927 | "\n", 928 | "Shrink model to first 2083 iterations.\n", 929 | "err: 22.19263108025873\n", 930 | "0:\tlearn: 40.2372751\ttest: 40.2372751\ttest1: 39.5062699\tbest: 39.5062699 (0)\ttotal: 77.6ms\tremaining: 6m 28s\n", 931 | "200:\tlearn: 18.6177315\ttest: 18.6177315\ttest1: 25.8164432\tbest: 25.7954700 (199)\ttotal: 15.6s\tremaining: 6m 11s\n", 932 | "400:\tlearn: 13.9433224\ttest: 13.9433224\ttest1: 24.5489698\tbest: 24.5489698 (400)\ttotal: 31.1s\tremaining: 5m 56s\n", 933 | "600:\tlearn: 11.0678258\ttest: 11.0678258\ttest1: 23.9059608\tbest: 23.8966854 (597)\ttotal: 46.6s\tremaining: 5m 41s\n", 934 | "800:\tlearn: 8.9866349\ttest: 8.9866349\ttest1: 23.5694571\tbest: 23.5635749 (796)\ttotal: 1m 2s\tremaining: 5m 25s\n", 935 | "1000:\tlearn: 7.3946596\ttest: 7.3946596\ttest1: 23.3933467\tbest: 23.3920246 (999)\ttotal: 1m 17s\tremaining: 5m 10s\n", 936 | "1200:\tlearn: 6.1721142\ttest: 6.1721142\ttest1: 23.3044636\tbest: 23.3044636 (1200)\ttotal: 1m 33s\tremaining: 4m 55s\n", 937 | "1400:\tlearn: 5.1788409\ttest: 5.1788409\ttest1: 23.2013175\tbest: 23.2013175 (1400)\ttotal: 1m 48s\tremaining: 4m 39s\n", 938 | "1600:\tlearn: 4.3836433\ttest: 4.3836433\ttest1: 23.1140350\tbest: 23.1140350 (1600)\ttotal: 2m 4s\tremaining: 4m 24s\n", 939 | "1800:\tlearn: 3.7432574\ttest: 3.7432574\ttest1: 23.0750452\tbest: 23.0733526 (1796)\ttotal: 2m 20s\tremaining: 4m 8s\n", 940 | "2000:\tlearn: 3.1976351\ttest: 3.1976351\ttest1: 23.0241154\tbest: 23.0220076 (1994)\ttotal: 2m 35s\tremaining: 3m 53s\n", 941 | "2200:\tlearn: 2.7444357\ttest: 2.7444357\ttest1: 23.0101091\tbest: 23.0057872 (2129)\ttotal: 2m 53s\tremaining: 3m 40s\n", 942 | "2400:\tlearn: 2.3606729\ttest: 2.3606729\ttest1: 22.9854733\tbest: 22.9845754 (2395)\ttotal: 3m 8s\tremaining: 3m 24s\n", 943 | "2600:\tlearn: 2.0322732\ttest: 2.0322732\ttest1: 22.9570772\tbest: 22.9544701 (2583)\ttotal: 3m 24s\tremaining: 3m 8s\n", 944 | "2800:\tlearn: 1.7603679\ttest: 1.7603679\ttest1: 22.9709213\tbest: 22.9529765 (2658)\ttotal: 3m 39s\tremaining: 2m 52s\n", 945 | "Stopped by overfitting detector (200 iterations wait)\n", 946 | "\n", 947 | "bestTest = 22.95297654\n", 948 | "bestIteration = 2658\n", 949 | "\n", 950 | "Shrink model to first 2659 iterations.\n", 951 | "err: 22.952976358290744\n", 952 | "0:\tlearn: 40.3027167\ttest: 40.3027167\ttest1: 39.9886165\tbest: 39.9886165 (0)\ttotal: 78.2ms\tremaining: 6m 31s\n", 953 | "200:\tlearn: 18.5956369\ttest: 18.5956369\ttest1: 25.3158600\tbest: 25.3158600 (200)\ttotal: 15.5s\tremaining: 6m 10s\n", 954 | "400:\tlearn: 14.0106798\ttest: 14.0106798\ttest1: 24.8249253\tbest: 24.8005189 (384)\ttotal: 31s\tremaining: 5m 55s\n", 955 | "600:\tlearn: 11.1547859\ttest: 11.1547859\ttest1: 24.3765404\tbest: 24.3721730 (598)\ttotal: 46.4s\tremaining: 5m 39s\n", 956 | "800:\tlearn: 9.1013138\ttest: 9.1013138\ttest1: 24.0427740\tbest: 24.0427740 (800)\ttotal: 1m 1s\tremaining: 5m 24s\n", 957 | "1000:\tlearn: 7.4668754\ttest: 7.4668754\ttest1: 23.8872275\tbest: 23.8788433 (991)\ttotal: 1m 17s\tremaining: 5m 9s\n", 958 | "1200:\tlearn: 6.2548365\ttest: 6.2548365\ttest1: 23.7607930\tbest: 23.7607930 (1200)\ttotal: 1m 32s\tremaining: 4m 53s\n", 959 | "1400:\tlearn: 5.2538671\ttest: 5.2538671\ttest1: 23.7367607\tbest: 23.7348041 (1366)\ttotal: 1m 48s\tremaining: 4m 38s\n", 960 | "1600:\tlearn: 4.4275497\ttest: 4.4275497\ttest1: 23.7398416\tbest: 23.7182286 (1578)\ttotal: 2m 3s\tremaining: 4m 23s\n", 961 | "1800:\tlearn: 3.7709991\ttest: 3.7709991\ttest1: 23.7015974\tbest: 23.6992448 (1769)\ttotal: 2m 19s\tremaining: 4m 7s\n", 962 | "2000:\tlearn: 3.2059532\ttest: 3.2059532\ttest1: 23.6612279\tbest: 23.6556684 (1946)\ttotal: 2m 35s\tremaining: 3m 52s\n", 963 | "2200:\tlearn: 2.7559935\ttest: 2.7559935\ttest1: 23.6300102\tbest: 23.6285448 (2183)\ttotal: 2m 50s\tremaining: 3m 37s\n", 964 | "2400:\tlearn: 2.3588065\ttest: 2.3588065\ttest1: 23.6128305\tbest: 23.6101937 (2336)\ttotal: 3m 6s\tremaining: 3m 21s\n", 965 | "2600:\tlearn: 2.0310939\ttest: 2.0310939\ttest1: 23.5988171\tbest: 23.5938532 (2478)\ttotal: 3m 21s\tremaining: 3m 6s\n", 966 | "Stopped by overfitting detector (200 iterations wait)\n", 967 | "\n", 968 | "bestTest = 23.59385323\n", 969 | "bestIteration = 2478\n", 970 | "\n", 971 | "Shrink model to first 2479 iterations.\n", 972 | "err: 23.593853180670912\n", 973 | "0:\tlearn: 40.3133567\ttest: 40.3133567\ttest1: 38.4216752\tbest: 38.4216752 (0)\ttotal: 77.9ms\tremaining: 6m 29s\n", 974 | "200:\tlearn: 18.5351295\ttest: 18.5351295\ttest1: 25.0794284\tbest: 25.0794284 (200)\ttotal: 15.6s\tremaining: 6m 11s\n", 975 | "400:\tlearn: 14.0712090\ttest: 14.0712090\ttest1: 24.1763458\tbest: 24.1742213 (398)\ttotal: 31.1s\tremaining: 5m 56s\n", 976 | "600:\tlearn: 11.2283119\ttest: 11.2283119\ttest1: 23.3785283\tbest: 23.3785283 (600)\ttotal: 46.6s\tremaining: 5m 41s\n", 977 | "800:\tlearn: 9.1316227\ttest: 9.1316227\ttest1: 23.0947359\tbest: 23.0947359 (800)\ttotal: 1m 2s\tremaining: 5m 25s\n", 978 | "1000:\tlearn: 7.4893484\ttest: 7.4893484\ttest1: 22.7073351\tbest: 22.7073351 (1000)\ttotal: 1m 17s\tremaining: 5m 10s\n", 979 | "1200:\tlearn: 6.2431172\ttest: 6.2431172\ttest1: 22.6216763\tbest: 22.6155442 (1198)\ttotal: 1m 33s\tremaining: 4m 54s\n", 980 | "1400:\tlearn: 5.2307179\ttest: 5.2307179\ttest1: 22.4836826\tbest: 22.4834448 (1396)\ttotal: 1m 48s\tremaining: 4m 39s\n", 981 | "1600:\tlearn: 4.4057232\ttest: 4.4057232\ttest1: 22.4289766\tbest: 22.4213644 (1596)\ttotal: 2m 4s\tremaining: 4m 24s\n", 982 | "1800:\tlearn: 3.7532065\ttest: 3.7532065\ttest1: 22.3976587\tbest: 22.3788089 (1705)\ttotal: 2m 19s\tremaining: 4m 8s\n", 983 | "Stopped by overfitting detector (200 iterations wait)\n", 984 | "\n", 985 | "bestTest = 22.37880893\n", 986 | "bestIteration = 1705\n", 987 | "\n", 988 | "Shrink model to first 1706 iterations.\n", 989 | "err: 22.37880911443636\n", 990 | "0:\tlearn: 40.1291292\ttest: 40.1291292\ttest1: 42.9847112\tbest: 42.9847112 (0)\ttotal: 77.3ms\tremaining: 6m 26s\n", 991 | "200:\tlearn: 18.5334325\ttest: 18.5334325\ttest1: 26.8958084\tbest: 26.8958084 (200)\ttotal: 15.4s\tremaining: 6m 8s\n", 992 | "400:\tlearn: 14.0019331\ttest: 14.0019331\ttest1: 25.7488425\tbest: 25.7488425 (400)\ttotal: 30.8s\tremaining: 5m 53s\n", 993 | "600:\tlearn: 11.1403528\ttest: 11.1403528\ttest1: 25.1925107\tbest: 25.1925107 (600)\ttotal: 46.3s\tremaining: 5m 38s\n", 994 | "800:\tlearn: 9.0911354\ttest: 9.0911354\ttest1: 24.9427465\tbest: 24.9427465 (800)\ttotal: 1m 1s\tremaining: 5m 23s\n", 995 | "1000:\tlearn: 7.4959946\ttest: 7.4959946\ttest1: 24.6866179\tbest: 24.6866179 (1000)\ttotal: 1m 17s\tremaining: 5m 8s\n", 996 | "1200:\tlearn: 6.2674539\ttest: 6.2674539\ttest1: 24.5538038\tbest: 24.5230785 (1170)\ttotal: 1m 32s\tremaining: 4m 53s\n", 997 | "1400:\tlearn: 5.2540337\ttest: 5.2540337\ttest1: 24.4766775\tbest: 24.4731437 (1393)\ttotal: 1m 48s\tremaining: 4m 38s\n", 998 | "1600:\tlearn: 4.4462344\ttest: 4.4462344\ttest1: 24.4213427\tbest: 24.4165306 (1554)\ttotal: 2m 3s\tremaining: 4m 23s\n", 999 | "1800:\tlearn: 3.7754953\ttest: 3.7754953\ttest1: 24.3620337\tbest: 24.3616876 (1799)\ttotal: 2m 19s\tremaining: 4m 7s\n", 1000 | "2000:\tlearn: 3.2344382\ttest: 3.2344382\ttest1: 24.3428412\tbest: 24.3394612 (1989)\ttotal: 2m 35s\tremaining: 3m 52s\n", 1001 | "2200:\tlearn: 2.7685777\ttest: 2.7685777\ttest1: 24.3236314\tbest: 24.3221403 (2180)\ttotal: 2m 50s\tremaining: 3m 36s\n", 1002 | "2400:\tlearn: 2.3949253\ttest: 2.3949253\ttest1: 24.2920106\tbest: 24.2873041 (2381)\ttotal: 3m 7s\tremaining: 3m 23s\n", 1003 | "2600:\tlearn: 2.0633171\ttest: 2.0633171\ttest1: 24.2726954\tbest: 24.2704865 (2591)\ttotal: 3m 23s\tremaining: 3m 7s\n", 1004 | "2800:\tlearn: 1.7723446\ttest: 1.7723446\ttest1: 24.2615666\tbest: 24.2615666 (2800)\ttotal: 3m 38s\tremaining: 2m 51s\n", 1005 | "3000:\tlearn: 1.5351622\ttest: 1.5351622\ttest1: 24.2568203\tbest: 24.2562337 (2999)\ttotal: 3m 54s\tremaining: 2m 36s\n", 1006 | "3200:\tlearn: 1.3320583\ttest: 1.3320583\ttest1: 24.2472974\tbest: 24.2448250 (3073)\ttotal: 4m 9s\tremaining: 2m 20s\n", 1007 | "3400:\tlearn: 1.1575297\ttest: 1.1575297\ttest1: 24.2437464\tbest: 24.2397267 (3377)\ttotal: 4m 25s\tremaining: 2m 4s\n", 1008 | "3600:\tlearn: 1.0090671\ttest: 1.0090671\ttest1: 24.2403057\tbest: 24.2383831 (3579)\ttotal: 4m 41s\tremaining: 1m 49s\n", 1009 | "3800:\tlearn: 0.8775672\ttest: 0.8775672\ttest1: 24.2342476\tbest: 24.2341390 (3798)\ttotal: 4m 56s\tremaining: 1m 33s\n", 1010 | "4000:\tlearn: 0.7692448\ttest: 0.7692448\ttest1: 24.2295141\tbest: 24.2291590 (3999)\ttotal: 5m 12s\tremaining: 1m 17s\n", 1011 | "4200:\tlearn: 0.6720400\ttest: 0.6720400\ttest1: 24.2257068\tbest: 24.2257068 (4200)\ttotal: 5m 29s\tremaining: 1m 2s\n", 1012 | "4400:\tlearn: 0.5881158\ttest: 0.5881158\ttest1: 24.2220132\tbest: 24.2199265 (4352)\ttotal: 5m 46s\tremaining: 47.1s\n", 1013 | "4600:\tlearn: 0.5165693\ttest: 0.5165693\ttest1: 24.2179745\tbest: 24.2179409 (4599)\ttotal: 6m 1s\tremaining: 31.4s\n", 1014 | "4800:\tlearn: 0.4526341\ttest: 0.4526341\ttest1: 24.2183527\tbest: 24.2169859 (4780)\ttotal: 6m 17s\tremaining: 15.7s\n", 1015 | "4999:\tlearn: 0.3940131\ttest: 0.3940131\ttest1: 24.2156999\tbest: 24.2153069 (4902)\ttotal: 6m 33s\tremaining: 0us\n", 1016 | "\n", 1017 | "bestTest = 24.21530691\n", 1018 | "bestIteration = 4902\n", 1019 | "\n", 1020 | "Shrink model to first 4903 iterations.\n", 1021 | "err: 24.215306777079714\n" 1022 | ], 1023 | "name": "stdout" 1024 | }, 1025 | { 1026 | "output_type": "execute_result", 1027 | "data": { 1028 | "text/plain": [ 1029 | "22.790050240627117" 1030 | ] 1031 | }, 1032 | "metadata": { 1033 | "tags": [] 1034 | }, 1035 | "execution_count": 117 1036 | } 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "metadata": { 1042 | "id": "3Oy1kH8gm9OO", 1043 | "colab_type": "code", 1044 | "outputId": "27b74a6b-2e32-4f91-b23b-9a82e4a0a4c5", 1045 | "colab": { 1046 | "base_uri": "https://localhost:8080/", 1047 | "height": 34 1048 | } 1049 | }, 1050 | "source": [ 1051 | "np.mean(errcb2)" 1052 | ], 1053 | "execution_count": 0, 1054 | "outputs": [ 1055 | { 1056 | "output_type": "execute_result", 1057 | "data": { 1058 | "text/plain": [ 1059 | "22.790050240627117" 1060 | ] 1061 | }, 1062 | "metadata": { 1063 | "tags": [] 1064 | }, 1065 | "execution_count": 118 1066 | } 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "code", 1071 | "metadata": { 1072 | "id": "SnMxF4rWm9OR", 1073 | "colab_type": "code", 1074 | "colab": {} 1075 | }, 1076 | "source": [ 1077 | "d = {'ID': test_id, 'target': np.mean(y_pred_totcb2, 0)}\n", 1078 | "sub = pd.DataFrame(data=d)\n", 1079 | "sub = sub[['ID', 'target']]" 1080 | ], 1081 | "execution_count": 0, 1082 | "outputs": [] 1083 | }, 1084 | { 1085 | "cell_type": "code", 1086 | "metadata": { 1087 | "id": "_azuEaJpICfS", 1088 | "colab_type": "code", 1089 | "colab": {} 1090 | }, 1091 | "source": [ 1092 | "sub.to_csv('indianda.csv',index=False)\n", 1093 | "from google.colab import files\n", 1094 | "files.download(\"indianda.csv\")" 1095 | ], 1096 | "execution_count": 0, 1097 | "outputs": [] 1098 | } 1099 | ] 1100 | } --------------------------------------------------------------------------------