├── README.md
├── Final_blend.ipynb
├── XGBOOST_notebook.ipynb
├── ANOTHERLGBMODEL.ipynb
├── new_data_creation.ipynb
└── indianda.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | #Please read this before you run kernels
2 |
3 | 1- RUN ANOTHERLGBMODEL.ipynb to get ANOTHERLGB.csv
4 |
5 | 2- RUN new_data_creation.ipynb TO get a new data ,Here is a Link to get the data IT'S A PRIVATE DATA www.kaggle.com/dataset/198175af5d025f8e7e9d6f28f34b6d7ecfc58f242d4049bdc6b50462e5821b44
6 | #of course i will not share this link when i will share the solution
7 |
8 | 3-RUN submission1 to get submission1.csv
9 |
10 | 4-RUN XGBOOST_notebook.ipynb to get subxgb.csv
11 |
12 | 5-RUN _submission2.ipynb to get submission2.csv
13 |
14 | 6-RUN EKHERMOU7AWLA.ipynb to get ekher_mou7awla.csv
15 |
16 | 7-RUN indianda.ipynb TO get indiana.csv
17 |
18 |
19 | 8-After runing those kernels you can run FINAL_BLEND.ipynb to get the finale_sub.csv
20 |
21 |
22 |
23 | ## [On the Leaderboard](https://zindi.africa/competitions/airqo-ugandan-air-quality-forecast-challenge/leaderboard)
24 |
25 | Look for the team named : **Forecasters**
26 | Rank : 12/501
27 |
28 | ## Authors
29 |
30 |
31 |
32 | | Name | Zindi ID | Github ID |
33 | |----------------|--------------------------------------------------|------------------------------------------|
34 | |Azer KSOURI |[@plndz](https://zindi.africa/users/plndz) |[@Az-Ks](https://github.com/Az-Ks) |
35 | |Helmi Klai |[@Klai](https://zindi.africa/users/Klai) |[@Klaimohelmi](https://github.com/Klaimohelmi) |
36 | |Muhamed TUO |[@Muhamed_Tuo](https://zindi.africa/users/Muhamed_Tuo) |[@NazarioR9](https://github.com/NazarioR9)|
37 | |Saurabh Kumar |[@Saurabh502](https://zindi.africa/users/Saurabh502) |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/Final_blend.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Final_blend.ipynb",
7 | "provenance": [],
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | }
14 | },
15 | "cells": [
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {
19 | "id": "view-in-github",
20 | "colab_type": "text"
21 | },
22 | "source": [
23 | "
"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {
29 | "id": "24e_LkQNwtip",
30 | "colab_type": "text"
31 | },
32 | "source": [
33 | "FIRST BLEND"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "metadata": {
39 | "id": "m1VpWdXKvMih",
40 | "colab_type": "code",
41 | "colab": {}
42 | },
43 | "source": [
44 | "import pandas as pd \n",
45 | "\n",
46 | "\n",
47 | "submission1 = pd.read_csv('submission1.csv')\n",
48 | "sub_xgb = pd.read_csv('subxgb.csv')\n",
49 | "\n",
50 | "submission_ = submission1.copy()\n",
51 | "\n",
52 | "submission_['target'] = sub_xgb['target']*0.3 + submission1['target']*0.7"
53 | ],
54 | "execution_count": 1,
55 | "outputs": []
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {
60 | "id": "lT0xCG7pwxcW",
61 | "colab_type": "text"
62 | },
63 | "source": [
64 | "**SECOND BLEND**"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "metadata": {
70 | "id": "dNTJwhfbw3ZH",
71 | "colab_type": "code",
72 | "colab": {}
73 | },
74 | "source": [
75 | "submission2 =pd.read_csv('submission2.csv')\n",
76 | "\n",
77 | "comb1['target'] = submission_['target']*0.5 + submission2['target']*0.5"
78 | ],
79 | "execution_count": 7,
80 | "outputs": []
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {
85 | "id": "zhJJ5yA6xbsW",
86 | "colab_type": "text"
87 | },
88 | "source": [
89 | "**FINAL BLEND BLEND**"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "metadata": {
95 | "id": "UDXHuUPexfV4",
96 | "colab_type": "code",
97 | "colab": {}
98 | },
99 | "source": [
100 | "#comb1 * 0.4288 + indianda * 0.1072 + liyjib37.07lgb * 0.264 + 0.2 * ekher mouhawla\n",
101 | "\n",
102 | "\n",
103 | "indiana =pd.read_csv('indiana.csv')\n",
104 | "another_lgb = pd.read_csv('ANOTHERLGB.csv')\n",
105 | "ekher_mou7awla =pd.read_csv('ekher_mou7awla.csv')\n",
106 | "\n",
107 | "\n",
108 | "\n",
109 | "sub_finale= indiana.copy()\n",
110 | "\n",
111 | "sub_finale['target'] = comb1['target']*0.4288 + indiana['target']*0.1072 + another_lgb['target']* 0.264 + ekher_mou7awla['target']*0.2"
112 | ],
113 | "execution_count": 9,
114 | "outputs": []
115 | },
116 | {
117 | "cell_type": "code",
118 | "metadata": {
119 | "id": "e0vbG9Ez0bmI",
120 | "colab_type": "code",
121 | "colab": {}
122 | },
123 | "source": [
124 | "sub_finale.to_csv('sub_finale.csv',index=False)"
125 | ],
126 | "execution_count": 10,
127 | "outputs": []
128 | },
129 | {
130 | "cell_type": "code",
131 | "metadata": {
132 | "id": "O48UkdYhYWA3",
133 | "colab_type": "code",
134 | "colab": {}
135 | },
136 | "source": [
137 | ""
138 | ],
139 | "execution_count": null,
140 | "outputs": []
141 | }
142 | ]
143 | }
--------------------------------------------------------------------------------
/XGBOOST_notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "display_name": "Python 3",
7 | "language": "python",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "codemirror_mode": {
12 | "name": "ipython",
13 | "version": 3
14 | },
15 | "file_extension": ".py",
16 | "mimetype": "text/x-python",
17 | "name": "python",
18 | "nbconvert_exporter": "python",
19 | "pygments_lexer": "ipython3",
20 | "version": "3.6.8"
21 | },
22 | "colab": {
23 | "name": "XGBOOST_.ipynb",
24 | "provenance": [],
25 | "collapsed_sections": [],
26 | "include_colab_link": true
27 | },
28 | "accelerator": "GPU"
29 | },
30 | "cells": [
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "id": "view-in-github",
35 | "colab_type": "text"
36 | },
37 | "source": [
38 | "
"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "id": "ZbVtKR2DolJZ",
45 | "colab_type": "text"
46 | },
47 | "source": [
48 | "## **PLEASE CHENGE YOUR RUNTIME TYPE TO GPU **"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "metadata": {
54 | "id": "lygrd_4THhsP",
55 | "colab_type": "code",
56 | "colab": {}
57 | },
58 | "source": [
59 | "import pandas as pd \n",
60 | "import numpy as np \n",
61 | "from tqdm import tqdm\n",
62 | "import math\n",
63 | "import gc"
64 | ],
65 | "execution_count": 0,
66 | "outputs": []
67 | },
68 | {
69 | "cell_type": "code",
70 | "metadata": {
71 | "id": "QDbr-4b3nuGb",
72 | "colab_type": "code",
73 | "colab": {}
74 | },
75 | "source": [
76 | "from google.colab import drive\n",
77 | "drive.mount('/content/drive')"
78 | ],
79 | "execution_count": 0,
80 | "outputs": []
81 | },
82 | {
83 | "cell_type": "code",
84 | "metadata": {
85 | "id": "ZZF7jjBgnuJa",
86 | "colab_type": "code",
87 | "colab": {}
88 | },
89 | "source": [
90 | "path = '/content/drive/My Drive/'"
91 | ],
92 | "execution_count": 0,
93 | "outputs": []
94 | },
95 | {
96 | "cell_type": "code",
97 | "metadata": {
98 | "id": "tRthNH90Hhsa",
99 | "colab_type": "code",
100 | "colab": {}
101 | },
102 | "source": [
103 | "train=pd.read_csv(\"/content/drive/My Drive/Train (14).csv\")\n",
104 | "test=pd.read_csv(\"/content/drive/My Drive/Test (8).csv\")"
105 | ],
106 | "execution_count": 0,
107 | "outputs": []
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {
112 | "id": "mApRDmFGHhsz",
113 | "colab_type": "text"
114 | },
115 | "source": [
116 | "### Features engineering part"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "metadata": {
122 | "id": "7lskkR2rHhss",
123 | "colab_type": "code",
124 | "colab": {}
125 | },
126 | "source": [
127 | "# covert features fron string to List of values \n",
128 | "def replace_nan(x):\n",
129 | " if x==\" \":\n",
130 | " return np.nan\n",
131 | " else :\n",
132 | " return float(x)\n",
133 | "features=[\"temp\",\"precip\",\"rel_humidity\",\"wind_dir\",\"wind_spd\",\"atmos_press\"]\n",
134 | "for feature in features : \n",
135 | " train[feature]=train[feature].apply(lambda x: [ replace_nan(X) for X in x.replace(\"nan\",\" \").split(\",\")])\n",
136 | " test[feature]=test[feature].apply(lambda x: [ replace_nan(X) for X in x.replace(\"nan\",\" \").split(\",\")]) "
137 | ],
138 | "execution_count": 0,
139 | "outputs": []
140 | },
141 | {
142 | "cell_type": "code",
143 | "metadata": {
144 | "id": "sg8-zujFHhs1",
145 | "colab_type": "code",
146 | "colab": {}
147 | },
148 | "source": [
149 | "def aggregate_features(x,col_name):\n",
150 | " x[\"max_\"+col_name]=x[col_name].apply(np.max)\n",
151 | " x[\"min_\"+col_name]=x[col_name].apply(np.min)\n",
152 | " x[\"mean_\"+col_name]=x[col_name].apply(np.mean)\n",
153 | " x[\"std_\"+col_name]=x[col_name].apply(np.std)\n",
154 | " #x[\"var_\"+col_name]=x[col_name].apply(np.var)\n",
155 | " x[\"sum_\"+col_name]=x[col_name].apply(np.sum)\n",
156 | " x[\"ptp_\"+col_name]=x[col_name].apply(np.ptp)\n",
157 | " return x \n",
158 | "def remove_nan_values(x):\n",
159 | " return [e for e in x if not math.isnan(e)]"
160 | ],
161 | "execution_count": 0,
162 | "outputs": []
163 | },
164 | {
165 | "cell_type": "code",
166 | "metadata": {
167 | "id": "jSHXJxduHhs6",
168 | "colab_type": "code",
169 | "outputId": "6acc98a4-5bdd-480f-fa0a-19bb8d5d406c",
170 | "colab": {
171 | "base_uri": "https://localhost:8080/",
172 | "height": 170
173 | }
174 | },
175 | "source": [
176 | "data=pd.concat([train,test],sort=False).reset_index(drop=True)\n",
177 | "data.columns.tolist()"
178 | ],
179 | "execution_count": 0,
180 | "outputs": [
181 | {
182 | "output_type": "execute_result",
183 | "data": {
184 | "text/plain": [
185 | "['ID',\n",
186 | " 'location',\n",
187 | " 'temp',\n",
188 | " 'precip',\n",
189 | " 'rel_humidity',\n",
190 | " 'wind_dir',\n",
191 | " 'wind_spd',\n",
192 | " 'atmos_press',\n",
193 | " 'target']"
194 | ]
195 | },
196 | "metadata": {
197 | "tags": []
198 | },
199 | "execution_count": 60
200 | }
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "metadata": {
206 | "id": "K4xhuB8H3Wwb",
207 | "colab_type": "code",
208 | "colab": {}
209 | },
210 | "source": [
211 | "for x in range(121):\n",
212 | " data[\"newtemp\"+ str(x)] = data.temp.str[x]\n",
213 | " data[\"newprecip\"+ str(x)] = data.precip.str[x]\n",
214 | " data[\"newrel_humidity\"+ str(x)] = data.rel_humidity.str[x]\n",
215 | " data[\"newwind_dir\"+ str(x)] = data.wind_dir.str[x]\n",
216 | " data[\"windspeed\"+ str(x)] = data.wind_spd.str[x]\n",
217 | " data[\"atmospherepressure\"+ str(x)] = data.atmos_press.str[x]"
218 | ],
219 | "execution_count": 0,
220 | "outputs": []
221 | },
222 | {
223 | "cell_type": "code",
224 | "metadata": {
225 | "id": "1dYviSGeHhs_",
226 | "colab_type": "code",
227 | "outputId": "410c532d-f019-4c35-fe11-191f48aa8641",
228 | "colab": {
229 | "base_uri": "https://localhost:8080/",
230 | "height": 34
231 | }
232 | },
233 | "source": [
234 | "data.shape"
235 | ],
236 | "execution_count": 0,
237 | "outputs": [
238 | {
239 | "output_type": "execute_result",
240 | "data": {
241 | "text/plain": [
242 | "(20574, 735)"
243 | ]
244 | },
245 | "metadata": {
246 | "tags": []
247 | },
248 | "execution_count": 62
249 | }
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "metadata": {
255 | "id": "tROUcB5p6esB",
256 | "colab_type": "code",
257 | "outputId": "497fd88b-da88-4aa0-e363-ee3fb0b97277",
258 | "colab": {
259 | "base_uri": "https://localhost:8080/",
260 | "height": 34
261 | }
262 | },
263 | "source": [
264 | "len(data.precip[1])"
265 | ],
266 | "execution_count": 0,
267 | "outputs": [
268 | {
269 | "output_type": "execute_result",
270 | "data": {
271 | "text/plain": [
272 | "121"
273 | ]
274 | },
275 | "metadata": {
276 | "tags": []
277 | },
278 | "execution_count": 63
279 | }
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "metadata": {
285 | "id": "cNcORQK-HhtG",
286 | "colab_type": "code",
287 | "outputId": "6407285c-9708-4cb5-f226-46c83fefc96f",
288 | "colab": {
289 | "base_uri": "https://localhost:8080/",
290 | "height": 34
291 | }
292 | },
293 | "source": [
294 | "for col_name in tqdm(features):\n",
295 | " data[col_name]=data[col_name].apply(remove_nan_values)"
296 | ],
297 | "execution_count": 0,
298 | "outputs": [
299 | {
300 | "output_type": "stream",
301 | "text": [
302 | "100%|██████████| 6/6 [00:02<00:00, 2.45it/s]\n"
303 | ],
304 | "name": "stderr"
305 | }
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "metadata": {
311 | "id": "K9iStUBEHhtO",
312 | "colab_type": "code",
313 | "outputId": "d81d1df4-a973-4d50-933b-50197a69cc6c",
314 | "colab": {
315 | "base_uri": "https://localhost:8080/",
316 | "height": 34
317 | }
318 | },
319 | "source": [
320 | "for col_name in tqdm(features):\n",
321 | " data=aggregate_features(data,col_name)"
322 | ],
323 | "execution_count": 0,
324 | "outputs": [
325 | {
326 | "output_type": "stream",
327 | "text": [
328 | "100%|██████████| 6/6 [00:14<00:00, 2.45s/it]\n"
329 | ],
330 | "name": "stderr"
331 | }
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "metadata": {
337 | "id": "aiAM7WA3HhtR",
338 | "colab_type": "code",
339 | "colab": {}
340 | },
341 | "source": [
342 | "data.drop(features,1,inplace=True)"
343 | ],
344 | "execution_count": 0,
345 | "outputs": []
346 | },
347 | {
348 | "cell_type": "code",
349 | "metadata": {
350 | "id": "aG-kLSttrM41",
351 | "colab_type": "code",
352 | "colab": {}
353 | },
354 | "source": [
355 | "hum_features = list(data.filter(regex='rel_humidity.*').columns)\n",
356 | "temp_features = list( data.filter(regex='newtemp.*').columns) \n",
357 | "precip_features = list(data.filter(regex='newprecip*').columns)\n",
358 | "winddir_features = list( data.filter(regex='newwind_dir.*').columns)\n",
359 | "windspead_features = list( data.filter(regex='windspeed.*').columns)\n",
360 | "atm_features = list(data.filter(regex='atmos.*').columns)\n",
361 | "\n",
362 | "\n",
363 | "\n",
364 | "\n",
365 | "hum_features= hum_features[36:]\n",
366 | "temp_features=temp_features[36:] \n",
367 | "precip_features=precip_features[31:]\n",
368 | "winddir_features=winddir_features[36:]\n",
369 | "windspead_features=windspead_features[36:]\n",
370 | "atm_features=atm_features[36:]\n",
371 | "\n",
372 | "\n",
373 | "\n",
374 | "\n",
375 | "\n",
376 | "data[hum_features]= data[hum_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n",
377 | "\n",
378 | "\n",
379 | "\n",
380 | "data[temp_features]= data[temp_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n",
381 | "\n",
382 | "\n",
383 | "data[precip_features]= data[precip_features].apply(lambda x: x.fillna(float(0.0)),axis=1)\n",
384 | "\n",
385 | "\n",
386 | "\n",
387 | "data[winddir_features]= data[winddir_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n",
388 | "\n",
389 | "\n",
390 | "\n",
391 | "data[windspead_features]= data[windspead_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n",
392 | "\n",
393 | "\n",
394 | "\n",
395 | "\n",
396 | "data[atm_features]= data[atm_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n"
397 | ],
398 | "execution_count": 0,
399 | "outputs": []
400 | },
401 | {
402 | "cell_type": "code",
403 | "metadata": {
404 | "id": "b-inscoEHhtU",
405 | "colab_type": "code",
406 | "colab": {}
407 | },
408 | "source": [
409 | "train=data[data.target.notnull()].reset_index(drop=True)\n",
410 | "test=data[data.target.isna()].reset_index(drop=True)"
411 | ],
412 | "execution_count": 0,
413 | "outputs": []
414 | },
415 | {
416 | "cell_type": "code",
417 | "metadata": {
418 | "id": "fS2h1hJcaASK",
419 | "colab_type": "code",
420 | "outputId": "63cf9ce6-6cf1-4b74-b36d-e0769a2df411",
421 | "colab": {
422 | "base_uri": "https://localhost:8080/",
423 | "height": 34
424 | }
425 | },
426 | "source": [
427 | "train.shape,test.shape"
428 | ],
429 | "execution_count": 0,
430 | "outputs": [
431 | {
432 | "output_type": "execute_result",
433 | "data": {
434 | "text/plain": [
435 | "((15539, 765), (5035, 765))"
436 | ]
437 | },
438 | "metadata": {
439 | "tags": []
440 | },
441 | "execution_count": 69
442 | }
443 | ]
444 | },
445 | {
446 | "cell_type": "markdown",
447 | "metadata": {
448 | "id": "mWcrHrV5CSPx",
449 | "colab_type": "text"
450 | },
451 | "source": [
452 | "###end of la3b"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "metadata": {
458 | "id": "EjI-PlKfvJno",
459 | "colab_type": "code",
460 | "colab": {}
461 | },
462 | "source": [
463 | "import pandas as pd\n",
464 | "import numpy as np\n",
465 | "import matplotlib.pyplot as plt\n",
466 | "import seaborn as sns\n",
467 | "import warnings\n",
468 | "warnings.filterwarnings('ignore')\n",
469 | "\n",
470 | "from math import sqrt \n",
471 | "import lightgbm as lgb\n",
472 | "from sklearn.metrics import mean_squared_error \n",
473 | "from sklearn.model_selection import KFold, train_test_split\n",
474 | "import requests\n",
475 | "from io import StringIO \n",
476 | "%matplotlib inline\n",
477 | "pd.set_option(\"display.max_rows\", 100)\n",
478 | "pd.set_option(\"display.max_columns\", 100)\n",
479 | "import numpy as np\n",
480 | "import matplotlib.pyplot as plt\n",
481 | "import seaborn as sns\n",
482 | "import warnings\n",
483 | "warnings.filterwarnings('ignore')\n",
484 | "\n",
485 | "from math import sqrt \n",
486 | "import lightgbm as lgb\n",
487 | "from sklearn.metrics import mean_squared_error \n",
488 | "from sklearn.model_selection import KFold, train_test_split\n",
489 | "import requests\n",
490 | "from io import StringIO \n",
491 | "%matplotlib inline\n",
492 | "pd.set_option(\"display.max_rows\", 100)\n",
493 | "pd.set_option(\"display.max_columns\", 100)"
494 | ],
495 | "execution_count": 0,
496 | "outputs": []
497 | },
498 | {
499 | "cell_type": "code",
500 | "metadata": {
501 | "id": "hs2TN42eqkR6",
502 | "colab_type": "code",
503 | "outputId": "28ad2dd0-e4d3-4c36-e19a-f7de3d8c3db1",
504 | "colab": {
505 | "base_uri": "https://localhost:8080/",
506 | "height": 34
507 | }
508 | },
509 | "source": [
510 | "test.drop(\"target\",axis = 1,inplace = True)\n",
511 | "train.shape,test.shape"
512 | ],
513 | "execution_count": 0,
514 | "outputs": [
515 | {
516 | "output_type": "execute_result",
517 | "data": {
518 | "text/plain": [
519 | "((15539, 765), (5035, 764))"
520 | ]
521 | },
522 | "metadata": {
523 | "tags": []
524 | },
525 | "execution_count": 71
526 | }
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "metadata": {
532 | "id": "TyT7Uu02rEDg",
533 | "colab_type": "code",
534 | "outputId": "4a027e7a-25ca-4afd-ab36-25077b690a38",
535 | "colab": {
536 | "base_uri": "https://localhost:8080/",
537 | "height": 34
538 | }
539 | },
540 | "source": [
541 | "from sklearn.preprocessing import LabelEncoder\n",
542 | "lab = LabelEncoder()\n",
543 | "lab.fit(train[\"location\"])\n",
544 | "train.location = lab.transform(train.location)\n",
545 | "test.location = lab.transform(test.location)\n",
546 | "train.target.min(),train.target.max()"
547 | ],
548 | "execution_count": 0,
549 | "outputs": [
550 | {
551 | "output_type": "execute_result",
552 | "data": {
553 | "text/plain": [
554 | "(1.4526190476190477, 475.82)"
555 | ]
556 | },
557 | "metadata": {
558 | "tags": []
559 | },
560 | "execution_count": 72
561 | }
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "metadata": {
567 | "id": "xEBTcrsTrD_s",
568 | "colab_type": "code",
569 | "colab": {}
570 | },
571 | "source": [
572 | "X=train.drop([\"ID\",'target'],axis =1)\n",
573 | "y = train.target\n",
574 | "test_id = test['ID']"
575 | ],
576 | "execution_count": 0,
577 | "outputs": []
578 | },
579 | {
580 | "cell_type": "code",
581 | "metadata": {
582 | "id": "KKciRChkkgGR",
583 | "colab_type": "code",
584 | "colab": {}
585 | },
586 | "source": [
587 | "from sklearn.cluster import KMeans\n",
588 | "X=X.fillna(-9999)\n",
589 | "\n",
590 | "kmeans = KMeans(n_clusters=6).fit(X)\n",
591 | "X['cluster']=kmeans.predict(X)\n",
592 | "test=test.fillna(-9999)\n",
593 | "test['cluster']=kmeans.predict(test.drop([\"ID\"],axis = 1))\n",
594 | "\n",
595 | "\n"
596 | ],
597 | "execution_count": 0,
598 | "outputs": []
599 | },
600 | {
601 | "cell_type": "code",
602 | "metadata": {
603 | "id": "OS2hPCB3kgN6",
604 | "colab_type": "code",
605 | "colab": {}
606 | },
607 | "source": [
608 | "air_temperature_filler = pd.DataFrame(X.groupby(['location','mean_wind_dir'])['mean_temp'].mean())\n",
609 | "X['mean temp per month']=0\n",
610 | "for i in range (len(X)):\n",
611 | " \n",
612 | " X['mean temp per month'][i]=air_temperature_filler.loc[(X['location'][i],X['mean_wind_dir'][i]), :]\n",
613 | "\n",
614 | "air_temperature_filler = pd.DataFrame(test.groupby(['location','mean_wind_dir'])['mean_temp'].mean())\n",
615 | "\n",
616 | "test['mean temp per month']=0\n",
617 | "for i in range (len(test)):\n",
618 | " \n",
619 | " test['mean temp per month'][i]=air_temperature_filler.loc[(test['location'][i],test['mean_wind_dir'][i]), :]"
620 | ],
621 | "execution_count": 0,
622 | "outputs": []
623 | },
624 | {
625 | "cell_type": "code",
626 | "metadata": {
627 | "id": "N2ciDCovkgVu",
628 | "colab_type": "code",
629 | "colab": {}
630 | },
631 | "source": [
632 | "p0=['newtemp0', 'newprecip0', 'newrel_humidity0', 'newwind_dir0',\n",
633 | " 'windspeed0', 'atmospherepressure0']\n",
634 | "p4=['newtemp30', 'newprecip30', 'newrel_humidity30', 'newwind_dir30',\n",
635 | " 'windspeed30', 'atmospherepressure30'] \n",
636 | "p1=['newtemp60', 'newprecip60', 'newrel_humidity60', 'newwind_dir60',\n",
637 | " 'windspeed60', 'atmospherepressure60'] \n",
638 | "p2=['newtemp90', 'newprecip90', 'newrel_humidity90', 'newwind_dir90',\n",
639 | " 'windspeed90', 'atmospherepressure90'] \n",
640 | "p3=['newtemp120', 'newprecip120', 'newrel_humidity120', 'newwind_dir120',\n",
641 | " 'windspeed120', 'atmospherepressure120'] \n",
642 | "from sklearn.decomposition import PCA, FastICA\n",
643 | "pca = PCA(random_state=42,n_components=1)\n",
644 | "train_pca = pca.fit_transform(X[p0])\n",
645 | "X['pca_0'] = train_pca[:,0]\n",
646 | "test_pca = pca.fit_transform(test[p0])\n",
647 | "test['pca_0'] = test_pca[:,0]\n",
648 | "train_pca = pca.fit_transform(X[p1])\n",
649 | "X['pca_1'] = train_pca[:,0]\n",
650 | "test_pca = pca.fit_transform(test[p1])\n",
651 | "test['pca_1'] = test_pca[:,0]\n",
652 | "train_pca = pca.fit_transform(X[p2])\n",
653 | "X['pca_2'] = train_pca[:,0]\n",
654 | "test_pca = pca.fit_transform(test[p2])\n",
655 | "test['pca_2'] = test_pca[:,0]\n",
656 | "\n",
657 | "train_pca = pca.fit_transform(X[p3])\n",
658 | "X['pca_3'] = train_pca[:,0]\n",
659 | "test_pca = pca.fit_transform(test[p3])\n",
660 | "test['pca_3'] = test_pca[:,0]\n",
661 | "\n",
662 | "train_pca = pca.fit_transform(X[p4])\n",
663 | "X['pca_4'] = train_pca[:,0]\n",
664 | "test_pca = pca.fit_transform(test[p4])\n",
665 | "test['pca_4'] = test_pca[:,0]\n",
666 | "\n",
667 | "test=test.drop(columns=['pca_2','pca_4'])\n",
668 | "X=X.drop(columns=['pca_4','pca_2'])"
669 | ],
670 | "execution_count": 0,
671 | "outputs": []
672 | },
673 | {
674 | "cell_type": "code",
675 | "metadata": {
676 | "id": "9Ju_Y7wQkgTJ",
677 | "colab_type": "code",
678 | "colab": {}
679 | },
680 | "source": [
681 | "X=X.drop(columns=['newtemp105', 'newprecip105', 'newrel_humidity105', 'newwind_dir105',\n",
682 | " 'windspeed105', 'atmospherepressure105','newtemp9', 'newprecip9', 'newrel_humidity9', 'newwind_dir9',\n",
683 | " 'windspeed9', 'atmospherepressure9'])\n",
684 | "test=test.drop(columns=['newtemp105', 'newprecip105', 'newrel_humidity105', 'newwind_dir105',\n",
685 | " 'windspeed105', 'atmospherepressure105','newtemp9', 'newprecip9', 'newrel_humidity9', 'newwind_dir9',\n",
686 | " 'windspeed9', 'atmospherepressure9']) "
687 | ],
688 | "execution_count": 0,
689 | "outputs": []
690 | },
691 | {
692 | "cell_type": "markdown",
693 | "metadata": {
694 | "id": "gSK7Goien71V",
695 | "colab_type": "text"
696 | },
697 | "source": [
698 | "## **hana & and hanat are the data generated by arima ** \n",
699 | "## PLEASE CHECK THAT YOU'VE UPLOAD IT "
700 | ]
701 | },
702 | {
703 | "cell_type": "code",
704 | "metadata": {
705 | "id": "9PxHJ_n7eSiI",
706 | "colab_type": "code",
707 | "colab": {}
708 | },
709 | "source": [
710 | "hana=pd.read_csv('/content/winddirforecasttrain ADD THOSE COLUMNS TO TRAIN.csv')"
711 | ],
712 | "execution_count": 0,
713 | "outputs": []
714 | },
715 | {
716 | "cell_type": "code",
717 | "metadata": {
718 | "id": "oSm55maceSu6",
719 | "colab_type": "code",
720 | "colab": {}
721 | },
722 | "source": [
723 | "hanat=pd.read_csv('/content/winddirforecasttest ADD THOSE FEATURES TO TEST.csv')"
724 | ],
725 | "execution_count": 0,
726 | "outputs": []
727 | },
728 | {
729 | "cell_type": "code",
730 | "metadata": {
731 | "id": "ApLZ3O8lfLHE",
732 | "colab_type": "code",
733 | "outputId": "ef927c34-7793-4f7d-917a-c97463d50a0b",
734 | "colab": {
735 | "base_uri": "https://localhost:8080/",
736 | "height": 224
737 | }
738 | },
739 | "source": [
740 | "hanat.head()"
741 | ],
742 | "execution_count": 0,
743 | "outputs": [
744 | {
745 | "output_type": "execute_result",
746 | "data": {
747 | "text/html": [
748 | "\n",
749 | "\n",
762 | "
\n",
763 | " \n",
764 | " \n",
765 | " | \n",
766 | " 0 | \n",
767 | " 1 | \n",
768 | " 2 | \n",
769 | " 3 | \n",
770 | " 4 | \n",
771 | " 5 | \n",
772 | " 6 | \n",
773 | " 7 | \n",
774 | " 8 | \n",
775 | " 9 | \n",
776 | " 10 | \n",
777 | " 11 | \n",
778 | " 12 | \n",
779 | " 13 | \n",
780 | " 14 | \n",
781 | " 15 | \n",
782 | " 16 | \n",
783 | " 17 | \n",
784 | "
\n",
785 | " \n",
786 | " \n",
787 | " \n",
788 | " | 0 | \n",
789 | " 197.250658 | \n",
790 | " 212.394441 | \n",
791 | " 219.223270 | \n",
792 | " 222.302612 | \n",
793 | " 223.691189 | \n",
794 | " 224.317343 | \n",
795 | " 224.599697 | \n",
796 | " 224.727019 | \n",
797 | " 224.784433 | \n",
798 | " 224.810323 | \n",
799 | " 224.821998 | \n",
800 | " 224.827262 | \n",
801 | " 224.829636 | \n",
802 | " 224.830707 | \n",
803 | " 224.831189 | \n",
804 | " 224.831407 | \n",
805 | " 224.831505 | \n",
806 | " 224.831549 | \n",
807 | "
\n",
808 | " \n",
809 | " | 1 | \n",
810 | " 146.588427 | \n",
811 | " 165.851197 | \n",
812 | " 176.443787 | \n",
813 | " 182.268648 | \n",
814 | " 185.471737 | \n",
815 | " 187.233115 | \n",
816 | " 188.201695 | \n",
817 | " 188.734318 | \n",
818 | " 189.027206 | \n",
819 | " 189.188266 | \n",
820 | " 189.276832 | \n",
821 | " 189.325535 | \n",
822 | " 189.352316 | \n",
823 | " 189.367044 | \n",
824 | " 189.375142 | \n",
825 | " 189.379595 | \n",
826 | " 189.382044 | \n",
827 | " 189.383391 | \n",
828 | "
\n",
829 | " \n",
830 | " | 2 | \n",
831 | " 165.153288 | \n",
832 | " 165.153288 | \n",
833 | " 165.153288 | \n",
834 | " 165.153288 | \n",
835 | " 165.153288 | \n",
836 | " 165.153288 | \n",
837 | " 165.153288 | \n",
838 | " 165.153288 | \n",
839 | " 165.153288 | \n",
840 | " 165.153288 | \n",
841 | " 165.153288 | \n",
842 | " 165.153288 | \n",
843 | " 165.153288 | \n",
844 | " 165.153288 | \n",
845 | " 165.153288 | \n",
846 | " 165.153288 | \n",
847 | " 165.153288 | \n",
848 | " 165.153288 | \n",
849 | "
\n",
850 | " \n",
851 | " | 3 | \n",
852 | " 268.745697 | \n",
853 | " 236.704786 | \n",
854 | " 220.128369 | \n",
855 | " 211.552532 | \n",
856 | " 207.115808 | \n",
857 | " 204.820462 | \n",
858 | " 203.632961 | \n",
859 | " 203.018605 | \n",
860 | " 202.700768 | \n",
861 | " 202.536334 | \n",
862 | " 202.451263 | \n",
863 | " 202.407252 | \n",
864 | " 202.384483 | \n",
865 | " 202.372703 | \n",
866 | " 202.366609 | \n",
867 | " 202.363456 | \n",
868 | " 202.361825 | \n",
869 | " 202.360981 | \n",
870 | "
\n",
871 | " \n",
872 | " | 4 | \n",
873 | " 185.847258 | \n",
874 | " 185.847258 | \n",
875 | " 185.847258 | \n",
876 | " 185.847258 | \n",
877 | " 185.847258 | \n",
878 | " 185.847258 | \n",
879 | " 185.847258 | \n",
880 | " 185.847258 | \n",
881 | " 185.847258 | \n",
882 | " 185.847258 | \n",
883 | " 185.847258 | \n",
884 | " 185.847258 | \n",
885 | " 185.847258 | \n",
886 | " 185.847258 | \n",
887 | " 185.847258 | \n",
888 | " 185.847258 | \n",
889 | " 185.847258 | \n",
890 | " 185.847258 | \n",
891 | "
\n",
892 | " \n",
893 | "
\n",
894 | "
"
895 | ],
896 | "text/plain": [
897 | " 0 1 2 3 4 5 \\\n",
898 | "0 197.250658 212.394441 219.223270 222.302612 223.691189 224.317343 \n",
899 | "1 146.588427 165.851197 176.443787 182.268648 185.471737 187.233115 \n",
900 | "2 165.153288 165.153288 165.153288 165.153288 165.153288 165.153288 \n",
901 | "3 268.745697 236.704786 220.128369 211.552532 207.115808 204.820462 \n",
902 | "4 185.847258 185.847258 185.847258 185.847258 185.847258 185.847258 \n",
903 | "\n",
904 | " 6 7 8 9 10 11 \\\n",
905 | "0 224.599697 224.727019 224.784433 224.810323 224.821998 224.827262 \n",
906 | "1 188.201695 188.734318 189.027206 189.188266 189.276832 189.325535 \n",
907 | "2 165.153288 165.153288 165.153288 165.153288 165.153288 165.153288 \n",
908 | "3 203.632961 203.018605 202.700768 202.536334 202.451263 202.407252 \n",
909 | "4 185.847258 185.847258 185.847258 185.847258 185.847258 185.847258 \n",
910 | "\n",
911 | " 12 13 14 15 16 17 \n",
912 | "0 224.829636 224.830707 224.831189 224.831407 224.831505 224.831549 \n",
913 | "1 189.352316 189.367044 189.375142 189.379595 189.382044 189.383391 \n",
914 | "2 165.153288 165.153288 165.153288 165.153288 165.153288 165.153288 \n",
915 | "3 202.384483 202.372703 202.366609 202.363456 202.361825 202.360981 \n",
916 | "4 185.847258 185.847258 185.847258 185.847258 185.847258 185.847258 "
917 | ]
918 | },
919 | "metadata": {
920 | "tags": []
921 | },
922 | "execution_count": 80
923 | }
924 | ]
925 | },
926 | {
927 | "cell_type": "code",
928 | "metadata": {
929 | "id": "YjqMLxIBiXc7",
930 | "colab_type": "code",
931 | "colab": {}
932 | },
933 | "source": [
934 | "for i in hana.columns :\n",
935 | " X[i]=hana[i]\n",
936 | "for i in hanat :\n",
937 | " test[i]=hanat[i]"
938 | ],
939 | "execution_count": 0,
940 | "outputs": []
941 | },
942 | {
943 | "cell_type": "code",
944 | "metadata": {
945 | "id": "bEZT2dUUinEj",
946 | "colab_type": "code",
947 | "outputId": "9f596bb0-b364-4cd9-8327-82185537eb14",
948 | "colab": {
949 | "base_uri": "https://localhost:8080/",
950 | "height": 34
951 | }
952 | },
953 | "source": [
954 | "X.shape,test.shape"
955 | ],
956 | "execution_count": 0,
957 | "outputs": [
958 | {
959 | "output_type": "execute_result",
960 | "data": {
961 | "text/plain": [
962 | "((15539, 774), (5035, 775))"
963 | ]
964 | },
965 | "metadata": {
966 | "tags": []
967 | },
968 | "execution_count": 82
969 | }
970 | ]
971 | },
972 | {
973 | "cell_type": "code",
974 | "metadata": {
975 | "id": "Vuv40f3DJOes",
976 | "colab_type": "code",
977 | "colab": {}
978 | },
979 | "source": [
980 | "import xgboost as xgb\n",
981 | "\n",
982 | "params = {\n",
983 | " 'gpu_id': 0, \n",
984 | " #'n_gpus': 2, \n",
985 | " 'objective': 'reg:squarederror', \n",
986 | " 'eval_metric': 'rmse', \n",
987 | " \n",
988 | " 'booster': 'gbtree', \n",
989 | " \n",
990 | " 'n_estimators': 10000, \n",
991 | " 'tree_method': 'gpu_hist', \n",
992 | " 'grow_policy': 'lossguide', \n",
993 | " 'max_depth': 8, \n",
994 | " \n",
995 | " \n",
996 | " \n",
997 | " 'learning_rate': 0.01, \n",
998 | " 'max_bin': 200, \n",
999 | " 'max_leaves': 200, \n",
1000 | " \n",
1001 | " 'reg_alpha': 10, \n",
1002 | " 'reg_lambda': 50, \n",
1003 | " 'subsample': 0.9 ,\n",
1004 | " }\n",
1005 | "model = xgb.XGBRegressor(**params)"
1006 | ],
1007 | "execution_count": 0,
1008 | "outputs": []
1009 | },
1010 | {
1011 | "cell_type": "code",
1012 | "metadata": {
1013 | "id": "MBcQ1gQXzA_c",
1014 | "colab_type": "code",
1015 | "outputId": "de37b93e-f15e-4bda-8e69-7ad74ef11e78",
1016 | "colab": {
1017 | "base_uri": "https://localhost:8080/",
1018 | "height": 170
1019 | }
1020 | },
1021 | "source": [
1022 | "model.fit(X,y)"
1023 | ],
1024 | "execution_count": 0,
1025 | "outputs": [
1026 | {
1027 | "output_type": "execute_result",
1028 | "data": {
1029 | "text/plain": [
1030 | "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
1031 | " colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',\n",
1032 | " gamma=0, gpu_id=0, grow_policy='lossguide', importance_type='gain',\n",
1033 | " learning_rate=0.01, max_bin=200, max_delta_step=0, max_depth=8,\n",
1034 | " max_leaves=200, min_child_weight=1, missing=None,\n",
1035 | " n_estimators=10000, n_jobs=1, nthread=None,\n",
1036 | " objective='reg:squarederror', random_state=0, reg_alpha=10,\n",
1037 | " reg_lambda=50, scale_pos_weight=1, seed=None, silent=None,\n",
1038 | " subsample=0.9, tree_method='gpu_hist', verbosity=1)"
1039 | ]
1040 | },
1041 | "metadata": {
1042 | "tags": []
1043 | },
1044 | "execution_count": 84
1045 | }
1046 | ]
1047 | },
1048 | {
1049 | "cell_type": "code",
1050 | "metadata": {
1051 | "id": "dnNV6t7azZ81",
1052 | "colab_type": "code",
1053 | "colab": {}
1054 | },
1055 | "source": [
1056 | "pp=model.predict(test.drop([\"ID\"],axis = 1))"
1057 | ],
1058 | "execution_count": 0,
1059 | "outputs": []
1060 | },
1061 | {
1062 | "cell_type": "code",
1063 | "metadata": {
1064 | "id": "DU4h2FgiLaOB",
1065 | "colab_type": "code",
1066 | "outputId": "0cafd63b-638a-422d-a9d7-0aa9615437f6",
1067 | "colab": {
1068 | "base_uri": "https://localhost:8080/",
1069 | "height": 34
1070 | }
1071 | },
1072 | "source": [
1073 | "d = {'ID': test_id, 'target': pp}\n",
1074 | "subxgb = pd.DataFrame(data=d)\n",
1075 | "subxgb = subxgb[['ID', 'target']]\n",
1076 | "subxgb.target.min(),subxgb.target.max()"
1077 | ],
1078 | "execution_count": 0,
1079 | "outputs": [
1080 | {
1081 | "output_type": "execute_result",
1082 | "data": {
1083 | "text/plain": [
1084 | "(15.86525821685791, 238.6833038330078)"
1085 | ]
1086 | },
1087 | "metadata": {
1088 | "tags": []
1089 | },
1090 | "execution_count": 86
1091 | }
1092 | ]
1093 | },
1094 | {
1095 | "cell_type": "code",
1096 | "metadata": {
1097 | "id": "TOCcTSqULg4_",
1098 | "colab_type": "code",
1099 | "colab": {}
1100 | },
1101 | "source": [
1102 | "subxgb.to_csv('subxgb.csv',index=False)\n",
1103 | "from google.colab import files\n",
1104 | "files.download(\"subxgb.csv\")"
1105 | ],
1106 | "execution_count": 0,
1107 | "outputs": []
1108 | }
1109 | ]
1110 | }
--------------------------------------------------------------------------------
/ANOTHERLGBMODEL.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "language": "python",
7 | "display_name": "Python 3",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "pygments_lexer": "ipython3",
12 | "nbconvert_exporter": "python",
13 | "version": "3.6.4",
14 | "file_extension": ".py",
15 | "codemirror_mode": {
16 | "name": "ipython",
17 | "version": 3
18 | },
19 | "name": "python",
20 | "mimetype": "text/x-python"
21 | },
22 | "colab": {
23 | "name": "kernel37.07LGBAZER.ipynb",
24 | "provenance": [],
25 | "include_colab_link": true
26 | }
27 | },
28 | "cells": [
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "id": "view-in-github",
33 | "colab_type": "text"
34 | },
35 | "source": [
36 | "
"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {
42 | "id": "MDcv3QxAxxUb",
43 | "colab_type": "text"
44 | },
45 | "source": [
46 | "# PLEASE MAKE SURE TO RUN THIS ON KAGGLE TO GET THE SAME SCORE "
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "metadata": {
52 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
53 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
54 | "trusted": true,
55 | "id": "HN3OBq3gxu5J",
56 | "colab_type": "code",
57 | "colab": {}
58 | },
59 | "source": [
60 | "# This Python 3 environment comes with many helpful analytics libraries installed\n",
61 | "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
62 | "# For example, here's several helpful packages to load\n",
63 | "\n",
64 | "import numpy as np # linear algebra\n",
65 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
66 | "\n",
67 | "# Input data files are available in the read-only \"../input/\" directory\n",
68 | "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
69 | "\n",
70 | "import os\n",
71 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
72 | " for filename in filenames:\n",
73 | " print(os.path.join(dirname, filename))\n",
74 | "\n",
75 | "# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
76 | "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
77 | ],
78 | "execution_count": 0,
79 | "outputs": []
80 | },
81 | {
82 | "cell_type": "code",
83 | "metadata": {
84 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
85 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
86 | "trusted": true,
87 | "id": "M_fa5gkqxu5N",
88 | "colab_type": "code",
89 | "colab": {}
90 | },
91 | "source": [
92 | "import os\n",
93 | "import sys\n",
94 | "import gc\n",
95 | "import math\n",
96 | "import random\n",
97 | "import pickle\n",
98 | "import pandas as pd\n",
99 | "import numpy as np\n",
100 | "import seaborn as sns\n",
101 | "from tqdm.notebook import tqdm\n",
102 | "import category_encoders as ce\n",
103 | "import matplotlib.pyplot as plt"
104 | ],
105 | "execution_count": 0,
106 | "outputs": []
107 | },
108 | {
109 | "cell_type": "code",
110 | "metadata": {
111 | "trusted": true,
112 | "id": "1W0PjzQixu5P",
113 | "colab_type": "code",
114 | "colab": {}
115 | },
116 | "source": [
117 | "from sklearn.model_selection import KFold, StratifiedKFold, train_test_split\n",
118 | "from sklearn.metrics import f1_score, confusion_matrix, classification_report\n",
119 | "from sklearn.metrics import mean_squared_error\n",
120 | "from sklearn.feature_selection import SelectFromModel\n",
121 | "from sklearn.linear_model import Lasso, LassoCV\n",
122 | "from sklearn.ensemble import RandomForestRegressor"
123 | ],
124 | "execution_count": 0,
125 | "outputs": []
126 | },
127 | {
128 | "cell_type": "code",
129 | "metadata": {
130 | "trusted": true,
131 | "id": "juUsndZAxu5S",
132 | "colab_type": "code",
133 | "colab": {}
134 | },
135 | "source": [
136 | "import lightgbm as lgbm \n",
137 | "import xgboost as xgb\n",
138 | "import catboost as cat\n",
139 | "from catboost import CatBoostRegressor, Pool, CatBoostClassifier"
140 | ],
141 | "execution_count": 0,
142 | "outputs": []
143 | },
144 | {
145 | "cell_type": "code",
146 | "metadata": {
147 | "trusted": true,
148 | "id": "PlwDxh2Axu5U",
149 | "colab_type": "code",
150 | "colab": {}
151 | },
152 | "source": [
153 | "seed = 2020\n",
154 | "random.seed(seed)\n",
155 | "np.random.seed(seed)"
156 | ],
157 | "execution_count": 0,
158 | "outputs": []
159 | },
160 | {
161 | "cell_type": "code",
162 | "metadata": {
163 | "trusted": true,
164 | "id": "nyo1ql9pxu5W",
165 | "colab_type": "code",
166 | "colab": {}
167 | },
168 | "source": [
169 | "os.makedirs('MODELS/', exist_ok=True)\n",
170 | "os.makedirs('/DATASET/CSV/', exist_ok=True)\n",
171 | "os.makedirs('/DATASET/ZIP/', exist_ok=True)\n",
172 | "os.makedirs('/DATASET/DOWNLOAD/', exist_ok=True)"
173 | ],
174 | "execution_count": 0,
175 | "outputs": []
176 | },
177 | {
178 | "cell_type": "code",
179 | "metadata": {
180 | "trusted": true,
181 | "id": "s7MPyZf-xu5Y",
182 | "colab_type": "code",
183 | "colab": {}
184 | },
185 | "source": [
186 | "mapper = {\n",
187 | " \"GOOD\": 0,\n",
188 | " \"MODERATE\": 1,\n",
189 | " \"SENSITIVE\": 2,\n",
190 | " \"UNHEALTHY\": 3,\n",
191 | " \"V_UNHEALTHY\": 4,\n",
192 | " \"HAZARDOUS\": 5\n",
193 | "}"
194 | ],
195 | "execution_count": 0,
196 | "outputs": []
197 | },
198 | {
199 | "cell_type": "code",
200 | "metadata": {
201 | "trusted": true,
202 | "id": "bcQN-kcrxu5a",
203 | "colab_type": "code",
204 | "colab": {}
205 | },
206 | "source": [
207 | "def categorize(target):\n",
208 | " if target <= 12:\n",
209 | " return \"GOOD\"\n",
210 | " elif target <=35:\n",
211 | " return \"MODERATE\"\n",
212 | " elif target <= 55:\n",
213 | " return \"SENSITIVE\"\n",
214 | " elif target <= 150:\n",
215 | " return \"UNHEALTHY\"\n",
216 | " elif target <= 250:\n",
217 | " return \"V_UNHEALTHY\"\n",
218 | " else:\n",
219 | " return \"HAZARDOUS\""
220 | ],
221 | "execution_count": 0,
222 | "outputs": []
223 | },
224 | {
225 | "cell_type": "code",
226 | "metadata": {
227 | "trusted": true,
228 | "id": "bP_lms_Cxu5d",
229 | "colab_type": "code",
230 | "colab": {}
231 | },
232 | "source": [
233 | "def split_into_days(df, features, days=5):\n",
234 | " width = 24\n",
235 | " for feature in features:\n",
236 | " for day in range(days):\n",
237 | " df[feature+'_day_'+str(day)] = df[feature].apply(lambda x: x[day*width:(day+1)*width])\n",
238 | " df[feature+'_target_reading_day'] = df[feature].apply(lambda x: x[-1])"
239 | ],
240 | "execution_count": 0,
241 | "outputs": []
242 | },
243 | {
244 | "cell_type": "code",
245 | "metadata": {
246 | "trusted": true,
247 | "id": "izS1G2WKxu5f",
248 | "colab_type": "code",
249 | "colab": {}
250 | },
251 | "source": [
252 | "# covert features fron string to List of values \n",
253 | "def replace_nan(x):\n",
254 | " if x==\" \":\n",
255 | " return np.nan\n",
256 | " else :\n",
257 | " return float(x) "
258 | ],
259 | "execution_count": 0,
260 | "outputs": []
261 | },
262 | {
263 | "cell_type": "code",
264 | "metadata": {
265 | "trusted": true,
266 | "id": "_Mhiv4vXxu5h",
267 | "colab_type": "code",
268 | "colab": {}
269 | },
270 | "source": [
271 | "def aggregate_features(x,col_name):\n",
272 | " x[\"max_\"+col_name]=x[col_name].apply(np.max)\n",
273 | " x[\"min_\"+col_name]=x[col_name].apply(np.min)\n",
274 | " x[\"mean_\"+col_name]=x[col_name].apply(np.mean)\n",
275 | " x[\"std_\"+col_name]=x[col_name].apply(np.std)\n",
276 | " x[\"var_\"+col_name]=x[col_name].apply(np.var)\n",
277 | " x[\"median_\"+col_name]=x[col_name].apply(np.median)\n",
278 | " x[\"ptp_\"+col_name]=x[col_name].apply(np.ptp)\n",
279 | " return x "
280 | ],
281 | "execution_count": 0,
282 | "outputs": []
283 | },
284 | {
285 | "cell_type": "code",
286 | "metadata": {
287 | "trusted": true,
288 | "id": "XFoIoSlvxu5i",
289 | "colab_type": "code",
290 | "colab": {}
291 | },
292 | "source": [
293 | "def remove_nan_values(x):\n",
294 | " strict = [e for e in x if not math.isnan(e)]\n",
295 | " if len(strict) == 0:\n",
296 | " strict = [np.nan]\n",
297 | " return strict"
298 | ],
299 | "execution_count": 0,
300 | "outputs": []
301 | },
302 | {
303 | "cell_type": "code",
304 | "metadata": {
305 | "trusted": true,
306 | "id": "FM4Vhiz2xu5l",
307 | "colab_type": "code",
308 | "colab": {}
309 | },
310 | "source": [
311 | "def metric(y,x):\n",
312 | " return np.sqrt(mean_squared_error(x,y))"
313 | ],
314 | "execution_count": 0,
315 | "outputs": []
316 | },
317 | {
318 | "cell_type": "code",
319 | "metadata": {
320 | "trusted": true,
321 | "id": "xY1VTpUwxu5m",
322 | "colab_type": "code",
323 | "colab": {}
324 | },
325 | "source": [
326 | "def train_function(model,train,test,params,other_params,target_name,features,metric, model_name):\n",
327 | " folds_num=train.fold.nunique()\n",
328 | " validation=train[[id_name,\"fold\",target_name]].copy()\n",
329 | " validation[\"pred_\"+target_name]=0\n",
330 | " sub=test[[id_name]].copy()\n",
331 | " feat_imps = pd.DataFrame()\n",
332 | " feat_imps['Features'] = features\n",
333 | " \n",
334 | " for fold in np.sort(train.fold.unique()):\n",
335 | " print(\"#\"*50+\" {} \".format(fold)+\"#\"*50)\n",
336 | " os.makedirs(\"model_save/{}/{}/{}\".format(model_name,Experiment_name,str(int(fold))), exist_ok=True)\n",
337 | " X_train=train[train.fold!=fold]\n",
338 | " X_val=train[train.fold==fold]\n",
339 | " \n",
340 | " train_pred,validation_pred,test_pred,feat_imp=model(X_train,X_val,test,params,other_params)\n",
341 | "\n",
342 | " validation.loc[validation.fold==fold,\"pred_\"+target_name]=validation_pred\n",
343 | " sub[target_name]=test_pred/folds_num\n",
344 | " train_score=metric(X_train[target_name],train_pred)\n",
345 | " val_score=metric(X_val[target_name],validation_pred)\n",
346 | " feat_imps[fold] = feat_imp\n",
347 | " print(\"train score : {} validation score : {}\".format(round(train_score,4),round(val_score,4)))\n",
348 | " \n",
349 | " final_validation_score=metric(validation[target_name],validation[\"pred_\"+target_name])\n",
350 | " print(\"final validation score : {}\".format(final_validation_score))\n",
351 | " \n",
352 | " return sub,validation,final_validation_score,feat_imps\n",
353 | "\n",
354 | "def lgbm_model(X_train,X_val,X_test,params,other_params):\n",
355 | " dtrain = lgbm.Dataset(data=X_train[features], label=X_train[target_name], feature_name=features)\n",
356 | " dval = lgbm.Dataset(data=X_val[features], label=X_val[target_name], feature_name=features)\n",
357 | "\n",
358 | " model = lgbm.train(\n",
359 | " params=params,\n",
360 | " train_set=dtrain,\n",
361 | " num_boost_round=other_params[\"num_boost_round\"],\n",
362 | " valid_sets=(dtrain, dval),\n",
363 | " early_stopping_rounds=other_params[\"early_stopping_rounds\"],\n",
364 | " verbose_eval=other_params[\"verbose_eval\"],\n",
365 | " )\n",
366 | " best_iteration = model.best_iteration\n",
367 | " train_pred=model.predict(X_train[features], num_iteration=best_iteration)\n",
368 | " validation_pred=model.predict(X_val[features], num_iteration=best_iteration)\n",
369 | " test_pred=model.predict(test[features], num_iteration=best_iteration)\n",
370 | " feat_imp = model.feature_importance(iteration=best_iteration)\n",
371 | " \n",
372 | " return train_pred,validation_pred,test_pred, feat_imp\n",
373 | "\n",
374 | "def cat_model(X_train,X_val,X_test,params,other_params):\n",
375 | " dtrain = Pool(data=X_train[features], label=X_train[target_name], feature_names=features)\n",
376 | " dval = Pool(data=X_val[features], label=X_val[target_name], feature_names=features)\n",
377 | "\n",
378 | " model = CatBoostRegressor(**params)\n",
379 | " model.fit(dtrain,\n",
380 | " eval_set=[dval],\n",
381 | " use_best_model=True,\n",
382 | " verbose_eval=other_params[\"verbose_eval\"],\n",
383 | " )\n",
384 | "\n",
385 | " best_iteration = model.best_iteration_\n",
386 | " train_pred = model.predict(X_train[features])\n",
387 | " validation_pred = model.predict(X_val[features])\n",
388 | " test_pred = model.predict(test[features])\n",
389 | " feat_imp = model.feature_importances_\n",
390 | " \n",
391 | " return train_pred,validation_pred,test_pred, feat_imp"
392 | ],
393 | "execution_count": 0,
394 | "outputs": []
395 | },
396 | {
397 | "cell_type": "code",
398 | "metadata": {
399 | "trusted": true,
400 | "id": "6zM5oaqPxu5o",
401 | "colab_type": "code",
402 | "colab": {}
403 | },
404 | "source": [
405 | "cols = ['location', 'loc_altitude', 'km2', 'aspect',\n",
406 | " 'dist_trunk', 'dist_primary', 'dist_secondary',\n",
407 | " 'dist_tertiary', 'dist_unclassified', 'dist_residential', 'popn', 'hh',\n",
408 | " 'hh_cook_charcoal', 'hh_cook_firewood', 'hh_burn_waste']"
409 | ],
410 | "execution_count": 0,
411 | "outputs": []
412 | },
413 | {
414 | "cell_type": "code",
415 | "metadata": {
416 | "trusted": true,
417 | "id": "FAo46IrOxu5q",
418 | "colab_type": "code",
419 | "colab": {}
420 | },
421 | "source": [
422 | "train = pd.read_csv('../input/airqo-ugandan-air-quality-forecast-challenge-zindi/Train (1).csv')\n",
423 | "test = pd.read_csv('../input/airqo-ugandan-air-quality-forecast-challenge-zindi/Test (1).csv')\n",
424 | "meta = pd.read_csv('../input/airqo-ugandan-air-quality-forecast-challenge-zindi/airqo_metadata.csv', usecols=cols)"
425 | ],
426 | "execution_count": 0,
427 | "outputs": []
428 | },
429 | {
430 | "cell_type": "code",
431 | "metadata": {
432 | "trusted": true,
433 | "id": "EWn__niTxu5t",
434 | "colab_type": "code",
435 | "colab": {}
436 | },
437 | "source": [
438 | "sns.barplot(x='location', y='target', data=train)"
439 | ],
440 | "execution_count": 0,
441 | "outputs": []
442 | },
443 | {
444 | "cell_type": "code",
445 | "metadata": {
446 | "trusted": true,
447 | "id": "6j6g4kIBxu5v",
448 | "colab_type": "code",
449 | "colab": {}
450 | },
451 | "source": [
452 | "features = [\"temp\", \"precip\", \"rel_humidity\", \"wind_dir\", \"wind_spd\", \"atmos_press\"]\n",
453 | "\n",
454 | "days_features = [\n",
455 | "'temp_day_0', 'temp_day_1', 'temp_day_2', 'temp_day_3', 'temp_day_4', \n",
456 | "'precip_day_0', 'precip_day_1', 'precip_day_2', 'precip_day_3','precip_day_4',\n",
457 | "'rel_humidity_day_0','rel_humidity_day_1', 'rel_humidity_day_2', 'rel_humidity_day_3','rel_humidity_day_4',\n",
458 | "'wind_dir_day_0', 'wind_dir_day_1', 'wind_dir_day_2', 'wind_dir_day_3','wind_dir_day_4',\n",
459 | "'wind_spd_day_0','wind_spd_day_1', 'wind_spd_day_2', 'wind_spd_day_3', 'wind_spd_day_4',\n",
460 | "'atmos_press_day_0', 'atmos_press_day_1','atmos_press_day_2', 'atmos_press_day_3', 'atmos_press_day_4']"
461 | ],
462 | "execution_count": 0,
463 | "outputs": []
464 | },
465 | {
466 | "cell_type": "code",
467 | "metadata": {
468 | "trusted": true,
469 | "id": "eGv82_T5xu5y",
470 | "colab_type": "code",
471 | "colab": {}
472 | },
473 | "source": [
474 | "for feature in features :\n",
475 | " train[feature] = train[feature].apply(lambda x: [ replace_nan(X) for X in x.replace(\"nan\",\" \").split(\",\")])\n",
476 | " test[feature] = test[feature].apply(lambda x: [ replace_nan(X) for X in x.replace(\"nan\",\" \").split(\",\")])"
477 | ],
478 | "execution_count": 0,
479 | "outputs": []
480 | },
481 | {
482 | "cell_type": "code",
483 | "metadata": {
484 | "trusted": true,
485 | "id": "it6idKZ4xu5z",
486 | "colab_type": "code",
487 | "colab": {}
488 | },
489 | "source": [
490 | "datav1 = pd.concat([train, test],sort=False).reset_index(drop=True)\n",
491 | "datav2 = datav1.copy()"
492 | ],
493 | "execution_count": 0,
494 | "outputs": []
495 | },
496 | {
497 | "cell_type": "code",
498 | "metadata": {
499 | "trusted": true,
500 | "id": "6cYe2pCDxu51",
501 | "colab_type": "code",
502 | "colab": {}
503 | },
504 | "source": [
505 | "for col_name in tqdm(features):\n",
506 | " split_into_days(datav1, features)"
507 | ],
508 | "execution_count": 0,
509 | "outputs": []
510 | },
511 | {
512 | "cell_type": "code",
513 | "metadata": {
514 | "trusted": true,
515 | "id": "9fVk3hRyxu53",
516 | "colab_type": "code",
517 | "colab": {}
518 | },
519 | "source": [
520 | "for col_name in tqdm(days_features):\n",
521 | " datav1[col_name] = datav1[col_name].apply(remove_nan_values)\n",
522 | "\n",
523 | "for col_name in tqdm(days_features):\n",
524 | " datav1 = aggregate_features(datav1,col_name)"
525 | ],
526 | "execution_count": 0,
527 | "outputs": []
528 | },
529 | {
530 | "cell_type": "code",
531 | "metadata": {
532 | "trusted": true,
533 | "id": "PlC9yiZ1xu54",
534 | "colab_type": "code",
535 | "colab": {}
536 | },
537 | "source": [
538 | "for col_name in tqdm(features):\n",
539 | " datav1[col_name] = datav1[col_name].apply(remove_nan_values)\n",
540 | "\n",
541 | "for col_name in tqdm(features):\n",
542 | " datav1 = aggregate_features(datav1,col_name)"
543 | ],
544 | "execution_count": 0,
545 | "outputs": []
546 | },
547 | {
548 | "cell_type": "code",
549 | "metadata": {
550 | "trusted": true,
551 | "id": "2SoIHewUxu56",
552 | "colab_type": "code",
553 | "colab": {}
554 | },
555 | "source": [
556 | "for feat in features:\n",
557 | " for i in range(len(datav2.loc[1, features[0]])):\n",
558 | " datav2[feat+f'_{i}'] = datav2[feat].apply(lambda x: x[i])"
559 | ],
560 | "execution_count": 0,
561 | "outputs": []
562 | },
563 | {
564 | "cell_type": "code",
565 | "metadata": {
566 | "trusted": true,
567 | "id": "5TVE9KpIxu57",
568 | "colab_type": "code",
569 | "colab": {}
570 | },
571 | "source": [
572 | "datav1.drop(features+days_features, axis=1, inplace=True)\n",
573 | "datav2.drop(features+['target', 'ID','location'], axis=1, inplace=True)"
574 | ],
575 | "execution_count": 0,
576 | "outputs": []
577 | },
578 | {
579 | "cell_type": "code",
580 | "metadata": {
581 | "trusted": true,
582 | "id": "S65anRC0xu59",
583 | "colab_type": "code",
584 | "colab": {}
585 | },
586 | "source": [
587 | "data = pd.concat([datav1, datav2], axis=1)\n",
588 | "#data = datav1.copy()"
589 | ],
590 | "execution_count": 0,
591 | "outputs": []
592 | },
593 | {
594 | "cell_type": "code",
595 | "metadata": {
596 | "trusted": true,
597 | "id": "Xo0WMKN2xu5_",
598 | "colab_type": "code",
599 | "colab": {}
600 | },
601 | "source": [
602 | "oe = ce.OrdinalEncoder(cols=['location'])\n",
603 | "data['binned_location'] = oe.fit_transform(data['location'])"
604 | ],
605 | "execution_count": 0,
606 | "outputs": []
607 | },
608 | {
609 | "cell_type": "code",
610 | "metadata": {
611 | "trusted": true,
612 | "id": "PxY9NNO4xu6B",
613 | "colab_type": "code",
614 | "colab": {}
615 | },
616 | "source": [
617 | "meta.fillna(-9999, inplace=True)"
618 | ],
619 | "execution_count": 0,
620 | "outputs": []
621 | },
622 | {
623 | "cell_type": "code",
624 | "metadata": {
625 | "trusted": true,
626 | "id": "48jydDiRxu6D",
627 | "colab_type": "code",
628 | "colab": {}
629 | },
630 | "source": [
631 | "aggs = {\n",
632 | " 'binned_location': ['count'],\n",
633 | " 'target': ['mean', 'min', 'max', 'std', 'quantile', 'sum'],\n",
634 | "\n",
635 | "}"
636 | ],
637 | "execution_count": 0,
638 | "outputs": []
639 | },
640 | {
641 | "cell_type": "code",
642 | "metadata": {
643 | "trusted": true,
644 | "id": "oPKGrSXyxu6E",
645 | "colab_type": "code",
646 | "colab": {}
647 | },
648 | "source": [
649 | "meta_stats = data.groupby('location').agg(aggs)"
650 | ],
651 | "execution_count": 0,
652 | "outputs": []
653 | },
654 | {
655 | "cell_type": "code",
656 | "metadata": {
657 | "trusted": true,
658 | "id": "ZEdUf6frxu6G",
659 | "colab_type": "code",
660 | "colab": {}
661 | },
662 | "source": [
663 | "meta_stats = meta_stats.merge(meta, how='inner', on='location')"
664 | ],
665 | "execution_count": 0,
666 | "outputs": []
667 | },
668 | {
669 | "cell_type": "code",
670 | "metadata": {
671 | "trusted": true,
672 | "id": "4IrEOEUmxu6I",
673 | "colab_type": "code",
674 | "colab": {}
675 | },
676 | "source": [
677 | "meta_stats.rename(\n",
678 | " columns = {\n",
679 | " ('binned_location', 'count') : 'count',\n",
680 | " ('target', 'mean') : 'mean_target', \n",
681 | " ('target', 'min') : 'min_target',\n",
682 | " ('target', 'max') : 'max_target',\n",
683 | " ('target', 'std') : 'std_target',\n",
684 | " ('target', 'quantile') : 'quantile_target',\n",
685 | " ('target', 'sum') : 'sum_target'\n",
686 | " },\n",
687 | " inplace=True\n",
688 | ")"
689 | ],
690 | "execution_count": 0,
691 | "outputs": []
692 | },
693 | {
694 | "cell_type": "code",
695 | "metadata": {
696 | "trusted": true,
697 | "id": "pFljrdYgxu6J",
698 | "colab_type": "code",
699 | "colab": {}
700 | },
701 | "source": [
702 | "meta_stats['mean_pm2.5_per_km2'] = meta_stats['mean_target']/meta_stats['km2']\n",
703 | "meta_stats['sum_pm2.5_per_km2'] = meta_stats['sum_target']/meta_stats['km2']\n",
704 | "meta_stats['device_per_km2'] = meta_stats['count']/meta_stats['km2']"
705 | ],
706 | "execution_count": 0,
707 | "outputs": []
708 | },
709 | {
710 | "cell_type": "code",
711 | "metadata": {
712 | "trusted": true,
713 | "id": "Nvdl8aXDxu6L",
714 | "colab_type": "code",
715 | "colab": {}
716 | },
717 | "source": [
718 | "meta_stats['sum_target'] = meta_stats['sum_target'].apply(np.log2)\n",
719 | "meta_stats['sum_pm2.5_per_km2'] = meta_stats['sum_pm2.5_per_km2'].apply(np.log2)"
720 | ],
721 | "execution_count": 0,
722 | "outputs": []
723 | },
724 | {
725 | "cell_type": "code",
726 | "metadata": {
727 | "trusted": true,
728 | "id": "zmF8CGw0xu6M",
729 | "colab_type": "code",
730 | "colab": {}
731 | },
732 | "source": [
733 | "data = data.merge(meta_stats, how='left', on=['location'])"
734 | ],
735 | "execution_count": 0,
736 | "outputs": []
737 | },
738 | {
739 | "cell_type": "code",
740 | "metadata": {
741 | "trusted": true,
742 | "id": "BvciyhPUxu6O",
743 | "colab_type": "code",
744 | "colab": {}
745 | },
746 | "source": [
747 | "data['mean_temp_day_3']"
748 | ],
749 | "execution_count": 0,
750 | "outputs": []
751 | },
752 | {
753 | "cell_type": "code",
754 | "metadata": {
755 | "trusted": true,
756 | "id": "NVKgBcCBxu6Q",
757 | "colab_type": "code",
758 | "colab": {}
759 | },
760 | "source": [
761 | "hum_features = list(data.filter(regex='rel_humidity_.*').columns)\n",
762 | "temp_features = list( data.filter(regex='temp_.*').columns) \n",
763 | "precip_features = list(data.filter(regex='precip.*').columns)\n",
764 | "winddir_features = list( data.filter(regex='wind_dir_.*').columns)\n",
765 | "windspead_features = list( data.filter(regex='wind_spd_.*').columns)\n",
766 | "atm_features = list(data.filter(regex='atmos_press_.*').columns)\n",
767 | "\n",
768 | "\n",
769 | "\n",
770 | "\n",
771 | "hum_features= hum_features[36:]\n",
772 | "temp_features=temp_features[36:] \n",
773 | "precip_features=precip_features[31:]\n",
774 | "winddir_features=winddir_features[36:]\n",
775 | "windspead_features=windspead_features[36:]\n",
776 | "atm_features=atm_features[36:]\n",
777 | "\n",
778 | "\n",
779 | "\n",
780 | "\n",
781 | "\n",
782 | "data[hum_features]= data[hum_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n",
783 | "\n",
784 | "\n",
785 | "\n",
786 | "data[temp_features]= data[temp_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n",
787 | "\n",
788 | "\n",
789 | "data[precip_features]= data[precip_features].apply(lambda x: x.fillna(float(0.0)),axis=1)\n",
790 | "\n",
791 | "\n",
792 | "\n",
793 | "data[winddir_features]= data[winddir_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n",
794 | "\n",
795 | "\n",
796 | "\n",
797 | "data[windspead_features]= data[windspead_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n",
798 | "\n",
799 | "\n",
800 | "\n",
801 | "\n",
802 | "data[atm_features]= data[atm_features].apply(lambda x: x.fillna(x.mean()),axis=1)\n"
803 | ],
804 | "execution_count": 0,
805 | "outputs": []
806 | },
807 | {
808 | "cell_type": "code",
809 | "metadata": {
810 | "trusted": true,
811 | "id": "AafXo5ESxu6T",
812 | "colab_type": "code",
813 | "colab": {}
814 | },
815 | "source": [
816 | "data['relation1'] = data['wind_spd_118'] +data['wind_spd_119'] +data['wind_spd_120']\n",
817 | "data['relation2'] = data['temp_89'] + data['temp_95'] + data['temp_48'] +data['temp_70'] + data['temp_88'] + data['temp_72']\n",
818 | "data['relation3'] = data['rel_humidity_112'] + data['rel_humidity_113'] + data['rel_humidity_102'] + data['rel_humidity_42'] + data['rel_humidity_3'] \n",
819 | "data['relation4'] = data['atmos_press_103'] + data['atmos_press_7'] +data['atmos_press_10'] +data['atmos_press_109'] +data['atmos_press_116']"
820 | ],
821 | "execution_count": 0,
822 | "outputs": []
823 | },
824 | {
825 | "cell_type": "code",
826 | "metadata": {
827 | "trusted": true,
828 | "id": "uN7EC45bxu6U",
829 | "colab_type": "code",
830 | "colab": {}
831 | },
832 | "source": [
833 | "wind_dir_feature_ = list(data.filter(regex='wind_dir.*').columns)\n",
834 | "len([x for x in data.columns if 'wind_dir' in x]) , len(wind_dir_feature_)"
835 | ],
836 | "execution_count": 0,
837 | "outputs": []
838 | },
839 | {
840 | "cell_type": "code",
841 | "metadata": {
842 | "trusted": true,
843 | "id": "xRQys-BFxu6W",
844 | "colab_type": "code",
845 | "colab": {}
846 | },
847 | "source": [
848 | "def radian_conv(degree):\n",
849 | " \"\"\"\n",
850 | " Return radian.\n",
851 | " \"\"\"\n",
852 | " return np.radians(degree) \n",
853 | "\n",
854 | "\n",
855 | "\n",
856 | "for col in wind_dir_feature_ :\n",
857 | " \n",
858 | " data[col] = data[col].apply(radian_conv)\n"
859 | ],
860 | "execution_count": 0,
861 | "outputs": []
862 | },
863 | {
864 | "cell_type": "code",
865 | "metadata": {
866 | "trusted": true,
867 | "id": "Ko3yQM7bxu6Y",
868 | "colab_type": "code",
869 | "colab": {}
870 | },
871 | "source": [
872 | "train = data[data.target.notnull()].reset_index(drop=True)\n",
873 | "test = data[data.target.isna()].reset_index(drop=True)"
874 | ],
875 | "execution_count": 0,
876 | "outputs": []
877 | },
878 | {
879 | "cell_type": "code",
880 | "metadata": {
881 | "trusted": true,
882 | "id": "5IZ9qeRnxu6d",
883 | "colab_type": "code",
884 | "colab": {}
885 | },
886 | "source": [
887 | "to_drop = ['min_precip_day_0', 'min_precip_day_1', 'min_precip_day_2',\n",
888 | " 'min_precip_day_3', 'min_precip_day_4', 'min_precip', \n",
889 | " 'median_precip_day_0', 'median_precip_day_1', 'median_precip_day_2',\n",
890 | " 'median_precip_day_3', 'median_precip_day_4', 'median_precip',\n",
891 | " ]"
892 | ],
893 | "execution_count": 0,
894 | "outputs": []
895 | },
896 | {
897 | "cell_type": "code",
898 | "metadata": {
899 | "trusted": true,
900 | "id": "fS0fK_LUxu6f",
901 | "colab_type": "code",
902 | "colab": {}
903 | },
904 | "source": [
905 | "train.drop(labels=to_drop, axis=1, inplace=True)\n",
906 | "test.drop(labels=to_drop, axis=1, inplace=True)"
907 | ],
908 | "execution_count": 0,
909 | "outputs": []
910 | },
911 | {
912 | "cell_type": "code",
913 | "metadata": {
914 | "trusted": true,
915 | "id": "01RV_1ynxu6i",
916 | "colab_type": "code",
917 | "colab": {}
918 | },
919 | "source": [
920 | "features = train.columns.difference(['ID', 'target', 'binned_location'])\n",
921 | "select_features = train.columns.difference(['ID', 'target', 'location'])\n",
922 | "target = 'target'"
923 | ],
924 | "execution_count": 0,
925 | "outputs": []
926 | },
927 | {
928 | "cell_type": "code",
929 | "metadata": {
930 | "trusted": true,
931 | "id": "DG4ZV1EZxu6k",
932 | "colab_type": "code",
933 | "colab": {}
934 | },
935 | "source": [
936 | "best_features = ['atmos_press_1', 'atmos_press_10', 'atmos_press_103',\n",
937 | " 'atmos_press_104', 'atmos_press_105', 'atmos_press_109',\n",
938 | " 'atmos_press_110', 'atmos_press_115', 'atmos_press_116',\n",
939 | " 'atmos_press_14', 'atmos_press_19', 'atmos_press_2',\n",
940 | " 'atmos_press_20', 'atmos_press_21', 'atmos_press_25',\n",
941 | " 'atmos_press_26', 'atmos_press_27', 'atmos_press_28',\n",
942 | " 'atmos_press_3', 'atmos_press_31', 'atmos_press_32',\n",
943 | " 'atmos_press_33', 'atmos_press_37', 'atmos_press_38',\n",
944 | " 'atmos_press_39', 'atmos_press_43', 'atmos_press_44',\n",
945 | " 'atmos_press_50', 'atmos_press_51', 'atmos_press_52',\n",
946 | " 'atmos_press_56', 'atmos_press_57', 'atmos_press_61',\n",
947 | " 'atmos_press_62', 'atmos_press_67', 'atmos_press_68',\n",
948 | " 'atmos_press_7', 'atmos_press_75', 'atmos_press_8',\n",
949 | " 'atmos_press_80', 'atmos_press_81', 'atmos_press_86',\n",
950 | " 'atmos_press_9', 'atmos_press_91', 'atmos_press_92',\n",
951 | " 'atmos_press_93', 'atmos_press_98', 'atmos_press_99',\n",
952 | " 'hh_burn_waste', 'max_atmos_press', 'max_precip',\n",
953 | " 'max_rel_humidity', \n",
954 | " 'max_rel_humidity_day_2',\n",
955 | " 'max_temp', 'max_wind_dir',\n",
956 | " 'max_wind_dir_day_2',\n",
957 | " 'max_wind_spd',\n",
958 | " 'mean_atmos_press', 'mean_precip', 'mean_rel_humidity',\n",
959 | " \n",
960 | " \n",
961 | " 'mean_target', 'mean_temp_day_2',\n",
962 | " 'mean_wind_dir',\n",
963 | " 'mean_wind_dir_day_2',\n",
964 | " 'mean_wind_spd',\n",
965 | " 'median_atmos_press',\n",
966 | " 'median_rel_humidity', \n",
967 | " 'median_rel_humidity_day_2', \n",
968 | " 'median_temp',\n",
969 | " 'median_temp_day_2', \n",
970 | " 'median_wind_dir',\n",
971 | " 'median_wind_dir_day_2',\n",
972 | " \n",
973 | " 'median_wind_spd',\n",
974 | " 'median_wind_spd_day_2', \n",
975 | " 'min_atmos_press_day_2',\n",
976 | " 'min_rel_humidity',\n",
977 | " 'min_temp',\n",
978 | " 'min_temp_day_2',\n",
979 | " 'min_wind_dir',\n",
980 | " \n",
981 | " 'min_wind_spd', 'min_wind_spd_day_2',\n",
982 | " 'ptp_atmos_press',\n",
983 | " \n",
984 | " 'ptp_precip', 'ptp_rel_humidity',\n",
985 | " 'ptp_wind_dir',\n",
986 | " 'ptp_wind_spd',\n",
987 | " 'rel_humidity_102', 'rel_humidity_112', 'rel_humidity_113',\n",
988 | " 'rel_humidity_114', 'rel_humidity_3', 'rel_humidity_42',\n",
989 | " 'rel_humidity_63', 'rel_humidity_78', 'std_atmos_press',\n",
990 | " 'std_precip', 'std_rel_humidity', 'std_wind_dir',\n",
991 | " 'std_wind_spd', 'temp_0', 'temp_1',\n",
992 | " 'temp_102', 'temp_113', 'temp_114', 'temp_118', 'temp_120',\n",
993 | " 'temp_16', 'temp_17', 'temp_2', 'temp_20', 'temp_22', 'temp_23',\n",
994 | " 'temp_24', 'temp_25', 'temp_30', 'temp_40', 'temp_41', 'temp_48',\n",
995 | " 'temp_49', 'temp_5', 'temp_50', 'temp_54', 'temp_64', 'temp_70',\n",
996 | " 'temp_71', 'temp_72', 'temp_78', 'temp_88', 'temp_89', 'temp_92',\n",
997 | " 'temp_94', 'temp_95', 'temp_97', 'temp_target_reading_day',\n",
998 | " 'var_atmos_press', 'var_precip', 'var_rel_humidity', 'var_temp',\n",
999 | " 'var_wind_dir', 'var_wind_spd',\n",
1000 | " 'wind_spd_108', 'wind_spd_118', 'wind_spd_119', 'wind_spd_120',\n",
1001 | " 'wind_spd_29', 'wind_spd_target_reading_day' ,\n",
1002 | " 'relation1','relation2','relation3',\n",
1003 | " ]"
1004 | ],
1005 | "execution_count": 0,
1006 | "outputs": []
1007 | },
1008 | {
1009 | "cell_type": "code",
1010 | "metadata": {
1011 | "trusted": true,
1012 | "id": "Se050WX6xu6l",
1013 | "colab_type": "code",
1014 | "colab": {}
1015 | },
1016 | "source": [
1017 | "fold = KFold(n_splits=20, random_state=seed)"
1018 | ],
1019 | "execution_count": 0,
1020 | "outputs": []
1021 | },
1022 | {
1023 | "cell_type": "code",
1024 | "metadata": {
1025 | "trusted": true,
1026 | "id": "bGOgFqJexu6n",
1027 | "colab_type": "code",
1028 | "colab": {}
1029 | },
1030 | "source": [
1031 | "import lightgbm as lgb\n",
1032 | "params = {\n",
1033 | " 'objective' :'regression',\n",
1034 | " 'learning_rate' : 0.1,\n",
1035 | " 'num_iterations': 1500,\n",
1036 | " 'max_bins': 150, \n",
1037 | " 'max_depth' :7 ,\n",
1038 | " 'num_leaves' : 200,\n",
1039 | " 'feature_fraction': 0.64, \n",
1040 | " 'bagging_fraction': 0.8, \n",
1041 | " 'bagging_freq':1,\n",
1042 | " 'boosting_type' : 'gbdt',\n",
1043 | " 'metric': 'rmse' ,\n",
1044 | " 'min_data_in_leaf':15,\n",
1045 | " 'reg_lambda' :150\n",
1046 | "}\n",
1047 | "\n"
1048 | ],
1049 | "execution_count": 0,
1050 | "outputs": []
1051 | },
1052 | {
1053 | "cell_type": "code",
1054 | "metadata": {
1055 | "trusted": true,
1056 | "id": "ic2XJvDoxu6p",
1057 | "colab_type": "code",
1058 | "colab": {}
1059 | },
1060 | "source": [
1061 | "test_preds = []\n",
1062 | "score_oofs = []\n",
1063 | "feats = pd.DataFrame({'features': best_features}) #You can change \n",
1064 | "\n",
1065 | "for i, (tr, vr) in enumerate(fold.split(train)):\n",
1066 | " X, Y = train.loc[tr, best_features], train.loc[tr, target]\n",
1067 | " x, y = train.loc[vr, best_features], train.loc[vr, target]\n",
1068 | "\n",
1069 | " model = lgb.LGBMRegressor(**params)\n",
1070 | " model.fit(X, Y, eval_set=[(x,y)], verbose=100, early_stopping_rounds=200)\n",
1071 | " pred = model.predict(x)\n",
1072 | " test_pred = model.predict(test[best_features])\n",
1073 | " sc = metric(y, pred)\n",
1074 | " score_oofs.append(sc)\n",
1075 | " test_preds.append(test_pred)\n",
1076 | " feats[f'Fold {i}'] = model.feature_importances_\n",
1077 | " # print('RMSE : {}'.format(sc))\n",
1078 | "feats['Importances'] = feats.mean(axis=1)\n",
1079 | "print()\n",
1080 | "print('CV RMSE : {}'.format(np.mean(score_oofs, axis=0)))"
1081 | ],
1082 | "execution_count": 0,
1083 | "outputs": []
1084 | },
1085 | {
1086 | "cell_type": "code",
1087 | "metadata": {
1088 | "trusted": true,
1089 | "id": "H0TQsVrBxu6q",
1090 | "colab_type": "code",
1091 | "colab": {}
1092 | },
1093 | "source": [
1094 | "print('CV RMSE : {}'.format(np.mean(score_oofs, axis=0)))"
1095 | ],
1096 | "execution_count": 0,
1097 | "outputs": []
1098 | },
1099 | {
1100 | "cell_type": "markdown",
1101 | "metadata": {
1102 | "id": "3sGBtFpTyXfT",
1103 | "colab_type": "text"
1104 | },
1105 | "source": [
1106 | "## ON KAGGLE CV RMSE ==22.69329469880266"
1107 | ]
1108 | },
1109 | {
1110 | "cell_type": "code",
1111 | "metadata": {
1112 | "trusted": true,
1113 | "id": "zsZqw6p_xu6r",
1114 | "colab_type": "code",
1115 | "colab": {}
1116 | },
1117 | "source": [
1118 | "test[target] = np.mean(test_preds, axis=0)\n",
1119 | "test[['ID', target]].to_csv('ANOTHERLGB.csv', index=False)"
1120 | ],
1121 | "execution_count": 0,
1122 | "outputs": []
1123 | },
1124 | {
1125 | "cell_type": "code",
1126 | "metadata": {
1127 | "trusted": true,
1128 | "id": "0vjmYYbqxu6t",
1129 | "colab_type": "code",
1130 | "colab": {}
1131 | },
1132 | "source": [
1133 | "feats.head()"
1134 | ],
1135 | "execution_count": 0,
1136 | "outputs": []
1137 | },
1138 | {
1139 | "cell_type": "code",
1140 | "metadata": {
1141 | "trusted": true,
1142 | "id": "xeyGQnUIxu6u",
1143 | "colab_type": "code",
1144 | "colab": {}
1145 | },
1146 | "source": [
1147 | "fig = plt.figure(figsize=(20,40))\n",
1148 | "sns.barplot(x='Importances', y='features', data=feats.sort_values(by='Importances', ascending=False));"
1149 | ],
1150 | "execution_count": 0,
1151 | "outputs": []
1152 | }
1153 | ]
1154 | }
--------------------------------------------------------------------------------
/new_data_creation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "language": "python",
7 | "display_name": "Python 3",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "pygments_lexer": "ipython3",
12 | "nbconvert_exporter": "python",
13 | "version": "3.6.4",
14 | "file_extension": ".py",
15 | "codemirror_mode": {
16 | "name": "ipython",
17 | "version": 3
18 | },
19 | "name": "python",
20 | "mimetype": "text/x-python"
21 | },
22 | "colab": {
23 | "name": "new_data_creation.ipynb",
24 | "provenance": [],
25 | "include_colab_link": true
26 | }
27 | },
28 | "cells": [
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "id": "view-in-github",
33 | "colab_type": "text"
34 | },
35 | "source": [
36 | "
"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "metadata": {
42 | "id": "lygrd_4THhsP",
43 | "trusted": true,
44 | "colab_type": "code",
45 | "colab": {}
46 | },
47 | "source": [
48 | "import pandas as pd \n",
49 | "import numpy as np \n",
50 | "from tqdm import tqdm\n",
51 | "import math\n",
52 | "import gc"
53 | ],
54 | "execution_count": 0,
55 | "outputs": []
56 | },
57 | {
58 | "cell_type": "code",
59 | "metadata": {
60 | "id": "laCuoZYCHhsW",
61 | "trusted": true,
62 | "colab_type": "code",
63 | "colab": {}
64 | },
65 | "source": [
66 | "\n",
67 | "pd.set_option('display.max_rows', 500)\n",
68 | "pd.set_option('display.max_columns', 500)\n",
69 | "pd.set_option('display.width', 1000)"
70 | ],
71 | "execution_count": 0,
72 | "outputs": []
73 | },
74 | {
75 | "cell_type": "code",
76 | "metadata": {
77 | "id": "tRthNH90Hhsa",
78 | "trusted": true,
79 | "colab_type": "code",
80 | "colab": {}
81 | },
82 | "source": [
83 | "train=pd.read_csv(\"../input/airqo-ugandan-air-quality-forecast-challenge-zindi/Train (1).csv\")\n",
84 | "test=pd.read_csv(\"../input/airqo-ugandan-air-quality-forecast-challenge-zindi/Test (1).csv\")"
85 | ],
86 | "execution_count": 0,
87 | "outputs": []
88 | },
89 | {
90 | "cell_type": "code",
91 | "metadata": {
92 | "id": "j_1ClVwsHhsf",
93 | "trusted": true,
94 | "colab_type": "code",
95 | "colab": {}
96 | },
97 | "source": [
98 | "train.head()"
99 | ],
100 | "execution_count": 0,
101 | "outputs": []
102 | },
103 | {
104 | "cell_type": "code",
105 | "metadata": {
106 | "id": "7lskkR2rHhss",
107 | "trusted": true,
108 | "colab_type": "code",
109 | "colab": {}
110 | },
111 | "source": [
112 | "# covert features fron string to List of values \n",
113 | "def replace_nan(x):\n",
114 | " if x==\"nan\":\n",
115 | " return np.nan\n",
116 | " else :\n",
117 | " return float(x)\n",
118 | "features=[\"temp\",\"precip\",\"rel_humidity\",\"wind_dir\",\"wind_spd\",\"atmos_press\"]\n"
119 | ],
120 | "execution_count": 0,
121 | "outputs": []
122 | },
123 | {
124 | "cell_type": "code",
125 | "metadata": {
126 | "trusted": true,
127 | "id": "zBgcfd_2nDdL",
128 | "colab_type": "code",
129 | "colab": {}
130 | },
131 | "source": [
132 | "train1 = train.copy()\n",
133 | "test1 = test.copy()"
134 | ],
135 | "execution_count": 0,
136 | "outputs": []
137 | },
138 | {
139 | "cell_type": "code",
140 | "metadata": {
141 | "trusted": true,
142 | "id": "cr3UXTc3nDdO",
143 | "colab_type": "code",
144 | "colab": {}
145 | },
146 | "source": [
147 | "# covert features fron string to List of values \n",
148 | "def replace_nan1(x):\n",
149 | " if x==\" \":\n",
150 | " return np.nan\n",
151 | " else :\n",
152 | " return float(x)\n",
153 | "features=[\"temp\",\"precip\",\"rel_humidity\",\"wind_dir\",\"wind_spd\",\"atmos_press\"]\n",
154 | "for feature in features : \n",
155 | " train1[feature]=train1[feature].apply(lambda x: [ replace_nan1(X) for X in x.replace(\"nan\",\" \").split(\",\")])\n",
156 | " test1[feature]=test1[feature].apply(lambda x: [ replace_nan1(X) for X in x.replace(\"nan\",\" \").split(\",\")]) "
157 | ],
158 | "execution_count": 0,
159 | "outputs": []
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {
164 | "id": "mApRDmFGHhsz",
165 | "colab_type": "text"
166 | },
167 | "source": [
168 | "### Features engineering part"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "metadata": {
174 | "trusted": true,
175 | "id": "ZD2E9Np_nDdR",
176 | "colab_type": "code",
177 | "colab": {}
178 | },
179 | "source": [
180 | "def percentile(n):\n",
181 | " def percentile_(x):\n",
182 | " return np.percentile(x, n)\n",
183 | " percentile_.__name__ = 'percentile_%s' % n\n",
184 | " return percentile_"
185 | ],
186 | "execution_count": 0,
187 | "outputs": []
188 | },
189 | {
190 | "cell_type": "code",
191 | "metadata": {
192 | "id": "sg8-zujFHhs1",
193 | "trusted": true,
194 | "colab_type": "code",
195 | "colab": {}
196 | },
197 | "source": [
198 | "def aggregate_features(x,col_name):\n",
199 | " x[\"max_\"+col_name]=x[col_name].apply(np.max)\n",
200 | " x[\"min_\"+col_name]=x[col_name].apply(np.min)\n",
201 | " x[\"mean_\"+col_name]=x[col_name].apply(np.mean)\n",
202 | " x[\"std_\"+col_name]=x[col_name].apply(np.std)\n",
203 | " x[\"var_\"+col_name]=x[col_name].apply(np.var)\n",
204 | " x[\"median_\"+col_name]=x[col_name].apply(np.median)\n",
205 | " x[\"ptp_\"+col_name]=x[col_name].apply(np.ptp)\n",
206 | " x[\"p1_\"+col_name]=x[col_name].apply(percentile(1))\n",
207 | " x[\"p5_\"+col_name]=x[col_name].apply(percentile(5))\n",
208 | " x[\"p10_\"+col_name]=x[col_name].apply(percentile(10))\n",
209 | " x[\"p20_\"+col_name]=x[col_name].apply(percentile(20))\n",
210 | " x[\"p30_\"+col_name]=x[col_name].apply(percentile(30))\n",
211 | " x[\"p40_\"+col_name]=x[col_name].apply(percentile(40))\n",
212 | " x[\"p60_\"+col_name]=x[col_name].apply(percentile(60))\n",
213 | " x[\"p70_\"+col_name]=x[col_name].apply(percentile(70))\n",
214 | " x[\"p80_\"+col_name]=x[col_name].apply(percentile(80))\n",
215 | " x[\"p90_\"+col_name]=x[col_name].apply(percentile(90))\n",
216 | " x[\"p95_\"+col_name]=x[col_name].apply(percentile(95))\n",
217 | " x[\"p99_\"+col_name]=x[col_name].apply(percentile(99))\n",
218 | " return x \n",
219 | "def remove_nan_values(x):\n",
220 | " return [e for e in x if not math.isnan(e)]\n"
221 | ],
222 | "execution_count": 0,
223 | "outputs": []
224 | },
225 | {
226 | "cell_type": "code",
227 | "metadata": {
228 | "id": "jSHXJxduHhs6",
229 | "trusted": true,
230 | "colab_type": "code",
231 | "colab": {}
232 | },
233 | "source": [
234 | "data=pd.concat([train,test],sort=False).reset_index(drop=True)\n",
235 | "data2=pd.concat([train1,test1],sort=False).reset_index(drop=True)\n",
236 | "data2.columns.tolist()"
237 | ],
238 | "execution_count": 0,
239 | "outputs": []
240 | },
241 | {
242 | "cell_type": "code",
243 | "metadata": {
244 | "trusted": true,
245 | "id": "CG8sz-usnDdZ",
246 | "colab_type": "code",
247 | "colab": {}
248 | },
249 | "source": [
250 | "for col_name in tqdm(features):\n",
251 | " data2[col_name]=data2[col_name].apply(remove_nan_values)"
252 | ],
253 | "execution_count": 0,
254 | "outputs": []
255 | },
256 | {
257 | "cell_type": "code",
258 | "metadata": {
259 | "trusted": true,
260 | "id": "hw9abEfknDda",
261 | "colab_type": "code",
262 | "colab": {}
263 | },
264 | "source": [
265 | "for col_name in tqdm(features):\n",
266 | " data2=aggregate_features(data2,col_name)"
267 | ],
268 | "execution_count": 0,
269 | "outputs": []
270 | },
271 | {
272 | "cell_type": "code",
273 | "metadata": {
274 | "trusted": true,
275 | "id": "Rq5JJrUWnDdd",
276 | "colab_type": "code",
277 | "colab": {}
278 | },
279 | "source": [
280 | "data2.head(2)"
281 | ],
282 | "execution_count": 0,
283 | "outputs": []
284 | },
285 | {
286 | "cell_type": "code",
287 | "metadata": {
288 | "id": "1dYviSGeHhs_",
289 | "trusted": true,
290 | "colab_type": "code",
291 | "colab": {}
292 | },
293 | "source": [
294 | "data.shape,data2.shape"
295 | ],
296 | "execution_count": 0,
297 | "outputs": []
298 | },
299 | {
300 | "cell_type": "code",
301 | "metadata": {
302 | "trusted": true,
303 | "id": "n77Rz_oCnDdg",
304 | "colab_type": "code",
305 | "colab": {}
306 | },
307 | "source": [
308 | "#['temp_'+str(i) for i in temp_df.columns]"
309 | ],
310 | "execution_count": 0,
311 | "outputs": []
312 | },
313 | {
314 | "cell_type": "code",
315 | "metadata": {
316 | "trusted": true,
317 | "id": "BBWMTSZ3nDdi",
318 | "colab_type": "code",
319 | "colab": {}
320 | },
321 | "source": [
322 | "data1 = pd.DataFrame()\n",
323 | "for col_name in tqdm(features):\n",
324 | " temp_df = data[col_name].apply(lambda x: pd.Series(x.split(',')))\n",
325 | " temp_df.columns = [col_name +str(i) for i in temp_df.columns]\n",
326 | " data1 = pd.concat([data1,temp_df],axis=1)"
327 | ],
328 | "execution_count": 0,
329 | "outputs": []
330 | },
331 | {
332 | "cell_type": "code",
333 | "metadata": {
334 | "trusted": true,
335 | "id": "sA4jCX2mnDdk",
336 | "colab_type": "code",
337 | "colab": {}
338 | },
339 | "source": [
340 | "for col_name in tqdm(data1.columns):\n",
341 | " data1[col_name] = data1[col_name].apply(replace_nan)"
342 | ],
343 | "execution_count": 0,
344 | "outputs": []
345 | },
346 | {
347 | "cell_type": "code",
348 | "metadata": {
349 | "trusted": true,
350 | "id": "9GmmluWunDdm",
351 | "colab_type": "code",
352 | "colab": {}
353 | },
354 | "source": [
355 | "data1"
356 | ],
357 | "execution_count": 0,
358 | "outputs": []
359 | },
360 | {
361 | "cell_type": "code",
362 | "metadata": {
363 | "trusted": true,
364 | "id": "bQokampdnDdo",
365 | "colab_type": "code",
366 | "colab": {}
367 | },
368 | "source": [
369 | "for col_name in tqdm(data1.columns):\n",
370 | " data1[col_name] = data1[col_name].fillna(0)"
371 | ],
372 | "execution_count": 0,
373 | "outputs": []
374 | },
375 | {
376 | "cell_type": "code",
377 | "metadata": {
378 | "trusted": true,
379 | "id": "072VFzctnDdq",
380 | "colab_type": "code",
381 | "colab": {}
382 | },
383 | "source": [
384 | "data1.head()"
385 | ],
386 | "execution_count": 0,
387 | "outputs": []
388 | },
389 | {
390 | "cell_type": "code",
391 | "metadata": {
392 | "trusted": true,
393 | "id": "zgHkBgQ2nDds",
394 | "colab_type": "code",
395 | "colab": {}
396 | },
397 | "source": [
398 | "data11 = data1.copy()\n",
399 | "data11['ID'] = data['ID']"
400 | ],
401 | "execution_count": 0,
402 | "outputs": []
403 | },
404 | {
405 | "cell_type": "code",
406 | "metadata": {
407 | "trusted": true,
408 | "id": "ddnjMZ-7nDdt",
409 | "colab_type": "code",
410 | "colab": {}
411 | },
412 | "source": [
413 | "data11.head()"
414 | ],
415 | "execution_count": 0,
416 | "outputs": []
417 | },
418 | {
419 | "cell_type": "code",
420 | "metadata": {
421 | "trusted": true,
422 | "id": "9onPi6UWnDdv",
423 | "colab_type": "code",
424 | "colab": {}
425 | },
426 | "source": [
427 | "def fun(x):\n",
428 | " if x.split('_')[0] == 'atmos':\n",
429 | " return x"
430 | ],
431 | "execution_count": 0,
432 | "outputs": []
433 | },
434 | {
435 | "cell_type": "code",
436 | "metadata": {
437 | "trusted": true,
438 | "id": "O5-PQIwcnDdx",
439 | "colab_type": "code",
440 | "colab": {}
441 | },
442 | "source": [
443 | "atmos = list(filter(fun,data11.columns))"
444 | ],
445 | "execution_count": 0,
446 | "outputs": []
447 | },
448 | {
449 | "cell_type": "code",
450 | "metadata": {
451 | "trusted": true,
452 | "id": "lBVOMbj-nDdz",
453 | "colab_type": "code",
454 | "colab": {}
455 | },
456 | "source": [
457 | "data11 = data11[['ID']+atmos]"
458 | ],
459 | "execution_count": 0,
460 | "outputs": []
461 | },
462 | {
463 | "cell_type": "code",
464 | "metadata": {
465 | "trusted": true,
466 | "id": "mWuRl8mKnDd1",
467 | "colab_type": "code",
468 | "colab": {}
469 | },
470 | "source": [
471 | "data11_atmos = pd.melt(data11,id_vars=[\"ID\"],var_name='atmos', value_name='value_atmos')\n"
472 | ],
473 | "execution_count": 0,
474 | "outputs": []
475 | },
476 | {
477 | "cell_type": "code",
478 | "metadata": {
479 | "trusted": true,
480 | "id": "fS87ZClWnDd4",
481 | "colab_type": "code",
482 | "colab": {}
483 | },
484 | "source": [
485 | "for i in tqdm(range(1, 15)):\n",
486 | " data11_atmos[f'magic_{i}'] = data11_atmos.sort_values(by='ID')['value_atmos'].shift(i).expanding().mean().fillna(method='ffill').sort_index()\n",
487 | " data11_atmos[f'magic2_{i}'] = data11_atmos.sort_values(by='ID')['value_atmos'].shift(-i).expanding().mean().fillna(method='bfill').sort_index()"
488 | ],
489 | "execution_count": 0,
490 | "outputs": []
491 | },
492 | {
493 | "cell_type": "code",
494 | "metadata": {
495 | "trusted": true,
496 | "id": "xmVcBhHlnDd5",
497 | "colab_type": "code",
498 | "colab": {}
499 | },
500 | "source": [
501 | "data11_atmos[data11_atmos.ID == 'ID_train_1'].head(3)"
502 | ],
503 | "execution_count": 0,
504 | "outputs": []
505 | },
506 | {
507 | "cell_type": "code",
508 | "metadata": {
509 | "trusted": true,
510 | "id": "cpWEAZ_9nDd7",
511 | "colab_type": "code",
512 | "colab": {}
513 | },
514 | "source": [
515 | "data11_piv = pd.pivot_table(data11_atmos,index='ID',columns = 'atmos',values = 'magic_1')\n",
516 | "data11_piv.columns = ['atmos_magic_1'+i for i in data11_piv.columns]\n",
517 | "for i in tqdm(range(2,15)):\n",
518 | " temp = pd.pivot_table(data11_atmos,index='ID',columns = 'atmos',values = 'magic_'+str(i))\n",
519 | " temp.columns = ['atmos_magic_'+str(i)+j for j in temp.columns]\n",
520 | " data11_piv = pd.concat([data11_piv,temp],axis=1)"
521 | ],
522 | "execution_count": 0,
523 | "outputs": []
524 | },
525 | {
526 | "cell_type": "code",
527 | "metadata": {
528 | "trusted": true,
529 | "id": "2Kt3ZD8unDd-",
530 | "colab_type": "code",
531 | "colab": {}
532 | },
533 | "source": [
534 | "data11_piv.reset_index(inplace=True)"
535 | ],
536 | "execution_count": 0,
537 | "outputs": []
538 | },
539 | {
540 | "cell_type": "code",
541 | "metadata": {
542 | "trusted": true,
543 | "id": "SXV38vqCnDeA",
544 | "colab_type": "code",
545 | "colab": {}
546 | },
547 | "source": [
548 | "data11 = pd.merge(data11,data11_piv,on='ID',how='left')"
549 | ],
550 | "execution_count": 0,
551 | "outputs": []
552 | },
553 | {
554 | "cell_type": "code",
555 | "metadata": {
556 | "trusted": true,
557 | "id": "Du4dLdyfnDeC",
558 | "colab_type": "code",
559 | "colab": {}
560 | },
561 | "source": [
562 | "m = data1.isnull().sum()\n",
563 | "sum(m[m>0])"
564 | ],
565 | "execution_count": 0,
566 | "outputs": []
567 | },
568 | {
569 | "cell_type": "code",
570 | "metadata": {
571 | "trusted": true,
572 | "id": "D65TOOg8nDeE",
573 | "colab_type": "code",
574 | "colab": {}
575 | },
576 | "source": [
577 | "data11.head()"
578 | ],
579 | "execution_count": 0,
580 | "outputs": []
581 | },
582 | {
583 | "cell_type": "code",
584 | "metadata": {
585 | "id": "aiAM7WA3HhtR",
586 | "trusted": true,
587 | "colab_type": "code",
588 | "colab": {}
589 | },
590 | "source": [
591 | "data2.drop(features,1,inplace=True)"
592 | ],
593 | "execution_count": 0,
594 | "outputs": []
595 | },
596 | {
597 | "cell_type": "code",
598 | "metadata": {
599 | "trusted": true,
600 | "id": "jFlOZI75nDeH",
601 | "colab_type": "code",
602 | "colab": {}
603 | },
604 | "source": [
605 | "data2"
606 | ],
607 | "execution_count": 0,
608 | "outputs": []
609 | },
610 | {
611 | "cell_type": "code",
612 | "metadata": {
613 | "trusted": true,
614 | "id": "TzKMLGREnDeJ",
615 | "colab_type": "code",
616 | "colab": {}
617 | },
618 | "source": [
619 | "data1['ID'] = data['ID']\n",
620 | "data1['target'] = data['target']\n",
621 | "data1['location'] = data['location']"
622 | ],
623 | "execution_count": 0,
624 | "outputs": []
625 | },
626 | {
627 | "cell_type": "code",
628 | "metadata": {
629 | "trusted": true,
630 | "id": "4hlTnqkwnDeK",
631 | "colab_type": "code",
632 | "colab": {}
633 | },
634 | "source": [
635 | "data1.head()"
636 | ],
637 | "execution_count": 0,
638 | "outputs": []
639 | },
640 | {
641 | "cell_type": "code",
642 | "metadata": {
643 | "trusted": true,
644 | "id": "53nZZvVfnDeM",
645 | "colab_type": "code",
646 | "colab": {}
647 | },
648 | "source": [
649 | "data1.shape"
650 | ],
651 | "execution_count": 0,
652 | "outputs": []
653 | },
654 | {
655 | "cell_type": "code",
656 | "metadata": {
657 | "trusted": true,
658 | "id": "t-3S4WYRnDeO",
659 | "colab_type": "code",
660 | "colab": {}
661 | },
662 | "source": [
663 | "# data1 = pd.merge(data1,data11,on='ID',how='inner')\n",
664 | "# data1.shape"
665 | ],
666 | "execution_count": 0,
667 | "outputs": []
668 | },
669 | {
670 | "cell_type": "code",
671 | "metadata": {
672 | "id": "b-inscoEHhtU",
673 | "trusted": true,
674 | "colab_type": "code",
675 | "colab": {}
676 | },
677 | "source": [
678 | "train=data1[data1.target.notnull()].reset_index(drop=True)\n",
679 | "test=data1[data1.target.isna()].reset_index(drop=True)"
680 | ],
681 | "execution_count": 0,
682 | "outputs": []
683 | },
684 | {
685 | "cell_type": "code",
686 | "metadata": {
687 | "trusted": true,
688 | "id": "cSDLwzdBnDeR",
689 | "colab_type": "code",
690 | "colab": {}
691 | },
692 | "source": [
693 | "train1=data2[data2.target.notnull()].reset_index(drop=True)\n",
694 | "test1=data2[data2.target.isna()].reset_index(drop=True)"
695 | ],
696 | "execution_count": 0,
697 | "outputs": []
698 | },
699 | {
700 | "cell_type": "code",
701 | "metadata": {
702 | "id": "j5JqDT3SHhtY",
703 | "trusted": true,
704 | "colab_type": "code",
705 | "colab": {}
706 | },
707 | "source": [
708 | "del data,data1,data2\n",
709 | "gc.collect()"
710 | ],
711 | "execution_count": 0,
712 | "outputs": []
713 | },
714 | {
715 | "cell_type": "code",
716 | "metadata": {
717 | "id": "h_m0qW-3Hhtb",
718 | "trusted": true,
719 | "colab_type": "code",
720 | "colab": {}
721 | },
722 | "source": [
723 | "train.head()"
724 | ],
725 | "execution_count": 0,
726 | "outputs": []
727 | },
728 | {
729 | "cell_type": "code",
730 | "metadata": {
731 | "id": "Ycw-LMZvHhtf",
732 | "trusted": true,
733 | "colab_type": "code",
734 | "colab": {}
735 | },
736 | "source": [
737 | "train1.drop(['ID','location','target'],axis=1,inplace=True)\n",
738 | "test1.drop(['ID','location','target'],axis=1,inplace=True)"
739 | ],
740 | "execution_count": 0,
741 | "outputs": []
742 | },
743 | {
744 | "cell_type": "code",
745 | "metadata": {
746 | "trusted": true,
747 | "id": "3Wp03kuLnDea",
748 | "colab_type": "code",
749 | "colab": {}
750 | },
751 | "source": [
752 | "train.shape,test.shape"
753 | ],
754 | "execution_count": 0,
755 | "outputs": []
756 | },
757 | {
758 | "cell_type": "code",
759 | "metadata": {
760 | "trusted": true,
761 | "id": "e2fukqsAnDec",
762 | "colab_type": "code",
763 | "colab": {}
764 | },
765 | "source": [
766 | "train = pd.concat([train,train1],axis=1)\n",
767 | "test = pd.concat([test,test1],axis=1)\n",
768 | "train.shape,test.shape"
769 | ],
770 | "execution_count": 0,
771 | "outputs": []
772 | },
773 | {
774 | "cell_type": "code",
775 | "metadata": {
776 | "id": "rXICDSaMHhtk",
777 | "trusted": true,
778 | "colab_type": "code",
779 | "colab": {}
780 | },
781 | "source": [
782 | "Experiment_name=\"simple_model\""
783 | ],
784 | "execution_count": 0,
785 | "outputs": []
786 | },
787 | {
788 | "cell_type": "code",
789 | "metadata": {
790 | "trusted": true,
791 | "id": "HIM-oxjJnDef",
792 | "colab_type": "code",
793 | "colab": {}
794 | },
795 | "source": [
796 | "train_id = train['ID']\n",
797 | "test_id = test['ID']\n",
798 | "y = train['target']"
799 | ],
800 | "execution_count": 0,
801 | "outputs": []
802 | },
803 | {
804 | "cell_type": "code",
805 | "metadata": {
806 | "trusted": true,
807 | "id": "DewPn3Y0nDeh",
808 | "colab_type": "code",
809 | "colab": {}
810 | },
811 | "source": [
812 | "#train.drop(['ID','target'],axis=1,inplace=True)\n",
813 | "#test.drop(['ID','target'],axis=1,inplace=True)"
814 | ],
815 | "execution_count": 0,
816 | "outputs": []
817 | },
818 | {
819 | "cell_type": "code",
820 | "metadata": {
821 | "trusted": false,
822 | "id": "BTt2Fet-nDei",
823 | "colab_type": "code",
824 | "colab": {}
825 | },
826 | "source": [
827 | "# from sklearn.preprocessing import LabelEncoder\n",
828 | "# lab = LabelEncoder()\n",
829 | "# lab.fit(train[\"location\"])\n",
830 | "\n",
831 | "# train.location = lab.transform(train.location)\n",
832 | "# test.location = lab.transform(test.location)"
833 | ],
834 | "execution_count": 0,
835 | "outputs": []
836 | },
837 | {
838 | "cell_type": "code",
839 | "metadata": {
840 | "trusted": true,
841 | "id": "Zw0z2yR1nDej",
842 | "colab_type": "code",
843 | "colab": {}
844 | },
845 | "source": [
846 | "import category_encoders as ce\n",
847 | "encoder = ce.CatBoostEncoder(cols=[\"location\"])\n",
848 | "encoder.fit(train, y)\n",
849 | "train = encoder.transform(train)\n",
850 | "test = encoder.transform(test)"
851 | ],
852 | "execution_count": 0,
853 | "outputs": []
854 | },
855 | {
856 | "cell_type": "code",
857 | "metadata": {
858 | "trusted": true,
859 | "id": "kF4AcdjknDel",
860 | "colab_type": "code",
861 | "colab": {}
862 | },
863 | "source": [
864 | "train.shape,test.shape"
865 | ],
866 | "execution_count": 0,
867 | "outputs": []
868 | },
869 | {
870 | "cell_type": "code",
871 | "metadata": {
872 | "trusted": true,
873 | "id": "V1guD8gFnDeo",
874 | "colab_type": "code",
875 | "colab": {}
876 | },
877 | "source": [
878 | "X = train.copy()"
879 | ],
880 | "execution_count": 0,
881 | "outputs": []
882 | },
883 | {
884 | "cell_type": "code",
885 | "metadata": {
886 | "trusted": true,
887 | "id": "kNMjT4HpnDer",
888 | "colab_type": "code",
889 | "colab": {}
890 | },
891 | "source": [
892 | "X['ID'] = train_id\n",
893 | "test['ID'] = test_id\n",
894 | "X['target'] = y"
895 | ],
896 | "execution_count": 0,
897 | "outputs": []
898 | },
899 | {
900 | "cell_type": "code",
901 | "metadata": {
902 | "trusted": true,
903 | "id": "Zi6-JGMfnDes",
904 | "colab_type": "code",
905 | "colab": {}
906 | },
907 | "source": [
908 | "X.to_csv('train_df.csv',index=False)\n",
909 | "test.to_csv('test_df.csv',index=False)"
910 | ],
911 | "execution_count": 0,
912 | "outputs": []
913 | },
914 | {
915 | "cell_type": "markdown",
916 | "metadata": {
917 | "id": "ra9AQc3gnDeu",
918 | "colab_type": "text"
919 | },
920 | "source": [
921 | "## **PART 2 // THIS PART WILL TAKE AROUND 2H++**"
922 | ]
923 | },
924 | {
925 | "cell_type": "code",
926 | "metadata": {
927 | "trusted": true,
928 | "id": "dfEXW7S9nDeu",
929 | "colab_type": "code",
930 | "colab": {}
931 | },
932 | "source": [
933 | "import pandas as pd\n",
934 | "import numpy as np\n",
935 | "import matplotlib.pyplot as plt\n",
936 | "import seaborn as sns\n",
937 | "import warnings\n",
938 | "warnings.filterwarnings('ignore')\n",
939 | "\n",
940 | "from math import sqrt \n",
941 | "import lightgbm as lgb\n",
942 | "from sklearn.metrics import mean_squared_error \n",
943 | "from sklearn.model_selection import KFold, train_test_split\n",
944 | "\n",
945 | "\n",
946 | "from scipy.sparse import csr_matrix\n",
947 | "import gc\n",
948 | "\n",
949 | "import pandas as pd\n",
950 | "import numpy as np\n",
951 | "import matplotlib.pyplot as plt\n",
952 | "import seaborn as sns\n",
953 | "from xgboost import XGBRegressor\n",
954 | "from catboost import CatBoostRegressor\n",
955 | "from lightgbm import LGBMRegressor\n",
956 | "from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,BaggingRegressor,AdaBoostRegressor,ExtraTreesRegressor\n",
957 | "from sklearn.tree import DecisionTreeRegressor\n",
958 | "from sklearn.linear_model import LinearRegression\n",
959 | "from sklearn.ensemble import StackingRegressor\n",
960 | "#from ngboost import NGBRegressor\n",
961 | "from sklearn.metrics import mean_squared_log_error\n",
962 | "from sklearn.preprocessing import LabelEncoder\n",
963 | "from sklearn.model_selection import KFold,StratifiedKFold, GroupKFold\n",
964 | "import datetime\n",
965 | "from statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n",
966 | "# from fastai.tabular import *\n",
967 | "import warnings\n",
968 | "from tqdm import *\n",
969 | "warnings.filterwarnings(\"ignore\")\n",
970 | "np.random.seed(0)\n",
971 | "pd.set_option('display.max_rows', 500)\n",
972 | "pd.set_option('display.max_columns', 500)\n",
973 | "pd.set_option('display.width', 1000)"
974 | ],
975 | "execution_count": 0,
976 | "outputs": []
977 | },
978 | {
979 | "cell_type": "code",
980 | "metadata": {
981 | "trusted": true,
982 | "id": "HmZwAO-9nDex",
983 | "colab_type": "code",
984 | "colab": {}
985 | },
986 | "source": [
987 | "# Memory reduction helper function:\n",
988 | "def reduce_mem_usage(df, verbose=True):\n",
989 | " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n",
990 | " start_mem = df.memory_usage().sum() / 1024**2 \n",
991 | " for col in df.columns: #columns\n",
992 | " col_type = df[col].dtypes\n",
993 | " if col_type in numerics: #numerics\n",
994 | " c_min = df[col].min()\n",
995 | " c_max = df[col].max()\n",
996 | " if str(col_type)[:3] == 'int':\n",
997 | " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
998 | " df[col] = df[col].astype(np.int8)\n",
999 | " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
1000 | " df[col] = df[col].astype(np.int16)\n",
1001 | " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
1002 | " df[col] = df[col].astype(np.int32)\n",
1003 | " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
1004 | " df[col] = df[col].astype(np.int64) \n",
1005 | " else:\n",
1006 | " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
1007 | " df[col] = df[col].astype(np.float16)\n",
1008 | " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
1009 | " df[col] = df[col].astype(np.float32)\n",
1010 | " else:\n",
1011 | " df[col] = df[col].astype(np.float64) \n",
1012 | " end_mem = df.memory_usage().sum() / 1024**2\n",
1013 | " if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))\n",
1014 | " return df"
1015 | ],
1016 | "execution_count": 0,
1017 | "outputs": []
1018 | },
1019 | {
1020 | "cell_type": "code",
1021 | "metadata": {
1022 | "trusted": true,
1023 | "id": "XAJLlciBnDey",
1024 | "colab_type": "code",
1025 | "colab": {}
1026 | },
1027 | "source": [
1028 | "train =X\n",
1029 | "test=test.drop('target',1)\n",
1030 | "train.shape,test.shape"
1031 | ],
1032 | "execution_count": 0,
1033 | "outputs": []
1034 | },
1035 | {
1036 | "cell_type": "code",
1037 | "metadata": {
1038 | "trusted": true,
1039 | "id": "4XpTRUoQnDez",
1040 | "colab_type": "code",
1041 | "colab": {}
1042 | },
1043 | "source": [
1044 | "train = reduce_mem_usage(train)\n",
1045 | "test = reduce_mem_usage(test)"
1046 | ],
1047 | "execution_count": 0,
1048 | "outputs": []
1049 | },
1050 | {
1051 | "cell_type": "code",
1052 | "metadata": {
1053 | "trusted": true,
1054 | "id": "HVykT3F9nDe2",
1055 | "colab_type": "code",
1056 | "colab": {}
1057 | },
1058 | "source": [
1059 | "temp = [\"temp{}\".format(i) for i in range(121)]\n",
1060 | "precip = [\"precip{}\".format(i) for i in range(121)]\n",
1061 | "rel_humidity = [\"rel_humidity{}\".format(i) for i in range(121)]\n",
1062 | "wind_dir = [\"wind_dir{}\".format(i) for i in range(121)]\n",
1063 | "wind_spd = [\"wind_spd{}\".format(i) for i in range(121)]\n",
1064 | "atmos_press = [\"atmos_press{}\".format(i) for i in range(121)]"
1065 | ],
1066 | "execution_count": 0,
1067 | "outputs": []
1068 | },
1069 | {
1070 | "cell_type": "code",
1071 | "metadata": {
1072 | "trusted": true,
1073 | "id": "U08Kp65bnDe3",
1074 | "colab_type": "code",
1075 | "colab": {}
1076 | },
1077 | "source": [
1078 | "rem_org = list(set(test.columns) - set(temp+precip+rel_humidity+wind_dir+wind_spd+atmos_press))\n",
1079 | "len(rem_org)"
1080 | ],
1081 | "execution_count": 0,
1082 | "outputs": []
1083 | },
1084 | {
1085 | "cell_type": "code",
1086 | "metadata": {
1087 | "trusted": true,
1088 | "id": "Nk4bL20MnDe5",
1089 | "colab_type": "code",
1090 | "colab": {}
1091 | },
1092 | "source": [
1093 | "import re\n",
1094 | "def fun(x):\n",
1095 | " return int(re.findall(r'\\d+', x)[0])"
1096 | ],
1097 | "execution_count": 0,
1098 | "outputs": []
1099 | },
1100 | {
1101 | "cell_type": "code",
1102 | "metadata": {
1103 | "trusted": true,
1104 | "id": "hzy9rVzCnDe6",
1105 | "colab_type": "code",
1106 | "colab": {}
1107 | },
1108 | "source": [
1109 | "l = ['precip','rel_humidity','wind_dir','wind_spd','atmos_press']"
1110 | ],
1111 | "execution_count": 0,
1112 | "outputs": []
1113 | },
1114 | {
1115 | "cell_type": "code",
1116 | "metadata": {
1117 | "trusted": true,
1118 | "id": "P0sg2E8xnDe7",
1119 | "colab_type": "code",
1120 | "colab": {}
1121 | },
1122 | "source": [
1123 | "data = train[['ID']+temp]\n",
1124 | "data = data.melt(id_vars=[\"ID\"],var_name=\"d\", value_name='temp')\n",
1125 | "data['d'] = data['d'].apply(fun)\n",
1126 | "for n,i in enumerate([precip,rel_humidity,wind_dir,wind_spd,atmos_press]):\n",
1127 | " data1 = train[['ID']+i]\n",
1128 | " data1 = data1.melt(id_vars=[\"ID\"],var_name=\"d\", value_name=l[n])\n",
1129 | " data1['d'] = data1['d'].apply(fun)\n",
1130 | " data = pd.merge(data,data1,on=['ID','d'],how='inner')\n",
1131 | " print('*'*8,n)"
1132 | ],
1133 | "execution_count": 0,
1134 | "outputs": []
1135 | },
1136 | {
1137 | "cell_type": "code",
1138 | "metadata": {
1139 | "trusted": true,
1140 | "id": "npYQ54HpnDe8",
1141 | "colab_type": "code",
1142 | "colab": {}
1143 | },
1144 | "source": [
1145 | "data_t = test[['ID']+temp]\n",
1146 | "data_t = data_t.melt(id_vars=[\"ID\"],var_name=\"d\", value_name='temp')\n",
1147 | "data_t['d'] = data_t['d'].apply(fun)\n",
1148 | "for n,i in enumerate([precip,rel_humidity,wind_dir,wind_spd,atmos_press]):\n",
1149 | " data1 = test[['ID']+i]\n",
1150 | " data1 = data1.melt(id_vars=[\"ID\"],var_name=\"d\", value_name=l[n])\n",
1151 | " data1['d'] = data1['d'].apply(fun)\n",
1152 | " data_t = pd.merge(data_t,data1,on=['ID','d'],how='inner')\n",
1153 | " print('*'*8,n)"
1154 | ],
1155 | "execution_count": 0,
1156 | "outputs": []
1157 | },
1158 | {
1159 | "cell_type": "code",
1160 | "metadata": {
1161 | "trusted": true,
1162 | "id": "phNrAka9nDe-",
1163 | "colab_type": "code",
1164 | "colab": {}
1165 | },
1166 | "source": [
1167 | "data['train'] = True\n",
1168 | "data_t['train'] = False\n",
1169 | "\n",
1170 | "data = pd.concat([data,data_t])\n",
1171 | "data.reset_index(drop=True,inplace=True)\n",
1172 | "data['train'].value_counts()"
1173 | ],
1174 | "execution_count": 0,
1175 | "outputs": []
1176 | },
1177 | {
1178 | "cell_type": "code",
1179 | "metadata": {
1180 | "trusted": true,
1181 | "id": "75YPN5stnDe_",
1182 | "colab_type": "code",
1183 | "colab": {}
1184 | },
1185 | "source": [
1186 | "data.head()"
1187 | ],
1188 | "execution_count": 0,
1189 | "outputs": []
1190 | },
1191 | {
1192 | "cell_type": "code",
1193 | "metadata": {
1194 | "trusted": true,
1195 | "id": "XiPKBJUWnDfA",
1196 | "colab_type": "code",
1197 | "colab": {}
1198 | },
1199 | "source": [
1200 | "def roll1(h):\n",
1201 | " if h <= 24:\n",
1202 | " return('day1')\n",
1203 | " elif (h>24) and (h<=48):\n",
1204 | " return('day2')\n",
1205 | " elif (h>48) and (h<=72):\n",
1206 | " return('day3')\n",
1207 | " elif (h>72) and (h<=96):\n",
1208 | " return('day4')\n",
1209 | " elif (h>96) and (h<=120):\n",
1210 | " return('day5')\n",
1211 | " \n",
1212 | " \n",
1213 | "def roll2(h):\n",
1214 | " if (h <= 6) or ((h-24)>0 and (h-24)<=6)or ((h-24*2)>0 and (h-24*2)<=6)or ((h-24*3)>0 and (h-24*3)<=6)or ((h-24*4)>0 and (h-24*4)<=6):\n",
1215 | " return('Morning')\n",
1216 | " elif (h <= 12) or ((h-24)>0 and (h-24)<=12)or ((h-24*2)>0 and (h-24*2)<=12)or ((h-24*3)>0 and (h-24*3)<=12)or ((h-24*4)>0 and (h-24*4)<=12):\n",
1217 | " return('Noon')\n",
1218 | " elif (h <= 18) or ((h-24)>0 and (h-24)<=18)or ((h-24*2)>0 and (h-24*2)<=18)or ((h-24*3)>0 and (h-24*3)<=18)or ((h-24*4)>0 and (h-24*4)<=18):\n",
1219 | " return('Evening')\n",
1220 | " else:\n",
1221 | " return('Night')\n",
1222 | " \n",
1223 | "def roll3(h):\n",
1224 | " if (h <= 3) or ((h-24)>0 and (h-24)<=3)or ((h-24*2)>0 and (h-24*2)<=3)or ((h-24*3)>0 and (h-24*3)<=3)or ((h-24*4)>0 and (h-24*4)<=3):\n",
1225 | " return('Mor1')\n",
1226 | " elif (h <= 6) or ((h-24)>0 and (h-24)<=6)or ((h-24*2)>0 and (h-24*2)<=6)or ((h-24*3)>0 and (h-24*3)<=6)or ((h-24*4)>0 and (h-24*4)<=6):\n",
1227 | " return('Mor2')\n",
1228 | " elif (h <= 9) or ((h-24)>0 and (h-24)<=9)or ((h-24*2)>0 and (h-24*2)<=9)or ((h-24*3)>0 and (h-24*3)<=9)or ((h-24*4)>0 and (h-24*4)<=9):\n",
1229 | " return('Noo1')\n",
1230 | " elif (h <= 12) or ((h-24)>0 and (h-24)<=12)or ((h-24*2)>0 and (h-24*2)<=12)or ((h-24*3)>0 and (h-24*3)<=12)or ((h-24*4)>0 and (h-24*4)<=12):\n",
1231 | " return('Noo2')\n",
1232 | " elif (h <= 15) or ((h-24)>0 and (h-24)<=15)or ((h-24*2)>0 and (h-24*2)<=15)or ((h-24*3)>0 and (h-24*3)<=15)or ((h-24*4)>0 and (h-24*4)<=15):\n",
1233 | " return('Eve1')\n",
1234 | " elif (h <= 18) or ((h-24)>0 and (h-24)<=18)or ((h-24*2)>0 and (h-24*2)<=18)or ((h-24*3)>0 and (h-24*3)<=18)or ((h-24*4)>0 and (h-24*4)<=18):\n",
1235 | " return('Eve2')\n",
1236 | " elif (h <= 21) or ((h-24)>0 and (h-24)<=21)or ((h-24*2)>0 and (h-24*2)<=21)or ((h-24*3)>0 and (h-24*3)<=21)or ((h-24*4)>0 and (h-24*4)<=21):\n",
1237 | " return('Nig1')\n",
1238 | " else:\n",
1239 | " return('Nig2')\n",
1240 | "\n",
1241 | "def roll4(h):\n",
1242 | " if (h <= 8) or ((h-24)>0 and (h-24)<=8)or ((h-24*2)>0 and (h-24*2)<=8)or ((h-24*3)>0 and (h-24*3)<=8)or ((h-24*4)>0 and (h-24*4)<=8):\n",
1243 | " return('First')\n",
1244 | " elif (h <= 16) or ((h-24)>0 and (h-24)<=16)or ((h-24*2)>0 and (h-24*2)<=16)or ((h-24*3)>0 and (h-24*3)<=16)or ((h-24*4)>0 and (h-24*4)<=16):\n",
1245 | " return('Second')\n",
1246 | " else:\n",
1247 | " return('Third')\n",
1248 | " \n",
1249 | "def roll5(h):\n",
1250 | " if (h <= 12) or ((h-24)>0 and (h-24)<=12)or ((h-24*2)>0 and (h-24*2)<=12)or ((h-24*3)>0 and (h-24*3)<=12)or ((h-24*4)>0 and (h-24*4)<=12):\n",
1251 | " return('F_half')\n",
1252 | " else:\n",
1253 | " return('S_half')"
1254 | ],
1255 | "execution_count": 0,
1256 | "outputs": []
1257 | },
1258 | {
1259 | "cell_type": "code",
1260 | "metadata": {
1261 | "trusted": true,
1262 | "id": "HUtseFywnDfB",
1263 | "colab_type": "code",
1264 | "colab": {}
1265 | },
1266 | "source": [
1267 | "data['type_of_day3'] = data['d'].apply(roll5)\n",
1268 | "data['type_of_day2'] = data['d'].apply(roll4)\n",
1269 | "data['type_of_day1'] = data['d'].apply(roll3)\n",
1270 | "data['type_of_day'] = data['d'].apply(roll2)\n",
1271 | "data['day'] = data['d'].apply(roll1)"
1272 | ],
1273 | "execution_count": 0,
1274 | "outputs": []
1275 | },
1276 | {
1277 | "cell_type": "code",
1278 | "metadata": {
1279 | "trusted": true,
1280 | "id": "W5n5GQWMnDfC",
1281 | "colab_type": "code",
1282 | "colab": {}
1283 | },
1284 | "source": [
1285 | "data.groupby(['train','day','type_of_day3']).count()['temp']"
1286 | ],
1287 | "execution_count": 0,
1288 | "outputs": []
1289 | },
1290 | {
1291 | "cell_type": "code",
1292 | "metadata": {
1293 | "trusted": true,
1294 | "id": "2mX5kNPvnDfG",
1295 | "colab_type": "code",
1296 | "colab": {}
1297 | },
1298 | "source": [
1299 | "data[(data['train'] == True) & (data['day'] == 'day5') & (data['type_of_day'] == 'Morning')].groupby('d').count()['temp']"
1300 | ],
1301 | "execution_count": 0,
1302 | "outputs": []
1303 | },
1304 | {
1305 | "cell_type": "code",
1306 | "metadata": {
1307 | "trusted": true,
1308 | "id": "U3fivT72nDfH",
1309 | "colab_type": "code",
1310 | "colab": {}
1311 | },
1312 | "source": [
1313 | "data = reduce_mem_usage(data)"
1314 | ],
1315 | "execution_count": 0,
1316 | "outputs": []
1317 | },
1318 | {
1319 | "cell_type": "code",
1320 | "metadata": {
1321 | "trusted": true,
1322 | "id": "h2QQZFk7nDfI",
1323 | "colab_type": "code",
1324 | "colab": {}
1325 | },
1326 | "source": [
1327 | "def percentile(n) :\n",
1328 | " def percentile_(x) : \n",
1329 | " return np.percentile(x, n)\n",
1330 | " percentile_.__name__ = 'percentile_%s' % n\n",
1331 | " return percentile_"
1332 | ],
1333 | "execution_count": 0,
1334 | "outputs": []
1335 | },
1336 | {
1337 | "cell_type": "code",
1338 | "metadata": {
1339 | "trusted": true,
1340 | "id": "beNOLTNVnDfK",
1341 | "colab_type": "code",
1342 | "colab": {}
1343 | },
1344 | "source": [
1345 | "%%time\n",
1346 | "grp_data1 = data.drop(['d','train'],axis=1).groupby(['ID','day']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n",
1347 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n",
1348 | " ,percentile(90),percentile(95),percentile(99)])\n",
1349 | "\n",
1350 | "grp_data1.columns = ['_'.join(col).strip() for col in grp_data1.columns.values]\n",
1351 | "grp_data1.reset_index(inplace = True)"
1352 | ],
1353 | "execution_count": 0,
1354 | "outputs": []
1355 | },
1356 | {
1357 | "cell_type": "code",
1358 | "metadata": {
1359 | "trusted": true,
1360 | "id": "yCdrAS4hnDfL",
1361 | "colab_type": "code",
1362 | "colab": {}
1363 | },
1364 | "source": [
1365 | "grp_data1.head()"
1366 | ],
1367 | "execution_count": 0,
1368 | "outputs": []
1369 | },
1370 | {
1371 | "cell_type": "code",
1372 | "metadata": {
1373 | "trusted": true,
1374 | "id": "ntdn73R5nDfM",
1375 | "colab_type": "code",
1376 | "colab": {}
1377 | },
1378 | "source": [
1379 | "%%time\n",
1380 | "grp_data2 = data.drop(['d','train'],axis=1).groupby(['ID','type_of_day']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n",
1381 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n",
1382 | " ,percentile(90),percentile(95),percentile(99)])\n",
1383 | "\n",
1384 | "grp_data2.columns = ['_'.join(col).strip() for col in grp_data2.columns.values]\n",
1385 | "grp_data2.reset_index(inplace=True)"
1386 | ],
1387 | "execution_count": 0,
1388 | "outputs": []
1389 | },
1390 | {
1391 | "cell_type": "code",
1392 | "metadata": {
1393 | "trusted": true,
1394 | "id": "d2JYuFCznDfO",
1395 | "colab_type": "code",
1396 | "colab": {}
1397 | },
1398 | "source": [
1399 | "grp_data2.head()"
1400 | ],
1401 | "execution_count": 0,
1402 | "outputs": []
1403 | },
1404 | {
1405 | "cell_type": "code",
1406 | "metadata": {
1407 | "trusted": true,
1408 | "id": "x_9ql5MInDfQ",
1409 | "colab_type": "code",
1410 | "colab": {}
1411 | },
1412 | "source": [
1413 | "%%time\n",
1414 | "#data['day_type_of_day'] = data['day']+ '_' + data['type_of_day']\n",
1415 | "\n",
1416 | "\n",
1417 | "grp_data3 = data.drop(['d','train'],axis=1).groupby(['ID','type_of_day1']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n",
1418 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n",
1419 | " ,percentile(90),percentile(95),percentile(99)])\n",
1420 | "\n",
1421 | "grp_data3.columns = ['_'.join(col).strip() for col in grp_data3.columns.values]\n",
1422 | "grp_data3.reset_index(inplace=True)"
1423 | ],
1424 | "execution_count": 0,
1425 | "outputs": []
1426 | },
1427 | {
1428 | "cell_type": "code",
1429 | "metadata": {
1430 | "trusted": true,
1431 | "id": "jaqnN8nTnDfR",
1432 | "colab_type": "code",
1433 | "colab": {}
1434 | },
1435 | "source": [
1436 | "grp_data3.shape"
1437 | ],
1438 | "execution_count": 0,
1439 | "outputs": []
1440 | },
1441 | {
1442 | "cell_type": "code",
1443 | "metadata": {
1444 | "trusted": true,
1445 | "id": "5eQJvsYZnDfS",
1446 | "colab_type": "code",
1447 | "colab": {}
1448 | },
1449 | "source": [
1450 | "%%time\n",
1451 | "grp_data4 = data.drop(['d','train'],axis=1).groupby(['ID','type_of_day2']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n",
1452 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n",
1453 | " ,percentile(90),percentile(95),percentile(99)])\n",
1454 | "\n",
1455 | "grp_data4.columns = ['_'.join(col).strip() for col in grp_data4.columns.values]\n",
1456 | "grp_data4.reset_index(inplace=True)"
1457 | ],
1458 | "execution_count": 0,
1459 | "outputs": []
1460 | },
1461 | {
1462 | "cell_type": "code",
1463 | "metadata": {
1464 | "trusted": true,
1465 | "id": "mlyp-_ZnnDfT",
1466 | "colab_type": "code",
1467 | "colab": {}
1468 | },
1469 | "source": [
1470 | "grp_data4.shape"
1471 | ],
1472 | "execution_count": 0,
1473 | "outputs": []
1474 | },
1475 | {
1476 | "cell_type": "code",
1477 | "metadata": {
1478 | "trusted": true,
1479 | "id": "a9-Lzf8UnDfU",
1480 | "colab_type": "code",
1481 | "colab": {}
1482 | },
1483 | "source": [
1484 | "%%time\n",
1485 | "#data['day_type_of_day1'] = data['day']+ '_' + data['type_of_day1']\n",
1486 | "\n",
1487 | "\n",
1488 | "grp_data5 = data.drop(['d','train'],axis=1).groupby(['ID','type_of_day3']).agg(['mean','max','min','std',percentile(1),percentile(5),percentile(10),percentile(20)\n",
1489 | " ,percentile(30),percentile(40),percentile(50),percentile(60),percentile(70),percentile(80)\n",
1490 | " ,percentile(90),percentile(95),percentile(99)])\n",
1491 | "\n",
1492 | "grp_data5.columns = ['_'.join(col).strip() for col in grp_data5.columns.values]\n",
1493 | "grp_data5.reset_index(inplace=True)"
1494 | ],
1495 | "execution_count": 0,
1496 | "outputs": []
1497 | },
1498 | {
1499 | "cell_type": "code",
1500 | "metadata": {
1501 | "trusted": true,
1502 | "id": "Ox43UwZmnDfW",
1503 | "colab_type": "code",
1504 | "colab": {}
1505 | },
1506 | "source": [
1507 | "grp_data5.shape"
1508 | ],
1509 | "execution_count": 0,
1510 | "outputs": []
1511 | },
1512 | {
1513 | "cell_type": "code",
1514 | "metadata": {
1515 | "trusted": true,
1516 | "id": "muiCFc8enDfZ",
1517 | "colab_type": "code",
1518 | "colab": {}
1519 | },
1520 | "source": [
1521 | "grp_data5"
1522 | ],
1523 | "execution_count": 0,
1524 | "outputs": []
1525 | },
1526 | {
1527 | "cell_type": "code",
1528 | "metadata": {
1529 | "trusted": true,
1530 | "id": "mpICUUQenDfb",
1531 | "colab_type": "code",
1532 | "colab": {}
1533 | },
1534 | "source": [
1535 | "grp_data11 = pd.pivot_table(data = grp_data1,index='ID',columns = 'day',values = 'temp_mean')\n",
1536 | "grp_data11.columns = ['temp_mean'+ i for i in grp_data11.columns]\n",
1537 | "grp_data11.reset_index(inplace=True)\n",
1538 | "for i in tqdm(grp_data1.drop(['ID','day','temp_mean'],axis=1).columns):\n",
1539 | " temp = pd.pivot_table(data = grp_data1,index='ID',columns = 'day',values = i)\n",
1540 | " temp.columns = [i+ j for j in temp.columns]\n",
1541 | " temp.reset_index(inplace=True)\n",
1542 | " grp_data11 = pd.merge(grp_data11,temp,on='ID',how='left')"
1543 | ],
1544 | "execution_count": 0,
1545 | "outputs": []
1546 | },
1547 | {
1548 | "cell_type": "code",
1549 | "metadata": {
1550 | "trusted": true,
1551 | "id": "9DBVhIuLnDfd",
1552 | "colab_type": "code",
1553 | "colab": {}
1554 | },
1555 | "source": [
1556 | "grp_data11_col = list(grp_data11.columns)"
1557 | ],
1558 | "execution_count": 0,
1559 | "outputs": []
1560 | },
1561 | {
1562 | "cell_type": "code",
1563 | "metadata": {
1564 | "trusted": true,
1565 | "id": "UMXdBe5VnDfe",
1566 | "colab_type": "code",
1567 | "colab": {}
1568 | },
1569 | "source": [
1570 | "grp_data21 = pd.pivot_table(data = grp_data2,index='ID',columns = 'type_of_day',values = 'temp_mean')\n",
1571 | "grp_data21.columns = ['temp_mean'+ i for i in grp_data21.columns]\n",
1572 | "grp_data21.reset_index(inplace=True)\n",
1573 | "for i in tqdm(grp_data2.drop(['ID','type_of_day','temp_mean'],axis=1).columns):\n",
1574 | " temp = pd.pivot_table(data = grp_data2,index='ID',columns = 'type_of_day',values = i)\n",
1575 | " temp.columns = [i+ j for j in temp.columns]\n",
1576 | " temp.reset_index(inplace=True)\n",
1577 | " grp_data21 = pd.merge(grp_data21,temp,on='ID',how='left')"
1578 | ],
1579 | "execution_count": 0,
1580 | "outputs": []
1581 | },
1582 | {
1583 | "cell_type": "code",
1584 | "metadata": {
1585 | "trusted": true,
1586 | "id": "otD9qfeVnDff",
1587 | "colab_type": "code",
1588 | "colab": {}
1589 | },
1590 | "source": [
1591 | "grp_data21_col = list(grp_data21.columns)"
1592 | ],
1593 | "execution_count": 0,
1594 | "outputs": []
1595 | },
1596 | {
1597 | "cell_type": "code",
1598 | "metadata": {
1599 | "trusted": true,
1600 | "id": "cEsFR3afnDfg",
1601 | "colab_type": "code",
1602 | "colab": {}
1603 | },
1604 | "source": [
1605 | "grp_data31 = pd.pivot_table(data = grp_data3,index='ID',columns = 'type_of_day1',values = 'temp_mean')\n",
1606 | "grp_data31.columns = ['temp_mean'+ i for i in grp_data31.columns]\n",
1607 | "grp_data31.reset_index(inplace=True)\n",
1608 | "for i in tqdm(grp_data3.drop(['ID','type_of_day1','temp_mean'],axis=1).columns):\n",
1609 | " temp = pd.pivot_table(data = grp_data3,index='ID',columns = 'type_of_day1',values = i)\n",
1610 | " temp.columns = [i+ j for j in temp.columns]\n",
1611 | " temp.reset_index(inplace=True)\n",
1612 | " grp_data31 = pd.merge(grp_data31,temp,on='ID',how='left')"
1613 | ],
1614 | "execution_count": 0,
1615 | "outputs": []
1616 | },
1617 | {
1618 | "cell_type": "code",
1619 | "metadata": {
1620 | "trusted": true,
1621 | "id": "kNik__8AnDfi",
1622 | "colab_type": "code",
1623 | "colab": {}
1624 | },
1625 | "source": [
1626 | "grp_data31_col = list(grp_data31.columns)"
1627 | ],
1628 | "execution_count": 0,
1629 | "outputs": []
1630 | },
1631 | {
1632 | "cell_type": "code",
1633 | "metadata": {
1634 | "trusted": true,
1635 | "id": "e5upctQonDfj",
1636 | "colab_type": "code",
1637 | "colab": {}
1638 | },
1639 | "source": [
1640 | "grp_data41 = pd.pivot_table(data = grp_data4,index='ID',columns = 'type_of_day2',values = 'temp_mean')\n",
1641 | "grp_data41.columns = ['temp_mean'+ i for i in grp_data41.columns]\n",
1642 | "grp_data41.reset_index(inplace=True)\n",
1643 | "for i in tqdm(grp_data4.drop(['ID','type_of_day2','temp_mean'],axis=1).columns):\n",
1644 | " temp = pd.pivot_table(data = grp_data4,index='ID',columns = 'type_of_day2',values = i)\n",
1645 | " temp.columns = [i+ j for j in temp.columns]\n",
1646 | " temp.reset_index(inplace=True)\n",
1647 | " grp_data41 = pd.merge(grp_data41,temp,on='ID',how='left')\n",
1648 | " "
1649 | ],
1650 | "execution_count": 0,
1651 | "outputs": []
1652 | },
1653 | {
1654 | "cell_type": "code",
1655 | "metadata": {
1656 | "trusted": true,
1657 | "id": "56CMRr_unDfk",
1658 | "colab_type": "code",
1659 | "colab": {}
1660 | },
1661 | "source": [
1662 | "grp_data41_col = list(grp_data41.columns)"
1663 | ],
1664 | "execution_count": 0,
1665 | "outputs": []
1666 | },
1667 | {
1668 | "cell_type": "code",
1669 | "metadata": {
1670 | "trusted": true,
1671 | "id": "4wXi_1_anDfl",
1672 | "colab_type": "code",
1673 | "colab": {}
1674 | },
1675 | "source": [
1676 | "grp_data51 = pd.pivot_table(data = grp_data5,index='ID',columns = 'type_of_day3',values = 'temp_mean')\n",
1677 | "grp_data51.columns = ['temp_mean'+ i for i in grp_data51.columns]\n",
1678 | "grp_data51.reset_index(inplace=True)\n",
1679 | "for i in tqdm(grp_data5.drop(['ID','type_of_day3','temp_mean'],axis=1).columns):\n",
1680 | " temp = pd.pivot_table(data = grp_data5,index='ID',columns = 'type_of_day3',values = i)\n",
1681 | " temp.columns = [i+ j for j in temp.columns]\n",
1682 | " temp.reset_index(inplace=True)\n",
1683 | " grp_data51 = pd.merge(grp_data51,temp,on='ID',how='left')\n",
1684 | " "
1685 | ],
1686 | "execution_count": 0,
1687 | "outputs": []
1688 | },
1689 | {
1690 | "cell_type": "code",
1691 | "metadata": {
1692 | "trusted": true,
1693 | "id": "Yf3BFI8bnDfm",
1694 | "colab_type": "code",
1695 | "colab": {}
1696 | },
1697 | "source": [
1698 | "grp_data51_col = list(grp_data51.columns)"
1699 | ],
1700 | "execution_count": 0,
1701 | "outputs": []
1702 | },
1703 | {
1704 | "cell_type": "code",
1705 | "metadata": {
1706 | "trusted": true,
1707 | "id": "p6hV54JvnDfo",
1708 | "colab_type": "code",
1709 | "colab": {}
1710 | },
1711 | "source": [
1712 | "grp_data_all = pd.merge(grp_data11,grp_data21,on='ID')\n",
1713 | "grp_data_all = pd.merge(grp_data_all,grp_data31,on='ID')\n",
1714 | "grp_data_all = pd.merge(grp_data_all,grp_data41,on='ID')\n",
1715 | "grp_data_all = pd.merge(grp_data_all,grp_data51,on='ID')\n",
1716 | "grp_data_all.shape"
1717 | ],
1718 | "execution_count": 0,
1719 | "outputs": []
1720 | },
1721 | {
1722 | "cell_type": "code",
1723 | "metadata": {
1724 | "trusted": true,
1725 | "id": "eYYTPQE7nDfp",
1726 | "colab_type": "code",
1727 | "colab": {}
1728 | },
1729 | "source": [
1730 | "train_df = pd.merge(train,grp_data_all,on='ID',how='left')\n",
1731 | "test_df = pd.merge(test,grp_data_all,on='ID',how='left')\n",
1732 | "train_df.shape,test_df.shape,train.shape,test.shape"
1733 | ],
1734 | "execution_count": 0,
1735 | "outputs": []
1736 | },
1737 | {
1738 | "cell_type": "code",
1739 | "metadata": {
1740 | "trusted": true,
1741 | "id": "Vo3f1vLunDfq",
1742 | "colab_type": "code",
1743 | "colab": {}
1744 | },
1745 | "source": [
1746 | "train_df.to_csv('final_train_df.csv',index=False)\n",
1747 | "test_df.to_csv('final_test_df.csv',index=False)"
1748 | ],
1749 | "execution_count": 0,
1750 | "outputs": []
1751 | },
1752 | {
1753 | "cell_type": "code",
1754 | "metadata": {
1755 | "trusted": true,
1756 | "id": "oVRYGGGjnDfq",
1757 | "colab_type": "code",
1758 | "colab": {}
1759 | },
1760 | "source": [
1761 | "pd.DataFrame(rem_org).to_csv('rem_org.csv',index=False)\n",
1762 | "pd.DataFrame(grp_data11_col).to_csv('grp_data11_col.csv',index=False)\n",
1763 | "pd.DataFrame(grp_data21_col).to_csv('grp_data21_col.csv',index=False)\n",
1764 | "pd.DataFrame(grp_data31_col).to_csv('grp_data31_col.csv',index=False)\n",
1765 | "pd.DataFrame(grp_data41_col).to_csv('grp_data41_col.csv',index=False)\n",
1766 | "pd.DataFrame(grp_data51_col).to_csv('grp_data51_col.csv',index=False)\n"
1767 | ],
1768 | "execution_count": 0,
1769 | "outputs": []
1770 | },
1771 | {
1772 | "cell_type": "code",
1773 | "metadata": {
1774 | "trusted": true,
1775 | "id": "adpmLxbSnDfu",
1776 | "colab_type": "code",
1777 | "outputId": "bfeceb2f-8f95-4b86-8ba9-6f355a888f95",
1778 | "colab": {}
1779 | },
1780 | "source": [
1781 | "train_df.shape,test_df.shape,train.shape,test.shape"
1782 | ],
1783 | "execution_count": 0,
1784 | "outputs": [
1785 | {
1786 | "output_type": "execute_result",
1787 | "data": {
1788 | "text/plain": [
1789 | "((15539, 3087), (5035, 3086), (15539, 843), (5035, 842))"
1790 | ]
1791 | },
1792 | "metadata": {
1793 | "tags": []
1794 | },
1795 | "execution_count": 102
1796 | }
1797 | ]
1798 | }
1799 | ]
1800 | }
--------------------------------------------------------------------------------
/indianda.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "display_name": "Python 3",
7 | "language": "python",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "codemirror_mode": {
12 | "name": "ipython",
13 | "version": 3
14 | },
15 | "file_extension": ".py",
16 | "mimetype": "text/x-python",
17 | "name": "python",
18 | "nbconvert_exporter": "python",
19 | "pygments_lexer": "ipython3",
20 | "version": "3.7.6"
21 | },
22 | "colab": {
23 | "name": "F7_33_29.ipynb",
24 | "provenance": [],
25 | "collapsed_sections": [],
26 | "include_colab_link": true
27 | }
28 | },
29 | "cells": [
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {
33 | "id": "view-in-github",
34 | "colab_type": "text"
35 | },
36 | "source": [
37 | "
"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "metadata": {
43 | "id": "28tisEbR66ru",
44 | "colab_type": "code",
45 | "outputId": "95b52c1b-9542-455c-bc29-d52b6a7a5c64",
46 | "colab": {
47 | "base_uri": "https://localhost:8080/",
48 | "height": 343
49 | }
50 | },
51 | "source": [
52 | "pip install catboost"
53 | ],
54 | "execution_count": 0,
55 | "outputs": [
56 | {
57 | "output_type": "stream",
58 | "text": [
59 | "Collecting catboost\n",
60 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b2/aa/e61819d04ef2bbee778bf4b3a748db1f3ad23512377e43ecfdc3211437a0/catboost-0.23.2-cp36-none-manylinux1_x86_64.whl (64.8MB)\n",
61 | "\u001b[K |████████████████████████████████| 64.8MB 122kB/s \n",
62 | "\u001b[?25hRequirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (1.0.3)\n",
63 | "Requirement already satisfied: plotly in /usr/local/lib/python3.6/dist-packages (from catboost) (4.4.1)\n",
64 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from catboost) (1.12.0)\n",
65 | "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from catboost) (1.4.1)\n",
66 | "Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (1.18.4)\n",
67 | "Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from catboost) (0.10.1)\n",
68 | "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from catboost) (3.2.1)\n",
69 | "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2018.9)\n",
70 | "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2.8.1)\n",
71 | "Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly->catboost) (1.3.3)\n",
72 | "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (0.10.0)\n",
73 | "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (1.2.0)\n",
74 | "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (2.4.7)\n",
75 | "Installing collected packages: catboost\n",
76 | "Successfully installed catboost-0.23.2\n"
77 | ],
78 | "name": "stdout"
79 | }
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "metadata": {
85 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
86 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
87 | "id": "Bt7cbLC6m9NS",
88 | "colab_type": "code",
89 | "colab": {}
90 | },
91 | "source": [
92 | "import pandas as pd\n",
93 | "import numpy as np\n",
94 | "import matplotlib.pyplot as plt\n",
95 | "import seaborn as sns\n",
96 | "import warnings\n",
97 | "warnings.filterwarnings('ignore')\n",
98 | "\n",
99 | "from math import sqrt \n",
100 | "import lightgbm as lgb\n",
101 | "from sklearn.metrics import mean_squared_error \n",
102 | "from sklearn.model_selection import KFold, train_test_split\n",
103 | "\n",
104 | "\n",
105 | "from scipy.sparse import csr_matrix\n",
106 | "import gc\n",
107 | "\n",
108 | "import pandas as pd\n",
109 | "import numpy as np\n",
110 | "import matplotlib.pyplot as plt\n",
111 | "import seaborn as sns\n",
112 | "from xgboost import XGBRegressor\n",
113 | "from catboost import CatBoostRegressor\n",
114 | "from lightgbm import LGBMRegressor\n",
115 | "from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,BaggingRegressor,AdaBoostRegressor,ExtraTreesRegressor\n",
116 | "from sklearn.tree import DecisionTreeRegressor\n",
117 | "from sklearn.linear_model import LinearRegression\n",
118 | "from sklearn.ensemble import StackingRegressor\n",
119 | "#from ngboost import NGBRegressor\n",
120 | "from sklearn.metrics import mean_squared_log_error\n",
121 | "from sklearn.preprocessing import LabelEncoder\n",
122 | "from sklearn.model_selection import KFold,StratifiedKFold, GroupKFold\n",
123 | "import datetime\n",
124 | "from statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n",
125 | "# from fastai.tabular import *\n",
126 | "import warnings\n",
127 | "warnings.filterwarnings(\"ignore\")\n",
128 | "np.random.seed(0)\n",
129 | "pd.set_option('display.max_rows', 500)\n",
130 | "pd.set_option('display.max_columns', 500)\n",
131 | "pd.set_option('display.width', 1000)"
132 | ],
133 | "execution_count": 0,
134 | "outputs": []
135 | },
136 | {
137 | "cell_type": "code",
138 | "metadata": {
139 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
140 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
141 | "id": "iIXDQzmam9NZ",
142 | "colab_type": "code",
143 | "colab": {}
144 | },
145 | "source": [
146 | "# Memory reduction helper function:\n",
147 | "def reduce_mem_usage(df, verbose=True):\n",
148 | " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n",
149 | " start_mem = df.memory_usage().sum() / 1024**2 \n",
150 | " for col in df.columns: #columns\n",
151 | " col_type = df[col].dtypes\n",
152 | " if col_type in numerics: #numerics\n",
153 | " c_min = df[col].min()\n",
154 | " c_max = df[col].max()\n",
155 | " if str(col_type)[:3] == 'int':\n",
156 | " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
157 | " df[col] = df[col].astype(np.int8)\n",
158 | " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
159 | " df[col] = df[col].astype(np.int16)\n",
160 | " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
161 | " df[col] = df[col].astype(np.int32)\n",
162 | " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
163 | " df[col] = df[col].astype(np.int64) \n",
164 | " else:\n",
165 | " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
166 | " df[col] = df[col].astype(np.float16)\n",
167 | " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
168 | " df[col] = df[col].astype(np.float32)\n",
169 | " else:\n",
170 | " df[col] = df[col].astype(np.float64) \n",
171 | " end_mem = df.memory_usage().sum() / 1024**2\n",
172 | " if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))\n",
173 | " return df"
174 | ],
175 | "execution_count": 0,
176 | "outputs": []
177 | },
178 | {
179 | "cell_type": "code",
180 | "metadata": {
181 | "id": "o0q54GGtm9Nc",
182 | "colab_type": "code",
183 | "colab": {}
184 | },
185 | "source": [
186 | "from IPython.display import HTML\n",
187 | "import pandas as pd\n",
188 | "import numpy as np\n",
189 | "import base64\n",
190 | "\n",
191 | "# download it (will only work for files < 2MB or so)\n",
192 | "def create_download_link(df, title = \"Download CSV file\", filename = \"rf.csv\"): \n",
193 | " csv = df.to_csv(index=False)\n",
194 | " b64 = base64.b64encode(csv.encode())\n",
195 | " payload = b64.decode()\n",
196 | " html = '{title}'\n",
197 | " html = html.format(payload=payload,title=title,filename=filename)\n",
198 | " return HTML(html)"
199 | ],
200 | "execution_count": 0,
201 | "outputs": []
202 | },
203 | {
204 | "cell_type": "code",
205 | "metadata": {
206 | "id": "16tEwM7Zy2LT",
207 | "colab_type": "code",
208 | "outputId": "72843c89-5503-46f7-83ce-cd9c47fc0d64",
209 | "colab": {
210 | "base_uri": "https://localhost:8080/",
211 | "height": 122
212 | }
213 | },
214 | "source": [
215 | "from google.colab import drive\n",
216 | "drive.mount('/content/drive')"
217 | ],
218 | "execution_count": 0,
219 | "outputs": [
220 | {
221 | "output_type": "stream",
222 | "text": [
223 | "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
224 | "\n",
225 | "Enter your authorization code:\n",
226 | "··········\n",
227 | "Mounted at /content/drive\n"
228 | ],
229 | "name": "stdout"
230 | }
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "metadata": {
236 | "id": "Ijyk-OtEm9Nf",
237 | "colab_type": "code",
238 | "colab": {}
239 | },
240 | "source": [
241 | "path = '/content/drive/My Drive/'"
242 | ],
243 | "execution_count": 0,
244 | "outputs": []
245 | },
246 | {
247 | "cell_type": "code",
248 | "metadata": {
249 | "id": "3KlNJQizm9Ni",
250 | "colab_type": "code",
251 | "outputId": "517007c7-f366-4887-8476-8be272e14b5c",
252 | "colab": {
253 | "base_uri": "https://localhost:8080/",
254 | "height": 34
255 | }
256 | },
257 | "source": [
258 | "train_df = pd.read_csv(path+'train_df.csv')\n",
259 | "test_df = pd.read_csv(path+'test_df.csv')\n",
260 | "train_df.shape,test_df.shape"
261 | ],
262 | "execution_count": 0,
263 | "outputs": [
264 | {
265 | "output_type": "execute_result",
266 | "data": {
267 | "text/plain": [
268 | "((15539, 3087), (5035, 3086))"
269 | ]
270 | },
271 | "metadata": {
272 | "tags": []
273 | },
274 | "execution_count": 48
275 | }
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "metadata": {
281 | "id": "UcJaZ8Gqm9Nq",
282 | "colab_type": "code",
283 | "colab": {}
284 | },
285 | "source": [
286 | "temp = [\"temp{}\".format(i) for i in range(121)]\n",
287 | "precip = [\"precip{}\".format(i) for i in range(121)]\n",
288 | "rel_humidity = [\"rel_humidity{}\".format(i) for i in range(121)]\n",
289 | "wind_dir = [\"wind_dir{}\".format(i) for i in range(121)]\n",
290 | "wind_spd = [\"wind_spd{}\".format(i) for i in range(121)]\n",
291 | "atmos_press = [\"atmos_press{}\".format(i) for i in range(121)]"
292 | ],
293 | "execution_count": 0,
294 | "outputs": []
295 | },
296 | {
297 | "cell_type": "code",
298 | "metadata": {
299 | "id": "5e37zAVpm9Nu",
300 | "colab_type": "code",
301 | "colab": {}
302 | },
303 | "source": [
304 | "grp_data11_col = pd.read_csv(path+'grp_data11_col.csv')\n",
305 | "grp_data11_col = list(grp_data11_col['0'].values)\n",
306 | "\n",
307 | "grp_data21_col = pd.read_csv(path+'grp_data21_col.csv')\n",
308 | "grp_data21_col = list(grp_data21_col['0'].values)\n",
309 | "\n",
310 | "grp_data31_col = pd.read_csv(path+'grp_data31_col.csv')\n",
311 | "grp_data31_col = list(grp_data31_col['0'].values)\n",
312 | "\n",
313 | "grp_data41_col = pd.read_csv(path+'grp_data41_col.csv')\n",
314 | "grp_data41_col = list(grp_data41_col['0'].values)\n",
315 | "\n",
316 | "grp_data51_col = pd.read_csv(path+'grp_data51_col.csv')\n",
317 | "grp_data51_col = list(grp_data51_col['0'].values)\n",
318 | "\n",
319 | "rem_org = pd.read_csv(path+'rem_org.csv')\n",
320 | "rem_org = list(rem_org['0'].values)"
321 | ],
322 | "execution_count": 0,
323 | "outputs": []
324 | },
325 | {
326 | "cell_type": "code",
327 | "metadata": {
328 | "id": "kwukjWmwm9Nz",
329 | "colab_type": "code",
330 | "outputId": "1c245c4e-5a90-4f1d-beac-d1eb85dd94c3",
331 | "colab": {
332 | "base_uri": "https://localhost:8080/",
333 | "height": 34
334 | }
335 | },
336 | "source": [
337 | "len(rem_org),len(grp_data11_col),len(grp_data21_col),len(grp_data31_col),len(grp_data41_col),len(grp_data51_col)"
338 | ],
339 | "execution_count": 0,
340 | "outputs": [
341 | {
342 | "output_type": "execute_result",
343 | "data": {
344 | "text/plain": [
345 | "(116, 511, 409, 817, 307, 205)"
346 | ]
347 | },
348 | "metadata": {
349 | "tags": []
350 | },
351 | "execution_count": 52
352 | }
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "metadata": {
358 | "id": "LEvsmujUm9N4",
359 | "colab_type": "code",
360 | "colab": {}
361 | },
362 | "source": [
363 | "f1 = list(set(temp + precip + rel_humidity + wind_dir + wind_spd + atmos_press + rem_org)) # orig\n",
364 | "f2 = list(set(f1 + grp_data11_col)) #day\n",
365 | "f3 = list(set(f1 + grp_data21_col)) #typeofday\n",
366 | "f4 = list(set(f1 + grp_data31_col)) #type_of_day1\n",
367 | "f5 = list(set(f1 + grp_data41_col)) #type_of_day2\n",
368 | "f6 = list(set(f1 + grp_data51_col)) #type_of_day3\n",
369 | "f7 = list(set(f1 + f3 + f5)) #type_of_day day_type_of_day2\n",
370 | "f8 = list(set(f1 + f3 + f6)) #type_of_day day_type_of_day3\n",
371 | "f9 = list(set(f1 + f4 + f5)) #type_of_day1 day_type_of_day2\n",
372 | "f10 = list(set(f1 + f4 + f6)) #type_of_day1 day_type_of_day3\n",
373 | "f11 = list(set(f1 + f5 + f6)) #type_of_day2 day_type_of_day3\n",
374 | "f12 = list(set(f1 + f3+ f5 + f6)) #type_of_day type_of_day2 day_type_of_day3\n",
375 | "f13 = list(set(f1 + f4+ f5 + f6)) #type_of_day1 type_of_day2 day_type_of_day3\n",
376 | "f14 = list(set(f1 +f2+ f4+ f5 + f3 + f6)) #type_of_day1 type_of_day2 day_type_of_day3"
377 | ],
378 | "execution_count": 0,
379 | "outputs": []
380 | },
381 | {
382 | "cell_type": "code",
383 | "metadata": {
384 | "id": "5Hb8Jukxm9N6",
385 | "colab_type": "code",
386 | "outputId": "4ea33593-48c8-4634-e69b-8f44b6106567",
387 | "colab": {
388 | "base_uri": "https://localhost:8080/",
389 | "height": 34
390 | }
391 | },
392 | "source": [
393 | "train = train_df[f7+['target']]\n",
394 | "test = test_df[f7]\n",
395 | "train.shape,test.shape"
396 | ],
397 | "execution_count": 0,
398 | "outputs": [
399 | {
400 | "output_type": "execute_result",
401 | "data": {
402 | "text/plain": [
403 | "((15539, 1557), (5035, 1556))"
404 | ]
405 | },
406 | "metadata": {
407 | "tags": []
408 | },
409 | "execution_count": 109
410 | }
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "metadata": {
416 | "id": "jfT0g2KSm9OA",
417 | "colab_type": "code",
418 | "outputId": "3d45143c-ffb8-4b37-867e-a81282d9a584",
419 | "colab": {
420 | "base_uri": "https://localhost:8080/",
421 | "height": 34
422 | }
423 | },
424 | "source": [
425 | "train_id = train['ID']\n",
426 | "test_id = test['ID']\n",
427 | "y = train['target']\n",
428 | "\n",
429 | "train.drop(['ID','target'],axis=1,inplace=True)\n",
430 | "test.drop('ID',axis=1,inplace=True)\n",
431 | "train.shape,test.shape"
432 | ],
433 | "execution_count": 0,
434 | "outputs": [
435 | {
436 | "output_type": "execute_result",
437 | "data": {
438 | "text/plain": [
439 | "((15539, 1555), (5035, 1555))"
440 | ]
441 | },
442 | "metadata": {
443 | "tags": []
444 | },
445 | "execution_count": 110
446 | }
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "metadata": {
452 | "id": "6FPdKg3d5xsq",
453 | "colab_type": "code",
454 | "colab": {}
455 | },
456 | "source": [
457 | "forecastwindte=pd.read_csv('/content/winddirforecasttest ADD THOSE FEATURES TO TEST.csv')\n",
458 | "forecastwindtr=pd.read_csv('/content/winddirforecasttrain ADD THOSE COLUMNS TO TRAIN.csv')"
459 | ],
460 | "execution_count": 0,
461 | "outputs": []
462 | },
463 | {
464 | "cell_type": "code",
465 | "metadata": {
466 | "id": "liV-ze6x1EZe",
467 | "colab_type": "code",
468 | "colab": {}
469 | },
470 | "source": [
471 | "X = train.copy()\n",
472 | "Xtest = test.copy()"
473 | ],
474 | "execution_count": 0,
475 | "outputs": []
476 | },
477 | {
478 | "cell_type": "code",
479 | "metadata": {
480 | "id": "W9Lwdhd3pv4M",
481 | "colab_type": "code",
482 | "colab": {}
483 | },
484 | "source": [
485 | "for i in forecastwindte.columns :\n",
486 | " X[i]=forecastwindtr[i]\n",
487 | " Xtest[i]=forecastwindte[i]"
488 | ],
489 | "execution_count": 0,
490 | "outputs": []
491 | },
492 | {
493 | "cell_type": "code",
494 | "metadata": {
495 | "id": "jyHCFKUBm9OI",
496 | "colab_type": "code",
497 | "outputId": "622d0c9c-1098-4396-8887-1191d120d292",
498 | "colab": {
499 | "base_uri": "https://localhost:8080/",
500 | "height": 51
501 | }
502 | },
503 | "source": [
504 | "%%time\n",
505 | "from lightgbm import LGBMRegressor\n",
506 | "lgb = LGBMRegressor()\n",
507 | "lgb.fit(X,y)\n",
508 | "\n",
509 | "imp = pd.DataFrame(lgb.feature_importances_,index=X.columns)\n",
510 | "l = list(imp[imp[0]>3].index)\n",
511 | "len(l)"
512 | ],
513 | "execution_count": 0,
514 | "outputs": [
515 | {
516 | "output_type": "stream",
517 | "text": [
518 | "CPU times: user 1min 10s, sys: 119 ms, total: 1min 11s\n",
519 | "Wall time: 36.8 s\n"
520 | ],
521 | "name": "stdout"
522 | }
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "metadata": {
528 | "id": "o4AbdDPW7-62",
529 | "colab_type": "code",
530 | "outputId": "f2eb1295-08a0-4018-b962-f92967e6d81f",
531 | "colab": {
532 | "base_uri": "https://localhost:8080/",
533 | "height": 34
534 | }
535 | },
536 | "source": [
537 | "len(l),X.shape"
538 | ],
539 | "execution_count": 0,
540 | "outputs": [
541 | {
542 | "output_type": "execute_result",
543 | "data": {
544 | "text/plain": [
545 | "(251, (15539, 1573))"
546 | ]
547 | },
548 | "metadata": {
549 | "tags": []
550 | },
551 | "execution_count": 115
552 | }
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "metadata": {
558 | "id": "XFezR9QtJr03",
559 | "colab_type": "code",
560 | "outputId": "1c23c1af-cbe0-4a21-92a1-1070abd88fd6",
561 | "colab": {
562 | "base_uri": "https://localhost:8080/",
563 | "height": 34
564 | }
565 | },
566 | "source": [
567 | "X[l].shape"
568 | ],
569 | "execution_count": 0,
570 | "outputs": [
571 | {
572 | "output_type": "execute_result",
573 | "data": {
574 | "text/plain": [
575 | "(15539, 251)"
576 | ]
577 | },
578 | "metadata": {
579 | "tags": []
580 | },
581 | "execution_count": 116
582 | }
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "metadata": {
588 | "id": "cJBxSH73m9OL",
589 | "colab_type": "code",
590 | "outputId": "47da5b3d-1904-47c4-a9a8-d9f13ad871d7",
591 | "colab": {
592 | "base_uri": "https://localhost:8080/",
593 | "height": 1000
594 | }
595 | },
596 | "source": [
597 | "from catboost import CatBoostRegressor\n",
598 | "errcb2=[]\n",
599 | "y_pred_totcb2=[]\n",
600 | "from sklearn.model_selection import KFold,StratifiedKFold, TimeSeriesSplit\n",
601 | "from sklearn.metrics import mean_squared_error\n",
602 | "fold=KFold(n_splits=20)#15#5#10\n",
603 | "i=1\n",
604 | "for train_index, test_index in fold.split(X,y):\n",
605 | " X_train, X_test = X[l].values[train_index], X[l].values[test_index]\n",
606 | " y_train, y_test = y.values[train_index], y.values[test_index]\n",
607 | " m2 = CatBoostRegressor(n_estimators=5000,eval_metric='RMSE',learning_rate=0.175, random_seed= 42, use_best_model=True )\n",
608 | " m2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=200,verbose=200)\n",
609 | " preds=m2.predict(X_test)\n",
610 | " print(\"err: \",np.sqrt(mean_squared_error(y_test,preds)))\n",
611 | " errcb2.append(np.sqrt(mean_squared_error(y_test,preds)))\n",
612 | " p2 = m2.predict(Xtest[l])\n",
613 | " y_pred_totcb2.append(p2)\n",
614 | "np.mean(errcb2)"
615 | ],
616 | "execution_count": 0,
617 | "outputs": [
618 | {
619 | "output_type": "stream",
620 | "text": [
621 | "0:\tlearn: 40.0876723\ttest: 40.0876723\ttest1: 40.8671914\tbest: 40.8671914 (0)\ttotal: 82.9ms\tremaining: 6m 54s\n",
622 | "200:\tlearn: 18.7261668\ttest: 18.7261668\ttest1: 24.3206390\tbest: 24.2671957 (195)\ttotal: 15.7s\tremaining: 6m 14s\n",
623 | "400:\tlearn: 14.0273746\ttest: 14.0273746\ttest1: 23.2844559\tbest: 23.2426862 (384)\ttotal: 31.2s\tremaining: 5m 58s\n",
624 | "600:\tlearn: 11.1481095\ttest: 11.1481095\ttest1: 22.9916124\tbest: 22.9214612 (544)\ttotal: 46.8s\tremaining: 5m 42s\n",
625 | "800:\tlearn: 9.1067324\ttest: 9.1067324\ttest1: 22.9154239\tbest: 22.8916184 (796)\ttotal: 1m 2s\tremaining: 5m 26s\n",
626 | "1000:\tlearn: 7.5375543\ttest: 7.5375543\ttest1: 22.8136649\tbest: 22.8033906 (983)\ttotal: 1m 17s\tremaining: 5m 10s\n",
627 | "1200:\tlearn: 6.2964674\ttest: 6.2964674\ttest1: 22.7658360\tbest: 22.7564960 (1192)\ttotal: 1m 33s\tremaining: 4m 54s\n",
628 | "1400:\tlearn: 5.3083733\ttest: 5.3083733\ttest1: 22.7173545\tbest: 22.6998592 (1353)\ttotal: 1m 49s\tremaining: 4m 41s\n",
629 | "1600:\tlearn: 4.4913219\ttest: 4.4913219\ttest1: 22.6589477\tbest: 22.6589477 (1600)\ttotal: 2m 5s\tremaining: 4m 27s\n",
630 | "1800:\tlearn: 3.8271519\ttest: 3.8271519\ttest1: 22.6417856\tbest: 22.6403727 (1793)\ttotal: 2m 21s\tremaining: 4m 10s\n",
631 | "2000:\tlearn: 3.2675556\ttest: 3.2675556\ttest1: 22.6056521\tbest: 22.6013030 (1996)\ttotal: 2m 36s\tremaining: 3m 55s\n",
632 | "2200:\tlearn: 2.7916963\ttest: 2.7916963\ttest1: 22.5825413\tbest: 22.5472933 (2089)\ttotal: 2m 52s\tremaining: 3m 39s\n",
633 | "Stopped by overfitting detector (200 iterations wait)\n",
634 | "\n",
635 | "bestTest = 22.54729334\n",
636 | "bestIteration = 2089\n",
637 | "\n",
638 | "Shrink model to first 2090 iterations.\n",
639 | "err: 22.54729355867787\n",
640 | "0:\tlearn: 40.2335396\ttest: 40.2335396\ttest1: 37.5305630\tbest: 37.5305630 (0)\ttotal: 79.2ms\tremaining: 6m 35s\n",
641 | "200:\tlearn: 18.7921257\ttest: 18.7921257\ttest1: 22.5270938\tbest: 22.5066513 (193)\ttotal: 15.5s\tremaining: 6m 10s\n",
642 | "400:\tlearn: 14.1558333\ttest: 14.1558333\ttest1: 21.6030513\tbest: 21.5935892 (389)\ttotal: 30.9s\tremaining: 5m 54s\n",
643 | "600:\tlearn: 11.2241101\ttest: 11.2241101\ttest1: 21.2615330\tbest: 21.2415566 (587)\ttotal: 46.4s\tremaining: 5m 39s\n",
644 | "800:\tlearn: 9.1447559\ttest: 9.1447559\ttest1: 21.1092259\tbest: 21.0980380 (788)\ttotal: 1m 1s\tremaining: 5m 24s\n",
645 | "1000:\tlearn: 7.5405212\ttest: 7.5405212\ttest1: 20.9789300\tbest: 20.9686746 (996)\ttotal: 1m 17s\tremaining: 5m 9s\n",
646 | "1200:\tlearn: 6.2914202\ttest: 6.2914202\ttest1: 20.9586354\tbest: 20.9396207 (1183)\ttotal: 1m 32s\tremaining: 4m 53s\n",
647 | "1400:\tlearn: 5.2881884\ttest: 5.2881884\ttest1: 20.9648652\tbest: 20.9364999 (1228)\ttotal: 1m 48s\tremaining: 4m 38s\n",
648 | "Stopped by overfitting detector (200 iterations wait)\n",
649 | "\n",
650 | "bestTest = 20.93649987\n",
651 | "bestIteration = 1228\n",
652 | "\n",
653 | "Shrink model to first 1229 iterations.\n",
654 | "err: 20.936499717501444\n",
655 | "0:\tlearn: 40.2699643\ttest: 40.2699643\ttest1: 40.8415667\tbest: 40.8415667 (0)\ttotal: 76.5ms\tremaining: 6m 22s\n",
656 | "200:\tlearn: 18.6753351\ttest: 18.6753351\ttest1: 27.4187183\tbest: 27.4088922 (193)\ttotal: 15.5s\tremaining: 6m 10s\n",
657 | "400:\tlearn: 13.9656204\ttest: 13.9656204\ttest1: 26.2749454\tbest: 26.2749454 (400)\ttotal: 31s\tremaining: 5m 55s\n",
658 | "600:\tlearn: 11.1377151\ttest: 11.1377151\ttest1: 25.7939350\tbest: 25.7758090 (597)\ttotal: 46.6s\tremaining: 5m 40s\n",
659 | "800:\tlearn: 9.0698954\ttest: 9.0698954\ttest1: 25.6473063\tbest: 25.6330944 (796)\ttotal: 1m 2s\tremaining: 5m 26s\n",
660 | "1000:\tlearn: 7.4735173\ttest: 7.4735173\ttest1: 25.6020918\tbest: 25.5582752 (956)\ttotal: 1m 17s\tremaining: 5m 10s\n",
661 | "1200:\tlearn: 6.2397293\ttest: 6.2397293\ttest1: 25.4641451\tbest: 25.4639263 (1199)\ttotal: 1m 33s\tremaining: 4m 55s\n",
662 | "1400:\tlearn: 5.2201605\ttest: 5.2201605\ttest1: 25.4301693\tbest: 25.4189535 (1337)\ttotal: 1m 48s\tremaining: 4m 39s\n",
663 | "1600:\tlearn: 4.4155659\ttest: 4.4155659\ttest1: 25.4454091\tbest: 25.3828358 (1479)\ttotal: 2m 4s\tremaining: 4m 24s\n",
664 | "Stopped by overfitting detector (200 iterations wait)\n",
665 | "\n",
666 | "bestTest = 25.38283585\n",
667 | "bestIteration = 1479\n",
668 | "\n",
669 | "Shrink model to first 1480 iterations.\n",
670 | "err: 25.382835740606435\n",
671 | "0:\tlearn: 40.0010233\ttest: 40.0010233\ttest1: 42.8422200\tbest: 42.8422200 (0)\ttotal: 77.1ms\tremaining: 6m 25s\n",
672 | "200:\tlearn: 18.5276182\ttest: 18.5276182\ttest1: 26.6084511\tbest: 26.5904701 (199)\ttotal: 15.5s\tremaining: 6m 11s\n",
673 | "400:\tlearn: 13.9511841\ttest: 13.9511841\ttest1: 25.2719486\tbest: 25.2719486 (400)\ttotal: 31s\tremaining: 5m 55s\n",
674 | "600:\tlearn: 11.0352126\ttest: 11.0352126\ttest1: 24.5883537\tbest: 24.5883537 (600)\ttotal: 46.5s\tremaining: 5m 40s\n",
675 | "800:\tlearn: 8.9755095\ttest: 8.9755095\ttest1: 24.3519888\tbest: 24.3087064 (740)\ttotal: 1m 1s\tremaining: 5m 24s\n",
676 | "1000:\tlearn: 7.4336900\ttest: 7.4336900\ttest1: 24.0587788\tbest: 24.0576351 (999)\ttotal: 1m 17s\tremaining: 5m 9s\n",
677 | "1200:\tlearn: 6.1865076\ttest: 6.1865076\ttest1: 23.8666247\tbest: 23.8666247 (1200)\ttotal: 1m 33s\tremaining: 4m 54s\n",
678 | "1400:\tlearn: 5.2120808\ttest: 5.2120808\ttest1: 23.8334914\tbest: 23.8274649 (1387)\ttotal: 1m 48s\tremaining: 4m 38s\n",
679 | "1600:\tlearn: 4.4131917\ttest: 4.4131917\ttest1: 23.7687539\tbest: 23.7601979 (1578)\ttotal: 2m 4s\tremaining: 4m 23s\n",
680 | "1800:\tlearn: 3.7598475\ttest: 3.7598475\ttest1: 23.7438139\tbest: 23.7426417 (1687)\ttotal: 2m 19s\tremaining: 4m 8s\n",
681 | "2000:\tlearn: 3.2096719\ttest: 3.2096719\ttest1: 23.6954691\tbest: 23.6877001 (1940)\ttotal: 2m 35s\tremaining: 3m 53s\n",
682 | "2200:\tlearn: 2.7333636\ttest: 2.7333636\ttest1: 23.6486949\tbest: 23.6438895 (2193)\ttotal: 2m 51s\tremaining: 3m 38s\n",
683 | "2400:\tlearn: 2.3576374\ttest: 2.3576374\ttest1: 23.6143687\tbest: 23.6135243 (2398)\ttotal: 3m 7s\tremaining: 3m 22s\n",
684 | "2600:\tlearn: 2.0288923\ttest: 2.0288923\ttest1: 23.6012689\tbest: 23.6000219 (2593)\ttotal: 3m 22s\tremaining: 3m 7s\n",
685 | "2800:\tlearn: 1.7491326\ttest: 1.7491326\ttest1: 23.5908282\tbest: 23.5875003 (2782)\ttotal: 3m 38s\tremaining: 2m 51s\n",
686 | "3000:\tlearn: 1.5159515\ttest: 1.5159515\ttest1: 23.5843994\tbest: 23.5841580 (2971)\ttotal: 3m 54s\tremaining: 2m 36s\n",
687 | "3200:\tlearn: 1.3248261\ttest: 1.3248261\ttest1: 23.5876218\tbest: 23.5837679 (3127)\ttotal: 4m 9s\tremaining: 2m 20s\n",
688 | "3400:\tlearn: 1.1520374\ttest: 1.1520374\ttest1: 23.5751287\tbest: 23.5742021 (3396)\ttotal: 4m 25s\tremaining: 2m 4s\n",
689 | "3600:\tlearn: 1.0056445\ttest: 1.0056445\ttest1: 23.5690721\tbest: 23.5683305 (3566)\ttotal: 4m 41s\tremaining: 1m 49s\n",
690 | "3800:\tlearn: 0.8768814\ttest: 0.8768814\ttest1: 23.5550729\tbest: 23.5541394 (3796)\ttotal: 4m 58s\tremaining: 1m 34s\n",
691 | "4000:\tlearn: 0.7579140\ttest: 0.7579140\ttest1: 23.5492676\tbest: 23.5476309 (3974)\ttotal: 5m 14s\tremaining: 1m 18s\n",
692 | "4200:\tlearn: 0.6576364\ttest: 0.6576364\ttest1: 23.5353984\tbest: 23.5342956 (4181)\ttotal: 5m 31s\tremaining: 1m 2s\n",
693 | "4400:\tlearn: 0.5711441\ttest: 0.5711441\ttest1: 23.5296140\tbest: 23.5290636 (4375)\ttotal: 5m 48s\tremaining: 47.4s\n",
694 | "4600:\tlearn: 0.5002357\ttest: 0.5002357\ttest1: 23.5301061\tbest: 23.5279730 (4528)\ttotal: 6m 3s\tremaining: 31.6s\n",
695 | "Stopped by overfitting detector (200 iterations wait)\n",
696 | "\n",
697 | "bestTest = 23.52797302\n",
698 | "bestIteration = 4528\n",
699 | "\n",
700 | "Shrink model to first 4529 iterations.\n",
701 | "err: 23.527972862069507\n",
702 | "0:\tlearn: 40.1718664\ttest: 40.1718664\ttest1: 41.1290550\tbest: 41.1290550 (0)\ttotal: 77.2ms\tremaining: 6m 26s\n",
703 | "200:\tlearn: 18.4836731\ttest: 18.4836731\ttest1: 25.4569672\tbest: 25.4569672 (200)\ttotal: 15.5s\tremaining: 6m 10s\n",
704 | "400:\tlearn: 13.9355300\ttest: 13.9355300\ttest1: 24.4773893\tbest: 24.4763157 (392)\ttotal: 31.2s\tremaining: 5m 57s\n",
705 | "600:\tlearn: 11.0812480\ttest: 11.0812480\ttest1: 23.9179170\tbest: 23.9087132 (596)\ttotal: 46.7s\tremaining: 5m 41s\n",
706 | "800:\tlearn: 9.0671201\ttest: 9.0671201\ttest1: 23.4813391\tbest: 23.4790011 (796)\ttotal: 1m 2s\tremaining: 5m 26s\n",
707 | "1000:\tlearn: 7.4996077\ttest: 7.4996077\ttest1: 23.3210801\tbest: 23.2980099 (978)\ttotal: 1m 17s\tremaining: 5m 10s\n",
708 | "1200:\tlearn: 6.2619113\ttest: 6.2619113\ttest1: 23.0835750\tbest: 23.0835750 (1200)\ttotal: 1m 33s\tremaining: 4m 55s\n",
709 | "1400:\tlearn: 5.2647254\ttest: 5.2647254\ttest1: 23.0231194\tbest: 23.0185763 (1397)\ttotal: 1m 49s\tremaining: 4m 40s\n",
710 | "1600:\tlearn: 4.4365986\ttest: 4.4365986\ttest1: 22.8960975\tbest: 22.8952064 (1599)\ttotal: 2m 4s\tremaining: 4m 24s\n",
711 | "1800:\tlearn: 3.7980562\ttest: 3.7980562\ttest1: 22.8099496\tbest: 22.8013149 (1793)\ttotal: 2m 20s\tremaining: 4m 8s\n",
712 | "2000:\tlearn: 3.2304707\ttest: 3.2304707\ttest1: 22.7436463\tbest: 22.7402474 (1986)\ttotal: 2m 35s\tremaining: 3m 53s\n",
713 | "2200:\tlearn: 2.7697644\ttest: 2.7697644\ttest1: 22.7045545\tbest: 22.7005346 (2180)\ttotal: 2m 51s\tremaining: 3m 38s\n",
714 | "2400:\tlearn: 2.3801329\ttest: 2.3801329\ttest1: 22.6786484\tbest: 22.6739218 (2396)\ttotal: 3m 7s\tremaining: 3m 22s\n",
715 | "2600:\tlearn: 2.0552460\ttest: 2.0552460\ttest1: 22.6356006\tbest: 22.6291453 (2583)\ttotal: 3m 22s\tremaining: 3m 6s\n",
716 | "2800:\tlearn: 1.7799421\ttest: 1.7799421\ttest1: 22.5960161\tbest: 22.5948527 (2787)\ttotal: 3m 38s\tremaining: 2m 51s\n",
717 | "3000:\tlearn: 1.5323343\ttest: 1.5323343\ttest1: 22.5770603\tbest: 22.5754886 (2946)\ttotal: 3m 53s\tremaining: 2m 35s\n",
718 | "3200:\tlearn: 1.3334044\ttest: 1.3334044\ttest1: 22.5594997\tbest: 22.5587520 (3148)\ttotal: 4m 9s\tremaining: 2m 20s\n",
719 | "3400:\tlearn: 1.1560733\ttest: 1.1560733\ttest1: 22.5518747\tbest: 22.5495954 (3345)\ttotal: 4m 25s\tremaining: 2m 4s\n",
720 | "3600:\tlearn: 1.0070794\ttest: 1.0070794\ttest1: 22.5210004\tbest: 22.5210004 (3600)\ttotal: 4m 40s\tremaining: 1m 49s\n",
721 | "3800:\tlearn: 0.8806402\ttest: 0.8806402\ttest1: 22.5110279\tbest: 22.5082813 (3784)\ttotal: 4m 56s\tremaining: 1m 33s\n",
722 | "4000:\tlearn: 0.7648709\ttest: 0.7648709\ttest1: 22.5080329\tbest: 22.5075276 (3993)\ttotal: 5m 12s\tremaining: 1m 17s\n",
723 | "4200:\tlearn: 0.6667834\ttest: 0.6667834\ttest1: 22.5034791\tbest: 22.5006112 (4108)\ttotal: 5m 29s\tremaining: 1m 2s\n",
724 | "Stopped by overfitting detector (200 iterations wait)\n",
725 | "\n",
726 | "bestTest = 22.50061121\n",
727 | "bestIteration = 4108\n",
728 | "\n",
729 | "Shrink model to first 4109 iterations.\n",
730 | "err: 22.500611177040856\n",
731 | "0:\tlearn: 40.3333876\ttest: 40.3333876\ttest1: 37.7283832\tbest: 37.7283832 (0)\ttotal: 77.2ms\tremaining: 6m 25s\n",
732 | "200:\tlearn: 18.5427660\ttest: 18.5427660\ttest1: 24.1868932\tbest: 24.1868932 (200)\ttotal: 15.6s\tremaining: 6m 12s\n",
733 | "400:\tlearn: 14.0666296\ttest: 14.0666296\ttest1: 23.2303523\tbest: 23.2043217 (382)\ttotal: 31s\tremaining: 5m 55s\n",
734 | "600:\tlearn: 11.2061832\ttest: 11.2061832\ttest1: 22.7633444\tbest: 22.7618894 (589)\ttotal: 46.5s\tremaining: 5m 40s\n",
735 | "800:\tlearn: 9.0756666\ttest: 9.0756666\ttest1: 22.5403978\tbest: 22.5362058 (799)\ttotal: 1m 2s\tremaining: 5m 25s\n",
736 | "1000:\tlearn: 7.4452019\ttest: 7.4452019\ttest1: 22.4006724\tbest: 22.3917867 (972)\ttotal: 1m 17s\tremaining: 5m 10s\n",
737 | "1200:\tlearn: 6.2142752\ttest: 6.2142752\ttest1: 22.3990171\tbest: 22.3441165 (1105)\ttotal: 1m 33s\tremaining: 4m 55s\n",
738 | "1400:\tlearn: 5.2024268\ttest: 5.2024268\ttest1: 22.3002244\tbest: 22.2999298 (1392)\ttotal: 1m 49s\tremaining: 4m 40s\n",
739 | "1600:\tlearn: 4.3919601\ttest: 4.3919601\ttest1: 22.2671572\tbest: 22.2620270 (1555)\ttotal: 2m 4s\tremaining: 4m 24s\n",
740 | "1800:\tlearn: 3.7456035\ttest: 3.7456035\ttest1: 22.2398679\tbest: 22.2380782 (1792)\ttotal: 2m 20s\tremaining: 4m 9s\n",
741 | "2000:\tlearn: 3.1848984\ttest: 3.1848984\ttest1: 22.2351871\tbest: 22.2207260 (1898)\ttotal: 2m 36s\tremaining: 3m 53s\n",
742 | "2200:\tlearn: 2.7256742\ttest: 2.7256742\ttest1: 22.2073729\tbest: 22.2030415 (2179)\ttotal: 2m 51s\tremaining: 3m 38s\n",
743 | "2400:\tlearn: 2.3578896\ttest: 2.3578896\ttest1: 22.1812838\tbest: 22.1704177 (2369)\ttotal: 3m 8s\tremaining: 3m 24s\n",
744 | "Stopped by overfitting detector (200 iterations wait)\n",
745 | "\n",
746 | "bestTest = 22.17041767\n",
747 | "bestIteration = 2369\n",
748 | "\n",
749 | "Shrink model to first 2370 iterations.\n",
750 | "err: 22.170417520833542\n",
751 | "0:\tlearn: 40.1457357\ttest: 40.1457357\ttest1: 41.3746201\tbest: 41.3746201 (0)\ttotal: 76ms\tremaining: 6m 19s\n",
752 | "200:\tlearn: 18.8420693\ttest: 18.8420693\ttest1: 25.4619658\tbest: 25.4619658 (200)\ttotal: 15.5s\tremaining: 6m 9s\n",
753 | "400:\tlearn: 14.2407549\ttest: 14.2407549\ttest1: 24.6336516\tbest: 24.6336516 (400)\ttotal: 31s\tremaining: 5m 55s\n",
754 | "600:\tlearn: 11.2038467\ttest: 11.2038467\ttest1: 24.2377383\tbest: 24.2286849 (599)\ttotal: 46.5s\tremaining: 5m 40s\n",
755 | "800:\tlearn: 9.2022496\ttest: 9.2022496\ttest1: 24.1784844\tbest: 24.1441411 (779)\ttotal: 1m 2s\tremaining: 5m 25s\n",
756 | "1000:\tlearn: 7.5905481\ttest: 7.5905481\ttest1: 24.0048152\tbest: 24.0048152 (1000)\ttotal: 1m 17s\tremaining: 5m 9s\n",
757 | "1200:\tlearn: 6.2990399\ttest: 6.2990399\ttest1: 23.8977774\tbest: 23.8908460 (1198)\ttotal: 1m 33s\tremaining: 4m 54s\n",
758 | "1400:\tlearn: 5.2766369\ttest: 5.2766369\ttest1: 23.7992180\tbest: 23.7963697 (1398)\ttotal: 1m 48s\tremaining: 4m 39s\n",
759 | "1600:\tlearn: 4.4485887\ttest: 4.4485887\ttest1: 23.7374916\tbest: 23.7374916 (1600)\ttotal: 2m 4s\tremaining: 4m 23s\n",
760 | "1800:\tlearn: 3.7643137\ttest: 3.7643137\ttest1: 23.6571016\tbest: 23.6542180 (1786)\ttotal: 2m 19s\tremaining: 4m 8s\n",
761 | "2000:\tlearn: 3.2122577\ttest: 3.2122577\ttest1: 23.6518104\tbest: 23.6267640 (1845)\ttotal: 2m 35s\tremaining: 3m 53s\n",
762 | "Stopped by overfitting detector (200 iterations wait)\n",
763 | "\n",
764 | "bestTest = 23.62676395\n",
765 | "bestIteration = 1845\n",
766 | "\n",
767 | "Shrink model to first 1846 iterations.\n",
768 | "err: 23.626763973224094\n",
769 | "0:\tlearn: 40.0831508\ttest: 40.0831508\ttest1: 40.2759953\tbest: 40.2759953 (0)\ttotal: 76.9ms\tremaining: 6m 24s\n",
770 | "200:\tlearn: 18.5545627\ttest: 18.5545627\ttest1: 25.7563384\tbest: 25.7297570 (199)\ttotal: 15.7s\tremaining: 6m 14s\n",
771 | "400:\tlearn: 14.0818511\ttest: 14.0818511\ttest1: 24.6033475\tbest: 24.5966082 (399)\ttotal: 31.1s\tremaining: 5m 57s\n",
772 | "600:\tlearn: 11.1993655\ttest: 11.1993655\ttest1: 24.0628108\tbest: 24.0190008 (591)\ttotal: 46.7s\tremaining: 5m 41s\n",
773 | "800:\tlearn: 9.1069544\ttest: 9.1069544\ttest1: 23.8031567\tbest: 23.8022204 (758)\ttotal: 1m 2s\tremaining: 5m 26s\n",
774 | "1000:\tlearn: 7.5044513\ttest: 7.5044513\ttest1: 23.6677070\tbest: 23.6677070 (1000)\ttotal: 1m 17s\tremaining: 5m 11s\n",
775 | "1200:\tlearn: 6.2820095\ttest: 6.2820095\ttest1: 23.6479612\tbest: 23.5872076 (1140)\ttotal: 1m 33s\tremaining: 4m 56s\n",
776 | "Stopped by overfitting detector (200 iterations wait)\n",
777 | "\n",
778 | "bestTest = 23.58720765\n",
779 | "bestIteration = 1140\n",
780 | "\n",
781 | "Shrink model to first 1141 iterations.\n",
782 | "err: 23.587207766724326\n",
783 | "0:\tlearn: 40.1447603\ttest: 40.1447603\ttest1: 40.0493867\tbest: 40.0493867 (0)\ttotal: 80.4ms\tremaining: 6m 41s\n",
784 | "200:\tlearn: 18.5434333\ttest: 18.5434333\ttest1: 26.1527334\tbest: 26.1379897 (198)\ttotal: 15.6s\tremaining: 6m 11s\n",
785 | "400:\tlearn: 13.9166313\ttest: 13.9166313\ttest1: 24.9135312\tbest: 24.9033703 (390)\ttotal: 31.1s\tremaining: 5m 56s\n",
786 | "600:\tlearn: 11.0496011\ttest: 11.0496011\ttest1: 24.5134111\tbest: 24.5005027 (595)\ttotal: 46.6s\tremaining: 5m 41s\n",
787 | "800:\tlearn: 8.9726530\ttest: 8.9726530\ttest1: 24.1596452\tbest: 24.1402399 (776)\ttotal: 1m 2s\tremaining: 5m 26s\n",
788 | "1000:\tlearn: 7.4108886\ttest: 7.4108886\ttest1: 23.9606031\tbest: 23.9555570 (991)\ttotal: 1m 17s\tremaining: 5m 10s\n",
789 | "1200:\tlearn: 6.1814401\ttest: 6.1814401\ttest1: 23.8160093\tbest: 23.8134094 (1184)\ttotal: 1m 33s\tremaining: 4m 55s\n",
790 | "1400:\tlearn: 5.2224432\ttest: 5.2224432\ttest1: 23.7993644\tbest: 23.7636155 (1331)\ttotal: 1m 49s\tremaining: 4m 40s\n",
791 | "1600:\tlearn: 4.4005124\ttest: 4.4005124\ttest1: 23.7306468\tbest: 23.7220672 (1584)\ttotal: 2m 4s\tremaining: 4m 25s\n",
792 | "1800:\tlearn: 3.7469906\ttest: 3.7469906\ttest1: 23.7112497\tbest: 23.6937853 (1660)\ttotal: 2m 20s\tremaining: 4m 9s\n",
793 | "2000:\tlearn: 3.2017034\ttest: 3.2017034\ttest1: 23.6699849\tbest: 23.6622811 (1989)\ttotal: 2m 36s\tremaining: 3m 54s\n",
794 | "2200:\tlearn: 2.7308572\ttest: 2.7308572\ttest1: 23.6678084\tbest: 23.6493446 (2068)\ttotal: 2m 52s\tremaining: 3m 38s\n",
795 | "2400:\tlearn: 2.3556090\ttest: 2.3556090\ttest1: 23.6214624\tbest: 23.6193422 (2398)\ttotal: 3m 7s\tremaining: 3m 23s\n",
796 | "2600:\tlearn: 2.0327111\ttest: 2.0327111\ttest1: 23.6190175\tbest: 23.6180589 (2423)\ttotal: 3m 23s\tremaining: 3m 7s\n",
797 | "2800:\tlearn: 1.7515269\ttest: 1.7515269\ttest1: 23.6220019\tbest: 23.6106827 (2615)\ttotal: 3m 39s\tremaining: 2m 52s\n",
798 | "Stopped by overfitting detector (200 iterations wait)\n",
799 | "\n",
800 | "bestTest = 23.6106827\n",
801 | "bestIteration = 2615\n",
802 | "\n",
803 | "Shrink model to first 2616 iterations.\n",
804 | "err: 23.610682571289285\n",
805 | "0:\tlearn: 40.2442856\ttest: 40.2442856\ttest1: 40.2821978\tbest: 40.2821978 (0)\ttotal: 82.8ms\tremaining: 6m 54s\n",
806 | "200:\tlearn: 18.5585820\ttest: 18.5585820\ttest1: 25.1762684\tbest: 25.1686103 (199)\ttotal: 15.6s\tremaining: 6m 13s\n",
807 | "400:\tlearn: 14.0746033\ttest: 14.0746033\ttest1: 24.1080371\tbest: 24.1080371 (400)\ttotal: 31.2s\tremaining: 5m 57s\n",
808 | "600:\tlearn: 11.1698903\ttest: 11.1698903\ttest1: 23.6591978\tbest: 23.6344024 (597)\ttotal: 46.7s\tremaining: 5m 41s\n",
809 | "800:\tlearn: 9.0968928\ttest: 9.0968928\ttest1: 23.4967794\tbest: 23.4480566 (731)\ttotal: 1m 2s\tremaining: 5m 27s\n",
810 | "1000:\tlearn: 7.4999226\ttest: 7.4999226\ttest1: 23.3029310\tbest: 23.3029310 (1000)\ttotal: 1m 18s\tremaining: 5m 12s\n",
811 | "1200:\tlearn: 6.2268303\ttest: 6.2268303\ttest1: 23.2306710\tbest: 23.2032763 (1158)\ttotal: 1m 33s\tremaining: 4m 56s\n",
812 | "1400:\tlearn: 5.2526449\ttest: 5.2526449\ttest1: 23.1735124\tbest: 23.1647173 (1382)\ttotal: 1m 50s\tremaining: 4m 44s\n",
813 | "1600:\tlearn: 4.4453553\ttest: 4.4453553\ttest1: 23.0764276\tbest: 23.0760354 (1599)\ttotal: 2m 6s\tremaining: 4m 28s\n",
814 | "1800:\tlearn: 3.7782981\ttest: 3.7782981\ttest1: 23.0401895\tbest: 23.0269641 (1790)\ttotal: 2m 22s\tremaining: 4m 12s\n",
815 | "2000:\tlearn: 3.2257521\ttest: 3.2257521\ttest1: 23.0495526\tbest: 23.0257095 (1863)\ttotal: 2m 37s\tremaining: 3m 56s\n",
816 | "Stopped by overfitting detector (200 iterations wait)\n",
817 | "\n",
818 | "bestTest = 23.02570946\n",
819 | "bestIteration = 1863\n",
820 | "\n",
821 | "Shrink model to first 1864 iterations.\n",
822 | "err: 23.025709845930972\n",
823 | "0:\tlearn: 40.3255443\ttest: 40.3255443\ttest1: 38.6459117\tbest: 38.6459117 (0)\ttotal: 78.7ms\tremaining: 6m 33s\n",
824 | "200:\tlearn: 18.5589204\ttest: 18.5589204\ttest1: 24.6792365\tbest: 24.6682080 (198)\ttotal: 15.6s\tremaining: 6m 11s\n",
825 | "400:\tlearn: 13.9842886\ttest: 13.9842886\ttest1: 23.3331800\tbest: 23.3331800 (400)\ttotal: 31.1s\tremaining: 5m 56s\n",
826 | "600:\tlearn: 11.1243029\ttest: 11.1243029\ttest1: 22.7763060\tbest: 22.7763060 (600)\ttotal: 46.6s\tremaining: 5m 41s\n",
827 | "800:\tlearn: 9.0916540\ttest: 9.0916540\ttest1: 22.5603758\tbest: 22.5405960 (745)\ttotal: 1m 2s\tremaining: 5m 25s\n",
828 | "1000:\tlearn: 7.4839058\ttest: 7.4839058\ttest1: 22.3351773\tbest: 22.3351773 (1000)\ttotal: 1m 17s\tremaining: 5m 10s\n",
829 | "1200:\tlearn: 6.2634279\ttest: 6.2634279\ttest1: 22.1963710\tbest: 22.1869120 (1197)\ttotal: 1m 33s\tremaining: 4m 54s\n",
830 | "1400:\tlearn: 5.2682685\ttest: 5.2682685\ttest1: 22.1296374\tbest: 22.1175826 (1371)\ttotal: 1m 48s\tremaining: 4m 39s\n",
831 | "1600:\tlearn: 4.4702299\ttest: 4.4702299\ttest1: 22.0735495\tbest: 22.0672424 (1571)\ttotal: 2m 4s\tremaining: 4m 23s\n",
832 | "Stopped by overfitting detector (200 iterations wait)\n",
833 | "\n",
834 | "bestTest = 22.06724241\n",
835 | "bestIteration = 1571\n",
836 | "\n",
837 | "Shrink model to first 1572 iterations.\n",
838 | "err: 22.067242551903032\n",
839 | "0:\tlearn: 40.3360136\ttest: 40.3360136\ttest1: 39.0164885\tbest: 39.0164885 (0)\ttotal: 77.5ms\tremaining: 6m 27s\n",
840 | "200:\tlearn: 18.6665069\ttest: 18.6665069\ttest1: 24.0657402\tbest: 24.0657402 (200)\ttotal: 15.5s\tremaining: 6m 11s\n",
841 | "400:\tlearn: 14.0490032\ttest: 14.0490032\ttest1: 23.1715470\tbest: 23.1636967 (393)\ttotal: 31.1s\tremaining: 5m 56s\n",
842 | "600:\tlearn: 11.1426550\ttest: 11.1426550\ttest1: 22.6796131\tbest: 22.6796131 (600)\ttotal: 46.5s\tremaining: 5m 40s\n",
843 | "800:\tlearn: 9.1014829\ttest: 9.1014829\ttest1: 22.4250826\tbest: 22.4250826 (800)\ttotal: 1m 1s\tremaining: 5m 24s\n",
844 | "1000:\tlearn: 7.5011249\ttest: 7.5011249\ttest1: 22.3179993\tbest: 22.3145331 (994)\ttotal: 1m 17s\tremaining: 5m 10s\n",
845 | "1200:\tlearn: 6.2489588\ttest: 6.2489588\ttest1: 22.2562969\tbest: 22.2363510 (1178)\ttotal: 1m 33s\tremaining: 4m 54s\n",
846 | "1400:\tlearn: 5.2631956\ttest: 5.2631956\ttest1: 22.2078329\tbest: 22.1983166 (1371)\ttotal: 1m 48s\tremaining: 4m 39s\n",
847 | "1600:\tlearn: 4.4421702\ttest: 4.4421702\ttest1: 22.1234305\tbest: 22.1234305 (1600)\ttotal: 2m 4s\tremaining: 4m 23s\n",
848 | "1800:\tlearn: 3.7661850\ttest: 3.7661850\ttest1: 22.0167844\tbest: 22.0161885 (1798)\ttotal: 2m 20s\tremaining: 4m 8s\n",
849 | "2000:\tlearn: 3.2289727\ttest: 3.2289727\ttest1: 21.9929626\tbest: 21.9918824 (1982)\ttotal: 2m 35s\tremaining: 3m 53s\n",
850 | "2200:\tlearn: 2.7692197\ttest: 2.7692197\ttest1: 21.9787016\tbest: 21.9774238 (2196)\ttotal: 2m 51s\tremaining: 3m 37s\n",
851 | "2400:\tlearn: 2.3751767\ttest: 2.3751767\ttest1: 21.9644917\tbest: 21.9644917 (2400)\ttotal: 3m 6s\tremaining: 3m 22s\n",
852 | "2600:\tlearn: 2.0464960\ttest: 2.0464960\ttest1: 21.9390869\tbest: 21.9305328 (2526)\ttotal: 3m 22s\tremaining: 3m 6s\n",
853 | "2800:\tlearn: 1.7718744\ttest: 1.7718744\ttest1: 21.9219081\tbest: 21.9219081 (2800)\ttotal: 3m 37s\tremaining: 2m 51s\n",
854 | "3000:\tlearn: 1.5257647\ttest: 1.5257647\ttest1: 21.9343371\tbest: 21.9192063 (2817)\ttotal: 3m 53s\tremaining: 2m 35s\n",
855 | "Stopped by overfitting detector (200 iterations wait)\n",
856 | "\n",
857 | "bestTest = 21.91920631\n",
858 | "bestIteration = 2817\n",
859 | "\n",
860 | "Shrink model to first 2818 iterations.\n",
861 | "err: 21.91920650501892\n",
862 | "0:\tlearn: 40.2495261\ttest: 40.2495261\ttest1: 40.2830735\tbest: 40.2830735 (0)\ttotal: 76.6ms\tremaining: 6m 22s\n",
863 | "200:\tlearn: 18.7101338\ttest: 18.7101338\ttest1: 22.9273356\tbest: 22.9273356 (200)\ttotal: 15.5s\tremaining: 6m 10s\n",
864 | "400:\tlearn: 13.9303330\ttest: 13.9303330\ttest1: 21.7517536\tbest: 21.7506623 (399)\ttotal: 31s\tremaining: 5m 55s\n",
865 | "600:\tlearn: 11.1010006\ttest: 11.1010006\ttest1: 21.2733640\tbest: 21.2726085 (598)\ttotal: 46.4s\tremaining: 5m 39s\n",
866 | "800:\tlearn: 9.0670233\ttest: 9.0670233\ttest1: 21.0638435\tbest: 21.0273690 (761)\ttotal: 1m 2s\tremaining: 5m 25s\n",
867 | "Stopped by overfitting detector (200 iterations wait)\n",
868 | "\n",
869 | "bestTest = 21.02736896\n",
870 | "bestIteration = 761\n",
871 | "\n",
872 | "Shrink model to first 762 iterations.\n",
873 | "err: 21.02736888901848\n",
874 | "0:\tlearn: 40.2029701\ttest: 40.2029701\ttest1: 41.6243888\tbest: 41.6243888 (0)\ttotal: 77.3ms\tremaining: 6m 26s\n",
875 | "200:\tlearn: 18.6679265\ttest: 18.6679265\ttest1: 23.4785938\tbest: 23.4755171 (195)\ttotal: 15.6s\tremaining: 6m 12s\n",
876 | "400:\tlearn: 13.9525682\ttest: 13.9525682\ttest1: 22.4108766\tbest: 22.4050359 (399)\ttotal: 31.1s\tremaining: 5m 56s\n",
877 | "600:\tlearn: 11.0323003\ttest: 11.0323003\ttest1: 22.1950919\tbest: 22.1655481 (591)\ttotal: 46.6s\tremaining: 5m 40s\n",
878 | "800:\tlearn: 8.9495728\ttest: 8.9495728\ttest1: 21.8626894\tbest: 21.8481166 (787)\ttotal: 1m 2s\tremaining: 5m 25s\n",
879 | "1000:\tlearn: 7.3931876\ttest: 7.3931876\ttest1: 21.7407306\tbest: 21.7274289 (979)\ttotal: 1m 17s\tremaining: 5m 10s\n",
880 | "1200:\tlearn: 6.1955991\ttest: 6.1955991\ttest1: 21.6722801\tbest: 21.6695920 (1198)\ttotal: 1m 34s\tremaining: 4m 59s\n",
881 | "1400:\tlearn: 5.2090663\ttest: 5.2090663\ttest1: 21.5923481\tbest: 21.5894501 (1396)\ttotal: 1m 50s\tremaining: 4m 43s\n",
882 | "1600:\tlearn: 4.4209683\ttest: 4.4209683\ttest1: 21.5497588\tbest: 21.5369012 (1475)\ttotal: 2m 5s\tremaining: 4m 27s\n",
883 | "1800:\tlearn: 3.7330390\ttest: 3.7330390\ttest1: 21.4947114\tbest: 21.4829271 (1734)\ttotal: 2m 21s\tremaining: 4m 11s\n",
884 | "2000:\tlearn: 3.1849787\ttest: 3.1849787\ttest1: 21.4697018\tbest: 21.4556831 (1926)\ttotal: 2m 37s\tremaining: 3m 55s\n",
885 | "Stopped by overfitting detector (200 iterations wait)\n",
886 | "\n",
887 | "bestTest = 21.45568305\n",
888 | "bestIteration = 1926\n",
889 | "\n",
890 | "Shrink model to first 1927 iterations.\n",
891 | "err: 21.45568292565179\n",
892 | "0:\tlearn: 39.9752611\ttest: 39.9752611\ttest1: 42.7216821\tbest: 42.7216821 (0)\ttotal: 76.1ms\tremaining: 6m 20s\n",
893 | "200:\tlearn: 18.6873486\ttest: 18.6873486\ttest1: 25.3251105\tbest: 25.3226036 (198)\ttotal: 15.6s\tremaining: 6m 11s\n",
894 | "400:\tlearn: 14.1755577\ttest: 14.1755577\ttest1: 24.2777607\tbest: 24.2654741 (394)\ttotal: 31s\tremaining: 5m 55s\n",
895 | "600:\tlearn: 11.2402725\ttest: 11.2402725\ttest1: 23.7855228\tbest: 23.7843534 (599)\ttotal: 46.5s\tremaining: 5m 40s\n",
896 | "800:\tlearn: 9.1229660\ttest: 9.1229660\ttest1: 23.4983693\tbest: 23.4983693 (800)\ttotal: 1m 1s\tremaining: 5m 24s\n",
897 | "1000:\tlearn: 7.5522934\ttest: 7.5522934\ttest1: 23.3819595\tbest: 23.3819595 (1000)\ttotal: 1m 17s\tremaining: 5m 9s\n",
898 | "1200:\tlearn: 6.2628687\ttest: 6.2628687\ttest1: 23.3963507\tbest: 23.3748201 (1129)\ttotal: 1m 33s\tremaining: 4m 54s\n",
899 | "1400:\tlearn: 5.2614718\ttest: 5.2614718\ttest1: 23.2582987\tbest: 23.2582987 (1400)\ttotal: 1m 48s\tremaining: 4m 39s\n",
900 | "1600:\tlearn: 4.4622843\ttest: 4.4622843\ttest1: 23.1452926\tbest: 23.1452926 (1600)\ttotal: 2m 4s\tremaining: 4m 23s\n",
901 | "1800:\tlearn: 3.7903927\ttest: 3.7903927\ttest1: 23.1218691\tbest: 23.1180035 (1641)\ttotal: 2m 19s\tremaining: 4m 8s\n",
902 | "2000:\tlearn: 3.2200017\ttest: 3.2200017\ttest1: 23.0990783\tbest: 23.0876150 (1955)\ttotal: 2m 35s\tremaining: 3m 53s\n",
903 | "2200:\tlearn: 2.7512936\ttest: 2.7512936\ttest1: 23.0878790\tbest: 23.0819325 (2192)\ttotal: 2m 51s\tremaining: 3m 37s\n",
904 | "Stopped by overfitting detector (200 iterations wait)\n",
905 | "\n",
906 | "bestTest = 23.08193251\n",
907 | "bestIteration = 2192\n",
908 | "\n",
909 | "Shrink model to first 2193 iterations.\n",
910 | "err: 23.081932696315334\n",
911 | "0:\tlearn: 40.3101864\ttest: 40.3101864\ttest1: 38.3758215\tbest: 38.3758215 (0)\ttotal: 76.3ms\tremaining: 6m 21s\n",
912 | "200:\tlearn: 18.5300099\ttest: 18.5300099\ttest1: 24.4798489\tbest: 24.4798489 (200)\ttotal: 15.6s\tremaining: 6m 12s\n",
913 | "400:\tlearn: 13.9776861\ttest: 13.9776861\ttest1: 23.5322861\tbest: 23.5322861 (400)\ttotal: 31.2s\tremaining: 5m 57s\n",
914 | "600:\tlearn: 11.1768234\ttest: 11.1768234\ttest1: 22.9532575\tbest: 22.9493678 (599)\ttotal: 46.7s\tremaining: 5m 41s\n",
915 | "800:\tlearn: 9.0597343\ttest: 9.0597343\ttest1: 22.6141236\tbest: 22.6009812 (796)\ttotal: 1m 2s\tremaining: 5m 25s\n",
916 | "1000:\tlearn: 7.5045138\ttest: 7.5045138\ttest1: 22.5008777\tbest: 22.4964678 (995)\ttotal: 1m 17s\tremaining: 5m 10s\n",
917 | "1200:\tlearn: 6.2556477\ttest: 6.2556477\ttest1: 22.3478979\tbest: 22.3394701 (1193)\ttotal: 1m 33s\tremaining: 4m 54s\n",
918 | "1400:\tlearn: 5.2750707\ttest: 5.2750707\ttest1: 22.2989177\tbest: 22.2841951 (1361)\ttotal: 1m 48s\tremaining: 4m 39s\n",
919 | "1600:\tlearn: 4.4615398\ttest: 4.4615398\ttest1: 22.2866401\tbest: 22.2627218 (1571)\ttotal: 2m 4s\tremaining: 4m 23s\n",
920 | "1800:\tlearn: 3.7752326\ttest: 3.7752326\ttest1: 22.2431147\tbest: 22.2390858 (1792)\ttotal: 2m 20s\tremaining: 4m 8s\n",
921 | "2000:\tlearn: 3.2245801\ttest: 3.2245801\ttest1: 22.2071219\tbest: 22.2071219 (2000)\ttotal: 2m 35s\tremaining: 3m 53s\n",
922 | "2200:\tlearn: 2.7548921\ttest: 2.7548921\ttest1: 22.2134860\tbest: 22.1926314 (2082)\ttotal: 2m 51s\tremaining: 3m 37s\n",
923 | "Stopped by overfitting detector (200 iterations wait)\n",
924 | "\n",
925 | "bestTest = 22.19263139\n",
926 | "bestIteration = 2082\n",
927 | "\n",
928 | "Shrink model to first 2083 iterations.\n",
929 | "err: 22.19263108025873\n",
930 | "0:\tlearn: 40.2372751\ttest: 40.2372751\ttest1: 39.5062699\tbest: 39.5062699 (0)\ttotal: 77.6ms\tremaining: 6m 28s\n",
931 | "200:\tlearn: 18.6177315\ttest: 18.6177315\ttest1: 25.8164432\tbest: 25.7954700 (199)\ttotal: 15.6s\tremaining: 6m 11s\n",
932 | "400:\tlearn: 13.9433224\ttest: 13.9433224\ttest1: 24.5489698\tbest: 24.5489698 (400)\ttotal: 31.1s\tremaining: 5m 56s\n",
933 | "600:\tlearn: 11.0678258\ttest: 11.0678258\ttest1: 23.9059608\tbest: 23.8966854 (597)\ttotal: 46.6s\tremaining: 5m 41s\n",
934 | "800:\tlearn: 8.9866349\ttest: 8.9866349\ttest1: 23.5694571\tbest: 23.5635749 (796)\ttotal: 1m 2s\tremaining: 5m 25s\n",
935 | "1000:\tlearn: 7.3946596\ttest: 7.3946596\ttest1: 23.3933467\tbest: 23.3920246 (999)\ttotal: 1m 17s\tremaining: 5m 10s\n",
936 | "1200:\tlearn: 6.1721142\ttest: 6.1721142\ttest1: 23.3044636\tbest: 23.3044636 (1200)\ttotal: 1m 33s\tremaining: 4m 55s\n",
937 | "1400:\tlearn: 5.1788409\ttest: 5.1788409\ttest1: 23.2013175\tbest: 23.2013175 (1400)\ttotal: 1m 48s\tremaining: 4m 39s\n",
938 | "1600:\tlearn: 4.3836433\ttest: 4.3836433\ttest1: 23.1140350\tbest: 23.1140350 (1600)\ttotal: 2m 4s\tremaining: 4m 24s\n",
939 | "1800:\tlearn: 3.7432574\ttest: 3.7432574\ttest1: 23.0750452\tbest: 23.0733526 (1796)\ttotal: 2m 20s\tremaining: 4m 8s\n",
940 | "2000:\tlearn: 3.1976351\ttest: 3.1976351\ttest1: 23.0241154\tbest: 23.0220076 (1994)\ttotal: 2m 35s\tremaining: 3m 53s\n",
941 | "2200:\tlearn: 2.7444357\ttest: 2.7444357\ttest1: 23.0101091\tbest: 23.0057872 (2129)\ttotal: 2m 53s\tremaining: 3m 40s\n",
942 | "2400:\tlearn: 2.3606729\ttest: 2.3606729\ttest1: 22.9854733\tbest: 22.9845754 (2395)\ttotal: 3m 8s\tremaining: 3m 24s\n",
943 | "2600:\tlearn: 2.0322732\ttest: 2.0322732\ttest1: 22.9570772\tbest: 22.9544701 (2583)\ttotal: 3m 24s\tremaining: 3m 8s\n",
944 | "2800:\tlearn: 1.7603679\ttest: 1.7603679\ttest1: 22.9709213\tbest: 22.9529765 (2658)\ttotal: 3m 39s\tremaining: 2m 52s\n",
945 | "Stopped by overfitting detector (200 iterations wait)\n",
946 | "\n",
947 | "bestTest = 22.95297654\n",
948 | "bestIteration = 2658\n",
949 | "\n",
950 | "Shrink model to first 2659 iterations.\n",
951 | "err: 22.952976358290744\n",
952 | "0:\tlearn: 40.3027167\ttest: 40.3027167\ttest1: 39.9886165\tbest: 39.9886165 (0)\ttotal: 78.2ms\tremaining: 6m 31s\n",
953 | "200:\tlearn: 18.5956369\ttest: 18.5956369\ttest1: 25.3158600\tbest: 25.3158600 (200)\ttotal: 15.5s\tremaining: 6m 10s\n",
954 | "400:\tlearn: 14.0106798\ttest: 14.0106798\ttest1: 24.8249253\tbest: 24.8005189 (384)\ttotal: 31s\tremaining: 5m 55s\n",
955 | "600:\tlearn: 11.1547859\ttest: 11.1547859\ttest1: 24.3765404\tbest: 24.3721730 (598)\ttotal: 46.4s\tremaining: 5m 39s\n",
956 | "800:\tlearn: 9.1013138\ttest: 9.1013138\ttest1: 24.0427740\tbest: 24.0427740 (800)\ttotal: 1m 1s\tremaining: 5m 24s\n",
957 | "1000:\tlearn: 7.4668754\ttest: 7.4668754\ttest1: 23.8872275\tbest: 23.8788433 (991)\ttotal: 1m 17s\tremaining: 5m 9s\n",
958 | "1200:\tlearn: 6.2548365\ttest: 6.2548365\ttest1: 23.7607930\tbest: 23.7607930 (1200)\ttotal: 1m 32s\tremaining: 4m 53s\n",
959 | "1400:\tlearn: 5.2538671\ttest: 5.2538671\ttest1: 23.7367607\tbest: 23.7348041 (1366)\ttotal: 1m 48s\tremaining: 4m 38s\n",
960 | "1600:\tlearn: 4.4275497\ttest: 4.4275497\ttest1: 23.7398416\tbest: 23.7182286 (1578)\ttotal: 2m 3s\tremaining: 4m 23s\n",
961 | "1800:\tlearn: 3.7709991\ttest: 3.7709991\ttest1: 23.7015974\tbest: 23.6992448 (1769)\ttotal: 2m 19s\tremaining: 4m 7s\n",
962 | "2000:\tlearn: 3.2059532\ttest: 3.2059532\ttest1: 23.6612279\tbest: 23.6556684 (1946)\ttotal: 2m 35s\tremaining: 3m 52s\n",
963 | "2200:\tlearn: 2.7559935\ttest: 2.7559935\ttest1: 23.6300102\tbest: 23.6285448 (2183)\ttotal: 2m 50s\tremaining: 3m 37s\n",
964 | "2400:\tlearn: 2.3588065\ttest: 2.3588065\ttest1: 23.6128305\tbest: 23.6101937 (2336)\ttotal: 3m 6s\tremaining: 3m 21s\n",
965 | "2600:\tlearn: 2.0310939\ttest: 2.0310939\ttest1: 23.5988171\tbest: 23.5938532 (2478)\ttotal: 3m 21s\tremaining: 3m 6s\n",
966 | "Stopped by overfitting detector (200 iterations wait)\n",
967 | "\n",
968 | "bestTest = 23.59385323\n",
969 | "bestIteration = 2478\n",
970 | "\n",
971 | "Shrink model to first 2479 iterations.\n",
972 | "err: 23.593853180670912\n",
973 | "0:\tlearn: 40.3133567\ttest: 40.3133567\ttest1: 38.4216752\tbest: 38.4216752 (0)\ttotal: 77.9ms\tremaining: 6m 29s\n",
974 | "200:\tlearn: 18.5351295\ttest: 18.5351295\ttest1: 25.0794284\tbest: 25.0794284 (200)\ttotal: 15.6s\tremaining: 6m 11s\n",
975 | "400:\tlearn: 14.0712090\ttest: 14.0712090\ttest1: 24.1763458\tbest: 24.1742213 (398)\ttotal: 31.1s\tremaining: 5m 56s\n",
976 | "600:\tlearn: 11.2283119\ttest: 11.2283119\ttest1: 23.3785283\tbest: 23.3785283 (600)\ttotal: 46.6s\tremaining: 5m 41s\n",
977 | "800:\tlearn: 9.1316227\ttest: 9.1316227\ttest1: 23.0947359\tbest: 23.0947359 (800)\ttotal: 1m 2s\tremaining: 5m 25s\n",
978 | "1000:\tlearn: 7.4893484\ttest: 7.4893484\ttest1: 22.7073351\tbest: 22.7073351 (1000)\ttotal: 1m 17s\tremaining: 5m 10s\n",
979 | "1200:\tlearn: 6.2431172\ttest: 6.2431172\ttest1: 22.6216763\tbest: 22.6155442 (1198)\ttotal: 1m 33s\tremaining: 4m 54s\n",
980 | "1400:\tlearn: 5.2307179\ttest: 5.2307179\ttest1: 22.4836826\tbest: 22.4834448 (1396)\ttotal: 1m 48s\tremaining: 4m 39s\n",
981 | "1600:\tlearn: 4.4057232\ttest: 4.4057232\ttest1: 22.4289766\tbest: 22.4213644 (1596)\ttotal: 2m 4s\tremaining: 4m 24s\n",
982 | "1800:\tlearn: 3.7532065\ttest: 3.7532065\ttest1: 22.3976587\tbest: 22.3788089 (1705)\ttotal: 2m 19s\tremaining: 4m 8s\n",
983 | "Stopped by overfitting detector (200 iterations wait)\n",
984 | "\n",
985 | "bestTest = 22.37880893\n",
986 | "bestIteration = 1705\n",
987 | "\n",
988 | "Shrink model to first 1706 iterations.\n",
989 | "err: 22.37880911443636\n",
990 | "0:\tlearn: 40.1291292\ttest: 40.1291292\ttest1: 42.9847112\tbest: 42.9847112 (0)\ttotal: 77.3ms\tremaining: 6m 26s\n",
991 | "200:\tlearn: 18.5334325\ttest: 18.5334325\ttest1: 26.8958084\tbest: 26.8958084 (200)\ttotal: 15.4s\tremaining: 6m 8s\n",
992 | "400:\tlearn: 14.0019331\ttest: 14.0019331\ttest1: 25.7488425\tbest: 25.7488425 (400)\ttotal: 30.8s\tremaining: 5m 53s\n",
993 | "600:\tlearn: 11.1403528\ttest: 11.1403528\ttest1: 25.1925107\tbest: 25.1925107 (600)\ttotal: 46.3s\tremaining: 5m 38s\n",
994 | "800:\tlearn: 9.0911354\ttest: 9.0911354\ttest1: 24.9427465\tbest: 24.9427465 (800)\ttotal: 1m 1s\tremaining: 5m 23s\n",
995 | "1000:\tlearn: 7.4959946\ttest: 7.4959946\ttest1: 24.6866179\tbest: 24.6866179 (1000)\ttotal: 1m 17s\tremaining: 5m 8s\n",
996 | "1200:\tlearn: 6.2674539\ttest: 6.2674539\ttest1: 24.5538038\tbest: 24.5230785 (1170)\ttotal: 1m 32s\tremaining: 4m 53s\n",
997 | "1400:\tlearn: 5.2540337\ttest: 5.2540337\ttest1: 24.4766775\tbest: 24.4731437 (1393)\ttotal: 1m 48s\tremaining: 4m 38s\n",
998 | "1600:\tlearn: 4.4462344\ttest: 4.4462344\ttest1: 24.4213427\tbest: 24.4165306 (1554)\ttotal: 2m 3s\tremaining: 4m 23s\n",
999 | "1800:\tlearn: 3.7754953\ttest: 3.7754953\ttest1: 24.3620337\tbest: 24.3616876 (1799)\ttotal: 2m 19s\tremaining: 4m 7s\n",
1000 | "2000:\tlearn: 3.2344382\ttest: 3.2344382\ttest1: 24.3428412\tbest: 24.3394612 (1989)\ttotal: 2m 35s\tremaining: 3m 52s\n",
1001 | "2200:\tlearn: 2.7685777\ttest: 2.7685777\ttest1: 24.3236314\tbest: 24.3221403 (2180)\ttotal: 2m 50s\tremaining: 3m 36s\n",
1002 | "2400:\tlearn: 2.3949253\ttest: 2.3949253\ttest1: 24.2920106\tbest: 24.2873041 (2381)\ttotal: 3m 7s\tremaining: 3m 23s\n",
1003 | "2600:\tlearn: 2.0633171\ttest: 2.0633171\ttest1: 24.2726954\tbest: 24.2704865 (2591)\ttotal: 3m 23s\tremaining: 3m 7s\n",
1004 | "2800:\tlearn: 1.7723446\ttest: 1.7723446\ttest1: 24.2615666\tbest: 24.2615666 (2800)\ttotal: 3m 38s\tremaining: 2m 51s\n",
1005 | "3000:\tlearn: 1.5351622\ttest: 1.5351622\ttest1: 24.2568203\tbest: 24.2562337 (2999)\ttotal: 3m 54s\tremaining: 2m 36s\n",
1006 | "3200:\tlearn: 1.3320583\ttest: 1.3320583\ttest1: 24.2472974\tbest: 24.2448250 (3073)\ttotal: 4m 9s\tremaining: 2m 20s\n",
1007 | "3400:\tlearn: 1.1575297\ttest: 1.1575297\ttest1: 24.2437464\tbest: 24.2397267 (3377)\ttotal: 4m 25s\tremaining: 2m 4s\n",
1008 | "3600:\tlearn: 1.0090671\ttest: 1.0090671\ttest1: 24.2403057\tbest: 24.2383831 (3579)\ttotal: 4m 41s\tremaining: 1m 49s\n",
1009 | "3800:\tlearn: 0.8775672\ttest: 0.8775672\ttest1: 24.2342476\tbest: 24.2341390 (3798)\ttotal: 4m 56s\tremaining: 1m 33s\n",
1010 | "4000:\tlearn: 0.7692448\ttest: 0.7692448\ttest1: 24.2295141\tbest: 24.2291590 (3999)\ttotal: 5m 12s\tremaining: 1m 17s\n",
1011 | "4200:\tlearn: 0.6720400\ttest: 0.6720400\ttest1: 24.2257068\tbest: 24.2257068 (4200)\ttotal: 5m 29s\tremaining: 1m 2s\n",
1012 | "4400:\tlearn: 0.5881158\ttest: 0.5881158\ttest1: 24.2220132\tbest: 24.2199265 (4352)\ttotal: 5m 46s\tremaining: 47.1s\n",
1013 | "4600:\tlearn: 0.5165693\ttest: 0.5165693\ttest1: 24.2179745\tbest: 24.2179409 (4599)\ttotal: 6m 1s\tremaining: 31.4s\n",
1014 | "4800:\tlearn: 0.4526341\ttest: 0.4526341\ttest1: 24.2183527\tbest: 24.2169859 (4780)\ttotal: 6m 17s\tremaining: 15.7s\n",
1015 | "4999:\tlearn: 0.3940131\ttest: 0.3940131\ttest1: 24.2156999\tbest: 24.2153069 (4902)\ttotal: 6m 33s\tremaining: 0us\n",
1016 | "\n",
1017 | "bestTest = 24.21530691\n",
1018 | "bestIteration = 4902\n",
1019 | "\n",
1020 | "Shrink model to first 4903 iterations.\n",
1021 | "err: 24.215306777079714\n"
1022 | ],
1023 | "name": "stdout"
1024 | },
1025 | {
1026 | "output_type": "execute_result",
1027 | "data": {
1028 | "text/plain": [
1029 | "22.790050240627117"
1030 | ]
1031 | },
1032 | "metadata": {
1033 | "tags": []
1034 | },
1035 | "execution_count": 117
1036 | }
1037 | ]
1038 | },
1039 | {
1040 | "cell_type": "code",
1041 | "metadata": {
1042 | "id": "3Oy1kH8gm9OO",
1043 | "colab_type": "code",
1044 | "outputId": "27b74a6b-2e32-4f91-b23b-9a82e4a0a4c5",
1045 | "colab": {
1046 | "base_uri": "https://localhost:8080/",
1047 | "height": 34
1048 | }
1049 | },
1050 | "source": [
1051 | "np.mean(errcb2)"
1052 | ],
1053 | "execution_count": 0,
1054 | "outputs": [
1055 | {
1056 | "output_type": "execute_result",
1057 | "data": {
1058 | "text/plain": [
1059 | "22.790050240627117"
1060 | ]
1061 | },
1062 | "metadata": {
1063 | "tags": []
1064 | },
1065 | "execution_count": 118
1066 | }
1067 | ]
1068 | },
1069 | {
1070 | "cell_type": "code",
1071 | "metadata": {
1072 | "id": "SnMxF4rWm9OR",
1073 | "colab_type": "code",
1074 | "colab": {}
1075 | },
1076 | "source": [
1077 | "d = {'ID': test_id, 'target': np.mean(y_pred_totcb2, 0)}\n",
1078 | "sub = pd.DataFrame(data=d)\n",
1079 | "sub = sub[['ID', 'target']]"
1080 | ],
1081 | "execution_count": 0,
1082 | "outputs": []
1083 | },
1084 | {
1085 | "cell_type": "code",
1086 | "metadata": {
1087 | "id": "_azuEaJpICfS",
1088 | "colab_type": "code",
1089 | "colab": {}
1090 | },
1091 | "source": [
1092 | "sub.to_csv('indianda.csv',index=False)\n",
1093 | "from google.colab import files\n",
1094 | "files.download(\"indianda.csv\")"
1095 | ],
1096 | "execution_count": 0,
1097 | "outputs": []
1098 | }
1099 | ]
1100 | }
--------------------------------------------------------------------------------