├── README.md
├── zind-airqo-final-blend.ipynb
├── darius-model-1.ipynb
├── darius-model-2.ipynb
└── zindi-airqo-cnn-quick.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # zindi-airqo
2 |
3 |
4 |
5 | 1. Run the 4 notebooks 'darius-model-1.ipynb', 'darius-model-2.ipynb', 'model-3.ipynb' and
6 | 'zindi-airqo-cnn-quick.ipynb' in any order. The outputs generated by the notebooks are kept already placed in the current folder.
7 | 2. Run the final notebook 'zind-airqo-final-blend.ipynb' only after running the above 4 notebooks in any order to get the final output 'zindi_airqo_final_sub.csv'.
8 |
--------------------------------------------------------------------------------
/zind-airqo-final-blend.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import pandas as pd"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "airqo-ugandan-air-quality-forecast-challenge #2 3 STRoNG !!!.zip\n",
25 | "cnn_preds.csv\n",
26 | "darius-model-1.ipynb\n",
27 | "darius-model-2.ipynb\n",
28 | "input\n",
29 | "model-3.ipynb\n",
30 | "model_12_blend.csv\n",
31 | "model_2.csv\n",
32 | "model_3.csv\n",
33 | "readme.txt\n",
34 | "zind-airqo-final-blend.ipynb\n",
35 | "zindi-airqo-cnn-quick.ipynb\n",
36 | "zindi_airqo_final_sub.csv\n"
37 | ]
38 | }
39 | ],
40 | "source": [
41 | "!ls"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 3,
47 | "metadata": {
48 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
49 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
50 | "collapsed": true
51 | },
52 | "outputs": [],
53 | "source": [
54 | "df = pd.read_csv('model_2.csv')[['ID']]\n",
55 | "df = pd.merge(df, pd.read_csv('model_12_blend.csv').rename({'target': 'A'}, axis=1), on = 'ID', how='left')\n",
56 | "df = pd.merge(df, pd.read_csv('model_2.csv').rename({'target': 'B'}, axis=1), on = 'ID', how='left')\n",
57 | "df = pd.merge(df, pd.read_csv('model_3.csv').rename({'target': 'C'}, axis=1), on = 'ID', how='left')\n",
58 | "df = pd.merge(df, pd.read_csv('cnn_preds.csv').rename({'target': 'D'}, axis=1), on = 'ID', how='left')"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/html": [
69 | "
\n",
70 | "\n",
83 | "
\n",
84 | " \n",
85 | " \n",
86 | " | \n",
87 | " A | \n",
88 | " B | \n",
89 | " C | \n",
90 | " D | \n",
91 | "
\n",
92 | " \n",
93 | " \n",
94 | " \n",
95 | " | A | \n",
96 | " 1.000000 | \n",
97 | " 0.972463 | \n",
98 | " 0.981905 | \n",
99 | " 0.937631 | \n",
100 | "
\n",
101 | " \n",
102 | " | B | \n",
103 | " 0.972463 | \n",
104 | " 1.000000 | \n",
105 | " 0.966836 | \n",
106 | " 0.932886 | \n",
107 | "
\n",
108 | " \n",
109 | " | C | \n",
110 | " 0.981905 | \n",
111 | " 0.966836 | \n",
112 | " 1.000000 | \n",
113 | " 0.941585 | \n",
114 | "
\n",
115 | " \n",
116 | " | D | \n",
117 | " 0.937631 | \n",
118 | " 0.932886 | \n",
119 | " 0.941585 | \n",
120 | " 1.000000 | \n",
121 | "
\n",
122 | " \n",
123 | "
\n",
124 | "
"
125 | ],
126 | "text/plain": [
127 | " A B C D\n",
128 | "A 1.000000 0.972463 0.981905 0.937631\n",
129 | "B 0.972463 1.000000 0.966836 0.932886\n",
130 | "C 0.981905 0.966836 1.000000 0.941585\n",
131 | "D 0.937631 0.932886 0.941585 1.000000"
132 | ]
133 | },
134 | "execution_count": 4,
135 | "metadata": {},
136 | "output_type": "execute_result"
137 | }
138 | ],
139 | "source": [
140 | "df[['A', 'B', 'C', 'D']].corr()"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 5,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "name": "stderr",
150 | "output_type": "stream",
151 | "text": [
152 | "C:\\Anaconda5\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
153 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
154 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
155 | "\n",
156 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
157 | " \n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "final_sub_df = df[['ID']]\n",
163 | "final_sub_df['target'] = ((df['A']*0.6 + df['B']*0.4)*0.35 + df['C']*0.65)*0.85 + df['D']*0.15\n",
164 | "final_sub_df.to_csv('zindi_airqo_final_sub.csv', index=False)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 6,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stderr",
174 | "output_type": "stream",
175 | "text": [
176 | "C:\\Anaconda5\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
177 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
178 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
179 | "\n",
180 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
181 | " \n"
182 | ]
183 | }
184 | ],
185 | "source": [
186 | "final_sub_df = df[['ID']]\n",
187 | "final_sub_df['target'] = (df['A']*0.6 + df['B']*0.4)\n",
188 | "final_sub_df.to_csv('zindi_airqo_model_1_and_2_only.csv', index=False)"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {
195 | "collapsed": true
196 | },
197 | "outputs": [],
198 | "source": []
199 | }
200 | ],
201 | "metadata": {
202 | "kernelspec": {
203 | "display_name": "Python 3",
204 | "language": "python",
205 | "name": "python3"
206 | },
207 | "language_info": {
208 | "codemirror_mode": {
209 | "name": "ipython",
210 | "version": 3
211 | },
212 | "file_extension": ".py",
213 | "mimetype": "text/x-python",
214 | "name": "python",
215 | "nbconvert_exporter": "python",
216 | "pygments_lexer": "ipython3",
217 | "version": "3.6.3"
218 | }
219 | },
220 | "nbformat": 4,
221 | "nbformat_minor": 4
222 | }
223 |
--------------------------------------------------------------------------------
/darius-model-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true,
8 | "id": "pk5WiopT9oJw",
9 | "outputId": "446b6db5-1cbe-4beb-e1e2-4bcdd512bab6"
10 | },
11 | "outputs": [],
12 | "source": [
13 | "# installing catboost\n",
14 | "# Catboost == 0.22 was the version of catboost at the start of this competition\n",
15 | "!pip install catboost==0.22 --quiet"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {
22 | "collapsed": true,
23 | "id": "jVXG4QlTuVYr"
24 | },
25 | "outputs": [],
26 | "source": [
27 | "# Importing libraries\n",
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "import warnings\n",
31 | "import joblib\n",
32 | "\n",
33 | "from tqdm import tqdm, tqdm_notebook\n",
34 | "from functools import reduce\n",
35 | "from time import time\n",
36 | "\n",
37 | "from catboost import CatBoostRegressor, CatBoostClassifier\n",
38 | "from sklearn.utils import shuffle\n",
39 | "\n",
40 | "pd.set_option('display.max_rows', 1000) \n",
41 | "warnings.filterwarnings('ignore')"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {
48 | "collapsed": true,
49 | "id": "VJjtuM3kvCAT"
50 | },
51 | "outputs": [],
52 | "source": [
53 | "# Loading data\n",
54 | "train = pd.read_csv('./input/Train.csv')\n",
55 | "test = pd.read_csv('./input/Test.csv')"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {
62 | "collapsed": true
63 | },
64 | "outputs": [],
65 | "source": [
66 | "start = time()"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "collapsed": true,
74 | "id": "4MhDKOpYF9dm"
75 | },
76 | "outputs": [],
77 | "source": [
78 | "# Separating the target variable from the training dataframe\n",
79 | "#\n",
80 | "target = train.target\n",
81 | "\n",
82 | "# Aligning the train and test dataframes\n",
83 | "#\n",
84 | "train, test = train.align(test, join='inner', axis=1)\n",
85 | "\n",
86 | "# Creating a separator column to both train and test,\n",
87 | "# This is to be used in separation\n",
88 | "#\n",
89 | "train['separator'] = 0\n",
90 | "test['separator'] = 1\n",
91 | "\n",
92 | "# Combing the train and test dataframes together\n",
93 | "#\n",
94 | "comb = pd.concat([train, test])\n",
95 | "\n",
96 | "# Creating a function to replace all spaces in the dataframe with np.nan\n",
97 | "#\n",
98 | "def replace_nan(x):\n",
99 | " if x == \" \":\n",
100 | " return np.nan\n",
101 | " else:\n",
102 | " return float(x)\n",
103 | "\n",
104 | "# Creating a list of the main columns\n",
105 | "#\n",
106 | "main_cols = [\"temp\", \"precip\", \"rel_humidity\", \"wind_dir\", \"wind_spd\", \"atmos_press\"]\n",
107 | "\n",
108 | "# Replacing spaces with np.nan\n",
109 | "#\n",
110 | "for col in main_cols: \n",
111 | " comb[col] = comb[col].apply(lambda x: [replace_nan(X) for X in x.replace(\"nan\", \" \").split(\",\")])\n",
112 | "\n",
113 | "def make_columns(feature):\n",
114 | " return [f\"{feature}_{i}\" for i in range(1, 122)]\n",
115 | " \n",
116 | "# Generating dataframes of hours for each main column\n",
117 | "#\n",
118 | "comb_temp = pd.DataFrame([x for x in comb.temp], columns=make_columns('temp'))\n",
119 | "comb_precip = pd.DataFrame([x for x in comb.precip], columns=make_columns('precip'))\n",
120 | "comb_rel_humidity = pd.DataFrame([x for x in comb.rel_humidity], columns=make_columns('rel_humidity'))\n",
121 | "comb_wind_dir = pd.DataFrame([x for x in comb.wind_dir], columns=make_columns('wind_dir'))\n",
122 | "comb_wind_spd = pd.DataFrame([x for x in comb.wind_spd], columns=make_columns('wind_spd'))\n",
123 | "comb_atmos_press = pd.DataFrame([x for x in comb.atmos_press], columns=make_columns('atmos_press'))\n",
124 | "\n",
125 | "comb_temp['ID'], comb_precip['ID'], comb_rel_humidity['ID'], comb_wind_dir['ID'], comb_wind_spd['ID'], comb_atmos_press['ID'] = [list(comb.ID)] * 6\n",
126 | "\n",
127 | "# Combining the generated dataframes together\n",
128 | "#\n",
129 | "comb_dfs = [comb, comb_temp, comb_precip, comb_rel_humidity, comb_wind_dir, comb_wind_spd, comb_atmos_press]\n",
130 | "comb = reduce(lambda left, right: pd.merge(left, right, on=['ID'], how='outer'), comb_dfs)\n",
131 | "comb.drop(main_cols, axis=1, inplace=True)\n",
132 | "df = comb.copy()"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "code_folding": [],
140 | "collapsed": true,
141 | "id": "3FLjxrAXYbRV",
142 | "outputId": "5d01c7d2-cf39-49ce-d234-6c735a3971d7"
143 | },
144 | "outputs": [],
145 | "source": [
146 | "# Creating original series for each feature\n",
147 | "orig_cols_dict = {}\n",
148 | "weather_cols = ['temp', 'precip', 'rel_humidity', 'wind_dir','wind_spd', 'atmos_press']\n",
149 | "\n",
150 | "for w in tqdm_notebook(weather_cols):\n",
151 | " selected_cols = [c for c in df.columns if w in c]\n",
152 | " orig_cols_dict[w] = pd.Series(selected_cols)"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {
159 | "collapsed": true,
160 | "id": "oAGDsJ92YS2Z",
161 | "outputId": "3c4077e0-d77c-4feb-e744-4ef904531c04"
162 | },
163 | "outputs": [],
164 | "source": [
165 | "# Aggregating features per hour\n",
166 | "\n",
167 | "for w in tqdm_notebook(weather_cols):\n",
168 | " tmp_df = pd.DataFrame()\n",
169 | " tmp_df['weather_col_orig'] = orig_cols_dict[w]\n",
170 | " tmp_df['hours_since_start'] = tmp_df['weather_col_orig'].apply(lambda x: x.split('_')[-1]).astype('int')\n",
171 | " tmp_df['hour_of_day'] = tmp_df['hours_since_start'] % 24\n",
172 | "\n",
173 | " for hour in range(1, 25):\n",
174 | " selected_cols = tmp_df[tmp_df['hour_of_day'] == hour]['weather_col_orig'].tolist()\n",
175 | " df_cols = df[selected_cols] # factorizing this part\n",
176 | " \n",
177 | " df[f'{w}_hour_{hour}_mean'] = df_cols.mean(axis=1)\n",
178 | " df[f'{w}_hour_{hour}_min'] = df_cols.min(axis=1)\n",
179 | " df[f'{w}_hour_{hour}_max'] = df_cols.max(axis=1)\n",
180 | " df[f'{w}_hour_{hour}_range'] = df[f'{w}_hour_{hour}_max'] - df[f'{w}_hour_{hour}_min']\n",
181 | " df[f'{w}_hour_{hour}_skew'] = df_cols.skew()\n",
182 | " df[f'{w}_hour_{hour}_kurt'] = df_cols.kurt()\n",
183 | "\n",
184 | " if hour - 3 > 0 and hour % 3 == 0:\n",
185 | " df[f'{w}_hour_{hour}_prev_hour_mean_diff'] = df[f'{w}_hour_{hour}_mean'] - df[f'{w}_hour_{hour - 3}_mean']\n",
186 | " if hour - 5 > 0 and hour % 3 == 0:\n",
187 | " df[f'{w}_hour_{hour}_prev_hour_mean_diff_5'] = df[f'{w}_hour_{hour}_mean'] - df[f'{w}_hour_{hour - 5}_mean']\n"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {
194 | "collapsed": true,
195 | "id": "pEyFVf1lqInY"
196 | },
197 | "outputs": [],
198 | "source": [
199 | "comb = df.copy()"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "metadata": {
206 | "collapsed": true,
207 | "id": "WCp_Ukh-NTso",
208 | "outputId": "104e385d-04c6-47af-8bec-582bc0fdfbac"
209 | },
210 | "outputs": [],
211 | "source": [
212 | "comb.head()"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {
219 | "collapsed": true,
220 | "id": "xzn_w6FYftB8",
221 | "outputId": "33af157e-0c05-4be9-fd8f-3290f64fcd47"
222 | },
223 | "outputs": [],
224 | "source": [
225 | "# Creating aggregation features for each variable\n",
226 | "aggs = ['mean', 'std', 'var', 'kurt', 'skew', 'max', 'median', 'sum', 'mode', 'sem', 'min']\n",
227 | "\n",
228 | "for col in tqdm_notebook(main_cols):\n",
229 | " for ag in tqdm(aggs):\n",
230 | " selected_cols = [x for x in comb.columns if x.startswith(col)]\n",
231 | "\n",
232 | " if ag == 'mode':\n",
233 | " aggregate = comb[selected_cols].agg(ag, axis=1)[0]\n",
234 | " else:\n",
235 | " aggregate = comb[selected_cols].agg(ag, axis=1)\n",
236 | " \n",
237 | " comb[col[0] + col[-1] + '_' + ag] = aggregate\n",
238 | "\n",
239 | "# Creating separate dataframes for each variable\n",
240 | "# Creating a list of columns for each separate dataframe\n",
241 | "temp_cols = [x for x in comb.columns if x.startswith('temp')]\n",
242 | "temp = comb[temp_cols]\n",
243 | "\n",
244 | "precip_cols = [x for x in comb.columns if x.startswith('precip')]\n",
245 | "precip = comb[precip_cols]\n",
246 | "\n",
247 | "humid_cols = [x for x in comb.columns if x.startswith('rel_humidity')]\n",
248 | "humid = comb[humid_cols]\n",
249 | "\n",
250 | "wind_dir_cols = [x for x in comb.columns if x.startswith('wind_dir')]\n",
251 | "wind_dir = comb[wind_dir_cols]\n",
252 | "\n",
253 | "wind_spd_cols = [x for x in comb.columns if x.startswith('wind_spd')]\n",
254 | "wind_spd = comb[wind_spd_cols]\n",
255 | "\n",
256 | "atmp_cols = [x for x in comb.columns if x.startswith('atmos_press')]\n",
257 | "atmp = comb[atmp_cols]\n",
258 | "\n",
259 | "fill_cols = comb.columns"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {
266 | "collapsed": true,
267 | "id": "qdVrf3ZmcFZn"
268 | },
269 | "outputs": [],
270 | "source": [
271 | "# Generating new features, by adding each variable per hour\n",
272 | "for x, y, z, a, b in zip(temp.columns, precip.columns, humid.columns, wind_spd.columns, atmp.columns):\n",
273 | " comb['add_tp' + y[-4:]] = temp[x] + precip[y] + humid[z] + wind_spd[a] + atmp[b]"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {
280 | "collapsed": true,
281 | "id": "IU2Z7uDigrgW"
282 | },
283 | "outputs": [],
284 | "source": [
285 | "# Filling missing values using forward fill\n",
286 | "comb = comb.ffill(axis=1)"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {
293 | "collapsed": true,
294 | "id": "aQTxY80kZ6rH",
295 | "outputId": "a2e1dfe2-2032-4666-9921-e523a4e566e6"
296 | },
297 | "outputs": [],
298 | "source": [
299 | "comb.head()"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {
306 | "collapsed": true
307 | },
308 | "outputs": [],
309 | "source": [
310 | "def apply_qcut(feat):\n",
311 | " return pd.qcut(comb[feat], 24, labels=False, duplicates='drop')"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {
318 | "collapsed": true
319 | },
320 | "outputs": [],
321 | "source": [
322 | "other_features = [x for x in comb.columns if x not in ['separator', 'ID', 'location']]\n",
323 | "\n",
324 | "# Multiprocessing trick: 15 seconds instead of 7 minutes !\n",
325 | "binned_data = joblib.Parallel(n_jobs=-1, backend='multiprocessing')(\n",
326 | " joblib.delayed(apply_qcut)(feat) for feat in tqdm_notebook(other_features))\n",
327 | "\n",
328 | "comb_binned_data = pd.concat(binned_data, axis=1)\n",
329 | "comb = pd.concat([comb[['separator', 'ID', 'location']], comb_binned_data], axis=1)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {
336 | "collapsed": true
337 | },
338 | "outputs": [],
339 | "source": [
340 | "comb.head()"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {
347 | "collapsed": true,
348 | "id": "w-60osCYdgjT",
349 | "outputId": "7dc3c3bd-ef95-4d34-f090-3c7efb05081a"
350 | },
351 | "outputs": [],
352 | "source": [
353 | "# Separating train and test from the combined dataframe\n",
354 | "train = comb[comb.separator == 0]\n",
355 | "test = comb[comb.separator == 1]\n",
356 | "train.drop('separator', axis=1, inplace=True)\n",
357 | "test.drop('separator', axis=1, inplace=True)\n",
358 | "\n",
359 | "# Creating a list of test ids in the order that they will be trained\n",
360 | "testA = test[test.location == 'A']\n",
361 | "testB = test[test.location == 'B']\n",
362 | "testC = test[test.location == 'C']\n",
363 | "testD = test[test.location == 'D']\n",
364 | "testE = test[test.location == 'E']\n",
365 | "\n",
366 | "tA, tD, tE, tBC = testA.ID, testD.ID, testE.ID, test[(test.location == 'B') | (test.location == 'C')].ID\n",
367 | "test_id = pd.concat([tA, tD, tE, tBC])\n",
368 | "\n",
369 | "# Adding back target to the train set\n",
370 | "train['target'] = target"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {
377 | "collapsed": true
378 | },
379 | "outputs": [],
380 | "source": [
381 | "end = time()\n",
382 | "print(f\"Total preprocessing time = {end - start:.1f}\")"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {
389 | "collapsed": true,
390 | "id": "130zceoDarsp"
391 | },
392 | "outputs": [],
393 | "source": [
394 | "%%time\n",
395 | "# Creating X and y values\n",
396 | "X = train.drop(['ID', 'location', 'target'], axis=1)\n",
397 | "y = target.values\n",
398 | "\n",
399 | "# Shuffling the X, y values\n",
400 | "X, y = shuffle(X, y, random_state=0)\n",
401 | "tes = test.drop(['ID', 'location'], axis=1)\n",
402 | "\n",
403 | "# Traing the model across multiple seeds\n",
404 | "predictions = []\n",
405 | "for i in tqdm_notebook(range(25)):\n",
406 | " cat = CatBoostRegressor(verbose=False, random_seed=i)\n",
407 | " cat.fit(X, y)\n",
408 | " \n",
409 | " preds = cat.predict(tes)\n",
410 | " predictions.append(preds)\n",
411 | "\n",
412 | "# Averaging the predictions\n",
413 | "avg_preds = np.mean(predictions, axis=0)\n",
414 | "\n",
415 | "# Post processing of the predictions\n",
416 | "# This post processing was done with the help of a validation set.\n",
417 | "# The validation set was adversarial, i.e. we chose the examples from the training set closest to the test set, and applied post processing to it.\n",
418 | "post_proc = [((((((((((x-0.85)*1.015)-0.85)*1.012)-0.75)*1.0095)-0.55)*1.0065)-0.8)*1.007) for x in avg_preds]\n",
419 | "post_proc = predzz = [((x-0.85)*1.015) for x in post_proc]\n",
420 | "\n",
421 | "# Creating a submission file\n",
422 | "sub_df = pd.DataFrame({'ID': test.ID, 'target': post_proc})\n",
423 | "sub_df.to_csv('model_1_1.csv', index=False)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {
430 | "collapsed": true,
431 | "id": "nrK5eVysjIna",
432 | "outputId": "8846d85a-7616-49b3-b0db-d87fc502c993"
433 | },
434 | "outputs": [],
435 | "source": [
436 | "%%time\n",
437 | "# Creating a list to hold predictions per seed\n",
438 | "predzz = []\n",
439 | "for i in tqdm_notebook(range(25), leave=False):\n",
440 | " # Creating a list to hold predictions per location\n",
441 | " # Training model per location per seed\n",
442 | " predictions = []\n",
443 | " for area in tqdm_notebook(['A', 'D', 'E'], leave=False):\n",
444 | " # Separating training data per location\n",
445 | " X = train[train.location == area]\n",
446 | " y = X.target\n",
447 | " X = X.drop(['ID', 'location', 'target'], axis=1)\n",
448 | "\n",
449 | " # Shuffling data\n",
450 | " X, y = shuffle(X, y, random_state=0)\n",
451 | "\n",
452 | " # Separating testing data per location\n",
453 | " tes = test[test.location == area]\n",
454 | " tes = tes.drop(['ID', 'location'], axis=1)\n",
455 | "\n",
456 | " # Training the model and making predictions per seed, per location\n",
457 | " preds = CatBoostRegressor(verbose=False, random_seed=i).fit(X, y).predict(tes)\n",
458 | " predictions.extend(preds)\n",
459 | "\n",
460 | " X = train[(train.location == 'B') | (train.location == 'C')]\n",
461 | " y = X.target\n",
462 | " X = X.drop(['ID', 'location', 'target'], axis=1)\n",
463 | " X, y = shuffle(X, y, random_state=0)\n",
464 | "\n",
465 | " tes = test[(test.location == 'B') | (test.location == 'C')]\n",
466 | " tes = tes.drop(['ID', 'location'], axis=1)\n",
467 | " preds = CatBoostRegressor(verbose=False, random_seed=i).fit(X, y).predict(tes)\n",
468 | " predictions.extend(preds)\n",
469 | "\n",
470 | " predzz.append(predictions)\n",
471 | "\n",
472 | "# Averaging the predictions\n",
473 | "preds_av = np.mean(predzz, axis=0)\n",
474 | "\n",
475 | "# Post processing of the predictions\n",
476 | "# This post processing was done with the help of a validation set.\n",
477 | "# The validation set was adversarial, i.e. we chose the examples from the training set closest to the test set, and applied post processing to it.\n",
478 | "predz = [((((((((((x-0.85)*1.015)-0.85)*1.012)-0.75)*1.0095)-0.55)*1.0065)-0.8)*1.007) for x in preds_av]\n",
479 | "predzz = [((x-0.85)*1.015) for x in predz]\n",
480 | "\n",
481 | "# Creating a submission file\n",
482 | "sub_df = pd.DataFrame({'ID': test_id, 'target': predzz})\n",
483 | "sub_df.to_csv('model_1_2.csv', index = False)"
484 | ]
485 | },
486 | {
487 | "cell_type": "code",
488 | "execution_count": 48,
489 | "metadata": {},
490 | "outputs": [
491 | {
492 | "data": {
493 | "text/html": [
494 | "\n",
495 | "\n",
508 | "
\n",
509 | " \n",
510 | " \n",
511 | " | \n",
512 | " A | \n",
513 | " B | \n",
514 | "
\n",
515 | " \n",
516 | " \n",
517 | " \n",
518 | " | A | \n",
519 | " 1.000000 | \n",
520 | " 0.979202 | \n",
521 | "
\n",
522 | " \n",
523 | " | B | \n",
524 | " 0.979202 | \n",
525 | " 1.000000 | \n",
526 | "
\n",
527 | " \n",
528 | "
\n",
529 | "
"
530 | ],
531 | "text/plain": [
532 | " A B\n",
533 | "A 1.000000 0.979202\n",
534 | "B 0.979202 1.000000"
535 | ]
536 | },
537 | "execution_count": 48,
538 | "metadata": {},
539 | "output_type": "execute_result"
540 | }
541 | ],
542 | "source": [
543 | "blend_df = pd.read_csv('model_1_1.csv')[['ID']]\n",
544 | "blend_df['A'] = pd.read_csv('model_1_1.csv')['target']\n",
545 | "blend_df = pd.merge(blend_df, pd.read_csv('model_1_2.csv').rename({'target': 'B'}, axis=1), on = 'ID', how = 'left')\n",
546 | "blend_df.corr()"
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": 53,
552 | "metadata": {
553 | "collapsed": true
554 | },
555 | "outputs": [],
556 | "source": [
557 | "blend_df['target'] = blend_df['A']*0.5 + blend_df['B']*0.5\n",
558 | "blend_df[['ID', 'target']].to_csv('model_12_blend.csv', index=False)"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": 54,
564 | "metadata": {},
565 | "outputs": [
566 | {
567 | "data": {
568 | "text/html": [
569 | "\n",
570 | "\n",
583 | "
\n",
584 | " \n",
585 | " \n",
586 | " | \n",
587 | " A | \n",
588 | " B | \n",
589 | " target | \n",
590 | "
\n",
591 | " \n",
592 | " \n",
593 | " \n",
594 | " | A | \n",
595 | " 1.000000 | \n",
596 | " 0.979202 | \n",
597 | " 0.994761 | \n",
598 | "
\n",
599 | " \n",
600 | " | B | \n",
601 | " 0.979202 | \n",
602 | " 1.000000 | \n",
603 | " 0.994813 | \n",
604 | "
\n",
605 | " \n",
606 | " | target | \n",
607 | " 0.994761 | \n",
608 | " 0.994813 | \n",
609 | " 1.000000 | \n",
610 | "
\n",
611 | " \n",
612 | "
\n",
613 | "
"
614 | ],
615 | "text/plain": [
616 | " A B target\n",
617 | "A 1.000000 0.979202 0.994761\n",
618 | "B 0.979202 1.000000 0.994813\n",
619 | "target 0.994761 0.994813 1.000000"
620 | ]
621 | },
622 | "execution_count": 54,
623 | "metadata": {},
624 | "output_type": "execute_result"
625 | }
626 | ],
627 | "source": [
628 | "blend_df.corr()"
629 | ]
630 | },
631 | {
632 | "cell_type": "code",
633 | "execution_count": 55,
634 | "metadata": {
635 | "collapsed": true
636 | },
637 | "outputs": [],
638 | "source": [
639 | "SUB_FILE_NAME = 'model_12_blend.csv'"
640 | ]
641 | },
642 | {
643 | "cell_type": "code",
644 | "execution_count": 57,
645 | "metadata": {},
646 | "outputs": [
647 | {
648 | "data": {
649 | "text/html": [
650 | "\n",
651 | "\n",
664 | "
\n",
665 | " \n",
666 | " \n",
667 | " | \n",
668 | " ID | \n",
669 | " target | \n",
670 | "
\n",
671 | " \n",
672 | " \n",
673 | " \n",
674 | " | 0 | \n",
675 | " ID_test_0 | \n",
676 | " 158.123774 | \n",
677 | "
\n",
678 | " \n",
679 | " | 1 | \n",
680 | " ID_test_1 | \n",
681 | " 97.217908 | \n",
682 | "
\n",
683 | " \n",
684 | " | 2 | \n",
685 | " ID_test_10 | \n",
686 | " 21.393733 | \n",
687 | "
\n",
688 | " \n",
689 | " | 3 | \n",
690 | " ID_test_100 | \n",
691 | " 63.222891 | \n",
692 | "
\n",
693 | " \n",
694 | " | 4 | \n",
695 | " ID_test_1000 | \n",
696 | " 92.046200 | \n",
697 | "
\n",
698 | " \n",
699 | " | 5 | \n",
700 | " ID_test_1001 | \n",
701 | " 44.955298 | \n",
702 | "
\n",
703 | " \n",
704 | " | 6 | \n",
705 | " ID_test_1002 | \n",
706 | " 83.270765 | \n",
707 | "
\n",
708 | " \n",
709 | " | 7 | \n",
710 | " ID_test_1003 | \n",
711 | " 36.458014 | \n",
712 | "
\n",
713 | " \n",
714 | " | 8 | \n",
715 | " ID_test_1004 | \n",
716 | " 34.101068 | \n",
717 | "
\n",
718 | " \n",
719 | " | 9 | \n",
720 | " ID_test_1005 | \n",
721 | " 47.728921 | \n",
722 | "
\n",
723 | " \n",
724 | "
\n",
725 | "
"
726 | ],
727 | "text/plain": [
728 | " ID target\n",
729 | "0 ID_test_0 158.123774\n",
730 | "1 ID_test_1 97.217908\n",
731 | "2 ID_test_10 21.393733\n",
732 | "3 ID_test_100 63.222891\n",
733 | "4 ID_test_1000 92.046200\n",
734 | "5 ID_test_1001 44.955298\n",
735 | "6 ID_test_1002 83.270765\n",
736 | "7 ID_test_1003 36.458014\n",
737 | "8 ID_test_1004 34.101068\n",
738 | "9 ID_test_1005 47.728921"
739 | ]
740 | },
741 | "execution_count": 57,
742 | "metadata": {},
743 | "output_type": "execute_result"
744 | }
745 | ],
746 | "source": [
747 | "blend_df[['ID', 'target']].head(10)"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 56,
753 | "metadata": {},
754 | "outputs": [
755 | {
756 | "data": {
757 | "text/html": [
758 | "Download CSV file"
759 | ],
760 | "text/plain": [
761 | ""
762 | ]
763 | },
764 | "execution_count": 56,
765 | "metadata": {},
766 | "output_type": "execute_result"
767 | }
768 | ],
769 | "source": [
770 | "from IPython.display import HTML\n",
771 | "def create_download_link(title = \"Download CSV file\", filename = \"data.csv\"): \n",
772 | " html = '{title}'\n",
773 | " html = html.format(title=title,filename=filename)\n",
774 | " return HTML(html)\n",
775 | "create_download_link(filename = SUB_FILE_NAME)"
776 | ]
777 | }
778 | ],
779 | "metadata": {
780 | "kernelspec": {
781 | "display_name": "Python 3",
782 | "language": "python",
783 | "name": "python3"
784 | },
785 | "language_info": {
786 | "codemirror_mode": {
787 | "name": "ipython",
788 | "version": 3
789 | },
790 | "file_extension": ".py",
791 | "mimetype": "text/x-python",
792 | "name": "python",
793 | "nbconvert_exporter": "python",
794 | "pygments_lexer": "ipython3",
795 | "version": "3.6.3"
796 | }
797 | },
798 | "nbformat": 4,
799 | "nbformat_minor": 4
800 | }
801 |
--------------------------------------------------------------------------------
/darius-model-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 25,
6 | "metadata": {
7 | "id": "pk5WiopT9oJw",
8 | "outputId": "f81ff94a-3399-4a56-9365-6c5e54d80782"
9 | },
10 | "outputs": [
11 | {
12 | "name": "stdout",
13 | "output_type": "stream",
14 | "text": [
15 | "\u001b[33mWARNING: You are using pip version 20.1; however, version 20.1.1 is available.\r\n",
16 | "You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.\u001b[0m\r\n"
17 | ]
18 | }
19 | ],
20 | "source": [
21 | "# installing catboost\n",
22 | "# Catboost == 0.22 was the version of catboost at the start of this competition\n",
23 | "!pip install catboost==0.22 --quiet"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 27,
29 | "metadata": {
30 | "collapsed": true,
31 | "id": "jVXG4QlTuVYr"
32 | },
33 | "outputs": [],
34 | "source": [
35 | "# Importing libraries\n",
36 | "import pandas as pd\n",
37 | "import numpy as np\n",
38 | "from lightgbm import LGBMRegressor\n",
39 | "from xgboost import XGBRegressor, XGBRFRegressor\n",
40 | "from sklearn.model_selection import KFold, cross_val_score, train_test_split\n",
41 | "from sklearn.metrics import mean_squared_error\n",
42 | "from sklearn.utils import shuffle\n",
43 | "from tqdm import tqdm, tqdm_notebook\n",
44 | "from functools import reduce\n",
45 | "from catboost import CatBoostRegressor, CatBoostClassifier\n",
46 | "import joblib\n",
47 | "\n",
48 | "\n",
49 | "import warnings\n",
50 | "warnings.filterwarnings('ignore')\n"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 28,
56 | "metadata": {
57 | "collapsed": true,
58 | "id": "agGrUcYwQ7on",
59 | "outputId": "5d4d760c-135f-4e6e-ae14-bc275b174734"
60 | },
61 | "outputs": [],
62 | "source": [
63 | "# Loading data\n",
64 | "train = pd.read_csv('./input/Train.csv')\n",
65 | "test = pd.read_csv('./input/Test.csv')"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 29,
71 | "metadata": {
72 | "collapsed": true,
73 | "id": "jNB_PDTNpxu4"
74 | },
75 | "outputs": [],
76 | "source": [
77 | "# Feature interaction functions\n",
78 | "# There are 4 types of interactions: product interactions, division interactions, sum interactions and divide interactions\n",
79 | "\n",
80 | "def add_prod_interacts(df, inter_cols): \n",
81 | " def apply_interacts(x, inter_cols):\n",
82 | " cols = [x + '_prod_' + c for c in inter_cols[inter_cols.index(x)+1:]]\n",
83 | " interacts_df[cols] = pd.concat([df[x] * df[c] for c in inter_cols[inter_cols.index(x)+1:]], axis=1)\n",
84 | " \n",
85 | " interacts_df = pd.DataFrame()\n",
86 | " _ = df[inter_cols[:-1]].apply(lambda x: apply_interacts(x.name, inter_cols))\n",
87 | " df = pd.concat([df, interacts_df], axis=1)\n",
88 | " return df\n",
89 | "\n",
90 | "\n",
91 | "def add_div_interacts(df, inter_cols): \n",
92 | " def apply_interacts(x, inter_cols):\n",
93 | " cols = [x + '_div_' + c for c in inter_cols[inter_cols.index(x)+1:]]\n",
94 | " interacts_df[cols] = pd.concat([df[x] / df[c] for c in inter_cols[inter_cols.index(x)+1:]], axis=1)\n",
95 | " \n",
96 | " interacts_df = pd.DataFrame()\n",
97 | " _ = df[inter_cols[:-1]].apply(lambda x: apply_interacts(x.name, inter_cols))\n",
98 | " df = pd.concat([df, interacts_df], axis=1)\n",
99 | "\n",
100 | " return df\n",
101 | "\n",
102 | "def add_sum_interacts(df, inter_cols): \n",
103 | " def apply_interacts(x, inter_cols):\n",
104 | " cols = [x + '_sum_' + c for c in inter_cols[inter_cols.index(x)+1:]]\n",
105 | " interacts_df[cols] = pd.concat([df[x] + df[c] for c in inter_cols[inter_cols.index(x)+1:]], axis=1)\n",
106 | " \n",
107 | " interacts_df = pd.DataFrame()\n",
108 | " _ = df[inter_cols[:-1]].apply(lambda x: apply_interacts(x.name, inter_cols))\n",
109 | " df = pd.concat([df, interacts_df], axis=1)\n",
110 | "\n",
111 | " return df\n",
112 | "\n",
113 | "def add_diff_interacts(df, inter_cols): \n",
114 | " def apply_interacts(x, inter_cols):\n",
115 | " cols = [x + '_diff_' + c for c in inter_cols[inter_cols.index(x)+1:]]\n",
116 | " interacts_df[cols] = pd.concat([df[x] - df[c] for c in inter_cols[inter_cols.index(x)+1:]], axis=1)\n",
117 | " \n",
118 | " interacts_df = pd.DataFrame()\n",
119 | " _ = df[inter_cols[:-1]].apply(lambda x: apply_interacts(x.name, inter_cols))\n",
120 | " df = pd.concat([df, interacts_df], axis=1)\n",
121 | "\n",
122 | " return df"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 30,
128 | "metadata": {
129 | "collapsed": true,
130 | "id": "VJjtuM3kvCAT"
131 | },
132 | "outputs": [],
133 | "source": [
134 | "# Loading data\n",
135 | "train = pd.read_csv('./input/Train.csv')\n",
136 | "test = pd.read_csv('./input/Test.csv')"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 31,
142 | "metadata": {
143 | "collapsed": true,
144 | "id": "4MhDKOpYF9dm"
145 | },
146 | "outputs": [],
147 | "source": [
148 | "# Separating the target variable from the training dataframe\n",
149 | "#\n",
150 | "target = train.target\n",
151 | "\n",
152 | "# Aligning the train and test dataframes\n",
153 | "#\n",
154 | "train, test = train.align(test, join = 'inner',axis = 1)\n",
155 | "\n",
156 | "# Creating a separator column to both train and test,\n",
157 | "# This is to be used in separation\n",
158 | "#\n",
159 | "train['separator'] = 0\n",
160 | "test['separator'] = 1\n",
161 | "# Combing the train and test dataframes together\n",
162 | "#\n",
163 | "comb = pd.concat([train, test])\n",
164 | "\n",
165 | "# Creating a function to replace all spaces in the dataframe with np.nan\n",
166 | "#\n",
167 | "def replace_nan(x):\n",
168 | " if x==\" \":\n",
169 | " return np.nan\n",
170 | " else :\n",
171 | " return float(x)\n",
172 | "\n",
173 | "# Creating a list of the main columns\n",
174 | "#\n",
175 | "main_cols = [\"temp\",\"precip\",\"rel_humidity\",\"wind_dir\",\"wind_spd\",\"atmos_press\"]\n",
176 | "\n",
177 | "# Replacing spaces with np.nan\n",
178 | "#\n",
179 | "for col in main_cols: \n",
180 | " comb[col]=comb[col].apply(lambda x: [ replace_nan(X) for X in x.replace(\"nan\",\" \").split(\",\")])\n",
181 | "\n",
182 | "def make_columns(feature):\n",
183 | " return [f\"{feature}_{i}\" for i in range(1, 122)]\n",
184 | " \n",
185 | "# Generating dataframes of hours for each main column\n",
186 | "#\n",
187 | "comb_temp = pd.DataFrame([x for x in comb.temp], columns=make_columns('temp'))\n",
188 | "comb_precip = pd.DataFrame([x for x in comb.precip], columns=make_columns('precip'))\n",
189 | "comb_rel_humidity = pd.DataFrame([x for x in comb.rel_humidity], columns=make_columns('rel_humidity'))\n",
190 | "comb_wind_dir = pd.DataFrame([x for x in comb.wind_dir], columns=make_columns('wind_dir'))\n",
191 | "comb_wind_spd = pd.DataFrame([x for x in comb.wind_spd], columns=make_columns('wind_spd'))\n",
192 | "comb_atmos_press = pd.DataFrame([x for x in comb.atmos_press], columns=make_columns('atmos_press'))\n",
193 | "\n",
194 | "comb_temp['ID'], comb_precip['ID'], comb_rel_humidity['ID'], comb_wind_dir['ID'], comb_wind_spd['ID'], comb_atmos_press['ID'] = [list(comb.ID)] * 6\n",
195 | "\n",
196 | "# Combining the generated dataframes together\n",
197 | "#\n",
198 | "comb_dfs = [comb, comb_temp, comb_precip, comb_rel_humidity, comb_wind_dir, comb_wind_spd, comb_atmos_press]\n",
199 | "comb = reduce(lambda left,right: pd.merge(left,right,on=['ID'], how='outer'), comb_dfs)\n",
200 | "comb.drop(main_cols, axis = 1, inplace = True)\n",
201 | "df = comb.copy()"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 32,
207 | "metadata": {
208 | "collapsed": true,
209 | "id": "pEyFVf1lqInY"
210 | },
211 | "outputs": [],
212 | "source": [
213 | "comb = df.copy()"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 33,
219 | "metadata": {
220 | "id": "xzn_w6FYftB8",
221 | "outputId": "9c6b5360-766f-4c1b-c223-c00ceaa834f9"
222 | },
223 | "outputs": [
224 | {
225 | "data": {
226 | "application/vnd.jupyter.widget-view+json": {
227 | "model_id": "999f9adfcfd444af978f061a15a74ea8",
228 | "version_major": 2,
229 | "version_minor": 0
230 | },
231 | "text/plain": [
232 | "HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))"
233 | ]
234 | },
235 | "metadata": {},
236 | "output_type": "display_data"
237 | },
238 | {
239 | "data": {
240 | "application/vnd.jupyter.widget-view+json": {
241 | "model_id": "",
242 | "version_major": 2,
243 | "version_minor": 0
244 | },
245 | "text/plain": [
246 | "HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))"
247 | ]
248 | },
249 | "metadata": {},
250 | "output_type": "display_data"
251 | },
252 | {
253 | "data": {
254 | "application/vnd.jupyter.widget-view+json": {
255 | "model_id": "",
256 | "version_major": 2,
257 | "version_minor": 0
258 | },
259 | "text/plain": [
260 | "HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))"
261 | ]
262 | },
263 | "metadata": {},
264 | "output_type": "display_data"
265 | },
266 | {
267 | "data": {
268 | "application/vnd.jupyter.widget-view+json": {
269 | "model_id": "",
270 | "version_major": 2,
271 | "version_minor": 0
272 | },
273 | "text/plain": [
274 | "HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))"
275 | ]
276 | },
277 | "metadata": {},
278 | "output_type": "display_data"
279 | },
280 | {
281 | "data": {
282 | "application/vnd.jupyter.widget-view+json": {
283 | "model_id": "",
284 | "version_major": 2,
285 | "version_minor": 0
286 | },
287 | "text/plain": [
288 | "HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))"
289 | ]
290 | },
291 | "metadata": {},
292 | "output_type": "display_data"
293 | },
294 | {
295 | "data": {
296 | "application/vnd.jupyter.widget-view+json": {
297 | "model_id": "",
298 | "version_major": 2,
299 | "version_minor": 0
300 | },
301 | "text/plain": [
302 | "HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))"
303 | ]
304 | },
305 | "metadata": {},
306 | "output_type": "display_data"
307 | },
308 | {
309 | "data": {
310 | "application/vnd.jupyter.widget-view+json": {
311 | "model_id": "",
312 | "version_major": 2,
313 | "version_minor": 0
314 | },
315 | "text/plain": [
316 | "HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))"
317 | ]
318 | },
319 | "metadata": {},
320 | "output_type": "display_data"
321 | },
322 | {
323 | "name": "stdout",
324 | "output_type": "stream",
325 | "text": [
326 | "\n"
327 | ]
328 | }
329 | ],
330 | "source": [
331 | "# Adding aggregation features for each variable\n",
332 | "#\n",
333 | "aggs = ['mean', 'std', 'var', 'kurt', 'skew', 'max', 'median', 'sum', 'mode', 'sem', 'min']\n",
334 | "for col in tqdm_notebook(main_cols):\n",
335 | " for ag in tqdm_notebook(aggs, leave = False):\n",
336 | " if ag == 'mode':\n",
337 | " comb[col[0] + col[-1] + '_'+ag] = comb[[x for x in comb.columns if x.startswith(col)]].agg(ag, axis = 1)[0]\n",
338 | " else:\n",
339 | " comb[col[0] + col[-1] + '_'+ag] = comb[[x for x in comb.columns if x.startswith(col)]].agg(ag, axis = 1)\n",
340 | "\n",
341 | "# Creating separate dataframes for each variable\n",
342 | "# Creating a list of columns for each separate dataframe\n",
343 | "#\n",
344 | "temp, temp_cols = comb[[x for x in comb.columns if x.startswith('temp')]], [x for x in comb.columns if x.startswith('temp')]\n",
345 | "precip, precip_cols = comb[[x for x in comb.columns if x.startswith('precip')]], [x for x in comb.columns if x.startswith('precip')]\n",
346 | "humid, humid_cols = comb[[x for x in comb.columns if x.startswith('rel_humidity')]], [x for x in comb.columns if x.startswith('rel_humidity')]\n",
347 | "wind_dir, wind_dir_cols = comb[[x for x in comb.columns if x.startswith('wind_dir')]], [x for x in comb.columns if x.startswith('wind_dir')]\n",
348 | "wind_spd, wind_spd_cols = comb[[x for x in comb.columns if x.startswith('wind_spd')]], [x for x in comb.columns if x.startswith('wind_spd')]\n",
349 | "atmp, atmp_cols = comb[[x for x in comb.columns if x.startswith('atmos_press')]], [x for x in comb.columns if x.startswith('atmos_press')]\n",
350 | "fill_cols = comb.columns"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 34,
356 | "metadata": {
357 | "id": "XHCE-yW1r3q8",
358 | "outputId": "dd3d6870-4a06-4794-c94c-eba320fa9335"
359 | },
360 | "outputs": [
361 | {
362 | "data": {
363 | "text/html": [
364 | "\n",
365 | "\n",
378 | "
\n",
379 | " \n",
380 | " \n",
381 | " | \n",
382 | " ID | \n",
383 | " location | \n",
384 | " separator | \n",
385 | " temp_1 | \n",
386 | " temp_2 | \n",
387 | " temp_3 | \n",
388 | " temp_4 | \n",
389 | " temp_5 | \n",
390 | " temp_6 | \n",
391 | " temp_7 | \n",
392 | " ... | \n",
393 | " as_std | \n",
394 | " as_var | \n",
395 | " as_kurt | \n",
396 | " as_skew | \n",
397 | " as_max | \n",
398 | " as_median | \n",
399 | " as_sum | \n",
400 | " as_mode | \n",
401 | " as_sem | \n",
402 | " as_min | \n",
403 | "
\n",
404 | " \n",
405 | " \n",
406 | " \n",
407 | " | 0 | \n",
408 | " ID_train_0 | \n",
409 | " C | \n",
410 | " 0 | \n",
411 | " NaN | \n",
412 | " NaN | \n",
413 | " NaN | \n",
414 | " NaN | \n",
415 | " NaN | \n",
416 | " NaN | \n",
417 | " NaN | \n",
418 | " ... | \n",
419 | " 0.072682 | \n",
420 | " 0.005283 | \n",
421 | " -0.158696 | \n",
422 | " -0.383144 | \n",
423 | " 87.871667 | \n",
424 | " 87.762083 | \n",
425 | " 1404.038939 | \n",
426 | " 87.614167 | \n",
427 | " 0.018170 | \n",
428 | " 87.614167 | \n",
429 | "
\n",
430 | " \n",
431 | " | 1 | \n",
432 | " ID_train_1 | \n",
433 | " D | \n",
434 | " 0 | \n",
435 | " 22.533333 | \n",
436 | " 21.716667 | \n",
437 | " 20.833333 | \n",
438 | " 20.983333 | \n",
439 | " 20.875000 | \n",
440 | " 20.141667 | \n",
441 | " 19.375000 | \n",
442 | " ... | \n",
443 | " 0.156648 | \n",
444 | " 0.024539 | \n",
445 | " -0.446340 | \n",
446 | " -0.173356 | \n",
447 | " 90.725000 | \n",
448 | " 90.429167 | \n",
449 | " 10942.020833 | \n",
450 | " 90.219167 | \n",
451 | " 0.014241 | \n",
452 | " 90.056667 | \n",
453 | "
\n",
454 | " \n",
455 | " | 2 | \n",
456 | " ID_train_10 | \n",
457 | " A | \n",
458 | " 0 | \n",
459 | " 28.975000 | \n",
460 | " 27.950000 | \n",
461 | " 29.600000 | \n",
462 | " 26.425000 | \n",
463 | " 22.091667 | \n",
464 | " 21.775000 | \n",
465 | " 22.333333 | \n",
466 | " ... | \n",
467 | " 0.180233 | \n",
468 | " 0.032484 | \n",
469 | " -0.227481 | \n",
470 | " -0.243561 | \n",
471 | " 88.813333 | \n",
472 | " 88.425000 | \n",
473 | " 10610.511667 | \n",
474 | " 88.287500 | \n",
475 | " 0.016453 | \n",
476 | " 87.982500 | \n",
477 | "
\n",
478 | " \n",
479 | " | 3 | \n",
480 | " ID_train_100 | \n",
481 | " A | \n",
482 | " 0 | \n",
483 | " 22.966667 | \n",
484 | " 24.266667 | \n",
485 | " 25.275000 | \n",
486 | " 25.625000 | \n",
487 | " 25.866667 | \n",
488 | " 25.091667 | \n",
489 | " 24.025000 | \n",
490 | " ... | \n",
491 | " 0.162430 | \n",
492 | " 0.026384 | \n",
493 | " -0.462889 | \n",
494 | " -0.347610 | \n",
495 | " 88.685000 | \n",
496 | " 88.400000 | \n",
497 | " 10693.606667 | \n",
498 | " 88.271667 | \n",
499 | " 0.014766 | \n",
500 | " 87.965000 | \n",
501 | "
\n",
502 | " \n",
503 | " | 4 | \n",
504 | " ID_train_1000 | \n",
505 | " A | \n",
506 | " 0 | \n",
507 | " 21.875000 | \n",
508 | " 21.575000 | \n",
509 | " 21.525000 | \n",
510 | " 21.433333 | \n",
511 | " 20.508333 | \n",
512 | " 19.916667 | \n",
513 | " 18.991667 | \n",
514 | " ... | \n",
515 | " 0.120393 | \n",
516 | " 0.014494 | \n",
517 | " -0.062557 | \n",
518 | " -0.705667 | \n",
519 | " 88.719167 | \n",
520 | " 88.552500 | \n",
521 | " 2656.143106 | \n",
522 | " 88.268333 | \n",
523 | " 0.021981 | \n",
524 | " 88.268333 | \n",
525 | "
\n",
526 | " \n",
527 | "
\n",
528 | "
5 rows × 795 columns
\n",
529 | "
"
530 | ],
531 | "text/plain": [
532 | " ID location separator temp_1 temp_2 temp_3 \\\n",
533 | "0 ID_train_0 C 0 NaN NaN NaN \n",
534 | "1 ID_train_1 D 0 22.533333 21.716667 20.833333 \n",
535 | "2 ID_train_10 A 0 28.975000 27.950000 29.600000 \n",
536 | "3 ID_train_100 A 0 22.966667 24.266667 25.275000 \n",
537 | "4 ID_train_1000 A 0 21.875000 21.575000 21.525000 \n",
538 | "\n",
539 | " temp_4 temp_5 temp_6 temp_7 ... as_std as_var \\\n",
540 | "0 NaN NaN NaN NaN ... 0.072682 0.005283 \n",
541 | "1 20.983333 20.875000 20.141667 19.375000 ... 0.156648 0.024539 \n",
542 | "2 26.425000 22.091667 21.775000 22.333333 ... 0.180233 0.032484 \n",
543 | "3 25.625000 25.866667 25.091667 24.025000 ... 0.162430 0.026384 \n",
544 | "4 21.433333 20.508333 19.916667 18.991667 ... 0.120393 0.014494 \n",
545 | "\n",
546 | " as_kurt as_skew as_max as_median as_sum as_mode \\\n",
547 | "0 -0.158696 -0.383144 87.871667 87.762083 1404.038939 87.614167 \n",
548 | "1 -0.446340 -0.173356 90.725000 90.429167 10942.020833 90.219167 \n",
549 | "2 -0.227481 -0.243561 88.813333 88.425000 10610.511667 88.287500 \n",
550 | "3 -0.462889 -0.347610 88.685000 88.400000 10693.606667 88.271667 \n",
551 | "4 -0.062557 -0.705667 88.719167 88.552500 2656.143106 88.268333 \n",
552 | "\n",
553 | " as_sem as_min \n",
554 | "0 0.018170 87.614167 \n",
555 | "1 0.014241 90.056667 \n",
556 | "2 0.016453 87.982500 \n",
557 | "3 0.014766 87.965000 \n",
558 | "4 0.021981 88.268333 \n",
559 | "\n",
560 | "[5 rows x 795 columns]"
561 | ]
562 | },
563 | "execution_count": 34,
564 | "metadata": {},
565 | "output_type": "execute_result"
566 | }
567 | ],
568 | "source": [
569 | "# Previewing the head of the generated dataframe\n",
570 | "#\n",
571 | "comb.head()"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 35,
577 | "metadata": {
578 | "id": "v4DlbAzKp_c8",
579 | "outputId": "26c4e9d8-68ae-4ed1-8e9a-e9a009f7343c"
580 | },
581 | "outputs": [
582 | {
583 | "data": {
584 | "application/vnd.jupyter.widget-view+json": {
585 | "model_id": "27712189b04b46b68969a0482c5243a5",
586 | "version_major": 2,
587 | "version_minor": 0
588 | },
589 | "text/plain": [
590 | "HBox(children=(FloatProgress(value=0.0, max=795.0), HTML(value='')))"
591 | ]
592 | },
593 | "metadata": {},
594 | "output_type": "display_data"
595 | },
596 | {
597 | "name": "stdout",
598 | "output_type": "stream",
599 | "text": [
600 | "\n"
601 | ]
602 | },
603 | {
604 | "data": {
605 | "application/vnd.jupyter.widget-view+json": {
606 | "model_id": "",
607 | "version_major": 2,
608 | "version_minor": 0
609 | },
610 | "text/plain": [
611 | "HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))"
612 | ]
613 | },
614 | "metadata": {},
615 | "output_type": "display_data"
616 | }
617 | ],
618 | "source": [
619 | "# Creating a list of columns containing aggregates only\n",
620 | "cols_mean = []\n",
621 | "cols_max = []\n",
622 | "cols_min = []\n",
623 | "for x in tqdm_notebook(comb.columns):\n",
624 | " if 'mean' in x:\n",
625 | " cols_mean.append(x)\n",
626 | " elif 'max' in x:\n",
627 | " cols_max.append(x)\n",
628 | " elif 'min' in x:\n",
629 | " cols_min.append(x)\n",
630 | " else:\n",
631 | " pass\n",
632 | "\n",
633 | "# Generating feature interactions between aggregates only\n",
634 | "for num_cols in tqdm_notebook([cols_mean, cols_max, cols_min], leave = False):\n",
635 | " comb = add_prod_interacts(comb, num_cols)\n",
636 | " comb = add_div_interacts(comb, num_cols)\n",
637 | " comb = add_diff_interacts(comb, num_cols)"
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": 36,
643 | "metadata": {
644 | "collapsed": true,
645 | "id": "qdVrf3ZmcFZn"
646 | },
647 | "outputs": [],
648 | "source": [
649 | "# Generating new features, by adding each variable per hour\n",
650 | "for x, y, z, a, b in zip(temp.columns, precip.columns, humid.columns, wind_spd.columns, atmp.columns):\n",
651 | " comb['add_tp' +y[-4:]] = temp[x] + precip[y] + humid[z] + wind_spd[a] + atmp[b]"
652 | ]
653 | },
654 | {
655 | "cell_type": "code",
656 | "execution_count": 37,
657 | "metadata": {
658 | "collapsed": true,
659 | "id": "IU2Z7uDigrgW"
660 | },
661 | "outputs": [],
662 | "source": [
663 | "# Filling missing values using forward fill\n",
664 | "comb = comb.ffill(axis = 1)"
665 | ]
666 | },
667 | {
668 | "cell_type": "code",
669 | "execution_count": 38,
670 | "metadata": {
671 | "collapsed": true,
672 | "id": "YnsfoPv_R5yN"
673 | },
674 | "outputs": [],
675 | "source": [
676 | "# Calculating the difference between features per each dataframe\n",
677 | "\n",
678 | "dfs = [temp, precip, humid, wind_spd, atmp]\n",
679 | "\n",
680 | "diff_dfs = []\n",
681 | "for i in range(5):\n",
682 | " i = dfs[i]\n",
683 | " name = str(i.columns[0].split('_')[0])\n",
684 | " temp_df = i.diff(axis = 1).values\n",
685 | " temp_df = pd.DataFrame(temp_df, columns=['diff_' +name + '_' + str(i) for i in range(1, 122)])\n",
686 | " diff_dfs.append(temp_df)\n",
687 | "\n",
688 | "diff_dfs.append(comb)\n",
689 | "comb = reduce(lambda left,right: pd.merge(left,right, right_index=True, left_index=True, how='outer'), diff_dfs)"
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": 39,
695 | "metadata": {
696 | "id": "TUeOD_AMUMyQ",
697 | "outputId": "ebd84e15-b073-45aa-d096-c2f270ce2e10"
698 | },
699 | "outputs": [
700 | {
701 | "data": {
702 | "text/html": [
703 | "\n",
704 | "\n",
717 | "
\n",
718 | " \n",
719 | " \n",
720 | " | \n",
721 | " diff_temp_1 | \n",
722 | " diff_temp_2 | \n",
723 | " diff_temp_3 | \n",
724 | " diff_temp_4 | \n",
725 | " diff_temp_5 | \n",
726 | " diff_temp_6 | \n",
727 | " diff_temp_7 | \n",
728 | " diff_temp_8 | \n",
729 | " diff_temp_9 | \n",
730 | " diff_temp_10 | \n",
731 | " ... | \n",
732 | " add_tp_112 | \n",
733 | " add_tp_113 | \n",
734 | " add_tp_114 | \n",
735 | " add_tp_115 | \n",
736 | " add_tp_116 | \n",
737 | " add_tp_117 | \n",
738 | " add_tp_118 | \n",
739 | " add_tp_119 | \n",
740 | " add_tp_120 | \n",
741 | " add_tp_121 | \n",
742 | "
\n",
743 | " \n",
744 | " \n",
745 | " \n",
746 | " | 0 | \n",
747 | " NaN | \n",
748 | " NaN | \n",
749 | " NaN | \n",
750 | " NaN | \n",
751 | " NaN | \n",
752 | " NaN | \n",
753 | " NaN | \n",
754 | " NaN | \n",
755 | " NaN | \n",
756 | " NaN | \n",
757 | " ... | \n",
758 | " 111.009 | \n",
759 | " 110.396 | \n",
760 | " 109.991 | \n",
761 | " 110.031 | \n",
762 | " 110.085 | \n",
763 | " 109.618 | \n",
764 | " 109.161 | \n",
765 | " 108.627 | \n",
766 | " 108.839 | \n",
767 | " 109.081 | \n",
768 | "
\n",
769 | " \n",
770 | " | 1 | \n",
771 | " NaN | \n",
772 | " -0.816667 | \n",
773 | " -0.883333 | \n",
774 | " 0.150000 | \n",
775 | " -0.108333 | \n",
776 | " -0.733333 | \n",
777 | " -0.766667 | \n",
778 | " -0.583333 | \n",
779 | " -0.016667 | \n",
780 | " -0.133333 | \n",
781 | " ... | \n",
782 | " 118.268 | \n",
783 | " 119.334 | \n",
784 | " 120.191 | \n",
785 | " 122.078 | \n",
786 | " 122.734 | \n",
787 | " 123.072 | \n",
788 | " 121.885 | \n",
789 | " 119.338 | \n",
790 | " 118.589 | \n",
791 | " 114.357 | \n",
792 | "
\n",
793 | " \n",
794 | " | 2 | \n",
795 | " NaN | \n",
796 | " -1.025000 | \n",
797 | " 1.650000 | \n",
798 | " -3.175000 | \n",
799 | " -4.333333 | \n",
800 | " -0.316667 | \n",
801 | " 0.558333 | \n",
802 | " -0.383333 | \n",
803 | " -1.508333 | \n",
804 | " 0.000000 | \n",
805 | " ... | \n",
806 | " 111.332 | \n",
807 | " 110.389 | \n",
808 | " 110.174 | \n",
809 | " 110.092 | \n",
810 | " 110.746 | \n",
811 | " 113.401 | \n",
812 | " 116.527 | \n",
813 | " 118.449 | \n",
814 | " 119.659 | \n",
815 | " 120.692 | \n",
816 | "
\n",
817 | " \n",
818 | " | 3 | \n",
819 | " NaN | \n",
820 | " 1.300000 | \n",
821 | " 1.008333 | \n",
822 | " 0.350000 | \n",
823 | " 0.241667 | \n",
824 | " -0.775000 | \n",
825 | " -1.066667 | \n",
826 | " -1.600000 | \n",
827 | " -2.191667 | \n",
828 | " -2.816667 | \n",
829 | " ... | \n",
830 | " 109.088 | \n",
831 | " 108.214 | \n",
832 | " 108.176 | \n",
833 | " 107.477 | \n",
834 | " 107.332 | \n",
835 | " 108.305 | \n",
836 | " 110.405 | \n",
837 | " 112.255 | \n",
838 | " 114.158 | \n",
839 | " 115.68 | \n",
840 | "
\n",
841 | " \n",
842 | " | 4 | \n",
843 | " NaN | \n",
844 | " -0.300000 | \n",
845 | " -0.050000 | \n",
846 | " -0.091667 | \n",
847 | " -0.925000 | \n",
848 | " -0.591667 | \n",
849 | " -0.925000 | \n",
850 | " -0.400000 | \n",
851 | " -0.541667 | \n",
852 | " -0.133333 | \n",
853 | " ... | \n",
854 | " 110.649 | \n",
855 | " 110.649 | \n",
856 | " 110.649 | \n",
857 | " 110.649 | \n",
858 | " 110.649 | \n",
859 | " 110.649 | \n",
860 | " 110.649 | \n",
861 | " 110.649 | \n",
862 | " 110.649 | \n",
863 | " 110.649 | \n",
864 | "
\n",
865 | " \n",
866 | "
\n",
867 | "
5 rows × 1656 columns
\n",
868 | "
"
869 | ],
870 | "text/plain": [
871 | " diff_temp_1 diff_temp_2 diff_temp_3 diff_temp_4 diff_temp_5 \\\n",
872 | "0 NaN NaN NaN NaN NaN \n",
873 | "1 NaN -0.816667 -0.883333 0.150000 -0.108333 \n",
874 | "2 NaN -1.025000 1.650000 -3.175000 -4.333333 \n",
875 | "3 NaN 1.300000 1.008333 0.350000 0.241667 \n",
876 | "4 NaN -0.300000 -0.050000 -0.091667 -0.925000 \n",
877 | "\n",
878 | " diff_temp_6 diff_temp_7 diff_temp_8 diff_temp_9 diff_temp_10 ... \\\n",
879 | "0 NaN NaN NaN NaN NaN ... \n",
880 | "1 -0.733333 -0.766667 -0.583333 -0.016667 -0.133333 ... \n",
881 | "2 -0.316667 0.558333 -0.383333 -1.508333 0.000000 ... \n",
882 | "3 -0.775000 -1.066667 -1.600000 -2.191667 -2.816667 ... \n",
883 | "4 -0.591667 -0.925000 -0.400000 -0.541667 -0.133333 ... \n",
884 | "\n",
885 | " add_tp_112 add_tp_113 add_tp_114 add_tp_115 add_tp_116 add_tp_117 \\\n",
886 | "0 111.009 110.396 109.991 110.031 110.085 109.618 \n",
887 | "1 118.268 119.334 120.191 122.078 122.734 123.072 \n",
888 | "2 111.332 110.389 110.174 110.092 110.746 113.401 \n",
889 | "3 109.088 108.214 108.176 107.477 107.332 108.305 \n",
890 | "4 110.649 110.649 110.649 110.649 110.649 110.649 \n",
891 | "\n",
892 | " add_tp_118 add_tp_119 add_tp_120 add_tp_121 \n",
893 | "0 109.161 108.627 108.839 109.081 \n",
894 | "1 121.885 119.338 118.589 114.357 \n",
895 | "2 116.527 118.449 119.659 120.692 \n",
896 | "3 110.405 112.255 114.158 115.68 \n",
897 | "4 110.649 110.649 110.649 110.649 \n",
898 | "\n",
899 | "[5 rows x 1656 columns]"
900 | ]
901 | },
902 | "execution_count": 39,
903 | "metadata": {},
904 | "output_type": "execute_result"
905 | }
906 | ],
907 | "source": [
908 | "comb.head()"
909 | ]
910 | },
911 | {
912 | "cell_type": "code",
913 | "execution_count": 40,
914 | "metadata": {
915 | "collapsed": true
916 | },
917 | "outputs": [],
918 | "source": [
919 | "def apply_qcut(feat):\n",
920 | " return pd.qcut(comb[feat], 24, labels=False, duplicates='drop')"
921 | ]
922 | },
923 | {
924 | "cell_type": "code",
925 | "execution_count": 41,
926 | "metadata": {
927 | "id": "Z1eJlRcDUdrq",
928 | "outputId": "4b5cf990-88bb-4147-b6cc-27f8f9af991a"
929 | },
930 | "outputs": [
931 | {
932 | "data": {
933 | "application/vnd.jupyter.widget-view+json": {
934 | "model_id": "3567c0b91c18424a881e581f27602992",
935 | "version_major": 2,
936 | "version_minor": 0
937 | },
938 | "text/plain": [
939 | "HBox(children=(FloatProgress(value=0.0, max=1653.0), HTML(value='')))"
940 | ]
941 | },
942 | "metadata": {},
943 | "output_type": "display_data"
944 | },
945 | {
946 | "name": "stdout",
947 | "output_type": "stream",
948 | "text": [
949 | "\n",
950 | "CPU times: user 2.11 s, sys: 1.36 s, total: 3.48 s\n",
951 | "Wall time: 18.5 s\n"
952 | ]
953 | }
954 | ],
955 | "source": [
956 | "%%time\n",
957 | "# Binning feaures\n",
958 | "other_features = [x for x in comb.columns if x not in ['separator', 'ID', 'location']]\n",
959 | "\n",
960 | "# Multiprocessing trick: 15 seconds instead of 7 minutes !\n",
961 | "binned_data = joblib.Parallel(n_jobs=-1, backend='multiprocessing')(\n",
962 | " joblib.delayed(apply_qcut)(feat) for feat in tqdm_notebook(other_features))\n",
963 | "\n",
964 | "comb_binned_data = pd.concat(binned_data, axis=1)\n",
965 | "comb = pd.concat([comb[['separator', 'ID', 'location']], comb_binned_data], axis=1)\n",
966 | "\n",
967 | "# Separating train and test from the combined dataframe\n",
968 | "train = comb[comb.separator == 0]\n",
969 | "test = comb[comb.separator == 1]\n",
970 | "train.drop('separator', axis = 1, inplace = True)\n",
971 | "test.drop('separator', axis = 1, inplace = True)\n",
972 | "\n",
973 | "# Creating a list of test ids in the order that they will be trained\n",
974 | "testA = test[test.location == 'A']\n",
975 | "testB = test[test.location == 'B']\n",
976 | "testC = test[test.location == 'C']\n",
977 | "testD = test[test.location == 'D']\n",
978 | "testE = test[test.location == 'E']\n",
979 | "\n",
980 | "tA, tD, tE, tBC = testA.ID, testD.ID, testE.ID, test[(test.location == 'B') | (test.location == 'C')].ID\n",
981 | "test_id = pd.concat([tA, tD, tE, tBC])\n",
982 | "\n",
983 | "# Adding back target to the train set\n",
984 | "train['target'] = target"
985 | ]
986 | },
987 | {
988 | "cell_type": "code",
989 | "execution_count": 42,
990 | "metadata": {
991 | "id": "95P-K-ZljXYv",
992 | "outputId": "0e8d98eb-fbcb-4100-a4a8-fc434398a399"
993 | },
994 | "outputs": [
995 | {
996 | "data": {
997 | "application/vnd.jupyter.widget-view+json": {
998 | "model_id": "fd94cfe7d8b84087bd429024bb637184",
999 | "version_major": 2,
1000 | "version_minor": 0
1001 | },
1002 | "text/plain": [
1003 | "HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))"
1004 | ]
1005 | },
1006 | "metadata": {},
1007 | "output_type": "display_data"
1008 | },
1009 | {
1010 | "name": "stdout",
1011 | "output_type": "stream",
1012 | "text": [
1013 | "\n",
1014 | "CPU times: user 2h 35min 4s, sys: 5min 2s, total: 2h 40min 7s\n",
1015 | "Wall time: 42min 26s\n"
1016 | ]
1017 | }
1018 | ],
1019 | "source": [
1020 | "%%time\n",
1021 | "# Creating X and y values\n",
1022 | "X = train.drop(['ID', 'location', 'target'], axis = 1)\n",
1023 | "y = target.values\n",
1024 | "\n",
1025 | "# Shuffling the X, y values\n",
1026 | "X, y = shuffle(X, y, random_state = 0)\n",
1027 | "tes = test.drop(['ID', 'location'], axis = 1)\n",
1028 | "\n",
1029 | "# Traing the model across multiple seeds\n",
1030 | "predictions = []\n",
1031 | "for i in tqdm_notebook(range(25)):\n",
1032 | " cat = CatBoostRegressor(verbose = False, random_seed=i)\n",
1033 | " cat.fit(X, y)\n",
1034 | "\n",
1035 | " preds = cat.predict(tes)\n",
1036 | " predictions.append(preds)\n",
1037 | "\n",
1038 | "# Averaging the predictions\n",
1039 | "avg_preds = np.mean(predictions, axis = 0)\n",
1040 | "\n",
1041 | "# Post processing of the predictions\n",
1042 | "post_proc = [((((((((((x-0.85)*1.015)-0.85)*1.012)-0.75)*1.0095)-0.55)*1.0065)-0.8)*1.007) for x in avg_preds]\n",
1043 | "post_proc = predzz = [((x-0.85)*1.015) for x in post_proc]\n",
1044 | "\n",
1045 | "# Creating a submission file\n",
1046 | "sub_df = pd.DataFrame({'ID': test.ID, 'target':post_proc})\n",
1047 | "sub_df.to_csv('model_2.csv', index = False)"
1048 | ]
1049 | },
1050 | {
1051 | "cell_type": "code",
1052 | "execution_count": 43,
1053 | "metadata": {},
1054 | "outputs": [
1055 | {
1056 | "data": {
1057 | "text/html": [
1058 | "\n",
1059 | "\n",
1072 | "
\n",
1073 | " \n",
1074 | " \n",
1075 | " | \n",
1076 | " ID | \n",
1077 | " target | \n",
1078 | "
\n",
1079 | " \n",
1080 | " \n",
1081 | " \n",
1082 | " | 15539 | \n",
1083 | " ID_test_0 | \n",
1084 | " 154.512605 | \n",
1085 | "
\n",
1086 | " \n",
1087 | " | 15540 | \n",
1088 | " ID_test_1 | \n",
1089 | " 117.225585 | \n",
1090 | "
\n",
1091 | " \n",
1092 | " | 15541 | \n",
1093 | " ID_test_10 | \n",
1094 | " 26.247779 | \n",
1095 | "
\n",
1096 | " \n",
1097 | " | 15542 | \n",
1098 | " ID_test_100 | \n",
1099 | " 63.167372 | \n",
1100 | "
\n",
1101 | " \n",
1102 | " | 15543 | \n",
1103 | " ID_test_1000 | \n",
1104 | " 92.044408 | \n",
1105 | "
\n",
1106 | " \n",
1107 | " | 15544 | \n",
1108 | " ID_test_1001 | \n",
1109 | " 41.451966 | \n",
1110 | "
\n",
1111 | " \n",
1112 | " | 15545 | \n",
1113 | " ID_test_1002 | \n",
1114 | " 84.857269 | \n",
1115 | "
\n",
1116 | " \n",
1117 | " | 15546 | \n",
1118 | " ID_test_1003 | \n",
1119 | " 37.807807 | \n",
1120 | "
\n",
1121 | " \n",
1122 | " | 15547 | \n",
1123 | " ID_test_1004 | \n",
1124 | " 30.820292 | \n",
1125 | "
\n",
1126 | " \n",
1127 | " | 15548 | \n",
1128 | " ID_test_1005 | \n",
1129 | " 45.990773 | \n",
1130 | "
\n",
1131 | " \n",
1132 | "
\n",
1133 | "
"
1134 | ],
1135 | "text/plain": [
1136 | " ID target\n",
1137 | "15539 ID_test_0 154.512605\n",
1138 | "15540 ID_test_1 117.225585\n",
1139 | "15541 ID_test_10 26.247779\n",
1140 | "15542 ID_test_100 63.167372\n",
1141 | "15543 ID_test_1000 92.044408\n",
1142 | "15544 ID_test_1001 41.451966\n",
1143 | "15545 ID_test_1002 84.857269\n",
1144 | "15546 ID_test_1003 37.807807\n",
1145 | "15547 ID_test_1004 30.820292\n",
1146 | "15548 ID_test_1005 45.990773"
1147 | ]
1148 | },
1149 | "execution_count": 43,
1150 | "metadata": {},
1151 | "output_type": "execute_result"
1152 | }
1153 | ],
1154 | "source": [
1155 | "sub_df.head(10)"
1156 | ]
1157 | },
1158 | {
1159 | "cell_type": "code",
1160 | "execution_count": null,
1161 | "metadata": {
1162 | "collapsed": true
1163 | },
1164 | "outputs": [],
1165 | "source": []
1166 | }
1167 | ],
1168 | "metadata": {
1169 | "kernelspec": {
1170 | "display_name": "Python 3",
1171 | "language": "python",
1172 | "name": "python3"
1173 | },
1174 | "language_info": {
1175 | "codemirror_mode": {
1176 | "name": "ipython",
1177 | "version": 3
1178 | },
1179 | "file_extension": ".py",
1180 | "mimetype": "text/x-python",
1181 | "name": "python",
1182 | "nbconvert_exporter": "python",
1183 | "pygments_lexer": "ipython3",
1184 | "version": "3.6.3"
1185 | }
1186 | },
1187 | "nbformat": 4,
1188 | "nbformat_minor": 4
1189 | }
1190 |
--------------------------------------------------------------------------------
/zindi-airqo-cnn-quick.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 28,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "### CNN output may vary everytime due to reproducibility issues\n",
12 | "### Using Kaggle GPU it takes only 22 minutes to run.\n",
13 | "### CNN is the most feasible option for implementation as the data size grows.\n",
14 | "### It does not require any kind of feature engineering - CNN does auto feature engineering.\n",
15 | "### GPU makes crunching a big dataset easier and faster.\n",
16 | "### CNN performance improves as the data increases - Also CNN has a good really CV.\n",
17 | "\n",
18 | "# -> Our CNN can easily handle different features. So even if new meteorological features are added no worries !!!"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {
25 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
26 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
27 | "collapsed": true
28 | },
29 | "outputs": [],
30 | "source": [
31 | "import pandas as pd \n",
32 | "import numpy as np \n",
33 | "from tqdm.notebook import tqdm\n",
34 | "import math\n",
35 | "import gc\n",
36 | "from sklearn.preprocessing import LabelEncoder\n",
37 | "import matplotlib.pyplot as plt\n",
38 | "from sklearn.model_selection import KFold, train_test_split\n",
39 | "\n",
40 | "import os\n",
41 | "import shutil\n",
42 | "import datetime\n",
43 | "from tqdm import tqdm, tqdm_notebook\n",
44 | "\n",
45 | "%matplotlib inline\n",
46 | "\n",
47 | "pd.set_option(\"display.max_rows\", 200)\n",
48 | "pd.set_option(\"display.max_columns\", 200)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 2,
54 | "metadata": {
55 | "collapsed": true
56 | },
57 | "outputs": [],
58 | "source": [
59 | "from datetime import datetime\n",
60 | "\n",
61 | "from sklearn.model_selection import train_test_split\n",
62 | "from sklearn.metrics import mean_squared_error\n",
63 | "from sklearn.preprocessing import StandardScaler\n",
64 | "\n",
65 | "# import keras\n",
66 | "import tensorflow as tf\n",
67 | "from tensorflow.keras.models import Sequential, Model\n",
68 | "from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout, Input, BatchNormalization\n",
69 | "from tensorflow.keras.layers import *\n",
70 | "from tensorflow.keras import callbacks, optimizers\n",
71 | "from tensorflow.keras.optimizers import Adam\n",
72 | "import tensorflow.keras.backend as K\n",
73 | "from joblib import Parallel, delayed\n",
74 | "from functools import partial\n",
75 | "import gc\n",
76 | "from tensorflow.keras.layers import MaxPooling1D"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 1,
82 | "metadata": {
83 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
84 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
85 | "collapsed": true
86 | },
87 | "outputs": [],
88 | "source": [
89 | "# !mkdir input\n",
90 | "# !cp /kaggle/input/zindi-airqo/* ./input"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 4,
96 | "metadata": {
97 | "collapsed": true
98 | },
99 | "outputs": [],
100 | "source": [
101 | "# Loading data\n",
102 | "train = pd.read_csv('./input/Train.csv')\n",
103 | "test = pd.read_csv('./input/Test.csv')"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 8,
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "name": "stderr",
113 | "output_type": "stream",
114 | "text": [
115 | "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:5: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
116 | "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
117 | " \"\"\"\n"
118 | ]
119 | },
120 | {
121 | "data": {
122 | "application/vnd.jupyter.widget-view+json": {
123 | "model_id": "6e4983244c3c4439be3e2b17ec9374ab",
124 | "version_major": 2,
125 | "version_minor": 0
126 | },
127 | "text/plain": [
128 | "HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))"
129 | ]
130 | },
131 | "metadata": {},
132 | "output_type": "display_data"
133 | },
134 | {
135 | "name": "stdout",
136 | "output_type": "stream",
137 | "text": [
138 | "\n"
139 | ]
140 | },
141 | {
142 | "name": "stderr",
143 | "output_type": "stream",
144 | "text": [
145 | "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:16: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
146 | "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
147 | " app.launch_new_instance()\n"
148 | ]
149 | },
150 | {
151 | "data": {
152 | "application/vnd.jupyter.widget-view+json": {
153 | "model_id": "266970b55163443f9c9029b908024b24",
154 | "version_major": 2,
155 | "version_minor": 0
156 | },
157 | "text/plain": [
158 | "HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))"
159 | ]
160 | },
161 | "metadata": {},
162 | "output_type": "display_data"
163 | },
164 | {
165 | "name": "stdout",
166 | "output_type": "stream",
167 | "text": [
168 | "\n",
169 | "(20574, 857)\n",
170 | "144018\n",
171 | "CPU times: user 22 s, sys: 2.7 s, total: 24.7 s\n",
172 | "Wall time: 33.5 s\n"
173 | ]
174 | },
175 | {
176 | "data": {
177 | "text/html": [
178 | "\n",
179 | "\n",
192 | "
\n",
193 | " \n",
194 | " \n",
195 | " | \n",
196 | " ID | \n",
197 | " location | \n",
198 | " target | \n",
199 | " temp | \n",
200 | " precip | \n",
201 | " rel_humidity | \n",
202 | " wind_spd | \n",
203 | " atmos_press | \n",
204 | " temp_0 | \n",
205 | " temp_1 | \n",
206 | " temp_2 | \n",
207 | " temp_3 | \n",
208 | " temp_4 | \n",
209 | " temp_5 | \n",
210 | " temp_6 | \n",
211 | " temp_7 | \n",
212 | " temp_8 | \n",
213 | " temp_9 | \n",
214 | " temp_10 | \n",
215 | " temp_11 | \n",
216 | " temp_12 | \n",
217 | " temp_13 | \n",
218 | " temp_14 | \n",
219 | " temp_15 | \n",
220 | " temp_16 | \n",
221 | " temp_17 | \n",
222 | " temp_18 | \n",
223 | " temp_19 | \n",
224 | " temp_20 | \n",
225 | " temp_21 | \n",
226 | " temp_22 | \n",
227 | " temp_23 | \n",
228 | " temp_24 | \n",
229 | " temp_25 | \n",
230 | " temp_26 | \n",
231 | " temp_27 | \n",
232 | " temp_28 | \n",
233 | " temp_29 | \n",
234 | " temp_30 | \n",
235 | " temp_31 | \n",
236 | " temp_32 | \n",
237 | " temp_33 | \n",
238 | " temp_34 | \n",
239 | " temp_35 | \n",
240 | " temp_36 | \n",
241 | " temp_37 | \n",
242 | " temp_38 | \n",
243 | " temp_39 | \n",
244 | " temp_40 | \n",
245 | " temp_41 | \n",
246 | " temp_42 | \n",
247 | " temp_43 | \n",
248 | " temp_44 | \n",
249 | " temp_45 | \n",
250 | " temp_46 | \n",
251 | " temp_47 | \n",
252 | " temp_48 | \n",
253 | " temp_49 | \n",
254 | " temp_50 | \n",
255 | " temp_51 | \n",
256 | " temp_52 | \n",
257 | " temp_53 | \n",
258 | " temp_54 | \n",
259 | " temp_55 | \n",
260 | " temp_56 | \n",
261 | " temp_57 | \n",
262 | " temp_58 | \n",
263 | " temp_59 | \n",
264 | " temp_60 | \n",
265 | " temp_61 | \n",
266 | " temp_62 | \n",
267 | " temp_63 | \n",
268 | " temp_64 | \n",
269 | " temp_65 | \n",
270 | " temp_66 | \n",
271 | " temp_67 | \n",
272 | " temp_68 | \n",
273 | " temp_69 | \n",
274 | " temp_70 | \n",
275 | " temp_71 | \n",
276 | " temp_72 | \n",
277 | " temp_73 | \n",
278 | " temp_74 | \n",
279 | " temp_75 | \n",
280 | " temp_76 | \n",
281 | " temp_77 | \n",
282 | " temp_78 | \n",
283 | " temp_79 | \n",
284 | " temp_80 | \n",
285 | " temp_81 | \n",
286 | " temp_82 | \n",
287 | " temp_83 | \n",
288 | " temp_84 | \n",
289 | " temp_85 | \n",
290 | " temp_86 | \n",
291 | " temp_87 | \n",
292 | " temp_88 | \n",
293 | " temp_89 | \n",
294 | " temp_90 | \n",
295 | " temp_91 | \n",
296 | " ... | \n",
297 | " wind_dir_sin_21 | \n",
298 | " wind_dir_sin_22 | \n",
299 | " wind_dir_sin_23 | \n",
300 | " wind_dir_sin_24 | \n",
301 | " wind_dir_sin_25 | \n",
302 | " wind_dir_sin_26 | \n",
303 | " wind_dir_sin_27 | \n",
304 | " wind_dir_sin_28 | \n",
305 | " wind_dir_sin_29 | \n",
306 | " wind_dir_sin_30 | \n",
307 | " wind_dir_sin_31 | \n",
308 | " wind_dir_sin_32 | \n",
309 | " wind_dir_sin_33 | \n",
310 | " wind_dir_sin_34 | \n",
311 | " wind_dir_sin_35 | \n",
312 | " wind_dir_sin_36 | \n",
313 | " wind_dir_sin_37 | \n",
314 | " wind_dir_sin_38 | \n",
315 | " wind_dir_sin_39 | \n",
316 | " wind_dir_sin_40 | \n",
317 | " wind_dir_sin_41 | \n",
318 | " wind_dir_sin_42 | \n",
319 | " wind_dir_sin_43 | \n",
320 | " wind_dir_sin_44 | \n",
321 | " wind_dir_sin_45 | \n",
322 | " wind_dir_sin_46 | \n",
323 | " wind_dir_sin_47 | \n",
324 | " wind_dir_sin_48 | \n",
325 | " wind_dir_sin_49 | \n",
326 | " wind_dir_sin_50 | \n",
327 | " wind_dir_sin_51 | \n",
328 | " wind_dir_sin_52 | \n",
329 | " wind_dir_sin_53 | \n",
330 | " wind_dir_sin_54 | \n",
331 | " wind_dir_sin_55 | \n",
332 | " wind_dir_sin_56 | \n",
333 | " wind_dir_sin_57 | \n",
334 | " wind_dir_sin_58 | \n",
335 | " wind_dir_sin_59 | \n",
336 | " wind_dir_sin_60 | \n",
337 | " wind_dir_sin_61 | \n",
338 | " wind_dir_sin_62 | \n",
339 | " wind_dir_sin_63 | \n",
340 | " wind_dir_sin_64 | \n",
341 | " wind_dir_sin_65 | \n",
342 | " wind_dir_sin_66 | \n",
343 | " wind_dir_sin_67 | \n",
344 | " wind_dir_sin_68 | \n",
345 | " wind_dir_sin_69 | \n",
346 | " wind_dir_sin_70 | \n",
347 | " wind_dir_sin_71 | \n",
348 | " wind_dir_sin_72 | \n",
349 | " wind_dir_sin_73 | \n",
350 | " wind_dir_sin_74 | \n",
351 | " wind_dir_sin_75 | \n",
352 | " wind_dir_sin_76 | \n",
353 | " wind_dir_sin_77 | \n",
354 | " wind_dir_sin_78 | \n",
355 | " wind_dir_sin_79 | \n",
356 | " wind_dir_sin_80 | \n",
357 | " wind_dir_sin_81 | \n",
358 | " wind_dir_sin_82 | \n",
359 | " wind_dir_sin_83 | \n",
360 | " wind_dir_sin_84 | \n",
361 | " wind_dir_sin_85 | \n",
362 | " wind_dir_sin_86 | \n",
363 | " wind_dir_sin_87 | \n",
364 | " wind_dir_sin_88 | \n",
365 | " wind_dir_sin_89 | \n",
366 | " wind_dir_sin_90 | \n",
367 | " wind_dir_sin_91 | \n",
368 | " wind_dir_sin_92 | \n",
369 | " wind_dir_sin_93 | \n",
370 | " wind_dir_sin_94 | \n",
371 | " wind_dir_sin_95 | \n",
372 | " wind_dir_sin_96 | \n",
373 | " wind_dir_sin_97 | \n",
374 | " wind_dir_sin_98 | \n",
375 | " wind_dir_sin_99 | \n",
376 | " wind_dir_sin_100 | \n",
377 | " wind_dir_sin_101 | \n",
378 | " wind_dir_sin_102 | \n",
379 | " wind_dir_sin_103 | \n",
380 | " wind_dir_sin_104 | \n",
381 | " wind_dir_sin_105 | \n",
382 | " wind_dir_sin_106 | \n",
383 | " wind_dir_sin_107 | \n",
384 | " wind_dir_sin_108 | \n",
385 | " wind_dir_sin_109 | \n",
386 | " wind_dir_sin_110 | \n",
387 | " wind_dir_sin_111 | \n",
388 | " wind_dir_sin_112 | \n",
389 | " wind_dir_sin_113 | \n",
390 | " wind_dir_sin_114 | \n",
391 | " wind_dir_sin_115 | \n",
392 | " wind_dir_sin_116 | \n",
393 | " wind_dir_sin_117 | \n",
394 | " wind_dir_sin_118 | \n",
395 | " wind_dir_sin_119 | \n",
396 | " wind_dir_sin_120 | \n",
397 | "
\n",
398 | " \n",
399 | " \n",
400 | " \n",
401 | " | 0 | \n",
402 | " ID_train_0 | \n",
403 | " C | \n",
404 | " 45.126304 | \n",
405 | " NaN | \n",
406 | " NaN | \n",
407 | " NaN | \n",
408 | " NaN | \n",
409 | " NaN | \n",
410 | " 22.533333 | \n",
411 | " 21.716667 | \n",
412 | " 20.833333 | \n",
413 | " 20.983333 | \n",
414 | " 20.875000 | \n",
415 | " 20.141667 | \n",
416 | " 19.375000 | \n",
417 | " 18.791667 | \n",
418 | " 18.775000 | \n",
419 | " 18.641667 | \n",
420 | " 18.558333 | \n",
421 | " 18.533333 | \n",
422 | " 19.608333 | \n",
423 | " 21.916667 | \n",
424 | " 24.716667 | \n",
425 | " 26.658333 | \n",
426 | " 29.175000 | \n",
427 | " 30.700000 | \n",
428 | " 31.433333 | \n",
429 | " 32.333333 | \n",
430 | " 33.008333 | \n",
431 | " 33.391667 | \n",
432 | " 33.616667 | \n",
433 | " 31.091667 | \n",
434 | " 27.550000 | \n",
435 | " 26.658333 | \n",
436 | " 25.675000 | \n",
437 | " 26.425000 | \n",
438 | " 23.783333 | \n",
439 | " 22.416667 | \n",
440 | " 22.041667 | \n",
441 | " 21.008333 | \n",
442 | " 21.475000 | \n",
443 | " 21.825000 | \n",
444 | " 21.158333 | \n",
445 | " 22.308333 | \n",
446 | " 23.116667 | \n",
447 | " 22.333333 | \n",
448 | " 23.850000 | \n",
449 | " 23.825000 | \n",
450 | " 24.983333 | \n",
451 | " 28.050000 | \n",
452 | " 30.841667 | \n",
453 | " 31.991667 | \n",
454 | " 32.041667 | \n",
455 | " 32.000000 | \n",
456 | " 31.666667 | \n",
457 | " 29.158333 | \n",
458 | " 25.866667 | \n",
459 | " 24.300000 | \n",
460 | " 23.683333 | \n",
461 | " 23.758333 | \n",
462 | " 22.991667 | \n",
463 | " 21.891667 | \n",
464 | " 21.158333 | \n",
465 | " 20.416667 | \n",
466 | " 19.533333 | \n",
467 | " 19.125000 | \n",
468 | " 18.766667 | \n",
469 | " 17.983333 | \n",
470 | " 18.458333 | \n",
471 | " 21.933333 | \n",
472 | " 23.791667 | \n",
473 | " 25.558333 | \n",
474 | " 27.758333 | \n",
475 | " 29.658333 | \n",
476 | " 31.391667 | \n",
477 | " 31.883333 | \n",
478 | " 32.358333 | \n",
479 | " 32.708333 | \n",
480 | " 31.983333 | \n",
481 | " 30.850000 | \n",
482 | " 28.800 | \n",
483 | " 26.491667 | \n",
484 | " 24.866667 | \n",
485 | " 24.366667 | \n",
486 | " 23.025000 | \n",
487 | " 22.325000 | \n",
488 | " 21.650000 | \n",
489 | " 20.750000 | \n",
490 | " 20.475000 | \n",
491 | " 19.641667 | \n",
492 | " 19.516667 | \n",
493 | " 19.575000 | \n",
494 | " 20.000000 | \n",
495 | " 23.358333 | \n",
496 | " 25.608333 | \n",
497 | " 26.883333 | \n",
498 | " 26.358333 | \n",
499 | " 27.325000 | \n",
500 | " 29.008333 | \n",
501 | " 28.433333 | \n",
502 | " ... | \n",
503 | " -0.141620 | \n",
504 | " 0.677692 | \n",
505 | " -0.691042 | \n",
506 | " -0.570434 | \n",
507 | " -0.621553 | \n",
508 | " 0.054916 | \n",
509 | " -0.068411 | \n",
510 | " -0.427471 | \n",
511 | " -0.748119 | \n",
512 | " 0.795807 | \n",
513 | " 0.997073 | \n",
514 | " -0.105681 | \n",
515 | " -0.170296 | \n",
516 | " 0.850561 | \n",
517 | " 0.971100 | \n",
518 | " -0.002620 | \n",
519 | " -0.897881 | \n",
520 | " 0.855572 | \n",
521 | " 0.562276 | \n",
522 | " 0.606049 | \n",
523 | " 0.902810 | \n",
524 | " -0.853162 | \n",
525 | " -0.186615 | \n",
526 | " -0.032798 | \n",
527 | " 0.937824 | \n",
528 | " -0.317277 | \n",
529 | " -0.451505 | \n",
530 | " -0.797416 | \n",
531 | " -0.159037 | \n",
532 | " -0.842704 | \n",
533 | " 0.590000 | \n",
534 | " 0.249485 | \n",
535 | " 0.643785 | \n",
536 | " 0.369674 | \n",
537 | " -0.910274 | \n",
538 | " -0.390897 | \n",
539 | " 0.531411 | \n",
540 | " -0.117671 | \n",
541 | " 0.682072 | \n",
542 | " 0.997411 | \n",
543 | " -0.445040 | \n",
544 | " 0.641554 | \n",
545 | " 0.917998 | \n",
546 | " 0.647900 | \n",
547 | " 0.843400 | \n",
548 | " 0.831011 | \n",
549 | " -0.780758 | \n",
550 | " -0.402907 | \n",
551 | " -0.970055 | \n",
552 | " -0.775132 | \n",
553 | " -0.737531 | \n",
554 | " -0.877086 | \n",
555 | " 0.706791 | \n",
556 | " -0.991253 | \n",
557 | " 0.870372 | \n",
558 | " 0.235040 | \n",
559 | " -0.399780 | \n",
560 | " -0.890181 | \n",
561 | " -0.429068 | \n",
562 | " 0.536505 | \n",
563 | " 0.849812 | \n",
564 | " -0.921784 | \n",
565 | " 0.890076 | \n",
566 | " 0.325722 | \n",
567 | " -0.968946 | \n",
568 | " 0.422955 | \n",
569 | " -0.908680 | \n",
570 | " -0.238531 | \n",
571 | " 0.994556 | \n",
572 | " -0.784712 | \n",
573 | " -0.327369 | \n",
574 | " 0.793215 | \n",
575 | " 0.921165 | \n",
576 | " 0.900036 | \n",
577 | " -0.244630 | \n",
578 | " -0.858170 | \n",
579 | " -0.980876 | \n",
580 | " 0.890335 | \n",
581 | " -0.992381 | \n",
582 | " -0.861434 | \n",
583 | " -0.629305 | \n",
584 | " -0.290513 | \n",
585 | " 0.999024 | \n",
586 | " -0.985627 | \n",
587 | " 0.403930 | \n",
588 | " -0.778758 | \n",
589 | " -0.623189 | \n",
590 | " 0.921356 | \n",
591 | " -0.949976 | \n",
592 | " -0.938622 | \n",
593 | " 0.539989 | \n",
594 | " 0.833571 | \n",
595 | " -0.566473 | \n",
596 | " 0.249451 | \n",
597 | " 0.703657 | \n",
598 | " -0.775581 | \n",
599 | " 0.150566 | \n",
600 | " 0.117369 | \n",
601 | " 0.635717 | \n",
602 | " -0.947955 | \n",
603 | "
\n",
604 | " \n",
605 | " | 1 | \n",
606 | " ID_train_1 | \n",
607 | " D | \n",
608 | " 79.131702 | \n",
609 | " NaN | \n",
610 | " NaN | \n",
611 | " NaN | \n",
612 | " NaN | \n",
613 | " NaN | \n",
614 | " 22.533333 | \n",
615 | " 21.716667 | \n",
616 | " 20.833333 | \n",
617 | " 20.983333 | \n",
618 | " 20.875000 | \n",
619 | " 20.141667 | \n",
620 | " 19.375000 | \n",
621 | " 18.791667 | \n",
622 | " 18.775000 | \n",
623 | " 18.641667 | \n",
624 | " 18.558333 | \n",
625 | " 18.533333 | \n",
626 | " 19.608333 | \n",
627 | " 21.916667 | \n",
628 | " 24.716667 | \n",
629 | " 26.658333 | \n",
630 | " 29.175000 | \n",
631 | " 30.700000 | \n",
632 | " 31.433333 | \n",
633 | " 32.333333 | \n",
634 | " 33.008333 | \n",
635 | " 33.391667 | \n",
636 | " 33.616667 | \n",
637 | " 31.091667 | \n",
638 | " 27.550000 | \n",
639 | " 26.658333 | \n",
640 | " 25.675000 | \n",
641 | " 26.425000 | \n",
642 | " 23.783333 | \n",
643 | " 22.416667 | \n",
644 | " 22.041667 | \n",
645 | " 21.008333 | \n",
646 | " 21.475000 | \n",
647 | " 21.825000 | \n",
648 | " 21.158333 | \n",
649 | " 22.308333 | \n",
650 | " 23.116667 | \n",
651 | " 22.333333 | \n",
652 | " 23.850000 | \n",
653 | " 23.825000 | \n",
654 | " 24.983333 | \n",
655 | " 28.050000 | \n",
656 | " 30.841667 | \n",
657 | " 31.991667 | \n",
658 | " 32.041667 | \n",
659 | " 32.000000 | \n",
660 | " 31.666667 | \n",
661 | " 29.158333 | \n",
662 | " 25.866667 | \n",
663 | " 24.300000 | \n",
664 | " 23.683333 | \n",
665 | " 23.758333 | \n",
666 | " 22.991667 | \n",
667 | " 21.891667 | \n",
668 | " 21.158333 | \n",
669 | " 20.416667 | \n",
670 | " 19.533333 | \n",
671 | " 19.125000 | \n",
672 | " 18.766667 | \n",
673 | " 17.983333 | \n",
674 | " 18.458333 | \n",
675 | " 21.933333 | \n",
676 | " 23.791667 | \n",
677 | " 25.558333 | \n",
678 | " 27.758333 | \n",
679 | " 29.658333 | \n",
680 | " 31.391667 | \n",
681 | " 31.883333 | \n",
682 | " 32.358333 | \n",
683 | " 32.708333 | \n",
684 | " 31.983333 | \n",
685 | " 30.850000 | \n",
686 | " 28.800 | \n",
687 | " 26.491667 | \n",
688 | " 24.866667 | \n",
689 | " 24.366667 | \n",
690 | " 23.025000 | \n",
691 | " 22.325000 | \n",
692 | " 21.650000 | \n",
693 | " 20.750000 | \n",
694 | " 20.475000 | \n",
695 | " 19.641667 | \n",
696 | " 19.516667 | \n",
697 | " 19.575000 | \n",
698 | " 20.000000 | \n",
699 | " 23.358333 | \n",
700 | " 25.608333 | \n",
701 | " 26.883333 | \n",
702 | " 26.358333 | \n",
703 | " 27.325000 | \n",
704 | " 29.008333 | \n",
705 | " 28.433333 | \n",
706 | " ... | \n",
707 | " -0.141620 | \n",
708 | " 0.677692 | \n",
709 | " -0.691042 | \n",
710 | " -0.570434 | \n",
711 | " -0.621553 | \n",
712 | " 0.054916 | \n",
713 | " -0.068411 | \n",
714 | " -0.427471 | \n",
715 | " -0.748119 | \n",
716 | " 0.795807 | \n",
717 | " 0.997073 | \n",
718 | " -0.105681 | \n",
719 | " -0.170296 | \n",
720 | " 0.850561 | \n",
721 | " 0.971100 | \n",
722 | " -0.002620 | \n",
723 | " -0.897881 | \n",
724 | " 0.855572 | \n",
725 | " 0.562276 | \n",
726 | " 0.606049 | \n",
727 | " 0.902810 | \n",
728 | " -0.853162 | \n",
729 | " -0.186615 | \n",
730 | " -0.032798 | \n",
731 | " 0.937824 | \n",
732 | " -0.317277 | \n",
733 | " -0.451505 | \n",
734 | " -0.797416 | \n",
735 | " -0.159037 | \n",
736 | " -0.842704 | \n",
737 | " 0.590000 | \n",
738 | " 0.249485 | \n",
739 | " 0.643785 | \n",
740 | " 0.369674 | \n",
741 | " -0.910274 | \n",
742 | " -0.390897 | \n",
743 | " 0.531411 | \n",
744 | " -0.117671 | \n",
745 | " 0.682072 | \n",
746 | " 0.997411 | \n",
747 | " -0.445040 | \n",
748 | " 0.641554 | \n",
749 | " 0.917998 | \n",
750 | " 0.647900 | \n",
751 | " 0.843400 | \n",
752 | " 0.831011 | \n",
753 | " -0.780758 | \n",
754 | " -0.402907 | \n",
755 | " -0.970055 | \n",
756 | " -0.775132 | \n",
757 | " -0.737531 | \n",
758 | " -0.877086 | \n",
759 | " 0.706791 | \n",
760 | " -0.991253 | \n",
761 | " 0.870372 | \n",
762 | " 0.235040 | \n",
763 | " -0.399780 | \n",
764 | " -0.890181 | \n",
765 | " -0.429068 | \n",
766 | " 0.536505 | \n",
767 | " 0.849812 | \n",
768 | " -0.921784 | \n",
769 | " 0.890076 | \n",
770 | " 0.325722 | \n",
771 | " -0.968946 | \n",
772 | " 0.422955 | \n",
773 | " -0.908680 | \n",
774 | " -0.238531 | \n",
775 | " 0.994556 | \n",
776 | " -0.784712 | \n",
777 | " -0.327369 | \n",
778 | " 0.793215 | \n",
779 | " 0.921165 | \n",
780 | " 0.900036 | \n",
781 | " -0.244630 | \n",
782 | " -0.858170 | \n",
783 | " -0.980876 | \n",
784 | " 0.890335 | \n",
785 | " -0.992381 | \n",
786 | " -0.861434 | \n",
787 | " -0.629305 | \n",
788 | " -0.290513 | \n",
789 | " 0.999024 | \n",
790 | " -0.985627 | \n",
791 | " 0.923642 | \n",
792 | " -0.553773 | \n",
793 | " 0.455011 | \n",
794 | " -0.989001 | \n",
795 | " 0.211568 | \n",
796 | " -0.230857 | \n",
797 | " 0.971538 | \n",
798 | " 0.669779 | \n",
799 | " -0.839579 | \n",
800 | " -0.212372 | \n",
801 | " -0.029492 | \n",
802 | " 0.903239 | \n",
803 | " 0.881671 | \n",
804 | " 0.304360 | \n",
805 | " -0.955722 | \n",
806 | " 0.996240 | \n",
807 | "
\n",
808 | " \n",
809 | " | 2 | \n",
810 | " ID_train_10 | \n",
811 | " A | \n",
812 | " 32.661304 | \n",
813 | " NaN | \n",
814 | " NaN | \n",
815 | " NaN | \n",
816 | " NaN | \n",
817 | " NaN | \n",
818 | " 28.975000 | \n",
819 | " 27.950000 | \n",
820 | " 29.600000 | \n",
821 | " 26.425000 | \n",
822 | " 22.091667 | \n",
823 | " 21.775000 | \n",
824 | " 22.333333 | \n",
825 | " 21.950000 | \n",
826 | " 20.441667 | \n",
827 | " 20.441667 | \n",
828 | " 20.950000 | \n",
829 | " 19.800000 | \n",
830 | " 19.591667 | \n",
831 | " 19.575000 | \n",
832 | " 19.516667 | \n",
833 | " 19.550000 | \n",
834 | " 19.783333 | \n",
835 | " 19.908333 | \n",
836 | " 19.516667 | \n",
837 | " 19.666667 | \n",
838 | " 20.683333 | \n",
839 | " 22.491667 | \n",
840 | " 23.708333 | \n",
841 | " 24.900000 | \n",
842 | " 26.058333 | \n",
843 | " 27.325000 | \n",
844 | " 27.866667 | \n",
845 | " 28.291667 | \n",
846 | " 22.136364 | \n",
847 | " 19.141667 | \n",
848 | " 19.991667 | \n",
849 | " 20.216667 | \n",
850 | " 20.433333 | \n",
851 | " 20.466667 | \n",
852 | " 20.800000 | \n",
853 | " 20.958333 | \n",
854 | " 20.641667 | \n",
855 | " 20.033333 | \n",
856 | " 19.825000 | \n",
857 | " 19.616667 | \n",
858 | " 19.225000 | \n",
859 | " 18.716667 | \n",
860 | " 18.458333 | \n",
861 | " 19.000000 | \n",
862 | " 20.758333 | \n",
863 | " 27.358333 | \n",
864 | " 24.491667 | \n",
865 | " 26.350000 | \n",
866 | " 27.950000 | \n",
867 | " 29.166667 | \n",
868 | " 30.066667 | \n",
869 | " 30.550000 | \n",
870 | " 30.400000 | \n",
871 | " 30.008333 | \n",
872 | " 28.908333 | \n",
873 | " 26.700000 | \n",
874 | " 25.716667 | \n",
875 | " 24.858333 | \n",
876 | " 24.291667 | \n",
877 | " 23.308333 | \n",
878 | " 23.008333 | \n",
879 | " 22.041667 | \n",
880 | " 21.375000 | \n",
881 | " 21.133333 | \n",
882 | " 21.158333 | \n",
883 | " 20.658333 | \n",
884 | " 20.766667 | \n",
885 | " 21.633333 | \n",
886 | " 22.708333 | \n",
887 | " 24.808333 | \n",
888 | " 27.108333 | \n",
889 | " 28.775000 | \n",
890 | " 29.475 | \n",
891 | " 29.766667 | \n",
892 | " 29.875000 | \n",
893 | " 29.150000 | \n",
894 | " 27.716667 | \n",
895 | " 27.491667 | \n",
896 | " 26.483333 | \n",
897 | " 25.475000 | \n",
898 | " 25.008333 | \n",
899 | " 24.600000 | \n",
900 | " 24.033333 | \n",
901 | " 23.358333 | \n",
902 | " 22.366667 | \n",
903 | " 22.608333 | \n",
904 | " 22.741667 | \n",
905 | " 21.908333 | \n",
906 | " 21.550000 | \n",
907 | " 21.758333 | \n",
908 | " 20.316667 | \n",
909 | " 20.650000 | \n",
910 | " ... | \n",
911 | " 0.992270 | \n",
912 | " -0.744836 | \n",
913 | " 0.312328 | \n",
914 | " -0.613390 | \n",
915 | " -0.691339 | \n",
916 | " -0.996273 | \n",
917 | " 0.877801 | \n",
918 | " -0.619629 | \n",
919 | " -0.852653 | \n",
920 | " 0.952294 | \n",
921 | " 0.272820 | \n",
922 | " 0.887248 | \n",
923 | " 0.161753 | \n",
924 | " 0.710564 | \n",
925 | " 0.493320 | \n",
926 | " 0.831038 | \n",
927 | " 0.922995 | \n",
928 | " -0.206542 | \n",
929 | " -0.905140 | \n",
930 | " 0.261427 | \n",
931 | " -0.834561 | \n",
932 | " 0.999585 | \n",
933 | " 0.969601 | \n",
934 | " -0.304580 | \n",
935 | " -0.134351 | \n",
936 | " 0.926241 | \n",
937 | " -0.050912 | \n",
938 | " -0.989347 | \n",
939 | " 0.825994 | \n",
940 | " 0.949567 | \n",
941 | " 0.105761 | \n",
942 | " -0.235078 | \n",
943 | " 0.986250 | \n",
944 | " 0.526325 | \n",
945 | " -0.741661 | \n",
946 | " 0.819501 | \n",
947 | " 0.090216 | \n",
948 | " -0.537516 | \n",
949 | " 0.846942 | \n",
950 | " -0.385990 | \n",
951 | " -0.578650 | \n",
952 | " 0.616461 | \n",
953 | " 0.490440 | \n",
954 | " -0.002268 | \n",
955 | " -0.768049 | \n",
956 | " -0.745449 | \n",
957 | " 0.941508 | \n",
958 | " 0.847960 | \n",
959 | " -0.947888 | \n",
960 | " -0.421278 | \n",
961 | " 0.855621 | \n",
962 | " -0.841248 | \n",
963 | " -0.738750 | \n",
964 | " -0.616005 | \n",
965 | " 0.000840 | \n",
966 | " -0.962296 | \n",
967 | " -0.913448 | \n",
968 | " 0.914193 | \n",
969 | " -0.680750 | \n",
970 | " -0.764314 | \n",
971 | " -0.668780 | \n",
972 | " 0.553494 | \n",
973 | " -0.927593 | \n",
974 | " -0.647122 | \n",
975 | " 0.326478 | \n",
976 | " 0.968075 | \n",
977 | " 0.946287 | \n",
978 | " -0.426579 | \n",
979 | " 0.674657 | \n",
980 | " 0.955783 | \n",
981 | " -0.978836 | \n",
982 | " 0.512857 | \n",
983 | " 0.356463 | \n",
984 | " -0.867495 | \n",
985 | " 0.154614 | \n",
986 | " -0.958078 | \n",
987 | " 0.485347 | \n",
988 | " 0.984303 | \n",
989 | " -0.659389 | \n",
990 | " -0.967570 | \n",
991 | " 0.280092 | \n",
992 | " -0.011816 | \n",
993 | " -0.862490 | \n",
994 | " -0.785080 | \n",
995 | " 0.212533 | \n",
996 | " 0.099250 | \n",
997 | " 0.970914 | \n",
998 | " 0.841250 | \n",
999 | " 0.772792 | \n",
1000 | " 0.999893 | \n",
1001 | " -0.620373 | \n",
1002 | " -0.999964 | \n",
1003 | " -0.328710 | \n",
1004 | " 0.549286 | \n",
1005 | " -0.995617 | \n",
1006 | " -0.654751 | \n",
1007 | " 0.089768 | \n",
1008 | " 0.955292 | \n",
1009 | " 0.772715 | \n",
1010 | " -0.939837 | \n",
1011 | "
\n",
1012 | " \n",
1013 | " | 3 | \n",
1014 | " ID_train_100 | \n",
1015 | " A | \n",
1016 | " 53.850238 | \n",
1017 | " NaN | \n",
1018 | " NaN | \n",
1019 | " NaN | \n",
1020 | " NaN | \n",
1021 | " NaN | \n",
1022 | " 22.966667 | \n",
1023 | " 24.266667 | \n",
1024 | " 25.275000 | \n",
1025 | " 25.625000 | \n",
1026 | " 25.866667 | \n",
1027 | " 25.091667 | \n",
1028 | " 24.025000 | \n",
1029 | " 22.425000 | \n",
1030 | " 20.233333 | \n",
1031 | " 17.416667 | \n",
1032 | " 17.391667 | \n",
1033 | " 17.083333 | \n",
1034 | " 17.516667 | \n",
1035 | " 17.825000 | \n",
1036 | " 18.258333 | \n",
1037 | " 17.950000 | \n",
1038 | " 17.575000 | \n",
1039 | " 17.425000 | \n",
1040 | " 17.475000 | \n",
1041 | " 16.941667 | \n",
1042 | " 18.233333 | \n",
1043 | " 21.166667 | \n",
1044 | " 23.708333 | \n",
1045 | " 24.941667 | \n",
1046 | " 25.825000 | \n",
1047 | " 26.691667 | \n",
1048 | " 27.275000 | \n",
1049 | " 27.491667 | \n",
1050 | " 21.136364 | \n",
1051 | " 19.650000 | \n",
1052 | " 19.391667 | \n",
1053 | " 19.116667 | \n",
1054 | " 19.075000 | \n",
1055 | " 19.541667 | \n",
1056 | " 19.450000 | \n",
1057 | " 19.175000 | \n",
1058 | " 18.891667 | \n",
1059 | " 19.058333 | \n",
1060 | " 19.325000 | \n",
1061 | " 19.400000 | \n",
1062 | " 19.458333 | \n",
1063 | " 19.466667 | \n",
1064 | " 19.441667 | \n",
1065 | " 19.408333 | \n",
1066 | " 19.850000 | \n",
1067 | " 22.716667 | \n",
1068 | " 23.091667 | \n",
1069 | " 23.841667 | \n",
1070 | " 25.166667 | \n",
1071 | " 26.875000 | \n",
1072 | " 27.125000 | \n",
1073 | " 27.183333 | \n",
1074 | " 26.500000 | \n",
1075 | " 26.075000 | \n",
1076 | " 23.391667 | \n",
1077 | " 21.558333 | \n",
1078 | " 20.525000 | \n",
1079 | " 18.016667 | \n",
1080 | " 18.333333 | \n",
1081 | " 18.533333 | \n",
1082 | " 18.641667 | \n",
1083 | " 18.808333 | \n",
1084 | " 18.741667 | \n",
1085 | " 18.733333 | \n",
1086 | " 18.575000 | \n",
1087 | " 18.491667 | \n",
1088 | " 18.225000 | \n",
1089 | " 18.250000 | \n",
1090 | " 18.458333 | \n",
1091 | " 19.266667 | \n",
1092 | " 21.025000 | \n",
1093 | " 23.166667 | \n",
1094 | " 24.000 | \n",
1095 | " 24.708333 | \n",
1096 | " 25.558333 | \n",
1097 | " 26.500000 | \n",
1098 | " 25.716667 | \n",
1099 | " 20.758333 | \n",
1100 | " 17.358333 | \n",
1101 | " 18.116667 | \n",
1102 | " 18.133333 | \n",
1103 | " 18.058333 | \n",
1104 | " 18.350000 | \n",
1105 | " 18.266667 | \n",
1106 | " 18.066667 | \n",
1107 | " 18.133333 | \n",
1108 | " 18.016667 | \n",
1109 | " 17.450000 | \n",
1110 | " 17.066667 | \n",
1111 | " 17.325000 | \n",
1112 | " 17.341667 | \n",
1113 | " 17.375000 | \n",
1114 | " ... | \n",
1115 | " 0.106538 | \n",
1116 | " -0.656670 | \n",
1117 | " 0.127297 | \n",
1118 | " 0.936974 | \n",
1119 | " -0.755199 | \n",
1120 | " 0.225715 | \n",
1121 | " -0.876023 | \n",
1122 | " -0.780112 | \n",
1123 | " -0.930897 | \n",
1124 | " 0.999544 | \n",
1125 | " -0.888784 | \n",
1126 | " -0.874682 | \n",
1127 | " -0.153198 | \n",
1128 | " 0.534190 | \n",
1129 | " -0.293604 | \n",
1130 | " 0.923470 | \n",
1131 | " -0.319239 | \n",
1132 | " -0.991811 | \n",
1133 | " -0.277354 | \n",
1134 | " -0.121781 | \n",
1135 | " -0.040652 | \n",
1136 | " -0.881798 | \n",
1137 | " -0.630439 | \n",
1138 | " 0.606618 | \n",
1139 | " -0.811544 | \n",
1140 | " -0.991091 | \n",
1141 | " -0.999107 | \n",
1142 | " 0.925944 | \n",
1143 | " 0.997952 | \n",
1144 | " 0.885999 | \n",
1145 | " 0.684460 | \n",
1146 | " -0.414167 | \n",
1147 | " 0.989378 | \n",
1148 | " -0.226863 | \n",
1149 | " 0.045114 | \n",
1150 | " -0.104814 | \n",
1151 | " 0.307146 | \n",
1152 | " -0.383696 | \n",
1153 | " -0.425937 | \n",
1154 | " 0.429735 | \n",
1155 | " 0.996446 | \n",
1156 | " -0.875655 | \n",
1157 | " 0.925057 | \n",
1158 | " -0.823499 | \n",
1159 | " -0.208259 | \n",
1160 | " 0.860431 | \n",
1161 | " 0.047207 | \n",
1162 | " -0.357716 | \n",
1163 | " 0.947255 | \n",
1164 | " -0.989517 | \n",
1165 | " 0.777529 | \n",
1166 | " -0.200849 | \n",
1167 | " 0.847853 | \n",
1168 | " 0.269371 | \n",
1169 | " -0.584174 | \n",
1170 | " -0.657106 | \n",
1171 | " 0.458151 | \n",
1172 | " -0.933249 | \n",
1173 | " 0.824032 | \n",
1174 | " 0.722134 | \n",
1175 | " 0.991245 | \n",
1176 | " -0.613111 | \n",
1177 | " -0.168375 | \n",
1178 | " 0.209887 | \n",
1179 | " 0.349680 | \n",
1180 | " 0.415632 | \n",
1181 | " 0.890022 | \n",
1182 | " 0.992949 | \n",
1183 | " 0.875658 | \n",
1184 | " 0.960588 | \n",
1185 | " 0.202772 | \n",
1186 | " -0.415800 | \n",
1187 | " 0.881197 | \n",
1188 | " 0.178436 | \n",
1189 | " -0.713055 | \n",
1190 | " -0.977995 | \n",
1191 | " -0.868449 | \n",
1192 | " 0.082654 | \n",
1193 | " -0.936302 | \n",
1194 | " 0.559380 | \n",
1195 | " -0.926564 | \n",
1196 | " 0.485755 | \n",
1197 | " 1.000000 | \n",
1198 | " -0.202556 | \n",
1199 | " -0.498867 | \n",
1200 | " -0.267156 | \n",
1201 | " -0.901780 | \n",
1202 | " 0.605642 | \n",
1203 | " 0.994990 | \n",
1204 | " -0.552490 | \n",
1205 | " 0.580466 | \n",
1206 | " 0.945695 | \n",
1207 | " 0.669056 | \n",
1208 | " -0.074219 | \n",
1209 | " 0.816325 | \n",
1210 | " 0.049529 | \n",
1211 | " 0.357827 | \n",
1212 | " 0.236227 | \n",
1213 | " 0.301397 | \n",
1214 | " -0.949435 | \n",
1215 | "
\n",
1216 | " \n",
1217 | " | 4 | \n",
1218 | " ID_train_1000 | \n",
1219 | " A | \n",
1220 | " 177.418750 | \n",
1221 | " NaN | \n",
1222 | " NaN | \n",
1223 | " NaN | \n",
1224 | " NaN | \n",
1225 | " NaN | \n",
1226 | " 21.875000 | \n",
1227 | " 21.575000 | \n",
1228 | " 21.525000 | \n",
1229 | " 21.433333 | \n",
1230 | " 20.508333 | \n",
1231 | " 19.916667 | \n",
1232 | " 18.991667 | \n",
1233 | " 18.591667 | \n",
1234 | " 18.050000 | \n",
1235 | " 17.916667 | \n",
1236 | " 18.166667 | \n",
1237 | " 17.525000 | \n",
1238 | " 19.191667 | \n",
1239 | " 22.458333 | \n",
1240 | " 25.016667 | \n",
1241 | " 25.858333 | \n",
1242 | " 26.850000 | \n",
1243 | " 27.625000 | \n",
1244 | " 28.358333 | \n",
1245 | " 28.750000 | \n",
1246 | " 27.850000 | \n",
1247 | " 23.766667 | \n",
1248 | " 20.891667 | \n",
1249 | " 21.008333 | \n",
1250 | " 20.608333 | \n",
1251 | " 20.191667 | \n",
1252 | " 20.108333 | \n",
1253 | " 19.633333 | \n",
1254 | " 24.247348 | \n",
1255 | " 23.537500 | \n",
1256 | " 23.416667 | \n",
1257 | " 22.900000 | \n",
1258 | " 22.100000 | \n",
1259 | " 21.554167 | \n",
1260 | " 21.116667 | \n",
1261 | " 20.970833 | \n",
1262 | " 20.691667 | \n",
1263 | " 20.587500 | \n",
1264 | " 20.820833 | \n",
1265 | " 21.108333 | \n",
1266 | " 20.829167 | \n",
1267 | " 20.879167 | \n",
1268 | " 20.625000 | \n",
1269 | " 18.608333 | \n",
1270 | " 18.516667 | \n",
1271 | " 20.020833 | \n",
1272 | " 20.225000 | \n",
1273 | " 20.612500 | \n",
1274 | " 21.675000 | \n",
1275 | " 22.925000 | \n",
1276 | " 23.857955 | \n",
1277 | " 25.779167 | \n",
1278 | " 21.263636 | \n",
1279 | " 20.616667 | \n",
1280 | " 24.316667 | \n",
1281 | " 23.008333 | \n",
1282 | " 22.145833 | \n",
1283 | " 20.095833 | \n",
1284 | " 19.970833 | \n",
1285 | " 20.083333 | \n",
1286 | " 20.062500 | \n",
1287 | " 19.933333 | \n",
1288 | " 19.691667 | \n",
1289 | " 19.816667 | \n",
1290 | " 19.433333 | \n",
1291 | " 19.441667 | \n",
1292 | " 19.154167 | \n",
1293 | " 19.016667 | \n",
1294 | " 19.016667 | \n",
1295 | " 19.525000 | \n",
1296 | " 21.487500 | \n",
1297 | " 22.937500 | \n",
1298 | " 23.400 | \n",
1299 | " 24.316667 | \n",
1300 | " 25.304167 | \n",
1301 | " 26.208333 | \n",
1302 | " 26.145833 | \n",
1303 | " 23.670833 | \n",
1304 | " 21.725000 | \n",
1305 | " 21.837500 | \n",
1306 | " 21.279167 | \n",
1307 | " 20.570833 | \n",
1308 | " 20.387500 | \n",
1309 | " 19.991667 | \n",
1310 | " 19.833333 | \n",
1311 | " 19.245833 | \n",
1312 | " 18.691667 | \n",
1313 | " 18.383333 | \n",
1314 | " 18.325000 | \n",
1315 | " 18.433333 | \n",
1316 | " 18.437500 | \n",
1317 | " 18.541667 | \n",
1318 | " ... | \n",
1319 | " -0.996037 | \n",
1320 | " 0.763370 | \n",
1321 | " 0.632792 | \n",
1322 | " -0.142443 | \n",
1323 | " 0.883681 | \n",
1324 | " 0.967957 | \n",
1325 | " 0.978257 | \n",
1326 | " 0.176707 | \n",
1327 | " 0.989523 | \n",
1328 | " 0.881470 | \n",
1329 | " -0.928733 | \n",
1330 | " 0.166814 | \n",
1331 | " -0.959935 | \n",
1332 | " -0.936613 | \n",
1333 | " 0.054542 | \n",
1334 | " -0.414697 | \n",
1335 | " -0.866427 | \n",
1336 | " 0.173500 | \n",
1337 | " 0.620190 | \n",
1338 | " -0.906068 | \n",
1339 | " -0.035412 | \n",
1340 | " 0.999959 | \n",
1341 | " -0.830783 | \n",
1342 | " -0.269391 | \n",
1343 | " 0.114415 | \n",
1344 | " 0.054568 | \n",
1345 | " -0.071428 | \n",
1346 | " -0.999393 | \n",
1347 | " -0.916862 | \n",
1348 | " 0.966995 | \n",
1349 | " 0.827238 | \n",
1350 | " -0.550644 | \n",
1351 | " -0.426123 | \n",
1352 | " -0.532556 | \n",
1353 | " -0.587063 | \n",
1354 | " -0.920966 | \n",
1355 | " -0.655409 | \n",
1356 | " -0.218329 | \n",
1357 | " -0.129844 | \n",
1358 | " 0.959347 | \n",
1359 | " -0.707344 | \n",
1360 | " -0.337041 | \n",
1361 | " 0.961337 | \n",
1362 | " -0.935177 | \n",
1363 | " 0.440540 | \n",
1364 | " -0.942829 | \n",
1365 | " -0.969093 | \n",
1366 | " -0.985078 | \n",
1367 | " -0.946619 | \n",
1368 | " -0.662913 | \n",
1369 | " -0.327247 | \n",
1370 | " -0.979234 | \n",
1371 | " 0.738470 | \n",
1372 | " 0.558170 | \n",
1373 | " 0.496131 | \n",
1374 | " -0.045062 | \n",
1375 | " 0.861434 | \n",
1376 | " 0.406506 | \n",
1377 | " 0.440201 | \n",
1378 | " -0.677080 | \n",
1379 | " -0.840071 | \n",
1380 | " 0.669422 | \n",
1381 | " 0.750988 | \n",
1382 | " 0.277206 | \n",
1383 | " 0.372669 | \n",
1384 | " -0.823120 | \n",
1385 | " 0.290495 | \n",
1386 | " -0.201678 | \n",
1387 | " -0.264323 | \n",
1388 | " 0.965121 | \n",
1389 | " 0.817075 | \n",
1390 | " -0.984789 | \n",
1391 | " 0.975569 | \n",
1392 | " 0.637579 | \n",
1393 | " -0.985222 | \n",
1394 | " -0.991592 | \n",
1395 | " 0.728332 | \n",
1396 | " -0.008841 | \n",
1397 | " -0.205911 | \n",
1398 | " 0.967486 | \n",
1399 | " 0.379974 | \n",
1400 | " -0.949185 | \n",
1401 | " 0.983956 | \n",
1402 | " -0.404907 | \n",
1403 | " -0.988736 | \n",
1404 | " -0.293643 | \n",
1405 | " 0.981650 | \n",
1406 | " -0.891477 | \n",
1407 | " 0.962301 | \n",
1408 | " -0.941072 | \n",
1409 | " 0.461160 | \n",
1410 | " 0.970801 | \n",
1411 | " -0.165447 | \n",
1412 | " -0.791261 | \n",
1413 | " -0.901593 | \n",
1414 | " -0.491141 | \n",
1415 | " 0.996079 | \n",
1416 | " -0.391008 | \n",
1417 | " -0.302821 | \n",
1418 | " -0.618876 | \n",
1419 | "
\n",
1420 | " \n",
1421 | "
\n",
1422 | "
5 rows × 857 columns
\n",
1423 | "
"
1424 | ],
1425 | "text/plain": [
1426 | " ID location target temp precip rel_humidity wind_spd \\\n",
1427 | "0 ID_train_0 C 45.126304 NaN NaN NaN NaN \n",
1428 | "1 ID_train_1 D 79.131702 NaN NaN NaN NaN \n",
1429 | "2 ID_train_10 A 32.661304 NaN NaN NaN NaN \n",
1430 | "3 ID_train_100 A 53.850238 NaN NaN NaN NaN \n",
1431 | "4 ID_train_1000 A 177.418750 NaN NaN NaN NaN \n",
1432 | "\n",
1433 | " atmos_press temp_0 temp_1 temp_2 temp_3 temp_4 \\\n",
1434 | "0 NaN 22.533333 21.716667 20.833333 20.983333 20.875000 \n",
1435 | "1 NaN 22.533333 21.716667 20.833333 20.983333 20.875000 \n",
1436 | "2 NaN 28.975000 27.950000 29.600000 26.425000 22.091667 \n",
1437 | "3 NaN 22.966667 24.266667 25.275000 25.625000 25.866667 \n",
1438 | "4 NaN 21.875000 21.575000 21.525000 21.433333 20.508333 \n",
1439 | "\n",
1440 | " temp_5 temp_6 temp_7 temp_8 temp_9 temp_10 \\\n",
1441 | "0 20.141667 19.375000 18.791667 18.775000 18.641667 18.558333 \n",
1442 | "1 20.141667 19.375000 18.791667 18.775000 18.641667 18.558333 \n",
1443 | "2 21.775000 22.333333 21.950000 20.441667 20.441667 20.950000 \n",
1444 | "3 25.091667 24.025000 22.425000 20.233333 17.416667 17.391667 \n",
1445 | "4 19.916667 18.991667 18.591667 18.050000 17.916667 18.166667 \n",
1446 | "\n",
1447 | " temp_11 temp_12 temp_13 temp_14 temp_15 temp_16 \\\n",
1448 | "0 18.533333 19.608333 21.916667 24.716667 26.658333 29.175000 \n",
1449 | "1 18.533333 19.608333 21.916667 24.716667 26.658333 29.175000 \n",
1450 | "2 19.800000 19.591667 19.575000 19.516667 19.550000 19.783333 \n",
1451 | "3 17.083333 17.516667 17.825000 18.258333 17.950000 17.575000 \n",
1452 | "4 17.525000 19.191667 22.458333 25.016667 25.858333 26.850000 \n",
1453 | "\n",
1454 | " temp_17 temp_18 temp_19 temp_20 temp_21 temp_22 \\\n",
1455 | "0 30.700000 31.433333 32.333333 33.008333 33.391667 33.616667 \n",
1456 | "1 30.700000 31.433333 32.333333 33.008333 33.391667 33.616667 \n",
1457 | "2 19.908333 19.516667 19.666667 20.683333 22.491667 23.708333 \n",
1458 | "3 17.425000 17.475000 16.941667 18.233333 21.166667 23.708333 \n",
1459 | "4 27.625000 28.358333 28.750000 27.850000 23.766667 20.891667 \n",
1460 | "\n",
1461 | " temp_23 temp_24 temp_25 temp_26 temp_27 temp_28 \\\n",
1462 | "0 31.091667 27.550000 26.658333 25.675000 26.425000 23.783333 \n",
1463 | "1 31.091667 27.550000 26.658333 25.675000 26.425000 23.783333 \n",
1464 | "2 24.900000 26.058333 27.325000 27.866667 28.291667 22.136364 \n",
1465 | "3 24.941667 25.825000 26.691667 27.275000 27.491667 21.136364 \n",
1466 | "4 21.008333 20.608333 20.191667 20.108333 19.633333 24.247348 \n",
1467 | "\n",
1468 | " temp_29 temp_30 temp_31 temp_32 temp_33 temp_34 \\\n",
1469 | "0 22.416667 22.041667 21.008333 21.475000 21.825000 21.158333 \n",
1470 | "1 22.416667 22.041667 21.008333 21.475000 21.825000 21.158333 \n",
1471 | "2 19.141667 19.991667 20.216667 20.433333 20.466667 20.800000 \n",
1472 | "3 19.650000 19.391667 19.116667 19.075000 19.541667 19.450000 \n",
1473 | "4 23.537500 23.416667 22.900000 22.100000 21.554167 21.116667 \n",
1474 | "\n",
1475 | " temp_35 temp_36 temp_37 temp_38 temp_39 temp_40 \\\n",
1476 | "0 22.308333 23.116667 22.333333 23.850000 23.825000 24.983333 \n",
1477 | "1 22.308333 23.116667 22.333333 23.850000 23.825000 24.983333 \n",
1478 | "2 20.958333 20.641667 20.033333 19.825000 19.616667 19.225000 \n",
1479 | "3 19.175000 18.891667 19.058333 19.325000 19.400000 19.458333 \n",
1480 | "4 20.970833 20.691667 20.587500 20.820833 21.108333 20.829167 \n",
1481 | "\n",
1482 | " temp_41 temp_42 temp_43 temp_44 temp_45 temp_46 \\\n",
1483 | "0 28.050000 30.841667 31.991667 32.041667 32.000000 31.666667 \n",
1484 | "1 28.050000 30.841667 31.991667 32.041667 32.000000 31.666667 \n",
1485 | "2 18.716667 18.458333 19.000000 20.758333 27.358333 24.491667 \n",
1486 | "3 19.466667 19.441667 19.408333 19.850000 22.716667 23.091667 \n",
1487 | "4 20.879167 20.625000 18.608333 18.516667 20.020833 20.225000 \n",
1488 | "\n",
1489 | " temp_47 temp_48 temp_49 temp_50 temp_51 temp_52 \\\n",
1490 | "0 29.158333 25.866667 24.300000 23.683333 23.758333 22.991667 \n",
1491 | "1 29.158333 25.866667 24.300000 23.683333 23.758333 22.991667 \n",
1492 | "2 26.350000 27.950000 29.166667 30.066667 30.550000 30.400000 \n",
1493 | "3 23.841667 25.166667 26.875000 27.125000 27.183333 26.500000 \n",
1494 | "4 20.612500 21.675000 22.925000 23.857955 25.779167 21.263636 \n",
1495 | "\n",
1496 | " temp_53 temp_54 temp_55 temp_56 temp_57 temp_58 \\\n",
1497 | "0 21.891667 21.158333 20.416667 19.533333 19.125000 18.766667 \n",
1498 | "1 21.891667 21.158333 20.416667 19.533333 19.125000 18.766667 \n",
1499 | "2 30.008333 28.908333 26.700000 25.716667 24.858333 24.291667 \n",
1500 | "3 26.075000 23.391667 21.558333 20.525000 18.016667 18.333333 \n",
1501 | "4 20.616667 24.316667 23.008333 22.145833 20.095833 19.970833 \n",
1502 | "\n",
1503 | " temp_59 temp_60 temp_61 temp_62 temp_63 temp_64 \\\n",
1504 | "0 17.983333 18.458333 21.933333 23.791667 25.558333 27.758333 \n",
1505 | "1 17.983333 18.458333 21.933333 23.791667 25.558333 27.758333 \n",
1506 | "2 23.308333 23.008333 22.041667 21.375000 21.133333 21.158333 \n",
1507 | "3 18.533333 18.641667 18.808333 18.741667 18.733333 18.575000 \n",
1508 | "4 20.083333 20.062500 19.933333 19.691667 19.816667 19.433333 \n",
1509 | "\n",
1510 | " temp_65 temp_66 temp_67 temp_68 temp_69 temp_70 \\\n",
1511 | "0 29.658333 31.391667 31.883333 32.358333 32.708333 31.983333 \n",
1512 | "1 29.658333 31.391667 31.883333 32.358333 32.708333 31.983333 \n",
1513 | "2 20.658333 20.766667 21.633333 22.708333 24.808333 27.108333 \n",
1514 | "3 18.491667 18.225000 18.250000 18.458333 19.266667 21.025000 \n",
1515 | "4 19.441667 19.154167 19.016667 19.016667 19.525000 21.487500 \n",
1516 | "\n",
1517 | " temp_71 temp_72 temp_73 temp_74 temp_75 temp_76 temp_77 \\\n",
1518 | "0 30.850000 28.800 26.491667 24.866667 24.366667 23.025000 22.325000 \n",
1519 | "1 30.850000 28.800 26.491667 24.866667 24.366667 23.025000 22.325000 \n",
1520 | "2 28.775000 29.475 29.766667 29.875000 29.150000 27.716667 27.491667 \n",
1521 | "3 23.166667 24.000 24.708333 25.558333 26.500000 25.716667 20.758333 \n",
1522 | "4 22.937500 23.400 24.316667 25.304167 26.208333 26.145833 23.670833 \n",
1523 | "\n",
1524 | " temp_78 temp_79 temp_80 temp_81 temp_82 temp_83 \\\n",
1525 | "0 21.650000 20.750000 20.475000 19.641667 19.516667 19.575000 \n",
1526 | "1 21.650000 20.750000 20.475000 19.641667 19.516667 19.575000 \n",
1527 | "2 26.483333 25.475000 25.008333 24.600000 24.033333 23.358333 \n",
1528 | "3 17.358333 18.116667 18.133333 18.058333 18.350000 18.266667 \n",
1529 | "4 21.725000 21.837500 21.279167 20.570833 20.387500 19.991667 \n",
1530 | "\n",
1531 | " temp_84 temp_85 temp_86 temp_87 temp_88 temp_89 \\\n",
1532 | "0 20.000000 23.358333 25.608333 26.883333 26.358333 27.325000 \n",
1533 | "1 20.000000 23.358333 25.608333 26.883333 26.358333 27.325000 \n",
1534 | "2 22.366667 22.608333 22.741667 21.908333 21.550000 21.758333 \n",
1535 | "3 18.066667 18.133333 18.016667 17.450000 17.066667 17.325000 \n",
1536 | "4 19.833333 19.245833 18.691667 18.383333 18.325000 18.433333 \n",
1537 | "\n",
1538 | " temp_90 temp_91 ... wind_dir_sin_21 wind_dir_sin_22 \\\n",
1539 | "0 29.008333 28.433333 ... -0.141620 0.677692 \n",
1540 | "1 29.008333 28.433333 ... -0.141620 0.677692 \n",
1541 | "2 20.316667 20.650000 ... 0.992270 -0.744836 \n",
1542 | "3 17.341667 17.375000 ... 0.106538 -0.656670 \n",
1543 | "4 18.437500 18.541667 ... -0.996037 0.763370 \n",
1544 | "\n",
1545 | " wind_dir_sin_23 wind_dir_sin_24 wind_dir_sin_25 wind_dir_sin_26 \\\n",
1546 | "0 -0.691042 -0.570434 -0.621553 0.054916 \n",
1547 | "1 -0.691042 -0.570434 -0.621553 0.054916 \n",
1548 | "2 0.312328 -0.613390 -0.691339 -0.996273 \n",
1549 | "3 0.127297 0.936974 -0.755199 0.225715 \n",
1550 | "4 0.632792 -0.142443 0.883681 0.967957 \n",
1551 | "\n",
1552 | " wind_dir_sin_27 wind_dir_sin_28 wind_dir_sin_29 wind_dir_sin_30 \\\n",
1553 | "0 -0.068411 -0.427471 -0.748119 0.795807 \n",
1554 | "1 -0.068411 -0.427471 -0.748119 0.795807 \n",
1555 | "2 0.877801 -0.619629 -0.852653 0.952294 \n",
1556 | "3 -0.876023 -0.780112 -0.930897 0.999544 \n",
1557 | "4 0.978257 0.176707 0.989523 0.881470 \n",
1558 | "\n",
1559 | " wind_dir_sin_31 wind_dir_sin_32 wind_dir_sin_33 wind_dir_sin_34 \\\n",
1560 | "0 0.997073 -0.105681 -0.170296 0.850561 \n",
1561 | "1 0.997073 -0.105681 -0.170296 0.850561 \n",
1562 | "2 0.272820 0.887248 0.161753 0.710564 \n",
1563 | "3 -0.888784 -0.874682 -0.153198 0.534190 \n",
1564 | "4 -0.928733 0.166814 -0.959935 -0.936613 \n",
1565 | "\n",
1566 | " wind_dir_sin_35 wind_dir_sin_36 wind_dir_sin_37 wind_dir_sin_38 \\\n",
1567 | "0 0.971100 -0.002620 -0.897881 0.855572 \n",
1568 | "1 0.971100 -0.002620 -0.897881 0.855572 \n",
1569 | "2 0.493320 0.831038 0.922995 -0.206542 \n",
1570 | "3 -0.293604 0.923470 -0.319239 -0.991811 \n",
1571 | "4 0.054542 -0.414697 -0.866427 0.173500 \n",
1572 | "\n",
1573 | " wind_dir_sin_39 wind_dir_sin_40 wind_dir_sin_41 wind_dir_sin_42 \\\n",
1574 | "0 0.562276 0.606049 0.902810 -0.853162 \n",
1575 | "1 0.562276 0.606049 0.902810 -0.853162 \n",
1576 | "2 -0.905140 0.261427 -0.834561 0.999585 \n",
1577 | "3 -0.277354 -0.121781 -0.040652 -0.881798 \n",
1578 | "4 0.620190 -0.906068 -0.035412 0.999959 \n",
1579 | "\n",
1580 | " wind_dir_sin_43 wind_dir_sin_44 wind_dir_sin_45 wind_dir_sin_46 \\\n",
1581 | "0 -0.186615 -0.032798 0.937824 -0.317277 \n",
1582 | "1 -0.186615 -0.032798 0.937824 -0.317277 \n",
1583 | "2 0.969601 -0.304580 -0.134351 0.926241 \n",
1584 | "3 -0.630439 0.606618 -0.811544 -0.991091 \n",
1585 | "4 -0.830783 -0.269391 0.114415 0.054568 \n",
1586 | "\n",
1587 | " wind_dir_sin_47 wind_dir_sin_48 wind_dir_sin_49 wind_dir_sin_50 \\\n",
1588 | "0 -0.451505 -0.797416 -0.159037 -0.842704 \n",
1589 | "1 -0.451505 -0.797416 -0.159037 -0.842704 \n",
1590 | "2 -0.050912 -0.989347 0.825994 0.949567 \n",
1591 | "3 -0.999107 0.925944 0.997952 0.885999 \n",
1592 | "4 -0.071428 -0.999393 -0.916862 0.966995 \n",
1593 | "\n",
1594 | " wind_dir_sin_51 wind_dir_sin_52 wind_dir_sin_53 wind_dir_sin_54 \\\n",
1595 | "0 0.590000 0.249485 0.643785 0.369674 \n",
1596 | "1 0.590000 0.249485 0.643785 0.369674 \n",
1597 | "2 0.105761 -0.235078 0.986250 0.526325 \n",
1598 | "3 0.684460 -0.414167 0.989378 -0.226863 \n",
1599 | "4 0.827238 -0.550644 -0.426123 -0.532556 \n",
1600 | "\n",
1601 | " wind_dir_sin_55 wind_dir_sin_56 wind_dir_sin_57 wind_dir_sin_58 \\\n",
1602 | "0 -0.910274 -0.390897 0.531411 -0.117671 \n",
1603 | "1 -0.910274 -0.390897 0.531411 -0.117671 \n",
1604 | "2 -0.741661 0.819501 0.090216 -0.537516 \n",
1605 | "3 0.045114 -0.104814 0.307146 -0.383696 \n",
1606 | "4 -0.587063 -0.920966 -0.655409 -0.218329 \n",
1607 | "\n",
1608 | " wind_dir_sin_59 wind_dir_sin_60 wind_dir_sin_61 wind_dir_sin_62 \\\n",
1609 | "0 0.682072 0.997411 -0.445040 0.641554 \n",
1610 | "1 0.682072 0.997411 -0.445040 0.641554 \n",
1611 | "2 0.846942 -0.385990 -0.578650 0.616461 \n",
1612 | "3 -0.425937 0.429735 0.996446 -0.875655 \n",
1613 | "4 -0.129844 0.959347 -0.707344 -0.337041 \n",
1614 | "\n",
1615 | " wind_dir_sin_63 wind_dir_sin_64 wind_dir_sin_65 wind_dir_sin_66 \\\n",
1616 | "0 0.917998 0.647900 0.843400 0.831011 \n",
1617 | "1 0.917998 0.647900 0.843400 0.831011 \n",
1618 | "2 0.490440 -0.002268 -0.768049 -0.745449 \n",
1619 | "3 0.925057 -0.823499 -0.208259 0.860431 \n",
1620 | "4 0.961337 -0.935177 0.440540 -0.942829 \n",
1621 | "\n",
1622 | " wind_dir_sin_67 wind_dir_sin_68 wind_dir_sin_69 wind_dir_sin_70 \\\n",
1623 | "0 -0.780758 -0.402907 -0.970055 -0.775132 \n",
1624 | "1 -0.780758 -0.402907 -0.970055 -0.775132 \n",
1625 | "2 0.941508 0.847960 -0.947888 -0.421278 \n",
1626 | "3 0.047207 -0.357716 0.947255 -0.989517 \n",
1627 | "4 -0.969093 -0.985078 -0.946619 -0.662913 \n",
1628 | "\n",
1629 | " wind_dir_sin_71 wind_dir_sin_72 wind_dir_sin_73 wind_dir_sin_74 \\\n",
1630 | "0 -0.737531 -0.877086 0.706791 -0.991253 \n",
1631 | "1 -0.737531 -0.877086 0.706791 -0.991253 \n",
1632 | "2 0.855621 -0.841248 -0.738750 -0.616005 \n",
1633 | "3 0.777529 -0.200849 0.847853 0.269371 \n",
1634 | "4 -0.327247 -0.979234 0.738470 0.558170 \n",
1635 | "\n",
1636 | " wind_dir_sin_75 wind_dir_sin_76 wind_dir_sin_77 wind_dir_sin_78 \\\n",
1637 | "0 0.870372 0.235040 -0.399780 -0.890181 \n",
1638 | "1 0.870372 0.235040 -0.399780 -0.890181 \n",
1639 | "2 0.000840 -0.962296 -0.913448 0.914193 \n",
1640 | "3 -0.584174 -0.657106 0.458151 -0.933249 \n",
1641 | "4 0.496131 -0.045062 0.861434 0.406506 \n",
1642 | "\n",
1643 | " wind_dir_sin_79 wind_dir_sin_80 wind_dir_sin_81 wind_dir_sin_82 \\\n",
1644 | "0 -0.429068 0.536505 0.849812 -0.921784 \n",
1645 | "1 -0.429068 0.536505 0.849812 -0.921784 \n",
1646 | "2 -0.680750 -0.764314 -0.668780 0.553494 \n",
1647 | "3 0.824032 0.722134 0.991245 -0.613111 \n",
1648 | "4 0.440201 -0.677080 -0.840071 0.669422 \n",
1649 | "\n",
1650 | " wind_dir_sin_83 wind_dir_sin_84 wind_dir_sin_85 wind_dir_sin_86 \\\n",
1651 | "0 0.890076 0.325722 -0.968946 0.422955 \n",
1652 | "1 0.890076 0.325722 -0.968946 0.422955 \n",
1653 | "2 -0.927593 -0.647122 0.326478 0.968075 \n",
1654 | "3 -0.168375 0.209887 0.349680 0.415632 \n",
1655 | "4 0.750988 0.277206 0.372669 -0.823120 \n",
1656 | "\n",
1657 | " wind_dir_sin_87 wind_dir_sin_88 wind_dir_sin_89 wind_dir_sin_90 \\\n",
1658 | "0 -0.908680 -0.238531 0.994556 -0.784712 \n",
1659 | "1 -0.908680 -0.238531 0.994556 -0.784712 \n",
1660 | "2 0.946287 -0.426579 0.674657 0.955783 \n",
1661 | "3 0.890022 0.992949 0.875658 0.960588 \n",
1662 | "4 0.290495 -0.201678 -0.264323 0.965121 \n",
1663 | "\n",
1664 | " wind_dir_sin_91 wind_dir_sin_92 wind_dir_sin_93 wind_dir_sin_94 \\\n",
1665 | "0 -0.327369 0.793215 0.921165 0.900036 \n",
1666 | "1 -0.327369 0.793215 0.921165 0.900036 \n",
1667 | "2 -0.978836 0.512857 0.356463 -0.867495 \n",
1668 | "3 0.202772 -0.415800 0.881197 0.178436 \n",
1669 | "4 0.817075 -0.984789 0.975569 0.637579 \n",
1670 | "\n",
1671 | " wind_dir_sin_95 wind_dir_sin_96 wind_dir_sin_97 wind_dir_sin_98 \\\n",
1672 | "0 -0.244630 -0.858170 -0.980876 0.890335 \n",
1673 | "1 -0.244630 -0.858170 -0.980876 0.890335 \n",
1674 | "2 0.154614 -0.958078 0.485347 0.984303 \n",
1675 | "3 -0.713055 -0.977995 -0.868449 0.082654 \n",
1676 | "4 -0.985222 -0.991592 0.728332 -0.008841 \n",
1677 | "\n",
1678 | " wind_dir_sin_99 wind_dir_sin_100 wind_dir_sin_101 wind_dir_sin_102 \\\n",
1679 | "0 -0.992381 -0.861434 -0.629305 -0.290513 \n",
1680 | "1 -0.992381 -0.861434 -0.629305 -0.290513 \n",
1681 | "2 -0.659389 -0.967570 0.280092 -0.011816 \n",
1682 | "3 -0.936302 0.559380 -0.926564 0.485755 \n",
1683 | "4 -0.205911 0.967486 0.379974 -0.949185 \n",
1684 | "\n",
1685 | " wind_dir_sin_103 wind_dir_sin_104 wind_dir_sin_105 wind_dir_sin_106 \\\n",
1686 | "0 0.999024 -0.985627 0.403930 -0.778758 \n",
1687 | "1 0.999024 -0.985627 0.923642 -0.553773 \n",
1688 | "2 -0.862490 -0.785080 0.212533 0.099250 \n",
1689 | "3 1.000000 -0.202556 -0.498867 -0.267156 \n",
1690 | "4 0.983956 -0.404907 -0.988736 -0.293643 \n",
1691 | "\n",
1692 | " wind_dir_sin_107 wind_dir_sin_108 wind_dir_sin_109 wind_dir_sin_110 \\\n",
1693 | "0 -0.623189 0.921356 -0.949976 -0.938622 \n",
1694 | "1 0.455011 -0.989001 0.211568 -0.230857 \n",
1695 | "2 0.970914 0.841250 0.772792 0.999893 \n",
1696 | "3 -0.901780 0.605642 0.994990 -0.552490 \n",
1697 | "4 0.981650 -0.891477 0.962301 -0.941072 \n",
1698 | "\n",
1699 | " wind_dir_sin_111 wind_dir_sin_112 wind_dir_sin_113 wind_dir_sin_114 \\\n",
1700 | "0 0.539989 0.833571 -0.566473 0.249451 \n",
1701 | "1 0.971538 0.669779 -0.839579 -0.212372 \n",
1702 | "2 -0.620373 -0.999964 -0.328710 0.549286 \n",
1703 | "3 0.580466 0.945695 0.669056 -0.074219 \n",
1704 | "4 0.461160 0.970801 -0.165447 -0.791261 \n",
1705 | "\n",
1706 | " wind_dir_sin_115 wind_dir_sin_116 wind_dir_sin_117 wind_dir_sin_118 \\\n",
1707 | "0 0.703657 -0.775581 0.150566 0.117369 \n",
1708 | "1 -0.029492 0.903239 0.881671 0.304360 \n",
1709 | "2 -0.995617 -0.654751 0.089768 0.955292 \n",
1710 | "3 0.816325 0.049529 0.357827 0.236227 \n",
1711 | "4 -0.901593 -0.491141 0.996079 -0.391008 \n",
1712 | "\n",
1713 | " wind_dir_sin_119 wind_dir_sin_120 \n",
1714 | "0 0.635717 -0.947955 \n",
1715 | "1 -0.955722 0.996240 \n",
1716 | "2 0.772715 -0.939837 \n",
1717 | "3 0.301397 -0.949435 \n",
1718 | "4 -0.302821 -0.618876 \n",
1719 | "\n",
1720 | "[5 rows x 857 columns]"
1721 | ]
1722 | },
1723 | "execution_count": 8,
1724 | "metadata": {},
1725 | "output_type": "execute_result"
1726 | }
1727 | ],
1728 | "source": [
1729 | "%%time\n",
1730 | "### Only takes 30s to preprocess the data\n",
1731 | "data = pd.concat([train,test]).reset_index(drop=True)\n",
1732 | "df = data[['ID', 'location', 'target']]\n",
1733 | "df = pd.concat([train,test]).reset_index(drop=True)\n",
1734 | "for c in tqdm_notebook(['temp', 'precip', 'rel_humidity', 'wind_dir','wind_spd', 'atmos_press']):\n",
1735 | " tmp = data[c].str.split(',', expand=True)\n",
1736 | " tmp.columns = [c + '_' + str(x) for x in tmp.columns]\n",
1737 | " df = pd.concat([df, tmp], axis=1)\n",
1738 | "obj_cols = [c for c in df.select_dtypes('object').columns if c not in ['ID', 'location']]\n",
1739 | "tmp = Parallel(n_jobs=4)(delayed(partial(pd.to_numeric, errors='coerce'))(df[c]) for c in obj_cols)\n",
1740 | "df = df.drop(obj_cols, axis=1)\n",
1741 | "df = pd.concat([df, pd.DataFrame(tmp).T], axis=1)\n",
1742 | "weather_cols = ['temp', 'precip', 'rel_humidity', 'wind_dir','wind_spd', 'atmos_press']\n",
1743 | "\n",
1744 | "### Filling NaNs by interpolation.\n",
1745 | "for w in tqdm_notebook(weather_cols):\n",
1746 | " selected_cols = [c for c in df.columns if w in c]\n",
1747 | " df[selected_cols] = df[selected_cols].interpolate(limit_direction='both')\n",
1748 | " ### Wind direction broken down into sine and cos components, and the original column is dropped\n",
1749 | " if w == 'wind_dir':\n",
1750 | " df[['wind_dir_cos_' + c.split('_')[-1] for c in selected_cols]] = df[selected_cols].apply(lambda x: np.cos(x))\n",
1751 | " df[['wind_dir_sin_' + c.split('_')[-1] for c in selected_cols]] = df[selected_cols].apply(lambda x: np.sin(x))\n",
1752 | " df = df.drop(selected_cols, axis = 1) \n",
1753 | "print(df.shape)\n",
1754 | "features = [c for c in df.columns if c not in ['ID', 'location', 'target']]\n",
1755 | "print(df[features].isnull().sum().sum())\n",
1756 | "df.head()"
1757 | ]
1758 | },
1759 | {
1760 | "cell_type": "code",
1761 | "execution_count": 9,
1762 | "metadata": {
1763 | "collapsed": true
1764 | },
1765 | "outputs": [],
1766 | "source": [
1767 | "### Encoding each location by the mean of its target\n",
1768 | "features = [\"temp\",\"precip\",\"rel_humidity\",\"wind_spd\",\"atmos_press\", \"wind_dir_cos\", \"wind_dir_sin\"]\n",
1769 | "df['location'] = df['location'].map(df.groupby('location')['target'].mean())"
1770 | ]
1771 | },
1772 | {
1773 | "cell_type": "code",
1774 | "execution_count": 10,
1775 | "metadata": {},
1776 | "outputs": [
1777 | {
1778 | "name": "stderr",
1779 | "output_type": "stream",
1780 | "text": [
1781 | "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:4: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
1782 | "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
1783 | " after removing the cwd from sys.path.\n"
1784 | ]
1785 | },
1786 | {
1787 | "data": {
1788 | "application/vnd.jupyter.widget-view+json": {
1789 | "model_id": "d3873ff95cce472896986a0bc35f665b",
1790 | "version_major": 2,
1791 | "version_minor": 0
1792 | },
1793 | "text/plain": [
1794 | "HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))"
1795 | ]
1796 | },
1797 | "metadata": {},
1798 | "output_type": "display_data"
1799 | },
1800 | {
1801 | "name": "stdout",
1802 | "output_type": "stream",
1803 | "text": [
1804 | "\n"
1805 | ]
1806 | }
1807 | ],
1808 | "source": [
1809 | "### Creating more features\n",
1810 | "\n",
1811 | "new_features = []\n",
1812 | "for f in tqdm_notebook(features):\n",
1813 | " new_features.append(f'{f}_24hrs_ratio')\n",
1814 | " new_features.append(f'{f}_6hrs_ratio')\n",
1815 | " for hour, f_col in enumerate([f\"f_{i}\" for i in range(121)]):\n",
1816 | " \n",
1817 | " ### Ratio of current hour and 24 hours before\n",
1818 | " if hour - 24 < 0:\n",
1819 | " df[f'{f}_24hrs_ratio_{hour}'] = df[f'{f}_{hour}'] /(1e-5 + df[f'{f}_0'])\n",
1820 | " else:\n",
1821 | " df[f'{f}_24hrs_ratio_{hour}'] = df[f'{f}_{hour}'] /(1e-5 + df[f'{f}_{hour-24}'])\n",
1822 | "\n",
1823 | " \n",
1824 | " ### Ratio of current hour and 6 hours before\n",
1825 | " if hour - 6 < 0:\n",
1826 | " df[f'{f}_6hrs_ratio_{hour}'] = df[f'{f}_{hour}'] /(1e-5 + df[f'{f}_0'])\n",
1827 | " else:\n",
1828 | " df[f'{f}_6hrs_ratio_{hour}'] = df[f'{f}_{hour}'] /(1e-5 + df[f'{f}_{hour-6}'])"
1829 | ]
1830 | },
1831 | {
1832 | "cell_type": "code",
1833 | "execution_count": 12,
1834 | "metadata": {},
1835 | "outputs": [
1836 | {
1837 | "data": {
1838 | "text/plain": [
1839 | "['temp',\n",
1840 | " 'precip',\n",
1841 | " 'rel_humidity',\n",
1842 | " 'wind_spd',\n",
1843 | " 'atmos_press',\n",
1844 | " 'wind_dir_cos',\n",
1845 | " 'wind_dir_sin',\n",
1846 | " 'temp_24hrs_ratio',\n",
1847 | " 'temp_6hrs_ratio',\n",
1848 | " 'precip_24hrs_ratio',\n",
1849 | " 'precip_6hrs_ratio',\n",
1850 | " 'rel_humidity_24hrs_ratio',\n",
1851 | " 'rel_humidity_6hrs_ratio',\n",
1852 | " 'wind_spd_24hrs_ratio',\n",
1853 | " 'wind_spd_6hrs_ratio',\n",
1854 | " 'atmos_press_24hrs_ratio',\n",
1855 | " 'atmos_press_6hrs_ratio',\n",
1856 | " 'wind_dir_cos_24hrs_ratio',\n",
1857 | " 'wind_dir_cos_6hrs_ratio',\n",
1858 | " 'wind_dir_sin_24hrs_ratio',\n",
1859 | " 'wind_dir_sin_6hrs_ratio']"
1860 | ]
1861 | },
1862 | "execution_count": 12,
1863 | "metadata": {},
1864 | "output_type": "execute_result"
1865 | }
1866 | ],
1867 | "source": [
1868 | "features = features + new_features\n",
1869 | "features"
1870 | ]
1871 | },
1872 | {
1873 | "cell_type": "code",
1874 | "execution_count": 13,
1875 | "metadata": {
1876 | "collapsed": true
1877 | },
1878 | "outputs": [],
1879 | "source": [
1880 | "train=df[df.target.notnull()].reset_index(drop=True)\n",
1881 | "test=df[df.target.isna()].reset_index(drop=True)"
1882 | ]
1883 | },
1884 | {
1885 | "cell_type": "code",
1886 | "execution_count": 14,
1887 | "metadata": {
1888 | "collapsed": true
1889 | },
1890 | "outputs": [],
1891 | "source": [
1892 | "def get_sample(x):\n",
1893 | " '''\n",
1894 | " Convert a train row into a sample suitable for CNN\n",
1895 | " x: row of train dataset\n",
1896 | " '''\n",
1897 | " sub_sample = np.zeros((len(features)+1, 121))\n",
1898 | " for i, f in enumerate(features + [\"location\"]):\n",
1899 | " if f == \"location\":\n",
1900 | " sub_sample[i] = x[\"location\"] * np.ones(121)\n",
1901 | " else:\n",
1902 | " cols_f = [f\"{f}_{i}\" for i in range(121)]\n",
1903 | " vals = x[cols_f].values.astype(float)\n",
1904 | " sub_sample[i] = vals\n",
1905 | " return np.array(sub_sample).astype('float')"
1906 | ]
1907 | },
1908 | {
1909 | "cell_type": "code",
1910 | "execution_count": 15,
1911 | "metadata": {},
1912 | "outputs": [
1913 | {
1914 | "data": {
1915 | "text/plain": [
1916 | "(15539, 2551)"
1917 | ]
1918 | },
1919 | "execution_count": 15,
1920 | "metadata": {},
1921 | "output_type": "execute_result"
1922 | }
1923 | ],
1924 | "source": [
1925 | "train.shape"
1926 | ]
1927 | },
1928 | {
1929 | "cell_type": "code",
1930 | "execution_count": 16,
1931 | "metadata": {},
1932 | "outputs": [
1933 | {
1934 | "name": "stdout",
1935 | "output_type": "stream",
1936 | "text": [
1937 | "CPU times: user 1min 43s, sys: 3.45 s, total: 1min 47s\n",
1938 | "Wall time: 6min 39s\n"
1939 | ]
1940 | }
1941 | ],
1942 | "source": [
1943 | "%%time\n",
1944 | "train_samples = Parallel(n_jobs=4)(delayed(get_sample)(row[1]) for row in train.iterrows())\n",
1945 | "test_samples = Parallel(n_jobs=4)(delayed(get_sample)(row[1]) for row in test.iterrows())"
1946 | ]
1947 | },
1948 | {
1949 | "cell_type": "code",
1950 | "execution_count": 17,
1951 | "metadata": {},
1952 | "outputs": [
1953 | {
1954 | "data": {
1955 | "text/plain": [
1956 | "((15539, 121, 22), (5035, 121, 22))"
1957 | ]
1958 | },
1959 | "execution_count": 17,
1960 | "metadata": {},
1961 | "output_type": "execute_result"
1962 | }
1963 | ],
1964 | "source": [
1965 | "X_test = np.array(test_samples)\n",
1966 | "X_test = np.einsum('ikj->ijk', X_test)\n",
1967 | "X_train = np.array(train_samples)\n",
1968 | "X_train = np.einsum('ikj->ijk', X_train)\n",
1969 | "X_train.shape, X_test.shape"
1970 | ]
1971 | },
1972 | {
1973 | "cell_type": "code",
1974 | "execution_count": 18,
1975 | "metadata": {},
1976 | "outputs": [
1977 | {
1978 | "name": "stdout",
1979 | "output_type": "stream",
1980 | "text": [
1981 | "Train std = 1.020, Test std = 0.935, \n",
1982 | "Train mean = -0.001, Test mean = 0.002\n"
1983 | ]
1984 | }
1985 | ],
1986 | "source": [
1987 | "X_all = np.concatenate([X_train, X_test])\n",
1988 | "\n",
1989 | "my_mean = X_all.mean(axis=(0, 1))\n",
1990 | "my_std = X_all.astype(float).std(axis=(0, 1))\n",
1991 | "\n",
1992 | "X_train[:, :] -= my_mean\n",
1993 | "X_test [:, :] -= my_mean\n",
1994 | "\n",
1995 | "X_train[:, :] /= my_std\n",
1996 | "X_test [:, :] /= my_std\n",
1997 | "\n",
1998 | "print(f\"Train std = {X_train.std(): .3f}, Test std = {X_test.std(): .3f}, \\nTrain mean = {X_train.mean(): .3f}, Test mean = {X_test.mean(): .3f}\")"
1999 | ]
2000 | },
2001 | {
2002 | "cell_type": "code",
2003 | "execution_count": 19,
2004 | "metadata": {},
2005 | "outputs": [
2006 | {
2007 | "data": {
2008 | "text/plain": [
2009 | "(15539,)"
2010 | ]
2011 | },
2012 | "execution_count": 19,
2013 | "metadata": {},
2014 | "output_type": "execute_result"
2015 | }
2016 | ],
2017 | "source": [
2018 | "y_train = train[\"target\"].values\n",
2019 | "y_train.shape"
2020 | ]
2021 | },
2022 | {
2023 | "cell_type": "code",
2024 | "execution_count": 20,
2025 | "metadata": {},
2026 | "outputs": [
2027 | {
2028 | "name": "stdout",
2029 | "output_type": "stream",
2030 | "text": [
2031 | "Model: \"sequential\"\n",
2032 | "_________________________________________________________________\n",
2033 | "Layer (type) Output Shape Param # \n",
2034 | "=================================================================\n",
2035 | "batch_normalization (BatchNo multiple 88 \n",
2036 | "_________________________________________________________________\n",
2037 | "conv1d (Conv1D) multiple 4288 \n",
2038 | "_________________________________________________________________\n",
2039 | "conv1d_1 (Conv1D) multiple 20544 \n",
2040 | "_________________________________________________________________\n",
2041 | "conv1d_2 (Conv1D) multiple 57472 \n",
2042 | "_________________________________________________________________\n",
2043 | "conv1d_3 (Conv1D) multiple 180352 \n",
2044 | "_________________________________________________________________\n",
2045 | "conv1d_4 (Conv1D) multiple 557312 \n",
2046 | "_________________________________________________________________\n",
2047 | "max_pooling1d (MaxPooling1D) multiple 0 \n",
2048 | "_________________________________________________________________\n",
2049 | "flatten (Flatten) multiple 0 \n",
2050 | "_________________________________________________________________\n",
2051 | "batch_normalization_1 (Batch multiple 20480 \n",
2052 | "_________________________________________________________________\n",
2053 | "dropout (Dropout) multiple 0 \n",
2054 | "_________________________________________________________________\n",
2055 | "output_cnn_simple (Dense) multiple 163872 \n",
2056 | "_________________________________________________________________\n",
2057 | "dense (Dense) multiple 33 \n",
2058 | "=================================================================\n",
2059 | "Total params: 1,004,441\n",
2060 | "Trainable params: 994,157\n",
2061 | "Non-trainable params: 10,284\n",
2062 | "_________________________________________________________________\n"
2063 | ]
2064 | }
2065 | ],
2066 | "source": [
2067 | "\n",
2068 | "N_FEATS = X_train.shape[2]\n",
2069 | "\n",
2070 | "\n",
2071 | "def root_mean_squared_error(y_true, y_pred):\n",
2072 | " return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) \n",
2073 | "\n",
2074 | "def make_model():\n",
2075 | " model = Sequential()\n",
2076 | " model.add(BatchNormalization())\n",
2077 | " model.add(Conv1D(filters=64, kernel_size=(3,), strides=1, activation='relu', input_shape=(121, N_FEATS)))\n",
2078 | " model.add(Conv1D(filters=64, kernel_size=(5 ,), strides=1, activation='relu'))\n",
2079 | " \n",
2080 | " model.add(Conv1D(filters=128, kernel_size=(7, ), strides=1, activation='relu'))\n",
2081 | " model.add(Conv1D(filters=128, kernel_size=(11, ), strides=1, activation='relu'))\n",
2082 | " model.add(Conv1D(filters=256, kernel_size=(17, ), strides=1, activation='relu'))\n",
2083 | " model.add(MaxPooling1D(4))\n",
2084 | " \n",
2085 | "\n",
2086 | " model.add(Flatten())\n",
2087 | " model.add(BatchNormalization())\n",
2088 | " model.add(Dropout(0.3))\n",
2089 | " \n",
2090 | " model.add(Dense(32, activation='relu', name='output_cnn_simple'))\n",
2091 | " model.add(Dense(1))\n",
2092 | "\n",
2093 | " model.compile(loss=root_mean_squared_error, optimizer=Adam(lr = 5e-4))\n",
2094 | " \n",
2095 | " return model\n",
2096 | "\n",
2097 | "m = make_model()\n",
2098 | "m.build((None, 121, N_FEATS))\n",
2099 | "m.summary()"
2100 | ]
2101 | },
2102 | {
2103 | "cell_type": "code",
2104 | "execution_count": 21,
2105 | "metadata": {},
2106 | "outputs": [
2107 | {
2108 | "name": "stdout",
2109 | "output_type": "stream",
2110 | "text": [
2111 | "\n",
2112 | "_____________________Round n-1\n",
2113 | "\n",
2114 | "\n",
2115 | "Fold n-1\n",
2116 | "\n",
2117 | "Restoring model weights from the end of the best epoch.\n",
2118 | "Epoch 00060: early stopping\n",
2119 | "Erreur = 23.69\n",
2120 | "\n",
2121 | "\n",
2122 | "\n",
2123 | "Fold n-2\n",
2124 | "\n",
2125 | "Restoring model weights from the end of the best epoch.\n",
2126 | "Epoch 00050: early stopping\n",
2127 | "Erreur = 23.92\n",
2128 | "\n",
2129 | "\n",
2130 | "\n",
2131 | "Fold n-3\n",
2132 | "\n",
2133 | "Restoring model weights from the end of the best epoch.\n",
2134 | "Epoch 00069: early stopping\n",
2135 | "Erreur = 20.91\n",
2136 | "\n",
2137 | "\n",
2138 | "\n",
2139 | "Fold n-4\n",
2140 | "\n",
2141 | "Restoring model weights from the end of the best epoch.\n",
2142 | "Epoch 00039: early stopping\n",
2143 | "Erreur = 22.98\n",
2144 | "\n",
2145 | "\n",
2146 | "\n",
2147 | "Fold n-5\n",
2148 | "\n",
2149 | "Restoring model weights from the end of the best epoch.\n",
2150 | "Epoch 00055: early stopping\n",
2151 | "Erreur = 21.24\n",
2152 | "\n",
2153 | "\n",
2154 | "\n",
2155 | "Fold n-6\n",
2156 | "\n",
2157 | "Restoring model weights from the end of the best epoch.\n",
2158 | "Epoch 00046: early stopping\n",
2159 | "Erreur = 24.60\n",
2160 | "\n",
2161 | "\n",
2162 | "\n",
2163 | "Fold n-7\n",
2164 | "\n",
2165 | "Restoring model weights from the end of the best epoch.\n",
2166 | "Epoch 00054: early stopping\n",
2167 | "Erreur = 24.30\n",
2168 | "\n",
2169 | "\n",
2170 | "\n",
2171 | "Fold n-8\n",
2172 | "\n",
2173 | "Restoring model weights from the end of the best epoch.\n",
2174 | "Epoch 00045: early stopping\n",
2175 | "Erreur = 24.49\n",
2176 | "\n",
2177 | "\n",
2178 | "\n",
2179 | "Fold n-9\n",
2180 | "\n",
2181 | "Restoring model weights from the end of the best epoch.\n",
2182 | "Epoch 00046: early stopping\n",
2183 | "Erreur = 23.19\n",
2184 | "\n",
2185 | "\n",
2186 | "\n",
2187 | "Fold n-10\n",
2188 | "\n",
2189 | "Restoring model weights from the end of the best epoch.\n",
2190 | "Epoch 00050: early stopping\n",
2191 | "Erreur = 22.19\n",
2192 | "\n",
2193 | "\n",
2194 | "Total error = 23.151\n"
2195 | ]
2196 | }
2197 | ],
2198 | "source": [
2199 | "errcb2 = list()\n",
2200 | "y_pred_test = list()\n",
2201 | "\n",
2202 | "\n",
2203 | "for my_round in range(1):\n",
2204 | " i = 0\n",
2205 | " print(f\"\\n_____________________Round n-{my_round + 1}\\n\")\n",
2206 | " \n",
2207 | " fold = KFold(n_splits=10, shuffle=True, random_state=my_round)\n",
2208 | " \n",
2209 | " for train_index, test_index in fold.split(X_train, y_train):\n",
2210 | " i += 1\n",
2211 | " print(f\"\\nFold n-{i}\\n\")\n",
2212 | "\n",
2213 | " X_entr, X_val = X_train[train_index], X_train[test_index]\n",
2214 | " y_entr, y_val = y_train[train_index], y_train[test_index]\n",
2215 | "\n",
2216 | " es = tf.keras.callbacks.EarlyStopping(patience=10, \n",
2217 | " verbose=1, \n",
2218 | " restore_best_weights=True)\n",
2219 | "\n",
2220 | " m = make_model()\n",
2221 | "\n",
2222 | " m.fit(X_entr,\n",
2223 | " y_entr,\n",
2224 | " epochs=150,\n",
2225 | " verbose=0,\n",
2226 | " batch_size=128,\n",
2227 | " validation_data=(X_val, y_val),\n",
2228 | " callbacks=[es],\n",
2229 | " )\n",
2230 | "\n",
2231 | " # val score\n",
2232 | " preds = m.predict(X_val)[:, 0]\n",
2233 | " rmse_val = mean_squared_error(y_val, preds) ** .5\n",
2234 | " print(f\"Erreur = {rmse_val:.2f}\\n\\n\")\n",
2235 | " errcb2.append(rmse_val)\n",
2236 | "\n",
2237 | " # prediction\n",
2238 | " p2 = m.predict(X_test)[:, 0]\n",
2239 | " p2 = np.clip(p2, train.target.min(), train.target.max())\n",
2240 | "\n",
2241 | " y_pred_test.append(p2)\n",
2242 | "\n",
2243 | "print(f\"Total error = {np.mean(errcb2): .3f}\")"
2244 | ]
2245 | },
2246 | {
2247 | "cell_type": "code",
2248 | "execution_count": 22,
2249 | "metadata": {
2250 | "collapsed": true
2251 | },
2252 | "outputs": [],
2253 | "source": [
2254 | "d = {'ID': test[\"ID\"], 'target': np.mean(y_pred_test, axis=0)}\n",
2255 | "sub = pd.DataFrame(data=d)\n",
2256 | "sub = sub[['ID', 'target']]\n",
2257 | "sub['target'] = sub['target'].clip(train.target.min(), train.target.max())"
2258 | ]
2259 | },
2260 | {
2261 | "cell_type": "code",
2262 | "execution_count": 25,
2263 | "metadata": {},
2264 | "outputs": [
2265 | {
2266 | "data": {
2267 | "text/html": [
2268 | "\n",
2269 | "\n",
2282 | "
\n",
2283 | " \n",
2284 | " \n",
2285 | " | \n",
2286 | " ID | \n",
2287 | " target | \n",
2288 | "
\n",
2289 | " \n",
2290 | " \n",
2291 | " \n",
2292 | " | 0 | \n",
2293 | " ID_test_0 | \n",
2294 | " 146.448013 | \n",
2295 | "
\n",
2296 | " \n",
2297 | " | 1 | \n",
2298 | " ID_test_1 | \n",
2299 | " 78.831528 | \n",
2300 | "
\n",
2301 | " \n",
2302 | " | 2 | \n",
2303 | " ID_test_10 | \n",
2304 | " 33.348366 | \n",
2305 | "
\n",
2306 | " \n",
2307 | " | 3 | \n",
2308 | " ID_test_100 | \n",
2309 | " 55.782494 | \n",
2310 | "
\n",
2311 | " \n",
2312 | " | 4 | \n",
2313 | " ID_test_1000 | \n",
2314 | " 88.687622 | \n",
2315 | "
\n",
2316 | " \n",
2317 | " | 5 | \n",
2318 | " ID_test_1001 | \n",
2319 | " 30.877085 | \n",
2320 | "
\n",
2321 | " \n",
2322 | " | 6 | \n",
2323 | " ID_test_1002 | \n",
2324 | " 75.518860 | \n",
2325 | "
\n",
2326 | " \n",
2327 | " | 7 | \n",
2328 | " ID_test_1003 | \n",
2329 | " 37.986477 | \n",
2330 | "
\n",
2331 | " \n",
2332 | " | 8 | \n",
2333 | " ID_test_1004 | \n",
2334 | " 34.908413 | \n",
2335 | "
\n",
2336 | " \n",
2337 | " | 9 | \n",
2338 | " ID_test_1005 | \n",
2339 | " 52.110146 | \n",
2340 | "
\n",
2341 | " \n",
2342 | "
\n",
2343 | "
"
2344 | ],
2345 | "text/plain": [
2346 | " ID target\n",
2347 | "0 ID_test_0 146.448013\n",
2348 | "1 ID_test_1 78.831528\n",
2349 | "2 ID_test_10 33.348366\n",
2350 | "3 ID_test_100 55.782494\n",
2351 | "4 ID_test_1000 88.687622\n",
2352 | "5 ID_test_1001 30.877085\n",
2353 | "6 ID_test_1002 75.518860\n",
2354 | "7 ID_test_1003 37.986477\n",
2355 | "8 ID_test_1004 34.908413\n",
2356 | "9 ID_test_1005 52.110146"
2357 | ]
2358 | },
2359 | "execution_count": 25,
2360 | "metadata": {},
2361 | "output_type": "execute_result"
2362 | }
2363 | ],
2364 | "source": [
2365 | "sub.head(10)"
2366 | ]
2367 | },
2368 | {
2369 | "cell_type": "code",
2370 | "execution_count": 29,
2371 | "metadata": {},
2372 | "outputs": [
2373 | {
2374 | "data": {
2375 | "text/plain": [
2376 | "23.151077529050763"
2377 | ]
2378 | },
2379 | "execution_count": 29,
2380 | "metadata": {},
2381 | "output_type": "execute_result"
2382 | }
2383 | ],
2384 | "source": [
2385 | "np.mean(errcb2)"
2386 | ]
2387 | },
2388 | {
2389 | "cell_type": "code",
2390 | "execution_count": 32,
2391 | "metadata": {
2392 | "collapsed": true
2393 | },
2394 | "outputs": [],
2395 | "source": [
2396 | "sub.to_csv(f\"cnn_preds.csv\", index=False)"
2397 | ]
2398 | },
2399 | {
2400 | "cell_type": "code",
2401 | "execution_count": 33,
2402 | "metadata": {},
2403 | "outputs": [
2404 | {
2405 | "data": {
2406 | "text/html": [
2407 | "Download CSV file"
2408 | ],
2409 | "text/plain": [
2410 | ""
2411 | ]
2412 | },
2413 | "execution_count": 33,
2414 | "metadata": {},
2415 | "output_type": "execute_result"
2416 | }
2417 | ],
2418 | "source": [
2419 | "SUB_FILE_NAME = f\"cnn_preds.csv\"\n",
2420 | "from IPython.display import HTML\n",
2421 | "def create_download_link(title = \"Download CSV file\", filename = \"data.csv\"): \n",
2422 | " html = '{title}'\n",
2423 | " html = html.format(title=title,filename=filename)\n",
2424 | " return HTML(html)\n",
2425 | "create_download_link(filename = SUB_FILE_NAME)"
2426 | ]
2427 | }
2428 | ],
2429 | "metadata": {
2430 | "kernelspec": {
2431 | "display_name": "Python 3",
2432 | "language": "python",
2433 | "name": "python3"
2434 | },
2435 | "language_info": {
2436 | "codemirror_mode": {
2437 | "name": "ipython",
2438 | "version": 3
2439 | },
2440 | "file_extension": ".py",
2441 | "mimetype": "text/x-python",
2442 | "name": "python",
2443 | "nbconvert_exporter": "python",
2444 | "pygments_lexer": "ipython3",
2445 | "version": "3.6.3"
2446 | }
2447 | },
2448 | "nbformat": 4,
2449 | "nbformat_minor": 4
2450 | }
2451 |
--------------------------------------------------------------------------------