├── README.md
├── meli2021
├── 19_xgb_barely_tuned_yt.ipynb
├── 32_baseline_yt.ipynb
├── 61_active_model_yt.ipynb
├── README
└── utils.py
└── multiple_time_series
├── README
└── workshop_notebook.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # english_tutorials
--------------------------------------------------------------------------------
/meli2021/19_xgb_barely_tuned_yt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "75638528-4609-4ee2-93c9-28538579e471",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "import utils\n",
13 | "\n",
14 | "from sklearn.model_selection import GroupKFold, KFold\n",
15 | "from sklearn.linear_model import LinearRegression\n",
16 | "from sklearn.ensemble import RandomForestRegressor\n",
17 | "from sklearn.metrics import mean_squared_error\n",
18 | "from xgboost import XGBRegressor\n",
19 | "import tweedie\n",
20 | "\n",
21 | "\n",
22 | "from importlib import reload\n",
23 | "reload(utils)\n",
24 | "from skopt import gp_minimize"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "id": "3be069d6-c17e-43a2-8252-2acf5e9c26ad",
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "train = pd.read_parquet(\"./train/0.parquet\")\n",
35 | "train['date'] = pd.to_datetime(train['date'])\n",
36 | "train['fold'] = train['date'].dt.month"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 3,
42 | "id": "b9975ad7-ff3a-4b58-9d4c-b1435d0c5535",
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "data": {
47 | "text/html": [
48 | "
\n",
49 | "\n",
62 | "
\n",
63 | " \n",
64 | " \n",
65 | " | \n",
66 | " sku | \n",
67 | " date | \n",
68 | " sold_quantity | \n",
69 | " current_price | \n",
70 | " currency | \n",
71 | " listing_type | \n",
72 | " shipping_logistic_type | \n",
73 | " shipping_payment | \n",
74 | " minutes_active | \n",
75 | " item_domain_id | \n",
76 | " site_id | \n",
77 | " fold | \n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " 0 | \n",
83 | " 464801 | \n",
84 | " 2021-02-01 | \n",
85 | " 0 | \n",
86 | " 156.78 | \n",
87 | " REA | \n",
88 | " classic | \n",
89 | " fulfillment | \n",
90 | " free_shipping | \n",
91 | " 1440.0 | \n",
92 | " MLB-NEBULIZERS | \n",
93 | " MLB | \n",
94 | " 2 | \n",
95 | "
\n",
96 | " \n",
97 | " 1 | \n",
98 | " 464801 | \n",
99 | " 2021-02-02 | \n",
100 | " 0 | \n",
101 | " 156.78 | \n",
102 | " REA | \n",
103 | " classic | \n",
104 | " fulfillment | \n",
105 | " free_shipping | \n",
106 | " 1440.0 | \n",
107 | " MLB-NEBULIZERS | \n",
108 | " MLB | \n",
109 | " 2 | \n",
110 | "
\n",
111 | " \n",
112 | " 2 | \n",
113 | " 464801 | \n",
114 | " 2021-02-03 | \n",
115 | " 0 | \n",
116 | " 156.78 | \n",
117 | " REA | \n",
118 | " classic | \n",
119 | " fulfillment | \n",
120 | " free_shipping | \n",
121 | " 1440.0 | \n",
122 | " MLB-NEBULIZERS | \n",
123 | " MLB | \n",
124 | " 2 | \n",
125 | "
\n",
126 | " \n",
127 | " 3 | \n",
128 | " 464801 | \n",
129 | " 2021-02-04 | \n",
130 | " 0 | \n",
131 | " 156.78 | \n",
132 | " REA | \n",
133 | " classic | \n",
134 | " fulfillment | \n",
135 | " free_shipping | \n",
136 | " 1440.0 | \n",
137 | " MLB-NEBULIZERS | \n",
138 | " MLB | \n",
139 | " 2 | \n",
140 | "
\n",
141 | " \n",
142 | " 4 | \n",
143 | " 464801 | \n",
144 | " 2021-02-05 | \n",
145 | " 1 | \n",
146 | " 156.78 | \n",
147 | " REA | \n",
148 | " classic | \n",
149 | " fulfillment | \n",
150 | " free_shipping | \n",
151 | " 1440.0 | \n",
152 | " MLB-NEBULIZERS | \n",
153 | " MLB | \n",
154 | " 2 | \n",
155 | "
\n",
156 | " \n",
157 | "
\n",
158 | "
"
159 | ],
160 | "text/plain": [
161 | " sku date sold_quantity current_price currency listing_type \\\n",
162 | "0 464801 2021-02-01 0 156.78 REA classic \n",
163 | "1 464801 2021-02-02 0 156.78 REA classic \n",
164 | "2 464801 2021-02-03 0 156.78 REA classic \n",
165 | "3 464801 2021-02-04 0 156.78 REA classic \n",
166 | "4 464801 2021-02-05 1 156.78 REA classic \n",
167 | "\n",
168 | " shipping_logistic_type shipping_payment minutes_active item_domain_id \\\n",
169 | "0 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
170 | "1 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
171 | "2 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
172 | "3 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
173 | "4 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
174 | "\n",
175 | " site_id fold \n",
176 | "0 MLB 2 \n",
177 | "1 MLB 2 \n",
178 | "2 MLB 2 \n",
179 | "3 MLB 2 \n",
180 | "4 MLB 2 "
181 | ]
182 | },
183 | "execution_count": 3,
184 | "metadata": {},
185 | "output_type": "execute_result"
186 | }
187 | ],
188 | "source": [
189 | "train.head()"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 3,
195 | "id": "smaller-boulder",
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "test = pd.read_csv(\"test_data.csv\", index_col=0).squeeze()"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 4,
205 | "id": "finnish-canadian",
206 | "metadata": {},
207 | "outputs": [
208 | {
209 | "data": {
210 | "text/plain": [
211 | "sku int64\n",
212 | "date datetime64[ns]\n",
213 | "sold_quantity int64\n",
214 | "current_price float64\n",
215 | "currency object\n",
216 | "listing_type object\n",
217 | "shipping_logistic_type object\n",
218 | "shipping_payment object\n",
219 | "minutes_active float64\n",
220 | "item_domain_id object\n",
221 | "site_id object\n",
222 | "fold int64\n",
223 | "dtype: object"
224 | ]
225 | },
226 | "execution_count": 4,
227 | "metadata": {},
228 | "output_type": "execute_result"
229 | }
230 | ],
231 | "source": [
232 | "train.dtypes"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 5,
238 | "id": "fc10499b-4c46-4f7e-a88e-a4233fb05504",
239 | "metadata": {},
240 | "outputs": [
241 | {
242 | "data": {
243 | "text/html": [
244 | "\n",
245 | "\n",
258 | "
\n",
259 | " \n",
260 | " \n",
261 | " | \n",
262 | " sku | \n",
263 | " date | \n",
264 | " sold_quantity | \n",
265 | " current_price | \n",
266 | " currency | \n",
267 | " listing_type | \n",
268 | " shipping_logistic_type | \n",
269 | " shipping_payment | \n",
270 | " minutes_active | \n",
271 | " item_domain_id | \n",
272 | " site_id | \n",
273 | " fold | \n",
274 | "
\n",
275 | " \n",
276 | " \n",
277 | " \n",
278 | " 0 | \n",
279 | " 464801 | \n",
280 | " 2021-02-01 | \n",
281 | " 0 | \n",
282 | " 156.78 | \n",
283 | " REA | \n",
284 | " classic | \n",
285 | " fulfillment | \n",
286 | " free_shipping | \n",
287 | " 1440.0 | \n",
288 | " MLB-NEBULIZERS | \n",
289 | " MLB | \n",
290 | " 2 | \n",
291 | "
\n",
292 | " \n",
293 | " 1 | \n",
294 | " 464801 | \n",
295 | " 2021-02-02 | \n",
296 | " 0 | \n",
297 | " 156.78 | \n",
298 | " REA | \n",
299 | " classic | \n",
300 | " fulfillment | \n",
301 | " free_shipping | \n",
302 | " 1440.0 | \n",
303 | " MLB-NEBULIZERS | \n",
304 | " MLB | \n",
305 | " 2 | \n",
306 | "
\n",
307 | " \n",
308 | " 2 | \n",
309 | " 464801 | \n",
310 | " 2021-02-03 | \n",
311 | " 0 | \n",
312 | " 156.78 | \n",
313 | " REA | \n",
314 | " classic | \n",
315 | " fulfillment | \n",
316 | " free_shipping | \n",
317 | " 1440.0 | \n",
318 | " MLB-NEBULIZERS | \n",
319 | " MLB | \n",
320 | " 2 | \n",
321 | "
\n",
322 | " \n",
323 | " 3 | \n",
324 | " 464801 | \n",
325 | " 2021-02-04 | \n",
326 | " 0 | \n",
327 | " 156.78 | \n",
328 | " REA | \n",
329 | " classic | \n",
330 | " fulfillment | \n",
331 | " free_shipping | \n",
332 | " 1440.0 | \n",
333 | " MLB-NEBULIZERS | \n",
334 | " MLB | \n",
335 | " 2 | \n",
336 | "
\n",
337 | " \n",
338 | " 4 | \n",
339 | " 464801 | \n",
340 | " 2021-02-05 | \n",
341 | " 1 | \n",
342 | " 156.78 | \n",
343 | " REA | \n",
344 | " classic | \n",
345 | " fulfillment | \n",
346 | " free_shipping | \n",
347 | " 1440.0 | \n",
348 | " MLB-NEBULIZERS | \n",
349 | " MLB | \n",
350 | " 2 | \n",
351 | "
\n",
352 | " \n",
353 | "
\n",
354 | "
"
355 | ],
356 | "text/plain": [
357 | " sku date sold_quantity current_price currency listing_type \\\n",
358 | "0 464801 2021-02-01 0 156.78 REA classic \n",
359 | "1 464801 2021-02-02 0 156.78 REA classic \n",
360 | "2 464801 2021-02-03 0 156.78 REA classic \n",
361 | "3 464801 2021-02-04 0 156.78 REA classic \n",
362 | "4 464801 2021-02-05 1 156.78 REA classic \n",
363 | "\n",
364 | " shipping_logistic_type shipping_payment minutes_active item_domain_id \\\n",
365 | "0 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
366 | "1 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
367 | "2 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
368 | "3 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
369 | "4 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
370 | "\n",
371 | " site_id fold \n",
372 | "0 MLB 2 \n",
373 | "1 MLB 2 \n",
374 | "2 MLB 2 \n",
375 | "3 MLB 2 \n",
376 | "4 MLB 2 "
377 | ]
378 | },
379 | "execution_count": 5,
380 | "metadata": {},
381 | "output_type": "execute_result"
382 | }
383 | ],
384 | "source": [
385 | "train.head()"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 6,
391 | "id": "refined-string",
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "cats = ['item_domain_id', 'currency', 'listing_type', 'shipping_logistic_type', 'shipping_payment', 'site_id']"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 7,
401 | "id": "99b8015a-8c95-4936-9ba6-9d655aa19848",
402 | "metadata": {},
403 | "outputs": [],
404 | "source": [
405 | "from category_encoders import OrdinalEncoder\n",
406 | "enc = OrdinalEncoder(cats)\n",
407 | "train = enc.fit_transform(train)"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 8,
413 | "id": "monthly-general",
414 | "metadata": {},
415 | "outputs": [],
416 | "source": [
417 | "def gen_tr_ts():\n",
418 | " for fold in [2,3]:\n",
419 | " ts = train[train['fold'] != fold]['date'].max()\n",
420 | " ts = train[(train['fold'] != fold) & (train['date'] == ts)].index\n",
421 | " yield train.index[train['fold'] == fold], ts, fold\n",
422 | "\n",
423 | " "
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "id": "bdb25520-ed36-4c4c-a9fe-23387cb3f918",
430 | "metadata": {},
431 | "outputs": [
432 | {
433 | "name": "stdout",
434 | "output_type": "stream",
435 | "text": [
436 | "Iteration No: 1 started. Evaluating function at random point.\n",
437 | "[0.09871192514273254, 9, 0.16531200313642108, 0.9491364637917304, 1.2337280871824563, 120]\n",
438 | "8.619976237016095\n",
439 | "8.863340114023996\n",
440 | "Iteration No: 1 ended. Evaluation done at random point.\n",
441 | "Time taken: 164.3399\n",
442 | "Function value obtained: 8.7417\n",
443 | "Current minimum: 8.7417\n",
444 | "Iteration No: 2 started. Evaluating function at random point.\n",
445 | "[0.0059678992438367785, 7, 0.8919851637254288, 0.8116798250174155, 1.3101407817629525, 158]\n",
446 | "8.969453535454983\n",
447 | "9.180772268605745\n",
448 | "Iteration No: 2 ended. Evaluation done at random point.\n",
449 | "Time taken: 161.5522\n",
450 | "Function value obtained: 9.0751\n",
451 | "Current minimum: 8.7417\n",
452 | "Iteration No: 3 started. Evaluating function at random point.\n",
453 | "[0.007707362534461022, 3, 0.5309725180523154, 0.8725658221213098, 1.4526327599071185, 130]\n",
454 | "9.148086951536671\n"
455 | ]
456 | }
457 | ],
458 | "source": [
459 | "def tune(params):\n",
460 | " print(params)\n",
461 | " features = [\"current_price\", \"minutes_active\"] + cats\n",
462 | "\n",
463 | " mean_rps = 0.\n",
464 | " for tr,ts, fold in gen_tr_ts():\n",
465 | " #print(tr.shape, ts.shape)\n",
466 | " X = train[features]\n",
467 | " y = train['sold_quantity']\n",
468 | "\n",
469 | " Xtr = X.iloc[tr]\n",
470 | " ytr = y.iloc[tr]\n",
471 | " Xval = X.iloc[ts]\n",
472 | " yval = y.iloc[ts]\n",
473 | "\n",
474 | " #mdl = LinearRegression(normalize=True)\n",
475 | " #mdl = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)\n",
476 | " mdl = XGBRegressor(n_estimators=1000, learning_rate=params[0],\n",
477 | " max_depth=params[1],\n",
478 | " subsample=params[2],\n",
479 | " colsample_bytree=params[3],\n",
480 | " tweedie_variance_power=params[4],\n",
481 | " min_child_weight=params[5],\n",
482 | " random_state=0, objective=\"reg:tweedie\", \n",
483 | " base_score=1e-3,\n",
484 | " tree_method='gpu_hist')\n",
485 | " mdl.fit(Xtr, ytr)\n",
486 | " p = mdl.predict(Xval)\n",
487 | "\n",
488 | "\n",
489 | " ## EVAL\n",
490 | " pp = train[train['fold'] != fold][['sku', 'date', 'sold_quantity']]\n",
491 | " pp['stock'] = pp['sku'].map(test)\n",
492 | " pp = pp.sort_values([\"sku\",\"date\"])\n",
493 | " pp['cumulative_y'] = pp.groupby(\"sku\")['sold_quantity'].cumsum()\n",
494 | "\n",
495 | " pp = pp.dropna(subset=['stock'])\n",
496 | " pp['stockout_y'] = pp['cumulative_y'] >= pp['stock']\n",
497 | "\n",
498 | " first_so_y = pp[pp['stockout_y']].groupby(\"sku\").first()\n",
499 | " days_to_so_y = (first_so_y[\"date\"] - pp[\"date\"].min()) / np.timedelta64(1, 'D')\n",
500 | " days_to_so_y = days_to_so_y.reindex(pp['sku'].unique()).fillna(30.).clip(1,30)\n",
501 | "\n",
502 | "\n",
503 | " ppp = train.iloc[ts][['sku']]\n",
504 | " #p[~np.isfinite(p)] = 17.\n",
505 | " ppp['p'] = p\n",
506 | " ppp['stock'] = ppp['sku'].map(test)\n",
507 | " ppp = ppp.dropna(subset=['stock'])\n",
508 | " ppp['days_to_so'] = (ppp['stock'] / ppp['p']).astype(int).fillna(30.).clip(1,30)\n",
509 | " days_to_so_p = ppp[['sku', 'days_to_so']].set_index(\"sku\").squeeze().reindex(days_to_so_y.index) \n",
510 | "\n",
511 | " days_to_so_p2 = utils.pred_list_to_tweedie(days_to_so_p, phi=2, p=1.5)\n",
512 | " \n",
513 | " #tweedie distribution -> [0.05, 0.07, ... .13, 0.12]\n",
514 | "\n",
515 | " rps = utils.rps(days_to_so_y, days_to_so_p2, probs=True)\n",
516 | " mean_rps += rps\n",
517 | " print(rps)\n",
518 | " return mean_rps / 2\n",
519 | "\n",
520 | "space = [(1e-3, 1e-1, 'log-uniform'),\n",
521 | " (1, 10),\n",
522 | " (0.05, 0.95),\n",
523 | " (0.05, 0.95),\n",
524 | " (1.0,1.99),\n",
525 | " (1,300)]\n",
526 | "res = gp_minimize(tune, space, random_state=1, verbose=1)\n",
527 | "\n",
528 | " "
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": null,
534 | "id": "pretty-literature",
535 | "metadata": {},
536 | "outputs": [],
537 | "source": [
538 | "# 15a \n",
539 | "# Mean CV 9.0805\n",
540 | "# LB 6.2598\n",
541 | "\n",
542 | "\n",
543 | "Iteration No: 2 started. Evaluating function at random point.\n",
544 | "[0.003936128001463711, 2, 0.29539066512210194, 0.47989860558921493, 1.8040470414877383, 145]\n",
545 | "6.131413939395725\n",
546 | "6.4664243315180086\n",
547 | "Iteration No: 2 ended. Evaluation done at random point.\n",
548 | "Time taken: 91.9157\n",
549 | "Function value obtained: 6.2989\n",
550 | "Current minimum: 6.2989"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "id": "comfortable-hypothesis",
556 | "metadata": {},
557 | "source": [
558 | "# sub"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": 10,
564 | "id": "dominant-machine",
565 | "metadata": {},
566 | "outputs": [
567 | {
568 | "name": "stdout",
569 | "output_type": "stream",
570 | "text": [
571 | "True\n"
572 | ]
573 | }
574 | ],
575 | "source": [
576 | "test_df = train[train['date'] == \"2021-03-31\"]\n",
577 | "test_df = test_df[test_df['sku'].isin(test.index)]\n",
578 | "print(np.all(test_df['sku'] == test.index))\n",
579 | "\n",
580 | "features = [\"current_price\", \"minutes_active\"] + cats\n",
581 | "params = [0.003936128001463711, 2, 0.29539066512210194, 0.47989860558921493, 1.8040470414877383, 145]\n",
582 | "mdl = XGBRegressor(n_estimators=1000, learning_rate=params[0],\n",
583 | " max_depth=params[1],\n",
584 | " subsample=params[2],\n",
585 | " colsample_bytree=params[3],\n",
586 | " tweedie_variance_power=params[4],\n",
587 | " min_child_weight=params[5],\n",
588 | " random_state=0, objective=\"reg:tweedie\", \n",
589 | " base_score=1e-3,\n",
590 | " tree_method='gpu_hist')\n",
591 | "mdl.fit(train[features], train['sold_quantity'])\n",
592 | "p = mdl.predict(test_df[features])"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": 14,
598 | "id": "important-rugby",
599 | "metadata": {},
600 | "outputs": [],
601 | "source": [
602 | "spp = test_df[['sku']].copy()\n",
603 | "spp['p'] = p\n",
604 | "spp['stock'] = spp['sku'].map(test)\n",
605 | "spp['days_to_so'] = (spp['stock'] / spp['p']).fillna(30.).clip(1,30).astype(int)\n"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": 15,
611 | "id": "excess-porter",
612 | "metadata": {},
613 | "outputs": [
614 | {
615 | "data": {
616 | "text/plain": [
617 | "1.0"
618 | ]
619 | },
620 | "execution_count": 15,
621 | "metadata": {},
622 | "output_type": "execute_result"
623 | }
624 | ],
625 | "source": [
626 | "test.index.isin(spp['sku']).mean()"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 16,
632 | "id": "joint-reservation",
633 | "metadata": {},
634 | "outputs": [],
635 | "source": [
636 | "prob_array = utils.pred_list_to_tweedie(spp['days_to_so'].values, phi=2., p=1.5)\n",
637 | "pd.set_option(\"display.max_columns\", 31)\n",
638 | "pd.DataFrame(prob_array).round(4).to_csv(\"19.csv.gz\", header=False, index=False, compression=\"gzip\")"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": 18,
644 | "id": "fitting-hamilton",
645 | "metadata": {},
646 | "outputs": [
647 | {
648 | "data": {
649 | "text/html": [
650 | "\n",
651 | "\n",
664 | "
\n",
665 | " \n",
666 | " \n",
667 | " | \n",
668 | " 0 | \n",
669 | " 1 | \n",
670 | " 2 | \n",
671 | " 3 | \n",
672 | " 4 | \n",
673 | " 5 | \n",
674 | " 6 | \n",
675 | " 7 | \n",
676 | " 8 | \n",
677 | " 9 | \n",
678 | " 10 | \n",
679 | " 11 | \n",
680 | " 12 | \n",
681 | " 13 | \n",
682 | " 14 | \n",
683 | " 15 | \n",
684 | " 16 | \n",
685 | " 17 | \n",
686 | " 18 | \n",
687 | " 19 | \n",
688 | " 20 | \n",
689 | " 21 | \n",
690 | " 22 | \n",
691 | " 23 | \n",
692 | " 24 | \n",
693 | " 25 | \n",
694 | " 26 | \n",
695 | " 27 | \n",
696 | " 28 | \n",
697 | " 29 | \n",
698 | "
\n",
699 | " \n",
700 | " \n",
701 | " \n",
702 | " 0 | \n",
703 | " 0.0215 | \n",
704 | " 0.0145 | \n",
705 | " 0.0175 | \n",
706 | " 0.0204 | \n",
707 | " 0.0233 | \n",
708 | " 0.0259 | \n",
709 | " 0.0284 | \n",
710 | " 0.0307 | \n",
711 | " 0.0328 | \n",
712 | " 0.0346 | \n",
713 | " 0.0362 | \n",
714 | " 0.0375 | \n",
715 | " 0.0386 | \n",
716 | " 0.0395 | \n",
717 | " 0.0401 | \n",
718 | " 0.0405 | \n",
719 | " 0.0407 | \n",
720 | " 0.0407 | \n",
721 | " 0.0405 | \n",
722 | " 0.0402 | \n",
723 | " 0.0397 | \n",
724 | " 0.0390 | \n",
725 | " 0.0383 | \n",
726 | " 0.0374 | \n",
727 | " 0.0364 | \n",
728 | " 0.0354 | \n",
729 | " 0.0343 | \n",
730 | " 0.0331 | \n",
731 | " 0.0319 | \n",
732 | " 0.0307 | \n",
733 | "
\n",
734 | " \n",
735 | " 1 | \n",
736 | " 0.0161 | \n",
737 | " 0.0111 | \n",
738 | " 0.0137 | \n",
739 | " 0.0162 | \n",
740 | " 0.0188 | \n",
741 | " 0.0213 | \n",
742 | " 0.0238 | \n",
743 | " 0.0261 | \n",
744 | " 0.0284 | \n",
745 | " 0.0305 | \n",
746 | " 0.0325 | \n",
747 | " 0.0343 | \n",
748 | " 0.0359 | \n",
749 | " 0.0373 | \n",
750 | " 0.0386 | \n",
751 | " 0.0397 | \n",
752 | " 0.0406 | \n",
753 | " 0.0413 | \n",
754 | " 0.0418 | \n",
755 | " 0.0422 | \n",
756 | " 0.0424 | \n",
757 | " 0.0424 | \n",
758 | " 0.0423 | \n",
759 | " 0.0421 | \n",
760 | " 0.0417 | \n",
761 | " 0.0412 | \n",
762 | " 0.0406 | \n",
763 | " 0.0399 | \n",
764 | " 0.0391 | \n",
765 | " 0.0383 | \n",
766 | "
\n",
767 | " \n",
768 | " 2 | \n",
769 | " 0.0161 | \n",
770 | " 0.0111 | \n",
771 | " 0.0137 | \n",
772 | " 0.0162 | \n",
773 | " 0.0188 | \n",
774 | " 0.0213 | \n",
775 | " 0.0238 | \n",
776 | " 0.0261 | \n",
777 | " 0.0284 | \n",
778 | " 0.0305 | \n",
779 | " 0.0325 | \n",
780 | " 0.0343 | \n",
781 | " 0.0359 | \n",
782 | " 0.0373 | \n",
783 | " 0.0386 | \n",
784 | " 0.0397 | \n",
785 | " 0.0406 | \n",
786 | " 0.0413 | \n",
787 | " 0.0418 | \n",
788 | " 0.0422 | \n",
789 | " 0.0424 | \n",
790 | " 0.0424 | \n",
791 | " 0.0423 | \n",
792 | " 0.0421 | \n",
793 | " 0.0417 | \n",
794 | " 0.0412 | \n",
795 | " 0.0406 | \n",
796 | " 0.0399 | \n",
797 | " 0.0391 | \n",
798 | " 0.0383 | \n",
799 | "
\n",
800 | " \n",
801 | " 3 | \n",
802 | " 0.0161 | \n",
803 | " 0.0111 | \n",
804 | " 0.0137 | \n",
805 | " 0.0162 | \n",
806 | " 0.0188 | \n",
807 | " 0.0213 | \n",
808 | " 0.0238 | \n",
809 | " 0.0261 | \n",
810 | " 0.0284 | \n",
811 | " 0.0305 | \n",
812 | " 0.0325 | \n",
813 | " 0.0343 | \n",
814 | " 0.0359 | \n",
815 | " 0.0373 | \n",
816 | " 0.0386 | \n",
817 | " 0.0397 | \n",
818 | " 0.0406 | \n",
819 | " 0.0413 | \n",
820 | " 0.0418 | \n",
821 | " 0.0422 | \n",
822 | " 0.0424 | \n",
823 | " 0.0424 | \n",
824 | " 0.0423 | \n",
825 | " 0.0421 | \n",
826 | " 0.0417 | \n",
827 | " 0.0412 | \n",
828 | " 0.0406 | \n",
829 | " 0.0399 | \n",
830 | " 0.0391 | \n",
831 | " 0.0383 | \n",
832 | "
\n",
833 | " \n",
834 | " 4 | \n",
835 | " 0.0161 | \n",
836 | " 0.0111 | \n",
837 | " 0.0137 | \n",
838 | " 0.0162 | \n",
839 | " 0.0188 | \n",
840 | " 0.0213 | \n",
841 | " 0.0238 | \n",
842 | " 0.0261 | \n",
843 | " 0.0284 | \n",
844 | " 0.0305 | \n",
845 | " 0.0325 | \n",
846 | " 0.0343 | \n",
847 | " 0.0359 | \n",
848 | " 0.0373 | \n",
849 | " 0.0386 | \n",
850 | " 0.0397 | \n",
851 | " 0.0406 | \n",
852 | " 0.0413 | \n",
853 | " 0.0418 | \n",
854 | " 0.0422 | \n",
855 | " 0.0424 | \n",
856 | " 0.0424 | \n",
857 | " 0.0423 | \n",
858 | " 0.0421 | \n",
859 | " 0.0417 | \n",
860 | " 0.0412 | \n",
861 | " 0.0406 | \n",
862 | " 0.0399 | \n",
863 | " 0.0391 | \n",
864 | " 0.0383 | \n",
865 | "
\n",
866 | " \n",
867 | " ... | \n",
868 | " ... | \n",
869 | " ... | \n",
870 | " ... | \n",
871 | " ... | \n",
872 | " ... | \n",
873 | " ... | \n",
874 | " ... | \n",
875 | " ... | \n",
876 | " ... | \n",
877 | " ... | \n",
878 | " ... | \n",
879 | " ... | \n",
880 | " ... | \n",
881 | " ... | \n",
882 | " ... | \n",
883 | " ... | \n",
884 | " ... | \n",
885 | " ... | \n",
886 | " ... | \n",
887 | " ... | \n",
888 | " ... | \n",
889 | " ... | \n",
890 | " ... | \n",
891 | " ... | \n",
892 | " ... | \n",
893 | " ... | \n",
894 | " ... | \n",
895 | " ... | \n",
896 | " ... | \n",
897 | " ... | \n",
898 | "
\n",
899 | " \n",
900 | " 551467 | \n",
901 | " 0.0161 | \n",
902 | " 0.0111 | \n",
903 | " 0.0137 | \n",
904 | " 0.0162 | \n",
905 | " 0.0188 | \n",
906 | " 0.0213 | \n",
907 | " 0.0238 | \n",
908 | " 0.0261 | \n",
909 | " 0.0284 | \n",
910 | " 0.0305 | \n",
911 | " 0.0325 | \n",
912 | " 0.0343 | \n",
913 | " 0.0359 | \n",
914 | " 0.0373 | \n",
915 | " 0.0386 | \n",
916 | " 0.0397 | \n",
917 | " 0.0406 | \n",
918 | " 0.0413 | \n",
919 | " 0.0418 | \n",
920 | " 0.0422 | \n",
921 | " 0.0424 | \n",
922 | " 0.0424 | \n",
923 | " 0.0423 | \n",
924 | " 0.0421 | \n",
925 | " 0.0417 | \n",
926 | " 0.0412 | \n",
927 | " 0.0406 | \n",
928 | " 0.0399 | \n",
929 | " 0.0391 | \n",
930 | " 0.0383 | \n",
931 | "
\n",
932 | " \n",
933 | " 551468 | \n",
934 | " 0.0161 | \n",
935 | " 0.0111 | \n",
936 | " 0.0137 | \n",
937 | " 0.0162 | \n",
938 | " 0.0188 | \n",
939 | " 0.0213 | \n",
940 | " 0.0238 | \n",
941 | " 0.0261 | \n",
942 | " 0.0284 | \n",
943 | " 0.0305 | \n",
944 | " 0.0325 | \n",
945 | " 0.0343 | \n",
946 | " 0.0359 | \n",
947 | " 0.0373 | \n",
948 | " 0.0386 | \n",
949 | " 0.0397 | \n",
950 | " 0.0406 | \n",
951 | " 0.0413 | \n",
952 | " 0.0418 | \n",
953 | " 0.0422 | \n",
954 | " 0.0424 | \n",
955 | " 0.0424 | \n",
956 | " 0.0423 | \n",
957 | " 0.0421 | \n",
958 | " 0.0417 | \n",
959 | " 0.0412 | \n",
960 | " 0.0406 | \n",
961 | " 0.0399 | \n",
962 | " 0.0391 | \n",
963 | " 0.0383 | \n",
964 | "
\n",
965 | " \n",
966 | " 551469 | \n",
967 | " 0.0161 | \n",
968 | " 0.0111 | \n",
969 | " 0.0137 | \n",
970 | " 0.0162 | \n",
971 | " 0.0188 | \n",
972 | " 0.0213 | \n",
973 | " 0.0238 | \n",
974 | " 0.0261 | \n",
975 | " 0.0284 | \n",
976 | " 0.0305 | \n",
977 | " 0.0325 | \n",
978 | " 0.0343 | \n",
979 | " 0.0359 | \n",
980 | " 0.0373 | \n",
981 | " 0.0386 | \n",
982 | " 0.0397 | \n",
983 | " 0.0406 | \n",
984 | " 0.0413 | \n",
985 | " 0.0418 | \n",
986 | " 0.0422 | \n",
987 | " 0.0424 | \n",
988 | " 0.0424 | \n",
989 | " 0.0423 | \n",
990 | " 0.0421 | \n",
991 | " 0.0417 | \n",
992 | " 0.0412 | \n",
993 | " 0.0406 | \n",
994 | " 0.0399 | \n",
995 | " 0.0391 | \n",
996 | " 0.0383 | \n",
997 | "
\n",
998 | " \n",
999 | " 551470 | \n",
1000 | " 0.0399 | \n",
1001 | " 0.0256 | \n",
1002 | " 0.0296 | \n",
1003 | " 0.0331 | \n",
1004 | " 0.0361 | \n",
1005 | " 0.0386 | \n",
1006 | " 0.0405 | \n",
1007 | " 0.0419 | \n",
1008 | " 0.0429 | \n",
1009 | " 0.0434 | \n",
1010 | " 0.0435 | \n",
1011 | " 0.0432 | \n",
1012 | " 0.0427 | \n",
1013 | " 0.0418 | \n",
1014 | " 0.0407 | \n",
1015 | " 0.0394 | \n",
1016 | " 0.0379 | \n",
1017 | " 0.0364 | \n",
1018 | " 0.0347 | \n",
1019 | " 0.0330 | \n",
1020 | " 0.0312 | \n",
1021 | " 0.0294 | \n",
1022 | " 0.0276 | \n",
1023 | " 0.0259 | \n",
1024 | " 0.0242 | \n",
1025 | " 0.0225 | \n",
1026 | " 0.0209 | \n",
1027 | " 0.0193 | \n",
1028 | " 0.0178 | \n",
1029 | " 0.0164 | \n",
1030 | "
\n",
1031 | " \n",
1032 | " 551471 | \n",
1033 | " 0.0161 | \n",
1034 | " 0.0111 | \n",
1035 | " 0.0137 | \n",
1036 | " 0.0162 | \n",
1037 | " 0.0188 | \n",
1038 | " 0.0213 | \n",
1039 | " 0.0238 | \n",
1040 | " 0.0261 | \n",
1041 | " 0.0284 | \n",
1042 | " 0.0305 | \n",
1043 | " 0.0325 | \n",
1044 | " 0.0343 | \n",
1045 | " 0.0359 | \n",
1046 | " 0.0373 | \n",
1047 | " 0.0386 | \n",
1048 | " 0.0397 | \n",
1049 | " 0.0406 | \n",
1050 | " 0.0413 | \n",
1051 | " 0.0418 | \n",
1052 | " 0.0422 | \n",
1053 | " 0.0424 | \n",
1054 | " 0.0424 | \n",
1055 | " 0.0423 | \n",
1056 | " 0.0421 | \n",
1057 | " 0.0417 | \n",
1058 | " 0.0412 | \n",
1059 | " 0.0406 | \n",
1060 | " 0.0399 | \n",
1061 | " 0.0391 | \n",
1062 | " 0.0383 | \n",
1063 | "
\n",
1064 | " \n",
1065 | "
\n",
1066 | "
551472 rows × 30 columns
\n",
1067 | "
"
1068 | ],
1069 | "text/plain": [
1070 | " 0 1 2 3 4 5 6 7 \\\n",
1071 | "0 0.0215 0.0145 0.0175 0.0204 0.0233 0.0259 0.0284 0.0307 \n",
1072 | "1 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n",
1073 | "2 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n",
1074 | "3 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n",
1075 | "4 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n",
1076 | "... ... ... ... ... ... ... ... ... \n",
1077 | "551467 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n",
1078 | "551468 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n",
1079 | "551469 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n",
1080 | "551470 0.0399 0.0256 0.0296 0.0331 0.0361 0.0386 0.0405 0.0419 \n",
1081 | "551471 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n",
1082 | "\n",
1083 | " 8 9 10 11 12 13 14 15 \\\n",
1084 | "0 0.0328 0.0346 0.0362 0.0375 0.0386 0.0395 0.0401 0.0405 \n",
1085 | "1 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n",
1086 | "2 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n",
1087 | "3 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n",
1088 | "4 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n",
1089 | "... ... ... ... ... ... ... ... ... \n",
1090 | "551467 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n",
1091 | "551468 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n",
1092 | "551469 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n",
1093 | "551470 0.0429 0.0434 0.0435 0.0432 0.0427 0.0418 0.0407 0.0394 \n",
1094 | "551471 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n",
1095 | "\n",
1096 | " 16 17 18 19 20 21 22 23 \\\n",
1097 | "0 0.0407 0.0407 0.0405 0.0402 0.0397 0.0390 0.0383 0.0374 \n",
1098 | "1 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n",
1099 | "2 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n",
1100 | "3 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n",
1101 | "4 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n",
1102 | "... ... ... ... ... ... ... ... ... \n",
1103 | "551467 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n",
1104 | "551468 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n",
1105 | "551469 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n",
1106 | "551470 0.0379 0.0364 0.0347 0.0330 0.0312 0.0294 0.0276 0.0259 \n",
1107 | "551471 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n",
1108 | "\n",
1109 | " 24 25 26 27 28 29 \n",
1110 | "0 0.0364 0.0354 0.0343 0.0331 0.0319 0.0307 \n",
1111 | "1 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n",
1112 | "2 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n",
1113 | "3 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n",
1114 | "4 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n",
1115 | "... ... ... ... ... ... ... \n",
1116 | "551467 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n",
1117 | "551468 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n",
1118 | "551469 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n",
1119 | "551470 0.0242 0.0225 0.0209 0.0193 0.0178 0.0164 \n",
1120 | "551471 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n",
1121 | "\n",
1122 | "[551472 rows x 30 columns]"
1123 | ]
1124 | },
1125 | "execution_count": 18,
1126 | "metadata": {},
1127 | "output_type": "execute_result"
1128 | }
1129 | ],
1130 | "source": [
1131 | "pd.read_csv(\"19.csv.gz\",header=None)#.sum(axis=1)"
1132 | ]
1133 | },
1134 | {
1135 | "cell_type": "code",
1136 | "execution_count": null,
1137 | "id": "eleven-intelligence",
1138 | "metadata": {},
1139 | "outputs": [],
1140 | "source": []
1141 | }
1142 | ],
1143 | "metadata": {
1144 | "kernelspec": {
1145 | "display_name": "Python 3",
1146 | "language": "python",
1147 | "name": "python3"
1148 | },
1149 | "language_info": {
1150 | "codemirror_mode": {
1151 | "name": "ipython",
1152 | "version": 3
1153 | },
1154 | "file_extension": ".py",
1155 | "mimetype": "text/x-python",
1156 | "name": "python",
1157 | "nbconvert_exporter": "python",
1158 | "pygments_lexer": "ipython3",
1159 | "version": "3.8.5"
1160 | }
1161 | },
1162 | "nbformat": 4,
1163 | "nbformat_minor": 5
1164 | }
1165 |
--------------------------------------------------------------------------------
/meli2021/32_baseline_yt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "id": "75638528-4609-4ee2-93c9-28538579e471",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "import utils\n",
13 | "\n",
14 | "from sklearn.model_selection import GroupKFold, KFold\n",
15 | "from sklearn.linear_model import LinearRegression\n",
16 | "from sklearn.ensemble import RandomForestRegressor\n",
17 | "from sklearn.metrics import mean_squared_error\n",
18 | "from xgboost import XGBRegressor\n",
19 | "import tweedie\n",
20 | "\n",
21 | "from importlib import reload\n",
22 | "reload(utils)\n",
23 | "from skopt import gp_minimize\n",
24 | "%matplotlib inline"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 5,
30 | "id": "3be069d6-c17e-43a2-8252-2acf5e9c26ad",
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "train = pd.read_parquet(\"./train/0.parquet\")\n",
35 | "train['date'] = pd.to_datetime(train['date'])\n",
36 | "train['fold'] = train['date'].dt.month"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 6,
42 | "id": "smaller-boulder",
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "test = pd.read_csv(\"test_data.csv\", index_col=0).squeeze()"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 7,
52 | "id": "finnish-canadian",
53 | "metadata": {},
54 | "outputs": [
55 | {
56 | "data": {
57 | "text/plain": [
58 | "sku int64\n",
59 | "date datetime64[ns]\n",
60 | "sold_quantity int64\n",
61 | "current_price float64\n",
62 | "currency object\n",
63 | "listing_type object\n",
64 | "shipping_logistic_type object\n",
65 | "shipping_payment object\n",
66 | "minutes_active float64\n",
67 | "item_domain_id object\n",
68 | "site_id object\n",
69 | "fold int64\n",
70 | "dtype: object"
71 | ]
72 | },
73 | "execution_count": 7,
74 | "metadata": {},
75 | "output_type": "execute_result"
76 | }
77 | ],
78 | "source": [
79 | "train.dtypes"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 8,
85 | "id": "fc10499b-4c46-4f7e-a88e-a4233fb05504",
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "data": {
90 | "text/html": [
91 | "\n",
92 | "\n",
105 | "
\n",
106 | " \n",
107 | " \n",
108 | " | \n",
109 | " sku | \n",
110 | " date | \n",
111 | " sold_quantity | \n",
112 | " current_price | \n",
113 | " currency | \n",
114 | " listing_type | \n",
115 | " shipping_logistic_type | \n",
116 | " shipping_payment | \n",
117 | " minutes_active | \n",
118 | " item_domain_id | \n",
119 | " site_id | \n",
120 | " fold | \n",
121 | "
\n",
122 | " \n",
123 | " \n",
124 | " \n",
125 | " 0 | \n",
126 | " 464801 | \n",
127 | " 2021-02-01 | \n",
128 | " 0 | \n",
129 | " 156.78 | \n",
130 | " REA | \n",
131 | " classic | \n",
132 | " fulfillment | \n",
133 | " free_shipping | \n",
134 | " 1440.0 | \n",
135 | " MLB-NEBULIZERS | \n",
136 | " MLB | \n",
137 | " 2 | \n",
138 | "
\n",
139 | " \n",
140 | " 1 | \n",
141 | " 464801 | \n",
142 | " 2021-02-02 | \n",
143 | " 0 | \n",
144 | " 156.78 | \n",
145 | " REA | \n",
146 | " classic | \n",
147 | " fulfillment | \n",
148 | " free_shipping | \n",
149 | " 1440.0 | \n",
150 | " MLB-NEBULIZERS | \n",
151 | " MLB | \n",
152 | " 2 | \n",
153 | "
\n",
154 | " \n",
155 | " 2 | \n",
156 | " 464801 | \n",
157 | " 2021-02-03 | \n",
158 | " 0 | \n",
159 | " 156.78 | \n",
160 | " REA | \n",
161 | " classic | \n",
162 | " fulfillment | \n",
163 | " free_shipping | \n",
164 | " 1440.0 | \n",
165 | " MLB-NEBULIZERS | \n",
166 | " MLB | \n",
167 | " 2 | \n",
168 | "
\n",
169 | " \n",
170 | " 3 | \n",
171 | " 464801 | \n",
172 | " 2021-02-04 | \n",
173 | " 0 | \n",
174 | " 156.78 | \n",
175 | " REA | \n",
176 | " classic | \n",
177 | " fulfillment | \n",
178 | " free_shipping | \n",
179 | " 1440.0 | \n",
180 | " MLB-NEBULIZERS | \n",
181 | " MLB | \n",
182 | " 2 | \n",
183 | "
\n",
184 | " \n",
185 | " 4 | \n",
186 | " 464801 | \n",
187 | " 2021-02-05 | \n",
188 | " 1 | \n",
189 | " 156.78 | \n",
190 | " REA | \n",
191 | " classic | \n",
192 | " fulfillment | \n",
193 | " free_shipping | \n",
194 | " 1440.0 | \n",
195 | " MLB-NEBULIZERS | \n",
196 | " MLB | \n",
197 | " 2 | \n",
198 | "
\n",
199 | " \n",
200 | "
\n",
201 | "
"
202 | ],
203 | "text/plain": [
204 | " sku date sold_quantity current_price currency listing_type \\\n",
205 | "0 464801 2021-02-01 0 156.78 REA classic \n",
206 | "1 464801 2021-02-02 0 156.78 REA classic \n",
207 | "2 464801 2021-02-03 0 156.78 REA classic \n",
208 | "3 464801 2021-02-04 0 156.78 REA classic \n",
209 | "4 464801 2021-02-05 1 156.78 REA classic \n",
210 | "\n",
211 | " shipping_logistic_type shipping_payment minutes_active item_domain_id \\\n",
212 | "0 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
213 | "1 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
214 | "2 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
215 | "3 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
216 | "4 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n",
217 | "\n",
218 | " site_id fold \n",
219 | "0 MLB 2 \n",
220 | "1 MLB 2 \n",
221 | "2 MLB 2 \n",
222 | "3 MLB 2 \n",
223 | "4 MLB 2 "
224 | ]
225 | },
226 | "execution_count": 8,
227 | "metadata": {},
228 | "output_type": "execute_result"
229 | }
230 | ],
231 | "source": [
232 | "train.head()"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 9,
238 | "id": "monthly-general",
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "def gen_tr_ts():\n",
243 | " for fold in [2,3]:\n",
244 | " ts = train[train['fold'] != fold]['date'].max()\n",
245 | " ts = train[(train['fold'] != fold) & (train['date'] == ts)].index\n",
246 | " yield train.index[train['fold'] == fold], ts, fold\n",
247 | "\n",
248 | " "
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 10,
254 | "id": "bdb25520-ed36-4c4c-a9fe-23387cb3f918",
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "\n",
259 | "def gen_p(na=30., power=1.5):\n",
260 | " dts = list()\n",
261 | " for tr,ts, fold in gen_tr_ts():\n",
262 | "\n",
263 | " ## EVAL\n",
264 | " pp = train[train['fold'] != fold][['sku', 'date', 'sold_quantity']]\n",
265 | " pp['stock'] = pp['sku'].map(test)\n",
266 | " pp = pp.sort_values([\"sku\",\"date\"])\n",
267 | " pp['cumulative_y'] = pp.groupby(\"sku\")['sold_quantity'].cumsum()\n",
268 | "\n",
269 | " pp = pp.dropna(subset=['stock'])\n",
270 | " pp['stockout_y'] = pp['cumulative_y'] >= pp['stock']\n",
271 | "\n",
272 | " first_so_y = pp[pp['stockout_y']].groupby(\"sku\").first()\n",
273 | " days_to_so_y = (first_so_y[\"date\"] - pp[\"date\"].min()) / np.timedelta64(1, 'D')\n",
274 | " days_to_so_y = days_to_so_y.reindex(pp['sku'].unique()).fillna(na).astype(int).clip(1)\n",
275 | " dts.append(days_to_so_y)\n",
276 | "\n",
277 | " m = utils.pred_list_to_distro(dts[0], wei=False, total_days=max(na, 30), phi=2, power=power)\n",
278 | " f = utils.pred_list_to_distro(dts[1].reindex(dts[0].index).fillna(dts[0]), wei=False, total_days=max(na, 30), phi=2, power=power)\n",
279 | "\n",
280 | " m = pd.DataFrame(m,index=dts[0].index)\n",
281 | " f = pd.DataFrame(f,index=dts[0].index)\n",
282 | "\n",
283 | " p = (m + f)/2\n",
284 | "\n",
285 | " p = p.div(p.sum(axis=1), axis=0)\n",
286 | "\n",
287 | " p = p.loc[test.index]\n",
288 | " \n",
289 | " p = p.round(4)\n",
290 | "\n",
291 | " return p"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 11,
297 | "id": "outer-certificate",
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "r = {i:gen_p(30, power=i/10) for i in range(11,20,1)}\n"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 52,
307 | "id": "1834ec6c-f4e4-4272-8b08-f6a3cbadb705",
308 | "metadata": {},
309 | "outputs": [
310 | {
311 | "data": {
312 | "text/plain": [
313 | ""
314 | ]
315 | },
316 | "execution_count": 52,
317 | "metadata": {},
318 | "output_type": "execute_result"
319 | },
320 | {
321 | "data": {
322 | "image/png": "\n",
323 | "text/plain": [
324 | ""
325 | ]
326 | },
327 | "metadata": {
328 | "needs_background": "light"
329 | },
330 | "output_type": "display_data"
331 | }
332 | ],
333 | "source": [
334 | "r[11].mean(axis=0).plot()"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 53,
340 | "id": "bd7c0392-db7e-4361-a7ed-64cf9a949504",
341 | "metadata": {},
342 | "outputs": [
343 | {
344 | "data": {
345 | "text/plain": [
346 | ""
347 | ]
348 | },
349 | "execution_count": 53,
350 | "metadata": {},
351 | "output_type": "execute_result"
352 | },
353 | {
354 | "data": {
355 | "image/png": "\n",
356 | "text/plain": [
357 | ""
358 | ]
359 | },
360 | "metadata": {
361 | "needs_background": "light"
362 | },
363 | "output_type": "display_data"
364 | }
365 | ],
366 | "source": [
367 | "r[12].mean(axis=0).plot()"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 13,
373 | "id": "7b22e456-318b-4ab0-bf80-31807d2f268c",
374 | "metadata": {},
375 | "outputs": [
376 | {
377 | "data": {
378 | "text/plain": [
379 | ""
380 | ]
381 | },
382 | "execution_count": 13,
383 | "metadata": {},
384 | "output_type": "execute_result"
385 | },
386 | {
387 | "data": {
388 | "image/png": "\n",
389 | "text/plain": [
390 | ""
391 | ]
392 | },
393 | "metadata": {
394 | "needs_background": "light"
395 | },
396 | "output_type": "display_data"
397 | }
398 | ],
399 | "source": [
400 | "r[13].mean(axis=0).plot()"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": 59,
406 | "id": "2efa8e12-e62a-463a-82e4-bdca3babdcd3",
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "# phi=2, power=1.3, na=30\n",
411 | "r[13].round(4).to_csv(\"32b.csv.gz\", header=False, index=False, compression=\"gzip\")\n",
412 | "# LB 4.94"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": null,
418 | "id": "57e747a1-1d2b-4339-8be8-ffdd1c6fb0a3",
419 | "metadata": {},
420 | "outputs": [],
421 | "source": [
422 | "# BONUS\n",
423 | "# xgb 19 + baseline 32 / 2 - LB 4.44"
424 | ]
425 | }
426 | ],
427 | "metadata": {
428 | "kernelspec": {
429 | "display_name": "Python 3",
430 | "language": "python",
431 | "name": "python3"
432 | },
433 | "language_info": {
434 | "codemirror_mode": {
435 | "name": "ipython",
436 | "version": 3
437 | },
438 | "file_extension": ".py",
439 | "mimetype": "text/x-python",
440 | "name": "python",
441 | "nbconvert_exporter": "python",
442 | "pygments_lexer": "ipython3",
443 | "version": "3.8.5"
444 | }
445 | },
446 | "nbformat": 4,
447 | "nbformat_minor": 5
448 | }
449 |
--------------------------------------------------------------------------------
/meli2021/61_active_model_yt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "7fe10a1f-3c36-4466-89c5-5dc2622daf87",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "%matplotlib inline\n",
12 | "import numpy as np\n",
13 | "from matplotlib import pyplot as plt"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "id": "c171946b-edfc-4a51-90fe-cc483428228f",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "Y -> how many days until this product becomes active?"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "id": "84e02a3e-1668-40a5-83cd-3029def45fd8",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "test = pd.read_csv(\"test_data.csv\").set_index(\"sku\").squeeze()\n",
34 | "train = pd.read_parquet(\"./train/0.parquet\")\n",
35 | "train['date'] = pd.to_datetime(train['date'])\n",
36 | "cats = ['item_domain_id', 'currency', 'listing_type', 'shipping_logistic_type', 'shipping_payment', 'site_id']\n",
37 | "for cat in cats:\n",
38 | " train[cat] = train[cat].astype(\"category\").cat.codes"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 3,
44 | "id": "8ca0f337-197d-4a22-9191-7bef0b33b705",
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "train.loc[train[\"minutes_active\"] == 0, \"active\"] = 0\n",
49 | "train[\"active\"] = train[\"active\"].fillna(1)"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 4,
55 | "id": "311a92d9-55b1-484d-a53b-ef755681d75c",
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "data": {
60 | "text/html": [
61 | "\n",
62 | "\n",
75 | "
\n",
76 | " \n",
77 | " \n",
78 | " | \n",
79 | " sku | \n",
80 | " shipping_logistic_type | \n",
81 | " shipping_payment | \n",
82 | " listing_type | \n",
83 | " currency | \n",
84 | " current_price | \n",
85 | " item_domain_id | \n",
86 | " site_id | \n",
87 | " days_to_active | \n",
88 | " days_since_inactive | \n",
89 | "
\n",
90 | " \n",
91 | " \n",
92 | " \n",
93 | " sku | \n",
94 | " 1.000000 | \n",
95 | " -0.000341 | \n",
96 | " -0.002559 | \n",
97 | " -0.000866 | \n",
98 | " -0.002091 | \n",
99 | " 0.002278 | \n",
100 | " -0.001439 | \n",
101 | " 0.000258 | \n",
102 | " -0.001643 | \n",
103 | " -0.001465 | \n",
104 | "
\n",
105 | " \n",
106 | " shipping_logistic_type | \n",
107 | " -0.000341 | \n",
108 | " 1.000000 | \n",
109 | " 0.074183 | \n",
110 | " 0.040746 | \n",
111 | " 0.050840 | \n",
112 | " -0.089014 | \n",
113 | " -0.031270 | \n",
114 | " -0.019540 | \n",
115 | " -0.009456 | \n",
116 | " 0.075642 | \n",
117 | "
\n",
118 | " \n",
119 | " shipping_payment | \n",
120 | " -0.002559 | \n",
121 | " 0.074183 | \n",
122 | " 1.000000 | \n",
123 | " -0.074338 | \n",
124 | " 0.107770 | \n",
125 | " -0.620856 | \n",
126 | " -0.118603 | \n",
127 | " -0.111873 | \n",
128 | " -0.012127 | \n",
129 | " -0.009789 | \n",
130 | "
\n",
131 | " \n",
132 | " listing_type | \n",
133 | " -0.000866 | \n",
134 | " 0.040746 | \n",
135 | " -0.074338 | \n",
136 | " 1.000000 | \n",
137 | " 0.152052 | \n",
138 | " -0.035744 | \n",
139 | " 0.072715 | \n",
140 | " 0.096491 | \n",
141 | " 0.029645 | \n",
142 | " -0.007888 | \n",
143 | "
\n",
144 | " \n",
145 | " currency | \n",
146 | " -0.002091 | \n",
147 | " 0.050840 | \n",
148 | " 0.107770 | \n",
149 | " 0.152052 | \n",
150 | " 1.000000 | \n",
151 | " -0.657856 | \n",
152 | " -0.422581 | \n",
153 | " -0.472765 | \n",
154 | " -0.014007 | \n",
155 | " -0.011959 | \n",
156 | "
\n",
157 | " \n",
158 | " current_price | \n",
159 | " 0.002278 | \n",
160 | " -0.089014 | \n",
161 | " -0.620856 | \n",
162 | " -0.035744 | \n",
163 | " -0.657856 | \n",
164 | " 1.000000 | \n",
165 | " 0.267825 | \n",
166 | " 0.274570 | \n",
167 | " 0.014325 | \n",
168 | " 0.006590 | \n",
169 | "
\n",
170 | " \n",
171 | " item_domain_id | \n",
172 | " -0.001439 | \n",
173 | " -0.031270 | \n",
174 | " -0.118603 | \n",
175 | " 0.072715 | \n",
176 | " -0.422581 | \n",
177 | " 0.267825 | \n",
178 | " 1.000000 | \n",
179 | " 0.893819 | \n",
180 | " 0.040363 | \n",
181 | " 0.022034 | \n",
182 | "
\n",
183 | " \n",
184 | " site_id | \n",
185 | " 0.000258 | \n",
186 | " -0.019540 | \n",
187 | " -0.111873 | \n",
188 | " 0.096491 | \n",
189 | " -0.472765 | \n",
190 | " 0.274570 | \n",
191 | " 0.893819 | \n",
192 | " 1.000000 | \n",
193 | " 0.038815 | \n",
194 | " 0.023721 | \n",
195 | "
\n",
196 | " \n",
197 | " days_to_active | \n",
198 | " -0.001643 | \n",
199 | " -0.009456 | \n",
200 | " -0.012127 | \n",
201 | " 0.029645 | \n",
202 | " -0.014007 | \n",
203 | " 0.014325 | \n",
204 | " 0.040363 | \n",
205 | " 0.038815 | \n",
206 | " 1.000000 | \n",
207 | " -0.118599 | \n",
208 | "
\n",
209 | " \n",
210 | " days_since_inactive | \n",
211 | " -0.001465 | \n",
212 | " 0.075642 | \n",
213 | " -0.009789 | \n",
214 | " -0.007888 | \n",
215 | " -0.011959 | \n",
216 | " 0.006590 | \n",
217 | " 0.022034 | \n",
218 | " 0.023721 | \n",
219 | " -0.118599 | \n",
220 | " 1.000000 | \n",
221 | "
\n",
222 | " \n",
223 | "
\n",
224 | "
"
225 | ],
226 | "text/plain": [
227 | " sku shipping_logistic_type shipping_payment \\\n",
228 | "sku 1.000000 -0.000341 -0.002559 \n",
229 | "shipping_logistic_type -0.000341 1.000000 0.074183 \n",
230 | "shipping_payment -0.002559 0.074183 1.000000 \n",
231 | "listing_type -0.000866 0.040746 -0.074338 \n",
232 | "currency -0.002091 0.050840 0.107770 \n",
233 | "current_price 0.002278 -0.089014 -0.620856 \n",
234 | "item_domain_id -0.001439 -0.031270 -0.118603 \n",
235 | "site_id 0.000258 -0.019540 -0.111873 \n",
236 | "days_to_active -0.001643 -0.009456 -0.012127 \n",
237 | "days_since_inactive -0.001465 0.075642 -0.009789 \n",
238 | "\n",
239 | " listing_type currency current_price item_domain_id \\\n",
240 | "sku -0.000866 -0.002091 0.002278 -0.001439 \n",
241 | "shipping_logistic_type 0.040746 0.050840 -0.089014 -0.031270 \n",
242 | "shipping_payment -0.074338 0.107770 -0.620856 -0.118603 \n",
243 | "listing_type 1.000000 0.152052 -0.035744 0.072715 \n",
244 | "currency 0.152052 1.000000 -0.657856 -0.422581 \n",
245 | "current_price -0.035744 -0.657856 1.000000 0.267825 \n",
246 | "item_domain_id 0.072715 -0.422581 0.267825 1.000000 \n",
247 | "site_id 0.096491 -0.472765 0.274570 0.893819 \n",
248 | "days_to_active 0.029645 -0.014007 0.014325 0.040363 \n",
249 | "days_since_inactive -0.007888 -0.011959 0.006590 0.022034 \n",
250 | "\n",
251 | " site_id days_to_active days_since_inactive \n",
252 | "sku 0.000258 -0.001643 -0.001465 \n",
253 | "shipping_logistic_type -0.019540 -0.009456 0.075642 \n",
254 | "shipping_payment -0.111873 -0.012127 -0.009789 \n",
255 | "listing_type 0.096491 0.029645 -0.007888 \n",
256 | "currency -0.472765 -0.014007 -0.011959 \n",
257 | "current_price 0.274570 0.014325 0.006590 \n",
258 | "item_domain_id 0.893819 0.040363 0.022034 \n",
259 | "site_id 1.000000 0.038815 0.023721 \n",
260 | "days_to_active 0.038815 1.000000 -0.118599 \n",
261 | "days_since_inactive 0.023721 -0.118599 1.000000 "
262 | ]
263 | },
264 | "execution_count": 4,
265 | "metadata": {},
266 | "output_type": "execute_result"
267 | }
268 | ],
269 | "source": [
270 | "act = train[train['active'] == 1][['sku', 'date']].sort_values(\"date\")\n",
271 | "act['active_date'] = act['date']\n",
272 | "inact = train[train['active'] == 0][['sku', 'date', 'shipping_logistic_type', 'shipping_payment', \n",
273 | " 'listing_type', 'currency', 'current_price', 'item_domain_id', 'site_id']].sort_values(\"date\")\n",
274 | "all_ = pd.merge_asof(inact, act, on=['date'], direction='forward', by=['sku']).dropna(subset=['active_date'])\n",
275 | "all_['days_to_active'] = (all_['active_date'] - all_['date']) / np.timedelta64(1,'D')\n",
276 | "all_['days_since_inactive'] = (all_['date'] - all_.groupby(\"sku\")[\"date\"].transform(\"min\")) / np.timedelta64(1,'D')\n",
277 | "y = all_['days_to_active'].copy()\n",
278 | "all_.corr(method='spearman')"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 5,
284 | "id": "bf5ab194-5311-4008-807e-8024a0e7b599",
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "import xgboost as xgb\n",
289 | "from sklearn.metrics import mean_squared_error"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 6,
295 | "id": "141e9fc5-9b36-4933-bf31-317f5fa88f47",
296 | "metadata": {},
297 | "outputs": [
298 | {
299 | "data": {
300 | "text/plain": [
301 | "7.689729948430816"
302 | ]
303 | },
304 | "execution_count": 6,
305 | "metadata": {},
306 | "output_type": "execute_result"
307 | }
308 | ],
309 | "source": [
310 | "Xtr = all_.loc[all_['date'] < \"2021-03-01\", ['days_since_inactive', 'current_price'] + cats]\n",
311 | "Xval = all_.loc[all_['date'] >= \"2021-03-01\", ['days_since_inactive', 'current_price'] + cats]\n",
312 | "\n",
313 | "ytr = y[all_['date'] < \"2021-03-01\"]\n",
314 | "yval = y[all_['date'] >= \"2021-03-01\"]\n",
315 | "\n",
316 | "#mdl = DecisionTreeRegressor(max_depth=3)\n",
317 | "#mdl = RandomForestRegressor(n_estimators=100, max_depth=1, random_state=0, n_jobs=6)\n",
318 | "mdl = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=0, n_jobs=6, tree_method='hist')\n",
319 | "mdl.fit(Xtr, ytr)\n",
320 | "p = mdl.predict(Xval)\n",
321 | "np.sqrt(mean_squared_error(yval, p))"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 7,
327 | "id": "f7df5b24-3d9d-4412-864b-b0f32da1cbd9",
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "data": {
332 | "text/plain": [
333 | ""
334 | ]
335 | },
336 | "execution_count": 7,
337 | "metadata": {},
338 | "output_type": "execute_result"
339 | },
340 | {
341 | "data": {
342 | "image/png": "\n",
343 | "text/plain": [
344 | ""
345 | ]
346 | },
347 | "metadata": {
348 | "needs_background": "light"
349 | },
350 | "output_type": "display_data"
351 | }
352 | ],
353 | "source": [
354 | "Xval['p'] = p\n",
355 | "Xval['error'] = Xval['p'] - yval\n",
356 | "\n",
357 | "Xval.groupby(\"days_since_inactive\")['error'].mean().plot.bar(figsize=(15,10))"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 8,
363 | "id": "638ead4d-b6bb-4c3f-a11e-ac1ee378a80c",
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "all_t = train[train['active'] == 0].copy()\n",
368 | "\n",
369 | "all_t['days_since_inactive'] = (all_t['date'] - all_t.groupby(\"sku\")[\"date\"].transform(\"min\")) / np.timedelta64(1,'D')\n",
370 | "all_t = all_t.groupby(\"sku\").last()\n",
371 | "all_t = all_t[all_t['date'] == \"2021-03-31\"].copy()"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": 9,
377 | "id": "f17ca4b5-3352-4747-88ce-b6de1fc0b480",
378 | "metadata": {},
379 | "outputs": [
380 | {
381 | "data": {
382 | "text/html": [
383 | "\n",
384 | "\n",
397 | "
\n",
398 | " \n",
399 | " \n",
400 | " | \n",
401 | " date | \n",
402 | " sold_quantity | \n",
403 | " current_price | \n",
404 | " currency | \n",
405 | " listing_type | \n",
406 | " shipping_logistic_type | \n",
407 | " shipping_payment | \n",
408 | " minutes_active | \n",
409 | " item_domain_id | \n",
410 | " site_id | \n",
411 | " active | \n",
412 | " days_since_inactive | \n",
413 | "
\n",
414 | " \n",
415 | " sku | \n",
416 | " | \n",
417 | " | \n",
418 | " | \n",
419 | " | \n",
420 | " | \n",
421 | " | \n",
422 | " | \n",
423 | " | \n",
424 | " | \n",
425 | " | \n",
426 | " | \n",
427 | " | \n",
428 | "
\n",
429 | " \n",
430 | " \n",
431 | " \n",
432 | " 4 | \n",
433 | " 2021-03-31 | \n",
434 | " 0 | \n",
435 | " 118.00 | \n",
436 | " 3 | \n",
437 | " 1 | \n",
438 | " 2 | \n",
439 | " 0 | \n",
440 | " 0.0 | \n",
441 | " 5263 | \n",
442 | " 1 | \n",
443 | " 0.0 | \n",
444 | " 54.0 | \n",
445 | "
\n",
446 | " \n",
447 | " 8 | \n",
448 | " 2021-03-31 | \n",
449 | " 0 | \n",
450 | " 58.49 | \n",
451 | " 2 | \n",
452 | " 1 | \n",
453 | " 2 | \n",
454 | " 1 | \n",
455 | " 0.0 | \n",
456 | " 6116 | \n",
457 | " 2 | \n",
458 | " 0.0 | \n",
459 | " 41.0 | \n",
460 | "
\n",
461 | " \n",
462 | " 9 | \n",
463 | " 2021-03-31 | \n",
464 | " 0 | \n",
465 | " 199.00 | \n",
466 | " 2 | \n",
467 | " 1 | \n",
468 | " 2 | \n",
469 | " 0 | \n",
470 | " 0.0 | \n",
471 | " 7991 | \n",
472 | " 2 | \n",
473 | " 0.0 | \n",
474 | " 6.0 | \n",
475 | "
\n",
476 | " \n",
477 | " 11 | \n",
478 | " 2021-03-31 | \n",
479 | " 0 | \n",
480 | " 109.90 | \n",
481 | " 3 | \n",
482 | " 0 | \n",
483 | " 2 | \n",
484 | " 0 | \n",
485 | " 0.0 | \n",
486 | " 3645 | \n",
487 | " 1 | \n",
488 | " 0.0 | \n",
489 | " 19.0 | \n",
490 | "
\n",
491 | " \n",
492 | " 13 | \n",
493 | " 2021-03-31 | \n",
494 | " 0 | \n",
495 | " 474.05 | \n",
496 | " 2 | \n",
497 | " 1 | \n",
498 | " 2 | \n",
499 | " 0 | \n",
500 | " 0.0 | \n",
501 | " 5658 | \n",
502 | " 2 | \n",
503 | " 0.0 | \n",
504 | " 0.0 | \n",
505 | "
\n",
506 | " \n",
507 | " ... | \n",
508 | " ... | \n",
509 | " ... | \n",
510 | " ... | \n",
511 | " ... | \n",
512 | " ... | \n",
513 | " ... | \n",
514 | " ... | \n",
515 | " ... | \n",
516 | " ... | \n",
517 | " ... | \n",
518 | " ... | \n",
519 | " ... | \n",
520 | "
\n",
521 | " \n",
522 | " 660897 | \n",
523 | " 2021-03-31 | \n",
524 | " 0 | \n",
525 | " 79.00 | \n",
526 | " 2 | \n",
527 | " 0 | \n",
528 | " 2 | \n",
529 | " 1 | \n",
530 | " 0.0 | \n",
531 | " 6201 | \n",
532 | " 2 | \n",
533 | " 0.0 | \n",
534 | " 58.0 | \n",
535 | "
\n",
536 | " \n",
537 | " 660904 | \n",
538 | " 2021-03-31 | \n",
539 | " 0 | \n",
540 | " 98.90 | \n",
541 | " 3 | \n",
542 | " 1 | \n",
543 | " 2 | \n",
544 | " 1 | \n",
545 | " 0.0 | \n",
546 | " 3678 | \n",
547 | " 1 | \n",
548 | " 0.0 | \n",
549 | " 24.0 | \n",
550 | "
\n",
551 | " \n",
552 | " 660907 | \n",
553 | " 2021-03-31 | \n",
554 | " 0 | \n",
555 | " 24.69 | \n",
556 | " 3 | \n",
557 | " 1 | \n",
558 | " 2 | \n",
559 | " 1 | \n",
560 | " 0.0 | \n",
561 | " 4703 | \n",
562 | " 1 | \n",
563 | " 0.0 | \n",
564 | " 56.0 | \n",
565 | "
\n",
566 | " \n",
567 | " 660910 | \n",
568 | " 2021-03-31 | \n",
569 | " 0 | \n",
570 | " 480.58 | \n",
571 | " 2 | \n",
572 | " 1 | \n",
573 | " 2 | \n",
574 | " 0 | \n",
575 | " 0.0 | \n",
576 | " 7878 | \n",
577 | " 2 | \n",
578 | " 0.0 | \n",
579 | " 42.0 | \n",
580 | "
\n",
581 | " \n",
582 | " 660915 | \n",
583 | " 2021-03-31 | \n",
584 | " 0 | \n",
585 | " 99.99 | \n",
586 | " 2 | \n",
587 | " 0 | \n",
588 | " 2 | \n",
589 | " 1 | \n",
590 | " 0.0 | \n",
591 | " 7994 | \n",
592 | " 2 | \n",
593 | " 0.0 | \n",
594 | " 58.0 | \n",
595 | "
\n",
596 | " \n",
597 | "
\n",
598 | "
181606 rows × 12 columns
\n",
599 | "
"
600 | ],
601 | "text/plain": [
602 | " date sold_quantity current_price currency listing_type \\\n",
603 | "sku \n",
604 | "4 2021-03-31 0 118.00 3 1 \n",
605 | "8 2021-03-31 0 58.49 2 1 \n",
606 | "9 2021-03-31 0 199.00 2 1 \n",
607 | "11 2021-03-31 0 109.90 3 0 \n",
608 | "13 2021-03-31 0 474.05 2 1 \n",
609 | "... ... ... ... ... ... \n",
610 | "660897 2021-03-31 0 79.00 2 0 \n",
611 | "660904 2021-03-31 0 98.90 3 1 \n",
612 | "660907 2021-03-31 0 24.69 3 1 \n",
613 | "660910 2021-03-31 0 480.58 2 1 \n",
614 | "660915 2021-03-31 0 99.99 2 0 \n",
615 | "\n",
616 | " shipping_logistic_type shipping_payment minutes_active \\\n",
617 | "sku \n",
618 | "4 2 0 0.0 \n",
619 | "8 2 1 0.0 \n",
620 | "9 2 0 0.0 \n",
621 | "11 2 0 0.0 \n",
622 | "13 2 0 0.0 \n",
623 | "... ... ... ... \n",
624 | "660897 2 1 0.0 \n",
625 | "660904 2 1 0.0 \n",
626 | "660907 2 1 0.0 \n",
627 | "660910 2 0 0.0 \n",
628 | "660915 2 1 0.0 \n",
629 | "\n",
630 | " item_domain_id site_id active days_since_inactive \n",
631 | "sku \n",
632 | "4 5263 1 0.0 54.0 \n",
633 | "8 6116 2 0.0 41.0 \n",
634 | "9 7991 2 0.0 6.0 \n",
635 | "11 3645 1 0.0 19.0 \n",
636 | "13 5658 2 0.0 0.0 \n",
637 | "... ... ... ... ... \n",
638 | "660897 6201 2 0.0 58.0 \n",
639 | "660904 3678 1 0.0 24.0 \n",
640 | "660907 4703 1 0.0 56.0 \n",
641 | "660910 7878 2 0.0 42.0 \n",
642 | "660915 7994 2 0.0 58.0 \n",
643 | "\n",
644 | "[181606 rows x 12 columns]"
645 | ]
646 | },
647 | "execution_count": 9,
648 | "metadata": {},
649 | "output_type": "execute_result"
650 | }
651 | ],
652 | "source": [
653 | "all_t"
654 | ]
655 | },
656 | {
657 | "cell_type": "code",
658 | "execution_count": 10,
659 | "id": "a8513a7c-3e3d-46ec-8725-b9076633ee7a",
660 | "metadata": {},
661 | "outputs": [],
662 | "source": [
663 | "X = all_.loc[:, ['days_since_inactive', 'current_price'] + cats]\n",
664 | "Xt = all_t.loc[:, ['days_since_inactive', 'current_price'] + cats]\n",
665 | "\n",
666 | "mdl = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=0, n_jobs=6, tree_method='hist')\n",
667 | "mdl.fit(X, y)\n",
668 | "p = mdl.predict(Xt)"
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": 11,
674 | "id": "96be5acc-dc34-4d44-9322-fc452364666c",
675 | "metadata": {},
676 | "outputs": [],
677 | "source": [
678 | "p2 = pd.Series(p.round(), index=Xt.index).reindex(test.index).dropna().astype(int)"
679 | ]
680 | },
681 | {
682 | "cell_type": "code",
683 | "execution_count": 12,
684 | "id": "42bf0345-5db8-4da9-8657-b570024386d7",
685 | "metadata": {},
686 | "outputs": [
687 | {
688 | "data": {
689 | "text/html": [
690 | "\n",
691 | "\n",
704 | "
\n",
705 | " \n",
706 | " \n",
707 | " | \n",
708 | " days_since_inactive | \n",
709 | " current_price | \n",
710 | " item_domain_id | \n",
711 | " currency | \n",
712 | " listing_type | \n",
713 | " shipping_logistic_type | \n",
714 | " shipping_payment | \n",
715 | " site_id | \n",
716 | "
\n",
717 | " \n",
718 | " sku | \n",
719 | " | \n",
720 | " | \n",
721 | " | \n",
722 | " | \n",
723 | " | \n",
724 | " | \n",
725 | " | \n",
726 | " | \n",
727 | "
\n",
728 | " \n",
729 | " \n",
730 | " \n",
731 | " 4 | \n",
732 | " 54.0 | \n",
733 | " 118.00 | \n",
734 | " 5263 | \n",
735 | " 3 | \n",
736 | " 1 | \n",
737 | " 2 | \n",
738 | " 0 | \n",
739 | " 1 | \n",
740 | "
\n",
741 | " \n",
742 | " 8 | \n",
743 | " 41.0 | \n",
744 | " 58.49 | \n",
745 | " 6116 | \n",
746 | " 2 | \n",
747 | " 1 | \n",
748 | " 2 | \n",
749 | " 1 | \n",
750 | " 2 | \n",
751 | "
\n",
752 | " \n",
753 | " 9 | \n",
754 | " 6.0 | \n",
755 | " 199.00 | \n",
756 | " 7991 | \n",
757 | " 2 | \n",
758 | " 1 | \n",
759 | " 2 | \n",
760 | " 0 | \n",
761 | " 2 | \n",
762 | "
\n",
763 | " \n",
764 | " 11 | \n",
765 | " 19.0 | \n",
766 | " 109.90 | \n",
767 | " 3645 | \n",
768 | " 3 | \n",
769 | " 0 | \n",
770 | " 2 | \n",
771 | " 0 | \n",
772 | " 1 | \n",
773 | "
\n",
774 | " \n",
775 | " 13 | \n",
776 | " 0.0 | \n",
777 | " 474.05 | \n",
778 | " 5658 | \n",
779 | " 2 | \n",
780 | " 1 | \n",
781 | " 2 | \n",
782 | " 0 | \n",
783 | " 2 | \n",
784 | "
\n",
785 | " \n",
786 | "
\n",
787 | "
"
788 | ],
789 | "text/plain": [
790 | " days_since_inactive current_price item_domain_id currency \\\n",
791 | "sku \n",
792 | "4 54.0 118.00 5263 3 \n",
793 | "8 41.0 58.49 6116 2 \n",
794 | "9 6.0 199.00 7991 2 \n",
795 | "11 19.0 109.90 3645 3 \n",
796 | "13 0.0 474.05 5658 2 \n",
797 | "\n",
798 | " listing_type shipping_logistic_type shipping_payment site_id \n",
799 | "sku \n",
800 | "4 1 2 0 1 \n",
801 | "8 1 2 1 2 \n",
802 | "9 1 2 0 2 \n",
803 | "11 0 2 0 1 \n",
804 | "13 1 2 0 2 "
805 | ]
806 | },
807 | "execution_count": 12,
808 | "metadata": {},
809 | "output_type": "execute_result"
810 | }
811 | ],
812 | "source": [
813 | "Xt.head()"
814 | ]
815 | },
816 | {
817 | "cell_type": "code",
818 | "execution_count": 14,
819 | "id": "b6880938-d9bf-456c-b8dd-c5f60dd293cd",
820 | "metadata": {},
821 | "outputs": [
822 | {
823 | "data": {
824 | "text/plain": [
825 | "sku\n",
826 | "431262 2\n",
827 | "94157 9\n",
828 | "394886 9\n",
829 | "434156 9\n",
830 | "197550 12\n",
831 | " ..\n",
832 | "575227 10\n",
833 | "470249 10\n",
834 | "24226 10\n",
835 | "297331 10\n",
836 | "511077 10\n",
837 | "Length: 97692, dtype: int64"
838 | ]
839 | },
840 | "execution_count": 14,
841 | "metadata": {},
842 | "output_type": "execute_result"
843 | }
844 | ],
845 | "source": [
846 | "p2"
847 | ]
848 | },
849 | {
850 | "cell_type": "code",
851 | "execution_count": 13,
852 | "id": "14642acd-37f0-420c-9951-b90ce6b6f195",
853 | "metadata": {},
854 | "outputs": [
855 | {
856 | "name": "stdout",
857 | "output_type": "stream",
858 | "text": [
859 | "(551472, 30)\n",
860 | "CPU times: user 2min 10s, sys: 836 ms, total: 2min 10s\n",
861 | "Wall time: 2min 10s\n"
862 | ]
863 | }
864 | ],
865 | "source": [
866 | "%%time\n",
867 | "sub = pd.read_csv(\"45d.csv.gz\", header=None) # 4.31\n",
868 | "sub_ = sub.copy()\n",
869 | "sub_.index = test.index\n",
870 | "\n",
871 | "for sku in p2.index:\n",
872 | " s = sub_.loc[sku].copy()\n",
873 | " days = p2.loc[sku]\n",
874 | " #print(s)\n",
875 | " s.iloc[:days] = s.iloc[:days]*0.5\n",
876 | " s = s / s.sum()\n",
877 | " sub_.loc[sku, :] = s\n",
878 | "print(sub_.shape)\n",
879 | "sub_.round(4).to_csv(\"61byt.csv.gz\", header=False, index=False, compression=\"gzip\")\n",
880 | "# LB 4.2772"
881 | ]
882 | },
883 | {
884 | "cell_type": "code",
885 | "execution_count": null,
886 | "id": "c8e7e49f-4e84-4a50-a324-76ca6abb9641",
887 | "metadata": {},
888 | "outputs": [],
889 | "source": []
890 | }
891 | ],
892 | "metadata": {
893 | "kernelspec": {
894 | "display_name": "Python 3",
895 | "language": "python",
896 | "name": "python3"
897 | },
898 | "language_info": {
899 | "codemirror_mode": {
900 | "name": "ipython",
901 | "version": 3
902 | },
903 | "file_extension": ".py",
904 | "mimetype": "text/x-python",
905 | "name": "python",
906 | "nbconvert_exporter": "python",
907 | "pygments_lexer": "ipython3",
908 | "version": "3.8.5"
909 | }
910 | },
911 | "nbformat": 4,
912 | "nbformat_minor": 5
913 | }
914 |
--------------------------------------------------------------------------------
/meli2021/README:
--------------------------------------------------------------------------------
1 | Solutions to #MeliDataChallenge 2021
2 |
3 | August 2nd, 2021 -> 19_xgb_barely_tuned gives you LB 4.64 (5th place)
4 | August 3rd, 2021 -> 32_baseline gives you LB 4.94 (7th place)
5 |
6 | If you average the predictions from 19 and 32 you get LB 4.44 (3rd place)
7 |
8 | August 9th, 2021 -> 61_active applied to the average above should give ~ LB 4.30 (4th place)
9 |
10 | Video Tutorials: https://www.youtube.com/playlist?list=PLV_itENB3unp-g1tgybj5-gs_4FGL8aA8
11 |
12 |
--------------------------------------------------------------------------------
/meli2021/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import tweedie
4 | import scipy.stats as st
5 |
6 | def pred_list_to_prob_array(pred_list, cumulative=False, total_days=30):
7 | prob_array = np.zeros((pred_list.shape[0], total_days))
8 | pred_list = np.clip(pred_list, 1, total_days)
9 | for row, e in enumerate(pred_list):
10 | if cumulative:
11 | prob_array[row, int(e-1)] = 1.
12 | else:
13 | prob_array[row, int(e-1):] = 1.
14 |
15 | if cumulative:
16 | prob_array = prob_array+1e-4
17 | prob_array = np.divide(prob_array, prob_array.sum(axis=1).reshape(-1,1))
18 | prob_array = prob_array.cumsum(axis=1)
19 |
20 | return prob_array
21 |
22 | def pred_list_to_prob_array_mc(pred_list, total_days=30):
23 | prob_array = np.zeros((pred_list.shape[0], total_days))
24 | pred_list = np.clip(pred_list, 1, total_days)
25 | for row, e in enumerate(pred_list):
26 | prob_array[row, int(e):] = 1.
27 |
28 | return prob_array
29 |
30 | def rps(y, p, probs=False, total_days=30):
31 | y_array = pred_list_to_prob_array(y, total_days=total_days)
32 | if probs:
33 | p_array = p.cumsum(axis=1)
34 | else:
35 | p_array = pred_list_to_prob_array(p, cumulative=True, total_days=total_days)
36 | return ((p_array - y_array)**2).sum(axis=1).mean()
37 |
38 |
39 | def rps_mc(y, p, probs=False, total_days=30):
40 | y_array = pred_list_to_prob_array_mc(y, total_days=total_days)
41 | if probs:
42 | p_array = p.cumsum(axis=1)
43 | return ((p_array - y_array)**2).sum(axis=1).mean()
44 |
45 | def rps_raw(y, p, probs=False):
46 | y_array = pred_list_to_prob_array(y)
47 | if probs:
48 | p_array = p.cumsum(axis=1)
49 | else:
50 | p_array = pred_list_to_prob_array(p, cumulative=True)
51 | return ((p_array - y_array)**2).sum(axis=1)
52 |
53 |
54 | def pred_list_to_tweedie(pred_list, phi=1, p=1.5):
55 | # has a bug in the first day, it's the wrong probability, but it's worse without the bug
56 | distros = dict()
57 | for mu in range(1,31):
58 | distros[mu] = [tweedie.tweedie(p=p, mu=mu, phi=phi).cdf(days) for days in range(1,31,1)]
59 | distros[mu][1:] = np.diff(distros[mu])
60 | distros[mu] = np.round(distros[mu] / np.sum(distros[mu]), 4)
61 |
62 | prob_array = np.zeros((pred_list.shape[0], 30))
63 |
64 | for row, mu in enumerate(pred_list):
65 | prob_array[row, :] = distros[mu]#.cumsum()
66 | #prob_array[row, -1] = 1.
67 |
68 | return prob_array
69 |
70 |
71 |
72 | def pred_list_to_distro(pred_list, wei=False, total_days=30, phi=2, power=1.5):
73 | distros = dict()
74 | for mu in range(1,total_days+1):
75 | if wei:
76 | distros[mu] = [st.norm.cdf(days, loc=mu, scale=1) for days in range(0,total_days+1,1)]
77 | else:
78 | distros[mu] = [tweedie.tweedie(p=power, mu=mu, phi=phi).cdf(days) for days in range(0,total_days+1,1)]
79 | #distros[mu] = [st.lognorm.cdf(days, s=0.5, loc=mu, scale=0.5) for days in range(0,31,1)]
80 | #distros[mu] = [st.expon.cdf(days, loc=mu, scale=0.01) for days in range(0,31,1)]
81 | #distros[mu] = [st.gengamma.cdf(days, loc=mu, scale=1, a=mu, c=1) for days in range(1,31,1)]
82 | if np.sum(distros[mu]) > 0:
83 | distros[mu] = np.diff(distros[mu])
84 | distros[mu] = np.round(distros[mu] / np.sum(distros[mu]), 4)
85 | else:
86 | distros[mu] = distros[mu][1:]
87 | distros[mu][-1] = 1
88 |
89 |
90 | prob_array = np.zeros((pred_list.shape[0], total_days))
91 |
92 | for row, mu in enumerate(pred_list):
93 | prob_array[row, :] = distros[mu]#.cumsum()
94 | #prob_array[row, -1] = 1.
95 |
96 | return prob_array
97 |
98 | def pred_list_to_distro_smooth(pred_list, total_days=30, phi=2, power=1.5, smooth_factor=0.3):
99 | distros = dict()
100 | for mu in range(1,total_days+1):
101 | distros[mu] = [tweedie.tweedie(p=power, mu=mu, phi=phi).cdf(days) for days in range(0,total_days+1,1)]
102 | if np.sum(distros[mu]) > 0:
103 | distros[mu] = np.diff(distros[mu])
104 | distros[mu] = np.round(distros[mu] / np.sum(distros[mu]), 4)
105 | else:
106 | distros[mu] = distros[mu][1:]
107 | distros[mu][-1] = 1
108 |
109 |
110 | prob_array = np.zeros((pred_list.shape[0], total_days))
111 |
112 | for row, mu in enumerate(pred_list):
113 | if mu == 1:
114 | prob_array[row, :] = (1-smooth_factor)*distros[mu] + smooth_factor*distros[mu+1]
115 | elif mu == total_days:
116 | prob_array[row, :] = smooth_factor*distros[mu-1] + (1-smooth_factor)*distros[mu]
117 | else:
118 | prob_array[row, :] = (smooth_factor/2)*distros[mu-1] + (1-smooth_factor)*distros[mu] + (smooth_factor/2)*distros[mu+1]
119 |
120 | return prob_array
--------------------------------------------------------------------------------
/multiple_time_series/README:
--------------------------------------------------------------------------------
1 | Notebooks for https://youtu.be/RRd2wzMRpOc
2 |
--------------------------------------------------------------------------------