├── README.md ├── meli2021 ├── 19_xgb_barely_tuned_yt.ipynb ├── 32_baseline_yt.ipynb ├── 61_active_model_yt.ipynb ├── README └── utils.py └── multiple_time_series ├── README └── workshop_notebook.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # english_tutorials -------------------------------------------------------------------------------- /meli2021/19_xgb_barely_tuned_yt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "75638528-4609-4ee2-93c9-28538579e471", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import utils\n", 13 | "\n", 14 | "from sklearn.model_selection import GroupKFold, KFold\n", 15 | "from sklearn.linear_model import LinearRegression\n", 16 | "from sklearn.ensemble import RandomForestRegressor\n", 17 | "from sklearn.metrics import mean_squared_error\n", 18 | "from xgboost import XGBRegressor\n", 19 | "import tweedie\n", 20 | "\n", 21 | "\n", 22 | "from importlib import reload\n", 23 | "reload(utils)\n", 24 | "from skopt import gp_minimize" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "id": "3be069d6-c17e-43a2-8252-2acf5e9c26ad", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "train = pd.read_parquet(\"./train/0.parquet\")\n", 35 | "train['date'] = pd.to_datetime(train['date'])\n", 36 | "train['fold'] = train['date'].dt.month" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "id": "b9975ad7-ff3a-4b58-9d4c-b1435d0c5535", 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/html": [ 48 | "
\n", 49 | "\n", 62 | "\n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | "
skudatesold_quantitycurrent_pricecurrencylisting_typeshipping_logistic_typeshipping_paymentminutes_activeitem_domain_idsite_idfold
04648012021-02-010156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
14648012021-02-020156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
24648012021-02-030156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
34648012021-02-040156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
44648012021-02-051156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
\n", 158 | "
" 159 | ], 160 | "text/plain": [ 161 | " sku date sold_quantity current_price currency listing_type \\\n", 162 | "0 464801 2021-02-01 0 156.78 REA classic \n", 163 | "1 464801 2021-02-02 0 156.78 REA classic \n", 164 | "2 464801 2021-02-03 0 156.78 REA classic \n", 165 | "3 464801 2021-02-04 0 156.78 REA classic \n", 166 | "4 464801 2021-02-05 1 156.78 REA classic \n", 167 | "\n", 168 | " shipping_logistic_type shipping_payment minutes_active item_domain_id \\\n", 169 | "0 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 170 | "1 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 171 | "2 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 172 | "3 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 173 | "4 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 174 | "\n", 175 | " site_id fold \n", 176 | "0 MLB 2 \n", 177 | "1 MLB 2 \n", 178 | "2 MLB 2 \n", 179 | "3 MLB 2 \n", 180 | "4 MLB 2 " 181 | ] 182 | }, 183 | "execution_count": 3, 184 | "metadata": {}, 185 | "output_type": "execute_result" 186 | } 187 | ], 188 | "source": [ 189 | "train.head()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 3, 195 | "id": "smaller-boulder", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "test = pd.read_csv(\"test_data.csv\", index_col=0).squeeze()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 4, 205 | "id": "finnish-canadian", 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/plain": [ 211 | "sku int64\n", 212 | "date datetime64[ns]\n", 213 | "sold_quantity int64\n", 214 | "current_price float64\n", 215 | "currency object\n", 216 | "listing_type object\n", 217 | "shipping_logistic_type object\n", 218 | "shipping_payment object\n", 219 | "minutes_active float64\n", 220 | "item_domain_id object\n", 221 | "site_id object\n", 222 | "fold int64\n", 223 | "dtype: object" 224 | ] 225 | }, 226 | "execution_count": 4, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "train.dtypes" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 5, 238 | "id": "fc10499b-4c46-4f7e-a88e-a4233fb05504", 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/html": [ 244 | "
\n", 245 | "\n", 258 | "\n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | "
skudatesold_quantitycurrent_pricecurrencylisting_typeshipping_logistic_typeshipping_paymentminutes_activeitem_domain_idsite_idfold
04648012021-02-010156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
14648012021-02-020156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
24648012021-02-030156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
34648012021-02-040156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
44648012021-02-051156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
\n", 354 | "
" 355 | ], 356 | "text/plain": [ 357 | " sku date sold_quantity current_price currency listing_type \\\n", 358 | "0 464801 2021-02-01 0 156.78 REA classic \n", 359 | "1 464801 2021-02-02 0 156.78 REA classic \n", 360 | "2 464801 2021-02-03 0 156.78 REA classic \n", 361 | "3 464801 2021-02-04 0 156.78 REA classic \n", 362 | "4 464801 2021-02-05 1 156.78 REA classic \n", 363 | "\n", 364 | " shipping_logistic_type shipping_payment minutes_active item_domain_id \\\n", 365 | "0 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 366 | "1 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 367 | "2 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 368 | "3 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 369 | "4 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 370 | "\n", 371 | " site_id fold \n", 372 | "0 MLB 2 \n", 373 | "1 MLB 2 \n", 374 | "2 MLB 2 \n", 375 | "3 MLB 2 \n", 376 | "4 MLB 2 " 377 | ] 378 | }, 379 | "execution_count": 5, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "train.head()" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 6, 391 | "id": "refined-string", 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "cats = ['item_domain_id', 'currency', 'listing_type', 'shipping_logistic_type', 'shipping_payment', 'site_id']" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 7, 401 | "id": "99b8015a-8c95-4936-9ba6-9d655aa19848", 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "from category_encoders import OrdinalEncoder\n", 406 | "enc = OrdinalEncoder(cats)\n", 407 | "train = enc.fit_transform(train)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 8, 413 | "id": "monthly-general", 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "def gen_tr_ts():\n", 418 | " for fold in [2,3]:\n", 419 | " ts = train[train['fold'] != fold]['date'].max()\n", 420 | " ts = train[(train['fold'] != fold) & (train['date'] == ts)].index\n", 421 | " yield train.index[train['fold'] == fold], ts, fold\n", 422 | "\n", 423 | " " 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "id": "bdb25520-ed36-4c4c-a9fe-23387cb3f918", 430 | "metadata": {}, 431 | "outputs": [ 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | "Iteration No: 1 started. Evaluating function at random point.\n", 437 | "[0.09871192514273254, 9, 0.16531200313642108, 0.9491364637917304, 1.2337280871824563, 120]\n", 438 | "8.619976237016095\n", 439 | "8.863340114023996\n", 440 | "Iteration No: 1 ended. Evaluation done at random point.\n", 441 | "Time taken: 164.3399\n", 442 | "Function value obtained: 8.7417\n", 443 | "Current minimum: 8.7417\n", 444 | "Iteration No: 2 started. Evaluating function at random point.\n", 445 | "[0.0059678992438367785, 7, 0.8919851637254288, 0.8116798250174155, 1.3101407817629525, 158]\n", 446 | "8.969453535454983\n", 447 | "9.180772268605745\n", 448 | "Iteration No: 2 ended. Evaluation done at random point.\n", 449 | "Time taken: 161.5522\n", 450 | "Function value obtained: 9.0751\n", 451 | "Current minimum: 8.7417\n", 452 | "Iteration No: 3 started. Evaluating function at random point.\n", 453 | "[0.007707362534461022, 3, 0.5309725180523154, 0.8725658221213098, 1.4526327599071185, 130]\n", 454 | "9.148086951536671\n" 455 | ] 456 | } 457 | ], 458 | "source": [ 459 | "def tune(params):\n", 460 | " print(params)\n", 461 | " features = [\"current_price\", \"minutes_active\"] + cats\n", 462 | "\n", 463 | " mean_rps = 0.\n", 464 | " for tr,ts, fold in gen_tr_ts():\n", 465 | " #print(tr.shape, ts.shape)\n", 466 | " X = train[features]\n", 467 | " y = train['sold_quantity']\n", 468 | "\n", 469 | " Xtr = X.iloc[tr]\n", 470 | " ytr = y.iloc[tr]\n", 471 | " Xval = X.iloc[ts]\n", 472 | " yval = y.iloc[ts]\n", 473 | "\n", 474 | " #mdl = LinearRegression(normalize=True)\n", 475 | " #mdl = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)\n", 476 | " mdl = XGBRegressor(n_estimators=1000, learning_rate=params[0],\n", 477 | " max_depth=params[1],\n", 478 | " subsample=params[2],\n", 479 | " colsample_bytree=params[3],\n", 480 | " tweedie_variance_power=params[4],\n", 481 | " min_child_weight=params[5],\n", 482 | " random_state=0, objective=\"reg:tweedie\", \n", 483 | " base_score=1e-3,\n", 484 | " tree_method='gpu_hist')\n", 485 | " mdl.fit(Xtr, ytr)\n", 486 | " p = mdl.predict(Xval)\n", 487 | "\n", 488 | "\n", 489 | " ## EVAL\n", 490 | " pp = train[train['fold'] != fold][['sku', 'date', 'sold_quantity']]\n", 491 | " pp['stock'] = pp['sku'].map(test)\n", 492 | " pp = pp.sort_values([\"sku\",\"date\"])\n", 493 | " pp['cumulative_y'] = pp.groupby(\"sku\")['sold_quantity'].cumsum()\n", 494 | "\n", 495 | " pp = pp.dropna(subset=['stock'])\n", 496 | " pp['stockout_y'] = pp['cumulative_y'] >= pp['stock']\n", 497 | "\n", 498 | " first_so_y = pp[pp['stockout_y']].groupby(\"sku\").first()\n", 499 | " days_to_so_y = (first_so_y[\"date\"] - pp[\"date\"].min()) / np.timedelta64(1, 'D')\n", 500 | " days_to_so_y = days_to_so_y.reindex(pp['sku'].unique()).fillna(30.).clip(1,30)\n", 501 | "\n", 502 | "\n", 503 | " ppp = train.iloc[ts][['sku']]\n", 504 | " #p[~np.isfinite(p)] = 17.\n", 505 | " ppp['p'] = p\n", 506 | " ppp['stock'] = ppp['sku'].map(test)\n", 507 | " ppp = ppp.dropna(subset=['stock'])\n", 508 | " ppp['days_to_so'] = (ppp['stock'] / ppp['p']).astype(int).fillna(30.).clip(1,30)\n", 509 | " days_to_so_p = ppp[['sku', 'days_to_so']].set_index(\"sku\").squeeze().reindex(days_to_so_y.index) \n", 510 | "\n", 511 | " days_to_so_p2 = utils.pred_list_to_tweedie(days_to_so_p, phi=2, p=1.5)\n", 512 | " \n", 513 | " #tweedie distribution -> [0.05, 0.07, ... .13, 0.12]\n", 514 | "\n", 515 | " rps = utils.rps(days_to_so_y, days_to_so_p2, probs=True)\n", 516 | " mean_rps += rps\n", 517 | " print(rps)\n", 518 | " return mean_rps / 2\n", 519 | "\n", 520 | "space = [(1e-3, 1e-1, 'log-uniform'),\n", 521 | " (1, 10),\n", 522 | " (0.05, 0.95),\n", 523 | " (0.05, 0.95),\n", 524 | " (1.0,1.99),\n", 525 | " (1,300)]\n", 526 | "res = gp_minimize(tune, space, random_state=1, verbose=1)\n", 527 | "\n", 528 | " " 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "id": "pretty-literature", 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "# 15a \n", 539 | "# Mean CV 9.0805\n", 540 | "# LB 6.2598\n", 541 | "\n", 542 | "\n", 543 | "Iteration No: 2 started. Evaluating function at random point.\n", 544 | "[0.003936128001463711, 2, 0.29539066512210194, 0.47989860558921493, 1.8040470414877383, 145]\n", 545 | "6.131413939395725\n", 546 | "6.4664243315180086\n", 547 | "Iteration No: 2 ended. Evaluation done at random point.\n", 548 | "Time taken: 91.9157\n", 549 | "Function value obtained: 6.2989\n", 550 | "Current minimum: 6.2989" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "id": "comfortable-hypothesis", 556 | "metadata": {}, 557 | "source": [ 558 | "# sub" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 10, 564 | "id": "dominant-machine", 565 | "metadata": {}, 566 | "outputs": [ 567 | { 568 | "name": "stdout", 569 | "output_type": "stream", 570 | "text": [ 571 | "True\n" 572 | ] 573 | } 574 | ], 575 | "source": [ 576 | "test_df = train[train['date'] == \"2021-03-31\"]\n", 577 | "test_df = test_df[test_df['sku'].isin(test.index)]\n", 578 | "print(np.all(test_df['sku'] == test.index))\n", 579 | "\n", 580 | "features = [\"current_price\", \"minutes_active\"] + cats\n", 581 | "params = [0.003936128001463711, 2, 0.29539066512210194, 0.47989860558921493, 1.8040470414877383, 145]\n", 582 | "mdl = XGBRegressor(n_estimators=1000, learning_rate=params[0],\n", 583 | " max_depth=params[1],\n", 584 | " subsample=params[2],\n", 585 | " colsample_bytree=params[3],\n", 586 | " tweedie_variance_power=params[4],\n", 587 | " min_child_weight=params[5],\n", 588 | " random_state=0, objective=\"reg:tweedie\", \n", 589 | " base_score=1e-3,\n", 590 | " tree_method='gpu_hist')\n", 591 | "mdl.fit(train[features], train['sold_quantity'])\n", 592 | "p = mdl.predict(test_df[features])" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 14, 598 | "id": "important-rugby", 599 | "metadata": {}, 600 | "outputs": [], 601 | "source": [ 602 | "spp = test_df[['sku']].copy()\n", 603 | "spp['p'] = p\n", 604 | "spp['stock'] = spp['sku'].map(test)\n", 605 | "spp['days_to_so'] = (spp['stock'] / spp['p']).fillna(30.).clip(1,30).astype(int)\n" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": 15, 611 | "id": "excess-porter", 612 | "metadata": {}, 613 | "outputs": [ 614 | { 615 | "data": { 616 | "text/plain": [ 617 | "1.0" 618 | ] 619 | }, 620 | "execution_count": 15, 621 | "metadata": {}, 622 | "output_type": "execute_result" 623 | } 624 | ], 625 | "source": [ 626 | "test.index.isin(spp['sku']).mean()" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 16, 632 | "id": "joint-reservation", 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "prob_array = utils.pred_list_to_tweedie(spp['days_to_so'].values, phi=2., p=1.5)\n", 637 | "pd.set_option(\"display.max_columns\", 31)\n", 638 | "pd.DataFrame(prob_array).round(4).to_csv(\"19.csv.gz\", header=False, index=False, compression=\"gzip\")" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 18, 644 | "id": "fitting-hamilton", 645 | "metadata": {}, 646 | "outputs": [ 647 | { 648 | "data": { 649 | "text/html": [ 650 | "
\n", 651 | "\n", 664 | "\n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | "
01234567891011121314151617181920212223242526272829
00.02150.01450.01750.02040.02330.02590.02840.03070.03280.03460.03620.03750.03860.03950.04010.04050.04070.04070.04050.04020.03970.03900.03830.03740.03640.03540.03430.03310.03190.0307
10.01610.01110.01370.01620.01880.02130.02380.02610.02840.03050.03250.03430.03590.03730.03860.03970.04060.04130.04180.04220.04240.04240.04230.04210.04170.04120.04060.03990.03910.0383
20.01610.01110.01370.01620.01880.02130.02380.02610.02840.03050.03250.03430.03590.03730.03860.03970.04060.04130.04180.04220.04240.04240.04230.04210.04170.04120.04060.03990.03910.0383
30.01610.01110.01370.01620.01880.02130.02380.02610.02840.03050.03250.03430.03590.03730.03860.03970.04060.04130.04180.04220.04240.04240.04230.04210.04170.04120.04060.03990.03910.0383
40.01610.01110.01370.01620.01880.02130.02380.02610.02840.03050.03250.03430.03590.03730.03860.03970.04060.04130.04180.04220.04240.04240.04230.04210.04170.04120.04060.03990.03910.0383
.............................................................................................
5514670.01610.01110.01370.01620.01880.02130.02380.02610.02840.03050.03250.03430.03590.03730.03860.03970.04060.04130.04180.04220.04240.04240.04230.04210.04170.04120.04060.03990.03910.0383
5514680.01610.01110.01370.01620.01880.02130.02380.02610.02840.03050.03250.03430.03590.03730.03860.03970.04060.04130.04180.04220.04240.04240.04230.04210.04170.04120.04060.03990.03910.0383
5514690.01610.01110.01370.01620.01880.02130.02380.02610.02840.03050.03250.03430.03590.03730.03860.03970.04060.04130.04180.04220.04240.04240.04230.04210.04170.04120.04060.03990.03910.0383
5514700.03990.02560.02960.03310.03610.03860.04050.04190.04290.04340.04350.04320.04270.04180.04070.03940.03790.03640.03470.03300.03120.02940.02760.02590.02420.02250.02090.01930.01780.0164
5514710.01610.01110.01370.01620.01880.02130.02380.02610.02840.03050.03250.03430.03590.03730.03860.03970.04060.04130.04180.04220.04240.04240.04230.04210.04170.04120.04060.03990.03910.0383
\n", 1066 | "

551472 rows × 30 columns

\n", 1067 | "
" 1068 | ], 1069 | "text/plain": [ 1070 | " 0 1 2 3 4 5 6 7 \\\n", 1071 | "0 0.0215 0.0145 0.0175 0.0204 0.0233 0.0259 0.0284 0.0307 \n", 1072 | "1 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n", 1073 | "2 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n", 1074 | "3 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n", 1075 | "4 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n", 1076 | "... ... ... ... ... ... ... ... ... \n", 1077 | "551467 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n", 1078 | "551468 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n", 1079 | "551469 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n", 1080 | "551470 0.0399 0.0256 0.0296 0.0331 0.0361 0.0386 0.0405 0.0419 \n", 1081 | "551471 0.0161 0.0111 0.0137 0.0162 0.0188 0.0213 0.0238 0.0261 \n", 1082 | "\n", 1083 | " 8 9 10 11 12 13 14 15 \\\n", 1084 | "0 0.0328 0.0346 0.0362 0.0375 0.0386 0.0395 0.0401 0.0405 \n", 1085 | "1 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n", 1086 | "2 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n", 1087 | "3 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n", 1088 | "4 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n", 1089 | "... ... ... ... ... ... ... ... ... \n", 1090 | "551467 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n", 1091 | "551468 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n", 1092 | "551469 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n", 1093 | "551470 0.0429 0.0434 0.0435 0.0432 0.0427 0.0418 0.0407 0.0394 \n", 1094 | "551471 0.0284 0.0305 0.0325 0.0343 0.0359 0.0373 0.0386 0.0397 \n", 1095 | "\n", 1096 | " 16 17 18 19 20 21 22 23 \\\n", 1097 | "0 0.0407 0.0407 0.0405 0.0402 0.0397 0.0390 0.0383 0.0374 \n", 1098 | "1 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n", 1099 | "2 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n", 1100 | "3 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n", 1101 | "4 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n", 1102 | "... ... ... ... ... ... ... ... ... \n", 1103 | "551467 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n", 1104 | "551468 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n", 1105 | "551469 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n", 1106 | "551470 0.0379 0.0364 0.0347 0.0330 0.0312 0.0294 0.0276 0.0259 \n", 1107 | "551471 0.0406 0.0413 0.0418 0.0422 0.0424 0.0424 0.0423 0.0421 \n", 1108 | "\n", 1109 | " 24 25 26 27 28 29 \n", 1110 | "0 0.0364 0.0354 0.0343 0.0331 0.0319 0.0307 \n", 1111 | "1 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n", 1112 | "2 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n", 1113 | "3 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n", 1114 | "4 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n", 1115 | "... ... ... ... ... ... ... \n", 1116 | "551467 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n", 1117 | "551468 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n", 1118 | "551469 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n", 1119 | "551470 0.0242 0.0225 0.0209 0.0193 0.0178 0.0164 \n", 1120 | "551471 0.0417 0.0412 0.0406 0.0399 0.0391 0.0383 \n", 1121 | "\n", 1122 | "[551472 rows x 30 columns]" 1123 | ] 1124 | }, 1125 | "execution_count": 18, 1126 | "metadata": {}, 1127 | "output_type": "execute_result" 1128 | } 1129 | ], 1130 | "source": [ 1131 | "pd.read_csv(\"19.csv.gz\",header=None)#.sum(axis=1)" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "execution_count": null, 1137 | "id": "eleven-intelligence", 1138 | "metadata": {}, 1139 | "outputs": [], 1140 | "source": [] 1141 | } 1142 | ], 1143 | "metadata": { 1144 | "kernelspec": { 1145 | "display_name": "Python 3", 1146 | "language": "python", 1147 | "name": "python3" 1148 | }, 1149 | "language_info": { 1150 | "codemirror_mode": { 1151 | "name": "ipython", 1152 | "version": 3 1153 | }, 1154 | "file_extension": ".py", 1155 | "mimetype": "text/x-python", 1156 | "name": "python", 1157 | "nbconvert_exporter": "python", 1158 | "pygments_lexer": "ipython3", 1159 | "version": "3.8.5" 1160 | } 1161 | }, 1162 | "nbformat": 4, 1163 | "nbformat_minor": 5 1164 | } 1165 | -------------------------------------------------------------------------------- /meli2021/32_baseline_yt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "id": "75638528-4609-4ee2-93c9-28538579e471", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import utils\n", 13 | "\n", 14 | "from sklearn.model_selection import GroupKFold, KFold\n", 15 | "from sklearn.linear_model import LinearRegression\n", 16 | "from sklearn.ensemble import RandomForestRegressor\n", 17 | "from sklearn.metrics import mean_squared_error\n", 18 | "from xgboost import XGBRegressor\n", 19 | "import tweedie\n", 20 | "\n", 21 | "from importlib import reload\n", 22 | "reload(utils)\n", 23 | "from skopt import gp_minimize\n", 24 | "%matplotlib inline" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 5, 30 | "id": "3be069d6-c17e-43a2-8252-2acf5e9c26ad", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "train = pd.read_parquet(\"./train/0.parquet\")\n", 35 | "train['date'] = pd.to_datetime(train['date'])\n", 36 | "train['fold'] = train['date'].dt.month" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 6, 42 | "id": "smaller-boulder", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "test = pd.read_csv(\"test_data.csv\", index_col=0).squeeze()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 7, 52 | "id": "finnish-canadian", 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "sku int64\n", 59 | "date datetime64[ns]\n", 60 | "sold_quantity int64\n", 61 | "current_price float64\n", 62 | "currency object\n", 63 | "listing_type object\n", 64 | "shipping_logistic_type object\n", 65 | "shipping_payment object\n", 66 | "minutes_active float64\n", 67 | "item_domain_id object\n", 68 | "site_id object\n", 69 | "fold int64\n", 70 | "dtype: object" 71 | ] 72 | }, 73 | "execution_count": 7, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "train.dtypes" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 8, 85 | "id": "fc10499b-4c46-4f7e-a88e-a4233fb05504", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/html": [ 91 | "
\n", 92 | "\n", 105 | "\n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | "
skudatesold_quantitycurrent_pricecurrencylisting_typeshipping_logistic_typeshipping_paymentminutes_activeitem_domain_idsite_idfold
04648012021-02-010156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
14648012021-02-020156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
24648012021-02-030156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
34648012021-02-040156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
44648012021-02-051156.78REAclassicfulfillmentfree_shipping1440.0MLB-NEBULIZERSMLB2
\n", 201 | "
" 202 | ], 203 | "text/plain": [ 204 | " sku date sold_quantity current_price currency listing_type \\\n", 205 | "0 464801 2021-02-01 0 156.78 REA classic \n", 206 | "1 464801 2021-02-02 0 156.78 REA classic \n", 207 | "2 464801 2021-02-03 0 156.78 REA classic \n", 208 | "3 464801 2021-02-04 0 156.78 REA classic \n", 209 | "4 464801 2021-02-05 1 156.78 REA classic \n", 210 | "\n", 211 | " shipping_logistic_type shipping_payment minutes_active item_domain_id \\\n", 212 | "0 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 213 | "1 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 214 | "2 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 215 | "3 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 216 | "4 fulfillment free_shipping 1440.0 MLB-NEBULIZERS \n", 217 | "\n", 218 | " site_id fold \n", 219 | "0 MLB 2 \n", 220 | "1 MLB 2 \n", 221 | "2 MLB 2 \n", 222 | "3 MLB 2 \n", 223 | "4 MLB 2 " 224 | ] 225 | }, 226 | "execution_count": 8, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "train.head()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 9, 238 | "id": "monthly-general", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "def gen_tr_ts():\n", 243 | " for fold in [2,3]:\n", 244 | " ts = train[train['fold'] != fold]['date'].max()\n", 245 | " ts = train[(train['fold'] != fold) & (train['date'] == ts)].index\n", 246 | " yield train.index[train['fold'] == fold], ts, fold\n", 247 | "\n", 248 | " " 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 10, 254 | "id": "bdb25520-ed36-4c4c-a9fe-23387cb3f918", 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "\n", 259 | "def gen_p(na=30., power=1.5):\n", 260 | " dts = list()\n", 261 | " for tr,ts, fold in gen_tr_ts():\n", 262 | "\n", 263 | " ## EVAL\n", 264 | " pp = train[train['fold'] != fold][['sku', 'date', 'sold_quantity']]\n", 265 | " pp['stock'] = pp['sku'].map(test)\n", 266 | " pp = pp.sort_values([\"sku\",\"date\"])\n", 267 | " pp['cumulative_y'] = pp.groupby(\"sku\")['sold_quantity'].cumsum()\n", 268 | "\n", 269 | " pp = pp.dropna(subset=['stock'])\n", 270 | " pp['stockout_y'] = pp['cumulative_y'] >= pp['stock']\n", 271 | "\n", 272 | " first_so_y = pp[pp['stockout_y']].groupby(\"sku\").first()\n", 273 | " days_to_so_y = (first_so_y[\"date\"] - pp[\"date\"].min()) / np.timedelta64(1, 'D')\n", 274 | " days_to_so_y = days_to_so_y.reindex(pp['sku'].unique()).fillna(na).astype(int).clip(1)\n", 275 | " dts.append(days_to_so_y)\n", 276 | "\n", 277 | " m = utils.pred_list_to_distro(dts[0], wei=False, total_days=max(na, 30), phi=2, power=power)\n", 278 | " f = utils.pred_list_to_distro(dts[1].reindex(dts[0].index).fillna(dts[0]), wei=False, total_days=max(na, 30), phi=2, power=power)\n", 279 | "\n", 280 | " m = pd.DataFrame(m,index=dts[0].index)\n", 281 | " f = pd.DataFrame(f,index=dts[0].index)\n", 282 | "\n", 283 | " p = (m + f)/2\n", 284 | "\n", 285 | " p = p.div(p.sum(axis=1), axis=0)\n", 286 | "\n", 287 | " p = p.loc[test.index]\n", 288 | " \n", 289 | " p = p.round(4)\n", 290 | "\n", 291 | " return p" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 11, 297 | "id": "outer-certificate", 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "r = {i:gen_p(30, power=i/10) for i in range(11,20,1)}\n" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 52, 307 | "id": "1834ec6c-f4e4-4272-8b08-f6a3cbadb705", 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "" 314 | ] 315 | }, 316 | "execution_count": 52, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | }, 320 | { 321 | "data": { 322 | "image/png": "\n", 323 | "text/plain": [ 324 | "
" 325 | ] 326 | }, 327 | "metadata": { 328 | "needs_background": "light" 329 | }, 330 | "output_type": "display_data" 331 | } 332 | ], 333 | "source": [ 334 | "r[11].mean(axis=0).plot()" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 53, 340 | "id": "bd7c0392-db7e-4361-a7ed-64cf9a949504", 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "" 347 | ] 348 | }, 349 | "execution_count": 53, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | }, 353 | { 354 | "data": { 355 | "image/png": "\n", 356 | "text/plain": [ 357 | "
" 358 | ] 359 | }, 360 | "metadata": { 361 | "needs_background": "light" 362 | }, 363 | "output_type": "display_data" 364 | } 365 | ], 366 | "source": [ 367 | "r[12].mean(axis=0).plot()" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 13, 373 | "id": "7b22e456-318b-4ab0-bf80-31807d2f268c", 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "text/plain": [ 379 | "" 380 | ] 381 | }, 382 | "execution_count": 13, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | }, 386 | { 387 | "data": { 388 | "image/png": "\n", 389 | "text/plain": [ 390 | "
" 391 | ] 392 | }, 393 | "metadata": { 394 | "needs_background": "light" 395 | }, 396 | "output_type": "display_data" 397 | } 398 | ], 399 | "source": [ 400 | "r[13].mean(axis=0).plot()" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 59, 406 | "id": "2efa8e12-e62a-463a-82e4-bdca3babdcd3", 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "# phi=2, power=1.3, na=30\n", 411 | "r[13].round(4).to_csv(\"32b.csv.gz\", header=False, index=False, compression=\"gzip\")\n", 412 | "# LB 4.94" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "id": "57e747a1-1d2b-4339-8be8-ffdd1c6fb0a3", 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "# BONUS\n", 423 | "# xgb 19 + baseline 32 / 2 - LB 4.44" 424 | ] 425 | } 426 | ], 427 | "metadata": { 428 | "kernelspec": { 429 | "display_name": "Python 3", 430 | "language": "python", 431 | "name": "python3" 432 | }, 433 | "language_info": { 434 | "codemirror_mode": { 435 | "name": "ipython", 436 | "version": 3 437 | }, 438 | "file_extension": ".py", 439 | "mimetype": "text/x-python", 440 | "name": "python", 441 | "nbconvert_exporter": "python", 442 | "pygments_lexer": "ipython3", 443 | "version": "3.8.5" 444 | } 445 | }, 446 | "nbformat": 4, 447 | "nbformat_minor": 5 448 | } 449 | -------------------------------------------------------------------------------- /meli2021/61_active_model_yt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "7fe10a1f-3c36-4466-89c5-5dc2622daf87", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "%matplotlib inline\n", 12 | "import numpy as np\n", 13 | "from matplotlib import pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "c171946b-edfc-4a51-90fe-cc483428228f", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "Y -> how many days until this product becomes active?" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "id": "84e02a3e-1668-40a5-83cd-3029def45fd8", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "test = pd.read_csv(\"test_data.csv\").set_index(\"sku\").squeeze()\n", 34 | "train = pd.read_parquet(\"./train/0.parquet\")\n", 35 | "train['date'] = pd.to_datetime(train['date'])\n", 36 | "cats = ['item_domain_id', 'currency', 'listing_type', 'shipping_logistic_type', 'shipping_payment', 'site_id']\n", 37 | "for cat in cats:\n", 38 | " train[cat] = train[cat].astype(\"category\").cat.codes" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "id": "8ca0f337-197d-4a22-9191-7bef0b33b705", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "train.loc[train[\"minutes_active\"] == 0, \"active\"] = 0\n", 49 | "train[\"active\"] = train[\"active\"].fillna(1)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "id": "311a92d9-55b1-484d-a53b-ef755681d75c", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | "
skushipping_logistic_typeshipping_paymentlisting_typecurrencycurrent_priceitem_domain_idsite_iddays_to_activedays_since_inactive
sku1.000000-0.000341-0.002559-0.000866-0.0020910.002278-0.0014390.000258-0.001643-0.001465
shipping_logistic_type-0.0003411.0000000.0741830.0407460.050840-0.089014-0.031270-0.019540-0.0094560.075642
shipping_payment-0.0025590.0741831.000000-0.0743380.107770-0.620856-0.118603-0.111873-0.012127-0.009789
listing_type-0.0008660.040746-0.0743381.0000000.152052-0.0357440.0727150.0964910.029645-0.007888
currency-0.0020910.0508400.1077700.1520521.000000-0.657856-0.422581-0.472765-0.014007-0.011959
current_price0.002278-0.089014-0.620856-0.035744-0.6578561.0000000.2678250.2745700.0143250.006590
item_domain_id-0.001439-0.031270-0.1186030.072715-0.4225810.2678251.0000000.8938190.0403630.022034
site_id0.000258-0.019540-0.1118730.096491-0.4727650.2745700.8938191.0000000.0388150.023721
days_to_active-0.001643-0.009456-0.0121270.029645-0.0140070.0143250.0403630.0388151.000000-0.118599
days_since_inactive-0.0014650.075642-0.009789-0.007888-0.0119590.0065900.0220340.023721-0.1185991.000000
\n", 224 | "
" 225 | ], 226 | "text/plain": [ 227 | " sku shipping_logistic_type shipping_payment \\\n", 228 | "sku 1.000000 -0.000341 -0.002559 \n", 229 | "shipping_logistic_type -0.000341 1.000000 0.074183 \n", 230 | "shipping_payment -0.002559 0.074183 1.000000 \n", 231 | "listing_type -0.000866 0.040746 -0.074338 \n", 232 | "currency -0.002091 0.050840 0.107770 \n", 233 | "current_price 0.002278 -0.089014 -0.620856 \n", 234 | "item_domain_id -0.001439 -0.031270 -0.118603 \n", 235 | "site_id 0.000258 -0.019540 -0.111873 \n", 236 | "days_to_active -0.001643 -0.009456 -0.012127 \n", 237 | "days_since_inactive -0.001465 0.075642 -0.009789 \n", 238 | "\n", 239 | " listing_type currency current_price item_domain_id \\\n", 240 | "sku -0.000866 -0.002091 0.002278 -0.001439 \n", 241 | "shipping_logistic_type 0.040746 0.050840 -0.089014 -0.031270 \n", 242 | "shipping_payment -0.074338 0.107770 -0.620856 -0.118603 \n", 243 | "listing_type 1.000000 0.152052 -0.035744 0.072715 \n", 244 | "currency 0.152052 1.000000 -0.657856 -0.422581 \n", 245 | "current_price -0.035744 -0.657856 1.000000 0.267825 \n", 246 | "item_domain_id 0.072715 -0.422581 0.267825 1.000000 \n", 247 | "site_id 0.096491 -0.472765 0.274570 0.893819 \n", 248 | "days_to_active 0.029645 -0.014007 0.014325 0.040363 \n", 249 | "days_since_inactive -0.007888 -0.011959 0.006590 0.022034 \n", 250 | "\n", 251 | " site_id days_to_active days_since_inactive \n", 252 | "sku 0.000258 -0.001643 -0.001465 \n", 253 | "shipping_logistic_type -0.019540 -0.009456 0.075642 \n", 254 | "shipping_payment -0.111873 -0.012127 -0.009789 \n", 255 | "listing_type 0.096491 0.029645 -0.007888 \n", 256 | "currency -0.472765 -0.014007 -0.011959 \n", 257 | "current_price 0.274570 0.014325 0.006590 \n", 258 | "item_domain_id 0.893819 0.040363 0.022034 \n", 259 | "site_id 1.000000 0.038815 0.023721 \n", 260 | "days_to_active 0.038815 1.000000 -0.118599 \n", 261 | "days_since_inactive 0.023721 -0.118599 1.000000 " 262 | ] 263 | }, 264 | "execution_count": 4, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "act = train[train['active'] == 1][['sku', 'date']].sort_values(\"date\")\n", 271 | "act['active_date'] = act['date']\n", 272 | "inact = train[train['active'] == 0][['sku', 'date', 'shipping_logistic_type', 'shipping_payment', \n", 273 | " 'listing_type', 'currency', 'current_price', 'item_domain_id', 'site_id']].sort_values(\"date\")\n", 274 | "all_ = pd.merge_asof(inact, act, on=['date'], direction='forward', by=['sku']).dropna(subset=['active_date'])\n", 275 | "all_['days_to_active'] = (all_['active_date'] - all_['date']) / np.timedelta64(1,'D')\n", 276 | "all_['days_since_inactive'] = (all_['date'] - all_.groupby(\"sku\")[\"date\"].transform(\"min\")) / np.timedelta64(1,'D')\n", 277 | "y = all_['days_to_active'].copy()\n", 278 | "all_.corr(method='spearman')" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 5, 284 | "id": "bf5ab194-5311-4008-807e-8024a0e7b599", 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "import xgboost as xgb\n", 289 | "from sklearn.metrics import mean_squared_error" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 6, 295 | "id": "141e9fc5-9b36-4933-bf31-317f5fa88f47", 296 | "metadata": {}, 297 | "outputs": [ 298 | { 299 | "data": { 300 | "text/plain": [ 301 | "7.689729948430816" 302 | ] 303 | }, 304 | "execution_count": 6, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "Xtr = all_.loc[all_['date'] < \"2021-03-01\", ['days_since_inactive', 'current_price'] + cats]\n", 311 | "Xval = all_.loc[all_['date'] >= \"2021-03-01\", ['days_since_inactive', 'current_price'] + cats]\n", 312 | "\n", 313 | "ytr = y[all_['date'] < \"2021-03-01\"]\n", 314 | "yval = y[all_['date'] >= \"2021-03-01\"]\n", 315 | "\n", 316 | "#mdl = DecisionTreeRegressor(max_depth=3)\n", 317 | "#mdl = RandomForestRegressor(n_estimators=100, max_depth=1, random_state=0, n_jobs=6)\n", 318 | "mdl = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=0, n_jobs=6, tree_method='hist')\n", 319 | "mdl.fit(Xtr, ytr)\n", 320 | "p = mdl.predict(Xval)\n", 321 | "np.sqrt(mean_squared_error(yval, p))" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 7, 327 | "id": "f7df5b24-3d9d-4412-864b-b0f32da1cbd9", 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "text/plain": [ 333 | "" 334 | ] 335 | }, 336 | "execution_count": 7, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | }, 340 | { 341 | "data": { 342 | "image/png": "\n", 343 | "text/plain": [ 344 | "
" 345 | ] 346 | }, 347 | "metadata": { 348 | "needs_background": "light" 349 | }, 350 | "output_type": "display_data" 351 | } 352 | ], 353 | "source": [ 354 | "Xval['p'] = p\n", 355 | "Xval['error'] = Xval['p'] - yval\n", 356 | "\n", 357 | "Xval.groupby(\"days_since_inactive\")['error'].mean().plot.bar(figsize=(15,10))" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 8, 363 | "id": "638ead4d-b6bb-4c3f-a11e-ac1ee378a80c", 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "all_t = train[train['active'] == 0].copy()\n", 368 | "\n", 369 | "all_t['days_since_inactive'] = (all_t['date'] - all_t.groupby(\"sku\")[\"date\"].transform(\"min\")) / np.timedelta64(1,'D')\n", 370 | "all_t = all_t.groupby(\"sku\").last()\n", 371 | "all_t = all_t[all_t['date'] == \"2021-03-31\"].copy()" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 9, 377 | "id": "f17ca4b5-3352-4747-88ce-b6de1fc0b480", 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/html": [ 383 | "
\n", 384 | "\n", 397 | "\n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | "
datesold_quantitycurrent_pricecurrencylisting_typeshipping_logistic_typeshipping_paymentminutes_activeitem_domain_idsite_idactivedays_since_inactive
sku
42021-03-310118.0031200.0526310.054.0
82021-03-31058.4921210.0611620.041.0
92021-03-310199.0021200.0799120.06.0
112021-03-310109.9030200.0364510.019.0
132021-03-310474.0521200.0565820.00.0
.......................................
6608972021-03-31079.0020210.0620120.058.0
6609042021-03-31098.9031210.0367810.024.0
6609072021-03-31024.6931210.0470310.056.0
6609102021-03-310480.5821200.0787820.042.0
6609152021-03-31099.9920210.0799420.058.0
\n", 598 | "

181606 rows × 12 columns

\n", 599 | "
" 600 | ], 601 | "text/plain": [ 602 | " date sold_quantity current_price currency listing_type \\\n", 603 | "sku \n", 604 | "4 2021-03-31 0 118.00 3 1 \n", 605 | "8 2021-03-31 0 58.49 2 1 \n", 606 | "9 2021-03-31 0 199.00 2 1 \n", 607 | "11 2021-03-31 0 109.90 3 0 \n", 608 | "13 2021-03-31 0 474.05 2 1 \n", 609 | "... ... ... ... ... ... \n", 610 | "660897 2021-03-31 0 79.00 2 0 \n", 611 | "660904 2021-03-31 0 98.90 3 1 \n", 612 | "660907 2021-03-31 0 24.69 3 1 \n", 613 | "660910 2021-03-31 0 480.58 2 1 \n", 614 | "660915 2021-03-31 0 99.99 2 0 \n", 615 | "\n", 616 | " shipping_logistic_type shipping_payment minutes_active \\\n", 617 | "sku \n", 618 | "4 2 0 0.0 \n", 619 | "8 2 1 0.0 \n", 620 | "9 2 0 0.0 \n", 621 | "11 2 0 0.0 \n", 622 | "13 2 0 0.0 \n", 623 | "... ... ... ... \n", 624 | "660897 2 1 0.0 \n", 625 | "660904 2 1 0.0 \n", 626 | "660907 2 1 0.0 \n", 627 | "660910 2 0 0.0 \n", 628 | "660915 2 1 0.0 \n", 629 | "\n", 630 | " item_domain_id site_id active days_since_inactive \n", 631 | "sku \n", 632 | "4 5263 1 0.0 54.0 \n", 633 | "8 6116 2 0.0 41.0 \n", 634 | "9 7991 2 0.0 6.0 \n", 635 | "11 3645 1 0.0 19.0 \n", 636 | "13 5658 2 0.0 0.0 \n", 637 | "... ... ... ... ... \n", 638 | "660897 6201 2 0.0 58.0 \n", 639 | "660904 3678 1 0.0 24.0 \n", 640 | "660907 4703 1 0.0 56.0 \n", 641 | "660910 7878 2 0.0 42.0 \n", 642 | "660915 7994 2 0.0 58.0 \n", 643 | "\n", 644 | "[181606 rows x 12 columns]" 645 | ] 646 | }, 647 | "execution_count": 9, 648 | "metadata": {}, 649 | "output_type": "execute_result" 650 | } 651 | ], 652 | "source": [ 653 | "all_t" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 10, 659 | "id": "a8513a7c-3e3d-46ec-8725-b9076633ee7a", 660 | "metadata": {}, 661 | "outputs": [], 662 | "source": [ 663 | "X = all_.loc[:, ['days_since_inactive', 'current_price'] + cats]\n", 664 | "Xt = all_t.loc[:, ['days_since_inactive', 'current_price'] + cats]\n", 665 | "\n", 666 | "mdl = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=0, n_jobs=6, tree_method='hist')\n", 667 | "mdl.fit(X, y)\n", 668 | "p = mdl.predict(Xt)" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 11, 674 | "id": "96be5acc-dc34-4d44-9322-fc452364666c", 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "p2 = pd.Series(p.round(), index=Xt.index).reindex(test.index).dropna().astype(int)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": 12, 684 | "id": "42bf0345-5db8-4da9-8657-b570024386d7", 685 | "metadata": {}, 686 | "outputs": [ 687 | { 688 | "data": { 689 | "text/html": [ 690 | "
\n", 691 | "\n", 704 | "\n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | "
days_since_inactivecurrent_priceitem_domain_idcurrencylisting_typeshipping_logistic_typeshipping_paymentsite_id
sku
454.0118.00526331201
841.058.49611621212
96.0199.00799121202
1119.0109.90364530201
130.0474.05565821202
\n", 787 | "
" 788 | ], 789 | "text/plain": [ 790 | " days_since_inactive current_price item_domain_id currency \\\n", 791 | "sku \n", 792 | "4 54.0 118.00 5263 3 \n", 793 | "8 41.0 58.49 6116 2 \n", 794 | "9 6.0 199.00 7991 2 \n", 795 | "11 19.0 109.90 3645 3 \n", 796 | "13 0.0 474.05 5658 2 \n", 797 | "\n", 798 | " listing_type shipping_logistic_type shipping_payment site_id \n", 799 | "sku \n", 800 | "4 1 2 0 1 \n", 801 | "8 1 2 1 2 \n", 802 | "9 1 2 0 2 \n", 803 | "11 0 2 0 1 \n", 804 | "13 1 2 0 2 " 805 | ] 806 | }, 807 | "execution_count": 12, 808 | "metadata": {}, 809 | "output_type": "execute_result" 810 | } 811 | ], 812 | "source": [ 813 | "Xt.head()" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": 14, 819 | "id": "b6880938-d9bf-456c-b8dd-c5f60dd293cd", 820 | "metadata": {}, 821 | "outputs": [ 822 | { 823 | "data": { 824 | "text/plain": [ 825 | "sku\n", 826 | "431262 2\n", 827 | "94157 9\n", 828 | "394886 9\n", 829 | "434156 9\n", 830 | "197550 12\n", 831 | " ..\n", 832 | "575227 10\n", 833 | "470249 10\n", 834 | "24226 10\n", 835 | "297331 10\n", 836 | "511077 10\n", 837 | "Length: 97692, dtype: int64" 838 | ] 839 | }, 840 | "execution_count": 14, 841 | "metadata": {}, 842 | "output_type": "execute_result" 843 | } 844 | ], 845 | "source": [ 846 | "p2" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": 13, 852 | "id": "14642acd-37f0-420c-9951-b90ce6b6f195", 853 | "metadata": {}, 854 | "outputs": [ 855 | { 856 | "name": "stdout", 857 | "output_type": "stream", 858 | "text": [ 859 | "(551472, 30)\n", 860 | "CPU times: user 2min 10s, sys: 836 ms, total: 2min 10s\n", 861 | "Wall time: 2min 10s\n" 862 | ] 863 | } 864 | ], 865 | "source": [ 866 | "%%time\n", 867 | "sub = pd.read_csv(\"45d.csv.gz\", header=None) # 4.31\n", 868 | "sub_ = sub.copy()\n", 869 | "sub_.index = test.index\n", 870 | "\n", 871 | "for sku in p2.index:\n", 872 | " s = sub_.loc[sku].copy()\n", 873 | " days = p2.loc[sku]\n", 874 | " #print(s)\n", 875 | " s.iloc[:days] = s.iloc[:days]*0.5\n", 876 | " s = s / s.sum()\n", 877 | " sub_.loc[sku, :] = s\n", 878 | "print(sub_.shape)\n", 879 | "sub_.round(4).to_csv(\"61byt.csv.gz\", header=False, index=False, compression=\"gzip\")\n", 880 | "# LB 4.2772" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "execution_count": null, 886 | "id": "c8e7e49f-4e84-4a50-a324-76ca6abb9641", 887 | "metadata": {}, 888 | "outputs": [], 889 | "source": [] 890 | } 891 | ], 892 | "metadata": { 893 | "kernelspec": { 894 | "display_name": "Python 3", 895 | "language": "python", 896 | "name": "python3" 897 | }, 898 | "language_info": { 899 | "codemirror_mode": { 900 | "name": "ipython", 901 | "version": 3 902 | }, 903 | "file_extension": ".py", 904 | "mimetype": "text/x-python", 905 | "name": "python", 906 | "nbconvert_exporter": "python", 907 | "pygments_lexer": "ipython3", 908 | "version": "3.8.5" 909 | } 910 | }, 911 | "nbformat": 4, 912 | "nbformat_minor": 5 913 | } 914 | -------------------------------------------------------------------------------- /meli2021/README: -------------------------------------------------------------------------------- 1 | Solutions to #MeliDataChallenge 2021 2 | 3 | August 2nd, 2021 -> 19_xgb_barely_tuned gives you LB 4.64 (5th place) 4 | August 3rd, 2021 -> 32_baseline gives you LB 4.94 (7th place) 5 | 6 | If you average the predictions from 19 and 32 you get LB 4.44 (3rd place) 7 | 8 | August 9th, 2021 -> 61_active applied to the average above should give ~ LB 4.30 (4th place) 9 | 10 | Video Tutorials: https://www.youtube.com/playlist?list=PLV_itENB3unp-g1tgybj5-gs_4FGL8aA8 11 | 12 | -------------------------------------------------------------------------------- /meli2021/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tweedie 4 | import scipy.stats as st 5 | 6 | def pred_list_to_prob_array(pred_list, cumulative=False, total_days=30): 7 | prob_array = np.zeros((pred_list.shape[0], total_days)) 8 | pred_list = np.clip(pred_list, 1, total_days) 9 | for row, e in enumerate(pred_list): 10 | if cumulative: 11 | prob_array[row, int(e-1)] = 1. 12 | else: 13 | prob_array[row, int(e-1):] = 1. 14 | 15 | if cumulative: 16 | prob_array = prob_array+1e-4 17 | prob_array = np.divide(prob_array, prob_array.sum(axis=1).reshape(-1,1)) 18 | prob_array = prob_array.cumsum(axis=1) 19 | 20 | return prob_array 21 | 22 | def pred_list_to_prob_array_mc(pred_list, total_days=30): 23 | prob_array = np.zeros((pred_list.shape[0], total_days)) 24 | pred_list = np.clip(pred_list, 1, total_days) 25 | for row, e in enumerate(pred_list): 26 | prob_array[row, int(e):] = 1. 27 | 28 | return prob_array 29 | 30 | def rps(y, p, probs=False, total_days=30): 31 | y_array = pred_list_to_prob_array(y, total_days=total_days) 32 | if probs: 33 | p_array = p.cumsum(axis=1) 34 | else: 35 | p_array = pred_list_to_prob_array(p, cumulative=True, total_days=total_days) 36 | return ((p_array - y_array)**2).sum(axis=1).mean() 37 | 38 | 39 | def rps_mc(y, p, probs=False, total_days=30): 40 | y_array = pred_list_to_prob_array_mc(y, total_days=total_days) 41 | if probs: 42 | p_array = p.cumsum(axis=1) 43 | return ((p_array - y_array)**2).sum(axis=1).mean() 44 | 45 | def rps_raw(y, p, probs=False): 46 | y_array = pred_list_to_prob_array(y) 47 | if probs: 48 | p_array = p.cumsum(axis=1) 49 | else: 50 | p_array = pred_list_to_prob_array(p, cumulative=True) 51 | return ((p_array - y_array)**2).sum(axis=1) 52 | 53 | 54 | def pred_list_to_tweedie(pred_list, phi=1, p=1.5): 55 | # has a bug in the first day, it's the wrong probability, but it's worse without the bug 56 | distros = dict() 57 | for mu in range(1,31): 58 | distros[mu] = [tweedie.tweedie(p=p, mu=mu, phi=phi).cdf(days) for days in range(1,31,1)] 59 | distros[mu][1:] = np.diff(distros[mu]) 60 | distros[mu] = np.round(distros[mu] / np.sum(distros[mu]), 4) 61 | 62 | prob_array = np.zeros((pred_list.shape[0], 30)) 63 | 64 | for row, mu in enumerate(pred_list): 65 | prob_array[row, :] = distros[mu]#.cumsum() 66 | #prob_array[row, -1] = 1. 67 | 68 | return prob_array 69 | 70 | 71 | 72 | def pred_list_to_distro(pred_list, wei=False, total_days=30, phi=2, power=1.5): 73 | distros = dict() 74 | for mu in range(1,total_days+1): 75 | if wei: 76 | distros[mu] = [st.norm.cdf(days, loc=mu, scale=1) for days in range(0,total_days+1,1)] 77 | else: 78 | distros[mu] = [tweedie.tweedie(p=power, mu=mu, phi=phi).cdf(days) for days in range(0,total_days+1,1)] 79 | #distros[mu] = [st.lognorm.cdf(days, s=0.5, loc=mu, scale=0.5) for days in range(0,31,1)] 80 | #distros[mu] = [st.expon.cdf(days, loc=mu, scale=0.01) for days in range(0,31,1)] 81 | #distros[mu] = [st.gengamma.cdf(days, loc=mu, scale=1, a=mu, c=1) for days in range(1,31,1)] 82 | if np.sum(distros[mu]) > 0: 83 | distros[mu] = np.diff(distros[mu]) 84 | distros[mu] = np.round(distros[mu] / np.sum(distros[mu]), 4) 85 | else: 86 | distros[mu] = distros[mu][1:] 87 | distros[mu][-1] = 1 88 | 89 | 90 | prob_array = np.zeros((pred_list.shape[0], total_days)) 91 | 92 | for row, mu in enumerate(pred_list): 93 | prob_array[row, :] = distros[mu]#.cumsum() 94 | #prob_array[row, -1] = 1. 95 | 96 | return prob_array 97 | 98 | def pred_list_to_distro_smooth(pred_list, total_days=30, phi=2, power=1.5, smooth_factor=0.3): 99 | distros = dict() 100 | for mu in range(1,total_days+1): 101 | distros[mu] = [tweedie.tweedie(p=power, mu=mu, phi=phi).cdf(days) for days in range(0,total_days+1,1)] 102 | if np.sum(distros[mu]) > 0: 103 | distros[mu] = np.diff(distros[mu]) 104 | distros[mu] = np.round(distros[mu] / np.sum(distros[mu]), 4) 105 | else: 106 | distros[mu] = distros[mu][1:] 107 | distros[mu][-1] = 1 108 | 109 | 110 | prob_array = np.zeros((pred_list.shape[0], total_days)) 111 | 112 | for row, mu in enumerate(pred_list): 113 | if mu == 1: 114 | prob_array[row, :] = (1-smooth_factor)*distros[mu] + smooth_factor*distros[mu+1] 115 | elif mu == total_days: 116 | prob_array[row, :] = smooth_factor*distros[mu-1] + (1-smooth_factor)*distros[mu] 117 | else: 118 | prob_array[row, :] = (smooth_factor/2)*distros[mu-1] + (1-smooth_factor)*distros[mu] + (smooth_factor/2)*distros[mu+1] 119 | 120 | return prob_array -------------------------------------------------------------------------------- /multiple_time_series/README: -------------------------------------------------------------------------------- 1 | Notebooks for https://youtu.be/RRd2wzMRpOc 2 | --------------------------------------------------------------------------------