├── README.md ├── main.ipynb ├── main_baseline_evaluation.ipynb └── main_feature_engine.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # invent-prediction 2 | Model to accurately forecast inventory demand based on historical sales data. 3 | -------------------------------------------------------------------------------- /main.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model to forecast inventory demand based on historical sales data. " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import seaborn as sns\n", 22 | "from scipy import stats\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import time\n", 25 | "import random\n", 26 | "import pickle" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Model accuracy is RMSLE" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "def rmsle(y, y0):\n", 45 | " assert len(y) == len(y0)\n", 46 | " return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Load Training Data \n", 54 | "The size of the training data is quite large (~4 GB). Large datasets require significant amount of memory to process. Instead, we will sample the data randomly for our initial data analysis and visualization. " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": false, 62 | "scrolled": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "def load_samp_data(filename='train.csv', columns=[], load_pkl=1):\n", 67 | " \"\"\" \n", 68 | " Function returns a dataframe containing the training data sampled randomly. \n", 69 | " The data is also stored in a pickle file for later processing.\n", 70 | " \"\"\"\n", 71 | " if load_pkl:\n", 72 | " inputfile = open('train_samp_data.pkl', 'rb')\n", 73 | " data = pickle.load(inputfile)\n", 74 | " inputfile.close()\n", 75 | " return data\n", 76 | " \n", 77 | " chunksize= 10 ** 6\n", 78 | " datasize = 74180464 #datasize = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)\n", 79 | " samplesize = 3*10 ** 4 # samples per chunk of data read from the file.\n", 80 | " \n", 81 | " data = pd.DataFrame([],columns=columns)\n", 82 | " chunks = pd.read_csv(filename, iterator=True, chunksize=chunksize)\n", 83 | " for chunk in chunks:\n", 84 | " chunk.columns = columns\n", 85 | " data = data.append(chunk.sample(samplesize)) \n", 86 | " \n", 87 | " # write data to a pickle file.\n", 88 | " outputfile = open('train_samp_data.pkl','wb')\n", 89 | " pickle.dump(data,outputfile)\n", 90 | " outputfile.close()\n", 91 | " \n", 92 | " return data\n", 93 | " \n", 94 | "load_pkl = 0\n", 95 | "columns = ['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id', 'saleunit_curr_wk', 'saleamt_curr_wk', 'retunit_next_week', 'retamt_next_wk', 'y_pred_demand']\n", 96 | "tic = time.time()\n", 97 | "train_data_samp = load_samp_data('train.csv', columns, load_pkl)\n", 98 | "toc = time.time()\n", 99 | "print '**'\n", 100 | "print 'Time to load: ', toc-tic, 'sec'\n", 101 | "print \n", 102 | "print train_data_samp.describe()\n", 103 | "print '**'\n", 104 | "print train_data_samp[['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']]" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Preliminary evaluation using Linear Regression" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "from sklearn import linear_model\n", 123 | "from sklearn.cross_validation import train_test_split\n", 124 | "features_train = train_data_samp[['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']].values\n", 125 | "labels_train = train_data_samp[['y_pred_demand']].values\n", 126 | "\n", 127 | "# Split the data samples into train and test.\n", 128 | "X_train, X_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.33, random_state=42)\n", 129 | "\n", 130 | "# Linear regression\n", 131 | "tic = time.time()\n", 132 | "clf = linear_model.LinearRegression()\n", 133 | "clf.fit(X_train, y_train)\n", 134 | "pred = clf.predict(X_test)\n", 135 | "pred[pred<0] = 0\n", 136 | "tac = time.time()\n", 137 | "print '----------'\n", 138 | "print 'Time:', tac-tic, 'RMSLE (LinearRegression):', rmsle(pred, y_test)\n", 139 | "print '----------'\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## Preliminary evaluation using gradient boosting (xgboost)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 7, 152 | "metadata": { 153 | "collapsed": true 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "# Utility function to report best scores\n", 158 | "def report(grid_scores, n_top=3):\n", 159 | " top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]\n", 160 | " for i, score in enumerate(top_scores):\n", 161 | " print(\"Model with rank: {0}\".format(i + 1))\n", 162 | " print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", 163 | " score.mean_validation_score,\n", 164 | " np.std(score.cv_validation_scores)))\n", 165 | " print(\"Parameters: {0}\".format(score.parameters))\n", 166 | " print(\"\")" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 8, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "Time: 0.00601291656494 RMSLE (LinearRegression): 0.935833411032\n", 186 | "----------\n", 187 | "RandomizedSearchCV took 1190.15 seconds for 20 candidates parameter settings.\n", 188 | "Model with rank: 1\n", 189 | "Mean validation score: 0.237 (std: 0.003)\n", 190 | "Parameters: {'bootstrap': False, 'min_samples_leaf': 3, 'min_samples_split': 5, 'criterion': 'entropy', 'max_features': 5, 'max_depth': 10}\n", 191 | "\n", 192 | "Model with rank: 2\n", 193 | "Mean validation score: 0.236 (std: 0.002)\n", 194 | "Parameters: {'bootstrap': False, 'min_samples_leaf': 2, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': 5, 'max_depth': 10}\n", 195 | "\n", 196 | "Model with rank: 3\n", 197 | "Mean validation score: 0.236 (std: 0.003)\n", 198 | "Parameters: {'bootstrap': True, 'min_samples_leaf': 5, 'min_samples_split': 3, 'criterion': 'gini', 'max_features': 5, 'max_depth': 10}\n", 199 | "\n", 200 | "Time: 0.00601291656494 RMSLE (RF): 0.786554235216\n", 201 | "----------\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "\n", 207 | "from sklearn.tree import DecisionTreeRegressor\n", 208 | "from sklearn.ensemble import AdaBoostRegressor\n", 209 | "from sklearn.ensemble import RandomForestClassifier\n", 210 | "\n", 211 | "from sklearn.grid_search import RandomizedSearchCV\n", 212 | "\n", 213 | "from scipy.stats import randint as sp_randint\n", 214 | "from operator import itemgetter\n", 215 | "\n", 216 | "\n", 217 | "\n", 218 | "\n", 219 | "\n", 220 | "clf = RandomForestClassifier(n_estimators=30)\n", 221 | "# specify parameters and distributions to sample from\n", 222 | "param_dist = {\"max_depth\": [10, None],\n", 223 | " \"max_features\": sp_randint(1, 6),\n", 224 | " \"min_samples_split\": sp_randint(1, 6),\n", 225 | " \"min_samples_leaf\": sp_randint(1, 6),\n", 226 | " \"bootstrap\": [True, False],\n", 227 | " \"criterion\": [\"gini\", \"entropy\"]}\n", 228 | "\n", 229 | "# run randomized search\n", 230 | "n_iter_search = 20\n", 231 | "random_search = RandomizedSearchCV(clf, param_distributions=param_dist,\n", 232 | " n_iter=n_iter_search, n_jobs=2)\n", 233 | "start = time.time()\n", 234 | "random_search.fit(X_train, np.ravel(y_train))\n", 235 | "\n", 236 | "print(\"RandomizedSearchCV took %.2f seconds for %d candidates\"\n", 237 | " \" parameter settings.\" % ((time.time() - start), n_iter_search))\n", 238 | "report(random_search.grid_scores_)\n", 239 | "random_search.best_score_ \n", 240 | "pred = random_search.predict(X_test)\n", 241 | "pred[pred<0] = 0\n", 242 | "print 'Time:', tac-tic, 'RMSLE (RF):', rmsle(pred, np.ravel(y_test))\n", 243 | "print '----------'" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": false 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "clientnameid_data = pd.read_csv('cliente_tabla.csv')\n", 264 | "townstate_data = pd.read_csv('town_state.csv')\n", 265 | "print clientnameid_data.head()\n", 266 | "print '----'\n", 267 | "print townstate_data.head()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## Data Cleaning\n", 275 | "There are duplicate client ids in cliente_table, which means one client id may have multiple client name that are very similar. We will cluster them based on a hash function and use a clustering algorithm to evaluate similarity. " 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": false 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "import re\n", 287 | "def hash_eval(s):\n", 288 | " hash_base = 4\n", 289 | " s = re.sub('[., ]', '', s)\n", 290 | " seqlen = len(s)\n", 291 | " n = seqlen - 1\n", 292 | " h = 0\n", 293 | " for c in s:\n", 294 | " h += ord(c) * (hash_base ** n)\n", 295 | " n -= 1\n", 296 | " curhash = h\n", 297 | " return curhash\n", 298 | "\n", 299 | "# In the client table, same clients are assigned different client ID. We create a new client table where clients are assigned unique ID. \n", 300 | "clientid_hash = dict()\n", 301 | "new_client_id = [-1] \n", 302 | "for idx, s in enumerate(clientnameid_data.NombreCliente):\n", 303 | " t = hash_eval(s)\n", 304 | " clientid_hash.setdefault(t, []).append(clientnameid_data.Cliente_ID[idx])\n", 305 | " if t in clientid_hash:\n", 306 | " a = clientid_hash[t]\n", 307 | " new_client_id.append(a[0])\n", 308 | "\n", 309 | "# In the agency table, same agencies (town, state) are assigned different agency ID. We create a new agency table where agencies (town, state) are assigned unique ID. \n", 310 | "agencyid_hash = dict()\n", 311 | "new_agency_id = [-1] \n", 312 | "for idx, s in enumerate(townstate_data.Town+townstate_data.State):\n", 313 | " t = hash_eval(s)\n", 314 | " agencyid_hash.setdefault(t, []).append(townstate_data.Agencia_ID[idx])\n", 315 | " if t in agencyid_hash:\n", 316 | " a = agencyid_hash[t]\n", 317 | " new_agency_id.append(a[0])\n" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "collapsed": false 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "clientnameid_data['New_Cliente_ID'] = new_client_id[1:]\n", 329 | "townstate_data['New_Agencia_ID'] = new_agency_id[1:]" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "print clientnameid_data.head(10)\n", 341 | "print '---'\n", 342 | "print townstate_data.head()\n", 343 | "print '---'\n", 344 | "print train_data_samp.head(10)\n" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": { 351 | "collapsed": false 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "print train_data_samp.head(10)\n", 356 | "print '------'\n", 357 | "for idx, cid in enumerate(train_data_samp.client_id):\n", 358 | " train_data_samp.client_id.values[idx] = clientnameid_data.New_Cliente_ID[train_data_samp.client_id.values[idx] == clientnameid_data.Cliente_ID.values].values[0]\n", 359 | " train_data_samp.sales_depot_id.values[idx] = townstate_data.New_Agencia_ID[train_data_samp.sales_depot_id.values[idx] == townstate_data.Agencia_ID.values].values[0]\n", 360 | "print '-----'\n", 361 | "print train_data_samp.head()\n" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "## Load Test Data" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "collapsed": false 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "test_data = pd.read_csv('test.csv')\n", 380 | "test_data.columns = ['id', 'week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client id', 'prod_id']\n", 381 | "test_labels = pd.read_csv('sample_submission.csv')\n", 382 | "test_data = test_data.drop('id', 1)\n", 383 | "print test_data.head()" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "outputs": [], 393 | "source": [] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "collapsed": false 400 | }, 401 | "outputs": [], 402 | "source": [ 403 | "g = sns.PairGrid(data_t)\n", 404 | "g.map(plt.scatter)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": { 411 | "collapsed": false, 412 | "scrolled": true 413 | }, 414 | "outputs": [], 415 | "source": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": { 421 | "collapsed": true 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "a = [[1, 2, 3, 4]]" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": { 432 | "collapsed": false 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "print a" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": { 443 | "collapsed": false 444 | }, 445 | "outputs": [], 446 | "source": [ 447 | "np.array(a)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": { 454 | "collapsed": false 455 | }, 456 | "outputs": [], 457 | "source": [ 458 | "print np.array(a)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": { 465 | "collapsed": false 466 | }, 467 | "outputs": [], 468 | "source": [ 469 | "a = np.array(a)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": { 476 | "collapsed": false 477 | }, 478 | "outputs": [], 479 | "source": [ 480 | "a" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": { 487 | "collapsed": false 488 | }, 489 | "outputs": [], 490 | "source": [ 491 | "print a.reshape(-1,)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": { 498 | "collapsed": true 499 | }, 500 | "outputs": [], 501 | "source": [] 502 | } 503 | ], 504 | "metadata": { 505 | "kernelspec": { 506 | "display_name": "Python 2", 507 | "language": "python", 508 | "name": "python2" 509 | }, 510 | "language_info": { 511 | "codemirror_mode": { 512 | "name": "ipython", 513 | "version": 2 514 | }, 515 | "file_extension": ".py", 516 | "mimetype": "text/x-python", 517 | "name": "python", 518 | "nbconvert_exporter": "python", 519 | "pygments_lexer": "ipython2", 520 | "version": "2.7.11" 521 | } 522 | }, 523 | "nbformat": 4, 524 | "nbformat_minor": 0 525 | } 526 | -------------------------------------------------------------------------------- /main_baseline_evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model to forecast inventory demand based on historical sales data. " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "from scipy import stats\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import time\n", 24 | "import random\n", 25 | "import pickle\n", 26 | "import math" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Model accuracy is RMSLE" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 5, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "import warnings\n", 45 | "warnings.filterwarnings(\"ignore\")\n", 46 | "\n", 47 | "def rmsle(y, y_pred):\n", 48 | " assert len(y) == len(y_pred)\n", 49 | " terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]\n", 50 | " return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Load Training Data \n", 58 | "The size of the training data is quite large (~4 GB). Large datasets require significant amount of memory to process. Instead, we will sample the data randomly for our initial data analysis and visualization. " 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": { 65 | "collapsed": false, 66 | "scrolled": true 67 | }, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "*********\n", 74 | "Time to load: 0.296472787857 sec\n", 75 | "\n", 76 | " week_num sales_depot_id sales_chan_id route_id \\\n", 77 | "count 150000.000000 150000.000000 150000.000000 150000.000000 \n", 78 | "mean 5.982820 2785.281267 1.378807 2116.544753 \n", 79 | "std 2.027124 4660.731926 1.454210 1486.426860 \n", 80 | "min 3.000000 1110.000000 1.000000 1.000000 \n", 81 | "25% 4.000000 1312.000000 1.000000 1162.000000 \n", 82 | "50% 6.000000 1614.000000 1.000000 1287.000000 \n", 83 | "75% 8.000000 2038.000000 1.000000 2803.000000 \n", 84 | "max 9.000000 25759.000000 11.000000 9892.000000 \n", 85 | "\n", 86 | " client_id prod_id saleunit_curr_wk saleamt_curr_wk \\\n", 87 | "count 1.500000e+05 150000.000000 150000.000000 150000.000000 \n", 88 | "mean 1.808100e+06 20917.492180 7.290980 68.851745 \n", 89 | "std 1.839784e+06 18654.697725 21.046499 307.624295 \n", 90 | "min 1.050000e+02 72.000000 0.000000 0.000000 \n", 91 | "25% 3.597660e+05 1242.000000 2.000000 16.760000 \n", 92 | "50% 1.200138e+06 30549.000000 3.000000 30.000000 \n", 93 | "75% 2.376863e+06 37427.000000 7.000000 56.100000 \n", 94 | "max 1.035181e+07 49994.000000 2108.000000 38620.960000 \n", 95 | "\n", 96 | " retunit_next_week retamt_next_wk y_pred_demand \n", 97 | "count 150000.000000 150000.000000 150000.000000 \n", 98 | "mean 0.123680 1.205767 7.206833 \n", 99 | "std 1.646204 17.002674 20.899182 \n", 100 | "min 0.000000 0.000000 0.000000 \n", 101 | "25% 0.000000 0.000000 2.000000 \n", 102 | "50% 0.000000 0.000000 3.000000 \n", 103 | "75% 0.000000 0.000000 6.000000 \n", 104 | "max 193.000000 2696.980000 2108.000000 \n", 105 | "*********\n", 106 | " week_num sales_depot_id sales_chan_id route_id client_id prod_id\n", 107 | "407495 3.0 1119.0 1.0 1260.0 4655180.0 41843.0\n", 108 | "998128 3.0 1130.0 1.0 1210.0 937277.0 35651.0\n", 109 | "435331 3.0 1119.0 1.0 1481.0 1883488.0 1216.0\n", 110 | "570645 3.0 1121.0 1.0 1639.0 2195156.0 323.0\n", 111 | "475547 3.0 1120.0 1.0 1476.0 170945.0 1250.0\n", 112 | "862766 3.0 1126.0 1.0 1219.0 4570976.0 1284.0\n", 113 | "425898 3.0 1119.0 1.0 1468.0 66840.0 1242.0\n", 114 | "627653 3.0 1122.0 1.0 1454.0 4187031.0 1278.0\n", 115 | "899892 3.0 1126.0 1.0 1439.0 433880.0 32819.0\n", 116 | "246230 3.0 1117.0 1.0 1048.0 64516.0 1182.0\n", 117 | "210464 3.0 1116.0 1.0 1609.0 16743.0 2505.0\n", 118 | "58040 3.0 1111.0 1.0 1641.0 4490132.0 30415.0\n", 119 | "194851 3.0 1116.0 1.0 1460.0 16581.0 1278.0\n", 120 | "839854 3.0 1126.0 1.0 1034.0 352604.0 1150.0\n", 121 | "452316 3.0 1120.0 1.0 1072.0 1585169.0 4767.0\n", 122 | "96311 3.0 1112.0 1.0 1412.0 324366.0 1220.0\n", 123 | "110290 3.0 1112.0 1.0 1606.0 4557386.0 4280.0\n", 124 | "187089 3.0 1116.0 1.0 1402.0 4167588.0 1238.0\n", 125 | "773597 3.0 1124.0 1.0 1066.0 402786.0 1125.0\n", 126 | "114850 3.0 1112.0 1.0 2103.0 339523.0 30575.0\n", 127 | "858502 3.0 1126.0 1.0 1213.0 1586252.0 1250.0\n", 128 | "492218 3.0 1120.0 1.0 1623.0 176409.0 3609.0\n", 129 | "427737 3.0 1119.0 1.0 1470.0 1268536.0 32819.0\n", 130 | "111759 3.0 1112.0 1.0 1608.0 1719768.0 3609.0\n", 131 | "88220 3.0 1112.0 1.0 1401.0 1579043.0 41938.0\n", 132 | "92080 3.0 1112.0 1.0 1407.0 330536.0 972.0\n", 133 | "830434 3.0 1126.0 1.0 1014.0 1043097.0 1182.0\n", 134 | "789436 3.0 1124.0 1.0 1214.0 4335456.0 972.0\n", 135 | "282203 3.0 1117.0 1.0 1450.0 1272568.0 35651.0\n", 136 | "152778 3.0 1113.0 1.0 2110.0 1244042.0 31310.0\n", 137 | "... ... ... ... ... ... ...\n", 138 | "6290 9.0 22560.0 1.0 1263.0 1815971.0 1687.0\n", 139 | "49359 9.0 23719.0 1.0 1160.0 4662738.0 1278.0\n", 140 | "50076 9.0 23719.0 1.0 1162.0 2334116.0 41938.0\n", 141 | "108971 9.0 24049.0 1.0 1204.0 599042.0 2233.0\n", 142 | "102354 9.0 23899.0 1.0 4503.0 4669766.0 1212.0\n", 143 | "116515 9.0 24049.0 1.0 4103.0 1172563.0 5000.0\n", 144 | "40629 9.0 23669.0 1.0 2123.0 1978422.0 32934.0\n", 145 | "168039 9.0 25759.0 1.0 1225.0 4483421.0 43200.0\n", 146 | "177718 9.0 25759.0 1.0 5503.0 2058383.0 45112.0\n", 147 | "17402 9.0 22560.0 1.0 2132.0 1888536.0 30531.0\n", 148 | "70202 9.0 23719.0 1.0 4405.0 2009286.0 43307.0\n", 149 | "97183 9.0 23899.0 1.0 2815.0 4572035.0 5337.0\n", 150 | "153604 9.0 25699.0 1.0 1205.0 2337869.0 37401.0\n", 151 | "121078 9.0 24539.0 1.0 1104.0 150132.0 43207.0\n", 152 | "139704 9.0 24669.0 1.0 1212.0 1315264.0 43196.0\n", 153 | "81464 9.0 23899.0 1.0 1226.0 2193803.0 972.0\n", 154 | "30064 9.0 23669.0 1.0 1115.0 250957.0 35651.0\n", 155 | "115242 9.0 24049.0 1.0 2904.0 1564516.0 30531.0\n", 156 | "41417 9.0 23669.0 1.0 2831.0 246772.0 30549.0\n", 157 | "150935 9.0 25699.0 1.0 1105.0 160661.0 43197.0\n", 158 | "50411 9.0 23719.0 1.0 1163.0 1116025.0 1216.0\n", 159 | "86511 9.0 23899.0 1.0 1234.0 1838655.0 1230.0\n", 160 | "31504 9.0 23669.0 1.0 1117.0 1989373.0 1232.0\n", 161 | "137566 9.0 24669.0 1.0 1207.0 120350.0 43200.0\n", 162 | "55541 9.0 23719.0 1.0 1276.0 2225575.0 2233.0\n", 163 | "109651 9.0 24049.0 1.0 1205.0 648349.0 1160.0\n", 164 | "105371 9.0 24049.0 1.0 1102.0 326425.0 34053.0\n", 165 | "20426 9.0 22560.0 1.0 2854.0 2442238.0 4245.0\n", 166 | "124517 9.0 24539.0 1.0 1203.0 4601950.0 34213.0\n", 167 | "140989 9.0 24669.0 1.0 1214.0 4461021.0 43203.0\n", 168 | "\n", 169 | "[150000 rows x 6 columns]\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "def load_samp_data(filename='train.csv', columns=[], load_pkl=1):\n", 175 | " \"\"\" \n", 176 | " Function returns a dataframe containing the training data sampled randomly. \n", 177 | " The data is also stored in a pickle file for later processing.\n", 178 | " \"\"\"\n", 179 | " if load_pkl:\n", 180 | " inputfile = open('train_samp_data.pkl', 'rb')\n", 181 | " data = pickle.load(inputfile)\n", 182 | " inputfile.close()\n", 183 | " return data\n", 184 | " \n", 185 | " chunksize= 10 ** 6\n", 186 | " datasize = 74180464 #datasize = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)\n", 187 | " samplesize = 2*10 ** 3 # samples per chunk of data read from the file.\n", 188 | " \n", 189 | " data = pd.DataFrame([],columns=columns)\n", 190 | " chunks = pd.read_csv(filename, iterator=True, chunksize=chunksize)\n", 191 | " for chunk in chunks:\n", 192 | " chunk.columns = columns\n", 193 | " data = data.append(chunk.sample(samplesize)) \n", 194 | " \n", 195 | " # write data to a pickle file.\n", 196 | " outputfile = open('train_samp_data.pkl','wb')\n", 197 | " pickle.dump(data,outputfile)\n", 198 | " outputfile.close()\n", 199 | " \n", 200 | " return data\n", 201 | " \n", 202 | "load_pkl = 1\n", 203 | "columns = ['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id', 'saleunit_curr_wk', 'saleamt_curr_wk', 'retunit_next_week', 'retamt_next_wk', 'y_pred_demand']\n", 204 | "tic = time.time()\n", 205 | "train_data_samp = load_samp_data('train.csv', columns, load_pkl)\n", 206 | "toc = time.time()\n", 207 | "print '*********'\n", 208 | "print 'Time to load: ', toc-tic, 'sec'\n", 209 | "print \n", 210 | "print train_data_samp.describe()\n", 211 | "print '*********'\n", 212 | "print train_data_samp[['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']]" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "## Preliminary analysis \n", 220 | "### 1. Linear Regression" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 4, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "----------\n", 235 | "Time: 0.0148620605469 RMSLE (LinearRegression): 0.938368243688\n", 236 | "----------\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "from sklearn import linear_model\n", 242 | "from sklearn.cross_validation import train_test_split\n", 243 | "features_train = train_data_samp[['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']].values\n", 244 | "labels_train = train_data_samp[['y_pred_demand']].values\n", 245 | "\n", 246 | "# Split the data samples into train and test.\n", 247 | "X_train, X_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.33, random_state=42)\n", 248 | "\n", 249 | "# Linear regression\n", 250 | "tic = time.time()\n", 251 | "clf = linear_model.LinearRegression()\n", 252 | "clf.fit(X_train, y_train)\n", 253 | "pred = clf.predict(X_test)\n", 254 | "pred[pred<0] = 0\n", 255 | "tac = time.time()\n", 256 | "print '----------'\n", 257 | "print 'Time:', tac-tic, 'RMSLE (LinearRegression):', rmsle(pred, y_test)\n", 258 | "print '----------'" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "### 2. Random Forest Classifier" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 6, 271 | "metadata": { 272 | "collapsed": true 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "# Utility function to report best scores\n", 277 | "def report(grid_scores, n_top=3):\n", 278 | " top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]\n", 279 | " for i, score in enumerate(top_scores):\n", 280 | " print(\"Model with rank: {0}\".format(i + 1))\n", 281 | " print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", 282 | " score.mean_validation_score,\n", 283 | " np.std(score.cv_validation_scores)))\n", 284 | " print(\"Parameters: {0}\".format(score.parameters))\n", 285 | " print(\"\")" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 7, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [ 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "\n", 300 | "Model Report ********\n", 301 | "Accuracy : 0.7968\n", 302 | "\n", 303 | "Model Report ********\n", 304 | "\n", 305 | "RandomizedSearchCV took 150.64 seconds for 10 candidates parameter settings.\n", 306 | "Model with rank: 1\n", 307 | "Mean validation score: 0.242 (std: 0.003)\n", 308 | "Parameters: {'max_features': 5, 'max_depth': 10}\n", 309 | "\n", 310 | "Model with rank: 2\n", 311 | "Mean validation score: 0.241 (std: 0.003)\n", 312 | "Parameters: {'max_features': 6, 'max_depth': 10}\n", 313 | "\n", 314 | "Model with rank: 3\n", 315 | "Mean validation score: 0.241 (std: 0.004)\n", 316 | "Parameters: {'max_features': 6, 'max_depth': 10}\n", 317 | "\n", 318 | "0.241613333333\n", 319 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 320 | " max_depth=10, max_features=5, max_leaf_nodes=None,\n", 321 | " min_samples_leaf=1, min_samples_split=2,\n", 322 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", 323 | " oob_score=False, random_state=None, verbose=0,\n", 324 | " warm_start=False)\n" 325 | ] 326 | } 327 | ], 328 | "source": [ 329 | "from sklearn.ensemble import RandomForestClassifier\n", 330 | "from sklearn.grid_search import RandomizedSearchCV\n", 331 | "from scipy.stats import randint as sp_randint\n", 332 | "from operator import itemgetter\n", 333 | "\n", 334 | "clf = RandomForestClassifier(n_estimators=10)\n", 335 | "\n", 336 | "# specify parameters and distributions to sample from\n", 337 | "param_dist = {\"max_depth\": [10],\n", 338 | " \"max_features\": sp_randint(4, 7),\n", 339 | " }\n", 340 | "\n", 341 | "# run randomized search\n", 342 | "n_iter_search = 10\n", 343 | "random_search = RandomizedSearchCV(clf, param_distributions=param_dist,\n", 344 | " n_iter=n_iter_search, n_jobs=4, cv=5)\n", 345 | "start = time.time()\n", 346 | "random_search.fit(features_train, np.ravel(labels_train))\n", 347 | "predict = random_search.predict(features_train)\n", 348 | "print '\\nModel Report ********'\n", 349 | "print \"Accuracy : %.4g\" % rmsle(np.ravel(labels_train), predict)\n", 350 | "print '\\nModel Report ********'\n", 351 | "print\n", 352 | "print(\"RandomizedSearchCV took %.2f seconds for %d candidates\"\n", 353 | " \" parameter settings.\" % ((time.time() - start), n_iter_search))\n", 354 | "report(random_search.grid_scores_)\n", 355 | "print random_search.best_score_ \n", 356 | "print random_search.best_estimator_ " 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 8, 362 | "metadata": { 363 | "collapsed": false 364 | }, 365 | "outputs": [ 366 | { 367 | "data": { 368 | "text/plain": [ 369 | "" 370 | ] 371 | }, 372 | "execution_count": 8, 373 | "metadata": {}, 374 | "output_type": "execute_result" 375 | }, 376 | { 377 | "data": { 378 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAEGCAYAAACToKXdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFBVJREFUeJzt3X2QXXd93/H3R3ZkamjcgAfLlmupgxMIDMQhoIiBlAUX\nkCFBTNKCzTQQmjQqU4M7TFq7STPeJmWKZ/KMmyZuXE+hUzzBECxPAJsm3BLCk3iwQ4uEhMHCsrGM\nnwA/AEL+9o9zLK7Xd3evdu/u3d/yfs3c0T3n/M75fc+9u58993fOuUpVIUlq04ZpFyBJWjpDXJIa\nZohLUsMMcUlqmCEuSQ0zxCWpYYa4JDXMENdjJLklyYNJvpnkW/2/m5a5zRcmuXVSNY7Z51VJfms1\n+5xPkkuTvGPadWj9OXHaBWhNKuAVVfXhCW4z/XaXtnJyQlUdnWA9qybJCdOuQeuXR+KaT0bOTLYn\n+dsk9yb5XJIXDi37pSRf6I/cv5TkV/v5JwPvB84YPrKfe6Q892g9yVeS/LskNwH3J9mQ5PQk1yS5\nM8nNSd401s4kW5I83Nf41SR3J9mV5DlJbkpyT5K3D7V/fZKPJnl7kvv6/Xrx0PLTk1zbb2d/kl8Z\nWnZpkncneWeS+4B/Bfw68Jp+/z+30Os1/FokeUuSw0luS/JLQ8sfl+R3+09N9yb5SJKTxnyPbu77\nvDnJBeO8flrDqsqHj0c9gK8ALx4x/wzgLuBl/fS5/fST+unzgK39858BHgDO6adfCHx1zvauAn5r\naPpRbfo6Ptv3exLdH5ZPA78BnABsBb4EvGSe/Ti2fWAL8DDwx8BG4J8ADwHvBZ7U93EY+Jm+/euB\nI8Cb+75eDdwH/IN++UeAtwM/BPwEcCcw0y+7FPgO8HP99En9vHfMqW+x1+tIv94JfdsHgFP65f8F\n+GtgU/+6bO9rmfc9Ak4GvgGc3S87Dfjxaf+8+VjewyNxzed9/dHpPUne28/758BfVtX1AFX1V3Sh\n+vJ++gNVdUv//G+AG+jCaTn+sKpur6rvAM8FTq2qt1bV0b6vPwPOH3NbRRfq362q/00Xiu+qqrur\n6nbgb4CfHGp/uKr+qO/rz4EvAq9IcibwPODiqjpSVTf1dbxuaN2PV9V1AH3tjy1m8dfru8Bv9/1/\nALgfeGqSAG8A3lxVd1TnE1V1hEXeI+Ao8Mwkj6uqw1W1d8zXTmuUIa757KyqJ/aPn+/nbQFePRTu\n9wLPB04HSHJeko/3Qwz30h09nrrMOg4NPd8CbJ7T/78Hnnwc27tz6PlDdEffw9NPGJq+bc66B+mO\ndM8A7qmqB+cs2zw0vehJ3DFer7ur6uGh6Qf7+k6lO7r/8ojNzvse9fW+Bngj8LUk1yV56mJ1am3z\nxKbmM2pM/Fa6IYFdj2mcbASuoTsSvLaqHk7yF0PbGXVS8wG6j/iPOH1Em+H1bgW+XFWrFTyb50yf\nBVwL3A48Mcnjq+qBoWXDoT93fx81PcbrtZC7gG8DTwE+P2fZvO8RQFV9CPhQP37+VuC/Af94jD61\nRnkkruPxP4GfS/LS/iTj4/oTcGfQjTNvBO7qA+k84KVD6x4GnpTkh4fm3Qi8PMmPpLuE8aJF+v8U\n8K3+ZOfjkpyQ5BlJnjNm/eME5LAnJ3lTkhOT/DPgaXRDFYeAjwH/OclJSZ4F/DLwzgW2dRjY2g+F\nwOKv17yqqujG+3+vP8G6oT+Z+UMs8B4leXKSV6Y70XyEbnimySt+9H2GuEYZeSlgH1476a60+Drd\nEMKvARuq6n66k4DvTnIP3Tj1tUPrfhF4F/Dl/mP+JrrQ+zvgFuCDwNUL1dEPLfwscA7dSc876Y4k\nf5jxLHh0PGL6k8CP0h35/jbwC1V1X7/sAuAf0R2Vvwf4zVr4ksx30/0RuTvJp/vX6yLmeb3GqP/X\n6I7C9wB3A2+jex/mfY/6x1voPjHcRXcE/sZF+tQal+6P+iKNkh3AH9D9EFxZVZeNaDMD/D7dGfKv\nV9WLJluqtHqSvB745apyqEFr2qJj4kk2AJfTXap0O7AnybVVtW+ozSl0lzy9tKpuS7Lck1mSpDGM\nM5yyDThQVQf7S5iupvu4Nuy1wHuq6jaAqrprsmVKkkYZJ8Q38+jLpQ7x2LP2P0Z3tv7DSfYk+cVJ\nFShNQ1X9D4dS1IJJXWJ4IvBs4MXA44GPJ/l4VX1puFES/1dmSVqCqhp5ddU4R+K30V0D+4gzeexN\nEIeA66vq21V1N90tyT8xTyGr9rj00kunfkus++f+/aDtm/s3+cdCxgnxPcDZ6b5AaCPdpVC757S5\nFnhBf93uycBPA97OK0krbNHhlKo6muRCuu91eOQSw71JdnWL64qq2pfkerprfo8CV1TVF1a0cknS\neGPiVfVB4Klz5v3pnOnfAX5ncqUt38zMzLRLWFHuX7vW876B+7eaxrrZZ2KdJbWa/UnSepCEWsaJ\nTUnSGmWIS1LDDHFJapghLkkNM8QlqWFNhPimTVtJsmqPTZu2TnuXJWksTVxi2P1nKKt5aWIWvdVV\nklaLlxhK0jpliEtSwwxxSWqYIS5JDTPEJalhhrgkNcwQl6SGGeKS1DBDXJIaZohLUsMMcUlqmCEu\nSQ0zxCWpYYa4JDXMEJekhhniktQwQ1ySGmaIS1LDDHFJapghLkkNGyvEk+xIsi/J/iQXj1j+wiT3\nJfls//gPky9VkjTXiYs1SLIBuBw4F7gd2JPk2qraN6fpR6rqlStQoyRpHuMciW8DDlTVwao6AlwN\n7BzRLhOtTJK0qHFCfDNw69D0oX7eXM9LcmOSv0zy9IlUJ0la0KLDKWP6DHBWVT2Y5DzgfcCPjWo4\nOzt77PnMzAwzMzMTKkGS1ofBYMBgMBirbapq4QbJdmC2qnb005cAVVWXLbDOV4Cfqqp75syvxfqb\nZ3vA8a+3dGEpdUrSSkhCVY0csh5nOGUPcHaSLUk2AucDu+d0cNrQ8210fxzuQZK0ohYdTqmqo0ku\nBG6gC/0rq2pvkl3d4roC+KdJ3ggcAR4CXrOSRUuSOosOp0y0M4dTJOm4LXc4RZK0RhniktQwQ1yS\nGmaIS1LDDHFJapghLkkNM8QlqWGGuCQ1zBCXpIYZ4pLUMENckhpmiEtSwwxxSWqYIS5JDTPEJalh\nhrgkNcwQl6SGGeKS1DBDXJIaZohLUsMMcUlqmCEuSQ0zxCWpYYa4JDXMEJekhhniktQwQ1ySGmaI\nS1LDDHFJathYIZ5kR5J9SfYnuXiBds9NciTJz0+uREnSfBYN8SQbgMuBlwHPAC5I8rR52r0NuH7S\nRUqSRhvnSHwbcKCqDlbVEeBqYOeIdm8CrgHunGB9kqQFjBPim4Fbh6YP9fOOSXIG8Kqq+q9AJlee\nJGkhJ05oO38ADI+Vzxvks7Ozx57PzMwwMzMzoRIkaX0YDAYMBoOx2qaqFm6QbAdmq2pHP30JUFV1\n2VCbLz/yFDgVeAD41araPWdbtVh/89QAHP96SxeWUqckrYQkVNXIg+NxQvwE4IvAucDXgE8BF1TV\n3nnaXwVcV1XvHbHMEJek47RQiC86nFJVR5NcCNxAN4Z+ZVXtTbKrW1xXzF1l2RVLksay6JH4RDvz\nSFySjttCR+LesSlJDTPEJalhhrgkNcwQl6SGGeKS1DBDXJIaZohLUsMMcUlqmCEuSQ0zxCWpYYa4\nJDXMEJekhhniktQwQ1ySGmaIS1LDDHFJapghLkkNM8QlqWGGuCQ1zBCXpIYZ4pLUMENckhpmiEtS\nwwxxSWqYIS5JDTPEJalhhrgkNcwQl6SGjRXiSXYk2Zdkf5KLRyx/ZZKbknwuyaeSPH/ypUqS5kpV\nLdwg2QDsB84Fbgf2AOdX1b6hNidX1YP982cCf15VPz5iW7VYf/PUABz/eksXllKnJK2EJFRVRi0b\n50h8G3Cgqg5W1RHgamDncINHArz3BODhpRYrSRrfOCG+Gbh1aPpQP+9RkrwqyV7gOuBfTKY8SdJC\nTpzUhqrqfcD7krwA+E/AS0a1m52dPfZ8ZmaGmZmZSZUgSevCYDBgMBiM1XacMfHtwGxV7einLwGq\nqi5bYJ2bgedW1T1z5jsmLknHablj4nuAs5NsSbIROB/YPaeDpww9fzawcW6AS5Imb9HhlKo6muRC\n4Aa60L+yqvYm2dUtriuAX0jyOuC7wEPAq1eyaElSZ9HhlIl25nCKJB235Q6nSJLWKENckhpmiEtS\nwwxxSWqYIS5JDTPEJalhhrgkNcwQl6SGGeKS1DBDXJIaZohLUsMMcUlqmCEuSQ0zxCWpYYa4JDXM\nEJekhhniktQwQ1ySGmaIS1LDDHFJapghLkkNM8QlqWGGuCQ1zBCXpIYZ4pLUMENckhpmiEtSwwxx\nSWrYWCGeZEeSfUn2J7l4xPLXJrmpf3w0yTMnX6okaa5U1cINkg3AfuBc4HZgD3B+Ve0barMd2FtV\n30iyA5itqu0jtlWL9TdPDcDxr7d0YSl1StJKSEJVZdSycY7EtwEHqupgVR0BrgZ2Djeoqk9U1Tf6\nyU8Am5dTsCRpPOOE+Gbg1qHpQywc0r8CfGA5RUmSxnPiJDeW5EXAG4AXzNdmdnb22POZmRlmZmYm\nWYIkNW8wGDAYDMZqO86Y+Ha6Me4d/fQlQFXVZXPaPQt4D7Cjqm6eZ1uOiUvScVrumPge4OwkW5Js\nBM4Hds/p4Cy6AP/F+QJckjR5iw6nVNXRJBcCN9CF/pVVtTfJrm5xXQH8JvBE4I/THTYfqaptK1m4\nJGmM4ZSJduZwiiQdt+UOp0iS1ihDXJIaZohLUsMMcUlqmCG+BmzatJUkq/bYtGnrtHdZ0oR4dcro\nHlf16pT1vn+SlserUyRpnTLEJalhhrgkNcwQl6SGGeKS1DBDXJIaZohLUsMMcUlqmCEuSQ0zxCWp\nYYa4JDXMEJekhhniktQwQ1ySGmaIS1LDDHFJapghLkkNM8QlqWGGuCQ1zBCXpIYZ4pLUMENckho2\nVogn2ZFkX5L9SS4esfypST6W5NtJ3jL5MiVJo5y4WIMkG4DLgXOB24E9Sa6tqn1Dze4G3gS8akWq\nlCSNNM6R+DbgQFUdrKojwNXAzuEGVXVXVX0G+N4K1ChJmsc4Ib4ZuHVo+lA/T5I0ZYsOp0za7Ozs\nseczMzPMzMysdgmStKYNBgMGg8FYbVNVCzdItgOzVbWjn74EqKq6bETbS4FvVdXvzbOtWqy/edYD\njn+9pQtLqXPJva3z/ZO0PEmoqoxaNs5wyh7g7CRbkmwEzgd2L9TfEmqUJC3BosMpVXU0yYXADXSh\nf2VV7U2yq1tcVyQ5Dfg08PeBh5NcBDy9qu5fyeIl6QfdosMpE+3M4ZTRva3z/du0aSuHDx9ctf5O\nO20Ld9xxy6r1J620hYZTDPHRPRrik+xtne+ftNKWOyYuSVqjDHFJapghLkkNM8QlqWGGuCQ1zBCX\npIYZ4tIybdq0lSSr8ti0aeu0d1drjNeJj+7R66gn2Zv7N8nevFHrB5A3+xx/j+s4BMD9m3Bv6zjE\n1/t71wpv9pGkdcoQl6SGGeKS1DBDXJIaZohLUsMMcUlqmCEuSQ0zxCWpYYa4JDXMEJekhhniktQw\nQ1zSD6zV/AbKlfoWSr8Aa3SPfsnQJHtz/ybZ2zreN3D/5lnLL8CSpPXJEJekhhniktQwQ1ySGmaI\nS1LDxgrxJDuS7EuyP8nF87T5oyQHktyY5JzJlrlUg2kXsMIG0y5ghQ2mXcAKGky7gBU2mHYBK2ww\n7QKOWTTEk2wALgdeBjwDuCDJ0+a0OQ94SlX9KLAL+JMVqHUJBtMuYIUNpl3AChtMu4AVNJh2ASts\nMO0CVthg2gUcM86R+DbgQFUdrKojwNXAzjltdgLvAKiqTwKnJDltopVKkh5jnBDfDNw6NH2on7dQ\nm9tGtJEkTdiJq91hd4fUktZc4nr/cWm9LbnOpXL/RlvP+7ee9w3cv3l6m/D+jRPitwFnDU2f2c+b\n2+YfLtJm3ttGJUlLM85wyh7g7CRbkmwEzgd2z2mzG3gdQJLtwH1VdXiilUqSHmPRI/GqOprkQuAG\nutC/sqr2JtnVLa4rqur9SV6e5EvAA8AbVrZsSRKs8rcYSpImyzs2JalhhrgkNWzVLzFcLUleQHej\n0v+tqhumXc8kJNlGdx5iT5KnAzuAfVX1/imXNnFJ3lFVr5t2HRpPfxf3ZuCTVXX/0PwdVfXB6VU2\nGf3+7eT797/cBuyuqr3Tq6qzbsbEk3yqqrb1z/8l8K+BvwBeClxXVW+bZn3LleRS4Dy6P7wfAn4a\n+DDwEuD6qnrrFMtbliRzr3YK8CLgrwGq6pWrXtQqSvKGqrpq2nUsVZI30/2+7QXOAS6qqmv7ZZ+t\nqmdPs77l6r8v6gK6u9UP9bPPpLtS7+ppZ8t6CvHPVdVP9s/3AC+vqq8neTzwiap65nQrXJ4kn6f7\nBTkJuAM4s6q+meTv0R39PGuqBS5Dks8CXwD+jO7/ygrwLrpfEqrq/0yvupWX5KtVddbiLdem/mfz\neVV1f5KtwDXAO6vqD4d/L1uVZD/wjP5rR4bnbwT+X/+dUVOznoZTNiT5Ebpx/hOq6usAVfVAku9N\nt7SJ+F5VHQUeTHJzVX0ToKoeSvLwlGtbrucAFwG/AfzbqroxyUPrKbyT/N18i4DWv2dowyNDKFV1\nS5IZ4JokW1j67ZBrycPAGcDBOfNP75dN1XoK8VOAz9D90FSS06vqa0mewPr4QfpukpOr6kHgpx6Z\nmeQU1sAP0nJU1cPA7yd5d//vYdbXzyZ0Qf0y4N458wN8bPXLmajDSc6pqhsB+iPynwX+O9D0J+De\nvwH+KskBvv8dUWcBZwMXTq2q3roZTplPkpOB06rqK9OuZTmSnFRV3xkx/1Tg9Kr6/BTKWhFJXgE8\nv6p+fdq1TEqSK4GrquqjI5b9r6p67RTKmogkZ9J9UrxjxLLnV9XfTqGsieq/knsbjz6xuaf/dDxV\n6z7EJWk98zpxSWqYIS5JDTPEJalhhrgkNez/A6lE5/ZXfhvHAAAAAElFTkSuQmCC\n", 379 | "text/plain": [ 380 | "" 381 | ] 382 | }, 383 | "metadata": {}, 384 | "output_type": "display_data" 385 | } 386 | ], 387 | "source": [ 388 | "feat_imp = pd.Series(random_search.best_estimator_.feature_importances_).sort_values(ascending=False)\n", 389 | "feat_imp.plot(kind='bar', title='Feature Importances')" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": { 395 | "collapsed": false 396 | }, 397 | "source": [ 398 | "### 3. Gradient Boosting" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 9, 404 | "metadata": { 405 | "collapsed": false 406 | }, 407 | "outputs": [], 408 | "source": [ 409 | "import xgboost as xgb\n", 410 | "from xgboost.sklearn import XGBClassifier\n", 411 | "from matplotlib.pylab import rcParams\n", 412 | "rcParams['figure.figsize'] = 12, 4\n", 413 | "from sklearn import metrics" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 12, 419 | "metadata": { 420 | "collapsed": true 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "def modelfit(alg, Xtrain, ytrain, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):\n", 425 | " if useTrainCV:\n", 426 | " xgb_param = alg.get_xgb_params()\n", 427 | " xgtrain = xgb.DMatrix(Xtrain, label=ytrain)\n", 428 | " print alg.get_params()['n_estimators']\n", 429 | " cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round = alg.get_params()['n_estimators'], early_stopping_rounds=early_stopping_rounds)\n", 430 | " alg.set_params(n_estimators=cvresult.shape[0])\n", 431 | " alg.fit(Xtrain, ytrain, eval_metric='auc')\n", 432 | " predict = alg.predict(Xtrain)\n", 433 | " return predict" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "## Step 1 Fix learning rate and number of estimators for tuning tree-based parameters" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 11, 446 | "metadata": { 447 | "collapsed": false 448 | }, 449 | "outputs": [ 450 | { 451 | "name": "stdout", 452 | "output_type": "stream", 453 | "text": [ 454 | "100\n" 455 | ] 456 | }, 457 | { 458 | "ename": "KeyboardInterrupt", 459 | "evalue": "", 460 | "output_type": "error", 461 | "traceback": [ 462 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 463 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 464 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 11\u001b[0m seed=27)\n\u001b[0;32m 12\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 13\u001b[1;33m \u001b[0mpredict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodelfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mxgb1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeatures_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabels_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 465 | "\u001b[1;32m\u001b[0m in \u001b[0;36mmodelfit\u001b[1;34m(alg, Xtrain, ytrain, useTrainCV, cv_folds, early_stopping_rounds)\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mcvresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mxgb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mxgb_param\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mxgtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnum_boost_round\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0malg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'n_estimators'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mearly_stopping_rounds\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mearly_stopping_rounds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0malg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn_estimators\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcvresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[0malg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mXtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mytrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0meval_metric\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'auc'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 9\u001b[0m \u001b[0mpredict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0malg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mXtrain\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mpredict\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 466 | "\u001b[1;32m/home/ubuntu/anaconda2/lib/python2.7/site-packages/xgboost-0.4-py2.7.egg/xgboost/sklearn.pyc\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose)\u001b[0m\n\u001b[0;32m 440\u001b[0m \u001b[0mearly_stopping_rounds\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mearly_stopping_rounds\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[0mevals_result\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mevals_result\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeval\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfeval\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 442\u001b[1;33m verbose_eval=verbose)\n\u001b[0m\u001b[0;32m 443\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 444\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobjective\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mxgb_options\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"objective\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 467 | "\u001b[1;32m/home/ubuntu/anaconda2/lib/python2.7/site-packages/xgboost-0.4-py2.7.egg/xgboost/training.pyc\u001b[0m in \u001b[0;36mtrain\u001b[1;34m(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)\u001b[0m\n\u001b[0;32m 203\u001b[0m \u001b[0mevals\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mevals\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 204\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeval\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfeval\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 205\u001b[1;33m xgb_model=xgb_model, callbacks=callbacks)\n\u001b[0m\u001b[0;32m 206\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 468 | "\u001b[1;32m/home/ubuntu/anaconda2/lib/python2.7/site-packages/xgboost-0.4-py2.7.egg/xgboost/training.pyc\u001b[0m in \u001b[0;36m_train_internal\u001b[1;34m(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[1;31m# Skip the first update if it is a recovery step.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 75\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mversion\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;36m2\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 76\u001b[1;33m \u001b[0mbst\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 77\u001b[0m \u001b[0mbst\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave_rabit_checkpoint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 78\u001b[0m \u001b[0mversion\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 469 | "\u001b[1;32m/home/ubuntu/anaconda2/lib/python2.7/site-packages/xgboost-0.4-py2.7.egg/xgboost/core.pyc\u001b[0m in \u001b[0;36mupdate\u001b[1;34m(self, dtrain, iteration, fobj)\u001b[0m\n\u001b[0;32m 804\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 805\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mfobj\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 806\u001b[1;33m \u001b[0m_check_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_LIB\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mXGBoosterUpdateOneIter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtrain\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhandle\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 807\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 808\u001b[0m \u001b[0mpred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 470 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: " 471 | ] 472 | } 473 | ], 474 | "source": [ 475 | "xgb1 = XGBClassifier(\n", 476 | " learning_rate =0.05,\n", 477 | " n_estimators=100,\n", 478 | " max_depth=15,\n", 479 | " min_child_weight=4,\n", 480 | " gamma=0,\n", 481 | " subsample=0.8,\n", 482 | " colsample_bytree=0.8,\n", 483 | " objective= 'reg:linear',\n", 484 | " scale_pos_weight=1,\n", 485 | " seed=27)\n", 486 | "\n", 487 | "predict = modelfit(xgb1, features_train, np.ravel(labels_train))" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": { 494 | "collapsed": false 495 | }, 496 | "outputs": [], 497 | "source": [ 498 | "#print model report:\n", 499 | "print '\\nModel Report ********'\n", 500 | "print \"Accuracy : %.4g\" % rmsle(np.ravel(labels_train), predict)\n", 501 | "print '\\nModel Report ********'\n", 502 | "feat_imp = pd.Series(xgb1.booster().get_fscore()).sort_values(ascending=False)\n", 503 | "feat_imp.plot(kind='bar', title='Feature Importances')\n", 504 | "plt.ylabel('Feature Importance Score')" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": { 510 | "collapsed": false 511 | }, 512 | "source": [ 513 | "## Step 2: Tune max_depth and min_child_weight" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": { 520 | "collapsed": false 521 | }, 522 | "outputs": [], 523 | "source": [ 524 | "from sklearn.grid_search import GridSearchCV\n", 525 | "param_test1 = {\n", 526 | " 'max_depth':range(3,10,2),\n", 527 | " 'min_child_weight':range(1,6,2)\n", 528 | "}\n", 529 | "gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27), param_grid = param_test1, scoring='roc_auc', n_jobs=4,iid=False)\n", 530 | "gsearch1.fit(features_train,np.ravel(labels_train))\n", 531 | "gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "## Load Test Data" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 59, 544 | "metadata": { 545 | "collapsed": false 546 | }, 547 | "outputs": [ 548 | { 549 | "name": "stdout", 550 | "output_type": "stream", 551 | "text": [ 552 | " week_num sales_depot_id sales_chan_id route_id client_id prod_id\n", 553 | "0 11 4037 1 2209 4639078 35305\n", 554 | "1 11 2237 1 1226 4705135 1238\n", 555 | "2 10 2045 1 2831 4549769 32940\n", 556 | "3 11 1227 1 4448 4717855 43066\n", 557 | "4 11 1219 1 1130 966351 1277\n" 558 | ] 559 | } 560 | ], 561 | "source": [ 562 | "test_data = pd.read_csv('test.csv')\n", 563 | "test_data.columns = ['id', 'week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']\n", 564 | "test_labels = pd.read_csv('sample_submission.csv')\n", 565 | "test_data = test_data.drop('id', 1)\n", 566 | "print test_data.head()" 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": {}, 572 | "source": [ 573 | "## Submission" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": { 580 | "collapsed": false 581 | }, 582 | "outputs": [ 583 | { 584 | "name": "stdout", 585 | "output_type": "stream", 586 | "text": [ 587 | "0 0.000306844711304\n", 588 | "100000 72.7710800171\n", 589 | "200000 145.477372885\n", 590 | "300000 218.045004845\n", 591 | "400000 290.581101894\n", 592 | "500000 363.356217861\n", 593 | "600000 436.15059185\n", 594 | "700000 509.013458014\n", 595 | "800000 581.852082014\n", 596 | "900000 654.530701876\n", 597 | "1000000 727.205648899\n", 598 | "1100000 800.080030918\n", 599 | "1200000 872.742459059\n", 600 | "1300000 945.787395\n", 601 | "1400000 1018.40329695\n", 602 | "1500000 1090.79495907\n", 603 | "1600000 1163.17800498\n", 604 | "1700000 1235.77449584\n", 605 | "1800000 1308.24206305\n" 606 | ] 607 | } 608 | ], 609 | "source": [ 610 | "Xtest = test_data[['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']].values\n", 611 | "y_pred = []\n", 612 | "tic = time.time()\n", 613 | "for ipred in xrange(len(Xtest)):\n", 614 | " if ipred%10e4 == 0:\n", 615 | " print ipred, ' ', time.time()-tic\n", 616 | " y_pred.append(max(0, random_search.predict(Xtest[ipred,:])[0]))\n", 617 | "sub_dict = {'Demanda_uni_equil': np.ravel(y_pred)}\n", 618 | "sub_df = pd.DataFrame(sub_dict)\n", 619 | "sub_df.to_csv('sample_submission.csv', sep='\\t')" 620 | ] 621 | } 622 | ], 623 | "metadata": { 624 | "kernelspec": { 625 | "display_name": "Python 2", 626 | "language": "python", 627 | "name": "python2" 628 | }, 629 | "language_info": { 630 | "codemirror_mode": { 631 | "name": "ipython", 632 | "version": 2 633 | }, 634 | "file_extension": ".py", 635 | "mimetype": "text/x-python", 636 | "name": "python", 637 | "nbconvert_exporter": "python", 638 | "pygments_lexer": "ipython2", 639 | "version": "2.7.11" 640 | } 641 | }, 642 | "nbformat": 4, 643 | "nbformat_minor": 0 644 | } 645 | -------------------------------------------------------------------------------- /main_feature_engine.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model to forecast inventory demand based on historical sales data. " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "from scipy import stats\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import time\n", 24 | "import random\n", 25 | "import pickle\n", 26 | "import math\n", 27 | "import warnings\n", 28 | "warnings.filterwarnings(\"ignore\")" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Model accuracy is RMSLE" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "def rmsle(y, y_pred):\n", 47 | " assert len(y) == len(y_pred)\n", 48 | " terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]\n", 49 | " return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Load Training Data \n", 57 | "The size of the training data is quite large (~4 GB). Large datasets require significant amount of memory to process. Instead, we will sample the data randomly for our initial data analysis and visualization. " 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": { 64 | "collapsed": false, 65 | "scrolled": true 66 | }, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "*********\n", 73 | "Time to load: 75.1288080215 sec\n", 74 | "\n", 75 | " week_num sales_depot_id sales_chan_id route_id \\\n", 76 | "count 75000.000000 75000.000000 75000.000000 75000.000000 \n", 77 | "mean 5.982800 2761.836040 1.378893 2114.386307 \n", 78 | "std 2.027004 4603.625646 1.455427 1492.045185 \n", 79 | "min 3.000000 1110.000000 1.000000 1.000000 \n", 80 | "25% 4.000000 1312.000000 1.000000 1161.000000 \n", 81 | "50% 6.000000 1614.000000 1.000000 1283.000000 \n", 82 | "75% 8.000000 2040.000000 1.000000 2802.000000 \n", 83 | "max 9.000000 25759.000000 11.000000 9935.000000 \n", 84 | "\n", 85 | " client_id prod_id saleunit_curr_wk saleamt_curr_wk \\\n", 86 | "count 7.500000e+04 75000.000000 75000.000000 75000.000000 \n", 87 | "mean 1.798237e+06 20879.108587 7.275120 67.941156 \n", 88 | "std 1.832623e+06 18659.089843 19.898198 258.362483 \n", 89 | "min 1.050000e+02 72.000000 0.000000 0.000000 \n", 90 | "25% 3.525328e+05 1242.000000 2.000000 16.760000 \n", 91 | "50% 1.192627e+06 30549.000000 3.000000 30.000000 \n", 92 | "75% 2.370167e+06 37519.000000 7.000000 56.580000 \n", 93 | "max 1.169326e+07 49994.000000 1920.000000 15724.800000 \n", 94 | "\n", 95 | " retunit_next_week retamt_next_wk y_pred_demand \n", 96 | "count 75000.000000 75000.000000 75000.00000 \n", 97 | "mean 0.130547 1.179629 7.19192 \n", 98 | "std 3.653853 16.583161 19.76362 \n", 99 | "min 0.000000 0.000000 0.00000 \n", 100 | "25% 0.000000 0.000000 2.00000 \n", 101 | "50% 0.000000 0.000000 3.00000 \n", 102 | "75% 0.000000 0.000000 6.00000 \n", 103 | "max 880.000000 1882.580000 1920.00000 \n", 104 | "*********\n", 105 | " week_num sales_depot_id sales_chan_id route_id client_id prod_id\n", 106 | "603469 3.0 1122.0 1.0 1072.0 80777.0 1109.0\n", 107 | "616160 3.0 1122.0 1.0 1225.0 2404123.0 1284.0\n", 108 | "900348 3.0 1126.0 1.0 1439.0 2344751.0 1240.0\n", 109 | "265531 3.0 1117.0 1.0 1274.0 58433.0 1109.0\n", 110 | "980436 3.0 1129.0 2.0 91.0 20626.0 34469.0\n", 111 | "374339 3.0 1118.0 1.0 1436.0 174077.0 1238.0\n", 112 | "809719 3.0 1124.0 1.0 1487.0 1269824.0 1230.0\n", 113 | "576696 3.0 1121.0 1.0 2110.0 4102682.0 36745.0\n", 114 | "401710 3.0 1119.0 1.0 1252.0 61120.0 1212.0\n", 115 | "547145 3.0 1121.0 1.0 1436.0 833293.0 1242.0\n", 116 | "911606 3.0 1127.0 1.0 1001.0 427990.0 1125.0\n", 117 | "876703 3.0 1126.0 1.0 1412.0 694607.0 1212.0\n", 118 | "927400 3.0 1127.0 1.0 1203.0 433950.0 1232.0\n", 119 | "678649 3.0 1123.0 1.0 1208.0 4381733.0 31423.0\n", 120 | "849482 3.0 1126.0 1.0 1202.0 1048736.0 1240.0\n", 121 | "853790 3.0 1126.0 1.0 1207.0 694019.0 41938.0\n", 122 | "389931 3.0 1119.0 1.0 1073.0 413854.0 1146.0\n", 123 | "62658 3.0 1111.0 1.0 2110.0 61646.0 36748.0\n", 124 | "135061 3.0 1113.0 1.0 1402.0 18505.0 32819.0\n", 125 | "312260 3.0 1117.0 1.0 1604.0 311478.0 40217.0\n", 126 | "305331 3.0 1117.0 1.0 1480.0 4182885.0 1212.0\n", 127 | "194110 3.0 1116.0 1.0 1458.0 4181549.0 1284.0\n", 128 | "617067 3.0 1122.0 1.0 1228.0 4389655.0 1278.0\n", 129 | "546132 3.0 1121.0 1.0 1433.0 885310.0 1242.0\n", 130 | "834210 3.0 1126.0 1.0 1022.0 595177.0 1125.0\n", 131 | "47702 3.0 1111.0 1.0 1620.0 4353154.0 4259.0\n", 132 | "677824 3.0 1123.0 1.0 1208.0 409837.0 35651.0\n", 133 | "724090 3.0 1123.0 1.0 1456.0 1123212.0 1242.0\n", 134 | "972101 3.0 1127.0 1.0 4450.0 4427233.0 43285.0\n", 135 | "545002 3.0 1121.0 1.0 1432.0 32683.0 1278.0\n", 136 | "... ... ... ... ... ... ...\n", 137 | "101366 9.0 23899.0 1.0 4501.0 7897208.0 1309.0\n", 138 | "27435 9.0 23669.0 1.0 1110.0 250172.0 1309.0\n", 139 | "174774 9.0 25759.0 1.0 2122.0 1090879.0 31471.0\n", 140 | "5244 9.0 22560.0 1.0 1260.0 2126508.0 3270.0\n", 141 | "42172 9.0 23669.0 1.0 2832.0 892995.0 37361.0\n", 142 | "33660 9.0 23669.0 1.0 1222.0 436729.0 1250.0\n", 143 | "68348 9.0 23719.0 1.0 2813.0 8012178.0 37058.0\n", 144 | "31810 9.0 23669.0 1.0 1220.0 246772.0 1250.0\n", 145 | "110113 9.0 24049.0 1.0 1205.0 1895047.0 41938.0\n", 146 | "87675 9.0 23899.0 1.0 1235.0 2067518.0 1240.0\n", 147 | "180384 9.0 25759.0 1.0 5517.0 4358772.0 43159.0\n", 148 | "51471 9.0 23719.0 1.0 1166.0 153159.0 1220.0\n", 149 | "93783 9.0 23899.0 1.0 2810.0 2210613.0 30531.0\n", 150 | "52075 9.0 23719.0 1.0 1166.0 4643704.0 1687.0\n", 151 | "156596 9.0 25699.0 1.0 2007.0 2023508.0 36610.0\n", 152 | "47824 9.0 23719.0 1.0 1063.0 506341.0 1109.0\n", 153 | "155877 9.0 25699.0 1.0 2006.0 655030.0 37058.0\n", 154 | "178431 9.0 25759.0 1.0 5507.0 54143.0 40886.0\n", 155 | "33386 9.0 23669.0 1.0 1221.0 4223242.0 1309.0\n", 156 | "47828 9.0 23719.0 1.0 1063.0 506363.0 1109.0\n", 157 | "24807 9.0 23669.0 1.0 1011.0 250102.0 1125.0\n", 158 | "108525 9.0 24049.0 1.0 1203.0 2491991.0 1242.0\n", 159 | "134417 9.0 24669.0 1.0 1201.0 4247222.0 43118.0\n", 160 | "149301 9.0 24669.0 1.0 5505.0 4592929.0 36598.0\n", 161 | "131958 9.0 24669.0 1.0 1054.0 186848.0 31393.0\n", 162 | "103212 9.0 23899.0 1.0 5006.0 2501968.0 31031.0\n", 163 | "94955 9.0 23899.0 1.0 2812.0 1752067.0 5000.0\n", 164 | "140145 9.0 24669.0 1.0 1213.0 442488.0 36711.0\n", 165 | "3153 9.0 22560.0 1.0 1230.0 158892.0 1242.0\n", 166 | "175686 9.0 25759.0 1.0 2128.0 4554772.0 31586.0\n", 167 | "\n", 168 | "[75000 rows x 6 columns]\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "def load_samp_data(filename='train.csv', columns=[], load_pkl=1):\n", 174 | " \"\"\" \n", 175 | " Function returns a dataframe containing the training data sampled randomly. \n", 176 | " The data is also stored in a pickle file for later processing.\n", 177 | " \"\"\"\n", 178 | " if load_pkl:\n", 179 | " inputfile = open('train_samp_data.pkl', 'rb')\n", 180 | " data = pickle.load(inputfile)\n", 181 | " inputfile.close()\n", 182 | " return data\n", 183 | " \n", 184 | " chunksize= 10 ** 6\n", 185 | " datasize = 74180464 #datasize = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)\n", 186 | " samplesize = 10 ** 3 # samples per chunk of data read from the file.\n", 187 | " \n", 188 | " data = pd.DataFrame([],columns=columns)\n", 189 | " chunks = pd.read_csv(filename, iterator=True, chunksize=chunksize)\n", 190 | " for chunk in chunks:\n", 191 | " chunk.columns = columns\n", 192 | " data = data.append(chunk.sample(samplesize)) \n", 193 | " \n", 194 | " # write data to a pickle file.\n", 195 | " outputfile = open('train_samp_data.pkl','wb')\n", 196 | " pickle.dump(data,outputfile)\n", 197 | " outputfile.close()\n", 198 | " \n", 199 | " return data\n", 200 | " \n", 201 | "load_pkl = 0\n", 202 | "columns = ['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id', 'saleunit_curr_wk', 'saleamt_curr_wk', 'retunit_next_week', 'retamt_next_wk', 'y_pred_demand']\n", 203 | "tic = time.time()\n", 204 | "train_data_samp = load_samp_data('train.csv', columns, load_pkl)\n", 205 | "toc = time.time()\n", 206 | "print '*********'\n", 207 | "print 'Time to load: ', toc-tic, 'sec'\n", 208 | "print \n", 209 | "print train_data_samp.describe()\n", 210 | "print '*********'\n", 211 | "print train_data_samp[['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']]\n", 212 | "\n", 213 | "features_train = train_data_samp[['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']].values\n", 214 | "labels_train_sale = train_data_samp[['saleunit_curr_wk']].values\n", 215 | "labels_train_return = train_data_samp[['retunit_next_week']].values\n", 216 | "labels_train = train_data_samp[['y_pred_demand']].values" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "## Feature Engineering \n" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 33, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/html": [ 236 | "
\n", 237 | "\n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | "
week_numsales_depot_idsales_chan_idroute_idsaleunit_curr_wksaleamt_curr_wkretunit_next_weekretamt_next_wky_pred_demand
client_idprod_id
105.043065.04.02061.02.07222.033.0163.680.00.033.0
791.035651.09.01227.01.01263.01.07.500.00.01.0
1311.031717.05.02095.011.03911.012.072.000.00.012.0
2003.043285.09.02012.01.02006.08.042.240.00.08.0
2025.01242.04.02012.01.01159.05.038.200.00.05.0
2056.037058.09.02012.01.02804.01.07.500.00.01.0
2059.01232.05.02012.01.01157.06.0109.440.00.06.0
2073.043066.05.02012.01.02006.01.09.630.00.01.0
2098.01242.05.02012.01.01159.02.015.280.00.02.0
2100.043285.08.02012.01.02006.05.026.400.00.05.0
2102.040886.07.02012.01.02803.02.011.520.00.02.0
2109.09217.04.02012.01.05012.02.018.000.00.02.0
2136.043147.05.02012.01.02803.04.018.160.00.04.0
2142.01250.07.02012.01.01162.09.068.760.00.09.0
2147.01125.04.02012.01.01073.01.09.600.00.01.0
2170.043069.05.02012.01.02005.01.07.410.00.01.0
2182.032819.07.02012.01.01162.02.017.780.00.02.0
2241.01250.09.02012.01.01159.05.038.200.00.05.0
2266.073.09.02012.01.01058.01.021.320.00.01.0
2269.01220.03.02012.01.01158.02.015.280.00.02.0
2291.04270.09.02012.01.05012.01.08.380.00.01.0
2386.01146.06.02012.01.01073.02.042.780.00.02.0
35651.04.02012.01.01173.05.037.500.00.05.0
2400.01250.08.02012.01.01173.02.015.280.00.02.0
2408.031423.06.02012.01.01158.03.032.070.00.03.0
2464.01109.09.02012.01.01074.01.015.010.00.01.0
2467.01232.04.02012.01.01174.06.0109.440.00.06.0
2486.01146.05.02012.01.01073.07.0149.730.00.07.0
1278.07.02012.01.01174.019.085.500.00.019.0
2488.043285.06.02012.01.02013.010.052.800.00.010.0
.................................
9725161.01242.09.01236.01.01121.08.061.120.00.08.0
9725591.030532.07.01337.01.02111.08.062.240.00.08.0
9725945.0972.07.01616.01.01147.02.037.960.00.02.0
9732294.036093.03.02024.01.01604.010.037.000.00.010.0
9733562.034204.09.01464.02.01529.017.0316.880.00.017.0
9733592.01220.05.01616.01.04531.02.015.280.00.02.0
9733759.01250.04.01312.01.01120.011.084.040.00.011.0
9733973.043342.04.01380.02.01603.078.01343.160.00.078.0
9741737.01278.04.01617.01.01208.010.045.000.00.010.0
9745917.037058.08.01617.01.04405.00.00.001.07.50.0
9746013.041938.05.01617.01.01208.01.09.910.00.01.0
9746335.043058.08.01617.01.02105.06.056.100.00.06.0
9746451.032934.07.01616.01.02812.03.021.120.00.03.0
9746583.01238.09.02238.01.01210.02.019.660.00.02.0
9746982.035307.07.01679.07.05713.010.062.500.00.010.0
9754172.03631.07.01336.01.01070.02.032.700.00.02.0
9765754.03270.05.02256.04.06608.02.025.420.00.02.0
9765766.037005.03.02256.04.06605.08.068.000.00.08.0
9770675.02233.07.02214.01.01001.016.0319.040.00.016.0
9780571.037058.08.01475.04.04709.015.0113.700.00.015.0
9780679.037005.06.022362.04.06612.033.0280.500.00.033.0
9838234.030314.08.02655.02.04177.048.0887.520.00.048.0
9853756.037058.05.04061.02.02579.00.00.0010.076.00.0
9881395.032303.09.01473.04.04803.010.059.200.00.010.0
9891352.037058.06.01544.04.06607.05.037.900.00.05.0
9982898.031514.08.01588.02.01534.014.0163.100.00.014.0
10303963.01230.04.02023.01.01105.03.050.010.00.03.0
1242.03.02023.01.01105.015.0114.600.00.015.0
10351814.01240.09.01618.01.01131.010.084.000.00.010.0
11693264.01109.03.01945.01.01055.014.0210.140.00.014.0
\n", 1059 | "

74832 rows × 9 columns

\n", 1060 | "
" 1061 | ], 1062 | "text/plain": [ 1063 | " week_num sales_depot_id sales_chan_id route_id \\\n", 1064 | "client_id prod_id \n", 1065 | "105.0 43065.0 4.0 2061.0 2.0 7222.0 \n", 1066 | "791.0 35651.0 9.0 1227.0 1.0 1263.0 \n", 1067 | "1311.0 31717.0 5.0 2095.0 11.0 3911.0 \n", 1068 | "2003.0 43285.0 9.0 2012.0 1.0 2006.0 \n", 1069 | "2025.0 1242.0 4.0 2012.0 1.0 1159.0 \n", 1070 | "2056.0 37058.0 9.0 2012.0 1.0 2804.0 \n", 1071 | "2059.0 1232.0 5.0 2012.0 1.0 1157.0 \n", 1072 | "2073.0 43066.0 5.0 2012.0 1.0 2006.0 \n", 1073 | "2098.0 1242.0 5.0 2012.0 1.0 1159.0 \n", 1074 | "2100.0 43285.0 8.0 2012.0 1.0 2006.0 \n", 1075 | "2102.0 40886.0 7.0 2012.0 1.0 2803.0 \n", 1076 | "2109.0 9217.0 4.0 2012.0 1.0 5012.0 \n", 1077 | "2136.0 43147.0 5.0 2012.0 1.0 2803.0 \n", 1078 | "2142.0 1250.0 7.0 2012.0 1.0 1162.0 \n", 1079 | "2147.0 1125.0 4.0 2012.0 1.0 1073.0 \n", 1080 | "2170.0 43069.0 5.0 2012.0 1.0 2005.0 \n", 1081 | "2182.0 32819.0 7.0 2012.0 1.0 1162.0 \n", 1082 | "2241.0 1250.0 9.0 2012.0 1.0 1159.0 \n", 1083 | "2266.0 73.0 9.0 2012.0 1.0 1058.0 \n", 1084 | "2269.0 1220.0 3.0 2012.0 1.0 1158.0 \n", 1085 | "2291.0 4270.0 9.0 2012.0 1.0 5012.0 \n", 1086 | "2386.0 1146.0 6.0 2012.0 1.0 1073.0 \n", 1087 | " 35651.0 4.0 2012.0 1.0 1173.0 \n", 1088 | "2400.0 1250.0 8.0 2012.0 1.0 1173.0 \n", 1089 | "2408.0 31423.0 6.0 2012.0 1.0 1158.0 \n", 1090 | "2464.0 1109.0 9.0 2012.0 1.0 1074.0 \n", 1091 | "2467.0 1232.0 4.0 2012.0 1.0 1174.0 \n", 1092 | "2486.0 1146.0 5.0 2012.0 1.0 1073.0 \n", 1093 | " 1278.0 7.0 2012.0 1.0 1174.0 \n", 1094 | "2488.0 43285.0 6.0 2012.0 1.0 2013.0 \n", 1095 | "... ... ... ... ... \n", 1096 | "9725161.0 1242.0 9.0 1236.0 1.0 1121.0 \n", 1097 | "9725591.0 30532.0 7.0 1337.0 1.0 2111.0 \n", 1098 | "9725945.0 972.0 7.0 1616.0 1.0 1147.0 \n", 1099 | "9732294.0 36093.0 3.0 2024.0 1.0 1604.0 \n", 1100 | "9733562.0 34204.0 9.0 1464.0 2.0 1529.0 \n", 1101 | "9733592.0 1220.0 5.0 1616.0 1.0 4531.0 \n", 1102 | "9733759.0 1250.0 4.0 1312.0 1.0 1120.0 \n", 1103 | "9733973.0 43342.0 4.0 1380.0 2.0 1603.0 \n", 1104 | "9741737.0 1278.0 4.0 1617.0 1.0 1208.0 \n", 1105 | "9745917.0 37058.0 8.0 1617.0 1.0 4405.0 \n", 1106 | "9746013.0 41938.0 5.0 1617.0 1.0 1208.0 \n", 1107 | "9746335.0 43058.0 8.0 1617.0 1.0 2105.0 \n", 1108 | "9746451.0 32934.0 7.0 1616.0 1.0 2812.0 \n", 1109 | "9746583.0 1238.0 9.0 2238.0 1.0 1210.0 \n", 1110 | "9746982.0 35307.0 7.0 1679.0 7.0 5713.0 \n", 1111 | "9754172.0 3631.0 7.0 1336.0 1.0 1070.0 \n", 1112 | "9765754.0 3270.0 5.0 2256.0 4.0 6608.0 \n", 1113 | "9765766.0 37005.0 3.0 2256.0 4.0 6605.0 \n", 1114 | "9770675.0 2233.0 7.0 2214.0 1.0 1001.0 \n", 1115 | "9780571.0 37058.0 8.0 1475.0 4.0 4709.0 \n", 1116 | "9780679.0 37005.0 6.0 22362.0 4.0 6612.0 \n", 1117 | "9838234.0 30314.0 8.0 2655.0 2.0 4177.0 \n", 1118 | "9853756.0 37058.0 5.0 4061.0 2.0 2579.0 \n", 1119 | "9881395.0 32303.0 9.0 1473.0 4.0 4803.0 \n", 1120 | "9891352.0 37058.0 6.0 1544.0 4.0 6607.0 \n", 1121 | "9982898.0 31514.0 8.0 1588.0 2.0 1534.0 \n", 1122 | "10303963.0 1230.0 4.0 2023.0 1.0 1105.0 \n", 1123 | " 1242.0 3.0 2023.0 1.0 1105.0 \n", 1124 | "10351814.0 1240.0 9.0 1618.0 1.0 1131.0 \n", 1125 | "11693264.0 1109.0 3.0 1945.0 1.0 1055.0 \n", 1126 | "\n", 1127 | " saleunit_curr_wk saleamt_curr_wk retunit_next_week \\\n", 1128 | "client_id prod_id \n", 1129 | "105.0 43065.0 33.0 163.68 0.0 \n", 1130 | "791.0 35651.0 1.0 7.50 0.0 \n", 1131 | "1311.0 31717.0 12.0 72.00 0.0 \n", 1132 | "2003.0 43285.0 8.0 42.24 0.0 \n", 1133 | "2025.0 1242.0 5.0 38.20 0.0 \n", 1134 | "2056.0 37058.0 1.0 7.50 0.0 \n", 1135 | "2059.0 1232.0 6.0 109.44 0.0 \n", 1136 | "2073.0 43066.0 1.0 9.63 0.0 \n", 1137 | "2098.0 1242.0 2.0 15.28 0.0 \n", 1138 | "2100.0 43285.0 5.0 26.40 0.0 \n", 1139 | "2102.0 40886.0 2.0 11.52 0.0 \n", 1140 | "2109.0 9217.0 2.0 18.00 0.0 \n", 1141 | "2136.0 43147.0 4.0 18.16 0.0 \n", 1142 | "2142.0 1250.0 9.0 68.76 0.0 \n", 1143 | "2147.0 1125.0 1.0 9.60 0.0 \n", 1144 | "2170.0 43069.0 1.0 7.41 0.0 \n", 1145 | "2182.0 32819.0 2.0 17.78 0.0 \n", 1146 | "2241.0 1250.0 5.0 38.20 0.0 \n", 1147 | "2266.0 73.0 1.0 21.32 0.0 \n", 1148 | "2269.0 1220.0 2.0 15.28 0.0 \n", 1149 | "2291.0 4270.0 1.0 8.38 0.0 \n", 1150 | "2386.0 1146.0 2.0 42.78 0.0 \n", 1151 | " 35651.0 5.0 37.50 0.0 \n", 1152 | "2400.0 1250.0 2.0 15.28 0.0 \n", 1153 | "2408.0 31423.0 3.0 32.07 0.0 \n", 1154 | "2464.0 1109.0 1.0 15.01 0.0 \n", 1155 | "2467.0 1232.0 6.0 109.44 0.0 \n", 1156 | "2486.0 1146.0 7.0 149.73 0.0 \n", 1157 | " 1278.0 19.0 85.50 0.0 \n", 1158 | "2488.0 43285.0 10.0 52.80 0.0 \n", 1159 | "... ... ... ... \n", 1160 | "9725161.0 1242.0 8.0 61.12 0.0 \n", 1161 | "9725591.0 30532.0 8.0 62.24 0.0 \n", 1162 | "9725945.0 972.0 2.0 37.96 0.0 \n", 1163 | "9732294.0 36093.0 10.0 37.00 0.0 \n", 1164 | "9733562.0 34204.0 17.0 316.88 0.0 \n", 1165 | "9733592.0 1220.0 2.0 15.28 0.0 \n", 1166 | "9733759.0 1250.0 11.0 84.04 0.0 \n", 1167 | "9733973.0 43342.0 78.0 1343.16 0.0 \n", 1168 | "9741737.0 1278.0 10.0 45.00 0.0 \n", 1169 | "9745917.0 37058.0 0.0 0.00 1.0 \n", 1170 | "9746013.0 41938.0 1.0 9.91 0.0 \n", 1171 | "9746335.0 43058.0 6.0 56.10 0.0 \n", 1172 | "9746451.0 32934.0 3.0 21.12 0.0 \n", 1173 | "9746583.0 1238.0 2.0 19.66 0.0 \n", 1174 | "9746982.0 35307.0 10.0 62.50 0.0 \n", 1175 | "9754172.0 3631.0 2.0 32.70 0.0 \n", 1176 | "9765754.0 3270.0 2.0 25.42 0.0 \n", 1177 | "9765766.0 37005.0 8.0 68.00 0.0 \n", 1178 | "9770675.0 2233.0 16.0 319.04 0.0 \n", 1179 | "9780571.0 37058.0 15.0 113.70 0.0 \n", 1180 | "9780679.0 37005.0 33.0 280.50 0.0 \n", 1181 | "9838234.0 30314.0 48.0 887.52 0.0 \n", 1182 | "9853756.0 37058.0 0.0 0.00 10.0 \n", 1183 | "9881395.0 32303.0 10.0 59.20 0.0 \n", 1184 | "9891352.0 37058.0 5.0 37.90 0.0 \n", 1185 | "9982898.0 31514.0 14.0 163.10 0.0 \n", 1186 | "10303963.0 1230.0 3.0 50.01 0.0 \n", 1187 | " 1242.0 15.0 114.60 0.0 \n", 1188 | "10351814.0 1240.0 10.0 84.00 0.0 \n", 1189 | "11693264.0 1109.0 14.0 210.14 0.0 \n", 1190 | "\n", 1191 | " retamt_next_wk y_pred_demand \n", 1192 | "client_id prod_id \n", 1193 | "105.0 43065.0 0.0 33.0 \n", 1194 | "791.0 35651.0 0.0 1.0 \n", 1195 | "1311.0 31717.0 0.0 12.0 \n", 1196 | "2003.0 43285.0 0.0 8.0 \n", 1197 | "2025.0 1242.0 0.0 5.0 \n", 1198 | "2056.0 37058.0 0.0 1.0 \n", 1199 | "2059.0 1232.0 0.0 6.0 \n", 1200 | "2073.0 43066.0 0.0 1.0 \n", 1201 | "2098.0 1242.0 0.0 2.0 \n", 1202 | "2100.0 43285.0 0.0 5.0 \n", 1203 | "2102.0 40886.0 0.0 2.0 \n", 1204 | "2109.0 9217.0 0.0 2.0 \n", 1205 | "2136.0 43147.0 0.0 4.0 \n", 1206 | "2142.0 1250.0 0.0 9.0 \n", 1207 | "2147.0 1125.0 0.0 1.0 \n", 1208 | "2170.0 43069.0 0.0 1.0 \n", 1209 | "2182.0 32819.0 0.0 2.0 \n", 1210 | "2241.0 1250.0 0.0 5.0 \n", 1211 | "2266.0 73.0 0.0 1.0 \n", 1212 | "2269.0 1220.0 0.0 2.0 \n", 1213 | "2291.0 4270.0 0.0 1.0 \n", 1214 | "2386.0 1146.0 0.0 2.0 \n", 1215 | " 35651.0 0.0 5.0 \n", 1216 | "2400.0 1250.0 0.0 2.0 \n", 1217 | "2408.0 31423.0 0.0 3.0 \n", 1218 | "2464.0 1109.0 0.0 1.0 \n", 1219 | "2467.0 1232.0 0.0 6.0 \n", 1220 | "2486.0 1146.0 0.0 7.0 \n", 1221 | " 1278.0 0.0 19.0 \n", 1222 | "2488.0 43285.0 0.0 10.0 \n", 1223 | "... ... ... \n", 1224 | "9725161.0 1242.0 0.0 8.0 \n", 1225 | "9725591.0 30532.0 0.0 8.0 \n", 1226 | "9725945.0 972.0 0.0 2.0 \n", 1227 | "9732294.0 36093.0 0.0 10.0 \n", 1228 | "9733562.0 34204.0 0.0 17.0 \n", 1229 | "9733592.0 1220.0 0.0 2.0 \n", 1230 | "9733759.0 1250.0 0.0 11.0 \n", 1231 | "9733973.0 43342.0 0.0 78.0 \n", 1232 | "9741737.0 1278.0 0.0 10.0 \n", 1233 | "9745917.0 37058.0 7.5 0.0 \n", 1234 | "9746013.0 41938.0 0.0 1.0 \n", 1235 | "9746335.0 43058.0 0.0 6.0 \n", 1236 | "9746451.0 32934.0 0.0 3.0 \n", 1237 | "9746583.0 1238.0 0.0 2.0 \n", 1238 | "9746982.0 35307.0 0.0 10.0 \n", 1239 | "9754172.0 3631.0 0.0 2.0 \n", 1240 | "9765754.0 3270.0 0.0 2.0 \n", 1241 | "9765766.0 37005.0 0.0 8.0 \n", 1242 | "9770675.0 2233.0 0.0 16.0 \n", 1243 | "9780571.0 37058.0 0.0 15.0 \n", 1244 | "9780679.0 37005.0 0.0 33.0 \n", 1245 | "9838234.0 30314.0 0.0 48.0 \n", 1246 | "9853756.0 37058.0 76.0 0.0 \n", 1247 | "9881395.0 32303.0 0.0 10.0 \n", 1248 | "9891352.0 37058.0 0.0 5.0 \n", 1249 | "9982898.0 31514.0 0.0 14.0 \n", 1250 | "10303963.0 1230.0 0.0 3.0 \n", 1251 | " 1242.0 0.0 15.0 \n", 1252 | "10351814.0 1240.0 0.0 10.0 \n", 1253 | "11693264.0 1109.0 0.0 14.0 \n", 1254 | "\n", 1255 | "[74832 rows x 9 columns]" 1256 | ] 1257 | }, 1258 | "execution_count": 33, 1259 | "metadata": {}, 1260 | "output_type": "execute_result" 1261 | } 1262 | ], 1263 | "source": [ 1264 | "\n", 1265 | "train_data_samp.groupby(['client_id', 'prod_id']).sum()" 1266 | ] 1267 | }, 1268 | { 1269 | "cell_type": "markdown", 1270 | "metadata": {}, 1271 | "source": [ 1272 | "### Predict sale units $y_{sale}$ and returns $y_{return}$ using two different classifiers. We will use xgboost to fit $y_{sale}$ and $y_{return}$ with the input data." 1273 | ] 1274 | }, 1275 | { 1276 | "cell_type": "code", 1277 | "execution_count": null, 1278 | "metadata": { 1279 | "collapsed": true 1280 | }, 1281 | "outputs": [], 1282 | "source": [ 1283 | "# Utility function to report best scores\n", 1284 | "def report(grid_scores, n_top=3):\n", 1285 | " top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]\n", 1286 | " for i, score in enumerate(top_scores):\n", 1287 | " print(\"Model with rank: {0}\".format(i + 1))\n", 1288 | " print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", 1289 | " score.mean_validation_score,\n", 1290 | " np.std(score.cv_validation_scores)))\n", 1291 | " print(\"Parameters: {0}\".format(score.parameters))\n", 1292 | " print(\"\")" 1293 | ] 1294 | }, 1295 | { 1296 | "cell_type": "code", 1297 | "execution_count": null, 1298 | "metadata": { 1299 | "collapsed": false 1300 | }, 1301 | "outputs": [], 1302 | "source": [ 1303 | "import warnings\n", 1304 | "warnings.filterwarnings(\"ignore\")\n", 1305 | "\n", 1306 | "from sklearn.ensemble import RandomForestClassifier\n", 1307 | "from sklearn.grid_search import RandomizedSearchCV\n", 1308 | "from scipy.stats import randint as sp_randint\n", 1309 | "from operator import itemgetter\n", 1310 | "\n", 1311 | "clf = RandomForestClassifier(n_estimators=10)\n", 1312 | "\n", 1313 | "# specify parameters and distributions to sample from\n", 1314 | "param_dist = {\"max_depth\": [10],\n", 1315 | " \"max_features\": sp_randint(4, 7),\n", 1316 | " }\n", 1317 | "\n", 1318 | "# run randomized search\n", 1319 | "n_iter_search = 10\n", 1320 | "random_search_sale = RandomizedSearchCV(clf, param_distributions=param_dist,\n", 1321 | " n_iter=n_iter_search, n_jobs=4, cv=5)\n", 1322 | "start = time.time()\n", 1323 | "random_search_sale.fit(features_train, np.ravel(labels_train_sale))\n", 1324 | "predict = random_search_sale.predict(features_train)\n", 1325 | "\n", 1326 | "print 'Model Report ********'\n", 1327 | "print 'Accuracy : ', rmsle(np.ravel(labels_train_sale), predict)\n", 1328 | "print 'Model Report ********'\n", 1329 | "\n", 1330 | "print(\"RandomizedSearchCV took %.2f seconds for %d candidates\"\n", 1331 | " \" parameter settings.\" % ((time.time() - start), n_iter_search))\n", 1332 | "report(random_search_sale.grid_scores_)\n", 1333 | "print random_search_sale.best_score_ \n", 1334 | "print random_search_sale.best_estimator_ \n", 1335 | "feat_imp = pd.Series(random_search_sale.best_estimator_.feature_importances_).sort_values(ascending=False)\n", 1336 | "feat_imp.plot(kind='bar', title='Feature Importances')" 1337 | ] 1338 | }, 1339 | { 1340 | "cell_type": "code", 1341 | "execution_count": null, 1342 | "metadata": { 1343 | "collapsed": false 1344 | }, 1345 | "outputs": [], 1346 | "source": [ 1347 | "import warnings\n", 1348 | "warnings.filterwarnings(\"ignore\")\n", 1349 | "\n", 1350 | "from sklearn.ensemble import RandomForestClassifier\n", 1351 | "from sklearn.grid_search import RandomizedSearchCV\n", 1352 | "from scipy.stats import randint as sp_randint\n", 1353 | "from operator import itemgetter\n", 1354 | "\n", 1355 | "clf = RandomForestClassifier(n_estimators=15)\n", 1356 | "\n", 1357 | "# specify parameters and distributions to sample from\n", 1358 | "param_dist = {\"max_depth\": [10],\n", 1359 | " \"max_features\": sp_randint(3, 5),\n", 1360 | " }\n", 1361 | "\n", 1362 | "# run randomized search\n", 1363 | "n_iter_search = 10\n", 1364 | "random_search_return = RandomizedSearchCV(clf, param_distributions=param_dist,\n", 1365 | " n_iter=n_iter_search, n_jobs=4, cv=5)\n", 1366 | "start = time.time()\n", 1367 | "random_search_return.fit(features_train, np.ravel(labels_train_return))\n", 1368 | "predict = random_search_return.predict(features_train)\n", 1369 | "\n", 1370 | "print 'Model Report ********'\n", 1371 | "print 'Accuracy : ', rmsle(np.ravel(labels_train_return), predict)\n", 1372 | "print 'Model Report ********'\n", 1373 | "\n", 1374 | "print(\"RandomizedSearchCV took %.2f seconds for %d candidates\"\n", 1375 | " \" parameter settings.\" % ((time.time() - start), n_iter_search))\n", 1376 | "report(random_search_return.grid_scores_)\n", 1377 | "print random_search_return.best_score_ \n", 1378 | "print random_search_return.best_estimator_ \n", 1379 | "feat_imp = pd.Series(random_search_return.best_estimator_.feature_importances_).sort_values(ascending=False)\n", 1380 | "feat_imp.plot(kind='bar', title='Feature Importances')" 1381 | ] 1382 | }, 1383 | { 1384 | "cell_type": "code", 1385 | "execution_count": null, 1386 | "metadata": { 1387 | "collapsed": false 1388 | }, 1389 | "outputs": [], 1390 | "source": [ 1391 | "predict_sale = random_search_sale.predict(features_train)\n", 1392 | "predict_return = random_search_return.predict(features_train)\n", 1393 | "y_pred = [max(0,(predict_sale[i]-predict_return[i])) for i in xrange(len(predict_return))]\n", 1394 | "plt.scatter(y_pred,np.ravel(labels_train))\n", 1395 | "print 'Model Report ********'\n", 1396 | "print 'Accuracy : ', rmsle(y_pred, np.ravel(labels_train))\n", 1397 | "print 'Model Report ********'\n" 1398 | ] 1399 | }, 1400 | { 1401 | "cell_type": "markdown", 1402 | "metadata": { 1403 | "collapsed": false 1404 | }, 1405 | "source": [ 1406 | "### 3. Gradient Boosting" 1407 | ] 1408 | }, 1409 | { 1410 | "cell_type": "code", 1411 | "execution_count": null, 1412 | "metadata": { 1413 | "collapsed": false 1414 | }, 1415 | "outputs": [], 1416 | "source": [ 1417 | "import xgboost as xgb\n", 1418 | "from xgboost.sklearn import XGBClassifier\n", 1419 | "from matplotlib.pylab import rcParams\n", 1420 | "rcParams['figure.figsize'] = 12, 4\n", 1421 | "from sklearn import metrics" 1422 | ] 1423 | }, 1424 | { 1425 | "cell_type": "code", 1426 | "execution_count": null, 1427 | "metadata": { 1428 | "collapsed": true 1429 | }, 1430 | "outputs": [], 1431 | "source": [ 1432 | "def modelfit(alg, Xtrain, ytrain, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):\n", 1433 | " if useTrainCV:\n", 1434 | " xgb_param = alg.get_xgb_params()\n", 1435 | " xgtrain = xgb.DMatrix(Xtrain, label=ytrain)\n", 1436 | " print alg.get_params()['n_estimators']\n", 1437 | " cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round = alg.get_params()['n_estimators'], early_stopping_rounds=early_stopping_rounds)\n", 1438 | " alg.set_params(n_estimators=cvresult.shape[0])\n", 1439 | " alg.fit(Xtrain, ytrain, eval_metric='auc')\n", 1440 | " predict = alg.predict(Xtrain)\n", 1441 | " return predict" 1442 | ] 1443 | }, 1444 | { 1445 | "cell_type": "markdown", 1446 | "metadata": {}, 1447 | "source": [ 1448 | "## Step 1 Fix learning rate and number of estimators for tuning tree-based parameters" 1449 | ] 1450 | }, 1451 | { 1452 | "cell_type": "code", 1453 | "execution_count": null, 1454 | "metadata": { 1455 | "collapsed": false 1456 | }, 1457 | "outputs": [], 1458 | "source": [ 1459 | "xgb1 = XGBClassifier(\n", 1460 | " learning_rate =0.05,\n", 1461 | " n_estimators=100,\n", 1462 | " max_depth=15,\n", 1463 | " min_child_weight=4,\n", 1464 | " gamma=0,\n", 1465 | " subsample=0.8,\n", 1466 | " colsample_bytree=0.8,\n", 1467 | " objective= 'reg:linear',\n", 1468 | " scale_pos_weight=1,\n", 1469 | " seed=27)\n", 1470 | "\n", 1471 | "predict = modelfit(xgb1, features_train, np.ravel(labels_train))" 1472 | ] 1473 | }, 1474 | { 1475 | "cell_type": "code", 1476 | "execution_count": null, 1477 | "metadata": { 1478 | "collapsed": false 1479 | }, 1480 | "outputs": [], 1481 | "source": [ 1482 | "#print model report:\n", 1483 | "print '\\nModel Report ********'\n", 1484 | "print \"Accuracy : %.4g\" % rmsle(np.ravel(labels_train), predict)\n", 1485 | "print '\\nModel Report ********'\n", 1486 | "feat_imp = pd.Series(xgb1.booster().get_fscore()).sort_values(ascending=False)\n", 1487 | "feat_imp.plot(kind='bar', title='Feature Importances')\n", 1488 | "plt.ylabel('Feature Importance Score')" 1489 | ] 1490 | }, 1491 | { 1492 | "cell_type": "markdown", 1493 | "metadata": { 1494 | "collapsed": false 1495 | }, 1496 | "source": [ 1497 | "## Step 2: Tune max_depth and min_child_weight" 1498 | ] 1499 | }, 1500 | { 1501 | "cell_type": "code", 1502 | "execution_count": null, 1503 | "metadata": { 1504 | "collapsed": false 1505 | }, 1506 | "outputs": [], 1507 | "source": [ 1508 | "from sklearn.grid_search import GridSearchCV\n", 1509 | "param_test1 = {\n", 1510 | " 'max_depth':range(3,10,2),\n", 1511 | " 'min_child_weight':range(1,6,2)\n", 1512 | "}\n", 1513 | "gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27), param_grid = param_test1, scoring='roc_auc', n_jobs=4,iid=False)\n", 1514 | "gsearch1.fit(features_train,np.ravel(labels_train))\n", 1515 | "gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_" 1516 | ] 1517 | }, 1518 | { 1519 | "cell_type": "markdown", 1520 | "metadata": {}, 1521 | "source": [ 1522 | "## Data Cleaning\n", 1523 | "There are duplicate client ids in cliente_table, which means one client id may have multiple client name that are very similar. We will cluster them based on a hash function and use a clustering algorithm to evaluate similarity. " 1524 | ] 1525 | }, 1526 | { 1527 | "cell_type": "code", 1528 | "execution_count": null, 1529 | "metadata": { 1530 | "collapsed": false 1531 | }, 1532 | "outputs": [], 1533 | "source": [ 1534 | "import re\n", 1535 | "def hash_eval(s):\n", 1536 | " hash_base = 4\n", 1537 | " s = re.sub('[., ]', '', s)\n", 1538 | " seqlen = len(s)\n", 1539 | " n = seqlen - 1\n", 1540 | " h = 0\n", 1541 | " for c in s:\n", 1542 | " h += ord(c) * (hash_base ** n)\n", 1543 | " n -= 1\n", 1544 | " curhash = h\n", 1545 | " return curhash\n", 1546 | "\n", 1547 | "# In the client table, same clients are assigned different client ID. We create a new client table where clients are assigned unique ID. \n", 1548 | "clientid_hash = dict()\n", 1549 | "new_client_id = [-1] \n", 1550 | "for idx, s in enumerate(clientnameid_data.NombreCliente):\n", 1551 | " t = hash_eval(s)\n", 1552 | " clientid_hash.setdefault(t, []).append(clientnameid_data.Cliente_ID[idx])\n", 1553 | " if t in clientid_hash:\n", 1554 | " a = clientid_hash[t]\n", 1555 | " new_client_id.append(a[0])\n", 1556 | "\n", 1557 | "# In the agency table, same agencies (town, state) are assigned different agency ID. We create a new agency table where agencies (town, state) are assigned unique ID. \n", 1558 | "agencyid_hash = dict()\n", 1559 | "new_agency_id = [-1] \n", 1560 | "for idx, s in enumerate(townstate_data.Town+townstate_data.State):\n", 1561 | " t = hash_eval(s)\n", 1562 | " agencyid_hash.setdefault(t, []).append(townstate_data.Agencia_ID[idx])\n", 1563 | " if t in agencyid_hash:\n", 1564 | " a = agencyid_hash[t]\n", 1565 | " new_agency_id.append(a[0])\n" 1566 | ] 1567 | }, 1568 | { 1569 | "cell_type": "code", 1570 | "execution_count": null, 1571 | "metadata": { 1572 | "collapsed": false 1573 | }, 1574 | "outputs": [], 1575 | "source": [ 1576 | "clientnameid_data['New_Cliente_ID'] = new_client_id[1:]\n", 1577 | "townstate_data['New_Agencia_ID'] = new_agency_id[1:]" 1578 | ] 1579 | }, 1580 | { 1581 | "cell_type": "code", 1582 | "execution_count": null, 1583 | "metadata": { 1584 | "collapsed": false 1585 | }, 1586 | "outputs": [], 1587 | "source": [ 1588 | "print clientnameid_data.head(10)\n", 1589 | "print '---'\n", 1590 | "print townstate_data.head()\n", 1591 | "print '---'\n", 1592 | "print train_data_samp.head(10)\n" 1593 | ] 1594 | }, 1595 | { 1596 | "cell_type": "code", 1597 | "execution_count": null, 1598 | "metadata": { 1599 | "collapsed": false 1600 | }, 1601 | "outputs": [], 1602 | "source": [ 1603 | "print train_data_samp.head(10)\n", 1604 | "print '------'\n", 1605 | "for idx, cid in enumerate(train_data_samp.client_id):\n", 1606 | " train_data_samp.client_id.values[idx] = clientnameid_data.New_Cliente_ID[train_data_samp.client_id.values[idx] == clientnameid_data.Cliente_ID.values].values[0]\n", 1607 | " train_data_samp.sales_depot_id.values[idx] = townstate_data.New_Agencia_ID[train_data_samp.sales_depot_id.values[idx] == townstate_data.Agencia_ID.values].values[0]\n", 1608 | "print '-----'\n", 1609 | "print train_data_samp.head()\n" 1610 | ] 1611 | }, 1612 | { 1613 | "cell_type": "markdown", 1614 | "metadata": {}, 1615 | "source": [ 1616 | "## Load Test Data" 1617 | ] 1618 | }, 1619 | { 1620 | "cell_type": "code", 1621 | "execution_count": null, 1622 | "metadata": { 1623 | "collapsed": false 1624 | }, 1625 | "outputs": [], 1626 | "source": [ 1627 | "test_data = pd.read_csv('test.csv')\n", 1628 | "test_data.columns = ['id', 'week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client id', 'prod_id']\n", 1629 | "test_labels = pd.read_csv('sample_submission.csv')\n", 1630 | "test_data = test_data.drop('id', 1)\n", 1631 | "print test_data.head()" 1632 | ] 1633 | }, 1634 | { 1635 | "cell_type": "code", 1636 | "execution_count": null, 1637 | "metadata": { 1638 | "collapsed": false 1639 | }, 1640 | "outputs": [], 1641 | "source": [] 1642 | }, 1643 | { 1644 | "cell_type": "code", 1645 | "execution_count": null, 1646 | "metadata": { 1647 | "collapsed": false 1648 | }, 1649 | "outputs": [], 1650 | "source": [ 1651 | "g = sns.PairGrid(data_t)\n", 1652 | "g.map(plt.scatter)" 1653 | ] 1654 | }, 1655 | { 1656 | "cell_type": "code", 1657 | "execution_count": null, 1658 | "metadata": { 1659 | "collapsed": false, 1660 | "scrolled": true 1661 | }, 1662 | "outputs": [], 1663 | "source": [] 1664 | }, 1665 | { 1666 | "cell_type": "code", 1667 | "execution_count": null, 1668 | "metadata": { 1669 | "collapsed": true 1670 | }, 1671 | "outputs": [], 1672 | "source": [ 1673 | "a = [[1, 2, 3, 4]]" 1674 | ] 1675 | }, 1676 | { 1677 | "cell_type": "code", 1678 | "execution_count": null, 1679 | "metadata": { 1680 | "collapsed": false 1681 | }, 1682 | "outputs": [], 1683 | "source": [ 1684 | "print a" 1685 | ] 1686 | }, 1687 | { 1688 | "cell_type": "code", 1689 | "execution_count": null, 1690 | "metadata": { 1691 | "collapsed": false 1692 | }, 1693 | "outputs": [], 1694 | "source": [ 1695 | "np.array(a)" 1696 | ] 1697 | }, 1698 | { 1699 | "cell_type": "code", 1700 | "execution_count": null, 1701 | "metadata": { 1702 | "collapsed": false 1703 | }, 1704 | "outputs": [], 1705 | "source": [ 1706 | "print np.array(a)" 1707 | ] 1708 | }, 1709 | { 1710 | "cell_type": "code", 1711 | "execution_count": null, 1712 | "metadata": { 1713 | "collapsed": false 1714 | }, 1715 | "outputs": [], 1716 | "source": [ 1717 | "a = np.array(a)" 1718 | ] 1719 | }, 1720 | { 1721 | "cell_type": "code", 1722 | "execution_count": null, 1723 | "metadata": { 1724 | "collapsed": false 1725 | }, 1726 | "outputs": [], 1727 | "source": [ 1728 | "a = sp_randint(10,2)" 1729 | ] 1730 | }, 1731 | { 1732 | "cell_type": "code", 1733 | "execution_count": null, 1734 | "metadata": { 1735 | "collapsed": false 1736 | }, 1737 | "outputs": [], 1738 | "source": [] 1739 | }, 1740 | { 1741 | "cell_type": "code", 1742 | "execution_count": null, 1743 | "metadata": { 1744 | "collapsed": false 1745 | }, 1746 | "outputs": [], 1747 | "source": [ 1748 | "range(3,10,2)" 1749 | ] 1750 | }, 1751 | { 1752 | "cell_type": "code", 1753 | "execution_count": null, 1754 | "metadata": { 1755 | "collapsed": false 1756 | }, 1757 | "outputs": [], 1758 | "source": [ 1759 | "sp_randint(1, 6)" 1760 | ] 1761 | }, 1762 | { 1763 | "cell_type": "code", 1764 | "execution_count": null, 1765 | "metadata": { 1766 | "collapsed": false 1767 | }, 1768 | "outputs": [], 1769 | "source": [ 1770 | "import subprocess \n", 1771 | "subprocess.call(['ec2kill'])" 1772 | ] 1773 | }, 1774 | { 1775 | "cell_type": "code", 1776 | "execution_count": null, 1777 | "metadata": { 1778 | "collapsed": false 1779 | }, 1780 | "outputs": [], 1781 | "source": [ 1782 | "from subprocess import call\n", 1783 | "call([\"ec2-terminate-instances\", \"i-308b33ed \"])" 1784 | ] 1785 | }, 1786 | { 1787 | "cell_type": "code", 1788 | "execution_count": null, 1789 | "metadata": { 1790 | "collapsed": true 1791 | }, 1792 | "outputs": [], 1793 | "source": [ 1794 | "# Utility function to report best scores\n", 1795 | "def report(grid_scores, n_top=3):\n", 1796 | " top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]\n", 1797 | " for i, score in enumerate(top_scores):\n", 1798 | " print(\"Model with rank: {0}\".format(i + 1))\n", 1799 | " print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", 1800 | " score.mean_validation_score,\n", 1801 | " np.std(score.cv_validation_scores)))\n", 1802 | " print(\"Parameters: {0}\".format(score.parameters))\n", 1803 | " print(\"\")" 1804 | ] 1805 | }, 1806 | { 1807 | "cell_type": "code", 1808 | "execution_count": null, 1809 | "metadata": { 1810 | "collapsed": true 1811 | }, 1812 | "outputs": [], 1813 | "source": [ 1814 | "from sklearn.ensemble import RandomForestClassifier\n", 1815 | "from sklearn.grid_search import RandomizedSearchCV\n", 1816 | "from scipy.stats import randint as sp_randint\n", 1817 | "from operator import itemgetter\n", 1818 | "\n", 1819 | "clf = RandomForestClassifier(n_estimators=30)\n", 1820 | "\n", 1821 | "# specify parameters and distributions to sample from\n", 1822 | "param_dist = {\"max_depth\": [10, None],\n", 1823 | " \"max_features\": sp_randint(1, 6),\n", 1824 | " \"min_samples_split\": sp_randint(1, 6),\n", 1825 | " \"min_samples_leaf\": sp_randint(1, 6),\n", 1826 | " \"bootstrap\": [True, False],\n", 1827 | " \"criterion\": [\"gini\", \"entropy\"]}\n", 1828 | "\n", 1829 | "# run randomized search\n", 1830 | "n_iter_search = 20\n", 1831 | "random_search = RandomizedSearchCV(clf, param_distributions=param_dist,\n", 1832 | " n_iter=n_iter_search, n_jobs=4, cv=3)\n", 1833 | "start = time.time()\n", 1834 | "random_search.fit(features_train, np.ravel(labels_train))\n", 1835 | "\n", 1836 | "print(\"RandomizedSearchCV took %.2f seconds for %d candidates\"\n", 1837 | " \" parameter settings.\" % ((time.time() - start), n_iter_search))\n", 1838 | "report(random_search.grid_scores_)\n", 1839 | "print random_search.best_score_ \n", 1840 | "print random_search.best_estimator_ " 1841 | ] 1842 | } 1843 | ], 1844 | "metadata": { 1845 | "kernelspec": { 1846 | "display_name": "Python 2", 1847 | "language": "python", 1848 | "name": "python2" 1849 | }, 1850 | "language_info": { 1851 | "codemirror_mode": { 1852 | "name": "ipython", 1853 | "version": 2 1854 | }, 1855 | "file_extension": ".py", 1856 | "mimetype": "text/x-python", 1857 | "name": "python", 1858 | "nbconvert_exporter": "python", 1859 | "pygments_lexer": "ipython2", 1860 | "version": "2.7.11" 1861 | } 1862 | }, 1863 | "nbformat": 4, 1864 | "nbformat_minor": 0 1865 | } 1866 | --------------------------------------------------------------------------------