├── README.md └── 14.auto-ml-energy-demand-forecasting.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # AutomatedMLEnergyForecast 2 | Energy demand forecasting on Azure Machine Learning with Automated ML 3 | -------------------------------------------------------------------------------- /14.auto-ml-energy-demand-forecasting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 8 | "\n", 9 | "Licensed under the MIT License." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# AutoML: Energy Demand Forecasting\n", 17 | "Time series forecasting is the task of predicting future values in a time-ordered sequence of observations. It is a common problem and has applications in many industries. For example, retail companies need to forecast future product sales so they can effectively organize their supply chains to meet demand. Similarly, package delivery companies need to estimate the demand for their services so they can plan workforce requirements and delivery routes ahead of time. In many cases, the financial risks of inaccurate forecasts can be significant. Therefore, forecasting is often a business critical activity.\n", 18 | "\n", 19 | "This sample shows how time series forecasting can be performed through AutoML package withing AzureML Services.\n", 20 | "\n", 21 | "Make sure you have executed the [00.configuration](00.configuration.ipynb) before running this notebook.\n", 22 | "\n", 23 | "In this notebook you would see\n", 24 | "1. Creating an Experiment in an existing Workspace\n", 25 | "2. Instantiating AutoMLConfig\n", 26 | "3. Training the Model using local compute\n", 27 | "4. Exploring the results\n", 28 | "5. Testing the fitted model\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Create Experiment\n", 36 | "\n", 37 | "As part of the setup you have already created a Workspace. For AutoML you would need to create an Experiment. An Experiment is a named object in a Workspace, which is used to run experiments." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 1, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "ename": "ModuleNotFoundError", 47 | "evalue": "No module named 'azureml.core.experiment'", 48 | "output_type": "error", 49 | "traceback": [ 50 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 51 | "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 52 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mazureml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mworkspace\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mWorkspace\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexperiment\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mExperiment\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mautoml\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mAutoMLConfig\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mautoml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mAutoMLRun\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 53 | "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'azureml.core.experiment'" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "import azureml.core\n", 59 | "import pandas as pd\n", 60 | "import azureml.core\n", 61 | "from azureml.core.workspace import Workspace\n", 62 | "from azureml.core.experiment import Experiment\n", 63 | "from azureml.train.automl import AutoMLConfig\n", 64 | "from azureml.train.automl.run import AutoMLRun\n", 65 | "import time\n", 66 | "import logging\n", 67 | "import os\n", 68 | "from sklearn import datasets\n", 69 | "import seaborn as sns\n", 70 | "from matplotlib import pyplot as plt\n", 71 | "from matplotlib.pyplot import imshow\n", 72 | "import random\n", 73 | "import numpy as np\n", 74 | "from scipy import stats" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 2, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "Found the config file in: C:\\Users\\lazzeri\\Desktop\\MachineLearningNotebooks-master\\automl\\aml_config\\config.json\n" 87 | ] 88 | }, 89 | { 90 | "data": { 91 | "text/html": [ 92 | "
\n", 93 | "\n", 106 | "\n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | "
SDK version0.1.4
Subscription IDff18d7a8-962a-406c-858f-49acd23d6c01
Workspace Namemyws
Resource Groupmyrg
Locationeastus2
Project DirectoryC:\\Users\\lazzeri\\Desktop\\MachineLearningNotebooks-master\\automl\\sample_projects\\automl-energydemandforecasting
Run History Nameautoml-energydemandforecasting
\n", 144 | "
" 145 | ], 146 | "text/plain": [ 147 | " \n", 148 | "SDK version 0.1.4 \n", 149 | "Subscription ID ff18d7a8-962a-406c-858f-49acd23d6c01 \n", 150 | "Workspace Name myws \n", 151 | "Resource Group myrg \n", 152 | "Location eastus2 \n", 153 | "Project Directory C:\\Users\\lazzeri\\Desktop\\MachineLearningNotebooks-master\\automl\\sample_projects\\automl-energydemandforecasting\n", 154 | "Run History Name automl-energydemandforecasting " 155 | ] 156 | }, 157 | "execution_count": 2, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "ws = Workspace.from_config()\n", 164 | "\n", 165 | "# choose a name for the run history container in the workspace\n", 166 | "history_name = 'automl-energydemandforecasting'\n", 167 | "\n", 168 | "# project folder\n", 169 | "project_folder = './sample_projects/automl-energydemandforecasting'\n", 170 | "\n", 171 | "import os\n", 172 | "from azureml.core.project import Project\n", 173 | "\n", 174 | "project = Project.attach(ws,\n", 175 | " history_name,\n", 176 | " project_folder)\n", 177 | "\n", 178 | "output = {}\n", 179 | "output['SDK version'] = azureml.core.VERSION\n", 180 | "output['Subscription ID'] = ws.subscription_id\n", 181 | "output['Workspace Name'] = ws.name\n", 182 | "output['Resource Group'] = ws.resource_group\n", 183 | "output['Location'] = ws.location\n", 184 | "output['Project Directory'] = project.project_directory\n", 185 | "output['Run History Name'] = project.history.name\n", 186 | "pd.set_option('display.max_colwidth', -1)\n", 187 | "pd.DataFrame(data=output, index=['']).T" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "### Update Conda Dependency file to have AutoML SDK\n", 195 | "\n", 196 | "Currently AutoML SDK is not installed with Azure ML SDK by default, Due to this we update the conda dependency file to add a dependency on AutoML SDK." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 3, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "name": "stdout", 206 | "output_type": "stream", 207 | "text": [ 208 | "Overwriting ./sample_projects/automl-energydemandforecasting/aml_config/conda_dependencies.yml\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "%%writefile $project_folder/aml_config/conda_dependencies.yml\n", 214 | "# Conda environment specification. The dependencies defined in this file will\n", 215 | "# be automatically provisioned for managed runs. These include runs against\n", 216 | "# the localdocker, remotedocker, and cluster compute targets.\n", 217 | "\n", 218 | "# Note that this file is NOT used to automatically manage dependencies for the\n", 219 | "# local compute target. To provision these dependencies locally, run:\n", 220 | "# conda env update --file conda_dependencies.yml\n", 221 | "\n", 222 | "# Details about the Conda environment file format:\n", 223 | "# https://conda.io/docs/using/envs.html#create-environment-file-by-hand\n", 224 | "\n", 225 | "# For managing Spark packages and configuration, see spark_dependencies.yml.\n", 226 | "\n", 227 | "# Version of this configuration file's structure and semantics in AzureML.\n", 228 | "# This directive is stored in a comment to preserve the Conda file structure.\n", 229 | "# [AzureMlVersion] = 2\n", 230 | "\n", 231 | "name: project_environment\n", 232 | "dependencies:\n", 233 | " # The python interpreter version.\n", 234 | " # Currently Azure ML Workbench only supports 3.5.2 and later.\n", 235 | " - python=3.6.2\n", 236 | " # Required by azureml-requirements, installed separately through Conda to\n", 237 | " # get a prebuilt version and not require build tools for the install.\n", 238 | " - psutil=5.4.5\n", 239 | " - numpy\n", 240 | "\n", 241 | " - pip:\n", 242 | " # Required packages for AzureML execution, history, and data preparation.\n", 243 | " - --index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/Preview/E7501C02541B433786111FE8E140CAA1\n", 244 | " - --extra-index-url https://pypi.python.org/simple\n", 245 | " - azureml-sdk\n", 246 | " - azureml-train-automl" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "## Create Get Data File\n", 254 | "For remote executions you should author a get_data.py file containing a get_data() function. This file should be in the root directory of the project. You can encapsulate code to read data either from a blob storage or local disk in this file.\n", 255 | "\n", 256 | "The *get_data()* function returns a [dictionary](README.md#getdata)." 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 4, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "Writing ./sample_projects/automl-energydemandforecasting/get_data.py\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "%%writefile $project_folder/get_data.py\n", 274 | "\n", 275 | "import pandas as pd\n", 276 | "from sklearn.model_selection import train_test_split\n", 277 | "from sklearn.preprocessing import LabelEncoder\n", 278 | "\n", 279 | "def get_data():\n", 280 | " \n", 281 | " demand = pd.read_csv(\"https://antaignitedata.blob.core.windows.net/antaignitedata/nyc_demand.csv\", parse_dates=['timeStamp'])\n", 282 | " weather = pd.read_csv(\"https://antaignitedata.blob.core.windows.net/antaignitedata/nyc_weather.csv\", parse_dates=['timeStamp'])\n", 283 | " df = pd.merge(demand, weather, on=['timeStamp'], how='outer')\n", 284 | " \n", 285 | " return { \"X\" : df }" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "### View data\n", 293 | "\n", 294 | "You can execute the *get_data()* function locally to view the *energy* data" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 5, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/html": [ 305 | "
\n", 306 | "\n", 319 | "\n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | "
timeStampdemandpreciptemp
02012-01-01 00:00:004937.50.046.13
12012-01-01 01:00:004752.10.045.89
22012-01-01 02:00:004542.60.045.04
32012-01-01 03:00:004357.70.045.03
42012-01-01 04:00:004275.50.042.61
\n", 367 | "
" 368 | ], 369 | "text/plain": [ 370 | " timeStamp demand precip temp\n", 371 | "0 2012-01-01 00:00:00 4937.5 0.0 46.13\n", 372 | "1 2012-01-01 01:00:00 4752.1 0.0 45.89\n", 373 | "2 2012-01-01 02:00:00 4542.6 0.0 45.04\n", 374 | "3 2012-01-01 03:00:00 4357.7 0.0 45.03\n", 375 | "4 2012-01-01 04:00:00 4275.5 0.0 42.61" 376 | ] 377 | }, 378 | "execution_count": 5, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "%run $project_folder/get_data.py\n", 385 | "data_dict = get_data()\n", 386 | "df = data_dict[\"X\"]\n", 387 | "df.head()" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "## Fill gaps in the time series\n", 395 | "Some periods in the time series are missing. This occurs if the period was missing in both the original demand and weather datasets. To identify these gaps, first we create an index of time periods that we would expect to be in the time series. There should be one record for every hour between the minimum and maximum datetimes in our dataset." 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 6, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "text/plain": [ 406 | "Timestamp('2012-01-01 00:00:00')" 407 | ] 408 | }, 409 | "execution_count": 6, 410 | "metadata": {}, 411 | "output_type": "execute_result" 412 | } 413 | ], 414 | "source": [ 415 | "min_time = min(df['timeStamp'])\n", 416 | "min_time" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 7, 422 | "metadata": {}, 423 | "outputs": [ 424 | { 425 | "data": { 426 | "text/plain": [ 427 | "Timestamp('2017-08-12 06:00:00')" 428 | ] 429 | }, 430 | "execution_count": 7, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "max_time = max(df['timeStamp'])\n", 437 | "max_time" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 8, 443 | "metadata": {}, 444 | "outputs": [ 445 | { 446 | "data": { 447 | "text/plain": [ 448 | "DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 01:00:00',\n", 449 | " '2012-01-01 02:00:00', '2012-01-01 03:00:00',\n", 450 | " '2012-01-01 04:00:00', '2012-01-01 05:00:00',\n", 451 | " '2012-01-01 06:00:00', '2012-01-01 07:00:00',\n", 452 | " '2012-01-01 08:00:00', '2012-01-01 09:00:00',\n", 453 | " ...\n", 454 | " '2017-08-11 21:00:00', '2017-08-11 22:00:00',\n", 455 | " '2017-08-11 23:00:00', '2017-08-12 00:00:00',\n", 456 | " '2017-08-12 01:00:00', '2017-08-12 02:00:00',\n", 457 | " '2017-08-12 03:00:00', '2017-08-12 04:00:00',\n", 458 | " '2017-08-12 05:00:00', '2017-08-12 06:00:00'],\n", 459 | " dtype='datetime64[ns]', length=49207, freq='H')" 460 | ] 461 | }, 462 | "execution_count": 8, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "dt_idx = pd.date_range(min_time, max_time, freq='H')\n", 469 | "dt_idx" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "Now we index the dataframe according to this datetime index to insert missing records into the time series:" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 9, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/html": [ 487 | "
\n", 488 | "\n", 501 | "\n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | "
timeStampdemandpreciptemp
2016-03-13 02:00:00NaTNaNNaNNaN
2017-03-12 02:00:00NaTNaNNaNNaN
\n", 528 | "
" 529 | ], 530 | "text/plain": [ 531 | " timeStamp demand precip temp\n", 532 | "2016-03-13 02:00:00 NaT NaN NaN NaN \n", 533 | "2017-03-12 02:00:00 NaT NaN NaN NaN " 534 | ] 535 | }, 536 | "execution_count": 9, 537 | "metadata": {}, 538 | "output_type": "execute_result" 539 | } 540 | ], 541 | "source": [ 542 | "df.index = df['timeStamp']\n", 543 | "df = df.reindex(dt_idx)\n", 544 | "\n", 545 | "df[df.isnull().all(axis=1)]" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": {}, 551 | "source": [ 552 | "Now that there are no missing periods in the time series, we can start handling missing values by filling as many as possible. Firstly, count the number of missing values in each column:" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 10, 558 | "metadata": {}, 559 | "outputs": [ 560 | { 561 | "data": { 562 | "text/plain": [ 563 | "timeStamp 2 \n", 564 | "demand 83 \n", 565 | "precip 232\n", 566 | "temp 188\n", 567 | "dtype: int64" 568 | ] 569 | }, 570 | "execution_count": 10, 571 | "metadata": {}, 572 | "output_type": "execute_result" 573 | } 574 | ], 575 | "source": [ 576 | "df.isnull().sum()" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": {}, 582 | "source": [ 583 | "Missing timeStamp can be filled from the dataframe index:\n", 584 | "\n" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 11, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "df.loc[df.isnull().all(axis=1), 'timeStamp'] = df.loc[df.isnull().all(axis=1)].index" 594 | ] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "For the other columns, we can fill many missing values by interpolating between the two closest non-missing values. Here, we use a quadratic function and set a limit of 6. This limit means that if more than 6 missing values occur consecutively, the missing values are not interpolated over and they remain missing. This is to avoid spurious interpolation between very distant time periods." 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 12, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "df = df.interpolate(limit=6, method='linear')" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "metadata": {}, 615 | "source": [ 616 | "Fill missing precip values with common value of 0:" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 13, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "precip_mode = np.asscalar(stats.mode(df['precip']).mode)\n", 626 | "df['precip'] = df['precip'].fillna(precip_mode)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 14, 632 | "metadata": {}, 633 | "outputs": [ 634 | { 635 | "data": { 636 | "text/plain": [ 637 | "timeStamp 0 \n", 638 | "demand 43\n", 639 | "precip 0 \n", 640 | "temp 86\n", 641 | "dtype: int64" 642 | ] 643 | }, 644 | "execution_count": 14, 645 | "metadata": {}, 646 | "output_type": "execute_result" 647 | } 648 | ], 649 | "source": [ 650 | "df.isnull().sum()" 651 | ] 652 | }, 653 | { 654 | "cell_type": "markdown", 655 | "metadata": {}, 656 | "source": [ 657 | "The number of missing values has now been greatly reduced. Records containing the remaining missing values will be removed later after model features have been created." 658 | ] 659 | }, 660 | { 661 | "cell_type": "markdown", 662 | "metadata": {}, 663 | "source": [ 664 | "## Compute features for forecasting models\n", 665 | "After exploring the data, it is clear that the energy demand follows seasonal trends, with daily, weekly and annual periodicity. We will create features that encode this information. First, we compute time driven features based on timeStamp. Note for dayofweek, Monday=0 and Sunday=6." 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 15, 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "df_features = df.copy()" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": 16, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "df_features['hour'] = df_features.timeStamp.dt.hour\n", 684 | "df_features['month'] = df_features.timeStamp.dt.month-1\n", 685 | "df_features['dayofweek'] = df_features.timeStamp.dt.dayofweek" 686 | ] 687 | }, 688 | { 689 | "cell_type": "markdown", 690 | "metadata": {}, 691 | "source": [ 692 | "Compute lagged demand features" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 17, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "def generate_lagged_features(df, var, max_lag):\n", 702 | " for t in range(1, max_lag+1):\n", 703 | " df[var+'_lag'+str(t)] = df[var].shift(t, freq='1H')" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 18, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "generate_lagged_features(df_features, 'temp', 6)\n", 713 | "generate_lagged_features(df_features, 'demand', 6)" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 19, 719 | "metadata": {}, 720 | "outputs": [ 721 | { 722 | "data": { 723 | "text/html": [ 724 | "
\n", 725 | "\n", 738 | "\n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | "
timeStampdemandpreciptemphourmonthdayofweektemp_lag1temp_lag2temp_lag3temp_lag4temp_lag5temp_lag6demand_lag1demand_lag2demand_lag3demand_lag4demand_lag5demand_lag6
2012-01-01 00:00:002012-01-01 00:00:004937.50.046.13006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2012-01-01 01:00:002012-01-01 01:00:004752.10.045.8910646.13NaNNaNNaNNaNNaN4937.5NaNNaNNaNNaNNaN
2012-01-01 02:00:002012-01-01 02:00:004542.60.045.0420645.8946.13NaNNaNNaNNaN4752.14937.5NaNNaNNaNNaN
2012-01-01 03:00:002012-01-01 03:00:004357.70.045.0330645.0445.8946.13NaNNaNNaN4542.64752.14937.5NaNNaNNaN
2012-01-01 04:00:002012-01-01 04:00:004275.50.042.6140645.0345.0445.8946.13NaNNaN4357.74542.64752.14937.5NaNNaN
\n", 876 | "
" 877 | ], 878 | "text/plain": [ 879 | " timeStamp demand precip temp hour month \\\n", 880 | "2012-01-01 00:00:00 2012-01-01 00:00:00 4937.5 0.0 46.13 0 0 \n", 881 | "2012-01-01 01:00:00 2012-01-01 01:00:00 4752.1 0.0 45.89 1 0 \n", 882 | "2012-01-01 02:00:00 2012-01-01 02:00:00 4542.6 0.0 45.04 2 0 \n", 883 | "2012-01-01 03:00:00 2012-01-01 03:00:00 4357.7 0.0 45.03 3 0 \n", 884 | "2012-01-01 04:00:00 2012-01-01 04:00:00 4275.5 0.0 42.61 4 0 \n", 885 | "\n", 886 | " dayofweek temp_lag1 temp_lag2 temp_lag3 temp_lag4 \\\n", 887 | "2012-01-01 00:00:00 6 NaN NaN NaN NaN \n", 888 | "2012-01-01 01:00:00 6 46.13 NaN NaN NaN \n", 889 | "2012-01-01 02:00:00 6 45.89 46.13 NaN NaN \n", 890 | "2012-01-01 03:00:00 6 45.04 45.89 46.13 NaN \n", 891 | "2012-01-01 04:00:00 6 45.03 45.04 45.89 46.13 \n", 892 | "\n", 893 | " temp_lag5 temp_lag6 demand_lag1 demand_lag2 \\\n", 894 | "2012-01-01 00:00:00 NaN NaN NaN NaN \n", 895 | "2012-01-01 01:00:00 NaN NaN 4937.5 NaN \n", 896 | "2012-01-01 02:00:00 NaN NaN 4752.1 4937.5 \n", 897 | "2012-01-01 03:00:00 NaN NaN 4542.6 4752.1 \n", 898 | "2012-01-01 04:00:00 NaN NaN 4357.7 4542.6 \n", 899 | "\n", 900 | " demand_lag3 demand_lag4 demand_lag5 demand_lag6 \n", 901 | "2012-01-01 00:00:00 NaN NaN NaN NaN \n", 902 | "2012-01-01 01:00:00 NaN NaN NaN NaN \n", 903 | "2012-01-01 02:00:00 NaN NaN NaN NaN \n", 904 | "2012-01-01 03:00:00 4937.5 NaN NaN NaN \n", 905 | "2012-01-01 04:00:00 4752.1 4937.5 NaN NaN " 906 | ] 907 | }, 908 | "execution_count": 19, 909 | "metadata": {}, 910 | "output_type": "execute_result" 911 | } 912 | ], 913 | "source": [ 914 | "df_features.head()" 915 | ] 916 | }, 917 | { 918 | "cell_type": "markdown", 919 | "metadata": {}, 920 | "source": [ 921 | "## Final data cleaning and write out training and test datasets\n", 922 | "Count remaining null values." 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 20, 928 | "metadata": {}, 929 | "outputs": [ 930 | { 931 | "data": { 932 | "text/plain": [ 933 | "timeStamp 0 \n", 934 | "demand 43\n", 935 | "precip 0 \n", 936 | "temp 86\n", 937 | "hour 0 \n", 938 | "month 0 \n", 939 | "dayofweek 0 \n", 940 | "temp_lag1 87\n", 941 | "temp_lag2 88\n", 942 | "temp_lag3 89\n", 943 | "temp_lag4 90\n", 944 | "temp_lag5 91\n", 945 | "temp_lag6 92\n", 946 | "demand_lag1 43\n", 947 | "demand_lag2 43\n", 948 | "demand_lag3 43\n", 949 | "demand_lag4 43\n", 950 | "demand_lag5 43\n", 951 | "demand_lag6 43\n", 952 | "dtype: int64" 953 | ] 954 | }, 955 | "execution_count": 20, 956 | "metadata": {}, 957 | "output_type": "execute_result" 958 | } 959 | ], 960 | "source": [ 961 | "df_features.isnull().sum()" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "Count number of rows with any null values" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": 21, 974 | "metadata": {}, 975 | "outputs": [ 976 | { 977 | "data": { 978 | "text/plain": [ 979 | "153" 980 | ] 981 | }, 982 | "execution_count": 21, 983 | "metadata": {}, 984 | "output_type": "execute_result" 985 | } 986 | ], 987 | "source": [ 988 | "df_features.loc[df_features.isnull().any(axis=1), ].shape[0]" 989 | ] 990 | }, 991 | { 992 | "cell_type": "markdown", 993 | "metadata": {}, 994 | "source": [ 995 | "This is a very small proportion of the overall dataset so can be safely dropped." 996 | ] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "execution_count": 22, 1001 | "metadata": {}, 1002 | "outputs": [], 1003 | "source": [ 1004 | "df_features.dropna(how='any', inplace=True)" 1005 | ] 1006 | }, 1007 | { 1008 | "cell_type": "markdown", 1009 | "metadata": {}, 1010 | "source": [ 1011 | "Split data into training and test datasets. All data after 1st July 2016 is reserved for the test set." 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": 23, 1017 | "metadata": {}, 1018 | "outputs": [], 1019 | "source": [ 1020 | "train, test = (df_features.loc[df_features['timeStamp']<'2016-07-01'], df_features.loc[df_features['timeStamp']>='2016-07-01'])" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "markdown", 1025 | "metadata": {}, 1026 | "source": [ 1027 | "## Instantiate Auto ML Regressor\n", 1028 | "\n", 1029 | "Instantiate a AutoML Object This creates an Experiment in Azure ML. You can reuse this objects to trigger multiple runs. Each run will be part of the same experiment.\n", 1030 | "\n", 1031 | "|Property|Description|\n", 1032 | "|-|-|\n", 1033 | "|**primary_metric**|This is the metric that you want to optimize.
Auto ML Regressor supports the following primary metrics
spearman_correlation
normalized_root_mean_squared_error
r2_score|\n", 1034 | "|**max_time_sec**|Time limit in seconds for each iterations|\n", 1035 | "|**iterations**|Number of iterations. In each iteration Auto ML Classifier trains the data with a specific pipeline|\n", 1036 | "|**n_cross_validations**|Number of cross validation splits|" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": 24, 1042 | "metadata": {}, 1043 | "outputs": [], 1044 | "source": [ 1045 | "from azureml.train.automl import AutoMLRegressor\n", 1046 | "import time\n", 1047 | "import logging\n", 1048 | "\n", 1049 | "automl_regressor = AutoMLRegressor(project = project,\n", 1050 | " name = \"AutoML_Demo_Experiment_v3\",\n", 1051 | " max_time_sec = 600,\n", 1052 | " iterations = 10,\n", 1053 | " primary_metric = 'normalized_root_mean_squared_error', \n", 1054 | " n_cross_validations = 5,\n", 1055 | " debug_log = 'automl.log',\n", 1056 | " verbosity = logging.INFO)" 1057 | ] 1058 | }, 1059 | { 1060 | "cell_type": "code", 1061 | "execution_count": 25, 1062 | "metadata": {}, 1063 | "outputs": [ 1064 | { 1065 | "data": { 1066 | "text/plain": [ 1067 | "numpy.ndarray" 1068 | ] 1069 | }, 1070 | "execution_count": 25, 1071 | "metadata": {}, 1072 | "output_type": "execute_result" 1073 | } 1074 | ], 1075 | "source": [ 1076 | "X = train.drop(['demand', 'timeStamp'], axis=1)\n", 1077 | "X.head()\n", 1078 | "type(X)\n", 1079 | "y = train['demand']\n", 1080 | "y = y.values\n", 1081 | "type(y)\n", 1082 | "#X = X.values\n", 1083 | "type(y)" 1084 | ] 1085 | }, 1086 | { 1087 | "cell_type": "markdown", 1088 | "metadata": {}, 1089 | "source": [ 1090 | "## Training the Model\n", 1091 | "\n", 1092 | "You can call the fit method on the AutoML instance and pass the run configuration. For Local runs the execution is synchronous. Depending on the data and number of iterations this can run for while.\n", 1093 | "You will see the currently running iterations printing to the console.\n", 1094 | "\n", 1095 | "*fit* method on Auto ML Regressor triggers the training of the model. It can be called with the following parameters\n", 1096 | "\n", 1097 | "|**Parameter**|**Description**|\n", 1098 | "|-|-|\n", 1099 | "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n", 1100 | "|**y**|(sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
Multi-class targets. An indicator matrix turns on multilabel classification.|\n", 1101 | "|**compute_target**|Indicates the compute used for training. local indicates train on the same compute which hosts the jupyter notebook.
For DSVM and Batch AI please refer to the relevant notebooks.|\n", 1102 | "|**show_output**| True/False to turn on/off console output|" 1103 | ] 1104 | }, 1105 | { 1106 | "cell_type": "code", 1107 | "execution_count": 26, 1108 | "metadata": {}, 1109 | "outputs": [ 1110 | { 1111 | "name": "stdout", 1112 | "output_type": "stream", 1113 | "text": [ 1114 | "Running locally\n", 1115 | "Parent Run ID: AutoML_eeecbb34-59f9-4e58-a757-6897e2d6666e\n", 1116 | "***********************************************************************************************\n", 1117 | "ITERATION: The iteration being evaluated.\n", 1118 | "PIPELINE: A summary description of the pipeline being evaluated.\n", 1119 | "DURATION: Time taken for the current iteration.\n", 1120 | "METRIC: The result of computing score on the fitted pipeline.\n", 1121 | "BEST: The best observed score thus far.\n", 1122 | "***********************************************************************************************\n", 1123 | "\n", 1124 | " ITERATION PIPELINE DURATION METRIC BEST\n", 1125 | " 0 Normalize RF regressor 0:00:09.988100 0.050 0.050\n", 1126 | " 1 Normalize lightGBM regressor 0:00:14.730993 0.010 0.010\n", 1127 | " 2 Robust Scaler extra trees regressor 0:00:20.434624 0.021 0.010\n", 1128 | " 3 Scale 0/1 Lasso lars 0:00:08.538191 0.010 0.010\n", 1129 | " 4 Normalize lightGBM regressor 0:00:11.922587 0.008 0.008\n", 1130 | " 5 Normalize lightGBM regressor 0:00:19.237972 0.006 0.006\n", 1131 | " 6 Normalizer Elastic net 0:00:08.036231 0.139 0.006\n", 1132 | " 7 Robust Scaler DT regressor 0:00:08.702193 0.043 0.006\n", 1133 | " 8 Robust Scaler DT regressor 0:00:08.811703 0.021 0.006\n", 1134 | " 9 Robust Scaler lightGBM regressor 0:05:14.171169 0.023 0.006\n" 1135 | ] 1136 | } 1137 | ], 1138 | "source": [ 1139 | "local_run = automl_regressor.fit( X = X, \n", 1140 | " y = y, \n", 1141 | " compute_target = 'local', \n", 1142 | " show_output = True ) " 1143 | ] 1144 | }, 1145 | { 1146 | "cell_type": "markdown", 1147 | "metadata": {}, 1148 | "source": [ 1149 | "## Exploring the results" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "markdown", 1154 | "metadata": {}, 1155 | "source": [ 1156 | "#### Widget for monitoring runs\n", 1157 | "\n", 1158 | "The widget will sit on \"loading\" until the first iteration completed, then you will see an auto-updating graph and table show up. It refreshed once per minute, so you should see the graph update as child runs complete.\n", 1159 | "\n", 1160 | "NOTE: The widget displays a link at the bottom. This links to a web-ui to explore the individual run details." 1161 | ] 1162 | }, 1163 | { 1164 | "cell_type": "code", 1165 | "execution_count": 27, 1166 | "metadata": {}, 1167 | "outputs": [ 1168 | { 1169 | "data": { 1170 | "application/vnd.jupyter.widget-view+json": { 1171 | "model_id": "be4cc19f50e14d6997ce57722a2d2d2e", 1172 | "version_major": 2, 1173 | "version_minor": 0 1174 | }, 1175 | "text/plain": [ 1176 | "AutoML(widget_settings={'childWidgetDisplay': 'popup'})" 1177 | ] 1178 | }, 1179 | "metadata": {}, 1180 | "output_type": "display_data" 1181 | } 1182 | ], 1183 | "source": [ 1184 | "from azureml.train.widgets import RunDetails\n", 1185 | "RunDetails(local_run).show() " 1186 | ] 1187 | }, 1188 | { 1189 | "cell_type": "markdown", 1190 | "metadata": {}, 1191 | "source": [ 1192 | "\n", 1193 | "#### Retrieve All Child Runs\n", 1194 | "You can also use sdk methods to fetch all the child runs and see individual metrics that we log. " 1195 | ] 1196 | }, 1197 | { 1198 | "cell_type": "code", 1199 | "execution_count": 28, 1200 | "metadata": { 1201 | "scrolled": false 1202 | }, 1203 | "outputs": [ 1204 | { 1205 | "data": { 1206 | "text/html": [ 1207 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | "
0123456789
explained_variance0.8835690.9948740.9796240.9949530.996720.9984580.08820750.9144480.9783540.99693
mean_absolute_error336.03656.7647120.51664.204950.271733.2532918.14256.774131.19181.768
median_absolute_error283.91238.216180.902744.587935.780122.9815738.36175.63494.0057172.511
normalized_mean_absolute_error0.03909040.006603320.01401940.007468810.005847990.003868270.1068050.02986990.0152610.0211446
normalized_median_absolute_error0.03302680.004445590.009411230.005186810.004162220.002673390.08589180.02043110.01093550.0200678
normalized_root_mean_squared_error0.04972620.01043450.02080410.0103560.008346780.005721730.1392030.0425920.02144370.0225197
normalized_root_mean_squared_error_min0.04972620.01043450.01043450.0103560.008346780.005721730.005721730.005721730.005721730.00572173
normalized_root_mean_squared_log_error8.07741e-061.62331e-063.27666e-061.7441e-061.34689e-069.06972e-072.25201e-056.57259e-063.479e-063.68513e-06
r2_score0.8835480.9948730.9796240.9949520.996720.9984580.08805610.9144410.9783530.976128
root_mean_squared_error427.46689.699178.84189.024571.752349.18631196.64366.138184.338193.589
root_mean_squared_log_error0.06943660.01395460.02816740.0149930.01157840.007796690.1935920.05650060.02990680.0316789
spearman_correlation0.9442060.9979630.9918090.9967290.9982240.999220.6396450.952670.9892390.999091
" 1609 | ], 1610 | "text/plain": [ 1611 | "" 1612 | ] 1613 | }, 1614 | "execution_count": 28, 1615 | "metadata": {}, 1616 | "output_type": "execute_result" 1617 | } 1618 | ], 1619 | "source": [ 1620 | "children = list(local_run.get_children())\n", 1621 | "metricslist = {}\n", 1622 | "for run in children:\n", 1623 | " properties = run.get_properties()\n", 1624 | " metrics = {k: v for k, v in run.get_metrics().items() if not isinstance(v, list)} \n", 1625 | " metricslist[int(properties['iteration'])] = metrics\n", 1626 | " \n", 1627 | "import pandas as pd\n", 1628 | "import seaborn as sns\n", 1629 | "rundata = pd.DataFrame(metricslist).sort_index(1)\n", 1630 | "cm = sns.light_palette(\"lightgreen\", as_cmap = True)\n", 1631 | "s = rundata.style.background_gradient(cmap = cm)\n", 1632 | "s" 1633 | ] 1634 | }, 1635 | { 1636 | "cell_type": "markdown", 1637 | "metadata": {}, 1638 | "source": [ 1639 | "### Retrieve the Best Model\n", 1640 | "\n", 1641 | "Below we select the best pipeline from our iterations. The *get_output* method on automl_classifier returns the best run and the fitted model for the last *fit* invocation. There are overloads on *get_output* that allow you to retrieve the best run and fitted model for *any* logged metric or a particular *iteration*." 1642 | ] 1643 | }, 1644 | { 1645 | "cell_type": "code", 1646 | "execution_count": 29, 1647 | "metadata": { 1648 | "scrolled": true 1649 | }, 1650 | "outputs": [ 1651 | { 1652 | "name": "stdout", 1653 | "output_type": "stream", 1654 | "text": [ 1655 | "Run({'id': 'AutoML_eeecbb34-59f9-4e58-a757-6897e2d6666e_5', 'type': None, 'status': 'Completed'})\n", 1656 | "Pipeline(memory=None,\n", 1657 | " steps=[('Normalize', StandardScaler(copy=True, with_mean=False, with_std=True)), ('lightGBM regressor', )])\n" 1658 | ] 1659 | } 1660 | ], 1661 | "source": [ 1662 | "best_run, fitted_model = local_run.get_output()\n", 1663 | "print(best_run)\n", 1664 | "print(fitted_model)" 1665 | ] 1666 | }, 1667 | { 1668 | "cell_type": "markdown", 1669 | "metadata": {}, 1670 | "source": [ 1671 | "#### Best Model based on any other metric\n", 1672 | "Show the run and model that has the smallest `spearman_correlation` value:" 1673 | ] 1674 | }, 1675 | { 1676 | "cell_type": "code", 1677 | "execution_count": 37, 1678 | "metadata": {}, 1679 | "outputs": [ 1680 | { 1681 | "name": "stdout", 1682 | "output_type": "stream", 1683 | "text": [ 1684 | "Run({'id': 'AutoML_eeecbb34-59f9-4e58-a757-6897e2d6666e_5', 'type': None, 'status': 'Completed'})\n", 1685 | "Pipeline(memory=None,\n", 1686 | " steps=[('Normalize', StandardScaler(copy=True, with_mean=False, with_std=True)), ('lightGBM regressor', )])\n" 1687 | ] 1688 | } 1689 | ], 1690 | "source": [ 1691 | "lookup_metric = \"spearman_correlation\"\n", 1692 | "best_run, fitted_model = local_run.get_output(metric=lookup_metric)\n", 1693 | "print(best_run)\n", 1694 | "print(fitted_model)" 1695 | ] 1696 | }, 1697 | { 1698 | "cell_type": "markdown", 1699 | "metadata": {}, 1700 | "source": [ 1701 | "#### Best Model based on any iteration\n", 1702 | "Simply show the run and model from the 3rd iteration:" 1703 | ] 1704 | }, 1705 | { 1706 | "cell_type": "code", 1707 | "execution_count": 38, 1708 | "metadata": {}, 1709 | "outputs": [ 1710 | { 1711 | "name": "stdout", 1712 | "output_type": "stream", 1713 | "text": [ 1714 | "Run({'id': 'AutoML_eeecbb34-59f9-4e58-a757-6897e2d6666e_3', 'type': None, 'status': 'Completed'})\n", 1715 | "Pipeline(memory=None,\n", 1716 | " steps=[('Scale 0/1', MinMaxScaler(copy=True, feature_range=(0, 1))), ('Lasso lars', LassoLars(alpha=0.001, copy_X=True, eps=2.220446049250313e-16,\n", 1717 | " fit_intercept=True, fit_path=True, max_iter=500, normalize=True,\n", 1718 | " positive=False, precompute='auto', verbose=False))])\n" 1719 | ] 1720 | } 1721 | ], 1722 | "source": [ 1723 | "iteration = 3\n", 1724 | "third_run, third_model = local_run.get_output(iteration = iteration)\n", 1725 | "print(third_run)\n", 1726 | "print(third_model)" 1727 | ] 1728 | }, 1729 | { 1730 | "cell_type": "markdown", 1731 | "metadata": {}, 1732 | "source": [ 1733 | "### Register fitted model for deployment" 1734 | ] 1735 | }, 1736 | { 1737 | "cell_type": "code", 1738 | "execution_count": 39, 1739 | "metadata": { 1740 | "scrolled": true 1741 | }, 1742 | "outputs": [ 1743 | { 1744 | "name": "stdout", 1745 | "output_type": "stream", 1746 | "text": [ 1747 | "Registering model AutoMLeeecbb345best\n", 1748 | "AutoMLeeecbb345best\n" 1749 | ] 1750 | } 1751 | ], 1752 | "source": [ 1753 | "description = 'AutoML Model'\n", 1754 | "tags = None\n", 1755 | "local_run.register_model(description = description, tags = tags)\n", 1756 | "print(local_run.model_id) # Use this id to deploy the model as a web service in Azure" 1757 | ] 1758 | }, 1759 | { 1760 | "cell_type": "markdown", 1761 | "metadata": {}, 1762 | "source": [ 1763 | "### Testing the Fitted Model" 1764 | ] 1765 | }, 1766 | { 1767 | "cell_type": "markdown", 1768 | "metadata": {}, 1769 | "source": [ 1770 | "Predict on training and test set, and calculate residual values." 1771 | ] 1772 | }, 1773 | { 1774 | "cell_type": "code", 1775 | "execution_count": 40, 1776 | "metadata": { 1777 | "scrolled": false 1778 | }, 1779 | "outputs": [], 1780 | "source": [ 1781 | "x_train = train.drop(['demand', 'timeStamp'], axis=1)\n", 1782 | "y_train = train['demand']\n", 1783 | "y_pred_train = fitted_model.predict(x_train)\n", 1784 | "#y_residual_train = y_train - y_pred_train\n", 1785 | "\n", 1786 | "x_test = test.drop(['demand', 'timeStamp'], axis=1)\n", 1787 | "y_test = test['demand']\n", 1788 | "y_pred_test = fitted_model.predict(x_test)\n", 1789 | "#y_residual_test = y_test - y_pred_test\n", 1790 | "\n", 1791 | "\n", 1792 | "expected = y_train\n", 1793 | "predictions = y_pred_train\n", 1794 | "forecast_errors = [expected[i]-predictions[i] for i in range(len(expected))]\n", 1795 | "#print('Forecast Errors: %s' % forecast_errors)" 1796 | ] 1797 | } 1798 | ], 1799 | "metadata": { 1800 | "kernelspec": { 1801 | "display_name": "Python [conda env:myenv]", 1802 | "language": "python", 1803 | "name": "conda-env-myenv-py" 1804 | }, 1805 | "language_info": { 1806 | "codemirror_mode": { 1807 | "name": "ipython", 1808 | "version": 3 1809 | }, 1810 | "file_extension": ".py", 1811 | "mimetype": "text/x-python", 1812 | "name": "python", 1813 | "nbconvert_exporter": "python", 1814 | "pygments_lexer": "ipython3", 1815 | "version": "3.6.6" 1816 | } 1817 | }, 1818 | "nbformat": 4, 1819 | "nbformat_minor": 2 1820 | } 1821 | --------------------------------------------------------------------------------