├── .gitignore ├── 01-Create-Datasets ├── 01-create-retail-datasets.ipynb ├── 02-create-online-retail-II-datasets.ipynb ├── 03-create-air-quality-dataset.ipynb ├── 04-create-air-passengers-dataset.ipynb └── 05-create-electricity-demand-dataset.ipynb ├── 02-Tabularizing-Time-Series ├── 01-data-analysis-air-pollutants.ipynb ├── 02-feature-engineering-air-pollutants.ipynb └── 03-forecasting-air-pollutants.ipynb ├── 03-Challenges-in-Time-Series-Forecasting ├── 01-Refactoring-feature-engineering.ipynb ├── 02-forecasting-one-step-ahead.ipynb ├── 03-multistep-forecasting-direct.ipynb ├── 04-multistep-forecasting-recursive.ipynb └── 05-multistep-forecasting-recursive-continued.ipynb ├── 04-Time-Series-Decomposition ├── 01-box-cox-transform.ipynb ├── 02-compute-moving-averages.ipynb ├── 03-classical-decomposition-to-compute-trend-and-seasonality.ipynb ├── 04-LOWESS-to-compute-trend.ipynb ├── 05-STL-to-compute-trend-and-seasonality.ipynb └── 06-MSTL-decomposition.ipynb ├── 05-Missing-Data ├── 01-impute-missing-data-using-forward-fill-backward-fill.ipynb ├── 02-impute-missing-data-using-linear-and-spline-interpolation.ipynb └── 03-impute-missing-data-using-STL-decomposition-and-interpolation.ipynb ├── 06-Outliers ├── 01-detect-outliers-using-rolling-statistics.ipynb ├── 02-detect-outliers-using-residuals-LOWESS.ipynb ├── 03-detect-outliers-using-residuals-STL.ipynb └── 04-modelling-outliers-with-dummy-variables.ipynb ├── 07-Lag-Features ├── 01-computing-lags.ipynb ├── 02-lag-plots.ipynb ├── 03-autocorrelation-function.ipynb ├── 04-partial-autocorrelation-function.ipynb ├── 05-cross-correlation-function.ipynb ├── 06-air-pollution-example-domain-knowledge.ipynb ├── 07-air-pollution-example-modelling.ipynb └── 08-air-pollution-example-correlation.ipynb ├── 08-Window-Features ├── 01-rolling-window-features.ipynb ├── 02-expanding-window-features.ipynb ├── 03-weighted-rolling-window-features.ipynb ├── 04-exponential-weights.ipynb └── 05-window-features-with-feature-selection.ipynb ├── 09-Trend-Features ├── 01-time-linear-trend.ipynb ├── 02-time-non-linear-trend.ipynb ├── 03-recursive-forecasting-example.ipynb ├── 04-piecewise-linear-trend-and-changepoints.ipynb ├── 05-tree-based-models-and-trend.ipynb ├── 06-linear-trees-lightgbm.ipynb └── images │ ├── forecast_with_just_time.png │ └── recursive_forecasting │ ├── Slide1.png │ ├── Slide2.png │ ├── Slide3.png │ └── Slide4.png ├── 10-Seasonality-Features ├── 01-seasonal-lags.ipynb ├── 02-datetime-features-seasonality.ipynb ├── 03-seasonal-dummies.ipynb └── 04-fourier-features.ipynb ├── 11-Time-Features ├── 01-Extracting-date-related-features.ipynb ├── 02-Extracting-time-related-features.ipynb ├── 03-datetime-with-Feature-engine.ipynb ├── 04-periodic-features.ipynb ├── 05-highlighting-holidays-sandbox.ipynb └── 05-highlighting-holidays.ipynb ├── 12-Categorical-Encoding ├── 1-one-hot-encoding.ipynb ├── 2-ordinal-encoding.ipynb ├── 3-mean-encoding-simple.ipynb └── 4-mean-encoding-expanding-window.ipynb ├── Appendix └── 00-pandas-period.ipynb ├── Datasets └── .gitkeep ├── LICENSE ├── README.md ├── assignments └── 02-tabularizing-time-series │ ├── assignment.ipynb │ └── solution.ipynb ├── images ├── FETSF_banner.png ├── forecasting_framework.png ├── lag_features.png ├── trainindata.png └── window_features.png └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook 2 | .ipynb_checkpoints 3 | 4 | # datasets 5 | *.csv 6 | *.zip 7 | *.xlsx 8 | 9 | # folders 10 | 11 | -------------------------------------------------------------------------------- /01-Create-Datasets/01-create-retail-datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c81efda5", 6 | "metadata": {}, 7 | "source": [ 8 | "# Retail sales\n", 9 | "\n", 10 | "In this notebook we will prepare and store the retail sales dataset found [here](https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv).\n", 11 | "\n", 12 | "**Description of data:**\n", 13 | "\n", 14 | "The timeseries is collected between January 1992 and May 2016. It consists of a single series of monthly values representing sales volumes. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "id": "888749e6", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "\n", 28 | "from statsmodels.tsa.seasonal import STL" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "25cc2a1f", 34 | "metadata": {}, 35 | "source": [ 36 | "# Get the dataset" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "73ac5d57", 42 | "metadata": {}, 43 | "source": [ 44 | "The dataset can be obtained from this [link](https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv). It will open a raw file in GitHub. A simple way of obtaining the data is to copy and paste the values from your browser into a text editor of your choice. \n", 45 | "Save it in the Datasets directory, which is found at the root of this project, with the filename `example_retail_sales.csv`. \n", 46 | "\n", 47 | "Alternatively, download it using Pandas by running:\n", 48 | "\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "id": "15c6a149", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "url = \"https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv\"\n", 59 | "df = pd.read_csv(url)\n", 60 | "df.to_csv(\"../Datasets/example_retail_sales.csv\", index=False)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "5feac9ec", 66 | "metadata": {}, 67 | "source": [ 68 | "Now follow the rest of the notebook." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 3, 74 | "id": "707768c5", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "df = pd.read_csv(\n", 79 | " \"../Datasets/example_retail_sales.csv\",\n", 80 | " parse_dates=[\"ds\"],\n", 81 | " index_col=[\"ds\"],\n", 82 | " nrows=160,\n", 83 | ")" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "three-blind", 89 | "metadata": {}, 90 | "source": [ 91 | "# Create dataset with missing data" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "id": "112f9b90", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# copy dataframe\n", 102 | "df_with_missing_data = df.copy()\n", 103 | "\n", 104 | "# Insert missing data into dataframe\n", 105 | "df_with_missing_data.iloc[10:11] = np.NaN\n", 106 | "df_with_missing_data.iloc[25:28] = np.NaN\n", 107 | "df_with_missing_data.iloc[40:45] = np.NaN\n", 108 | "df_with_missing_data.iloc[70:94] = np.NaN" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "id": "45acce8b", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Save dataset in Datasets directory\n", 119 | "df_with_missing_data.to_csv(\"../Datasets/example_retail_sales_with_missing_data.csv\")" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "80293d1b", 125 | "metadata": {}, 126 | "source": [ 127 | "# Create dataset with outliers" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 6, 133 | "id": "b78e8d57", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "df_with_outliers = df.copy()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "id": "57bf7198", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# Insert outliers into dataframe\n", 148 | "outlier_idx = [20, 33, 66, 150]\n", 149 | "df_with_outliers.iloc[outlier_idx] = df_with_outliers.iloc[outlier_idx] * 1.7" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 8, 155 | "id": "ce560e64", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# Save dataset in Datasets directory\n", 160 | "df_with_outliers.to_csv(\"../Datasets/example_retail_sales_with_outliers.csv\")" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "41606a6b", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "fets", 175 | "language": "python", 176 | "name": "fets" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.8.2" 189 | }, 190 | "toc": { 191 | "base_numbering": 1, 192 | "nav_menu": {}, 193 | "number_sections": true, 194 | "sideBar": true, 195 | "skip_h1_title": false, 196 | "title_cell": "Table of Contents", 197 | "title_sidebar": "Contents", 198 | "toc_cell": false, 199 | "toc_position": {}, 200 | "toc_section_display": true, 201 | "toc_window_display": false 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 5 206 | } 207 | -------------------------------------------------------------------------------- /01-Create-Datasets/02-create-online-retail-II-datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Online Retail II Data Set\n", 8 | "\n", 9 | "In this notebook we will prepare and store the Online Retail II Data Set stored on the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail+II)\n", 10 | "\n", 11 | "\n", 12 | "**Citation:**\n", 13 | "\n", 14 | "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n", 15 | "\n", 16 | "## Download the data\n", 17 | "\n", 18 | "- Navigate to the [data folder](https://archive.ics.uci.edu/dataset/502/online+retail+ii).\n", 19 | "- Download the file called **online_retail_II.xlsx**.\n", 20 | "- Save the Excel file into the **datasets** folder at the root of this repository." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Load data" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# If you downloaded and stored the file as explained\n", 46 | "# above, it should be located here:\n", 47 | "\n", 48 | "file = \"../Datasets/online_retail_II.xlsx\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# The data is provided as two sheets in a single Excel file.\n", 58 | "# Each sheet contains a different time period.\n", 59 | "# Load both and join into a single dataframe.\n", 60 | "\n", 61 | "df_1 = pd.read_excel(file, sheet_name=\"Year 2009-2010\")\n", 62 | "df_2 = pd.read_excel(file, sheet_name=\"Year 2010-2011\")\n", 63 | "\n", 64 | "df = pd.concat([df_1, df_2])" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/html": [ 75 | "
\n", 76 | "\n", 89 | "\n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | "
InvoiceStockCodeDescriptionQuantityInvoiceDatePriceCustomer IDCountry
04894348504815CM CHRISTMAS GLASS BALL 20 LIGHTS122009-12-01 07:45:006.9513085.0United Kingdom
148943479323PPINK CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom
248943479323WWHITE CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom
348943422041RECORD FRAME 7\" SINGLE SIZE482009-12-01 07:45:002.1013085.0United Kingdom
448943421232STRAWBERRY CERAMIC TRINKET BOX242009-12-01 07:45:001.2513085.0United Kingdom
\n", 161 | "
" 162 | ], 163 | "text/plain": [ 164 | " Invoice StockCode Description Quantity \\\n", 165 | "0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 \n", 166 | "1 489434 79323P PINK CHERRY LIGHTS 12 \n", 167 | "2 489434 79323W WHITE CHERRY LIGHTS 12 \n", 168 | "3 489434 22041 RECORD FRAME 7\" SINGLE SIZE 48 \n", 169 | "4 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 \n", 170 | "\n", 171 | " InvoiceDate Price Customer ID Country \n", 172 | "0 2009-12-01 07:45:00 6.95 13085.0 United Kingdom \n", 173 | "1 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n", 174 | "2 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n", 175 | "3 2009-12-01 07:45:00 2.10 13085.0 United Kingdom \n", 176 | "4 2009-12-01 07:45:00 1.25 13085.0 United Kingdom " 177 | ] 178 | }, 179 | "execution_count": 4, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "# Inspect dataframe\n", 186 | "\n", 187 | "df.head()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 5, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "# Rename columns\n", 197 | "\n", 198 | "df.columns = [\n", 199 | " \"invoice\",\n", 200 | " \"stock_code\",\n", 201 | " \"description\",\n", 202 | " \"quantity\",\n", 203 | " \"invoice_date\",\n", 204 | " \"price\",\n", 205 | " \"customer_id\",\n", 206 | " \"country\",\n", 207 | "]" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "# Process data" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "Remove null customer ids." 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 6, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "mask = ~df[\"customer_id\"].isnull()\n", 231 | "df = df[mask]" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "Create a flag for when an order is cancelled. Cancelled orders contain \n", 239 | "the letter `C` at the start of the invoice." 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 7, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "df[\"is_cancelled\"] = df[\"invoice\"].apply(lambda x: str(x)[0] == \"C\")" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "Remove transactions which are negative quantities sold and are not cancelled orders." 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 8, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "mask = ~(~df[\"is_cancelled\"] & df[\"quantity\"] < 0)\n", 265 | "\n", 266 | "df = df[mask]" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "Compute revenue." 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 9, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "df[\"revenue\"] = df[\"quantity\"] * df[\"price\"]" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "To compute gross revenue and quantity sold we filter out cancelled orders.\n", 290 | "\n", 291 | "After this, we resample the data at a weekly level." 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 10, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "mask = ~df[\"is_cancelled\"]\n", 301 | "\n", 302 | "# If running this raises an UnsupportedFunctionCall error\n", 303 | "# try upgrading your version of pandas.\n", 304 | "df_gross = (\n", 305 | " df.loc[mask, [\"invoice_date\", \"quantity\", \"revenue\", \"country\"]]\n", 306 | " .groupby(\"country\")\n", 307 | " .resample(\"W\", on=\"invoice_date\")\n", 308 | " .sum(numeric_only=True)\n", 309 | ")" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 11, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "df_gross.index.rename([\"country\", \"week\"], inplace=True)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "# Save data\n", 326 | "\n", 327 | "We will save 3 different versions of the preprocessed dataset for different demos.\n", 328 | "\n", 329 | "## Weekly sampled" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 12, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "df_gross_countries = df_gross.reset_index(level=\"country\")" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 13, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "countries = [\n", 348 | " 'United Kingdom',\n", 349 | " 'Belgium',\n", 350 | " \"EIRE\",\n", 351 | " 'Germany',\n", 352 | " \"France\",\n", 353 | " 'Spain',\n", 354 | "]" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 14, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "df_gross_countries[df_gross_countries[\"country\"].isin(countries)].to_csv(\n", 364 | " \"../Datasets/online_retail_dataset_countries.csv\",\n", 365 | " index=True,\n", 366 | ")" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "## Unstacked countries" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 15, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "y = df_gross.unstack(\"country\")[\"revenue\"]" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 16, 388 | "metadata": {}, 389 | "outputs": [ 390 | { 391 | "data": { 392 | "text/html": [ 393 | "
\n", 394 | "\n", 407 | "\n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | "
countryAustraliaAustriaBahrainBelgiumBrazilCanadaChannel IslandsCyprusCzech RepublicDenmark...SingaporeSpainSwedenSwitzerlandThailandUSAUnited Arab EmiratesUnited KingdomUnspecifiedWest Indies
week
2009-12-06196.1NaNNaN439.1NaNNaN989.18760.69NaN1008.00...NaN435.88NaNNaNNaN141.0NaN213000.35NaNNaN
2009-12-130.01429.83NaN8.5NaNNaN0.000.00NaN0.00...NaN412.60285.3NaNNaN0.0517.7195810.04NaNNaN
2009-12-2075.00.00NaN0.0NaNNaN0.002796.29NaN429.66...NaN1952.640.0589.4NaN0.00.0182396.74NaNNaN
2009-12-270.0568.51NaN0.0NaNNaN0.000.00NaN0.00...NaN5149.060.00.0NaN0.00.022007.77NaNNaN
2010-01-030.00.00NaN0.0NaNNaN0.000.00NaN0.00...NaN0.000.00.0NaN0.00.00.00NaNNaN
\n", 581 | "

5 rows × 41 columns

\n", 582 | "
" 583 | ], 584 | "text/plain": [ 585 | "country Australia Austria Bahrain Belgium Brazil Canada \\\n", 586 | "week \n", 587 | "2009-12-06 196.1 NaN NaN 439.1 NaN NaN \n", 588 | "2009-12-13 0.0 1429.83 NaN 8.5 NaN NaN \n", 589 | "2009-12-20 75.0 0.00 NaN 0.0 NaN NaN \n", 590 | "2009-12-27 0.0 568.51 NaN 0.0 NaN NaN \n", 591 | "2010-01-03 0.0 0.00 NaN 0.0 NaN NaN \n", 592 | "\n", 593 | "country Channel Islands Cyprus Czech Republic Denmark ... Singapore \\\n", 594 | "week ... \n", 595 | "2009-12-06 989.18 760.69 NaN 1008.00 ... NaN \n", 596 | "2009-12-13 0.00 0.00 NaN 0.00 ... NaN \n", 597 | "2009-12-20 0.00 2796.29 NaN 429.66 ... NaN \n", 598 | "2009-12-27 0.00 0.00 NaN 0.00 ... NaN \n", 599 | "2010-01-03 0.00 0.00 NaN 0.00 ... NaN \n", 600 | "\n", 601 | "country Spain Sweden Switzerland Thailand USA \\\n", 602 | "week \n", 603 | "2009-12-06 435.88 NaN NaN NaN 141.0 \n", 604 | "2009-12-13 412.60 285.3 NaN NaN 0.0 \n", 605 | "2009-12-20 1952.64 0.0 589.4 NaN 0.0 \n", 606 | "2009-12-27 5149.06 0.0 0.0 NaN 0.0 \n", 607 | "2010-01-03 0.00 0.0 0.0 NaN 0.0 \n", 608 | "\n", 609 | "country United Arab Emirates United Kingdom Unspecified West Indies \n", 610 | "week \n", 611 | "2009-12-06 NaN 213000.35 NaN NaN \n", 612 | "2009-12-13 517.7 195810.04 NaN NaN \n", 613 | "2009-12-20 0.0 182396.74 NaN NaN \n", 614 | "2009-12-27 0.0 22007.77 NaN NaN \n", 615 | "2010-01-03 0.0 0.00 NaN NaN \n", 616 | "\n", 617 | "[5 rows x 41 columns]" 618 | ] 619 | }, 620 | "execution_count": 16, 621 | "metadata": {}, 622 | "output_type": "execute_result" 623 | } 624 | ], 625 | "source": [ 626 | "y.head()" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 17, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "y.to_csv(\"../Datasets/online_retail_dataset.csv\")" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": {}, 641 | "source": [ 642 | "## Raw data" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 18, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "# columns needed for demo\n", 652 | "cols = [\"invoice_date\", \"description\", \"revenue\"]\n", 653 | "\n", 654 | "# just UK\n", 655 | "df = df[df[\"country\"] == \"United Kingdom\"]\n", 656 | "\n", 657 | "# save\n", 658 | "df[cols].to_csv(\"../Datasets/online_retail_dataset_all.csv\", index=False)" 659 | ] 660 | } 661 | ], 662 | "metadata": { 663 | "kernelspec": { 664 | "display_name": "Python 3 (ipykernel)", 665 | "language": "python", 666 | "name": "python3" 667 | }, 668 | "language_info": { 669 | "codemirror_mode": { 670 | "name": "ipython", 671 | "version": 3 672 | }, 673 | "file_extension": ".py", 674 | "mimetype": "text/x-python", 675 | "name": "python", 676 | "nbconvert_exporter": "python", 677 | "pygments_lexer": "ipython3", 678 | "version": "3.8.7" 679 | }, 680 | "toc": { 681 | "base_numbering": 1, 682 | "nav_menu": {}, 683 | "number_sections": true, 684 | "sideBar": true, 685 | "skip_h1_title": false, 686 | "title_cell": "Table of Contents", 687 | "title_sidebar": "Contents", 688 | "toc_cell": false, 689 | "toc_position": {}, 690 | "toc_section_display": true, 691 | "toc_window_display": false 692 | } 693 | }, 694 | "nbformat": 4, 695 | "nbformat_minor": 4 696 | } 697 | -------------------------------------------------------------------------------- /01-Create-Datasets/03-create-air-quality-dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Air Quality Data Set\n", 8 | "\n", 9 | "In this notebook we will prepare and store the Air Quality Data Set from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Air+Quality)\n", 10 | "\n", 11 | "**Citation:**\n", 12 | "\n", 13 | "Dua, D. and Graff, C. (2019). [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml). Irvine, CA: University of California, School of Information and Computer Science.\n", 14 | "\n", 15 | "\n", 16 | "## Download and unzip the data\n", 17 | "\n", 18 | "- Navigate to the [data folder](https://archive.ics.uci.edu/dataset/360/air+quality).\n", 19 | "- Download the zip file called **AirQualityUCI.zip**.\n", 20 | "- Unzip it.\n", 21 | "- Save the csv file called **AirQualityUCI.csv** into the **datasets** folder at the root of this repository." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import pandas as pd\n", 31 | "import matplotlib.pyplot as plt" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# If you downloaded and stored the file as explained\n", 41 | "# above, it should be located here:\n", 42 | "\n", 43 | "filename = '../Datasets/AirQualityUCI.csv'" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "(9357, 14)" 55 | ] 56 | }, 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "output_type": "execute_result" 60 | } 61 | ], 62 | "source": [ 63 | "# load the data\n", 64 | "\n", 65 | "data = pd.read_csv(\n", 66 | " filename, sep=';', parse_dates=[['Date', 'Time']]\n", 67 | ").iloc[:, :-2] # drops last 2 columns, not real variables\n", 68 | "\n", 69 | "# drop missing values\n", 70 | "# these are added at the end of the file during reading\n", 71 | "data.dropna(inplace=True)\n", 72 | "\n", 73 | "data.shape" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/html": [ 84 | "
\n", 85 | "\n", 98 | "\n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | "
Date_TimeCO(GT)PT08.S1(CO)NMHC(GT)C6H6(GT)PT08.S2(NMHC)NOx(GT)PT08.S3(NOx)NO2(GT)PT08.S4(NO2)PT08.S5(O3)TRHAH
010/03/2004 18.00.002,61360.0150.011,91046.0166.01056.0113.01692.01268.013,648,90,7578
110/03/2004 19.00.0021292.0112.09,4955.0103.01174.092.01559.0972.013,347,70,7255
210/03/2004 20.00.002,21402.088.09,0939.0131.01140.0114.01555.01074.011,954,00,7502
310/03/2004 21.00.002,21376.080.09,2948.0172.01092.0122.01584.01203.011,060,00,7867
410/03/2004 22.00.001,61272.051.06,5836.0131.01205.0116.01490.01110.011,259,60,7888
\n", 206 | "
" 207 | ], 208 | "text/plain": [ 209 | " Date_Time CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) \\\n", 210 | "0 10/03/2004 18.00.00 2,6 1360.0 150.0 11,9 1046.0 \n", 211 | "1 10/03/2004 19.00.00 2 1292.0 112.0 9,4 955.0 \n", 212 | "2 10/03/2004 20.00.00 2,2 1402.0 88.0 9,0 939.0 \n", 213 | "3 10/03/2004 21.00.00 2,2 1376.0 80.0 9,2 948.0 \n", 214 | "4 10/03/2004 22.00.00 1,6 1272.0 51.0 6,5 836.0 \n", 215 | "\n", 216 | " NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH \\\n", 217 | "0 166.0 1056.0 113.0 1692.0 1268.0 13,6 48,9 \n", 218 | "1 103.0 1174.0 92.0 1559.0 972.0 13,3 47,7 \n", 219 | "2 131.0 1140.0 114.0 1555.0 1074.0 11,9 54,0 \n", 220 | "3 172.0 1092.0 122.0 1584.0 1203.0 11,0 60,0 \n", 221 | "4 131.0 1205.0 116.0 1490.0 1110.0 11,2 59,6 \n", 222 | "\n", 223 | " AH \n", 224 | "0 0,7578 \n", 225 | "1 0,7255 \n", 226 | "2 0,7502 \n", 227 | "3 0,7867 \n", 228 | "4 0,7888 " 229 | ] 230 | }, 231 | "execution_count": 4, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "data.head()" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "### Attribute Information:\n", 245 | "\n", 246 | "Taken from the [original website](https://archive.ics.uci.edu/ml/datasets/Air+Quality).\n", 247 | "\n", 248 | "- 0 Date (DD/MM/YYYY)\n", 249 | "- 1 Time (HH.MM.SS)\n", 250 | "\n", 251 | "The above were merged during loading into the Date_Time column\n", 252 | "\n", 253 | "\n", 254 | "- 2 True hourly averaged concentration CO in mg/m^3 (reference analyzer)\n", 255 | "- 3 PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)\n", 256 | "- 4 True hourly averaged overall Non Metanic HydroCarbons concentration in microg/m^3 (reference analyzer)\n", 257 | "- 5 True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)\n", 258 | "- 6 PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)\n", 259 | "- 7 True hourly averaged NOx concentration in ppb (reference analyzer)\n", 260 | "- 8 PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)\n", 261 | "- 9 True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)\n", 262 | "- 10 PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)\n", 263 | "- 11 PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)\n", 264 | "- 12 Temperature in °C\n", 265 | "- 13 Relative Humidity (%)\n", 266 | "- 14 AH Absolute Humidity " 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 5, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "# I will give the variables simpler names\n", 276 | "# more details at the end of the notebook\n", 277 | "\n", 278 | "new_var_names = [\n", 279 | " 'Date_Time',\n", 280 | " 'CO_true',\n", 281 | " 'CO_sensor',\n", 282 | " 'NMHC_true',\n", 283 | " 'C6H6_true',\n", 284 | " 'NMHC_sensor',\n", 285 | " 'NOX_true',\n", 286 | " 'NOX_sensor',\n", 287 | " 'NO2_true',\n", 288 | " 'NO2_sensor',\n", 289 | " 'O3_sensor',\n", 290 | " 'T',\n", 291 | " 'RH',\n", 292 | " 'AH', \n", 293 | "]" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 6, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "Index(['Date_Time', 'CO_true', 'CO_sensor', 'NMHC_true', 'C6H6_true',\n", 305 | " 'NMHC_sensor', 'NOX_true', 'NOX_sensor', 'NO2_true', 'NO2_sensor',\n", 306 | " 'O3_sensor', 'T', 'RH', 'AH'],\n", 307 | " dtype='object')" 308 | ] 309 | }, 310 | "execution_count": 6, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "data.columns = new_var_names\n", 317 | "\n", 318 | "data.columns" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 7, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "data": { 328 | "text/plain": [ 329 | "Index(['CO_true', 'CO_sensor', 'NMHC_true', 'C6H6_true', 'NMHC_sensor',\n", 330 | " 'NOX_true', 'NOX_sensor', 'NO2_true', 'NO2_sensor', 'O3_sensor', 'T',\n", 331 | " 'RH', 'AH'],\n", 332 | " dtype='object')" 333 | ] 334 | }, 335 | "execution_count": 7, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "# let's capture the variables\n", 342 | "\n", 343 | "predictors = data.columns[1:]\n", 344 | "\n", 345 | "predictors" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 8, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "data": { 355 | "text/html": [ 356 | "
\n", 357 | "\n", 370 | "\n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | "
Date_TimeCO_trueCO_sensorNMHC_trueC6H6_trueNMHC_sensorNOX_trueNOX_sensorNO2_trueNO2_sensorO3_sensorTRHAH
010/03/2004 18.00.002.61360.0150.011.91046.0166.01056.0113.01692.01268.013.648.90.7578
110/03/2004 19.00.002.01292.0112.09.4955.0103.01174.092.01559.0972.013.347.70.7255
210/03/2004 20.00.002.21402.088.09.0939.0131.01140.0114.01555.01074.011.954.00.7502
310/03/2004 21.00.002.21376.080.09.2948.0172.01092.0122.01584.01203.011.060.00.7867
410/03/2004 22.00.001.61272.051.06.5836.0131.01205.0116.01490.01110.011.259.60.7888
\n", 478 | "
" 479 | ], 480 | "text/plain": [ 481 | " Date_Time CO_true CO_sensor NMHC_true C6H6_true NMHC_sensor \\\n", 482 | "0 10/03/2004 18.00.00 2.6 1360.0 150.0 11.9 1046.0 \n", 483 | "1 10/03/2004 19.00.00 2.0 1292.0 112.0 9.4 955.0 \n", 484 | "2 10/03/2004 20.00.00 2.2 1402.0 88.0 9.0 939.0 \n", 485 | "3 10/03/2004 21.00.00 2.2 1376.0 80.0 9.2 948.0 \n", 486 | "4 10/03/2004 22.00.00 1.6 1272.0 51.0 6.5 836.0 \n", 487 | "\n", 488 | " NOX_true NOX_sensor NO2_true NO2_sensor O3_sensor T RH AH \n", 489 | "0 166.0 1056.0 113.0 1692.0 1268.0 13.6 48.9 0.7578 \n", 490 | "1 103.0 1174.0 92.0 1559.0 972.0 13.3 47.7 0.7255 \n", 491 | "2 131.0 1140.0 114.0 1555.0 1074.0 11.9 54.0 0.7502 \n", 492 | "3 172.0 1092.0 122.0 1584.0 1203.0 11.0 60.0 0.7867 \n", 493 | "4 131.0 1205.0 116.0 1490.0 1110.0 11.2 59.6 0.7888 " 494 | ] 495 | }, 496 | "execution_count": 8, 497 | "metadata": {}, 498 | "output_type": "execute_result" 499 | } 500 | ], 501 | "source": [ 502 | "# cast variables as numeric (they are strings by defo)\n", 503 | "# need to replace the , by . to cast as numeric\n", 504 | "\n", 505 | "for var in predictors:\n", 506 | " if data[var].dtype =='O':\n", 507 | " data[var] = data[var].str.replace(',', '.')\n", 508 | " data[var] = pd.to_numeric(data[var])\n", 509 | "\n", 510 | "data.head()" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 9, 516 | "metadata": {}, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/html": [ 521 | "
\n", 522 | "\n", 535 | "\n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | "
Date_TimeCO_trueCO_sensorNMHC_trueC6H6_trueNMHC_sensorNOX_trueNOX_sensorNO2_trueNO2_sensorO3_sensorTRHAH
\n", 558 | "
" 559 | ], 560 | "text/plain": [ 561 | "Empty DataFrame\n", 562 | "Columns: [Date_Time, CO_true, CO_sensor, NMHC_true, C6H6_true, NMHC_sensor, NOX_true, NOX_sensor, NO2_true, NO2_sensor, O3_sensor, T, RH, AH]\n", 563 | "Index: []" 564 | ] 565 | }, 566 | "execution_count": 9, 567 | "metadata": {}, 568 | "output_type": "execute_result" 569 | } 570 | ], 571 | "source": [ 572 | "data[data['Date_Time'].apply(lambda x: len(x))>19]" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 10, 578 | "metadata": {}, 579 | "outputs": [ 580 | { 581 | "data": { 582 | "text/html": [ 583 | "
\n", 584 | "\n", 597 | "\n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | "
Date_TimeCO_trueCO_sensorNMHC_trueC6H6_trueNMHC_sensorNOX_trueNOX_sensorNO2_trueNO2_sensorO3_sensorTRHAH
02004-10-03 18:00:002.61360.0150.011.91046.0166.01056.0113.01692.01268.013.648.90.7578
12004-10-03 19:00:002.01292.0112.09.4955.0103.01174.092.01559.0972.013.347.70.7255
22004-10-03 20:00:002.21402.088.09.0939.0131.01140.0114.01555.01074.011.954.00.7502
32004-10-03 21:00:002.21376.080.09.2948.0172.01092.0122.01584.01203.011.060.00.7867
42004-10-03 22:00:001.61272.051.06.5836.0131.01205.0116.01490.01110.011.259.60.7888
\n", 705 | "
" 706 | ], 707 | "text/plain": [ 708 | " Date_Time CO_true CO_sensor NMHC_true C6H6_true NMHC_sensor \\\n", 709 | "0 2004-10-03 18:00:00 2.6 1360.0 150.0 11.9 1046.0 \n", 710 | "1 2004-10-03 19:00:00 2.0 1292.0 112.0 9.4 955.0 \n", 711 | "2 2004-10-03 20:00:00 2.2 1402.0 88.0 9.0 939.0 \n", 712 | "3 2004-10-03 21:00:00 2.2 1376.0 80.0 9.2 948.0 \n", 713 | "4 2004-10-03 22:00:00 1.6 1272.0 51.0 6.5 836.0 \n", 714 | "\n", 715 | " NOX_true NOX_sensor NO2_true NO2_sensor O3_sensor T RH AH \n", 716 | "0 166.0 1056.0 113.0 1692.0 1268.0 13.6 48.9 0.7578 \n", 717 | "1 103.0 1174.0 92.0 1559.0 972.0 13.3 47.7 0.7255 \n", 718 | "2 131.0 1140.0 114.0 1555.0 1074.0 11.9 54.0 0.7502 \n", 719 | "3 172.0 1092.0 122.0 1584.0 1203.0 11.0 60.0 0.7867 \n", 720 | "4 131.0 1205.0 116.0 1490.0 1110.0 11.2 59.6 0.7888 " 721 | ] 722 | }, 723 | "execution_count": 10, 724 | "metadata": {}, 725 | "output_type": "execute_result" 726 | } 727 | ], 728 | "source": [ 729 | "# cast date and time variable as datetime\n", 730 | "# replace . by : to transform to datetime format\n", 731 | "\n", 732 | "data['Date_Time'] = data['Date_Time'].str.replace('.', ':', regex=False)\n", 733 | "\n", 734 | "data['Date_Time'] = pd.to_datetime(data['Date_Time'])\n", 735 | "# use dayfirst=True parameter if format is dd/mm/yyyy HH:mm:ss Eg: pd.to_datetime(data['Date_Time'], dayfirst=True)\n", 736 | "\n", 737 | "data.head()" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": 11, 743 | "metadata": {}, 744 | "outputs": [], 745 | "source": [ 746 | "# sort index\n", 747 | "# we want the data in time order\n", 748 | "\n", 749 | "data.sort_index(inplace=True)" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 12, 755 | "metadata": {}, 756 | "outputs": [ 757 | { 758 | "data": { 759 | "text/plain": [ 760 | "Date_Time datetime64[ns]\n", 761 | "CO_true float64\n", 762 | "CO_sensor float64\n", 763 | "NMHC_true float64\n", 764 | "C6H6_true float64\n", 765 | "NMHC_sensor float64\n", 766 | "NOX_true float64\n", 767 | "NOX_sensor float64\n", 768 | "NO2_true float64\n", 769 | "NO2_sensor float64\n", 770 | "O3_sensor float64\n", 771 | "T float64\n", 772 | "RH float64\n", 773 | "AH float64\n", 774 | "dtype: object" 775 | ] 776 | }, 777 | "execution_count": 12, 778 | "metadata": {}, 779 | "output_type": "execute_result" 780 | } 781 | ], 782 | "source": [ 783 | "# check the format\n", 784 | "\n", 785 | "data.dtypes" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": 13, 791 | "metadata": {}, 792 | "outputs": [ 793 | { 794 | "data": { 795 | "text/plain": [ 796 | "0" 797 | ] 798 | }, 799 | "execution_count": 13, 800 | "metadata": {}, 801 | "output_type": "execute_result" 802 | } 803 | ], 804 | "source": [ 805 | "# sanity check: duplicates in dt variable\n", 806 | "\n", 807 | "data['Date_Time'].duplicated().sum()" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 14, 813 | "metadata": {}, 814 | "outputs": [ 815 | { 816 | "data": { 817 | "text/plain": [ 818 | "Date_Time 0\n", 819 | "CO_true 0\n", 820 | "CO_sensor 0\n", 821 | "NMHC_true 0\n", 822 | "C6H6_true 0\n", 823 | "NMHC_sensor 0\n", 824 | "NOX_true 0\n", 825 | "NOX_sensor 0\n", 826 | "NO2_true 0\n", 827 | "NO2_sensor 0\n", 828 | "O3_sensor 0\n", 829 | "T 0\n", 830 | "RH 0\n", 831 | "AH 0\n", 832 | "dtype: int64" 833 | ] 834 | }, 835 | "execution_count": 14, 836 | "metadata": {}, 837 | "output_type": "execute_result" 838 | } 839 | ], 840 | "source": [ 841 | "# check NA\n", 842 | "\n", 843 | "data.isnull().sum()" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": 15, 849 | "metadata": {}, 850 | "outputs": [ 851 | { 852 | "data": { 853 | "text/plain": [ 854 | "min 2004-01-04 00:00:00\n", 855 | "max 2005-12-03 23:00:00\n", 856 | "Name: Date_Time, dtype: datetime64[ns]" 857 | ] 858 | }, 859 | "execution_count": 15, 860 | "metadata": {}, 861 | "output_type": "execute_result" 862 | } 863 | ], 864 | "source": [ 865 | "# check time span\n", 866 | "\n", 867 | "data['Date_Time'].agg(['min', 'max'])" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": 16, 873 | "metadata": {}, 874 | "outputs": [], 875 | "source": [ 876 | "# save preprocessed data\n", 877 | "\n", 878 | "data.to_csv('../Datasets/AirQualityUCI_ready.csv', index=False)" 879 | ] 880 | }, 881 | { 882 | "cell_type": "markdown", 883 | "metadata": {}, 884 | "source": [ 885 | "## Data set Summary\n", 886 | "\n", 887 | "The dataset was collected between January 2004 and March 2005.\n", 888 | "\n", 889 | "It consists of hourly measurements of the different air pollutants, NO2, NOX, CO, C6H6, O3 and NMHC. The measurements are accompanied by local temperature and humidity values, also recorded hourly.\n", 890 | "\n", 891 | "In the data collection experiments, scientists were testing new pollutant sensors. The values from the new sensors are stored in the variables called _sensors. \n", 892 | "\n", 893 | "For comparison, data for the pollutants was also gathered from fixed stations, that regularly measure the concentration of these gases. Those values are stored in the variables called _true." 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "execution_count": null, 899 | "metadata": {}, 900 | "outputs": [], 901 | "source": [] 902 | } 903 | ], 904 | "metadata": { 905 | "kernelspec": { 906 | "display_name": "Python 3 (ipykernel)", 907 | "language": "python", 908 | "name": "python3" 909 | }, 910 | "language_info": { 911 | "codemirror_mode": { 912 | "name": "ipython", 913 | "version": 3 914 | }, 915 | "file_extension": ".py", 916 | "mimetype": "text/x-python", 917 | "name": "python", 918 | "nbconvert_exporter": "python", 919 | "pygments_lexer": "ipython3", 920 | "version": "3.10.5" 921 | }, 922 | "toc": { 923 | "base_numbering": 1, 924 | "nav_menu": {}, 925 | "number_sections": true, 926 | "sideBar": true, 927 | "skip_h1_title": false, 928 | "title_cell": "Table of Contents", 929 | "title_sidebar": "Contents", 930 | "toc_cell": false, 931 | "toc_position": {}, 932 | "toc_section_display": true, 933 | "toc_window_display": true 934 | } 935 | }, 936 | "nbformat": 4, 937 | "nbformat_minor": 4 938 | } 939 | -------------------------------------------------------------------------------- /01-Create-Datasets/04-create-air-passengers-dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "threatened-orbit", 6 | "metadata": {}, 7 | "source": [ 8 | "# Air Passengers Dataset\n", 9 | "\n", 10 | "In this notebook we will prepare and store the air passengers dataset found [here](https://github.com/facebook/prophet/blob/main/examples/example_air_passengers.csv).\n", 11 | "\n", 12 | "**Citation:**\n", 13 | "\n", 14 | "Box, G. E. P., Jenkins, G. M. and Reinsel, G. C. (1976) Time Series Analysis, Forecasting and Control. Third Edition. Holden-Day. Series G.\n", 15 | "\n", 16 | "**Description of data:**\n", 17 | "\n", 18 | "The data is a monthly time series measuring the number of international airline passengers, in thousands, from 1949 to 1960." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "aggressive-license", 24 | "metadata": {}, 25 | "source": [ 26 | "In this notebook we will:\n", 27 | "\n", 28 | "1. Provide instructions to download the air passengers data set\n", 29 | "\n", 30 | "2. Save the time series data in the correct location for use in the course\n", 31 | "\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "competitive-robertson", 37 | "metadata": {}, 38 | "source": [ 39 | "# Get the dataset" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "outdoor-architecture", 45 | "metadata": {}, 46 | "source": [ 47 | "The dataset can be obtained from this [link](https://raw.githubusercontent.com/facebook/prophet/master/examples/example_air_passengers.csv). It will open a raw file in GitHub. A simple way of obtaining the data is to copy and paste the values from your browser into a text editor of your choice. \n", 48 | "Save it in the Datasets directory, which is found at the root of this project, with the filename `example_air_passengers.csv`.\n", 49 | "\n", 50 | "Alternatively, run the code below." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 1, 56 | "id": "5045cf1c", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "import pandas as pd\n", 61 | "\n", 62 | "url = \"https://raw.githubusercontent.com/facebook/prophet/main/examples/example_air_passengers.csv\"\n", 63 | "df = pd.read_csv(url)\n", 64 | "df.to_csv(\"../Datasets/example_air_passengers.csv\", index=False)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "respected-worth", 70 | "metadata": {}, 71 | "source": [ 72 | "# Data set synopsis" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "mediterranean-toilet", 78 | "metadata": {}, 79 | "source": [ 80 | "The air passengers dataset is a monthly timeseries representing the number of US air passengers collected between January 1949 and December 1960." 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "italic-serial", 86 | "metadata": {}, 87 | "source": [ 88 | "# Check that you can load the data " 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 2, 94 | "id": "established-clinic", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "import pandas as pd" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 3, 104 | "id": "developmental-roulette", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "df = pd.read_csv(\n", 109 | " \"../Datasets/example_air_passengers.csv\", parse_dates=[\"ds\"], index_col=[\"ds\"]\n", 110 | ")" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 4, 116 | "id": "quantitative-missouri", 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/html": [ 122 | "
\n", 123 | "\n", 136 | "\n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | "
y
ds
1949-01-01112
1949-02-01118
1949-03-01132
1949-04-01129
1949-05-01121
\n", 170 | "
" 171 | ], 172 | "text/plain": [ 173 | " y\n", 174 | "ds \n", 175 | "1949-01-01 112\n", 176 | "1949-02-01 118\n", 177 | "1949-03-01 132\n", 178 | "1949-04-01 129\n", 179 | "1949-05-01 121" 180 | ] 181 | }, 182 | "execution_count": 4, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "df.head()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 5, 194 | "id": "straight-mouth", 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "" 201 | ] 202 | }, 203 | "execution_count": 5, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | }, 207 | { 208 | "data": { 209 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEGCAYAAACevtWaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAABFZElEQVR4nO3dd3xcZ5no8d+rGUmjNupdsuUi23GJY8fpvZGEAEkoISwlJGGzQGBhWe6SvbCUZdkF7l3a3V12QyAJGyChZZMA6YnTiOMS917ULVldGpXp7/3jnDMayRppRnPGkuXn+/n449GZmVNcnnnnOc/7vEprjRBCiPklbbZPQAghhP0kuAshxDwkwV0IIeYhCe5CCDEPSXAXQoh5yDnbJwBQUlKi6+rqZvs0hBDitLJt27ZurXXpZM/NieBeV1fH1q1bZ/s0hBDitKKUaor1nKRlhBBiHpLgLoQQ85AEdyGEmIfmRM59MoFAgNbWVrxe72yfSkwul4uamhrS09Nn+1SEEGKcORvcW1tbycvLo66uDqXUbJ/OSbTW9PT00NrayqJFi2b7dIQQYpw5m5bxer0UFxfPycAOoJSiuLh4Tn+zEEKcueZscAfmbGC3zPXzE0KcueZ0cBdCiDNFY/cwLx04Ydv+JLgLIcQccP9rx7jn59vo8vhs2Z8EdyGEmAP6hv0Ew5r/2d5my/4kuMfw1a9+lR/84AeRn7/85S/zwx/+cPZOSAgxr/WPBAB4bGsLdqyQN2dLIaN946m97Ds+aOs+V1a5+dq7V8V8/q677uK9730vn//85wmHwzz66KNs3rzZ1nMQQghL/2iAdIfiSOcQ21v6Wb+gMKn9ycg9hrq6OoqLi9m+fTvPPfcc69ato7i4eLZPSwgxTw2M+Ln2rHKy0h38ZmtL0vuLa+SulCoAHgBWAxq4CzgIPAbUAY3AbVrrPmXUB/4QeCcwAnxca/12Mic51Qg7lT7xiU/w0EMP0dHRwV133TUr5yCEODP0jQSoKcziprMreWpnO//wrpVkZ8w8uRLvyP2HwDNa6xXAWmA/cB/wota6HnjR/BngRqDe/HUP8OMZn90su/XWW3nmmWfYsmUL119//WyfjhBinvIGQowGQhRkZ/DutVUM+YK83dSf1D6n/VhQSuUDlwMfB9Ba+wG/Uupm4ErzZQ8DG4EvATcDP9fGHYFNSqkCpVSl1ro9qTOdBRkZGVx11VUUFBTgcDhm+3SEEPPU4KhxMzU/K50KtwuAAXPbTMUzcl8EdAEPKqW2K6UeUErlAOVRAbsDKDcfVwPRCaNWc9tpJxwOs2nTJu6+++7ZPhUhxDzWbwbygux03FnGmHvQm/rg7gTWAz/WWq8DhhlLwQBgjtITqt1RSt2jlNqqlNra1dWVyFtPiX379rF06VKuueYa6uvrZ/t0hBDzmFUGWZCVgdtldJkdTHLkHk+2vhVo1Vq/Zf78W4zgfsJKtyilKoFO8/k2oDbq/TXmtnG01vcD9wNs2LAh+aJOm61cuZJjx47N9mkIIc4A/SN+wBi5Z2c4cKSp1I/ctdYdQItSarm56RpgH/AkcIe57Q7gCfPxk8DHlOFCYGCm+XY7CvlTaa6fnxDi9NAflXNXSuF2ORkcDSa1z3jrbD4L/EIplQEcA+7E+GD4tVLqbqAJuM187Z8wyiCPYJRC3jmTE3O5XPT09MzZtr9WP3eXyzXbpyKEOM0NjIzl3AHcWelJj9zjCu5a6x3AhkmeumaS12rg3qTOCqipqaG1tZW5mI+3WCsxCSFEMvpH/TjSFLmZRkh2u9JPSc59VqSnp8sKR0KIM0L/SIACMyUDRnpm0JtcWkbaDwghxCzrHw2Qnz22FrM7y5n0yF2CuxBCzLIBc+RucbuSz7lLcBdCiFnWP+qnIDsj8rM7Kz3pahkJ7kIIMcv6hieO3J2MBkL4g+EZ71OCuxBCzLKBk3LuxmNPEqkZCe5CCDGLAqEwQ74ghdFpGasFQRIVMxLchRBiFg2Mjp/ABIw1D0uiYkaCuxBCzCKraVj+hGoZSK4zpAR3IYSYwog/yBM72lLWS2pg1GoaNr5aBkiqYkaCuxBCTOG7zxzkc4/u4HDnUEr2P9buV0buQghxShzrGuKRTU0A9A37U3KM/hHJuQshxCn17acPEAwb6Zhkl72LJbIKU9ZYWiYr3YEzyZ7uEtyFEGISm4718Ny+E9y2wej8mmwjr1gGRvwoBXmusT6OSqmkZ6lKcBdCiEn895tNlORm8DfXLQOSX/Yulv7RAPlZ6aSljV+3wu1yyshdCCHsdmLQS31ZHmV5xoI8KUvLTGgaZjFG7hLchRDCVr3DfopyMnCkKfIykxtFT8Vo95tx0najM6SkZYQQwla9I0ZwB3u6NMbSP+KPMXJPrqe7BHchhJggGArTPxKIBPc8lzO1aZnsSYJ7kj3dJbgLIcQEfWbtuRXc821YsDr2sWKN3KVaRgghbNU3YkxYGp+WsT+4ewMhPN4gZW7XSc8l29NdgrsQQkzQMzQhuLtSE9w7B30AlOZlnvRcsj3dJbgLIcQEvcPjg7uRlrH/hmqnxwtA2WTB3ewvM9NcvwR3IYSYoNdMyxRH0jJOhnxBgqGZL3s3mU7PVCN3s7/MDD9UJLgLIcQEvUPj2/Bao2iPzaP3zkFr5D5Zzt1q+ysjdyGEsEXvsI88l5MMpxEirYU07K6Y6Rry4UhTkW8I0dxJHlOCuxBCTNA7EhgXcK1Aa3ete+egj5LcjJP6ykD0yF3SMkIIYYveYR+FUcE934aVkSbT6fFNmpKB6Jy7jNyFEMIWPUP+CSP35AJtLEZwP/lmKkT1dJecuxBC2KMvqq8MJF+WGEuXx0uZe/LgbvV0T2kppFKqUSm1Wym1Qym11dxWpJR6Xil12Py90NyulFI/UkodUUrtUkqtn9GZCSHEJN482sPHH9xMwOayRIvWmt5hf4y0jH3BPRgK0zPspzRGWgagwu2ifcA7o/0nMnK/Smt9jtZ6g/nzfcCLWut64EXzZ4AbgXrz1z3Aj2d0ZkIIMYn/+9xBNh7sonvIl5L9e3xBAiE9Li2TneHAkeSydxN1D/nRevIJTJbaoiyae0dmtP9k0jI3Aw+bjx8Gbona/nNt2AQUKKUqkziOEEIAsKOln21NfYD9NeeWvsjs1LGgq5TCbXNnyKlmp1oWFGXT0juC1jrh/ccb3DXwnFJqm1LqHnNbuda63XzcAZSbj6uBlqj3tprbxlFK3aOU2qqU2trV1ZXwiQshzjwPvtEQeZyqZe96IsF9fKfGfJt7ult9ZSZrGmapLcrGFwzT5Un8W0q8wf1SrfV6jJTLvUqpy6Of1MbHSkIfLVrr+7XWG7TWG0pLSxN5qxDiDNQx4OWPu9pZv6AASN3IvXfo5JE7kNTNzclM1XrAUluYDUBLX+KpmbiCu9a6zfy9E3gcOB84YaVbzN87zZe3AbVRb68xtwkhxIz996ZGwlrz2WvqAfvLEi0T+8pYkl08YyIrLVOaO1XO3QzuvaMJ73/a4K6UylFK5VmPgXcAe4AngTvMl90BPGE+fhL4mFk1cyEwEJW+EUKIGdl4sIuLlhSzqsoNpC4tY3WELJwQ3PNt7une6fFRmJ0eaXEwmZrCLIAZ3VR1xvGacuBxpZT1+l9qrZ9RSm0Bfq2UuhtoAm4zX/8n4J3AEWAEuDPhsxJCiAk6BrysrS0Ym5afqrTMsJ8MZxo5GY5x291ZTgZszLl3TTE71eJKd1DuzqQlFcFda30MWDvJ9h7gmkm2a+DehM9ECCFi8AZC9Az7qcp3kelMI8ORlrq0zLAxO9Uc0Ea4bV5qr9PjizmBKVptYfaMRu4yQ1UIMedZlSUV+VkopchzOVN3Q3XYT2H2JF0aXen4g2G8gZAtx+ka9E55M9VSW5RNa18Kcu5CCDHbjg8Ywa0y30hjpGpNUzBKIYtzp2jBa8NxtdZ0DU2flgEjuLcPjCa8lqoEdyHEnNdhTsGvMIN7KkfufcPj+8pY7Ozp3jcSIBDSU05gstQWZhHWcLw/sdG7BHchxJxn9VepMCf82F2WGC12Wsa4RWnHTdXI7NQ4cu4LimZW6y7BXQgx53UMjOJ2OcnJNAKsOys1I3dvIMSQL0hJitMybzf1A1BdkDXta61a90RvqkpwF0Ik5Uinhz1tAyk9RvuAl8r8sUCYl5manLv1DSH6WBa70jLDviA/eOEQ6xcUcE5twbSvL3e7SHeohCcyxVPnLoQQJ/nV5mZ++noDRzqHyEp3sPcb10+6XJwdOga9kXw7pG7k3mZWpVRNMqK2q6f7/a8eo9Pj48cfOfekcsvJONIUNYXZCde6y8hdCDEj33nmAFprrj2rnNFAKDJtPxWMkftYcM9zpTMaCNne0926aTlZusRajWlgZObB/cSgl/tfPcZNayo5d2Fh3O+rKcySnLsQIvX6R/z0jwT40PkLeN96o+mrVYtuN38wTPeQb/zI3by5affova1/FKUYdyxLptNBnsuZVB/5B147RjAc5ks3rEjofbVFMnIXQpwCjT1GoFlYnBOp+LAqQOzW6fGiNSeN3MH+/jLH+0cpy8uM2e+lMn/mKyMB7GsfZFVVPguKsxN6X3meyyyfjP+bigR3IUTCGruHAVhUkk1prhF0O2fQczweYzXuY6kSq3LF7pH78YHRSfPtlor8LDoGZx7cG7qGWVSSk/D7rElVVlOzeEhwF0IkrLFnGKWgpjA7MnKfyYIS8RirYIkeuRtpGbtr3dv6pg7ulUmsaeoNhDg+4KWuOPHgbpVmJpISkuAuhEhYY/cwVflZuNIduNKNXHRnEiPaqUycnQpjlSseG4N7OKw5PuCdsva8It9F95Av4VYAYHwgAiwqncnI3fgA7RmSkbsQIoUae0aoKxnLG5flZaYsLdM+4CUnw0Fe5ljltlW5Yueydz3DfvzB8JTBvTLfhdYzu78QSWXNYORuLRzSMywjdyFECjX2DLMwKkiV5blSl3MfHKUi3zWuJjxyQ9XGkbtVBjl1zt349tAxg9RMQ7dxEzr6QzFeMnIXQqScVQYZPQItc2emrFpm4uxUgLxMJ0rZu2DHWHCP3anROo+Z5N0buocoyc2MfDAlwu1ykuFIo1uCuxAiVcbKICekZQZ9GGv12Ku933tS3XlamiI3w2lrKWTbFBOYLMmM3Bu7R1g0g1E7gFKK4twMuaEqhEidJuvGYMn4tIwvGLZ96btgKEynZ/zsVIs7K93WUsi2/lGyMxyRHjKTcbucZGc4ZjZy75lZGaSlODeDHgnuQohUaeg2yiCtboVAysohu4f8hPXkM0bzXE7bc+7VBVlT9ntRSlGR76JjMLEmXh5vgC6Pj7pkgntOJj1Rde6j/qlXhJLgLoRISHQZpMVaLs7uvLuVhijJPbnvuduVbmsp5PF+75Q3Uy0zmaXaZKayZlIpYzFG7mPB/aYfvTbl6yW4CyES0tgzMi7fDkSWi7N/5G4F98n6qzttLYU83j/1BCZLhTsr4Zz7se6Z17hbSnIz6R4y7msEQ+Fp+7tLcBdCJKSxZ/ik9EKkv4zNzcOs6fbFOSeP3PNc6Xh89ozcvYEQPcN+qqeolLFUFRhln8EE+rxYNe4Li5JJy2TgC4YZ9odo7RslGJ765rUEdyFE3KwyyLoJI/e8TCeZzjTb0zJWGmLSBatd9o3c2+KocbdU5LsIhY0FruNlpLJcZGU4pn9xDGO17j4azJvaU5HgLoSIm5UKWDghd6yUMmvdbU7LDPvIcKaRm3nyukJ5Zs7djvLLqfq4T2RV7iSSdz/WffK3nUQVR/WXaeiS4C6EsJGVdil3n5y+KMtz2Z6W6RnyU5yTMWkFizvLSVjD8DRVI/E40jkEEFcr3gq38QEQK+8+6A1EvgkAaK1psCG4l5oj9+4hP409w5HmabFIcBdCxM26wWlVx0Qz+svYnZbxTZqSgbEWBHZUzLx5tIcFRdmTrp060XQj908/8ja3/eebkW8UTT0jDIwGWF2Vn9Q5Wn8OPUN+Grqnr5mX4C7EPPHNP+zjhX0nUnoMqxrGamQVLRXNw3qH/ZPeTIWxzpDJ5t1DYc2mYz1ctLg4rtcXZKeT6UyjY+DkWvc3j/bw+pFu2vpHIwtab2/pA2DdgoKkzrPIah425JPgLsSZwuMN8NPXG/jsr7Zz+IQnZcfpHvKR53KOq3G3lLldeLxBvIHk0yRjx/NPMXK3ltpLbuS+v32QQW+Qi5bEF9yVUpPWumut+d7zB8kxb5puaewFYHtzPzkZDpaV5yV1ntYyf8cHvLT1j07bF16CuxDzwFHzBps3GOKTj2xj2GdvGwBL15Bv0pQMRE1ksinvrrWmZ9g36QQmGFuNaSazVKPLGP98tBsg7uAORsXMxJz760e62dLYx9/dsAK3yzkuuK+tLcCRFnvma7xKcjPZ3tyH1tg3cldKOZRS25VSfzB/XqSUekspdUQp9ZhSKsPcnmn+fMR8vi6ZixFCTO+oeUPwmzevpqF7mK8+sTclx+n2+GMG2zKbZ6mO+EN4A+FIOmKimS6S7QuGuOCfX+T+V48CRiplcWnOpDeJY6nKzxp30xTg+88forogi9vPr2VDXRFbGnsZ9YfY3z6YdErGUpyTwUHzm5mdaZnPAfujfv4O8H2t9VKgD7jb3H430Gdu/775OiFECh3tGsKZpvjgebW8/9wantvbkZLjdE8xcrdmqdqVd4/UuMcI7gXZia8rCsY3i55hP//63CGOdQ2xuaGXixMYtYPRV6dj0IsvaKSghn1B3m7u54Pn1ZLpdHBeXRFHu4Z55VAXwbBmXW1hQvuPpTg3A6vyc7rqm7iCu1KqBrgJeMD8WQFXA781X/IwcIv5+GbzZ8znr1FTdeIRQiTtSOcQC4uzSXekUV+Wh8cXZMDGdriWLo8vUpI3UWUS7XAnY606FOubQmF2OtkZjmmn4U9kffj4gmHuemgLw/4QFy0uSWgfC4uz0ZrITVNrCb2lZbkAnFdnBPMHXjsGwDl2jdzNP4uinIwpu1dC/CP3HwB/B1iJqmKgX2ttfR9qBarNx9VAC4D5/ID5+nGUUvcopbYqpbZ2dXXFeRpCiMkc7RqKBJaaQqOcr7UvsaA3HW8ghMcXjDlyL8hOJ8OZRodNa6lONTsVjBubC4qyae5J7Dq7zLTR+9bXRHrTX7i4KKF9WL11mnuNoN40ocf9mpp8MpxpbG3qY0FRdswPqERZ+4mndfC0wV0p9S6gU2u9Lekzi6K1vl9rvUFrvaG0tNTOXQtxRgmEwjT1jLCk1Aju1ZHgnlhb2ulYZZCTNfGCsSoSu0fuxVMExgVF2TQlOHK3ruOL1y9jVZWbs2vypzzG5Mc1gqsV1BvM3jFWBUum08E5NQVA8iWQ0aw/++kqZQCmnuJkuAR4j1LqnYALcAM/BAqUUk5zdF4DtJmvbwNqgVallBPIB3oSvAYhRJyae0cIhnUkuNcUGqNHu4P7VBOYLOVu+4J79zQ5dzBGyhsPdREOa9LirEbp9PhIU8Y9gl/dcyHBUOLtC0pyM8jOcESCe1PPMKV5meREtUk4b1Ehmxt7WVdbkPD+Y7Fq/uNZ0WnakbvW+u+11jVa6zrgduAlrfWHgZeB95svuwN4wnz8pPkz5vMv6VSsvSWEAMamzi8x0zJWLrotZSP32MG9Mt9la1omJ8MxaU29ZUFxDv5gmBMJVOh0Dvoozs3EkaZwu9JjVuNMJZISMr81NHaPnNSr/crlZTjTFBcvTSyfPxXrg3VRSe60r02mzv1LwBeUUkcwcuo/Nbf/FCg2t38BuC+JYwghpnG0ywzuZq9wpRQ1hVm259ytkfRUwb3CHLnbMZ7rGfZNmy5ZaK4GlUjevWso9k3hRCwszo4sOdjYM3xSj/vz6orY8bV3JD15Kdq5Cwv5p1tWc+3KsmlfG09aJkJrvRHYaD4+Bpw/yWu8wAcS2a8QYuaOdg5T7s6M9FoBo7thqtIysW5wgjG5xx8KG20Dkgygxj6mHlVbAbWpd4QL4mwf0OnxRvrPJ2NhcQ4vH+zC4w3QGWMJvcm6WSbDkab4yIUL43qtzFAV4jR3tGsokm+31BRm2z5y7/L4yM9KJ9MZO01SYU4EsiM10z0Uu6+MpaogC0eaSmzk7vFFJlwlY0FRNv5gODITNZnFr1NBgrsQpzGtNUc7JwvuWQx6g7YuID3VBCZLhY217j1DvpiVOZZ0RxrVBVlxV8yEwpruIf+01xGPBWZK6NVDRvuCiWmZ2SbBXYjTWJfHh8cXjNS4W6yKGTtvqnZ5pg+2VsvcZEfu4bCmd9gf183OhcXZNMexMhFA34ifUFhHZtMmwwrmrxwy5unEU554KklwF+I0dsS8mbp4wsLLqah1N0buUwfFktwM0lTyI/dBb4BgWMeVt0+k1t1qambHyN1KCTV0n1wGORdIcBfiNGYF74kLL1uzVNtszLvHM3J3OtIoy0u+1r1n2KrMmX7kvqAom/6RQFztFqx1T+3IuVspIeCkMsi5QIK7EKcxK4hOrP4ozsnAlZ5m28h9xB9k2B+Ka8RbbkOt+1jTsOmPF2kFEMdN1U7zvOwYuUcfe67l20GCuxApcyrm7rUPeM1APr6CRSllazlkt2f6GndLpfvkhSwS1RNH2aUl0gqgd/q8e1ccs2wTYd1UTXZ91FSQ4C5ECngDRs/wx7e3pvQ4HQOjkQqViWoKs2nttyctEwmKcQT3inwXJxIM7gc6BgmHxz4Mu4enbz1gsRa1bopr5O4jN9NJdoY9+XFrxD7XbqaCBHchUqKld4ROj4+fv9mU0uO0D3gjrXYnqinMsq1aJp6+MpaKfBceX5ChOFeDaukd4YYfvMbP3miIbHv1UBcluRlx3VDNzXRSkpsRMy2zramPN48a7a26huypcbesqsonTcFZlfbNQrWLBHchUsBKh2xv7qexO74yvZnoGPROOXLvGwnEHWSnEk9fGUuifd2t3jgPvtFIMBTmeP8oL+4/wW0bauNems6omJn8z/nvf7+Lv350O6Gwpmtw+lr9RFyytIQ3//4aFpdO3+vlVJPgLkQKRM8OfXx72xSvnLlRf4j+kUCktnyi6kjFTGKj95beEc7/1guR+m0YuxEZTw7cWq4u3uBuLXTR1j/K8/tO8OjmZjTwofMXxH3OtUXZJy17B8Z5HzoxRJfHx+aG3inXgJ2pRJbnO5UkuAuRAq19o2Q407h4STH/s6MtJTdXrYqUihjBZaaLduxpG6DT4+Nzj26nrX+U/e2D/OyNRtbWFpDumD5kREbucVbMNPWMkJPhoLYoi5+8doxHt7Rw5bJSaovir0CpKcyivd87buFrgDfMxa/TFDy16zidg15bJjCdDiS4C5ECrX2j1BRk8d71NTT1jPB2c7/tx2gfMEaqU+XcgUlHtFPv1wjK3kCIT/73Nu58cAu5mU7+8yPr43r/2Mg9vuM29QyzsDiHj1+8iLeb++n0+OJujmWpKcwmGNacmLB+6+uHeyjMTufGNZX8YefxuMs55wMJ7kKkQGvfCNWFWVy/qhxXelpKqmastEesnHtpbiaZzsRr3dsHRsl0pvG9285hd9sAw74gD955Xsz0z0SudAeF2ekJjdzrSrK5bUMNuZlOqguyuHL59C1to1kfZC1RM1W11rx+pIuLl5bwnrVVDHqNew923lCdy+bWfFkh5onWvlHeUZVPniudy+pLeeOI/YuRWSPsWEF3rNY9sbSMVYHzzjWV/PD2c1hUksNZle6E9lEVZ419MBSmpW+E61dXkOdK5/99aB1ZGY64b6RaJlt96mjXECcGfVy6tIQrlpWSl+mccg3Y+UZG7kLYbMQfpGfYHxlNrqx009gzzKg/ZOtxOga8FGSnk5URuwVv9QzKIY3gbpz7zedUc7a5Fmgi6kpy4qoSah/wEgjpyKIbV60o48I4+7JHqyowvr1Ef5C9ftjIt1+6tARXuoPrVpYDJ8/mna8kuAthM2v0aAX3FRV5aA2HOz22Hqd9wBvzZqrF6OueWHDvmKJ2Pl6LinNo6RslMOEGZ0P3MHc/tIX7Xz0KjE08WpjkJKBMp4Nyd+a4a339SA8Li7MjN2bvunQRVy4vnZMTjlJBgrsQNrNGj1ZQWV5hTHA52GFvcO8YHJ02CNcUZtEz7GfEH1+teyis6Rj0UlmQZHAvySEU1uNy4A+8dowbfvAqLx7o5ME3GtFaR8og6+JY8Hk60QuUBENhNh3r4ZKo9UtXV+fz0J3nT7km63wiwV0Im00cuS8szsGVnmZ/cB/wUjHNTc6aBGvdu4d8hMJ62v1Ox+q10mCmZvYeH+Cf/rifi5cU84XrltE+4OVw5xBNPcNkONMot6E80Vg31rjO/e0ehnzBGaV45gsJ7kLYrLXPqDax+rA40hT1ZXkcPGFfcPcFQ3QP+eMauQO0xlkOedx8XVWyaZkJwX1nywAA33jPat5/bg1gtBho6hlhYVE2aQneQJ1MTWEW7QNGrfu2JmPpuw0LC5Pe7+lKqmWEsJlVBqnUWMBaXpE3bsZnsqxFJ2KVQVomqyKZynQVOPEqzE4nPys9knbZ3TaA2+Wktsj4c6kvy+WVQ110DvqSzrdbagqzI2mlbc39VOa7qCpI7jpOZzJyF8JmrX2jkaBqWV6eR5fHR6/Z7TBZY0F46uBemptJhiMt7nLIePc7HaUUdSU549Iyq6vzIx94ly8r5a2GXhp7hqmzqRd6TdTqU9saezn3DB61gwR3IWxnBPfxI0a7b6pONzvVkpamqCpwxT9y7x/FlZ5GQXZ60ue4uCSHxu4R/MEwB9o9rKnOjzx3+bJS/MEwvmDYtoUurA/ULQ29HB/wSnCf7RMQYj4Z9gXpjapxt6yIBPdBW44zNjt1+rRDTWF23DdU2weNGvfolNJM1RXncHxglN1tA/hDYVZFBfcLFhWR6TTCj11pmaoCF0rBEzuPA0hwn+0TEGI+sfq4TEzLlOZlUpidPuObqhPrxdv6R8nLdJIbx6LM0VUk02nvn768Ml51JdloDX/c1Q4wbuTuSndwgVnJYlfdeabTQXmeiyOdQ7jS0xKeVTvfSHAXwkZWbnviyF0pxfKKPA7MIC2zuaGX5V95mm88tZcRf5D/3tTEL99q5pwFBXG9v7ogi+4hH97A5DNkj3R6GPQai0sb5ZX2BPfFJUaP8z/uPk5epjMyC9XyvvXVLC/Pi8wutYP15762Jr4OlvPZmX31QtispXd8jXu0FRVuDnV4xi0nF4+tTb2EtbGYxcXffol/+J89XL6slH//cHxdGmuKxm40TnSgY5B3/vB1vvDYDkJmV8WqJCtlLNbEpBODPlZWuU8qd7z5nGqe/ZvLcdoYhK0/9w11Z3ZKBiS4C2Grpp4RsjMck641urwij2F/KOEWvEc7hyl3Z/LYPReyoCibz1y1lJ98bANuV3w3Pa0U0cTj+oNh/vbXO/GHwrywv5PNDb2Ewjrp2amWPFd6ZOWm6JRMKlnXeqbn20GCuxC2au4dZkFR9qQ3JOvLjDSFtaxcvI51D7G4JJcLFhfz5Gcu5YvXL0+oa2KsRTv+7aXD7D0+yLduXU2GI41vP70fSL4MMtoic/S+pubUBPf1Cwsoyc3k3AVFp+R4c9m0wV0p5VJKbVZK7VRK7VVKfcPcvkgp9ZZS6ohS6jGlVIa5PdP8+Yj5fF2Kr0GIOaOpZ4QFMVYQWmoG90QaiGmtOdY1zOLSmd90LMtzke5QkZQRwLGuIf5941Heu76aD1+wkHevrWJnqzGLNNkJTNGsmaqrT9HI/eoV5Wz9yrXk21DKebqLZ+TuA67WWq8FzgFuUEpdCHwH+L7WeinQB9xtvv5uoM/c/n3zdULMuo0HO2PeVLRDOKxp7h2JWbddkJ1BaV4mh0/EP3LvHfYzMBpgSRILMDvSFAuLc8Z9Y7BSMJ+9uh6AOy+pizxn58j9imVlnLuwkEVnSCfGuWTa4K4N1r+KdPOXBq4Gfmtufxi4xXx8s/kz5vPXKDuKZoVIwsEODx9/cAvfeGpvyo7RNeTDFwyzYIpAVl+Wy+EE0jJHu4wZnsmM3MHI9x88MVZjf6DDQ1a6I1LBsro6nwsWFZGT4SA/y75R701nV/K7T11sS+8YkZi4cu5KKYdSagfQCTwPHAX6tdZWH9FWoNp8XA20AJjPDwAntWZTSt2jlNqqlNra1WVfzw0hJnPAnDz0q80tvHywMyXHsHqTx0rLgBHcj3YOxb1g9rEu44MgmZE7wIryPFp6RxnyGf9lD53wsKw8d1zQ/e77z+bfPrzelglMYvbFFdy11iGt9TlADXA+sCLZA2ut79dab9BabygtLU12d0JM6WjnEGnKCK5f+u0u+kfs6fESrclskjWxnjva0vI8PL4gJwZ9MV8T7Vi30RI32QZYVvuDw+YkqoMdnsg2y8LiHK5KcO1SMXclVC2jte4HXgYuAgqUUtb0uBqgzXzcBtQCmM/nA/YvIClEAg53DrGwOIfvf/Aceof9/Otzh2w/RnPvCI40RfUkNe6WpaWJ3VQ92jnE4pKchNcUnWhFhTFb82CHhy6Pj55hP8srzuwZnPNdPNUypUqpAvNxFnAdsB8jyL/ffNkdwBPm4yfNnzGff0nH+x1UiBQ50jnE0rJcVlfnc8WyUjY39Np+jKaeEaoKXFPOjKwvN4N7nDdVj3UnVyljqSnMIjvDwYEOT6R52YoJI3cxv8Qzcq8EXlZK7QK2AM9rrf8AfAn4glLqCEZO/afm638KFJvbvwDcZ/9pCxG/QChMY89wpBTxrEo3R7uG8AXtrZxp6h1hYdHUgbg4J4PC7PS4bqr6g2Gae0ci0/iTkZamqC/P42CHJ3L/YWJaRswv03Yd0lrvAtZNsv0YRv594nYv8AFbzk4IGzT1jBAI6UhKZEVlHsGw5kjnEKuq7Ku/bu4Z5sY1lVO+xlioIo8jMdIy4bDmN9tauHBxMYGQJhTWtozcwbip+vz+E9QUZlGSmxGZPSrmJ5mhKuY9q77bSolY+ecD7fYtezfoDdA3EpjyZqplaXkuh06cXDETDmv+/ve7+dLvdvMXP3mLN48Zt6qSrZSxLK/Io3fYz5+P9sio/QwgwV3Me0cnlBPWFWeT6UyLpCfs0GyWQcaz8ER9WS4DowG6h8YqdkJhzRd/u5PHtrbwwQ219I34+fqTRk2+bSN3M6C39Y+yvFxups53EtzFvHf4hIeqfBc5Zu9zpyONZeUza78bi1XjXhvHyL2+zCxLjErNPLOng9+/3cbnr63nO+8/O9LxsTQvk7w4G4RNJ3q0vrzCnm8DYu6S4C7mvSNdQywtH5+GOKsyj/3tNo7ce62R+/SjbOvG7tGom6oHOgZJU/CpK5cAcNXyMv79L9bzv65fbts5FudmRvLsUgY5/0lwF7NqxB/kg//1Jj9/szEl+w+bN06XTshbr6hw0z3kp8sT32SiqWitOdo1RHFORlwrI5W7M8nLdI6rmDnWPUxtUTaZTkdk2w2rK7htQ23S5xdteUUuSsGychm5z3fT/0sUIkW01nz58T281dBLhjONj11UZ/sx2vpH8QbCkdGyZUWlMZI/0DFIaV7iM6SbeoZ5ek8HL+4/wf52D0O+IOfXxddmVinF0vLccbXuDV3DkQ6KqXTj6krys9LJzpD/+vOd/A2LWfPLzc08vr0Nt8tpa/472hHzZupJwT2qYuay+sSC+8sHOrnzoS0ArK5287711Swtz+PKZfHvp74sl5cOGD2VtNY0dA9zweLU9yD/yIUL+ciFC1N+HDH7JLiLWdHYPcw3ntzHlctLuWhxMf/y9AF6h/0U5WTYepwjJyYP7kU5GZS7M9k/g4qZlw92kpPh4JnPXx7XDdTJ1Jfl8eutrfQN+/EGQ4wGQiy2qeRRCJCcu5glf9rTjj8U5p9vXRNZpd7O0kTL3uMDVLhdk35orKhws38Gte4H2j2sqHTPOLCDUesOxjeLBqut7ylIy4gzhwR3MStePtDJqio3VQVZkRK9gylIzexqHYi5xNuKSmOmqD8Yjnt/Wmv2dwxyVmVyk4CsJfcOnxjiWLcR3E9Fzl2cOSS4i1Ouf8TPtqY+rl5htJcty8ukIDvd9uA+6A1wrHuYtTGC++qqfAIhzaET8R+3rX8UjzcYydnPVFW+0cjrcKeHY13DuNLTqHDbtwKSEBLcxSn36uFuwhquMoO7Uorl5XkcTCDIxmOPuSbompqCSZ9fa27fZb4uHlbLgmRH7mlpiiWluRzpHKKhe4hFJbmyWpGwlQR3ccq9fKCTopyMSHAFY2r8oQ4P4bB93aF3tRlB++wYizPXFmWRn5XO7rb+uPdp3RdYVp58b5b6Miu4D0u+XdhOgrs4pUJhzcaDnVyxrHTcAhTLK9wM+0O09Y/adqzdrQPUFmVRGKMCRynF2TX57GyJf+S+v8NDbVGWLS0Blpbn0j7gpbl3RPLtwnYS3MUptbO1n76RQCQlY7FuqtpZ776ztZ+zqwumfM3ZNfkcOuHBG4ivt/uB9kHOsmnqvtVjJqztaw4mhEWCuzjJH3e10z5g3wg62gv7TpCm4IoJE4fGKmbsKYfsHfbT2jfK2TFuplrWVBcQDGv2xegzo7XmD7uOG/XogRAN3cOsqLQruI/VtcvIXdhNgrsYZ1tTH/f+8m0eeK3B9n37giF+vbWVK5aVkp89Pq2Rm+mkpjDLtpH7rtZ+gJhlkJa1tcbzu2PcVH1xfyef+eV2PvfYDg52eAhrOMumXui1RdlkOI3/gnastiRENJmhKsb53vMHAdh3fOYjaG8gRPuAl5rCrHHriT61s53uIR93X7p40vetqMizrRzSCtZrYtxMtVS4XZTkZk5aMaO15j82HiHDmcarh7rw+o3UjV0jd4dZMdM56D3pw06IZElwFxGbjvXwxpEe3C4n+9oH0VqjVOLlef/yp/08/GYTzjTFsvI8vvv+s1lV5eZnrzewrDyXS5YWT/q+lZVuXjrQyZAvGFd3RcvGg538Zmsr//cDa8nKMLoq7mwdYHFpzrQ3PpVSrK3Jj4z0o73V0Mvbzf184z2reH7fCV4/0k1WuoMFScxMneg9a6ts6UwpxESSlhGAMUr93nOHKMvL5LNX1zMwGuD4gHdG+9pzfJAlpTncc/li+kf8/MVPNvHAaw3sax/krksWxfzA2FBXRFjD2019CR3v2b0d/HF3O199Yg8Arxzq4uWDnVy0ePIPkYnW1ORzpGuIYV9w3Pb/2HiUktwMPnheLd99/9nkZTpZUZk3rsonWZ+6cglfffdK2/YnhEWCuwDgjSM9bG7s5d6rlrJ+YQEA+2eYmjnWNcT5i4r5uxtW8OtPXkR+djrf+tN+inIyuGVddcz3rVtQQJqCrY29CR2vuXeENAW/2dbKvzy9n08/so1l5Xncd+OKuN5/dk0+WsOetrHUzJ62AV491MWdlyzCle6gqiCLX/7lhfzLe9ckdG5CzBYJ7gKtNf/6/EGq8l3cfn5tZJWeWBUkU+kd9tM3EmCJWdpXU5jNr//qIs6pLeCvr16KK90R8715rnRWVrnZ0pjYyL2ld5Qb11Ry6dIS/uuVY7iz0nnw4+fFXYtuTaba3tIf2fbUruOkOxQfvWisPe6amvyk2w4IcapIcBdsPNjF9uZ+PnN1PZlOB7mZTuqKs2e0DN2xCYtRA1TmZ/E/917Cxy9ZNO37NywsYntLX9zNvIKhMMf7R1lYlM0Pbz+H28+r5eG7zqciP/4+LcW5mSwuyRn3jWFLQy9n1xTgtmn9UiFONQnuZzitNd97/hC1RVl8YENNZPtZle4ZjdyPWe1rZzgp5/xFRXgDYfYej2/WaPuAl2BYs6Aom+LcTL79vrNn1BrgvLoitjT2EQ5rvIEQu9sG2FBXmPB+hJgrJLif4Z7bd4LdbQP89dX148oWV1a6aeoZYWjCTcbpHO0eIsORRk3hzCpKrIC6Jc68e0ufsTB1Mr3VreMOjAY40jXEzpZ+AiEd97J5QsxFEtzPcI9ubqamMItbJ9zoXFllLUOX2Oj9aOcwC4uzZ1xRUpbnoq44O+68e0uvEdyTLU88zwzkWxp7Ix8s5y6Ukbs4fUlwP8Ptb/dwXl0RTsf4fwrW6kiJ5t2PdQ+Ny7fPxIa6IrY29sbVIbKldxRHmqIygRz7ZBYWZ1Oal8mWhl62NPaxvDyPgmx7l/wT4lSS4H4GGxgJ0DHojfR1iVaZ76IgOz2hvHsgFKa5ZyTpJljn1RXSNxLgqHlzdiotfSNU5rtO+nBKlFKK8+oK2dzQy9tNfZJvF6c9Ce5nMGtxjMmCu1KKsyrc7GmLP7g3944QDOukF3q20iE7okoTpzqmXTNGz6sr4viAF48vyPmLJN8uTm/TBnelVK1S6mWl1D6l1F6l1OfM7UVKqeeVUofN3wvN7Uop9SOl1BGl1C6l1PpUX4SYGasD44oYjbAuXlLM7rYBjsfZYz3ZShnLopJcsjMc7I1jElVL7yi1M7x5O9F5UTdQN8jNVHGai2fkHgT+Vmu9ErgQuFcptRK4D3hRa10PvGj+DHAjUG/+ugf4se1nLWxxoMNDnssZc+3Od62tAuBPu9vj2l+kxj3JDoeONMWqKje728aXQ7YPjPL1J/dy3rdeYEdLPyP+IN1DPhYU2xPcV1TkkZPhoCrfRXVBli37FGK2TBvctdbtWuu3zcceYD9QDdwMPGy+7GHgFvPxzcDPtWETUKCUqrT7xM8EfcN+uod89I/4U7L/gx0eVlTkxez1sqgkh1VVbp7aFV9wP9o1REluhi0dDldV5bPv+CAh86bq07vbueK7G3lkUxMDowEe/nMjrX3GN4qaQnsCsdORxkcvquPDFy6c/sVCzHEJ5dyVUnXAOuAtoFxrbf2v7wDKzcfVQEvU21rNbRP3dY9SaqtSamtXV1ei5z3v/XprC+u++Twb/ukFzvnH5/ndtlZb96+15uAJz6T59mjvXlvFzpZ+mntGpt3nsa5h2/qSr6nOZzQQinwbeOjPjVQVuHj5i1dy24Ya/rS7PdKWONka92j33biCe69aatv+hJgtcQd3pVQu8Dvg81rrcclQrbUGElrZWGt9v9Z6g9Z6Q2lp6fRvOMP8cVc71QVZfPPmVSwuzeFnbzRg/DHb4/iAF483GOkjE8tNa4wvXX/YfXzcdm8gxJM7j/OpR7ax7h+fY83XnmVbc59ty8VZi2zsbhtgYDTA1qY+3rmmktqibG7bUIsvGObHG48Cyde4CzEfxdU0WymVjhHYf6G1/r25+YRSqlJr3W6mXTrN7W1AbdTba8xtIk4j/iBvHuvhIxcs5KMX1aGU4iv/s4cdLf2sW5B4id7PXm9gYDTA31y3LLJtupupltqibM6pLeAPO9v59JVjI9pvPLWXX21uoTQvk+tWlpObmU6agg+eVzvF3uK3uCQHV3oau9sGyHCmEQprrjbXXV1Tnc+KijwOdHjISndQHGMBbCHOZPFUyyjgp8B+rfX3op56ErjDfHwH8ETU9o+ZVTMXAgNR6RsRhz8f6cEfDEeC2S3rqsnJcPDIpuaE99Xl8fGdZw7wo5cOj1vlyFrOLp4+LO9eW8W+9sFI3XkorHlmTwc3rankrb+/hu++fy1fffdKvvKuldTPoK/LZJyONFZWutnbNshLBzopyE6PfLAppbhtg/EhsqAoe0YLiggx38WTlrkE+ChwtVJqh/nrncC3geuUUoeBa82fAf4EHAOOAD8BPm3/ac9vLx/sJCfDEam1zs10csu6av6w63jCN1cffKMBfyhMVrqD7z9/KLL9YIeHqnwX+VnT3/y8cXUFAM/s6QCMXud9IwGuW1lOmo0LV0y0pjqfvccHeOVgF5fXl45raXDLumrSHYraIqlqEWIy8VTLvK61Vlrrs7XW55i//qS17tFaX6O1rtdaX6u17jVfr7XW92qtl2it12itt6b+MuYPrTUvH+jk0vqSyOLJAB++YCG+YJjfJnBjddAb4L/fbOLG1RX85WWLeWZvR2RBioMdHpbFudBzVUEWa2sLeHavEdxfPWTcAL+0viTuc5mJVdX5DPtD9Az7I99iLEU5GXz3/WfzySuWpPQchDhdyQzVOebgCQ/HB7wnBbOVVW7OqS3g8e3x3774xaZmPL4gn7piKXdftgi3y8nXn9zLF3+zk8OdQwktPHHj6gp2tQ7Q2jfCK4e6WFOdT0luZtzvnwlrcWul4IplJ990v3VdjUw2EiIGCe6zQGsdsynWSweM+9JXLi876bnrVpaz9/hgXAsqe7wBfvp6A5fVl7CmJh+3K52/umIJW5v6eHZvBzevreIvL5t+8QzLDauM1MxvtrayvaWfy5eldtQOUF+WS6YzjXW1BRTKTVMhEhL/EvPCNnc+tIXC7Ay+/8Fzxm3v9Hh5bEsLq6rclE8ya/Ty+lL+z7MHee1wF+9dX3PS89H+6Q/76R328YXrzo1s++QVS7hkaQkrK93jUj7xqCvJYUVFHv/5ylFCYc0Vy07+8LGb05HGV961kiUl9pRXCnEmkZH7KeYNhHjjSDdP7GijfWCsZ0vHgJfb/2sTXR4fX33Xyknfu6rKTXFORiTnHctLB07w2NYW/uqKJeNKJx1pinNqCxIO7JYbV1fiC4bJzXSybkHBjPaRqI9euJCLl6b+W4IQ840E9wQEQmECoXBkSvxM7D0+SCCkCWt4dLMxkXdgJMDt979Jp8fHw3edzwWLiyd9b1qa4rL6El493B0zrdMz5ONLv9vNioo8Pn9t/YzPczI3mFUzlywtHrdqkxBi7pH/oXF68I0G6r/8NPVffprlX3maN450z2g/25uNFYbWVOfz6JZmAqEwX31yD619ozx053njOhNO5orlpfQO+yftmHj4hIf3/fjPDIwE+Nfb1pLpdMzoHGNZVp7Lp65cwl9ettjW/Qoh7CfBPU6Pb29jcUkOX3zHMsryMvnOMwdm1A5ge0s/1QVZ/PU19ZwY9PGl3+3iiR3H+ezV9XFVflxWb1SNvHKoc9z21w53cet//JkhX4hf3XMBq6ryEz636Sil+NINK6RCRYjTgAT3OHR6vOxqHeDWddV85up6Pn/tMna1DvDC/s7p3zzBjuZ+1i0o4OoVZVTlu/j9222cXZPPp6+Kr167JDeTVVVuXj00/pvD157cS7k7kyc/cwnnLpTgK8SZToJ7HDYeNG5gXmXWnr93fTV1xdl87/lDca3zaTkx6KWtf5R1CwpxpCnuvmwxuZlO/vUDaxPKYV+xrJRtzX2R2apHu4Y41jXMxy6qo0r6kAshkOAel40HOyl3GyNmMEr0PndtPfvbB3nGnLUZj+3N/QCRSpO7L13Eli9fm3A/lneuqSQU1jy10+jU+MK+EwBcu7J8qrcJIc4gEtynEQiFee1QN1ctLxvXoOo9a6tZUJTNo1tapnj3eNtb+shwpEU+JACyMhK/6bm6Op+VlW5+vdVoRfD8vhOsrHTL6kFCiAgJ7tPY0tiLxxeMpGQsjjTF5ctK2NbYSzAUjmtf25v7WVnltqWK5bYNNexuG+D1w91sa+7jOhm1CyGiSHCfxssHOkl3KC6ZZCLNBYuKGfaH2BPHQs6BUJhdrf22Tf65+ZxqMhxp/N1vd6I1EtyFEOOctu0H3jjSze/fNppo5WQ6+LsbVpCbaf/lbDzYxQWLiifd9wWLjaqUt471cE5tQcx9DHoDfP7RHXgDYS5ZYs9sy8KcDK5bVc4fd7VTme8al+oRQojTcuQeDIX50u928cyedjYd6+HnbzbxwGvHbD9O56CXw51DXBajtW1ZnovFJTm81dAbcx8N3cPc+u9v8MqhLv7x5lVcc5Z9PVmsBSuuPatcFqwQQoxzWgb3p3Ydp7VvlB/evo437rua61eV89PXGhJeyGI6bx7rAeCiJZO3AwBj9L6lsXfSlgSvHOri5n97nd5hP4/cfQEfM5fMs8ulS0v43DX1fCKB7o5CiDPDaRfcw2HNjzceZVl5bqTn+d9ct4whf5CfzGD0rrXmroe28LUn9pwUoN882kOeyznlbM8LFhXj8QbZ3z4+7/7Lt5q588HNVBVk8eRnLp3yA2KmHGmKv7luGQuLpWuiEGK80y64v3igk0MnhvjUlUsiS7ytqHBz05pKHnyjkZ6h6XudR3uroZeXDnTy8JtN/K/f7BwX4P98tIcLFhWPW95tImspvOjUjNaa//PsAc6rK+J3n7qY2qLshM5JCCGSddoF9x9vPEJNYRbvPrtq3PbPX7sMbyDEg280JrS/RzY14XY5+ezVS/n99ja++JudaK1p7RuhuXeEi6cZcVcVZFFblMVbZgoHjBmjfSMB3nduDTkpuMkrhBDTmZPB/YkdbWye5CbltqY+3m7u5xOXLsI5Ybr+0rJcrlhWyu/fbo27JUCXx8ezezt4/7m1/O07lvO5a+p5fHsbz+zp4M2j0+fbLRcsKmZzVN59c4PR+XG6Do9CCJEqcy647zs+yOce3cFt//Umt/3Xm2xrGgvyP3ujgTyXkw+YVSIT3bKumuMD3imrV6L9emsLgZDmwxcuAOCzVy9lZaWbbzy1jxf2n6AoJ4PlcbQGuGJZKf0jAbY1GUF9a2MvJbkZ1BVLOkYIMTvmXHD/2RsNZKU7+N/vXEFTzzB/8ZO32N8+SFv/KM/s6eBD5y+Imep4x8oKcjIcPL69ddrjhMKaX77VzMVLillSmgsYPWP+6dbVnPB4eXbvCS5cXBTJ60/lqhVlZDjTeHpPOwCbG3s5r65IyhOFELNmTgX3Lo+PJ3cc5wMbarjn8iU89dlLyc9K59O/eJv/ePkIWms+dtHCmO/PynBww+pKnt7dgTcQmvJYf9rdTlv/KB++YPz+1i8o5PbzjJH8RXFOOMrNdHJ5fQnP7umgfWCU1r5R6XkuhJhVcyq4/+KtJvyhMB+/uA4wJgn9vw+to7l3hF+81cwNqyuoKZw61XHrumo8viAvTtFr3RsI8Z1nDrCiIi+ydFy0+25cwScuXcS71lTGfe43rK7k+ICXn73eAMD5EtyFELNozgR3XzDEI5uauHpFGYvNNAnABYuLue+GFaQ7FJ+IY3m3i5YUU+7OPCk1MzAawBc0RvMP/7mR1r5RvnLTyknLHPOz0vnKu1ZSmJMR9/lfe1YZzjTFQ39uJDvDwVmVibXxFUIIO82ZOr1v/mEf3UN+7rrk5NmWf3n5Yj54fi1uV/q0+3GkKW5ZV80DrzVwYtBLudvFqD/Etd97BYA7L6njxxuPcuXyUi6N0VZgJgqyM7hoSTGvHe7mgkWFJ1XzCCHEqTQnIlBb/yiPbGrmry5fzCVLJy89jCewWz503gJCYc2jm41e649vb6PL46My38V3nznIsC/I/37nWbace7TrVxkpHimBFELMtjkxcu8d9vOtq5byt+9YZkuFSV1JDpfVl/Crzc18+qol/OyNBlZVuXni3kt4u7kfjzfAsgRXP4rHTWsqeXpPOzedHX+uXgghUkFpHf8aoKmyYNka3XRwl62lg8/s6eCTj2zjIxcu4JFNzXzvtrW8d32NbfsXQojZppTaprXeMNlz06ZllFI/U0p1KqX2RG0rUko9r5Q6bP5eaG5XSqkfKaWOKKV2KaXWx3OCZe5M22vCrz2rjAq3i0c2NVOalymjaSHEGSWenPtDwA0Ttt0HvKi1rgdeNH8GuBGoN3/dA/zYntNMnNORxu3nGzNZP3rhQluWthNCiNPFtDl3rfWrSqm6CZtvBq40Hz8MbAS+ZG7/uTZyPZuUUgVKqUqtdbttZ5yAOy6qo38kwB0X1c3G4YUQYtbMtFqmPCpgdwDWAp7VQEvU61rNbSdRSt2jlNqqlNra1dU1w9OYWmFOBl9/zyrys+OvtBFCiPkg6VJIc5Se8F1ZrfX9WusNWusNpaWlyZ6GEEKIKDMN7ieUUpUA5u/WXP82ILplY425TQghxCk00+D+JHCH+fgO4Imo7R8zq2YuBAZmK98uhBBnsmlvqCqlfoVx87REKdUKfA34NvBrpdTdQBNwm/nyPwHvBI4AI8CdKThnIYQQ04inWuZDMZ66ZpLXauDeZE9KCCFEcuZEbxkhhBD2kuAuhBDzkAR3IYSYh+ZE4zCllAc4eAoPmQ8MnMLjlQDdp/B48/36YP5fo1yfvebr9S3UWk86UWhOtPwFDsbqbJYKSqn7tdb3nMLjbZXrs/2Y8/oa5fpsP968vr7JnKlpmadm+wRSbL5fH8z/a5TrO73N+vWdkcFdaz3rf/CpNN+vD+b/Ncr1nd7mwvXNleB+/2yfQIrJ9Z3+5vs1yvXNM3PihqoQQgh7zZWRuxBCCBtJcBdCiHkoZcE9xtqra5VSbyqldiulnlJKuSe8Z4FSakgp9cWobZ9TSu1RSu1VSn0+VeebqESuTylVp5QaVUrtMH/9Z9R7vqWUalFKDc3GdcRi4/U9o5Taaf79/adSak6sd2jj9W1USh2Meq5sNq5nIjuuTymVF7Vth1KqWyn1g1m6pHFs/Pv7oDLWe96rlPrObFxLymitU/ILuBxYD+yJ2rYFuMJ8fBfwzQnv+S3wG+CL5s+rgT1ANkZN/gvA0lSdc6quD6iLft2E/VwIVAJDs31NKbo+t/m7An4H3D7b12bz9W0ENsz29aTq+ibscxtw+Wxfm13XBxQDzUCp+fPDwDWzfW12/UrZyF1r/SrQO2HzMuBV8/HzwPusJ5RStwANwN6o158FvKW1HtFaB4FXgPem6pwTkej1TbGfTXoO9ry38foGzYdOIIMZrNqVCnZd31xl9/UppZYBZcBrtpxgkmy6vsXAYa21tc7nC3G857RxqnPuezEW0Qb4AOaqTUqpXIwFtr8x4fV7gMuUUsVKqWyMXvG1zF2TXp9pkVJqu1LqFaXUZaf+1Gwxo+tTSj2LsVqXB+Pb2Vw107+/B82v+/+glFKn5ExnJpl/n7cDj2lziDtHJXp9R4DlZtrGCdzC3I4vCTnVwf0u4NNKqW1AHuA3t38d+L7WelzeWWu9H/gO8BzwDLADCJ2qk52BWNfXDizQWq8DvgD8Uk2433CamNH1aa2vx0g9ZQJXn9pTTshMru/DWus1wGXmr4+e4nNORDL/Pm8HfnXKznRmEro+rXUf8CngMYxvJI3M7fiSmBTnxeqInatcBmw2H1t/sI1AP8bXrc9M8p5/Bj4927msRK9vkuc2MiFPyxzLudt9feb2jwH/NtvXlcLr+/h8vD5gLXBotq/nFPz93QN8d7avy65fp3TkblUSKKXSgK8A/wmgtb5Ma12nta4DfgD8s9b63ya8ZwFGvv2Xp/KcExHr+pRSpVaViFJqMVAPHJut85ypRK9PKZWrxhZSdwI3AQdm49zjMYPrcyqlSszt6cC7MFKJc1IS/z4/xNwftc/o+qLeUwh8Gnjg1J95aqSsK6SafO3VXKWUtQzf74EH49jV75RSxUAAuFdr3Z+C001Ygtd3OfCPSqkAEAY+qbXuNffzXeAvgGxzPw9orb9+yi4kBjuuTylVDjyplMrESAG+jPkfbrbZdH05wLNmYHdg3JD7ySm8jJjs+vdpug3jftecYeP1/VAptdZ8/I9a60On5AJOAWk/IIQQ85DMUBVCiHlIgrsQQsxDEtyFEGIekuAuhBDzkAR3IYSYhyS4CxGDUurrKqpDqRCnEwnuQggxD0lwFyKKUurLSqlDSqnXgeXmtr9WSu0z+34/OsunKERcUjZDVYjTjVLqXIwGWedg/N94G6OH+X3AIq21TylVMGsnKEQCZOQuxJjLgMe1sX7AIPCkuX0X8Aul1EeA4KydnRAJkOAuxPRuAv4dY+WfLWYTNCHmNAnuQox5FbhFKZWllMoD3o3xf6RWa/0yxoIy+UDuLJ6jEHGREYgQJq3120qpx4CdGCtHbcFYFvARpVQ+xjqwP5ornUmFmIp0hRRCiHlI0jJCCDEPSXAXQoh5SIK7EELMQxLchRBiHpLgLoQQ85AEdyGEmIckuAshxDz0/wGOa+jMLGt66wAAAABJRU5ErkJggg==\n", 210 | "text/plain": [ 211 | "
" 212 | ] 213 | }, 214 | "metadata": { 215 | "needs_background": "light" 216 | }, 217 | "output_type": "display_data" 218 | } 219 | ], 220 | "source": [ 221 | "df.plot()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "aaba673e", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [] 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": "Python 3 (ipykernel)", 236 | "language": "python", 237 | "name": "python3" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 3 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython3", 249 | "version": "3.10.5" 250 | }, 251 | "toc": { 252 | "base_numbering": 1, 253 | "nav_menu": {}, 254 | "number_sections": true, 255 | "sideBar": true, 256 | "skip_h1_title": false, 257 | "title_cell": "Table of Contents", 258 | "title_sidebar": "Contents", 259 | "toc_cell": false, 260 | "toc_position": {}, 261 | "toc_section_display": true, 262 | "toc_window_display": false 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 5 267 | } 268 | -------------------------------------------------------------------------------- /01-Create-Datasets/05-create-electricity-demand-dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Electricity Demand in Victoria, Australia \n", 8 | "\n", 9 | "In this notebook we will prepare and store the electricity demand dataset found [here](https://github.com/tidyverts/tsibbledata/tree/master/data-raw/vic_elec/VIC2015).\n", 10 | "\n", 11 | "**Citation:**\n", 12 | "\n", 13 | "Godahewa, Rakshitha, Bergmeir, Christoph, Webb, Geoff, Hyndman, Rob, & Montero-Manso, Pablo. (2021). Australian Electricity Demand Dataset (Version 1) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.4659727\n", 14 | "\n", 15 | "**Description of data:**\n", 16 | "\n", 17 | "A description of the data can be found [here](https://rdrr.io/cran/tsibbledata/man/vic_elec.html). The data contains electricity demand in Victoria, Australia, at 30 minute intervals over a period of 12 years, from 2002 to early 2015. There is also the temperature in Melbourne at 30 minute intervals and public holiday dates." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Download the data via the URL below and pandas" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import pandas as pd\n", 34 | "import numpy as np" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Electricity demand.\n", 44 | "url = \"https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/demand.csv\"\n", 45 | "demand = pd.read_csv(url)\n", 46 | "\n", 47 | "# Temperature of Melbourne (BOM site 086071).\n", 48 | "url = \"https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/temperature.csv\"\n", 49 | "temp = pd.read_csv(url)\n", 50 | "df = demand.merge(temp, on=[\"Date\", \"Period\"], how=\"left\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Public holidays in Australia\n", 60 | "url = \"https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/holidays.txt\"\n", 61 | "holidays = pd.read_csv(url, header=None, parse_dates=[0], dayfirst=True)\n", 62 | "holidays.columns = [\"date\"]" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "# Process and save the data" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "We will only use the `OperationLessIndustrial` demand. So let's drop `Industrial`." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "df.drop(columns=[\"Industrial\"], inplace=True)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Let's extract the date and date-time." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Convert the integer Date to an actual date with datetime type\n", 102 | "df[\"date\"] = df[\"Date\"].apply(\n", 103 | " lambda x: pd.Timestamp(\"1899-12-30\") + pd.Timedelta(x, unit=\"days\")\n", 104 | ")\n", 105 | "\n", 106 | "# Create a timestamp from the integer Period representing 30 minute intervals\n", 107 | "df[\"date_time\"] = df[\"date\"] + pd.to_timedelta((df[\"Period\"] - 1) * 30, unit=\"m\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Drop the null rows." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "df.dropna(inplace=True)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "Create holidays column." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 7, 136 | "metadata": { 137 | "tags": [] 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "holidays[\"is_holiday\"] = 1\n", 142 | "df = df.merge(holidays, on=[\"date\"], how=\"left\")\n", 143 | "df[\"is_holiday\"] = df[\"is_holiday\"].fillna(0).astype(int)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "We now just use the timestamp and the electricity demand and resample to hourly." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 8, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/html": [ 161 | "
\n", 162 | "\n", 175 | "\n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | "
demandtemperatureis_holiday
date_time
2002-01-01 00:00:006919.36609232.61
2002-01-01 01:00:007165.97418832.61
2002-01-01 02:00:006406.54299432.61
2002-01-01 03:00:005815.53782832.61
2002-01-01 04:00:005497.73292232.61
\n", 223 | "
" 224 | ], 225 | "text/plain": [ 226 | " demand temperature is_holiday\n", 227 | "date_time \n", 228 | "2002-01-01 00:00:00 6919.366092 32.6 1\n", 229 | "2002-01-01 01:00:00 7165.974188 32.6 1\n", 230 | "2002-01-01 02:00:00 6406.542994 32.6 1\n", 231 | "2002-01-01 03:00:00 5815.537828 32.6 1\n", 232 | "2002-01-01 04:00:00 5497.732922 32.6 1" 233 | ] 234 | }, 235 | "execution_count": 8, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "# Rename columns\n", 242 | "timeseries = df[[\"date_time\", \"OperationalLessIndustrial\", \"Temp\", \"is_holiday\"]]\n", 243 | "\n", 244 | "timeseries.columns = [\"date_time\", \"demand\", \"temperature\", \"is_holiday\"]\n", 245 | "\n", 246 | "# Resample to hourly\n", 247 | "timeseries = (\n", 248 | " timeseries.set_index(\"date_time\")\n", 249 | " .resample(\"H\")\n", 250 | " .agg(\n", 251 | " {\n", 252 | " \"demand\": \"sum\",\n", 253 | " \"temperature\": \"mean\",\n", 254 | " \"is_holiday\": np.min,\n", 255 | " }\n", 256 | " )\n", 257 | ")\n", 258 | "timeseries.head()" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "Save the timeseries in the datasets folder." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 9, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "timeseries.to_csv(\"../Datasets/victoria_electricity_demand.csv\")" 275 | ] 276 | } 277 | ], 278 | "metadata": { 279 | "kernelspec": { 280 | "display_name": "Python 3 (ipykernel)", 281 | "language": "python", 282 | "name": "python3" 283 | }, 284 | "language_info": { 285 | "codemirror_mode": { 286 | "name": "ipython", 287 | "version": 3 288 | }, 289 | "file_extension": ".py", 290 | "mimetype": "text/x-python", 291 | "name": "python", 292 | "nbconvert_exporter": "python", 293 | "pygments_lexer": "ipython3", 294 | "version": "3.8.7" 295 | }, 296 | "toc": { 297 | "base_numbering": 1, 298 | "nav_menu": {}, 299 | "number_sections": true, 300 | "sideBar": true, 301 | "skip_h1_title": false, 302 | "title_cell": "Table of Contents", 303 | "title_sidebar": "Contents", 304 | "toc_cell": false, 305 | "toc_position": {}, 306 | "toc_section_display": true, 307 | "toc_window_display": true 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 4 312 | } 313 | -------------------------------------------------------------------------------- /09-Trend-Features/images/forecast_with_just_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/forecast_with_just_time.png -------------------------------------------------------------------------------- /09-Trend-Features/images/recursive_forecasting/Slide1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide1.png -------------------------------------------------------------------------------- /09-Trend-Features/images/recursive_forecasting/Slide2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide2.png -------------------------------------------------------------------------------- /09-Trend-Features/images/recursive_forecasting/Slide3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide3.png -------------------------------------------------------------------------------- /09-Trend-Features/images/recursive_forecasting/Slide4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide4.png -------------------------------------------------------------------------------- /11-Time-Features/02-Extracting-time-related-features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Time features from the datetime variable\n", 8 | "\n", 9 | "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n", 10 | "\n", 11 | "Time series data are, by definition, time-indexed. The \"time\" component has information about the date and time. We can extract a number of features from the time component of the index.\n", 12 | "\n", 13 | "In this notebook, we will see how we can easily derive many time-related features.\n", 14 | "\n", 15 | "\n", 16 | "## Features from the time part:\n", 17 | "\n", 18 | "Below are some of the features that we can extract off-the-shelf using [pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components):\n", 19 | "\n", 20 | "- pandas.Series.dt.hour\n", 21 | "- pandas.Series.dt.minute\n", 22 | "- pandas.Series.dt.second\n", 23 | "- pandas.Series.dt.microsecond\n", 24 | "- pandas.Series.dt.nanosecond\n", 25 | "\n", 26 | "\n", 27 | "## The dataset\n", 28 | "\n", 29 | "We will use the Online Retail II Data Set available in the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/machine-learning-databases/00502/).\n", 30 | "\n", 31 | "Download the xlsx file from the link above and save it in the **Datasets** folder within this repo.\n", 32 | "\n", 33 | "**Citation**:\n", 34 | "\n", 35 | "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n", 36 | "\n", 37 | "## In this demo\n", 38 | "\n", 39 | "We will extract different time-related features from the datetime variable: **InvoiceDate**" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 1, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import numpy as np\n", 49 | "import pandas as pd\n", 50 | "import matplotlib.pyplot as plt" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Load the data" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "(1067371, 8)\n" 70 | ] 71 | }, 72 | { 73 | "data": { 74 | "text/html": [ 75 | "
\n", 76 | "\n", 89 | "\n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | "
InvoiceStockCodeDescriptionQuantityInvoiceDatePriceCustomer IDCountry
04894348504815CM CHRISTMAS GLASS BALL 20 LIGHTS122009-12-01 07:45:006.9513085.0United Kingdom
148943479323PPINK CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom
248943479323WWHITE CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom
348943422041RECORD FRAME 7\" SINGLE SIZE482009-12-01 07:45:002.1013085.0United Kingdom
448943421232STRAWBERRY CERAMIC TRINKET BOX242009-12-01 07:45:001.2513085.0United Kingdom
\n", 161 | "
" 162 | ], 163 | "text/plain": [ 164 | " Invoice StockCode Description Quantity \\\n", 165 | "0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 \n", 166 | "1 489434 79323P PINK CHERRY LIGHTS 12 \n", 167 | "2 489434 79323W WHITE CHERRY LIGHTS 12 \n", 168 | "3 489434 22041 RECORD FRAME 7\" SINGLE SIZE 48 \n", 169 | "4 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 \n", 170 | "\n", 171 | " InvoiceDate Price Customer ID Country \n", 172 | "0 2009-12-01 07:45:00 6.95 13085.0 United Kingdom \n", 173 | "1 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n", 174 | "2 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n", 175 | "3 2009-12-01 07:45:00 2.10 13085.0 United Kingdom \n", 176 | "4 2009-12-01 07:45:00 1.25 13085.0 United Kingdom " 177 | ] 178 | }, 179 | "execution_count": 2, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "# File path:\n", 186 | "file = \"../Datasets/online_retail_II.xlsx\"\n", 187 | "\n", 188 | "# The data is provided as two sheets in a single Excel file.\n", 189 | "# Each sheet contains a different time period.\n", 190 | "# Load both and join them into a single dataframe\n", 191 | "# as shown below:\n", 192 | "\n", 193 | "df_1 = pd.read_excel(file, sheet_name=\"Year 2009-2010\")\n", 194 | "df_2 = pd.read_excel(file, sheet_name=\"Year 2010-2011\")\n", 195 | "\n", 196 | "data = pd.concat([df_1, df_2])\n", 197 | "\n", 198 | "print(data.shape)\n", 199 | "\n", 200 | "data.head()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "In this dataset, we have the datetime variable in a column called InvoiceDate. We could also have it in the dataframe index. The procedure for extracting the date and time features is identical. That is, we would use the methods from pandas dt as shown below.\n", 208 | "\n", 209 | "The dataset contains sales information for different customers in different countries. Customers may have made one or multiple purchases from the business that provided the data.\n", 210 | "\n", 211 | "## Variable format" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 3, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "dtype('\n", 254 | "\n", 267 | "\n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | "
dateInvoiceDate
02009-12-01 07:45:002009-12-01 07:45:00
12009-12-01 07:45:002009-12-01 07:45:00
22009-12-01 07:45:002009-12-01 07:45:00
32009-12-01 07:45:002009-12-01 07:45:00
42009-12-01 07:45:002009-12-01 07:45:00
\n", 303 | "" 304 | ], 305 | "text/plain": [ 306 | " date InvoiceDate\n", 307 | "0 2009-12-01 07:45:00 2009-12-01 07:45:00\n", 308 | "1 2009-12-01 07:45:00 2009-12-01 07:45:00\n", 309 | "2 2009-12-01 07:45:00 2009-12-01 07:45:00\n", 310 | "3 2009-12-01 07:45:00 2009-12-01 07:45:00\n", 311 | "4 2009-12-01 07:45:00 2009-12-01 07:45:00" 312 | ] 313 | }, 314 | "execution_count": 4, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "# This is how we parse date strings into datetime format.\n", 321 | "\n", 322 | "data[\"date\"] = pd.to_datetime(data[\"InvoiceDate\"])\n", 323 | "\n", 324 | "data[[\"date\", \"InvoiceDate\"]].head()" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "## Extract the time part" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 5, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/plain": [ 342 | "0 07:45:00\n", 343 | "1 07:45:00\n", 344 | "2 07:45:00\n", 345 | "3 07:45:00\n", 346 | "4 07:45:00\n", 347 | "Name: time_part, dtype: object" 348 | ] 349 | }, 350 | "execution_count": 5, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "# Extract time part.\n", 357 | "\n", 358 | "# (We would normally not use this as a predictive feature,\n", 359 | "# but it might be handy for data analysis).\n", 360 | "\n", 361 | "data[\"time_part\"] = data[\"date\"].dt.time\n", 362 | "\n", 363 | "data[\"time_part\"].head()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "### Extract the hr, minute and second" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 6, 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "data": { 380 | "text/html": [ 381 | "
\n", 382 | "\n", 395 | "\n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | "
InvoiceStockCodeDescriptionQuantityInvoiceDatePriceCustomer IDCountrydatetime_parthourminsecmicrosecnanosec
04894348504815CM CHRISTMAS GLASS BALL 20 LIGHTS122009-12-01 07:45:006.9513085.0United Kingdom2009-12-01 07:45:0007:45:00745000
148943479323PPINK CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom2009-12-01 07:45:0007:45:00745000
248943479323WWHITE CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom2009-12-01 07:45:0007:45:00745000
348943422041RECORD FRAME 7\" SINGLE SIZE482009-12-01 07:45:002.1013085.0United Kingdom2009-12-01 07:45:0007:45:00745000
448943421232STRAWBERRY CERAMIC TRINKET BOX242009-12-01 07:45:001.2513085.0United Kingdom2009-12-01 07:45:0007:45:00745000
\n", 509 | "
" 510 | ], 511 | "text/plain": [ 512 | " Invoice StockCode Description Quantity \\\n", 513 | "0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 \n", 514 | "1 489434 79323P PINK CHERRY LIGHTS 12 \n", 515 | "2 489434 79323W WHITE CHERRY LIGHTS 12 \n", 516 | "3 489434 22041 RECORD FRAME 7\" SINGLE SIZE 48 \n", 517 | "4 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 \n", 518 | "\n", 519 | " InvoiceDate Price Customer ID Country date \\\n", 520 | "0 2009-12-01 07:45:00 6.95 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 521 | "1 2009-12-01 07:45:00 6.75 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 522 | "2 2009-12-01 07:45:00 6.75 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 523 | "3 2009-12-01 07:45:00 2.10 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 524 | "4 2009-12-01 07:45:00 1.25 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 525 | "\n", 526 | " time_part hour min sec microsec nanosec \n", 527 | "0 07:45:00 7 45 0 0 0 \n", 528 | "1 07:45:00 7 45 0 0 0 \n", 529 | "2 07:45:00 7 45 0 0 0 \n", 530 | "3 07:45:00 7 45 0 0 0 \n", 531 | "4 07:45:00 7 45 0 0 0 " 532 | ] 533 | }, 534 | "execution_count": 6, 535 | "metadata": {}, 536 | "output_type": "execute_result" 537 | } 538 | ], 539 | "source": [ 540 | "data[\"hour\"] = data[\"date\"].dt.hour\n", 541 | "data[\"min\"] = data[\"date\"].dt.minute\n", 542 | "data[\"sec\"] = data[\"date\"].dt.second\n", 543 | "\n", 544 | "# We do not have micro and nano seconds in this dataset,\n", 545 | "# but if we did, we can extract them as follows:\n", 546 | "\n", 547 | "data[\"microsec\"] = data[\"date\"].dt.microsecond\n", 548 | "data[\"nanosec\"] = data[\"date\"].dt.nanosecond\n", 549 | "\n", 550 | "data.head()" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "### Extract hr, min, sec, at the same time" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 7, 563 | "metadata": {}, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/html": [ 568 | "
\n", 569 | "\n", 582 | "\n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | "
InvoiceStockCodeDescriptionQuantityInvoiceDatePriceCustomer IDCountrydatetime_parthourminsecmicrosecnanosechms
04894348504815CM CHRISTMAS GLASS BALL 20 LIGHTS122009-12-01 07:45:006.9513085.0United Kingdom2009-12-01 07:45:0007:45:007450007450
148943479323PPINK CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom2009-12-01 07:45:0007:45:007450007450
248943479323WWHITE CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom2009-12-01 07:45:0007:45:007450007450
348943422041RECORD FRAME 7\" SINGLE SIZE482009-12-01 07:45:002.1013085.0United Kingdom2009-12-01 07:45:0007:45:007450007450
448943421232STRAWBERRY CERAMIC TRINKET BOX242009-12-01 07:45:001.2513085.0United Kingdom2009-12-01 07:45:0007:45:007450007450
\n", 714 | "
" 715 | ], 716 | "text/plain": [ 717 | " Invoice StockCode Description Quantity \\\n", 718 | "0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 \n", 719 | "1 489434 79323P PINK CHERRY LIGHTS 12 \n", 720 | "2 489434 79323W WHITE CHERRY LIGHTS 12 \n", 721 | "3 489434 22041 RECORD FRAME 7\" SINGLE SIZE 48 \n", 722 | "4 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 \n", 723 | "\n", 724 | " InvoiceDate Price Customer ID Country date \\\n", 725 | "0 2009-12-01 07:45:00 6.95 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 726 | "1 2009-12-01 07:45:00 6.75 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 727 | "2 2009-12-01 07:45:00 6.75 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 728 | "3 2009-12-01 07:45:00 2.10 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 729 | "4 2009-12-01 07:45:00 1.25 13085.0 United Kingdom 2009-12-01 07:45:00 \n", 730 | "\n", 731 | " time_part hour min sec microsec nanosec h m s \n", 732 | "0 07:45:00 7 45 0 0 0 7 45 0 \n", 733 | "1 07:45:00 7 45 0 0 0 7 45 0 \n", 734 | "2 07:45:00 7 45 0 0 0 7 45 0 \n", 735 | "3 07:45:00 7 45 0 0 0 7 45 0 \n", 736 | "4 07:45:00 7 45 0 0 0 7 45 0 " 737 | ] 738 | }, 739 | "execution_count": 7, 740 | "metadata": {}, 741 | "output_type": "execute_result" 742 | } 743 | ], 744 | "source": [ 745 | "# Now, let's repeat what we did in the previous cell in 1 command.\n", 746 | "\n", 747 | "data[[\"h\", \"m\", \"s\"]] = pd.DataFrame(\n", 748 | " [(x.hour, x.minute, x.second) for x in data[\"date\"]]\n", 749 | ")\n", 750 | "\n", 751 | "data.head()" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "## Work with different timezones\n", 759 | "\n", 760 | "In the next few cells, we will see how to work with timestamps that are in different time zones." 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": 8, 766 | "metadata": {}, 767 | "outputs": [ 768 | { 769 | "data": { 770 | "text/html": [ 771 | "
\n", 772 | "\n", 785 | "\n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | "
time
02014-08-01 09:00:00+02:00
12014-08-01 10:00:00+02:00
22014-08-01 11:00:00+02:00
02014-08-01 09:00:00-05:00
12014-08-01 10:00:00-05:00
22014-08-01 11:00:00-05:00
\n", 819 | "
" 820 | ], 821 | "text/plain": [ 822 | " time\n", 823 | "0 2014-08-01 09:00:00+02:00\n", 824 | "1 2014-08-01 10:00:00+02:00\n", 825 | "2 2014-08-01 11:00:00+02:00\n", 826 | "0 2014-08-01 09:00:00-05:00\n", 827 | "1 2014-08-01 10:00:00-05:00\n", 828 | "2 2014-08-01 11:00:00-05:00" 829 | ] 830 | }, 831 | "execution_count": 8, 832 | "metadata": {}, 833 | "output_type": "execute_result" 834 | } 835 | ], 836 | "source": [ 837 | "# First, let's create a toy dataframe with some timestamps in different time zones.\n", 838 | "\n", 839 | "df = pd.DataFrame()\n", 840 | "\n", 841 | "df[\"time\"] = pd.concat(\n", 842 | " [\n", 843 | " pd.Series(\n", 844 | " pd.date_range(\n", 845 | " start=\"2014-08-01 09:00\", freq=\"H\", periods=3, tz=\"Europe/Berlin\"\n", 846 | " )\n", 847 | " ),\n", 848 | " pd.Series(\n", 849 | " pd.date_range(\n", 850 | " start=\"2014-08-01 09:00\", freq=\"H\", periods=3, tz=\"US/Central\"\n", 851 | " )\n", 852 | " ),\n", 853 | " ],\n", 854 | " axis=0,\n", 855 | ")\n", 856 | "\n", 857 | "df" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "metadata": {}, 863 | "source": [ 864 | "We can see the different timezones indicated by the +2 and -5, with respect to the central meridian." 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": 9, 870 | "metadata": {}, 871 | "outputs": [ 872 | { 873 | "data": { 874 | "text/html": [ 875 | "
\n", 876 | "\n", 889 | "\n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | "
timetime_utctime_london
02014-08-01 09:00:00+02:002014-08-01 07:00:00+00:002014-08-01 08:00:00+01:00
12014-08-01 10:00:00+02:002014-08-01 08:00:00+00:002014-08-01 09:00:00+01:00
22014-08-01 11:00:00+02:002014-08-01 09:00:00+00:002014-08-01 10:00:00+01:00
02014-08-01 09:00:00-05:002014-08-01 14:00:00+00:002014-08-01 15:00:00+01:00
12014-08-01 10:00:00-05:002014-08-01 15:00:00+00:002014-08-01 16:00:00+01:00
22014-08-01 11:00:00-05:002014-08-01 16:00:00+00:002014-08-01 17:00:00+01:00
\n", 937 | "
" 938 | ], 939 | "text/plain": [ 940 | " time time_utc \\\n", 941 | "0 2014-08-01 09:00:00+02:00 2014-08-01 07:00:00+00:00 \n", 942 | "1 2014-08-01 10:00:00+02:00 2014-08-01 08:00:00+00:00 \n", 943 | "2 2014-08-01 11:00:00+02:00 2014-08-01 09:00:00+00:00 \n", 944 | "0 2014-08-01 09:00:00-05:00 2014-08-01 14:00:00+00:00 \n", 945 | "1 2014-08-01 10:00:00-05:00 2014-08-01 15:00:00+00:00 \n", 946 | "2 2014-08-01 11:00:00-05:00 2014-08-01 16:00:00+00:00 \n", 947 | "\n", 948 | " time_london \n", 949 | "0 2014-08-01 08:00:00+01:00 \n", 950 | "1 2014-08-01 09:00:00+01:00 \n", 951 | "2 2014-08-01 10:00:00+01:00 \n", 952 | "0 2014-08-01 15:00:00+01:00 \n", 953 | "1 2014-08-01 16:00:00+01:00 \n", 954 | "2 2014-08-01 17:00:00+01:00 " 955 | ] 956 | }, 957 | "execution_count": 9, 958 | "metadata": {}, 959 | "output_type": "execute_result" 960 | } 961 | ], 962 | "source": [ 963 | "# To work with different time zones, first we unify the\n", 964 | "# timezone to the central one by setting utc = True.\n", 965 | "\n", 966 | "df[\"time_utc\"] = pd.to_datetime(df[\"time\"], utc=True)\n", 967 | "\n", 968 | "# Next, we change all timestamps to the desired timezone,\n", 969 | "# e.g., Europe/London, as in this example.\n", 970 | "\n", 971 | "df[\"time_london\"] = df[\"time_utc\"].dt.tz_convert(\"Europe/London\")\n", 972 | "\n", 973 | "\n", 974 | "df" 975 | ] 976 | }, 977 | { 978 | "cell_type": "markdown", 979 | "metadata": {}, 980 | "source": [ 981 | "Whether to unify the timezone depends on the use case. If we are forecasting sales for different countries, perhaps it is better to keep each country's respective time zone, since we will treat those series independently.\n", 982 | "\n", 983 | "If we have a small company that sells mostly inland and occasionally sells something abroad, we probably have the local timezone already, but if we do not, we may want to localize the time stamp to our time zone." 984 | ] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": null, 989 | "metadata": {}, 990 | "outputs": [], 991 | "source": [] 992 | } 993 | ], 994 | "metadata": { 995 | "kernelspec": { 996 | "display_name": "fets", 997 | "language": "python", 998 | "name": "fets" 999 | }, 1000 | "language_info": { 1001 | "codemirror_mode": { 1002 | "name": "ipython", 1003 | "version": 3 1004 | }, 1005 | "file_extension": ".py", 1006 | "mimetype": "text/x-python", 1007 | "name": "python", 1008 | "nbconvert_exporter": "python", 1009 | "pygments_lexer": "ipython3", 1010 | "version": "3.8.2" 1011 | }, 1012 | "toc": { 1013 | "base_numbering": 1, 1014 | "nav_menu": {}, 1015 | "number_sections": true, 1016 | "sideBar": true, 1017 | "skip_h1_title": false, 1018 | "title_cell": "Table of Contents", 1019 | "title_sidebar": "Contents", 1020 | "toc_cell": false, 1021 | "toc_position": {}, 1022 | "toc_section_display": "block", 1023 | "toc_window_display": true 1024 | } 1025 | }, 1026 | "nbformat": 4, 1027 | "nbformat_minor": 2 1028 | } 1029 | -------------------------------------------------------------------------------- /12-Categorical-Encoding/3-mean-encoding-simple.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b699e295", 6 | "metadata": {}, 7 | "source": [ 8 | "# Mean encoding - simple\n", 9 | "\n", 10 | "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n", 11 | "\n", 12 | "In this notebook, we will encode static features with mean encoding. We will split the data into train and test sets, learn the mean target value per category using the train set, and then encode both the train and test sets with those learned parameters.\n", 13 | "\n", 14 | "It has the advantage that this logic is implemented by open-source libraries.\n", 15 | "\n", 16 | "The drawback is that we may overfit because we are leaking future data into the past. \n", 17 | "\n", 18 | "We will use the online retail dataset, which we prepared in the notebook `02-create-online-retail-II-datasets.ipynb` located in the `01-Create-Datasets` folder." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "id": "49b2f0bf", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "from feature_engine.encoding import MeanEncoder" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "5a174f3b", 36 | "metadata": {}, 37 | "source": [ 38 | "## Load data" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "id": "67a2af74", 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/html": [ 50 | "
\n", 51 | "\n", 64 | "\n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | "
countryquantityrevenue
week
2009-12-06Belgium143439.1
2009-12-13Belgium108.5
2009-12-20Belgium00.0
2009-12-27Belgium00.0
2010-01-03Belgium00.0
\n", 112 | "
" 113 | ], 114 | "text/plain": [ 115 | " country quantity revenue\n", 116 | "week \n", 117 | "2009-12-06 Belgium 143 439.1\n", 118 | "2009-12-13 Belgium 10 8.5\n", 119 | "2009-12-20 Belgium 0 0.0\n", 120 | "2009-12-27 Belgium 0 0.0\n", 121 | "2010-01-03 Belgium 0 0.0" 122 | ] 123 | }, 124 | "execution_count": 2, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "df = pd.read_csv(\"../Datasets/online_retail_dataset_countries.csv\",\n", 131 | " parse_dates=[\"week\"],\n", 132 | " index_col=\"week\",\n", 133 | " )\n", 134 | "\n", 135 | "df.head()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "id": "4a419d6a", 141 | "metadata": {}, 142 | "source": [ 143 | "## Split into train and test" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 3, 149 | "id": "1f4c0763", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "# Split the data before and after June 2011\n", 154 | "\n", 155 | "X_train = df[df.index <= pd.to_datetime('2011-06-30')]\n", 156 | "X_test = df[df.index > pd.to_datetime('2011-06-30')]\n", 157 | "\n", 158 | "y_train = X_train[\"revenue\"]\n", 159 | "y_test = X_test[\"revenue\"]" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 4, 165 | "id": "928be034", 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "(Timestamp('2009-12-06 00:00:00'), Timestamp('2011-06-26 00:00:00'))" 172 | ] 173 | }, 174 | "execution_count": 4, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "# sanity check\n", 181 | "\n", 182 | "X_train.index.min(), X_train.index.max()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 5, 188 | "id": "6e838b49", 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "(Timestamp('2011-07-03 00:00:00'), Timestamp('2011-12-11 00:00:00'))" 195 | ] 196 | }, 197 | "execution_count": 5, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "# sanity check\n", 204 | "\n", 205 | "X_test.index.min(), X_test.index.max()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "d5de7aa0", 211 | "metadata": {}, 212 | "source": [ 213 | "## Encode" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 6, 219 | "id": "2402ebb9", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "# Set up the mean encoder\n", 224 | "\n", 225 | "enc = MeanEncoder()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 7, 231 | "id": "74ef4a1a", 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/html": [ 237 | "
MeanEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 238 | ], 239 | "text/plain": [ 240 | "MeanEncoder()" 241 | ] 242 | }, 243 | "execution_count": 7, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "# Find mean target value per category\n", 250 | "# (it uses the entire train set)\n", 251 | "\n", 252 | "enc.fit(X_train, y_train)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 8, 258 | "id": "1667b70c", 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "['country']" 265 | ] 266 | }, 267 | "execution_count": 8, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "# Feature-engine's encoder finds categorical variables\n", 274 | "# by default\n", 275 | "\n", 276 | "enc.variables_" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 9, 282 | "id": "90a34078", 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "{'country': {'Belgium': 511.37853658536585,\n", 289 | " 'EIRE': 5579.161829268293,\n", 290 | " 'France': 2872.7475609756098,\n", 291 | " 'Germany': 3764.180012195122,\n", 292 | " 'Spain': 919.3335365853659,\n", 293 | " 'United Kingdom': 129124.83931707316}}" 294 | ] 295 | }, 296 | "execution_count": 9, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "# the encoding values\n", 303 | "\n", 304 | "enc.encoder_dict_" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 10, 310 | "id": "2c4cf198", 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "data": { 315 | "text/html": [ 316 | "
\n", 317 | "\n", 330 | "\n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | "
countryquantityrevenue
week
2009-12-06511.378537143439.1
2009-12-13511.378537108.5
2009-12-20511.37853700.0
2009-12-27511.37853700.0
2010-01-03511.37853700.0
\n", 378 | "
" 379 | ], 380 | "text/plain": [ 381 | " country quantity revenue\n", 382 | "week \n", 383 | "2009-12-06 511.378537 143 439.1\n", 384 | "2009-12-13 511.378537 10 8.5\n", 385 | "2009-12-20 511.378537 0 0.0\n", 386 | "2009-12-27 511.378537 0 0.0\n", 387 | "2010-01-03 511.378537 0 0.0" 388 | ] 389 | }, 390 | "execution_count": 10, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "# Encode datasets\n", 397 | "\n", 398 | "X_train_t = enc.transform(X_train)\n", 399 | "X_test_t = enc.transform(X_test)\n", 400 | "\n", 401 | "X_train_t.head()" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "id": "85599ce7", 407 | "metadata": {}, 408 | "source": [ 409 | "Note that Belgium was replaced by 511.37 in all rows, even though on various occasions the revenue was 0. This may result in a \"look ahead\" bias." 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "id": "60a6c207", 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [] 419 | } 420 | ], 421 | "metadata": { 422 | "kernelspec": { 423 | "display_name": "fsml", 424 | "language": "python", 425 | "name": "fsml" 426 | }, 427 | "language_info": { 428 | "codemirror_mode": { 429 | "name": "ipython", 430 | "version": 3 431 | }, 432 | "file_extension": ".py", 433 | "mimetype": "text/x-python", 434 | "name": "python", 435 | "nbconvert_exporter": "python", 436 | "pygments_lexer": "ipython3", 437 | "version": "3.10.5" 438 | }, 439 | "toc": { 440 | "base_numbering": 1, 441 | "nav_menu": {}, 442 | "number_sections": true, 443 | "sideBar": true, 444 | "skip_h1_title": false, 445 | "title_cell": "Table of Contents", 446 | "title_sidebar": "Contents", 447 | "toc_cell": false, 448 | "toc_position": { 449 | "height": "calc(100% - 180px)", 450 | "left": "10px", 451 | "top": "150px", 452 | "width": "165px" 453 | }, 454 | "toc_section_display": true, 455 | "toc_window_display": true 456 | } 457 | }, 458 | "nbformat": 4, 459 | "nbformat_minor": 5 460 | } 461 | -------------------------------------------------------------------------------- /12-Categorical-Encoding/4-mean-encoding-expanding-window.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b699e295", 6 | "metadata": {}, 7 | "source": [ 8 | "# Mean encoding - expanding window\n", 9 | "\n", 10 | "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n", 11 | "\n", 12 | "In this notebook, we will encode static features with mean encoding by using expanding windows. This implementation avoids look-ahead bias.\n", 13 | "\n", 14 | "We will use the online retail dataset, which we prepared in the notebook `02-create-online-retail-II-datasets.ipynb` located in the `01-Create-Datasets` folder." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "id": "49b2f0bf", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "5a174f3b", 31 | "metadata": {}, 32 | "source": [ 33 | "## Load data" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "67a2af74", 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/html": [ 45 | "
\n", 46 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | "
countryquantityrevenue
week
2009-12-06Belgium143439.1
2009-12-13Belgium108.5
2009-12-20Belgium00.0
2009-12-27Belgium00.0
2010-01-03Belgium00.0
\n", 107 | "
" 108 | ], 109 | "text/plain": [ 110 | " country quantity revenue\n", 111 | "week \n", 112 | "2009-12-06 Belgium 143 439.1\n", 113 | "2009-12-13 Belgium 10 8.5\n", 114 | "2009-12-20 Belgium 0 0.0\n", 115 | "2009-12-27 Belgium 0 0.0\n", 116 | "2010-01-03 Belgium 0 0.0" 117 | ] 118 | }, 119 | "execution_count": 2, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "df = pd.read_csv(\"../Datasets/online_retail_dataset_countries.csv\",\n", 126 | " parse_dates=[\"week\"],\n", 127 | " index_col=\"week\",\n", 128 | " )\n", 129 | "\n", 130 | "df.head()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "id": "50846272", 136 | "metadata": {}, 137 | "source": [ 138 | "## Split into train and test" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 3, 144 | "id": "1f4c0763", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# Split data before an after June 2011\n", 149 | "\n", 150 | "X_train = df[df.index <= pd.to_datetime('2011-06-30')]\n", 151 | "\n", 152 | "# We need the past data for the expanding window.\n", 153 | "X_test = df.copy()\n", 154 | "\n", 155 | "# the target variable\n", 156 | "y_train = X_train[\"revenue\"]\n", 157 | "y_test = X_test[\"revenue\"]" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 4, 163 | "id": "e1418b42", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "(Timestamp('2009-12-06 00:00:00'), Timestamp('2011-06-26 00:00:00'))" 170 | ] 171 | }, 172 | "execution_count": 4, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "# sanity check\n", 179 | "\n", 180 | "X_train.index.min(), X_train.index.max()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 5, 186 | "id": "1faf10f7", 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "(Timestamp('2009-12-06 00:00:00'), Timestamp('2011-12-11 00:00:00'))" 193 | ] 194 | }, 195 | "execution_count": 5, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "# sanity check\n", 202 | "\n", 203 | "X_test.index.min(), X_test.index.max()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "id": "d5de7aa0", 209 | "metadata": {}, 210 | "source": [ 211 | "## Encode countries" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 6, 217 | "id": "931e9ef9", 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/html": [ 223 | "
\n", 224 | "\n", 237 | "\n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | "
countryweekcountry_enc
0Belgium2009-12-06NaN
1Belgium2009-12-13439.100000
2Belgium2009-12-20223.800000
3Belgium2009-12-27149.200000
4Belgium2010-01-03111.900000
............
487United Kingdom2011-05-29129923.850701
488United Kingdom2011-06-05129810.417487
489United Kingdom2011-06-12129208.338025
490United Kingdom2011-06-19129708.159425
491United Kingdom2011-06-26129598.153506
\n", 315 | "

492 rows × 3 columns

\n", 316 | "
" 317 | ], 318 | "text/plain": [ 319 | " country week country_enc\n", 320 | "0 Belgium 2009-12-06 NaN\n", 321 | "1 Belgium 2009-12-13 439.100000\n", 322 | "2 Belgium 2009-12-20 223.800000\n", 323 | "3 Belgium 2009-12-27 149.200000\n", 324 | "4 Belgium 2010-01-03 111.900000\n", 325 | ".. ... ... ...\n", 326 | "487 United Kingdom 2011-05-29 129923.850701\n", 327 | "488 United Kingdom 2011-06-05 129810.417487\n", 328 | "489 United Kingdom 2011-06-12 129208.338025\n", 329 | "490 United Kingdom 2011-06-19 129708.159425\n", 330 | "491 United Kingdom 2011-06-26 129598.153506\n", 331 | "\n", 332 | "[492 rows x 3 columns]" 333 | ] 334 | }, 335 | "execution_count": 6, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "# train set first\n", 342 | "\n", 343 | "train_enc = (\n", 344 | " X_train\n", 345 | " .groupby(['country'])['revenue']\n", 346 | " .expanding()\n", 347 | " .mean()\n", 348 | " .shift()\n", 349 | ").reset_index()\n", 350 | "\n", 351 | "train_enc.rename(columns = {\"revenue\": \"country_enc\"}, inplace = True)\n", 352 | "\n", 353 | "train_enc" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 7, 359 | "id": "6d3d07a7", 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/html": [ 365 | "
\n", 366 | "\n", 379 | "\n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | "
weekcountryquantityrevenuecountry_enc
02009-12-06Belgium143439.10NaN
12009-12-13Belgium108.50439.100000
22009-12-20Belgium00.00223.800000
32009-12-27Belgium00.00149.200000
42010-01-03Belgium00.00111.900000
..................
4872011-05-29United Kingdom67666121076.06129923.850701
4882011-06-05United Kingdom4442282246.14129810.417487
4892011-06-12United Kingdom77850169194.05129208.338025
4902011-06-19United Kingdom68207120797.68129708.159425
4912011-06-26United Kingdom5710290786.39129598.153506
\n", 481 | "

492 rows × 5 columns

\n", 482 | "
" 483 | ], 484 | "text/plain": [ 485 | " week country quantity revenue country_enc\n", 486 | "0 2009-12-06 Belgium 143 439.10 NaN\n", 487 | "1 2009-12-13 Belgium 10 8.50 439.100000\n", 488 | "2 2009-12-20 Belgium 0 0.00 223.800000\n", 489 | "3 2009-12-27 Belgium 0 0.00 149.200000\n", 490 | "4 2010-01-03 Belgium 0 0.00 111.900000\n", 491 | ".. ... ... ... ... ...\n", 492 | "487 2011-05-29 United Kingdom 67666 121076.06 129923.850701\n", 493 | "488 2011-06-05 United Kingdom 44422 82246.14 129810.417487\n", 494 | "489 2011-06-12 United Kingdom 77850 169194.05 129208.338025\n", 495 | "490 2011-06-19 United Kingdom 68207 120797.68 129708.159425\n", 496 | "491 2011-06-26 United Kingdom 57102 90786.39 129598.153506\n", 497 | "\n", 498 | "[492 rows x 5 columns]" 499 | ] 500 | }, 501 | "execution_count": 7, 502 | "metadata": {}, 503 | "output_type": "execute_result" 504 | } 505 | ], 506 | "source": [ 507 | "# Add encoded variable to original train set\n", 508 | "\n", 509 | "X_train_enc = X_train.reset_index().merge(train_enc)\n", 510 | "\n", 511 | "X_train_enc" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 8, 517 | "id": "5f6bf153", 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "data": { 522 | "text/html": [ 523 | "
\n", 524 | "\n", 537 | "\n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | "
quantityrevenuecountry_enc
week
2009-12-06143439.1NaN
2009-12-13108.5439.1
2009-12-2000.0223.8
2009-12-2700.0149.2
2010-01-0300.0111.9
\n", 585 | "
" 586 | ], 587 | "text/plain": [ 588 | " quantity revenue country_enc\n", 589 | "week \n", 590 | "2009-12-06 143 439.1 NaN\n", 591 | "2009-12-13 10 8.5 439.1\n", 592 | "2009-12-20 0 0.0 223.8\n", 593 | "2009-12-27 0 0.0 149.2\n", 594 | "2010-01-03 0 0.0 111.9" 595 | ] 596 | }, 597 | "execution_count": 8, 598 | "metadata": {}, 599 | "output_type": "execute_result" 600 | } 601 | ], 602 | "source": [ 603 | "# Now we drop the static variable\n", 604 | "\n", 605 | "X_train_enc = X_train_enc.drop(\"country\", axis=1)\n", 606 | "\n", 607 | "# Reset the index\n", 608 | "X_train_enc.set_index(\"week\", inplace=True)\n", 609 | "\n", 610 | "X_train_enc.head()" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 9, 616 | "id": "2402ebb9", 617 | "metadata": {}, 618 | "outputs": [ 619 | { 620 | "data": { 621 | "text/html": [ 622 | "
\n", 623 | "\n", 636 | "\n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | "
quantityrevenuecountry_enc
week
2011-07-03103163.90511.378537
2011-07-106661022.82507.192048
2011-07-171345.60513.330476
2011-07-2400.00507.827765
2011-07-3110001407.15501.922791
\n", 684 | "
" 685 | ], 686 | "text/plain": [ 687 | " quantity revenue country_enc\n", 688 | "week \n", 689 | "2011-07-03 103 163.90 511.378537\n", 690 | "2011-07-10 666 1022.82 507.192048\n", 691 | "2011-07-17 13 45.60 513.330476\n", 692 | "2011-07-24 0 0.00 507.827765\n", 693 | "2011-07-31 1000 1407.15 501.922791" 694 | ] 695 | }, 696 | "execution_count": 9, 697 | "metadata": {}, 698 | "output_type": "execute_result" 699 | } 700 | ], 701 | "source": [ 702 | "# Now we repeat for the test set\n", 703 | "\n", 704 | "# Find the encoding values\n", 705 | "test_enc = (\n", 706 | " X_test\n", 707 | " .groupby(['country'])['revenue']\n", 708 | " .expanding()\n", 709 | " .mean()\n", 710 | " .shift()\n", 711 | ").reset_index()\n", 712 | "\n", 713 | "test_enc.rename(columns = {\"revenue\": \"country_enc\"}, inplace = True)\n", 714 | "\n", 715 | "# join encoded variable\n", 716 | "X_test_enc = X_test.reset_index().merge(test_enc)\n", 717 | "\n", 718 | "# Drop original variable\n", 719 | "X_test_enc = X_test_enc.drop(\"country\", axis=1)\n", 720 | "\n", 721 | "# Reset the index\n", 722 | "X_test_enc.set_index(\"week\", inplace=True)\n", 723 | "\n", 724 | "# Remove data that belongs to the train set\n", 725 | "X_test_enc = X_test_enc[X_test_enc.index > pd.to_datetime('2011-06-30')]\n", 726 | "\n", 727 | "X_test_enc.head()" 728 | ] 729 | }, 730 | { 731 | "cell_type": "markdown", 732 | "id": "86a89e3e", 733 | "metadata": {}, 734 | "source": [ 735 | "That's it!\n", 736 | "\n", 737 | "As you can see, with this way of encoding the static feature, we need to do a lot of the work manually, and we need to be careful to have enough data in the train set, and to split the data correctly after the encoding." 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": null, 743 | "id": "77b803d1", 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [] 747 | } 748 | ], 749 | "metadata": { 750 | "kernelspec": { 751 | "display_name": "fsml", 752 | "language": "python", 753 | "name": "fsml" 754 | }, 755 | "language_info": { 756 | "codemirror_mode": { 757 | "name": "ipython", 758 | "version": 3 759 | }, 760 | "file_extension": ".py", 761 | "mimetype": "text/x-python", 762 | "name": "python", 763 | "nbconvert_exporter": "python", 764 | "pygments_lexer": "ipython3", 765 | "version": "3.10.5" 766 | }, 767 | "toc": { 768 | "base_numbering": 1, 769 | "nav_menu": {}, 770 | "number_sections": true, 771 | "sideBar": true, 772 | "skip_h1_title": false, 773 | "title_cell": "Table of Contents", 774 | "title_sidebar": "Contents", 775 | "toc_cell": false, 776 | "toc_position": { 777 | "height": "calc(100% - 180px)", 778 | "left": "10px", 779 | "top": "150px", 780 | "width": "173.267px" 781 | }, 782 | "toc_section_display": true, 783 | "toc_window_display": true 784 | } 785 | }, 786 | "nbformat": 4, 787 | "nbformat_minor": 5 788 | } 789 | -------------------------------------------------------------------------------- /Appendix/00-pandas-period.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "24d78855-9712-419b-8201-486452f5120a", 6 | "metadata": {}, 7 | "source": [ 8 | "# Pandas Period" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "bdb234fa-ee2e-403d-a0ff-4b2c0fdced43", 14 | "metadata": {}, 15 | "source": [ 16 | "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n", 17 | "\n", 18 | "In this notebook we'll discuss the Pandas `Period` and `PeriodIndex` type to handle time span related data." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "010aee50-728d-4c24-a6f6-9282a71364c1", 24 | "metadata": {}, 25 | "source": [ 26 | "# Load example data" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "55065bd0-a3fe-4d4b-970c-ea09d514fb12", 32 | "metadata": {}, 33 | "source": [ 34 | "The air passengers dataset is the monthly totals of international airline passengers, from 1949 to 1960, in units of 1000s. \n", 35 | "\n", 36 | "For instructions on how to download, prepare, and store the dataset, refer to notebook number 5, in the folder \"01-Create-Datasets\" from this repo." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 1, 42 | "id": "7e30d3c0-baa1-4fb0-86c4-6196e46641c0", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "import pandas as pd\n", 47 | "import numpy as np" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "id": "d8d74785-9082-4711-8dad-de0d3b333ab6", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "df = pd.read_csv(\n", 58 | " \"../Datasets/example_air_passengers.csv\",\n", 59 | " parse_dates=[\"ds\"],\n", 60 | " index_col=[\"ds\"],\n", 61 | ")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "id": "f258096a-1171-43b0-97d7-70ce59f74e00", 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',\n", 74 | " '1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01',\n", 75 | " '1949-09-01', '1949-10-01',\n", 76 | " ...\n", 77 | " '1960-03-01', '1960-04-01', '1960-05-01', '1960-06-01',\n", 78 | " '1960-07-01', '1960-08-01', '1960-09-01', '1960-10-01',\n", 79 | " '1960-11-01', '1960-12-01'],\n", 80 | " dtype='datetime64[ns]', name='ds', length=144, freq=None)" 81 | ] 82 | }, 83 | "execution_count": 3, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "df.index" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "id": "777e7fb4-3554-41e4-8a0d-8b1d8086e14f", 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "pandas._libs.tslibs.timestamps.Timestamp" 102 | ] 103 | }, 104 | "execution_count": 4, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "type(df.index[0])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "d9d5b80c-3b03-427e-a474-de54022904b9", 116 | "metadata": {}, 117 | "source": [ 118 | "The current type of our index is a `DatetimeIndex` where each element is a `Timestamp`." 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "bf881c8f-77b0-4dea-89a3-9a39046a5e64", 124 | "metadata": {}, 125 | "source": [ 126 | "# Pandas Period - what is it and when to use it." 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "7e98b0a0-27a6-4515-91fa-961a76ae0a8a", 132 | "metadata": {}, 133 | "source": [ 134 | "When working with time related information which refers to a time span (e.g., the sales of products over each month) rather than an instance in time (e.g., an event that occurs at a specific timestamp), it can be more convenient to work with a data type in Pandas called `Period`." 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "995f40b2-fab4-40a0-9b43-643da70f2b56", 140 | "metadata": {}, 141 | "source": [ 142 | "To read more about the `Period` type in Pandas see the [docs](https://pandas.pydata.org/docs/user_guide/timeseries.html), in particular the section titled \"timestamps vs. time spans\".\n", 143 | " \n", 144 | " > \"A `Period` represents a span of time (e.g., a day, a month, a quarter, etc).\"\n", 145 | " \n", 146 | " > \"Under the hood, pandas represents timestamps using instances of `Timestamp` and sequences of timestamps using instances of `DatetimeIndex`. For regular time spans, pandas uses `Period` objects for scalar values and `PeriodIndex` for sequences of spans.\"" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "id": "1d47571a-47c0-4611-b5a1-f90e456a72eb", 152 | "metadata": {}, 153 | "source": [ 154 | "`Period` objects can be created just as easily as timestamp `Timestamp` objects." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 5, 160 | "id": "f979b672-257f-459e-9b55-84d4aab3760b", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "Timestamp('2020-01-01 00:00:00')" 167 | ] 168 | }, 169 | "execution_count": 5, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "pd.Timestamp(\"2020-01-01\") # Create a timestamp representing 1st January 2020 at time 00:00:00" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 6, 181 | "id": "1ee94d2b-94a0-4f7e-9a86-5ee68430f8b2", 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "Period('2020-01', 'M')" 188 | ] 189 | }, 190 | "execution_count": 6, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "pd.Period(\"2020-01\", freq=\"M\") # Create a time period representing the month of January 2020" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "id": "bb8d418b-64b5-4c1b-a214-fe64a2e7eb5d", 202 | "metadata": {}, 203 | "source": [ 204 | "For example, our dataset index currently is a `DatetimeIndex` where there is a day (and even a time) associated with each month (e.g., 1960-12-01 00:00:00), despite the day and time being meaningless for this data set. What we're trying to represent is the sales over the time span of a given month." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 7, 210 | "id": "463cb570-fbab-4a1b-926e-ff911d628868", 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/html": [ 216 | "
\n", 217 | "\n", 230 | "\n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | "
y
ds
1949-01-01112
1949-02-01118
1949-03-01132
1949-04-01129
1949-05-01121
\n", 264 | "
" 265 | ], 266 | "text/plain": [ 267 | " y\n", 268 | "ds \n", 269 | "1949-01-01 112\n", 270 | "1949-02-01 118\n", 271 | "1949-03-01 132\n", 272 | "1949-04-01 129\n", 273 | "1949-05-01 121" 274 | ] 275 | }, 276 | "execution_count": 7, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "df.head()" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "id": "0c107454-d49a-49e4-adbf-2c0a97f6597d", 288 | "metadata": {}, 289 | "source": [ 290 | "We can convert the index from `datetime` to `Period` as follows:" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 8, 296 | "id": "bba662e9-82ee-47a3-ad78-88047a12a911", 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "df.index = df.index.to_period()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 9, 306 | "id": "eda8d569-a678-4d35-90a6-d288cf53986d", 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/html": [ 312 | "
\n", 313 | "\n", 326 | "\n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | "
y
ds
1949-01112
1949-02118
1949-03132
1949-04129
1949-05121
\n", 360 | "
" 361 | ], 362 | "text/plain": [ 363 | " y\n", 364 | "ds \n", 365 | "1949-01 112\n", 366 | "1949-02 118\n", 367 | "1949-03 132\n", 368 | "1949-04 129\n", 369 | "1949-05 121" 370 | ] 371 | }, 372 | "execution_count": 9, 373 | "metadata": {}, 374 | "output_type": "execute_result" 375 | } 376 | ], 377 | "source": [ 378 | "df.head()" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 10, 384 | "id": "e6d889f0-8c74-4eec-a367-da6c005b067c", 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "data": { 389 | "text/plain": [ 390 | "PeriodIndex(['1949-01', '1949-02', '1949-03', '1949-04', '1949-05', '1949-06',\n", 391 | " '1949-07', '1949-08', '1949-09', '1949-10',\n", 392 | " ...\n", 393 | " '1960-03', '1960-04', '1960-05', '1960-06', '1960-07', '1960-08',\n", 394 | " '1960-09', '1960-10', '1960-11', '1960-12'],\n", 395 | " dtype='period[M]', name='ds', length=144)" 396 | ] 397 | }, 398 | "execution_count": 10, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "df.index" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "id": "4f351009-6800-470c-845c-5f7338a8db97", 410 | "metadata": {}, 411 | "source": [ 412 | "We now have a `PeriodIndex` with monthly frequency which better represents the time series (i.e., the sales over the whole month)." 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "id": "1e2f4e38-9bee-46dc-88b1-8933bf5a0394", 418 | "metadata": {}, 419 | "source": [ 420 | "`Period` objects can make it easier to do certain calculations. Let's add one month to a given period:" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 11, 426 | "id": "9eae6521-ed45-4bb9-b1c4-c375c5ebdf6b", 427 | "metadata": {}, 428 | "outputs": [ 429 | { 430 | "data": { 431 | "text/plain": [ 432 | "Period('1949-01', 'M')" 433 | ] 434 | }, 435 | "execution_count": 11, 436 | "metadata": {}, 437 | "output_type": "execute_result" 438 | } 439 | ], 440 | "source": [ 441 | "df.index[0]" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 12, 447 | "id": "e86cbaa9-ce01-47cc-b5e0-b8ef6931d77e", 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "Period('1949-02', 'M')" 454 | ] 455 | }, 456 | "execution_count": 12, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "df.index[0] + 1" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "id": "3d567f8f-a0f7-4c3d-84ba-4385f44daeb5", 468 | "metadata": {}, 469 | "source": [ 470 | "`Period` is also the preferred type when calculating the **exact** differences in dates in terms of calendar events (e.g., what is the exact integer difference between the week numbers of the two following timestamps: \"2012-01-15 10:00:00\" (week 2, year 2012) and \"2014-04-01 01:30:00\" (week 14, year 2014))" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "id": "95c877ef-d1a1-4f44-a0e1-5f7c700cc064", 476 | "metadata": {}, 477 | "source": [ 478 | "Using `Period`" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 13, 484 | "id": "fd40ddc1-00bd-4289-a018-27a918d72e68", 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "data": { 489 | "text/plain": [ 490 | "<-116 * Weeks: weekday=6>" 491 | ] 492 | }, 493 | "execution_count": 13, 494 | "metadata": {}, 495 | "output_type": "execute_result" 496 | } 497 | ], 498 | "source": [ 499 | "delta = pd.Period(\"2012-01-15 10:00:00\", freq=\"W\") - pd.Period(\"2014-04-01 01:30:00\", freq=\"W\")\n", 500 | "delta" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "id": "63f43b42-ffcc-4f7e-ad2e-133bb2a056c8", 506 | "metadata": {}, 507 | "source": [ 508 | "We can get the integer using the `n` attribute:" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 14, 514 | "id": "bfa9b0fb-3e19-4465-baaa-d99d0cebb9d4", 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "data": { 519 | "text/plain": [ 520 | "-116" 521 | ] 522 | }, 523 | "execution_count": 14, 524 | "metadata": {}, 525 | "output_type": "execute_result" 526 | } 527 | ], 528 | "source": [ 529 | "delta.n" 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "id": "050c54fa-fc33-4b6e-97bb-e594fe148897", 535 | "metadata": {}, 536 | "source": [ 537 | "Using `Timestamp` and `timedelta` objects we only get approximate, and sometimes incorrect, answers:" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 15, 543 | "id": "be49da06-d33f-4be8-9a6b-443450331a92", 544 | "metadata": {}, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/plain": [ 549 | "-115.23511904761905" 550 | ] 551 | }, 552 | "execution_count": 15, 553 | "metadata": {}, 554 | "output_type": "execute_result" 555 | } 556 | ], 557 | "source": [ 558 | "(pd.Timestamp(\"2012-01-15 10:00:00\") - pd.Timestamp(\"2014-04-01 01:30:00\")) / np.timedelta64(1, \"W\")" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "id": "3119df1a-2812-4133-b9e6-4e7580c8cf64", 564 | "metadata": {}, 565 | "source": [ 566 | "Whether we use `Period` or `datetime` should not change the forecasting workflow, but it will make some calculations easier depending on the time series." 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "id": "a3bf4812-70e1-4a36-9658-90d164395bc8", 572 | "metadata": {}, 573 | "source": [ 574 | "In general, if your data represents a timespan then `Period` (e.g., sales over one month) can make handling the data more convenient. If your data represents events that occurred at a timepoint then `datetime` or `Timestamp` is preferred." 575 | ] 576 | } 577 | ], 578 | "metadata": { 579 | "kernelspec": { 580 | "display_name": "Python 3 (ipykernel)", 581 | "language": "python", 582 | "name": "python3" 583 | }, 584 | "language_info": { 585 | "codemirror_mode": { 586 | "name": "ipython", 587 | "version": 3 588 | }, 589 | "file_extension": ".py", 590 | "mimetype": "text/x-python", 591 | "name": "python", 592 | "nbconvert_exporter": "python", 593 | "pygments_lexer": "ipython3", 594 | "version": "3.10.5" 595 | }, 596 | "toc": { 597 | "base_numbering": 1, 598 | "nav_menu": {}, 599 | "number_sections": true, 600 | "sideBar": true, 601 | "skip_h1_title": false, 602 | "title_cell": "Table of Contents", 603 | "title_sidebar": "Contents", 604 | "toc_cell": false, 605 | "toc_position": {}, 606 | "toc_section_display": true, 607 | "toc_window_display": false 608 | } 609 | }, 610 | "nbformat": 4, 611 | "nbformat_minor": 5 612 | } 613 | -------------------------------------------------------------------------------- /Datasets/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/Datasets/.gitkeep -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021-2024, Kishan Manani, Soledad Galli 4 | Feature Engineering for Time Series - Online Course: 5 | https://www.trainindata.com/p/feature-engineering-for-forecasting 6 | 7 | 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this 12 | list of conditions and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution. 17 | 18 | 3. Neither the name of the copyright holder nor the names of its 19 | contributors may be used to endorse or promote products derived from 20 | this software without specific prior written permission. 21 | 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Feature Engineering for Time Series Forecasting - Code Repository 2 | 3 | [](https://www.trainindata.com/p/feature-engineering-for-forecasting) 4 | 5 | 6 | ![PythonVersion](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10-success) 7 | [![License https://github.com/trainindata/feature-engineering-for-time-series-forecasting/blob/master/LICENSE](https://img.shields.io/badge/license-BSD-success.svg)](https://github.com/trainindata/feature-engineering-for-time-series-forecasting/blob/master/LICENSE) 8 | [![Sponsorship https://www.trainindata.com/](https://img.shields.io/badge/Powered%20By-TrainInData-orange.svg)](https://www.trainindata.com/) 9 | 10 | Published October, 2022 11 | 12 | Actively maintained. 13 | 14 | ## Links 15 | 16 | - [Online Course](https://www.trainindata.com/p/feature-engineering-for-forecasting) 17 | 18 | 19 | ## Table of Contents 20 | 21 | 1. **Tabularizing time series data** 22 | 1. Features from the target 23 | 2. Features from exogenous variables 24 | 3. Single step forecasting 25 | 26 | 2. **Challenges in feature engineering for time series** 27 | 1. Train-test split 28 | 2. Pipelines 29 | 3. Multistep forecasting 30 | 4. Direct forecasting 31 | 5. Recursive forecasting 32 | 33 | 3. **Time series decomposition** 34 | 1. Components of a time series: trend and seasonality 35 | 2. Multiplicative and additive models 36 | 3. Log transform and Box-Cox 37 | 4. Moving averages 38 | 5. LOWESS, STL, and multiseasonal time series decomposition 39 | 40 | 4. **Missing data imputation** 41 | 1. Forward and backward filling 42 | 2. Linear and spline interpolation 43 | 3. Seasonal decomposition and interpolation 44 | 45 | 5. **Outliers** 46 | 1. Rolling statistics for outlier detection 47 | 2. LOWESS for outlier detection 48 | 3. STL for outlier detection 49 | 50 | 6. **Lag features** 51 | 1. Autoregressive processes 52 | 2. Lag plots 53 | 3. ACF, PACF, CCF 54 | 4. Seasonal lags 55 | 4. Creating lags with open-source 56 | 57 | 7. **Window features** 58 | 1. Rolling windows 59 | 2. Expanding windows 60 | 3. Exponentially weighted windows 61 | 4. Creating window features with open-source 62 | 63 | 8. **Trend features** 64 | 1. Using time to model linear trend 65 | 2. Polynomial features of time to model non-linear trend 66 | 3. Changepoints & piecweise linear trends to model non-linear trend 67 | 4. Forecasting time series with trend using tree-based models 68 | 5. Creating trend features with open-source 69 | 70 | 9. **Seasonality features** 71 | 1. Seasonal lags 72 | 2. Seasonal dummies 73 | 3. Seasonal decomposition methods 74 | 4. Fourier terms 75 | 5. Creating seasonality features with open-source 76 | 77 | 10. **Datetime features** 78 | 1. Extracting features from date and time 79 | 2. Periodic features 80 | 3. Calendar events 81 | 4. Creating datetime features with open-source 82 | 83 | 11. **Categorical Features** 84 | 1. One hot encoding 85 | 2. Target encoding 86 | 3. Rolling entropy and rolling majority 87 | 88 | 89 | - [Online Course](https://www.trainindata.com/p/feature-engineering-for-forecasting) 90 | -------------------------------------------------------------------------------- /assignments/02-tabularizing-time-series/assignment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2d1a73ab", 6 | "metadata": {}, 7 | "source": [ 8 | "# Tabularize time series\n", 9 | "\n", 10 | "In this assignment, your task is to convert **time series data** into a **tabular data set**.\n", 11 | "\n", 12 | "You need to create suitable input features from a time series containing weekly sales to be able to forecast sales for the next week.\n", 13 | "\n", 14 | "To prepare the dataset for this assignment, please follow the guidelines in the notebook `02-create-online-retail-II-datasets.ipynb` in the `01-Create-Datasets` folder." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "id": "f53976d3", 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/html": [ 26 | "
\n", 27 | "\n", 40 | "\n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | "
sales
week
2009-12-06213000.35
2009-12-13195810.04
2009-12-20182396.74
2009-12-2722007.77
2010-01-030.00
\n", 74 | "
" 75 | ], 76 | "text/plain": [ 77 | " sales\n", 78 | "week \n", 79 | "2009-12-06 213000.35\n", 80 | "2009-12-13 195810.04\n", 81 | "2009-12-20 182396.74\n", 82 | "2009-12-27 22007.77\n", 83 | "2010-01-03 0.00" 84 | ] 85 | }, 86 | "execution_count": 2, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "# load weekly sales dataset\n", 93 | "\n", 94 | "filename = \"../../Datasets/online_retail_dataset.csv\"\n", 95 | "\n", 96 | "df = pd.read_csv(\n", 97 | " filename,\n", 98 | " usecols=[\"week\", \"United Kingdom\"],\n", 99 | " parse_dates=[\"week\"],\n", 100 | " index_col=[\"week\"],\n", 101 | ")\n", 102 | "\n", 103 | "df.columns = ['sales']\n", 104 | "\n", 105 | "df.head()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "cdfe9415", 111 | "metadata": {}, 112 | "source": [ 113 | "# Data analysis\n", 114 | "\n", 115 | "First, explore the time series.\n", 116 | "\n", 117 | "## Plot time series" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "id": "6ceabd79", 123 | "metadata": {}, 124 | "source": [ 125 | "## Missing data\n", 126 | "\n", 127 | "Check if there are missing values in the time series." 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "9c484bca", 133 | "metadata": {}, 134 | "source": [ 135 | "## Missing timestamps\n", 136 | "\n", 137 | "Check if there are missing timestamps in the index." 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "id": "444ca303", 143 | "metadata": {}, 144 | "source": [ 145 | "## Seasonality\n", 146 | "\n", 147 | "Does the time series show any obvious seasonal pattern?" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "id": "e81565cb", 153 | "metadata": {}, 154 | "source": [ 155 | "# Feature engineering\n", 156 | "\n", 157 | "Now, let's begin to tabularize the data." 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "id": "20ae8079", 163 | "metadata": {}, 164 | "source": [ 165 | "## Split data\n", 166 | "\n", 167 | "Separate the data into training and testing sets, leaving the data after the last week of September to evaluate the forecasts, that is, in the testing set." 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "820803d5", 173 | "metadata": {}, 174 | "source": [ 175 | "## Naive forecast\n", 176 | "\n", 177 | "Predict sales in the next week (t) as the value of sales in the previous week (t-1)." 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "id": "4058260e", 183 | "metadata": {}, 184 | "source": [ 185 | "## Machine Learning" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "4957673a", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "fsml", 200 | "language": "python", 201 | "name": "fsml" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.10.5" 214 | }, 215 | "toc": { 216 | "base_numbering": 1, 217 | "nav_menu": {}, 218 | "number_sections": true, 219 | "sideBar": true, 220 | "skip_h1_title": false, 221 | "title_cell": "Table of Contents", 222 | "title_sidebar": "Contents", 223 | "toc_cell": false, 224 | "toc_position": {}, 225 | "toc_section_display": true, 226 | "toc_window_display": true 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 5 231 | } 232 | -------------------------------------------------------------------------------- /images/FETSF_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/FETSF_banner.png -------------------------------------------------------------------------------- /images/forecasting_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/forecasting_framework.png -------------------------------------------------------------------------------- /images/lag_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/lag_features.png -------------------------------------------------------------------------------- /images/trainindata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/trainindata.png -------------------------------------------------------------------------------- /images/window_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/window_features.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openpyxl>=3.0.6 2 | xlrd>=2.0.1 3 | 4 | # Numerical computing libraries 5 | pandas>=1.4.0 6 | numpy>=1.18.0 7 | scikit-learn>=1.0.0 8 | scipy>=1.6.0 9 | statsmodels>=0.12.1 10 | 11 | # plotting libraries 12 | matplotlib>=3.3.4 13 | seaborn>=0.11.1 14 | 15 | # jupyter notebook 16 | jupyterlab>=3.0.6 17 | ipykernel>=5.5.5 18 | 19 | # feature engineering libraries 20 | feature-engine>=1.3.0 21 | featuretools>=1.2.0 --------------------------------------------------------------------------------