├── .gitignore
├── 01-Create-Datasets
    ├── 01-create-retail-datasets.ipynb
    ├── 02-create-online-retail-II-datasets.ipynb
    ├── 03-create-air-quality-dataset.ipynb
    ├── 04-create-air-passengers-dataset.ipynb
    └── 05-create-electricity-demand-dataset.ipynb
├── 02-Tabularizing-Time-Series
    ├── 01-data-analysis-air-pollutants.ipynb
    ├── 02-feature-engineering-air-pollutants.ipynb
    └── 03-forecasting-air-pollutants.ipynb
├── 03-Challenges-in-Time-Series-Forecasting
    ├── 01-Refactoring-feature-engineering.ipynb
    ├── 02-forecasting-one-step-ahead.ipynb
    ├── 03-multistep-forecasting-direct.ipynb
    ├── 04-multistep-forecasting-recursive.ipynb
    └── 05-multistep-forecasting-recursive-continued.ipynb
├── 04-Time-Series-Decomposition
    ├── 01-box-cox-transform.ipynb
    ├── 02-compute-moving-averages.ipynb
    ├── 03-classical-decomposition-to-compute-trend-and-seasonality.ipynb
    ├── 04-LOWESS-to-compute-trend.ipynb
    ├── 05-STL-to-compute-trend-and-seasonality.ipynb
    └── 06-MSTL-decomposition.ipynb
├── 05-Missing-Data
    ├── 01-impute-missing-data-using-forward-fill-backward-fill.ipynb
    ├── 02-impute-missing-data-using-linear-and-spline-interpolation.ipynb
    └── 03-impute-missing-data-using-STL-decomposition-and-interpolation.ipynb
├── 06-Outliers
    ├── 01-detect-outliers-using-rolling-statistics.ipynb
    ├── 02-detect-outliers-using-residuals-LOWESS.ipynb
    ├── 03-detect-outliers-using-residuals-STL.ipynb
    └── 04-modelling-outliers-with-dummy-variables.ipynb
├── 07-Lag-Features
    ├── 01-computing-lags.ipynb
    ├── 02-lag-plots.ipynb
    ├── 03-autocorrelation-function.ipynb
    ├── 04-partial-autocorrelation-function.ipynb
    ├── 05-cross-correlation-function.ipynb
    ├── 06-air-pollution-example-domain-knowledge.ipynb
    ├── 07-air-pollution-example-modelling.ipynb
    └── 08-air-pollution-example-correlation.ipynb
├── 08-Window-Features
    ├── 01-rolling-window-features.ipynb
    ├── 02-expanding-window-features.ipynb
    ├── 03-weighted-rolling-window-features.ipynb
    ├── 04-exponential-weights.ipynb
    └── 05-window-features-with-feature-selection.ipynb
├── 09-Trend-Features
    ├── 01-time-linear-trend.ipynb
    ├── 02-time-non-linear-trend.ipynb
    ├── 03-recursive-forecasting-example.ipynb
    ├── 04-piecewise-linear-trend-and-changepoints.ipynb
    ├── 05-tree-based-models-and-trend.ipynb
    ├── 06-linear-trees-lightgbm.ipynb
    └── images
    │   ├── forecast_with_just_time.png
    │   └── recursive_forecasting
    │       ├── Slide1.png
    │       ├── Slide2.png
    │       ├── Slide3.png
    │       └── Slide4.png
├── 10-Seasonality-Features
    ├── 01-seasonal-lags.ipynb
    ├── 02-datetime-features-seasonality.ipynb
    ├── 03-seasonal-dummies.ipynb
    └── 04-fourier-features.ipynb
├── 11-Time-Features
    ├── 01-Extracting-date-related-features.ipynb
    ├── 02-Extracting-time-related-features.ipynb
    ├── 03-datetime-with-Feature-engine.ipynb
    ├── 04-periodic-features.ipynb
    ├── 05-highlighting-holidays-sandbox.ipynb
    └── 05-highlighting-holidays.ipynb
├── 12-Categorical-Encoding
    ├── 1-one-hot-encoding.ipynb
    ├── 2-ordinal-encoding.ipynb
    ├── 3-mean-encoding-simple.ipynb
    └── 4-mean-encoding-expanding-window.ipynb
├── Appendix
    └── 00-pandas-period.ipynb
├── Datasets
    └── .gitkeep
├── LICENSE
├── README.md
├── assignments
    └── 02-tabularizing-time-series
    │   ├── assignment.ipynb
    │   └── solution.ipynb
├── images
    ├── FETSF_banner.png
    ├── forecasting_framework.png
    ├── lag_features.png
    ├── trainindata.png
    └── window_features.png
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Jupyter Notebook
 2 | .ipynb_checkpoints
 3 | 
 4 | # datasets
 5 | *.csv
 6 | *.zip
 7 | *.xlsx
 8 | 
 9 | # folders
10 | 
11 | 


--------------------------------------------------------------------------------
/01-Create-Datasets/01-create-retail-datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c81efda5",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Retail sales\n",
  9 |     "\n",
 10 |     "In this notebook we will prepare and store the retail sales dataset found [here](https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv).\n",
 11 |     "\n",
 12 |     "**Description of data:**\n",
 13 |     "\n",
 14 |     "The timeseries is collected between January 1992 and May 2016. It consists of a single series of monthly values representing sales volumes. "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "id": "888749e6",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import pandas as pd\n",
 25 |     "import numpy as np\n",
 26 |     "import matplotlib.pyplot as plt\n",
 27 |     "\n",
 28 |     "from statsmodels.tsa.seasonal import STL"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "25cc2a1f",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "# Get the dataset"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "id": "73ac5d57",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "The dataset can be obtained from this [link](https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv). It will open a raw file in GitHub. A simple way of obtaining the data is to copy and paste the values from your browser into a text editor of your choice. \n",
 45 |     "Save it in the Datasets directory, which is found at the root of this project, with the filename `example_retail_sales.csv`. \n",
 46 |     "\n",
 47 |     "Alternatively, download it using Pandas by running:\n",
 48 |     "\n"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 2,
 54 |    "id": "15c6a149",
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "url = \"https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv\"\n",
 59 |     "df = pd.read_csv(url)\n",
 60 |     "df.to_csv(\"../Datasets/example_retail_sales.csv\", index=False)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "id": "5feac9ec",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "Now follow the rest of the notebook."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 3,
 74 |    "id": "707768c5",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "df = pd.read_csv(\n",
 79 |     "    \"../Datasets/example_retail_sales.csv\",\n",
 80 |     "    parse_dates=[\"ds\"],\n",
 81 |     "    index_col=[\"ds\"],\n",
 82 |     "    nrows=160,\n",
 83 |     ")"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "id": "three-blind",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "# Create dataset with missing data"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 4,
 97 |    "id": "112f9b90",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "# copy dataframe\n",
102 |     "df_with_missing_data = df.copy()\n",
103 |     "\n",
104 |     "# Insert missing data into dataframe\n",
105 |     "df_with_missing_data.iloc[10:11] = np.NaN\n",
106 |     "df_with_missing_data.iloc[25:28] = np.NaN\n",
107 |     "df_with_missing_data.iloc[40:45] = np.NaN\n",
108 |     "df_with_missing_data.iloc[70:94] = np.NaN"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 5,
114 |    "id": "45acce8b",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# Save dataset in Datasets directory\n",
119 |     "df_with_missing_data.to_csv(\"../Datasets/example_retail_sales_with_missing_data.csv\")"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "id": "80293d1b",
125 |    "metadata": {},
126 |    "source": [
127 |     "# Create dataset with outliers"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 6,
133 |    "id": "b78e8d57",
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "df_with_outliers = df.copy()"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 7,
143 |    "id": "57bf7198",
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "# Insert outliers into dataframe\n",
148 |     "outlier_idx = [20, 33, 66, 150]\n",
149 |     "df_with_outliers.iloc[outlier_idx] = df_with_outliers.iloc[outlier_idx] * 1.7"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 8,
155 |    "id": "ce560e64",
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "# Save dataset in Datasets directory\n",
160 |     "df_with_outliers.to_csv(\"../Datasets/example_retail_sales_with_outliers.csv\")"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "id": "41606a6b",
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": []
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "kernelspec": {
174 |    "display_name": "fets",
175 |    "language": "python",
176 |    "name": "fets"
177 |   },
178 |   "language_info": {
179 |    "codemirror_mode": {
180 |     "name": "ipython",
181 |     "version": 3
182 |    },
183 |    "file_extension": ".py",
184 |    "mimetype": "text/x-python",
185 |    "name": "python",
186 |    "nbconvert_exporter": "python",
187 |    "pygments_lexer": "ipython3",
188 |    "version": "3.8.2"
189 |   },
190 |   "toc": {
191 |    "base_numbering": 1,
192 |    "nav_menu": {},
193 |    "number_sections": true,
194 |    "sideBar": true,
195 |    "skip_h1_title": false,
196 |    "title_cell": "Table of Contents",
197 |    "title_sidebar": "Contents",
198 |    "toc_cell": false,
199 |    "toc_position": {},
200 |    "toc_section_display": true,
201 |    "toc_window_display": false
202 |   }
203 |  },
204 |  "nbformat": 4,
205 |  "nbformat_minor": 5
206 | }
207 | 


--------------------------------------------------------------------------------
/01-Create-Datasets/02-create-online-retail-II-datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Online Retail II Data Set\n",
  8 |     "\n",
  9 |     "In this notebook we will prepare and store the Online Retail II Data Set stored on the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail+II)\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "**Citation:**\n",
 13 |     "\n",
 14 |     "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n",
 15 |     "\n",
 16 |     "## Download the data\n",
 17 |     "\n",
 18 |     "- Navigate to the [data folder](https://archive.ics.uci.edu/dataset/502/online+retail+ii).\n",
 19 |     "- Download the file called **online_retail_II.xlsx**.\n",
 20 |     "- Save the Excel file into the **datasets** folder at the root of this repository."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import pandas as pd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "# Load data"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# If you downloaded and stored the file as explained\n",
 46 |     "# above, it should be located here:\n",
 47 |     "\n",
 48 |     "file = \"../Datasets/online_retail_II.xlsx\""
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# The data is provided as two sheets in a single Excel file.\n",
 58 |     "# Each sheet contains a different time period.\n",
 59 |     "# Load both and join into a single dataframe.\n",
 60 |     "\n",
 61 |     "df_1 = pd.read_excel(file, sheet_name=\"Year 2009-2010\")\n",
 62 |     "df_2 = pd.read_excel(file, sheet_name=\"Year 2010-2011\")\n",
 63 |     "\n",
 64 |     "df = pd.concat([df_1, df_2])"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/html": [
 75 |        "<div>\n",
 76 |        "<style scoped>\n",
 77 |        "    .dataframe tbody tr th:only-of-type {\n",
 78 |        "        vertical-align: middle;\n",
 79 |        "    }\n",
 80 |        "\n",
 81 |        "    .dataframe tbody tr th {\n",
 82 |        "        vertical-align: top;\n",
 83 |        "    }\n",
 84 |        "\n",
 85 |        "    .dataframe thead th {\n",
 86 |        "        text-align: right;\n",
 87 |        "    }\n",
 88 |        "</style>\n",
 89 |        "<table border=\"1\" class=\"dataframe\">\n",
 90 |        "  <thead>\n",
 91 |        "    <tr style=\"text-align: right;\">\n",
 92 |        "      <th></th>\n",
 93 |        "      <th>Invoice</th>\n",
 94 |        "      <th>StockCode</th>\n",
 95 |        "      <th>Description</th>\n",
 96 |        "      <th>Quantity</th>\n",
 97 |        "      <th>InvoiceDate</th>\n",
 98 |        "      <th>Price</th>\n",
 99 |        "      <th>Customer ID</th>\n",
100 |        "      <th>Country</th>\n",
101 |        "    </tr>\n",
102 |        "  </thead>\n",
103 |        "  <tbody>\n",
104 |        "    <tr>\n",
105 |        "      <th>0</th>\n",
106 |        "      <td>489434</td>\n",
107 |        "      <td>85048</td>\n",
108 |        "      <td>15CM CHRISTMAS GLASS BALL 20 LIGHTS</td>\n",
109 |        "      <td>12</td>\n",
110 |        "      <td>2009-12-01 07:45:00</td>\n",
111 |        "      <td>6.95</td>\n",
112 |        "      <td>13085.0</td>\n",
113 |        "      <td>United Kingdom</td>\n",
114 |        "    </tr>\n",
115 |        "    <tr>\n",
116 |        "      <th>1</th>\n",
117 |        "      <td>489434</td>\n",
118 |        "      <td>79323P</td>\n",
119 |        "      <td>PINK CHERRY LIGHTS</td>\n",
120 |        "      <td>12</td>\n",
121 |        "      <td>2009-12-01 07:45:00</td>\n",
122 |        "      <td>6.75</td>\n",
123 |        "      <td>13085.0</td>\n",
124 |        "      <td>United Kingdom</td>\n",
125 |        "    </tr>\n",
126 |        "    <tr>\n",
127 |        "      <th>2</th>\n",
128 |        "      <td>489434</td>\n",
129 |        "      <td>79323W</td>\n",
130 |        "      <td>WHITE CHERRY LIGHTS</td>\n",
131 |        "      <td>12</td>\n",
132 |        "      <td>2009-12-01 07:45:00</td>\n",
133 |        "      <td>6.75</td>\n",
134 |        "      <td>13085.0</td>\n",
135 |        "      <td>United Kingdom</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <th>3</th>\n",
139 |        "      <td>489434</td>\n",
140 |        "      <td>22041</td>\n",
141 |        "      <td>RECORD FRAME 7\" SINGLE SIZE</td>\n",
142 |        "      <td>48</td>\n",
143 |        "      <td>2009-12-01 07:45:00</td>\n",
144 |        "      <td>2.10</td>\n",
145 |        "      <td>13085.0</td>\n",
146 |        "      <td>United Kingdom</td>\n",
147 |        "    </tr>\n",
148 |        "    <tr>\n",
149 |        "      <th>4</th>\n",
150 |        "      <td>489434</td>\n",
151 |        "      <td>21232</td>\n",
152 |        "      <td>STRAWBERRY CERAMIC TRINKET BOX</td>\n",
153 |        "      <td>24</td>\n",
154 |        "      <td>2009-12-01 07:45:00</td>\n",
155 |        "      <td>1.25</td>\n",
156 |        "      <td>13085.0</td>\n",
157 |        "      <td>United Kingdom</td>\n",
158 |        "    </tr>\n",
159 |        "  </tbody>\n",
160 |        "</table>\n",
161 |        "</div>"
162 |       ],
163 |       "text/plain": [
164 |        "  Invoice StockCode                          Description  Quantity  \\\n",
165 |        "0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   \n",
166 |        "1  489434    79323P                   PINK CHERRY LIGHTS        12   \n",
167 |        "2  489434    79323W                  WHITE CHERRY LIGHTS        12   \n",
168 |        "3  489434     22041         RECORD FRAME 7\" SINGLE SIZE         48   \n",
169 |        "4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   \n",
170 |        "\n",
171 |        "          InvoiceDate  Price  Customer ID         Country  \n",
172 |        "0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  \n",
173 |        "1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  \n",
174 |        "2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  \n",
175 |        "3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  \n",
176 |        "4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  "
177 |       ]
178 |      },
179 |      "execution_count": 4,
180 |      "metadata": {},
181 |      "output_type": "execute_result"
182 |     }
183 |    ],
184 |    "source": [
185 |     "# Inspect dataframe\n",
186 |     "\n",
187 |     "df.head()"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 5,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "# Rename columns\n",
197 |     "\n",
198 |     "df.columns = [\n",
199 |     "    \"invoice\",\n",
200 |     "    \"stock_code\",\n",
201 |     "    \"description\",\n",
202 |     "    \"quantity\",\n",
203 |     "    \"invoice_date\",\n",
204 |     "    \"price\",\n",
205 |     "    \"customer_id\",\n",
206 |     "    \"country\",\n",
207 |     "]"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "# Process data"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "Remove null customer ids."
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 6,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "mask = ~df[\"customer_id\"].isnull()\n",
231 |     "df = df[mask]"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "Create a flag for when an order is cancelled. Cancelled orders contain \n",
239 |     "the letter `C` at the start of the invoice."
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 7,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "df[\"is_cancelled\"] = df[\"invoice\"].apply(lambda x: str(x)[0] == \"C\")"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "Remove transactions which are negative quantities sold and are not cancelled orders."
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 8,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "mask = ~(~df[\"is_cancelled\"] & df[\"quantity\"] < 0)\n",
265 |     "\n",
266 |     "df = df[mask]"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "Compute revenue."
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 9,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "df[\"revenue\"] = df[\"quantity\"] * df[\"price\"]"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "metadata": {},
288 |    "source": [
289 |     "To compute gross revenue and quantity sold we filter out cancelled orders.\n",
290 |     "\n",
291 |     "After this, we resample the data at a weekly level."
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 10,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "mask = ~df[\"is_cancelled\"]\n",
301 |     "\n",
302 |     "# If running this raises an UnsupportedFunctionCall error\n",
303 |     "# try upgrading your version of pandas.\n",
304 |     "df_gross = (\n",
305 |     "    df.loc[mask, [\"invoice_date\", \"quantity\", \"revenue\", \"country\"]]\n",
306 |     "    .groupby(\"country\")\n",
307 |     "    .resample(\"W\", on=\"invoice_date\")\n",
308 |     "    .sum(numeric_only=True)\n",
309 |     ")"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 11,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "df_gross.index.rename([\"country\", \"week\"], inplace=True)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "# Save data\n",
326 |     "\n",
327 |     "We will save 3 different versions of the preprocessed dataset for different demos.\n",
328 |     "\n",
329 |     "## Weekly sampled"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 12,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "df_gross_countries = df_gross.reset_index(level=\"country\")"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 13,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "countries = [\n",
348 |     "    'United Kingdom',\n",
349 |     "    'Belgium',\n",
350 |     "    \"EIRE\",\n",
351 |     "    'Germany',\n",
352 |     "    \"France\",\n",
353 |     "    'Spain',\n",
354 |     "]"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 14,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "df_gross_countries[df_gross_countries[\"country\"].isin(countries)].to_csv(\n",
364 |     "    \"../Datasets/online_retail_dataset_countries.csv\",\n",
365 |     "    index=True,\n",
366 |     ")"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {},
372 |    "source": [
373 |     "## Unstacked countries"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 15,
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": [
382 |     "y = df_gross.unstack(\"country\")[\"revenue\"]"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 16,
388 |    "metadata": {},
389 |    "outputs": [
390 |     {
391 |      "data": {
392 |       "text/html": [
393 |        "<div>\n",
394 |        "<style scoped>\n",
395 |        "    .dataframe tbody tr th:only-of-type {\n",
396 |        "        vertical-align: middle;\n",
397 |        "    }\n",
398 |        "\n",
399 |        "    .dataframe tbody tr th {\n",
400 |        "        vertical-align: top;\n",
401 |        "    }\n",
402 |        "\n",
403 |        "    .dataframe thead th {\n",
404 |        "        text-align: right;\n",
405 |        "    }\n",
406 |        "</style>\n",
407 |        "<table border=\"1\" class=\"dataframe\">\n",
408 |        "  <thead>\n",
409 |        "    <tr style=\"text-align: right;\">\n",
410 |        "      <th>country</th>\n",
411 |        "      <th>Australia</th>\n",
412 |        "      <th>Austria</th>\n",
413 |        "      <th>Bahrain</th>\n",
414 |        "      <th>Belgium</th>\n",
415 |        "      <th>Brazil</th>\n",
416 |        "      <th>Canada</th>\n",
417 |        "      <th>Channel Islands</th>\n",
418 |        "      <th>Cyprus</th>\n",
419 |        "      <th>Czech Republic</th>\n",
420 |        "      <th>Denmark</th>\n",
421 |        "      <th>...</th>\n",
422 |        "      <th>Singapore</th>\n",
423 |        "      <th>Spain</th>\n",
424 |        "      <th>Sweden</th>\n",
425 |        "      <th>Switzerland</th>\n",
426 |        "      <th>Thailand</th>\n",
427 |        "      <th>USA</th>\n",
428 |        "      <th>United Arab Emirates</th>\n",
429 |        "      <th>United Kingdom</th>\n",
430 |        "      <th>Unspecified</th>\n",
431 |        "      <th>West Indies</th>\n",
432 |        "    </tr>\n",
433 |        "    <tr>\n",
434 |        "      <th>week</th>\n",
435 |        "      <th></th>\n",
436 |        "      <th></th>\n",
437 |        "      <th></th>\n",
438 |        "      <th></th>\n",
439 |        "      <th></th>\n",
440 |        "      <th></th>\n",
441 |        "      <th></th>\n",
442 |        "      <th></th>\n",
443 |        "      <th></th>\n",
444 |        "      <th></th>\n",
445 |        "      <th></th>\n",
446 |        "      <th></th>\n",
447 |        "      <th></th>\n",
448 |        "      <th></th>\n",
449 |        "      <th></th>\n",
450 |        "      <th></th>\n",
451 |        "      <th></th>\n",
452 |        "      <th></th>\n",
453 |        "      <th></th>\n",
454 |        "      <th></th>\n",
455 |        "      <th></th>\n",
456 |        "    </tr>\n",
457 |        "  </thead>\n",
458 |        "  <tbody>\n",
459 |        "    <tr>\n",
460 |        "      <th>2009-12-06</th>\n",
461 |        "      <td>196.1</td>\n",
462 |        "      <td>NaN</td>\n",
463 |        "      <td>NaN</td>\n",
464 |        "      <td>439.1</td>\n",
465 |        "      <td>NaN</td>\n",
466 |        "      <td>NaN</td>\n",
467 |        "      <td>989.18</td>\n",
468 |        "      <td>760.69</td>\n",
469 |        "      <td>NaN</td>\n",
470 |        "      <td>1008.00</td>\n",
471 |        "      <td>...</td>\n",
472 |        "      <td>NaN</td>\n",
473 |        "      <td>435.88</td>\n",
474 |        "      <td>NaN</td>\n",
475 |        "      <td>NaN</td>\n",
476 |        "      <td>NaN</td>\n",
477 |        "      <td>141.0</td>\n",
478 |        "      <td>NaN</td>\n",
479 |        "      <td>213000.35</td>\n",
480 |        "      <td>NaN</td>\n",
481 |        "      <td>NaN</td>\n",
482 |        "    </tr>\n",
483 |        "    <tr>\n",
484 |        "      <th>2009-12-13</th>\n",
485 |        "      <td>0.0</td>\n",
486 |        "      <td>1429.83</td>\n",
487 |        "      <td>NaN</td>\n",
488 |        "      <td>8.5</td>\n",
489 |        "      <td>NaN</td>\n",
490 |        "      <td>NaN</td>\n",
491 |        "      <td>0.00</td>\n",
492 |        "      <td>0.00</td>\n",
493 |        "      <td>NaN</td>\n",
494 |        "      <td>0.00</td>\n",
495 |        "      <td>...</td>\n",
496 |        "      <td>NaN</td>\n",
497 |        "      <td>412.60</td>\n",
498 |        "      <td>285.3</td>\n",
499 |        "      <td>NaN</td>\n",
500 |        "      <td>NaN</td>\n",
501 |        "      <td>0.0</td>\n",
502 |        "      <td>517.7</td>\n",
503 |        "      <td>195810.04</td>\n",
504 |        "      <td>NaN</td>\n",
505 |        "      <td>NaN</td>\n",
506 |        "    </tr>\n",
507 |        "    <tr>\n",
508 |        "      <th>2009-12-20</th>\n",
509 |        "      <td>75.0</td>\n",
510 |        "      <td>0.00</td>\n",
511 |        "      <td>NaN</td>\n",
512 |        "      <td>0.0</td>\n",
513 |        "      <td>NaN</td>\n",
514 |        "      <td>NaN</td>\n",
515 |        "      <td>0.00</td>\n",
516 |        "      <td>2796.29</td>\n",
517 |        "      <td>NaN</td>\n",
518 |        "      <td>429.66</td>\n",
519 |        "      <td>...</td>\n",
520 |        "      <td>NaN</td>\n",
521 |        "      <td>1952.64</td>\n",
522 |        "      <td>0.0</td>\n",
523 |        "      <td>589.4</td>\n",
524 |        "      <td>NaN</td>\n",
525 |        "      <td>0.0</td>\n",
526 |        "      <td>0.0</td>\n",
527 |        "      <td>182396.74</td>\n",
528 |        "      <td>NaN</td>\n",
529 |        "      <td>NaN</td>\n",
530 |        "    </tr>\n",
531 |        "    <tr>\n",
532 |        "      <th>2009-12-27</th>\n",
533 |        "      <td>0.0</td>\n",
534 |        "      <td>568.51</td>\n",
535 |        "      <td>NaN</td>\n",
536 |        "      <td>0.0</td>\n",
537 |        "      <td>NaN</td>\n",
538 |        "      <td>NaN</td>\n",
539 |        "      <td>0.00</td>\n",
540 |        "      <td>0.00</td>\n",
541 |        "      <td>NaN</td>\n",
542 |        "      <td>0.00</td>\n",
543 |        "      <td>...</td>\n",
544 |        "      <td>NaN</td>\n",
545 |        "      <td>5149.06</td>\n",
546 |        "      <td>0.0</td>\n",
547 |        "      <td>0.0</td>\n",
548 |        "      <td>NaN</td>\n",
549 |        "      <td>0.0</td>\n",
550 |        "      <td>0.0</td>\n",
551 |        "      <td>22007.77</td>\n",
552 |        "      <td>NaN</td>\n",
553 |        "      <td>NaN</td>\n",
554 |        "    </tr>\n",
555 |        "    <tr>\n",
556 |        "      <th>2010-01-03</th>\n",
557 |        "      <td>0.0</td>\n",
558 |        "      <td>0.00</td>\n",
559 |        "      <td>NaN</td>\n",
560 |        "      <td>0.0</td>\n",
561 |        "      <td>NaN</td>\n",
562 |        "      <td>NaN</td>\n",
563 |        "      <td>0.00</td>\n",
564 |        "      <td>0.00</td>\n",
565 |        "      <td>NaN</td>\n",
566 |        "      <td>0.00</td>\n",
567 |        "      <td>...</td>\n",
568 |        "      <td>NaN</td>\n",
569 |        "      <td>0.00</td>\n",
570 |        "      <td>0.0</td>\n",
571 |        "      <td>0.0</td>\n",
572 |        "      <td>NaN</td>\n",
573 |        "      <td>0.0</td>\n",
574 |        "      <td>0.0</td>\n",
575 |        "      <td>0.00</td>\n",
576 |        "      <td>NaN</td>\n",
577 |        "      <td>NaN</td>\n",
578 |        "    </tr>\n",
579 |        "  </tbody>\n",
580 |        "</table>\n",
581 |        "<p>5 rows × 41 columns</p>\n",
582 |        "</div>"
583 |       ],
584 |       "text/plain": [
585 |        "country     Australia  Austria  Bahrain  Belgium  Brazil  Canada  \\\n",
586 |        "week                                                               \n",
587 |        "2009-12-06      196.1      NaN      NaN    439.1     NaN     NaN   \n",
588 |        "2009-12-13        0.0  1429.83      NaN      8.5     NaN     NaN   \n",
589 |        "2009-12-20       75.0     0.00      NaN      0.0     NaN     NaN   \n",
590 |        "2009-12-27        0.0   568.51      NaN      0.0     NaN     NaN   \n",
591 |        "2010-01-03        0.0     0.00      NaN      0.0     NaN     NaN   \n",
592 |        "\n",
593 |        "country     Channel Islands   Cyprus  Czech Republic  Denmark  ...  Singapore  \\\n",
594 |        "week                                                           ...              \n",
595 |        "2009-12-06           989.18   760.69             NaN  1008.00  ...        NaN   \n",
596 |        "2009-12-13             0.00     0.00             NaN     0.00  ...        NaN   \n",
597 |        "2009-12-20             0.00  2796.29             NaN   429.66  ...        NaN   \n",
598 |        "2009-12-27             0.00     0.00             NaN     0.00  ...        NaN   \n",
599 |        "2010-01-03             0.00     0.00             NaN     0.00  ...        NaN   \n",
600 |        "\n",
601 |        "country       Spain  Sweden  Switzerland  Thailand    USA  \\\n",
602 |        "week                                                        \n",
603 |        "2009-12-06   435.88     NaN          NaN       NaN  141.0   \n",
604 |        "2009-12-13   412.60   285.3          NaN       NaN    0.0   \n",
605 |        "2009-12-20  1952.64     0.0        589.4       NaN    0.0   \n",
606 |        "2009-12-27  5149.06     0.0          0.0       NaN    0.0   \n",
607 |        "2010-01-03     0.00     0.0          0.0       NaN    0.0   \n",
608 |        "\n",
609 |        "country     United Arab Emirates  United Kingdom  Unspecified  West Indies  \n",
610 |        "week                                                                        \n",
611 |        "2009-12-06                   NaN       213000.35          NaN          NaN  \n",
612 |        "2009-12-13                 517.7       195810.04          NaN          NaN  \n",
613 |        "2009-12-20                   0.0       182396.74          NaN          NaN  \n",
614 |        "2009-12-27                   0.0        22007.77          NaN          NaN  \n",
615 |        "2010-01-03                   0.0            0.00          NaN          NaN  \n",
616 |        "\n",
617 |        "[5 rows x 41 columns]"
618 |       ]
619 |      },
620 |      "execution_count": 16,
621 |      "metadata": {},
622 |      "output_type": "execute_result"
623 |     }
624 |    ],
625 |    "source": [
626 |     "y.head()"
627 |    ]
628 |   },
629 |   {
630 |    "cell_type": "code",
631 |    "execution_count": 17,
632 |    "metadata": {},
633 |    "outputs": [],
634 |    "source": [
635 |     "y.to_csv(\"../Datasets/online_retail_dataset.csv\")"
636 |    ]
637 |   },
638 |   {
639 |    "cell_type": "markdown",
640 |    "metadata": {},
641 |    "source": [
642 |     "## Raw data"
643 |    ]
644 |   },
645 |   {
646 |    "cell_type": "code",
647 |    "execution_count": 18,
648 |    "metadata": {},
649 |    "outputs": [],
650 |    "source": [
651 |     "# columns needed for demo\n",
652 |     "cols = [\"invoice_date\", \"description\", \"revenue\"]\n",
653 |     "\n",
654 |     "# just UK\n",
655 |     "df = df[df[\"country\"] == \"United Kingdom\"]\n",
656 |     "\n",
657 |     "# save\n",
658 |     "df[cols].to_csv(\"../Datasets/online_retail_dataset_all.csv\", index=False)"
659 |    ]
660 |   }
661 |  ],
662 |  "metadata": {
663 |   "kernelspec": {
664 |    "display_name": "Python 3 (ipykernel)",
665 |    "language": "python",
666 |    "name": "python3"
667 |   },
668 |   "language_info": {
669 |    "codemirror_mode": {
670 |     "name": "ipython",
671 |     "version": 3
672 |    },
673 |    "file_extension": ".py",
674 |    "mimetype": "text/x-python",
675 |    "name": "python",
676 |    "nbconvert_exporter": "python",
677 |    "pygments_lexer": "ipython3",
678 |    "version": "3.8.7"
679 |   },
680 |   "toc": {
681 |    "base_numbering": 1,
682 |    "nav_menu": {},
683 |    "number_sections": true,
684 |    "sideBar": true,
685 |    "skip_h1_title": false,
686 |    "title_cell": "Table of Contents",
687 |    "title_sidebar": "Contents",
688 |    "toc_cell": false,
689 |    "toc_position": {},
690 |    "toc_section_display": true,
691 |    "toc_window_display": false
692 |   }
693 |  },
694 |  "nbformat": 4,
695 |  "nbformat_minor": 4
696 | }
697 | 


--------------------------------------------------------------------------------
/01-Create-Datasets/03-create-air-quality-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Air Quality Data Set\n",
  8 |     "\n",
  9 |     "In this notebook we will prepare and store the Air Quality Data Set from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Air+Quality)\n",
 10 |     "\n",
 11 |     "**Citation:**\n",
 12 |     "\n",
 13 |     "Dua, D. and Graff, C. (2019). [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml). Irvine, CA: University of California, School of Information and Computer Science.\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "## Download and unzip the data\n",
 17 |     "\n",
 18 |     "- Navigate to the [data folder](https://archive.ics.uci.edu/dataset/360/air+quality).\n",
 19 |     "- Download the zip file called **AirQualityUCI.zip**.\n",
 20 |     "- Unzip it.\n",
 21 |     "- Save the csv file called **AirQualityUCI.csv** into the **datasets** folder at the root of this repository."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import pandas as pd\n",
 31 |     "import matplotlib.pyplot as plt"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# If you downloaded and stored the file as explained\n",
 41 |     "# above, it should be located here:\n",
 42 |     "\n",
 43 |     "filename = '../Datasets/AirQualityUCI.csv'"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/plain": [
 54 |        "(9357, 14)"
 55 |       ]
 56 |      },
 57 |      "execution_count": 3,
 58 |      "metadata": {},
 59 |      "output_type": "execute_result"
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "# load the data\n",
 64 |     "\n",
 65 |     "data = pd.read_csv(\n",
 66 |     "    filename, sep=';', parse_dates=[['Date', 'Time']]\n",
 67 |     ").iloc[:, :-2]  # drops last 2 columns, not real variables\n",
 68 |     "\n",
 69 |     "# drop missing values\n",
 70 |     "# these are added at the end of the file during reading\n",
 71 |     "data.dropna(inplace=True)\n",
 72 |     "\n",
 73 |     "data.shape"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 4,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "data": {
 83 |       "text/html": [
 84 |        "<div>\n",
 85 |        "<style scoped>\n",
 86 |        "    .dataframe tbody tr th:only-of-type {\n",
 87 |        "        vertical-align: middle;\n",
 88 |        "    }\n",
 89 |        "\n",
 90 |        "    .dataframe tbody tr th {\n",
 91 |        "        vertical-align: top;\n",
 92 |        "    }\n",
 93 |        "\n",
 94 |        "    .dataframe thead th {\n",
 95 |        "        text-align: right;\n",
 96 |        "    }\n",
 97 |        "</style>\n",
 98 |        "<table border=\"1\" class=\"dataframe\">\n",
 99 |        "  <thead>\n",
100 |        "    <tr style=\"text-align: right;\">\n",
101 |        "      <th></th>\n",
102 |        "      <th>Date_Time</th>\n",
103 |        "      <th>CO(GT)</th>\n",
104 |        "      <th>PT08.S1(CO)</th>\n",
105 |        "      <th>NMHC(GT)</th>\n",
106 |        "      <th>C6H6(GT)</th>\n",
107 |        "      <th>PT08.S2(NMHC)</th>\n",
108 |        "      <th>NOx(GT)</th>\n",
109 |        "      <th>PT08.S3(NOx)</th>\n",
110 |        "      <th>NO2(GT)</th>\n",
111 |        "      <th>PT08.S4(NO2)</th>\n",
112 |        "      <th>PT08.S5(O3)</th>\n",
113 |        "      <th>T</th>\n",
114 |        "      <th>RH</th>\n",
115 |        "      <th>AH</th>\n",
116 |        "    </tr>\n",
117 |        "  </thead>\n",
118 |        "  <tbody>\n",
119 |        "    <tr>\n",
120 |        "      <th>0</th>\n",
121 |        "      <td>10/03/2004 18.00.00</td>\n",
122 |        "      <td>2,6</td>\n",
123 |        "      <td>1360.0</td>\n",
124 |        "      <td>150.0</td>\n",
125 |        "      <td>11,9</td>\n",
126 |        "      <td>1046.0</td>\n",
127 |        "      <td>166.0</td>\n",
128 |        "      <td>1056.0</td>\n",
129 |        "      <td>113.0</td>\n",
130 |        "      <td>1692.0</td>\n",
131 |        "      <td>1268.0</td>\n",
132 |        "      <td>13,6</td>\n",
133 |        "      <td>48,9</td>\n",
134 |        "      <td>0,7578</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>1</th>\n",
138 |        "      <td>10/03/2004 19.00.00</td>\n",
139 |        "      <td>2</td>\n",
140 |        "      <td>1292.0</td>\n",
141 |        "      <td>112.0</td>\n",
142 |        "      <td>9,4</td>\n",
143 |        "      <td>955.0</td>\n",
144 |        "      <td>103.0</td>\n",
145 |        "      <td>1174.0</td>\n",
146 |        "      <td>92.0</td>\n",
147 |        "      <td>1559.0</td>\n",
148 |        "      <td>972.0</td>\n",
149 |        "      <td>13,3</td>\n",
150 |        "      <td>47,7</td>\n",
151 |        "      <td>0,7255</td>\n",
152 |        "    </tr>\n",
153 |        "    <tr>\n",
154 |        "      <th>2</th>\n",
155 |        "      <td>10/03/2004 20.00.00</td>\n",
156 |        "      <td>2,2</td>\n",
157 |        "      <td>1402.0</td>\n",
158 |        "      <td>88.0</td>\n",
159 |        "      <td>9,0</td>\n",
160 |        "      <td>939.0</td>\n",
161 |        "      <td>131.0</td>\n",
162 |        "      <td>1140.0</td>\n",
163 |        "      <td>114.0</td>\n",
164 |        "      <td>1555.0</td>\n",
165 |        "      <td>1074.0</td>\n",
166 |        "      <td>11,9</td>\n",
167 |        "      <td>54,0</td>\n",
168 |        "      <td>0,7502</td>\n",
169 |        "    </tr>\n",
170 |        "    <tr>\n",
171 |        "      <th>3</th>\n",
172 |        "      <td>10/03/2004 21.00.00</td>\n",
173 |        "      <td>2,2</td>\n",
174 |        "      <td>1376.0</td>\n",
175 |        "      <td>80.0</td>\n",
176 |        "      <td>9,2</td>\n",
177 |        "      <td>948.0</td>\n",
178 |        "      <td>172.0</td>\n",
179 |        "      <td>1092.0</td>\n",
180 |        "      <td>122.0</td>\n",
181 |        "      <td>1584.0</td>\n",
182 |        "      <td>1203.0</td>\n",
183 |        "      <td>11,0</td>\n",
184 |        "      <td>60,0</td>\n",
185 |        "      <td>0,7867</td>\n",
186 |        "    </tr>\n",
187 |        "    <tr>\n",
188 |        "      <th>4</th>\n",
189 |        "      <td>10/03/2004 22.00.00</td>\n",
190 |        "      <td>1,6</td>\n",
191 |        "      <td>1272.0</td>\n",
192 |        "      <td>51.0</td>\n",
193 |        "      <td>6,5</td>\n",
194 |        "      <td>836.0</td>\n",
195 |        "      <td>131.0</td>\n",
196 |        "      <td>1205.0</td>\n",
197 |        "      <td>116.0</td>\n",
198 |        "      <td>1490.0</td>\n",
199 |        "      <td>1110.0</td>\n",
200 |        "      <td>11,2</td>\n",
201 |        "      <td>59,6</td>\n",
202 |        "      <td>0,7888</td>\n",
203 |        "    </tr>\n",
204 |        "  </tbody>\n",
205 |        "</table>\n",
206 |        "</div>"
207 |       ],
208 |       "text/plain": [
209 |        "             Date_Time CO(GT)  PT08.S1(CO)  NMHC(GT) C6H6(GT)  PT08.S2(NMHC)  \\\n",
210 |        "0  10/03/2004 18.00.00    2,6       1360.0     150.0     11,9         1046.0   \n",
211 |        "1  10/03/2004 19.00.00      2       1292.0     112.0      9,4          955.0   \n",
212 |        "2  10/03/2004 20.00.00    2,2       1402.0      88.0      9,0          939.0   \n",
213 |        "3  10/03/2004 21.00.00    2,2       1376.0      80.0      9,2          948.0   \n",
214 |        "4  10/03/2004 22.00.00    1,6       1272.0      51.0      6,5          836.0   \n",
215 |        "\n",
216 |        "   NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)     T    RH  \\\n",
217 |        "0    166.0        1056.0    113.0        1692.0       1268.0  13,6  48,9   \n",
218 |        "1    103.0        1174.0     92.0        1559.0        972.0  13,3  47,7   \n",
219 |        "2    131.0        1140.0    114.0        1555.0       1074.0  11,9  54,0   \n",
220 |        "3    172.0        1092.0    122.0        1584.0       1203.0  11,0  60,0   \n",
221 |        "4    131.0        1205.0    116.0        1490.0       1110.0  11,2  59,6   \n",
222 |        "\n",
223 |        "       AH  \n",
224 |        "0  0,7578  \n",
225 |        "1  0,7255  \n",
226 |        "2  0,7502  \n",
227 |        "3  0,7867  \n",
228 |        "4  0,7888  "
229 |       ]
230 |      },
231 |      "execution_count": 4,
232 |      "metadata": {},
233 |      "output_type": "execute_result"
234 |     }
235 |    ],
236 |    "source": [
237 |     "data.head()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "### Attribute Information:\n",
245 |     "\n",
246 |     "Taken from the [original website](https://archive.ics.uci.edu/ml/datasets/Air+Quality).\n",
247 |     "\n",
248 |     "- 0 Date (DD/MM/YYYY)\n",
249 |     "- 1 Time (HH.MM.SS)\n",
250 |     "\n",
251 |     "The above were merged during loading into the Date_Time column\n",
252 |     "\n",
253 |     "\n",
254 |     "- 2 True hourly averaged concentration CO in mg/m^3 (reference analyzer)\n",
255 |     "- 3 PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)\n",
256 |     "- 4 True hourly averaged overall Non Metanic HydroCarbons concentration in microg/m^3 (reference analyzer)\n",
257 |     "- 5 True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)\n",
258 |     "- 6 PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)\n",
259 |     "- 7 True hourly averaged NOx concentration in ppb (reference analyzer)\n",
260 |     "- 8 PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)\n",
261 |     "- 9 True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)\n",
262 |     "- 10 PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)\n",
263 |     "- 11 PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)\n",
264 |     "- 12 Temperature in Â°C\n",
265 |     "- 13 Relative Humidity (%)\n",
266 |     "- 14 AH Absolute Humidity "
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 5,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "# I will give the variables simpler names\n",
276 |     "# more details at the end of the notebook\n",
277 |     "\n",
278 |     "new_var_names = [\n",
279 |     "    'Date_Time',\n",
280 |     "    'CO_true',\n",
281 |     "    'CO_sensor',\n",
282 |     "    'NMHC_true',\n",
283 |     "    'C6H6_true',\n",
284 |     "    'NMHC_sensor',\n",
285 |     "    'NOX_true',\n",
286 |     "    'NOX_sensor',\n",
287 |     "    'NO2_true',\n",
288 |     "    'NO2_sensor',\n",
289 |     "    'O3_sensor',\n",
290 |     "    'T',\n",
291 |     "    'RH',\n",
292 |     "    'AH',    \n",
293 |     "]"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 6,
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "data": {
303 |       "text/plain": [
304 |        "Index(['Date_Time', 'CO_true', 'CO_sensor', 'NMHC_true', 'C6H6_true',\n",
305 |        "       'NMHC_sensor', 'NOX_true', 'NOX_sensor', 'NO2_true', 'NO2_sensor',\n",
306 |        "       'O3_sensor', 'T', 'RH', 'AH'],\n",
307 |        "      dtype='object')"
308 |       ]
309 |      },
310 |      "execution_count": 6,
311 |      "metadata": {},
312 |      "output_type": "execute_result"
313 |     }
314 |    ],
315 |    "source": [
316 |     "data.columns = new_var_names\n",
317 |     "\n",
318 |     "data.columns"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 7,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/plain": [
329 |        "Index(['CO_true', 'CO_sensor', 'NMHC_true', 'C6H6_true', 'NMHC_sensor',\n",
330 |        "       'NOX_true', 'NOX_sensor', 'NO2_true', 'NO2_sensor', 'O3_sensor', 'T',\n",
331 |        "       'RH', 'AH'],\n",
332 |        "      dtype='object')"
333 |       ]
334 |      },
335 |      "execution_count": 7,
336 |      "metadata": {},
337 |      "output_type": "execute_result"
338 |     }
339 |    ],
340 |    "source": [
341 |     "# let's capture the variables\n",
342 |     "\n",
343 |     "predictors = data.columns[1:]\n",
344 |     "\n",
345 |     "predictors"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 8,
351 |    "metadata": {},
352 |    "outputs": [
353 |     {
354 |      "data": {
355 |       "text/html": [
356 |        "<div>\n",
357 |        "<style scoped>\n",
358 |        "    .dataframe tbody tr th:only-of-type {\n",
359 |        "        vertical-align: middle;\n",
360 |        "    }\n",
361 |        "\n",
362 |        "    .dataframe tbody tr th {\n",
363 |        "        vertical-align: top;\n",
364 |        "    }\n",
365 |        "\n",
366 |        "    .dataframe thead th {\n",
367 |        "        text-align: right;\n",
368 |        "    }\n",
369 |        "</style>\n",
370 |        "<table border=\"1\" class=\"dataframe\">\n",
371 |        "  <thead>\n",
372 |        "    <tr style=\"text-align: right;\">\n",
373 |        "      <th></th>\n",
374 |        "      <th>Date_Time</th>\n",
375 |        "      <th>CO_true</th>\n",
376 |        "      <th>CO_sensor</th>\n",
377 |        "      <th>NMHC_true</th>\n",
378 |        "      <th>C6H6_true</th>\n",
379 |        "      <th>NMHC_sensor</th>\n",
380 |        "      <th>NOX_true</th>\n",
381 |        "      <th>NOX_sensor</th>\n",
382 |        "      <th>NO2_true</th>\n",
383 |        "      <th>NO2_sensor</th>\n",
384 |        "      <th>O3_sensor</th>\n",
385 |        "      <th>T</th>\n",
386 |        "      <th>RH</th>\n",
387 |        "      <th>AH</th>\n",
388 |        "    </tr>\n",
389 |        "  </thead>\n",
390 |        "  <tbody>\n",
391 |        "    <tr>\n",
392 |        "      <th>0</th>\n",
393 |        "      <td>10/03/2004 18.00.00</td>\n",
394 |        "      <td>2.6</td>\n",
395 |        "      <td>1360.0</td>\n",
396 |        "      <td>150.0</td>\n",
397 |        "      <td>11.9</td>\n",
398 |        "      <td>1046.0</td>\n",
399 |        "      <td>166.0</td>\n",
400 |        "      <td>1056.0</td>\n",
401 |        "      <td>113.0</td>\n",
402 |        "      <td>1692.0</td>\n",
403 |        "      <td>1268.0</td>\n",
404 |        "      <td>13.6</td>\n",
405 |        "      <td>48.9</td>\n",
406 |        "      <td>0.7578</td>\n",
407 |        "    </tr>\n",
408 |        "    <tr>\n",
409 |        "      <th>1</th>\n",
410 |        "      <td>10/03/2004 19.00.00</td>\n",
411 |        "      <td>2.0</td>\n",
412 |        "      <td>1292.0</td>\n",
413 |        "      <td>112.0</td>\n",
414 |        "      <td>9.4</td>\n",
415 |        "      <td>955.0</td>\n",
416 |        "      <td>103.0</td>\n",
417 |        "      <td>1174.0</td>\n",
418 |        "      <td>92.0</td>\n",
419 |        "      <td>1559.0</td>\n",
420 |        "      <td>972.0</td>\n",
421 |        "      <td>13.3</td>\n",
422 |        "      <td>47.7</td>\n",
423 |        "      <td>0.7255</td>\n",
424 |        "    </tr>\n",
425 |        "    <tr>\n",
426 |        "      <th>2</th>\n",
427 |        "      <td>10/03/2004 20.00.00</td>\n",
428 |        "      <td>2.2</td>\n",
429 |        "      <td>1402.0</td>\n",
430 |        "      <td>88.0</td>\n",
431 |        "      <td>9.0</td>\n",
432 |        "      <td>939.0</td>\n",
433 |        "      <td>131.0</td>\n",
434 |        "      <td>1140.0</td>\n",
435 |        "      <td>114.0</td>\n",
436 |        "      <td>1555.0</td>\n",
437 |        "      <td>1074.0</td>\n",
438 |        "      <td>11.9</td>\n",
439 |        "      <td>54.0</td>\n",
440 |        "      <td>0.7502</td>\n",
441 |        "    </tr>\n",
442 |        "    <tr>\n",
443 |        "      <th>3</th>\n",
444 |        "      <td>10/03/2004 21.00.00</td>\n",
445 |        "      <td>2.2</td>\n",
446 |        "      <td>1376.0</td>\n",
447 |        "      <td>80.0</td>\n",
448 |        "      <td>9.2</td>\n",
449 |        "      <td>948.0</td>\n",
450 |        "      <td>172.0</td>\n",
451 |        "      <td>1092.0</td>\n",
452 |        "      <td>122.0</td>\n",
453 |        "      <td>1584.0</td>\n",
454 |        "      <td>1203.0</td>\n",
455 |        "      <td>11.0</td>\n",
456 |        "      <td>60.0</td>\n",
457 |        "      <td>0.7867</td>\n",
458 |        "    </tr>\n",
459 |        "    <tr>\n",
460 |        "      <th>4</th>\n",
461 |        "      <td>10/03/2004 22.00.00</td>\n",
462 |        "      <td>1.6</td>\n",
463 |        "      <td>1272.0</td>\n",
464 |        "      <td>51.0</td>\n",
465 |        "      <td>6.5</td>\n",
466 |        "      <td>836.0</td>\n",
467 |        "      <td>131.0</td>\n",
468 |        "      <td>1205.0</td>\n",
469 |        "      <td>116.0</td>\n",
470 |        "      <td>1490.0</td>\n",
471 |        "      <td>1110.0</td>\n",
472 |        "      <td>11.2</td>\n",
473 |        "      <td>59.6</td>\n",
474 |        "      <td>0.7888</td>\n",
475 |        "    </tr>\n",
476 |        "  </tbody>\n",
477 |        "</table>\n",
478 |        "</div>"
479 |       ],
480 |       "text/plain": [
481 |        "             Date_Time  CO_true  CO_sensor  NMHC_true  C6H6_true  NMHC_sensor  \\\n",
482 |        "0  10/03/2004 18.00.00      2.6     1360.0      150.0       11.9       1046.0   \n",
483 |        "1  10/03/2004 19.00.00      2.0     1292.0      112.0        9.4        955.0   \n",
484 |        "2  10/03/2004 20.00.00      2.2     1402.0       88.0        9.0        939.0   \n",
485 |        "3  10/03/2004 21.00.00      2.2     1376.0       80.0        9.2        948.0   \n",
486 |        "4  10/03/2004 22.00.00      1.6     1272.0       51.0        6.5        836.0   \n",
487 |        "\n",
488 |        "   NOX_true  NOX_sensor  NO2_true  NO2_sensor  O3_sensor     T    RH      AH  \n",
489 |        "0     166.0      1056.0     113.0      1692.0     1268.0  13.6  48.9  0.7578  \n",
490 |        "1     103.0      1174.0      92.0      1559.0      972.0  13.3  47.7  0.7255  \n",
491 |        "2     131.0      1140.0     114.0      1555.0     1074.0  11.9  54.0  0.7502  \n",
492 |        "3     172.0      1092.0     122.0      1584.0     1203.0  11.0  60.0  0.7867  \n",
493 |        "4     131.0      1205.0     116.0      1490.0     1110.0  11.2  59.6  0.7888  "
494 |       ]
495 |      },
496 |      "execution_count": 8,
497 |      "metadata": {},
498 |      "output_type": "execute_result"
499 |     }
500 |    ],
501 |    "source": [
502 |     "# cast variables as numeric (they are strings by defo)\n",
503 |     "# need to replace the , by . to cast as numeric\n",
504 |     "\n",
505 |     "for var in predictors:\n",
506 |     "    if data[var].dtype =='O':\n",
507 |     "        data[var] = data[var].str.replace(',', '.')\n",
508 |     "        data[var] = pd.to_numeric(data[var])\n",
509 |     "\n",
510 |     "data.head()"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 9,
516 |    "metadata": {},
517 |    "outputs": [
518 |     {
519 |      "data": {
520 |       "text/html": [
521 |        "<div>\n",
522 |        "<style scoped>\n",
523 |        "    .dataframe tbody tr th:only-of-type {\n",
524 |        "        vertical-align: middle;\n",
525 |        "    }\n",
526 |        "\n",
527 |        "    .dataframe tbody tr th {\n",
528 |        "        vertical-align: top;\n",
529 |        "    }\n",
530 |        "\n",
531 |        "    .dataframe thead th {\n",
532 |        "        text-align: right;\n",
533 |        "    }\n",
534 |        "</style>\n",
535 |        "<table border=\"1\" class=\"dataframe\">\n",
536 |        "  <thead>\n",
537 |        "    <tr style=\"text-align: right;\">\n",
538 |        "      <th></th>\n",
539 |        "      <th>Date_Time</th>\n",
540 |        "      <th>CO_true</th>\n",
541 |        "      <th>CO_sensor</th>\n",
542 |        "      <th>NMHC_true</th>\n",
543 |        "      <th>C6H6_true</th>\n",
544 |        "      <th>NMHC_sensor</th>\n",
545 |        "      <th>NOX_true</th>\n",
546 |        "      <th>NOX_sensor</th>\n",
547 |        "      <th>NO2_true</th>\n",
548 |        "      <th>NO2_sensor</th>\n",
549 |        "      <th>O3_sensor</th>\n",
550 |        "      <th>T</th>\n",
551 |        "      <th>RH</th>\n",
552 |        "      <th>AH</th>\n",
553 |        "    </tr>\n",
554 |        "  </thead>\n",
555 |        "  <tbody>\n",
556 |        "  </tbody>\n",
557 |        "</table>\n",
558 |        "</div>"
559 |       ],
560 |       "text/plain": [
561 |        "Empty DataFrame\n",
562 |        "Columns: [Date_Time, CO_true, CO_sensor, NMHC_true, C6H6_true, NMHC_sensor, NOX_true, NOX_sensor, NO2_true, NO2_sensor, O3_sensor, T, RH, AH]\n",
563 |        "Index: []"
564 |       ]
565 |      },
566 |      "execution_count": 9,
567 |      "metadata": {},
568 |      "output_type": "execute_result"
569 |     }
570 |    ],
571 |    "source": [
572 |     "data[data['Date_Time'].apply(lambda x: len(x))>19]"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": 10,
578 |    "metadata": {},
579 |    "outputs": [
580 |     {
581 |      "data": {
582 |       "text/html": [
583 |        "<div>\n",
584 |        "<style scoped>\n",
585 |        "    .dataframe tbody tr th:only-of-type {\n",
586 |        "        vertical-align: middle;\n",
587 |        "    }\n",
588 |        "\n",
589 |        "    .dataframe tbody tr th {\n",
590 |        "        vertical-align: top;\n",
591 |        "    }\n",
592 |        "\n",
593 |        "    .dataframe thead th {\n",
594 |        "        text-align: right;\n",
595 |        "    }\n",
596 |        "</style>\n",
597 |        "<table border=\"1\" class=\"dataframe\">\n",
598 |        "  <thead>\n",
599 |        "    <tr style=\"text-align: right;\">\n",
600 |        "      <th></th>\n",
601 |        "      <th>Date_Time</th>\n",
602 |        "      <th>CO_true</th>\n",
603 |        "      <th>CO_sensor</th>\n",
604 |        "      <th>NMHC_true</th>\n",
605 |        "      <th>C6H6_true</th>\n",
606 |        "      <th>NMHC_sensor</th>\n",
607 |        "      <th>NOX_true</th>\n",
608 |        "      <th>NOX_sensor</th>\n",
609 |        "      <th>NO2_true</th>\n",
610 |        "      <th>NO2_sensor</th>\n",
611 |        "      <th>O3_sensor</th>\n",
612 |        "      <th>T</th>\n",
613 |        "      <th>RH</th>\n",
614 |        "      <th>AH</th>\n",
615 |        "    </tr>\n",
616 |        "  </thead>\n",
617 |        "  <tbody>\n",
618 |        "    <tr>\n",
619 |        "      <th>0</th>\n",
620 |        "      <td>2004-10-03 18:00:00</td>\n",
621 |        "      <td>2.6</td>\n",
622 |        "      <td>1360.0</td>\n",
623 |        "      <td>150.0</td>\n",
624 |        "      <td>11.9</td>\n",
625 |        "      <td>1046.0</td>\n",
626 |        "      <td>166.0</td>\n",
627 |        "      <td>1056.0</td>\n",
628 |        "      <td>113.0</td>\n",
629 |        "      <td>1692.0</td>\n",
630 |        "      <td>1268.0</td>\n",
631 |        "      <td>13.6</td>\n",
632 |        "      <td>48.9</td>\n",
633 |        "      <td>0.7578</td>\n",
634 |        "    </tr>\n",
635 |        "    <tr>\n",
636 |        "      <th>1</th>\n",
637 |        "      <td>2004-10-03 19:00:00</td>\n",
638 |        "      <td>2.0</td>\n",
639 |        "      <td>1292.0</td>\n",
640 |        "      <td>112.0</td>\n",
641 |        "      <td>9.4</td>\n",
642 |        "      <td>955.0</td>\n",
643 |        "      <td>103.0</td>\n",
644 |        "      <td>1174.0</td>\n",
645 |        "      <td>92.0</td>\n",
646 |        "      <td>1559.0</td>\n",
647 |        "      <td>972.0</td>\n",
648 |        "      <td>13.3</td>\n",
649 |        "      <td>47.7</td>\n",
650 |        "      <td>0.7255</td>\n",
651 |        "    </tr>\n",
652 |        "    <tr>\n",
653 |        "      <th>2</th>\n",
654 |        "      <td>2004-10-03 20:00:00</td>\n",
655 |        "      <td>2.2</td>\n",
656 |        "      <td>1402.0</td>\n",
657 |        "      <td>88.0</td>\n",
658 |        "      <td>9.0</td>\n",
659 |        "      <td>939.0</td>\n",
660 |        "      <td>131.0</td>\n",
661 |        "      <td>1140.0</td>\n",
662 |        "      <td>114.0</td>\n",
663 |        "      <td>1555.0</td>\n",
664 |        "      <td>1074.0</td>\n",
665 |        "      <td>11.9</td>\n",
666 |        "      <td>54.0</td>\n",
667 |        "      <td>0.7502</td>\n",
668 |        "    </tr>\n",
669 |        "    <tr>\n",
670 |        "      <th>3</th>\n",
671 |        "      <td>2004-10-03 21:00:00</td>\n",
672 |        "      <td>2.2</td>\n",
673 |        "      <td>1376.0</td>\n",
674 |        "      <td>80.0</td>\n",
675 |        "      <td>9.2</td>\n",
676 |        "      <td>948.0</td>\n",
677 |        "      <td>172.0</td>\n",
678 |        "      <td>1092.0</td>\n",
679 |        "      <td>122.0</td>\n",
680 |        "      <td>1584.0</td>\n",
681 |        "      <td>1203.0</td>\n",
682 |        "      <td>11.0</td>\n",
683 |        "      <td>60.0</td>\n",
684 |        "      <td>0.7867</td>\n",
685 |        "    </tr>\n",
686 |        "    <tr>\n",
687 |        "      <th>4</th>\n",
688 |        "      <td>2004-10-03 22:00:00</td>\n",
689 |        "      <td>1.6</td>\n",
690 |        "      <td>1272.0</td>\n",
691 |        "      <td>51.0</td>\n",
692 |        "      <td>6.5</td>\n",
693 |        "      <td>836.0</td>\n",
694 |        "      <td>131.0</td>\n",
695 |        "      <td>1205.0</td>\n",
696 |        "      <td>116.0</td>\n",
697 |        "      <td>1490.0</td>\n",
698 |        "      <td>1110.0</td>\n",
699 |        "      <td>11.2</td>\n",
700 |        "      <td>59.6</td>\n",
701 |        "      <td>0.7888</td>\n",
702 |        "    </tr>\n",
703 |        "  </tbody>\n",
704 |        "</table>\n",
705 |        "</div>"
706 |       ],
707 |       "text/plain": [
708 |        "            Date_Time  CO_true  CO_sensor  NMHC_true  C6H6_true  NMHC_sensor  \\\n",
709 |        "0 2004-10-03 18:00:00      2.6     1360.0      150.0       11.9       1046.0   \n",
710 |        "1 2004-10-03 19:00:00      2.0     1292.0      112.0        9.4        955.0   \n",
711 |        "2 2004-10-03 20:00:00      2.2     1402.0       88.0        9.0        939.0   \n",
712 |        "3 2004-10-03 21:00:00      2.2     1376.0       80.0        9.2        948.0   \n",
713 |        "4 2004-10-03 22:00:00      1.6     1272.0       51.0        6.5        836.0   \n",
714 |        "\n",
715 |        "   NOX_true  NOX_sensor  NO2_true  NO2_sensor  O3_sensor     T    RH      AH  \n",
716 |        "0     166.0      1056.0     113.0      1692.0     1268.0  13.6  48.9  0.7578  \n",
717 |        "1     103.0      1174.0      92.0      1559.0      972.0  13.3  47.7  0.7255  \n",
718 |        "2     131.0      1140.0     114.0      1555.0     1074.0  11.9  54.0  0.7502  \n",
719 |        "3     172.0      1092.0     122.0      1584.0     1203.0  11.0  60.0  0.7867  \n",
720 |        "4     131.0      1205.0     116.0      1490.0     1110.0  11.2  59.6  0.7888  "
721 |       ]
722 |      },
723 |      "execution_count": 10,
724 |      "metadata": {},
725 |      "output_type": "execute_result"
726 |     }
727 |    ],
728 |    "source": [
729 |     "# cast date and time variable as datetime\n",
730 |     "# replace . by : to transform to datetime format\n",
731 |     "\n",
732 |     "data['Date_Time'] = data['Date_Time'].str.replace('.', ':', regex=False)\n",
733 |     "\n",
734 |     "data['Date_Time'] = pd.to_datetime(data['Date_Time'])\n",
735 |     "# use dayfirst=True parameter if format is dd/mm/yyyy HH:mm:ss Eg: pd.to_datetime(data['Date_Time'], dayfirst=True)\n",
736 |     "\n",
737 |     "data.head()"
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "code",
742 |    "execution_count": 11,
743 |    "metadata": {},
744 |    "outputs": [],
745 |    "source": [
746 |     "# sort index\n",
747 |     "# we want the data in time order\n",
748 |     "\n",
749 |     "data.sort_index(inplace=True)"
750 |    ]
751 |   },
752 |   {
753 |    "cell_type": "code",
754 |    "execution_count": 12,
755 |    "metadata": {},
756 |    "outputs": [
757 |     {
758 |      "data": {
759 |       "text/plain": [
760 |        "Date_Time      datetime64[ns]\n",
761 |        "CO_true               float64\n",
762 |        "CO_sensor             float64\n",
763 |        "NMHC_true             float64\n",
764 |        "C6H6_true             float64\n",
765 |        "NMHC_sensor           float64\n",
766 |        "NOX_true              float64\n",
767 |        "NOX_sensor            float64\n",
768 |        "NO2_true              float64\n",
769 |        "NO2_sensor            float64\n",
770 |        "O3_sensor             float64\n",
771 |        "T                     float64\n",
772 |        "RH                    float64\n",
773 |        "AH                    float64\n",
774 |        "dtype: object"
775 |       ]
776 |      },
777 |      "execution_count": 12,
778 |      "metadata": {},
779 |      "output_type": "execute_result"
780 |     }
781 |    ],
782 |    "source": [
783 |     "# check the format\n",
784 |     "\n",
785 |     "data.dtypes"
786 |    ]
787 |   },
788 |   {
789 |    "cell_type": "code",
790 |    "execution_count": 13,
791 |    "metadata": {},
792 |    "outputs": [
793 |     {
794 |      "data": {
795 |       "text/plain": [
796 |        "0"
797 |       ]
798 |      },
799 |      "execution_count": 13,
800 |      "metadata": {},
801 |      "output_type": "execute_result"
802 |     }
803 |    ],
804 |    "source": [
805 |     "# sanity check: duplicates in dt variable\n",
806 |     "\n",
807 |     "data['Date_Time'].duplicated().sum()"
808 |    ]
809 |   },
810 |   {
811 |    "cell_type": "code",
812 |    "execution_count": 14,
813 |    "metadata": {},
814 |    "outputs": [
815 |     {
816 |      "data": {
817 |       "text/plain": [
818 |        "Date_Time      0\n",
819 |        "CO_true        0\n",
820 |        "CO_sensor      0\n",
821 |        "NMHC_true      0\n",
822 |        "C6H6_true      0\n",
823 |        "NMHC_sensor    0\n",
824 |        "NOX_true       0\n",
825 |        "NOX_sensor     0\n",
826 |        "NO2_true       0\n",
827 |        "NO2_sensor     0\n",
828 |        "O3_sensor      0\n",
829 |        "T              0\n",
830 |        "RH             0\n",
831 |        "AH             0\n",
832 |        "dtype: int64"
833 |       ]
834 |      },
835 |      "execution_count": 14,
836 |      "metadata": {},
837 |      "output_type": "execute_result"
838 |     }
839 |    ],
840 |    "source": [
841 |     "# check NA\n",
842 |     "\n",
843 |     "data.isnull().sum()"
844 |    ]
845 |   },
846 |   {
847 |    "cell_type": "code",
848 |    "execution_count": 15,
849 |    "metadata": {},
850 |    "outputs": [
851 |     {
852 |      "data": {
853 |       "text/plain": [
854 |        "min   2004-01-04 00:00:00\n",
855 |        "max   2005-12-03 23:00:00\n",
856 |        "Name: Date_Time, dtype: datetime64[ns]"
857 |       ]
858 |      },
859 |      "execution_count": 15,
860 |      "metadata": {},
861 |      "output_type": "execute_result"
862 |     }
863 |    ],
864 |    "source": [
865 |     "# check time span\n",
866 |     "\n",
867 |     "data['Date_Time'].agg(['min', 'max'])"
868 |    ]
869 |   },
870 |   {
871 |    "cell_type": "code",
872 |    "execution_count": 16,
873 |    "metadata": {},
874 |    "outputs": [],
875 |    "source": [
876 |     "# save preprocessed data\n",
877 |     "\n",
878 |     "data.to_csv('../Datasets/AirQualityUCI_ready.csv', index=False)"
879 |    ]
880 |   },
881 |   {
882 |    "cell_type": "markdown",
883 |    "metadata": {},
884 |    "source": [
885 |     "## Data set Summary\n",
886 |     "\n",
887 |     "The dataset was collected between January 2004 and March 2005.\n",
888 |     "\n",
889 |     "It consists of hourly measurements of the different air pollutants, NO2, NOX, CO, C6H6, O3 and NMHC. The measurements are accompanied by local temperature and humidity values, also recorded hourly.\n",
890 |     "\n",
891 |     "In the data collection experiments, scientists were testing new pollutant sensors. The values from the new sensors are stored in the variables called _sensors. \n",
892 |     "\n",
893 |     "For comparison, data for the pollutants was also gathered from fixed stations, that regularly measure the concentration of these gases. Those values are stored in the variables called _true."
894 |    ]
895 |   },
896 |   {
897 |    "cell_type": "code",
898 |    "execution_count": null,
899 |    "metadata": {},
900 |    "outputs": [],
901 |    "source": []
902 |   }
903 |  ],
904 |  "metadata": {
905 |   "kernelspec": {
906 |    "display_name": "Python 3 (ipykernel)",
907 |    "language": "python",
908 |    "name": "python3"
909 |   },
910 |   "language_info": {
911 |    "codemirror_mode": {
912 |     "name": "ipython",
913 |     "version": 3
914 |    },
915 |    "file_extension": ".py",
916 |    "mimetype": "text/x-python",
917 |    "name": "python",
918 |    "nbconvert_exporter": "python",
919 |    "pygments_lexer": "ipython3",
920 |    "version": "3.10.5"
921 |   },
922 |   "toc": {
923 |    "base_numbering": 1,
924 |    "nav_menu": {},
925 |    "number_sections": true,
926 |    "sideBar": true,
927 |    "skip_h1_title": false,
928 |    "title_cell": "Table of Contents",
929 |    "title_sidebar": "Contents",
930 |    "toc_cell": false,
931 |    "toc_position": {},
932 |    "toc_section_display": true,
933 |    "toc_window_display": true
934 |   }
935 |  },
936 |  "nbformat": 4,
937 |  "nbformat_minor": 4
938 | }
939 | 


--------------------------------------------------------------------------------
/01-Create-Datasets/04-create-air-passengers-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "threatened-orbit",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Air Passengers Dataset\n",
  9 |     "\n",
 10 |     "In this notebook we will prepare and store the air passengers dataset found [here](https://github.com/facebook/prophet/blob/main/examples/example_air_passengers.csv).\n",
 11 |     "\n",
 12 |     "**Citation:**\n",
 13 |     "\n",
 14 |     "Box, G. E. P., Jenkins, G. M. and Reinsel, G. C. (1976) Time Series Analysis, Forecasting and Control. Third Edition. Holden-Day. Series G.\n",
 15 |     "\n",
 16 |     "**Description of data:**\n",
 17 |     "\n",
 18 |     "The data is a monthly time series measuring the number of international airline passengers, in thousands, from 1949 to 1960."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "aggressive-license",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "In this notebook we will:\n",
 27 |     "\n",
 28 |     "1. Provide instructions to download the air passengers data set\n",
 29 |     "\n",
 30 |     "2. Save the time series data in the correct location for use in the course\n",
 31 |     "\n"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "id": "competitive-robertson",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "# Get the dataset"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "outdoor-architecture",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "The dataset can be obtained from this [link](https://raw.githubusercontent.com/facebook/prophet/master/examples/example_air_passengers.csv). It will open a raw file in GitHub. A simple way of obtaining the data is to copy and paste the values from your browser into a text editor of your choice. \n",
 48 |     "Save it in the Datasets directory, which is found at the root of this project, with the filename `example_air_passengers.csv`.\n",
 49 |     "\n",
 50 |     "Alternatively, run the code below."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 1,
 56 |    "id": "5045cf1c",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "import pandas as pd\n",
 61 |     "\n",
 62 |     "url = \"https://raw.githubusercontent.com/facebook/prophet/main/examples/example_air_passengers.csv\"\n",
 63 |     "df = pd.read_csv(url)\n",
 64 |     "df.to_csv(\"../Datasets/example_air_passengers.csv\", index=False)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "id": "respected-worth",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "# Data set synopsis"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "mediterranean-toilet",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "The air passengers dataset is a monthly timeseries representing the number of US air passengers collected between January 1949 and December 1960."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "id": "italic-serial",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "# Check that you can load the data "
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 2,
 94 |    "id": "established-clinic",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "import pandas as pd"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 3,
104 |    "id": "developmental-roulette",
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "df = pd.read_csv(\n",
109 |     "    \"../Datasets/example_air_passengers.csv\", parse_dates=[\"ds\"], index_col=[\"ds\"]\n",
110 |     ")"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 4,
116 |    "id": "quantitative-missouri",
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/html": [
122 |        "<div>\n",
123 |        "<style scoped>\n",
124 |        "    .dataframe tbody tr th:only-of-type {\n",
125 |        "        vertical-align: middle;\n",
126 |        "    }\n",
127 |        "\n",
128 |        "    .dataframe tbody tr th {\n",
129 |        "        vertical-align: top;\n",
130 |        "    }\n",
131 |        "\n",
132 |        "    .dataframe thead th {\n",
133 |        "        text-align: right;\n",
134 |        "    }\n",
135 |        "</style>\n",
136 |        "<table border=\"1\" class=\"dataframe\">\n",
137 |        "  <thead>\n",
138 |        "    <tr style=\"text-align: right;\">\n",
139 |        "      <th></th>\n",
140 |        "      <th>y</th>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>ds</th>\n",
144 |        "      <th></th>\n",
145 |        "    </tr>\n",
146 |        "  </thead>\n",
147 |        "  <tbody>\n",
148 |        "    <tr>\n",
149 |        "      <th>1949-01-01</th>\n",
150 |        "      <td>112</td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <th>1949-02-01</th>\n",
154 |        "      <td>118</td>\n",
155 |        "    </tr>\n",
156 |        "    <tr>\n",
157 |        "      <th>1949-03-01</th>\n",
158 |        "      <td>132</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <th>1949-04-01</th>\n",
162 |        "      <td>129</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>1949-05-01</th>\n",
166 |        "      <td>121</td>\n",
167 |        "    </tr>\n",
168 |        "  </tbody>\n",
169 |        "</table>\n",
170 |        "</div>"
171 |       ],
172 |       "text/plain": [
173 |        "              y\n",
174 |        "ds             \n",
175 |        "1949-01-01  112\n",
176 |        "1949-02-01  118\n",
177 |        "1949-03-01  132\n",
178 |        "1949-04-01  129\n",
179 |        "1949-05-01  121"
180 |       ]
181 |      },
182 |      "execution_count": 4,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "df.head()"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 5,
194 |    "id": "straight-mouth",
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "data": {
199 |       "text/plain": [
200 |        "<AxesSubplot:xlabel='ds'>"
201 |       ]
202 |      },
203 |      "execution_count": 5,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     },
207 |     {
208 |      "data": {
209 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEGCAYAAACevtWaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAABFZElEQVR4nO3dd3xcZ5no8d+rGUmjNupdsuUi23GJY8fpvZGEAEkoISwlJGGzQGBhWe6SvbCUZdkF7l3a3V12QyAJGyChZZMA6YnTiOMS917ULVldGpXp7/3jnDMayRppRnPGkuXn+/n449GZmVNcnnnnOc/7vEprjRBCiPklbbZPQAghhP0kuAshxDwkwV0IIeYhCe5CCDEPSXAXQoh5yDnbJwBQUlKi6+rqZvs0hBDitLJt27ZurXXpZM/NieBeV1fH1q1bZ/s0hBDitKKUaor1nKRlhBBiHpLgLoQQ85AEdyGEmIfmRM59MoFAgNbWVrxe72yfSkwul4uamhrS09Nn+1SEEGKcORvcW1tbycvLo66uDqXUbJ/OSbTW9PT00NrayqJFi2b7dIQQYpw5m5bxer0UFxfPycAOoJSiuLh4Tn+zEEKcueZscAfmbGC3zPXzE0KcueZ0cBdCiDNFY/cwLx04Ydv+JLgLIcQccP9rx7jn59vo8vhs2Z8EdyGEmAP6hv0Ew5r/2d5my/4kuMfw1a9+lR/84AeRn7/85S/zwx/+cPZOSAgxr/WPBAB4bGsLdqyQN2dLIaN946m97Ds+aOs+V1a5+dq7V8V8/q677uK9730vn//85wmHwzz66KNs3rzZ1nMQQghL/2iAdIfiSOcQ21v6Wb+gMKn9ycg9hrq6OoqLi9m+fTvPPfcc69ato7i4eLZPSwgxTw2M+Ln2rHKy0h38ZmtL0vuLa+SulCoAHgBWAxq4CzgIPAbUAY3AbVrrPmXUB/4QeCcwAnxca/12Mic51Qg7lT7xiU/w0EMP0dHRwV133TUr5yCEODP0jQSoKcziprMreWpnO//wrpVkZ8w8uRLvyP2HwDNa6xXAWmA/cB/wota6HnjR/BngRqDe/HUP8OMZn90su/XWW3nmmWfYsmUL119//WyfjhBinvIGQowGQhRkZ/DutVUM+YK83dSf1D6n/VhQSuUDlwMfB9Ba+wG/Uupm4ErzZQ8DG4EvATcDP9fGHYFNSqkCpVSl1ro9qTOdBRkZGVx11VUUFBTgcDhm+3SEEPPU4KhxMzU/K50KtwuAAXPbTMUzcl8EdAEPKqW2K6UeUErlAOVRAbsDKDcfVwPRCaNWc9tpJxwOs2nTJu6+++7ZPhUhxDzWbwbygux03FnGmHvQm/rg7gTWAz/WWq8DhhlLwQBgjtITqt1RSt2jlNqqlNra1dWVyFtPiX379rF06VKuueYa6uvrZ/t0hBDzmFUGWZCVgdtldJkdTHLkHk+2vhVo1Vq/Zf78W4zgfsJKtyilKoFO8/k2oDbq/TXmtnG01vcD9wNs2LAh+aJOm61cuZJjx47N9mkIIc4A/SN+wBi5Z2c4cKSp1I/ctdYdQItSarm56RpgH/AkcIe57Q7gCfPxk8DHlOFCYGCm+XY7CvlTaa6fnxDi9NAflXNXSuF2ORkcDSa1z3jrbD4L/EIplQEcA+7E+GD4tVLqbqAJuM187Z8wyiCPYJRC3jmTE3O5XPT09MzZtr9WP3eXyzXbpyKEOM0NjIzl3AHcWelJj9zjCu5a6x3AhkmeumaS12rg3qTOCqipqaG1tZW5mI+3WCsxCSFEMvpH/TjSFLmZRkh2u9JPSc59VqSnp8sKR0KIM0L/SIACMyUDRnpm0JtcWkbaDwghxCzrHw2Qnz22FrM7y5n0yF2CuxBCzLIBc+RucbuSz7lLcBdCiFnWP+qnIDsj8rM7Kz3pahkJ7kIIMcv6hieO3J2MBkL4g+EZ71OCuxBCzLKBk3LuxmNPEqkZCe5CCDGLAqEwQ74ghdFpGasFQRIVMxLchRBiFg2Mjp/ABIw1D0uiYkaCuxBCzCKraVj+hGoZSK4zpAR3IYSYwog/yBM72lLWS2pg1GoaNr5aBkiqYkaCuxBCTOG7zxzkc4/u4HDnUEr2P9buV0buQghxShzrGuKRTU0A9A37U3KM/hHJuQshxCn17acPEAwb6Zhkl72LJbIKU9ZYWiYr3YEzyZ7uEtyFEGISm4718Ny+E9y2wej8mmwjr1gGRvwoBXmusT6OSqmkZ6lKcBdCiEn895tNlORm8DfXLQOSX/Yulv7RAPlZ6aSljV+3wu1yyshdCCHsdmLQS31ZHmV5xoI8KUvLTGgaZjFG7hLchRDCVr3DfopyMnCkKfIykxtFT8Vo95tx0najM6SkZYQQwla9I0ZwB3u6NMbSP+KPMXJPrqe7BHchhJggGArTPxKIBPc8lzO1aZnsSYJ7kj3dJbgLIcQEfWbtuRXc821YsDr2sWKN3KVaRgghbNU3YkxYGp+WsT+4ewMhPN4gZW7XSc8l29NdgrsQQkzQMzQhuLtSE9w7B30AlOZlnvRcsj3dJbgLIcQEvcPjg7uRlrH/hmqnxwtA2WTB3ewvM9NcvwR3IYSYoNdMyxRH0jJOhnxBgqGZL3s3mU7PVCN3s7/MDD9UJLgLIcQEvUPj2/Bao2iPzaP3zkFr5D5Zzt1q+ysjdyGEsEXvsI88l5MMpxEirYU07K6Y6Rry4UhTkW8I0dxJHlOCuxBCTNA7EhgXcK1Aa3ete+egj5LcjJP6ykD0yF3SMkIIYYveYR+FUcE934aVkSbT6fFNmpKB6Jy7jNyFEMIWPUP+CSP35AJtLEZwP/lmKkT1dJecuxBC2KMvqq8MJF+WGEuXx0uZe/LgbvV0T2kppFKqUSm1Wym1Qym11dxWpJR6Xil12Py90NyulFI/UkodUUrtUkqtn9GZCSHEJN482sPHH9xMwOayRIvWmt5hf4y0jH3BPRgK0zPspzRGWgagwu2ifcA7o/0nMnK/Smt9jtZ6g/nzfcCLWut64EXzZ4AbgXrz1z3Aj2d0ZkIIMYn/+9xBNh7sonvIl5L9e3xBAiE9Li2TneHAkeSydxN1D/nRevIJTJbaoiyae0dmtP9k0jI3Aw+bjx8Gbona/nNt2AQUKKUqkziOEEIAsKOln21NfYD9NeeWvsjs1LGgq5TCbXNnyKlmp1oWFGXT0juC1jrh/ccb3DXwnFJqm1LqHnNbuda63XzcAZSbj6uBlqj3tprbxlFK3aOU2qqU2trV1ZXwiQshzjwPvtEQeZyqZe96IsF9fKfGfJt7ult9ZSZrGmapLcrGFwzT5Un8W0q8wf1SrfV6jJTLvUqpy6Of1MbHSkIfLVrr+7XWG7TWG0pLSxN5qxDiDNQx4OWPu9pZv6AASN3IvXfo5JE7kNTNzclM1XrAUluYDUBLX+KpmbiCu9a6zfy9E3gcOB84YaVbzN87zZe3AbVRb68xtwkhxIz996ZGwlrz2WvqAfvLEi0T+8pYkl08YyIrLVOaO1XO3QzuvaMJ73/a4K6UylFK5VmPgXcAe4AngTvMl90BPGE+fhL4mFk1cyEwEJW+EUKIGdl4sIuLlhSzqsoNpC4tY3WELJwQ3PNt7une6fFRmJ0eaXEwmZrCLIAZ3VR1xvGacuBxpZT1+l9qrZ9RSm0Bfq2UuhtoAm4zX/8n4J3AEWAEuDPhsxJCiAk6BrysrS0Ym5afqrTMsJ8MZxo5GY5x291ZTgZszLl3TTE71eJKd1DuzqQlFcFda30MWDvJ9h7gmkm2a+DehM9ECCFi8AZC9Az7qcp3kelMI8ORlrq0zLAxO9Uc0Ea4bV5qr9PjizmBKVptYfaMRu4yQ1UIMedZlSUV+VkopchzOVN3Q3XYT2H2JF0aXen4g2G8gZAtx+ka9E55M9VSW5RNa18Kcu5CCDHbjg8Ywa0y30hjpGpNUzBKIYtzp2jBa8NxtdZ0DU2flgEjuLcPjCa8lqoEdyHEnNdhTsGvMIN7KkfufcPj+8pY7Ozp3jcSIBDSU05gstQWZhHWcLw/sdG7BHchxJxn9VepMCf82F2WGC12Wsa4RWnHTdXI7NQ4cu4LimZW6y7BXQgx53UMjOJ2OcnJNAKsOys1I3dvIMSQL0hJitMybzf1A1BdkDXta61a90RvqkpwF0Ik5Uinhz1tAyk9RvuAl8r8sUCYl5manLv1DSH6WBa70jLDviA/eOEQ6xcUcE5twbSvL3e7SHeohCcyxVPnLoQQJ/nV5mZ++noDRzqHyEp3sPcb10+6XJwdOga9kXw7pG7k3mZWpVRNMqK2q6f7/a8eo9Pj48cfOfekcsvJONIUNYXZCde6y8hdCDEj33nmAFprrj2rnNFAKDJtPxWMkftYcM9zpTMaCNne0926aTlZusRajWlgZObB/cSgl/tfPcZNayo5d2Fh3O+rKcySnLsQIvX6R/z0jwT40PkLeN96o+mrVYtuN38wTPeQb/zI3by5affova1/FKUYdyxLptNBnsuZVB/5B147RjAc5ks3rEjofbVFMnIXQpwCjT1GoFlYnBOp+LAqQOzW6fGiNSeN3MH+/jLH+0cpy8uM2e+lMn/mKyMB7GsfZFVVPguKsxN6X3meyyyfjP+bigR3IUTCGruHAVhUkk1prhF0O2fQczweYzXuY6kSq3LF7pH78YHRSfPtlor8LDoGZx7cG7qGWVSSk/D7rElVVlOzeEhwF0IkrLFnGKWgpjA7MnKfyYIS8RirYIkeuRtpGbtr3dv6pg7ulUmsaeoNhDg+4KWuOPHgbpVmJpISkuAuhEhYY/cwVflZuNIduNKNXHRnEiPaqUycnQpjlSseG4N7OKw5PuCdsva8It9F95Av4VYAYHwgAiwqncnI3fgA7RmSkbsQIoUae0aoKxnLG5flZaYsLdM+4CUnw0Fe5ljltlW5Yueydz3DfvzB8JTBvTLfhdYzu78QSWXNYORuLRzSMywjdyFECjX2DLMwKkiV5blSl3MfHKUi3zWuJjxyQ9XGkbtVBjl1zt349tAxg9RMQ7dxEzr6QzFeMnIXQqScVQYZPQItc2emrFpm4uxUgLxMJ0rZu2DHWHCP3anROo+Z5N0buocoyc2MfDAlwu1ykuFIo1uCuxAiVcbKICekZQZ9GGv12Ku933tS3XlamiI3w2lrKWTbFBOYLMmM3Bu7R1g0g1E7gFKK4twMuaEqhEidJuvGYMn4tIwvGLZ96btgKEynZ/zsVIs7K93WUsi2/lGyMxyRHjKTcbucZGc4ZjZy75lZGaSlODeDHgnuQohUaeg2yiCtboVAysohu4f8hPXkM0bzXE7bc+7VBVlT9ntRSlGR76JjMLEmXh5vgC6Pj7pkgntOJj1Rde6j/qlXhJLgLoRISHQZpMVaLs7uvLuVhijJPbnvuduVbmsp5PF+75Q3Uy0zmaXaZKayZlIpYzFG7mPB/aYfvTbl6yW4CyES0tgzMi7fDkSWi7N/5G4F98n6qzttLYU83j/1BCZLhTsr4Zz7se6Z17hbSnIz6R4y7msEQ+Fp+7tLcBdCJKSxZ/ik9EKkv4zNzcOs6fbFOSeP3PNc6Xh89ozcvYEQPcN+qqeolLFUFRhln8EE+rxYNe4Li5JJy2TgC4YZ9odo7RslGJ765rUEdyFE3KwyyLoJI/e8TCeZzjTb0zJWGmLSBatd9o3c2+KocbdU5LsIhY0FruNlpLJcZGU4pn9xDGO17j4azJvaU5HgLoSIm5UKWDghd6yUMmvdbU7LDPvIcKaRm3nyukJ5Zs7djvLLqfq4T2RV7iSSdz/WffK3nUQVR/WXaeiS4C6EsJGVdil3n5y+KMtz2Z6W6RnyU5yTMWkFizvLSVjD8DRVI/E40jkEEFcr3gq38QEQK+8+6A1EvgkAaK1psCG4l5oj9+4hP409w5HmabFIcBdCxM26wWlVx0Qz+svYnZbxTZqSgbEWBHZUzLx5tIcFRdmTrp060XQj908/8ja3/eebkW8UTT0jDIwGWF2Vn9Q5Wn8OPUN+Grqnr5mX4C7EPPHNP+zjhX0nUnoMqxrGamQVLRXNw3qH/ZPeTIWxzpDJ5t1DYc2mYz1ctLg4rtcXZKeT6UyjY+DkWvc3j/bw+pFu2vpHIwtab2/pA2DdgoKkzrPIah425JPgLsSZwuMN8NPXG/jsr7Zz+IQnZcfpHvKR53KOq3G3lLldeLxBvIHk0yRjx/NPMXK3ltpLbuS+v32QQW+Qi5bEF9yVUpPWumut+d7zB8kxb5puaewFYHtzPzkZDpaV5yV1ntYyf8cHvLT1j07bF16CuxDzwFHzBps3GOKTj2xj2GdvGwBL15Bv0pQMRE1ksinvrrWmZ9g36QQmGFuNaSazVKPLGP98tBsg7uAORsXMxJz760e62dLYx9/dsAK3yzkuuK+tLcCRFnvma7xKcjPZ3tyH1tg3cldKOZRS25VSfzB/XqSUekspdUQp9ZhSKsPcnmn+fMR8vi6ZixFCTO+oeUPwmzevpqF7mK8+sTclx+n2+GMG2zKbZ6mO+EN4A+FIOmKimS6S7QuGuOCfX+T+V48CRiplcWnOpDeJY6nKzxp30xTg+88forogi9vPr2VDXRFbGnsZ9YfY3z6YdErGUpyTwUHzm5mdaZnPAfujfv4O8H2t9VKgD7jb3H430Gdu/775OiFECh3tGsKZpvjgebW8/9wantvbkZLjdE8xcrdmqdqVd4/UuMcI7gXZia8rCsY3i55hP//63CGOdQ2xuaGXixMYtYPRV6dj0IsvaKSghn1B3m7u54Pn1ZLpdHBeXRFHu4Z55VAXwbBmXW1hQvuPpTg3A6vyc7rqm7iCu1KqBrgJeMD8WQFXA781X/IwcIv5+GbzZ8znr1FTdeIRQiTtSOcQC4uzSXekUV+Wh8cXZMDGdriWLo8vUpI3UWUS7XAnY606FOubQmF2OtkZjmmn4U9kffj4gmHuemgLw/4QFy0uSWgfC4uz0ZrITVNrCb2lZbkAnFdnBPMHXjsGwDl2jdzNP4uinIwpu1dC/CP3HwB/B1iJqmKgX2ttfR9qBarNx9VAC4D5/ID5+nGUUvcopbYqpbZ2dXXFeRpCiMkc7RqKBJaaQqOcr7UvsaA3HW8ghMcXjDlyL8hOJ8OZRodNa6lONTsVjBubC4qyae5J7Dq7zLTR+9bXRHrTX7i4KKF9WL11mnuNoN40ocf9mpp8MpxpbG3qY0FRdswPqERZ+4mndfC0wV0p9S6gU2u9Lekzi6K1vl9rvUFrvaG0tNTOXQtxRgmEwjT1jLCk1Aju1ZHgnlhb2ulYZZCTNfGCsSoSu0fuxVMExgVF2TQlOHK3ruOL1y9jVZWbs2vypzzG5Mc1gqsV1BvM3jFWBUum08E5NQVA8iWQ0aw/++kqZQCmnuJkuAR4j1LqnYALcAM/BAqUUk5zdF4DtJmvbwNqgVallBPIB3oSvAYhRJyae0cIhnUkuNcUGqNHu4P7VBOYLOVu+4J79zQ5dzBGyhsPdREOa9LirEbp9PhIU8Y9gl/dcyHBUOLtC0pyM8jOcESCe1PPMKV5meREtUk4b1Ehmxt7WVdbkPD+Y7Fq/uNZ0WnakbvW+u+11jVa6zrgduAlrfWHgZeB95svuwN4wnz8pPkz5vMv6VSsvSWEAMamzi8x0zJWLrotZSP32MG9Mt9la1omJ8MxaU29ZUFxDv5gmBMJVOh0Dvoozs3EkaZwu9JjVuNMJZISMr81NHaPnNSr/crlZTjTFBcvTSyfPxXrg3VRSe60r02mzv1LwBeUUkcwcuo/Nbf/FCg2t38BuC+JYwghpnG0ywzuZq9wpRQ1hVm259ytkfRUwb3CHLnbMZ7rGfZNmy5ZaK4GlUjevWso9k3hRCwszo4sOdjYM3xSj/vz6orY8bV3JD15Kdq5Cwv5p1tWc+3KsmlfG09aJkJrvRHYaD4+Bpw/yWu8wAcS2a8QYuaOdg5T7s6M9FoBo7thqtIysW5wgjG5xx8KG20Dkgygxj6mHlVbAbWpd4QL4mwf0OnxRvrPJ2NhcQ4vH+zC4w3QGWMJvcm6WSbDkab4yIUL43qtzFAV4jR3tGsokm+31BRm2z5y7/L4yM9KJ9MZO01SYU4EsiM10z0Uu6+MpaogC0eaSmzk7vFFJlwlY0FRNv5gODITNZnFr1NBgrsQpzGtNUc7JwvuWQx6g7YuID3VBCZLhY217j1DvpiVOZZ0RxrVBVlxV8yEwpruIf+01xGPBWZK6NVDRvuCiWmZ2SbBXYjTWJfHh8cXjNS4W6yKGTtvqnZ5pg+2VsvcZEfu4bCmd9gf183OhcXZNMexMhFA34ifUFhHZtMmwwrmrxwy5unEU554KklwF+I0dsS8mbp4wsLLqah1N0buUwfFktwM0lTyI/dBb4BgWMeVt0+k1t1qambHyN1KCTV0n1wGORdIcBfiNGYF74kLL1uzVNtszLvHM3J3OtIoy0u+1r1n2KrMmX7kvqAom/6RQFztFqx1T+3IuVspIeCkMsi5QIK7EKcxK4hOrP4ozsnAlZ5m28h9xB9k2B+Ka8RbbkOt+1jTsOmPF2kFEMdN1U7zvOwYuUcfe67l20GCuxApcyrm7rUPeM1APr6CRSllazlkt2f6GndLpfvkhSwS1RNH2aUl0gqgd/q8e1ccs2wTYd1UTXZ91FSQ4C5ECngDRs/wx7e3pvQ4HQOjkQqViWoKs2nttyctEwmKcQT3inwXJxIM7gc6BgmHxz4Mu4enbz1gsRa1bopr5O4jN9NJdoY9+XFrxD7XbqaCBHchUqKld4ROj4+fv9mU0uO0D3gjrXYnqinMsq1aJp6+MpaKfBceX5ChOFeDaukd4YYfvMbP3miIbHv1UBcluRlx3VDNzXRSkpsRMy2zramPN48a7a26huypcbesqsonTcFZlfbNQrWLBHchUsBKh2xv7qexO74yvZnoGPROOXLvGwnEHWSnEk9fGUuifd2t3jgPvtFIMBTmeP8oL+4/wW0bauNems6omJn8z/nvf7+Lv350O6Gwpmtw+lr9RFyytIQ3//4aFpdO3+vlVJPgLkQKRM8OfXx72xSvnLlRf4j+kUCktnyi6kjFTGKj95beEc7/1guR+m0YuxEZTw7cWq4u3uBuLXTR1j/K8/tO8OjmZjTwofMXxH3OtUXZJy17B8Z5HzoxRJfHx+aG3inXgJ2pRJbnO5UkuAuRAq19o2Q407h4STH/s6MtJTdXrYqUihjBZaaLduxpG6DT4+Nzj26nrX+U/e2D/OyNRtbWFpDumD5kREbucVbMNPWMkJPhoLYoi5+8doxHt7Rw5bJSaovir0CpKcyivd87buFrgDfMxa/TFDy16zidg15bJjCdDiS4C5ECrX2j1BRk8d71NTT1jPB2c7/tx2gfMEaqU+XcgUlHtFPv1wjK3kCIT/73Nu58cAu5mU7+8yPr43r/2Mg9vuM29QyzsDiHj1+8iLeb++n0+OJujmWpKcwmGNacmLB+6+uHeyjMTufGNZX8YefxuMs55wMJ7kKkQGvfCNWFWVy/qhxXelpKqmastEesnHtpbiaZzsRr3dsHRsl0pvG9285hd9sAw74gD955Xsz0z0SudAeF2ekJjdzrSrK5bUMNuZlOqguyuHL59C1to1kfZC1RM1W11rx+pIuLl5bwnrVVDHqNew923lCdy+bWfFkh5onWvlHeUZVPniudy+pLeeOI/YuRWSPsWEF3rNY9sbSMVYHzzjWV/PD2c1hUksNZle6E9lEVZ419MBSmpW+E61dXkOdK5/99aB1ZGY64b6RaJlt96mjXECcGfVy6tIQrlpWSl+mccg3Y+UZG7kLYbMQfpGfYHxlNrqx009gzzKg/ZOtxOga8FGSnk5URuwVv9QzKIY3gbpz7zedUc7a5Fmgi6kpy4qoSah/wEgjpyKIbV60o48I4+7JHqyowvr1Ef5C9ftjIt1+6tARXuoPrVpYDJ8/mna8kuAthM2v0aAX3FRV5aA2HOz22Hqd9wBvzZqrF6OueWHDvmKJ2Pl6LinNo6RslMOEGZ0P3MHc/tIX7Xz0KjE08WpjkJKBMp4Nyd+a4a339SA8Li7MjN2bvunQRVy4vnZMTjlJBgrsQNrNGj1ZQWV5hTHA52GFvcO8YHJ02CNcUZtEz7GfEH1+teyis6Rj0UlmQZHAvySEU1uNy4A+8dowbfvAqLx7o5ME3GtFaR8og6+JY8Hk60QuUBENhNh3r4ZKo9UtXV+fz0J3nT7km63wiwV0Im00cuS8szsGVnmZ/cB/wUjHNTc6aBGvdu4d8hMJ62v1Ox+q10mCmZvYeH+Cf/rifi5cU84XrltE+4OVw5xBNPcNkONMot6E80Vg31rjO/e0ehnzBGaV45gsJ7kLYrLXPqDax+rA40hT1ZXkcPGFfcPcFQ3QP+eMauQO0xlkOedx8XVWyaZkJwX1nywAA33jPat5/bg1gtBho6hlhYVE2aQneQJ1MTWEW7QNGrfu2JmPpuw0LC5Pe7+lKqmWEsJlVBqnUWMBaXpE3bsZnsqxFJ2KVQVomqyKZynQVOPEqzE4nPys9knbZ3TaA2+Wktsj4c6kvy+WVQ110DvqSzrdbagqzI2mlbc39VOa7qCpI7jpOZzJyF8JmrX2jkaBqWV6eR5fHR6/Z7TBZY0F46uBemptJhiMt7nLIePc7HaUUdSU549Iyq6vzIx94ly8r5a2GXhp7hqmzqRd6TdTqU9saezn3DB61gwR3IWxnBPfxI0a7b6pONzvVkpamqCpwxT9y7x/FlZ5GQXZ60ue4uCSHxu4R/MEwB9o9rKnOjzx3+bJS/MEwvmDYtoUurA/ULQ29HB/wSnCf7RMQYj4Z9gXpjapxt6yIBPdBW44zNjt1+rRDTWF23DdU2weNGvfolNJM1RXncHxglN1tA/hDYVZFBfcLFhWR6TTCj11pmaoCF0rBEzuPA0hwn+0TEGI+sfq4TEzLlOZlUpidPuObqhPrxdv6R8nLdJIbx6LM0VUk02nvn768Ml51JdloDX/c1Q4wbuTuSndwgVnJYlfdeabTQXmeiyOdQ7jS0xKeVTvfSHAXwkZWbnviyF0pxfKKPA7MIC2zuaGX5V95mm88tZcRf5D/3tTEL99q5pwFBXG9v7ogi+4hH97A5DNkj3R6GPQai0sb5ZX2BPfFJUaP8z/uPk5epjMyC9XyvvXVLC/Pi8wutYP15762Jr4OlvPZmX31QtispXd8jXu0FRVuDnV4xi0nF4+tTb2EtbGYxcXffol/+J89XL6slH//cHxdGmuKxm40TnSgY5B3/vB1vvDYDkJmV8WqJCtlLNbEpBODPlZWuU8qd7z5nGqe/ZvLcdoYhK0/9w11Z3ZKBiS4C2Grpp4RsjMck641urwij2F/KOEWvEc7hyl3Z/LYPReyoCibz1y1lJ98bANuV3w3Pa0U0cTj+oNh/vbXO/GHwrywv5PNDb2Ewjrp2amWPFd6ZOWm6JRMKlnXeqbn20GCuxC2au4dZkFR9qQ3JOvLjDSFtaxcvI51D7G4JJcLFhfz5Gcu5YvXL0+oa2KsRTv+7aXD7D0+yLduXU2GI41vP70fSL4MMtoic/S+pubUBPf1Cwsoyc3k3AVFp+R4c9m0wV0p5VJKbVZK7VRK7VVKfcPcvkgp9ZZS6ohS6jGlVIa5PdP8+Yj5fF2Kr0GIOaOpZ4QFMVYQWmoG90QaiGmtOdY1zOLSmd90LMtzke5QkZQRwLGuIf5941Heu76aD1+wkHevrWJnqzGLNNkJTNGsmaqrT9HI/eoV5Wz9yrXk21DKebqLZ+TuA67WWq8FzgFuUEpdCHwH+L7WeinQB9xtvv5uoM/c/n3zdULMuo0HO2PeVLRDOKxp7h2JWbddkJ1BaV4mh0/EP3LvHfYzMBpgSRILMDvSFAuLc8Z9Y7BSMJ+9uh6AOy+pizxn58j9imVlnLuwkEVnSCfGuWTa4K4N1r+KdPOXBq4Gfmtufxi4xXx8s/kz5vPXKDuKZoVIwsEODx9/cAvfeGpvyo7RNeTDFwyzYIpAVl+Wy+EE0jJHu4wZnsmM3MHI9x88MVZjf6DDQ1a6I1LBsro6nwsWFZGT4SA/y75R701nV/K7T11sS+8YkZi4cu5KKYdSagfQCTwPHAX6tdZWH9FWoNp8XA20AJjPDwAntWZTSt2jlNqqlNra1WVfzw0hJnPAnDz0q80tvHywMyXHsHqTx0rLgBHcj3YOxb1g9rEu44MgmZE7wIryPFp6RxnyGf9lD53wsKw8d1zQ/e77z+bfPrzelglMYvbFFdy11iGt9TlADXA+sCLZA2ut79dab9BabygtLU12d0JM6WjnEGnKCK5f+u0u+kfs6fESrclskjWxnjva0vI8PL4gJwZ9MV8T7Vi30RI32QZYVvuDw+YkqoMdnsg2y8LiHK5KcO1SMXclVC2jte4HXgYuAgqUUtb0uBqgzXzcBtQCmM/nA/YvIClEAg53DrGwOIfvf/Aceof9/Otzh2w/RnPvCI40RfUkNe6WpaWJ3VQ92jnE4pKchNcUnWhFhTFb82CHhy6Pj55hP8srzuwZnPNdPNUypUqpAvNxFnAdsB8jyL/ffNkdwBPm4yfNnzGff0nH+x1UiBQ50jnE0rJcVlfnc8WyUjY39Np+jKaeEaoKXFPOjKwvN4N7nDdVj3UnVyljqSnMIjvDwYEOT6R52YoJI3cxv8Qzcq8EXlZK7QK2AM9rrf8AfAn4glLqCEZO/afm638KFJvbvwDcZ/9pCxG/QChMY89wpBTxrEo3R7uG8AXtrZxp6h1hYdHUgbg4J4PC7PS4bqr6g2Gae0ci0/iTkZamqC/P42CHJ3L/YWJaRswv03Yd0lrvAtZNsv0YRv594nYv8AFbzk4IGzT1jBAI6UhKZEVlHsGw5kjnEKuq7Ku/bu4Z5sY1lVO+xlioIo8jMdIy4bDmN9tauHBxMYGQJhTWtozcwbip+vz+E9QUZlGSmxGZPSrmJ5mhKuY9q77bSolY+ecD7fYtezfoDdA3EpjyZqplaXkuh06cXDETDmv+/ve7+dLvdvMXP3mLN48Zt6qSrZSxLK/Io3fYz5+P9sio/QwgwV3Me0cnlBPWFWeT6UyLpCfs0GyWQcaz8ER9WS4DowG6h8YqdkJhzRd/u5PHtrbwwQ219I34+fqTRk2+bSN3M6C39Y+yvFxups53EtzFvHf4hIeqfBc5Zu9zpyONZeUza78bi1XjXhvHyL2+zCxLjErNPLOng9+/3cbnr63nO+8/O9LxsTQvk7w4G4RNJ3q0vrzCnm8DYu6S4C7mvSNdQywtH5+GOKsyj/3tNo7ce62R+/SjbOvG7tGom6oHOgZJU/CpK5cAcNXyMv79L9bzv65fbts5FudmRvLsUgY5/0lwF7NqxB/kg//1Jj9/szEl+w+bN06XTshbr6hw0z3kp8sT32SiqWitOdo1RHFORlwrI5W7M8nLdI6rmDnWPUxtUTaZTkdk2w2rK7htQ23S5xdteUUuSsGychm5z3fT/0sUIkW01nz58T281dBLhjONj11UZ/sx2vpH8QbCkdGyZUWlMZI/0DFIaV7iM6SbeoZ5ek8HL+4/wf52D0O+IOfXxddmVinF0vLccbXuDV3DkQ6KqXTj6krys9LJzpD/+vOd/A2LWfPLzc08vr0Nt8tpa/472hHzZupJwT2qYuay+sSC+8sHOrnzoS0ArK5287711Swtz+PKZfHvp74sl5cOGD2VtNY0dA9zweLU9yD/yIUL+ciFC1N+HDH7JLiLWdHYPcw3ntzHlctLuWhxMf/y9AF6h/0U5WTYepwjJyYP7kU5GZS7M9k/g4qZlw92kpPh4JnPXx7XDdTJ1Jfl8eutrfQN+/EGQ4wGQiy2qeRRCJCcu5glf9rTjj8U5p9vXRNZpd7O0kTL3uMDVLhdk35orKhws38Gte4H2j2sqHTPOLCDUesOxjeLBqut7ylIy4gzhwR3MStePtDJqio3VQVZkRK9gylIzexqHYi5xNuKSmOmqD8Yjnt/Wmv2dwxyVmVyk4CsJfcOnxjiWLcR3E9Fzl2cOSS4i1Ouf8TPtqY+rl5htJcty8ukIDvd9uA+6A1wrHuYtTGC++qqfAIhzaET8R+3rX8UjzcYydnPVFW+0cjrcKeHY13DuNLTqHDbtwKSEBLcxSn36uFuwhquMoO7Uorl5XkcTCDIxmOPuSbompqCSZ9fa27fZb4uHlbLgmRH7mlpiiWluRzpHKKhe4hFJbmyWpGwlQR3ccq9fKCTopyMSHAFY2r8oQ4P4bB93aF3tRlB++wYizPXFmWRn5XO7rb+uPdp3RdYVp58b5b6Miu4D0u+XdhOgrs4pUJhzcaDnVyxrHTcAhTLK9wM+0O09Y/adqzdrQPUFmVRGKMCRynF2TX57GyJf+S+v8NDbVGWLS0Blpbn0j7gpbl3RPLtwnYS3MUptbO1n76RQCQlY7FuqtpZ776ztZ+zqwumfM3ZNfkcOuHBG4ivt/uB9kHOsmnqvtVjJqztaw4mhEWCuzjJH3e10z5g3wg62gv7TpCm4IoJE4fGKmbsKYfsHfbT2jfK2TFuplrWVBcQDGv2xegzo7XmD7uOG/XogRAN3cOsqLQruI/VtcvIXdhNgrsYZ1tTH/f+8m0eeK3B9n37giF+vbWVK5aVkp89Pq2Rm+mkpjDLtpH7rtZ+gJhlkJa1tcbzu2PcVH1xfyef+eV2PvfYDg52eAhrOMumXui1RdlkOI3/gnastiRENJmhKsb53vMHAdh3fOYjaG8gRPuAl5rCrHHriT61s53uIR93X7p40vetqMizrRzSCtZrYtxMtVS4XZTkZk5aMaO15j82HiHDmcarh7rw+o3UjV0jd4dZMdM56D3pw06IZElwFxGbjvXwxpEe3C4n+9oH0VqjVOLlef/yp/08/GYTzjTFsvI8vvv+s1lV5eZnrzewrDyXS5YWT/q+lZVuXjrQyZAvGFd3RcvGg538Zmsr//cDa8nKMLoq7mwdYHFpzrQ3PpVSrK3Jj4z0o73V0Mvbzf184z2reH7fCV4/0k1WuoMFScxMneg9a6ts6UwpxESSlhGAMUr93nOHKMvL5LNX1zMwGuD4gHdG+9pzfJAlpTncc/li+kf8/MVPNvHAaw3sax/krksWxfzA2FBXRFjD2019CR3v2b0d/HF3O199Yg8Arxzq4uWDnVy0ePIPkYnW1ORzpGuIYV9w3Pb/2HiUktwMPnheLd99/9nkZTpZUZk3rsonWZ+6cglfffdK2/YnhEWCuwDgjSM9bG7s5d6rlrJ+YQEA+2eYmjnWNcT5i4r5uxtW8OtPXkR+djrf+tN+inIyuGVddcz3rVtQQJqCrY29CR2vuXeENAW/2dbKvzy9n08/so1l5Xncd+OKuN5/dk0+WsOetrHUzJ62AV491MWdlyzCle6gqiCLX/7lhfzLe9ckdG5CzBYJ7gKtNf/6/EGq8l3cfn5tZJWeWBUkU+kd9tM3EmCJWdpXU5jNr//qIs6pLeCvr16KK90R8715rnRWVrnZ0pjYyL2ld5Qb11Ry6dIS/uuVY7iz0nnw4+fFXYtuTaba3tIf2fbUruOkOxQfvWisPe6amvyk2w4IcapIcBdsPNjF9uZ+PnN1PZlOB7mZTuqKs2e0DN2xCYtRA1TmZ/E/917Cxy9ZNO37NywsYntLX9zNvIKhMMf7R1lYlM0Pbz+H28+r5eG7zqciP/4+LcW5mSwuyRn3jWFLQy9n1xTgtmn9UiFONQnuZzitNd97/hC1RVl8YENNZPtZle4ZjdyPWe1rZzgp5/xFRXgDYfYej2/WaPuAl2BYs6Aom+LcTL79vrNn1BrgvLoitjT2EQ5rvIEQu9sG2FBXmPB+hJgrJLif4Z7bd4LdbQP89dX148oWV1a6aeoZYWjCTcbpHO0eIsORRk3hzCpKrIC6Jc68e0ufsTB1Mr3VreMOjAY40jXEzpZ+AiEd97J5QsxFEtzPcI9ubqamMItbJ9zoXFllLUOX2Oj9aOcwC4uzZ1xRUpbnoq44O+68e0uvEdyTLU88zwzkWxp7Ix8s5y6Ukbs4fUlwP8Ptb/dwXl0RTsf4fwrW6kiJ5t2PdQ+Ny7fPxIa6IrY29sbVIbKldxRHmqIygRz7ZBYWZ1Oal8mWhl62NPaxvDyPgmx7l/wT4lSS4H4GGxgJ0DHojfR1iVaZ76IgOz2hvHsgFKa5ZyTpJljn1RXSNxLgqHlzdiotfSNU5rtO+nBKlFKK8+oK2dzQy9tNfZJvF6c9Ce5nMGtxjMmCu1KKsyrc7GmLP7g3944QDOukF3q20iE7okoTpzqmXTNGz6sr4viAF48vyPmLJN8uTm/TBnelVK1S6mWl1D6l1F6l1OfM7UVKqeeVUofN3wvN7Uop9SOl1BGl1C6l1PpUX4SYGasD44oYjbAuXlLM7rYBjsfZYz3ZShnLopJcsjMc7I1jElVL7yi1M7x5O9F5UTdQN8jNVHGai2fkHgT+Vmu9ErgQuFcptRK4D3hRa10PvGj+DHAjUG/+ugf4se1nLWxxoMNDnssZc+3Od62tAuBPu9vj2l+kxj3JDoeONMWqKje728aXQ7YPjPL1J/dy3rdeYEdLPyP+IN1DPhYU2xPcV1TkkZPhoCrfRXVBli37FGK2TBvctdbtWuu3zcceYD9QDdwMPGy+7GHgFvPxzcDPtWETUKCUqrT7xM8EfcN+uod89I/4U7L/gx0eVlTkxez1sqgkh1VVbp7aFV9wP9o1REluhi0dDldV5bPv+CAh86bq07vbueK7G3lkUxMDowEe/nMjrX3GN4qaQnsCsdORxkcvquPDFy6c/sVCzHEJ5dyVUnXAOuAtoFxrbf2v7wDKzcfVQEvU21rNbRP3dY9SaqtSamtXV1ei5z3v/XprC+u++Twb/ukFzvnH5/ndtlZb96+15uAJz6T59mjvXlvFzpZ+mntGpt3nsa5h2/qSr6nOZzQQinwbeOjPjVQVuHj5i1dy24Ya/rS7PdKWONka92j33biCe69aatv+hJgtcQd3pVQu8Dvg81rrcclQrbUGElrZWGt9v9Z6g9Z6Q2lp6fRvOMP8cVc71QVZfPPmVSwuzeFnbzRg/DHb4/iAF483GOkjE8tNa4wvXX/YfXzcdm8gxJM7j/OpR7ax7h+fY83XnmVbc59ty8VZi2zsbhtgYDTA1qY+3rmmktqibG7bUIsvGObHG48Cyde4CzEfxdU0WymVjhHYf6G1/r25+YRSqlJr3W6mXTrN7W1AbdTba8xtIk4j/iBvHuvhIxcs5KMX1aGU4iv/s4cdLf2sW5B4id7PXm9gYDTA31y3LLJtupupltqibM6pLeAPO9v59JVjI9pvPLWXX21uoTQvk+tWlpObmU6agg+eVzvF3uK3uCQHV3oau9sGyHCmEQprrjbXXV1Tnc+KijwOdHjISndQHGMBbCHOZPFUyyjgp8B+rfX3op56ErjDfHwH8ETU9o+ZVTMXAgNR6RsRhz8f6cEfDEeC2S3rqsnJcPDIpuaE99Xl8fGdZw7wo5cOj1vlyFrOLp4+LO9eW8W+9sFI3XkorHlmTwc3rankrb+/hu++fy1fffdKvvKuldTPoK/LZJyONFZWutnbNshLBzopyE6PfLAppbhtg/EhsqAoe0YLiggx38WTlrkE+ChwtVJqh/nrncC3geuUUoeBa82fAf4EHAOOAD8BPm3/ac9vLx/sJCfDEam1zs10csu6av6w63jCN1cffKMBfyhMVrqD7z9/KLL9YIeHqnwX+VnT3/y8cXUFAM/s6QCMXud9IwGuW1lOmo0LV0y0pjqfvccHeOVgF5fXl45raXDLumrSHYraIqlqEWIy8VTLvK61Vlrrs7XW55i//qS17tFaX6O1rtdaX6u17jVfr7XW92qtl2it12itt6b+MuYPrTUvH+jk0vqSyOLJAB++YCG+YJjfJnBjddAb4L/fbOLG1RX85WWLeWZvR2RBioMdHpbFudBzVUEWa2sLeHavEdxfPWTcAL+0viTuc5mJVdX5DPtD9Az7I99iLEU5GXz3/WfzySuWpPQchDhdyQzVOebgCQ/HB7wnBbOVVW7OqS3g8e3x3774xaZmPL4gn7piKXdftgi3y8nXn9zLF3+zk8OdQwktPHHj6gp2tQ7Q2jfCK4e6WFOdT0luZtzvnwlrcWul4IplJ990v3VdjUw2EiIGCe6zQGsdsynWSweM+9JXLi876bnrVpaz9/hgXAsqe7wBfvp6A5fVl7CmJh+3K52/umIJW5v6eHZvBzevreIvL5t+8QzLDauM1MxvtrayvaWfy5eldtQOUF+WS6YzjXW1BRTKTVMhEhL/EvPCNnc+tIXC7Ay+/8Fzxm3v9Hh5bEsLq6rclE8ya/Ty+lL+z7MHee1wF+9dX3PS89H+6Q/76R328YXrzo1s++QVS7hkaQkrK93jUj7xqCvJYUVFHv/5ylFCYc0Vy07+8LGb05HGV961kiUl9pRXCnEmkZH7KeYNhHjjSDdP7GijfWCsZ0vHgJfb/2sTXR4fX33Xyknfu6rKTXFORiTnHctLB07w2NYW/uqKJeNKJx1pinNqCxIO7JYbV1fiC4bJzXSybkHBjPaRqI9euJCLl6b+W4IQ840E9wQEQmECoXBkSvxM7D0+SCCkCWt4dLMxkXdgJMDt979Jp8fHw3edzwWLiyd9b1qa4rL6El493B0zrdMz5ONLv9vNioo8Pn9t/YzPczI3mFUzlywtHrdqkxBi7pH/oXF68I0G6r/8NPVffprlX3maN450z2g/25uNFYbWVOfz6JZmAqEwX31yD619ozx053njOhNO5orlpfQO+yftmHj4hIf3/fjPDIwE+Nfb1pLpdMzoHGNZVp7Lp65cwl9ettjW/Qoh7CfBPU6Pb29jcUkOX3zHMsryMvnOMwdm1A5ge0s/1QVZ/PU19ZwY9PGl3+3iiR3H+ezV9XFVflxWb1SNvHKoc9z21w53cet//JkhX4hf3XMBq6ryEz636Sil+NINK6RCRYjTgAT3OHR6vOxqHeDWddV85up6Pn/tMna1DvDC/s7p3zzBjuZ+1i0o4OoVZVTlu/j9222cXZPPp6+Kr167JDeTVVVuXj00/pvD157cS7k7kyc/cwnnLpTgK8SZToJ7HDYeNG5gXmXWnr93fTV1xdl87/lDca3zaTkx6KWtf5R1CwpxpCnuvmwxuZlO/vUDaxPKYV+xrJRtzX2R2apHu4Y41jXMxy6qo0r6kAshkOAel40HOyl3GyNmMEr0PndtPfvbB3nGnLUZj+3N/QCRSpO7L13Eli9fm3A/lneuqSQU1jy10+jU+MK+EwBcu7J8qrcJIc4gEtynEQiFee1QN1ctLxvXoOo9a6tZUJTNo1tapnj3eNtb+shwpEU+JACyMhK/6bm6Op+VlW5+vdVoRfD8vhOsrHTL6kFCiAgJ7tPY0tiLxxeMpGQsjjTF5ctK2NbYSzAUjmtf25v7WVnltqWK5bYNNexuG+D1w91sa+7jOhm1CyGiSHCfxssHOkl3KC6ZZCLNBYuKGfaH2BPHQs6BUJhdrf22Tf65+ZxqMhxp/N1vd6I1EtyFEOOctu0H3jjSze/fNppo5WQ6+LsbVpCbaf/lbDzYxQWLiifd9wWLjaqUt471cE5tQcx9DHoDfP7RHXgDYS5ZYs9sy8KcDK5bVc4fd7VTme8al+oRQojTcuQeDIX50u928cyedjYd6+HnbzbxwGvHbD9O56CXw51DXBajtW1ZnovFJTm81dAbcx8N3cPc+u9v8MqhLv7x5lVcc5Z9PVmsBSuuPatcFqwQQoxzWgb3p3Ydp7VvlB/evo437rua61eV89PXGhJeyGI6bx7rAeCiJZO3AwBj9L6lsXfSlgSvHOri5n97nd5hP4/cfQEfM5fMs8ulS0v43DX1fCKB7o5CiDPDaRfcw2HNjzceZVl5bqTn+d9ct4whf5CfzGD0rrXmroe28LUn9pwUoN882kOeyznlbM8LFhXj8QbZ3z4+7/7Lt5q588HNVBVk8eRnLp3yA2KmHGmKv7luGQuLpWuiEGK80y64v3igk0MnhvjUlUsiS7ytqHBz05pKHnyjkZ6h6XudR3uroZeXDnTy8JtN/K/f7BwX4P98tIcLFhWPW95tImspvOjUjNaa//PsAc6rK+J3n7qY2qLshM5JCCGSddoF9x9vPEJNYRbvPrtq3PbPX7sMbyDEg280JrS/RzY14XY5+ezVS/n99ja++JudaK1p7RuhuXeEi6cZcVcVZFFblMVbZgoHjBmjfSMB3nduDTkpuMkrhBDTmZPB/YkdbWye5CbltqY+3m7u5xOXLsI5Ybr+0rJcrlhWyu/fbo27JUCXx8ezezt4/7m1/O07lvO5a+p5fHsbz+zp4M2j0+fbLRcsKmZzVN59c4PR+XG6Do9CCJEqcy647zs+yOce3cFt//Umt/3Xm2xrGgvyP3ujgTyXkw+YVSIT3bKumuMD3imrV6L9emsLgZDmwxcuAOCzVy9lZaWbbzy1jxf2n6AoJ4PlcbQGuGJZKf0jAbY1GUF9a2MvJbkZ1BVLOkYIMTvmXHD/2RsNZKU7+N/vXEFTzzB/8ZO32N8+SFv/KM/s6eBD5y+Imep4x8oKcjIcPL69ddrjhMKaX77VzMVLillSmgsYPWP+6dbVnPB4eXbvCS5cXBTJ60/lqhVlZDjTeHpPOwCbG3s5r65IyhOFELNmTgX3Lo+PJ3cc5wMbarjn8iU89dlLyc9K59O/eJv/ePkIWms+dtHCmO/PynBww+pKnt7dgTcQmvJYf9rdTlv/KB++YPz+1i8o5PbzjJH8RXFOOMrNdHJ5fQnP7umgfWCU1r5R6XkuhJhVcyq4/+KtJvyhMB+/uA4wJgn9vw+to7l3hF+81cwNqyuoKZw61XHrumo8viAvTtFr3RsI8Z1nDrCiIi+ydFy0+25cwScuXcS71lTGfe43rK7k+ICXn73eAMD5EtyFELNozgR3XzDEI5uauHpFGYvNNAnABYuLue+GFaQ7FJ+IY3m3i5YUU+7OPCk1MzAawBc0RvMP/7mR1r5RvnLTyknLHPOz0vnKu1ZSmJMR9/lfe1YZzjTFQ39uJDvDwVmVibXxFUIIO82ZOr1v/mEf3UN+7rrk5NmWf3n5Yj54fi1uV/q0+3GkKW5ZV80DrzVwYtBLudvFqD/Etd97BYA7L6njxxuPcuXyUi6N0VZgJgqyM7hoSTGvHe7mgkWFJ1XzCCHEqTQnIlBb/yiPbGrmry5fzCVLJy89jCewWz503gJCYc2jm41e649vb6PL46My38V3nznIsC/I/37nWbace7TrVxkpHimBFELMtjkxcu8d9vOtq5byt+9YZkuFSV1JDpfVl/Crzc18+qol/OyNBlZVuXni3kt4u7kfjzfAsgRXP4rHTWsqeXpPOzedHX+uXgghUkFpHf8aoKmyYNka3XRwl62lg8/s6eCTj2zjIxcu4JFNzXzvtrW8d32NbfsXQojZppTaprXeMNlz06ZllFI/U0p1KqX2RG0rUko9r5Q6bP5eaG5XSqkfKaWOKKV2KaXWx3OCZe5M22vCrz2rjAq3i0c2NVOalymjaSHEGSWenPtDwA0Ttt0HvKi1rgdeNH8GuBGoN3/dA/zYntNMnNORxu3nGzNZP3rhQluWthNCiNPFtDl3rfWrSqm6CZtvBq40Hz8MbAS+ZG7/uTZyPZuUUgVKqUqtdbttZ5yAOy6qo38kwB0X1c3G4YUQYtbMtFqmPCpgdwDWAp7VQEvU61rNbSdRSt2jlNqqlNra1dU1w9OYWmFOBl9/zyrys+OvtBFCiPkg6VJIc5Se8F1ZrfX9WusNWusNpaWlyZ6GEEKIKDMN7ieUUpUA5u/WXP82ILplY425TQghxCk00+D+JHCH+fgO4Imo7R8zq2YuBAZmK98uhBBnsmlvqCqlfoVx87REKdUKfA34NvBrpdTdQBNwm/nyPwHvBI4AI8CdKThnIYQQ04inWuZDMZ66ZpLXauDeZE9KCCFEcuZEbxkhhBD2kuAuhBDzkAR3IYSYh+ZE4zCllAc4eAoPmQ8MnMLjlQDdp/B48/36YP5fo1yfvebr9S3UWk86UWhOtPwFDsbqbJYKSqn7tdb3nMLjbZXrs/2Y8/oa5fpsP968vr7JnKlpmadm+wRSbL5fH8z/a5TrO73N+vWdkcFdaz3rf/CpNN+vD+b/Ncr1nd7mwvXNleB+/2yfQIrJ9Z3+5vs1yvXNM3PihqoQQgh7zZWRuxBCCBtJcBdCiHkoZcE9xtqra5VSbyqldiulnlJKuSe8Z4FSakgp9cWobZ9TSu1RSu1VSn0+VeebqESuTylVp5QaVUrtMH/9Z9R7vqWUalFKDc3GdcRi4/U9o5Taaf79/adSak6sd2jj9W1USh2Meq5sNq5nIjuuTymVF7Vth1KqWyn1g1m6pHFs/Pv7oDLWe96rlPrObFxLymitU/ILuBxYD+yJ2rYFuMJ8fBfwzQnv+S3wG+CL5s+rgT1ANkZN/gvA0lSdc6quD6iLft2E/VwIVAJDs31NKbo+t/m7An4H3D7b12bz9W0ENsz29aTq+ibscxtw+Wxfm13XBxQDzUCp+fPDwDWzfW12/UrZyF1r/SrQO2HzMuBV8/HzwPusJ5RStwANwN6o158FvKW1HtFaB4FXgPem6pwTkej1TbGfTXoO9ry38foGzYdOIIMZrNqVCnZd31xl9/UppZYBZcBrtpxgkmy6vsXAYa21tc7nC3G857RxqnPuezEW0Qb4AOaqTUqpXIwFtr8x4fV7gMuUUsVKqWyMXvG1zF2TXp9pkVJqu1LqFaXUZaf+1Gwxo+tTSj2LsVqXB+Pb2Vw107+/B82v+/+glFKn5ExnJpl/n7cDj2lziDtHJXp9R4DlZtrGCdzC3I4vCTnVwf0u4NNKqW1AHuA3t38d+L7WelzeWWu9H/gO8BzwDLADCJ2qk52BWNfXDizQWq8DvgD8Uk2433CamNH1aa2vx0g9ZQJXn9pTTshMru/DWus1wGXmr4+e4nNORDL/Pm8HfnXKznRmEro+rXUf8CngMYxvJI3M7fiSmBTnxeqInatcBmw2H1t/sI1AP8bXrc9M8p5/Bj4927msRK9vkuc2MiFPyxzLudt9feb2jwH/NtvXlcLr+/h8vD5gLXBotq/nFPz93QN8d7avy65fp3TkblUSKKXSgK8A/wmgtb5Ma12nta4DfgD8s9b63ya8ZwFGvv2Xp/KcExHr+pRSpVaViFJqMVAPHJut85ypRK9PKZWrxhZSdwI3AQdm49zjMYPrcyqlSszt6cC7MFKJc1IS/z4/xNwftc/o+qLeUwh8Gnjg1J95aqSsK6SafO3VXKWUtQzf74EH49jV75RSxUAAuFdr3Z+C001Ygtd3OfCPSqkAEAY+qbXuNffzXeAvgGxzPw9orb9+yi4kBjuuTylVDjyplMrESAG+jPkfbrbZdH05wLNmYHdg3JD7ySm8jJjs+vdpug3jftecYeP1/VAptdZ8/I9a60On5AJOAWk/IIQQ85DMUBVCiHlIgrsQQsxDEtyFEGIekuAuhBDzkAR3IYSYhyS4CxGDUurrKqpDqRCnEwnuQggxD0lwFyKKUurLSqlDSqnXgeXmtr9WSu0z+34/OsunKERcUjZDVYjTjVLqXIwGWedg/N94G6OH+X3AIq21TylVMGsnKEQCZOQuxJjLgMe1sX7AIPCkuX0X8Aul1EeA4KydnRAJkOAuxPRuAv4dY+WfLWYTNCHmNAnuQox5FbhFKZWllMoD3o3xf6RWa/0yxoIy+UDuLJ6jEHGREYgQJq3120qpx4CdGCtHbcFYFvARpVQ+xjqwP5ornUmFmIp0hRRCiHlI0jJCCDEPSXAXQoh5SIK7EELMQxLchRBiHpLgLoQQ85AEdyGEmIckuAshxDz0/wGOa+jMLGt66wAAAABJRU5ErkJggg==\n",
210 |       "text/plain": [
211 |        "<Figure size 432x288 with 1 Axes>"
212 |       ]
213 |      },
214 |      "metadata": {
215 |       "needs_background": "light"
216 |      },
217 |      "output_type": "display_data"
218 |     }
219 |    ],
220 |    "source": [
221 |     "df.plot()"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "id": "aaba673e",
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": []
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "kernelspec": {
235 |    "display_name": "Python 3 (ipykernel)",
236 |    "language": "python",
237 |    "name": "python3"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 3
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython3",
249 |    "version": "3.10.5"
250 |   },
251 |   "toc": {
252 |    "base_numbering": 1,
253 |    "nav_menu": {},
254 |    "number_sections": true,
255 |    "sideBar": true,
256 |    "skip_h1_title": false,
257 |    "title_cell": "Table of Contents",
258 |    "title_sidebar": "Contents",
259 |    "toc_cell": false,
260 |    "toc_position": {},
261 |    "toc_section_display": true,
262 |    "toc_window_display": false
263 |   }
264 |  },
265 |  "nbformat": 4,
266 |  "nbformat_minor": 5
267 | }
268 | 


--------------------------------------------------------------------------------
/01-Create-Datasets/05-create-electricity-demand-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Electricity Demand in Victoria, Australia \n",
  8 |     "\n",
  9 |     "In this notebook we will prepare and store the electricity demand dataset found [here](https://github.com/tidyverts/tsibbledata/tree/master/data-raw/vic_elec/VIC2015).\n",
 10 |     "\n",
 11 |     "**Citation:**\n",
 12 |     "\n",
 13 |     "Godahewa, Rakshitha, Bergmeir, Christoph, Webb, Geoff, Hyndman, Rob, & Montero-Manso, Pablo. (2021). Australian Electricity Demand Dataset (Version 1) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.4659727\n",
 14 |     "\n",
 15 |     "**Description of data:**\n",
 16 |     "\n",
 17 |     "A description of the data can be found [here](https://rdrr.io/cran/tsibbledata/man/vic_elec.html). The data contains electricity demand in Victoria, Australia, at 30 minute intervals over a period of 12 years, from 2002 to early 2015. There is also the temperature in Melbourne at 30 minute intervals and public holiday dates."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "# Download the data via the URL below and pandas"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 1,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import pandas as pd\n",
 34 |     "import numpy as np"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# Electricity demand.\n",
 44 |     "url = \"https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/demand.csv\"\n",
 45 |     "demand = pd.read_csv(url)\n",
 46 |     "\n",
 47 |     "# Temperature of Melbourne (BOM site 086071).\n",
 48 |     "url = \"https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/temperature.csv\"\n",
 49 |     "temp = pd.read_csv(url)\n",
 50 |     "df = demand.merge(temp, on=[\"Date\", \"Period\"], how=\"left\")"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Public holidays in Australia\n",
 60 |     "url = \"https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/holidays.txt\"\n",
 61 |     "holidays = pd.read_csv(url, header=None, parse_dates=[0], dayfirst=True)\n",
 62 |     "holidays.columns = [\"date\"]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "# Process and save the data"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "We will only use the `OperationLessIndustrial` demand. So let's drop `Industrial`."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "df.drop(columns=[\"Industrial\"], inplace=True)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Let's extract the date and date-time."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Convert the integer Date to an actual date with datetime type\n",
102 |     "df[\"date\"] = df[\"Date\"].apply(\n",
103 |     "    lambda x: pd.Timestamp(\"1899-12-30\") + pd.Timedelta(x, unit=\"days\")\n",
104 |     ")\n",
105 |     "\n",
106 |     "# Create a timestamp from the integer Period representing 30 minute intervals\n",
107 |     "df[\"date_time\"] = df[\"date\"] + pd.to_timedelta((df[\"Period\"] - 1) * 30, unit=\"m\")"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "Drop the null rows."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "df.dropna(inplace=True)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "Create holidays column."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 7,
136 |    "metadata": {
137 |     "tags": []
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "holidays[\"is_holiday\"] = 1\n",
142 |     "df = df.merge(holidays, on=[\"date\"], how=\"left\")\n",
143 |     "df[\"is_holiday\"] = df[\"is_holiday\"].fillna(0).astype(int)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "We now just use the timestamp and the electricity demand and resample to hourly."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 8,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/html": [
161 |        "<div>\n",
162 |        "<style scoped>\n",
163 |        "    .dataframe tbody tr th:only-of-type {\n",
164 |        "        vertical-align: middle;\n",
165 |        "    }\n",
166 |        "\n",
167 |        "    .dataframe tbody tr th {\n",
168 |        "        vertical-align: top;\n",
169 |        "    }\n",
170 |        "\n",
171 |        "    .dataframe thead th {\n",
172 |        "        text-align: right;\n",
173 |        "    }\n",
174 |        "</style>\n",
175 |        "<table border=\"1\" class=\"dataframe\">\n",
176 |        "  <thead>\n",
177 |        "    <tr style=\"text-align: right;\">\n",
178 |        "      <th></th>\n",
179 |        "      <th>demand</th>\n",
180 |        "      <th>temperature</th>\n",
181 |        "      <th>is_holiday</th>\n",
182 |        "    </tr>\n",
183 |        "    <tr>\n",
184 |        "      <th>date_time</th>\n",
185 |        "      <th></th>\n",
186 |        "      <th></th>\n",
187 |        "      <th></th>\n",
188 |        "    </tr>\n",
189 |        "  </thead>\n",
190 |        "  <tbody>\n",
191 |        "    <tr>\n",
192 |        "      <th>2002-01-01 00:00:00</th>\n",
193 |        "      <td>6919.366092</td>\n",
194 |        "      <td>32.6</td>\n",
195 |        "      <td>1</td>\n",
196 |        "    </tr>\n",
197 |        "    <tr>\n",
198 |        "      <th>2002-01-01 01:00:00</th>\n",
199 |        "      <td>7165.974188</td>\n",
200 |        "      <td>32.6</td>\n",
201 |        "      <td>1</td>\n",
202 |        "    </tr>\n",
203 |        "    <tr>\n",
204 |        "      <th>2002-01-01 02:00:00</th>\n",
205 |        "      <td>6406.542994</td>\n",
206 |        "      <td>32.6</td>\n",
207 |        "      <td>1</td>\n",
208 |        "    </tr>\n",
209 |        "    <tr>\n",
210 |        "      <th>2002-01-01 03:00:00</th>\n",
211 |        "      <td>5815.537828</td>\n",
212 |        "      <td>32.6</td>\n",
213 |        "      <td>1</td>\n",
214 |        "    </tr>\n",
215 |        "    <tr>\n",
216 |        "      <th>2002-01-01 04:00:00</th>\n",
217 |        "      <td>5497.732922</td>\n",
218 |        "      <td>32.6</td>\n",
219 |        "      <td>1</td>\n",
220 |        "    </tr>\n",
221 |        "  </tbody>\n",
222 |        "</table>\n",
223 |        "</div>"
224 |       ],
225 |       "text/plain": [
226 |        "                          demand  temperature  is_holiday\n",
227 |        "date_time                                                \n",
228 |        "2002-01-01 00:00:00  6919.366092         32.6           1\n",
229 |        "2002-01-01 01:00:00  7165.974188         32.6           1\n",
230 |        "2002-01-01 02:00:00  6406.542994         32.6           1\n",
231 |        "2002-01-01 03:00:00  5815.537828         32.6           1\n",
232 |        "2002-01-01 04:00:00  5497.732922         32.6           1"
233 |       ]
234 |      },
235 |      "execution_count": 8,
236 |      "metadata": {},
237 |      "output_type": "execute_result"
238 |     }
239 |    ],
240 |    "source": [
241 |     "# Rename columns\n",
242 |     "timeseries = df[[\"date_time\", \"OperationalLessIndustrial\", \"Temp\", \"is_holiday\"]]\n",
243 |     "\n",
244 |     "timeseries.columns = [\"date_time\", \"demand\", \"temperature\", \"is_holiday\"]\n",
245 |     "\n",
246 |     "# Resample to hourly\n",
247 |     "timeseries = (\n",
248 |     "    timeseries.set_index(\"date_time\")\n",
249 |     "    .resample(\"H\")\n",
250 |     "    .agg(\n",
251 |     "        {\n",
252 |     "            \"demand\": \"sum\",\n",
253 |     "            \"temperature\": \"mean\",\n",
254 |     "            \"is_holiday\": np.min,\n",
255 |     "        }\n",
256 |     "    )\n",
257 |     ")\n",
258 |     "timeseries.head()"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "Save the timeseries in the datasets folder."
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 9,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "timeseries.to_csv(\"../Datasets/victoria_electricity_demand.csv\")"
275 |    ]
276 |   }
277 |  ],
278 |  "metadata": {
279 |   "kernelspec": {
280 |    "display_name": "Python 3 (ipykernel)",
281 |    "language": "python",
282 |    "name": "python3"
283 |   },
284 |   "language_info": {
285 |    "codemirror_mode": {
286 |     "name": "ipython",
287 |     "version": 3
288 |    },
289 |    "file_extension": ".py",
290 |    "mimetype": "text/x-python",
291 |    "name": "python",
292 |    "nbconvert_exporter": "python",
293 |    "pygments_lexer": "ipython3",
294 |    "version": "3.8.7"
295 |   },
296 |   "toc": {
297 |    "base_numbering": 1,
298 |    "nav_menu": {},
299 |    "number_sections": true,
300 |    "sideBar": true,
301 |    "skip_h1_title": false,
302 |    "title_cell": "Table of Contents",
303 |    "title_sidebar": "Contents",
304 |    "toc_cell": false,
305 |    "toc_position": {},
306 |    "toc_section_display": true,
307 |    "toc_window_display": true
308 |   }
309 |  },
310 |  "nbformat": 4,
311 |  "nbformat_minor": 4
312 | }
313 | 


--------------------------------------------------------------------------------
/09-Trend-Features/images/forecast_with_just_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/forecast_with_just_time.png


--------------------------------------------------------------------------------
/09-Trend-Features/images/recursive_forecasting/Slide1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide1.png


--------------------------------------------------------------------------------
/09-Trend-Features/images/recursive_forecasting/Slide2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide2.png


--------------------------------------------------------------------------------
/09-Trend-Features/images/recursive_forecasting/Slide3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide3.png


--------------------------------------------------------------------------------
/09-Trend-Features/images/recursive_forecasting/Slide4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide4.png


--------------------------------------------------------------------------------
/11-Time-Features/02-Extracting-time-related-features.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Time features from the datetime variable\n",
   8 |     "\n",
   9 |     "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n",
  10 |     "\n",
  11 |     "Time series data are, by definition, time-indexed. The \"time\" component has information about the date and time. We can extract a number of features from the time component of the index.\n",
  12 |     "\n",
  13 |     "In this notebook, we will see how we can easily derive many time-related features.\n",
  14 |     "\n",
  15 |     "\n",
  16 |     "## Features from the time part:\n",
  17 |     "\n",
  18 |     "Below are some of the features that we can extract off-the-shelf using [pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components):\n",
  19 |     "\n",
  20 |     "- pandas.Series.dt.hour\n",
  21 |     "- pandas.Series.dt.minute\n",
  22 |     "- pandas.Series.dt.second\n",
  23 |     "- pandas.Series.dt.microsecond\n",
  24 |     "- pandas.Series.dt.nanosecond\n",
  25 |     "\n",
  26 |     "\n",
  27 |     "## The dataset\n",
  28 |     "\n",
  29 |     "We will use the Online Retail II Data Set available in the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/machine-learning-databases/00502/).\n",
  30 |     "\n",
  31 |     "Download the xlsx file from the link above and save it in the **Datasets** folder within this repo.\n",
  32 |     "\n",
  33 |     "**Citation**:\n",
  34 |     "\n",
  35 |     "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n",
  36 |     "\n",
  37 |     "## In this demo\n",
  38 |     "\n",
  39 |     "We will extract different time-related features from the datetime variable: **InvoiceDate**"
  40 |    ]
  41 |   },
  42 |   {
  43 |    "cell_type": "code",
  44 |    "execution_count": 1,
  45 |    "metadata": {},
  46 |    "outputs": [],
  47 |    "source": [
  48 |     "import numpy as np\n",
  49 |     "import pandas as pd\n",
  50 |     "import matplotlib.pyplot as plt"
  51 |    ]
  52 |   },
  53 |   {
  54 |    "cell_type": "markdown",
  55 |    "metadata": {},
  56 |    "source": [
  57 |     "## Load the data"
  58 |    ]
  59 |   },
  60 |   {
  61 |    "cell_type": "code",
  62 |    "execution_count": 2,
  63 |    "metadata": {},
  64 |    "outputs": [
  65 |     {
  66 |      "name": "stdout",
  67 |      "output_type": "stream",
  68 |      "text": [
  69 |       "(1067371, 8)\n"
  70 |      ]
  71 |     },
  72 |     {
  73 |      "data": {
  74 |       "text/html": [
  75 |        "<div>\n",
  76 |        "<style scoped>\n",
  77 |        "    .dataframe tbody tr th:only-of-type {\n",
  78 |        "        vertical-align: middle;\n",
  79 |        "    }\n",
  80 |        "\n",
  81 |        "    .dataframe tbody tr th {\n",
  82 |        "        vertical-align: top;\n",
  83 |        "    }\n",
  84 |        "\n",
  85 |        "    .dataframe thead th {\n",
  86 |        "        text-align: right;\n",
  87 |        "    }\n",
  88 |        "</style>\n",
  89 |        "<table border=\"1\" class=\"dataframe\">\n",
  90 |        "  <thead>\n",
  91 |        "    <tr style=\"text-align: right;\">\n",
  92 |        "      <th></th>\n",
  93 |        "      <th>Invoice</th>\n",
  94 |        "      <th>StockCode</th>\n",
  95 |        "      <th>Description</th>\n",
  96 |        "      <th>Quantity</th>\n",
  97 |        "      <th>InvoiceDate</th>\n",
  98 |        "      <th>Price</th>\n",
  99 |        "      <th>Customer ID</th>\n",
 100 |        "      <th>Country</th>\n",
 101 |        "    </tr>\n",
 102 |        "  </thead>\n",
 103 |        "  <tbody>\n",
 104 |        "    <tr>\n",
 105 |        "      <th>0</th>\n",
 106 |        "      <td>489434</td>\n",
 107 |        "      <td>85048</td>\n",
 108 |        "      <td>15CM CHRISTMAS GLASS BALL 20 LIGHTS</td>\n",
 109 |        "      <td>12</td>\n",
 110 |        "      <td>2009-12-01 07:45:00</td>\n",
 111 |        "      <td>6.95</td>\n",
 112 |        "      <td>13085.0</td>\n",
 113 |        "      <td>United Kingdom</td>\n",
 114 |        "    </tr>\n",
 115 |        "    <tr>\n",
 116 |        "      <th>1</th>\n",
 117 |        "      <td>489434</td>\n",
 118 |        "      <td>79323P</td>\n",
 119 |        "      <td>PINK CHERRY LIGHTS</td>\n",
 120 |        "      <td>12</td>\n",
 121 |        "      <td>2009-12-01 07:45:00</td>\n",
 122 |        "      <td>6.75</td>\n",
 123 |        "      <td>13085.0</td>\n",
 124 |        "      <td>United Kingdom</td>\n",
 125 |        "    </tr>\n",
 126 |        "    <tr>\n",
 127 |        "      <th>2</th>\n",
 128 |        "      <td>489434</td>\n",
 129 |        "      <td>79323W</td>\n",
 130 |        "      <td>WHITE CHERRY LIGHTS</td>\n",
 131 |        "      <td>12</td>\n",
 132 |        "      <td>2009-12-01 07:45:00</td>\n",
 133 |        "      <td>6.75</td>\n",
 134 |        "      <td>13085.0</td>\n",
 135 |        "      <td>United Kingdom</td>\n",
 136 |        "    </tr>\n",
 137 |        "    <tr>\n",
 138 |        "      <th>3</th>\n",
 139 |        "      <td>489434</td>\n",
 140 |        "      <td>22041</td>\n",
 141 |        "      <td>RECORD FRAME 7\" SINGLE SIZE</td>\n",
 142 |        "      <td>48</td>\n",
 143 |        "      <td>2009-12-01 07:45:00</td>\n",
 144 |        "      <td>2.10</td>\n",
 145 |        "      <td>13085.0</td>\n",
 146 |        "      <td>United Kingdom</td>\n",
 147 |        "    </tr>\n",
 148 |        "    <tr>\n",
 149 |        "      <th>4</th>\n",
 150 |        "      <td>489434</td>\n",
 151 |        "      <td>21232</td>\n",
 152 |        "      <td>STRAWBERRY CERAMIC TRINKET BOX</td>\n",
 153 |        "      <td>24</td>\n",
 154 |        "      <td>2009-12-01 07:45:00</td>\n",
 155 |        "      <td>1.25</td>\n",
 156 |        "      <td>13085.0</td>\n",
 157 |        "      <td>United Kingdom</td>\n",
 158 |        "    </tr>\n",
 159 |        "  </tbody>\n",
 160 |        "</table>\n",
 161 |        "</div>"
 162 |       ],
 163 |       "text/plain": [
 164 |        "  Invoice StockCode                          Description  Quantity  \\\n",
 165 |        "0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   \n",
 166 |        "1  489434    79323P                   PINK CHERRY LIGHTS        12   \n",
 167 |        "2  489434    79323W                  WHITE CHERRY LIGHTS        12   \n",
 168 |        "3  489434     22041         RECORD FRAME 7\" SINGLE SIZE         48   \n",
 169 |        "4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   \n",
 170 |        "\n",
 171 |        "          InvoiceDate  Price  Customer ID         Country  \n",
 172 |        "0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  \n",
 173 |        "1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  \n",
 174 |        "2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  \n",
 175 |        "3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  \n",
 176 |        "4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  "
 177 |       ]
 178 |      },
 179 |      "execution_count": 2,
 180 |      "metadata": {},
 181 |      "output_type": "execute_result"
 182 |     }
 183 |    ],
 184 |    "source": [
 185 |     "# File path:\n",
 186 |     "file = \"../Datasets/online_retail_II.xlsx\"\n",
 187 |     "\n",
 188 |     "# The data is provided as two sheets in a single Excel file.\n",
 189 |     "# Each sheet contains a different time period.\n",
 190 |     "# Load both and join them into a single dataframe\n",
 191 |     "# as shown below:\n",
 192 |     "\n",
 193 |     "df_1 = pd.read_excel(file, sheet_name=\"Year 2009-2010\")\n",
 194 |     "df_2 = pd.read_excel(file, sheet_name=\"Year 2010-2011\")\n",
 195 |     "\n",
 196 |     "data = pd.concat([df_1, df_2])\n",
 197 |     "\n",
 198 |     "print(data.shape)\n",
 199 |     "\n",
 200 |     "data.head()"
 201 |    ]
 202 |   },
 203 |   {
 204 |    "cell_type": "markdown",
 205 |    "metadata": {},
 206 |    "source": [
 207 |     "In this dataset, we have the datetime variable in a column called InvoiceDate. We could also have it in the dataframe index. The procedure for extracting the date and time features is identical. That is, we would use the methods from pandas dt as shown below.\n",
 208 |     "\n",
 209 |     "The dataset contains sales information for different customers in different countries. Customers may have made one or multiple purchases from the business that provided the data.\n",
 210 |     "\n",
 211 |     "## Variable format"
 212 |    ]
 213 |   },
 214 |   {
 215 |    "cell_type": "code",
 216 |    "execution_count": 3,
 217 |    "metadata": {},
 218 |    "outputs": [
 219 |     {
 220 |      "data": {
 221 |       "text/plain": [
 222 |        "dtype('<M8[ns]')"
 223 |       ]
 224 |      },
 225 |      "execution_count": 3,
 226 |      "metadata": {},
 227 |      "output_type": "execute_result"
 228 |     }
 229 |    ],
 230 |    "source": [
 231 |     "# Let's determine the type of data in the datetime variable.\n",
 232 |     "\n",
 233 |     "data[\"InvoiceDate\"].dtypes"
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "markdown",
 238 |    "metadata": {},
 239 |    "source": [
 240 |     "In this dataset, the variable is already parsed as datetime data.\n",
 241 |     "\n",
 242 |     "In some datasets, the datetime variable may be cast as an object, i.e., strings. In these cases, before carrying on with the rest of the notebook, we should re-cast it from object into datetime, as we do in the following cell:"
 243 |    ]
 244 |   },
 245 |   {
 246 |    "cell_type": "code",
 247 |    "execution_count": 4,
 248 |    "metadata": {},
 249 |    "outputs": [
 250 |     {
 251 |      "data": {
 252 |       "text/html": [
 253 |        "<div>\n",
 254 |        "<style scoped>\n",
 255 |        "    .dataframe tbody tr th:only-of-type {\n",
 256 |        "        vertical-align: middle;\n",
 257 |        "    }\n",
 258 |        "\n",
 259 |        "    .dataframe tbody tr th {\n",
 260 |        "        vertical-align: top;\n",
 261 |        "    }\n",
 262 |        "\n",
 263 |        "    .dataframe thead th {\n",
 264 |        "        text-align: right;\n",
 265 |        "    }\n",
 266 |        "</style>\n",
 267 |        "<table border=\"1\" class=\"dataframe\">\n",
 268 |        "  <thead>\n",
 269 |        "    <tr style=\"text-align: right;\">\n",
 270 |        "      <th></th>\n",
 271 |        "      <th>date</th>\n",
 272 |        "      <th>InvoiceDate</th>\n",
 273 |        "    </tr>\n",
 274 |        "  </thead>\n",
 275 |        "  <tbody>\n",
 276 |        "    <tr>\n",
 277 |        "      <th>0</th>\n",
 278 |        "      <td>2009-12-01 07:45:00</td>\n",
 279 |        "      <td>2009-12-01 07:45:00</td>\n",
 280 |        "    </tr>\n",
 281 |        "    <tr>\n",
 282 |        "      <th>1</th>\n",
 283 |        "      <td>2009-12-01 07:45:00</td>\n",
 284 |        "      <td>2009-12-01 07:45:00</td>\n",
 285 |        "    </tr>\n",
 286 |        "    <tr>\n",
 287 |        "      <th>2</th>\n",
 288 |        "      <td>2009-12-01 07:45:00</td>\n",
 289 |        "      <td>2009-12-01 07:45:00</td>\n",
 290 |        "    </tr>\n",
 291 |        "    <tr>\n",
 292 |        "      <th>3</th>\n",
 293 |        "      <td>2009-12-01 07:45:00</td>\n",
 294 |        "      <td>2009-12-01 07:45:00</td>\n",
 295 |        "    </tr>\n",
 296 |        "    <tr>\n",
 297 |        "      <th>4</th>\n",
 298 |        "      <td>2009-12-01 07:45:00</td>\n",
 299 |        "      <td>2009-12-01 07:45:00</td>\n",
 300 |        "    </tr>\n",
 301 |        "  </tbody>\n",
 302 |        "</table>\n",
 303 |        "</div>"
 304 |       ],
 305 |       "text/plain": [
 306 |        "                 date         InvoiceDate\n",
 307 |        "0 2009-12-01 07:45:00 2009-12-01 07:45:00\n",
 308 |        "1 2009-12-01 07:45:00 2009-12-01 07:45:00\n",
 309 |        "2 2009-12-01 07:45:00 2009-12-01 07:45:00\n",
 310 |        "3 2009-12-01 07:45:00 2009-12-01 07:45:00\n",
 311 |        "4 2009-12-01 07:45:00 2009-12-01 07:45:00"
 312 |       ]
 313 |      },
 314 |      "execution_count": 4,
 315 |      "metadata": {},
 316 |      "output_type": "execute_result"
 317 |     }
 318 |    ],
 319 |    "source": [
 320 |     "# This is how we parse date strings into datetime format.\n",
 321 |     "\n",
 322 |     "data[\"date\"] = pd.to_datetime(data[\"InvoiceDate\"])\n",
 323 |     "\n",
 324 |     "data[[\"date\", \"InvoiceDate\"]].head()"
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "markdown",
 329 |    "metadata": {},
 330 |    "source": [
 331 |     "## Extract the time part"
 332 |    ]
 333 |   },
 334 |   {
 335 |    "cell_type": "code",
 336 |    "execution_count": 5,
 337 |    "metadata": {},
 338 |    "outputs": [
 339 |     {
 340 |      "data": {
 341 |       "text/plain": [
 342 |        "0    07:45:00\n",
 343 |        "1    07:45:00\n",
 344 |        "2    07:45:00\n",
 345 |        "3    07:45:00\n",
 346 |        "4    07:45:00\n",
 347 |        "Name: time_part, dtype: object"
 348 |       ]
 349 |      },
 350 |      "execution_count": 5,
 351 |      "metadata": {},
 352 |      "output_type": "execute_result"
 353 |     }
 354 |    ],
 355 |    "source": [
 356 |     "# Extract time part.\n",
 357 |     "\n",
 358 |     "# (We would normally not use this as a predictive feature,\n",
 359 |     "# but it might be handy for data analysis).\n",
 360 |     "\n",
 361 |     "data[\"time_part\"] = data[\"date\"].dt.time\n",
 362 |     "\n",
 363 |     "data[\"time_part\"].head()"
 364 |    ]
 365 |   },
 366 |   {
 367 |    "cell_type": "markdown",
 368 |    "metadata": {},
 369 |    "source": [
 370 |     "### Extract the hr, minute and second"
 371 |    ]
 372 |   },
 373 |   {
 374 |    "cell_type": "code",
 375 |    "execution_count": 6,
 376 |    "metadata": {},
 377 |    "outputs": [
 378 |     {
 379 |      "data": {
 380 |       "text/html": [
 381 |        "<div>\n",
 382 |        "<style scoped>\n",
 383 |        "    .dataframe tbody tr th:only-of-type {\n",
 384 |        "        vertical-align: middle;\n",
 385 |        "    }\n",
 386 |        "\n",
 387 |        "    .dataframe tbody tr th {\n",
 388 |        "        vertical-align: top;\n",
 389 |        "    }\n",
 390 |        "\n",
 391 |        "    .dataframe thead th {\n",
 392 |        "        text-align: right;\n",
 393 |        "    }\n",
 394 |        "</style>\n",
 395 |        "<table border=\"1\" class=\"dataframe\">\n",
 396 |        "  <thead>\n",
 397 |        "    <tr style=\"text-align: right;\">\n",
 398 |        "      <th></th>\n",
 399 |        "      <th>Invoice</th>\n",
 400 |        "      <th>StockCode</th>\n",
 401 |        "      <th>Description</th>\n",
 402 |        "      <th>Quantity</th>\n",
 403 |        "      <th>InvoiceDate</th>\n",
 404 |        "      <th>Price</th>\n",
 405 |        "      <th>Customer ID</th>\n",
 406 |        "      <th>Country</th>\n",
 407 |        "      <th>date</th>\n",
 408 |        "      <th>time_part</th>\n",
 409 |        "      <th>hour</th>\n",
 410 |        "      <th>min</th>\n",
 411 |        "      <th>sec</th>\n",
 412 |        "      <th>microsec</th>\n",
 413 |        "      <th>nanosec</th>\n",
 414 |        "    </tr>\n",
 415 |        "  </thead>\n",
 416 |        "  <tbody>\n",
 417 |        "    <tr>\n",
 418 |        "      <th>0</th>\n",
 419 |        "      <td>489434</td>\n",
 420 |        "      <td>85048</td>\n",
 421 |        "      <td>15CM CHRISTMAS GLASS BALL 20 LIGHTS</td>\n",
 422 |        "      <td>12</td>\n",
 423 |        "      <td>2009-12-01 07:45:00</td>\n",
 424 |        "      <td>6.95</td>\n",
 425 |        "      <td>13085.0</td>\n",
 426 |        "      <td>United Kingdom</td>\n",
 427 |        "      <td>2009-12-01 07:45:00</td>\n",
 428 |        "      <td>07:45:00</td>\n",
 429 |        "      <td>7</td>\n",
 430 |        "      <td>45</td>\n",
 431 |        "      <td>0</td>\n",
 432 |        "      <td>0</td>\n",
 433 |        "      <td>0</td>\n",
 434 |        "    </tr>\n",
 435 |        "    <tr>\n",
 436 |        "      <th>1</th>\n",
 437 |        "      <td>489434</td>\n",
 438 |        "      <td>79323P</td>\n",
 439 |        "      <td>PINK CHERRY LIGHTS</td>\n",
 440 |        "      <td>12</td>\n",
 441 |        "      <td>2009-12-01 07:45:00</td>\n",
 442 |        "      <td>6.75</td>\n",
 443 |        "      <td>13085.0</td>\n",
 444 |        "      <td>United Kingdom</td>\n",
 445 |        "      <td>2009-12-01 07:45:00</td>\n",
 446 |        "      <td>07:45:00</td>\n",
 447 |        "      <td>7</td>\n",
 448 |        "      <td>45</td>\n",
 449 |        "      <td>0</td>\n",
 450 |        "      <td>0</td>\n",
 451 |        "      <td>0</td>\n",
 452 |        "    </tr>\n",
 453 |        "    <tr>\n",
 454 |        "      <th>2</th>\n",
 455 |        "      <td>489434</td>\n",
 456 |        "      <td>79323W</td>\n",
 457 |        "      <td>WHITE CHERRY LIGHTS</td>\n",
 458 |        "      <td>12</td>\n",
 459 |        "      <td>2009-12-01 07:45:00</td>\n",
 460 |        "      <td>6.75</td>\n",
 461 |        "      <td>13085.0</td>\n",
 462 |        "      <td>United Kingdom</td>\n",
 463 |        "      <td>2009-12-01 07:45:00</td>\n",
 464 |        "      <td>07:45:00</td>\n",
 465 |        "      <td>7</td>\n",
 466 |        "      <td>45</td>\n",
 467 |        "      <td>0</td>\n",
 468 |        "      <td>0</td>\n",
 469 |        "      <td>0</td>\n",
 470 |        "    </tr>\n",
 471 |        "    <tr>\n",
 472 |        "      <th>3</th>\n",
 473 |        "      <td>489434</td>\n",
 474 |        "      <td>22041</td>\n",
 475 |        "      <td>RECORD FRAME 7\" SINGLE SIZE</td>\n",
 476 |        "      <td>48</td>\n",
 477 |        "      <td>2009-12-01 07:45:00</td>\n",
 478 |        "      <td>2.10</td>\n",
 479 |        "      <td>13085.0</td>\n",
 480 |        "      <td>United Kingdom</td>\n",
 481 |        "      <td>2009-12-01 07:45:00</td>\n",
 482 |        "      <td>07:45:00</td>\n",
 483 |        "      <td>7</td>\n",
 484 |        "      <td>45</td>\n",
 485 |        "      <td>0</td>\n",
 486 |        "      <td>0</td>\n",
 487 |        "      <td>0</td>\n",
 488 |        "    </tr>\n",
 489 |        "    <tr>\n",
 490 |        "      <th>4</th>\n",
 491 |        "      <td>489434</td>\n",
 492 |        "      <td>21232</td>\n",
 493 |        "      <td>STRAWBERRY CERAMIC TRINKET BOX</td>\n",
 494 |        "      <td>24</td>\n",
 495 |        "      <td>2009-12-01 07:45:00</td>\n",
 496 |        "      <td>1.25</td>\n",
 497 |        "      <td>13085.0</td>\n",
 498 |        "      <td>United Kingdom</td>\n",
 499 |        "      <td>2009-12-01 07:45:00</td>\n",
 500 |        "      <td>07:45:00</td>\n",
 501 |        "      <td>7</td>\n",
 502 |        "      <td>45</td>\n",
 503 |        "      <td>0</td>\n",
 504 |        "      <td>0</td>\n",
 505 |        "      <td>0</td>\n",
 506 |        "    </tr>\n",
 507 |        "  </tbody>\n",
 508 |        "</table>\n",
 509 |        "</div>"
 510 |       ],
 511 |       "text/plain": [
 512 |        "  Invoice StockCode                          Description  Quantity  \\\n",
 513 |        "0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   \n",
 514 |        "1  489434    79323P                   PINK CHERRY LIGHTS        12   \n",
 515 |        "2  489434    79323W                  WHITE CHERRY LIGHTS        12   \n",
 516 |        "3  489434     22041         RECORD FRAME 7\" SINGLE SIZE         48   \n",
 517 |        "4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   \n",
 518 |        "\n",
 519 |        "          InvoiceDate  Price  Customer ID         Country                date  \\\n",
 520 |        "0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 521 |        "1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 522 |        "2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 523 |        "3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 524 |        "4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 525 |        "\n",
 526 |        "  time_part  hour  min  sec  microsec  nanosec  \n",
 527 |        "0  07:45:00     7   45    0         0        0  \n",
 528 |        "1  07:45:00     7   45    0         0        0  \n",
 529 |        "2  07:45:00     7   45    0         0        0  \n",
 530 |        "3  07:45:00     7   45    0         0        0  \n",
 531 |        "4  07:45:00     7   45    0         0        0  "
 532 |       ]
 533 |      },
 534 |      "execution_count": 6,
 535 |      "metadata": {},
 536 |      "output_type": "execute_result"
 537 |     }
 538 |    ],
 539 |    "source": [
 540 |     "data[\"hour\"] = data[\"date\"].dt.hour\n",
 541 |     "data[\"min\"] = data[\"date\"].dt.minute\n",
 542 |     "data[\"sec\"] = data[\"date\"].dt.second\n",
 543 |     "\n",
 544 |     "# We do not have micro and nano seconds in this dataset,\n",
 545 |     "# but if we did, we can extract them as follows:\n",
 546 |     "\n",
 547 |     "data[\"microsec\"] = data[\"date\"].dt.microsecond\n",
 548 |     "data[\"nanosec\"] = data[\"date\"].dt.nanosecond\n",
 549 |     "\n",
 550 |     "data.head()"
 551 |    ]
 552 |   },
 553 |   {
 554 |    "cell_type": "markdown",
 555 |    "metadata": {},
 556 |    "source": [
 557 |     "### Extract hr, min, sec, at the same time"
 558 |    ]
 559 |   },
 560 |   {
 561 |    "cell_type": "code",
 562 |    "execution_count": 7,
 563 |    "metadata": {},
 564 |    "outputs": [
 565 |     {
 566 |      "data": {
 567 |       "text/html": [
 568 |        "<div>\n",
 569 |        "<style scoped>\n",
 570 |        "    .dataframe tbody tr th:only-of-type {\n",
 571 |        "        vertical-align: middle;\n",
 572 |        "    }\n",
 573 |        "\n",
 574 |        "    .dataframe tbody tr th {\n",
 575 |        "        vertical-align: top;\n",
 576 |        "    }\n",
 577 |        "\n",
 578 |        "    .dataframe thead th {\n",
 579 |        "        text-align: right;\n",
 580 |        "    }\n",
 581 |        "</style>\n",
 582 |        "<table border=\"1\" class=\"dataframe\">\n",
 583 |        "  <thead>\n",
 584 |        "    <tr style=\"text-align: right;\">\n",
 585 |        "      <th></th>\n",
 586 |        "      <th>Invoice</th>\n",
 587 |        "      <th>StockCode</th>\n",
 588 |        "      <th>Description</th>\n",
 589 |        "      <th>Quantity</th>\n",
 590 |        "      <th>InvoiceDate</th>\n",
 591 |        "      <th>Price</th>\n",
 592 |        "      <th>Customer ID</th>\n",
 593 |        "      <th>Country</th>\n",
 594 |        "      <th>date</th>\n",
 595 |        "      <th>time_part</th>\n",
 596 |        "      <th>hour</th>\n",
 597 |        "      <th>min</th>\n",
 598 |        "      <th>sec</th>\n",
 599 |        "      <th>microsec</th>\n",
 600 |        "      <th>nanosec</th>\n",
 601 |        "      <th>h</th>\n",
 602 |        "      <th>m</th>\n",
 603 |        "      <th>s</th>\n",
 604 |        "    </tr>\n",
 605 |        "  </thead>\n",
 606 |        "  <tbody>\n",
 607 |        "    <tr>\n",
 608 |        "      <th>0</th>\n",
 609 |        "      <td>489434</td>\n",
 610 |        "      <td>85048</td>\n",
 611 |        "      <td>15CM CHRISTMAS GLASS BALL 20 LIGHTS</td>\n",
 612 |        "      <td>12</td>\n",
 613 |        "      <td>2009-12-01 07:45:00</td>\n",
 614 |        "      <td>6.95</td>\n",
 615 |        "      <td>13085.0</td>\n",
 616 |        "      <td>United Kingdom</td>\n",
 617 |        "      <td>2009-12-01 07:45:00</td>\n",
 618 |        "      <td>07:45:00</td>\n",
 619 |        "      <td>7</td>\n",
 620 |        "      <td>45</td>\n",
 621 |        "      <td>0</td>\n",
 622 |        "      <td>0</td>\n",
 623 |        "      <td>0</td>\n",
 624 |        "      <td>7</td>\n",
 625 |        "      <td>45</td>\n",
 626 |        "      <td>0</td>\n",
 627 |        "    </tr>\n",
 628 |        "    <tr>\n",
 629 |        "      <th>1</th>\n",
 630 |        "      <td>489434</td>\n",
 631 |        "      <td>79323P</td>\n",
 632 |        "      <td>PINK CHERRY LIGHTS</td>\n",
 633 |        "      <td>12</td>\n",
 634 |        "      <td>2009-12-01 07:45:00</td>\n",
 635 |        "      <td>6.75</td>\n",
 636 |        "      <td>13085.0</td>\n",
 637 |        "      <td>United Kingdom</td>\n",
 638 |        "      <td>2009-12-01 07:45:00</td>\n",
 639 |        "      <td>07:45:00</td>\n",
 640 |        "      <td>7</td>\n",
 641 |        "      <td>45</td>\n",
 642 |        "      <td>0</td>\n",
 643 |        "      <td>0</td>\n",
 644 |        "      <td>0</td>\n",
 645 |        "      <td>7</td>\n",
 646 |        "      <td>45</td>\n",
 647 |        "      <td>0</td>\n",
 648 |        "    </tr>\n",
 649 |        "    <tr>\n",
 650 |        "      <th>2</th>\n",
 651 |        "      <td>489434</td>\n",
 652 |        "      <td>79323W</td>\n",
 653 |        "      <td>WHITE CHERRY LIGHTS</td>\n",
 654 |        "      <td>12</td>\n",
 655 |        "      <td>2009-12-01 07:45:00</td>\n",
 656 |        "      <td>6.75</td>\n",
 657 |        "      <td>13085.0</td>\n",
 658 |        "      <td>United Kingdom</td>\n",
 659 |        "      <td>2009-12-01 07:45:00</td>\n",
 660 |        "      <td>07:45:00</td>\n",
 661 |        "      <td>7</td>\n",
 662 |        "      <td>45</td>\n",
 663 |        "      <td>0</td>\n",
 664 |        "      <td>0</td>\n",
 665 |        "      <td>0</td>\n",
 666 |        "      <td>7</td>\n",
 667 |        "      <td>45</td>\n",
 668 |        "      <td>0</td>\n",
 669 |        "    </tr>\n",
 670 |        "    <tr>\n",
 671 |        "      <th>3</th>\n",
 672 |        "      <td>489434</td>\n",
 673 |        "      <td>22041</td>\n",
 674 |        "      <td>RECORD FRAME 7\" SINGLE SIZE</td>\n",
 675 |        "      <td>48</td>\n",
 676 |        "      <td>2009-12-01 07:45:00</td>\n",
 677 |        "      <td>2.10</td>\n",
 678 |        "      <td>13085.0</td>\n",
 679 |        "      <td>United Kingdom</td>\n",
 680 |        "      <td>2009-12-01 07:45:00</td>\n",
 681 |        "      <td>07:45:00</td>\n",
 682 |        "      <td>7</td>\n",
 683 |        "      <td>45</td>\n",
 684 |        "      <td>0</td>\n",
 685 |        "      <td>0</td>\n",
 686 |        "      <td>0</td>\n",
 687 |        "      <td>7</td>\n",
 688 |        "      <td>45</td>\n",
 689 |        "      <td>0</td>\n",
 690 |        "    </tr>\n",
 691 |        "    <tr>\n",
 692 |        "      <th>4</th>\n",
 693 |        "      <td>489434</td>\n",
 694 |        "      <td>21232</td>\n",
 695 |        "      <td>STRAWBERRY CERAMIC TRINKET BOX</td>\n",
 696 |        "      <td>24</td>\n",
 697 |        "      <td>2009-12-01 07:45:00</td>\n",
 698 |        "      <td>1.25</td>\n",
 699 |        "      <td>13085.0</td>\n",
 700 |        "      <td>United Kingdom</td>\n",
 701 |        "      <td>2009-12-01 07:45:00</td>\n",
 702 |        "      <td>07:45:00</td>\n",
 703 |        "      <td>7</td>\n",
 704 |        "      <td>45</td>\n",
 705 |        "      <td>0</td>\n",
 706 |        "      <td>0</td>\n",
 707 |        "      <td>0</td>\n",
 708 |        "      <td>7</td>\n",
 709 |        "      <td>45</td>\n",
 710 |        "      <td>0</td>\n",
 711 |        "    </tr>\n",
 712 |        "  </tbody>\n",
 713 |        "</table>\n",
 714 |        "</div>"
 715 |       ],
 716 |       "text/plain": [
 717 |        "  Invoice StockCode                          Description  Quantity  \\\n",
 718 |        "0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   \n",
 719 |        "1  489434    79323P                   PINK CHERRY LIGHTS        12   \n",
 720 |        "2  489434    79323W                  WHITE CHERRY LIGHTS        12   \n",
 721 |        "3  489434     22041         RECORD FRAME 7\" SINGLE SIZE         48   \n",
 722 |        "4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   \n",
 723 |        "\n",
 724 |        "          InvoiceDate  Price  Customer ID         Country                date  \\\n",
 725 |        "0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 726 |        "1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 727 |        "2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 728 |        "3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 729 |        "4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom 2009-12-01 07:45:00   \n",
 730 |        "\n",
 731 |        "  time_part  hour  min  sec  microsec  nanosec  h   m  s  \n",
 732 |        "0  07:45:00     7   45    0         0        0  7  45  0  \n",
 733 |        "1  07:45:00     7   45    0         0        0  7  45  0  \n",
 734 |        "2  07:45:00     7   45    0         0        0  7  45  0  \n",
 735 |        "3  07:45:00     7   45    0         0        0  7  45  0  \n",
 736 |        "4  07:45:00     7   45    0         0        0  7  45  0  "
 737 |       ]
 738 |      },
 739 |      "execution_count": 7,
 740 |      "metadata": {},
 741 |      "output_type": "execute_result"
 742 |     }
 743 |    ],
 744 |    "source": [
 745 |     "# Now, let's repeat what we did in the previous cell in 1 command.\n",
 746 |     "\n",
 747 |     "data[[\"h\", \"m\", \"s\"]] = pd.DataFrame(\n",
 748 |     "    [(x.hour, x.minute, x.second) for x in data[\"date\"]]\n",
 749 |     ")\n",
 750 |     "\n",
 751 |     "data.head()"
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "markdown",
 756 |    "metadata": {},
 757 |    "source": [
 758 |     "## Work with different timezones\n",
 759 |     "\n",
 760 |     "In the next few cells, we will see how to work with timestamps that are in different time zones."
 761 |    ]
 762 |   },
 763 |   {
 764 |    "cell_type": "code",
 765 |    "execution_count": 8,
 766 |    "metadata": {},
 767 |    "outputs": [
 768 |     {
 769 |      "data": {
 770 |       "text/html": [
 771 |        "<div>\n",
 772 |        "<style scoped>\n",
 773 |        "    .dataframe tbody tr th:only-of-type {\n",
 774 |        "        vertical-align: middle;\n",
 775 |        "    }\n",
 776 |        "\n",
 777 |        "    .dataframe tbody tr th {\n",
 778 |        "        vertical-align: top;\n",
 779 |        "    }\n",
 780 |        "\n",
 781 |        "    .dataframe thead th {\n",
 782 |        "        text-align: right;\n",
 783 |        "    }\n",
 784 |        "</style>\n",
 785 |        "<table border=\"1\" class=\"dataframe\">\n",
 786 |        "  <thead>\n",
 787 |        "    <tr style=\"text-align: right;\">\n",
 788 |        "      <th></th>\n",
 789 |        "      <th>time</th>\n",
 790 |        "    </tr>\n",
 791 |        "  </thead>\n",
 792 |        "  <tbody>\n",
 793 |        "    <tr>\n",
 794 |        "      <th>0</th>\n",
 795 |        "      <td>2014-08-01 09:00:00+02:00</td>\n",
 796 |        "    </tr>\n",
 797 |        "    <tr>\n",
 798 |        "      <th>1</th>\n",
 799 |        "      <td>2014-08-01 10:00:00+02:00</td>\n",
 800 |        "    </tr>\n",
 801 |        "    <tr>\n",
 802 |        "      <th>2</th>\n",
 803 |        "      <td>2014-08-01 11:00:00+02:00</td>\n",
 804 |        "    </tr>\n",
 805 |        "    <tr>\n",
 806 |        "      <th>0</th>\n",
 807 |        "      <td>2014-08-01 09:00:00-05:00</td>\n",
 808 |        "    </tr>\n",
 809 |        "    <tr>\n",
 810 |        "      <th>1</th>\n",
 811 |        "      <td>2014-08-01 10:00:00-05:00</td>\n",
 812 |        "    </tr>\n",
 813 |        "    <tr>\n",
 814 |        "      <th>2</th>\n",
 815 |        "      <td>2014-08-01 11:00:00-05:00</td>\n",
 816 |        "    </tr>\n",
 817 |        "  </tbody>\n",
 818 |        "</table>\n",
 819 |        "</div>"
 820 |       ],
 821 |       "text/plain": [
 822 |        "                        time\n",
 823 |        "0  2014-08-01 09:00:00+02:00\n",
 824 |        "1  2014-08-01 10:00:00+02:00\n",
 825 |        "2  2014-08-01 11:00:00+02:00\n",
 826 |        "0  2014-08-01 09:00:00-05:00\n",
 827 |        "1  2014-08-01 10:00:00-05:00\n",
 828 |        "2  2014-08-01 11:00:00-05:00"
 829 |       ]
 830 |      },
 831 |      "execution_count": 8,
 832 |      "metadata": {},
 833 |      "output_type": "execute_result"
 834 |     }
 835 |    ],
 836 |    "source": [
 837 |     "# First, let's create a toy dataframe with some timestamps in different time zones.\n",
 838 |     "\n",
 839 |     "df = pd.DataFrame()\n",
 840 |     "\n",
 841 |     "df[\"time\"] = pd.concat(\n",
 842 |     "    [\n",
 843 |     "        pd.Series(\n",
 844 |     "            pd.date_range(\n",
 845 |     "                start=\"2014-08-01 09:00\", freq=\"H\", periods=3, tz=\"Europe/Berlin\"\n",
 846 |     "            )\n",
 847 |     "        ),\n",
 848 |     "        pd.Series(\n",
 849 |     "            pd.date_range(\n",
 850 |     "                start=\"2014-08-01 09:00\", freq=\"H\", periods=3, tz=\"US/Central\"\n",
 851 |     "            )\n",
 852 |     "        ),\n",
 853 |     "    ],\n",
 854 |     "    axis=0,\n",
 855 |     ")\n",
 856 |     "\n",
 857 |     "df"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "markdown",
 862 |    "metadata": {},
 863 |    "source": [
 864 |     "We can see the different timezones indicated by the +2 and -5, with respect to the central meridian."
 865 |    ]
 866 |   },
 867 |   {
 868 |    "cell_type": "code",
 869 |    "execution_count": 9,
 870 |    "metadata": {},
 871 |    "outputs": [
 872 |     {
 873 |      "data": {
 874 |       "text/html": [
 875 |        "<div>\n",
 876 |        "<style scoped>\n",
 877 |        "    .dataframe tbody tr th:only-of-type {\n",
 878 |        "        vertical-align: middle;\n",
 879 |        "    }\n",
 880 |        "\n",
 881 |        "    .dataframe tbody tr th {\n",
 882 |        "        vertical-align: top;\n",
 883 |        "    }\n",
 884 |        "\n",
 885 |        "    .dataframe thead th {\n",
 886 |        "        text-align: right;\n",
 887 |        "    }\n",
 888 |        "</style>\n",
 889 |        "<table border=\"1\" class=\"dataframe\">\n",
 890 |        "  <thead>\n",
 891 |        "    <tr style=\"text-align: right;\">\n",
 892 |        "      <th></th>\n",
 893 |        "      <th>time</th>\n",
 894 |        "      <th>time_utc</th>\n",
 895 |        "      <th>time_london</th>\n",
 896 |        "    </tr>\n",
 897 |        "  </thead>\n",
 898 |        "  <tbody>\n",
 899 |        "    <tr>\n",
 900 |        "      <th>0</th>\n",
 901 |        "      <td>2014-08-01 09:00:00+02:00</td>\n",
 902 |        "      <td>2014-08-01 07:00:00+00:00</td>\n",
 903 |        "      <td>2014-08-01 08:00:00+01:00</td>\n",
 904 |        "    </tr>\n",
 905 |        "    <tr>\n",
 906 |        "      <th>1</th>\n",
 907 |        "      <td>2014-08-01 10:00:00+02:00</td>\n",
 908 |        "      <td>2014-08-01 08:00:00+00:00</td>\n",
 909 |        "      <td>2014-08-01 09:00:00+01:00</td>\n",
 910 |        "    </tr>\n",
 911 |        "    <tr>\n",
 912 |        "      <th>2</th>\n",
 913 |        "      <td>2014-08-01 11:00:00+02:00</td>\n",
 914 |        "      <td>2014-08-01 09:00:00+00:00</td>\n",
 915 |        "      <td>2014-08-01 10:00:00+01:00</td>\n",
 916 |        "    </tr>\n",
 917 |        "    <tr>\n",
 918 |        "      <th>0</th>\n",
 919 |        "      <td>2014-08-01 09:00:00-05:00</td>\n",
 920 |        "      <td>2014-08-01 14:00:00+00:00</td>\n",
 921 |        "      <td>2014-08-01 15:00:00+01:00</td>\n",
 922 |        "    </tr>\n",
 923 |        "    <tr>\n",
 924 |        "      <th>1</th>\n",
 925 |        "      <td>2014-08-01 10:00:00-05:00</td>\n",
 926 |        "      <td>2014-08-01 15:00:00+00:00</td>\n",
 927 |        "      <td>2014-08-01 16:00:00+01:00</td>\n",
 928 |        "    </tr>\n",
 929 |        "    <tr>\n",
 930 |        "      <th>2</th>\n",
 931 |        "      <td>2014-08-01 11:00:00-05:00</td>\n",
 932 |        "      <td>2014-08-01 16:00:00+00:00</td>\n",
 933 |        "      <td>2014-08-01 17:00:00+01:00</td>\n",
 934 |        "    </tr>\n",
 935 |        "  </tbody>\n",
 936 |        "</table>\n",
 937 |        "</div>"
 938 |       ],
 939 |       "text/plain": [
 940 |        "                        time                  time_utc  \\\n",
 941 |        "0  2014-08-01 09:00:00+02:00 2014-08-01 07:00:00+00:00   \n",
 942 |        "1  2014-08-01 10:00:00+02:00 2014-08-01 08:00:00+00:00   \n",
 943 |        "2  2014-08-01 11:00:00+02:00 2014-08-01 09:00:00+00:00   \n",
 944 |        "0  2014-08-01 09:00:00-05:00 2014-08-01 14:00:00+00:00   \n",
 945 |        "1  2014-08-01 10:00:00-05:00 2014-08-01 15:00:00+00:00   \n",
 946 |        "2  2014-08-01 11:00:00-05:00 2014-08-01 16:00:00+00:00   \n",
 947 |        "\n",
 948 |        "                time_london  \n",
 949 |        "0 2014-08-01 08:00:00+01:00  \n",
 950 |        "1 2014-08-01 09:00:00+01:00  \n",
 951 |        "2 2014-08-01 10:00:00+01:00  \n",
 952 |        "0 2014-08-01 15:00:00+01:00  \n",
 953 |        "1 2014-08-01 16:00:00+01:00  \n",
 954 |        "2 2014-08-01 17:00:00+01:00  "
 955 |       ]
 956 |      },
 957 |      "execution_count": 9,
 958 |      "metadata": {},
 959 |      "output_type": "execute_result"
 960 |     }
 961 |    ],
 962 |    "source": [
 963 |     "# To work with different time zones, first we unify the\n",
 964 |     "# timezone to the central one by setting utc = True.\n",
 965 |     "\n",
 966 |     "df[\"time_utc\"] = pd.to_datetime(df[\"time\"], utc=True)\n",
 967 |     "\n",
 968 |     "# Next, we change all timestamps to the desired timezone,\n",
 969 |     "# e.g., Europe/London, as in this example.\n",
 970 |     "\n",
 971 |     "df[\"time_london\"] = df[\"time_utc\"].dt.tz_convert(\"Europe/London\")\n",
 972 |     "\n",
 973 |     "\n",
 974 |     "df"
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "markdown",
 979 |    "metadata": {},
 980 |    "source": [
 981 |     "Whether to unify the timezone depends on the use case. If we are forecasting sales for different countries, perhaps it is better to keep each country's respective time zone, since we will treat those series independently.\n",
 982 |     "\n",
 983 |     "If we have a small company that sells mostly inland and occasionally sells something abroad, we probably have the local timezone already, but if we do not, we may want to localize the time stamp to our time zone."
 984 |    ]
 985 |   },
 986 |   {
 987 |    "cell_type": "code",
 988 |    "execution_count": null,
 989 |    "metadata": {},
 990 |    "outputs": [],
 991 |    "source": []
 992 |   }
 993 |  ],
 994 |  "metadata": {
 995 |   "kernelspec": {
 996 |    "display_name": "fets",
 997 |    "language": "python",
 998 |    "name": "fets"
 999 |   },
1000 |   "language_info": {
1001 |    "codemirror_mode": {
1002 |     "name": "ipython",
1003 |     "version": 3
1004 |    },
1005 |    "file_extension": ".py",
1006 |    "mimetype": "text/x-python",
1007 |    "name": "python",
1008 |    "nbconvert_exporter": "python",
1009 |    "pygments_lexer": "ipython3",
1010 |    "version": "3.8.2"
1011 |   },
1012 |   "toc": {
1013 |    "base_numbering": 1,
1014 |    "nav_menu": {},
1015 |    "number_sections": true,
1016 |    "sideBar": true,
1017 |    "skip_h1_title": false,
1018 |    "title_cell": "Table of Contents",
1019 |    "title_sidebar": "Contents",
1020 |    "toc_cell": false,
1021 |    "toc_position": {},
1022 |    "toc_section_display": "block",
1023 |    "toc_window_display": true
1024 |   }
1025 |  },
1026 |  "nbformat": 4,
1027 |  "nbformat_minor": 2
1028 | }
1029 | 


--------------------------------------------------------------------------------
/12-Categorical-Encoding/3-mean-encoding-simple.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "b699e295",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Mean encoding - simple\n",
  9 |     "\n",
 10 |     "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n",
 11 |     "\n",
 12 |     "In this notebook, we will encode static features with mean encoding. We will split the data into train and test sets, learn the mean target value per category using the train set, and then encode both the train and test sets with those learned parameters.\n",
 13 |     "\n",
 14 |     "It has the advantage that this logic is implemented by open-source libraries.\n",
 15 |     "\n",
 16 |     "The drawback is that we may overfit because we are leaking future data into the past. \n",
 17 |     "\n",
 18 |     "We will use the online retail dataset, which we prepared in the notebook `02-create-online-retail-II-datasets.ipynb` located in the `01-Create-Datasets` folder."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "id": "49b2f0bf",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import numpy as np\n",
 29 |     "import pandas as pd\n",
 30 |     "from feature_engine.encoding import MeanEncoder"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "5a174f3b",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## Load data"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 2,
 44 |    "id": "67a2af74",
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/html": [
 50 |        "<div>\n",
 51 |        "<style scoped>\n",
 52 |        "    .dataframe tbody tr th:only-of-type {\n",
 53 |        "        vertical-align: middle;\n",
 54 |        "    }\n",
 55 |        "\n",
 56 |        "    .dataframe tbody tr th {\n",
 57 |        "        vertical-align: top;\n",
 58 |        "    }\n",
 59 |        "\n",
 60 |        "    .dataframe thead th {\n",
 61 |        "        text-align: right;\n",
 62 |        "    }\n",
 63 |        "</style>\n",
 64 |        "<table border=\"1\" class=\"dataframe\">\n",
 65 |        "  <thead>\n",
 66 |        "    <tr style=\"text-align: right;\">\n",
 67 |        "      <th></th>\n",
 68 |        "      <th>country</th>\n",
 69 |        "      <th>quantity</th>\n",
 70 |        "      <th>revenue</th>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>week</th>\n",
 74 |        "      <th></th>\n",
 75 |        "      <th></th>\n",
 76 |        "      <th></th>\n",
 77 |        "    </tr>\n",
 78 |        "  </thead>\n",
 79 |        "  <tbody>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>2009-12-06</th>\n",
 82 |        "      <td>Belgium</td>\n",
 83 |        "      <td>143</td>\n",
 84 |        "      <td>439.1</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>2009-12-13</th>\n",
 88 |        "      <td>Belgium</td>\n",
 89 |        "      <td>10</td>\n",
 90 |        "      <td>8.5</td>\n",
 91 |        "    </tr>\n",
 92 |        "    <tr>\n",
 93 |        "      <th>2009-12-20</th>\n",
 94 |        "      <td>Belgium</td>\n",
 95 |        "      <td>0</td>\n",
 96 |        "      <td>0.0</td>\n",
 97 |        "    </tr>\n",
 98 |        "    <tr>\n",
 99 |        "      <th>2009-12-27</th>\n",
100 |        "      <td>Belgium</td>\n",
101 |        "      <td>0</td>\n",
102 |        "      <td>0.0</td>\n",
103 |        "    </tr>\n",
104 |        "    <tr>\n",
105 |        "      <th>2010-01-03</th>\n",
106 |        "      <td>Belgium</td>\n",
107 |        "      <td>0</td>\n",
108 |        "      <td>0.0</td>\n",
109 |        "    </tr>\n",
110 |        "  </tbody>\n",
111 |        "</table>\n",
112 |        "</div>"
113 |       ],
114 |       "text/plain": [
115 |        "            country  quantity  revenue\n",
116 |        "week                                  \n",
117 |        "2009-12-06  Belgium       143    439.1\n",
118 |        "2009-12-13  Belgium        10      8.5\n",
119 |        "2009-12-20  Belgium         0      0.0\n",
120 |        "2009-12-27  Belgium         0      0.0\n",
121 |        "2010-01-03  Belgium         0      0.0"
122 |       ]
123 |      },
124 |      "execution_count": 2,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "df = pd.read_csv(\"../Datasets/online_retail_dataset_countries.csv\",\n",
131 |     "                parse_dates=[\"week\"],\n",
132 |     "                index_col=\"week\",\n",
133 |     "                )\n",
134 |     "\n",
135 |     "df.head()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "id": "4a419d6a",
141 |    "metadata": {},
142 |    "source": [
143 |     "## Split into train and test"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 3,
149 |    "id": "1f4c0763",
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "# Split the data before and after June 2011\n",
154 |     "\n",
155 |     "X_train = df[df.index <= pd.to_datetime('2011-06-30')]\n",
156 |     "X_test = df[df.index > pd.to_datetime('2011-06-30')]\n",
157 |     "\n",
158 |     "y_train = X_train[\"revenue\"]\n",
159 |     "y_test = X_test[\"revenue\"]"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 4,
165 |    "id": "928be034",
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/plain": [
171 |        "(Timestamp('2009-12-06 00:00:00'), Timestamp('2011-06-26 00:00:00'))"
172 |       ]
173 |      },
174 |      "execution_count": 4,
175 |      "metadata": {},
176 |      "output_type": "execute_result"
177 |     }
178 |    ],
179 |    "source": [
180 |     "# sanity check\n",
181 |     "\n",
182 |     "X_train.index.min(), X_train.index.max()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 5,
188 |    "id": "6e838b49",
189 |    "metadata": {},
190 |    "outputs": [
191 |     {
192 |      "data": {
193 |       "text/plain": [
194 |        "(Timestamp('2011-07-03 00:00:00'), Timestamp('2011-12-11 00:00:00'))"
195 |       ]
196 |      },
197 |      "execution_count": 5,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "# sanity check\n",
204 |     "\n",
205 |     "X_test.index.min(), X_test.index.max()"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "id": "d5de7aa0",
211 |    "metadata": {},
212 |    "source": [
213 |     "## Encode"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 6,
219 |    "id": "2402ebb9",
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "# Set up the mean encoder\n",
224 |     "\n",
225 |     "enc = MeanEncoder()"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 7,
231 |    "id": "74ef4a1a",
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "data": {
236 |       "text/html": [
237 |        "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MeanEncoder()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">MeanEncoder</label><div class=\"sk-toggleable__content\"><pre>MeanEncoder()</pre></div></div></div></div></div>"
238 |       ],
239 |       "text/plain": [
240 |        "MeanEncoder()"
241 |       ]
242 |      },
243 |      "execution_count": 7,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "# Find mean target value per category\n",
250 |     "# (it uses the entire train set)\n",
251 |     "\n",
252 |     "enc.fit(X_train, y_train)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 8,
258 |    "id": "1667b70c",
259 |    "metadata": {},
260 |    "outputs": [
261 |     {
262 |      "data": {
263 |       "text/plain": [
264 |        "['country']"
265 |       ]
266 |      },
267 |      "execution_count": 8,
268 |      "metadata": {},
269 |      "output_type": "execute_result"
270 |     }
271 |    ],
272 |    "source": [
273 |     "# Feature-engine's encoder finds categorical variables\n",
274 |     "# by default\n",
275 |     "\n",
276 |     "enc.variables_"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 9,
282 |    "id": "90a34078",
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "data": {
287 |       "text/plain": [
288 |        "{'country': {'Belgium': 511.37853658536585,\n",
289 |        "  'EIRE': 5579.161829268293,\n",
290 |        "  'France': 2872.7475609756098,\n",
291 |        "  'Germany': 3764.180012195122,\n",
292 |        "  'Spain': 919.3335365853659,\n",
293 |        "  'United Kingdom': 129124.83931707316}}"
294 |       ]
295 |      },
296 |      "execution_count": 9,
297 |      "metadata": {},
298 |      "output_type": "execute_result"
299 |     }
300 |    ],
301 |    "source": [
302 |     "# the encoding values\n",
303 |     "\n",
304 |     "enc.encoder_dict_"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 10,
310 |    "id": "2c4cf198",
311 |    "metadata": {},
312 |    "outputs": [
313 |     {
314 |      "data": {
315 |       "text/html": [
316 |        "<div>\n",
317 |        "<style scoped>\n",
318 |        "    .dataframe tbody tr th:only-of-type {\n",
319 |        "        vertical-align: middle;\n",
320 |        "    }\n",
321 |        "\n",
322 |        "    .dataframe tbody tr th {\n",
323 |        "        vertical-align: top;\n",
324 |        "    }\n",
325 |        "\n",
326 |        "    .dataframe thead th {\n",
327 |        "        text-align: right;\n",
328 |        "    }\n",
329 |        "</style>\n",
330 |        "<table border=\"1\" class=\"dataframe\">\n",
331 |        "  <thead>\n",
332 |        "    <tr style=\"text-align: right;\">\n",
333 |        "      <th></th>\n",
334 |        "      <th>country</th>\n",
335 |        "      <th>quantity</th>\n",
336 |        "      <th>revenue</th>\n",
337 |        "    </tr>\n",
338 |        "    <tr>\n",
339 |        "      <th>week</th>\n",
340 |        "      <th></th>\n",
341 |        "      <th></th>\n",
342 |        "      <th></th>\n",
343 |        "    </tr>\n",
344 |        "  </thead>\n",
345 |        "  <tbody>\n",
346 |        "    <tr>\n",
347 |        "      <th>2009-12-06</th>\n",
348 |        "      <td>511.378537</td>\n",
349 |        "      <td>143</td>\n",
350 |        "      <td>439.1</td>\n",
351 |        "    </tr>\n",
352 |        "    <tr>\n",
353 |        "      <th>2009-12-13</th>\n",
354 |        "      <td>511.378537</td>\n",
355 |        "      <td>10</td>\n",
356 |        "      <td>8.5</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>2009-12-20</th>\n",
360 |        "      <td>511.378537</td>\n",
361 |        "      <td>0</td>\n",
362 |        "      <td>0.0</td>\n",
363 |        "    </tr>\n",
364 |        "    <tr>\n",
365 |        "      <th>2009-12-27</th>\n",
366 |        "      <td>511.378537</td>\n",
367 |        "      <td>0</td>\n",
368 |        "      <td>0.0</td>\n",
369 |        "    </tr>\n",
370 |        "    <tr>\n",
371 |        "      <th>2010-01-03</th>\n",
372 |        "      <td>511.378537</td>\n",
373 |        "      <td>0</td>\n",
374 |        "      <td>0.0</td>\n",
375 |        "    </tr>\n",
376 |        "  </tbody>\n",
377 |        "</table>\n",
378 |        "</div>"
379 |       ],
380 |       "text/plain": [
381 |        "               country  quantity  revenue\n",
382 |        "week                                     \n",
383 |        "2009-12-06  511.378537       143    439.1\n",
384 |        "2009-12-13  511.378537        10      8.5\n",
385 |        "2009-12-20  511.378537         0      0.0\n",
386 |        "2009-12-27  511.378537         0      0.0\n",
387 |        "2010-01-03  511.378537         0      0.0"
388 |       ]
389 |      },
390 |      "execution_count": 10,
391 |      "metadata": {},
392 |      "output_type": "execute_result"
393 |     }
394 |    ],
395 |    "source": [
396 |     "# Encode datasets\n",
397 |     "\n",
398 |     "X_train_t = enc.transform(X_train)\n",
399 |     "X_test_t = enc.transform(X_test)\n",
400 |     "\n",
401 |     "X_train_t.head()"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "id": "85599ce7",
407 |    "metadata": {},
408 |    "source": [
409 |     "Note that Belgium was replaced by 511.37 in all rows, even though on various occasions the revenue was 0. This may result in a \"look ahead\" bias."
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "id": "60a6c207",
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": []
419 |   }
420 |  ],
421 |  "metadata": {
422 |   "kernelspec": {
423 |    "display_name": "fsml",
424 |    "language": "python",
425 |    "name": "fsml"
426 |   },
427 |   "language_info": {
428 |    "codemirror_mode": {
429 |     "name": "ipython",
430 |     "version": 3
431 |    },
432 |    "file_extension": ".py",
433 |    "mimetype": "text/x-python",
434 |    "name": "python",
435 |    "nbconvert_exporter": "python",
436 |    "pygments_lexer": "ipython3",
437 |    "version": "3.10.5"
438 |   },
439 |   "toc": {
440 |    "base_numbering": 1,
441 |    "nav_menu": {},
442 |    "number_sections": true,
443 |    "sideBar": true,
444 |    "skip_h1_title": false,
445 |    "title_cell": "Table of Contents",
446 |    "title_sidebar": "Contents",
447 |    "toc_cell": false,
448 |    "toc_position": {
449 |     "height": "calc(100% - 180px)",
450 |     "left": "10px",
451 |     "top": "150px",
452 |     "width": "165px"
453 |    },
454 |    "toc_section_display": true,
455 |    "toc_window_display": true
456 |   }
457 |  },
458 |  "nbformat": 4,
459 |  "nbformat_minor": 5
460 | }
461 | 


--------------------------------------------------------------------------------
/12-Categorical-Encoding/4-mean-encoding-expanding-window.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "b699e295",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Mean encoding - expanding window\n",
  9 |     "\n",
 10 |     "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n",
 11 |     "\n",
 12 |     "In this notebook, we will encode static features with mean encoding by using expanding windows. This implementation avoids look-ahead bias.\n",
 13 |     "\n",
 14 |     "We will use the online retail dataset, which we prepared in the notebook `02-create-online-retail-II-datasets.ipynb` located in the `01-Create-Datasets` folder."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "id": "49b2f0bf",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import numpy as np\n",
 25 |     "import pandas as pd"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "id": "5a174f3b",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Load data"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "id": "67a2af74",
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "data": {
 44 |       "text/html": [
 45 |        "<div>\n",
 46 |        "<style scoped>\n",
 47 |        "    .dataframe tbody tr th:only-of-type {\n",
 48 |        "        vertical-align: middle;\n",
 49 |        "    }\n",
 50 |        "\n",
 51 |        "    .dataframe tbody tr th {\n",
 52 |        "        vertical-align: top;\n",
 53 |        "    }\n",
 54 |        "\n",
 55 |        "    .dataframe thead th {\n",
 56 |        "        text-align: right;\n",
 57 |        "    }\n",
 58 |        "</style>\n",
 59 |        "<table border=\"1\" class=\"dataframe\">\n",
 60 |        "  <thead>\n",
 61 |        "    <tr style=\"text-align: right;\">\n",
 62 |        "      <th></th>\n",
 63 |        "      <th>country</th>\n",
 64 |        "      <th>quantity</th>\n",
 65 |        "      <th>revenue</th>\n",
 66 |        "    </tr>\n",
 67 |        "    <tr>\n",
 68 |        "      <th>week</th>\n",
 69 |        "      <th></th>\n",
 70 |        "      <th></th>\n",
 71 |        "      <th></th>\n",
 72 |        "    </tr>\n",
 73 |        "  </thead>\n",
 74 |        "  <tbody>\n",
 75 |        "    <tr>\n",
 76 |        "      <th>2009-12-06</th>\n",
 77 |        "      <td>Belgium</td>\n",
 78 |        "      <td>143</td>\n",
 79 |        "      <td>439.1</td>\n",
 80 |        "    </tr>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>2009-12-13</th>\n",
 83 |        "      <td>Belgium</td>\n",
 84 |        "      <td>10</td>\n",
 85 |        "      <td>8.5</td>\n",
 86 |        "    </tr>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>2009-12-20</th>\n",
 89 |        "      <td>Belgium</td>\n",
 90 |        "      <td>0</td>\n",
 91 |        "      <td>0.0</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>2009-12-27</th>\n",
 95 |        "      <td>Belgium</td>\n",
 96 |        "      <td>0</td>\n",
 97 |        "      <td>0.0</td>\n",
 98 |        "    </tr>\n",
 99 |        "    <tr>\n",
100 |        "      <th>2010-01-03</th>\n",
101 |        "      <td>Belgium</td>\n",
102 |        "      <td>0</td>\n",
103 |        "      <td>0.0</td>\n",
104 |        "    </tr>\n",
105 |        "  </tbody>\n",
106 |        "</table>\n",
107 |        "</div>"
108 |       ],
109 |       "text/plain": [
110 |        "            country  quantity  revenue\n",
111 |        "week                                  \n",
112 |        "2009-12-06  Belgium       143    439.1\n",
113 |        "2009-12-13  Belgium        10      8.5\n",
114 |        "2009-12-20  Belgium         0      0.0\n",
115 |        "2009-12-27  Belgium         0      0.0\n",
116 |        "2010-01-03  Belgium         0      0.0"
117 |       ]
118 |      },
119 |      "execution_count": 2,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "df = pd.read_csv(\"../Datasets/online_retail_dataset_countries.csv\",\n",
126 |     "                parse_dates=[\"week\"],\n",
127 |     "                index_col=\"week\",\n",
128 |     "                )\n",
129 |     "\n",
130 |     "df.head()"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "id": "50846272",
136 |    "metadata": {},
137 |    "source": [
138 |     "## Split into train and test"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 3,
144 |    "id": "1f4c0763",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# Split data before an after June 2011\n",
149 |     "\n",
150 |     "X_train = df[df.index <= pd.to_datetime('2011-06-30')]\n",
151 |     "\n",
152 |     "# We need the past data for the expanding window.\n",
153 |     "X_test = df.copy()\n",
154 |     "\n",
155 |     "# the target variable\n",
156 |     "y_train = X_train[\"revenue\"]\n",
157 |     "y_test = X_test[\"revenue\"]"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 4,
163 |    "id": "e1418b42",
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "data": {
168 |       "text/plain": [
169 |        "(Timestamp('2009-12-06 00:00:00'), Timestamp('2011-06-26 00:00:00'))"
170 |       ]
171 |      },
172 |      "execution_count": 4,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "# sanity check\n",
179 |     "\n",
180 |     "X_train.index.min(), X_train.index.max()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 5,
186 |    "id": "1faf10f7",
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/plain": [
192 |        "(Timestamp('2009-12-06 00:00:00'), Timestamp('2011-12-11 00:00:00'))"
193 |       ]
194 |      },
195 |      "execution_count": 5,
196 |      "metadata": {},
197 |      "output_type": "execute_result"
198 |     }
199 |    ],
200 |    "source": [
201 |     "# sanity check\n",
202 |     "\n",
203 |     "X_test.index.min(), X_test.index.max()"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "id": "d5de7aa0",
209 |    "metadata": {},
210 |    "source": [
211 |     "## Encode countries"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 6,
217 |    "id": "931e9ef9",
218 |    "metadata": {},
219 |    "outputs": [
220 |     {
221 |      "data": {
222 |       "text/html": [
223 |        "<div>\n",
224 |        "<style scoped>\n",
225 |        "    .dataframe tbody tr th:only-of-type {\n",
226 |        "        vertical-align: middle;\n",
227 |        "    }\n",
228 |        "\n",
229 |        "    .dataframe tbody tr th {\n",
230 |        "        vertical-align: top;\n",
231 |        "    }\n",
232 |        "\n",
233 |        "    .dataframe thead th {\n",
234 |        "        text-align: right;\n",
235 |        "    }\n",
236 |        "</style>\n",
237 |        "<table border=\"1\" class=\"dataframe\">\n",
238 |        "  <thead>\n",
239 |        "    <tr style=\"text-align: right;\">\n",
240 |        "      <th></th>\n",
241 |        "      <th>country</th>\n",
242 |        "      <th>week</th>\n",
243 |        "      <th>country_enc</th>\n",
244 |        "    </tr>\n",
245 |        "  </thead>\n",
246 |        "  <tbody>\n",
247 |        "    <tr>\n",
248 |        "      <th>0</th>\n",
249 |        "      <td>Belgium</td>\n",
250 |        "      <td>2009-12-06</td>\n",
251 |        "      <td>NaN</td>\n",
252 |        "    </tr>\n",
253 |        "    <tr>\n",
254 |        "      <th>1</th>\n",
255 |        "      <td>Belgium</td>\n",
256 |        "      <td>2009-12-13</td>\n",
257 |        "      <td>439.100000</td>\n",
258 |        "    </tr>\n",
259 |        "    <tr>\n",
260 |        "      <th>2</th>\n",
261 |        "      <td>Belgium</td>\n",
262 |        "      <td>2009-12-20</td>\n",
263 |        "      <td>223.800000</td>\n",
264 |        "    </tr>\n",
265 |        "    <tr>\n",
266 |        "      <th>3</th>\n",
267 |        "      <td>Belgium</td>\n",
268 |        "      <td>2009-12-27</td>\n",
269 |        "      <td>149.200000</td>\n",
270 |        "    </tr>\n",
271 |        "    <tr>\n",
272 |        "      <th>4</th>\n",
273 |        "      <td>Belgium</td>\n",
274 |        "      <td>2010-01-03</td>\n",
275 |        "      <td>111.900000</td>\n",
276 |        "    </tr>\n",
277 |        "    <tr>\n",
278 |        "      <th>...</th>\n",
279 |        "      <td>...</td>\n",
280 |        "      <td>...</td>\n",
281 |        "      <td>...</td>\n",
282 |        "    </tr>\n",
283 |        "    <tr>\n",
284 |        "      <th>487</th>\n",
285 |        "      <td>United Kingdom</td>\n",
286 |        "      <td>2011-05-29</td>\n",
287 |        "      <td>129923.850701</td>\n",
288 |        "    </tr>\n",
289 |        "    <tr>\n",
290 |        "      <th>488</th>\n",
291 |        "      <td>United Kingdom</td>\n",
292 |        "      <td>2011-06-05</td>\n",
293 |        "      <td>129810.417487</td>\n",
294 |        "    </tr>\n",
295 |        "    <tr>\n",
296 |        "      <th>489</th>\n",
297 |        "      <td>United Kingdom</td>\n",
298 |        "      <td>2011-06-12</td>\n",
299 |        "      <td>129208.338025</td>\n",
300 |        "    </tr>\n",
301 |        "    <tr>\n",
302 |        "      <th>490</th>\n",
303 |        "      <td>United Kingdom</td>\n",
304 |        "      <td>2011-06-19</td>\n",
305 |        "      <td>129708.159425</td>\n",
306 |        "    </tr>\n",
307 |        "    <tr>\n",
308 |        "      <th>491</th>\n",
309 |        "      <td>United Kingdom</td>\n",
310 |        "      <td>2011-06-26</td>\n",
311 |        "      <td>129598.153506</td>\n",
312 |        "    </tr>\n",
313 |        "  </tbody>\n",
314 |        "</table>\n",
315 |        "<p>492 rows × 3 columns</p>\n",
316 |        "</div>"
317 |       ],
318 |       "text/plain": [
319 |        "            country       week    country_enc\n",
320 |        "0           Belgium 2009-12-06            NaN\n",
321 |        "1           Belgium 2009-12-13     439.100000\n",
322 |        "2           Belgium 2009-12-20     223.800000\n",
323 |        "3           Belgium 2009-12-27     149.200000\n",
324 |        "4           Belgium 2010-01-03     111.900000\n",
325 |        "..              ...        ...            ...\n",
326 |        "487  United Kingdom 2011-05-29  129923.850701\n",
327 |        "488  United Kingdom 2011-06-05  129810.417487\n",
328 |        "489  United Kingdom 2011-06-12  129208.338025\n",
329 |        "490  United Kingdom 2011-06-19  129708.159425\n",
330 |        "491  United Kingdom 2011-06-26  129598.153506\n",
331 |        "\n",
332 |        "[492 rows x 3 columns]"
333 |       ]
334 |      },
335 |      "execution_count": 6,
336 |      "metadata": {},
337 |      "output_type": "execute_result"
338 |     }
339 |    ],
340 |    "source": [
341 |     "# train set first\n",
342 |     "\n",
343 |     "train_enc = (\n",
344 |     "    X_train\n",
345 |     "    .groupby(['country'])['revenue']\n",
346 |     "    .expanding()\n",
347 |     "    .mean()\n",
348 |     "    .shift()\n",
349 |     ").reset_index()\n",
350 |     "\n",
351 |     "train_enc.rename(columns = {\"revenue\": \"country_enc\"}, inplace = True)\n",
352 |     "\n",
353 |     "train_enc"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 7,
359 |    "id": "6d3d07a7",
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "data": {
364 |       "text/html": [
365 |        "<div>\n",
366 |        "<style scoped>\n",
367 |        "    .dataframe tbody tr th:only-of-type {\n",
368 |        "        vertical-align: middle;\n",
369 |        "    }\n",
370 |        "\n",
371 |        "    .dataframe tbody tr th {\n",
372 |        "        vertical-align: top;\n",
373 |        "    }\n",
374 |        "\n",
375 |        "    .dataframe thead th {\n",
376 |        "        text-align: right;\n",
377 |        "    }\n",
378 |        "</style>\n",
379 |        "<table border=\"1\" class=\"dataframe\">\n",
380 |        "  <thead>\n",
381 |        "    <tr style=\"text-align: right;\">\n",
382 |        "      <th></th>\n",
383 |        "      <th>week</th>\n",
384 |        "      <th>country</th>\n",
385 |        "      <th>quantity</th>\n",
386 |        "      <th>revenue</th>\n",
387 |        "      <th>country_enc</th>\n",
388 |        "    </tr>\n",
389 |        "  </thead>\n",
390 |        "  <tbody>\n",
391 |        "    <tr>\n",
392 |        "      <th>0</th>\n",
393 |        "      <td>2009-12-06</td>\n",
394 |        "      <td>Belgium</td>\n",
395 |        "      <td>143</td>\n",
396 |        "      <td>439.10</td>\n",
397 |        "      <td>NaN</td>\n",
398 |        "    </tr>\n",
399 |        "    <tr>\n",
400 |        "      <th>1</th>\n",
401 |        "      <td>2009-12-13</td>\n",
402 |        "      <td>Belgium</td>\n",
403 |        "      <td>10</td>\n",
404 |        "      <td>8.50</td>\n",
405 |        "      <td>439.100000</td>\n",
406 |        "    </tr>\n",
407 |        "    <tr>\n",
408 |        "      <th>2</th>\n",
409 |        "      <td>2009-12-20</td>\n",
410 |        "      <td>Belgium</td>\n",
411 |        "      <td>0</td>\n",
412 |        "      <td>0.00</td>\n",
413 |        "      <td>223.800000</td>\n",
414 |        "    </tr>\n",
415 |        "    <tr>\n",
416 |        "      <th>3</th>\n",
417 |        "      <td>2009-12-27</td>\n",
418 |        "      <td>Belgium</td>\n",
419 |        "      <td>0</td>\n",
420 |        "      <td>0.00</td>\n",
421 |        "      <td>149.200000</td>\n",
422 |        "    </tr>\n",
423 |        "    <tr>\n",
424 |        "      <th>4</th>\n",
425 |        "      <td>2010-01-03</td>\n",
426 |        "      <td>Belgium</td>\n",
427 |        "      <td>0</td>\n",
428 |        "      <td>0.00</td>\n",
429 |        "      <td>111.900000</td>\n",
430 |        "    </tr>\n",
431 |        "    <tr>\n",
432 |        "      <th>...</th>\n",
433 |        "      <td>...</td>\n",
434 |        "      <td>...</td>\n",
435 |        "      <td>...</td>\n",
436 |        "      <td>...</td>\n",
437 |        "      <td>...</td>\n",
438 |        "    </tr>\n",
439 |        "    <tr>\n",
440 |        "      <th>487</th>\n",
441 |        "      <td>2011-05-29</td>\n",
442 |        "      <td>United Kingdom</td>\n",
443 |        "      <td>67666</td>\n",
444 |        "      <td>121076.06</td>\n",
445 |        "      <td>129923.850701</td>\n",
446 |        "    </tr>\n",
447 |        "    <tr>\n",
448 |        "      <th>488</th>\n",
449 |        "      <td>2011-06-05</td>\n",
450 |        "      <td>United Kingdom</td>\n",
451 |        "      <td>44422</td>\n",
452 |        "      <td>82246.14</td>\n",
453 |        "      <td>129810.417487</td>\n",
454 |        "    </tr>\n",
455 |        "    <tr>\n",
456 |        "      <th>489</th>\n",
457 |        "      <td>2011-06-12</td>\n",
458 |        "      <td>United Kingdom</td>\n",
459 |        "      <td>77850</td>\n",
460 |        "      <td>169194.05</td>\n",
461 |        "      <td>129208.338025</td>\n",
462 |        "    </tr>\n",
463 |        "    <tr>\n",
464 |        "      <th>490</th>\n",
465 |        "      <td>2011-06-19</td>\n",
466 |        "      <td>United Kingdom</td>\n",
467 |        "      <td>68207</td>\n",
468 |        "      <td>120797.68</td>\n",
469 |        "      <td>129708.159425</td>\n",
470 |        "    </tr>\n",
471 |        "    <tr>\n",
472 |        "      <th>491</th>\n",
473 |        "      <td>2011-06-26</td>\n",
474 |        "      <td>United Kingdom</td>\n",
475 |        "      <td>57102</td>\n",
476 |        "      <td>90786.39</td>\n",
477 |        "      <td>129598.153506</td>\n",
478 |        "    </tr>\n",
479 |        "  </tbody>\n",
480 |        "</table>\n",
481 |        "<p>492 rows × 5 columns</p>\n",
482 |        "</div>"
483 |       ],
484 |       "text/plain": [
485 |        "          week         country  quantity    revenue    country_enc\n",
486 |        "0   2009-12-06         Belgium       143     439.10            NaN\n",
487 |        "1   2009-12-13         Belgium        10       8.50     439.100000\n",
488 |        "2   2009-12-20         Belgium         0       0.00     223.800000\n",
489 |        "3   2009-12-27         Belgium         0       0.00     149.200000\n",
490 |        "4   2010-01-03         Belgium         0       0.00     111.900000\n",
491 |        "..         ...             ...       ...        ...            ...\n",
492 |        "487 2011-05-29  United Kingdom     67666  121076.06  129923.850701\n",
493 |        "488 2011-06-05  United Kingdom     44422   82246.14  129810.417487\n",
494 |        "489 2011-06-12  United Kingdom     77850  169194.05  129208.338025\n",
495 |        "490 2011-06-19  United Kingdom     68207  120797.68  129708.159425\n",
496 |        "491 2011-06-26  United Kingdom     57102   90786.39  129598.153506\n",
497 |        "\n",
498 |        "[492 rows x 5 columns]"
499 |       ]
500 |      },
501 |      "execution_count": 7,
502 |      "metadata": {},
503 |      "output_type": "execute_result"
504 |     }
505 |    ],
506 |    "source": [
507 |     "# Add encoded variable to original train set\n",
508 |     "\n",
509 |     "X_train_enc = X_train.reset_index().merge(train_enc)\n",
510 |     "\n",
511 |     "X_train_enc"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": 8,
517 |    "id": "5f6bf153",
518 |    "metadata": {},
519 |    "outputs": [
520 |     {
521 |      "data": {
522 |       "text/html": [
523 |        "<div>\n",
524 |        "<style scoped>\n",
525 |        "    .dataframe tbody tr th:only-of-type {\n",
526 |        "        vertical-align: middle;\n",
527 |        "    }\n",
528 |        "\n",
529 |        "    .dataframe tbody tr th {\n",
530 |        "        vertical-align: top;\n",
531 |        "    }\n",
532 |        "\n",
533 |        "    .dataframe thead th {\n",
534 |        "        text-align: right;\n",
535 |        "    }\n",
536 |        "</style>\n",
537 |        "<table border=\"1\" class=\"dataframe\">\n",
538 |        "  <thead>\n",
539 |        "    <tr style=\"text-align: right;\">\n",
540 |        "      <th></th>\n",
541 |        "      <th>quantity</th>\n",
542 |        "      <th>revenue</th>\n",
543 |        "      <th>country_enc</th>\n",
544 |        "    </tr>\n",
545 |        "    <tr>\n",
546 |        "      <th>week</th>\n",
547 |        "      <th></th>\n",
548 |        "      <th></th>\n",
549 |        "      <th></th>\n",
550 |        "    </tr>\n",
551 |        "  </thead>\n",
552 |        "  <tbody>\n",
553 |        "    <tr>\n",
554 |        "      <th>2009-12-06</th>\n",
555 |        "      <td>143</td>\n",
556 |        "      <td>439.1</td>\n",
557 |        "      <td>NaN</td>\n",
558 |        "    </tr>\n",
559 |        "    <tr>\n",
560 |        "      <th>2009-12-13</th>\n",
561 |        "      <td>10</td>\n",
562 |        "      <td>8.5</td>\n",
563 |        "      <td>439.1</td>\n",
564 |        "    </tr>\n",
565 |        "    <tr>\n",
566 |        "      <th>2009-12-20</th>\n",
567 |        "      <td>0</td>\n",
568 |        "      <td>0.0</td>\n",
569 |        "      <td>223.8</td>\n",
570 |        "    </tr>\n",
571 |        "    <tr>\n",
572 |        "      <th>2009-12-27</th>\n",
573 |        "      <td>0</td>\n",
574 |        "      <td>0.0</td>\n",
575 |        "      <td>149.2</td>\n",
576 |        "    </tr>\n",
577 |        "    <tr>\n",
578 |        "      <th>2010-01-03</th>\n",
579 |        "      <td>0</td>\n",
580 |        "      <td>0.0</td>\n",
581 |        "      <td>111.9</td>\n",
582 |        "    </tr>\n",
583 |        "  </tbody>\n",
584 |        "</table>\n",
585 |        "</div>"
586 |       ],
587 |       "text/plain": [
588 |        "            quantity  revenue  country_enc\n",
589 |        "week                                      \n",
590 |        "2009-12-06       143    439.1          NaN\n",
591 |        "2009-12-13        10      8.5        439.1\n",
592 |        "2009-12-20         0      0.0        223.8\n",
593 |        "2009-12-27         0      0.0        149.2\n",
594 |        "2010-01-03         0      0.0        111.9"
595 |       ]
596 |      },
597 |      "execution_count": 8,
598 |      "metadata": {},
599 |      "output_type": "execute_result"
600 |     }
601 |    ],
602 |    "source": [
603 |     "# Now we drop the static variable\n",
604 |     "\n",
605 |     "X_train_enc = X_train_enc.drop(\"country\", axis=1)\n",
606 |     "\n",
607 |     "# Reset the index\n",
608 |     "X_train_enc.set_index(\"week\", inplace=True)\n",
609 |     "\n",
610 |     "X_train_enc.head()"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": 9,
616 |    "id": "2402ebb9",
617 |    "metadata": {},
618 |    "outputs": [
619 |     {
620 |      "data": {
621 |       "text/html": [
622 |        "<div>\n",
623 |        "<style scoped>\n",
624 |        "    .dataframe tbody tr th:only-of-type {\n",
625 |        "        vertical-align: middle;\n",
626 |        "    }\n",
627 |        "\n",
628 |        "    .dataframe tbody tr th {\n",
629 |        "        vertical-align: top;\n",
630 |        "    }\n",
631 |        "\n",
632 |        "    .dataframe thead th {\n",
633 |        "        text-align: right;\n",
634 |        "    }\n",
635 |        "</style>\n",
636 |        "<table border=\"1\" class=\"dataframe\">\n",
637 |        "  <thead>\n",
638 |        "    <tr style=\"text-align: right;\">\n",
639 |        "      <th></th>\n",
640 |        "      <th>quantity</th>\n",
641 |        "      <th>revenue</th>\n",
642 |        "      <th>country_enc</th>\n",
643 |        "    </tr>\n",
644 |        "    <tr>\n",
645 |        "      <th>week</th>\n",
646 |        "      <th></th>\n",
647 |        "      <th></th>\n",
648 |        "      <th></th>\n",
649 |        "    </tr>\n",
650 |        "  </thead>\n",
651 |        "  <tbody>\n",
652 |        "    <tr>\n",
653 |        "      <th>2011-07-03</th>\n",
654 |        "      <td>103</td>\n",
655 |        "      <td>163.90</td>\n",
656 |        "      <td>511.378537</td>\n",
657 |        "    </tr>\n",
658 |        "    <tr>\n",
659 |        "      <th>2011-07-10</th>\n",
660 |        "      <td>666</td>\n",
661 |        "      <td>1022.82</td>\n",
662 |        "      <td>507.192048</td>\n",
663 |        "    </tr>\n",
664 |        "    <tr>\n",
665 |        "      <th>2011-07-17</th>\n",
666 |        "      <td>13</td>\n",
667 |        "      <td>45.60</td>\n",
668 |        "      <td>513.330476</td>\n",
669 |        "    </tr>\n",
670 |        "    <tr>\n",
671 |        "      <th>2011-07-24</th>\n",
672 |        "      <td>0</td>\n",
673 |        "      <td>0.00</td>\n",
674 |        "      <td>507.827765</td>\n",
675 |        "    </tr>\n",
676 |        "    <tr>\n",
677 |        "      <th>2011-07-31</th>\n",
678 |        "      <td>1000</td>\n",
679 |        "      <td>1407.15</td>\n",
680 |        "      <td>501.922791</td>\n",
681 |        "    </tr>\n",
682 |        "  </tbody>\n",
683 |        "</table>\n",
684 |        "</div>"
685 |       ],
686 |       "text/plain": [
687 |        "            quantity  revenue  country_enc\n",
688 |        "week                                      \n",
689 |        "2011-07-03       103   163.90   511.378537\n",
690 |        "2011-07-10       666  1022.82   507.192048\n",
691 |        "2011-07-17        13    45.60   513.330476\n",
692 |        "2011-07-24         0     0.00   507.827765\n",
693 |        "2011-07-31      1000  1407.15   501.922791"
694 |       ]
695 |      },
696 |      "execution_count": 9,
697 |      "metadata": {},
698 |      "output_type": "execute_result"
699 |     }
700 |    ],
701 |    "source": [
702 |     "# Now we repeat for the test set\n",
703 |     "\n",
704 |     "# Find the encoding values\n",
705 |     "test_enc = (\n",
706 |     "    X_test\n",
707 |     "    .groupby(['country'])['revenue']\n",
708 |     "    .expanding()\n",
709 |     "    .mean()\n",
710 |     "    .shift()\n",
711 |     ").reset_index()\n",
712 |     "\n",
713 |     "test_enc.rename(columns = {\"revenue\": \"country_enc\"}, inplace = True)\n",
714 |     "\n",
715 |     "# join encoded variable\n",
716 |     "X_test_enc = X_test.reset_index().merge(test_enc)\n",
717 |     "\n",
718 |     "# Drop original variable\n",
719 |     "X_test_enc = X_test_enc.drop(\"country\", axis=1)\n",
720 |     "\n",
721 |     "# Reset the index\n",
722 |     "X_test_enc.set_index(\"week\", inplace=True)\n",
723 |     "\n",
724 |     "# Remove data that belongs to the train set\n",
725 |     "X_test_enc = X_test_enc[X_test_enc.index > pd.to_datetime('2011-06-30')]\n",
726 |     "\n",
727 |     "X_test_enc.head()"
728 |    ]
729 |   },
730 |   {
731 |    "cell_type": "markdown",
732 |    "id": "86a89e3e",
733 |    "metadata": {},
734 |    "source": [
735 |     "That's it!\n",
736 |     "\n",
737 |     "As you can see, with this way of encoding the static feature, we need to do a lot of the work manually, and we need to be careful to have enough data in the train set, and to split the data correctly after the encoding."
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "code",
742 |    "execution_count": null,
743 |    "id": "77b803d1",
744 |    "metadata": {},
745 |    "outputs": [],
746 |    "source": []
747 |   }
748 |  ],
749 |  "metadata": {
750 |   "kernelspec": {
751 |    "display_name": "fsml",
752 |    "language": "python",
753 |    "name": "fsml"
754 |   },
755 |   "language_info": {
756 |    "codemirror_mode": {
757 |     "name": "ipython",
758 |     "version": 3
759 |    },
760 |    "file_extension": ".py",
761 |    "mimetype": "text/x-python",
762 |    "name": "python",
763 |    "nbconvert_exporter": "python",
764 |    "pygments_lexer": "ipython3",
765 |    "version": "3.10.5"
766 |   },
767 |   "toc": {
768 |    "base_numbering": 1,
769 |    "nav_menu": {},
770 |    "number_sections": true,
771 |    "sideBar": true,
772 |    "skip_h1_title": false,
773 |    "title_cell": "Table of Contents",
774 |    "title_sidebar": "Contents",
775 |    "toc_cell": false,
776 |    "toc_position": {
777 |     "height": "calc(100% - 180px)",
778 |     "left": "10px",
779 |     "top": "150px",
780 |     "width": "173.267px"
781 |    },
782 |    "toc_section_display": true,
783 |    "toc_window_display": true
784 |   }
785 |  },
786 |  "nbformat": 4,
787 |  "nbformat_minor": 5
788 | }
789 | 


--------------------------------------------------------------------------------
/Appendix/00-pandas-period.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "24d78855-9712-419b-8201-486452f5120a",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Pandas Period"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "bdb234fa-ee2e-403d-a0ff-4b2c0fdced43",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n",
 17 |     "\n",
 18 |     "In this notebook we'll discuss the Pandas `Period` and `PeriodIndex` type to handle time span related data."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "010aee50-728d-4c24-a6f6-9282a71364c1",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Load example data"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "55065bd0-a3fe-4d4b-970c-ea09d514fb12",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "The air passengers dataset is the monthly totals of international airline passengers, from 1949 to 1960, in units of 1000s. \n",
 35 |     "\n",
 36 |     "For instructions on how to download, prepare, and store the dataset, refer to notebook number 5, in the folder \"01-Create-Datasets\" from this repo."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 1,
 42 |    "id": "7e30d3c0-baa1-4fb0-86c4-6196e46641c0",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "import pandas as pd\n",
 47 |     "import numpy as np"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 2,
 53 |    "id": "d8d74785-9082-4711-8dad-de0d3b333ab6",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "df = pd.read_csv(\n",
 58 |     "    \"../Datasets/example_air_passengers.csv\",\n",
 59 |     "    parse_dates=[\"ds\"],\n",
 60 |     "    index_col=[\"ds\"],\n",
 61 |     ")"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "id": "f258096a-1171-43b0-97d7-70ce59f74e00",
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',\n",
 74 |        "               '1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01',\n",
 75 |        "               '1949-09-01', '1949-10-01',\n",
 76 |        "               ...\n",
 77 |        "               '1960-03-01', '1960-04-01', '1960-05-01', '1960-06-01',\n",
 78 |        "               '1960-07-01', '1960-08-01', '1960-09-01', '1960-10-01',\n",
 79 |        "               '1960-11-01', '1960-12-01'],\n",
 80 |        "              dtype='datetime64[ns]', name='ds', length=144, freq=None)"
 81 |       ]
 82 |      },
 83 |      "execution_count": 3,
 84 |      "metadata": {},
 85 |      "output_type": "execute_result"
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "df.index"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 4,
 95 |    "id": "777e7fb4-3554-41e4-8a0d-8b1d8086e14f",
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "pandas._libs.tslibs.timestamps.Timestamp"
102 |       ]
103 |      },
104 |      "execution_count": 4,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "type(df.index[0])"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "id": "d9d5b80c-3b03-427e-a474-de54022904b9",
116 |    "metadata": {},
117 |    "source": [
118 |     "The current type of our index is a `DatetimeIndex` where each element is a `Timestamp`."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "id": "bf881c8f-77b0-4dea-89a3-9a39046a5e64",
124 |    "metadata": {},
125 |    "source": [
126 |     "# Pandas Period - what is it and when to use it."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "7e98b0a0-27a6-4515-91fa-961a76ae0a8a",
132 |    "metadata": {},
133 |    "source": [
134 |     "When working with time related information which refers to a time span (e.g., the sales of products over each month) rather than an instance in time (e.g., an event that occurs at a specific timestamp), it can be more convenient to work with a data type in Pandas called `Period`."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "995f40b2-fab4-40a0-9b43-643da70f2b56",
140 |    "metadata": {},
141 |    "source": [
142 |     "To read more about the `Period` type in Pandas see the [docs](https://pandas.pydata.org/docs/user_guide/timeseries.html), in particular the section titled \"timestamps vs. time spans\".\n",
143 |     "    \n",
144 |     "   > \"A `Period` represents a span of time (e.g., a day, a month, a quarter, etc).\"\n",
145 |     "   \n",
146 |     "   > \"Under the hood, pandas represents timestamps using instances of `Timestamp` and sequences of timestamps using instances of `DatetimeIndex`. For regular time spans, pandas uses `Period` objects for scalar values and `PeriodIndex` for sequences of spans.\""
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "id": "1d47571a-47c0-4611-b5a1-f90e456a72eb",
152 |    "metadata": {},
153 |    "source": [
154 |     "`Period` objects can be created just as easily as timestamp `Timestamp` objects."
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 5,
160 |    "id": "f979b672-257f-459e-9b55-84d4aab3760b",
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "Timestamp('2020-01-01 00:00:00')"
167 |       ]
168 |      },
169 |      "execution_count": 5,
170 |      "metadata": {},
171 |      "output_type": "execute_result"
172 |     }
173 |    ],
174 |    "source": [
175 |     "pd.Timestamp(\"2020-01-01\") # Create a timestamp representing 1st January 2020 at time 00:00:00"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 6,
181 |    "id": "1ee94d2b-94a0-4f7e-9a86-5ee68430f8b2",
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "Period('2020-01', 'M')"
188 |       ]
189 |      },
190 |      "execution_count": 6,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "pd.Period(\"2020-01\", freq=\"M\") # Create a time period representing the month of January 2020"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "id": "bb8d418b-64b5-4c1b-a214-fe64a2e7eb5d",
202 |    "metadata": {},
203 |    "source": [
204 |     "For example, our dataset index currently is a `DatetimeIndex` where there is a day (and even a time) associated with each month (e.g., 1960-12-01 00:00:00), despite the day and time being meaningless for this data set. What we're trying to represent is the sales over the time span of a given month."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 7,
210 |    "id": "463cb570-fbab-4a1b-926e-ff911d628868",
211 |    "metadata": {},
212 |    "outputs": [
213 |     {
214 |      "data": {
215 |       "text/html": [
216 |        "<div>\n",
217 |        "<style scoped>\n",
218 |        "    .dataframe tbody tr th:only-of-type {\n",
219 |        "        vertical-align: middle;\n",
220 |        "    }\n",
221 |        "\n",
222 |        "    .dataframe tbody tr th {\n",
223 |        "        vertical-align: top;\n",
224 |        "    }\n",
225 |        "\n",
226 |        "    .dataframe thead th {\n",
227 |        "        text-align: right;\n",
228 |        "    }\n",
229 |        "</style>\n",
230 |        "<table border=\"1\" class=\"dataframe\">\n",
231 |        "  <thead>\n",
232 |        "    <tr style=\"text-align: right;\">\n",
233 |        "      <th></th>\n",
234 |        "      <th>y</th>\n",
235 |        "    </tr>\n",
236 |        "    <tr>\n",
237 |        "      <th>ds</th>\n",
238 |        "      <th></th>\n",
239 |        "    </tr>\n",
240 |        "  </thead>\n",
241 |        "  <tbody>\n",
242 |        "    <tr>\n",
243 |        "      <th>1949-01-01</th>\n",
244 |        "      <td>112</td>\n",
245 |        "    </tr>\n",
246 |        "    <tr>\n",
247 |        "      <th>1949-02-01</th>\n",
248 |        "      <td>118</td>\n",
249 |        "    </tr>\n",
250 |        "    <tr>\n",
251 |        "      <th>1949-03-01</th>\n",
252 |        "      <td>132</td>\n",
253 |        "    </tr>\n",
254 |        "    <tr>\n",
255 |        "      <th>1949-04-01</th>\n",
256 |        "      <td>129</td>\n",
257 |        "    </tr>\n",
258 |        "    <tr>\n",
259 |        "      <th>1949-05-01</th>\n",
260 |        "      <td>121</td>\n",
261 |        "    </tr>\n",
262 |        "  </tbody>\n",
263 |        "</table>\n",
264 |        "</div>"
265 |       ],
266 |       "text/plain": [
267 |        "              y\n",
268 |        "ds             \n",
269 |        "1949-01-01  112\n",
270 |        "1949-02-01  118\n",
271 |        "1949-03-01  132\n",
272 |        "1949-04-01  129\n",
273 |        "1949-05-01  121"
274 |       ]
275 |      },
276 |      "execution_count": 7,
277 |      "metadata": {},
278 |      "output_type": "execute_result"
279 |     }
280 |    ],
281 |    "source": [
282 |     "df.head()"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "id": "0c107454-d49a-49e4-adbf-2c0a97f6597d",
288 |    "metadata": {},
289 |    "source": [
290 |     "We can convert the index from `datetime` to `Period` as follows:"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 8,
296 |    "id": "bba662e9-82ee-47a3-ad78-88047a12a911",
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "df.index = df.index.to_period()"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 9,
306 |    "id": "eda8d569-a678-4d35-90a6-d288cf53986d",
307 |    "metadata": {},
308 |    "outputs": [
309 |     {
310 |      "data": {
311 |       "text/html": [
312 |        "<div>\n",
313 |        "<style scoped>\n",
314 |        "    .dataframe tbody tr th:only-of-type {\n",
315 |        "        vertical-align: middle;\n",
316 |        "    }\n",
317 |        "\n",
318 |        "    .dataframe tbody tr th {\n",
319 |        "        vertical-align: top;\n",
320 |        "    }\n",
321 |        "\n",
322 |        "    .dataframe thead th {\n",
323 |        "        text-align: right;\n",
324 |        "    }\n",
325 |        "</style>\n",
326 |        "<table border=\"1\" class=\"dataframe\">\n",
327 |        "  <thead>\n",
328 |        "    <tr style=\"text-align: right;\">\n",
329 |        "      <th></th>\n",
330 |        "      <th>y</th>\n",
331 |        "    </tr>\n",
332 |        "    <tr>\n",
333 |        "      <th>ds</th>\n",
334 |        "      <th></th>\n",
335 |        "    </tr>\n",
336 |        "  </thead>\n",
337 |        "  <tbody>\n",
338 |        "    <tr>\n",
339 |        "      <th>1949-01</th>\n",
340 |        "      <td>112</td>\n",
341 |        "    </tr>\n",
342 |        "    <tr>\n",
343 |        "      <th>1949-02</th>\n",
344 |        "      <td>118</td>\n",
345 |        "    </tr>\n",
346 |        "    <tr>\n",
347 |        "      <th>1949-03</th>\n",
348 |        "      <td>132</td>\n",
349 |        "    </tr>\n",
350 |        "    <tr>\n",
351 |        "      <th>1949-04</th>\n",
352 |        "      <td>129</td>\n",
353 |        "    </tr>\n",
354 |        "    <tr>\n",
355 |        "      <th>1949-05</th>\n",
356 |        "      <td>121</td>\n",
357 |        "    </tr>\n",
358 |        "  </tbody>\n",
359 |        "</table>\n",
360 |        "</div>"
361 |       ],
362 |       "text/plain": [
363 |        "           y\n",
364 |        "ds          \n",
365 |        "1949-01  112\n",
366 |        "1949-02  118\n",
367 |        "1949-03  132\n",
368 |        "1949-04  129\n",
369 |        "1949-05  121"
370 |       ]
371 |      },
372 |      "execution_count": 9,
373 |      "metadata": {},
374 |      "output_type": "execute_result"
375 |     }
376 |    ],
377 |    "source": [
378 |     "df.head()"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 10,
384 |    "id": "e6d889f0-8c74-4eec-a367-da6c005b067c",
385 |    "metadata": {},
386 |    "outputs": [
387 |     {
388 |      "data": {
389 |       "text/plain": [
390 |        "PeriodIndex(['1949-01', '1949-02', '1949-03', '1949-04', '1949-05', '1949-06',\n",
391 |        "             '1949-07', '1949-08', '1949-09', '1949-10',\n",
392 |        "             ...\n",
393 |        "             '1960-03', '1960-04', '1960-05', '1960-06', '1960-07', '1960-08',\n",
394 |        "             '1960-09', '1960-10', '1960-11', '1960-12'],\n",
395 |        "            dtype='period[M]', name='ds', length=144)"
396 |       ]
397 |      },
398 |      "execution_count": 10,
399 |      "metadata": {},
400 |      "output_type": "execute_result"
401 |     }
402 |    ],
403 |    "source": [
404 |     "df.index"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "id": "4f351009-6800-470c-845c-5f7338a8db97",
410 |    "metadata": {},
411 |    "source": [
412 |     "We now have a `PeriodIndex` with monthly frequency which better represents the time series (i.e., the sales over the whole month)."
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "id": "1e2f4e38-9bee-46dc-88b1-8933bf5a0394",
418 |    "metadata": {},
419 |    "source": [
420 |     "`Period` objects can make it easier to do certain calculations. Let's add one month to a given period:"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 11,
426 |    "id": "9eae6521-ed45-4bb9-b1c4-c375c5ebdf6b",
427 |    "metadata": {},
428 |    "outputs": [
429 |     {
430 |      "data": {
431 |       "text/plain": [
432 |        "Period('1949-01', 'M')"
433 |       ]
434 |      },
435 |      "execution_count": 11,
436 |      "metadata": {},
437 |      "output_type": "execute_result"
438 |     }
439 |    ],
440 |    "source": [
441 |     "df.index[0]"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 12,
447 |    "id": "e86cbaa9-ce01-47cc-b5e0-b8ef6931d77e",
448 |    "metadata": {},
449 |    "outputs": [
450 |     {
451 |      "data": {
452 |       "text/plain": [
453 |        "Period('1949-02', 'M')"
454 |       ]
455 |      },
456 |      "execution_count": 12,
457 |      "metadata": {},
458 |      "output_type": "execute_result"
459 |     }
460 |    ],
461 |    "source": [
462 |     "df.index[0] + 1"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "markdown",
467 |    "id": "3d567f8f-a0f7-4c3d-84ba-4385f44daeb5",
468 |    "metadata": {},
469 |    "source": [
470 |     "`Period` is also the preferred type when calculating the **exact** differences in dates in terms of calendar events (e.g., what is the exact integer difference between the week numbers of the two following timestamps: \"2012-01-15 10:00:00\" (week 2, year 2012) and \"2014-04-01 01:30:00\" (week 14, year 2014))"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "id": "95c877ef-d1a1-4f44-a0e1-5f7c700cc064",
476 |    "metadata": {},
477 |    "source": [
478 |     "Using `Period`"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 13,
484 |    "id": "fd40ddc1-00bd-4289-a018-27a918d72e68",
485 |    "metadata": {},
486 |    "outputs": [
487 |     {
488 |      "data": {
489 |       "text/plain": [
490 |        "<-116 * Weeks: weekday=6>"
491 |       ]
492 |      },
493 |      "execution_count": 13,
494 |      "metadata": {},
495 |      "output_type": "execute_result"
496 |     }
497 |    ],
498 |    "source": [
499 |     "delta = pd.Period(\"2012-01-15 10:00:00\", freq=\"W\") - pd.Period(\"2014-04-01 01:30:00\", freq=\"W\")\n",
500 |     "delta"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "id": "63f43b42-ffcc-4f7e-ad2e-133bb2a056c8",
506 |    "metadata": {},
507 |    "source": [
508 |     "We can get the integer using the `n` attribute:"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": 14,
514 |    "id": "bfa9b0fb-3e19-4465-baaa-d99d0cebb9d4",
515 |    "metadata": {},
516 |    "outputs": [
517 |     {
518 |      "data": {
519 |       "text/plain": [
520 |        "-116"
521 |       ]
522 |      },
523 |      "execution_count": 14,
524 |      "metadata": {},
525 |      "output_type": "execute_result"
526 |     }
527 |    ],
528 |    "source": [
529 |     "delta.n"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "markdown",
534 |    "id": "050c54fa-fc33-4b6e-97bb-e594fe148897",
535 |    "metadata": {},
536 |    "source": [
537 |     "Using `Timestamp` and `timedelta` objects we only get approximate, and sometimes incorrect, answers:"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": 15,
543 |    "id": "be49da06-d33f-4be8-9a6b-443450331a92",
544 |    "metadata": {},
545 |    "outputs": [
546 |     {
547 |      "data": {
548 |       "text/plain": [
549 |        "-115.23511904761905"
550 |       ]
551 |      },
552 |      "execution_count": 15,
553 |      "metadata": {},
554 |      "output_type": "execute_result"
555 |     }
556 |    ],
557 |    "source": [
558 |     "(pd.Timestamp(\"2012-01-15 10:00:00\") - pd.Timestamp(\"2014-04-01 01:30:00\")) / np.timedelta64(1, \"W\")"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "markdown",
563 |    "id": "3119df1a-2812-4133-b9e6-4e7580c8cf64",
564 |    "metadata": {},
565 |    "source": [
566 |     "Whether we use `Period` or `datetime` should not change the forecasting workflow, but it will make some calculations easier depending on the time series."
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "markdown",
571 |    "id": "a3bf4812-70e1-4a36-9658-90d164395bc8",
572 |    "metadata": {},
573 |    "source": [
574 |     "In general, if your data represents a timespan then `Period` (e.g., sales over one month) can make handling the data more convenient. If your data represents events that occurred at a timepoint then `datetime` or `Timestamp` is preferred."
575 |    ]
576 |   }
577 |  ],
578 |  "metadata": {
579 |   "kernelspec": {
580 |    "display_name": "Python 3 (ipykernel)",
581 |    "language": "python",
582 |    "name": "python3"
583 |   },
584 |   "language_info": {
585 |    "codemirror_mode": {
586 |     "name": "ipython",
587 |     "version": 3
588 |    },
589 |    "file_extension": ".py",
590 |    "mimetype": "text/x-python",
591 |    "name": "python",
592 |    "nbconvert_exporter": "python",
593 |    "pygments_lexer": "ipython3",
594 |    "version": "3.10.5"
595 |   },
596 |   "toc": {
597 |    "base_numbering": 1,
598 |    "nav_menu": {},
599 |    "number_sections": true,
600 |    "sideBar": true,
601 |    "skip_h1_title": false,
602 |    "title_cell": "Table of Contents",
603 |    "title_sidebar": "Contents",
604 |    "toc_cell": false,
605 |    "toc_position": {},
606 |    "toc_section_display": true,
607 |    "toc_window_display": false
608 |   }
609 |  },
610 |  "nbformat": 4,
611 |  "nbformat_minor": 5
612 | }
613 | 


--------------------------------------------------------------------------------
/Datasets/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/Datasets/.gitkeep


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2021-2024, Kishan Manani, Soledad Galli
 4 | Feature Engineering for Time Series - Online Course:
 5 | https://www.trainindata.com/p/feature-engineering-for-forecasting
 6 | 
 7 | 
 8 | Redistribution and use in source and binary forms, with or without
 9 | modification, are permitted provided that the following conditions are met:
10 | 
11 | 1. Redistributions of source code must retain the above copyright notice, this
12 |    list of conditions and the following disclaimer.
13 | 
14 | 2. Redistributions in binary form must reproduce the above copyright notice,
15 |    this list of conditions and the following disclaimer in the documentation
16 |    and/or other materials provided with the distribution.
17 | 
18 | 3. Neither the name of the copyright holder nor the names of its
19 |    contributors may be used to endorse or promote products derived from
20 |    this software without specific prior written permission.
21 | 
22 | 
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ﻿## Feature Engineering for Time Series Forecasting - Code Repository
 2 | 
 3 | [<img src="images/FETSF_banner.png" width="1500">](https://www.trainindata.com/p/feature-engineering-for-forecasting)
 4 | 
 5 | 
 6 | ![PythonVersion](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10-success)
 7 | [![License https://github.com/trainindata/feature-engineering-for-time-series-forecasting/blob/master/LICENSE](https://img.shields.io/badge/license-BSD-success.svg)](https://github.com/trainindata/feature-engineering-for-time-series-forecasting/blob/master/LICENSE)
 8 | [![Sponsorship https://www.trainindata.com/](https://img.shields.io/badge/Powered%20By-TrainInData-orange.svg)](https://www.trainindata.com/)
 9 | 
10 | Published October, 2022
11 | 
12 | Actively maintained.
13 | 
14 | ## Links
15 | 
16 | - [Online Course](https://www.trainindata.com/p/feature-engineering-for-forecasting)
17 | 
18 | 
19 | ## Table of Contents
20 | 
21 | 1. **Tabularizing time series data**
22 | 	1. Features from the target
23 | 	2. Features from exogenous variables
24 | 	3. Single step forecasting
25 | 
26 | 2. **Challenges in feature engineering for time series**
27 | 	1. Train-test split
28 | 	2. Pipelines
29 | 	3. Multistep forecasting
30 | 	4. Direct forecasting
31 | 	5. Recursive forecasting
32 | 	
33 | 3. **Time series decomposition**
34 | 	1. Components of a time series: trend and seasonality
35 | 	2. Multiplicative and additive models
36 | 	3. Log transform and Box-Cox
37 | 	4. Moving averages
38 | 	5. LOWESS, STL, and multiseasonal time series decomposition
39 | 
40 | 4. **Missing data imputation**
41 | 	1. Forward and backward filling
42 | 	2. Linear and spline interpolation
43 | 	3. Seasonal decomposition and interpolation
44 | 
45 | 5. **Outliers**
46 | 	1. Rolling statistics for outlier detection
47 | 	2. LOWESS for outlier detection
48 | 	3. STL for outlier detection
49 | 
50 | 6. **Lag features**
51 | 	1. Autoregressive processes
52 | 	2. Lag plots
53 | 	3. ACF, PACF, CCF
54 | 	4. Seasonal lags
55 | 	4. Creating lags with open-source
56 | 
57 | 7. **Window features**
58 | 	1. Rolling windows
59 | 	2. Expanding windows
60 | 	3. Exponentially weighted windows
61 | 	4. Creating window features with open-source
62 | 
63 | 8. **Trend features**
64 | 	1. Using time to model linear trend
65 |     2. Polynomial features of time to model non-linear trend
66 | 	3. Changepoints & piecweise linear trends to model non-linear trend
67 | 	4. Forecasting time series with trend using tree-based models
68 | 	5. Creating trend features with open-source
69 | 
70 | 9. **Seasonality features**
71 | 	1. Seasonal lags
72 | 	2. Seasonal dummies
73 | 	3. Seasonal decomposition methods
74 | 	4. Fourier terms
75 | 	5. Creating seasonality features with open-source
76 | 
77 | 10. **Datetime features**
78 | 	1. Extracting features from date and time
79 | 	2. Periodic features
80 | 	3. Calendar events
81 | 	4. Creating datetime features with open-source
82 | 
83 | 11. **Categorical Features**
84 | 	1. One hot encoding
85 | 	2. Target encoding
86 | 	3. Rolling entropy and rolling majority
87 | 
88 | 
89 | - [Online Course](https://www.trainindata.com/p/feature-engineering-for-forecasting)
90 | 


--------------------------------------------------------------------------------
/assignments/02-tabularizing-time-series/assignment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "2d1a73ab",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Tabularize time series\n",
  9 |     "\n",
 10 |     "In this assignment, your task is to convert **time series data** into a **tabular data set**.\n",
 11 |     "\n",
 12 |     "You need to create suitable input features from a time series containing weekly sales to be able to forecast sales for the next week.\n",
 13 |     "\n",
 14 |     "To prepare the dataset for this assignment, please follow the guidelines in the notebook `02-create-online-retail-II-datasets.ipynb` in the `01-Create-Datasets` folder."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "id": "f53976d3",
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "data": {
 25 |       "text/html": [
 26 |        "<div>\n",
 27 |        "<style scoped>\n",
 28 |        "    .dataframe tbody tr th:only-of-type {\n",
 29 |        "        vertical-align: middle;\n",
 30 |        "    }\n",
 31 |        "\n",
 32 |        "    .dataframe tbody tr th {\n",
 33 |        "        vertical-align: top;\n",
 34 |        "    }\n",
 35 |        "\n",
 36 |        "    .dataframe thead th {\n",
 37 |        "        text-align: right;\n",
 38 |        "    }\n",
 39 |        "</style>\n",
 40 |        "<table border=\"1\" class=\"dataframe\">\n",
 41 |        "  <thead>\n",
 42 |        "    <tr style=\"text-align: right;\">\n",
 43 |        "      <th></th>\n",
 44 |        "      <th>sales</th>\n",
 45 |        "    </tr>\n",
 46 |        "    <tr>\n",
 47 |        "      <th>week</th>\n",
 48 |        "      <th></th>\n",
 49 |        "    </tr>\n",
 50 |        "  </thead>\n",
 51 |        "  <tbody>\n",
 52 |        "    <tr>\n",
 53 |        "      <th>2009-12-06</th>\n",
 54 |        "      <td>213000.35</td>\n",
 55 |        "    </tr>\n",
 56 |        "    <tr>\n",
 57 |        "      <th>2009-12-13</th>\n",
 58 |        "      <td>195810.04</td>\n",
 59 |        "    </tr>\n",
 60 |        "    <tr>\n",
 61 |        "      <th>2009-12-20</th>\n",
 62 |        "      <td>182396.74</td>\n",
 63 |        "    </tr>\n",
 64 |        "    <tr>\n",
 65 |        "      <th>2009-12-27</th>\n",
 66 |        "      <td>22007.77</td>\n",
 67 |        "    </tr>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>2010-01-03</th>\n",
 70 |        "      <td>0.00</td>\n",
 71 |        "    </tr>\n",
 72 |        "  </tbody>\n",
 73 |        "</table>\n",
 74 |        "</div>"
 75 |       ],
 76 |       "text/plain": [
 77 |        "                sales\n",
 78 |        "week                 \n",
 79 |        "2009-12-06  213000.35\n",
 80 |        "2009-12-13  195810.04\n",
 81 |        "2009-12-20  182396.74\n",
 82 |        "2009-12-27   22007.77\n",
 83 |        "2010-01-03       0.00"
 84 |       ]
 85 |      },
 86 |      "execution_count": 2,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "# load weekly sales dataset\n",
 93 |     "\n",
 94 |     "filename = \"../../Datasets/online_retail_dataset.csv\"\n",
 95 |     "\n",
 96 |     "df = pd.read_csv(\n",
 97 |     "    filename,\n",
 98 |     "    usecols=[\"week\", \"United Kingdom\"],\n",
 99 |     "    parse_dates=[\"week\"],\n",
100 |     "    index_col=[\"week\"],\n",
101 |     ")\n",
102 |     "\n",
103 |     "df.columns = ['sales']\n",
104 |     "\n",
105 |     "df.head()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "id": "cdfe9415",
111 |    "metadata": {},
112 |    "source": [
113 |     "# Data analysis\n",
114 |     "\n",
115 |     "First, explore the time series.\n",
116 |     "\n",
117 |     "## Plot time series"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "id": "6ceabd79",
123 |    "metadata": {},
124 |    "source": [
125 |     "## Missing data\n",
126 |     "\n",
127 |     "Check if there are missing values in the time series."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "id": "9c484bca",
133 |    "metadata": {},
134 |    "source": [
135 |     "## Missing timestamps\n",
136 |     "\n",
137 |     "Check if there are missing timestamps in the index."
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "id": "444ca303",
143 |    "metadata": {},
144 |    "source": [
145 |     "## Seasonality\n",
146 |     "\n",
147 |     "Does the time series show any obvious seasonal pattern?"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "id": "e81565cb",
153 |    "metadata": {},
154 |    "source": [
155 |     "# Feature engineering\n",
156 |     "\n",
157 |     "Now, let's begin to tabularize the data."
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "id": "20ae8079",
163 |    "metadata": {},
164 |    "source": [
165 |     "## Split data\n",
166 |     "\n",
167 |     "Separate the data into training and testing sets, leaving the data after the last week of September to evaluate the forecasts, that is, in the testing set."
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "id": "820803d5",
173 |    "metadata": {},
174 |    "source": [
175 |     "## Naive forecast\n",
176 |     "\n",
177 |     "Predict sales in the next week (t) as the value of sales in the previous week (t-1)."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "id": "4058260e",
183 |    "metadata": {},
184 |    "source": [
185 |     "## Machine Learning"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "id": "4957673a",
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": []
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "fsml",
200 |    "language": "python",
201 |    "name": "fsml"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": {
205 |     "name": "ipython",
206 |     "version": 3
207 |    },
208 |    "file_extension": ".py",
209 |    "mimetype": "text/x-python",
210 |    "name": "python",
211 |    "nbconvert_exporter": "python",
212 |    "pygments_lexer": "ipython3",
213 |    "version": "3.10.5"
214 |   },
215 |   "toc": {
216 |    "base_numbering": 1,
217 |    "nav_menu": {},
218 |    "number_sections": true,
219 |    "sideBar": true,
220 |    "skip_h1_title": false,
221 |    "title_cell": "Table of Contents",
222 |    "title_sidebar": "Contents",
223 |    "toc_cell": false,
224 |    "toc_position": {},
225 |    "toc_section_display": true,
226 |    "toc_window_display": true
227 |   }
228 |  },
229 |  "nbformat": 4,
230 |  "nbformat_minor": 5
231 | }
232 | 


--------------------------------------------------------------------------------
/images/FETSF_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/FETSF_banner.png


--------------------------------------------------------------------------------
/images/forecasting_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/forecasting_framework.png


--------------------------------------------------------------------------------
/images/lag_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/lag_features.png


--------------------------------------------------------------------------------
/images/trainindata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/trainindata.png


--------------------------------------------------------------------------------
/images/window_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/window_features.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openpyxl>=3.0.6
 2 | xlrd>=2.0.1
 3 | 
 4 | # Numerical computing libraries
 5 | pandas>=1.4.0
 6 | numpy>=1.18.0
 7 | scikit-learn>=1.0.0
 8 | scipy>=1.6.0
 9 | statsmodels>=0.12.1
10 | 
11 | # plotting libraries
12 | matplotlib>=3.3.4
13 | seaborn>=0.11.1
14 | 
15 | # jupyter notebook
16 | jupyterlab>=3.0.6
17 | ipykernel>=5.5.5
18 | 
19 | # feature engineering libraries
20 | feature-engine>=1.3.0
21 | featuretools>=1.2.0


--------------------------------------------------------------------------------