├── .gitignore
├── 01-Create-Datasets
├── 01-create-retail-datasets.ipynb
├── 02-create-online-retail-II-datasets.ipynb
├── 03-create-air-quality-dataset.ipynb
├── 04-create-air-passengers-dataset.ipynb
└── 05-create-electricity-demand-dataset.ipynb
├── 02-Tabularizing-Time-Series
├── 01-data-analysis-air-pollutants.ipynb
├── 02-feature-engineering-air-pollutants.ipynb
└── 03-forecasting-air-pollutants.ipynb
├── 03-Challenges-in-Time-Series-Forecasting
├── 01-Refactoring-feature-engineering.ipynb
├── 02-forecasting-one-step-ahead.ipynb
├── 03-multistep-forecasting-direct.ipynb
├── 04-multistep-forecasting-recursive.ipynb
└── 05-multistep-forecasting-recursive-continued.ipynb
├── 04-Time-Series-Decomposition
├── 01-box-cox-transform.ipynb
├── 02-compute-moving-averages.ipynb
├── 03-classical-decomposition-to-compute-trend-and-seasonality.ipynb
├── 04-LOWESS-to-compute-trend.ipynb
├── 05-STL-to-compute-trend-and-seasonality.ipynb
└── 06-MSTL-decomposition.ipynb
├── 05-Missing-Data
├── 01-impute-missing-data-using-forward-fill-backward-fill.ipynb
├── 02-impute-missing-data-using-linear-and-spline-interpolation.ipynb
└── 03-impute-missing-data-using-STL-decomposition-and-interpolation.ipynb
├── 06-Outliers
├── 01-detect-outliers-using-rolling-statistics.ipynb
├── 02-detect-outliers-using-residuals-LOWESS.ipynb
├── 03-detect-outliers-using-residuals-STL.ipynb
└── 04-modelling-outliers-with-dummy-variables.ipynb
├── 07-Lag-Features
├── 01-computing-lags.ipynb
├── 02-lag-plots.ipynb
├── 03-autocorrelation-function.ipynb
├── 04-partial-autocorrelation-function.ipynb
├── 05-cross-correlation-function.ipynb
├── 06-air-pollution-example-domain-knowledge.ipynb
├── 07-air-pollution-example-modelling.ipynb
└── 08-air-pollution-example-correlation.ipynb
├── 08-Window-Features
├── 01-rolling-window-features.ipynb
├── 02-expanding-window-features.ipynb
├── 03-weighted-rolling-window-features.ipynb
├── 04-exponential-weights.ipynb
└── 05-window-features-with-feature-selection.ipynb
├── 09-Trend-Features
├── 01-time-linear-trend.ipynb
├── 02-time-non-linear-trend.ipynb
├── 03-recursive-forecasting-example.ipynb
├── 04-piecewise-linear-trend-and-changepoints.ipynb
├── 05-tree-based-models-and-trend.ipynb
├── 06-linear-trees-lightgbm.ipynb
└── images
│ ├── forecast_with_just_time.png
│ └── recursive_forecasting
│ ├── Slide1.png
│ ├── Slide2.png
│ ├── Slide3.png
│ └── Slide4.png
├── 10-Seasonality-Features
├── 01-seasonal-lags.ipynb
├── 02-datetime-features-seasonality.ipynb
├── 03-seasonal-dummies.ipynb
└── 04-fourier-features.ipynb
├── 11-Time-Features
├── 01-Extracting-date-related-features.ipynb
├── 02-Extracting-time-related-features.ipynb
├── 03-datetime-with-Feature-engine.ipynb
├── 04-periodic-features.ipynb
├── 05-highlighting-holidays-sandbox.ipynb
└── 05-highlighting-holidays.ipynb
├── 12-Categorical-Encoding
├── 1-one-hot-encoding.ipynb
├── 2-ordinal-encoding.ipynb
├── 3-mean-encoding-simple.ipynb
└── 4-mean-encoding-expanding-window.ipynb
├── Appendix
└── 00-pandas-period.ipynb
├── Datasets
└── .gitkeep
├── LICENSE
├── README.md
├── assignments
└── 02-tabularizing-time-series
│ ├── assignment.ipynb
│ └── solution.ipynb
├── images
├── FETSF_banner.png
├── forecasting_framework.png
├── lag_features.png
├── trainindata.png
└── window_features.png
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | # Jupyter Notebook
2 | .ipynb_checkpoints
3 |
4 | # datasets
5 | *.csv
6 | *.zip
7 | *.xlsx
8 |
9 | # folders
10 |
11 |
--------------------------------------------------------------------------------
/01-Create-Datasets/01-create-retail-datasets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "c81efda5",
6 | "metadata": {},
7 | "source": [
8 | "# Retail sales\n",
9 | "\n",
10 | "In this notebook we will prepare and store the retail sales dataset found [here](https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv).\n",
11 | "\n",
12 | "**Description of data:**\n",
13 | "\n",
14 | "The timeseries is collected between January 1992 and May 2016. It consists of a single series of monthly values representing sales volumes. "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "id": "888749e6",
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "import matplotlib.pyplot as plt\n",
27 | "\n",
28 | "from statsmodels.tsa.seasonal import STL"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "id": "25cc2a1f",
34 | "metadata": {},
35 | "source": [
36 | "# Get the dataset"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "id": "73ac5d57",
42 | "metadata": {},
43 | "source": [
44 | "The dataset can be obtained from this [link](https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv). It will open a raw file in GitHub. A simple way of obtaining the data is to copy and paste the values from your browser into a text editor of your choice. \n",
45 | "Save it in the Datasets directory, which is found at the root of this project, with the filename `example_retail_sales.csv`. \n",
46 | "\n",
47 | "Alternatively, download it using Pandas by running:\n",
48 | "\n"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 2,
54 | "id": "15c6a149",
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "url = \"https://raw.githubusercontent.com/facebook/prophet/master/examples/example_retail_sales.csv\"\n",
59 | "df = pd.read_csv(url)\n",
60 | "df.to_csv(\"../Datasets/example_retail_sales.csv\", index=False)"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "id": "5feac9ec",
66 | "metadata": {},
67 | "source": [
68 | "Now follow the rest of the notebook."
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 3,
74 | "id": "707768c5",
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "df = pd.read_csv(\n",
79 | " \"../Datasets/example_retail_sales.csv\",\n",
80 | " parse_dates=[\"ds\"],\n",
81 | " index_col=[\"ds\"],\n",
82 | " nrows=160,\n",
83 | ")"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "id": "three-blind",
89 | "metadata": {},
90 | "source": [
91 | "# Create dataset with missing data"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 4,
97 | "id": "112f9b90",
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "# copy dataframe\n",
102 | "df_with_missing_data = df.copy()\n",
103 | "\n",
104 | "# Insert missing data into dataframe\n",
105 | "df_with_missing_data.iloc[10:11] = np.NaN\n",
106 | "df_with_missing_data.iloc[25:28] = np.NaN\n",
107 | "df_with_missing_data.iloc[40:45] = np.NaN\n",
108 | "df_with_missing_data.iloc[70:94] = np.NaN"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 5,
114 | "id": "45acce8b",
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "# Save dataset in Datasets directory\n",
119 | "df_with_missing_data.to_csv(\"../Datasets/example_retail_sales_with_missing_data.csv\")"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "id": "80293d1b",
125 | "metadata": {},
126 | "source": [
127 | "# Create dataset with outliers"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 6,
133 | "id": "b78e8d57",
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "df_with_outliers = df.copy()"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 7,
143 | "id": "57bf7198",
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "# Insert outliers into dataframe\n",
148 | "outlier_idx = [20, 33, 66, 150]\n",
149 | "df_with_outliers.iloc[outlier_idx] = df_with_outliers.iloc[outlier_idx] * 1.7"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 8,
155 | "id": "ce560e64",
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "# Save dataset in Datasets directory\n",
160 | "df_with_outliers.to_csv(\"../Datasets/example_retail_sales_with_outliers.csv\")"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "id": "41606a6b",
167 | "metadata": {},
168 | "outputs": [],
169 | "source": []
170 | }
171 | ],
172 | "metadata": {
173 | "kernelspec": {
174 | "display_name": "fets",
175 | "language": "python",
176 | "name": "fets"
177 | },
178 | "language_info": {
179 | "codemirror_mode": {
180 | "name": "ipython",
181 | "version": 3
182 | },
183 | "file_extension": ".py",
184 | "mimetype": "text/x-python",
185 | "name": "python",
186 | "nbconvert_exporter": "python",
187 | "pygments_lexer": "ipython3",
188 | "version": "3.8.2"
189 | },
190 | "toc": {
191 | "base_numbering": 1,
192 | "nav_menu": {},
193 | "number_sections": true,
194 | "sideBar": true,
195 | "skip_h1_title": false,
196 | "title_cell": "Table of Contents",
197 | "title_sidebar": "Contents",
198 | "toc_cell": false,
199 | "toc_position": {},
200 | "toc_section_display": true,
201 | "toc_window_display": false
202 | }
203 | },
204 | "nbformat": 4,
205 | "nbformat_minor": 5
206 | }
207 |
--------------------------------------------------------------------------------
/01-Create-Datasets/02-create-online-retail-II-datasets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Online Retail II Data Set\n",
8 | "\n",
9 | "In this notebook we will prepare and store the Online Retail II Data Set stored on the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail+II)\n",
10 | "\n",
11 | "\n",
12 | "**Citation:**\n",
13 | "\n",
14 | "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n",
15 | "\n",
16 | "## Download the data\n",
17 | "\n",
18 | "- Navigate to the [data folder](https://archive.ics.uci.edu/dataset/502/online+retail+ii).\n",
19 | "- Download the file called **online_retail_II.xlsx**.\n",
20 | "- Save the Excel file into the **datasets** folder at the root of this repository."
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import pandas as pd"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "# Load data"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# If you downloaded and stored the file as explained\n",
46 | "# above, it should be located here:\n",
47 | "\n",
48 | "file = \"../Datasets/online_retail_II.xlsx\""
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# The data is provided as two sheets in a single Excel file.\n",
58 | "# Each sheet contains a different time period.\n",
59 | "# Load both and join into a single dataframe.\n",
60 | "\n",
61 | "df_1 = pd.read_excel(file, sheet_name=\"Year 2009-2010\")\n",
62 | "df_2 = pd.read_excel(file, sheet_name=\"Year 2010-2011\")\n",
63 | "\n",
64 | "df = pd.concat([df_1, df_2])"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 4,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/html": [
75 | "
\n",
76 | "\n",
89 | "
\n",
90 | " \n",
91 | " \n",
92 | " \n",
93 | " Invoice \n",
94 | " StockCode \n",
95 | " Description \n",
96 | " Quantity \n",
97 | " InvoiceDate \n",
98 | " Price \n",
99 | " Customer ID \n",
100 | " Country \n",
101 | " \n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " 0 \n",
106 | " 489434 \n",
107 | " 85048 \n",
108 | " 15CM CHRISTMAS GLASS BALL 20 LIGHTS \n",
109 | " 12 \n",
110 | " 2009-12-01 07:45:00 \n",
111 | " 6.95 \n",
112 | " 13085.0 \n",
113 | " United Kingdom \n",
114 | " \n",
115 | " \n",
116 | " 1 \n",
117 | " 489434 \n",
118 | " 79323P \n",
119 | " PINK CHERRY LIGHTS \n",
120 | " 12 \n",
121 | " 2009-12-01 07:45:00 \n",
122 | " 6.75 \n",
123 | " 13085.0 \n",
124 | " United Kingdom \n",
125 | " \n",
126 | " \n",
127 | " 2 \n",
128 | " 489434 \n",
129 | " 79323W \n",
130 | " WHITE CHERRY LIGHTS \n",
131 | " 12 \n",
132 | " 2009-12-01 07:45:00 \n",
133 | " 6.75 \n",
134 | " 13085.0 \n",
135 | " United Kingdom \n",
136 | " \n",
137 | " \n",
138 | " 3 \n",
139 | " 489434 \n",
140 | " 22041 \n",
141 | " RECORD FRAME 7\" SINGLE SIZE \n",
142 | " 48 \n",
143 | " 2009-12-01 07:45:00 \n",
144 | " 2.10 \n",
145 | " 13085.0 \n",
146 | " United Kingdom \n",
147 | " \n",
148 | " \n",
149 | " 4 \n",
150 | " 489434 \n",
151 | " 21232 \n",
152 | " STRAWBERRY CERAMIC TRINKET BOX \n",
153 | " 24 \n",
154 | " 2009-12-01 07:45:00 \n",
155 | " 1.25 \n",
156 | " 13085.0 \n",
157 | " United Kingdom \n",
158 | " \n",
159 | " \n",
160 | "
\n",
161 | "
"
162 | ],
163 | "text/plain": [
164 | " Invoice StockCode Description Quantity \\\n",
165 | "0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 \n",
166 | "1 489434 79323P PINK CHERRY LIGHTS 12 \n",
167 | "2 489434 79323W WHITE CHERRY LIGHTS 12 \n",
168 | "3 489434 22041 RECORD FRAME 7\" SINGLE SIZE 48 \n",
169 | "4 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 \n",
170 | "\n",
171 | " InvoiceDate Price Customer ID Country \n",
172 | "0 2009-12-01 07:45:00 6.95 13085.0 United Kingdom \n",
173 | "1 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n",
174 | "2 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n",
175 | "3 2009-12-01 07:45:00 2.10 13085.0 United Kingdom \n",
176 | "4 2009-12-01 07:45:00 1.25 13085.0 United Kingdom "
177 | ]
178 | },
179 | "execution_count": 4,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "# Inspect dataframe\n",
186 | "\n",
187 | "df.head()"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 5,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "# Rename columns\n",
197 | "\n",
198 | "df.columns = [\n",
199 | " \"invoice\",\n",
200 | " \"stock_code\",\n",
201 | " \"description\",\n",
202 | " \"quantity\",\n",
203 | " \"invoice_date\",\n",
204 | " \"price\",\n",
205 | " \"customer_id\",\n",
206 | " \"country\",\n",
207 | "]"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "# Process data"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {},
220 | "source": [
221 | "Remove null customer ids."
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 6,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "mask = ~df[\"customer_id\"].isnull()\n",
231 | "df = df[mask]"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "Create a flag for when an order is cancelled. Cancelled orders contain \n",
239 | "the letter `C` at the start of the invoice."
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 7,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "df[\"is_cancelled\"] = df[\"invoice\"].apply(lambda x: str(x)[0] == \"C\")"
249 | ]
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "metadata": {},
254 | "source": [
255 | "Remove transactions which are negative quantities sold and are not cancelled orders."
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 8,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "mask = ~(~df[\"is_cancelled\"] & df[\"quantity\"] < 0)\n",
265 | "\n",
266 | "df = df[mask]"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {},
272 | "source": [
273 | "Compute revenue."
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 9,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "df[\"revenue\"] = df[\"quantity\"] * df[\"price\"]"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {},
288 | "source": [
289 | "To compute gross revenue and quantity sold we filter out cancelled orders.\n",
290 | "\n",
291 | "After this, we resample the data at a weekly level."
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 10,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "mask = ~df[\"is_cancelled\"]\n",
301 | "\n",
302 | "# If running this raises an UnsupportedFunctionCall error\n",
303 | "# try upgrading your version of pandas.\n",
304 | "df_gross = (\n",
305 | " df.loc[mask, [\"invoice_date\", \"quantity\", \"revenue\", \"country\"]]\n",
306 | " .groupby(\"country\")\n",
307 | " .resample(\"W\", on=\"invoice_date\")\n",
308 | " .sum(numeric_only=True)\n",
309 | ")"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 11,
315 | "metadata": {},
316 | "outputs": [],
317 | "source": [
318 | "df_gross.index.rename([\"country\", \"week\"], inplace=True)"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "# Save data\n",
326 | "\n",
327 | "We will save 3 different versions of the preprocessed dataset for different demos.\n",
328 | "\n",
329 | "## Weekly sampled"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 12,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": [
338 | "df_gross_countries = df_gross.reset_index(level=\"country\")"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 13,
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "countries = [\n",
348 | " 'United Kingdom',\n",
349 | " 'Belgium',\n",
350 | " \"EIRE\",\n",
351 | " 'Germany',\n",
352 | " \"France\",\n",
353 | " 'Spain',\n",
354 | "]"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": 14,
360 | "metadata": {},
361 | "outputs": [],
362 | "source": [
363 | "df_gross_countries[df_gross_countries[\"country\"].isin(countries)].to_csv(\n",
364 | " \"../Datasets/online_retail_dataset_countries.csv\",\n",
365 | " index=True,\n",
366 | ")"
367 | ]
368 | },
369 | {
370 | "cell_type": "markdown",
371 | "metadata": {},
372 | "source": [
373 | "## Unstacked countries"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": 15,
379 | "metadata": {},
380 | "outputs": [],
381 | "source": [
382 | "y = df_gross.unstack(\"country\")[\"revenue\"]"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 16,
388 | "metadata": {},
389 | "outputs": [
390 | {
391 | "data": {
392 | "text/html": [
393 | "\n",
394 | "\n",
407 | "
\n",
408 | " \n",
409 | " \n",
410 | " country \n",
411 | " Australia \n",
412 | " Austria \n",
413 | " Bahrain \n",
414 | " Belgium \n",
415 | " Brazil \n",
416 | " Canada \n",
417 | " Channel Islands \n",
418 | " Cyprus \n",
419 | " Czech Republic \n",
420 | " Denmark \n",
421 | " ... \n",
422 | " Singapore \n",
423 | " Spain \n",
424 | " Sweden \n",
425 | " Switzerland \n",
426 | " Thailand \n",
427 | " USA \n",
428 | " United Arab Emirates \n",
429 | " United Kingdom \n",
430 | " Unspecified \n",
431 | " West Indies \n",
432 | " \n",
433 | " \n",
434 | " week \n",
435 | " \n",
436 | " \n",
437 | " \n",
438 | " \n",
439 | " \n",
440 | " \n",
441 | " \n",
442 | " \n",
443 | " \n",
444 | " \n",
445 | " \n",
446 | " \n",
447 | " \n",
448 | " \n",
449 | " \n",
450 | " \n",
451 | " \n",
452 | " \n",
453 | " \n",
454 | " \n",
455 | " \n",
456 | " \n",
457 | " \n",
458 | " \n",
459 | " \n",
460 | " 2009-12-06 \n",
461 | " 196.1 \n",
462 | " NaN \n",
463 | " NaN \n",
464 | " 439.1 \n",
465 | " NaN \n",
466 | " NaN \n",
467 | " 989.18 \n",
468 | " 760.69 \n",
469 | " NaN \n",
470 | " 1008.00 \n",
471 | " ... \n",
472 | " NaN \n",
473 | " 435.88 \n",
474 | " NaN \n",
475 | " NaN \n",
476 | " NaN \n",
477 | " 141.0 \n",
478 | " NaN \n",
479 | " 213000.35 \n",
480 | " NaN \n",
481 | " NaN \n",
482 | " \n",
483 | " \n",
484 | " 2009-12-13 \n",
485 | " 0.0 \n",
486 | " 1429.83 \n",
487 | " NaN \n",
488 | " 8.5 \n",
489 | " NaN \n",
490 | " NaN \n",
491 | " 0.00 \n",
492 | " 0.00 \n",
493 | " NaN \n",
494 | " 0.00 \n",
495 | " ... \n",
496 | " NaN \n",
497 | " 412.60 \n",
498 | " 285.3 \n",
499 | " NaN \n",
500 | " NaN \n",
501 | " 0.0 \n",
502 | " 517.7 \n",
503 | " 195810.04 \n",
504 | " NaN \n",
505 | " NaN \n",
506 | " \n",
507 | " \n",
508 | " 2009-12-20 \n",
509 | " 75.0 \n",
510 | " 0.00 \n",
511 | " NaN \n",
512 | " 0.0 \n",
513 | " NaN \n",
514 | " NaN \n",
515 | " 0.00 \n",
516 | " 2796.29 \n",
517 | " NaN \n",
518 | " 429.66 \n",
519 | " ... \n",
520 | " NaN \n",
521 | " 1952.64 \n",
522 | " 0.0 \n",
523 | " 589.4 \n",
524 | " NaN \n",
525 | " 0.0 \n",
526 | " 0.0 \n",
527 | " 182396.74 \n",
528 | " NaN \n",
529 | " NaN \n",
530 | " \n",
531 | " \n",
532 | " 2009-12-27 \n",
533 | " 0.0 \n",
534 | " 568.51 \n",
535 | " NaN \n",
536 | " 0.0 \n",
537 | " NaN \n",
538 | " NaN \n",
539 | " 0.00 \n",
540 | " 0.00 \n",
541 | " NaN \n",
542 | " 0.00 \n",
543 | " ... \n",
544 | " NaN \n",
545 | " 5149.06 \n",
546 | " 0.0 \n",
547 | " 0.0 \n",
548 | " NaN \n",
549 | " 0.0 \n",
550 | " 0.0 \n",
551 | " 22007.77 \n",
552 | " NaN \n",
553 | " NaN \n",
554 | " \n",
555 | " \n",
556 | " 2010-01-03 \n",
557 | " 0.0 \n",
558 | " 0.00 \n",
559 | " NaN \n",
560 | " 0.0 \n",
561 | " NaN \n",
562 | " NaN \n",
563 | " 0.00 \n",
564 | " 0.00 \n",
565 | " NaN \n",
566 | " 0.00 \n",
567 | " ... \n",
568 | " NaN \n",
569 | " 0.00 \n",
570 | " 0.0 \n",
571 | " 0.0 \n",
572 | " NaN \n",
573 | " 0.0 \n",
574 | " 0.0 \n",
575 | " 0.00 \n",
576 | " NaN \n",
577 | " NaN \n",
578 | " \n",
579 | " \n",
580 | "
\n",
581 | "
5 rows × 41 columns
\n",
582 | "
"
583 | ],
584 | "text/plain": [
585 | "country Australia Austria Bahrain Belgium Brazil Canada \\\n",
586 | "week \n",
587 | "2009-12-06 196.1 NaN NaN 439.1 NaN NaN \n",
588 | "2009-12-13 0.0 1429.83 NaN 8.5 NaN NaN \n",
589 | "2009-12-20 75.0 0.00 NaN 0.0 NaN NaN \n",
590 | "2009-12-27 0.0 568.51 NaN 0.0 NaN NaN \n",
591 | "2010-01-03 0.0 0.00 NaN 0.0 NaN NaN \n",
592 | "\n",
593 | "country Channel Islands Cyprus Czech Republic Denmark ... Singapore \\\n",
594 | "week ... \n",
595 | "2009-12-06 989.18 760.69 NaN 1008.00 ... NaN \n",
596 | "2009-12-13 0.00 0.00 NaN 0.00 ... NaN \n",
597 | "2009-12-20 0.00 2796.29 NaN 429.66 ... NaN \n",
598 | "2009-12-27 0.00 0.00 NaN 0.00 ... NaN \n",
599 | "2010-01-03 0.00 0.00 NaN 0.00 ... NaN \n",
600 | "\n",
601 | "country Spain Sweden Switzerland Thailand USA \\\n",
602 | "week \n",
603 | "2009-12-06 435.88 NaN NaN NaN 141.0 \n",
604 | "2009-12-13 412.60 285.3 NaN NaN 0.0 \n",
605 | "2009-12-20 1952.64 0.0 589.4 NaN 0.0 \n",
606 | "2009-12-27 5149.06 0.0 0.0 NaN 0.0 \n",
607 | "2010-01-03 0.00 0.0 0.0 NaN 0.0 \n",
608 | "\n",
609 | "country United Arab Emirates United Kingdom Unspecified West Indies \n",
610 | "week \n",
611 | "2009-12-06 NaN 213000.35 NaN NaN \n",
612 | "2009-12-13 517.7 195810.04 NaN NaN \n",
613 | "2009-12-20 0.0 182396.74 NaN NaN \n",
614 | "2009-12-27 0.0 22007.77 NaN NaN \n",
615 | "2010-01-03 0.0 0.00 NaN NaN \n",
616 | "\n",
617 | "[5 rows x 41 columns]"
618 | ]
619 | },
620 | "execution_count": 16,
621 | "metadata": {},
622 | "output_type": "execute_result"
623 | }
624 | ],
625 | "source": [
626 | "y.head()"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 17,
632 | "metadata": {},
633 | "outputs": [],
634 | "source": [
635 | "y.to_csv(\"../Datasets/online_retail_dataset.csv\")"
636 | ]
637 | },
638 | {
639 | "cell_type": "markdown",
640 | "metadata": {},
641 | "source": [
642 | "## Raw data"
643 | ]
644 | },
645 | {
646 | "cell_type": "code",
647 | "execution_count": 18,
648 | "metadata": {},
649 | "outputs": [],
650 | "source": [
651 | "# columns needed for demo\n",
652 | "cols = [\"invoice_date\", \"description\", \"revenue\"]\n",
653 | "\n",
654 | "# just UK\n",
655 | "df = df[df[\"country\"] == \"United Kingdom\"]\n",
656 | "\n",
657 | "# save\n",
658 | "df[cols].to_csv(\"../Datasets/online_retail_dataset_all.csv\", index=False)"
659 | ]
660 | }
661 | ],
662 | "metadata": {
663 | "kernelspec": {
664 | "display_name": "Python 3 (ipykernel)",
665 | "language": "python",
666 | "name": "python3"
667 | },
668 | "language_info": {
669 | "codemirror_mode": {
670 | "name": "ipython",
671 | "version": 3
672 | },
673 | "file_extension": ".py",
674 | "mimetype": "text/x-python",
675 | "name": "python",
676 | "nbconvert_exporter": "python",
677 | "pygments_lexer": "ipython3",
678 | "version": "3.8.7"
679 | },
680 | "toc": {
681 | "base_numbering": 1,
682 | "nav_menu": {},
683 | "number_sections": true,
684 | "sideBar": true,
685 | "skip_h1_title": false,
686 | "title_cell": "Table of Contents",
687 | "title_sidebar": "Contents",
688 | "toc_cell": false,
689 | "toc_position": {},
690 | "toc_section_display": true,
691 | "toc_window_display": false
692 | }
693 | },
694 | "nbformat": 4,
695 | "nbformat_minor": 4
696 | }
697 |
--------------------------------------------------------------------------------
/01-Create-Datasets/03-create-air-quality-dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Air Quality Data Set\n",
8 | "\n",
9 | "In this notebook we will prepare and store the Air Quality Data Set from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Air+Quality)\n",
10 | "\n",
11 | "**Citation:**\n",
12 | "\n",
13 | "Dua, D. and Graff, C. (2019). [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml). Irvine, CA: University of California, School of Information and Computer Science.\n",
14 | "\n",
15 | "\n",
16 | "## Download and unzip the data\n",
17 | "\n",
18 | "- Navigate to the [data folder](https://archive.ics.uci.edu/dataset/360/air+quality).\n",
19 | "- Download the zip file called **AirQualityUCI.zip**.\n",
20 | "- Unzip it.\n",
21 | "- Save the csv file called **AirQualityUCI.csv** into the **datasets** folder at the root of this repository."
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import pandas as pd\n",
31 | "import matplotlib.pyplot as plt"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "# If you downloaded and stored the file as explained\n",
41 | "# above, it should be located here:\n",
42 | "\n",
43 | "filename = '../Datasets/AirQualityUCI.csv'"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 3,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/plain": [
54 | "(9357, 14)"
55 | ]
56 | },
57 | "execution_count": 3,
58 | "metadata": {},
59 | "output_type": "execute_result"
60 | }
61 | ],
62 | "source": [
63 | "# load the data\n",
64 | "\n",
65 | "data = pd.read_csv(\n",
66 | " filename, sep=';', parse_dates=[['Date', 'Time']]\n",
67 | ").iloc[:, :-2] # drops last 2 columns, not real variables\n",
68 | "\n",
69 | "# drop missing values\n",
70 | "# these are added at the end of the file during reading\n",
71 | "data.dropna(inplace=True)\n",
72 | "\n",
73 | "data.shape"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 4,
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "data": {
83 | "text/html": [
84 | "\n",
85 | "\n",
98 | "
\n",
99 | " \n",
100 | " \n",
101 | " \n",
102 | " Date_Time \n",
103 | " CO(GT) \n",
104 | " PT08.S1(CO) \n",
105 | " NMHC(GT) \n",
106 | " C6H6(GT) \n",
107 | " PT08.S2(NMHC) \n",
108 | " NOx(GT) \n",
109 | " PT08.S3(NOx) \n",
110 | " NO2(GT) \n",
111 | " PT08.S4(NO2) \n",
112 | " PT08.S5(O3) \n",
113 | " T \n",
114 | " RH \n",
115 | " AH \n",
116 | " \n",
117 | " \n",
118 | " \n",
119 | " \n",
120 | " 0 \n",
121 | " 10/03/2004 18.00.00 \n",
122 | " 2,6 \n",
123 | " 1360.0 \n",
124 | " 150.0 \n",
125 | " 11,9 \n",
126 | " 1046.0 \n",
127 | " 166.0 \n",
128 | " 1056.0 \n",
129 | " 113.0 \n",
130 | " 1692.0 \n",
131 | " 1268.0 \n",
132 | " 13,6 \n",
133 | " 48,9 \n",
134 | " 0,7578 \n",
135 | " \n",
136 | " \n",
137 | " 1 \n",
138 | " 10/03/2004 19.00.00 \n",
139 | " 2 \n",
140 | " 1292.0 \n",
141 | " 112.0 \n",
142 | " 9,4 \n",
143 | " 955.0 \n",
144 | " 103.0 \n",
145 | " 1174.0 \n",
146 | " 92.0 \n",
147 | " 1559.0 \n",
148 | " 972.0 \n",
149 | " 13,3 \n",
150 | " 47,7 \n",
151 | " 0,7255 \n",
152 | " \n",
153 | " \n",
154 | " 2 \n",
155 | " 10/03/2004 20.00.00 \n",
156 | " 2,2 \n",
157 | " 1402.0 \n",
158 | " 88.0 \n",
159 | " 9,0 \n",
160 | " 939.0 \n",
161 | " 131.0 \n",
162 | " 1140.0 \n",
163 | " 114.0 \n",
164 | " 1555.0 \n",
165 | " 1074.0 \n",
166 | " 11,9 \n",
167 | " 54,0 \n",
168 | " 0,7502 \n",
169 | " \n",
170 | " \n",
171 | " 3 \n",
172 | " 10/03/2004 21.00.00 \n",
173 | " 2,2 \n",
174 | " 1376.0 \n",
175 | " 80.0 \n",
176 | " 9,2 \n",
177 | " 948.0 \n",
178 | " 172.0 \n",
179 | " 1092.0 \n",
180 | " 122.0 \n",
181 | " 1584.0 \n",
182 | " 1203.0 \n",
183 | " 11,0 \n",
184 | " 60,0 \n",
185 | " 0,7867 \n",
186 | " \n",
187 | " \n",
188 | " 4 \n",
189 | " 10/03/2004 22.00.00 \n",
190 | " 1,6 \n",
191 | " 1272.0 \n",
192 | " 51.0 \n",
193 | " 6,5 \n",
194 | " 836.0 \n",
195 | " 131.0 \n",
196 | " 1205.0 \n",
197 | " 116.0 \n",
198 | " 1490.0 \n",
199 | " 1110.0 \n",
200 | " 11,2 \n",
201 | " 59,6 \n",
202 | " 0,7888 \n",
203 | " \n",
204 | " \n",
205 | "
\n",
206 | "
"
207 | ],
208 | "text/plain": [
209 | " Date_Time CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) \\\n",
210 | "0 10/03/2004 18.00.00 2,6 1360.0 150.0 11,9 1046.0 \n",
211 | "1 10/03/2004 19.00.00 2 1292.0 112.0 9,4 955.0 \n",
212 | "2 10/03/2004 20.00.00 2,2 1402.0 88.0 9,0 939.0 \n",
213 | "3 10/03/2004 21.00.00 2,2 1376.0 80.0 9,2 948.0 \n",
214 | "4 10/03/2004 22.00.00 1,6 1272.0 51.0 6,5 836.0 \n",
215 | "\n",
216 | " NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH \\\n",
217 | "0 166.0 1056.0 113.0 1692.0 1268.0 13,6 48,9 \n",
218 | "1 103.0 1174.0 92.0 1559.0 972.0 13,3 47,7 \n",
219 | "2 131.0 1140.0 114.0 1555.0 1074.0 11,9 54,0 \n",
220 | "3 172.0 1092.0 122.0 1584.0 1203.0 11,0 60,0 \n",
221 | "4 131.0 1205.0 116.0 1490.0 1110.0 11,2 59,6 \n",
222 | "\n",
223 | " AH \n",
224 | "0 0,7578 \n",
225 | "1 0,7255 \n",
226 | "2 0,7502 \n",
227 | "3 0,7867 \n",
228 | "4 0,7888 "
229 | ]
230 | },
231 | "execution_count": 4,
232 | "metadata": {},
233 | "output_type": "execute_result"
234 | }
235 | ],
236 | "source": [
237 | "data.head()"
238 | ]
239 | },
240 | {
241 | "cell_type": "markdown",
242 | "metadata": {},
243 | "source": [
244 | "### Attribute Information:\n",
245 | "\n",
246 | "Taken from the [original website](https://archive.ics.uci.edu/ml/datasets/Air+Quality).\n",
247 | "\n",
248 | "- 0 Date (DD/MM/YYYY)\n",
249 | "- 1 Time (HH.MM.SS)\n",
250 | "\n",
251 | "The above were merged during loading into the Date_Time column\n",
252 | "\n",
253 | "\n",
254 | "- 2 True hourly averaged concentration CO in mg/m^3 (reference analyzer)\n",
255 | "- 3 PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)\n",
256 | "- 4 True hourly averaged overall Non Metanic HydroCarbons concentration in microg/m^3 (reference analyzer)\n",
257 | "- 5 True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)\n",
258 | "- 6 PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)\n",
259 | "- 7 True hourly averaged NOx concentration in ppb (reference analyzer)\n",
260 | "- 8 PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)\n",
261 | "- 9 True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)\n",
262 | "- 10 PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)\n",
263 | "- 11 PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)\n",
264 | "- 12 Temperature in °C\n",
265 | "- 13 Relative Humidity (%)\n",
266 | "- 14 AH Absolute Humidity "
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 5,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "# I will give the variables simpler names\n",
276 | "# more details at the end of the notebook\n",
277 | "\n",
278 | "new_var_names = [\n",
279 | " 'Date_Time',\n",
280 | " 'CO_true',\n",
281 | " 'CO_sensor',\n",
282 | " 'NMHC_true',\n",
283 | " 'C6H6_true',\n",
284 | " 'NMHC_sensor',\n",
285 | " 'NOX_true',\n",
286 | " 'NOX_sensor',\n",
287 | " 'NO2_true',\n",
288 | " 'NO2_sensor',\n",
289 | " 'O3_sensor',\n",
290 | " 'T',\n",
291 | " 'RH',\n",
292 | " 'AH', \n",
293 | "]"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 6,
299 | "metadata": {},
300 | "outputs": [
301 | {
302 | "data": {
303 | "text/plain": [
304 | "Index(['Date_Time', 'CO_true', 'CO_sensor', 'NMHC_true', 'C6H6_true',\n",
305 | " 'NMHC_sensor', 'NOX_true', 'NOX_sensor', 'NO2_true', 'NO2_sensor',\n",
306 | " 'O3_sensor', 'T', 'RH', 'AH'],\n",
307 | " dtype='object')"
308 | ]
309 | },
310 | "execution_count": 6,
311 | "metadata": {},
312 | "output_type": "execute_result"
313 | }
314 | ],
315 | "source": [
316 | "data.columns = new_var_names\n",
317 | "\n",
318 | "data.columns"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 7,
324 | "metadata": {},
325 | "outputs": [
326 | {
327 | "data": {
328 | "text/plain": [
329 | "Index(['CO_true', 'CO_sensor', 'NMHC_true', 'C6H6_true', 'NMHC_sensor',\n",
330 | " 'NOX_true', 'NOX_sensor', 'NO2_true', 'NO2_sensor', 'O3_sensor', 'T',\n",
331 | " 'RH', 'AH'],\n",
332 | " dtype='object')"
333 | ]
334 | },
335 | "execution_count": 7,
336 | "metadata": {},
337 | "output_type": "execute_result"
338 | }
339 | ],
340 | "source": [
341 | "# let's capture the variables\n",
342 | "\n",
343 | "predictors = data.columns[1:]\n",
344 | "\n",
345 | "predictors"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 8,
351 | "metadata": {},
352 | "outputs": [
353 | {
354 | "data": {
355 | "text/html": [
356 | "\n",
357 | "\n",
370 | "
\n",
371 | " \n",
372 | " \n",
373 | " \n",
374 | " Date_Time \n",
375 | " CO_true \n",
376 | " CO_sensor \n",
377 | " NMHC_true \n",
378 | " C6H6_true \n",
379 | " NMHC_sensor \n",
380 | " NOX_true \n",
381 | " NOX_sensor \n",
382 | " NO2_true \n",
383 | " NO2_sensor \n",
384 | " O3_sensor \n",
385 | " T \n",
386 | " RH \n",
387 | " AH \n",
388 | " \n",
389 | " \n",
390 | " \n",
391 | " \n",
392 | " 0 \n",
393 | " 10/03/2004 18.00.00 \n",
394 | " 2.6 \n",
395 | " 1360.0 \n",
396 | " 150.0 \n",
397 | " 11.9 \n",
398 | " 1046.0 \n",
399 | " 166.0 \n",
400 | " 1056.0 \n",
401 | " 113.0 \n",
402 | " 1692.0 \n",
403 | " 1268.0 \n",
404 | " 13.6 \n",
405 | " 48.9 \n",
406 | " 0.7578 \n",
407 | " \n",
408 | " \n",
409 | " 1 \n",
410 | " 10/03/2004 19.00.00 \n",
411 | " 2.0 \n",
412 | " 1292.0 \n",
413 | " 112.0 \n",
414 | " 9.4 \n",
415 | " 955.0 \n",
416 | " 103.0 \n",
417 | " 1174.0 \n",
418 | " 92.0 \n",
419 | " 1559.0 \n",
420 | " 972.0 \n",
421 | " 13.3 \n",
422 | " 47.7 \n",
423 | " 0.7255 \n",
424 | " \n",
425 | " \n",
426 | " 2 \n",
427 | " 10/03/2004 20.00.00 \n",
428 | " 2.2 \n",
429 | " 1402.0 \n",
430 | " 88.0 \n",
431 | " 9.0 \n",
432 | " 939.0 \n",
433 | " 131.0 \n",
434 | " 1140.0 \n",
435 | " 114.0 \n",
436 | " 1555.0 \n",
437 | " 1074.0 \n",
438 | " 11.9 \n",
439 | " 54.0 \n",
440 | " 0.7502 \n",
441 | " \n",
442 | " \n",
443 | " 3 \n",
444 | " 10/03/2004 21.00.00 \n",
445 | " 2.2 \n",
446 | " 1376.0 \n",
447 | " 80.0 \n",
448 | " 9.2 \n",
449 | " 948.0 \n",
450 | " 172.0 \n",
451 | " 1092.0 \n",
452 | " 122.0 \n",
453 | " 1584.0 \n",
454 | " 1203.0 \n",
455 | " 11.0 \n",
456 | " 60.0 \n",
457 | " 0.7867 \n",
458 | " \n",
459 | " \n",
460 | " 4 \n",
461 | " 10/03/2004 22.00.00 \n",
462 | " 1.6 \n",
463 | " 1272.0 \n",
464 | " 51.0 \n",
465 | " 6.5 \n",
466 | " 836.0 \n",
467 | " 131.0 \n",
468 | " 1205.0 \n",
469 | " 116.0 \n",
470 | " 1490.0 \n",
471 | " 1110.0 \n",
472 | " 11.2 \n",
473 | " 59.6 \n",
474 | " 0.7888 \n",
475 | " \n",
476 | " \n",
477 | "
\n",
478 | "
"
479 | ],
480 | "text/plain": [
481 | " Date_Time CO_true CO_sensor NMHC_true C6H6_true NMHC_sensor \\\n",
482 | "0 10/03/2004 18.00.00 2.6 1360.0 150.0 11.9 1046.0 \n",
483 | "1 10/03/2004 19.00.00 2.0 1292.0 112.0 9.4 955.0 \n",
484 | "2 10/03/2004 20.00.00 2.2 1402.0 88.0 9.0 939.0 \n",
485 | "3 10/03/2004 21.00.00 2.2 1376.0 80.0 9.2 948.0 \n",
486 | "4 10/03/2004 22.00.00 1.6 1272.0 51.0 6.5 836.0 \n",
487 | "\n",
488 | " NOX_true NOX_sensor NO2_true NO2_sensor O3_sensor T RH AH \n",
489 | "0 166.0 1056.0 113.0 1692.0 1268.0 13.6 48.9 0.7578 \n",
490 | "1 103.0 1174.0 92.0 1559.0 972.0 13.3 47.7 0.7255 \n",
491 | "2 131.0 1140.0 114.0 1555.0 1074.0 11.9 54.0 0.7502 \n",
492 | "3 172.0 1092.0 122.0 1584.0 1203.0 11.0 60.0 0.7867 \n",
493 | "4 131.0 1205.0 116.0 1490.0 1110.0 11.2 59.6 0.7888 "
494 | ]
495 | },
496 | "execution_count": 8,
497 | "metadata": {},
498 | "output_type": "execute_result"
499 | }
500 | ],
501 | "source": [
502 | "# cast variables as numeric (they are strings by defo)\n",
503 | "# need to replace the , by . to cast as numeric\n",
504 | "\n",
505 | "for var in predictors:\n",
506 | " if data[var].dtype =='O':\n",
507 | " data[var] = data[var].str.replace(',', '.')\n",
508 | " data[var] = pd.to_numeric(data[var])\n",
509 | "\n",
510 | "data.head()"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 9,
516 | "metadata": {},
517 | "outputs": [
518 | {
519 | "data": {
520 | "text/html": [
521 | "\n",
522 | "\n",
535 | "
\n",
536 | " \n",
537 | " \n",
538 | " \n",
539 | " Date_Time \n",
540 | " CO_true \n",
541 | " CO_sensor \n",
542 | " NMHC_true \n",
543 | " C6H6_true \n",
544 | " NMHC_sensor \n",
545 | " NOX_true \n",
546 | " NOX_sensor \n",
547 | " NO2_true \n",
548 | " NO2_sensor \n",
549 | " O3_sensor \n",
550 | " T \n",
551 | " RH \n",
552 | " AH \n",
553 | " \n",
554 | " \n",
555 | " \n",
556 | " \n",
557 | "
\n",
558 | "
"
559 | ],
560 | "text/plain": [
561 | "Empty DataFrame\n",
562 | "Columns: [Date_Time, CO_true, CO_sensor, NMHC_true, C6H6_true, NMHC_sensor, NOX_true, NOX_sensor, NO2_true, NO2_sensor, O3_sensor, T, RH, AH]\n",
563 | "Index: []"
564 | ]
565 | },
566 | "execution_count": 9,
567 | "metadata": {},
568 | "output_type": "execute_result"
569 | }
570 | ],
571 | "source": [
572 | "data[data['Date_Time'].apply(lambda x: len(x))>19]"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": 10,
578 | "metadata": {},
579 | "outputs": [
580 | {
581 | "data": {
582 | "text/html": [
583 | "\n",
584 | "\n",
597 | "
\n",
598 | " \n",
599 | " \n",
600 | " \n",
601 | " Date_Time \n",
602 | " CO_true \n",
603 | " CO_sensor \n",
604 | " NMHC_true \n",
605 | " C6H6_true \n",
606 | " NMHC_sensor \n",
607 | " NOX_true \n",
608 | " NOX_sensor \n",
609 | " NO2_true \n",
610 | " NO2_sensor \n",
611 | " O3_sensor \n",
612 | " T \n",
613 | " RH \n",
614 | " AH \n",
615 | " \n",
616 | " \n",
617 | " \n",
618 | " \n",
619 | " 0 \n",
620 | " 2004-10-03 18:00:00 \n",
621 | " 2.6 \n",
622 | " 1360.0 \n",
623 | " 150.0 \n",
624 | " 11.9 \n",
625 | " 1046.0 \n",
626 | " 166.0 \n",
627 | " 1056.0 \n",
628 | " 113.0 \n",
629 | " 1692.0 \n",
630 | " 1268.0 \n",
631 | " 13.6 \n",
632 | " 48.9 \n",
633 | " 0.7578 \n",
634 | " \n",
635 | " \n",
636 | " 1 \n",
637 | " 2004-10-03 19:00:00 \n",
638 | " 2.0 \n",
639 | " 1292.0 \n",
640 | " 112.0 \n",
641 | " 9.4 \n",
642 | " 955.0 \n",
643 | " 103.0 \n",
644 | " 1174.0 \n",
645 | " 92.0 \n",
646 | " 1559.0 \n",
647 | " 972.0 \n",
648 | " 13.3 \n",
649 | " 47.7 \n",
650 | " 0.7255 \n",
651 | " \n",
652 | " \n",
653 | " 2 \n",
654 | " 2004-10-03 20:00:00 \n",
655 | " 2.2 \n",
656 | " 1402.0 \n",
657 | " 88.0 \n",
658 | " 9.0 \n",
659 | " 939.0 \n",
660 | " 131.0 \n",
661 | " 1140.0 \n",
662 | " 114.0 \n",
663 | " 1555.0 \n",
664 | " 1074.0 \n",
665 | " 11.9 \n",
666 | " 54.0 \n",
667 | " 0.7502 \n",
668 | " \n",
669 | " \n",
670 | " 3 \n",
671 | " 2004-10-03 21:00:00 \n",
672 | " 2.2 \n",
673 | " 1376.0 \n",
674 | " 80.0 \n",
675 | " 9.2 \n",
676 | " 948.0 \n",
677 | " 172.0 \n",
678 | " 1092.0 \n",
679 | " 122.0 \n",
680 | " 1584.0 \n",
681 | " 1203.0 \n",
682 | " 11.0 \n",
683 | " 60.0 \n",
684 | " 0.7867 \n",
685 | " \n",
686 | " \n",
687 | " 4 \n",
688 | " 2004-10-03 22:00:00 \n",
689 | " 1.6 \n",
690 | " 1272.0 \n",
691 | " 51.0 \n",
692 | " 6.5 \n",
693 | " 836.0 \n",
694 | " 131.0 \n",
695 | " 1205.0 \n",
696 | " 116.0 \n",
697 | " 1490.0 \n",
698 | " 1110.0 \n",
699 | " 11.2 \n",
700 | " 59.6 \n",
701 | " 0.7888 \n",
702 | " \n",
703 | " \n",
704 | "
\n",
705 | "
"
706 | ],
707 | "text/plain": [
708 | " Date_Time CO_true CO_sensor NMHC_true C6H6_true NMHC_sensor \\\n",
709 | "0 2004-10-03 18:00:00 2.6 1360.0 150.0 11.9 1046.0 \n",
710 | "1 2004-10-03 19:00:00 2.0 1292.0 112.0 9.4 955.0 \n",
711 | "2 2004-10-03 20:00:00 2.2 1402.0 88.0 9.0 939.0 \n",
712 | "3 2004-10-03 21:00:00 2.2 1376.0 80.0 9.2 948.0 \n",
713 | "4 2004-10-03 22:00:00 1.6 1272.0 51.0 6.5 836.0 \n",
714 | "\n",
715 | " NOX_true NOX_sensor NO2_true NO2_sensor O3_sensor T RH AH \n",
716 | "0 166.0 1056.0 113.0 1692.0 1268.0 13.6 48.9 0.7578 \n",
717 | "1 103.0 1174.0 92.0 1559.0 972.0 13.3 47.7 0.7255 \n",
718 | "2 131.0 1140.0 114.0 1555.0 1074.0 11.9 54.0 0.7502 \n",
719 | "3 172.0 1092.0 122.0 1584.0 1203.0 11.0 60.0 0.7867 \n",
720 | "4 131.0 1205.0 116.0 1490.0 1110.0 11.2 59.6 0.7888 "
721 | ]
722 | },
723 | "execution_count": 10,
724 | "metadata": {},
725 | "output_type": "execute_result"
726 | }
727 | ],
728 | "source": [
729 | "# cast date and time variable as datetime\n",
730 | "# replace . by : to transform to datetime format\n",
731 | "\n",
732 | "data['Date_Time'] = data['Date_Time'].str.replace('.', ':', regex=False)\n",
733 | "\n",
734 | "data['Date_Time'] = pd.to_datetime(data['Date_Time'])\n",
735 | "# use dayfirst=True parameter if format is dd/mm/yyyy HH:mm:ss Eg: pd.to_datetime(data['Date_Time'], dayfirst=True)\n",
736 | "\n",
737 | "data.head()"
738 | ]
739 | },
740 | {
741 | "cell_type": "code",
742 | "execution_count": 11,
743 | "metadata": {},
744 | "outputs": [],
745 | "source": [
746 | "# sort index\n",
747 | "# we want the data in time order\n",
748 | "\n",
749 | "data.sort_index(inplace=True)"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": 12,
755 | "metadata": {},
756 | "outputs": [
757 | {
758 | "data": {
759 | "text/plain": [
760 | "Date_Time datetime64[ns]\n",
761 | "CO_true float64\n",
762 | "CO_sensor float64\n",
763 | "NMHC_true float64\n",
764 | "C6H6_true float64\n",
765 | "NMHC_sensor float64\n",
766 | "NOX_true float64\n",
767 | "NOX_sensor float64\n",
768 | "NO2_true float64\n",
769 | "NO2_sensor float64\n",
770 | "O3_sensor float64\n",
771 | "T float64\n",
772 | "RH float64\n",
773 | "AH float64\n",
774 | "dtype: object"
775 | ]
776 | },
777 | "execution_count": 12,
778 | "metadata": {},
779 | "output_type": "execute_result"
780 | }
781 | ],
782 | "source": [
783 | "# check the format\n",
784 | "\n",
785 | "data.dtypes"
786 | ]
787 | },
788 | {
789 | "cell_type": "code",
790 | "execution_count": 13,
791 | "metadata": {},
792 | "outputs": [
793 | {
794 | "data": {
795 | "text/plain": [
796 | "0"
797 | ]
798 | },
799 | "execution_count": 13,
800 | "metadata": {},
801 | "output_type": "execute_result"
802 | }
803 | ],
804 | "source": [
805 | "# sanity check: duplicates in dt variable\n",
806 | "\n",
807 | "data['Date_Time'].duplicated().sum()"
808 | ]
809 | },
810 | {
811 | "cell_type": "code",
812 | "execution_count": 14,
813 | "metadata": {},
814 | "outputs": [
815 | {
816 | "data": {
817 | "text/plain": [
818 | "Date_Time 0\n",
819 | "CO_true 0\n",
820 | "CO_sensor 0\n",
821 | "NMHC_true 0\n",
822 | "C6H6_true 0\n",
823 | "NMHC_sensor 0\n",
824 | "NOX_true 0\n",
825 | "NOX_sensor 0\n",
826 | "NO2_true 0\n",
827 | "NO2_sensor 0\n",
828 | "O3_sensor 0\n",
829 | "T 0\n",
830 | "RH 0\n",
831 | "AH 0\n",
832 | "dtype: int64"
833 | ]
834 | },
835 | "execution_count": 14,
836 | "metadata": {},
837 | "output_type": "execute_result"
838 | }
839 | ],
840 | "source": [
841 | "# check NA\n",
842 | "\n",
843 | "data.isnull().sum()"
844 | ]
845 | },
846 | {
847 | "cell_type": "code",
848 | "execution_count": 15,
849 | "metadata": {},
850 | "outputs": [
851 | {
852 | "data": {
853 | "text/plain": [
854 | "min 2004-01-04 00:00:00\n",
855 | "max 2005-12-03 23:00:00\n",
856 | "Name: Date_Time, dtype: datetime64[ns]"
857 | ]
858 | },
859 | "execution_count": 15,
860 | "metadata": {},
861 | "output_type": "execute_result"
862 | }
863 | ],
864 | "source": [
865 | "# check time span\n",
866 | "\n",
867 | "data['Date_Time'].agg(['min', 'max'])"
868 | ]
869 | },
870 | {
871 | "cell_type": "code",
872 | "execution_count": 16,
873 | "metadata": {},
874 | "outputs": [],
875 | "source": [
876 | "# save preprocessed data\n",
877 | "\n",
878 | "data.to_csv('../Datasets/AirQualityUCI_ready.csv', index=False)"
879 | ]
880 | },
881 | {
882 | "cell_type": "markdown",
883 | "metadata": {},
884 | "source": [
885 | "## Data set Summary\n",
886 | "\n",
887 | "The dataset was collected between January 2004 and March 2005.\n",
888 | "\n",
889 | "It consists of hourly measurements of the different air pollutants, NO2, NOX, CO, C6H6, O3 and NMHC. The measurements are accompanied by local temperature and humidity values, also recorded hourly.\n",
890 | "\n",
891 | "In the data collection experiments, scientists were testing new pollutant sensors. The values from the new sensors are stored in the variables called _sensors. \n",
892 | "\n",
893 | "For comparison, data for the pollutants was also gathered from fixed stations, that regularly measure the concentration of these gases. Those values are stored in the variables called _true."
894 | ]
895 | },
896 | {
897 | "cell_type": "code",
898 | "execution_count": null,
899 | "metadata": {},
900 | "outputs": [],
901 | "source": []
902 | }
903 | ],
904 | "metadata": {
905 | "kernelspec": {
906 | "display_name": "Python 3 (ipykernel)",
907 | "language": "python",
908 | "name": "python3"
909 | },
910 | "language_info": {
911 | "codemirror_mode": {
912 | "name": "ipython",
913 | "version": 3
914 | },
915 | "file_extension": ".py",
916 | "mimetype": "text/x-python",
917 | "name": "python",
918 | "nbconvert_exporter": "python",
919 | "pygments_lexer": "ipython3",
920 | "version": "3.10.5"
921 | },
922 | "toc": {
923 | "base_numbering": 1,
924 | "nav_menu": {},
925 | "number_sections": true,
926 | "sideBar": true,
927 | "skip_h1_title": false,
928 | "title_cell": "Table of Contents",
929 | "title_sidebar": "Contents",
930 | "toc_cell": false,
931 | "toc_position": {},
932 | "toc_section_display": true,
933 | "toc_window_display": true
934 | }
935 | },
936 | "nbformat": 4,
937 | "nbformat_minor": 4
938 | }
939 |
--------------------------------------------------------------------------------
/01-Create-Datasets/04-create-air-passengers-dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "threatened-orbit",
6 | "metadata": {},
7 | "source": [
8 | "# Air Passengers Dataset\n",
9 | "\n",
10 | "In this notebook we will prepare and store the air passengers dataset found [here](https://github.com/facebook/prophet/blob/main/examples/example_air_passengers.csv).\n",
11 | "\n",
12 | "**Citation:**\n",
13 | "\n",
14 | "Box, G. E. P., Jenkins, G. M. and Reinsel, G. C. (1976) Time Series Analysis, Forecasting and Control. Third Edition. Holden-Day. Series G.\n",
15 | "\n",
16 | "**Description of data:**\n",
17 | "\n",
18 | "The data is a monthly time series measuring the number of international airline passengers, in thousands, from 1949 to 1960."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "aggressive-license",
24 | "metadata": {},
25 | "source": [
26 | "In this notebook we will:\n",
27 | "\n",
28 | "1. Provide instructions to download the air passengers data set\n",
29 | "\n",
30 | "2. Save the time series data in the correct location for use in the course\n",
31 | "\n"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "id": "competitive-robertson",
37 | "metadata": {},
38 | "source": [
39 | "# Get the dataset"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "id": "outdoor-architecture",
45 | "metadata": {},
46 | "source": [
47 | "The dataset can be obtained from this [link](https://raw.githubusercontent.com/facebook/prophet/master/examples/example_air_passengers.csv). It will open a raw file in GitHub. A simple way of obtaining the data is to copy and paste the values from your browser into a text editor of your choice. \n",
48 | "Save it in the Datasets directory, which is found at the root of this project, with the filename `example_air_passengers.csv`.\n",
49 | "\n",
50 | "Alternatively, run the code below."
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 1,
56 | "id": "5045cf1c",
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "import pandas as pd\n",
61 | "\n",
62 | "url = \"https://raw.githubusercontent.com/facebook/prophet/main/examples/example_air_passengers.csv\"\n",
63 | "df = pd.read_csv(url)\n",
64 | "df.to_csv(\"../Datasets/example_air_passengers.csv\", index=False)"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "id": "respected-worth",
70 | "metadata": {},
71 | "source": [
72 | "# Data set synopsis"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "id": "mediterranean-toilet",
78 | "metadata": {},
79 | "source": [
80 | "The air passengers dataset is a monthly timeseries representing the number of US air passengers collected between January 1949 and December 1960."
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "id": "italic-serial",
86 | "metadata": {},
87 | "source": [
88 | "# Check that you can load the data "
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 2,
94 | "id": "established-clinic",
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "import pandas as pd"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 3,
104 | "id": "developmental-roulette",
105 | "metadata": {},
106 | "outputs": [],
107 | "source": [
108 | "df = pd.read_csv(\n",
109 | " \"../Datasets/example_air_passengers.csv\", parse_dates=[\"ds\"], index_col=[\"ds\"]\n",
110 | ")"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 4,
116 | "id": "quantitative-missouri",
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "data": {
121 | "text/html": [
122 | "\n",
123 | "\n",
136 | "
\n",
137 | " \n",
138 | " \n",
139 | " \n",
140 | " y \n",
141 | " \n",
142 | " \n",
143 | " ds \n",
144 | " \n",
145 | " \n",
146 | " \n",
147 | " \n",
148 | " \n",
149 | " 1949-01-01 \n",
150 | " 112 \n",
151 | " \n",
152 | " \n",
153 | " 1949-02-01 \n",
154 | " 118 \n",
155 | " \n",
156 | " \n",
157 | " 1949-03-01 \n",
158 | " 132 \n",
159 | " \n",
160 | " \n",
161 | " 1949-04-01 \n",
162 | " 129 \n",
163 | " \n",
164 | " \n",
165 | " 1949-05-01 \n",
166 | " 121 \n",
167 | " \n",
168 | " \n",
169 | "
\n",
170 | "
"
171 | ],
172 | "text/plain": [
173 | " y\n",
174 | "ds \n",
175 | "1949-01-01 112\n",
176 | "1949-02-01 118\n",
177 | "1949-03-01 132\n",
178 | "1949-04-01 129\n",
179 | "1949-05-01 121"
180 | ]
181 | },
182 | "execution_count": 4,
183 | "metadata": {},
184 | "output_type": "execute_result"
185 | }
186 | ],
187 | "source": [
188 | "df.head()"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 5,
194 | "id": "straight-mouth",
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "data": {
199 | "text/plain": [
200 | ""
201 | ]
202 | },
203 | "execution_count": 5,
204 | "metadata": {},
205 | "output_type": "execute_result"
206 | },
207 | {
208 | "data": {
209 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEGCAYAAACevtWaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAABFZElEQVR4nO3dd3xcZ5no8d+rGUmjNupdsuUi23GJY8fpvZGEAEkoISwlJGGzQGBhWe6SvbCUZdkF7l3a3V12QyAJGyChZZMA6YnTiOMS917ULVldGpXp7/3jnDMayRppRnPGkuXn+/n449GZmVNcnnnnOc/7vEprjRBCiPklbbZPQAghhP0kuAshxDwkwV0IIeYhCe5CCDEPSXAXQoh5yDnbJwBQUlKi6+rqZvs0hBDitLJt27ZurXXpZM/NieBeV1fH1q1bZ/s0hBDitKKUaor1nKRlhBBiHpLgLoQQ85AEdyGEmIfmRM59MoFAgNbWVrxe72yfSkwul4uamhrS09Nn+1SEEGKcORvcW1tbycvLo66uDqXUbJ/OSbTW9PT00NrayqJFi2b7dIQQYpw5m5bxer0UFxfPycAOoJSiuLh4Tn+zEEKcueZscAfmbGC3zPXzE0KcueZ0cBdCiDNFY/cwLx04Ydv+JLgLIcQccP9rx7jn59vo8vhs2Z8EdyGEmAP6hv0Ew5r/2d5my/4kuMfw1a9+lR/84AeRn7/85S/zwx/+cPZOSAgxr/WPBAB4bGsLdqyQN2dLIaN946m97Ds+aOs+V1a5+dq7V8V8/q677uK9730vn//85wmHwzz66KNs3rzZ1nMQQghL/2iAdIfiSOcQ21v6Wb+gMKn9ycg9hrq6OoqLi9m+fTvPPfcc69ato7i4eLZPSwgxTw2M+Ln2rHKy0h38ZmtL0vuLa+SulCoAHgBWAxq4CzgIPAbUAY3AbVrrPmXUB/4QeCcwAnxca/12Mic51Qg7lT7xiU/w0EMP0dHRwV133TUr5yCEODP0jQSoKcziprMreWpnO//wrpVkZ8w8uRLvyP2HwDNa6xXAWmA/cB/wota6HnjR/BngRqDe/HUP8OMZn90su/XWW3nmmWfYsmUL119//WyfjhBinvIGQowGQhRkZ/DutVUM+YK83dSf1D6n/VhQSuUDlwMfB9Ba+wG/Uupm4ErzZQ8DG4EvATcDP9fGHYFNSqkCpVSl1ro9qTOdBRkZGVx11VUUFBTgcDhm+3SEEPPU4KhxMzU/K50KtwuAAXPbTMUzcl8EdAEPKqW2K6UeUErlAOVRAbsDKDcfVwPRCaNWc9tpJxwOs2nTJu6+++7ZPhUhxDzWbwbygux03FnGmHvQm/rg7gTWAz/WWq8DhhlLwQBgjtITqt1RSt2jlNqqlNra1dWVyFtPiX379rF06VKuueYa6uvrZ/t0hBDzmFUGWZCVgdtldJkdTHLkHk+2vhVo1Vq/Zf78W4zgfsJKtyilKoFO8/k2oDbq/TXmtnG01vcD9wNs2LAh+aJOm61cuZJjx47N9mkIIc4A/SN+wBi5Z2c4cKSp1I/ctdYdQItSarm56RpgH/AkcIe57Q7gCfPxk8DHlOFCYGCm+XY7CvlTaa6fnxDi9NAflXNXSuF2ORkcDSa1z3jrbD4L/EIplQEcA+7E+GD4tVLqbqAJuM187Z8wyiCPYJRC3jmTE3O5XPT09MzZtr9WP3eXyzXbpyKEOM0NjIzl3AHcWelJj9zjCu5a6x3AhkmeumaS12rg3qTOCqipqaG1tZW5mI+3WCsxCSFEMvpH/TjSFLmZRkh2u9JPSc59VqSnp8sKR0KIM0L/SIACMyUDRnpm0JtcWkbaDwghxCzrHw2Qnz22FrM7y5n0yF2CuxBCzLIBc+RucbuSz7lLcBdCiFnWP+qnIDsj8rM7Kz3pahkJ7kIIMcv6hieO3J2MBkL4g+EZ71OCuxBCzLKBk3LuxmNPEqkZCe5CCDGLAqEwQ74ghdFpGasFQRIVMxLchRBiFg2Mjp/ABIw1D0uiYkaCuxBCzCKraVj+hGoZSK4zpAR3IYSYwog/yBM72lLWS2pg1GoaNr5aBkiqYkaCuxBCTOG7zxzkc4/u4HDnUEr2P9buV0buQghxShzrGuKRTU0A9A37U3KM/hHJuQshxCn17acPEAwb6Zhkl72LJbIKU9ZYWiYr3YEzyZ7uEtyFEGISm4718Ny+E9y2wej8mmwjr1gGRvwoBXmusT6OSqmkZ6lKcBdCiEn895tNlORm8DfXLQOSX/Yulv7RAPlZ6aSljV+3wu1yyshdCCHsdmLQS31ZHmV5xoI8KUvLTGgaZjFG7hLchRDCVr3DfopyMnCkKfIykxtFT8Vo95tx0najM6SkZYQQwla9I0ZwB3u6NMbSP+KPMXJPrqe7BHchhJggGArTPxKIBPc8lzO1aZnsSYJ7kj3dJbgLIcQEfWbtuRXc821YsDr2sWKN3KVaRgghbNU3YkxYGp+WsT+4ewMhPN4gZW7XSc8l29NdgrsQQkzQMzQhuLtSE9w7B30AlOZlnvRcsj3dJbgLIcQEvcPjg7uRlrH/hmqnxwtA2WTB3ewvM9NcvwR3IYSYoNdMyxRH0jJOhnxBgqGZL3s3mU7PVCN3s7/MDD9UJLgLIcQEvUPj2/Bao2iPzaP3zkFr5D5Zzt1q+ysjdyGEsEXvsI88l5MMpxEirYU07K6Y6Rry4UhTkW8I0dxJHlOCuxBCTNA7EhgXcK1Aa3ete+egj5LcjJP6ykD0yF3SMkIIYYveYR+FUcE934aVkSbT6fFNmpKB6Jy7jNyFEMIWPUP+CSP35AJtLEZwP/lmKkT1dJecuxBC2KMvqq8MJF+WGEuXx0uZe/LgbvV0T2kppFKqUSm1Wym1Qym11dxWpJR6Xil12Py90NyulFI/UkodUUrtUkqtn9GZCSHEJN482sPHH9xMwOayRIvWmt5hf4y0jH3BPRgK0zPspzRGWgagwu2ifcA7o/0nMnK/Smt9jtZ6g/nzfcCLWut64EXzZ4AbgXrz1z3Aj2d0ZkIIMYn/+9xBNh7sonvIl5L9e3xBAiE9Li2TneHAkeSydxN1D/nRevIJTJbaoiyae0dmtP9k0jI3Aw+bjx8Gbona/nNt2AQUKKUqkziOEEIAsKOln21NfYD9NeeWvsjs1LGgq5TCbXNnyKlmp1oWFGXT0juC1jrh/ccb3DXwnFJqm1LqHnNbuda63XzcAZSbj6uBlqj3tprbxlFK3aOU2qqU2trV1ZXwiQshzjwPvtEQeZyqZe96IsF9fKfGfJt7ult9ZSZrGmapLcrGFwzT5Un8W0q8wf1SrfV6jJTLvUqpy6Of1MbHSkIfLVrr+7XWG7TWG0pLSxN5qxDiDNQx4OWPu9pZv6AASN3IvXfo5JE7kNTNzclM1XrAUluYDUBLX+KpmbiCu9a6zfy9E3gcOB84YaVbzN87zZe3AbVRb68xtwkhxIz996ZGwlrz2WvqAfvLEi0T+8pYkl08YyIrLVOaO1XO3QzuvaMJ73/a4K6UylFK5VmPgXcAe4AngTvMl90BPGE+fhL4mFk1cyEwEJW+EUKIGdl4sIuLlhSzqsoNpC4tY3WELJwQ3PNt7une6fFRmJ0eaXEwmZrCLIAZ3VR1xvGacuBxpZT1+l9qrZ9RSm0Bfq2UuhtoAm4zX/8n4J3AEWAEuDPhsxJCiAk6BrysrS0Ym5afqrTMsJ8MZxo5GY5x291ZTgZszLl3TTE71eJKd1DuzqQlFcFda30MWDvJ9h7gmkm2a+DehM9ECCFi8AZC9Az7qcp3kelMI8ORlrq0zLAxO9Uc0Ea4bV5qr9PjizmBKVptYfaMRu4yQ1UIMedZlSUV+VkopchzOVN3Q3XYT2H2JF0aXen4g2G8gZAtx+ka9E55M9VSW5RNa18Kcu5CCDHbjg8Ywa0y30hjpGpNUzBKIYtzp2jBa8NxtdZ0DU2flgEjuLcPjCa8lqoEdyHEnNdhTsGvMIN7KkfufcPj+8pY7Ozp3jcSIBDSU05gstQWZhHWcLw/sdG7BHchxJxn9VepMCf82F2WGC12Wsa4RWnHTdXI7NQ4cu4LimZW6y7BXQgx53UMjOJ2OcnJNAKsOys1I3dvIMSQL0hJitMybzf1A1BdkDXta61a90RvqkpwF0Ik5Uinhz1tAyk9RvuAl8r8sUCYl5manLv1DSH6WBa70jLDviA/eOEQ6xcUcE5twbSvL3e7SHeohCcyxVPnLoQQJ/nV5mZ++noDRzqHyEp3sPcb10+6XJwdOga9kXw7pG7k3mZWpVRNMqK2q6f7/a8eo9Pj48cfOfekcsvJONIUNYXZCde6y8hdCDEj33nmAFprrj2rnNFAKDJtPxWMkftYcM9zpTMaCNne0926aTlZusRajWlgZObB/cSgl/tfPcZNayo5d2Fh3O+rKcySnLsQIvX6R/z0jwT40PkLeN96o+mrVYtuN38wTPeQb/zI3by5affova1/FKUYdyxLptNBnsuZVB/5B147RjAc5ks3rEjofbVFMnIXQpwCjT1GoFlYnBOp+LAqQOzW6fGiNSeN3MH+/jLH+0cpy8uM2e+lMn/mKyMB7GsfZFVVPguKsxN6X3meyyyfjP+bigR3IUTCGruHAVhUkk1prhF0O2fQczweYzXuY6kSq3LF7pH78YHRSfPtlor8LDoGZx7cG7qGWVSSk/D7rElVVlOzeEhwF0IkrLFnGKWgpjA7MnKfyYIS8RirYIkeuRtpGbtr3dv6pg7ulUmsaeoNhDg+4KWuOPHgbpVmJpISkuAuhEhYY/cwVflZuNIduNKNXHRnEiPaqUycnQpjlSseG4N7OKw5PuCdsva8It9F95Av4VYAYHwgAiwqncnI3fgA7RmSkbsQIoUae0aoKxnLG5flZaYsLdM+4CUnw0Fe5ljltlW5Yueydz3DfvzB8JTBvTLfhdYzu78QSWXNYORuLRzSMywjdyFECjX2DLMwKkiV5blSl3MfHKUi3zWuJjxyQ9XGkbtVBjl1zt349tAxg9RMQ7dxEzr6QzFeMnIXQqScVQYZPQItc2emrFpm4uxUgLxMJ0rZu2DHWHCP3anROo+Z5N0buocoyc2MfDAlwu1ykuFIo1uCuxAiVcbKICekZQZ9GGv12Ku933tS3XlamiI3w2lrKWTbFBOYLMmM3Bu7R1g0g1E7gFKK4twMuaEqhEidJuvGYMn4tIwvGLZ96btgKEynZ/zsVIs7K93WUsi2/lGyMxyRHjKTcbucZGc4ZjZy75lZGaSlODeDHgnuQohUaeg2yiCtboVAysohu4f8hPXkM0bzXE7bc+7VBVlT9ntRSlGR76JjMLEmXh5vgC6Pj7pkgntOJj1Rde6j/qlXhJLgLoRISHQZpMVaLs7uvLuVhijJPbnvuduVbmsp5PF+75Q3Uy0zmaXaZKayZlIpYzFG7mPB/aYfvTbl6yW4CyES0tgzMi7fDkSWi7N/5G4F98n6qzttLYU83j/1BCZLhTsr4Zz7se6Z17hbSnIz6R4y7msEQ+Fp+7tLcBdCJKSxZ/ik9EKkv4zNzcOs6fbFOSeP3PNc6Xh89ozcvYEQPcN+qqeolLFUFRhln8EE+rxYNe4Li5JJy2TgC4YZ9odo7RslGJ765rUEdyFE3KwyyLoJI/e8TCeZzjTb0zJWGmLSBatd9o3c2+KocbdU5LsIhY0FruNlpLJcZGU4pn9xDGO17j4azJvaU5HgLoSIm5UKWDghd6yUMmvdbU7LDPvIcKaRm3nyukJ5Zs7djvLLqfq4T2RV7iSSdz/WffK3nUQVR/WXaeiS4C6EsJGVdil3n5y+KMtz2Z6W6RnyU5yTMWkFizvLSVjD8DRVI/E40jkEEFcr3gq38QEQK+8+6A1EvgkAaK1psCG4l5oj9+4hP409w5HmabFIcBdCxM26wWlVx0Qz+svYnZbxTZqSgbEWBHZUzLx5tIcFRdmTrp060XQj908/8ja3/eebkW8UTT0jDIwGWF2Vn9Q5Wn8OPUN+Grqnr5mX4C7EPPHNP+zjhX0nUnoMqxrGamQVLRXNw3qH/ZPeTIWxzpDJ5t1DYc2mYz1ctLg4rtcXZKeT6UyjY+DkWvc3j/bw+pFu2vpHIwtab2/pA2DdgoKkzrPIah425JPgLsSZwuMN8NPXG/jsr7Zz+IQnZcfpHvKR53KOq3G3lLldeLxBvIHk0yRjx/NPMXK3ltpLbuS+v32QQW+Qi5bEF9yVUpPWumut+d7zB8kxb5puaewFYHtzPzkZDpaV5yV1ntYyf8cHvLT1j07bF16CuxDzwFHzBps3GOKTj2xj2GdvGwBL15Bv0pQMRE1ksinvrrWmZ9g36QQmGFuNaSazVKPLGP98tBsg7uAORsXMxJz760e62dLYx9/dsAK3yzkuuK+tLcCRFnvma7xKcjPZ3tyH1tg3cldKOZRS25VSfzB/XqSUekspdUQp9ZhSKsPcnmn+fMR8vi6ZixFCTO+oeUPwmzevpqF7mK8+sTclx+n2+GMG2zKbZ6mO+EN4A+FIOmKimS6S7QuGuOCfX+T+V48CRiplcWnOpDeJY6nKzxp30xTg+88forogi9vPr2VDXRFbGnsZ9YfY3z6YdErGUpyTwUHzm5mdaZnPAfujfv4O8H2t9VKgD7jb3H430Gdu/775OiFECh3tGsKZpvjgebW8/9wantvbkZLjdE8xcrdmqdqVd4/UuMcI7gXZia8rCsY3i55hP//63CGOdQ2xuaGXixMYtYPRV6dj0IsvaKSghn1B3m7u54Pn1ZLpdHBeXRFHu4Z55VAXwbBmXW1hQvuPpTg3A6vyc7rqm7iCu1KqBrgJeMD8WQFXA781X/IwcIv5+GbzZ8znr1FTdeIRQiTtSOcQC4uzSXekUV+Wh8cXZMDGdriWLo8vUpI3UWUS7XAnY606FOubQmF2OtkZjmmn4U9kffj4gmHuemgLw/4QFy0uSWgfC4uz0ZrITVNrCb2lZbkAnFdnBPMHXjsGwDl2jdzNP4uinIwpu1dC/CP3HwB/B1iJqmKgX2ttfR9qBarNx9VAC4D5/ID5+nGUUvcopbYqpbZ2dXXFeRpCiMkc7RqKBJaaQqOcr7UvsaA3HW8ghMcXjDlyL8hOJ8OZRodNa6lONTsVjBubC4qyae5J7Dq7zLTR+9bXRHrTX7i4KKF9WL11mnuNoN40ocf9mpp8MpxpbG3qY0FRdswPqERZ+4mndfC0wV0p9S6gU2u9Lekzi6K1vl9rvUFrvaG0tNTOXQtxRgmEwjT1jLCk1Aju1ZHgnlhb2ulYZZCTNfGCsSoSu0fuxVMExgVF2TQlOHK3ruOL1y9jVZWbs2vypzzG5Mc1gqsV1BvM3jFWBUum08E5NQVA8iWQ0aw/++kqZQCmnuJkuAR4j1LqnYALcAM/BAqUUk5zdF4DtJmvbwNqgVallBPIB3oSvAYhRJyae0cIhnUkuNcUGqNHu4P7VBOYLOVu+4J79zQ5dzBGyhsPdREOa9LirEbp9PhIU8Y9gl/dcyHBUOLtC0pyM8jOcESCe1PPMKV5meREtUk4b1Ehmxt7WVdbkPD+Y7Fq/uNZ0WnakbvW+u+11jVa6zrgduAlrfWHgZeB95svuwN4wnz8pPkz5vMv6VSsvSWEAMamzi8x0zJWLrotZSP32MG9Mt9la1omJ8MxaU29ZUFxDv5gmBMJVOh0Dvoozs3EkaZwu9JjVuNMJZISMr81NHaPnNSr/crlZTjTFBcvTSyfPxXrg3VRSe60r02mzv1LwBeUUkcwcuo/Nbf/FCg2t38BuC+JYwghpnG0ywzuZq9wpRQ1hVm259ytkfRUwb3CHLnbMZ7rGfZNmy5ZaK4GlUjevWso9k3hRCwszo4sOdjYM3xSj/vz6orY8bV3JD15Kdq5Cwv5p1tWc+3KsmlfG09aJkJrvRHYaD4+Bpw/yWu8wAcS2a8QYuaOdg5T7s6M9FoBo7thqtIysW5wgjG5xx8KG20Dkgygxj6mHlVbAbWpd4QL4mwf0OnxRvrPJ2NhcQ4vH+zC4w3QGWMJvcm6WSbDkab4yIUL43qtzFAV4jR3tGsokm+31BRm2z5y7/L4yM9KJ9MZO01SYU4EsiM10z0Uu6+MpaogC0eaSmzk7vFFJlwlY0FRNv5gODITNZnFr1NBgrsQpzGtNUc7JwvuWQx6g7YuID3VBCZLhY217j1DvpiVOZZ0RxrVBVlxV8yEwpruIf+01xGPBWZK6NVDRvuCiWmZ2SbBXYjTWJfHh8cXjNS4W6yKGTtvqnZ5pg+2VsvcZEfu4bCmd9gf183OhcXZNMexMhFA34ifUFhHZtMmwwrmrxwy5unEU554KklwF+I0dsS8mbp4wsLLqah1N0buUwfFktwM0lTyI/dBb4BgWMeVt0+k1t1qambHyN1KCTV0n1wGORdIcBfiNGYF74kLL1uzVNtszLvHM3J3OtIoy0u+1r1n2KrMmX7kvqAom/6RQFztFqx1T+3IuVspIeCkMsi5QIK7EKcxK4hOrP4ozsnAlZ5m28h9xB9k2B+Ka8RbbkOt+1jTsOmPF2kFEMdN1U7zvOwYuUcfe67l20GCuxApcyrm7rUPeM1APr6CRSllazlkt2f6GndLpfvkhSwS1RNH2aUl0gqgd/q8e1ccs2wTYd1UTXZ91FSQ4C5ECngDRs/wx7e3pvQ4HQOjkQqViWoKs2nttyctEwmKcQT3inwXJxIM7gc6BgmHxz4Mu4enbz1gsRa1bopr5O4jN9NJdoY9+XFrxD7XbqaCBHchUqKld4ROj4+fv9mU0uO0D3gjrXYnqinMsq1aJp6+MpaKfBceX5ChOFeDaukd4YYfvMbP3miIbHv1UBcluRlx3VDNzXRSkpsRMy2zramPN48a7a26huypcbesqsonTcFZlfbNQrWLBHchUsBKh2xv7qexO74yvZnoGPROOXLvGwnEHWSnEk9fGUuifd2t3jgPvtFIMBTmeP8oL+4/wW0bauNems6omJn8z/nvf7+Lv350O6Gwpmtw+lr9RFyytIQ3//4aFpdO3+vlVJPgLkQKRM8OfXx72xSvnLlRf4j+kUCktnyi6kjFTGKj95beEc7/1guR+m0YuxEZTw7cWq4u3uBuLXTR1j/K8/tO8OjmZjTwofMXxH3OtUXZJy17B8Z5HzoxRJfHx+aG3inXgJ2pRJbnO5UkuAuRAq19o2Q407h4STH/s6MtJTdXrYqUihjBZaaLduxpG6DT4+Nzj26nrX+U/e2D/OyNRtbWFpDumD5kREbucVbMNPWMkJPhoLYoi5+8doxHt7Rw5bJSaovir0CpKcyivd87buFrgDfMxa/TFDy16zidg15bJjCdDiS4C5ECrX2j1BRk8d71NTT1jPB2c7/tx2gfMEaqU+XcgUlHtFPv1wjK3kCIT/73Nu58cAu5mU7+8yPr43r/2Mg9vuM29QyzsDiHj1+8iLeb++n0+OJujmWpKcwmGNacmLB+6+uHeyjMTufGNZX8YefxuMs55wMJ7kKkQGvfCNWFWVy/qhxXelpKqmastEesnHtpbiaZzsRr3dsHRsl0pvG9285hd9sAw74gD955Xsz0z0SudAeF2ekJjdzrSrK5bUMNuZlOqguyuHL59C1to1kfZC1RM1W11rx+pIuLl5bwnrVVDHqNew923lCdy+bWfFkh5onWvlHeUZVPniudy+pLeeOI/YuRWSPsWEF3rNY9sbSMVYHzzjWV/PD2c1hUksNZle6E9lEVZ419MBSmpW+E61dXkOdK5/99aB1ZGY64b6RaJlt96mjXECcGfVy6tIQrlpWSl+mccg3Y+UZG7kLYbMQfpGfYHxlNrqx009gzzKg/ZOtxOga8FGSnk5URuwVv9QzKIY3gbpz7zedUc7a5Fmgi6kpy4qoSah/wEgjpyKIbV60o48I4+7JHqyowvr1Ef5C9ftjIt1+6tARXuoPrVpYDJ8/mna8kuAthM2v0aAX3FRV5aA2HOz22Hqd9wBvzZqrF6OueWHDvmKJ2Pl6LinNo6RslMOEGZ0P3MHc/tIX7Xz0KjE08WpjkJKBMp4Nyd+a4a339SA8Li7MjN2bvunQRVy4vnZMTjlJBgrsQNrNGj1ZQWV5hTHA52GFvcO8YHJ02CNcUZtEz7GfEH1+teyis6Rj0UlmQZHAvySEU1uNy4A+8dowbfvAqLx7o5ME3GtFaR8og6+JY8Hk60QuUBENhNh3r4ZKo9UtXV+fz0J3nT7km63wiwV0Im00cuS8szsGVnmZ/cB/wUjHNTc6aBGvdu4d8hMJ62v1Ox+q10mCmZvYeH+Cf/rifi5cU84XrltE+4OVw5xBNPcNkONMot6E80Vg31rjO/e0ehnzBGaV45gsJ7kLYrLXPqDax+rA40hT1ZXkcPGFfcPcFQ3QP+eMauQO0xlkOedx8XVWyaZkJwX1nywAA33jPat5/bg1gtBho6hlhYVE2aQneQJ1MTWEW7QNGrfu2JmPpuw0LC5Pe7+lKqmWEsJlVBqnUWMBaXpE3bsZnsqxFJ2KVQVomqyKZynQVOPEqzE4nPys9knbZ3TaA2+Wktsj4c6kvy+WVQ110DvqSzrdbagqzI2mlbc39VOa7qCpI7jpOZzJyF8JmrX2jkaBqWV6eR5fHR6/Z7TBZY0F46uBemptJhiMt7nLIePc7HaUUdSU549Iyq6vzIx94ly8r5a2GXhp7hqmzqRd6TdTqU9saezn3DB61gwR3IWxnBPfxI0a7b6pONzvVkpamqCpwxT9y7x/FlZ5GQXZ60ue4uCSHxu4R/MEwB9o9rKnOjzx3+bJS/MEwvmDYtoUurA/ULQ29HB/wSnCf7RMQYj4Z9gXpjapxt6yIBPdBW44zNjt1+rRDTWF23DdU2weNGvfolNJM1RXncHxglN1tA/hDYVZFBfcLFhWR6TTCj11pmaoCF0rBEzuPA0hwn+0TEGI+sfq4TEzLlOZlUpidPuObqhPrxdv6R8nLdJIbx6LM0VUk02nvn768Ml51JdloDX/c1Q4wbuTuSndwgVnJYlfdeabTQXmeiyOdQ7jS0xKeVTvfSHAXwkZWbnviyF0pxfKKPA7MIC2zuaGX5V95mm88tZcRf5D/3tTEL99q5pwFBXG9v7ogi+4hH97A5DNkj3R6GPQai0sb5ZX2BPfFJUaP8z/uPk5epjMyC9XyvvXVLC/Pi8wutYP15762Jr4OlvPZmX31QtispXd8jXu0FRVuDnV4xi0nF4+tTb2EtbGYxcXffol/+J89XL6slH//cHxdGmuKxm40TnSgY5B3/vB1vvDYDkJmV8WqJCtlLNbEpBODPlZWuU8qd7z5nGqe/ZvLcdoYhK0/9w11Z3ZKBiS4C2Grpp4RsjMck641urwij2F/KOEWvEc7hyl3Z/LYPReyoCibz1y1lJ98bANuV3w3Pa0U0cTj+oNh/vbXO/GHwrywv5PNDb2Ewjrp2amWPFd6ZOWm6JRMKlnXeqbn20GCuxC2au4dZkFR9qQ3JOvLjDSFtaxcvI51D7G4JJcLFhfz5Gcu5YvXL0+oa2KsRTv+7aXD7D0+yLduXU2GI41vP70fSL4MMtoic/S+pubUBPf1Cwsoyc3k3AVFp+R4c9m0wV0p5VJKbVZK7VRK7VVKfcPcvkgp9ZZS6ohS6jGlVIa5PdP8+Yj5fF2Kr0GIOaOpZ4QFMVYQWmoG90QaiGmtOdY1zOLSmd90LMtzke5QkZQRwLGuIf5941Heu76aD1+wkHevrWJnqzGLNNkJTNGsmaqrT9HI/eoV5Wz9yrXk21DKebqLZ+TuA67WWq8FzgFuUEpdCHwH+L7WeinQB9xtvv5uoM/c/n3zdULMuo0HO2PeVLRDOKxp7h2JWbddkJ1BaV4mh0/EP3LvHfYzMBpgSRILMDvSFAuLc8Z9Y7BSMJ+9uh6AOy+pizxn58j9imVlnLuwkEVnSCfGuWTa4K4N1r+KdPOXBq4Gfmtufxi4xXx8s/kz5vPXKDuKZoVIwsEODx9/cAvfeGpvyo7RNeTDFwyzYIpAVl+Wy+EE0jJHu4wZnsmM3MHI9x88MVZjf6DDQ1a6I1LBsro6nwsWFZGT4SA/y75R701nV/K7T11sS+8YkZi4cu5KKYdSagfQCTwPHAX6tdZWH9FWoNp8XA20AJjPDwAntWZTSt2jlNqqlNra1WVfzw0hJnPAnDz0q80tvHywMyXHsHqTx0rLgBHcj3YOxb1g9rEu44MgmZE7wIryPFp6RxnyGf9lD53wsKw8d1zQ/e77z+bfPrzelglMYvbFFdy11iGt9TlADXA+sCLZA2ut79dab9BabygtLU12d0JM6WjnEGnKCK5f+u0u+kfs6fESrclskjWxnjva0vI8PL4gJwZ9MV8T7Vi30RI32QZYVvuDw+YkqoMdnsg2y8LiHK5KcO1SMXclVC2jte4HXgYuAgqUUtb0uBqgzXzcBtQCmM/nA/YvIClEAg53DrGwOIfvf/Aceof9/Otzh2w/RnPvCI40RfUkNe6WpaWJ3VQ92jnE4pKchNcUnWhFhTFb82CHhy6Pj55hP8srzuwZnPNdPNUypUqpAvNxFnAdsB8jyL/ffNkdwBPm4yfNnzGff0nH+x1UiBQ50jnE0rJcVlfnc8WyUjY39Np+jKaeEaoKXFPOjKwvN4N7nDdVj3UnVyljqSnMIjvDwYEOT6R52YoJI3cxv8Qzcq8EXlZK7QK2AM9rrf8AfAn4glLqCEZO/afm638KFJvbvwDcZ/9pCxG/QChMY89wpBTxrEo3R7uG8AXtrZxp6h1hYdHUgbg4J4PC7PS4bqr6g2Gae0ci0/iTkZamqC/P42CHJ3L/YWJaRswv03Yd0lrvAtZNsv0YRv594nYv8AFbzk4IGzT1jBAI6UhKZEVlHsGw5kjnEKuq7Ku/bu4Z5sY1lVO+xlioIo8jMdIy4bDmN9tauHBxMYGQJhTWtozcwbip+vz+E9QUZlGSmxGZPSrmJ5mhKuY9q77bSolY+ecD7fYtezfoDdA3EpjyZqplaXkuh06cXDETDmv+/ve7+dLvdvMXP3mLN48Zt6qSrZSxLK/Io3fYz5+P9sio/QwgwV3Me0cnlBPWFWeT6UyLpCfs0GyWQcaz8ER9WS4DowG6h8YqdkJhzRd/u5PHtrbwwQ219I34+fqTRk2+bSN3M6C39Y+yvFxups53EtzFvHf4hIeqfBc5Zu9zpyONZeUza78bi1XjXhvHyL2+zCxLjErNPLOng9+/3cbnr63nO+8/O9LxsTQvk7w4G4RNJ3q0vrzCnm8DYu6S4C7mvSNdQywtH5+GOKsyj/3tNo7ce62R+/SjbOvG7tGom6oHOgZJU/CpK5cAcNXyMv79L9bzv65fbts5FudmRvLsUgY5/0lwF7NqxB/kg//1Jj9/szEl+w+bN06XTshbr6hw0z3kp8sT32SiqWitOdo1RHFORlwrI5W7M8nLdI6rmDnWPUxtUTaZTkdk2w2rK7htQ23S5xdteUUuSsGychm5z3fT/0sUIkW01nz58T281dBLhjONj11UZ/sx2vpH8QbCkdGyZUWlMZI/0DFIaV7iM6SbeoZ5ek8HL+4/wf52D0O+IOfXxddmVinF0vLccbXuDV3DkQ6KqXTj6krys9LJzpD/+vOd/A2LWfPLzc08vr0Nt8tpa/472hHzZupJwT2qYuay+sSC+8sHOrnzoS0ArK5287711Swtz+PKZfHvp74sl5cOGD2VtNY0dA9zweLU9yD/yIUL+ciFC1N+HDH7JLiLWdHYPcw3ntzHlctLuWhxMf/y9AF6h/0U5WTYepwjJyYP7kU5GZS7M9k/g4qZlw92kpPh4JnPXx7XDdTJ1Jfl8eutrfQN+/EGQ4wGQiy2qeRRCJCcu5glf9rTjj8U5p9vXRNZpd7O0kTL3uMDVLhdk35orKhws38Gte4H2j2sqHTPOLCDUesOxjeLBqut7ylIy4gzhwR3MStePtDJqio3VQVZkRK9gylIzexqHYi5xNuKSmOmqD8Yjnt/Wmv2dwxyVmVyk4CsJfcOnxjiWLcR3E9Fzl2cOSS4i1Ouf8TPtqY+rl5htJcty8ukIDvd9uA+6A1wrHuYtTGC++qqfAIhzaET8R+3rX8UjzcYydnPVFW+0cjrcKeHY13DuNLTqHDbtwKSEBLcxSn36uFuwhquMoO7Uorl5XkcTCDIxmOPuSbompqCSZ9fa27fZb4uHlbLgmRH7mlpiiWluRzpHKKhe4hFJbmyWpGwlQR3ccq9fKCTopyMSHAFY2r8oQ4P4bB93aF3tRlB++wYizPXFmWRn5XO7rb+uPdp3RdYVp58b5b6Miu4D0u+XdhOgrs4pUJhzcaDnVyxrHTcAhTLK9wM+0O09Y/adqzdrQPUFmVRGKMCRynF2TX57GyJf+S+v8NDbVGWLS0Blpbn0j7gpbl3RPLtwnYS3MUptbO1n76RQCQlY7FuqtpZ776ztZ+zqwumfM3ZNfkcOuHBG4ivt/uB9kHOsmnqvtVjJqztaw4mhEWCuzjJH3e10z5g3wg62gv7TpCm4IoJE4fGKmbsKYfsHfbT2jfK2TFuplrWVBcQDGv2xegzo7XmD7uOG/XogRAN3cOsqLQruI/VtcvIXdhNgrsYZ1tTH/f+8m0eeK3B9n37giF+vbWVK5aVkp89Pq2Rm+mkpjDLtpH7rtZ+gJhlkJa1tcbzu2PcVH1xfyef+eV2PvfYDg52eAhrOMumXui1RdlkOI3/gnastiRENJmhKsb53vMHAdh3fOYjaG8gRPuAl5rCrHHriT61s53uIR93X7p40vetqMizrRzSCtZrYtxMtVS4XZTkZk5aMaO15j82HiHDmcarh7rw+o3UjV0jd4dZMdM56D3pw06IZElwFxGbjvXwxpEe3C4n+9oH0VqjVOLlef/yp/08/GYTzjTFsvI8vvv+s1lV5eZnrzewrDyXS5YWT/q+lZVuXjrQyZAvGFd3RcvGg538Zmsr//cDa8nKMLoq7mwdYHFpzrQ3PpVSrK3Jj4z0o73V0Mvbzf184z2reH7fCV4/0k1WuoMFScxMneg9a6ts6UwpxESSlhGAMUr93nOHKMvL5LNX1zMwGuD4gHdG+9pzfJAlpTncc/li+kf8/MVPNvHAaw3sax/krksWxfzA2FBXRFjD2019CR3v2b0d/HF3O199Yg8Arxzq4uWDnVy0ePIPkYnW1ORzpGuIYV9w3Pb/2HiUktwMPnheLd99/9nkZTpZUZk3rsonWZ+6cglfffdK2/YnhEWCuwDgjSM9bG7s5d6rlrJ+YQEA+2eYmjnWNcT5i4r5uxtW8OtPXkR+djrf+tN+inIyuGVddcz3rVtQQJqCrY29CR2vuXeENAW/2dbKvzy9n08/so1l5Xncd+OKuN5/dk0+WsOetrHUzJ62AV491MWdlyzCle6gqiCLX/7lhfzLe9ckdG5CzBYJ7gKtNf/6/EGq8l3cfn5tZJWeWBUkU+kd9tM3EmCJWdpXU5jNr//qIs6pLeCvr16KK90R8715rnRWVrnZ0pjYyL2ld5Qb11Ry6dIS/uuVY7iz0nnw4+fFXYtuTaba3tIf2fbUruOkOxQfvWisPe6amvyk2w4IcapIcBdsPNjF9uZ+PnN1PZlOB7mZTuqKs2e0DN2xCYtRA1TmZ/E/917Cxy9ZNO37NywsYntLX9zNvIKhMMf7R1lYlM0Pbz+H28+r5eG7zqciP/4+LcW5mSwuyRn3jWFLQy9n1xTgtmn9UiFONQnuZzitNd97/hC1RVl8YENNZPtZle4ZjdyPWe1rZzgp5/xFRXgDYfYej2/WaPuAl2BYs6Aom+LcTL79vrNn1BrgvLoitjT2EQ5rvIEQu9sG2FBXmPB+hJgrJLif4Z7bd4LdbQP89dX148oWV1a6aeoZYWjCTcbpHO0eIsORRk3hzCpKrIC6Jc68e0ufsTB1Mr3VreMOjAY40jXEzpZ+AiEd97J5QsxFEtzPcI9ubqamMItbJ9zoXFllLUOX2Oj9aOcwC4uzZ1xRUpbnoq44O+68e0uvEdyTLU88zwzkWxp7Ix8s5y6Ukbs4fUlwP8Ptb/dwXl0RTsf4fwrW6kiJ5t2PdQ+Ny7fPxIa6IrY29sbVIbKldxRHmqIygRz7ZBYWZ1Oal8mWhl62NPaxvDyPgmx7l/wT4lSS4H4GGxgJ0DHojfR1iVaZ76IgOz2hvHsgFKa5ZyTpJljn1RXSNxLgqHlzdiotfSNU5rtO+nBKlFKK8+oK2dzQy9tNfZJvF6c9Ce5nMGtxjMmCu1KKsyrc7GmLP7g3944QDOukF3q20iE7okoTpzqmXTNGz6sr4viAF48vyPmLJN8uTm/TBnelVK1S6mWl1D6l1F6l1OfM7UVKqeeVUofN3wvN7Uop9SOl1BGl1C6l1PpUX4SYGasD44oYjbAuXlLM7rYBjsfZYz3ZShnLopJcsjMc7I1jElVL7yi1M7x5O9F5UTdQN8jNVHGai2fkHgT+Vmu9ErgQuFcptRK4D3hRa10PvGj+DHAjUG/+ugf4se1nLWxxoMNDnssZc+3Od62tAuBPu9vj2l+kxj3JDoeONMWqKje728aXQ7YPjPL1J/dy3rdeYEdLPyP+IN1DPhYU2xPcV1TkkZPhoCrfRXVBli37FGK2TBvctdbtWuu3zcceYD9QDdwMPGy+7GHgFvPxzcDPtWETUKCUqrT7xM8EfcN+uod89I/4U7L/gx0eVlTkxez1sqgkh1VVbp7aFV9wP9o1REluhi0dDldV5bPv+CAh86bq07vbueK7G3lkUxMDowEe/nMjrX3GN4qaQnsCsdORxkcvquPDFy6c/sVCzHEJ5dyVUnXAOuAtoFxrbf2v7wDKzcfVQEvU21rNbRP3dY9SaqtSamtXV1ei5z3v/XprC+u++Twb/ukFzvnH5/ndtlZb96+15uAJz6T59mjvXlvFzpZ+mntGpt3nsa5h2/qSr6nOZzQQinwbeOjPjVQVuHj5i1dy24Ya/rS7PdKWONka92j33biCe69aatv+hJgtcQd3pVQu8Dvg81rrcclQrbUGElrZWGt9v9Z6g9Z6Q2lp6fRvOMP8cVc71QVZfPPmVSwuzeFnbzRg/DHb4/iAF483GOkjE8tNa4wvXX/YfXzcdm8gxJM7j/OpR7ax7h+fY83XnmVbc59ty8VZi2zsbhtgYDTA1qY+3rmmktqibG7bUIsvGObHG48Cyde4CzEfxdU0WymVjhHYf6G1/r25+YRSqlJr3W6mXTrN7W1AbdTba8xtIk4j/iBvHuvhIxcs5KMX1aGU4iv/s4cdLf2sW5B4id7PXm9gYDTA31y3LLJtupupltqibM6pLeAPO9v59JVjI9pvPLWXX21uoTQvk+tWlpObmU6agg+eVzvF3uK3uCQHV3oau9sGyHCmEQprrjbXXV1Tnc+KijwOdHjISndQHGMBbCHOZPFUyyjgp8B+rfX3op56ErjDfHwH8ETU9o+ZVTMXAgNR6RsRhz8f6cEfDEeC2S3rqsnJcPDIpuaE99Xl8fGdZw7wo5cOj1vlyFrOLp4+LO9eW8W+9sFI3XkorHlmTwc3rankrb+/hu++fy1fffdKvvKuldTPoK/LZJyONFZWutnbNshLBzopyE6PfLAppbhtg/EhsqAoe0YLiggx38WTlrkE+ChwtVJqh/nrncC3geuUUoeBa82fAf4EHAOOAD8BPm3/ac9vLx/sJCfDEam1zs10csu6av6w63jCN1cffKMBfyhMVrqD7z9/KLL9YIeHqnwX+VnT3/y8cXUFAM/s6QCMXud9IwGuW1lOmo0LV0y0pjqfvccHeOVgF5fXl45raXDLumrSHYraIqlqEWIy8VTLvK61Vlrrs7XW55i//qS17tFaX6O1rtdaX6u17jVfr7XW92qtl2it12itt6b+MuYPrTUvH+jk0vqSyOLJAB++YCG+YJjfJnBjddAb4L/fbOLG1RX85WWLeWZvR2RBioMdHpbFudBzVUEWa2sLeHavEdxfPWTcAL+0viTuc5mJVdX5DPtD9Az7I99iLEU5GXz3/WfzySuWpPQchDhdyQzVOebgCQ/HB7wnBbOVVW7OqS3g8e3x3774xaZmPL4gn7piKXdftgi3y8nXn9zLF3+zk8OdQwktPHHj6gp2tQ7Q2jfCK4e6WFOdT0luZtzvnwlrcWul4IplJ990v3VdjUw2EiIGCe6zQGsdsynWSweM+9JXLi876bnrVpaz9/hgXAsqe7wBfvp6A5fVl7CmJh+3K52/umIJW5v6eHZvBzevreIvL5t+8QzLDauM1MxvtrayvaWfy5eldtQOUF+WS6YzjXW1BRTKTVMhEhL/EvPCNnc+tIXC7Ay+/8Fzxm3v9Hh5bEsLq6rclE8ya/Ty+lL+z7MHee1wF+9dX3PS89H+6Q/76R328YXrzo1s++QVS7hkaQkrK93jUj7xqCvJYUVFHv/5ylFCYc0Vy07+8LGb05HGV961kiUl9pRXCnEmkZH7KeYNhHjjSDdP7GijfWCsZ0vHgJfb/2sTXR4fX33Xyknfu6rKTXFORiTnHctLB07w2NYW/uqKJeNKJx1pinNqCxIO7JYbV1fiC4bJzXSybkHBjPaRqI9euJCLl6b+W4IQ840E9wQEQmECoXBkSvxM7D0+SCCkCWt4dLMxkXdgJMDt979Jp8fHw3edzwWLiyd9b1qa4rL6El493B0zrdMz5ONLv9vNioo8Pn9t/YzPczI3mFUzlywtHrdqkxBi7pH/oXF68I0G6r/8NPVffprlX3maN450z2g/25uNFYbWVOfz6JZmAqEwX31yD619ozx053njOhNO5orlpfQO+yftmHj4hIf3/fjPDIwE+Nfb1pLpdMzoHGNZVp7Lp65cwl9ettjW/Qoh7CfBPU6Pb29jcUkOX3zHMsryMvnOMwdm1A5ge0s/1QVZ/PU19ZwY9PGl3+3iiR3H+ezV9XFVflxWb1SNvHKoc9z21w53cet//JkhX4hf3XMBq6ryEz636Sil+NINK6RCRYjTgAT3OHR6vOxqHeDWddV85up6Pn/tMna1DvDC/s7p3zzBjuZ+1i0o4OoVZVTlu/j9222cXZPPp6+Kr167JDeTVVVuXj00/pvD157cS7k7kyc/cwnnLpTgK8SZToJ7HDYeNG5gXmXWnr93fTV1xdl87/lDca3zaTkx6KWtf5R1CwpxpCnuvmwxuZlO/vUDaxPKYV+xrJRtzX2R2apHu4Y41jXMxy6qo0r6kAshkOAel40HOyl3GyNmMEr0PndtPfvbB3nGnLUZj+3N/QCRSpO7L13Eli9fm3A/lneuqSQU1jy10+jU+MK+EwBcu7J8qrcJIc4gEtynEQiFee1QN1ctLxvXoOo9a6tZUJTNo1tapnj3eNtb+shwpEU+JACyMhK/6bm6Op+VlW5+vdVoRfD8vhOsrHTL6kFCiAgJ7tPY0tiLxxeMpGQsjjTF5ctK2NbYSzAUjmtf25v7WVnltqWK5bYNNexuG+D1w91sa+7jOhm1CyGiSHCfxssHOkl3KC6ZZCLNBYuKGfaH2BPHQs6BUJhdrf22Tf65+ZxqMhxp/N1vd6I1EtyFEOOctu0H3jjSze/fNppo5WQ6+LsbVpCbaf/lbDzYxQWLiifd9wWLjaqUt471cE5tQcx9DHoDfP7RHXgDYS5ZYs9sy8KcDK5bVc4fd7VTme8al+oRQojTcuQeDIX50u928cyedjYd6+HnbzbxwGvHbD9O56CXw51DXBajtW1ZnovFJTm81dAbcx8N3cPc+u9v8MqhLv7x5lVcc5Z9PVmsBSuuPatcFqwQQoxzWgb3p3Ydp7VvlB/evo437rua61eV89PXGhJeyGI6bx7rAeCiJZO3AwBj9L6lsXfSlgSvHOri5n97nd5hP4/cfQEfM5fMs8ulS0v43DX1fCKB7o5CiDPDaRfcw2HNjzceZVl5bqTn+d9ct4whf5CfzGD0rrXmroe28LUn9pwUoN882kOeyznlbM8LFhXj8QbZ3z4+7/7Lt5q588HNVBVk8eRnLp3yA2KmHGmKv7luGQuLpWuiEGK80y64v3igk0MnhvjUlUsiS7ytqHBz05pKHnyjkZ6h6XudR3uroZeXDnTy8JtN/K/f7BwX4P98tIcLFhWPW95tImspvOjUjNaa//PsAc6rK+J3n7qY2qLshM5JCCGSddoF9x9vPEJNYRbvPrtq3PbPX7sMbyDEg280JrS/RzY14XY5+ezVS/n99ja++JudaK1p7RuhuXeEi6cZcVcVZFFblMVbZgoHjBmjfSMB3nduDTkpuMkrhBDTmZPB/YkdbWye5CbltqY+3m7u5xOXLsI5Ybr+0rJcrlhWyu/fbo27JUCXx8ezezt4/7m1/O07lvO5a+p5fHsbz+zp4M2j0+fbLRcsKmZzVN59c4PR+XG6Do9CCJEqcy647zs+yOce3cFt//Umt/3Xm2xrGgvyP3ujgTyXkw+YVSIT3bKumuMD3imrV6L9emsLgZDmwxcuAOCzVy9lZaWbbzy1jxf2n6AoJ4PlcbQGuGJZKf0jAbY1GUF9a2MvJbkZ1BVLOkYIMTvmXHD/2RsNZKU7+N/vXEFTzzB/8ZO32N8+SFv/KM/s6eBD5y+Imep4x8oKcjIcPL69ddrjhMKaX77VzMVLillSmgsYPWP+6dbVnPB4eXbvCS5cXBTJ60/lqhVlZDjTeHpPOwCbG3s5r65IyhOFELNmTgX3Lo+PJ3cc5wMbarjn8iU89dlLyc9K59O/eJv/ePkIWms+dtHCmO/PynBww+pKnt7dgTcQmvJYf9rdTlv/KB++YPz+1i8o5PbzjJH8RXFOOMrNdHJ5fQnP7umgfWCU1r5R6XkuhJhVcyq4/+KtJvyhMB+/uA4wJgn9vw+to7l3hF+81cwNqyuoKZw61XHrumo8viAvTtFr3RsI8Z1nDrCiIi+ydFy0+25cwScuXcS71lTGfe43rK7k+ICXn73eAMD5EtyFELNozgR3XzDEI5uauHpFGYvNNAnABYuLue+GFaQ7FJ+IY3m3i5YUU+7OPCk1MzAawBc0RvMP/7mR1r5RvnLTyknLHPOz0vnKu1ZSmJMR9/lfe1YZzjTFQ39uJDvDwVmVibXxFUIIO82ZOr1v/mEf3UN+7rrk5NmWf3n5Yj54fi1uV/q0+3GkKW5ZV80DrzVwYtBLudvFqD/Etd97BYA7L6njxxuPcuXyUi6N0VZgJgqyM7hoSTGvHe7mgkWFJ1XzCCHEqTQnIlBb/yiPbGrmry5fzCVLJy89jCewWz503gJCYc2jm41e649vb6PL46My38V3nznIsC/I/37nWbace7TrVxkpHimBFELMtjkxcu8d9vOtq5byt+9YZkuFSV1JDpfVl/Crzc18+qol/OyNBlZVuXni3kt4u7kfjzfAsgRXP4rHTWsqeXpPOzedHX+uXgghUkFpHf8aoKmyYNka3XRwl62lg8/s6eCTj2zjIxcu4JFNzXzvtrW8d32NbfsXQojZppTaprXeMNlz06ZllFI/U0p1KqX2RG0rUko9r5Q6bP5eaG5XSqkfKaWOKKV2KaXWx3OCZe5M22vCrz2rjAq3i0c2NVOalymjaSHEGSWenPtDwA0Ttt0HvKi1rgdeNH8GuBGoN3/dA/zYntNMnNORxu3nGzNZP3rhQluWthNCiNPFtDl3rfWrSqm6CZtvBq40Hz8MbAS+ZG7/uTZyPZuUUgVKqUqtdbttZ5yAOy6qo38kwB0X1c3G4YUQYtbMtFqmPCpgdwDWAp7VQEvU61rNbSdRSt2jlNqqlNra1dU1w9OYWmFOBl9/zyrys+OvtBFCiPkg6VJIc5Se8F1ZrfX9WusNWusNpaWlyZ6GEEKIKDMN7ieUUpUA5u/WXP82ILplY425TQghxCk00+D+JHCH+fgO4Imo7R8zq2YuBAZmK98uhBBnsmlvqCqlfoVx87REKdUKfA34NvBrpdTdQBNwm/nyPwHvBI4AI8CdKThnIYQQ04inWuZDMZ66ZpLXauDeZE9KCCFEcuZEbxkhhBD2kuAuhBDzkAR3IYSYh+ZE4zCllAc4eAoPmQ8MnMLjlQDdp/B48/36YP5fo1yfvebr9S3UWk86UWhOtPwFDsbqbJYKSqn7tdb3nMLjbZXrs/2Y8/oa5fpsP968vr7JnKlpmadm+wRSbL5fH8z/a5TrO73N+vWdkcFdaz3rf/CpNN+vD+b/Ncr1nd7mwvXNleB+/2yfQIrJ9Z3+5vs1yvXNM3PihqoQQgh7zZWRuxBCCBtJcBdCiHkoZcE9xtqra5VSbyqldiulnlJKuSe8Z4FSakgp9cWobZ9TSu1RSu1VSn0+VeebqESuTylVp5QaVUrtMH/9Z9R7vqWUalFKDc3GdcRi4/U9o5Taaf79/adSak6sd2jj9W1USh2Meq5sNq5nIjuuTymVF7Vth1KqWyn1g1m6pHFs/Pv7oDLWe96rlPrObFxLymitU/ILuBxYD+yJ2rYFuMJ8fBfwzQnv+S3wG+CL5s+rgT1ANkZN/gvA0lSdc6quD6iLft2E/VwIVAJDs31NKbo+t/m7An4H3D7b12bz9W0ENsz29aTq+ibscxtw+Wxfm13XBxQDzUCp+fPDwDWzfW12/UrZyF1r/SrQO2HzMuBV8/HzwPusJ5RStwANwN6o158FvKW1HtFaB4FXgPem6pwTkej1TbGfTXoO9ry38foGzYdOIIMZrNqVCnZd31xl9/UppZYBZcBrtpxgkmy6vsXAYa21tc7nC3G857RxqnPuezEW0Qb4AOaqTUqpXIwFtr8x4fV7gMuUUsVKqWyMXvG1zF2TXp9pkVJqu1LqFaXUZaf+1Gwxo+tTSj2LsVqXB+Pb2Vw107+/B82v+/+glFKn5ExnJpl/n7cDj2lziDtHJXp9R4DlZtrGCdzC3I4vCTnVwf0u4NNKqW1AHuA3t38d+L7WelzeWWu9H/gO8BzwDLADCJ2qk52BWNfXDizQWq8DvgD8Uk2433CamNH1aa2vx0g9ZQJXn9pTTshMru/DWus1wGXmr4+e4nNORDL/Pm8HfnXKznRmEro+rXUf8CngMYxvJI3M7fiSmBTnxeqInatcBmw2H1t/sI1AP8bXrc9M8p5/Bj4927msRK9vkuc2MiFPyxzLudt9feb2jwH/NtvXlcLr+/h8vD5gLXBotq/nFPz93QN8d7avy65fp3TkblUSKKXSgK8A/wmgtb5Ma12nta4DfgD8s9b63ya8ZwFGvv2Xp/KcExHr+pRSpVaViFJqMVAPHJut85ypRK9PKZWrxhZSdwI3AQdm49zjMYPrcyqlSszt6cC7MFKJc1IS/z4/xNwftc/o+qLeUwh8Gnjg1J95aqSsK6SafO3VXKWUtQzf74EH49jV75RSxUAAuFdr3Z+C001Ygtd3OfCPSqkAEAY+qbXuNffzXeAvgGxzPw9orb9+yi4kBjuuTylVDjyplMrESAG+jPkfbrbZdH05wLNmYHdg3JD7ySm8jJjs+vdpug3jftecYeP1/VAptdZ8/I9a60On5AJOAWk/IIQQ85DMUBVCiHlIgrsQQsxDEtyFEGIekuAuhBDzkAR3IYSYhyS4CxGDUurrKqpDqRCnEwnuQggxD0lwFyKKUurLSqlDSqnXgeXmtr9WSu0z+34/OsunKERcUjZDVYjTjVLqXIwGWedg/N94G6OH+X3AIq21TylVMGsnKEQCZOQuxJjLgMe1sX7AIPCkuX0X8Aul1EeA4KydnRAJkOAuxPRuAv4dY+WfLWYTNCHmNAnuQox5FbhFKZWllMoD3o3xf6RWa/0yxoIy+UDuLJ6jEHGREYgQJq3120qpx4CdGCtHbcFYFvARpVQ+xjqwP5ornUmFmIp0hRRCiHlI0jJCCDEPSXAXQoh5SIK7EELMQxLchRBiHpLgLoQQ85AEdyGEmIckuAshxDz0/wGOa+jMLGt66wAAAABJRU5ErkJggg==\n",
210 | "text/plain": [
211 | ""
212 | ]
213 | },
214 | "metadata": {
215 | "needs_background": "light"
216 | },
217 | "output_type": "display_data"
218 | }
219 | ],
220 | "source": [
221 | "df.plot()"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "id": "aaba673e",
228 | "metadata": {},
229 | "outputs": [],
230 | "source": []
231 | }
232 | ],
233 | "metadata": {
234 | "kernelspec": {
235 | "display_name": "Python 3 (ipykernel)",
236 | "language": "python",
237 | "name": "python3"
238 | },
239 | "language_info": {
240 | "codemirror_mode": {
241 | "name": "ipython",
242 | "version": 3
243 | },
244 | "file_extension": ".py",
245 | "mimetype": "text/x-python",
246 | "name": "python",
247 | "nbconvert_exporter": "python",
248 | "pygments_lexer": "ipython3",
249 | "version": "3.10.5"
250 | },
251 | "toc": {
252 | "base_numbering": 1,
253 | "nav_menu": {},
254 | "number_sections": true,
255 | "sideBar": true,
256 | "skip_h1_title": false,
257 | "title_cell": "Table of Contents",
258 | "title_sidebar": "Contents",
259 | "toc_cell": false,
260 | "toc_position": {},
261 | "toc_section_display": true,
262 | "toc_window_display": false
263 | }
264 | },
265 | "nbformat": 4,
266 | "nbformat_minor": 5
267 | }
268 |
--------------------------------------------------------------------------------
/01-Create-Datasets/05-create-electricity-demand-dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Electricity Demand in Victoria, Australia \n",
8 | "\n",
9 | "In this notebook we will prepare and store the electricity demand dataset found [here](https://github.com/tidyverts/tsibbledata/tree/master/data-raw/vic_elec/VIC2015).\n",
10 | "\n",
11 | "**Citation:**\n",
12 | "\n",
13 | "Godahewa, Rakshitha, Bergmeir, Christoph, Webb, Geoff, Hyndman, Rob, & Montero-Manso, Pablo. (2021). Australian Electricity Demand Dataset (Version 1) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.4659727\n",
14 | "\n",
15 | "**Description of data:**\n",
16 | "\n",
17 | "A description of the data can be found [here](https://rdrr.io/cran/tsibbledata/man/vic_elec.html). The data contains electricity demand in Victoria, Australia, at 30 minute intervals over a period of 12 years, from 2002 to early 2015. There is also the temperature in Melbourne at 30 minute intervals and public holiday dates."
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "# Download the data via the URL below and pandas"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 1,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pandas as pd\n",
34 | "import numpy as np"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# Electricity demand.\n",
44 | "url = \"https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/demand.csv\"\n",
45 | "demand = pd.read_csv(url)\n",
46 | "\n",
47 | "# Temperature of Melbourne (BOM site 086071).\n",
48 | "url = \"https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/temperature.csv\"\n",
49 | "temp = pd.read_csv(url)\n",
50 | "df = demand.merge(temp, on=[\"Date\", \"Period\"], how=\"left\")"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 3,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "# Public holidays in Australia\n",
60 | "url = \"https://raw.githubusercontent.com/tidyverts/tsibbledata/master/data-raw/vic_elec/VIC2015/holidays.txt\"\n",
61 | "holidays = pd.read_csv(url, header=None, parse_dates=[0], dayfirst=True)\n",
62 | "holidays.columns = [\"date\"]"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "# Process and save the data"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "We will only use the `OperationLessIndustrial` demand. So let's drop `Industrial`."
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 4,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "df.drop(columns=[\"Industrial\"], inplace=True)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "Let's extract the date and date-time."
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 5,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "# Convert the integer Date to an actual date with datetime type\n",
102 | "df[\"date\"] = df[\"Date\"].apply(\n",
103 | " lambda x: pd.Timestamp(\"1899-12-30\") + pd.Timedelta(x, unit=\"days\")\n",
104 | ")\n",
105 | "\n",
106 | "# Create a timestamp from the integer Period representing 30 minute intervals\n",
107 | "df[\"date_time\"] = df[\"date\"] + pd.to_timedelta((df[\"Period\"] - 1) * 30, unit=\"m\")"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "Drop the null rows."
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 6,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "df.dropna(inplace=True)"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "Create holidays column."
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 7,
136 | "metadata": {
137 | "tags": []
138 | },
139 | "outputs": [],
140 | "source": [
141 | "holidays[\"is_holiday\"] = 1\n",
142 | "df = df.merge(holidays, on=[\"date\"], how=\"left\")\n",
143 | "df[\"is_holiday\"] = df[\"is_holiday\"].fillna(0).astype(int)"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "We now just use the timestamp and the electricity demand and resample to hourly."
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 8,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/html": [
161 | "\n",
162 | "\n",
175 | "
\n",
176 | " \n",
177 | " \n",
178 | " \n",
179 | " demand \n",
180 | " temperature \n",
181 | " is_holiday \n",
182 | " \n",
183 | " \n",
184 | " date_time \n",
185 | " \n",
186 | " \n",
187 | " \n",
188 | " \n",
189 | " \n",
190 | " \n",
191 | " \n",
192 | " 2002-01-01 00:00:00 \n",
193 | " 6919.366092 \n",
194 | " 32.6 \n",
195 | " 1 \n",
196 | " \n",
197 | " \n",
198 | " 2002-01-01 01:00:00 \n",
199 | " 7165.974188 \n",
200 | " 32.6 \n",
201 | " 1 \n",
202 | " \n",
203 | " \n",
204 | " 2002-01-01 02:00:00 \n",
205 | " 6406.542994 \n",
206 | " 32.6 \n",
207 | " 1 \n",
208 | " \n",
209 | " \n",
210 | " 2002-01-01 03:00:00 \n",
211 | " 5815.537828 \n",
212 | " 32.6 \n",
213 | " 1 \n",
214 | " \n",
215 | " \n",
216 | " 2002-01-01 04:00:00 \n",
217 | " 5497.732922 \n",
218 | " 32.6 \n",
219 | " 1 \n",
220 | " \n",
221 | " \n",
222 | "
\n",
223 | "
"
224 | ],
225 | "text/plain": [
226 | " demand temperature is_holiday\n",
227 | "date_time \n",
228 | "2002-01-01 00:00:00 6919.366092 32.6 1\n",
229 | "2002-01-01 01:00:00 7165.974188 32.6 1\n",
230 | "2002-01-01 02:00:00 6406.542994 32.6 1\n",
231 | "2002-01-01 03:00:00 5815.537828 32.6 1\n",
232 | "2002-01-01 04:00:00 5497.732922 32.6 1"
233 | ]
234 | },
235 | "execution_count": 8,
236 | "metadata": {},
237 | "output_type": "execute_result"
238 | }
239 | ],
240 | "source": [
241 | "# Rename columns\n",
242 | "timeseries = df[[\"date_time\", \"OperationalLessIndustrial\", \"Temp\", \"is_holiday\"]]\n",
243 | "\n",
244 | "timeseries.columns = [\"date_time\", \"demand\", \"temperature\", \"is_holiday\"]\n",
245 | "\n",
246 | "# Resample to hourly\n",
247 | "timeseries = (\n",
248 | " timeseries.set_index(\"date_time\")\n",
249 | " .resample(\"H\")\n",
250 | " .agg(\n",
251 | " {\n",
252 | " \"demand\": \"sum\",\n",
253 | " \"temperature\": \"mean\",\n",
254 | " \"is_holiday\": np.min,\n",
255 | " }\n",
256 | " )\n",
257 | ")\n",
258 | "timeseries.head()"
259 | ]
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {},
264 | "source": [
265 | "Save the timeseries in the datasets folder."
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 9,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "timeseries.to_csv(\"../Datasets/victoria_electricity_demand.csv\")"
275 | ]
276 | }
277 | ],
278 | "metadata": {
279 | "kernelspec": {
280 | "display_name": "Python 3 (ipykernel)",
281 | "language": "python",
282 | "name": "python3"
283 | },
284 | "language_info": {
285 | "codemirror_mode": {
286 | "name": "ipython",
287 | "version": 3
288 | },
289 | "file_extension": ".py",
290 | "mimetype": "text/x-python",
291 | "name": "python",
292 | "nbconvert_exporter": "python",
293 | "pygments_lexer": "ipython3",
294 | "version": "3.8.7"
295 | },
296 | "toc": {
297 | "base_numbering": 1,
298 | "nav_menu": {},
299 | "number_sections": true,
300 | "sideBar": true,
301 | "skip_h1_title": false,
302 | "title_cell": "Table of Contents",
303 | "title_sidebar": "Contents",
304 | "toc_cell": false,
305 | "toc_position": {},
306 | "toc_section_display": true,
307 | "toc_window_display": true
308 | }
309 | },
310 | "nbformat": 4,
311 | "nbformat_minor": 4
312 | }
313 |
--------------------------------------------------------------------------------
/09-Trend-Features/images/forecast_with_just_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/forecast_with_just_time.png
--------------------------------------------------------------------------------
/09-Trend-Features/images/recursive_forecasting/Slide1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide1.png
--------------------------------------------------------------------------------
/09-Trend-Features/images/recursive_forecasting/Slide2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide2.png
--------------------------------------------------------------------------------
/09-Trend-Features/images/recursive_forecasting/Slide3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide3.png
--------------------------------------------------------------------------------
/09-Trend-Features/images/recursive_forecasting/Slide4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/09-Trend-Features/images/recursive_forecasting/Slide4.png
--------------------------------------------------------------------------------
/11-Time-Features/02-Extracting-time-related-features.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Time features from the datetime variable\n",
8 | "\n",
9 | "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n",
10 | "\n",
11 | "Time series data are, by definition, time-indexed. The \"time\" component has information about the date and time. We can extract a number of features from the time component of the index.\n",
12 | "\n",
13 | "In this notebook, we will see how we can easily derive many time-related features.\n",
14 | "\n",
15 | "\n",
16 | "## Features from the time part:\n",
17 | "\n",
18 | "Below are some of the features that we can extract off-the-shelf using [pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components):\n",
19 | "\n",
20 | "- pandas.Series.dt.hour\n",
21 | "- pandas.Series.dt.minute\n",
22 | "- pandas.Series.dt.second\n",
23 | "- pandas.Series.dt.microsecond\n",
24 | "- pandas.Series.dt.nanosecond\n",
25 | "\n",
26 | "\n",
27 | "## The dataset\n",
28 | "\n",
29 | "We will use the Online Retail II Data Set available in the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/machine-learning-databases/00502/).\n",
30 | "\n",
31 | "Download the xlsx file from the link above and save it in the **Datasets** folder within this repo.\n",
32 | "\n",
33 | "**Citation**:\n",
34 | "\n",
35 | "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n",
36 | "\n",
37 | "## In this demo\n",
38 | "\n",
39 | "We will extract different time-related features from the datetime variable: **InvoiceDate**"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 1,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "import numpy as np\n",
49 | "import pandas as pd\n",
50 | "import matplotlib.pyplot as plt"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "## Load the data"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 2,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "(1067371, 8)\n"
70 | ]
71 | },
72 | {
73 | "data": {
74 | "text/html": [
75 | "\n",
76 | "\n",
89 | "
\n",
90 | " \n",
91 | " \n",
92 | " \n",
93 | " Invoice \n",
94 | " StockCode \n",
95 | " Description \n",
96 | " Quantity \n",
97 | " InvoiceDate \n",
98 | " Price \n",
99 | " Customer ID \n",
100 | " Country \n",
101 | " \n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " 0 \n",
106 | " 489434 \n",
107 | " 85048 \n",
108 | " 15CM CHRISTMAS GLASS BALL 20 LIGHTS \n",
109 | " 12 \n",
110 | " 2009-12-01 07:45:00 \n",
111 | " 6.95 \n",
112 | " 13085.0 \n",
113 | " United Kingdom \n",
114 | " \n",
115 | " \n",
116 | " 1 \n",
117 | " 489434 \n",
118 | " 79323P \n",
119 | " PINK CHERRY LIGHTS \n",
120 | " 12 \n",
121 | " 2009-12-01 07:45:00 \n",
122 | " 6.75 \n",
123 | " 13085.0 \n",
124 | " United Kingdom \n",
125 | " \n",
126 | " \n",
127 | " 2 \n",
128 | " 489434 \n",
129 | " 79323W \n",
130 | " WHITE CHERRY LIGHTS \n",
131 | " 12 \n",
132 | " 2009-12-01 07:45:00 \n",
133 | " 6.75 \n",
134 | " 13085.0 \n",
135 | " United Kingdom \n",
136 | " \n",
137 | " \n",
138 | " 3 \n",
139 | " 489434 \n",
140 | " 22041 \n",
141 | " RECORD FRAME 7\" SINGLE SIZE \n",
142 | " 48 \n",
143 | " 2009-12-01 07:45:00 \n",
144 | " 2.10 \n",
145 | " 13085.0 \n",
146 | " United Kingdom \n",
147 | " \n",
148 | " \n",
149 | " 4 \n",
150 | " 489434 \n",
151 | " 21232 \n",
152 | " STRAWBERRY CERAMIC TRINKET BOX \n",
153 | " 24 \n",
154 | " 2009-12-01 07:45:00 \n",
155 | " 1.25 \n",
156 | " 13085.0 \n",
157 | " United Kingdom \n",
158 | " \n",
159 | " \n",
160 | "
\n",
161 | "
"
162 | ],
163 | "text/plain": [
164 | " Invoice StockCode Description Quantity \\\n",
165 | "0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 \n",
166 | "1 489434 79323P PINK CHERRY LIGHTS 12 \n",
167 | "2 489434 79323W WHITE CHERRY LIGHTS 12 \n",
168 | "3 489434 22041 RECORD FRAME 7\" SINGLE SIZE 48 \n",
169 | "4 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 \n",
170 | "\n",
171 | " InvoiceDate Price Customer ID Country \n",
172 | "0 2009-12-01 07:45:00 6.95 13085.0 United Kingdom \n",
173 | "1 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n",
174 | "2 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n",
175 | "3 2009-12-01 07:45:00 2.10 13085.0 United Kingdom \n",
176 | "4 2009-12-01 07:45:00 1.25 13085.0 United Kingdom "
177 | ]
178 | },
179 | "execution_count": 2,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "# File path:\n",
186 | "file = \"../Datasets/online_retail_II.xlsx\"\n",
187 | "\n",
188 | "# The data is provided as two sheets in a single Excel file.\n",
189 | "# Each sheet contains a different time period.\n",
190 | "# Load both and join them into a single dataframe\n",
191 | "# as shown below:\n",
192 | "\n",
193 | "df_1 = pd.read_excel(file, sheet_name=\"Year 2009-2010\")\n",
194 | "df_2 = pd.read_excel(file, sheet_name=\"Year 2010-2011\")\n",
195 | "\n",
196 | "data = pd.concat([df_1, df_2])\n",
197 | "\n",
198 | "print(data.shape)\n",
199 | "\n",
200 | "data.head()"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "In this dataset, we have the datetime variable in a column called InvoiceDate. We could also have it in the dataframe index. The procedure for extracting the date and time features is identical. That is, we would use the methods from pandas dt as shown below.\n",
208 | "\n",
209 | "The dataset contains sales information for different customers in different countries. Customers may have made one or multiple purchases from the business that provided the data.\n",
210 | "\n",
211 | "## Variable format"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 3,
217 | "metadata": {},
218 | "outputs": [
219 | {
220 | "data": {
221 | "text/plain": [
222 | "dtype('\n",
254 | "\n",
267 | "\n",
268 | " \n",
269 | " \n",
270 | " \n",
271 | " date \n",
272 | " InvoiceDate \n",
273 | " \n",
274 | " \n",
275 | " \n",
276 | " \n",
277 | " 0 \n",
278 | " 2009-12-01 07:45:00 \n",
279 | " 2009-12-01 07:45:00 \n",
280 | " \n",
281 | " \n",
282 | " 1 \n",
283 | " 2009-12-01 07:45:00 \n",
284 | " 2009-12-01 07:45:00 \n",
285 | " \n",
286 | " \n",
287 | " 2 \n",
288 | " 2009-12-01 07:45:00 \n",
289 | " 2009-12-01 07:45:00 \n",
290 | " \n",
291 | " \n",
292 | " 3 \n",
293 | " 2009-12-01 07:45:00 \n",
294 | " 2009-12-01 07:45:00 \n",
295 | " \n",
296 | " \n",
297 | " 4 \n",
298 | " 2009-12-01 07:45:00 \n",
299 | " 2009-12-01 07:45:00 \n",
300 | " \n",
301 | " \n",
302 | "
\n",
303 | ""
304 | ],
305 | "text/plain": [
306 | " date InvoiceDate\n",
307 | "0 2009-12-01 07:45:00 2009-12-01 07:45:00\n",
308 | "1 2009-12-01 07:45:00 2009-12-01 07:45:00\n",
309 | "2 2009-12-01 07:45:00 2009-12-01 07:45:00\n",
310 | "3 2009-12-01 07:45:00 2009-12-01 07:45:00\n",
311 | "4 2009-12-01 07:45:00 2009-12-01 07:45:00"
312 | ]
313 | },
314 | "execution_count": 4,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "# This is how we parse date strings into datetime format.\n",
321 | "\n",
322 | "data[\"date\"] = pd.to_datetime(data[\"InvoiceDate\"])\n",
323 | "\n",
324 | "data[[\"date\", \"InvoiceDate\"]].head()"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "## Extract the time part"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 5,
337 | "metadata": {},
338 | "outputs": [
339 | {
340 | "data": {
341 | "text/plain": [
342 | "0 07:45:00\n",
343 | "1 07:45:00\n",
344 | "2 07:45:00\n",
345 | "3 07:45:00\n",
346 | "4 07:45:00\n",
347 | "Name: time_part, dtype: object"
348 | ]
349 | },
350 | "execution_count": 5,
351 | "metadata": {},
352 | "output_type": "execute_result"
353 | }
354 | ],
355 | "source": [
356 | "# Extract time part.\n",
357 | "\n",
358 | "# (We would normally not use this as a predictive feature,\n",
359 | "# but it might be handy for data analysis).\n",
360 | "\n",
361 | "data[\"time_part\"] = data[\"date\"].dt.time\n",
362 | "\n",
363 | "data[\"time_part\"].head()"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "### Extract the hr, minute and second"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 6,
376 | "metadata": {},
377 | "outputs": [
378 | {
379 | "data": {
380 | "text/html": [
381 | "\n",
382 | "\n",
395 | "
\n",
396 | " \n",
397 | " \n",
398 | " \n",
399 | " Invoice \n",
400 | " StockCode \n",
401 | " Description \n",
402 | " Quantity \n",
403 | " InvoiceDate \n",
404 | " Price \n",
405 | " Customer ID \n",
406 | " Country \n",
407 | " date \n",
408 | " time_part \n",
409 | " hour \n",
410 | " min \n",
411 | " sec \n",
412 | " microsec \n",
413 | " nanosec \n",
414 | " \n",
415 | " \n",
416 | " \n",
417 | " \n",
418 | " 0 \n",
419 | " 489434 \n",
420 | " 85048 \n",
421 | " 15CM CHRISTMAS GLASS BALL 20 LIGHTS \n",
422 | " 12 \n",
423 | " 2009-12-01 07:45:00 \n",
424 | " 6.95 \n",
425 | " 13085.0 \n",
426 | " United Kingdom \n",
427 | " 2009-12-01 07:45:00 \n",
428 | " 07:45:00 \n",
429 | " 7 \n",
430 | " 45 \n",
431 | " 0 \n",
432 | " 0 \n",
433 | " 0 \n",
434 | " \n",
435 | " \n",
436 | " 1 \n",
437 | " 489434 \n",
438 | " 79323P \n",
439 | " PINK CHERRY LIGHTS \n",
440 | " 12 \n",
441 | " 2009-12-01 07:45:00 \n",
442 | " 6.75 \n",
443 | " 13085.0 \n",
444 | " United Kingdom \n",
445 | " 2009-12-01 07:45:00 \n",
446 | " 07:45:00 \n",
447 | " 7 \n",
448 | " 45 \n",
449 | " 0 \n",
450 | " 0 \n",
451 | " 0 \n",
452 | " \n",
453 | " \n",
454 | " 2 \n",
455 | " 489434 \n",
456 | " 79323W \n",
457 | " WHITE CHERRY LIGHTS \n",
458 | " 12 \n",
459 | " 2009-12-01 07:45:00 \n",
460 | " 6.75 \n",
461 | " 13085.0 \n",
462 | " United Kingdom \n",
463 | " 2009-12-01 07:45:00 \n",
464 | " 07:45:00 \n",
465 | " 7 \n",
466 | " 45 \n",
467 | " 0 \n",
468 | " 0 \n",
469 | " 0 \n",
470 | " \n",
471 | " \n",
472 | " 3 \n",
473 | " 489434 \n",
474 | " 22041 \n",
475 | " RECORD FRAME 7\" SINGLE SIZE \n",
476 | " 48 \n",
477 | " 2009-12-01 07:45:00 \n",
478 | " 2.10 \n",
479 | " 13085.0 \n",
480 | " United Kingdom \n",
481 | " 2009-12-01 07:45:00 \n",
482 | " 07:45:00 \n",
483 | " 7 \n",
484 | " 45 \n",
485 | " 0 \n",
486 | " 0 \n",
487 | " 0 \n",
488 | " \n",
489 | " \n",
490 | " 4 \n",
491 | " 489434 \n",
492 | " 21232 \n",
493 | " STRAWBERRY CERAMIC TRINKET BOX \n",
494 | " 24 \n",
495 | " 2009-12-01 07:45:00 \n",
496 | " 1.25 \n",
497 | " 13085.0 \n",
498 | " United Kingdom \n",
499 | " 2009-12-01 07:45:00 \n",
500 | " 07:45:00 \n",
501 | " 7 \n",
502 | " 45 \n",
503 | " 0 \n",
504 | " 0 \n",
505 | " 0 \n",
506 | " \n",
507 | " \n",
508 | "
\n",
509 | "
"
510 | ],
511 | "text/plain": [
512 | " Invoice StockCode Description Quantity \\\n",
513 | "0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 \n",
514 | "1 489434 79323P PINK CHERRY LIGHTS 12 \n",
515 | "2 489434 79323W WHITE CHERRY LIGHTS 12 \n",
516 | "3 489434 22041 RECORD FRAME 7\" SINGLE SIZE 48 \n",
517 | "4 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 \n",
518 | "\n",
519 | " InvoiceDate Price Customer ID Country date \\\n",
520 | "0 2009-12-01 07:45:00 6.95 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
521 | "1 2009-12-01 07:45:00 6.75 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
522 | "2 2009-12-01 07:45:00 6.75 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
523 | "3 2009-12-01 07:45:00 2.10 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
524 | "4 2009-12-01 07:45:00 1.25 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
525 | "\n",
526 | " time_part hour min sec microsec nanosec \n",
527 | "0 07:45:00 7 45 0 0 0 \n",
528 | "1 07:45:00 7 45 0 0 0 \n",
529 | "2 07:45:00 7 45 0 0 0 \n",
530 | "3 07:45:00 7 45 0 0 0 \n",
531 | "4 07:45:00 7 45 0 0 0 "
532 | ]
533 | },
534 | "execution_count": 6,
535 | "metadata": {},
536 | "output_type": "execute_result"
537 | }
538 | ],
539 | "source": [
540 | "data[\"hour\"] = data[\"date\"].dt.hour\n",
541 | "data[\"min\"] = data[\"date\"].dt.minute\n",
542 | "data[\"sec\"] = data[\"date\"].dt.second\n",
543 | "\n",
544 | "# We do not have micro and nano seconds in this dataset,\n",
545 | "# but if we did, we can extract them as follows:\n",
546 | "\n",
547 | "data[\"microsec\"] = data[\"date\"].dt.microsecond\n",
548 | "data[\"nanosec\"] = data[\"date\"].dt.nanosecond\n",
549 | "\n",
550 | "data.head()"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "### Extract hr, min, sec, at the same time"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": 7,
563 | "metadata": {},
564 | "outputs": [
565 | {
566 | "data": {
567 | "text/html": [
568 | "\n",
569 | "\n",
582 | "
\n",
583 | " \n",
584 | " \n",
585 | " \n",
586 | " Invoice \n",
587 | " StockCode \n",
588 | " Description \n",
589 | " Quantity \n",
590 | " InvoiceDate \n",
591 | " Price \n",
592 | " Customer ID \n",
593 | " Country \n",
594 | " date \n",
595 | " time_part \n",
596 | " hour \n",
597 | " min \n",
598 | " sec \n",
599 | " microsec \n",
600 | " nanosec \n",
601 | " h \n",
602 | " m \n",
603 | " s \n",
604 | " \n",
605 | " \n",
606 | " \n",
607 | " \n",
608 | " 0 \n",
609 | " 489434 \n",
610 | " 85048 \n",
611 | " 15CM CHRISTMAS GLASS BALL 20 LIGHTS \n",
612 | " 12 \n",
613 | " 2009-12-01 07:45:00 \n",
614 | " 6.95 \n",
615 | " 13085.0 \n",
616 | " United Kingdom \n",
617 | " 2009-12-01 07:45:00 \n",
618 | " 07:45:00 \n",
619 | " 7 \n",
620 | " 45 \n",
621 | " 0 \n",
622 | " 0 \n",
623 | " 0 \n",
624 | " 7 \n",
625 | " 45 \n",
626 | " 0 \n",
627 | " \n",
628 | " \n",
629 | " 1 \n",
630 | " 489434 \n",
631 | " 79323P \n",
632 | " PINK CHERRY LIGHTS \n",
633 | " 12 \n",
634 | " 2009-12-01 07:45:00 \n",
635 | " 6.75 \n",
636 | " 13085.0 \n",
637 | " United Kingdom \n",
638 | " 2009-12-01 07:45:00 \n",
639 | " 07:45:00 \n",
640 | " 7 \n",
641 | " 45 \n",
642 | " 0 \n",
643 | " 0 \n",
644 | " 0 \n",
645 | " 7 \n",
646 | " 45 \n",
647 | " 0 \n",
648 | " \n",
649 | " \n",
650 | " 2 \n",
651 | " 489434 \n",
652 | " 79323W \n",
653 | " WHITE CHERRY LIGHTS \n",
654 | " 12 \n",
655 | " 2009-12-01 07:45:00 \n",
656 | " 6.75 \n",
657 | " 13085.0 \n",
658 | " United Kingdom \n",
659 | " 2009-12-01 07:45:00 \n",
660 | " 07:45:00 \n",
661 | " 7 \n",
662 | " 45 \n",
663 | " 0 \n",
664 | " 0 \n",
665 | " 0 \n",
666 | " 7 \n",
667 | " 45 \n",
668 | " 0 \n",
669 | " \n",
670 | " \n",
671 | " 3 \n",
672 | " 489434 \n",
673 | " 22041 \n",
674 | " RECORD FRAME 7\" SINGLE SIZE \n",
675 | " 48 \n",
676 | " 2009-12-01 07:45:00 \n",
677 | " 2.10 \n",
678 | " 13085.0 \n",
679 | " United Kingdom \n",
680 | " 2009-12-01 07:45:00 \n",
681 | " 07:45:00 \n",
682 | " 7 \n",
683 | " 45 \n",
684 | " 0 \n",
685 | " 0 \n",
686 | " 0 \n",
687 | " 7 \n",
688 | " 45 \n",
689 | " 0 \n",
690 | " \n",
691 | " \n",
692 | " 4 \n",
693 | " 489434 \n",
694 | " 21232 \n",
695 | " STRAWBERRY CERAMIC TRINKET BOX \n",
696 | " 24 \n",
697 | " 2009-12-01 07:45:00 \n",
698 | " 1.25 \n",
699 | " 13085.0 \n",
700 | " United Kingdom \n",
701 | " 2009-12-01 07:45:00 \n",
702 | " 07:45:00 \n",
703 | " 7 \n",
704 | " 45 \n",
705 | " 0 \n",
706 | " 0 \n",
707 | " 0 \n",
708 | " 7 \n",
709 | " 45 \n",
710 | " 0 \n",
711 | " \n",
712 | " \n",
713 | "
\n",
714 | "
"
715 | ],
716 | "text/plain": [
717 | " Invoice StockCode Description Quantity \\\n",
718 | "0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 \n",
719 | "1 489434 79323P PINK CHERRY LIGHTS 12 \n",
720 | "2 489434 79323W WHITE CHERRY LIGHTS 12 \n",
721 | "3 489434 22041 RECORD FRAME 7\" SINGLE SIZE 48 \n",
722 | "4 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 \n",
723 | "\n",
724 | " InvoiceDate Price Customer ID Country date \\\n",
725 | "0 2009-12-01 07:45:00 6.95 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
726 | "1 2009-12-01 07:45:00 6.75 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
727 | "2 2009-12-01 07:45:00 6.75 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
728 | "3 2009-12-01 07:45:00 2.10 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
729 | "4 2009-12-01 07:45:00 1.25 13085.0 United Kingdom 2009-12-01 07:45:00 \n",
730 | "\n",
731 | " time_part hour min sec microsec nanosec h m s \n",
732 | "0 07:45:00 7 45 0 0 0 7 45 0 \n",
733 | "1 07:45:00 7 45 0 0 0 7 45 0 \n",
734 | "2 07:45:00 7 45 0 0 0 7 45 0 \n",
735 | "3 07:45:00 7 45 0 0 0 7 45 0 \n",
736 | "4 07:45:00 7 45 0 0 0 7 45 0 "
737 | ]
738 | },
739 | "execution_count": 7,
740 | "metadata": {},
741 | "output_type": "execute_result"
742 | }
743 | ],
744 | "source": [
745 | "# Now, let's repeat what we did in the previous cell in 1 command.\n",
746 | "\n",
747 | "data[[\"h\", \"m\", \"s\"]] = pd.DataFrame(\n",
748 | " [(x.hour, x.minute, x.second) for x in data[\"date\"]]\n",
749 | ")\n",
750 | "\n",
751 | "data.head()"
752 | ]
753 | },
754 | {
755 | "cell_type": "markdown",
756 | "metadata": {},
757 | "source": [
758 | "## Work with different timezones\n",
759 | "\n",
760 | "In the next few cells, we will see how to work with timestamps that are in different time zones."
761 | ]
762 | },
763 | {
764 | "cell_type": "code",
765 | "execution_count": 8,
766 | "metadata": {},
767 | "outputs": [
768 | {
769 | "data": {
770 | "text/html": [
771 | "\n",
772 | "\n",
785 | "
\n",
786 | " \n",
787 | " \n",
788 | " \n",
789 | " time \n",
790 | " \n",
791 | " \n",
792 | " \n",
793 | " \n",
794 | " 0 \n",
795 | " 2014-08-01 09:00:00+02:00 \n",
796 | " \n",
797 | " \n",
798 | " 1 \n",
799 | " 2014-08-01 10:00:00+02:00 \n",
800 | " \n",
801 | " \n",
802 | " 2 \n",
803 | " 2014-08-01 11:00:00+02:00 \n",
804 | " \n",
805 | " \n",
806 | " 0 \n",
807 | " 2014-08-01 09:00:00-05:00 \n",
808 | " \n",
809 | " \n",
810 | " 1 \n",
811 | " 2014-08-01 10:00:00-05:00 \n",
812 | " \n",
813 | " \n",
814 | " 2 \n",
815 | " 2014-08-01 11:00:00-05:00 \n",
816 | " \n",
817 | " \n",
818 | "
\n",
819 | "
"
820 | ],
821 | "text/plain": [
822 | " time\n",
823 | "0 2014-08-01 09:00:00+02:00\n",
824 | "1 2014-08-01 10:00:00+02:00\n",
825 | "2 2014-08-01 11:00:00+02:00\n",
826 | "0 2014-08-01 09:00:00-05:00\n",
827 | "1 2014-08-01 10:00:00-05:00\n",
828 | "2 2014-08-01 11:00:00-05:00"
829 | ]
830 | },
831 | "execution_count": 8,
832 | "metadata": {},
833 | "output_type": "execute_result"
834 | }
835 | ],
836 | "source": [
837 | "# First, let's create a toy dataframe with some timestamps in different time zones.\n",
838 | "\n",
839 | "df = pd.DataFrame()\n",
840 | "\n",
841 | "df[\"time\"] = pd.concat(\n",
842 | " [\n",
843 | " pd.Series(\n",
844 | " pd.date_range(\n",
845 | " start=\"2014-08-01 09:00\", freq=\"H\", periods=3, tz=\"Europe/Berlin\"\n",
846 | " )\n",
847 | " ),\n",
848 | " pd.Series(\n",
849 | " pd.date_range(\n",
850 | " start=\"2014-08-01 09:00\", freq=\"H\", periods=3, tz=\"US/Central\"\n",
851 | " )\n",
852 | " ),\n",
853 | " ],\n",
854 | " axis=0,\n",
855 | ")\n",
856 | "\n",
857 | "df"
858 | ]
859 | },
860 | {
861 | "cell_type": "markdown",
862 | "metadata": {},
863 | "source": [
864 | "We can see the different timezones indicated by the +2 and -5, with respect to the central meridian."
865 | ]
866 | },
867 | {
868 | "cell_type": "code",
869 | "execution_count": 9,
870 | "metadata": {},
871 | "outputs": [
872 | {
873 | "data": {
874 | "text/html": [
875 | "\n",
876 | "\n",
889 | "
\n",
890 | " \n",
891 | " \n",
892 | " \n",
893 | " time \n",
894 | " time_utc \n",
895 | " time_london \n",
896 | " \n",
897 | " \n",
898 | " \n",
899 | " \n",
900 | " 0 \n",
901 | " 2014-08-01 09:00:00+02:00 \n",
902 | " 2014-08-01 07:00:00+00:00 \n",
903 | " 2014-08-01 08:00:00+01:00 \n",
904 | " \n",
905 | " \n",
906 | " 1 \n",
907 | " 2014-08-01 10:00:00+02:00 \n",
908 | " 2014-08-01 08:00:00+00:00 \n",
909 | " 2014-08-01 09:00:00+01:00 \n",
910 | " \n",
911 | " \n",
912 | " 2 \n",
913 | " 2014-08-01 11:00:00+02:00 \n",
914 | " 2014-08-01 09:00:00+00:00 \n",
915 | " 2014-08-01 10:00:00+01:00 \n",
916 | " \n",
917 | " \n",
918 | " 0 \n",
919 | " 2014-08-01 09:00:00-05:00 \n",
920 | " 2014-08-01 14:00:00+00:00 \n",
921 | " 2014-08-01 15:00:00+01:00 \n",
922 | " \n",
923 | " \n",
924 | " 1 \n",
925 | " 2014-08-01 10:00:00-05:00 \n",
926 | " 2014-08-01 15:00:00+00:00 \n",
927 | " 2014-08-01 16:00:00+01:00 \n",
928 | " \n",
929 | " \n",
930 | " 2 \n",
931 | " 2014-08-01 11:00:00-05:00 \n",
932 | " 2014-08-01 16:00:00+00:00 \n",
933 | " 2014-08-01 17:00:00+01:00 \n",
934 | " \n",
935 | " \n",
936 | "
\n",
937 | "
"
938 | ],
939 | "text/plain": [
940 | " time time_utc \\\n",
941 | "0 2014-08-01 09:00:00+02:00 2014-08-01 07:00:00+00:00 \n",
942 | "1 2014-08-01 10:00:00+02:00 2014-08-01 08:00:00+00:00 \n",
943 | "2 2014-08-01 11:00:00+02:00 2014-08-01 09:00:00+00:00 \n",
944 | "0 2014-08-01 09:00:00-05:00 2014-08-01 14:00:00+00:00 \n",
945 | "1 2014-08-01 10:00:00-05:00 2014-08-01 15:00:00+00:00 \n",
946 | "2 2014-08-01 11:00:00-05:00 2014-08-01 16:00:00+00:00 \n",
947 | "\n",
948 | " time_london \n",
949 | "0 2014-08-01 08:00:00+01:00 \n",
950 | "1 2014-08-01 09:00:00+01:00 \n",
951 | "2 2014-08-01 10:00:00+01:00 \n",
952 | "0 2014-08-01 15:00:00+01:00 \n",
953 | "1 2014-08-01 16:00:00+01:00 \n",
954 | "2 2014-08-01 17:00:00+01:00 "
955 | ]
956 | },
957 | "execution_count": 9,
958 | "metadata": {},
959 | "output_type": "execute_result"
960 | }
961 | ],
962 | "source": [
963 | "# To work with different time zones, first we unify the\n",
964 | "# timezone to the central one by setting utc = True.\n",
965 | "\n",
966 | "df[\"time_utc\"] = pd.to_datetime(df[\"time\"], utc=True)\n",
967 | "\n",
968 | "# Next, we change all timestamps to the desired timezone,\n",
969 | "# e.g., Europe/London, as in this example.\n",
970 | "\n",
971 | "df[\"time_london\"] = df[\"time_utc\"].dt.tz_convert(\"Europe/London\")\n",
972 | "\n",
973 | "\n",
974 | "df"
975 | ]
976 | },
977 | {
978 | "cell_type": "markdown",
979 | "metadata": {},
980 | "source": [
981 | "Whether to unify the timezone depends on the use case. If we are forecasting sales for different countries, perhaps it is better to keep each country's respective time zone, since we will treat those series independently.\n",
982 | "\n",
983 | "If we have a small company that sells mostly inland and occasionally sells something abroad, we probably have the local timezone already, but if we do not, we may want to localize the time stamp to our time zone."
984 | ]
985 | },
986 | {
987 | "cell_type": "code",
988 | "execution_count": null,
989 | "metadata": {},
990 | "outputs": [],
991 | "source": []
992 | }
993 | ],
994 | "metadata": {
995 | "kernelspec": {
996 | "display_name": "fets",
997 | "language": "python",
998 | "name": "fets"
999 | },
1000 | "language_info": {
1001 | "codemirror_mode": {
1002 | "name": "ipython",
1003 | "version": 3
1004 | },
1005 | "file_extension": ".py",
1006 | "mimetype": "text/x-python",
1007 | "name": "python",
1008 | "nbconvert_exporter": "python",
1009 | "pygments_lexer": "ipython3",
1010 | "version": "3.8.2"
1011 | },
1012 | "toc": {
1013 | "base_numbering": 1,
1014 | "nav_menu": {},
1015 | "number_sections": true,
1016 | "sideBar": true,
1017 | "skip_h1_title": false,
1018 | "title_cell": "Table of Contents",
1019 | "title_sidebar": "Contents",
1020 | "toc_cell": false,
1021 | "toc_position": {},
1022 | "toc_section_display": "block",
1023 | "toc_window_display": true
1024 | }
1025 | },
1026 | "nbformat": 4,
1027 | "nbformat_minor": 2
1028 | }
1029 |
--------------------------------------------------------------------------------
/12-Categorical-Encoding/3-mean-encoding-simple.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "b699e295",
6 | "metadata": {},
7 | "source": [
8 | "# Mean encoding - simple\n",
9 | "\n",
10 | "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n",
11 | "\n",
12 | "In this notebook, we will encode static features with mean encoding. We will split the data into train and test sets, learn the mean target value per category using the train set, and then encode both the train and test sets with those learned parameters.\n",
13 | "\n",
14 | "It has the advantage that this logic is implemented by open-source libraries.\n",
15 | "\n",
16 | "The drawback is that we may overfit because we are leaking future data into the past. \n",
17 | "\n",
18 | "We will use the online retail dataset, which we prepared in the notebook `02-create-online-retail-II-datasets.ipynb` located in the `01-Create-Datasets` folder."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "id": "49b2f0bf",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import numpy as np\n",
29 | "import pandas as pd\n",
30 | "from feature_engine.encoding import MeanEncoder"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "id": "5a174f3b",
36 | "metadata": {},
37 | "source": [
38 | "## Load data"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 2,
44 | "id": "67a2af74",
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/html": [
50 | "\n",
51 | "\n",
64 | "
\n",
65 | " \n",
66 | " \n",
67 | " \n",
68 | " country \n",
69 | " quantity \n",
70 | " revenue \n",
71 | " \n",
72 | " \n",
73 | " week \n",
74 | " \n",
75 | " \n",
76 | " \n",
77 | " \n",
78 | " \n",
79 | " \n",
80 | " \n",
81 | " 2009-12-06 \n",
82 | " Belgium \n",
83 | " 143 \n",
84 | " 439.1 \n",
85 | " \n",
86 | " \n",
87 | " 2009-12-13 \n",
88 | " Belgium \n",
89 | " 10 \n",
90 | " 8.5 \n",
91 | " \n",
92 | " \n",
93 | " 2009-12-20 \n",
94 | " Belgium \n",
95 | " 0 \n",
96 | " 0.0 \n",
97 | " \n",
98 | " \n",
99 | " 2009-12-27 \n",
100 | " Belgium \n",
101 | " 0 \n",
102 | " 0.0 \n",
103 | " \n",
104 | " \n",
105 | " 2010-01-03 \n",
106 | " Belgium \n",
107 | " 0 \n",
108 | " 0.0 \n",
109 | " \n",
110 | " \n",
111 | "
\n",
112 | "
"
113 | ],
114 | "text/plain": [
115 | " country quantity revenue\n",
116 | "week \n",
117 | "2009-12-06 Belgium 143 439.1\n",
118 | "2009-12-13 Belgium 10 8.5\n",
119 | "2009-12-20 Belgium 0 0.0\n",
120 | "2009-12-27 Belgium 0 0.0\n",
121 | "2010-01-03 Belgium 0 0.0"
122 | ]
123 | },
124 | "execution_count": 2,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "df = pd.read_csv(\"../Datasets/online_retail_dataset_countries.csv\",\n",
131 | " parse_dates=[\"week\"],\n",
132 | " index_col=\"week\",\n",
133 | " )\n",
134 | "\n",
135 | "df.head()"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "id": "4a419d6a",
141 | "metadata": {},
142 | "source": [
143 | "## Split into train and test"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 3,
149 | "id": "1f4c0763",
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "# Split the data before and after June 2011\n",
154 | "\n",
155 | "X_train = df[df.index <= pd.to_datetime('2011-06-30')]\n",
156 | "X_test = df[df.index > pd.to_datetime('2011-06-30')]\n",
157 | "\n",
158 | "y_train = X_train[\"revenue\"]\n",
159 | "y_test = X_test[\"revenue\"]"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 4,
165 | "id": "928be034",
166 | "metadata": {},
167 | "outputs": [
168 | {
169 | "data": {
170 | "text/plain": [
171 | "(Timestamp('2009-12-06 00:00:00'), Timestamp('2011-06-26 00:00:00'))"
172 | ]
173 | },
174 | "execution_count": 4,
175 | "metadata": {},
176 | "output_type": "execute_result"
177 | }
178 | ],
179 | "source": [
180 | "# sanity check\n",
181 | "\n",
182 | "X_train.index.min(), X_train.index.max()"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 5,
188 | "id": "6e838b49",
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | "(Timestamp('2011-07-03 00:00:00'), Timestamp('2011-12-11 00:00:00'))"
195 | ]
196 | },
197 | "execution_count": 5,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "# sanity check\n",
204 | "\n",
205 | "X_test.index.min(), X_test.index.max()"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "id": "d5de7aa0",
211 | "metadata": {},
212 | "source": [
213 | "## Encode"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 6,
219 | "id": "2402ebb9",
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "# Set up the mean encoder\n",
224 | "\n",
225 | "enc = MeanEncoder()"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 7,
231 | "id": "74ef4a1a",
232 | "metadata": {},
233 | "outputs": [
234 | {
235 | "data": {
236 | "text/html": [
237 | "MeanEncoder() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
238 | ],
239 | "text/plain": [
240 | "MeanEncoder()"
241 | ]
242 | },
243 | "execution_count": 7,
244 | "metadata": {},
245 | "output_type": "execute_result"
246 | }
247 | ],
248 | "source": [
249 | "# Find mean target value per category\n",
250 | "# (it uses the entire train set)\n",
251 | "\n",
252 | "enc.fit(X_train, y_train)"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 8,
258 | "id": "1667b70c",
259 | "metadata": {},
260 | "outputs": [
261 | {
262 | "data": {
263 | "text/plain": [
264 | "['country']"
265 | ]
266 | },
267 | "execution_count": 8,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "# Feature-engine's encoder finds categorical variables\n",
274 | "# by default\n",
275 | "\n",
276 | "enc.variables_"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 9,
282 | "id": "90a34078",
283 | "metadata": {},
284 | "outputs": [
285 | {
286 | "data": {
287 | "text/plain": [
288 | "{'country': {'Belgium': 511.37853658536585,\n",
289 | " 'EIRE': 5579.161829268293,\n",
290 | " 'France': 2872.7475609756098,\n",
291 | " 'Germany': 3764.180012195122,\n",
292 | " 'Spain': 919.3335365853659,\n",
293 | " 'United Kingdom': 129124.83931707316}}"
294 | ]
295 | },
296 | "execution_count": 9,
297 | "metadata": {},
298 | "output_type": "execute_result"
299 | }
300 | ],
301 | "source": [
302 | "# the encoding values\n",
303 | "\n",
304 | "enc.encoder_dict_"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 10,
310 | "id": "2c4cf198",
311 | "metadata": {},
312 | "outputs": [
313 | {
314 | "data": {
315 | "text/html": [
316 | "\n",
317 | "\n",
330 | "
\n",
331 | " \n",
332 | " \n",
333 | " \n",
334 | " country \n",
335 | " quantity \n",
336 | " revenue \n",
337 | " \n",
338 | " \n",
339 | " week \n",
340 | " \n",
341 | " \n",
342 | " \n",
343 | " \n",
344 | " \n",
345 | " \n",
346 | " \n",
347 | " 2009-12-06 \n",
348 | " 511.378537 \n",
349 | " 143 \n",
350 | " 439.1 \n",
351 | " \n",
352 | " \n",
353 | " 2009-12-13 \n",
354 | " 511.378537 \n",
355 | " 10 \n",
356 | " 8.5 \n",
357 | " \n",
358 | " \n",
359 | " 2009-12-20 \n",
360 | " 511.378537 \n",
361 | " 0 \n",
362 | " 0.0 \n",
363 | " \n",
364 | " \n",
365 | " 2009-12-27 \n",
366 | " 511.378537 \n",
367 | " 0 \n",
368 | " 0.0 \n",
369 | " \n",
370 | " \n",
371 | " 2010-01-03 \n",
372 | " 511.378537 \n",
373 | " 0 \n",
374 | " 0.0 \n",
375 | " \n",
376 | " \n",
377 | "
\n",
378 | "
"
379 | ],
380 | "text/plain": [
381 | " country quantity revenue\n",
382 | "week \n",
383 | "2009-12-06 511.378537 143 439.1\n",
384 | "2009-12-13 511.378537 10 8.5\n",
385 | "2009-12-20 511.378537 0 0.0\n",
386 | "2009-12-27 511.378537 0 0.0\n",
387 | "2010-01-03 511.378537 0 0.0"
388 | ]
389 | },
390 | "execution_count": 10,
391 | "metadata": {},
392 | "output_type": "execute_result"
393 | }
394 | ],
395 | "source": [
396 | "# Encode datasets\n",
397 | "\n",
398 | "X_train_t = enc.transform(X_train)\n",
399 | "X_test_t = enc.transform(X_test)\n",
400 | "\n",
401 | "X_train_t.head()"
402 | ]
403 | },
404 | {
405 | "cell_type": "markdown",
406 | "id": "85599ce7",
407 | "metadata": {},
408 | "source": [
409 | "Note that Belgium was replaced by 511.37 in all rows, even though on various occasions the revenue was 0. This may result in a \"look ahead\" bias."
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "id": "60a6c207",
416 | "metadata": {},
417 | "outputs": [],
418 | "source": []
419 | }
420 | ],
421 | "metadata": {
422 | "kernelspec": {
423 | "display_name": "fsml",
424 | "language": "python",
425 | "name": "fsml"
426 | },
427 | "language_info": {
428 | "codemirror_mode": {
429 | "name": "ipython",
430 | "version": 3
431 | },
432 | "file_extension": ".py",
433 | "mimetype": "text/x-python",
434 | "name": "python",
435 | "nbconvert_exporter": "python",
436 | "pygments_lexer": "ipython3",
437 | "version": "3.10.5"
438 | },
439 | "toc": {
440 | "base_numbering": 1,
441 | "nav_menu": {},
442 | "number_sections": true,
443 | "sideBar": true,
444 | "skip_h1_title": false,
445 | "title_cell": "Table of Contents",
446 | "title_sidebar": "Contents",
447 | "toc_cell": false,
448 | "toc_position": {
449 | "height": "calc(100% - 180px)",
450 | "left": "10px",
451 | "top": "150px",
452 | "width": "165px"
453 | },
454 | "toc_section_display": true,
455 | "toc_window_display": true
456 | }
457 | },
458 | "nbformat": 4,
459 | "nbformat_minor": 5
460 | }
461 |
--------------------------------------------------------------------------------
/12-Categorical-Encoding/4-mean-encoding-expanding-window.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "b699e295",
6 | "metadata": {},
7 | "source": [
8 | "# Mean encoding - expanding window\n",
9 | "\n",
10 | "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n",
11 | "\n",
12 | "In this notebook, we will encode static features with mean encoding by using expanding windows. This implementation avoids look-ahead bias.\n",
13 | "\n",
14 | "We will use the online retail dataset, which we prepared in the notebook `02-create-online-retail-II-datasets.ipynb` located in the `01-Create-Datasets` folder."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "id": "49b2f0bf",
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import numpy as np\n",
25 | "import pandas as pd"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "id": "5a174f3b",
31 | "metadata": {},
32 | "source": [
33 | "## Load data"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "id": "67a2af74",
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "data": {
44 | "text/html": [
45 | "\n",
46 | "\n",
59 | "
\n",
60 | " \n",
61 | " \n",
62 | " \n",
63 | " country \n",
64 | " quantity \n",
65 | " revenue \n",
66 | " \n",
67 | " \n",
68 | " week \n",
69 | " \n",
70 | " \n",
71 | " \n",
72 | " \n",
73 | " \n",
74 | " \n",
75 | " \n",
76 | " 2009-12-06 \n",
77 | " Belgium \n",
78 | " 143 \n",
79 | " 439.1 \n",
80 | " \n",
81 | " \n",
82 | " 2009-12-13 \n",
83 | " Belgium \n",
84 | " 10 \n",
85 | " 8.5 \n",
86 | " \n",
87 | " \n",
88 | " 2009-12-20 \n",
89 | " Belgium \n",
90 | " 0 \n",
91 | " 0.0 \n",
92 | " \n",
93 | " \n",
94 | " 2009-12-27 \n",
95 | " Belgium \n",
96 | " 0 \n",
97 | " 0.0 \n",
98 | " \n",
99 | " \n",
100 | " 2010-01-03 \n",
101 | " Belgium \n",
102 | " 0 \n",
103 | " 0.0 \n",
104 | " \n",
105 | " \n",
106 | "
\n",
107 | "
"
108 | ],
109 | "text/plain": [
110 | " country quantity revenue\n",
111 | "week \n",
112 | "2009-12-06 Belgium 143 439.1\n",
113 | "2009-12-13 Belgium 10 8.5\n",
114 | "2009-12-20 Belgium 0 0.0\n",
115 | "2009-12-27 Belgium 0 0.0\n",
116 | "2010-01-03 Belgium 0 0.0"
117 | ]
118 | },
119 | "execution_count": 2,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "df = pd.read_csv(\"../Datasets/online_retail_dataset_countries.csv\",\n",
126 | " parse_dates=[\"week\"],\n",
127 | " index_col=\"week\",\n",
128 | " )\n",
129 | "\n",
130 | "df.head()"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "id": "50846272",
136 | "metadata": {},
137 | "source": [
138 | "## Split into train and test"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 3,
144 | "id": "1f4c0763",
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "# Split data before an after June 2011\n",
149 | "\n",
150 | "X_train = df[df.index <= pd.to_datetime('2011-06-30')]\n",
151 | "\n",
152 | "# We need the past data for the expanding window.\n",
153 | "X_test = df.copy()\n",
154 | "\n",
155 | "# the target variable\n",
156 | "y_train = X_train[\"revenue\"]\n",
157 | "y_test = X_test[\"revenue\"]"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 4,
163 | "id": "e1418b42",
164 | "metadata": {},
165 | "outputs": [
166 | {
167 | "data": {
168 | "text/plain": [
169 | "(Timestamp('2009-12-06 00:00:00'), Timestamp('2011-06-26 00:00:00'))"
170 | ]
171 | },
172 | "execution_count": 4,
173 | "metadata": {},
174 | "output_type": "execute_result"
175 | }
176 | ],
177 | "source": [
178 | "# sanity check\n",
179 | "\n",
180 | "X_train.index.min(), X_train.index.max()"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 5,
186 | "id": "1faf10f7",
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "data": {
191 | "text/plain": [
192 | "(Timestamp('2009-12-06 00:00:00'), Timestamp('2011-12-11 00:00:00'))"
193 | ]
194 | },
195 | "execution_count": 5,
196 | "metadata": {},
197 | "output_type": "execute_result"
198 | }
199 | ],
200 | "source": [
201 | "# sanity check\n",
202 | "\n",
203 | "X_test.index.min(), X_test.index.max()"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "id": "d5de7aa0",
209 | "metadata": {},
210 | "source": [
211 | "## Encode countries"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 6,
217 | "id": "931e9ef9",
218 | "metadata": {},
219 | "outputs": [
220 | {
221 | "data": {
222 | "text/html": [
223 | "\n",
224 | "\n",
237 | "
\n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " country \n",
242 | " week \n",
243 | " country_enc \n",
244 | " \n",
245 | " \n",
246 | " \n",
247 | " \n",
248 | " 0 \n",
249 | " Belgium \n",
250 | " 2009-12-06 \n",
251 | " NaN \n",
252 | " \n",
253 | " \n",
254 | " 1 \n",
255 | " Belgium \n",
256 | " 2009-12-13 \n",
257 | " 439.100000 \n",
258 | " \n",
259 | " \n",
260 | " 2 \n",
261 | " Belgium \n",
262 | " 2009-12-20 \n",
263 | " 223.800000 \n",
264 | " \n",
265 | " \n",
266 | " 3 \n",
267 | " Belgium \n",
268 | " 2009-12-27 \n",
269 | " 149.200000 \n",
270 | " \n",
271 | " \n",
272 | " 4 \n",
273 | " Belgium \n",
274 | " 2010-01-03 \n",
275 | " 111.900000 \n",
276 | " \n",
277 | " \n",
278 | " ... \n",
279 | " ... \n",
280 | " ... \n",
281 | " ... \n",
282 | " \n",
283 | " \n",
284 | " 487 \n",
285 | " United Kingdom \n",
286 | " 2011-05-29 \n",
287 | " 129923.850701 \n",
288 | " \n",
289 | " \n",
290 | " 488 \n",
291 | " United Kingdom \n",
292 | " 2011-06-05 \n",
293 | " 129810.417487 \n",
294 | " \n",
295 | " \n",
296 | " 489 \n",
297 | " United Kingdom \n",
298 | " 2011-06-12 \n",
299 | " 129208.338025 \n",
300 | " \n",
301 | " \n",
302 | " 490 \n",
303 | " United Kingdom \n",
304 | " 2011-06-19 \n",
305 | " 129708.159425 \n",
306 | " \n",
307 | " \n",
308 | " 491 \n",
309 | " United Kingdom \n",
310 | " 2011-06-26 \n",
311 | " 129598.153506 \n",
312 | " \n",
313 | " \n",
314 | "
\n",
315 | "
492 rows × 3 columns
\n",
316 | "
"
317 | ],
318 | "text/plain": [
319 | " country week country_enc\n",
320 | "0 Belgium 2009-12-06 NaN\n",
321 | "1 Belgium 2009-12-13 439.100000\n",
322 | "2 Belgium 2009-12-20 223.800000\n",
323 | "3 Belgium 2009-12-27 149.200000\n",
324 | "4 Belgium 2010-01-03 111.900000\n",
325 | ".. ... ... ...\n",
326 | "487 United Kingdom 2011-05-29 129923.850701\n",
327 | "488 United Kingdom 2011-06-05 129810.417487\n",
328 | "489 United Kingdom 2011-06-12 129208.338025\n",
329 | "490 United Kingdom 2011-06-19 129708.159425\n",
330 | "491 United Kingdom 2011-06-26 129598.153506\n",
331 | "\n",
332 | "[492 rows x 3 columns]"
333 | ]
334 | },
335 | "execution_count": 6,
336 | "metadata": {},
337 | "output_type": "execute_result"
338 | }
339 | ],
340 | "source": [
341 | "# train set first\n",
342 | "\n",
343 | "train_enc = (\n",
344 | " X_train\n",
345 | " .groupby(['country'])['revenue']\n",
346 | " .expanding()\n",
347 | " .mean()\n",
348 | " .shift()\n",
349 | ").reset_index()\n",
350 | "\n",
351 | "train_enc.rename(columns = {\"revenue\": \"country_enc\"}, inplace = True)\n",
352 | "\n",
353 | "train_enc"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 7,
359 | "id": "6d3d07a7",
360 | "metadata": {},
361 | "outputs": [
362 | {
363 | "data": {
364 | "text/html": [
365 | "\n",
366 | "\n",
379 | "
\n",
380 | " \n",
381 | " \n",
382 | " \n",
383 | " week \n",
384 | " country \n",
385 | " quantity \n",
386 | " revenue \n",
387 | " country_enc \n",
388 | " \n",
389 | " \n",
390 | " \n",
391 | " \n",
392 | " 0 \n",
393 | " 2009-12-06 \n",
394 | " Belgium \n",
395 | " 143 \n",
396 | " 439.10 \n",
397 | " NaN \n",
398 | " \n",
399 | " \n",
400 | " 1 \n",
401 | " 2009-12-13 \n",
402 | " Belgium \n",
403 | " 10 \n",
404 | " 8.50 \n",
405 | " 439.100000 \n",
406 | " \n",
407 | " \n",
408 | " 2 \n",
409 | " 2009-12-20 \n",
410 | " Belgium \n",
411 | " 0 \n",
412 | " 0.00 \n",
413 | " 223.800000 \n",
414 | " \n",
415 | " \n",
416 | " 3 \n",
417 | " 2009-12-27 \n",
418 | " Belgium \n",
419 | " 0 \n",
420 | " 0.00 \n",
421 | " 149.200000 \n",
422 | " \n",
423 | " \n",
424 | " 4 \n",
425 | " 2010-01-03 \n",
426 | " Belgium \n",
427 | " 0 \n",
428 | " 0.00 \n",
429 | " 111.900000 \n",
430 | " \n",
431 | " \n",
432 | " ... \n",
433 | " ... \n",
434 | " ... \n",
435 | " ... \n",
436 | " ... \n",
437 | " ... \n",
438 | " \n",
439 | " \n",
440 | " 487 \n",
441 | " 2011-05-29 \n",
442 | " United Kingdom \n",
443 | " 67666 \n",
444 | " 121076.06 \n",
445 | " 129923.850701 \n",
446 | " \n",
447 | " \n",
448 | " 488 \n",
449 | " 2011-06-05 \n",
450 | " United Kingdom \n",
451 | " 44422 \n",
452 | " 82246.14 \n",
453 | " 129810.417487 \n",
454 | " \n",
455 | " \n",
456 | " 489 \n",
457 | " 2011-06-12 \n",
458 | " United Kingdom \n",
459 | " 77850 \n",
460 | " 169194.05 \n",
461 | " 129208.338025 \n",
462 | " \n",
463 | " \n",
464 | " 490 \n",
465 | " 2011-06-19 \n",
466 | " United Kingdom \n",
467 | " 68207 \n",
468 | " 120797.68 \n",
469 | " 129708.159425 \n",
470 | " \n",
471 | " \n",
472 | " 491 \n",
473 | " 2011-06-26 \n",
474 | " United Kingdom \n",
475 | " 57102 \n",
476 | " 90786.39 \n",
477 | " 129598.153506 \n",
478 | " \n",
479 | " \n",
480 | "
\n",
481 | "
492 rows × 5 columns
\n",
482 | "
"
483 | ],
484 | "text/plain": [
485 | " week country quantity revenue country_enc\n",
486 | "0 2009-12-06 Belgium 143 439.10 NaN\n",
487 | "1 2009-12-13 Belgium 10 8.50 439.100000\n",
488 | "2 2009-12-20 Belgium 0 0.00 223.800000\n",
489 | "3 2009-12-27 Belgium 0 0.00 149.200000\n",
490 | "4 2010-01-03 Belgium 0 0.00 111.900000\n",
491 | ".. ... ... ... ... ...\n",
492 | "487 2011-05-29 United Kingdom 67666 121076.06 129923.850701\n",
493 | "488 2011-06-05 United Kingdom 44422 82246.14 129810.417487\n",
494 | "489 2011-06-12 United Kingdom 77850 169194.05 129208.338025\n",
495 | "490 2011-06-19 United Kingdom 68207 120797.68 129708.159425\n",
496 | "491 2011-06-26 United Kingdom 57102 90786.39 129598.153506\n",
497 | "\n",
498 | "[492 rows x 5 columns]"
499 | ]
500 | },
501 | "execution_count": 7,
502 | "metadata": {},
503 | "output_type": "execute_result"
504 | }
505 | ],
506 | "source": [
507 | "# Add encoded variable to original train set\n",
508 | "\n",
509 | "X_train_enc = X_train.reset_index().merge(train_enc)\n",
510 | "\n",
511 | "X_train_enc"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": 8,
517 | "id": "5f6bf153",
518 | "metadata": {},
519 | "outputs": [
520 | {
521 | "data": {
522 | "text/html": [
523 | "\n",
524 | "\n",
537 | "
\n",
538 | " \n",
539 | " \n",
540 | " \n",
541 | " quantity \n",
542 | " revenue \n",
543 | " country_enc \n",
544 | " \n",
545 | " \n",
546 | " week \n",
547 | " \n",
548 | " \n",
549 | " \n",
550 | " \n",
551 | " \n",
552 | " \n",
553 | " \n",
554 | " 2009-12-06 \n",
555 | " 143 \n",
556 | " 439.1 \n",
557 | " NaN \n",
558 | " \n",
559 | " \n",
560 | " 2009-12-13 \n",
561 | " 10 \n",
562 | " 8.5 \n",
563 | " 439.1 \n",
564 | " \n",
565 | " \n",
566 | " 2009-12-20 \n",
567 | " 0 \n",
568 | " 0.0 \n",
569 | " 223.8 \n",
570 | " \n",
571 | " \n",
572 | " 2009-12-27 \n",
573 | " 0 \n",
574 | " 0.0 \n",
575 | " 149.2 \n",
576 | " \n",
577 | " \n",
578 | " 2010-01-03 \n",
579 | " 0 \n",
580 | " 0.0 \n",
581 | " 111.9 \n",
582 | " \n",
583 | " \n",
584 | "
\n",
585 | "
"
586 | ],
587 | "text/plain": [
588 | " quantity revenue country_enc\n",
589 | "week \n",
590 | "2009-12-06 143 439.1 NaN\n",
591 | "2009-12-13 10 8.5 439.1\n",
592 | "2009-12-20 0 0.0 223.8\n",
593 | "2009-12-27 0 0.0 149.2\n",
594 | "2010-01-03 0 0.0 111.9"
595 | ]
596 | },
597 | "execution_count": 8,
598 | "metadata": {},
599 | "output_type": "execute_result"
600 | }
601 | ],
602 | "source": [
603 | "# Now we drop the static variable\n",
604 | "\n",
605 | "X_train_enc = X_train_enc.drop(\"country\", axis=1)\n",
606 | "\n",
607 | "# Reset the index\n",
608 | "X_train_enc.set_index(\"week\", inplace=True)\n",
609 | "\n",
610 | "X_train_enc.head()"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": 9,
616 | "id": "2402ebb9",
617 | "metadata": {},
618 | "outputs": [
619 | {
620 | "data": {
621 | "text/html": [
622 | "\n",
623 | "\n",
636 | "
\n",
637 | " \n",
638 | " \n",
639 | " \n",
640 | " quantity \n",
641 | " revenue \n",
642 | " country_enc \n",
643 | " \n",
644 | " \n",
645 | " week \n",
646 | " \n",
647 | " \n",
648 | " \n",
649 | " \n",
650 | " \n",
651 | " \n",
652 | " \n",
653 | " 2011-07-03 \n",
654 | " 103 \n",
655 | " 163.90 \n",
656 | " 511.378537 \n",
657 | " \n",
658 | " \n",
659 | " 2011-07-10 \n",
660 | " 666 \n",
661 | " 1022.82 \n",
662 | " 507.192048 \n",
663 | " \n",
664 | " \n",
665 | " 2011-07-17 \n",
666 | " 13 \n",
667 | " 45.60 \n",
668 | " 513.330476 \n",
669 | " \n",
670 | " \n",
671 | " 2011-07-24 \n",
672 | " 0 \n",
673 | " 0.00 \n",
674 | " 507.827765 \n",
675 | " \n",
676 | " \n",
677 | " 2011-07-31 \n",
678 | " 1000 \n",
679 | " 1407.15 \n",
680 | " 501.922791 \n",
681 | " \n",
682 | " \n",
683 | "
\n",
684 | "
"
685 | ],
686 | "text/plain": [
687 | " quantity revenue country_enc\n",
688 | "week \n",
689 | "2011-07-03 103 163.90 511.378537\n",
690 | "2011-07-10 666 1022.82 507.192048\n",
691 | "2011-07-17 13 45.60 513.330476\n",
692 | "2011-07-24 0 0.00 507.827765\n",
693 | "2011-07-31 1000 1407.15 501.922791"
694 | ]
695 | },
696 | "execution_count": 9,
697 | "metadata": {},
698 | "output_type": "execute_result"
699 | }
700 | ],
701 | "source": [
702 | "# Now we repeat for the test set\n",
703 | "\n",
704 | "# Find the encoding values\n",
705 | "test_enc = (\n",
706 | " X_test\n",
707 | " .groupby(['country'])['revenue']\n",
708 | " .expanding()\n",
709 | " .mean()\n",
710 | " .shift()\n",
711 | ").reset_index()\n",
712 | "\n",
713 | "test_enc.rename(columns = {\"revenue\": \"country_enc\"}, inplace = True)\n",
714 | "\n",
715 | "# join encoded variable\n",
716 | "X_test_enc = X_test.reset_index().merge(test_enc)\n",
717 | "\n",
718 | "# Drop original variable\n",
719 | "X_test_enc = X_test_enc.drop(\"country\", axis=1)\n",
720 | "\n",
721 | "# Reset the index\n",
722 | "X_test_enc.set_index(\"week\", inplace=True)\n",
723 | "\n",
724 | "# Remove data that belongs to the train set\n",
725 | "X_test_enc = X_test_enc[X_test_enc.index > pd.to_datetime('2011-06-30')]\n",
726 | "\n",
727 | "X_test_enc.head()"
728 | ]
729 | },
730 | {
731 | "cell_type": "markdown",
732 | "id": "86a89e3e",
733 | "metadata": {},
734 | "source": [
735 | "That's it!\n",
736 | "\n",
737 | "As you can see, with this way of encoding the static feature, we need to do a lot of the work manually, and we need to be careful to have enough data in the train set, and to split the data correctly after the encoding."
738 | ]
739 | },
740 | {
741 | "cell_type": "code",
742 | "execution_count": null,
743 | "id": "77b803d1",
744 | "metadata": {},
745 | "outputs": [],
746 | "source": []
747 | }
748 | ],
749 | "metadata": {
750 | "kernelspec": {
751 | "display_name": "fsml",
752 | "language": "python",
753 | "name": "fsml"
754 | },
755 | "language_info": {
756 | "codemirror_mode": {
757 | "name": "ipython",
758 | "version": 3
759 | },
760 | "file_extension": ".py",
761 | "mimetype": "text/x-python",
762 | "name": "python",
763 | "nbconvert_exporter": "python",
764 | "pygments_lexer": "ipython3",
765 | "version": "3.10.5"
766 | },
767 | "toc": {
768 | "base_numbering": 1,
769 | "nav_menu": {},
770 | "number_sections": true,
771 | "sideBar": true,
772 | "skip_h1_title": false,
773 | "title_cell": "Table of Contents",
774 | "title_sidebar": "Contents",
775 | "toc_cell": false,
776 | "toc_position": {
777 | "height": "calc(100% - 180px)",
778 | "left": "10px",
779 | "top": "150px",
780 | "width": "173.267px"
781 | },
782 | "toc_section_display": true,
783 | "toc_window_display": true
784 | }
785 | },
786 | "nbformat": 4,
787 | "nbformat_minor": 5
788 | }
789 |
--------------------------------------------------------------------------------
/Appendix/00-pandas-period.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "24d78855-9712-419b-8201-486452f5120a",
6 | "metadata": {},
7 | "source": [
8 | "# Pandas Period"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "bdb234fa-ee2e-403d-a0ff-4b2c0fdced43",
14 | "metadata": {},
15 | "source": [
16 | "[Feature Engineering for Time Series Forecasting](https://www.trainindata.com/p/feature-engineering-for-forecasting)\n",
17 | "\n",
18 | "In this notebook we'll discuss the Pandas `Period` and `PeriodIndex` type to handle time span related data."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "010aee50-728d-4c24-a6f6-9282a71364c1",
24 | "metadata": {},
25 | "source": [
26 | "# Load example data"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "id": "55065bd0-a3fe-4d4b-970c-ea09d514fb12",
32 | "metadata": {},
33 | "source": [
34 | "The air passengers dataset is the monthly totals of international airline passengers, from 1949 to 1960, in units of 1000s. \n",
35 | "\n",
36 | "For instructions on how to download, prepare, and store the dataset, refer to notebook number 5, in the folder \"01-Create-Datasets\" from this repo."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 1,
42 | "id": "7e30d3c0-baa1-4fb0-86c4-6196e46641c0",
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "import pandas as pd\n",
47 | "import numpy as np"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 2,
53 | "id": "d8d74785-9082-4711-8dad-de0d3b333ab6",
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "df = pd.read_csv(\n",
58 | " \"../Datasets/example_air_passengers.csv\",\n",
59 | " parse_dates=[\"ds\"],\n",
60 | " index_col=[\"ds\"],\n",
61 | ")"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "id": "f258096a-1171-43b0-97d7-70ce59f74e00",
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',\n",
74 | " '1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01',\n",
75 | " '1949-09-01', '1949-10-01',\n",
76 | " ...\n",
77 | " '1960-03-01', '1960-04-01', '1960-05-01', '1960-06-01',\n",
78 | " '1960-07-01', '1960-08-01', '1960-09-01', '1960-10-01',\n",
79 | " '1960-11-01', '1960-12-01'],\n",
80 | " dtype='datetime64[ns]', name='ds', length=144, freq=None)"
81 | ]
82 | },
83 | "execution_count": 3,
84 | "metadata": {},
85 | "output_type": "execute_result"
86 | }
87 | ],
88 | "source": [
89 | "df.index"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 4,
95 | "id": "777e7fb4-3554-41e4-8a0d-8b1d8086e14f",
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "data": {
100 | "text/plain": [
101 | "pandas._libs.tslibs.timestamps.Timestamp"
102 | ]
103 | },
104 | "execution_count": 4,
105 | "metadata": {},
106 | "output_type": "execute_result"
107 | }
108 | ],
109 | "source": [
110 | "type(df.index[0])"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "id": "d9d5b80c-3b03-427e-a474-de54022904b9",
116 | "metadata": {},
117 | "source": [
118 | "The current type of our index is a `DatetimeIndex` where each element is a `Timestamp`."
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "id": "bf881c8f-77b0-4dea-89a3-9a39046a5e64",
124 | "metadata": {},
125 | "source": [
126 | "# Pandas Period - what is it and when to use it."
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "id": "7e98b0a0-27a6-4515-91fa-961a76ae0a8a",
132 | "metadata": {},
133 | "source": [
134 | "When working with time related information which refers to a time span (e.g., the sales of products over each month) rather than an instance in time (e.g., an event that occurs at a specific timestamp), it can be more convenient to work with a data type in Pandas called `Period`."
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "id": "995f40b2-fab4-40a0-9b43-643da70f2b56",
140 | "metadata": {},
141 | "source": [
142 | "To read more about the `Period` type in Pandas see the [docs](https://pandas.pydata.org/docs/user_guide/timeseries.html), in particular the section titled \"timestamps vs. time spans\".\n",
143 | " \n",
144 | " > \"A `Period` represents a span of time (e.g., a day, a month, a quarter, etc).\"\n",
145 | " \n",
146 | " > \"Under the hood, pandas represents timestamps using instances of `Timestamp` and sequences of timestamps using instances of `DatetimeIndex`. For regular time spans, pandas uses `Period` objects for scalar values and `PeriodIndex` for sequences of spans.\""
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "id": "1d47571a-47c0-4611-b5a1-f90e456a72eb",
152 | "metadata": {},
153 | "source": [
154 | "`Period` objects can be created just as easily as timestamp `Timestamp` objects."
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 5,
160 | "id": "f979b672-257f-459e-9b55-84d4aab3760b",
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "data": {
165 | "text/plain": [
166 | "Timestamp('2020-01-01 00:00:00')"
167 | ]
168 | },
169 | "execution_count": 5,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "pd.Timestamp(\"2020-01-01\") # Create a timestamp representing 1st January 2020 at time 00:00:00"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 6,
181 | "id": "1ee94d2b-94a0-4f7e-9a86-5ee68430f8b2",
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "data": {
186 | "text/plain": [
187 | "Period('2020-01', 'M')"
188 | ]
189 | },
190 | "execution_count": 6,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | }
194 | ],
195 | "source": [
196 | "pd.Period(\"2020-01\", freq=\"M\") # Create a time period representing the month of January 2020"
197 | ]
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "id": "bb8d418b-64b5-4c1b-a214-fe64a2e7eb5d",
202 | "metadata": {},
203 | "source": [
204 | "For example, our dataset index currently is a `DatetimeIndex` where there is a day (and even a time) associated with each month (e.g., 1960-12-01 00:00:00), despite the day and time being meaningless for this data set. What we're trying to represent is the sales over the time span of a given month."
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 7,
210 | "id": "463cb570-fbab-4a1b-926e-ff911d628868",
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "data": {
215 | "text/html": [
216 | "\n",
217 | "\n",
230 | "
\n",
231 | " \n",
232 | " \n",
233 | " \n",
234 | " y \n",
235 | " \n",
236 | " \n",
237 | " ds \n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " \n",
242 | " \n",
243 | " 1949-01-01 \n",
244 | " 112 \n",
245 | " \n",
246 | " \n",
247 | " 1949-02-01 \n",
248 | " 118 \n",
249 | " \n",
250 | " \n",
251 | " 1949-03-01 \n",
252 | " 132 \n",
253 | " \n",
254 | " \n",
255 | " 1949-04-01 \n",
256 | " 129 \n",
257 | " \n",
258 | " \n",
259 | " 1949-05-01 \n",
260 | " 121 \n",
261 | " \n",
262 | " \n",
263 | "
\n",
264 | "
"
265 | ],
266 | "text/plain": [
267 | " y\n",
268 | "ds \n",
269 | "1949-01-01 112\n",
270 | "1949-02-01 118\n",
271 | "1949-03-01 132\n",
272 | "1949-04-01 129\n",
273 | "1949-05-01 121"
274 | ]
275 | },
276 | "execution_count": 7,
277 | "metadata": {},
278 | "output_type": "execute_result"
279 | }
280 | ],
281 | "source": [
282 | "df.head()"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "id": "0c107454-d49a-49e4-adbf-2c0a97f6597d",
288 | "metadata": {},
289 | "source": [
290 | "We can convert the index from `datetime` to `Period` as follows:"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 8,
296 | "id": "bba662e9-82ee-47a3-ad78-88047a12a911",
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "df.index = df.index.to_period()"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 9,
306 | "id": "eda8d569-a678-4d35-90a6-d288cf53986d",
307 | "metadata": {},
308 | "outputs": [
309 | {
310 | "data": {
311 | "text/html": [
312 | "\n",
313 | "\n",
326 | "
\n",
327 | " \n",
328 | " \n",
329 | " \n",
330 | " y \n",
331 | " \n",
332 | " \n",
333 | " ds \n",
334 | " \n",
335 | " \n",
336 | " \n",
337 | " \n",
338 | " \n",
339 | " 1949-01 \n",
340 | " 112 \n",
341 | " \n",
342 | " \n",
343 | " 1949-02 \n",
344 | " 118 \n",
345 | " \n",
346 | " \n",
347 | " 1949-03 \n",
348 | " 132 \n",
349 | " \n",
350 | " \n",
351 | " 1949-04 \n",
352 | " 129 \n",
353 | " \n",
354 | " \n",
355 | " 1949-05 \n",
356 | " 121 \n",
357 | " \n",
358 | " \n",
359 | "
\n",
360 | "
"
361 | ],
362 | "text/plain": [
363 | " y\n",
364 | "ds \n",
365 | "1949-01 112\n",
366 | "1949-02 118\n",
367 | "1949-03 132\n",
368 | "1949-04 129\n",
369 | "1949-05 121"
370 | ]
371 | },
372 | "execution_count": 9,
373 | "metadata": {},
374 | "output_type": "execute_result"
375 | }
376 | ],
377 | "source": [
378 | "df.head()"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 10,
384 | "id": "e6d889f0-8c74-4eec-a367-da6c005b067c",
385 | "metadata": {},
386 | "outputs": [
387 | {
388 | "data": {
389 | "text/plain": [
390 | "PeriodIndex(['1949-01', '1949-02', '1949-03', '1949-04', '1949-05', '1949-06',\n",
391 | " '1949-07', '1949-08', '1949-09', '1949-10',\n",
392 | " ...\n",
393 | " '1960-03', '1960-04', '1960-05', '1960-06', '1960-07', '1960-08',\n",
394 | " '1960-09', '1960-10', '1960-11', '1960-12'],\n",
395 | " dtype='period[M]', name='ds', length=144)"
396 | ]
397 | },
398 | "execution_count": 10,
399 | "metadata": {},
400 | "output_type": "execute_result"
401 | }
402 | ],
403 | "source": [
404 | "df.index"
405 | ]
406 | },
407 | {
408 | "cell_type": "markdown",
409 | "id": "4f351009-6800-470c-845c-5f7338a8db97",
410 | "metadata": {},
411 | "source": [
412 | "We now have a `PeriodIndex` with monthly frequency which better represents the time series (i.e., the sales over the whole month)."
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "id": "1e2f4e38-9bee-46dc-88b1-8933bf5a0394",
418 | "metadata": {},
419 | "source": [
420 | "`Period` objects can make it easier to do certain calculations. Let's add one month to a given period:"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": 11,
426 | "id": "9eae6521-ed45-4bb9-b1c4-c375c5ebdf6b",
427 | "metadata": {},
428 | "outputs": [
429 | {
430 | "data": {
431 | "text/plain": [
432 | "Period('1949-01', 'M')"
433 | ]
434 | },
435 | "execution_count": 11,
436 | "metadata": {},
437 | "output_type": "execute_result"
438 | }
439 | ],
440 | "source": [
441 | "df.index[0]"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": 12,
447 | "id": "e86cbaa9-ce01-47cc-b5e0-b8ef6931d77e",
448 | "metadata": {},
449 | "outputs": [
450 | {
451 | "data": {
452 | "text/plain": [
453 | "Period('1949-02', 'M')"
454 | ]
455 | },
456 | "execution_count": 12,
457 | "metadata": {},
458 | "output_type": "execute_result"
459 | }
460 | ],
461 | "source": [
462 | "df.index[0] + 1"
463 | ]
464 | },
465 | {
466 | "cell_type": "markdown",
467 | "id": "3d567f8f-a0f7-4c3d-84ba-4385f44daeb5",
468 | "metadata": {},
469 | "source": [
470 | "`Period` is also the preferred type when calculating the **exact** differences in dates in terms of calendar events (e.g., what is the exact integer difference between the week numbers of the two following timestamps: \"2012-01-15 10:00:00\" (week 2, year 2012) and \"2014-04-01 01:30:00\" (week 14, year 2014))"
471 | ]
472 | },
473 | {
474 | "cell_type": "markdown",
475 | "id": "95c877ef-d1a1-4f44-a0e1-5f7c700cc064",
476 | "metadata": {},
477 | "source": [
478 | "Using `Period`"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 13,
484 | "id": "fd40ddc1-00bd-4289-a018-27a918d72e68",
485 | "metadata": {},
486 | "outputs": [
487 | {
488 | "data": {
489 | "text/plain": [
490 | "<-116 * Weeks: weekday=6>"
491 | ]
492 | },
493 | "execution_count": 13,
494 | "metadata": {},
495 | "output_type": "execute_result"
496 | }
497 | ],
498 | "source": [
499 | "delta = pd.Period(\"2012-01-15 10:00:00\", freq=\"W\") - pd.Period(\"2014-04-01 01:30:00\", freq=\"W\")\n",
500 | "delta"
501 | ]
502 | },
503 | {
504 | "cell_type": "markdown",
505 | "id": "63f43b42-ffcc-4f7e-ad2e-133bb2a056c8",
506 | "metadata": {},
507 | "source": [
508 | "We can get the integer using the `n` attribute:"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 14,
514 | "id": "bfa9b0fb-3e19-4465-baaa-d99d0cebb9d4",
515 | "metadata": {},
516 | "outputs": [
517 | {
518 | "data": {
519 | "text/plain": [
520 | "-116"
521 | ]
522 | },
523 | "execution_count": 14,
524 | "metadata": {},
525 | "output_type": "execute_result"
526 | }
527 | ],
528 | "source": [
529 | "delta.n"
530 | ]
531 | },
532 | {
533 | "cell_type": "markdown",
534 | "id": "050c54fa-fc33-4b6e-97bb-e594fe148897",
535 | "metadata": {},
536 | "source": [
537 | "Using `Timestamp` and `timedelta` objects we only get approximate, and sometimes incorrect, answers:"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 15,
543 | "id": "be49da06-d33f-4be8-9a6b-443450331a92",
544 | "metadata": {},
545 | "outputs": [
546 | {
547 | "data": {
548 | "text/plain": [
549 | "-115.23511904761905"
550 | ]
551 | },
552 | "execution_count": 15,
553 | "metadata": {},
554 | "output_type": "execute_result"
555 | }
556 | ],
557 | "source": [
558 | "(pd.Timestamp(\"2012-01-15 10:00:00\") - pd.Timestamp(\"2014-04-01 01:30:00\")) / np.timedelta64(1, \"W\")"
559 | ]
560 | },
561 | {
562 | "cell_type": "markdown",
563 | "id": "3119df1a-2812-4133-b9e6-4e7580c8cf64",
564 | "metadata": {},
565 | "source": [
566 | "Whether we use `Period` or `datetime` should not change the forecasting workflow, but it will make some calculations easier depending on the time series."
567 | ]
568 | },
569 | {
570 | "cell_type": "markdown",
571 | "id": "a3bf4812-70e1-4a36-9658-90d164395bc8",
572 | "metadata": {},
573 | "source": [
574 | "In general, if your data represents a timespan then `Period` (e.g., sales over one month) can make handling the data more convenient. If your data represents events that occurred at a timepoint then `datetime` or `Timestamp` is preferred."
575 | ]
576 | }
577 | ],
578 | "metadata": {
579 | "kernelspec": {
580 | "display_name": "Python 3 (ipykernel)",
581 | "language": "python",
582 | "name": "python3"
583 | },
584 | "language_info": {
585 | "codemirror_mode": {
586 | "name": "ipython",
587 | "version": 3
588 | },
589 | "file_extension": ".py",
590 | "mimetype": "text/x-python",
591 | "name": "python",
592 | "nbconvert_exporter": "python",
593 | "pygments_lexer": "ipython3",
594 | "version": "3.10.5"
595 | },
596 | "toc": {
597 | "base_numbering": 1,
598 | "nav_menu": {},
599 | "number_sections": true,
600 | "sideBar": true,
601 | "skip_h1_title": false,
602 | "title_cell": "Table of Contents",
603 | "title_sidebar": "Contents",
604 | "toc_cell": false,
605 | "toc_position": {},
606 | "toc_section_display": true,
607 | "toc_window_display": false
608 | }
609 | },
610 | "nbformat": 4,
611 | "nbformat_minor": 5
612 | }
613 |
--------------------------------------------------------------------------------
/Datasets/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/Datasets/.gitkeep
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2021-2024, Kishan Manani, Soledad Galli
4 | Feature Engineering for Time Series - Online Course:
5 | https://www.trainindata.com/p/feature-engineering-for-forecasting
6 |
7 |
8 | Redistribution and use in source and binary forms, with or without
9 | modification, are permitted provided that the following conditions are met:
10 |
11 | 1. Redistributions of source code must retain the above copyright notice, this
12 | list of conditions and the following disclaimer.
13 |
14 | 2. Redistributions in binary form must reproduce the above copyright notice,
15 | this list of conditions and the following disclaimer in the documentation
16 | and/or other materials provided with the distribution.
17 |
18 | 3. Neither the name of the copyright holder nor the names of its
19 | contributors may be used to endorse or promote products derived from
20 | this software without specific prior written permission.
21 |
22 |
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Feature Engineering for Time Series Forecasting - Code Repository
2 |
3 | [ ](https://www.trainindata.com/p/feature-engineering-for-forecasting)
4 |
5 |
6 | 
7 | [](https://github.com/trainindata/feature-engineering-for-time-series-forecasting/blob/master/LICENSE)
8 | [](https://www.trainindata.com/)
9 |
10 | Published October, 2022
11 |
12 | Actively maintained.
13 |
14 | ## Links
15 |
16 | - [Online Course](https://www.trainindata.com/p/feature-engineering-for-forecasting)
17 |
18 |
19 | ## Table of Contents
20 |
21 | 1. **Tabularizing time series data**
22 | 1. Features from the target
23 | 2. Features from exogenous variables
24 | 3. Single step forecasting
25 |
26 | 2. **Challenges in feature engineering for time series**
27 | 1. Train-test split
28 | 2. Pipelines
29 | 3. Multistep forecasting
30 | 4. Direct forecasting
31 | 5. Recursive forecasting
32 |
33 | 3. **Time series decomposition**
34 | 1. Components of a time series: trend and seasonality
35 | 2. Multiplicative and additive models
36 | 3. Log transform and Box-Cox
37 | 4. Moving averages
38 | 5. LOWESS, STL, and multiseasonal time series decomposition
39 |
40 | 4. **Missing data imputation**
41 | 1. Forward and backward filling
42 | 2. Linear and spline interpolation
43 | 3. Seasonal decomposition and interpolation
44 |
45 | 5. **Outliers**
46 | 1. Rolling statistics for outlier detection
47 | 2. LOWESS for outlier detection
48 | 3. STL for outlier detection
49 |
50 | 6. **Lag features**
51 | 1. Autoregressive processes
52 | 2. Lag plots
53 | 3. ACF, PACF, CCF
54 | 4. Seasonal lags
55 | 4. Creating lags with open-source
56 |
57 | 7. **Window features**
58 | 1. Rolling windows
59 | 2. Expanding windows
60 | 3. Exponentially weighted windows
61 | 4. Creating window features with open-source
62 |
63 | 8. **Trend features**
64 | 1. Using time to model linear trend
65 | 2. Polynomial features of time to model non-linear trend
66 | 3. Changepoints & piecweise linear trends to model non-linear trend
67 | 4. Forecasting time series with trend using tree-based models
68 | 5. Creating trend features with open-source
69 |
70 | 9. **Seasonality features**
71 | 1. Seasonal lags
72 | 2. Seasonal dummies
73 | 3. Seasonal decomposition methods
74 | 4. Fourier terms
75 | 5. Creating seasonality features with open-source
76 |
77 | 10. **Datetime features**
78 | 1. Extracting features from date and time
79 | 2. Periodic features
80 | 3. Calendar events
81 | 4. Creating datetime features with open-source
82 |
83 | 11. **Categorical Features**
84 | 1. One hot encoding
85 | 2. Target encoding
86 | 3. Rolling entropy and rolling majority
87 |
88 |
89 | - [Online Course](https://www.trainindata.com/p/feature-engineering-for-forecasting)
90 |
--------------------------------------------------------------------------------
/assignments/02-tabularizing-time-series/assignment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "2d1a73ab",
6 | "metadata": {},
7 | "source": [
8 | "# Tabularize time series\n",
9 | "\n",
10 | "In this assignment, your task is to convert **time series data** into a **tabular data set**.\n",
11 | "\n",
12 | "You need to create suitable input features from a time series containing weekly sales to be able to forecast sales for the next week.\n",
13 | "\n",
14 | "To prepare the dataset for this assignment, please follow the guidelines in the notebook `02-create-online-retail-II-datasets.ipynb` in the `01-Create-Datasets` folder."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "id": "f53976d3",
21 | "metadata": {},
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/html": [
26 | "\n",
27 | "\n",
40 | "
\n",
41 | " \n",
42 | " \n",
43 | " \n",
44 | " sales \n",
45 | " \n",
46 | " \n",
47 | " week \n",
48 | " \n",
49 | " \n",
50 | " \n",
51 | " \n",
52 | " \n",
53 | " 2009-12-06 \n",
54 | " 213000.35 \n",
55 | " \n",
56 | " \n",
57 | " 2009-12-13 \n",
58 | " 195810.04 \n",
59 | " \n",
60 | " \n",
61 | " 2009-12-20 \n",
62 | " 182396.74 \n",
63 | " \n",
64 | " \n",
65 | " 2009-12-27 \n",
66 | " 22007.77 \n",
67 | " \n",
68 | " \n",
69 | " 2010-01-03 \n",
70 | " 0.00 \n",
71 | " \n",
72 | " \n",
73 | "
\n",
74 | "
"
75 | ],
76 | "text/plain": [
77 | " sales\n",
78 | "week \n",
79 | "2009-12-06 213000.35\n",
80 | "2009-12-13 195810.04\n",
81 | "2009-12-20 182396.74\n",
82 | "2009-12-27 22007.77\n",
83 | "2010-01-03 0.00"
84 | ]
85 | },
86 | "execution_count": 2,
87 | "metadata": {},
88 | "output_type": "execute_result"
89 | }
90 | ],
91 | "source": [
92 | "# load weekly sales dataset\n",
93 | "\n",
94 | "filename = \"../../Datasets/online_retail_dataset.csv\"\n",
95 | "\n",
96 | "df = pd.read_csv(\n",
97 | " filename,\n",
98 | " usecols=[\"week\", \"United Kingdom\"],\n",
99 | " parse_dates=[\"week\"],\n",
100 | " index_col=[\"week\"],\n",
101 | ")\n",
102 | "\n",
103 | "df.columns = ['sales']\n",
104 | "\n",
105 | "df.head()"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "id": "cdfe9415",
111 | "metadata": {},
112 | "source": [
113 | "# Data analysis\n",
114 | "\n",
115 | "First, explore the time series.\n",
116 | "\n",
117 | "## Plot time series"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "id": "6ceabd79",
123 | "metadata": {},
124 | "source": [
125 | "## Missing data\n",
126 | "\n",
127 | "Check if there are missing values in the time series."
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "id": "9c484bca",
133 | "metadata": {},
134 | "source": [
135 | "## Missing timestamps\n",
136 | "\n",
137 | "Check if there are missing timestamps in the index."
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "id": "444ca303",
143 | "metadata": {},
144 | "source": [
145 | "## Seasonality\n",
146 | "\n",
147 | "Does the time series show any obvious seasonal pattern?"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "id": "e81565cb",
153 | "metadata": {},
154 | "source": [
155 | "# Feature engineering\n",
156 | "\n",
157 | "Now, let's begin to tabularize the data."
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "id": "20ae8079",
163 | "metadata": {},
164 | "source": [
165 | "## Split data\n",
166 | "\n",
167 | "Separate the data into training and testing sets, leaving the data after the last week of September to evaluate the forecasts, that is, in the testing set."
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "id": "820803d5",
173 | "metadata": {},
174 | "source": [
175 | "## Naive forecast\n",
176 | "\n",
177 | "Predict sales in the next week (t) as the value of sales in the previous week (t-1)."
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "id": "4058260e",
183 | "metadata": {},
184 | "source": [
185 | "## Machine Learning"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "4957673a",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": []
195 | }
196 | ],
197 | "metadata": {
198 | "kernelspec": {
199 | "display_name": "fsml",
200 | "language": "python",
201 | "name": "fsml"
202 | },
203 | "language_info": {
204 | "codemirror_mode": {
205 | "name": "ipython",
206 | "version": 3
207 | },
208 | "file_extension": ".py",
209 | "mimetype": "text/x-python",
210 | "name": "python",
211 | "nbconvert_exporter": "python",
212 | "pygments_lexer": "ipython3",
213 | "version": "3.10.5"
214 | },
215 | "toc": {
216 | "base_numbering": 1,
217 | "nav_menu": {},
218 | "number_sections": true,
219 | "sideBar": true,
220 | "skip_h1_title": false,
221 | "title_cell": "Table of Contents",
222 | "title_sidebar": "Contents",
223 | "toc_cell": false,
224 | "toc_position": {},
225 | "toc_section_display": true,
226 | "toc_window_display": true
227 | }
228 | },
229 | "nbformat": 4,
230 | "nbformat_minor": 5
231 | }
232 |
--------------------------------------------------------------------------------
/images/FETSF_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/FETSF_banner.png
--------------------------------------------------------------------------------
/images/forecasting_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/forecasting_framework.png
--------------------------------------------------------------------------------
/images/lag_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/lag_features.png
--------------------------------------------------------------------------------
/images/trainindata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/trainindata.png
--------------------------------------------------------------------------------
/images/window_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/feature-engineering-for-time-series-forecasting/e2cbe925a4127485902c8878879ea4cb247c919d/images/window_features.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openpyxl>=3.0.6
2 | xlrd>=2.0.1
3 |
4 | # Numerical computing libraries
5 | pandas>=1.4.0
6 | numpy>=1.18.0
7 | scikit-learn>=1.0.0
8 | scipy>=1.6.0
9 | statsmodels>=0.12.1
10 |
11 | # plotting libraries
12 | matplotlib>=3.3.4
13 | seaborn>=0.11.1
14 |
15 | # jupyter notebook
16 | jupyterlab>=3.0.6
17 | ipykernel>=5.5.5
18 |
19 | # feature engineering libraries
20 | feature-engine>=1.3.0
21 | featuretools>=1.2.0
--------------------------------------------------------------------------------