├── .gitignore
├── .project-metadata.yaml
├── LICENSE.txt
├── README.md
├── apps
├── diagnostics.py
├── forecast.py
├── launch_diagnostics.py
└── launch_forecast.py
├── cml
├── fit_models_parallel.py
└── install_dependencies.py
├── data
└── demand.json
├── img
├── app.png
├── diagnostic-chart.png
└── diagnostic-metrics.png
├── requirements.txt
├── scripts
├── fit_baseline_model.py
├── fit_complex_log_prophet_model.py
├── fit_complex_prophet_model.py
├── fit_simple_prophet_model.py
├── get_csv.py
├── make_forecast.py
└── validation_metrics.py
├── setup.py
└── sts
├── __init__.py
├── data
├── __init__.py
└── loader.py
└── models
├── __init__.py
├── baselines.py
└── prophet.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # pytype static type analyzer
135 | .pytype/
136 |
137 | # Cython debug symbols
138 | cython_debug/
139 |
140 | # Project specific
141 | R
142 | node_modules
143 | *.pyc
144 | __pycache__
145 | .*
146 | !.gitignore
147 |
--------------------------------------------------------------------------------
/.project-metadata.yaml:
--------------------------------------------------------------------------------
1 | name: Structural Time Series
2 | description: California electricity demand forecasting with Prophet.
3 | author: Cloudera Inc.
4 | specification_version: 1.0
5 | prototype_version: 1.0
6 | date: "2020-10-14"
7 | api_version: 1
8 |
9 | environment_variables:
10 | EIA_API_KEY:
11 | default: "EIA KEY"
12 | description: "Optional EIA open data API key"
13 | prompt_user: false
14 |
15 | tasks:
16 | - type: create_job
17 | name: Install Dependencies
18 | entity_label: install_dependencies
19 | script: cml/install_dependencies.py
20 | arguments: None
21 | cpu: 2
22 | memory: 4
23 | short_summary: Create job to install project dependencies.
24 | environment:
25 | TASK_TYPE: CREATE/RUN_JOB
26 | kernel: python3
27 |
28 | - type: run_job
29 | entity_label: install_dependencies
30 | short_summary: Running install dependencies job.
31 | long_summary: Running the job to install dependencies. Note that this requires at least 4GB of memory
32 |
33 | - type: create_job
34 | name: Launch Parallel Model Fitting
35 | entity_label: fit_models_parallel
36 | script: cml/fit_models_parallel.py
37 | arguments: None
38 | short_summary: Create job to launch parallel training script execution.
39 | long_summary: Creates job to launch independent training workloads for each forecast script in the /scripts directory.
40 | cpu: 1
41 | memory: 2
42 | environment:
43 | TASK_TYPE: CREATE/RUN_JOB
44 | kernel: python3
45 |
46 | - type: run_job
47 | entity_label: fit_models_parallel
48 | short_summary: Running job to train forecasts in parallel.
49 | long_summary: Running job to train forecasts in parallel via CDSW Workers API
50 |
51 | - type: start_application
52 | name: Diagnostic App
53 | subdomain: diagnostics
54 | script: apps/launch_diagnostics.py
55 | short_summary: Starting forecast diagnostics application
56 | environment_variables:
57 | TASK_TYPE: START_APPLICATION
58 | kernel: python3
59 |
60 | - type: start_application
61 | name: Forecast App
62 | subdomain: forecast
63 | script: apps/launch_forecast.py
64 | short_summary: Starting primary forecast application
65 | environment_variables:
66 | TASK_TYPE: START_APPLICATION
67 | kernel: python3
68 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Structural Time Series
2 |
3 | This repo accompanies the Cloudera Fast Forward report [Structural Time Series](https://structural-time-series.fastforwardlabs.com/).
4 | It provides an example application of generalized additive models (via the [Prophet](https://facebook.github.io/prophet/) library) to California hourly electricity demand data.
5 |
6 | The primary output of this repository is a small application exposing a probablistic forecast and interface for asking a probabilistic question against it.
7 | The final app looks like this.
8 |
9 |
10 |
11 | Instructions are given both for general use (on a laptop, say), and for Cloudera CML and CDSW.
12 | We'll first describe what's here, then go through how to run everything.
13 |
14 | ## Structure
15 |
16 | The folder structure of the repo is as follows
17 |
18 | ```
19 | .
20 | ├── apps # Two small Streamlit applications.
21 | ├── cml # This folder contains scripts that facilitate the project launch on CML.
22 | ├── data # This folder contains starter data, and is where forecasts will live.
23 | ├── scripts # This is where all the code that does something lives.
24 | └── sts # A small library of useful functions.
25 | ```
26 |
27 | There's also an `img` folder that contains images for this README.
28 | That folder is unimportant and you can ignore it.
29 | Let's examine each of the important folders in turn.
30 |
31 | ### `sts`
32 |
33 | This is a small Python library of utility functions useful to our problem.
34 | Its structure is as follows:
35 |
36 | ```
37 | sts
38 | ├── data
39 | │ └── loader.py
40 | └── models
41 | ├── baselines.py
42 | └── prophet.py
43 | ```
44 |
45 | Building a small library of problem-specific abstractions allows us to reuse them in multiple places.
46 | The code in `data/loader.py`, is reused in most of the scripts and applications.
47 | In this case, we have closed model details (such as the number of Fourier terms to include in a given Prophet model) into the library.
48 | It would be trivial to pass these through as arguments though, if we wanted to perform an extensive hyperparameter search for example.
49 |
50 | ### `scripts`
51 |
52 | These imperative scripts are where the _work_ of the analysis is done.
53 | Side-effectful actions such as I/O and model training occur in these scripts.
54 |
55 | ```
56 | scripts
57 | ├── fit_baseline_model.py
58 | ├── fit_simple_prophet_model.py
59 | ├── fit_complex_prophet_model.py
60 | ├── fit_complex_log_prophet_model.py
61 | ├── get_csv.py
62 | ├── make_forecast.py
63 | └── validation_metrics.py
64 | ```
65 |
66 | ### `apps`
67 |
68 | Two applications accompany this project.
69 | Each has a launcher script to assist launching an [Application](https://docs.cloudera.com/machine-learning/cloud/applications/topics/ml-applications.html) with CDSW/CML.
70 | To launch the applications in another environment, run the code inside the launcher files, with the prefixed `!` removed.
71 | You may need to specify different ports.
72 |
73 | ```
74 | apps
75 | ├── diagnostics.py # A model comparison and debugging assistant.
76 | ├── forecast.py # The primary forecasting interface.
77 | ├── launch_diagnostics.py # Launcher script for CDSW/CML
78 | └── launch_forecast.py # Launcher script for CDSW/CML
79 | ```
80 |
81 | #### Diagnostics
82 |
83 | The diagnostic application serves two purposes.
84 | First, it computes and reports top level metrics for any forecasts saved in the `data/forecasts` directory.
85 |
86 |
87 |
88 | Second, it provides a few diagnostic charts, including a zoomable forecast.
89 |
90 |
91 |
92 | #### Forecast
93 |
94 | The primary forecast application (pictured at the top of this README) is a prototype user interface for the forecast this analysis generates.
95 |
96 | ### `cml`
97 |
98 | These scripts serve as launch instructions to facilitate the automated project setup on CML.
99 | Each script is triggered by the declarative pipeline as defined in the `.project-metadata.yaml` file found in the project's root directory.
100 |
101 | ```
102 | cml
103 | ├── install_dependencies.py
104 | └── fit_models_parallel.py
105 | ```
106 |
107 | ## Running through the analysis
108 |
109 | To go from a fresh clone of the repo to the final state, follow these instructions in order.
110 |
111 | ### Installation
112 |
113 | The code and applications within were developed against Python 3.6.9, and are likely also to function with more recent versions of Python.
114 |
115 | To install dependencies, first create and activate a new virtual environment through your preferred means, then pip install from the requirements file. I recommend:
116 |
117 | ```python
118 | python3 -m venv .venv
119 | source .venv/bin/activate
120 | pip install -r requirements.txt
121 | ```
122 |
123 | In CML or CDSW, no virtual env is necessary. Instead, inside a Python 3 session (with at least 2 vCPU / 4 GiB Memory), simply run
124 |
125 | ```python
126 | !pip3 install -r requirements.txt # notice `pip3`, not `pip`
127 | ```
128 |
129 | Next, install the `sts` module from this repository, with
130 |
131 | ```python
132 | pip3 install -e .
133 | ```
134 |
135 | from inside the root directory of this repo.
136 |
137 | ### Data
138 |
139 | We use historic California electricity demand data from the [US Energy Information Administration](https://www.eia.gov/opendata/qb.php?category=3389936&sdid=EBA.CAL-ALL.D.H).
140 |
141 | A full set of data through October 12th 2020 is included as a starter.
142 | More recent data can be fetched from the [EIA open data API](https://www.eia.gov/opendata/).
143 | Doing so requires an API key, which must be set as the `EIA_API_KEY` environment variable for this project.
144 | To fetch new data, simply call the `load_california_electricity_demand` function from the `sts.data.loader` module.
145 | The code is set up to work directly with the json response to the EIA API.
146 | By default, each time new data is fetched, it will overwrite the existing data.
147 | Similarly, when a new forecast is made, it will overwrite the existing forecast.
148 | It would not be hard to adapt the code to maintain a history of fetched data or forecasts if desired.
149 |
150 | ### Scripts
151 |
152 | To fit models and generate forecasts, we call each script in turn from the `scripts` directory.
153 |
154 | ```bash
155 | python3 scripts/fit_baseline_model.py
156 | python3 scripts/fit_simple_prophet_model.py
157 | python3 scripts/fit_complex_prophet_model.py
158 | python3 scripts/fit_complex_log_prophet_model.py
159 | ```
160 |
161 | This will fit a series of models of increasing complexity and write their outputs (the mean forecast) to the `data/forecasts` directory.
162 | Launching the diagnostic app will show the metrics and diagnostic charts for each model.
163 |
164 | The most compex model wins.
165 | We can view its metrics when trained on the validation data (through 2019) by running the `scripts/validation_metrics.py` script.
166 | We can then generate 1000 samples from the model trained on all available training data with the `scripts/make_forecast.py` script.
167 | When those samples are written to disk, we can use the forecast app to investigate them.
168 |
169 | The additional script, `get_csv.py`, simply fetches and writes data as a csv, which is convenient for any ad hoc analytics and interactive exploration.
170 |
--------------------------------------------------------------------------------
/apps/diagnostics.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import os
42 |
43 | import streamlit as st
44 | import pandas as pd
45 | import numpy as np
46 | import plotly.express as px
47 | import plotly.graph_objects as go
48 | from statsmodels.tsa.stattools import acf, pacf
49 |
50 | from sts.data.loader import load_california_electricity_demand
51 | from sts.models.baselines import year_ahead_hourly_forecast
52 |
53 |
54 | FORECAST_DIRECTORY = "data/forecasts"
55 |
56 |
57 | st.title("California Electricity Demand Model Diagnostics")
58 |
59 | # first, load true demand data and forecasts
60 |
61 |
62 | def read_forecast(filename):
63 | name = filename.split(".")[0]
64 | df = (
65 | pd
66 | .read_csv(FORECAST_DIRECTORY+"/"+filename)
67 | .rename(columns={"yhat": name})
68 | .assign(ds=lambda df: pd.to_datetime(df.ds))
69 | )
70 | return df
71 |
72 |
73 | @st.cache(allow_output_mutation=True)
74 | def load_all_forecasts():
75 | df = load_california_electricity_demand().sort_values("ds")
76 | forecast_list = os.listdir(FORECAST_DIRECTORY)
77 | for f in forecast_list:
78 | df = df.merge(read_forecast(f), on="ds")
79 | return df
80 |
81 |
82 | data_loading = st.text("Loading data...")
83 | df = load_all_forecasts()
84 | data_loading.text("")
85 |
86 | df_train = df[df.ds.dt.year < 2019]
87 | df_2018 = df[df.ds.dt.year == 2018]
88 | df_2019 = df[df.ds.dt.year == 2019]
89 |
90 | model_names = [x for x in df.columns if x not in ["ds", "y"]]
91 |
92 |
93 | f"""
94 | ## Model comparison
95 | There are {len(model_names)} models. Here is a comparison of their MAPE for select data slices.
96 | We compare a held out test set (2019) to the whole training set through 2018 and also
97 | 2018 in isolation. 2018 is included for being one complete period in the training set
98 | of equal length to 2019.
99 | """
100 |
101 |
102 | def ape(df):
103 | return pd.DataFrame({m: np.abs(df.y - df[m]) / df.y for m in model_names})
104 |
105 |
106 | st.write(
107 | pd.DataFrame({
108 | "all training": ape(df_train).mean().rename("MAPE"),
109 | "2018 (training)": ape(df_2018).mean().rename("MAPE"),
110 | "2019 (holdout)": ape(df_2019).mean().rename("MAPE")
111 | }).transpose()
112 | )
113 |
114 |
115 | """
116 | ---
117 | Another metric for this kind of time series is [MASE](https://en.wikipedia.org/wiki/Mean_absolute_scaled_error).
118 | We will use a seasonal variant, where the season is defined to be 52 weeks long,
119 | so that years are approximately aligned.
120 | MASE measures error relative to the baseline, so a lower score is better.
121 | """
122 |
123 |
124 | def mase_denominator(df):
125 | naive_forecast = year_ahead_hourly_forecast(df)
126 | denom = np.sum(
127 | np.abs((naive_forecast - df.y).dropna())
128 | ) / len(naive_forecast.dropna())
129 | return denom
130 |
131 |
132 | denom = mase_denominator(df_train)
133 |
134 |
135 | def mase(df):
136 | return pd.DataFrame({m: np.abs(df.y - df[m]) / denom for m in model_names})
137 |
138 |
139 | st.write(
140 | pd.DataFrame({
141 | "all training": mase(df_train).mean().rename("MASE"),
142 | "2018 (training)": mase(df_2018).mean().rename("MASE"),
143 | "2019 (holdout)": mase(df_2019).mean().rename("MASE")
144 | }).transpose()
145 | )
146 |
147 |
148 | """
149 | ---
150 | ## Model drill-down
151 | We can compute some more detailed diagnostics for each model individually.
152 | """
153 | active_model = st.selectbox("Model", model_names)
154 |
155 |
156 | """
157 | ### The forecast
158 | First, we should see the forecast vs true, observed values.
159 | """
160 |
161 | forecast_chart = px.line(
162 | df, x='ds', y=['y', active_model],
163 | color_discrete_sequence=["#ff8300", "#00828c"]
164 | )
165 | forecast_chart.update_xaxes(
166 | rangeslider_visible=True,
167 | rangeselector=dict(
168 | buttons=list([
169 | dict(count=7, label="1w", step="day", stepmode="backward"),
170 | dict(count=1, label="1m", step="month", stepmode="backward"),
171 | dict(count=3, label="3m", step="month", stepmode="backward"),
172 | dict(count=6, label="6m", step="month", stepmode="backward"),
173 | dict(count=1, label="YTD", step="year", stepmode="todate"),
174 | dict(count=1, label="1y", step="year", stepmode="backward"),
175 | dict(step="all")
176 | ])
177 | )
178 | )
179 | forecast_chart.update_layout(
180 | xaxis_title="Datetime (hourly increments)",
181 | yaxis_title="Demand (Megawatt-hours)",
182 | legend=dict(
183 | orientation="h",
184 | yanchor="bottom",
185 | y=1.02,
186 | xanchor="right",
187 | x=1,
188 |
189 | ),
190 | legend_title_text=""
191 | )
192 | st.plotly_chart(forecast_chart)
193 |
194 |
195 | """
196 | ---
197 | ### Diagnostics
198 | """
199 |
200 | data_set = st.selectbox("Dataset", ['Train', 'Test', 'Combined'])
201 |
202 | if data_set == 'Train':
203 | df = df_train
204 | elif data_set == 'Test':
205 | df = df_2019
206 |
207 |
208 | """
209 | ---
210 | Scatter plot of the true values vs forecast values.
211 | This plot will be heavily overplotted, but the overall shape should tell us
212 | whether we are over- or under-predicting.
213 | """
214 |
215 | scatter_chart = go.Figure(data=go.Scatter(
216 | x=df.y, y=df[active_model],
217 | mode="markers",
218 | marker=dict(color="#00828c", opacity=0.2),
219 | ))
220 | scatter_chart.update_layout(
221 | xaxis_title="True demand (Megawatt-hours)",
222 | yaxis_title="Forecast demand (Megawatt-hours)"
223 | )
224 |
225 | st.plotly_chart(scatter_chart)
226 |
227 | residuals = (df["y"] - df[active_model]).dropna()
228 |
229 |
230 | """
231 | ---
232 | Here is the marginal distribution of the residuals.
233 | We expect it to be symmetric, approximately normal, and centered at zero.
234 | """
235 |
236 | residual_chart = px.histogram(
237 | df, x=residuals, color_discrete_sequence=["#00828c"]
238 | )
239 | residual_chart.update_layout(
240 | xaxis_title="Residual (true demand - forecast demand) (Megawatt-hours)",
241 | yaxis_title="Count"
242 | )
243 |
244 | st.plotly_chart(residual_chart)
245 |
246 |
247 | """
248 | ---
249 | The autocorrelation and partial autocorrelation of the residuals.
250 | Since none of our models try to model the error (with autoregressive terms), we may
251 | expect some autocorrelation.
252 | The orange bands represent the 95% confidence interval for the null hypothesis that
253 | there is no (partial) autocorrelation.
254 | Bars outside those bounds indicate high likelihood of autocorrelation.
255 | """
256 |
257 | autocorrelation, conf_intervals = acf(residuals, alpha=0.05, nlags=48)
258 |
259 | autocorrelation_df = pd.DataFrame({
260 | "autocorrelation": autocorrelation,
261 | # center confidence intervals on zero,
262 | # so that null hypothesis is zero autocorrelation
263 | "ci_lower": conf_intervals[:, 0]-autocorrelation,
264 | "ci_upper": conf_intervals[:, 1]-autocorrelation
265 | })
266 | autocorrelation_chart = px.bar(
267 | autocorrelation_df,
268 | x=autocorrelation_df.index,
269 | y=["autocorrelation", "ci_lower", "ci_upper"],
270 | color_discrete_sequence=["#00828c", "#ff8300", "#ff8300"],
271 | barmode="overlay"
272 | )
273 | autocorrelation_chart.update_layout(
274 | xaxis_title="Timestep (hours)",
275 | yaxis_title="Autocorrelation",
276 | showlegend=False
277 | )
278 |
279 | st.plotly_chart(autocorrelation_chart)
280 |
281 |
282 | partial_autocorrelation, partial_conf_intervals = pacf(
283 | residuals, alpha=0.05, nlags=48
284 | )
285 |
286 | partial_autocorrelation_df = pd.DataFrame({
287 | "partial_autocorrelation": partial_autocorrelation,
288 | # center confidence intervals on zero,
289 | # so that null hypothesis is zero partial autocorrelation
290 | "ci_lower": partial_conf_intervals[:, 0]-partial_autocorrelation,
291 | "ci_upper": partial_conf_intervals[:, 1]-partial_autocorrelation
292 | })
293 | partial_autocorrelation_chart = px.bar(
294 | partial_autocorrelation_df,
295 | x=partial_autocorrelation_df.index,
296 | y=["partial_autocorrelation", "ci_lower", "ci_upper"],
297 | color_discrete_sequence=["#00828c", "#ff8300", "#ff8300"],
298 | barmode='overlay'
299 | )
300 | partial_autocorrelation_chart.update_layout(
301 | xaxis_title="Timestep (hours)",
302 | yaxis_title="Partial autocorrelation",
303 | showlegend=False
304 | )
305 |
306 | st.plotly_chart(partial_autocorrelation_chart)
307 |
--------------------------------------------------------------------------------
/apps/forecast.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import datetime
42 |
43 | import streamlit as st
44 | import pandas as pd
45 | import numpy as np
46 | import matplotlib.pyplot as plt
47 | import plotly.express as px
48 |
49 |
50 | N_SAMPLES = 10
51 |
52 | st.title("California Electricity Demand Forecast")
53 |
54 |
55 | # Data loading and selection
56 |
57 | data_loading = st.text("Loading data...")
58 |
59 |
60 | @st.cache(allow_output_mutation=True)
61 | def load_data():
62 | data = pd.read_csv("data/forecast.csv", parse_dates=["ds"])
63 | data = data.set_index("ds")
64 | return data
65 |
66 |
67 | data = load_data()
68 |
69 | st.markdown("""
70 | The forecast is generated for one year ahead of the most recent
71 | observation. Please select the range of interest over which to view and
72 | filter samples from the forecast distribution.
73 | """)
74 |
75 | start_date, end_date = st.date_input(
76 | "Select a forecast range",
77 | [data.index.min().date(), data.index.max().date()]
78 | )
79 |
80 | subset = data[(data.index.date >= start_date) &
81 | (data.index.date <= end_date)].copy()
82 | data_loading.text("")
83 |
84 |
85 | @st.cache(hash_funcs={pd.DataFrame: lambda _: None})
86 | def samples(df):
87 | return df.sample(N_SAMPLES, axis="columns").reset_index().melt(id_vars='ds')
88 |
89 |
90 | @st.cache(hash_funcs={pd.DataFrame: lambda _: None})
91 | def mean(df):
92 | return df.mean(axis="columns")
93 |
94 | # Main forecast plot
95 |
96 |
97 | st.markdown(f"""
98 | The chart below shows the mean forecast (based on 1000 samples),
99 | and {N_SAMPLES} individual samples, which can be thought of as
100 | "possible futures".
101 | """)
102 |
103 | generating_chart = st.text("Generating chart")
104 | mean_forecast = mean(subset)
105 | sample_forecasts = samples(subset)
106 |
107 |
108 | line_chart = px.line(
109 | sample_forecasts,
110 | x='ds',
111 | y='value',
112 | line_group='variable',
113 | color_discrete_sequence=["rgba(0,130,140,0.1)"],
114 |
115 | )
116 | line_chart.add_scatter(
117 | x=mean_forecast.index,
118 | y=mean_forecast,
119 | mode='lines',
120 | marker=dict(color="rgba(0,130,140,1)")
121 | )
122 | line_chart.update_xaxes(range=[start_date, end_date])
123 | line_chart.update_layout(
124 | showlegend=False,
125 | xaxis_title="Datetime (hourly increments)",
126 | yaxis_title="Megawatt-hours"
127 | )
128 | st.plotly_chart(line_chart)
129 | generating_chart.text("")
130 |
131 |
132 | # Marginal plot of sum of values over interval
133 |
134 | data_sum = subset.sum()
135 | _min = float(data_sum.min())
136 | _max = float(data_sum.max())
137 |
138 | st.markdown(f"""
139 | The mean estimate of the aggregate demand from {start_date} to {end_date}
140 | is **{data_sum.mean():.2e}** Megawatt-hours.
141 | """)
142 |
143 | st.markdown("""
144 | We can assess the probability of exceeding a given aggregate demand over
145 | the selected period. Choose the threshold of interest below.
146 | """)
147 |
148 | threshold = st.slider(
149 | "Threshold (Megawatt-hours)",
150 | min_value=_min,
151 | max_value=_max,
152 | format="%.2e"
153 | )
154 |
155 | prob_exceed = data_sum[data_sum > threshold].count() / data_sum.count()
156 |
157 | st.markdown(f"""
158 | The probability of the aggregate demand between {start_date} and {end_date}
159 | being more than {threshold:.2e} Megawatt-hours is
160 | **{100*prob_exceed:.1f}**%.
161 | """)
162 |
163 | st.markdown("""
164 | The histogram below shows the probability distribution of possible
165 | aggregate demands, cut off at the threshold selected.
166 | The higher the count for a given demand, the more likely that future is.
167 | """)
168 |
169 | hist = px.histogram(
170 | data_sum[data_sum > threshold],
171 | title="Possible total electricity demand levels",
172 | color_discrete_sequence=["#00828c"]
173 | )
174 | hist.update_xaxes(range=[_min, _max])
175 | hist.update_layout(
176 | showlegend=False,
177 | xaxis_title="Megawatt-hours",
178 | yaxis_title="Count (of 1000 simulated futures)"
179 | )
180 | st.plotly_chart(hist)
181 |
--------------------------------------------------------------------------------
/apps/launch_diagnostics.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | !streamlit run apps/diagnostics.py --server.port $CDSW_APP_PORT --server.address 127.0.0.1
42 |
--------------------------------------------------------------------------------
/apps/launch_forecast.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | !streamlit run apps/forecast.py --server.port $CDSW_APP_PORT --server.address 127.0.0.1
42 |
--------------------------------------------------------------------------------
/cml/fit_models_parallel.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import os
42 | import time
43 | import cdsw
44 |
45 |
46 | def fit_models_parallel():
47 | '''
48 | Use the CDSW Workers API (via Python SDK) to launch each model fitting script in parallel
49 |
50 | Docs - https://docs.cloudera.com/machine-learning/cloud/distributed-computing/topics/ml-workers-api.html
51 |
52 | '''
53 | # Launch a separate worker to run each script independently
54 |
55 | base_path = os.getcwd()
56 | script_path = base_path + '/scripts'
57 |
58 | scripts = os.listdir(script_path)
59 | scripts = [script_path+'/' +
60 | script for script in scripts if script[0:3] in ['fit', 'mak']]
61 |
62 | for script in scripts:
63 | cdsw.launch_workers(n=1, cpu=1, memory=3, script=script)
64 |
65 | # Force session to persist until each worker job has completed
66 | # Check for completion every minute
67 |
68 | complete = False
69 |
70 | while complete == False:
71 |
72 | time.sleep(60)
73 |
74 | workers = cdsw.list_workers()
75 | workers_status = [wkr['status'] for wkr in workers]
76 |
77 | if all(status == 'succeeded' for status in workers_status):
78 | complete = True
79 |
80 |
81 | if __name__ == "__main__":
82 | fit_models_parallel()
83 |
--------------------------------------------------------------------------------
/cml/install_dependencies.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | !pip3 install -r requirements.txt
42 | !pip3 install -e .
43 |
--------------------------------------------------------------------------------
/img/app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/img/app.png
--------------------------------------------------------------------------------
/img/diagnostic-chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/img/diagnostic-chart.png
--------------------------------------------------------------------------------
/img/diagnostic-metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/img/diagnostic-metrics.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fbprophet==0.6
2 | matplotlib==2.0.0
3 | numpy==1.19.1
4 | pandas==1.1.0
5 | plotly==4.9.0
6 | requests==2.22.0
7 | statsmodels==0.12.0
8 | streamlit==0.66.0
--------------------------------------------------------------------------------
/scripts/fit_baseline_model.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import os
42 |
43 | import numpy as np
44 |
45 | from sts.data.loader import load_california_electricity_demand
46 | from sts.models.baselines import year_ahead_hourly_forecast
47 |
48 |
49 | # Load the data
50 |
51 | df = load_california_electricity_demand()
52 |
53 | # ## Baseline
54 | # Reproduce observed values exactly 52 weeks prior as forecast.
55 |
56 | baseline = (
57 | df
58 | .sort_values('ds')
59 | .assign(yhat=year_ahead_hourly_forecast)
60 | )
61 |
62 |
63 | # ## Write
64 | # Write the forecast values to csv
65 | DIR = 'data/forecasts/'
66 |
67 | if not os.path.exists(DIR):
68 | os.makedirs(DIR)
69 |
70 | baseline[['ds', 'yhat']].to_csv(DIR + 'baseline.csv', index=False)
71 |
--------------------------------------------------------------------------------
/scripts/fit_complex_log_prophet_model.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import os
42 |
43 | import numpy as np
44 |
45 | from sts.data.loader import load_california_electricity_demand
46 | from sts.models.prophet import (
47 | add_season_weekday_indicators,
48 | seasonal_daily_prophet_model
49 | )
50 |
51 |
52 | # Load the training data (through 2018)
53 |
54 | df = load_california_electricity_demand(train_only=True)
55 |
56 | # Log transform the target variable
57 | df['y'] = df.y.apply(np.log)
58 |
59 |
60 | # ## Prophet (with more complicated seasonality)
61 | # FB Prophet model, splitting intra-day seasonalities into four subgroups:
62 | # - summer weekday
63 | # - summer weekend
64 | # - winter weekday
65 | # - winter weekend
66 |
67 | model = seasonal_daily_prophet_model(df)
68 |
69 | future = model.make_future_dataframe(periods=8760, freq='H')
70 | seasonal_future = add_season_weekday_indicators(future)
71 |
72 | forecast = model.predict(seasonal_future)
73 |
74 | # Reverse the log transform on predictions
75 | forecast['yhat'] = forecast.yhat.apply(np.exp)
76 |
77 |
78 | # ## Write
79 | # Write the forecast values to csv
80 | DIR = 'data/forecasts/'
81 |
82 | if not os.path.exists(DIR):
83 | os.makedirs(DIR)
84 |
85 | forecast[['ds', 'yhat']].to_csv(DIR + 'prophet_complex_log.csv', index=False)
86 |
--------------------------------------------------------------------------------
/scripts/fit_complex_prophet_model.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import os
42 |
43 | import numpy as np
44 |
45 | from sts.data.loader import load_california_electricity_demand
46 | from sts.models.prophet import (
47 | add_season_weekday_indicators,
48 | seasonal_daily_prophet_model
49 | )
50 |
51 |
52 | # Load the training data (through 2018)
53 |
54 | df = load_california_electricity_demand(train_only=True)
55 |
56 |
57 | # ## Prophet (with more complicated seasonality)
58 | # FB Prophet model, splitting intra-day seasonalities into four subgroups:
59 | # - summer weekday
60 | # - summer weekend
61 | # - winter weekday
62 | # - winter weekend
63 |
64 | model = seasonal_daily_prophet_model(df)
65 |
66 | future = model.make_future_dataframe(periods=8760, freq='H')
67 | seasonal_future = add_season_weekday_indicators(future)
68 |
69 | forecast = model.predict(seasonal_future)
70 |
71 |
72 | # ## Write
73 | # Write the forecast values to csv
74 | DIR = 'data/forecasts/'
75 |
76 | if not os.path.exists(DIR):
77 | os.makedirs(DIR)
78 |
79 | forecast[['ds', 'yhat']].to_csv(DIR + 'prophet_complex.csv', index=False)
80 |
--------------------------------------------------------------------------------
/scripts/fit_simple_prophet_model.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import os
42 |
43 | import numpy as np
44 |
45 | from sts.data.loader import load_california_electricity_demand
46 | from sts.models.prophet import default_prophet_model
47 |
48 |
49 | # Load the training data (through 2018)
50 |
51 | df = load_california_electricity_demand(train_only=True)
52 |
53 |
54 | # ## Prophet (Default)
55 | # FB Prophet model, all default parameters.
56 |
57 | model = default_prophet_model(df)
58 |
59 | future = model.make_future_dataframe(periods=8760, freq='H')
60 | forecast = model.predict(future)
61 |
62 |
63 | # ## Write
64 | # Write the forecast values to csv
65 | DIR = 'data/forecasts/'
66 |
67 | if not os.path.exists(DIR):
68 | os.makedirs(DIR)
69 |
70 | forecast[['ds', 'yhat']].to_csv(DIR + 'prophet_simple.csv', index=False)
71 |
--------------------------------------------------------------------------------
/scripts/get_csv.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | from sts.data.loader import load_california_electricity_demand
42 |
43 | # This will load or download the data as json, and write it to csv.
44 | df = load_california_electricity_demand('data/demand.json')
45 | df.to_csv('data/demand.csv')
46 |
--------------------------------------------------------------------------------
/scripts/make_forecast.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import datetime
42 |
43 | import numpy as np
44 | import pandas as pd
45 |
46 | from sts.models.prophet import (
47 | add_season_weekday_indicators,
48 | seasonal_daily_prophet_model
49 | )
50 | from sts.data.loader import load_california_electricity_demand
51 |
52 |
53 | # Load all available data for training
54 |
55 | df = load_california_electricity_demand()
56 |
57 | # Take log transform for fully multiplicative model
58 | df['y'] = df.y.apply(np.log)
59 |
60 |
61 | # Fit best current model
62 |
63 | model = seasonal_daily_prophet_model(df)
64 |
65 |
66 | # Make predictions for one year ahead of most recent training data
67 |
68 | future = add_season_weekday_indicators(
69 | model.make_future_dataframe(periods=24*365, freq='H')
70 | )
71 |
72 | forecast = model.predict(future)
73 |
74 | samples = model.predictive_samples(future)
75 |
76 | # Reverse log transform
77 | predictions = np.exp(samples['yhat'])
78 |
79 | prediction_df = (
80 | future
81 | .merge(pd.DataFrame(predictions), left_index=True, right_index=True)
82 | .drop(['winter_weekday', 'winter_weekend', 'summer_weekday', 'summer_weekend'],
83 | axis='columns')
84 | [future.ds.dt.date >= datetime.date.today()]
85 | )
86 |
87 |
88 | # Save predictions
89 |
90 | prediction_df.to_csv('data/forecast.csv', index=False)
91 |
--------------------------------------------------------------------------------
/scripts/validation_metrics.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import datetime
42 |
43 | import numpy as np
44 | import pandas as pd
45 |
46 | from sts.models.baselines import year_ahead_hourly_forecast
47 | from sts.models.prophet import (
48 | add_season_weekday_indicators,
49 | seasonal_daily_prophet_model
50 | )
51 | from sts.data.loader import load_california_electricity_demand
52 |
53 |
54 | # Load all available data for training
55 |
56 | df = load_california_electricity_demand()
57 |
58 | # Restrict to pre-2020 for evaluation on 2020
59 | train_df = df[df.ds.dt.year < 2020]
60 |
61 | # Take log transform for fully multiplicative model
62 | train_df['y'] = train_df.y.apply(np.log)
63 |
64 |
65 | # Fit best current model
66 |
67 | model = seasonal_daily_prophet_model(train_df)
68 |
69 |
70 | # Make predictions for one year ahead of most recent training data
71 |
72 | future = add_season_weekday_indicators(
73 | model.make_future_dataframe(periods=24*365, freq='H')
74 | )
75 |
76 | forecast = model.predict(future)
77 |
78 | # Reverse log transform
79 | forecast['yhat'] = np.exp(forecast['yhat'])
80 | train_df['y'] = np.exp(train_df['y'])
81 |
82 | predictions = (
83 | forecast[['ds', 'yhat']]
84 | .merge(df, on='ds')
85 | )
86 | predictions = predictions[predictions.ds.dt.year == 2020]
87 |
88 | # ### MAPE
89 | mape = (np.abs(predictions.y - predictions.yhat) / predictions.y).mean()
90 |
91 | # Let's compare this to the MAPE of the seasonal naive model
92 | naive_df = df.copy()
93 | naive_df['yhat'] = year_ahead_hourly_forecast(naive_df)
94 | naive_df = naive_df[naive_df.ds.dt.year == 2020]
95 | naive_mape = (np.abs(naive_df.yhat - naive_df.y) / naive_df.y).mean()
96 |
97 | # ### MASE
98 | # Note, we have trained on a larger data set than we did for model selection.
99 | # As such, this MASE cannot be compared to the MASEs listed in the diagnostic
100 | # app. It's a measure of performance relative to the baseline on the new
101 | # training set of all data before 2020.
102 | # (The deep reason here is that time series are non-iid, and as such, we
103 | # must make train/dev/validation splits along choronological lines.
104 | # An unfortunate artefact of this is never having the metrics for the exact
105 | # model we deploy.)
106 |
107 | naive_forecast = year_ahead_hourly_forecast(train_df)
108 | denom = (
109 | np.sum(np.abs((naive_forecast - train_df.y).dropna()))
110 | / len(naive_forecast.dropna())
111 | )
112 | mase = (np.abs(predictions.y - predictions.yhat) / denom).mean()
113 |
114 | print(f"The MAPE of our best performing model is: {mape}")
115 | print(f"The MAPE of the seasonal naive baseline: {naive_mape}")
116 | print(f"The MASE of the best performing model is: {mase}")
117 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | from setuptools import setup
42 |
43 | setup(
44 | name='sts',
45 | version='0.0.1',
46 | description='''
47 | Utilities for structural time series modelling of
48 | California electricity demand data.
49 | ''',
50 | author='Chris J. Wallace',
51 | packages=['sts']
52 | )
53 |
--------------------------------------------------------------------------------
/sts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/sts/__init__.py
--------------------------------------------------------------------------------
/sts/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/sts/data/__init__.py
--------------------------------------------------------------------------------
/sts/data/loader.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | import os
42 | import json
43 | import requests
44 |
45 | import pandas as pd
46 |
47 |
48 | def load_california_electricity_demand(
49 | filepath='data/demand.json',
50 | api_key_env='EIA_API_KEY',
51 | train_only=False):
52 |
53 | data = read_or_download_data(filepath, api_key_env)
54 |
55 | df = (
56 | json_to_df(data)
57 | .rename(columns={0: 'ds', 1: 'y'})
58 | .assign(ds=utc_to_pst)
59 | .assign(ds=lambda df: df.ds.dt.tz_localize(None))
60 | .sort_values('ds')
61 | )
62 |
63 | if train_only:
64 | df = remove_2019_and_later(df)
65 |
66 | return df
67 |
68 |
69 | def read_or_download_data(filepath, api_key_env):
70 |
71 | if os.path.exists(filepath):
72 | data = read_json(filepath)
73 | else:
74 | api_key = try_get_env(api_key_env)
75 | response_json = fetch_california_demand(api_key)
76 | write_json(response_json, filepath)
77 | data = read_json(filepath)
78 |
79 | return data
80 |
81 |
82 | def read_json(file):
83 | with open(file) as f:
84 | data = json.load(f)
85 | return data
86 |
87 |
88 | def write_json(data, filepath):
89 | with open(filepath, 'w') as file:
90 | json.dump(data, file)
91 |
92 |
93 | def try_get_env(api_key_env):
94 | env = os.getenv(api_key_env)
95 | if env:
96 | return env
97 | else:
98 | print('Please provide a valid EIA_API_KEY environment variable.')
99 | return None
100 |
101 |
102 | def fetch_california_demand(api_key):
103 | r = requests.get(
104 | 'http://api.eia.gov/series',
105 | params={
106 | 'api_key': api_key,
107 | 'series_id': 'EBA.CAL-ALL.D.H',
108 | 'out': 'json'
109 | }
110 | )
111 | return r.json()
112 |
113 |
114 | def json_to_df(data):
115 | df = pd.DataFrame(data['series'][0]['data'])
116 | return df
117 |
118 |
119 | def utc_to_pst(df):
120 | """
121 | Convert from UTC to PST.
122 | PST is always UTC -8 hours: it ignores daylight savings.
123 | """
124 | pst = (
125 | pd
126 | .to_datetime(df['ds'])
127 | .subtract(pd.Timedelta('8 hours'))
128 | )
129 | return pst
130 |
131 |
132 | def remove_2019_and_later(df):
133 | return df[df['ds'] < '2019']
134 |
--------------------------------------------------------------------------------
/sts/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/sts/models/__init__.py
--------------------------------------------------------------------------------
/sts/models/baselines.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | NUM_HOURS_IN_DAY = 24
42 | NUM_DAYS_IN_WEEK = 7
43 |
44 |
45 | # Define some baseline forecasts
46 |
47 | # n-step ahead
48 |
49 | # n-step hourly
50 |
51 |
52 | def hour_ahead_hourly_forecast(df):
53 | return df.shift(periods=1).y
54 |
55 |
56 | def day_ahead_hourly_forecast(df):
57 | return df.shift(periods=24).y
58 |
59 |
60 | def week_ahead_hourly_forecast(df):
61 | return df.shift(periods=NUM_HOURS_IN_DAY*NUM_DAYS_IN_WEEK).y
62 |
63 |
64 | def month_ahead_hourly_forecast(df):
65 | """One month is exactly four weeks"""
66 | return df.shift(periods=4*NUM_HOURS_IN_DAY*NUM_DAYS_IN_WEEK).y
67 |
68 |
69 | def year_ahead_hourly_forecast(df):
70 | """One year is exactly 52 weeks"""
71 | return df.shift(periods=52*NUM_HOURS_IN_DAY*NUM_DAYS_IN_WEEK).y
72 |
73 |
74 | # n-step daily
75 |
76 |
77 | def day_ahead_daily_forecast(df):
78 | return df.shift(periods=1).y
79 |
80 |
81 | def week_ahead_daily_forecast(df):
82 | return df.shift(periods=NUM_DAYS_IN_WEEK).y
83 |
84 |
85 | def month_ahead_daily_forecast(df):
86 | """One month is exactly four weeks"""
87 | return df.shift(periods=4*NUM_DAYS_IN_WEEK).y
88 |
89 |
90 | def year_ahead_daily_forecast(df):
91 | """One year is exactly 52 weeks"""
92 | return df.shift(periods=52*NUM_DAYS_IN_WEEK).y
93 |
94 |
95 | # Collect baseline forecasts
96 |
97 |
98 | def global_mean_forecast(df):
99 | return df.y.mean()
100 |
101 |
102 | def hourly_forecasts(df):
103 | forecasts = df.assign(
104 | hour_ahead_hourly_forecast=hour_ahead_hourly_forecast,
105 | day_ahead_hourly_forecast=day_ahead_hourly_forecast,
106 | week_ahead_hourly_forecast=week_ahead_hourly_forecast,
107 | month_ahead_hourly_forecast=month_ahead_hourly_forecast,
108 | year_ahead_hourly_forecast=year_ahead_hourly_forecast
109 | )
110 | return forecasts
111 |
112 |
113 | def daily_forecasts():
114 | forecasts = df.resample('1D').sum().assign(
115 | day_ahead_daily_forecast=day_ahead_daily_forecast,
116 | week_ahead_daily_forecast=week_ahead_daily_forecast,
117 | month_ahead_daily_forecast=month_ahead_daily_forecast,
118 | year_ahead_daily_forecast=year_ahead_daily_forecast
119 | )
120 | return forecasts
121 |
--------------------------------------------------------------------------------
/sts/models/prophet.py:
--------------------------------------------------------------------------------
1 | # ###########################################################################
2 | #
3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
4 | # (C) Cloudera, Inc. 2020
5 | # All rights reserved.
6 | #
7 | # Applicable Open Source License: Apache 2.0
8 | #
9 | # NOTE: Cloudera open source products are modular software products
10 | # made up of hundreds of individual components, each of which was
11 | # individually copyrighted. Each Cloudera open source product is a
12 | # collective work under U.S. Copyright Law. Your license to use the
13 | # collective work is as provided in your written agreement with
14 | # Cloudera. Used apart from the collective work, this file is
15 | # licensed for your use pursuant to the open source license
16 | # identified above.
17 | #
18 | # This code is provided to you pursuant a written agreement with
19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | # this code. If you do not have a written agreement with Cloudera nor
21 | # with an authorized and properly licensed third party, you do not
22 | # have any rights to access nor to use this code.
23 | #
24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | # DATA.
38 | #
39 | # ###########################################################################
40 |
41 | from fbprophet import Prophet
42 |
43 |
44 | def default_prophet_model(df):
45 | model = Prophet()
46 | model.fit(df)
47 | return model
48 |
49 |
50 | def multiplicative_prophet_model(df):
51 | model = Prophet(seasonality_mode='multiplicative')
52 | model.fit(df)
53 | return model
54 |
55 |
56 | def seasonal_daily_prophet_model(df):
57 | model = Prophet(
58 | daily_seasonality=False,
59 | yearly_seasonality=20,
60 | changepoint_prior_scale=0.001
61 | )
62 | model.add_seasonality(
63 | name='winter_weekday',
64 | period=1,
65 | fourier_order=12,
66 | condition_name='winter_weekday'
67 | )
68 | model.add_seasonality(
69 | name='winter_weekend',
70 | period=1,
71 | fourier_order=12,
72 | condition_name='winter_weekend'
73 | )
74 | model.add_seasonality(
75 | name='summer_weekday',
76 | period=1,
77 | fourier_order=12,
78 | condition_name='summer_weekday'
79 | )
80 | model.add_seasonality(
81 | name='summer_weekend',
82 | period=1,
83 | fourier_order=12,
84 | condition_name='summer_weekend'
85 | )
86 | model.add_country_holidays(country_name='US')
87 | df = add_season_weekday_indicators(df)
88 | model.fit(df)
89 | return model
90 |
91 |
92 | def add_season_weekday_indicators(df):
93 | df['winter_weekday'] = df['ds'].apply(is_winter_weekday)
94 | df['winter_weekend'] = df['ds'].apply(is_winter_weekend)
95 | df['summer_weekday'] = df['ds'].apply(is_summer_weekday)
96 | df['summer_weekend'] = df['ds'].apply(is_summer_weekend)
97 | return df
98 |
99 |
100 | def is_winter_weekday(ds):
101 | condition = (
102 | (ds.month < 6 or ds.month >= 10)
103 | and not (ds.day_name() in ['Saturday', 'Sunday'])
104 | )
105 | return condition
106 |
107 |
108 | def is_winter_weekend(ds):
109 | condition = (
110 | (ds.month < 6 or ds.month >= 10)
111 | and (ds.day_name() in ['Saturday', 'Sunday'])
112 | )
113 | return condition
114 |
115 |
116 | def is_summer_weekday(ds):
117 | condition = (
118 | (ds.month >= 6 or ds.month < 10)
119 | and not (ds.day_name() in ['Saturday', 'Sunday'])
120 | )
121 | return condition
122 |
123 |
124 | def is_summer_weekend(ds):
125 | condition = (
126 | (ds.month >= 6 or ds.month < 10)
127 | and (ds.day_name() in ['Saturday', 'Sunday'])
128 | )
129 | return condition
130 |
--------------------------------------------------------------------------------