├── .gitignore
├── .project-metadata.yaml
├── LICENSE.txt
├── README.md
├── apps
    ├── diagnostics.py
    ├── forecast.py
    ├── launch_diagnostics.py
    └── launch_forecast.py
├── cml
    ├── fit_models_parallel.py
    └── install_dependencies.py
├── data
    └── demand.json
├── img
    ├── app.png
    ├── diagnostic-chart.png
    └── diagnostic-metrics.png
├── requirements.txt
├── scripts
    ├── fit_baseline_model.py
    ├── fit_complex_log_prophet_model.py
    ├── fit_complex_prophet_model.py
    ├── fit_simple_prophet_model.py
    ├── get_csv.py
    ├── make_forecast.py
    └── validation_metrics.py
├── setup.py
└── sts
    ├── __init__.py
    ├── data
        ├── __init__.py
        └── loader.py
    └── models
        ├── __init__.py
        ├── baselines.py
        └── prophet.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | # Project specific
141 | R
142 | node_modules
143 | *.pyc
144 | __pycache__
145 | .*
146 | !.gitignore
147 | 


--------------------------------------------------------------------------------
/.project-metadata.yaml:
--------------------------------------------------------------------------------
 1 | name: Structural Time Series
 2 | description: California electricity demand forecasting with Prophet.
 3 | author: Cloudera Inc.
 4 | specification_version: 1.0
 5 | prototype_version: 1.0
 6 | date: "2020-10-14"
 7 | api_version: 1
 8 | 
 9 | environment_variables:
10 |   EIA_API_KEY:
11 |     default: "EIA KEY"
12 |     description: "Optional EIA open data API key"
13 |     prompt_user: false
14 | 
15 | tasks:
16 |   - type: create_job
17 |     name: Install Dependencies
18 |     entity_label: install_dependencies
19 |     script: cml/install_dependencies.py
20 |     arguments: None
21 |     cpu: 2
22 |     memory: 4
23 |     short_summary: Create job to install project dependencies.
24 |     environment:
25 |       TASK_TYPE: CREATE/RUN_JOB
26 |     kernel: python3
27 | 
28 |   - type: run_job
29 |     entity_label: install_dependencies
30 |     short_summary: Running install dependencies job.
31 |     long_summary: Running the job to install dependencies.  Note that this requires at least 4GB of memory
32 | 
33 |   - type: create_job
34 |     name: Launch Parallel Model Fitting
35 |     entity_label: fit_models_parallel
36 |     script: cml/fit_models_parallel.py
37 |     arguments: None
38 |     short_summary: Create job to launch parallel training script execution.
39 |     long_summary: Creates job to launch independent training workloads for each forecast script in the /scripts directory.
40 |     cpu: 1
41 |     memory: 2
42 |     environment:
43 |       TASK_TYPE: CREATE/RUN_JOB
44 |     kernel: python3
45 | 
46 |   - type: run_job
47 |     entity_label: fit_models_parallel
48 |     short_summary: Running job to train forecasts in parallel.
49 |     long_summary: Running job to train forecasts in parallel via CDSW Workers API
50 | 
51 |   - type: start_application
52 |     name: Diagnostic App
53 |     subdomain: diagnostics
54 |     script: apps/launch_diagnostics.py
55 |     short_summary: Starting forecast diagnostics application
56 |     environment_variables:
57 |       TASK_TYPE: START_APPLICATION
58 |     kernel: python3
59 | 
60 |   - type: start_application
61 |     name: Forecast App
62 |     subdomain: forecast
63 |     script: apps/launch_forecast.py
64 |     short_summary: Starting primary forecast application
65 |     environment_variables:
66 |       TASK_TYPE: START_APPLICATION
67 |     kernel: python3
68 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Structural Time Series
  2 | 
  3 | This repo accompanies the Cloudera Fast Forward report [Structural Time Series](https://structural-time-series.fastforwardlabs.com/).
  4 | It provides an example application of generalized additive models (via the [Prophet](https://facebook.github.io/prophet/) library) to California hourly electricity demand data.
  5 | 
  6 | The primary output of this repository is a small application exposing a probablistic forecast and interface for asking a probabilistic question against it.
  7 | The final app looks like this.
  8 | 
  9 | <img src="img/app.png" alt="Forecasting app interface" width="40%">
 10 | 
 11 | Instructions are given both for general use (on a laptop, say), and for Cloudera CML and CDSW.
 12 | We'll first describe what's here, then go through how to run everything.
 13 | 
 14 | ## Structure
 15 | 
 16 | The folder structure of the repo is as follows
 17 | 
 18 | ```
 19 | .
 20 | ├── apps      # Two small Streamlit applications.
 21 | ├── cml       # This folder contains scripts that facilitate the project launch on CML.
 22 | ├── data      # This folder contains starter data, and is where forecasts will live.
 23 | ├── scripts   # This is where all the code that does something lives.
 24 | └── sts       # A small library of useful functions.
 25 | ```
 26 | 
 27 | There's also an `img` folder that contains images for this README.
 28 | That folder is unimportant and you can ignore it.
 29 | Let's examine each of the important folders in turn.
 30 | 
 31 | ### `sts`
 32 | 
 33 | This is a small Python library of utility functions useful to our problem.
 34 | Its structure is as follows:
 35 | 
 36 | ```
 37 | sts
 38 | ├── data
 39 | │   └── loader.py
 40 | └── models
 41 |     ├── baselines.py
 42 |     └── prophet.py
 43 | ```
 44 | 
 45 | Building a small library of problem-specific abstractions allows us to reuse them in multiple places.
 46 | The code in `data/loader.py`, is reused in most of the scripts and applications.
 47 | In this case, we have closed model details (such as the number of Fourier terms to include in a given Prophet model) into the library.
 48 | It would be trivial to pass these through as arguments though, if we wanted to perform an extensive hyperparameter search for example.
 49 | 
 50 | ### `scripts`
 51 | 
 52 | These imperative scripts are where the _work_ of the analysis is done.
 53 | Side-effectful actions such as I/O and model training occur in these scripts.
 54 | 
 55 | ```
 56 | scripts
 57 | ├── fit_baseline_model.py
 58 | ├── fit_simple_prophet_model.py
 59 | ├── fit_complex_prophet_model.py
 60 | ├── fit_complex_log_prophet_model.py
 61 | ├── get_csv.py
 62 | ├── make_forecast.py
 63 | └── validation_metrics.py
 64 | ```
 65 | 
 66 | ### `apps`
 67 | 
 68 | Two applications accompany this project.
 69 | Each has a launcher script to assist launching an [Application](https://docs.cloudera.com/machine-learning/cloud/applications/topics/ml-applications.html) with CDSW/CML.
 70 | To launch the applications in another environment, run the code inside the launcher files, with the prefixed `!` removed.
 71 | You may need to specify different ports.
 72 | 
 73 | ```
 74 | apps
 75 | ├── diagnostics.py          # A model comparison and debugging assistant.
 76 | ├── forecast.py             # The primary forecasting interface.
 77 | ├── launch_diagnostics.py   # Launcher script for CDSW/CML
 78 | └── launch_forecast.py      # Launcher script for CDSW/CML
 79 | ```
 80 | 
 81 | #### Diagnostics
 82 | 
 83 | The diagnostic application serves two purposes.
 84 | First, it computes and reports top level metrics for any forecasts saved in the `data/forecasts` directory.
 85 | 
 86 | <img src="img/diagnostic-metrics.png" alt="Diagnostic app showing model metrics" width="40%">
 87 | 
 88 | Second, it provides a few diagnostic charts, including a zoomable forecast.
 89 | 
 90 | <img src="img/diagnostic-chart.png" alt="Diagnostic app showing chart of forecast" width="40%">
 91 | 
 92 | #### Forecast
 93 | 
 94 | The primary forecast application (pictured at the top of this README) is a prototype user interface for the forecast this analysis generates.
 95 | 
 96 | ### `cml`
 97 | 
 98 | These scripts serve as launch instructions to facilitate the automated project setup on CML.
 99 | Each script is triggered by the declarative pipeline as defined in the `.project-metadata.yaml` file found in the project's root directory.
100 | 
101 | ```
102 | cml
103 | ├── install_dependencies.py
104 | └── fit_models_parallel.py
105 | ```
106 | 
107 | ## Running through the analysis
108 | 
109 | To go from a fresh clone of the repo to the final state, follow these instructions in order.
110 | 
111 | ### Installation
112 | 
113 | The code and applications within were developed against Python 3.6.9, and are likely also to function with more recent versions of Python.
114 | 
115 | To install dependencies, first create and activate a new virtual environment through your preferred means, then pip install from the requirements file. I recommend:
116 | 
117 | ```python
118 | python3 -m venv .venv
119 | source .venv/bin/activate
120 | pip install -r requirements.txt
121 | ```
122 | 
123 | In CML or CDSW, no virtual env is necessary. Instead, inside a Python 3 session (with at least 2 vCPU / 4 GiB Memory), simply run
124 | 
125 | ```python
126 | !pip3 install -r requirements.txt     # notice `pip3`, not `pip`
127 | ```
128 | 
129 | Next, install the `sts` module from this repository, with
130 | 
131 | ```python
132 | pip3 install -e .
133 | ```
134 | 
135 | from inside the root directory of this repo.
136 | 
137 | ### Data
138 | 
139 | We use historic California electricity demand data from the [US Energy Information Administration](https://www.eia.gov/opendata/qb.php?category=3389936&sdid=EBA.CAL-ALL.D.H).
140 | 
141 | A full set of data through October 12th 2020 is included as a starter.
142 | More recent data can be fetched from the [EIA open data API](https://www.eia.gov/opendata/).
143 | Doing so requires an API key, which must be set as the `EIA_API_KEY` environment variable for this project.
144 | To fetch new data, simply call the `load_california_electricity_demand` function from the `sts.data.loader` module.
145 | The code is set up to work directly with the json response to the EIA API.
146 | By default, each time new data is fetched, it will overwrite the existing data.
147 | Similarly, when a new forecast is made, it will overwrite the existing forecast.
148 | It would not be hard to adapt the code to maintain a history of fetched data or forecasts if desired.
149 | 
150 | ### Scripts
151 | 
152 | To fit models and generate forecasts, we call each script in turn from the `scripts` directory.
153 | 
154 | ```bash
155 | python3 scripts/fit_baseline_model.py
156 | python3 scripts/fit_simple_prophet_model.py
157 | python3 scripts/fit_complex_prophet_model.py
158 | python3 scripts/fit_complex_log_prophet_model.py
159 | ```
160 | 
161 | This will fit a series of models of increasing complexity and write their outputs (the mean forecast) to the `data/forecasts` directory.
162 | Launching the diagnostic app will show the metrics and diagnostic charts for each model.
163 | 
164 | The most compex model wins.
165 | We can view its metrics when trained on the validation data (through 2019) by running the `scripts/validation_metrics.py` script.
166 | We can then generate 1000 samples from the model trained on all available training data with the `scripts/make_forecast.py` script.
167 | When those samples are written to disk, we can use the forecast app to investigate them.
168 | 
169 | The additional script, `get_csv.py`, simply fetches and writes data as a csv, which is convenient for any ad hoc analytics and interactive exploration.
170 | 


--------------------------------------------------------------------------------
/apps/diagnostics.py:
--------------------------------------------------------------------------------
  1 | # ###########################################################################
  2 | #
  3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
  4 | #  (C) Cloudera, Inc. 2020
  5 | #  All rights reserved.
  6 | #
  7 | #  Applicable Open Source License: Apache 2.0
  8 | #
  9 | #  NOTE: Cloudera open source products are modular software products
 10 | #  made up of hundreds of individual components, each of which was
 11 | #  individually copyrighted.  Each Cloudera open source product is a
 12 | #  collective work under U.S. Copyright Law. Your license to use the
 13 | #  collective work is as provided in your written agreement with
 14 | #  Cloudera.  Used apart from the collective work, this file is
 15 | #  licensed for your use pursuant to the open source license
 16 | #  identified above.
 17 | #
 18 | #  This code is provided to you pursuant a written agreement with
 19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
 20 | #  this code. If you do not have a written agreement with Cloudera nor
 21 | #  with an authorized and properly licensed third party, you do not
 22 | #  have any rights to access nor to use this code.
 23 | #
 24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
 25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
 26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
 27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
 28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
 29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
 30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
 31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
 32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
 33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
 34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
 35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
 36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
 37 | #  DATA.
 38 | #
 39 | # ###########################################################################
 40 | 
 41 | import os
 42 | 
 43 | import streamlit as st
 44 | import pandas as pd
 45 | import numpy as np
 46 | import plotly.express as px
 47 | import plotly.graph_objects as go
 48 | from statsmodels.tsa.stattools import acf, pacf
 49 | 
 50 | from sts.data.loader import load_california_electricity_demand
 51 | from sts.models.baselines import year_ahead_hourly_forecast
 52 | 
 53 | 
 54 | FORECAST_DIRECTORY = "data/forecasts"
 55 | 
 56 | 
 57 | st.title("California Electricity Demand Model Diagnostics")
 58 | 
 59 | # first, load true demand data and forecasts
 60 | 
 61 | 
 62 | def read_forecast(filename):
 63 |     name = filename.split(".")[0]
 64 |     df = (
 65 |         pd
 66 |         .read_csv(FORECAST_DIRECTORY+"/"+filename)
 67 |         .rename(columns={"yhat": name})
 68 |         .assign(ds=lambda df: pd.to_datetime(df.ds))
 69 |     )
 70 |     return df
 71 | 
 72 | 
 73 | @st.cache(allow_output_mutation=True)
 74 | def load_all_forecasts():
 75 |     df = load_california_electricity_demand().sort_values("ds")
 76 |     forecast_list = os.listdir(FORECAST_DIRECTORY)
 77 |     for f in forecast_list:
 78 |         df = df.merge(read_forecast(f), on="ds")
 79 |     return df
 80 | 
 81 | 
 82 | data_loading = st.text("Loading data...")
 83 | df = load_all_forecasts()
 84 | data_loading.text("")
 85 | 
 86 | df_train = df[df.ds.dt.year < 2019]
 87 | df_2018 = df[df.ds.dt.year == 2018]
 88 | df_2019 = df[df.ds.dt.year == 2019]
 89 | 
 90 | model_names = [x for x in df.columns if x not in ["ds", "y"]]
 91 | 
 92 | 
 93 | f"""
 94 | ## Model comparison
 95 | There are {len(model_names)} models. Here is a comparison of their MAPE for select data slices.
 96 | We compare a held out test set (2019) to the whole training set through 2018 and also
 97 | 2018 in isolation. 2018 is included for being one complete period in the training set
 98 | of equal length to 2019.
 99 | """
100 | 
101 | 
102 | def ape(df):
103 |     return pd.DataFrame({m: np.abs(df.y - df[m]) / df.y for m in model_names})
104 | 
105 | 
106 | st.write(
107 |     pd.DataFrame({
108 |         "all training":    ape(df_train).mean().rename("MAPE"),
109 |         "2018 (training)": ape(df_2018).mean().rename("MAPE"),
110 |         "2019  (holdout)": ape(df_2019).mean().rename("MAPE")
111 |     }).transpose()
112 | )
113 | 
114 | 
115 | """
116 | ---
117 | Another metric for this kind of time series is [MASE](https://en.wikipedia.org/wiki/Mean_absolute_scaled_error).
118 | We will use a seasonal variant, where the season is defined to be 52 weeks long,
119 | so that years are approximately aligned.
120 | MASE measures error relative to the baseline, so a lower score is better.
121 | """
122 | 
123 | 
124 | def mase_denominator(df):
125 |     naive_forecast = year_ahead_hourly_forecast(df)
126 |     denom = np.sum(
127 |         np.abs((naive_forecast - df.y).dropna())
128 |     ) / len(naive_forecast.dropna())
129 |     return denom
130 | 
131 | 
132 | denom = mase_denominator(df_train)
133 | 
134 | 
135 | def mase(df):
136 |     return pd.DataFrame({m: np.abs(df.y - df[m]) / denom for m in model_names})
137 | 
138 | 
139 | st.write(
140 |     pd.DataFrame({
141 |         "all training":    mase(df_train).mean().rename("MASE"),
142 |         "2018 (training)": mase(df_2018).mean().rename("MASE"),
143 |         "2019  (holdout)": mase(df_2019).mean().rename("MASE")
144 |     }).transpose()
145 | )
146 | 
147 | 
148 | """
149 | ---
150 | ## Model drill-down
151 | We can compute some more detailed diagnostics for each model individually.
152 | """
153 | active_model = st.selectbox("Model", model_names)
154 | 
155 | 
156 | """
157 | ### The forecast
158 | First, we should see the forecast vs true, observed values.
159 | """
160 | 
161 | forecast_chart = px.line(
162 |     df, x='ds', y=['y', active_model],
163 |     color_discrete_sequence=["#ff8300", "#00828c"]
164 | )
165 | forecast_chart.update_xaxes(
166 |     rangeslider_visible=True,
167 |     rangeselector=dict(
168 |         buttons=list([
169 |             dict(count=7, label="1w", step="day", stepmode="backward"),
170 |             dict(count=1, label="1m", step="month", stepmode="backward"),
171 |             dict(count=3, label="3m", step="month", stepmode="backward"),
172 |             dict(count=6, label="6m", step="month", stepmode="backward"),
173 |             dict(count=1, label="YTD", step="year", stepmode="todate"),
174 |             dict(count=1, label="1y", step="year", stepmode="backward"),
175 |             dict(step="all")
176 |         ])
177 |     )
178 | )
179 | forecast_chart.update_layout(
180 |     xaxis_title="Datetime (hourly increments)",
181 |     yaxis_title="Demand (Megawatt-hours)",
182 |     legend=dict(
183 |         orientation="h",
184 |         yanchor="bottom",
185 |         y=1.02,
186 |         xanchor="right",
187 |         x=1,
188 | 
189 |     ),
190 |     legend_title_text=""
191 | )
192 | st.plotly_chart(forecast_chart)
193 | 
194 | 
195 | """
196 | ---
197 | ### Diagnostics
198 | """
199 | 
200 | data_set = st.selectbox("Dataset", ['Train', 'Test', 'Combined'])
201 | 
202 | if data_set == 'Train':
203 |     df = df_train
204 | elif data_set == 'Test':
205 |     df = df_2019
206 | 
207 | 
208 | """
209 | ---
210 | Scatter plot of the true values vs forecast values.
211 | This plot will be heavily overplotted, but the overall shape should tell us
212 | whether we are over- or under-predicting.
213 | """
214 | 
215 | scatter_chart = go.Figure(data=go.Scatter(
216 |     x=df.y, y=df[active_model],
217 |     mode="markers",
218 |     marker=dict(color="#00828c", opacity=0.2),
219 | ))
220 | scatter_chart.update_layout(
221 |     xaxis_title="True demand (Megawatt-hours)",
222 |     yaxis_title="Forecast demand (Megawatt-hours)"
223 | )
224 | 
225 | st.plotly_chart(scatter_chart)
226 | 
227 | residuals = (df["y"] - df[active_model]).dropna()
228 | 
229 | 
230 | """
231 | ---
232 | Here is the marginal distribution of the residuals.
233 | We expect it to be symmetric, approximately normal, and centered at zero.
234 | """
235 | 
236 | residual_chart = px.histogram(
237 |     df, x=residuals, color_discrete_sequence=["#00828c"]
238 | )
239 | residual_chart.update_layout(
240 |     xaxis_title="Residual (true demand - forecast demand) (Megawatt-hours)",
241 |     yaxis_title="Count"
242 | )
243 | 
244 | st.plotly_chart(residual_chart)
245 | 
246 | 
247 | """
248 | ---
249 | The autocorrelation and partial autocorrelation of the residuals.
250 | Since none of our models try to model the error (with autoregressive terms), we may
251 | expect some autocorrelation.
252 | The orange bands represent the 95% confidence interval for the null hypothesis that
253 | there is no (partial) autocorrelation.
254 | Bars outside those bounds indicate high likelihood of autocorrelation.
255 | """
256 | 
257 | autocorrelation, conf_intervals = acf(residuals, alpha=0.05, nlags=48)
258 | 
259 | autocorrelation_df = pd.DataFrame({
260 |     "autocorrelation": autocorrelation,
261 |     # center confidence intervals on zero,
262 |     # so that null hypothesis is zero autocorrelation
263 |     "ci_lower": conf_intervals[:, 0]-autocorrelation,
264 |     "ci_upper": conf_intervals[:, 1]-autocorrelation
265 | })
266 | autocorrelation_chart = px.bar(
267 |     autocorrelation_df,
268 |     x=autocorrelation_df.index,
269 |     y=["autocorrelation", "ci_lower", "ci_upper"],
270 |     color_discrete_sequence=["#00828c", "#ff8300", "#ff8300"],
271 |     barmode="overlay"
272 | )
273 | autocorrelation_chart.update_layout(
274 |     xaxis_title="Timestep (hours)",
275 |     yaxis_title="Autocorrelation",
276 |     showlegend=False
277 | )
278 | 
279 | st.plotly_chart(autocorrelation_chart)
280 | 
281 | 
282 | partial_autocorrelation, partial_conf_intervals = pacf(
283 |     residuals, alpha=0.05, nlags=48
284 | )
285 | 
286 | partial_autocorrelation_df = pd.DataFrame({
287 |     "partial_autocorrelation": partial_autocorrelation,
288 |     # center confidence intervals on zero,
289 |     # so that null hypothesis is zero partial autocorrelation
290 |     "ci_lower": partial_conf_intervals[:, 0]-partial_autocorrelation,
291 |     "ci_upper": partial_conf_intervals[:, 1]-partial_autocorrelation
292 | })
293 | partial_autocorrelation_chart = px.bar(
294 |     partial_autocorrelation_df,
295 |     x=partial_autocorrelation_df.index,
296 |     y=["partial_autocorrelation", "ci_lower", "ci_upper"],
297 |     color_discrete_sequence=["#00828c", "#ff8300", "#ff8300"],
298 |     barmode='overlay'
299 | )
300 | partial_autocorrelation_chart.update_layout(
301 |     xaxis_title="Timestep (hours)",
302 |     yaxis_title="Partial autocorrelation",
303 |     showlegend=False
304 | )
305 | 
306 | st.plotly_chart(partial_autocorrelation_chart)
307 | 


--------------------------------------------------------------------------------
/apps/forecast.py:
--------------------------------------------------------------------------------
  1 | # ###########################################################################
  2 | #
  3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
  4 | #  (C) Cloudera, Inc. 2020
  5 | #  All rights reserved.
  6 | #
  7 | #  Applicable Open Source License: Apache 2.0
  8 | #
  9 | #  NOTE: Cloudera open source products are modular software products
 10 | #  made up of hundreds of individual components, each of which was
 11 | #  individually copyrighted.  Each Cloudera open source product is a
 12 | #  collective work under U.S. Copyright Law. Your license to use the
 13 | #  collective work is as provided in your written agreement with
 14 | #  Cloudera.  Used apart from the collective work, this file is
 15 | #  licensed for your use pursuant to the open source license
 16 | #  identified above.
 17 | #
 18 | #  This code is provided to you pursuant a written agreement with
 19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
 20 | #  this code. If you do not have a written agreement with Cloudera nor
 21 | #  with an authorized and properly licensed third party, you do not
 22 | #  have any rights to access nor to use this code.
 23 | #
 24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
 25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
 26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
 27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
 28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
 29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
 30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
 31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
 32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
 33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
 34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
 35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
 36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
 37 | #  DATA.
 38 | #
 39 | # ###########################################################################
 40 | 
 41 | import datetime
 42 | 
 43 | import streamlit as st
 44 | import pandas as pd
 45 | import numpy as np
 46 | import matplotlib.pyplot as plt
 47 | import plotly.express as px
 48 | 
 49 | 
 50 | N_SAMPLES = 10
 51 | 
 52 | st.title("California Electricity Demand Forecast")
 53 | 
 54 | 
 55 | # Data loading and selection
 56 | 
 57 | data_loading = st.text("Loading data...")
 58 | 
 59 | 
 60 | @st.cache(allow_output_mutation=True)
 61 | def load_data():
 62 |     data = pd.read_csv("data/forecast.csv", parse_dates=["ds"])
 63 |     data = data.set_index("ds")
 64 |     return data
 65 | 
 66 | 
 67 | data = load_data()
 68 | 
 69 | st.markdown("""
 70 |     The forecast is generated for one year ahead of the most recent
 71 |     observation. Please select the range of interest over which to view and
 72 |     filter samples from the forecast distribution.
 73 | """)
 74 | 
 75 | start_date, end_date = st.date_input(
 76 |     "Select a forecast range",
 77 |     [data.index.min().date(), data.index.max().date()]
 78 | )
 79 | 
 80 | subset = data[(data.index.date >= start_date) &
 81 |               (data.index.date <= end_date)].copy()
 82 | data_loading.text("")
 83 | 
 84 | 
 85 | @st.cache(hash_funcs={pd.DataFrame: lambda _: None})
 86 | def samples(df):
 87 |     return df.sample(N_SAMPLES, axis="columns").reset_index().melt(id_vars='ds')
 88 | 
 89 | 
 90 | @st.cache(hash_funcs={pd.DataFrame: lambda _: None})
 91 | def mean(df):
 92 |     return df.mean(axis="columns")
 93 | 
 94 | # Main forecast plot
 95 | 
 96 | 
 97 | st.markdown(f"""
 98 |     The chart below shows the mean forecast (based on 1000 samples),
 99 |     and {N_SAMPLES} individual samples, which can be thought of as
100 |     "possible futures".
101 | """)
102 | 
103 | generating_chart = st.text("Generating chart")
104 | mean_forecast = mean(subset)
105 | sample_forecasts = samples(subset)
106 | 
107 | 
108 | line_chart = px.line(
109 |     sample_forecasts,
110 |     x='ds',
111 |     y='value',
112 |     line_group='variable',
113 |     color_discrete_sequence=["rgba(0,130,140,0.1)"],
114 | 
115 | )
116 | line_chart.add_scatter(
117 |     x=mean_forecast.index,
118 |     y=mean_forecast,
119 |     mode='lines',
120 |     marker=dict(color="rgba(0,130,140,1)")
121 | )
122 | line_chart.update_xaxes(range=[start_date, end_date])
123 | line_chart.update_layout(
124 |     showlegend=False,
125 |     xaxis_title="Datetime (hourly increments)",
126 |     yaxis_title="Megawatt-hours"
127 | )
128 | st.plotly_chart(line_chart)
129 | generating_chart.text("")
130 | 
131 | 
132 | # Marginal plot of sum of values over interval
133 | 
134 | data_sum = subset.sum()
135 | _min = float(data_sum.min())
136 | _max = float(data_sum.max())
137 | 
138 | st.markdown(f"""
139 |     The mean estimate of the aggregate demand from {start_date} to {end_date}
140 |     is **{data_sum.mean():.2e}** Megawatt-hours.
141 | """)
142 | 
143 | st.markdown("""
144 |     We can assess the probability of exceeding a given aggregate demand over
145 |     the selected period. Choose the threshold of interest below.
146 | """)
147 | 
148 | threshold = st.slider(
149 |     "Threshold (Megawatt-hours)",
150 |     min_value=_min,
151 |     max_value=_max,
152 |     format="%.2e"
153 | )
154 | 
155 | prob_exceed = data_sum[data_sum > threshold].count() / data_sum.count()
156 | 
157 | st.markdown(f"""
158 |     The probability of the aggregate demand between {start_date} and {end_date}
159 |     being more than {threshold:.2e} Megawatt-hours is
160 |     **{100*prob_exceed:.1f}**%.
161 | """)
162 | 
163 | st.markdown("""
164 |     The histogram below shows the probability distribution of possible
165 |     aggregate demands, cut off at the threshold selected.
166 |     The higher the count for a given demand, the more likely that future is.
167 | """)
168 | 
169 | hist = px.histogram(
170 |     data_sum[data_sum > threshold],
171 |     title="Possible total electricity demand levels",
172 |     color_discrete_sequence=["#00828c"]
173 | )
174 | hist.update_xaxes(range=[_min, _max])
175 | hist.update_layout(
176 |     showlegend=False,
177 |     xaxis_title="Megawatt-hours",
178 |     yaxis_title="Count (of 1000 simulated futures)"
179 | )
180 | st.plotly_chart(hist)
181 | 


--------------------------------------------------------------------------------
/apps/launch_diagnostics.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | !streamlit run apps/diagnostics.py --server.port $CDSW_APP_PORT --server.address 127.0.0.1
42 | 


--------------------------------------------------------------------------------
/apps/launch_forecast.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | !streamlit run apps/forecast.py --server.port $CDSW_APP_PORT --server.address 127.0.0.1
42 | 


--------------------------------------------------------------------------------
/cml/fit_models_parallel.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | import os
42 | import time
43 | import cdsw
44 | 
45 | 
46 | def fit_models_parallel():
47 |     '''
48 |     Use the CDSW Workers API (via Python SDK) to launch each model fitting script in parallel
49 | 
50 |     Docs - https://docs.cloudera.com/machine-learning/cloud/distributed-computing/topics/ml-workers-api.html
51 | 
52 |     '''
53 |     # Launch a separate worker to run each script independently
54 | 
55 |     base_path = os.getcwd()
56 |     script_path = base_path + '/scripts'
57 | 
58 |     scripts = os.listdir(script_path)
59 |     scripts = [script_path+'/' +
60 |                script for script in scripts if script[0:3] in ['fit', 'mak']]
61 | 
62 |     for script in scripts:
63 |         cdsw.launch_workers(n=1, cpu=1, memory=3, script=script)
64 | 
65 |     # Force session to persist until each worker job has completed
66 |     # Check for completion every minute
67 | 
68 |     complete = False
69 | 
70 |     while complete == False:
71 | 
72 |         time.sleep(60)
73 | 
74 |         workers = cdsw.list_workers()
75 |         workers_status = [wkr['status'] for wkr in workers]
76 | 
77 |         if all(status == 'succeeded' for status in workers_status):
78 |             complete = True
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     fit_models_parallel()
83 | 


--------------------------------------------------------------------------------
/cml/install_dependencies.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | !pip3  install -r requirements.txt
42 | !pip3 install -e .
43 | 


--------------------------------------------------------------------------------
/img/app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/img/app.png


--------------------------------------------------------------------------------
/img/diagnostic-chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/img/diagnostic-chart.png


--------------------------------------------------------------------------------
/img/diagnostic-metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/img/diagnostic-metrics.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fbprophet==0.6
2 | matplotlib==2.0.0
3 | numpy==1.19.1
4 | pandas==1.1.0
5 | plotly==4.9.0
6 | requests==2.22.0
7 | statsmodels==0.12.0
8 | streamlit==0.66.0


--------------------------------------------------------------------------------
/scripts/fit_baseline_model.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | import os
42 | 
43 | import numpy as np
44 | 
45 | from sts.data.loader import load_california_electricity_demand
46 | from sts.models.baselines import year_ahead_hourly_forecast
47 | 
48 | 
49 | # Load the data
50 | 
51 | df = load_california_electricity_demand()
52 | 
53 | # ## Baseline
54 | # Reproduce observed values exactly 52 weeks prior as forecast.
55 | 
56 | baseline = (
57 |     df
58 |     .sort_values('ds')
59 |     .assign(yhat=year_ahead_hourly_forecast)
60 | )
61 | 
62 | 
63 | # ## Write
64 | # Write the forecast values to csv
65 | DIR = 'data/forecasts/'
66 | 
67 | if not os.path.exists(DIR):
68 |     os.makedirs(DIR)
69 | 
70 | baseline[['ds', 'yhat']].to_csv(DIR + 'baseline.csv', index=False)
71 | 


--------------------------------------------------------------------------------
/scripts/fit_complex_log_prophet_model.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | import os
42 | 
43 | import numpy as np
44 | 
45 | from sts.data.loader import load_california_electricity_demand
46 | from sts.models.prophet import (
47 |     add_season_weekday_indicators,
48 |     seasonal_daily_prophet_model
49 | )
50 | 
51 | 
52 | # Load the training data (through 2018)
53 | 
54 | df = load_california_electricity_demand(train_only=True)
55 | 
56 | # Log transform the target variable
57 | df['y'] = df.y.apply(np.log)
58 | 
59 | 
60 | # ## Prophet (with more complicated seasonality)
61 | # FB Prophet model, splitting intra-day seasonalities into four subgroups:
62 | # - summer weekday
63 | # - summer weekend
64 | # - winter weekday
65 | # - winter weekend
66 | 
67 | model = seasonal_daily_prophet_model(df)
68 | 
69 | future = model.make_future_dataframe(periods=8760, freq='H')
70 | seasonal_future = add_season_weekday_indicators(future)
71 | 
72 | forecast = model.predict(seasonal_future)
73 | 
74 | # Reverse the log transform on predictions
75 | forecast['yhat'] = forecast.yhat.apply(np.exp)
76 | 
77 | 
78 | # ## Write
79 | # Write the forecast values to csv
80 | DIR = 'data/forecasts/'
81 | 
82 | if not os.path.exists(DIR):
83 |     os.makedirs(DIR)
84 | 
85 | forecast[['ds', 'yhat']].to_csv(DIR + 'prophet_complex_log.csv', index=False)
86 | 


--------------------------------------------------------------------------------
/scripts/fit_complex_prophet_model.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | import os
42 | 
43 | import numpy as np
44 | 
45 | from sts.data.loader import load_california_electricity_demand
46 | from sts.models.prophet import (
47 |     add_season_weekday_indicators,
48 |     seasonal_daily_prophet_model
49 | )
50 | 
51 | 
52 | # Load the training data (through 2018)
53 | 
54 | df = load_california_electricity_demand(train_only=True)
55 | 
56 | 
57 | # ## Prophet (with more complicated seasonality)
58 | # FB Prophet model, splitting intra-day seasonalities into four subgroups:
59 | # - summer weekday
60 | # - summer weekend
61 | # - winter weekday
62 | # - winter weekend
63 | 
64 | model = seasonal_daily_prophet_model(df)
65 | 
66 | future = model.make_future_dataframe(periods=8760, freq='H')
67 | seasonal_future = add_season_weekday_indicators(future)
68 | 
69 | forecast = model.predict(seasonal_future)
70 | 
71 | 
72 | # ## Write
73 | # Write the forecast values to csv
74 | DIR = 'data/forecasts/'
75 | 
76 | if not os.path.exists(DIR):
77 |     os.makedirs(DIR)
78 | 
79 | forecast[['ds', 'yhat']].to_csv(DIR + 'prophet_complex.csv', index=False)
80 | 


--------------------------------------------------------------------------------
/scripts/fit_simple_prophet_model.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | import os
42 | 
43 | import numpy as np
44 | 
45 | from sts.data.loader import load_california_electricity_demand
46 | from sts.models.prophet import default_prophet_model
47 | 
48 | 
49 | # Load the training data (through 2018)
50 | 
51 | df = load_california_electricity_demand(train_only=True)
52 | 
53 | 
54 | # ## Prophet (Default)
55 | # FB Prophet model, all default parameters.
56 | 
57 | model = default_prophet_model(df)
58 | 
59 | future = model.make_future_dataframe(periods=8760, freq='H')
60 | forecast = model.predict(future)
61 | 
62 | 
63 | # ## Write
64 | # Write the forecast values to csv
65 | DIR = 'data/forecasts/'
66 | 
67 | if not os.path.exists(DIR):
68 |     os.makedirs(DIR)
69 | 
70 | forecast[['ds', 'yhat']].to_csv(DIR + 'prophet_simple.csv', index=False)
71 | 


--------------------------------------------------------------------------------
/scripts/get_csv.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | from sts.data.loader import load_california_electricity_demand
42 | 
43 | # This will load or download the data as json, and write it to csv.
44 | df = load_california_electricity_demand('data/demand.json')
45 | df.to_csv('data/demand.csv')
46 | 


--------------------------------------------------------------------------------
/scripts/make_forecast.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | import datetime
42 | 
43 | import numpy as np
44 | import pandas as pd
45 | 
46 | from sts.models.prophet import (
47 |     add_season_weekday_indicators,
48 |     seasonal_daily_prophet_model
49 | )
50 | from sts.data.loader import load_california_electricity_demand
51 | 
52 | 
53 | # Load all available data for training
54 | 
55 | df = load_california_electricity_demand()
56 | 
57 | # Take log transform for fully multiplicative model
58 | df['y'] = df.y.apply(np.log)
59 | 
60 | 
61 | # Fit best current model
62 | 
63 | model = seasonal_daily_prophet_model(df)
64 | 
65 | 
66 | # Make predictions for one year ahead of most recent training data
67 | 
68 | future = add_season_weekday_indicators(
69 |     model.make_future_dataframe(periods=24*365, freq='H')
70 | )
71 | 
72 | forecast = model.predict(future)
73 | 
74 | samples = model.predictive_samples(future)
75 | 
76 | # Reverse log transform
77 | predictions = np.exp(samples['yhat'])
78 | 
79 | prediction_df = (
80 |     future
81 |     .merge(pd.DataFrame(predictions), left_index=True, right_index=True)
82 |     .drop(['winter_weekday', 'winter_weekend', 'summer_weekday', 'summer_weekend'],
83 |           axis='columns')
84 |     [future.ds.dt.date >= datetime.date.today()]
85 | )
86 | 
87 | 
88 | # Save predictions
89 | 
90 | prediction_df.to_csv('data/forecast.csv', index=False)
91 | 


--------------------------------------------------------------------------------
/scripts/validation_metrics.py:
--------------------------------------------------------------------------------
  1 | # ###########################################################################
  2 | #
  3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
  4 | #  (C) Cloudera, Inc. 2020
  5 | #  All rights reserved.
  6 | #
  7 | #  Applicable Open Source License: Apache 2.0
  8 | #
  9 | #  NOTE: Cloudera open source products are modular software products
 10 | #  made up of hundreds of individual components, each of which was
 11 | #  individually copyrighted.  Each Cloudera open source product is a
 12 | #  collective work under U.S. Copyright Law. Your license to use the
 13 | #  collective work is as provided in your written agreement with
 14 | #  Cloudera.  Used apart from the collective work, this file is
 15 | #  licensed for your use pursuant to the open source license
 16 | #  identified above.
 17 | #
 18 | #  This code is provided to you pursuant a written agreement with
 19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
 20 | #  this code. If you do not have a written agreement with Cloudera nor
 21 | #  with an authorized and properly licensed third party, you do not
 22 | #  have any rights to access nor to use this code.
 23 | #
 24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
 25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
 26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
 27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
 28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
 29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
 30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
 31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
 32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
 33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
 34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
 35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
 36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
 37 | #  DATA.
 38 | #
 39 | # ###########################################################################
 40 | 
 41 | import datetime
 42 | 
 43 | import numpy as np
 44 | import pandas as pd
 45 | 
 46 | from sts.models.baselines import year_ahead_hourly_forecast
 47 | from sts.models.prophet import (
 48 |     add_season_weekday_indicators,
 49 |     seasonal_daily_prophet_model
 50 | )
 51 | from sts.data.loader import load_california_electricity_demand
 52 | 
 53 | 
 54 | # Load all available data for training
 55 | 
 56 | df = load_california_electricity_demand()
 57 | 
 58 | # Restrict to pre-2020 for evaluation on 2020
 59 | train_df = df[df.ds.dt.year < 2020]
 60 | 
 61 | # Take log transform for fully multiplicative model
 62 | train_df['y'] = train_df.y.apply(np.log)
 63 | 
 64 | 
 65 | # Fit best current model
 66 | 
 67 | model = seasonal_daily_prophet_model(train_df)
 68 | 
 69 | 
 70 | # Make predictions for one year ahead of most recent training data
 71 | 
 72 | future = add_season_weekday_indicators(
 73 |     model.make_future_dataframe(periods=24*365, freq='H')
 74 | )
 75 | 
 76 | forecast = model.predict(future)
 77 | 
 78 | # Reverse log transform
 79 | forecast['yhat'] = np.exp(forecast['yhat'])
 80 | train_df['y'] = np.exp(train_df['y'])
 81 | 
 82 | predictions = (
 83 |     forecast[['ds', 'yhat']]
 84 |     .merge(df, on='ds')
 85 | )
 86 | predictions = predictions[predictions.ds.dt.year == 2020]
 87 | 
 88 | # ### MAPE
 89 | mape = (np.abs(predictions.y - predictions.yhat) / predictions.y).mean()
 90 | 
 91 | # Let's compare this to the MAPE of the seasonal naive model
 92 | naive_df = df.copy()
 93 | naive_df['yhat'] = year_ahead_hourly_forecast(naive_df)
 94 | naive_df = naive_df[naive_df.ds.dt.year == 2020]
 95 | naive_mape = (np.abs(naive_df.yhat - naive_df.y) / naive_df.y).mean()
 96 | 
 97 | # ### MASE
 98 | # Note, we have trained on a larger data set than we did for model selection.
 99 | # As such, this MASE cannot be compared to the MASEs listed in the diagnostic
100 | # app. It's a measure of performance relative to the baseline on the new
101 | # training set of all data before 2020.
102 | # (The deep reason here is that time series are non-iid, and as such, we
103 | # must make train/dev/validation splits along choronological lines.
104 | # An unfortunate artefact of this is never having the metrics for the exact
105 | # model we deploy.)
106 | 
107 | naive_forecast = year_ahead_hourly_forecast(train_df)
108 | denom = (
109 |     np.sum(np.abs((naive_forecast - train_df.y).dropna()))
110 |     / len(naive_forecast.dropna())
111 | )
112 | mase = (np.abs(predictions.y - predictions.yhat) / denom).mean()
113 | 
114 | print(f"The MAPE of our best performing model is: {mape}")
115 | print(f"The MAPE of the seasonal naive baseline: {naive_mape}")
116 | print(f"The MASE of the best performing model is: {mase}")
117 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # ###########################################################################
 2 | #
 3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 4 | #  (C) Cloudera, Inc. 2020
 5 | #  All rights reserved.
 6 | #
 7 | #  Applicable Open Source License: Apache 2.0
 8 | #
 9 | #  NOTE: Cloudera open source products are modular software products
10 | #  made up of hundreds of individual components, each of which was
11 | #  individually copyrighted.  Each Cloudera open source product is a
12 | #  collective work under U.S. Copyright Law. Your license to use the
13 | #  collective work is as provided in your written agreement with
14 | #  Cloudera.  Used apart from the collective work, this file is
15 | #  licensed for your use pursuant to the open source license
16 | #  identified above.
17 | #
18 | #  This code is provided to you pursuant a written agreement with
19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
20 | #  this code. If you do not have a written agreement with Cloudera nor
21 | #  with an authorized and properly licensed third party, you do not
22 | #  have any rights to access nor to use this code.
23 | #
24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
37 | #  DATA.
38 | #
39 | # ###########################################################################
40 | 
41 | from setuptools import setup
42 | 
43 | setup(
44 |     name='sts',
45 |     version='0.0.1',
46 |     description='''
47 |         Utilities for structural time series modelling of
48 |         California electricity demand data.
49 |     ''',
50 |     author='Chris J. Wallace',
51 |     packages=['sts']
52 | )
53 | 


--------------------------------------------------------------------------------
/sts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/sts/__init__.py


--------------------------------------------------------------------------------
/sts/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/sts/data/__init__.py


--------------------------------------------------------------------------------
/sts/data/loader.py:
--------------------------------------------------------------------------------
  1 | # ###########################################################################
  2 | #
  3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
  4 | #  (C) Cloudera, Inc. 2020
  5 | #  All rights reserved.
  6 | #
  7 | #  Applicable Open Source License: Apache 2.0
  8 | #
  9 | #  NOTE: Cloudera open source products are modular software products
 10 | #  made up of hundreds of individual components, each of which was
 11 | #  individually copyrighted.  Each Cloudera open source product is a
 12 | #  collective work under U.S. Copyright Law. Your license to use the
 13 | #  collective work is as provided in your written agreement with
 14 | #  Cloudera.  Used apart from the collective work, this file is
 15 | #  licensed for your use pursuant to the open source license
 16 | #  identified above.
 17 | #
 18 | #  This code is provided to you pursuant a written agreement with
 19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
 20 | #  this code. If you do not have a written agreement with Cloudera nor
 21 | #  with an authorized and properly licensed third party, you do not
 22 | #  have any rights to access nor to use this code.
 23 | #
 24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
 25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
 26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
 27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
 28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
 29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
 30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
 31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
 32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
 33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
 34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
 35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
 36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
 37 | #  DATA.
 38 | #
 39 | # ###########################################################################
 40 | 
 41 | import os
 42 | import json
 43 | import requests
 44 | 
 45 | import pandas as pd
 46 | 
 47 | 
 48 | def load_california_electricity_demand(
 49 |         filepath='data/demand.json',
 50 |         api_key_env='EIA_API_KEY',
 51 |         train_only=False):
 52 | 
 53 |     data = read_or_download_data(filepath, api_key_env)
 54 | 
 55 |     df = (
 56 |         json_to_df(data)
 57 |         .rename(columns={0: 'ds', 1: 'y'})
 58 |         .assign(ds=utc_to_pst)
 59 |         .assign(ds=lambda df: df.ds.dt.tz_localize(None))
 60 |         .sort_values('ds')
 61 |     )
 62 | 
 63 |     if train_only:
 64 |         df = remove_2019_and_later(df)
 65 | 
 66 |     return df
 67 | 
 68 | 
 69 | def read_or_download_data(filepath, api_key_env):
 70 | 
 71 |     if os.path.exists(filepath):
 72 |         data = read_json(filepath)
 73 |     else:
 74 |         api_key = try_get_env(api_key_env)
 75 |         response_json = fetch_california_demand(api_key)
 76 |         write_json(response_json, filepath)
 77 |         data = read_json(filepath)
 78 | 
 79 |     return data
 80 | 
 81 | 
 82 | def read_json(file):
 83 |     with open(file) as f:
 84 |         data = json.load(f)
 85 |     return data
 86 | 
 87 | 
 88 | def write_json(data, filepath):
 89 |     with open(filepath, 'w') as file:
 90 |         json.dump(data, file)
 91 | 
 92 | 
 93 | def try_get_env(api_key_env):
 94 |     env = os.getenv(api_key_env)
 95 |     if env:
 96 |         return env
 97 |     else:
 98 |         print('Please provide a valid EIA_API_KEY environment variable.')
 99 |         return None
100 | 
101 | 
102 | def fetch_california_demand(api_key):
103 |     r = requests.get(
104 |         'http://api.eia.gov/series',
105 |         params={
106 |             'api_key': api_key,
107 |             'series_id': 'EBA.CAL-ALL.D.H',
108 |             'out': 'json'
109 |         }
110 |     )
111 |     return r.json()
112 | 
113 | 
114 | def json_to_df(data):
115 |     df = pd.DataFrame(data['series'][0]['data'])
116 |     return df
117 | 
118 | 
119 | def utc_to_pst(df):
120 |     """
121 |     Convert from UTC to PST.
122 |     PST is always UTC -8 hours: it ignores daylight savings.
123 |     """
124 |     pst = (
125 |         pd
126 |         .to_datetime(df['ds'])
127 |         .subtract(pd.Timedelta('8 hours'))
128 |     )
129 |     return pst
130 | 
131 | 
132 | def remove_2019_and_later(df):
133 |     return df[df['ds'] < '2019']
134 | 


--------------------------------------------------------------------------------
/sts/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/sts/models/__init__.py


--------------------------------------------------------------------------------
/sts/models/baselines.py:
--------------------------------------------------------------------------------
  1 | # ###########################################################################
  2 | #
  3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
  4 | #  (C) Cloudera, Inc. 2020
  5 | #  All rights reserved.
  6 | #
  7 | #  Applicable Open Source License: Apache 2.0
  8 | #
  9 | #  NOTE: Cloudera open source products are modular software products
 10 | #  made up of hundreds of individual components, each of which was
 11 | #  individually copyrighted.  Each Cloudera open source product is a
 12 | #  collective work under U.S. Copyright Law. Your license to use the
 13 | #  collective work is as provided in your written agreement with
 14 | #  Cloudera.  Used apart from the collective work, this file is
 15 | #  licensed for your use pursuant to the open source license
 16 | #  identified above.
 17 | #
 18 | #  This code is provided to you pursuant a written agreement with
 19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
 20 | #  this code. If you do not have a written agreement with Cloudera nor
 21 | #  with an authorized and properly licensed third party, you do not
 22 | #  have any rights to access nor to use this code.
 23 | #
 24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
 25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
 26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
 27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
 28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
 29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
 30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
 31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
 32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
 33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
 34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
 35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
 36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
 37 | #  DATA.
 38 | #
 39 | # ###########################################################################
 40 | 
 41 | NUM_HOURS_IN_DAY = 24
 42 | NUM_DAYS_IN_WEEK = 7
 43 | 
 44 | 
 45 | # Define some baseline forecasts
 46 | 
 47 | # n-step ahead
 48 | 
 49 | # n-step hourly
 50 | 
 51 | 
 52 | def hour_ahead_hourly_forecast(df):
 53 |     return df.shift(periods=1).y
 54 | 
 55 | 
 56 | def day_ahead_hourly_forecast(df):
 57 |     return df.shift(periods=24).y
 58 | 
 59 | 
 60 | def week_ahead_hourly_forecast(df):
 61 |     return df.shift(periods=NUM_HOURS_IN_DAY*NUM_DAYS_IN_WEEK).y
 62 | 
 63 | 
 64 | def month_ahead_hourly_forecast(df):
 65 |     """One month is exactly four weeks"""
 66 |     return df.shift(periods=4*NUM_HOURS_IN_DAY*NUM_DAYS_IN_WEEK).y
 67 | 
 68 | 
 69 | def year_ahead_hourly_forecast(df):
 70 |     """One year is exactly 52 weeks"""
 71 |     return df.shift(periods=52*NUM_HOURS_IN_DAY*NUM_DAYS_IN_WEEK).y
 72 | 
 73 | 
 74 | # n-step daily
 75 | 
 76 | 
 77 | def day_ahead_daily_forecast(df):
 78 |     return df.shift(periods=1).y
 79 | 
 80 | 
 81 | def week_ahead_daily_forecast(df):
 82 |     return df.shift(periods=NUM_DAYS_IN_WEEK).y
 83 | 
 84 | 
 85 | def month_ahead_daily_forecast(df):
 86 |     """One month is exactly four weeks"""
 87 |     return df.shift(periods=4*NUM_DAYS_IN_WEEK).y
 88 | 
 89 | 
 90 | def year_ahead_daily_forecast(df):
 91 |     """One year is exactly 52 weeks"""
 92 |     return df.shift(periods=52*NUM_DAYS_IN_WEEK).y
 93 | 
 94 | 
 95 | # Collect baseline forecasts
 96 | 
 97 | 
 98 | def global_mean_forecast(df):
 99 |     return df.y.mean()
100 | 
101 | 
102 | def hourly_forecasts(df):
103 |     forecasts = df.assign(
104 |         hour_ahead_hourly_forecast=hour_ahead_hourly_forecast,
105 |         day_ahead_hourly_forecast=day_ahead_hourly_forecast,
106 |         week_ahead_hourly_forecast=week_ahead_hourly_forecast,
107 |         month_ahead_hourly_forecast=month_ahead_hourly_forecast,
108 |         year_ahead_hourly_forecast=year_ahead_hourly_forecast
109 |     )
110 |     return forecasts
111 | 
112 | 
113 | def daily_forecasts():
114 |     forecasts = df.resample('1D').sum().assign(
115 |         day_ahead_daily_forecast=day_ahead_daily_forecast,
116 |         week_ahead_daily_forecast=week_ahead_daily_forecast,
117 |         month_ahead_daily_forecast=month_ahead_daily_forecast,
118 |         year_ahead_daily_forecast=year_ahead_daily_forecast
119 |     )
120 |     return forecasts
121 | 


--------------------------------------------------------------------------------
/sts/models/prophet.py:
--------------------------------------------------------------------------------
  1 | # ###########################################################################
  2 | #
  3 | #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
  4 | #  (C) Cloudera, Inc. 2020
  5 | #  All rights reserved.
  6 | #
  7 | #  Applicable Open Source License: Apache 2.0
  8 | #
  9 | #  NOTE: Cloudera open source products are modular software products
 10 | #  made up of hundreds of individual components, each of which was
 11 | #  individually copyrighted.  Each Cloudera open source product is a
 12 | #  collective work under U.S. Copyright Law. Your license to use the
 13 | #  collective work is as provided in your written agreement with
 14 | #  Cloudera.  Used apart from the collective work, this file is
 15 | #  licensed for your use pursuant to the open source license
 16 | #  identified above.
 17 | #
 18 | #  This code is provided to you pursuant a written agreement with
 19 | #  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
 20 | #  this code. If you do not have a written agreement with Cloudera nor
 21 | #  with an authorized and properly licensed third party, you do not
 22 | #  have any rights to access nor to use this code.
 23 | #
 24 | #  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
 25 | #  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
 26 | #  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
 27 | #  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
 28 | #  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
 29 | #  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
 30 | #  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
 31 | #  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
 32 | #  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
 33 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
 34 | #  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
 35 | #  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
 36 | #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
 37 | #  DATA.
 38 | #
 39 | # ###########################################################################
 40 | 
 41 | from fbprophet import Prophet
 42 | 
 43 | 
 44 | def default_prophet_model(df):
 45 |     model = Prophet()
 46 |     model.fit(df)
 47 |     return model
 48 | 
 49 | 
 50 | def multiplicative_prophet_model(df):
 51 |     model = Prophet(seasonality_mode='multiplicative')
 52 |     model.fit(df)
 53 |     return model
 54 | 
 55 | 
 56 | def seasonal_daily_prophet_model(df):
 57 |     model = Prophet(
 58 |         daily_seasonality=False,
 59 |         yearly_seasonality=20,
 60 |         changepoint_prior_scale=0.001
 61 |     )
 62 |     model.add_seasonality(
 63 |         name='winter_weekday',
 64 |         period=1,
 65 |         fourier_order=12,
 66 |         condition_name='winter_weekday'
 67 |     )
 68 |     model.add_seasonality(
 69 |         name='winter_weekend',
 70 |         period=1,
 71 |         fourier_order=12,
 72 |         condition_name='winter_weekend'
 73 |     )
 74 |     model.add_seasonality(
 75 |         name='summer_weekday',
 76 |         period=1,
 77 |         fourier_order=12,
 78 |         condition_name='summer_weekday'
 79 |     )
 80 |     model.add_seasonality(
 81 |         name='summer_weekend',
 82 |         period=1,
 83 |         fourier_order=12,
 84 |         condition_name='summer_weekend'
 85 |     )
 86 |     model.add_country_holidays(country_name='US')
 87 |     df = add_season_weekday_indicators(df)
 88 |     model.fit(df)
 89 |     return model
 90 | 
 91 | 
 92 | def add_season_weekday_indicators(df):
 93 |     df['winter_weekday'] = df['ds'].apply(is_winter_weekday)
 94 |     df['winter_weekend'] = df['ds'].apply(is_winter_weekend)
 95 |     df['summer_weekday'] = df['ds'].apply(is_summer_weekday)
 96 |     df['summer_weekend'] = df['ds'].apply(is_summer_weekend)
 97 |     return df
 98 | 
 99 | 
100 | def is_winter_weekday(ds):
101 |     condition = (
102 |         (ds.month < 6 or ds.month >= 10)
103 |         and not (ds.day_name() in ['Saturday', 'Sunday'])
104 |     )
105 |     return condition
106 | 
107 | 
108 | def is_winter_weekend(ds):
109 |     condition = (
110 |         (ds.month < 6 or ds.month >= 10)
111 |         and (ds.day_name() in ['Saturday', 'Sunday'])
112 |     )
113 |     return condition
114 | 
115 | 
116 | def is_summer_weekday(ds):
117 |     condition = (
118 |         (ds.month >= 6 or ds.month < 10)
119 |         and not (ds.day_name() in ['Saturday', 'Sunday'])
120 |     )
121 |     return condition
122 | 
123 | 
124 | def is_summer_weekend(ds):
125 |     condition = (
126 |         (ds.month >= 6 or ds.month < 10)
127 |         and (ds.day_name() in ['Saturday', 'Sunday'])
128 |     )
129 |     return condition
130 | 


--------------------------------------------------------------------------------