├── .gitignore ├── .project-metadata.yaml ├── LICENSE.txt ├── README.md ├── apps ├── diagnostics.py ├── forecast.py ├── launch_diagnostics.py └── launch_forecast.py ├── cml ├── fit_models_parallel.py └── install_dependencies.py ├── data └── demand.json ├── img ├── app.png ├── diagnostic-chart.png └── diagnostic-metrics.png ├── requirements.txt ├── scripts ├── fit_baseline_model.py ├── fit_complex_log_prophet_model.py ├── fit_complex_prophet_model.py ├── fit_simple_prophet_model.py ├── get_csv.py ├── make_forecast.py └── validation_metrics.py ├── setup.py └── sts ├── __init__.py ├── data ├── __init__.py └── loader.py └── models ├── __init__.py ├── baselines.py └── prophet.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | # Project specific 141 | R 142 | node_modules 143 | *.pyc 144 | __pycache__ 145 | .* 146 | !.gitignore 147 | -------------------------------------------------------------------------------- /.project-metadata.yaml: -------------------------------------------------------------------------------- 1 | name: Structural Time Series 2 | description: California electricity demand forecasting with Prophet. 3 | author: Cloudera Inc. 4 | specification_version: 1.0 5 | prototype_version: 1.0 6 | date: "2020-10-14" 7 | api_version: 1 8 | 9 | environment_variables: 10 | EIA_API_KEY: 11 | default: "EIA KEY" 12 | description: "Optional EIA open data API key" 13 | prompt_user: false 14 | 15 | tasks: 16 | - type: create_job 17 | name: Install Dependencies 18 | entity_label: install_dependencies 19 | script: cml/install_dependencies.py 20 | arguments: None 21 | cpu: 2 22 | memory: 4 23 | short_summary: Create job to install project dependencies. 24 | environment: 25 | TASK_TYPE: CREATE/RUN_JOB 26 | kernel: python3 27 | 28 | - type: run_job 29 | entity_label: install_dependencies 30 | short_summary: Running install dependencies job. 31 | long_summary: Running the job to install dependencies. Note that this requires at least 4GB of memory 32 | 33 | - type: create_job 34 | name: Launch Parallel Model Fitting 35 | entity_label: fit_models_parallel 36 | script: cml/fit_models_parallel.py 37 | arguments: None 38 | short_summary: Create job to launch parallel training script execution. 39 | long_summary: Creates job to launch independent training workloads for each forecast script in the /scripts directory. 40 | cpu: 1 41 | memory: 2 42 | environment: 43 | TASK_TYPE: CREATE/RUN_JOB 44 | kernel: python3 45 | 46 | - type: run_job 47 | entity_label: fit_models_parallel 48 | short_summary: Running job to train forecasts in parallel. 49 | long_summary: Running job to train forecasts in parallel via CDSW Workers API 50 | 51 | - type: start_application 52 | name: Diagnostic App 53 | subdomain: diagnostics 54 | script: apps/launch_diagnostics.py 55 | short_summary: Starting forecast diagnostics application 56 | environment_variables: 57 | TASK_TYPE: START_APPLICATION 58 | kernel: python3 59 | 60 | - type: start_application 61 | name: Forecast App 62 | subdomain: forecast 63 | script: apps/launch_forecast.py 64 | short_summary: Starting primary forecast application 65 | environment_variables: 66 | TASK_TYPE: START_APPLICATION 67 | kernel: python3 68 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Structural Time Series 2 | 3 | This repo accompanies the Cloudera Fast Forward report [Structural Time Series](https://structural-time-series.fastforwardlabs.com/). 4 | It provides an example application of generalized additive models (via the [Prophet](https://facebook.github.io/prophet/) library) to California hourly electricity demand data. 5 | 6 | The primary output of this repository is a small application exposing a probablistic forecast and interface for asking a probabilistic question against it. 7 | The final app looks like this. 8 | 9 | Forecasting app interface 10 | 11 | Instructions are given both for general use (on a laptop, say), and for Cloudera CML and CDSW. 12 | We'll first describe what's here, then go through how to run everything. 13 | 14 | ## Structure 15 | 16 | The folder structure of the repo is as follows 17 | 18 | ``` 19 | . 20 | ├── apps # Two small Streamlit applications. 21 | ├── cml # This folder contains scripts that facilitate the project launch on CML. 22 | ├── data # This folder contains starter data, and is where forecasts will live. 23 | ├── scripts # This is where all the code that does something lives. 24 | └── sts # A small library of useful functions. 25 | ``` 26 | 27 | There's also an `img` folder that contains images for this README. 28 | That folder is unimportant and you can ignore it. 29 | Let's examine each of the important folders in turn. 30 | 31 | ### `sts` 32 | 33 | This is a small Python library of utility functions useful to our problem. 34 | Its structure is as follows: 35 | 36 | ``` 37 | sts 38 | ├── data 39 | │ └── loader.py 40 | └── models 41 | ├── baselines.py 42 | └── prophet.py 43 | ``` 44 | 45 | Building a small library of problem-specific abstractions allows us to reuse them in multiple places. 46 | The code in `data/loader.py`, is reused in most of the scripts and applications. 47 | In this case, we have closed model details (such as the number of Fourier terms to include in a given Prophet model) into the library. 48 | It would be trivial to pass these through as arguments though, if we wanted to perform an extensive hyperparameter search for example. 49 | 50 | ### `scripts` 51 | 52 | These imperative scripts are where the _work_ of the analysis is done. 53 | Side-effectful actions such as I/O and model training occur in these scripts. 54 | 55 | ``` 56 | scripts 57 | ├── fit_baseline_model.py 58 | ├── fit_simple_prophet_model.py 59 | ├── fit_complex_prophet_model.py 60 | ├── fit_complex_log_prophet_model.py 61 | ├── get_csv.py 62 | ├── make_forecast.py 63 | └── validation_metrics.py 64 | ``` 65 | 66 | ### `apps` 67 | 68 | Two applications accompany this project. 69 | Each has a launcher script to assist launching an [Application](https://docs.cloudera.com/machine-learning/cloud/applications/topics/ml-applications.html) with CDSW/CML. 70 | To launch the applications in another environment, run the code inside the launcher files, with the prefixed `!` removed. 71 | You may need to specify different ports. 72 | 73 | ``` 74 | apps 75 | ├── diagnostics.py # A model comparison and debugging assistant. 76 | ├── forecast.py # The primary forecasting interface. 77 | ├── launch_diagnostics.py # Launcher script for CDSW/CML 78 | └── launch_forecast.py # Launcher script for CDSW/CML 79 | ``` 80 | 81 | #### Diagnostics 82 | 83 | The diagnostic application serves two purposes. 84 | First, it computes and reports top level metrics for any forecasts saved in the `data/forecasts` directory. 85 | 86 | Diagnostic app showing model metrics 87 | 88 | Second, it provides a few diagnostic charts, including a zoomable forecast. 89 | 90 | Diagnostic app showing chart of forecast 91 | 92 | #### Forecast 93 | 94 | The primary forecast application (pictured at the top of this README) is a prototype user interface for the forecast this analysis generates. 95 | 96 | ### `cml` 97 | 98 | These scripts serve as launch instructions to facilitate the automated project setup on CML. 99 | Each script is triggered by the declarative pipeline as defined in the `.project-metadata.yaml` file found in the project's root directory. 100 | 101 | ``` 102 | cml 103 | ├── install_dependencies.py 104 | └── fit_models_parallel.py 105 | ``` 106 | 107 | ## Running through the analysis 108 | 109 | To go from a fresh clone of the repo to the final state, follow these instructions in order. 110 | 111 | ### Installation 112 | 113 | The code and applications within were developed against Python 3.6.9, and are likely also to function with more recent versions of Python. 114 | 115 | To install dependencies, first create and activate a new virtual environment through your preferred means, then pip install from the requirements file. I recommend: 116 | 117 | ```python 118 | python3 -m venv .venv 119 | source .venv/bin/activate 120 | pip install -r requirements.txt 121 | ``` 122 | 123 | In CML or CDSW, no virtual env is necessary. Instead, inside a Python 3 session (with at least 2 vCPU / 4 GiB Memory), simply run 124 | 125 | ```python 126 | !pip3 install -r requirements.txt # notice `pip3`, not `pip` 127 | ``` 128 | 129 | Next, install the `sts` module from this repository, with 130 | 131 | ```python 132 | pip3 install -e . 133 | ``` 134 | 135 | from inside the root directory of this repo. 136 | 137 | ### Data 138 | 139 | We use historic California electricity demand data from the [US Energy Information Administration](https://www.eia.gov/opendata/qb.php?category=3389936&sdid=EBA.CAL-ALL.D.H). 140 | 141 | A full set of data through October 12th 2020 is included as a starter. 142 | More recent data can be fetched from the [EIA open data API](https://www.eia.gov/opendata/). 143 | Doing so requires an API key, which must be set as the `EIA_API_KEY` environment variable for this project. 144 | To fetch new data, simply call the `load_california_electricity_demand` function from the `sts.data.loader` module. 145 | The code is set up to work directly with the json response to the EIA API. 146 | By default, each time new data is fetched, it will overwrite the existing data. 147 | Similarly, when a new forecast is made, it will overwrite the existing forecast. 148 | It would not be hard to adapt the code to maintain a history of fetched data or forecasts if desired. 149 | 150 | ### Scripts 151 | 152 | To fit models and generate forecasts, we call each script in turn from the `scripts` directory. 153 | 154 | ```bash 155 | python3 scripts/fit_baseline_model.py 156 | python3 scripts/fit_simple_prophet_model.py 157 | python3 scripts/fit_complex_prophet_model.py 158 | python3 scripts/fit_complex_log_prophet_model.py 159 | ``` 160 | 161 | This will fit a series of models of increasing complexity and write their outputs (the mean forecast) to the `data/forecasts` directory. 162 | Launching the diagnostic app will show the metrics and diagnostic charts for each model. 163 | 164 | The most compex model wins. 165 | We can view its metrics when trained on the validation data (through 2019) by running the `scripts/validation_metrics.py` script. 166 | We can then generate 1000 samples from the model trained on all available training data with the `scripts/make_forecast.py` script. 167 | When those samples are written to disk, we can use the forecast app to investigate them. 168 | 169 | The additional script, `get_csv.py`, simply fetches and writes data as a csv, which is convenient for any ad hoc analytics and interactive exploration. 170 | -------------------------------------------------------------------------------- /apps/diagnostics.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import os 42 | 43 | import streamlit as st 44 | import pandas as pd 45 | import numpy as np 46 | import plotly.express as px 47 | import plotly.graph_objects as go 48 | from statsmodels.tsa.stattools import acf, pacf 49 | 50 | from sts.data.loader import load_california_electricity_demand 51 | from sts.models.baselines import year_ahead_hourly_forecast 52 | 53 | 54 | FORECAST_DIRECTORY = "data/forecasts" 55 | 56 | 57 | st.title("California Electricity Demand Model Diagnostics") 58 | 59 | # first, load true demand data and forecasts 60 | 61 | 62 | def read_forecast(filename): 63 | name = filename.split(".")[0] 64 | df = ( 65 | pd 66 | .read_csv(FORECAST_DIRECTORY+"/"+filename) 67 | .rename(columns={"yhat": name}) 68 | .assign(ds=lambda df: pd.to_datetime(df.ds)) 69 | ) 70 | return df 71 | 72 | 73 | @st.cache(allow_output_mutation=True) 74 | def load_all_forecasts(): 75 | df = load_california_electricity_demand().sort_values("ds") 76 | forecast_list = os.listdir(FORECAST_DIRECTORY) 77 | for f in forecast_list: 78 | df = df.merge(read_forecast(f), on="ds") 79 | return df 80 | 81 | 82 | data_loading = st.text("Loading data...") 83 | df = load_all_forecasts() 84 | data_loading.text("") 85 | 86 | df_train = df[df.ds.dt.year < 2019] 87 | df_2018 = df[df.ds.dt.year == 2018] 88 | df_2019 = df[df.ds.dt.year == 2019] 89 | 90 | model_names = [x for x in df.columns if x not in ["ds", "y"]] 91 | 92 | 93 | f""" 94 | ## Model comparison 95 | There are {len(model_names)} models. Here is a comparison of their MAPE for select data slices. 96 | We compare a held out test set (2019) to the whole training set through 2018 and also 97 | 2018 in isolation. 2018 is included for being one complete period in the training set 98 | of equal length to 2019. 99 | """ 100 | 101 | 102 | def ape(df): 103 | return pd.DataFrame({m: np.abs(df.y - df[m]) / df.y for m in model_names}) 104 | 105 | 106 | st.write( 107 | pd.DataFrame({ 108 | "all training": ape(df_train).mean().rename("MAPE"), 109 | "2018 (training)": ape(df_2018).mean().rename("MAPE"), 110 | "2019 (holdout)": ape(df_2019).mean().rename("MAPE") 111 | }).transpose() 112 | ) 113 | 114 | 115 | """ 116 | --- 117 | Another metric for this kind of time series is [MASE](https://en.wikipedia.org/wiki/Mean_absolute_scaled_error). 118 | We will use a seasonal variant, where the season is defined to be 52 weeks long, 119 | so that years are approximately aligned. 120 | MASE measures error relative to the baseline, so a lower score is better. 121 | """ 122 | 123 | 124 | def mase_denominator(df): 125 | naive_forecast = year_ahead_hourly_forecast(df) 126 | denom = np.sum( 127 | np.abs((naive_forecast - df.y).dropna()) 128 | ) / len(naive_forecast.dropna()) 129 | return denom 130 | 131 | 132 | denom = mase_denominator(df_train) 133 | 134 | 135 | def mase(df): 136 | return pd.DataFrame({m: np.abs(df.y - df[m]) / denom for m in model_names}) 137 | 138 | 139 | st.write( 140 | pd.DataFrame({ 141 | "all training": mase(df_train).mean().rename("MASE"), 142 | "2018 (training)": mase(df_2018).mean().rename("MASE"), 143 | "2019 (holdout)": mase(df_2019).mean().rename("MASE") 144 | }).transpose() 145 | ) 146 | 147 | 148 | """ 149 | --- 150 | ## Model drill-down 151 | We can compute some more detailed diagnostics for each model individually. 152 | """ 153 | active_model = st.selectbox("Model", model_names) 154 | 155 | 156 | """ 157 | ### The forecast 158 | First, we should see the forecast vs true, observed values. 159 | """ 160 | 161 | forecast_chart = px.line( 162 | df, x='ds', y=['y', active_model], 163 | color_discrete_sequence=["#ff8300", "#00828c"] 164 | ) 165 | forecast_chart.update_xaxes( 166 | rangeslider_visible=True, 167 | rangeselector=dict( 168 | buttons=list([ 169 | dict(count=7, label="1w", step="day", stepmode="backward"), 170 | dict(count=1, label="1m", step="month", stepmode="backward"), 171 | dict(count=3, label="3m", step="month", stepmode="backward"), 172 | dict(count=6, label="6m", step="month", stepmode="backward"), 173 | dict(count=1, label="YTD", step="year", stepmode="todate"), 174 | dict(count=1, label="1y", step="year", stepmode="backward"), 175 | dict(step="all") 176 | ]) 177 | ) 178 | ) 179 | forecast_chart.update_layout( 180 | xaxis_title="Datetime (hourly increments)", 181 | yaxis_title="Demand (Megawatt-hours)", 182 | legend=dict( 183 | orientation="h", 184 | yanchor="bottom", 185 | y=1.02, 186 | xanchor="right", 187 | x=1, 188 | 189 | ), 190 | legend_title_text="" 191 | ) 192 | st.plotly_chart(forecast_chart) 193 | 194 | 195 | """ 196 | --- 197 | ### Diagnostics 198 | """ 199 | 200 | data_set = st.selectbox("Dataset", ['Train', 'Test', 'Combined']) 201 | 202 | if data_set == 'Train': 203 | df = df_train 204 | elif data_set == 'Test': 205 | df = df_2019 206 | 207 | 208 | """ 209 | --- 210 | Scatter plot of the true values vs forecast values. 211 | This plot will be heavily overplotted, but the overall shape should tell us 212 | whether we are over- or under-predicting. 213 | """ 214 | 215 | scatter_chart = go.Figure(data=go.Scatter( 216 | x=df.y, y=df[active_model], 217 | mode="markers", 218 | marker=dict(color="#00828c", opacity=0.2), 219 | )) 220 | scatter_chart.update_layout( 221 | xaxis_title="True demand (Megawatt-hours)", 222 | yaxis_title="Forecast demand (Megawatt-hours)" 223 | ) 224 | 225 | st.plotly_chart(scatter_chart) 226 | 227 | residuals = (df["y"] - df[active_model]).dropna() 228 | 229 | 230 | """ 231 | --- 232 | Here is the marginal distribution of the residuals. 233 | We expect it to be symmetric, approximately normal, and centered at zero. 234 | """ 235 | 236 | residual_chart = px.histogram( 237 | df, x=residuals, color_discrete_sequence=["#00828c"] 238 | ) 239 | residual_chart.update_layout( 240 | xaxis_title="Residual (true demand - forecast demand) (Megawatt-hours)", 241 | yaxis_title="Count" 242 | ) 243 | 244 | st.plotly_chart(residual_chart) 245 | 246 | 247 | """ 248 | --- 249 | The autocorrelation and partial autocorrelation of the residuals. 250 | Since none of our models try to model the error (with autoregressive terms), we may 251 | expect some autocorrelation. 252 | The orange bands represent the 95% confidence interval for the null hypothesis that 253 | there is no (partial) autocorrelation. 254 | Bars outside those bounds indicate high likelihood of autocorrelation. 255 | """ 256 | 257 | autocorrelation, conf_intervals = acf(residuals, alpha=0.05, nlags=48) 258 | 259 | autocorrelation_df = pd.DataFrame({ 260 | "autocorrelation": autocorrelation, 261 | # center confidence intervals on zero, 262 | # so that null hypothesis is zero autocorrelation 263 | "ci_lower": conf_intervals[:, 0]-autocorrelation, 264 | "ci_upper": conf_intervals[:, 1]-autocorrelation 265 | }) 266 | autocorrelation_chart = px.bar( 267 | autocorrelation_df, 268 | x=autocorrelation_df.index, 269 | y=["autocorrelation", "ci_lower", "ci_upper"], 270 | color_discrete_sequence=["#00828c", "#ff8300", "#ff8300"], 271 | barmode="overlay" 272 | ) 273 | autocorrelation_chart.update_layout( 274 | xaxis_title="Timestep (hours)", 275 | yaxis_title="Autocorrelation", 276 | showlegend=False 277 | ) 278 | 279 | st.plotly_chart(autocorrelation_chart) 280 | 281 | 282 | partial_autocorrelation, partial_conf_intervals = pacf( 283 | residuals, alpha=0.05, nlags=48 284 | ) 285 | 286 | partial_autocorrelation_df = pd.DataFrame({ 287 | "partial_autocorrelation": partial_autocorrelation, 288 | # center confidence intervals on zero, 289 | # so that null hypothesis is zero partial autocorrelation 290 | "ci_lower": partial_conf_intervals[:, 0]-partial_autocorrelation, 291 | "ci_upper": partial_conf_intervals[:, 1]-partial_autocorrelation 292 | }) 293 | partial_autocorrelation_chart = px.bar( 294 | partial_autocorrelation_df, 295 | x=partial_autocorrelation_df.index, 296 | y=["partial_autocorrelation", "ci_lower", "ci_upper"], 297 | color_discrete_sequence=["#00828c", "#ff8300", "#ff8300"], 298 | barmode='overlay' 299 | ) 300 | partial_autocorrelation_chart.update_layout( 301 | xaxis_title="Timestep (hours)", 302 | yaxis_title="Partial autocorrelation", 303 | showlegend=False 304 | ) 305 | 306 | st.plotly_chart(partial_autocorrelation_chart) 307 | -------------------------------------------------------------------------------- /apps/forecast.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import datetime 42 | 43 | import streamlit as st 44 | import pandas as pd 45 | import numpy as np 46 | import matplotlib.pyplot as plt 47 | import plotly.express as px 48 | 49 | 50 | N_SAMPLES = 10 51 | 52 | st.title("California Electricity Demand Forecast") 53 | 54 | 55 | # Data loading and selection 56 | 57 | data_loading = st.text("Loading data...") 58 | 59 | 60 | @st.cache(allow_output_mutation=True) 61 | def load_data(): 62 | data = pd.read_csv("data/forecast.csv", parse_dates=["ds"]) 63 | data = data.set_index("ds") 64 | return data 65 | 66 | 67 | data = load_data() 68 | 69 | st.markdown(""" 70 | The forecast is generated for one year ahead of the most recent 71 | observation. Please select the range of interest over which to view and 72 | filter samples from the forecast distribution. 73 | """) 74 | 75 | start_date, end_date = st.date_input( 76 | "Select a forecast range", 77 | [data.index.min().date(), data.index.max().date()] 78 | ) 79 | 80 | subset = data[(data.index.date >= start_date) & 81 | (data.index.date <= end_date)].copy() 82 | data_loading.text("") 83 | 84 | 85 | @st.cache(hash_funcs={pd.DataFrame: lambda _: None}) 86 | def samples(df): 87 | return df.sample(N_SAMPLES, axis="columns").reset_index().melt(id_vars='ds') 88 | 89 | 90 | @st.cache(hash_funcs={pd.DataFrame: lambda _: None}) 91 | def mean(df): 92 | return df.mean(axis="columns") 93 | 94 | # Main forecast plot 95 | 96 | 97 | st.markdown(f""" 98 | The chart below shows the mean forecast (based on 1000 samples), 99 | and {N_SAMPLES} individual samples, which can be thought of as 100 | "possible futures". 101 | """) 102 | 103 | generating_chart = st.text("Generating chart") 104 | mean_forecast = mean(subset) 105 | sample_forecasts = samples(subset) 106 | 107 | 108 | line_chart = px.line( 109 | sample_forecasts, 110 | x='ds', 111 | y='value', 112 | line_group='variable', 113 | color_discrete_sequence=["rgba(0,130,140,0.1)"], 114 | 115 | ) 116 | line_chart.add_scatter( 117 | x=mean_forecast.index, 118 | y=mean_forecast, 119 | mode='lines', 120 | marker=dict(color="rgba(0,130,140,1)") 121 | ) 122 | line_chart.update_xaxes(range=[start_date, end_date]) 123 | line_chart.update_layout( 124 | showlegend=False, 125 | xaxis_title="Datetime (hourly increments)", 126 | yaxis_title="Megawatt-hours" 127 | ) 128 | st.plotly_chart(line_chart) 129 | generating_chart.text("") 130 | 131 | 132 | # Marginal plot of sum of values over interval 133 | 134 | data_sum = subset.sum() 135 | _min = float(data_sum.min()) 136 | _max = float(data_sum.max()) 137 | 138 | st.markdown(f""" 139 | The mean estimate of the aggregate demand from {start_date} to {end_date} 140 | is **{data_sum.mean():.2e}** Megawatt-hours. 141 | """) 142 | 143 | st.markdown(""" 144 | We can assess the probability of exceeding a given aggregate demand over 145 | the selected period. Choose the threshold of interest below. 146 | """) 147 | 148 | threshold = st.slider( 149 | "Threshold (Megawatt-hours)", 150 | min_value=_min, 151 | max_value=_max, 152 | format="%.2e" 153 | ) 154 | 155 | prob_exceed = data_sum[data_sum > threshold].count() / data_sum.count() 156 | 157 | st.markdown(f""" 158 | The probability of the aggregate demand between {start_date} and {end_date} 159 | being more than {threshold:.2e} Megawatt-hours is 160 | **{100*prob_exceed:.1f}**%. 161 | """) 162 | 163 | st.markdown(""" 164 | The histogram below shows the probability distribution of possible 165 | aggregate demands, cut off at the threshold selected. 166 | The higher the count for a given demand, the more likely that future is. 167 | """) 168 | 169 | hist = px.histogram( 170 | data_sum[data_sum > threshold], 171 | title="Possible total electricity demand levels", 172 | color_discrete_sequence=["#00828c"] 173 | ) 174 | hist.update_xaxes(range=[_min, _max]) 175 | hist.update_layout( 176 | showlegend=False, 177 | xaxis_title="Megawatt-hours", 178 | yaxis_title="Count (of 1000 simulated futures)" 179 | ) 180 | st.plotly_chart(hist) 181 | -------------------------------------------------------------------------------- /apps/launch_diagnostics.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | !streamlit run apps/diagnostics.py --server.port $CDSW_APP_PORT --server.address 127.0.0.1 42 | -------------------------------------------------------------------------------- /apps/launch_forecast.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | !streamlit run apps/forecast.py --server.port $CDSW_APP_PORT --server.address 127.0.0.1 42 | -------------------------------------------------------------------------------- /cml/fit_models_parallel.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import os 42 | import time 43 | import cdsw 44 | 45 | 46 | def fit_models_parallel(): 47 | ''' 48 | Use the CDSW Workers API (via Python SDK) to launch each model fitting script in parallel 49 | 50 | Docs - https://docs.cloudera.com/machine-learning/cloud/distributed-computing/topics/ml-workers-api.html 51 | 52 | ''' 53 | # Launch a separate worker to run each script independently 54 | 55 | base_path = os.getcwd() 56 | script_path = base_path + '/scripts' 57 | 58 | scripts = os.listdir(script_path) 59 | scripts = [script_path+'/' + 60 | script for script in scripts if script[0:3] in ['fit', 'mak']] 61 | 62 | for script in scripts: 63 | cdsw.launch_workers(n=1, cpu=1, memory=3, script=script) 64 | 65 | # Force session to persist until each worker job has completed 66 | # Check for completion every minute 67 | 68 | complete = False 69 | 70 | while complete == False: 71 | 72 | time.sleep(60) 73 | 74 | workers = cdsw.list_workers() 75 | workers_status = [wkr['status'] for wkr in workers] 76 | 77 | if all(status == 'succeeded' for status in workers_status): 78 | complete = True 79 | 80 | 81 | if __name__ == "__main__": 82 | fit_models_parallel() 83 | -------------------------------------------------------------------------------- /cml/install_dependencies.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | !pip3 install -r requirements.txt 42 | !pip3 install -e . 43 | -------------------------------------------------------------------------------- /img/app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/img/app.png -------------------------------------------------------------------------------- /img/diagnostic-chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/img/diagnostic-chart.png -------------------------------------------------------------------------------- /img/diagnostic-metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/img/diagnostic-metrics.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fbprophet==0.6 2 | matplotlib==2.0.0 3 | numpy==1.19.1 4 | pandas==1.1.0 5 | plotly==4.9.0 6 | requests==2.22.0 7 | statsmodels==0.12.0 8 | streamlit==0.66.0 -------------------------------------------------------------------------------- /scripts/fit_baseline_model.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import os 42 | 43 | import numpy as np 44 | 45 | from sts.data.loader import load_california_electricity_demand 46 | from sts.models.baselines import year_ahead_hourly_forecast 47 | 48 | 49 | # Load the data 50 | 51 | df = load_california_electricity_demand() 52 | 53 | # ## Baseline 54 | # Reproduce observed values exactly 52 weeks prior as forecast. 55 | 56 | baseline = ( 57 | df 58 | .sort_values('ds') 59 | .assign(yhat=year_ahead_hourly_forecast) 60 | ) 61 | 62 | 63 | # ## Write 64 | # Write the forecast values to csv 65 | DIR = 'data/forecasts/' 66 | 67 | if not os.path.exists(DIR): 68 | os.makedirs(DIR) 69 | 70 | baseline[['ds', 'yhat']].to_csv(DIR + 'baseline.csv', index=False) 71 | -------------------------------------------------------------------------------- /scripts/fit_complex_log_prophet_model.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import os 42 | 43 | import numpy as np 44 | 45 | from sts.data.loader import load_california_electricity_demand 46 | from sts.models.prophet import ( 47 | add_season_weekday_indicators, 48 | seasonal_daily_prophet_model 49 | ) 50 | 51 | 52 | # Load the training data (through 2018) 53 | 54 | df = load_california_electricity_demand(train_only=True) 55 | 56 | # Log transform the target variable 57 | df['y'] = df.y.apply(np.log) 58 | 59 | 60 | # ## Prophet (with more complicated seasonality) 61 | # FB Prophet model, splitting intra-day seasonalities into four subgroups: 62 | # - summer weekday 63 | # - summer weekend 64 | # - winter weekday 65 | # - winter weekend 66 | 67 | model = seasonal_daily_prophet_model(df) 68 | 69 | future = model.make_future_dataframe(periods=8760, freq='H') 70 | seasonal_future = add_season_weekday_indicators(future) 71 | 72 | forecast = model.predict(seasonal_future) 73 | 74 | # Reverse the log transform on predictions 75 | forecast['yhat'] = forecast.yhat.apply(np.exp) 76 | 77 | 78 | # ## Write 79 | # Write the forecast values to csv 80 | DIR = 'data/forecasts/' 81 | 82 | if not os.path.exists(DIR): 83 | os.makedirs(DIR) 84 | 85 | forecast[['ds', 'yhat']].to_csv(DIR + 'prophet_complex_log.csv', index=False) 86 | -------------------------------------------------------------------------------- /scripts/fit_complex_prophet_model.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import os 42 | 43 | import numpy as np 44 | 45 | from sts.data.loader import load_california_electricity_demand 46 | from sts.models.prophet import ( 47 | add_season_weekday_indicators, 48 | seasonal_daily_prophet_model 49 | ) 50 | 51 | 52 | # Load the training data (through 2018) 53 | 54 | df = load_california_electricity_demand(train_only=True) 55 | 56 | 57 | # ## Prophet (with more complicated seasonality) 58 | # FB Prophet model, splitting intra-day seasonalities into four subgroups: 59 | # - summer weekday 60 | # - summer weekend 61 | # - winter weekday 62 | # - winter weekend 63 | 64 | model = seasonal_daily_prophet_model(df) 65 | 66 | future = model.make_future_dataframe(periods=8760, freq='H') 67 | seasonal_future = add_season_weekday_indicators(future) 68 | 69 | forecast = model.predict(seasonal_future) 70 | 71 | 72 | # ## Write 73 | # Write the forecast values to csv 74 | DIR = 'data/forecasts/' 75 | 76 | if not os.path.exists(DIR): 77 | os.makedirs(DIR) 78 | 79 | forecast[['ds', 'yhat']].to_csv(DIR + 'prophet_complex.csv', index=False) 80 | -------------------------------------------------------------------------------- /scripts/fit_simple_prophet_model.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import os 42 | 43 | import numpy as np 44 | 45 | from sts.data.loader import load_california_electricity_demand 46 | from sts.models.prophet import default_prophet_model 47 | 48 | 49 | # Load the training data (through 2018) 50 | 51 | df = load_california_electricity_demand(train_only=True) 52 | 53 | 54 | # ## Prophet (Default) 55 | # FB Prophet model, all default parameters. 56 | 57 | model = default_prophet_model(df) 58 | 59 | future = model.make_future_dataframe(periods=8760, freq='H') 60 | forecast = model.predict(future) 61 | 62 | 63 | # ## Write 64 | # Write the forecast values to csv 65 | DIR = 'data/forecasts/' 66 | 67 | if not os.path.exists(DIR): 68 | os.makedirs(DIR) 69 | 70 | forecast[['ds', 'yhat']].to_csv(DIR + 'prophet_simple.csv', index=False) 71 | -------------------------------------------------------------------------------- /scripts/get_csv.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | from sts.data.loader import load_california_electricity_demand 42 | 43 | # This will load or download the data as json, and write it to csv. 44 | df = load_california_electricity_demand('data/demand.json') 45 | df.to_csv('data/demand.csv') 46 | -------------------------------------------------------------------------------- /scripts/make_forecast.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import datetime 42 | 43 | import numpy as np 44 | import pandas as pd 45 | 46 | from sts.models.prophet import ( 47 | add_season_weekday_indicators, 48 | seasonal_daily_prophet_model 49 | ) 50 | from sts.data.loader import load_california_electricity_demand 51 | 52 | 53 | # Load all available data for training 54 | 55 | df = load_california_electricity_demand() 56 | 57 | # Take log transform for fully multiplicative model 58 | df['y'] = df.y.apply(np.log) 59 | 60 | 61 | # Fit best current model 62 | 63 | model = seasonal_daily_prophet_model(df) 64 | 65 | 66 | # Make predictions for one year ahead of most recent training data 67 | 68 | future = add_season_weekday_indicators( 69 | model.make_future_dataframe(periods=24*365, freq='H') 70 | ) 71 | 72 | forecast = model.predict(future) 73 | 74 | samples = model.predictive_samples(future) 75 | 76 | # Reverse log transform 77 | predictions = np.exp(samples['yhat']) 78 | 79 | prediction_df = ( 80 | future 81 | .merge(pd.DataFrame(predictions), left_index=True, right_index=True) 82 | .drop(['winter_weekday', 'winter_weekend', 'summer_weekday', 'summer_weekend'], 83 | axis='columns') 84 | [future.ds.dt.date >= datetime.date.today()] 85 | ) 86 | 87 | 88 | # Save predictions 89 | 90 | prediction_df.to_csv('data/forecast.csv', index=False) 91 | -------------------------------------------------------------------------------- /scripts/validation_metrics.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import datetime 42 | 43 | import numpy as np 44 | import pandas as pd 45 | 46 | from sts.models.baselines import year_ahead_hourly_forecast 47 | from sts.models.prophet import ( 48 | add_season_weekday_indicators, 49 | seasonal_daily_prophet_model 50 | ) 51 | from sts.data.loader import load_california_electricity_demand 52 | 53 | 54 | # Load all available data for training 55 | 56 | df = load_california_electricity_demand() 57 | 58 | # Restrict to pre-2020 for evaluation on 2020 59 | train_df = df[df.ds.dt.year < 2020] 60 | 61 | # Take log transform for fully multiplicative model 62 | train_df['y'] = train_df.y.apply(np.log) 63 | 64 | 65 | # Fit best current model 66 | 67 | model = seasonal_daily_prophet_model(train_df) 68 | 69 | 70 | # Make predictions for one year ahead of most recent training data 71 | 72 | future = add_season_weekday_indicators( 73 | model.make_future_dataframe(periods=24*365, freq='H') 74 | ) 75 | 76 | forecast = model.predict(future) 77 | 78 | # Reverse log transform 79 | forecast['yhat'] = np.exp(forecast['yhat']) 80 | train_df['y'] = np.exp(train_df['y']) 81 | 82 | predictions = ( 83 | forecast[['ds', 'yhat']] 84 | .merge(df, on='ds') 85 | ) 86 | predictions = predictions[predictions.ds.dt.year == 2020] 87 | 88 | # ### MAPE 89 | mape = (np.abs(predictions.y - predictions.yhat) / predictions.y).mean() 90 | 91 | # Let's compare this to the MAPE of the seasonal naive model 92 | naive_df = df.copy() 93 | naive_df['yhat'] = year_ahead_hourly_forecast(naive_df) 94 | naive_df = naive_df[naive_df.ds.dt.year == 2020] 95 | naive_mape = (np.abs(naive_df.yhat - naive_df.y) / naive_df.y).mean() 96 | 97 | # ### MASE 98 | # Note, we have trained on a larger data set than we did for model selection. 99 | # As such, this MASE cannot be compared to the MASEs listed in the diagnostic 100 | # app. It's a measure of performance relative to the baseline on the new 101 | # training set of all data before 2020. 102 | # (The deep reason here is that time series are non-iid, and as such, we 103 | # must make train/dev/validation splits along choronological lines. 104 | # An unfortunate artefact of this is never having the metrics for the exact 105 | # model we deploy.) 106 | 107 | naive_forecast = year_ahead_hourly_forecast(train_df) 108 | denom = ( 109 | np.sum(np.abs((naive_forecast - train_df.y).dropna())) 110 | / len(naive_forecast.dropna()) 111 | ) 112 | mase = (np.abs(predictions.y - predictions.yhat) / denom).mean() 113 | 114 | print(f"The MAPE of our best performing model is: {mape}") 115 | print(f"The MAPE of the seasonal naive baseline: {naive_mape}") 116 | print(f"The MASE of the best performing model is: {mase}") 117 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | from setuptools import setup 42 | 43 | setup( 44 | name='sts', 45 | version='0.0.1', 46 | description=''' 47 | Utilities for structural time series modelling of 48 | California electricity demand data. 49 | ''', 50 | author='Chris J. Wallace', 51 | packages=['sts'] 52 | ) 53 | -------------------------------------------------------------------------------- /sts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/sts/__init__.py -------------------------------------------------------------------------------- /sts/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/sts/data/__init__.py -------------------------------------------------------------------------------- /sts/data/loader.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | import os 42 | import json 43 | import requests 44 | 45 | import pandas as pd 46 | 47 | 48 | def load_california_electricity_demand( 49 | filepath='data/demand.json', 50 | api_key_env='EIA_API_KEY', 51 | train_only=False): 52 | 53 | data = read_or_download_data(filepath, api_key_env) 54 | 55 | df = ( 56 | json_to_df(data) 57 | .rename(columns={0: 'ds', 1: 'y'}) 58 | .assign(ds=utc_to_pst) 59 | .assign(ds=lambda df: df.ds.dt.tz_localize(None)) 60 | .sort_values('ds') 61 | ) 62 | 63 | if train_only: 64 | df = remove_2019_and_later(df) 65 | 66 | return df 67 | 68 | 69 | def read_or_download_data(filepath, api_key_env): 70 | 71 | if os.path.exists(filepath): 72 | data = read_json(filepath) 73 | else: 74 | api_key = try_get_env(api_key_env) 75 | response_json = fetch_california_demand(api_key) 76 | write_json(response_json, filepath) 77 | data = read_json(filepath) 78 | 79 | return data 80 | 81 | 82 | def read_json(file): 83 | with open(file) as f: 84 | data = json.load(f) 85 | return data 86 | 87 | 88 | def write_json(data, filepath): 89 | with open(filepath, 'w') as file: 90 | json.dump(data, file) 91 | 92 | 93 | def try_get_env(api_key_env): 94 | env = os.getenv(api_key_env) 95 | if env: 96 | return env 97 | else: 98 | print('Please provide a valid EIA_API_KEY environment variable.') 99 | return None 100 | 101 | 102 | def fetch_california_demand(api_key): 103 | r = requests.get( 104 | 'http://api.eia.gov/series', 105 | params={ 106 | 'api_key': api_key, 107 | 'series_id': 'EBA.CAL-ALL.D.H', 108 | 'out': 'json' 109 | } 110 | ) 111 | return r.json() 112 | 113 | 114 | def json_to_df(data): 115 | df = pd.DataFrame(data['series'][0]['data']) 116 | return df 117 | 118 | 119 | def utc_to_pst(df): 120 | """ 121 | Convert from UTC to PST. 122 | PST is always UTC -8 hours: it ignores daylight savings. 123 | """ 124 | pst = ( 125 | pd 126 | .to_datetime(df['ds']) 127 | .subtract(pd.Timedelta('8 hours')) 128 | ) 129 | return pst 130 | 131 | 132 | def remove_2019_and_later(df): 133 | return df[df['ds'] < '2019'] 134 | -------------------------------------------------------------------------------- /sts/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/structural-time-series/2b51f92df06d1e38a9d59ae32456fd19a7a3ee7b/sts/models/__init__.py -------------------------------------------------------------------------------- /sts/models/baselines.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | NUM_HOURS_IN_DAY = 24 42 | NUM_DAYS_IN_WEEK = 7 43 | 44 | 45 | # Define some baseline forecasts 46 | 47 | # n-step ahead 48 | 49 | # n-step hourly 50 | 51 | 52 | def hour_ahead_hourly_forecast(df): 53 | return df.shift(periods=1).y 54 | 55 | 56 | def day_ahead_hourly_forecast(df): 57 | return df.shift(periods=24).y 58 | 59 | 60 | def week_ahead_hourly_forecast(df): 61 | return df.shift(periods=NUM_HOURS_IN_DAY*NUM_DAYS_IN_WEEK).y 62 | 63 | 64 | def month_ahead_hourly_forecast(df): 65 | """One month is exactly four weeks""" 66 | return df.shift(periods=4*NUM_HOURS_IN_DAY*NUM_DAYS_IN_WEEK).y 67 | 68 | 69 | def year_ahead_hourly_forecast(df): 70 | """One year is exactly 52 weeks""" 71 | return df.shift(periods=52*NUM_HOURS_IN_DAY*NUM_DAYS_IN_WEEK).y 72 | 73 | 74 | # n-step daily 75 | 76 | 77 | def day_ahead_daily_forecast(df): 78 | return df.shift(periods=1).y 79 | 80 | 81 | def week_ahead_daily_forecast(df): 82 | return df.shift(periods=NUM_DAYS_IN_WEEK).y 83 | 84 | 85 | def month_ahead_daily_forecast(df): 86 | """One month is exactly four weeks""" 87 | return df.shift(periods=4*NUM_DAYS_IN_WEEK).y 88 | 89 | 90 | def year_ahead_daily_forecast(df): 91 | """One year is exactly 52 weeks""" 92 | return df.shift(periods=52*NUM_DAYS_IN_WEEK).y 93 | 94 | 95 | # Collect baseline forecasts 96 | 97 | 98 | def global_mean_forecast(df): 99 | return df.y.mean() 100 | 101 | 102 | def hourly_forecasts(df): 103 | forecasts = df.assign( 104 | hour_ahead_hourly_forecast=hour_ahead_hourly_forecast, 105 | day_ahead_hourly_forecast=day_ahead_hourly_forecast, 106 | week_ahead_hourly_forecast=week_ahead_hourly_forecast, 107 | month_ahead_hourly_forecast=month_ahead_hourly_forecast, 108 | year_ahead_hourly_forecast=year_ahead_hourly_forecast 109 | ) 110 | return forecasts 111 | 112 | 113 | def daily_forecasts(): 114 | forecasts = df.resample('1D').sum().assign( 115 | day_ahead_daily_forecast=day_ahead_daily_forecast, 116 | week_ahead_daily_forecast=week_ahead_daily_forecast, 117 | month_ahead_daily_forecast=month_ahead_daily_forecast, 118 | year_ahead_daily_forecast=year_ahead_daily_forecast 119 | ) 120 | return forecasts 121 | -------------------------------------------------------------------------------- /sts/models/prophet.py: -------------------------------------------------------------------------------- 1 | # ########################################################################### 2 | # 3 | # CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) 4 | # (C) Cloudera, Inc. 2020 5 | # All rights reserved. 6 | # 7 | # Applicable Open Source License: Apache 2.0 8 | # 9 | # NOTE: Cloudera open source products are modular software products 10 | # made up of hundreds of individual components, each of which was 11 | # individually copyrighted. Each Cloudera open source product is a 12 | # collective work under U.S. Copyright Law. Your license to use the 13 | # collective work is as provided in your written agreement with 14 | # Cloudera. Used apart from the collective work, this file is 15 | # licensed for your use pursuant to the open source license 16 | # identified above. 17 | # 18 | # This code is provided to you pursuant a written agreement with 19 | # (i) Cloudera, Inc. or (ii) a third-party authorized to distribute 20 | # this code. If you do not have a written agreement with Cloudera nor 21 | # with an authorized and properly licensed third party, you do not 22 | # have any rights to access nor to use this code. 23 | # 24 | # Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the 25 | # contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY 26 | # KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED 27 | # WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO 28 | # IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND 29 | # FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, 30 | # AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS 31 | # ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE 32 | # OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY 33 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR 34 | # CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES 35 | # RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF 36 | # BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF 37 | # DATA. 38 | # 39 | # ########################################################################### 40 | 41 | from fbprophet import Prophet 42 | 43 | 44 | def default_prophet_model(df): 45 | model = Prophet() 46 | model.fit(df) 47 | return model 48 | 49 | 50 | def multiplicative_prophet_model(df): 51 | model = Prophet(seasonality_mode='multiplicative') 52 | model.fit(df) 53 | return model 54 | 55 | 56 | def seasonal_daily_prophet_model(df): 57 | model = Prophet( 58 | daily_seasonality=False, 59 | yearly_seasonality=20, 60 | changepoint_prior_scale=0.001 61 | ) 62 | model.add_seasonality( 63 | name='winter_weekday', 64 | period=1, 65 | fourier_order=12, 66 | condition_name='winter_weekday' 67 | ) 68 | model.add_seasonality( 69 | name='winter_weekend', 70 | period=1, 71 | fourier_order=12, 72 | condition_name='winter_weekend' 73 | ) 74 | model.add_seasonality( 75 | name='summer_weekday', 76 | period=1, 77 | fourier_order=12, 78 | condition_name='summer_weekday' 79 | ) 80 | model.add_seasonality( 81 | name='summer_weekend', 82 | period=1, 83 | fourier_order=12, 84 | condition_name='summer_weekend' 85 | ) 86 | model.add_country_holidays(country_name='US') 87 | df = add_season_weekday_indicators(df) 88 | model.fit(df) 89 | return model 90 | 91 | 92 | def add_season_weekday_indicators(df): 93 | df['winter_weekday'] = df['ds'].apply(is_winter_weekday) 94 | df['winter_weekend'] = df['ds'].apply(is_winter_weekend) 95 | df['summer_weekday'] = df['ds'].apply(is_summer_weekday) 96 | df['summer_weekend'] = df['ds'].apply(is_summer_weekend) 97 | return df 98 | 99 | 100 | def is_winter_weekday(ds): 101 | condition = ( 102 | (ds.month < 6 or ds.month >= 10) 103 | and not (ds.day_name() in ['Saturday', 'Sunday']) 104 | ) 105 | return condition 106 | 107 | 108 | def is_winter_weekend(ds): 109 | condition = ( 110 | (ds.month < 6 or ds.month >= 10) 111 | and (ds.day_name() in ['Saturday', 'Sunday']) 112 | ) 113 | return condition 114 | 115 | 116 | def is_summer_weekday(ds): 117 | condition = ( 118 | (ds.month >= 6 or ds.month < 10) 119 | and not (ds.day_name() in ['Saturday', 'Sunday']) 120 | ) 121 | return condition 122 | 123 | 124 | def is_summer_weekend(ds): 125 | condition = ( 126 | (ds.month >= 6 or ds.month < 10) 127 | and (ds.day_name() in ['Saturday', 'Sunday']) 128 | ) 129 | return condition 130 | --------------------------------------------------------------------------------