├── .env.default
├── .github
    └── workflows
    │   ├── ci_cd_ml_pipeline.yml
    │   └── ci_cd_web_app.yml
├── .gitignore
├── LICENSE
├── README.md
├── README_CICD.md
├── README_DEPLOY.md
├── airflow
    ├── .gitignore
    ├── Dockerfile
    ├── dags
    │   ├── .env.default
    │   ├── __init__.py
    │   └── ml_pipeline_dag.py
    ├── docker-compose.yaml
    ├── poetry.lock
    └── pyproject.toml
├── app-api
    ├── .dockerignore
    ├── .env.default
    ├── Dockerfile
    ├── README.md
    ├── api
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── application.py
    │   ├── config.py
    │   ├── schemas
    │   │   ├── __init__.py
    │   │   ├── area_values.py
    │   │   ├── consumer_type_values.py
    │   │   ├── health.py
    │   │   └── predictions.py
    │   └── views.py
    ├── poetry.lock
    ├── pyproject.toml
    └── run.sh
├── app-frontend
    ├── .dockerignore
    ├── .streamlit
    │   └── config.toml
    ├── Dockerfile
    ├── README.md
    ├── frontend
    │   ├── __init__.py
    │   ├── components.py
    │   ├── main.py
    │   └── settings.py
    ├── poetry.lock
    └── pyproject.toml
├── app-monitoring
    ├── .dockerignore
    ├── .streamlit
    │   └── config.toml
    ├── Dockerfile
    ├── README.md
    ├── monitoring
    │   ├── __init__.py
    │   ├── components.py
    │   ├── main.py
    │   └── settings.py
    ├── poetry.lock
    └── pyproject.toml
├── batch-prediction-pipeline
    ├── .env.default
    ├── README.md
    ├── batch_prediction_pipeline
    │   ├── __init__.py
    │   ├── batch.py
    │   ├── data.py
    │   ├── monitoring.py
    │   ├── settings.py
    │   └── utils.py
    ├── poetry.lock
    └── pyproject.toml
├── deploy
    ├── app-docker-compose.local.yml
    ├── app-docker-compose.yml
    └── ml-pipeline.sh
├── feature-pipeline
    ├── .env.default
    ├── README.md
    ├── feature_pipeline
    │   ├── __init__.py
    │   ├── clean_feature_store.py
    │   ├── etl
    │   │   ├── __init__.py
    │   │   ├── cleaning.py
    │   │   ├── extract.py
    │   │   ├── load.py
    │   │   └── validation.py
    │   ├── feature_view.py
    │   ├── pipeline.py
    │   ├── settings.py
    │   └── utils.py
    ├── poetry.lock
    └── pyproject.toml
├── images
    ├── airflow_login_screenshot.png
    ├── airflow_ml_pipeline_dag_overview_screenshot.png
    ├── airflow_ml_pipeline_dag_screenshot.png
    ├── airflow_variables_screenshot.png
    ├── architecture.png
    ├── forecasting_demo_screenshot.png
    ├── gcp_expose_ports_firewall_rule_screenshot.png
    ├── gcp_gcs_screenshot.png
    ├── gcp_iap_for_tcp_firewall_rule.png
    ├── gcp_ssh_screenshot.png
    ├── gcp_vm_external_ip_screenshot.png
    ├── github_actions_secrets_screenshot.png
    ├── github_actions_see_cicd_screenshot.png
    ├── github_actions_variables_screenshot.png
    ├── gmail.png
    ├── linkedin.png
    ├── medium.png
    ├── screenshot_introduction_video.png
    ├── substack.png
    └── twitter.png
├── scripts
    └── install_poetry_macos_m1_chip.sh
└── training-pipeline
    ├── .env.default
    ├── README.md
    ├── poetry.lock
    ├── pyproject.toml
    └── training_pipeline
        ├── __init__.py
        ├── best_config.py
        ├── configs
            ├── __init__.py
            └── gridsearch.py
        ├── data.py
        ├── hyperparameter_tuning.py
        ├── models.py
        ├── settings.py
        ├── train.py
        ├── transformers.py
        └── utils.py


/.env.default:
--------------------------------------------------------------------------------
1 | FS_API_KEY = "<your-feature-store-api-key>"
2 | FS_PROJECT_NAME = "<your-feature-store-project-name>"
3 | WANDB_API_KEY = "<your-wandb-api-key>"
4 | WANDB_ENTITY = "teaching-mlops"
5 | WANDB_PROJECT = "energy_consumption"
6 | GOOGLE_CLOUD_PROJECT = "energy_consumption"
7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions"
8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "/absolute/path/to/your/service-account.json"
9 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_cd_ml_pipeline.yml:
--------------------------------------------------------------------------------
 1 | name: CD/CD for the ml-pipeline that builds all the pipeline modules and pushes them to the private PyPI registry. From where Airflow will install the latest versions and use them in the next run.
 2 | 
 3 | on:
 4 |   push:
 5 |     paths-ignore:
 6 |       - 'app-api/'
 7 |       - 'app-frontend/'
 8 |       - '**/*.yml'
 9 |       - '**/*.md'
10 |     branches: [ "main" ]
11 |     
12 | env:
13 |   CLOUDSDK_CORE_PROJECT: '${{ vars.CLOUDSDK_CORE_PROJECT }}'
14 |   USER: '${{ vars.USER }}'
15 |   INSTANCE_NAME: '${{ vars.ML_PIPELINE_INSTANCE_NAME }}'
16 |   ZONE: '${{ vars.ZONE }}'
17 | 
18 | jobs:
19 |   ci_cd:
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - uses: 'actions/checkout@v3'
23 | 
24 |       - id: 'auth'
25 |         uses: 'google-github-actions/auth@v0'
26 |         with:
27 |           credentials_json: '${{ secrets.GCP_CREDENTIALS }}'
28 |       - id: 'compute-ssh'
29 |         uses: 'google-github-actions/ssh-compute@v0'
30 |         with:
31 |           project_id: '${{ env.CLOUDSDK_CORE_PROJECT }}'
32 |           user: '${{ env.USER }}'
33 |           instance_name: '${{ env.INSTANCE_NAME }}'
34 |           zone: '${{ env.ZONE }}'
35 |           ssh_private_key: '${{ secrets.GCP_SSH_PRIVATE_KEY }}'
36 |           command: >
37 |             cd ~/energy-forecasting && 
38 |             git pull && 
39 |             sh deploy/ml-pipeline.sh
40 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_cd_web_app.yml:
--------------------------------------------------------------------------------
 1 | name: CI/CD for the web app (API + frontend)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths-ignore:
 6 |       - 'batch-prediction-pipeline/'
 7 |       - 'feature-pipeline/'
 8 |       - 'training-pipeline'
 9 |       - '**/*.yml'
10 |       - '**/*.md'
11 |     branches: [ "main" ]
12 |     
13 | env:
14 |   CLOUDSDK_CORE_PROJECT: '${{ vars.CLOUDSDK_CORE_PROJECT }}'
15 |   USER: '${{ vars.USER }}'
16 |   INSTANCE_NAME: '${{ vars.APP_INSTANCE_NAME }}'
17 |   ZONE: '${{ vars.ZONE }}'
18 | 
19 | jobs:
20 |   ci_cd:
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - uses: 'actions/checkout@v3'
24 | 
25 |       - id: 'auth'
26 |         uses: 'google-github-actions/auth@v0'
27 |         with:
28 |           credentials_json: '${{ secrets.GCP_CREDENTIALS }}'
29 |       - id: 'compute-ssh'
30 |         uses: 'google-github-actions/ssh-compute@v0'
31 |         with:
32 |           project_id: '${{ env.CLOUDSDK_CORE_PROJECT }}'
33 |           user: '${{ env.USER }}'
34 |           instance_name: '${{ env.INSTANCE_NAME }}'
35 |           zone: '${{ env.ZONE }}'
36 |           ssh_private_key: '${{ secrets.GCP_SSH_PRIVATE_KEY }}'
37 |           command: >
38 |             cd ~/energy-forecasting && 
39 |             git pull && 
40 |             docker compose -f deploy/app-docker-compose.yml --project-directory . up --build -d
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | */.env
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | # IDEs
133 | .idea
134 | .vscode
135 | 
136 | # Tools
137 | mage_data/
138 | wandb/
139 | energy_consumption/
140 | 
141 | # Data
142 | *.parquet
143 | models/
144 | output/
145 | artifacts/
146 | 
147 | # Models
148 | *.pkl
149 | 
150 | # Sensitive
151 | credentials/
152 | .hw_api_key
153 | 
154 | # Local notes
155 | deploy_hardcoded_steps.md
156 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Paul Iusztin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README_CICD.md:
--------------------------------------------------------------------------------
 1 | # The Full Stack 7-Steps MLOps Framework
 2 | 
 3 | Congratulations, you are close to the whole experience if you reached this far. This is the last step from the 7 lessons of the course.
 4 | 
 5 | **NOTE:** You can finish this lesson only if you deploy your code to GCP. If you haven't. [Check out this section to see how.](https://github.com/iusztinpaul/energy-forecasting/blob/main/README_DEPLOY.md)
 6 | 
 7 | [Access Lesson 7 on Medium for more detailed step-by-step instructions.](https://towardsdatascience.com/seamless-ci-cd-pipelines-with-github-actions-on-gcp-your-tools-for-effective-mlops-96f676f72012).
 8 | 
 9 | # CI/CD
10 | 
11 | We will use GitHub Actions to create the CI/CD pipeline. GitHub Actions will let us run various commands on specific triggers, such as a new commit to a branch.
12 | 
13 | ## Fork the Repository
14 | 
15 | By forking the repository, you will create the exact identical copy of the code on your own GitHub account. Thus, you will have full access to the settings of the repository.
16 | 
17 | [Check out this doc to see how to fork a repository on GitHub.](https://docs.github.com/en/get-started/quickstart/fork-a-repo)
18 | 
19 | ## Set Actions Variables
20 | 
21 | Go to your forked repository. After go to `Settings` -> `Secrets and variables` (in the Security tab) -> `Actions`.
22 | 
23 | Now, click `Variables`. You can create a new variable from that section by clicking `New repository variable`.
24 | 
25 | See the image below 👇
26 | 
27 | <p align="center">
28 |   <img src="images/github_actions_variables_screenshot.png">
29 | </p>
30 | 
31 | You have to create 5 variables that will be used by the GitHub Actions scripts:
32 | * `APP_INSTANCE_NAME` : the name of the web app VM. | In our case, it is `app`. The default should be ok if you use our recommended naming conventions.
33 | * `GCLOUD_PROJECT` : the ID of your GCP Project | Here, you have to change it with your own project ID.
34 | * `ML_PIPELINE_INSTANCE_NAME` : the name of the ML pipeline VM. | In our case, it is `ml-pipeline`. The default should be ok if you use our recommended naming conventions.
35 | * `USER`: the user you used to connect to the VMs while settings up the machine using the SSH connection. | Mine was `pauliusztin`, but you must change it with yours.
36 | * `ZONE` : the zone where you deployed the VMs. | The default should be ok if you use our recommended naming conventions.
37 | 
38 | ## Set Secrets
39 | 
40 | In the same `Secrets and variables/Actions` section, hit the `Secrets` tab. 
41 | 
42 | You can create a new secret by pressing the `New repository secret` button.
43 | 
44 | These are similar to the variables we just completed, but after you fill in their values, you can't see them anymore. That is why these are called secrets. Here is where you add all your sensitive information. In our case, the GCP credentials and private keys.
45 | 
46 | See the image below 👇
47 | 
48 | <p align="center">
49 |   <img src="images/github_actions_secrets_screenshot.png">
50 | </p>
51 | 
52 | The `GCP_CREDENTIALS` secret contains the content of the JSON key of your VM admin service account. By settings this up, the CI/CD pipeline will use that service account to authenticate to the VMs.
53 | 
54 | Because the content of the file is in JSON format, you have to run the following commands:
55 | 
56 | Install the jq CLI: 
57 | ```shell
58 | sudo apt update
59 | sudo apt install -y jq
60 | jq --version
61 | ```
62 | Format your JSON key file:
63 | ```shell
64 | jq -c . /path/to/your/admin-vm.json 
65 | ```
66 | Take the output of this command and create your `GCP_CREDENTIALS` secret with it.
67 | 
68 | The `GCP_SSH_PRIVATE_KEY` is your GCP private SSH key (not your personal one - GCP creates an additional one automatically), which was built on your local computer when you used SSH to connect to the VMs.
69 | 
70 | To copy it, run the following:
71 | ```shell
72 | cd ~/.ssh
73 | cat google_compute_engine
74 | ```
75 | Copy the output from the terminal and create the `GCP_SSH_PRIVATE_KEY` variable. 
76 | 
77 | 
78 | ## Run the CI/CD Pipeline
79 | 
80 | Now make any change to the code, push it to the main branch, and the GitHub Actions files should trigger automatically.
81 | 
82 | To see their results, check your GitHub repository's `Actions` tab.
83 | 
84 | <p align="center">
85 |   <img src="images/github_actions_see_cicd_screenshot.png">
86 | </p>
87 | 
88 | Two actions will be triggered. One will build and deploy the `ml-pipeline` modules to your `ml-pipeline` GCP VM, and one will build and deploy the `web app` to your `app` GCP VM. 
89 | 
90 | If you want to understand better how we wrote the GitHub Actions scripts under the `.github/workflows` directory [check out the **"CI/CD Pipeline Using GitHub Actions"** section of Lesson 7 on Medium](https://towardsdatascience.com/seamless-ci-cd-pipelines-with-github-actions-on-gcp-your-tools-for-effective-mlops-96f676f72012) that explains everything in detail.
91 | 


--------------------------------------------------------------------------------
/README_DEPLOY.md:
--------------------------------------------------------------------------------
  1 | # The Full Stack 7-Steps MLOps Framework
  2 | 
  3 | ## Deploy to GCP
  4 | 
  5 | This step must only be finished if you want to deploy the code on GCP VMs and build the CI/CD with GitHub Actions.
  6 | 
  7 | Note that this step might result in a few costs on GCP. It won't be much. While developing this course, I spent only ~20$, which will probably be less for you.
  8 | 
  9 | Also, you can get some free credits if you have a new GCP account (I had 300$ free credits). Just be sure to delete the resources after you finish the course.
 10 | 
 11 | [Access Lesson 7 on Medium for more detailed step-by-step instructions.](https://towardsdatascience.com/seamless-ci-cd-pipelines-with-github-actions-on-gcp-your-tools-for-effective-mlops-96f676f72012).
 12 | 
 13 | ------
 14 | 
 15 | ## General Set Up
 16 | 
 17 | Before setting up the code, you must go to your GCP project and create a few additional resources. After, you can SSH to your machines and deploy your code.
 18 | 
 19 | #### GCP Resources
 20 | 
 21 | ### Admin VM Service Account with IAP Access
 22 | 
 23 | We need a new GCP service account with admin rights & IAP access when working with GCP VMs. You have to create a new service account and assign to the new service account the following roles:
 24 | * Compute Instance Admin (v1)
 25 | * IAP-secured Tunnel User
 26 | * Service Account Token Creator
 27 | * Service Account User
 28 | 
 29 | IAP stands for Identity-Aware Proxy. It is a way to create tunnels that route TCP traffic. For your knowledge, you can read more about this topic using the following docs (you don't have to fully understand it to proceed to the next steps):
 30 | * [Using IAP for TCP forwarding](https://cloud.google.com/iap/docs/using-tcp-forwarding)
 31 | * [Overview of TCP forwarding](https://cloud.google.com/iap/docs/tcp-forwarding-overview)
 32 | 
 33 | ### Expose Ports Firewall Rule
 34 | 
 35 | Create a firewall rule that exposes the following TCP ports: 8501, 8502, 8001.
 36 | 
 37 | Also, add a `target tag` called `energy-forecasting-expose-ports`.
 38 | 
 39 | Here is how my firewall rule looks like:
 40 | 
 41 | <p align="center">
 42 |   <img src="images/gcp_expose_ports_firewall_rule_screenshot.png">
 43 | </p>
 44 | 
 45 | Here are 2 docs that helped me create and configure the ports for the firewall rule:
 46 | * [Doc 1](https://stackoverflow.com/questions/21065922/how-to-open-a-specific-port-such-as-9090-in-google-compute-engine)
 47 | * [Doc 2](https://www.howtogeek.com/devops/how-to-open-firewall-ports-on-a-gcp-compute-engine-instance/)
 48 | 
 49 | 
 50 | ### IAP for TCP Tunneling Firewall Rule
 51 | 
 52 | Now we will create a firewall rule allowing IAP for TCP Tunneling on all the VMs connected to the `default` network.
 53 | 
 54 | [Docs on how to create the firewall rule.](https://cloud.google.com/iap/docs/using-tcp-forwarding#preparing_your_project_for_tcp_forwarding)
 55 | 
 56 | Here is how my firewall rule looks like:
 57 | 
 58 | <p align="center">
 59 |   <img src="images/gcp_iap_for_tcp_firewall_rule.png">
 60 | </p>
 61 | 
 62 | 
 63 | ### VM for the Pipeline
 64 | 
 65 | Go to your GCP project -> `VM Instances` -> `Create Instance`
 66 | 
 67 | Choose `e2-standard-2: 2 vCPU cores - 8 GB RAM` as your VM instance type.
 68 | 
 69 | Call it: `ml-pipeline`
 70 | 
 71 | Change the disk to `20 GB Storage`
 72 | 
 73 | Pick region `europe-west3 (Frankfurt)` and zone `europe-west3-c`
 74 | 
 75 | Network: `default`
 76 | 
 77 | Also, check the `HTTP` and `HTTPS` boxes and add the `energy-forecasting-expose-ports` custom firewall rule we did a few steps back.
 78 | 
 79 | Here are 2 docs that helped me create and configure the ports for the firewall rule:
 80 | * [Doc 1](https://stackoverflow.com/questions/21065922/how-to-open-a-specific-port-such-as-9090-in-google-compute-engine)
 81 | * [Doc 2](https://www.howtogeek.com/devops/how-to-open-firewall-ports-on-a-gcp-compute-engine-instance/)
 82 | 
 83 | 
 84 | ### VM for the Web App
 85 | 
 86 | Go to your GCP project -> `VM Instances` -> `Create Instance`
 87 | 
 88 | Choose: `e2-micro: 0.25 2 vCPU - 1 GB memory` as your VM instance type.
 89 | 
 90 | Call it: `app`
 91 | 
 92 | Change the disk to: `15 GB standard persisted disk`
 93 | 
 94 | Pick region `europe-west3 (Frankfurt)` and zone `europe-west3-c`
 95 | 
 96 | Network: `default`
 97 | 
 98 | Also, check the `HTTP` and `HTTPS` boxes and add the `energy-forecasting-expose-ports` custom firewall rule we created a few steps back.
 99 | 
100 | Here are 2 docs that helped me create and configure the ports for the firewall rule:
101 | * [Doc 1](https://stackoverflow.com/questions/21065922/how-to-open-a-specific-port-such-as-9090-in-google-compute-engine)
102 | * [Doc 2](https://www.howtogeek.com/devops/how-to-open-firewall-ports-on-a-gcp-compute-engine-instance/)
103 | 
104 | 
105 | ### External Static IP
106 | 
107 | If we want the external IP for our web app to be static (aka not to change), we have to attach a static address to our web app VM.
108 | 
109 | More precisely, we suggest adding it only to the `app` VM we created a few steps ahead. 
110 | 
111 | That is perfectly fine if you want to also add a static external IP to the `ml-pipeline` VM. 
112 | 
113 | [Docs on reserving a static external IP address.](https://cloud.google.com/compute/docs/ip-addresses/reserve-static-external-ip-address)
114 | 
115 | 
116 | ----
117 | 
118 | #### Now that the boring part is finished, let's start deploying the code 👇 👇 👇
119 | 
120 | 
121 | ## Deploy - General Steps
122 | 
123 | ### Configure Your Service Account
124 | 
125 | We will use your service account configured with admin rights for VMs and IAP access to SSH from your local machine to the GCP VMs.
126 | 
127 | First, we must tell the `gcloud` GCP CLI to use that service account.
128 | 
129 | To do so, you have to create a key for your service account and download it as a JSON file (same as you did for the buckets service accounts - [here are some docs to refresh your mind](https://cloud.google.com/iam/docs/keys-create-delete)).
130 | 
131 | After you download the file, you just have to run the following `gcloud` command in your terminal:
132 | ```shell
133 | gcloud auth activate-service-account SERVICE_ACCOUNT@DOMAIN.COM --key-file=/path/key.json --project=PROJECT_ID
134 | ```
135 | 
136 | [Check out this doc for more details about the gcloud auth command](https://cloud.google.com/sdk/gcloud/reference/auth/activate-service-account).
137 | 
138 | Now whenever you run commands with `gcloud`, it will use this service account to authenticate.
139 | 
140 | 
141 | ## Deploy - The Pipeline
142 | 
143 | Let's connect through SSH to the `ml-pipeline` GCP VM you created a few steps ahead:
144 | ```shell
145 | gcloud compute ssh ml-pipeline --zone europe-west3-c --quiet --tunnel-through-iap --project <your-project-id>
146 | ```
147 | **NOTE 1:** Change the `zone` if you haven't created a VM within the same zone as us.<br/>
148 | **NOTE 2:** Your `project-id` is NOT your `project name`. Go to your GCP projects list and find the project id.
149 | 
150 | Starting this point, if you configured the firewalls and service account correctly, as everything is Dockerized, all the steps will be 99% similar to the ones from the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#tools) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections.
151 | 
152 | You can follow the same steps while your terminal has an SSH connection with the GCP machine.
153 | 
154 | Note that the GCP machine is a Linux machine. Thus, you can directly copy & paste the commands from the README regardless of the OS you use on your local machine.
155 | 
156 | <p align="center">
157 |   <img src="images/gcp_ssh_screenshot.png">
158 | </p>
159 | 
160 | Now you must repeat all the steps you've done setting `The Pipeline` locally, using this SSH connection.
161 | 
162 | ### BUT YOU HAVE TO KEEP IN MIND THE FOLLOWING:
163 | 
164 | **Clone the code in the home directory of the VM:**
165 | 
166 | Just SHH to the VM and run:
167 | ```shell
168 | git clone https://github.com/iusztinpaul/energy-forecasting.git
169 | cd energy-forecasting
170 | ```
171 | 
172 | **Install Docker using the following commands:** <br/><br/>
173 | Install Docker:
174 | ```shell
175 | sudo apt update
176 | sudo apt install --yes apt-transport-https ca-certificates curl gnupg2 software-properties-common
177 | curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add -
178 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
179 | sudo apt update
180 | sudo apt install --yes docker-ce
181 | ```
182 | Add sudo access to Docker:
183 | ```shell
184 | sudo usermod -aG docker $USER
185 | logout 
186 | ```
187 | Login again to your machine:
188 | ```shell
189 | gcloud compute ssh ml-pipeline --zone europe-west3-c --quiet --tunnel-through-iap --project <your-project-id>
190 | ```
191 | 
192 | [Check out these docs for the full instructions.](https://tomroth.com.au/gcp-docker/)
193 | 
194 | **Replace all `cp` commands with `gcloud compute scp`:** <br/><br/>
195 | 
196 | This command will help you to copy files from your local machine to the VM.
197 | 
198 | For example, instead of running:
199 | ```shell
200 | cp -r /path/to/admin/gcs/credentials/admin-buckets.json credentials/gcp/energy_consumption
201 | ```
202 | Run in a different terminal (not the one connected with SSH to your VM):
203 | ```shell
204 | gcloud compute scp --recurse --zone europe-west3-c --quiet --tunnel-through-iap --project <your-project-id> /local/path/to/admin-buckets.json ml-pipeline:~/energy-forecasting/airflow/dags/credentials/gcp/energy_consumption/
205 | ```
206 | This command will copy your local `admin-buckets.json` file to the `ml-pipeline` VM.
207 | 
208 | 
209 | **!!!** This is all you need to know. All the other steps are the same as the ones run locally. Only Docker has a slightly different installation, and you need a different way to copy files from your local machine to the VM. 
210 | 
211 | 
212 | Now to access the Airflow application, go to your VM view from GCP and go to the `Network tags` section. You will find the `External IP address` column, as shown in the image below. Copy that IP and attach port `8080` to it. 
213 | 
214 | For example, based on the `External IP address` from the image below, I accessed Airflow using this address: `35.207.134.188:8080`. 
215 | 
216 | Congrats! You connected to your own self-hosted Airflow application.
217 | 
218 | If it doesn't connect, give it a few seconds to load properly.
219 | 
220 | <p align="center">
221 |   <img src="images/gcp_vm_external_ip_screenshot.png">
222 | </p>
223 | 
224 | 
225 | ## Deploy - The Web App
226 | Let's connect through SSH to the `app` GCP VM you created a few steps ahead:
227 | ```shell
228 | gcloud compute ssh app --zone europe-west3-c --quiet --tunnel-through-iap --project <your-project-id>
229 | ```
230 | **NOTE 1:** Change the `zone` if you haven't created a VM within the same zone as us.<br/>
231 | **NOTE 2:** Your `project-id` is NOT your `project name`. Go to your GCP projects list and find the project id.
232 | 
233 | Here the process is similar to deploying the ML Pipeline.
234 | 
235 | You can deploy the web app following the exact same steps described in [Lesson 6](https://towardsdatascience.com/fastapi-and-streamlit-the-python-duo-you-must-know-about-72825def1243) or in the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#tools) & [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections of the GitHub repository.
236 | 
237 | But don't forget to keep in mind the same edge cases described in the [Deploy - The Pipeline](https://github.com/iusztinpaul/energy-forecasting/blob/main/README_DEPLOY.md#deploy---the-pipeline) section.
238 | 
239 | ----
240 | 
241 | 👀 **As you can see, here you have done everything manually. If you want to know how to create a simple CI/CD using GitHub Actions [check out this section](https://github.com/iusztinpaul/energy-forecasting/blob/main/README_CICD.md).**
242 | 


--------------------------------------------------------------------------------
/airflow/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | .env
3 | 


--------------------------------------------------------------------------------
/airflow/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:2.7.0
 2 | 
 3 | ARG CURRENT_USER=$USER
 4 | 
 5 | USER root
 6 | # Install Python dependencies to be able to process the wheels from the private PyPI server.
 7 | RUN apt-get -y update && ACCEPT_EULA=Y apt-get -y upgrade
 8 | RUN apt-get install -y python3.9-distutils python3.9-dev build-essential
 9 | USER ${CURRENT_USER}
10 | 


--------------------------------------------------------------------------------
/airflow/dags/.env.default:
--------------------------------------------------------------------------------
1 | FS_API_KEY = "<your-feature-store-api-key>"
2 | FS_PROJECT_NAME = "<your-feature-store-project-name>"
3 | WANDB_API_KEY = "<your-wandb-api-key>"
4 | WANDB_ENTITY = "teaching-mlops"
5 | WANDB_PROJECT = "energy_consumption"
6 | GOOGLE_CLOUD_PROJECT = "energy_consumption"
7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions"
8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "/opt/airflow/dags/credentials/gcp/energy_consumption/admin-buckets.json"
9 | 


--------------------------------------------------------------------------------
/airflow/dags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/airflow/dags/__init__.py


--------------------------------------------------------------------------------
/airflow/dags/ml_pipeline_dag.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | 
  3 | from airflow.decorators import dag, task
  4 | from airflow.models import Variable
  5 | from airflow.operators.empty import EmptyOperator
  6 | from airflow.utils.trigger_rule import TriggerRule
  7 | from airflow.utils.edgemodifier import Label
  8 | 
  9 | 
 10 | @dag(
 11 |     dag_id="ml_pipeline",
 12 |     schedule="@hourly",
 13 |     start_date=datetime(2023, 4, 14),
 14 |     catchup=False,
 15 |     tags=["feature-engineering", "model-training", "batch-prediction"],
 16 |     max_active_runs=1,
 17 | )
 18 | def ml_pipeline():
 19 |     @task.virtualenv(
 20 |         task_id="run_feature_pipeline",
 21 |         requirements=[
 22 |             "--trusted-host 172.17.0.1",
 23 |             "--extra-index-url http://172.17.0.1",
 24 |             "feature_pipeline",
 25 |         ],
 26 |         python_version="3.9",
 27 |         multiple_outputs=True,
 28 |         system_site_packages=True,
 29 |     )
 30 |     def run_feature_pipeline(
 31 |         export_end_reference_datetime: str,
 32 |         days_delay: int,
 33 |         days_export: int,
 34 |         url: str,
 35 |         feature_group_version: int,
 36 |     ) -> dict:
 37 |         """
 38 |         Run the feature pipeline.
 39 | 
 40 |         Args:
 41 |             export_end_reference_datetime: The end reference datetime of the export window. If None, the current time is used.
 42 |                 Because the data is always delayed with "days_delay" days, this date is used only as a reference point.
 43 |                 The real extracted window will be computed as [export_end_reference_datetime - days_delay - days_export, export_end_reference_datetime - days_delay].
 44 | 
 45 |             days_delay : int
 46 |                 Data has a delay of N days. Thus, we have to shift our window with N days.
 47 | 
 48 |             days_export : int
 49 |                 The number of days to export.
 50 | 
 51 |             url : str
 52 |                 URL to the raw data.
 53 | 
 54 |             feature_group_version : int
 55 |                 Version of the feature store feature group to use.
 56 | 
 57 |         Returns:
 58 |             Metadata of the feature pipeline run.
 59 |         """
 60 | 
 61 |         from datetime import datetime
 62 | 
 63 |         from feature_pipeline import utils, pipeline
 64 | 
 65 |         logger = utils.get_logger(__name__)
 66 | 
 67 |         try:
 68 |             export_end_reference_datetime = datetime.strptime(
 69 |                 export_end_reference_datetime, "%Y-%m-%d %H:%M:%S.%f%z"
 70 |             )
 71 |         except ValueError:
 72 |             export_end_reference_datetime = datetime.strptime(
 73 |                 export_end_reference_datetime, "%Y-%m-%d %H:%M:%S%z"
 74 |             )
 75 |         export_end_reference_datetime = export_end_reference_datetime.replace(
 76 |             microsecond=0, tzinfo=None
 77 |         )
 78 | 
 79 |         logger.info(f"export_end_datetime = {export_end_reference_datetime}")
 80 |         logger.info(f"days_delay = {days_delay}")
 81 |         logger.info(f"days_export = {days_export}")
 82 |         logger.info(f"url = {url}")
 83 |         logger.info(f"feature_group_version = {feature_group_version}")
 84 | 
 85 |         return pipeline.run(
 86 |             export_end_reference_datetime=export_end_reference_datetime,
 87 |             days_delay=days_delay,
 88 |             days_export=days_export,
 89 |             url=url,
 90 |             feature_group_version=feature_group_version,
 91 |         )
 92 | 
 93 |     @task.virtualenv(
 94 |         task_id="create_feature_view",
 95 |         requirements=[
 96 |             "--trusted-host 172.17.0.1",
 97 |             "--extra-index-url http://172.17.0.1",
 98 |             "feature_pipeline",
 99 |         ],
100 |         python_version="3.9",
101 |         multiple_outputs=True,
102 |         system_site_packages=False,
103 |     )
104 |     def create_feature_view(feature_pipeline_metadata: dict) -> dict:
105 |         """
106 |         This function creates a feature view based on the feature pipeline computations. The feature view
107 |         is created using the feature group version from the feature pipeline metadata.
108 |         """
109 | 
110 |         from feature_pipeline import feature_view
111 | 
112 |         return feature_view.create(
113 |             feature_group_version=feature_pipeline_metadata["feature_group_version"]
114 |         )
115 | 
116 |     @task.virtualenv(
117 |         task_id="run_hyperparameter_tuning",
118 |         requirements=[
119 |             "--trusted-host 172.17.0.1",
120 |             "--extra-index-url http://172.17.0.1",
121 |             "training_pipeline",
122 |         ],
123 |         python_version="3.9",
124 |         multiple_outputs=True,
125 |         system_site_packages=False,
126 |     )
127 |     def run_hyperparameter_tuning(feature_view_metadata: dict) -> dict:
128 |         """
129 |         This function runs hyperparameter tuning for the training pipeline.
130 |         The feature store feature view version and training dataset version are passed
131 |         based on the results from the create_feature_view task.
132 |         """
133 | 
134 |         from training_pipeline import hyperparameter_tuning
135 | 
136 |         return hyperparameter_tuning.run(
137 |             feature_view_version=feature_view_metadata["feature_view_version"],
138 |             training_dataset_version=feature_view_metadata["training_dataset_version"],
139 |         )
140 | 
141 |     @task.virtualenv(
142 |         task_id="upload_best_config",
143 |         requirements=[
144 |             "--trusted-host 172.17.0.1",
145 |             "--extra-index-url http://172.17.0.1",
146 |             "training_pipeline",
147 |         ],
148 |         python_version="3.9",
149 |         multiple_outputs=False,
150 |         system_site_packages=False,
151 |     )
152 |     def upload_best_config(last_sweep_metadata: dict):
153 |         """
154 |         Upload the best config to W&B ML platform found in the hyperparameter tuning step
155 |         based on the given sweep id.
156 |         """
157 | 
158 |         from training_pipeline import best_config
159 | 
160 |         best_config.upload(sweep_id=last_sweep_metadata["sweep_id"])
161 | 
162 |     @task.virtualenv(
163 |         task_id="train_from_best_config",
164 |         requirements=[
165 |             "--trusted-host 172.17.0.1",
166 |             "--extra-index-url http://172.17.0.1",
167 |             "training_pipeline",
168 |         ],
169 |         python_version="3.9",
170 |         multiple_outputs=True,
171 |         system_site_packages=False,
172 |         trigger_rule=TriggerRule.ALL_DONE,
173 |     )
174 |     def train_from_best_config(feature_view_metadata: dict) -> dict:
175 |         """Trains model from the best config found in hyperparameter tuning.
176 | 
177 |         Args:
178 |             feature_view_metadata (dict): Contains feature store feature view and training dataset version.
179 | 
180 |         Returns:
181 |             metadata from the training run
182 |         """
183 | 
184 |         from training_pipeline import utils, train
185 | 
186 |         has_best_config = utils.check_if_artifact_exists("best_config")
187 |         if has_best_config is False:
188 |             raise RuntimeError(
189 |                 "No best config found. Please run hyperparameter tuning first."
190 |             )
191 | 
192 |         return train.from_best_config(
193 |             feature_view_version=feature_view_metadata["feature_view_version"],
194 |             training_dataset_version=feature_view_metadata["training_dataset_version"],
195 |         )
196 | 
197 |     @task.virtualenv(
198 |         task_id="compute_monitoring",
199 |         requirements=[
200 |             "--trusted-host 172.17.0.1",
201 |             "--extra-index-url http://172.17.0.1",
202 |             "batch_prediction_pipeline",
203 |         ],
204 |         python_version="3.9",
205 |         system_site_packages=False,
206 |     )
207 |     def compute_monitoring(feature_view_metadata: dict):
208 |         """Compute monitoring metrics for newly obbserved data.
209 | 
210 |         Args:
211 |             feature_view_metadata: metadata containing the version of the feature store feature view version.
212 |         """
213 | 
214 |         from batch_prediction_pipeline import monitoring
215 | 
216 |         monitoring.compute(
217 |             feature_view_version=feature_view_metadata["feature_view_version"],
218 |         )
219 | 
220 |     @task.virtualenv(
221 |         task_id="batch_predict",
222 |         requirements=[
223 |             "--trusted-host 172.17.0.1",
224 |             "--extra-index-url http://172.17.0.1",
225 |             "batch_prediction_pipeline",
226 |         ],
227 |         python_version="3.9",
228 |         system_site_packages=False,
229 |     )
230 |     def batch_predict(
231 |         feature_view_metadata: dict,
232 |         train_metadata: dict,
233 |         feature_pipeline_metadata: dict,
234 |         fh: int = 24,
235 |     ):
236 |         """
237 |         This is the function that runs the batch prediction pipeline
238 | 
239 |         Args:
240 |             feature_view_metadata (dict):  the metadata from the create feature view task
241 |             train_metadata (dict): the metadata from the training pipeline task
242 |             feature_pipeline_metadata (dict): the metadata from the feature pipeline task
243 |             fh (int, optional): forecast horizon. Defaults to 24.
244 |         """
245 | 
246 |         from datetime import datetime
247 |         from batch_prediction_pipeline import batch
248 | 
249 |         start_datetime = datetime.strptime(
250 |             feature_pipeline_metadata["export_datetime_utc_start"],
251 |             feature_pipeline_metadata["datetime_format"],
252 |         )
253 |         end_datetime = datetime.strptime(
254 |             feature_pipeline_metadata["export_datetime_utc_end"],
255 |             feature_pipeline_metadata["datetime_format"],
256 |         )
257 | 
258 |         batch.predict(
259 |             fh=fh,
260 |             feature_view_version=feature_view_metadata["feature_view_version"],
261 |             model_version=train_metadata["model_version"],
262 |             start_datetime=start_datetime,
263 |             end_datetime=end_datetime,
264 |         )
265 | 
266 |     @task.branch(task_id="if_run_hyperparameter_tuning_branching")
267 |     def if_run_hyperparameter_tuning_branching(run_hyperparameter_tuning: bool) -> bool:
268 |         """Task used to branch between hyperparameter tuning and skipping it."""
269 |         if run_hyperparameter_tuning is True:
270 |             return ["branch_run_hyperparameter_tuning"]
271 |         else:
272 |             return ["branch_skip_hyperparameter_tuning"]
273 | 
274 |     # Define empty operators used for branching between hyperparameter tuning and skipping it.
275 |     branch_run_hyperparameter_tuning_operator = EmptyOperator(
276 |         task_id="branch_run_hyperparameter_tuning"
277 |     )
278 |     branch_skip_hyperparameter_tuning_operator = EmptyOperator(
279 |         task_id="branch_skip_hyperparameter_tuning"
280 |     )
281 | 
282 |     # Define Airflow variables.
283 |     days_delay = int(Variable.get("ml_pipeline_days_delay", default_var=15))
284 |     days_export = int(Variable.get("ml_pipeline_days_export", default_var=30))
285 |     url = Variable.get(
286 |         "ml_pipeline_url",
287 |         default_var="https://drive.google.com/uc?export=download&id=1y48YeDymLurOTUO-GeFOUXVNc9MCApG5",
288 |     )
289 |     feature_group_version = int(
290 |         Variable.get("ml_pipeline_feature_group_version", default_var=1)
291 |     )
292 |     should_run_hyperparameter_tuning = (
293 |         Variable.get(
294 |             "ml_pipeline_should_run_hyperparameter_tuning", default_var="False"
295 |         )
296 |         == "True"
297 |     )
298 | 
299 |     # Feature pipeline
300 |     feature_pipeline_metadata = run_feature_pipeline(
301 |         export_end_reference_datetime="{{ dag_run.logical_date }}",
302 |         days_delay=days_delay,
303 |         days_export=days_export,
304 |         url=url,
305 |         feature_group_version=feature_group_version,
306 |     )
307 |     feature_view_metadata = create_feature_view(feature_pipeline_metadata)
308 | 
309 |     # Training pipeline
310 |     if_run_hyperparameter_tuning_branch = if_run_hyperparameter_tuning_branching(
311 |         should_run_hyperparameter_tuning
312 |     )
313 |     last_sweep_metadata = run_hyperparameter_tuning(feature_view_metadata)
314 |     upload_best_model_step = upload_best_config(last_sweep_metadata)
315 |     train_metadata = train_from_best_config(feature_view_metadata)
316 | 
317 |     # Batch prediction pipeline
318 |     compute_monitoring_step = compute_monitoring(feature_view_metadata)
319 |     batch_predict_step = batch_predict(
320 |         feature_view_metadata, train_metadata, feature_pipeline_metadata
321 |     )
322 | 
323 |     # Define DAG structure.
324 |     (
325 |         feature_view_metadata
326 |         >> if_run_hyperparameter_tuning_branch
327 |         >> [
328 |             if_run_hyperparameter_tuning_branch
329 |             >> Label("Run HPO")
330 |             >> branch_run_hyperparameter_tuning_operator
331 |             >> last_sweep_metadata
332 |             >> upload_best_model_step,
333 |             if_run_hyperparameter_tuning_branch
334 |             >> Label("Skip HPO")
335 |             >> branch_skip_hyperparameter_tuning_operator,
336 |         ]
337 |         >> train_metadata
338 |         >> compute_monitoring_step
339 |         >> batch_predict_step
340 |     )
341 | 
342 | 
343 | ml_pipeline()
344 | 


--------------------------------------------------------------------------------
/airflow/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.7.0
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # AIRFLOW_PROJ_DIR             - Base path to which all the files will be volumed.
 31 | #                                Default: .
 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 33 | #
 34 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 37 | #                                Default: airflow
 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 39 | #                                Use this option ONLY for quick checks. Installing requirements at container
 40 | #                                startup is done EVERY TIME the service is started.
 41 | #                                A better way is to build a custom image or extend the official image
 42 | #                                as described in https://airflow.apache.org/docs/docker-stack/build.html.
 43 | #                                Default: ''
 44 | #
 45 | # Feel free to modify this file to suit your needs.
 46 | ---
 47 | version: '3.8'
 48 | x-airflow-common:
 49 |   &airflow-common
 50 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 51 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 52 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 53 |   # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.7.0}
 54 |   build: .
 55 |   environment:
 56 |     &airflow-common-env
 57 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 58 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 59 |     # For backward compatibility, with Airflow <2.3
 60 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 61 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 62 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 63 |     AIRFLOW__CORE__FERNET_KEY: ''
 64 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 65 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
 66 |     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
 67 |     # yamllint disable rule:line-length
 68 |     # Use simple http server on scheduler for health checks
 69 |     # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
 70 |     # yamllint enable rule:line-length
 71 |     AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
 72 |     # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
 73 |     # for other purpose (development, test and especially production usage) build/extend Airflow image.
 74 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 75 | 
 76 |     # ML pipeline Custom Environment Variables
 77 |     ML_PIPELINE_ROOT_DIR: ${ML_PIPELINE_ROOT_DIR:-/opt/airflow/dags}
 78 |   volumes:
 79 |     - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
 80 |     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
 81 |     - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
 82 |     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
 83 |   user: "${AIRFLOW_UID:-50000}:0"
 84 |   depends_on:
 85 |     &airflow-common-depends-on
 86 |     redis:
 87 |       condition: service_healthy
 88 |     postgres:
 89 |       condition: service_healthy
 90 | 
 91 | services:
 92 |   postgres:
 93 |     image: postgres:13
 94 |     platform: linux/amd64
 95 |     environment:
 96 |       POSTGRES_USER: airflow
 97 |       POSTGRES_PASSWORD: airflow
 98 |       POSTGRES_DB: airflow
 99 |     volumes:
100 |       - postgres-db-volume:/var/lib/postgresql/data
101 |     healthcheck:
102 |       test: ["CMD", "pg_isready", "-U", "airflow"]
103 |       interval: 10s
104 |       retries: 5
105 |       start_period: 5s
106 |     restart: always
107 | 
108 |   redis:
109 |     image: redis:latest
110 |     expose:
111 |       - 6379
112 |     healthcheck:
113 |       test: ["CMD", "redis-cli", "ping"]
114 |       interval: 10s
115 |       timeout: 30s
116 |       retries: 50
117 |       start_period: 30s
118 |     restart: always
119 | 
120 |   airflow-webserver:
121 |     <<: *airflow-common
122 |     command: webserver
123 |     ports:
124 |       - "8080:8080"
125 |     healthcheck:
126 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
127 |       interval: 30s
128 |       timeout: 10s
129 |       retries: 5
130 |       start_period: 30s
131 |     restart: always
132 |     depends_on:
133 |       <<: *airflow-common-depends-on
134 |       airflow-init:
135 |         condition: service_completed_successfully
136 | 
137 |   airflow-scheduler:
138 |     <<: *airflow-common
139 |     command: scheduler
140 |     healthcheck:
141 |       test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
142 |       interval: 30s
143 |       timeout: 10s
144 |       retries: 5
145 |       start_period: 30s
146 |     restart: always
147 |     depends_on:
148 |       <<: *airflow-common-depends-on
149 |       airflow-init:
150 |         condition: service_completed_successfully
151 | 
152 |   airflow-worker:
153 |     <<: *airflow-common
154 |     command: celery worker
155 |     healthcheck:
156 |       # yamllint disable rule:line-length
157 |       test:
158 |         - "CMD-SHELL"
159 |         - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
160 |       interval: 30s
161 |       timeout: 10s
162 |       retries: 5
163 |       start_period: 30s
164 |     environment:
165 |       <<: *airflow-common-env
166 |       # Required to handle warm shutdown of the celery workers properly
167 |       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
168 |       DUMB_INIT_SETSID: "0"
169 |     restart: always
170 |     depends_on:
171 |       <<: *airflow-common-depends-on
172 |       airflow-init:
173 |         condition: service_completed_successfully
174 | 
175 |   airflow-triggerer:
176 |     <<: *airflow-common
177 |     command: triggerer
178 |     healthcheck:
179 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
180 |       interval: 30s
181 |       timeout: 10s
182 |       retries: 5
183 |       start_period: 30s
184 |     restart: always
185 |     depends_on:
186 |       <<: *airflow-common-depends-on
187 |       airflow-init:
188 |         condition: service_completed_successfully
189 | 
190 |   airflow-init:
191 |     <<: *airflow-common
192 |     entrypoint: /bin/bash
193 |     # yamllint disable rule:line-length
194 |     command:
195 |       - -c
196 |       - |
197 |         function ver() {
198 |           printf "%04d%04d%04d%04d" $${1//./ }
199 |         }
200 |         airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version)
201 |         airflow_version_comparable=$$(ver $${airflow_version})
202 |         min_airflow_version=2.2.0
203 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
204 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
205 |           echo
206 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
207 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
208 |           echo
209 |           exit 1
210 |         fi
211 |         if [[ -z "${AIRFLOW_UID}" ]]; then
212 |           echo
213 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
214 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
215 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
216 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
217 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
218 |           echo
219 |         fi
220 |         one_meg=1048576
221 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
222 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
223 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
224 |         warning_resources="false"
225 |         if (( mem_available < 4000 )) ; then
226 |           echo
227 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
228 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
229 |           echo
230 |           warning_resources="true"
231 |         fi
232 |         if (( cpus_available < 2 )); then
233 |           echo
234 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
235 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
236 |           echo
237 |           warning_resources="true"
238 |         fi
239 |         if (( disk_available < one_meg * 10 )); then
240 |           echo
241 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
242 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
243 |           echo
244 |           warning_resources="true"
245 |         fi
246 |         if [[ $${warning_resources} == "true" ]]; then
247 |           echo
248 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
249 |           echo "Please follow the instructions to increase amount of resources available:"
250 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
251 |           echo
252 |         fi
253 |         mkdir -p /sources/logs /sources/dags /sources/plugins
254 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
255 |         exec /entrypoint airflow version
256 |     # yamllint enable rule:line-length
257 |     environment:
258 |       <<: *airflow-common-env
259 |       _AIRFLOW_DB_MIGRATE: 'true'
260 |       _AIRFLOW_WWW_USER_CREATE: 'true'
261 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
262 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
263 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
264 |     user: "0:0"
265 |     volumes:
266 |       - ${AIRFLOW_PROJ_DIR:-.}:/sources
267 | 
268 |   airflow-cli:
269 |     <<: *airflow-common
270 |     profiles:
271 |       - debug
272 |     environment:
273 |       <<: *airflow-common-env
274 |       CONNECTION_CHECK_MAX_COUNT: "0"
275 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
276 |     command:
277 |       - bash
278 |       - -c
279 |       - airflow
280 | 
281 |   # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
282 |   # or by explicitly targeted on the command line e.g. docker-compose up flower.
283 |   # See: https://docs.docker.com/compose/profiles/
284 |   flower:
285 |     <<: *airflow-common
286 |     command: celery flower
287 |     profiles:
288 |       - flower
289 |     ports:
290 |       - "5555:5555"
291 |     healthcheck:
292 |       test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
293 |       interval: 30s
294 |       timeout: 10s
295 |       retries: 5
296 |       start_period: 30s
297 |     restart: always
298 |     depends_on:
299 |       <<: *airflow-common-depends-on
300 |       airflow-init:
301 |         condition: service_completed_successfully
302 | 
303 |   my-private-pypi:
304 |     image: pypiserver/pypiserver:v1.5.2
305 |     platform: linux/amd64
306 |     restart: always
307 |     ports:
308 |       - "80:8080"
309 |     volumes:
310 |       - ~/.htpasswd:/data/.htpasswd
311 |     command:
312 |       - run
313 |       - -P
314 |       - .htpasswd/htpasswd.txt
315 |       - --overwrite
316 | 
317 | volumes:
318 |   postgres-db-volume:
319 | 


--------------------------------------------------------------------------------
/airflow/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "energy-forecasting"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = [
 6 | "Paul Iusztin <paul.lusztin@coreai.ai>",
 7 | "Kurtis Pykes <kurtispykes@gmail.com>" 
 8 | ]
 9 | 
10 | [tool.poetry.dependencies]
11 | python = "~3.9"
12 | pyarrow = "^11.0.0"
13 | tqdm = "^4.64.1"
14 | category-encoders = "^2.6.0"
15 | wandb = "^0.13.10"
16 | matplotlib = "^3.7.0"
17 | hopsworks = "^3.0.5"
18 | python-dotenv = "^0.21.1"
19 | lightgbm = "^3.3.5"
20 | sktime = "^0.16.1"
21 | seaborn = "^0.12.2"
22 | google-cloud-storage = "^2.7.0"
23 | yarl = "^1.8.2"
24 | fire = "^0.5.0"
25 | Jinja2 = "3.0.1"
26 | 
27 | [tool.poetry.dev-dependencies]
28 | 
29 | [tool.poetry.group.dev.dependencies]
30 | black = "^23.1.0"
31 | 
32 | [build-system]
33 | requires = ["poetry-core>=1.0.0"]
34 | build-backend = "poetry.core.masonry.api"
35 | 


--------------------------------------------------------------------------------
/app-api/.dockerignore:
--------------------------------------------------------------------------------
 1 | jupyter_notebooks*
 2 | */env*
 3 | */venv*
 4 | venv
 5 | env
 6 | .circleci*
 7 | *.env
 8 | *.log
 9 | .git
10 | .gitignore
11 | .tox


--------------------------------------------------------------------------------
/app-api/.env.default:
--------------------------------------------------------------------------------
1 | APP_API_GCP_PROJECT = "energy_consumption"
2 | APP_API_GCP_BUCKET = "hourly-batch-predictions"
3 | APP_API_GCP_SERVICE_ACCOUNT_JSON_PATH = "/app/src/credentials/gcp/energy_consumption/read-buckets.json"
4 | 


--------------------------------------------------------------------------------
/app-api/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.4
 2 | 
 3 | WORKDIR /app/src
 4 | 
 5 | RUN apt-get update && apt-get upgrade -y
 6 | RUN pip install --no-cache -U pip
 7 | RUN pip install --no-cache poetry==1.4.2
 8 | 
 9 | # Configuring poetry.
10 | RUN poetry config virtualenvs.create false
11 | 
12 | # First copy & install requirements to speed up the build process in case only the code changes.
13 | COPY ./app-api/pyproject.toml /app/src/
14 | COPY ./app-api/poetry.lock /app/src/
15 | 
16 | RUN poetry install --no-interaction --no-root -vvv
17 | 
18 | # Copy the rest of the files.
19 | ADD ./app-api /app/src
20 | 
21 | # Give access to run the run.sh script.
22 | RUN chmod +x run.sh
23 | 
24 | CMD ["bash", "./run.sh"]
25 | 


--------------------------------------------------------------------------------
/app-api/README.md:
--------------------------------------------------------------------------------
 1 | # API - Web APP
 2 | 
 3 | Check out [Lesson 6](https://towardsdatascience.com/fastapi-and-streamlit-the-python-duo-you-must-know-about-72825def1243) on Medium to better understand how we built the FastAPI backend.
 4 | 
 5 | ## Install for Development
 6 | 
 7 | Create virtual environment:
 8 | ```shell
 9 | cd app-api
10 | poetry shell
11 | poetry install
12 | ```
13 | 
14 | Check the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#-set-up-additional-tools-) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections to see **how to set up** the **additional tools** and **credentials** you need to run this project.
15 | 
16 | ## Usage for Development
17 | 
18 | To start the server, run the following:
19 | ```shell
20 | sh run.sh
21 | ```
22 | 
23 | Access http://127.0.0.1:8001/api/v1/docs to see the docs.
24 | 
25 | **NOTE:** Be careful to complete the `.env` file as explained in the main README.
26 | 


--------------------------------------------------------------------------------
/app-api/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/app-api/api/__init__.py


--------------------------------------------------------------------------------
/app-api/api/__main__.py:
--------------------------------------------------------------------------------
 1 | import uvicorn
 2 | 
 3 | from api.config import get_settings
 4 | 
 5 | 
 6 | def main() -> None:
 7 |     """Entrypoint of the application."""
 8 |     uvicorn.run(
 9 |         "api.application:get_app",
10 |         workers=get_settings().WORKERS_COUNT,
11 |         host=get_settings().HOST,
12 |         port=get_settings().PORT,
13 |         reload=get_settings().RELOAD,
14 |         log_level=get_settings().LOG_LEVEL.value.lower(),
15 |         factory=True,
16 |     )
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     main()
21 | 


--------------------------------------------------------------------------------
/app-api/api/application.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import uvicorn
 4 | from fastapi import APIRouter, FastAPI
 5 | from fastapi.middleware.cors import CORSMiddleware
 6 | 
 7 | from api.views import api_router
 8 | from api.config import get_settings
 9 | 
10 | 
11 | def get_app() -> FastAPI:
12 |     """Create FastAPI app."""
13 | 
14 |     app = FastAPI(
15 |         title=get_settings().PROJECT_NAME,
16 |         docs_url=f"/api/{get_settings().VERSION}/docs",
17 |         redoc_url=f"/api/{get_settings().VERSION}/redoc",
18 |         openapi_url=f"/api/{get_settings().VERSION}/openapi.json",
19 |     )
20 |     # For demo purposes, allow all origins.
21 |     app.add_middleware(
22 |         CORSMiddleware,
23 |         allow_origins=["*"],
24 |         allow_credentials=True,
25 |         allow_methods=["*"],
26 |         allow_headers=["*"],
27 |     )
28 | 
29 |     app.include_router(api_router, prefix=f"/api/{get_settings().VERSION}")
30 | 
31 |     return app
32 | 


--------------------------------------------------------------------------------
/app-api/api/config.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | from functools import lru_cache
 3 | import logging
 4 | import sys
 5 | from types import FrameType
 6 | from typing import List, Optional, cast
 7 | 
 8 | from pydantic import AnyHttpUrl, BaseSettings
 9 | 
10 | 
11 | class LogLevel(str, enum.Enum):  # noqa: WPS600
12 |     """Possible log levels."""
13 | 
14 |     NOTSET = "NOTSET"
15 |     DEBUG = "DEBUG"
16 |     INFO = "INFO"
17 |     WARNING = "WARNING"
18 |     ERROR = "ERROR"
19 |     FATAL = "FATAL"
20 | 
21 | 
22 | class Settings(BaseSettings):
23 |     """
24 |     Application settings.
25 | 
26 |     These parameters can be configured
27 |     with environment variables.
28 |     """
29 | 
30 |     # General configurations.
31 |     HOST: str = "0.0.0.0"
32 |     PORT: int = 8001
33 |     LOG_LEVEL: LogLevel = LogLevel.INFO
34 |     # - Current version of the API.
35 |     VERSION: str = "v1"
36 |     # - Quantity of workers for uvicorn.
37 |     WORKERS_COUNT: int = 1
38 |     # - Enable uvicorn reloading.
39 |     RELOAD: bool = False
40 | 
41 |     PROJECT_NAME: str = "Energy Consumption API"
42 | 
43 |     # Google Cloud Platform credentials
44 |     GCP_PROJECT: Optional[str] = None
45 |     GCP_BUCKET: Optional[str] = None
46 |     GCP_SERVICE_ACCOUNT_JSON_PATH: Optional[str] = None
47 | 
48 |     class Config:
49 |         env_file = ".env"
50 |         env_prefix = "APP_API_"
51 |         case_sensitive = False
52 |         env_file_encoding = "utf-8"
53 | 
54 | 
55 | @lru_cache()
56 | def get_settings():
57 |     return Settings()
58 | 


--------------------------------------------------------------------------------
/app-api/api/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | from api.schemas.health import Health
2 | from api.schemas.predictions import (
3 |     PredictionResults,
4 |     MonitoringMetrics,
5 |     MonitoringValues,
6 | )
7 | from api.schemas.consumer_type_values import UniqueConsumerType
8 | from api.schemas.area_values import UniqueArea
9 | 


--------------------------------------------------------------------------------
/app-api/api/schemas/area_values.py:
--------------------------------------------------------------------------------
1 | from typing import Any, List
2 | 
3 | from pydantic import BaseModel
4 | 
5 | 
6 | class UniqueArea(BaseModel):
7 |     values: List[int]
8 | 


--------------------------------------------------------------------------------
/app-api/api/schemas/consumer_type_values.py:
--------------------------------------------------------------------------------
1 | from typing import Any, List
2 | 
3 | from pydantic import BaseModel
4 | 
5 | 
6 | class UniqueConsumerType(BaseModel):
7 |     values: List[int]
8 | 


--------------------------------------------------------------------------------
/app-api/api/schemas/health.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | 
4 | class Health(BaseModel):
5 |     name: str
6 |     api_version: str
7 | 


--------------------------------------------------------------------------------
/app-api/api/schemas/predictions.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Any
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | 
 6 | class PredictionResults(BaseModel):
 7 |     datetime_utc: List[int]
 8 |     energy_consumption: List[float]
 9 |     preds_datetime_utc: List[int]
10 |     preds_energy_consumption: List[float]
11 | 
12 | 
13 | class MonitoringMetrics(BaseModel):
14 |     datetime_utc: List[int]
15 |     mape: List[float]
16 | 
17 | 
18 | class MonitoringValues(BaseModel):
19 |     y_monitoring_datetime_utc: List[int]
20 |     y_monitoring_energy_consumption: List[float]
21 |     predictions_monitoring_datetime_utc: List[int]
22 |     predictions_monitoring_energy_consumptionc: List[float]
23 | 


--------------------------------------------------------------------------------
/app-api/api/views.py:
--------------------------------------------------------------------------------
  1 | import gcsfs
  2 | from typing import Any, List
  3 | 
  4 | import pandas as pd
  5 | from fastapi import APIRouter, HTTPException
  6 | 
  7 | from api import schemas
  8 | from api.config import get_settings
  9 | 
 10 | 
 11 | fs = gcsfs.GCSFileSystem(
 12 |     project=get_settings().GCP_PROJECT,
 13 |     token=get_settings().GCP_SERVICE_ACCOUNT_JSON_PATH,
 14 | )
 15 | 
 16 | api_router = APIRouter()
 17 | 
 18 | 
 19 | @api_router.get("/health", response_model=schemas.Health, status_code=200)
 20 | def health() -> dict:
 21 |     """
 22 |     Health check endpoint.
 23 |     """
 24 | 
 25 |     health_data = schemas.Health(
 26 |         name=get_settings().PROJECT_NAME, api_version=get_settings().VERSION
 27 |     )
 28 | 
 29 |     return health_data.dict()
 30 | 
 31 | 
 32 | @api_router.get(
 33 |     "/consumer_type_values", response_model=schemas.UniqueConsumerType, status_code=200
 34 | )
 35 | def consumer_type_values() -> List:
 36 |     """
 37 |     Retrieve unique consumer types.
 38 |     """
 39 | 
 40 |     # Download the data from GCS.
 41 |     X = pd.read_parquet(f"{get_settings().GCP_BUCKET}/X.parquet", filesystem=fs)
 42 | 
 43 |     unique_consumer_type = list(X.index.unique(level="consumer_type"))
 44 | 
 45 |     return {"values": unique_consumer_type}
 46 | 
 47 | 
 48 | @api_router.get("/area_values", response_model=schemas.UniqueArea, status_code=200)
 49 | def area_values() -> List:
 50 |     """
 51 |     Retrieve unique areas.
 52 |     """
 53 | 
 54 |     # Download the data from GCS.
 55 |     X = pd.read_parquet(f"{get_settings().GCP_BUCKET}/X.parquet", filesystem=fs)
 56 | 
 57 |     unique_area = list(X.index.unique(level="area"))
 58 | 
 59 |     return {"values": unique_area}
 60 | 
 61 | 
 62 | @api_router.get(
 63 |     "/predictions/{area}/{consumer_type}",
 64 |     response_model=schemas.PredictionResults,
 65 |     status_code=200,
 66 | )
 67 | async def get_predictions(area: int, consumer_type: int) -> Any:
 68 |     """
 69 |     Get forecasted predictions based on the given area and consumer type.
 70 |     """
 71 | 
 72 |     # Download the data from GCS.
 73 |     train_df = pd.read_parquet(f"{get_settings().GCP_BUCKET}/y.parquet", filesystem=fs)
 74 |     preds_df = pd.read_parquet(
 75 |         f"{get_settings().GCP_BUCKET}/predictions.parquet", filesystem=fs
 76 |     )
 77 | 
 78 |     # Query the data for the given area and consumer type.
 79 |     try:
 80 |         train_df = train_df.xs((area, consumer_type), level=["area", "consumer_type"])
 81 |         preds_df = preds_df.xs((area, consumer_type), level=["area", "consumer_type"])
 82 |     except KeyError:
 83 |         raise HTTPException(
 84 |             status_code=404,
 85 |             detail=f"No data found for the given area and consumer type: {area}, {consumer_type}",
 86 |         )
 87 | 
 88 |     if len(train_df) == 0 or len(preds_df) == 0:
 89 |         raise HTTPException(
 90 |             status_code=404,
 91 |             detail=f"No data found for the given area and consumer type: {area}, {consumer_type}",
 92 |         )
 93 | 
 94 |     # Return only the latest week of observations.
 95 |     train_df = train_df.sort_index().tail(24 * 7)
 96 | 
 97 |     # Prepare data to be returned.
 98 |     datetime_utc = train_df.index.get_level_values("datetime_utc").to_list()
 99 |     energy_consumption = train_df["energy_consumption"].to_list()
100 | 
101 |     preds_datetime_utc = preds_df.index.get_level_values("datetime_utc").to_list()
102 |     preds_energy_consumption = preds_df["energy_consumption"].to_list()
103 | 
104 |     results = {
105 |         "datetime_utc": datetime_utc,
106 |         "energy_consumption": energy_consumption,
107 |         "preds_datetime_utc": preds_datetime_utc,
108 |         "preds_energy_consumption": preds_energy_consumption,
109 |     }
110 | 
111 |     return results
112 | 
113 | 
114 | @api_router.get(
115 |     "/monitoring/metrics",
116 |     response_model=schemas.MonitoringMetrics,
117 |     status_code=200,
118 | )
119 | async def get_metrics() -> Any:
120 |     """
121 |     Get monitoring metrics.
122 |     """
123 | 
124 |     # Download the data from GCS.
125 |     metrics = pd.read_parquet(
126 |         f"{get_settings().GCP_BUCKET}/metrics_monitoring.parquet", filesystem=fs
127 |     )
128 | 
129 |     datetime_utc = metrics.index.to_list()
130 |     mape = metrics["MAPE"].to_list()
131 | 
132 |     return {
133 |         "datetime_utc": datetime_utc,
134 |         "mape": mape,
135 |     }
136 | 
137 | 
138 | @api_router.get(
139 |     "/monitoring/values/{area}/{consumer_type}",
140 |     response_model=schemas.MonitoringValues,
141 |     status_code=200,
142 | )
143 | async def get_predictions(area: int, consumer_type: int) -> Any:
144 |     """
145 |     Get forecasted predictions based on the given area and consumer type.
146 |     """
147 | 
148 |     # Download the data from GCS.
149 |     y_monitoring = pd.read_parquet(
150 |         f"{get_settings().GCP_BUCKET}/y_monitoring.parquet", filesystem=fs
151 |     )
152 |     predictions_monitoring = pd.read_parquet(
153 |         f"{get_settings().GCP_BUCKET}/predictions_monitoring.parquet", filesystem=fs
154 |     )
155 | 
156 |     # Query the data for the given area and consumer type.
157 |     try:
158 |         y_monitoring = y_monitoring.xs(
159 |             (area, consumer_type), level=["area", "consumer_type"]
160 |         )
161 |         predictions_monitoring = predictions_monitoring.xs(
162 |             (area, consumer_type), level=["area", "consumer_type"]
163 |         )
164 |     except KeyError:
165 |         raise HTTPException(
166 |             status_code=404,
167 |             detail=f"No data found for the given area and consumer typefrontend: {area}, {consumer_type}",
168 |         )
169 | 
170 |     if len(y_monitoring) == 0 or len(predictions_monitoring) == 0:
171 |         raise HTTPException(
172 |             status_code=404,
173 |             detail=f"No data found for the given area and consumer type: {area}, {consumer_type}",
174 |         )
175 | 
176 |     # Prepare data to be returned.
177 |     y_monitoring_datetime_utc = y_monitoring.index.get_level_values(
178 |         "datetime_utc"
179 |     ).to_list()
180 |     y_monitoring_energy_consumption = y_monitoring["energy_consumption"].to_list()
181 | 
182 |     predictions_monitoring_datetime_utc = predictions_monitoring.index.get_level_values(
183 |         "datetime_utc"
184 |     ).to_list()
185 |     predictions_monitoring_energy_consumptionc = predictions_monitoring[
186 |         "energy_consumption"
187 |     ].to_list()
188 | 
189 |     results = {
190 |         "y_monitoring_datetime_utc": y_monitoring_datetime_utc,
191 |         "y_monitoring_energy_consumption": y_monitoring_energy_consumption,
192 |         "predictions_monitoring_datetime_utc": predictions_monitoring_datetime_utc,
193 |         "predictions_monitoring_energy_consumptionc": predictions_monitoring_energy_consumptionc,
194 |     }
195 | 
196 |     return results
197 | 


--------------------------------------------------------------------------------
/app-api/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "app-api"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Iusztin Paul <p.e.iusztin@gmail.com>", "Kurtis Pykes <kurtispykes@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "api"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "~3.9"
11 | pandas = ">=1.5.3,<1.6.0"
12 | uvicorn = ">=0.21.0,<0.22.0"
13 | fastapi = ">=0.94.1,<0.95.0"
14 | pydantic = ">=1.10.6,<1.11.0"
15 | pyarrow = ">=11.0.0,<11.1.0"
16 | gcsfs = "2023.3.0"
17 | python-dotenv = "0.21.1"
18 | 
19 | 
20 | [build-system]
21 | requires = ["poetry-core"]
22 | build-backend = "poetry.core.masonry.api"
23 | 


--------------------------------------------------------------------------------
/app-api/run.sh:
--------------------------------------------------------------------------------
1 | /usr/local/bin/python -m api
2 | 


--------------------------------------------------------------------------------
/app-frontend/.dockerignore:
--------------------------------------------------------------------------------
 1 | jupyter_notebooks*
 2 | */env*
 3 | */venv*
 4 | venv
 5 | env
 6 | .circleci*
 7 | *.env
 8 | *.log
 9 | .git
10 | .gitignore
11 | .tox


--------------------------------------------------------------------------------
/app-frontend/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | base = "dark"
3 | 


--------------------------------------------------------------------------------
/app-frontend/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.8
 2 | 
 3 | WORKDIR /app/src
 4 | 
 5 | RUN apt-get update && apt-get upgrade -y
 6 | RUN pip install --no-cache -U pip
 7 | RUN pip install --no-cache poetry==1.4.2
 8 | 
 9 | # Configuring poetry.
10 | RUN poetry config virtualenvs.create false
11 | 
12 | # First copy & install requirements to speed up the build process in case only the code changes.
13 | COPY ./app-frontend/pyproject.toml /app/src/
14 | COPY ./app-frontend/poetry.lock /app/src/
15 | 
16 | RUN poetry install --no-interaction --no-root -vvv
17 | 
18 | # Copy the rest of the files.
19 | ADD ./app-frontend /app/src
20 | 
21 | CMD ["streamlit", "run", "frontend/main.py", "--server.port", "8501"]


--------------------------------------------------------------------------------
/app-frontend/README.md:
--------------------------------------------------------------------------------
 1 | # Frontend - Web APP
 2 | 
 3 | Check out [Lesson 6](https://towardsdatascience.com/fastapi-and-streamlit-the-python-duo-you-must-know-about-72825def1243) on Medium to better understand how we built the Streamlit predictions dashboard.
 4 | 
 5 | ## Install for Development
 6 | 
 7 | Create virtual environment:
 8 | ```shell
 9 | cd app-frontend
10 | poetry shell
11 | poetry install
12 | ```
13 | 
14 | **NOTE:** Be sure that the API is already running.
15 | 
16 | 
17 | ## Usage for Development
18 | 
19 | To start the app, run the following:
20 | ```shell
21 | streamlit run frontend/main.py --server.port 8501
22 | ```
23 | 
24 | Access http://127.0.0.1:8501/ to see the app.
25 | 


--------------------------------------------------------------------------------
/app-frontend/frontend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/app-frontend/frontend/__init__.py


--------------------------------------------------------------------------------
/app-frontend/frontend/components.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import requests
 3 | 
 4 | import pandas as pd
 5 | import plotly.graph_objects as go
 6 | 
 7 | from settings import API_URL
 8 | 
 9 | 
10 | def build_data_plot(area: int, consumer_type: int):
11 |     """
12 |     Build plotly graph for data.
13 |     """
14 | 
15 |     # Get predictions from API.
16 |     response = requests.get(
17 |         API_URL / "predictions" / f"{area}" / f"{consumer_type}", verify=False
18 |     )
19 |     if response.status_code != 200:
20 |         # If the response is invalid, build empty dataframes in the proper format.
21 |         train_df = build_dataframe([], [])
22 |         preds_df = build_dataframe([], [])
23 | 
24 |         title = "NO DATA AVAILABLE FOR THE GIVEN AREA AND CONSUMER TYPE"
25 |     else:
26 |         json_response = response.json()
27 | 
28 |         # Build DataFrames for plotting.
29 |         datetime_utc = json_response.get("datetime_utc")
30 |         energy_consumption = json_response.get("energy_consumption")
31 |         pred_datetime_utc = json_response.get("preds_datetime_utc")
32 |         pred_energy_consumption = json_response.get("preds_energy_consumption")
33 | 
34 |         train_df = build_dataframe(datetime_utc, energy_consumption)
35 |         preds_df = build_dataframe(pred_datetime_utc, pred_energy_consumption)
36 | 
37 |         title = "Energy Consumption per DE35 Industry Code per Hour"
38 | 
39 |     # Create plot.
40 |     fig = go.Figure()
41 |     fig.update_layout(
42 |         title=dict(
43 |             text=title,
44 |             font=dict(family="Arial", size=16),
45 |         ),
46 |         showlegend=True,
47 |     )
48 |     fig.update_xaxes(title_text="Datetime UTC")
49 |     fig.update_yaxes(title_text="Total Consumption")
50 |     fig.add_scatter(
51 |         x=train_df["datetime_utc"],
52 |         y=train_df["energy_consumption"],
53 |         name="Observations",
54 |         line=dict(color="#C4B6B6"),
55 |         hovertemplate="<br>".join(["Datetime: %{x}", "Energy Consumption: %{y} kWh"]),
56 |     )
57 |     fig.add_scatter(
58 |         x=preds_df["datetime_utc"],
59 |         y=preds_df["energy_consumption"],
60 |         name="Predictions",
61 |         line=dict(color="#FFC703"),
62 |         hovertemplate="<br>".join(["Datetime: %{x}", "Total Consumption: %{y} kWh"]),
63 |     )
64 | 
65 |     return fig
66 | 
67 | 
68 | def build_dataframe(datetime_utc: List[int], energy_consumption_values: List[float]):
69 |     """
70 |     Build DataFrame for plotting from timestamps and energy consumption values.
71 | 
72 |     Args:
73 |         datetime_utc (List[int]): list of timestamp values in UTC
74 |         values (List[float]): list of energy consumption values
75 |     """
76 | 
77 |     df = pd.DataFrame(
78 |         list(zip(datetime_utc, energy_consumption_values)),
79 |         columns=["datetime_utc", "energy_consumption"],
80 |     )
81 |     df["datetime_utc"] = pd.to_datetime(df["datetime_utc"], unit="h")
82 | 
83 |     # Resample to hourly frequency to make the data continuous.
84 |     df = df.set_index("datetime_utc")
85 |     df = df.resample("H").asfreq()
86 |     df = df.reset_index()
87 | 
88 |     return df
89 | 


--------------------------------------------------------------------------------
/app-frontend/frontend/main.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | import streamlit as st
 4 | 
 5 | from settings import API_URL, TITLE
 6 | from components import build_data_plot
 7 | 
 8 | 
 9 | st.set_page_config(page_title=TITLE)
10 | st.title(TITLE)
11 | 
12 | 
13 | # Create dropdown for area selection.
14 | area_response = requests.get(API_URL / "area_values")
15 | 
16 | area = st.selectbox(
17 |     label="Denmark is divided in two price areas, or bidding zones,\
18 |         divided by the Great Belt. DK1 (shown as 1) is west of the Great Belt \
19 |             and DK2 (shown as 2) is east of the Great Belt.",
20 |     options=area_response.json().get("values", []),
21 | )
22 | 
23 | # Create dropdown for consumer type selection.
24 | consumer_type_response = requests.get(API_URL / "consumer_type_values")
25 | 
26 | consumer_type = st.selectbox(
27 |     label="The consumer type is the Industry Code DE35 which is owned \
28 |           and maintained by Danish Energy, a non-commercial lobby \
29 |               organization for Danish energy companies. \
30 |                 The code is used by Danish energy companies.",
31 |     options=consumer_type_response.json().get("values", []),
32 | )
33 | 
34 | 
35 | # Check if both area and consumer type have values listed, then create plot for data.
36 | if area and consumer_type:
37 |     st.plotly_chart(build_data_plot(area, consumer_type))
38 | 


--------------------------------------------------------------------------------
/app-frontend/frontend/settings.py:
--------------------------------------------------------------------------------
1 | from yarl import URL
2 | 
3 | 
4 | TITLE = "Energy Consumption Forecasting"
5 | API_URL = URL("http://172.17.0.1:8001/api/v1")
6 | 


--------------------------------------------------------------------------------
/app-frontend/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "app-frontend"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Iusztin Paul <p.e.iusztin@gmail.com>", "Kurtis Pykes <kurtispykes@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "frontend"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = ">3.9.7,<3.10"
11 | streamlit = ">=1.20.0,<1.21.0"
12 | plotly = ">=5.14.1,<5.15.0"
13 | yarl = "^1.8.2"
14 | 
15 | 
16 | [build-system]
17 | requires = ["poetry-core"]
18 | build-backend = "poetry.core.masonry.api"
19 | 


--------------------------------------------------------------------------------
/app-monitoring/.dockerignore:
--------------------------------------------------------------------------------
 1 | jupyter_notebooks*
 2 | */env*
 3 | */venv*
 4 | venv
 5 | env
 6 | .circleci*
 7 | *.env
 8 | *.log
 9 | .git
10 | .gitignore
11 | .tox


--------------------------------------------------------------------------------
/app-monitoring/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | base = "dark"
3 | 


--------------------------------------------------------------------------------
/app-monitoring/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.8
 2 | 
 3 | WORKDIR /app/src
 4 | 
 5 | RUN apt-get update && apt-get upgrade -y
 6 | RUN pip install --no-cache -U pip
 7 | RUN pip install --no-cache poetry==1.4.2
 8 | 
 9 | # Configuring poetry.
10 | RUN poetry config virtualenvs.create false
11 | 
12 | # First copy & install requirements to speed up the build process in case only the code changes.
13 | COPY ./app-monitoring/pyproject.toml /app/src/
14 | COPY ./app-monitoring/poetry.lock /app/src/
15 | 
16 | RUN poetry install --no-interaction --no-root -vvv
17 | 
18 | # Copy the rest of the files.
19 | ADD ./app-monitoring /app/src
20 | 
21 | CMD ["streamlit", "run", "monitoring/main.py", "--server.port", "8502"]


--------------------------------------------------------------------------------
/app-monitoring/README.md:
--------------------------------------------------------------------------------
 1 | # Monitoring - Web APP
 2 | 
 3 | Check out [Lesson 6](https://towardsdatascience.com/fastapi-and-streamlit-the-python-duo-you-must-know-about-72825def1243) on Medium to better understand how we built the Streamlit monitoring dashboard.
 4 | 
 5 | ## Install for Development
 6 | 
 7 | Create virtual environment:
 8 | ```shell
 9 | cd app-monitoring
10 | poetry shell
11 | poetry install
12 | ```
13 | 
14 | **NOTE:** Be sure that the API is already running.
15 | 
16 | 
17 | ## Usage for Development
18 | 
19 | To start the app, run the following:
20 | ```shell
21 | streamlit run monitoring/main.py --server.port 8502
22 | ```
23 | 
24 | Access http://127.0.0.1:8502/ to see the app.
25 | 


--------------------------------------------------------------------------------
/app-monitoring/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/app-monitoring/monitoring/__init__.py


--------------------------------------------------------------------------------
/app-monitoring/monitoring/components.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import requests
  3 | 
  4 | import pandas as pd
  5 | import plotly.graph_objects as go
  6 | 
  7 | from settings import API_URL
  8 | 
  9 | 
 10 | def build_metrics_plot():
 11 |     """
 12 |     Build plotly graph for metrics.
 13 |     """
 14 | 
 15 |     response = requests.get(API_URL / "monitoring" / "metrics", verify=False)
 16 |     if response.status_code != 200:
 17 |         # If the response is invalid, build empty dataframes in the proper format.
 18 |         metrics_df = build_dataframe([], [], values_column_name="mape")
 19 | 
 20 |         title = "No metrics available."
 21 |     else:
 22 |         json_response = response.json()
 23 | 
 24 |         # Build DataFrame for plotting.
 25 |         datetime_utc = json_response.get("datetime_utc", [])
 26 |         mape = json_response.get("mape", [])
 27 |         metrics_df = build_dataframe(datetime_utc, mape, values_column_name="mape")
 28 | 
 29 |         title = "Predictions vs. Observations | Aggregated Metrics"
 30 | 
 31 |     # Create plot.
 32 |     fig = go.Figure()
 33 |     fig.update_layout(
 34 |         title=dict(
 35 |             text=title,
 36 |             font=dict(family="Arial", size=16),
 37 |         ),
 38 |         showlegend=True,
 39 |     )
 40 |     fig.update_xaxes(title_text="Datetime UTC")
 41 |     fig.update_yaxes(title_text="MAPE")
 42 |     fig.add_scatter(
 43 |         x=metrics_df["datetime_utc"],
 44 |         y=metrics_df["mape"],
 45 |         name="MAPE",
 46 |         line=dict(color="#C4B6B6"),
 47 |         hovertemplate="<br>".join(["Datetime UTC: %{x}", "MAPE: %{y} kWh"]),
 48 |     )
 49 | 
 50 |     return fig
 51 | 
 52 | 
 53 | def build_data_plot(area: int, consumer_type: int):
 54 |     """
 55 |     Build plotly graph for data.
 56 |     """
 57 | 
 58 |     # Get predictions from API.
 59 |     response = requests.get(
 60 |         API_URL / "monitoring" / "values" / f"{area}" / f"{consumer_type}", verify=False
 61 |     )
 62 |     if response.status_code != 200:
 63 |         # If the response is invalid, build empty dataframes in the proper format.
 64 |         train_df = build_dataframe([], [])
 65 |         preds_df = build_dataframe([], [])
 66 | 
 67 |         title = "NO DATA AVAILABLE FOR THE GIVEN AREA AND CONSUMER TYPE"
 68 |     else:
 69 |         json_response = response.json()
 70 | 
 71 |         # Build DataFrames for plotting.
 72 |         y_monitoring_datetime_utc = json_response.get("y_monitoring_datetime_utc", [])
 73 |         y_monitoring_energy_consumption = json_response.get(
 74 |             "y_monitoring_energy_consumption", []
 75 |         )
 76 |         predictions_monitoring_datetime_utc = json_response.get(
 77 |             "predictions_monitoring_datetime_utc", []
 78 |         )
 79 |         predictions_monitoring_energy_consumptionc = json_response.get(
 80 |             "predictions_monitoring_energy_consumptionc", []
 81 |         )
 82 | 
 83 |         train_df = build_dataframe(
 84 |             y_monitoring_datetime_utc, y_monitoring_energy_consumption
 85 |         )
 86 |         preds_df = build_dataframe(
 87 |             predictions_monitoring_datetime_utc,
 88 |             predictions_monitoring_energy_consumptionc,
 89 |         )
 90 | 
 91 |         title = "Predictions vs. Observations | Energy Consumption"
 92 | 
 93 |     # Create plot.
 94 |     fig = go.Figure()
 95 |     fig.update_layout(
 96 |         title=dict(
 97 |             text=title,
 98 |             font=dict(family="Arial", size=16),
 99 |         ),
100 |         showlegend=True,
101 |     )
102 |     fig.update_xaxes(title_text="Datetime UTC")
103 |     fig.update_yaxes(title_text="Total Consumption")
104 |     fig.add_scatter(
105 |         x=train_df["datetime_utc"],
106 |         y=train_df["energy_consumption"],
107 |         name="Observations",
108 |         line=dict(color="#C4B6B6"),
109 |         hovertemplate="<br>".join(
110 |             ["Datetime UTC: %{x}", "Energy Consumption: %{y} kWh"]
111 |         ),
112 |     )
113 |     fig.add_scatter(
114 |         x=preds_df["datetime_utc"],
115 |         y=preds_df["energy_consumption"],
116 |         name="Predictions",
117 |         line=dict(color="#FFC703"),
118 |         hovertemplate="<br>".join(
119 |             ["Datetime UTC: %{x}", "Total Consumption: %{y} kWh"]
120 |         ),
121 |     )
122 | 
123 |     return fig
124 | 
125 | 
126 | def build_dataframe(
127 |     datetime_utc: List[int],
128 |     energy_consumption_values: List[float],
129 |     values_column_name: str = "energy_consumption",
130 | ):
131 |     """
132 |     Build DataFrame for plotting from timestamps and energy consumption values.
133 | 
134 |     Args:
135 |         datetime_utc (List[int]): list of timestamp values in UTC
136 |         values (List[float]): list of energy consumption values
137 |         values_column_name (str): name of the column containing the values
138 |     """
139 | 
140 |     df = pd.DataFrame(
141 |         list(zip(datetime_utc, energy_consumption_values)),
142 |         columns=["datetime_utc", values_column_name],
143 |     )
144 |     df["datetime_utc"] = pd.to_datetime(df["datetime_utc"], unit="h")
145 | 
146 |     # Resample to hourly frequency to make the data continuous.
147 |     df = df.set_index("datetime_utc")
148 |     df = df.resample("H").asfreq()
149 |     df = df.reset_index()
150 | 
151 |     return df
152 | 


--------------------------------------------------------------------------------
/app-monitoring/monitoring/main.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | import streamlit as st
 4 | 
 5 | from settings import API_URL, TITLE
 6 | from components import build_metrics_plot, build_data_plot
 7 | 
 8 | 
 9 | st.set_page_config(page_title=TITLE)
10 | st.title(TITLE)
11 | 
12 | # Create plot for metrics over time.
13 | st.plotly_chart(build_metrics_plot())
14 | 
15 | st.divider()
16 | 
17 | 
18 | # Create dropdown for area selection.
19 | area_response = requests.get(API_URL / "area_values")
20 | 
21 | area = st.selectbox(
22 |     label="Denmark is divided in two price areas, or bidding zones,\
23 |         divided by the Great Belt. DK1 (shown as 1) is west of the Great Belt \
24 |             and DK2 (shown as 2) is east of the Great Belt.",
25 |     options=area_response.json().get("values", []),
26 | )
27 | 
28 | # Create dropdown for consumer type selection.
29 | consumer_type_response = requests.get(API_URL / "consumer_type_values")
30 | 
31 | consumer_type = st.selectbox(
32 |     label="The consumer type is the Industry Code DE35 which is owned \
33 |           and maintained by Danish Energy, a non-commercial lobby \
34 |               organization for Danish energy companies. \
35 |                 The code is used by Danish energy companies.",
36 |     options=consumer_type_response.json().get("values", []),
37 | )
38 | 
39 | 
40 | # Check if both area and consumer type have values listed, then create plot for data.
41 | if area and consumer_type:
42 |     st.plotly_chart(build_data_plot(area, consumer_type))
43 | 


--------------------------------------------------------------------------------
/app-monitoring/monitoring/settings.py:
--------------------------------------------------------------------------------
1 | from yarl import URL
2 | 
3 | 
4 | TITLE = "Monitoring | Energy Consumption"
5 | API_URL = URL("http://172.17.0.1:8001/api/v1")
6 | 


--------------------------------------------------------------------------------
/app-monitoring/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "app-monitoring"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Iusztin Paul <p.e.iusztin@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "monitoring"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = ">3.9.7,<3.10"
11 | streamlit = ">=1.21.0,<1.22.0"
12 | plotly = ">=5.14.1,<5.15.0"
13 | yarl = "^1.8.2"
14 | 
15 | 
16 | [build-system]
17 | requires = ["poetry-core"]
18 | build-backend = "poetry.core.masonry.api"
19 | 


--------------------------------------------------------------------------------
/batch-prediction-pipeline/.env.default:
--------------------------------------------------------------------------------
1 | FS_API_KEY = "<your-feature-store-api-key>"
2 | FS_PROJECT_NAME = "<your-feature-store-project-name>"
3 | WANDB_API_KEY = "<your-wandb-api-key>"
4 | WANDB_ENTITY = "teaching-mlops"
5 | WANDB_PROJECT = "energy_consumption"
6 | GOOGLE_CLOUD_PROJECT = "energy_consumption"
7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions"
8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "path/to/your/service-account.json"
9 | 


--------------------------------------------------------------------------------
/batch-prediction-pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # Batch Prediction Pipeline
 2 | 
 3 | Check out [Lesson 3](https://towardsdatascience.com/unlock-the-secret-to-efficient-batch-prediction-pipelines-using-python-a-feature-store-and-gcs-17a1462ca489) on Medium to better understand how we built the batch prediction pipeline. 
 4 | 
 5 | Also, check out [Lesson 5](https://towardsdatascience.com/ensuring-trustworthy-ml-systems-with-data-validation-and-real-time-monitoring-89ab079f4360) to learn how we implemented the monitoring layer to compute the model's real-time performance.
 6 | 
 7 | ## Install for Development
 8 | 
 9 | The batch prediction pipeline uses the training pipeline module as a dependency. Thus, as a first step, we must ensure that the training pipeline module is published to our private PyPi server.
10 | 
11 | **NOTE:** Make sure that your private PyPi server is running. Check the [Usage section](https://github.com/iusztinpaul/energy-forecasting#the-pipeline) if it isn't.
12 | 
13 | Build & publish the `training-pipeline` to your private PyPi server:
14 | ```shell
15 | cd training-pipeline
16 | poetry build
17 | poetry publish -r my-pypi
18 | cd ..
19 | ```
20 | 
21 | Install the virtual environment for `batch-prediction-pipeline`:
22 | ```shell
23 | cd batch-prediction-pipeline
24 | poetry shell
25 | poetry install
26 | ```
27 | 
28 | Check the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#-set-up-additional-tools-) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections to see **how to set up** the **additional tools** and **credentials** you need to run this project.
29 | 
30 | ## Usage for Development
31 | 
32 | To start batch prediction script, run:
33 | ```shell
34 | python -m batch_prediction_pipeline.batch
35 | ```
36 | 
37 | To compute the monitoring metrics based, run the following:
38 | ```shell
39 | python -m batch_prediction_pipeline.monitoring
40 | ```
41 | 
42 | **NOTE:** Be careful to complete the `.env` file and set the `ML_PIPELINE_ROOT_DIR` variable as explained in the [Set Up the ML_PIPELINE_ROOT_DIR Variable](https://github.com/iusztinpaul/energy-forecasting#set-up-the-ml_pipeline_root_dir-variable) section of the main README.
43 | 


--------------------------------------------------------------------------------
/batch-prediction-pipeline/batch_prediction_pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/batch-prediction-pipeline/batch_prediction_pipeline/__init__.py


--------------------------------------------------------------------------------
/batch-prediction-pipeline/batch_prediction_pipeline/batch.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from pathlib import Path
  3 | from typing import Optional
  4 | 
  5 | import hopsworks
  6 | import pandas as pd
  7 | 
  8 | from batch_prediction_pipeline import data
  9 | from batch_prediction_pipeline import settings
 10 | from batch_prediction_pipeline import utils
 11 | 
 12 | 
 13 | logger = utils.get_logger(__name__)
 14 | 
 15 | 
 16 | def predict(
 17 |     fh: int = 24,
 18 |     feature_view_version: Optional[int] = None,
 19 |     model_version: Optional[int] = None,
 20 |     start_datetime: Optional[datetime] = None,
 21 |     end_datetime: Optional[datetime] = None,
 22 | ) -> None:
 23 |     """Main function used to do batch predictions.
 24 | 
 25 |     Args:
 26 |         fh (int, optional): forecast horizon. Defaults to 24.
 27 |         feature_view_version (Optional[int], optional): feature store feature view version. If None is provided, it will try to load it from the cached feature_view_metadata.json file.
 28 |         model_version (Optional[int], optional): model version to load from the model registry. If None is provided, it will try to load it from the cached train_metadata.json file.
 29 |         start_datetime (Optional[datetime], optional): start datetime used for extracting features for predictions. If None is provided, it will try to load it from the cached feature_pipeline_metadata.json file.
 30 |         end_datetime (Optional[datetime], optional): end datetime used for extracting features for predictions. If None is provided, it will try to load it from the cached feature_pipeline_metadata.json file.
 31 |     """
 32 | 
 33 |     if feature_view_version is None:
 34 |         feature_view_metadata = utils.load_json("feature_view_metadata.json")
 35 |         feature_view_version = feature_view_metadata["feature_view_version"]
 36 |     if model_version is None:
 37 |         train_metadata = utils.load_json("train_metadata.json")
 38 |         model_version = train_metadata["model_version"]
 39 |     if start_datetime is None or end_datetime is None:
 40 |         feature_pipeline_metadata = utils.load_json("feature_pipeline_metadata.json")
 41 |         start_datetime = datetime.strptime(
 42 |             feature_pipeline_metadata["export_datetime_utc_start"],
 43 |             feature_pipeline_metadata["datetime_format"],
 44 |         )
 45 |         end_datetime = datetime.strptime(
 46 |             feature_pipeline_metadata["export_datetime_utc_end"],
 47 |             feature_pipeline_metadata["datetime_format"],
 48 |         )
 49 | 
 50 |     logger.info("Connecting to the feature store...")
 51 |     project = hopsworks.login(
 52 |         api_key_value=settings.SETTINGS["FS_API_KEY"],
 53 |         project=settings.SETTINGS["FS_PROJECT_NAME"],
 54 |     )
 55 |     fs = project.get_feature_store()
 56 |     logger.info("Successfully connected to the feature store.")
 57 | 
 58 |     logger.info("Loading data from feature store...")
 59 |     logger.info(f"Loading features from {start_datetime} to {end_datetime}.")
 60 |     X, y = data.load_data_from_feature_store(
 61 |         fs,
 62 |         feature_view_version,
 63 |         start_datetime=start_datetime,
 64 |         end_datetime=end_datetime,
 65 |     )
 66 |     logger.info("Successfully loaded data from feature store.")
 67 | 
 68 |     logger.info("Loading model from model registry...")
 69 |     model = load_model_from_model_registry(project, model_version)
 70 |     logger.info("Successfully loaded model from model registry.")
 71 | 
 72 |     logger.info("Making predictions...")
 73 |     predictions = forecast(model, X, fh=fh)
 74 |     predictions_start_datetime = predictions.index.get_level_values(
 75 |         level="datetime_utc"
 76 |     ).min()
 77 |     predictions_end_datetime = predictions.index.get_level_values(
 78 |         level="datetime_utc"
 79 |     ).max()
 80 |     logger.info(
 81 |         f"Forecasted energy consumption from {predictions_start_datetime} to {predictions_end_datetime}."
 82 |     )
 83 |     logger.info("Successfully made predictions.")
 84 | 
 85 |     logger.info("Saving predictions...")
 86 |     save(X, y, predictions)
 87 |     logger.info("Successfully saved predictions.")
 88 | 
 89 |     # Save the predictions to the bucket for monitoring.
 90 |     logger.info("Merging predictions with cached predictions...")
 91 |     save_for_monitoring(predictions, start_datetime)
 92 |     logger.info("Successfully merged predictions with cached predictions...")
 93 | 
 94 | 
 95 | def load_model_from_model_registry(project, model_version: int):
 96 |     """
 97 |     This function loads a model from the Model Registry.
 98 |     The model is downloaded, saved locally, and loaded into memory.
 99 |     """
100 | 
101 |     mr = project.get_model_registry()
102 |     model_registry_reference = mr.get_model(name="best_model", version=model_version)
103 |     model_dir = model_registry_reference.download()
104 |     model_path = Path(model_dir) / "best_model.pkl"
105 | 
106 |     model = utils.load_model(model_path)
107 | 
108 |     return model
109 | 
110 | 
111 | def forecast(model, X: pd.DataFrame, fh: int = 24):
112 |     """
113 |     Get a forecast of the total load for the given areas and consumer types.
114 | 
115 |     Args:
116 |         model (sklearn.base.BaseEstimator): Fitted model that implements the predict method.
117 |         X (pd.DataFrame): Exogenous data with area, consumer_type, and datetime_utc as index.
118 |         fh (int): Forecast horizon.
119 | 
120 |     Returns:
121 |         pd.DataFrame: Forecast of total load for each area, consumer_type, and datetime_utc.
122 |     """
123 | 
124 |     all_areas = X.index.get_level_values(level=0).unique()
125 |     all_consumer_types = X.index.get_level_values(level=1).unique()
126 |     latest_datetime = X.index.get_level_values(level=2).max()
127 | 
128 |     start = latest_datetime + 1
129 |     end = start + fh - 1
130 |     fh_range = pd.date_range(
131 |         start=start.to_timestamp(), end=end.to_timestamp(), freq="H"
132 |     )
133 |     fh_range = pd.PeriodIndex(fh_range, freq="H")
134 | 
135 |     index = pd.MultiIndex.from_product(
136 |         [all_areas, all_consumer_types, fh_range],
137 |         names=["area", "consumer_type", "datetime_utc"],
138 |     )
139 |     X_forecast = pd.DataFrame(index=index)
140 |     X_forecast["area_exog"] = X_forecast.index.get_level_values(0)
141 |     X_forecast["consumer_type_exog"] = X_forecast.index.get_level_values(1)
142 | 
143 |     predictions = model.predict(X=X_forecast)
144 | 
145 |     return predictions
146 | 
147 | 
148 | def save(X: pd.DataFrame, y: pd.DataFrame, predictions: pd.DataFrame):
149 |     """Save the input data, target data, and predictions to GCS."""
150 | 
151 |     # Get the bucket object from the GCS client.
152 |     bucket = utils.get_bucket()
153 | 
154 |     # Save the input data and target data to the bucket.
155 |     for df, blob_name in zip(
156 |         [X, y, predictions], ["X.parquet", "y.parquet", "predictions.parquet"]
157 |     ):
158 |         logger.info(f"Saving {blob_name} to bucket...")
159 |         utils.write_blob_to(
160 |             bucket=bucket,
161 |             blob_name=blob_name,
162 |             data=df,
163 |         )
164 |         logger.info(f"Successfully saved {blob_name} to bucket.")
165 | 
166 | 
167 | def save_for_monitoring(predictions: pd.DataFrame, start_datetime: datetime):
168 |     """Save predictions to GCS for monitoring.
169 | 
170 |     The predictions are saved as a parquet file in GCS.
171 |     The predictions are saved in a bucket with the following structure:
172 |     gs://<BUCKET_NAME>/predictions_monitoring.parquet
173 | 
174 |     The predictions are stored in a multiindex dataframe with the following indexes:
175 |     - area: The area of the predictions, e.g. "DK1".
176 |     - consumer_type: The consumer type of the predictions, e.g. "residential".
177 |     - datetime_utc: The timestamp of the predictions, e.g. "2020-01-01 00:00:00" with a frequency of 1 hour.
178 |     """
179 | 
180 |     bucket = utils.get_bucket()
181 | 
182 |     cached_predictions = utils.read_blob_from(
183 |         bucket=bucket, blob_name=f"predictions_monitoring.parquet"
184 |     )
185 |     has_cached_predictions = cached_predictions is not None
186 |     if has_cached_predictions is True:
187 |         # Merge predictions with cached predictions.
188 |         cached_predictions.index = cached_predictions.index.set_levels(
189 |             pd.to_datetime(cached_predictions.index.levels[2], unit="h").to_period("H"),
190 |             level=2,
191 |         )
192 | 
193 |         merged_predictions = predictions.merge(
194 |             cached_predictions,
195 |             left_index=True,
196 |             right_index=True,
197 |             how="outer",
198 |             suffixes=("_new", "_cached"),
199 |         )
200 |         new_predictions = merged_predictions.filter(regex=".*?_new")
201 |         new_predictions.columns = new_predictions.columns.str.replace("_new", "")
202 |         cached_predictions = merged_predictions.filter(regex=".*?_cached")
203 |         cached_predictions.columns = cached_predictions.columns.str.replace(
204 |             "_cached", ""
205 |         )
206 | 
207 |         # NOTE: fillna() not working properly on multindex DataFrames. Got nasty bugs because of it.
208 |         new_predictions.update(cached_predictions)
209 |         predictions = new_predictions
210 | 
211 |     predictions = predictions.loc[
212 |         predictions.index.get_level_values("datetime_utc")
213 |         >= pd.Period(start_datetime, freq="H")
214 |     ]
215 |     predictions = predictions.dropna(subset=["energy_consumption"])
216 | 
217 |     utils.write_blob_to(
218 |         bucket=bucket,
219 |         blob_name=f"predictions_monitoring.parquet",
220 |         data=predictions,
221 |     )
222 |     logger.info(f"Successfully cached predictions forecasted before {start_datetime}.")
223 | 
224 | 
225 | if __name__ == "__main__":
226 |     predict()
227 | 


--------------------------------------------------------------------------------
/batch-prediction-pipeline/batch_prediction_pipeline/data.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Tuple
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from hsfs.feature_store import FeatureStore
 7 | 
 8 | 
 9 | def load_data_from_feature_store(
10 |     fs: FeatureStore,
11 |     feature_view_version: int,
12 |     start_datetime: datetime,
13 |     end_datetime: datetime,
14 |     target: str = "energy_consumption",
15 | ) -> Tuple[pd.DataFrame, pd.DataFrame]:
16 |     """Loads data for a given time range from the feature store.
17 | 
18 |     Args:
19 |         fs: Feature store.
20 |         feature_view_version: Feature view version.
21 |         start_datetime: Start datetime.
22 |         end_datetime: End datetime.
23 |         target: Name of the target feature.
24 | 
25 |     Returns:
26 |         Tuple of exogenous variables and the time series to be forecasted.
27 |     """
28 | 
29 |     feature_view = fs.get_feature_view(
30 |         name="energy_consumption_denmark_view", version=feature_view_version
31 |     )
32 |     data = feature_view.get_batch_data(start_time=start_datetime, end_time=end_datetime)
33 | 
34 |     # Set the index as is required by sktime.
35 |     data["datetime_utc"] = pd.PeriodIndex(data["datetime_utc"], freq="H")
36 |     data = data.set_index(["area", "consumer_type", "datetime_utc"]).sort_index()
37 | 
38 |     # Prepare exogenous variables.
39 |     X = data.drop(columns=[target])
40 |     # Prepare the time series to be forecasted.
41 |     y = data[[target]]
42 | 
43 |     return X, y
44 | 


--------------------------------------------------------------------------------
/batch-prediction-pipeline/batch_prediction_pipeline/monitoring.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import hopsworks
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from sktime.performance_metrics.forecasting import mean_absolute_percentage_error
  8 | 
  9 | from batch_prediction_pipeline import data
 10 | from batch_prediction_pipeline import settings
 11 | from batch_prediction_pipeline import utils
 12 | 
 13 | 
 14 | logger = utils.get_logger(__name__)
 15 | 
 16 | 
 17 | def compute(feature_view_version: Optional[int] = None) -> None:
 18 |     """Computes the metrics on the latest n_days of predictions.
 19 | 
 20 |     Args:
 21 |         feature_view_version: The version of the feature view to load data from the feature store. If None is provided, it will try to load it from the cached feature_view_metadata.json file.
 22 |     """
 23 | 
 24 |     if feature_view_version is None:
 25 |         feature_view_metadata = utils.load_json("feature_view_metadata.json")
 26 |         feature_view_version = feature_view_metadata["feature_view_version"]
 27 | 
 28 |     logger.info("Loading old predictions...")
 29 |     bucket = utils.get_bucket()
 30 |     predictions = utils.read_blob_from(
 31 |         bucket=bucket, blob_name=f"predictions_monitoring.parquet"
 32 |     )
 33 |     if predictions is None or len(predictions) == 0:
 34 |         logger.info(
 35 |             "Haven't found any predictions to compute the metrics on. Exiting..."
 36 |         )
 37 | 
 38 |         return
 39 |     predictions.index = predictions.index.set_levels(
 40 |         pd.to_datetime(predictions.index.levels[2], unit="h").to_period("H"), level=2
 41 |     )
 42 |     logger.info("Successfully loaded old predictions.")
 43 | 
 44 |     logger.info("Connecting to the feature store...")
 45 |     project = hopsworks.login(
 46 |         api_key_value=settings.SETTINGS["FS_API_KEY"],
 47 |         project=settings.SETTINGS["FS_PROJECT_NAME"],
 48 |     )
 49 |     fs = project.get_feature_store()
 50 |     logger.info("Successfully connected to the feature store.")
 51 | 
 52 |     logger.info("Loading latest data from feature store...")
 53 |     predictions_min_datetime_utc = (
 54 |         predictions.index.get_level_values("datetime_utc").min().to_timestamp()
 55 |     )
 56 |     predictions_max_datetime_utc = (
 57 |         predictions.index.get_level_values("datetime_utc").max().to_timestamp()
 58 |     )
 59 |     logger.info(
 60 |         f"Loading predictions from {predictions_min_datetime_utc} to {predictions_max_datetime_utc}."
 61 |     )
 62 |     _, latest_observations = data.load_data_from_feature_store(
 63 |         fs,
 64 |         feature_view_version,
 65 |         start_datetime=predictions_min_datetime_utc,
 66 |         end_datetime=predictions_max_datetime_utc,
 67 |     )
 68 |     logger.info("Successfully loaded latest data from feature store.")
 69 | 
 70 |     if len(latest_observations) == 0:
 71 |         logger.info(
 72 |             "Haven't found any new ground truths to compute the metrics on. Exiting..."
 73 |         )
 74 | 
 75 |         return
 76 | 
 77 |     logger.info("Computing metrics...")
 78 |     predictions = predictions.rename(
 79 |         columns={"energy_consumption": "energy_consumption_predictions"}
 80 |     )
 81 |     latest_observations = latest_observations.rename(
 82 |         columns={"energy_consumption": "energy_consumption_observations"}
 83 |     )
 84 | 
 85 |     predictions["energy_consumption_observations"] = np.nan
 86 |     predictions.update(latest_observations)
 87 | 
 88 |     # Compute metrics only on data points that have ground truth.
 89 |     predictions = predictions.dropna(subset=["energy_consumption_observations"])
 90 |     if len(predictions) == 0:
 91 |         logger.info(
 92 |             "Haven't found any new ground truths to compute the metrics on. Exiting..."
 93 |         )
 94 | 
 95 |         return
 96 | 
 97 |     mape_metrics = predictions.groupby("datetime_utc").apply(
 98 |         lambda point_in_time: mean_absolute_percentage_error(
 99 |             point_in_time["energy_consumption_observations"],
100 |             point_in_time["energy_consumption_predictions"],
101 |             symmetric=False,
102 |         )
103 |     )
104 |     mape_metrics = mape_metrics.rename("MAPE")
105 |     metrics = mape_metrics.to_frame()
106 |     logger.info("Successfully computed metrics...")
107 | 
108 |     logger.info("Saving new metrics...")
109 |     utils.write_blob_to(
110 |         bucket=bucket,
111 |         blob_name=f"metrics_monitoring.parquet",
112 |         data=metrics,
113 |     )
114 |     latest_observations = latest_observations.rename(
115 |         columns={"energy_consumption_observations": "energy_consumption"}
116 |     )
117 |     utils.write_blob_to(
118 |         bucket=bucket,
119 |         blob_name=f"y_monitoring.parquet",
120 |         data=latest_observations[["energy_consumption"]],
121 |     )
122 |     logger.info("Successfully saved new metrics.")
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     compute()
127 | 


--------------------------------------------------------------------------------
/batch-prediction-pipeline/batch_prediction_pipeline/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import warnings
 3 | from pathlib import Path
 4 | from typing import Union
 5 | 
 6 | from dotenv import load_dotenv
 7 | 
 8 | 
 9 | warnings.filterwarnings(action="ignore", category=FutureWarning, module="sktime")
10 | 
11 | 
12 | def load_env_vars(root_dir: Union[str, Path]) -> dict:
13 |     """
14 |     Load environment variables from .env.default and .env files.
15 | 
16 |     Args:
17 |         root_dir: Root directory of the .env files.
18 | 
19 |     Returns:
20 |         Dictionary with the environment variables.
21 |     """
22 | 
23 |     if isinstance(root_dir, str):
24 |         root_dir = Path(root_dir)
25 | 
26 |     load_dotenv(dotenv_path=root_dir / ".env.default")
27 |     load_dotenv(dotenv_path=root_dir / ".env", override=True)
28 | 
29 |     return dict(os.environ)
30 | 
31 | 
32 | def get_root_dir(default_value: str = ".") -> Path:
33 |     """
34 |     Get the root directory of the project.
35 | 
36 |     Args:
37 |         default_value: Default value to use if the environment variable is not set.
38 | 
39 |     Returns:
40 |         Path to the root directory of the project.
41 |     """
42 | 
43 |     return Path(os.getenv("ML_PIPELINE_ROOT_DIR", default_value))
44 | 
45 | 
46 | # The settings will be loaded and the outputs will be saved relative to the 'ML_PIPELINE_ROOT_DIR' directory.
47 | ML_PIPELINE_ROOT_DIR = get_root_dir()
48 | OUTPUT_DIR = ML_PIPELINE_ROOT_DIR / "output"
49 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
50 | 
51 | SETTINGS = load_env_vars(root_dir=ML_PIPELINE_ROOT_DIR)
52 | 


--------------------------------------------------------------------------------
/batch-prediction-pipeline/batch_prediction_pipeline/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from pathlib import Path
  4 | from typing import Optional, Union
  5 | import joblib
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from google.cloud import storage
 10 | 
 11 | from batch_prediction_pipeline import settings
 12 | 
 13 | 
 14 | def get_logger(name: str) -> logging.Logger:
 15 |     """
 16 |     Template for getting a logger.
 17 | 
 18 |     Args:
 19 |         name: Name of the logger.
 20 | 
 21 |     Returns: Logger.
 22 |     """
 23 | 
 24 |     logging.basicConfig(level=logging.INFO)
 25 |     logger = logging.getLogger(name)
 26 | 
 27 |     return logger
 28 | 
 29 | 
 30 | def load_model(model_path: Union[str, Path]):
 31 |     """
 32 |     Template for loading a model.
 33 | 
 34 |     Args:
 35 |         model_path: Path to the model.
 36 | 
 37 |     Returns: Loaded model.
 38 |     """
 39 | 
 40 |     return joblib.load(model_path)
 41 | 
 42 | 
 43 | def save_json(data: dict, file_name: str, save_dir: str = settings.OUTPUT_DIR):
 44 |     """
 45 |     Save a dictionary as a JSON file.
 46 | 
 47 |     Args:
 48 |         data: data to save.
 49 |         file_name: Name of the JSON file.
 50 |         save_dir: Directory to save the JSON file.
 51 | 
 52 |     Returns: None
 53 |     """
 54 | 
 55 |     data_path = Path(save_dir) / file_name
 56 |     with open(data_path, "w") as f:
 57 |         json.dump(data, f)
 58 | 
 59 | 
 60 | def load_json(file_name: str, save_dir: str = settings.OUTPUT_DIR) -> dict:
 61 |     """
 62 |     Load a JSON file.
 63 | 
 64 |     Args:
 65 |         file_name: Name of the JSON file.
 66 |         save_dir: Directory of the JSON file.
 67 | 
 68 |     Returns: Dictionary with the data.
 69 |     """
 70 | 
 71 |     data_path = Path(save_dir) / file_name
 72 |     with open(data_path, "r") as f:
 73 |         return json.load(f)
 74 | 
 75 | 
 76 | def get_bucket(
 77 |     bucket_name: str = settings.SETTINGS["GOOGLE_CLOUD_BUCKET_NAME"],
 78 |     bucket_project: str = settings.SETTINGS["GOOGLE_CLOUD_PROJECT"],
 79 |     json_credentials_path: str = settings.SETTINGS[
 80 |         "GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH"
 81 |     ],
 82 | ) -> storage.Bucket:
 83 |     """Get a Google Cloud Storage bucket.
 84 | 
 85 |     This function returns a Google Cloud Storage bucket that can be used to upload and download
 86 |     files from Google Cloud Storage.
 87 | 
 88 |     Args:
 89 |         bucket_name : str
 90 |             The name of the bucket to connect to.
 91 |         bucket_project : str
 92 |             The name of the project in which the bucket resides.
 93 |         json_credentials_path : str
 94 |             Path to the JSON credentials file for your Google Cloud Project.
 95 | 
 96 |     Returns
 97 |         storage.Bucket
 98 |             A storage bucket that can be used to upload and download files from Google Cloud Storage.
 99 |     """
100 | 
101 |     storage_client = storage.Client.from_service_account_json(
102 |         json_credentials_path=json_credentials_path,
103 |         project=bucket_project,
104 |     )
105 |     bucket = storage_client.bucket(bucket_name=bucket_name)
106 | 
107 |     return bucket
108 | 
109 | 
110 | def write_blob_to(bucket: storage.Bucket, blob_name: str, data: pd.DataFrame):
111 |     """Write a dataframe to a GCS bucket as a parquet file.
112 | 
113 |     Args:
114 |         bucket (google.cloud.storage.Bucket): The bucket to write to.
115 |         blob_name (str): The name of the blob to write to. Must be a parquet file.
116 |         data (pd.DataFrame): The dataframe to write to GCS.
117 |     """
118 | 
119 |     blob = bucket.blob(blob_name=blob_name)
120 |     with blob.open("wb") as f:
121 |         data.to_parquet(f)
122 | 
123 | 
124 | def read_blob_from(bucket: storage.Bucket, blob_name: str) -> Optional[pd.DataFrame]:
125 |     """Reads a blob from a bucket and returns a dataframe.
126 | 
127 |     Args:
128 |         bucket: The bucket to read from.
129 |         blob_name: The name of the blob to read.
130 | 
131 |     Returns:
132 |         A dataframe containing the data from the blob.
133 |     """
134 | 
135 |     blob = bucket.blob(blob_name=blob_name)
136 |     if not blob.exists():
137 |         return None
138 | 
139 |     with blob.open("rb") as f:
140 |         return pd.read_parquet(f)
141 | 


--------------------------------------------------------------------------------
/batch-prediction-pipeline/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "batch-prediction-pipeline"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Iusztin Paul <p.e.iusztin@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "batch_prediction_pipeline"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "~3.9"
11 | category-encoders = "^2.6.0"
12 | hopsworks = "3.4.3"
13 | python-dotenv = "^1.0.0"
14 | lightgbm = "^3.3.5"
15 | sktime = "^0.16.1"
16 | google-cloud-storage = "^2.7.0"
17 | fire = "^0.5.0"
18 | training-pipeline = "^0.1.0"
19 | 
20 | 
21 | [tool.poetry.group.dev.dependencies]
22 | black = "^23.1.0"
23 | 
24 | [build-system]
25 | requires = ["poetry-core"]
26 | build-backend = "poetry.core.masonry.api"
27 | 
28 | [[tool.poetry.source]]
29 | name = "test"  # This name will be used in the configuration to retreive the proper credentials
30 | url = "http://localhost"  # URL used to download your packages from
31 | 


--------------------------------------------------------------------------------
/deploy/app-docker-compose.local.yml:
--------------------------------------------------------------------------------
 1 | version: '3.9'
 2 | 
 3 | services:
 4 |   frontend:
 5 |     volumes:
 6 |     - ./app-frontend:/app/src/
 7 | 
 8 |   monitoring:
 9 |     volumes:
10 |     - ./app-monitoring:/app/src/
11 | 
12 |   api:
13 |     volumes:
14 |     - ./app-api:/app/src/
15 |     environment:
16 |       # Enables autoreload.
17 |       APP_API_RELOAD: "True"
18 | 


--------------------------------------------------------------------------------
/deploy/app-docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.9'
 2 | 
 3 | services:
 4 |   frontend:
 5 |     build:
 6 |         dockerfile: app-frontend/Dockerfile
 7 |     image: app-frontend:${APP_FRONTEND_VERSION:-latest}
 8 |     restart: always
 9 |     ports:
10 |       - 8501:8501
11 |     depends_on:
12 |       - api
13 | 
14 |   monitoring:
15 |     build:
16 |         dockerfile: app-monitoring/Dockerfile
17 |     image: app-monitoring:${APP_MONITORING_VERSION:-latest}
18 |     restart: always
19 |     ports:
20 |       - 8502:8502
21 |     depends_on:
22 |       - api
23 | 
24 |   api:
25 |     build:
26 |         dockerfile: app-api/Dockerfile
27 |     image: app-api:${APP_API_VERSION:-latest}
28 |     restart: always
29 |     volumes:
30 |       - ./credentials:/app/src/credentials
31 |     env_file:
32 |       - app-api/.env    
33 |     ports:
34 |       - 8001:8001


--------------------------------------------------------------------------------
/deploy/ml-pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Build and publish the feature-pipeline, training-pipeline, and batch-prediction-pipeline packages.
 4 | # This is done so that the pipelines can be run from the CLI.
 5 | # The pipelines are executed in the feature-pipeline, training-pipeline, and batch-prediction-pipeline
 6 | # directories, so we must change directories before building and publishing the packages.
 7 | # The my-pypi repository must be defined in the project's poetry.toml file.
 8 | 
 9 | cd feature-pipeline
10 | poetry build
11 | poetry publish -r my-pypi
12 | 
13 | cd ../training-pipeline
14 | poetry build
15 | poetry publish -r my-pypi
16 | 
17 | cd ../batch-prediction-pipeline
18 | poetry build
19 | poetry publish -r my-pypi
20 | 


--------------------------------------------------------------------------------
/feature-pipeline/.env.default:
--------------------------------------------------------------------------------
1 | FS_API_KEY = "<your-feature-store-api-key>"
2 | FS_PROJECT_NAME = "<your-feature-store-project-name>"
3 | WANDB_API_KEY = "<your-wandb-api-key>"
4 | WANDB_ENTITY = "teaching-mlops"
5 | WANDB_PROJECT = "energy_consumption"
6 | GOOGLE_CLOUD_PROJECT = "energy_consumption"
7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions"
8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "path/to/your/service-account.json"
9 | 


--------------------------------------------------------------------------------
/feature-pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # Feature Pipeline
 2 | 
 3 | Check out [Lesson 1](https://medium.com/towards-data-science/a-framework-for-building-a-production-ready-feature-engineering-pipeline-f0b29609b20f) on Medium to better understand how we built the FE pipeline. 
 4 | 
 5 | Also, check out [Lesson 5](https://towardsdatascience.com/ensuring-trustworthy-ml-systems-with-data-validation-and-real-time-monitoring-89ab079f4360) to learn how we implemented the data validation layer using Great Expectations.
 6 | 
 7 | ## Install for Development
 8 | 
 9 | Create virtual environment:
10 | ```shell
11 | cd feature-pipeline
12 | poetry shell
13 | poetry install
14 | ```
15 | 
16 | Check the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#-set-up-additional-tools-) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections to see **how to set up** the **additional tools** and **credentials** you need to run this project.
17 | 
18 | ## Usage for Development
19 | 
20 | To start the ETL pipeline run:
21 | ```shell
22 | python -m feature_pipeline.pipeline
23 | ```
24 | 
25 | To create a new feature view run:
26 | ```shell
27 | python -m feature_pipeline.feature_view
28 | ```
29 | 
30 | **NOTE:** Be careful to complete the `.env` file and set the `ML_PIPELINE_ROOT_DIR` variable as explained in the [Set Up the ML_PIPELINE_ROOT_DIR Variable](https://github.com/iusztinpaul/energy-forecasting#set-up-the-ml_pipeline_root_dir-variable) section of the main README.
31 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/feature-pipeline/feature_pipeline/__init__.py


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/clean_feature_store.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | import hopsworks
 3 | 
 4 | from feature_pipeline import settings
 5 | 
 6 | 
 7 | def clean():
 8 |     """
 9 |     Utiliy function used during development to clean all the data from the feature store.
10 |     """
11 | 
12 |     project = hopsworks.login(
13 |         api_key_value=settings.SETTINGS["FS_API_KEY"],
14 |         project=settings.SETTINGS["FS_PROJECT_NAME"],
15 |     )
16 |     fs = project.get_feature_store()
17 | 
18 |     print("Deleting feature views and training datasets...")
19 |     try:
20 |         feature_views = fs.get_feature_views(name="energy_consumption_denmark_view")
21 | 
22 |         for feature_view in feature_views:
23 |             try:
24 |                 feature_view.delete()
25 |             except Exception as e:
26 |                 print(e)
27 |     except Exception as e:
28 |         print(e)
29 | 
30 |     print("Deleting feature groups...")
31 |     try:
32 |         feature_groups = fs.get_feature_groups(name="energy_consumption_denmark")
33 |         for feature_group in feature_groups:
34 |             try:
35 |                 feature_group.delete()
36 |             except Exception as e:
37 |                 print(e)
38 |     except Exception as e:
39 |         print(e)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     fire.Fire(clean)
44 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/etl/__init__.py:
--------------------------------------------------------------------------------
1 | from feature_pipeline.etl.cleaning import *
2 | from feature_pipeline.etl.extract import *
3 | from feature_pipeline.etl.load import *
4 | from feature_pipeline.etl.validation import *
5 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/etl/cleaning.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
 5 |     """
 6 |     Rename columns to match our schema.
 7 |     """
 8 | 
 9 |     data = df.copy()
10 | 
11 |     # Drop irrelevant columns.
12 |     data.drop(columns=["HourDK"], inplace=True)
13 | 
14 |     # Rename columns
15 |     data.rename(
16 |         columns={
17 |             "HourUTC": "datetime_utc",
18 |             "PriceArea": "area",
19 |             "ConsumerType_DE35": "consumer_type",
20 |             "TotalCon": "energy_consumption",
21 |         },
22 |         inplace=True,
23 |     )
24 | 
25 |     return data
26 | 
27 | 
28 | def cast_columns(df: pd.DataFrame) -> pd.DataFrame:
29 |     """
30 |     Cast columns to the correct data type.
31 |     """
32 | 
33 |     data = df.copy()
34 | 
35 |     data["datetime_utc"] = pd.to_datetime(data["datetime_utc"])
36 |     data["area"] = data["area"].astype("string")
37 |     data["consumer_type"] = data["consumer_type"].astype("int32")
38 |     data["energy_consumption"] = data["energy_consumption"].astype("float64")
39 | 
40 |     return data
41 | 
42 | 
43 | def encode_area_column(df: pd.DataFrame) -> pd.DataFrame:
44 |     """
45 |     Encode the area column to integers.
46 |     """
47 | 
48 |     data = df.copy()
49 | 
50 |     area_mappings = {"DK": 0, "DK1": 1, "DK2": 2}
51 | 
52 |     data["area"] = data["area"].map(lambda string_area: area_mappings.get(string_area))
53 |     data["area"] = data["area"].astype("int8")
54 | 
55 |     return data
56 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/etl/extract.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from json import JSONDecodeError
  3 | from pathlib import Path
  4 | from pandas.errors import EmptyDataError
  5 | from typing import Any, Dict, Tuple, Optional
  6 | 
  7 | import pandas as pd
  8 | import requests
  9 | 
 10 | from yarl import URL
 11 | 
 12 | from feature_pipeline import utils, settings
 13 | 
 14 | 
 15 | logger = utils.get_logger(__name__)
 16 | 
 17 | 
 18 | def from_file(
 19 |     export_end_reference_datetime: Optional[datetime.datetime] = None,
 20 |     days_delay: int = 15,
 21 |     days_export: int = 30,
 22 |     url: str = "https://drive.google.com/uc?export=download&id=1y48YeDymLurOTUO-GeFOUXVNc9MCApG5",
 23 |     datetime_format: str = "%Y-%m-%d %H:%M",
 24 |     cache_dir: Optional[Path] = None,
 25 | ) -> Optional[Tuple[pd.DataFrame, Dict[str, Any]]]:
 26 |     """
 27 |     Extract data from the DK energy consumption API.
 28 | 
 29 |     As the official API expired in July 2023, we will use a copy of the data to simulate the same behavior. 
 30 |     We made a copy of the data between '2020-06-30 22:00' and '2023-06-30 21:00'. Thus, there are 3 years of data to play with.
 31 | 
 32 |     Here is the link to the official obsolete dataset: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour
 33 |     Here is the link to the copy of the dataset: https://drive.google.com/file/d/1y48YeDymLurOTUO-GeFOUXVNc9MCApG5/view?usp=drive_link
 34 |     
 35 |     Args:
 36 |         export_end_reference_datetime: The end reference datetime of the export window. If None, the current time is used.
 37 |             Because the data is always delayed with "days_delay" days, this date is used only as a reference point.
 38 |             The real extracted window will be computed as [export_end_reference_datetime - days_delay - days_export, export_end_reference_datetime - days_delay].
 39 |         days_delay: Data has a delay of N days. Thus, we have to shift our window with N days.
 40 |         days_export: The number of days to export.
 41 |         url: The URL of the API.
 42 |         datetime_format: The datetime format of the fields from the file.
 43 |         cache_dir: The directory where the downloaded data will be cached. By default it will be downloaded in the standard output directory.
 44 | 
 45 | 
 46 |     Returns:
 47 |           A tuple of a Pandas DataFrame containing the exported data and a dictionary of metadata.
 48 |     """
 49 | 
 50 |     export_start, export_end = _compute_extraction_window(export_end_reference_datetime=export_end_reference_datetime, days_delay=days_delay, days_export=days_export)
 51 |     records = _extract_records_from_file_url(url=url, export_start=export_start, export_end=export_end, datetime_format=datetime_format, cache_dir=cache_dir)
 52 |     
 53 |     metadata = {
 54 |         "days_delay": days_delay,
 55 |         "days_export": days_export,
 56 |         "url": url,
 57 |         "export_datetime_utc_start": export_start.strftime(datetime_format),
 58 |         "export_datetime_utc_end": export_end.strftime(datetime_format),
 59 |         "datetime_format": datetime_format,
 60 |         "num_unique_samples_per_time_series": len(records["HourUTC"].unique()),
 61 |     }
 62 | 
 63 |     return records, metadata
 64 | 
 65 | 
 66 | def _extract_records_from_file_url(url: str, export_start: datetime.datetime, export_end: datetime.datetime, datetime_format: str, cache_dir: Optional[Path] = None) -> Optional[pd.DataFrame]:
 67 |     """Extract records from the file backup based on the given export window."""
 68 | 
 69 |     if cache_dir is None:
 70 |         cache_dir = settings.OUTPUT_DIR / "data"
 71 |         cache_dir.mkdir(parents=True, exist_ok=True)
 72 |         
 73 |     file_path = cache_dir / "ConsumptionDE35Hour.csv"
 74 |     if not file_path.exists():
 75 |         logger.info(f"Downloading data from: {url}")
 76 | 
 77 |         try:
 78 |             response = requests.get(url)
 79 |         except requests.exceptions.HTTPError as e:
 80 |             logger.error(
 81 |                 f"Response status = {response.status_code}. Could not download the file due to: {e}"
 82 |             )
 83 | 
 84 |             return None
 85 |         
 86 |         if response.status_code != 200:
 87 |             raise ValueError(f"Response status = {response.status_code}. Could not download the file.")
 88 |     
 89 |         with file_path.open("w") as f:
 90 |             f.write(response.text)
 91 | 
 92 |         logger.info(f"Successfully downloaded data to: {file_path}")
 93 |     else:
 94 |         logger.info(f"Data already downloaded at: {file_path}")
 95 | 
 96 |     try:
 97 |         data = pd.read_csv(file_path, delimiter=";")
 98 |     except EmptyDataError:
 99 |         file_path.unlink(missing_ok=True)
100 |         
101 |         raise ValueError(f"Downloaded file at {file_path} is empty. Could not load it into a DataFrame.")
102 | 
103 |     records = data[(data["HourUTC"] >= export_start.strftime(datetime_format)) & (data["HourUTC"] < export_end.strftime(datetime_format))]
104 | 
105 |     return records
106 | 
107 | 
108 | def from_api(
109 |     export_end_reference_datetime: Optional[datetime.datetime] = None,
110 |     days_delay: int = 15,
111 |     days_export: int = 30,
112 |     url: str = "https://api.energidataservice.dk/dataset/ConsumptionDE35Hour",
113 |     datetime_format: str = "%Y-%m-%dT%H:%M:%SZ"
114 | ) -> Optional[Tuple[pd.DataFrame, Dict[str, Any]]]:
115 |     """
116 |     Extract data from the DK energy consumption API.
117 | 
118 |     IMPORTANT NOTE: This dataset will not be updated starting July 2023. The dataset will expire during 2023.
119 |     Here is the link to the dataset: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour
120 | 
121 |     Args:
122 |         export_end_reference_datetime: The end reference datetime of the export window. If None, the current time is used.
123 |             Because the data is always delayed with "days_delay" days, this date is used only as a reference point.
124 |             The real extracted window will be computed as [export_end_reference_datetime - days_delay - days_export, export_end_reference_datetime - days_delay].
125 |         days_delay: Data has a delay of N days. Thus, we have to shift our window with N days.
126 |         days_export: The number of days to export.
127 |         url: The URL of the API.
128 |         datetime_format: The datetime format of the fields in the API response.
129 | 
130 |     Returns:
131 |           A tuple of a Pandas DataFrame containing the exported data and a dictionary of metadata.
132 |     """
133 | 
134 |     export_start, export_end = _compute_extraction_window(export_end_reference_datetime=export_end_reference_datetime, days_delay=days_delay, days_export=days_export)
135 | 
136 |     records = _extract_records_from_api_url(url=url, export_start=export_start, export_end=export_end)
137 |     
138 |     metadata = {
139 |         "days_delay": days_delay,
140 |         "days_export": days_export,
141 |         "url": url,
142 |         "export_datetime_utc_start": export_start.strftime(datetime_format),
143 |         "export_datetime_utc_end": export_end.strftime(datetime_format),
144 |         "datetime_format": datetime_format,
145 |         "num_unique_samples_per_time_series": len(records["HourUTC"].unique()),
146 |     }
147 | 
148 |     return records, metadata
149 | 
150 | def _extract_records_from_api_url(url: str, export_start: datetime.datetime, export_end: datetime.datetime):
151 |     """Extracts records from the official API based on the given export window."""
152 | 
153 |     query_params = {
154 |         "offset": 0,
155 |         "sort": "HourUTC",
156 |         "timezone": "utc",
157 |         "start": export_start.strftime("%Y-%m-%dT%H:%M"),
158 |         "end": export_end.strftime("%Y-%m-%dT%H:%M"),
159 |     }
160 |     url = URL(url) % query_params
161 |     url = str(url)
162 |     logger.info(f"Requesting data from API with URL: {url}")
163 |     response = requests.get(url)
164 |     logger.info(f"Response received from API with status code: {response.status_code} ")
165 | 
166 |     # Parse API response.
167 |     try:
168 |         response = response.json()
169 |     except JSONDecodeError:
170 |         logger.error(
171 |             f"Response status = {response.status_code}. Could not decode response from API with URL: {url}"
172 |         )
173 | 
174 |         return None
175 | 
176 |     records = response["records"]
177 |     records = pd.DataFrame.from_records(records)
178 | 
179 |     return records
180 | 
181 | def _compute_extraction_window(export_end_reference_datetime: datetime.datetime, days_delay: int, days_export: int) -> Tuple[datetime.datetime, datetime.datetime]:
182 |     """Compute the extraction window relative to 'export_end_reference_datetime' and take into consideration the maximum and minimum data points available in the dataset."""
183 | 
184 |     if export_end_reference_datetime is None:
185 |         # As the dataset will expire in July 2023, we set the export end reference datetime to the last day of June 2023 + the delay.
186 |         export_end_reference_datetime = datetime.datetime(
187 |             2023, 6, 30, 21, 0, 0
188 |         ) + datetime.timedelta(days=days_delay)
189 |         export_end_reference_datetime = export_end_reference_datetime.replace(
190 |             minute=0, second=0, microsecond=0
191 |         )
192 |     else:
193 |         export_end_reference_datetime = export_end_reference_datetime.replace(
194 |             minute=0, second=0, microsecond=0
195 |         )
196 | 
197 |     # TODO: Change the API source, until then we have to clamp the export_end_reference_datetime to the last day of June 2023 to simulate the same behavior.
198 |     expiring_dataset_datetime = datetime.datetime(2023, 6, 30, 21, 0, 0) + datetime.timedelta(
199 |         days=days_delay
200 |     )
201 |     if export_end_reference_datetime > expiring_dataset_datetime:
202 |         export_end_reference_datetime = expiring_dataset_datetime
203 | 
204 |         logger.warning(
205 |             "We clapped 'export_end_reference_datetime' to 'datetime(2023, 6, 30) + datetime.timedelta(days=days_delay)' as \
206 |         the dataset will not be updated starting from July 2023. The dataset will expire during 2023. \
207 |         Check out the following link for more information: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour"
208 |         )
209 | 
210 |     export_end = export_end_reference_datetime - datetime.timedelta(days=days_delay)
211 |     export_start = export_end_reference_datetime - datetime.timedelta(
212 |         days=days_delay + days_export
213 |     )
214 | 
215 |     min_export_start = datetime.datetime(2020, 6, 30, 22, 0, 0)
216 |     if export_start < min_export_start:
217 |         export_start = min_export_start
218 |         export_end = export_start + datetime.timedelta(days=days_export)
219 | 
220 |         logger.warning(
221 |             "We clapped 'export_start' to 'datetime(2020, 6, 30, 22, 0, 0)' and 'export_end' to 'export_start + datetime.timedelta(days=days_export)' as this is the latest window available in the dataset."
222 |         )
223 | 
224 |     return export_start, export_end
225 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/etl/load.py:
--------------------------------------------------------------------------------
 1 | import hopsworks
 2 | import pandas as pd
 3 | from great_expectations.core import ExpectationSuite
 4 | from hsfs.feature_group import FeatureGroup
 5 | 
 6 | from feature_pipeline.settings import SETTINGS
 7 | 
 8 | 
 9 | def to_feature_store(
10 |     data: pd.DataFrame,
11 |     validation_expectation_suite: ExpectationSuite,
12 |     feature_group_version: int,
13 | ) -> FeatureGroup:
14 |     """
15 |     This function takes in a pandas DataFrame and a validation expectation suite,
16 |     performs validation on the data using the suite, and then saves the data to a
17 |     feature store in the feature store.
18 |     """
19 | 
20 |     # Connect to feature store.
21 |     project = hopsworks.login(
22 |         api_key_value=SETTINGS["FS_API_KEY"], project=SETTINGS["FS_PROJECT_NAME"]
23 |     )
24 |     feature_store = project.get_feature_store()
25 | 
26 |     # Create feature group.
27 |     energy_feature_group = feature_store.get_or_create_feature_group(
28 |         name="energy_consumption_denmark",
29 |         version=feature_group_version,
30 |         description="Denmark hourly energy consumption data. Data is uploaded with an 15 days delay.",
31 |         primary_key=["area", "consumer_type"],
32 |         event_time="datetime_utc",
33 |         online_enabled=False,
34 |         expectation_suite=validation_expectation_suite,
35 |     )
36 |     # Upload data.
37 |     energy_feature_group.insert(
38 |         features=data,
39 |         overwrite=False,
40 |         write_options={
41 |             "wait_for_job": True,
42 |         },
43 |     )
44 | 
45 |     # Add feature descriptions.
46 |     feature_descriptions = [
47 |         {
48 |             "name": "datetime_utc",
49 |             "description": """
50 |                             Datetime interval in UTC when the data was observed.
51 |                             """,
52 |             "validation_rules": "Always full hours, i.e. minutes are 00",
53 |         },
54 |         {
55 |             "name": "area",
56 |             "description": """
57 |                             Denmark is divided in two price areas, divided by the Great Belt: DK1 and DK2.
58 |                             If price area is “DK”, the data covers all Denmark.
59 |                             """,
60 |             "validation_rules": "0 (DK), 1 (DK1) or 2 (Dk2) (int)",
61 |         },
62 |         {
63 |             "name": "consumer_type",
64 |             "description": """
65 |                             The consumer type is the Industry Code DE35 which is owned by Danish Energy. 
66 |                             The code is used by Danish energy companies.
67 |                             """,
68 |             "validation_rules": ">0 (int)",
69 |         },
70 |         {
71 |             "name": "energy_consumption",
72 |             "description": "Total electricity consumption in kWh.",
73 |             "validation_rules": ">=0 (float)",
74 |         },
75 |     ]
76 |     for description in feature_descriptions:
77 |         energy_feature_group.update_feature_description(
78 |             description["name"], description["description"]
79 |         )
80 | 
81 |     # Update statistics.
82 |     energy_feature_group.statistics_config = {
83 |         "enabled": True,
84 |         "histograms": True,
85 |         "correlations": True,
86 |     }
87 |     energy_feature_group.update_statistics_config()
88 |     energy_feature_group.compute_statistics()
89 | 
90 |     return energy_feature_group
91 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/etl/validation.py:
--------------------------------------------------------------------------------
  1 | from great_expectations.core import ExpectationSuite, ExpectationConfiguration
  2 | 
  3 | 
  4 | def build_expectation_suite() -> ExpectationSuite:
  5 |     """
  6 |     Builder used to retrieve an instance of the validation expectation suite.
  7 |     """
  8 | 
  9 |     expectation_suite_energy_consumption = ExpectationSuite(
 10 |         expectation_suite_name="energy_consumption_suite"
 11 |     )
 12 | 
 13 |     # Columns.
 14 |     expectation_suite_energy_consumption.add_expectation(
 15 |         ExpectationConfiguration(
 16 |             expectation_type="expect_table_columns_to_match_ordered_list",
 17 |             kwargs={
 18 |                 "column_list": [
 19 |                     "datetime_utc",
 20 |                     "area",
 21 |                     "consumer_type",
 22 |                     "energy_consumption",
 23 |                 ]
 24 |             },
 25 |         )
 26 |     )
 27 |     expectation_suite_energy_consumption.add_expectation(
 28 |         ExpectationConfiguration(
 29 |             expectation_type="expect_table_column_count_to_equal", kwargs={"value": 4}
 30 |         )
 31 |     )
 32 | 
 33 |     # Datetime UTC
 34 |     expectation_suite_energy_consumption.add_expectation(
 35 |         ExpectationConfiguration(
 36 |             expectation_type="expect_column_values_to_not_be_null",
 37 |             kwargs={"column": "datetime_utc"},
 38 |         )
 39 |     )
 40 | 
 41 |     # Area
 42 |     expectation_suite_energy_consumption.add_expectation(
 43 |         ExpectationConfiguration(
 44 |             expectation_type="expect_column_distinct_values_to_be_in_set",
 45 |             kwargs={"column": "area", "value_set": (0, 1, 2)},
 46 |         )
 47 |     )
 48 |     expectation_suite_energy_consumption.add_expectation(
 49 |         ExpectationConfiguration(
 50 |             expectation_type="expect_column_values_to_be_of_type",
 51 |             kwargs={"column": "area", "type_": "int8"},
 52 |         )
 53 |     )
 54 | 
 55 |     # Consumer type
 56 |     expectation_suite_energy_consumption.add_expectation(
 57 |         ExpectationConfiguration(
 58 |             expectation_type="expect_column_distinct_values_to_be_in_set",
 59 |             kwargs={
 60 |                 "column": "consumer_type",
 61 |                 "value_set": (
 62 |                     111,
 63 |                     112,
 64 |                     119,
 65 |                     121,
 66 |                     122,
 67 |                     123,
 68 |                     130,
 69 |                     211,
 70 |                     212,
 71 |                     215,
 72 |                     220,
 73 |                     310,
 74 |                     320,
 75 |                     330,
 76 |                     340,
 77 |                     350,
 78 |                     360,
 79 |                     370,
 80 |                     381,
 81 |                     382,
 82 |                     390,
 83 |                     410,
 84 |                     421,
 85 |                     422,
 86 |                     431,
 87 |                     432,
 88 |                     433,
 89 |                     441,
 90 |                     442,
 91 |                     443,
 92 |                     444,
 93 |                     445,
 94 |                     446,
 95 |                     447,
 96 |                     450,
 97 |                     461,
 98 |                     462,
 99 |                     999,
100 |                 ),
101 |             },
102 |         )
103 |     )
104 |     expectation_suite_energy_consumption.add_expectation(
105 |         ExpectationConfiguration(
106 |             expectation_type="expect_column_values_to_be_of_type",
107 |             kwargs={"column": "consumer_type", "type_": "int32"},
108 |         )
109 |     )
110 | 
111 |     # Energy consumption
112 |     expectation_suite_energy_consumption.add_expectation(
113 |         ExpectationConfiguration(
114 |             expectation_type="expect_column_min_to_be_between",
115 |             kwargs={
116 |                 "column": "energy_consumption",
117 |                 "min_value": 0,
118 |                 "strict_min": False,
119 |             },
120 |         )
121 |     )
122 |     expectation_suite_energy_consumption.add_expectation(
123 |         ExpectationConfiguration(
124 |             expectation_type="expect_column_values_to_be_of_type",
125 |             kwargs={"column": "energy_consumption", "type_": "float64"},
126 |         )
127 |     )
128 |     expectation_suite_energy_consumption.add_expectation(
129 |         ExpectationConfiguration(
130 |             expectation_type="expect_column_values_to_not_be_null",
131 |             kwargs={"column": "energy_consumption"},
132 |         )
133 |     )
134 | 
135 |     return expectation_suite_energy_consumption
136 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/feature_view.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from typing import Optional
  3 | 
  4 | import fire
  5 | import hopsworks
  6 | 
  7 | from feature_pipeline import utils
  8 | from feature_pipeline import settings
  9 | import hsfs
 10 | 
 11 | 
 12 | logger = utils.get_logger(__name__)
 13 | 
 14 | 
 15 | def create(
 16 |     feature_group_version: Optional[int] = None,
 17 |     start_datetime: Optional[datetime] = None,
 18 |     end_datetime: Optional[datetime] = None,
 19 | ) -> dict:
 20 |     """Create a new feature view version and training dataset
 21 |     based on the given feature group version and start and end datetimes.
 22 | 
 23 |     Args:
 24 |         feature_group_version (Optional[int]): The version of the
 25 |             feature group. If None is provided, it will try to load it
 26 |             from the cached feature_pipeline_metadata.json file.
 27 |         start_datetime (Optional[datetime]): The start
 28 |             datetime of the training dataset that will be created.
 29 |             If None is provided, it will try to load it
 30 |             from the cached feature_pipeline_metadata.json file.
 31 |         end_datetime (Optional[datetime]): The end
 32 |             datetime of the training dataset that will be created.
 33 |               If None is provided, it will try to load it
 34 |             from the cached feature_pipeline_metadata.json file.
 35 | 
 36 |     Returns:
 37 |         dict: The feature group version.
 38 | 
 39 |     """
 40 | 
 41 |     if feature_group_version is None:
 42 |         feature_pipeline_metadata = utils.load_json("feature_pipeline_metadata.json")
 43 |         feature_group_version = feature_pipeline_metadata["feature_group_version"]
 44 | 
 45 |     if start_datetime is None or end_datetime is None:
 46 |         feature_pipeline_metadata = utils.load_json("feature_pipeline_metadata.json")
 47 |         start_datetime = datetime.strptime(
 48 |             feature_pipeline_metadata["export_datetime_utc_start"],
 49 |             feature_pipeline_metadata["datetime_format"],
 50 |         )
 51 |         end_datetime = datetime.strptime(
 52 |             feature_pipeline_metadata["export_datetime_utc_end"],
 53 |             feature_pipeline_metadata["datetime_format"],
 54 |         )
 55 | 
 56 |     project = hopsworks.login(
 57 |         api_key_value=settings.SETTINGS["FS_API_KEY"],
 58 |         project=settings.SETTINGS["FS_PROJECT_NAME"],
 59 |     )
 60 |     fs = project.get_feature_store()
 61 | 
 62 |     # Delete old feature views as the free tier only allows 100 feature views.
 63 |     # NOTE: Normally you would not want to delete feature views. We do it here just to stay in the free tier.
 64 |     try:
 65 |         feature_views = fs.get_feature_views(name="energy_consumption_denmark_view")
 66 |     except hsfs.client.exceptions.RestAPIError:
 67 |         logger.info("No feature views found for energy_consumption_denmark_view.")
 68 | 
 69 |         feature_views = []
 70 | 
 71 |     for feature_view in feature_views:
 72 |         try:
 73 |             feature_view.delete_all_training_datasets()
 74 |         except hsfs.client.exceptions.RestAPIError:
 75 |             logger.error(
 76 |                 f"Failed to delete training datasets for feature view {feature_view.name} with version {feature_view.version}."
 77 |             )
 78 | 
 79 |         try:
 80 |             feature_view.delete()
 81 |         except hsfs.client.exceptions.RestAPIError:
 82 |             logger.error(
 83 |                 f"Failed to delete feature view {feature_view.name} with version {feature_view.version}."
 84 |             )
 85 | 
 86 |     # Create feature view in the given feature group version.
 87 |     energy_consumption_fg = fs.get_feature_group(
 88 |         "energy_consumption_denmark", version=feature_group_version
 89 |     )
 90 |     ds_query = energy_consumption_fg.select_all()
 91 |     feature_view = fs.create_feature_view(
 92 |         name="energy_consumption_denmark_view",
 93 |         description="Energy consumption for Denmark forecasting model.",
 94 |         query=ds_query,
 95 |         labels=[],
 96 |     )
 97 | 
 98 |     # Create training dataset.
 99 |     logger.info(
100 |         f"Creating training dataset between {start_datetime} and {end_datetime}."
101 |     )
102 |     feature_view.create_training_data(
103 |         description="Energy consumption training dataset",
104 |         data_format="csv",
105 |         start_time=start_datetime,
106 |         end_time=end_datetime,
107 |         write_options={"wait_for_job": True},
108 |         coalesce=False,
109 |     )
110 | 
111 |     # Save metadata.
112 |     metadata = {
113 |         "feature_view_version": feature_view.version,
114 |         "training_dataset_version": 1,
115 |     }
116 |     utils.save_json(
117 |         metadata,
118 |         file_name="feature_view_metadata.json",
119 |     )
120 | 
121 |     return metadata
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     fire.Fire(create)
126 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/pipeline.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from typing import Optional
 3 | import fire
 4 | import pandas as pd
 5 | 
 6 | from feature_pipeline.etl import cleaning, load, extract, validation
 7 | from feature_pipeline import utils
 8 | 
 9 | logger = utils.get_logger(__name__)
10 | 
11 | 
12 | def run(
13 |     export_end_reference_datetime: Optional[datetime.datetime] = None,
14 |     days_delay: int = 15,
15 |     days_export: int = 30,
16 |     url: str = "https://drive.google.com/uc?export=download&id=1y48YeDymLurOTUO-GeFOUXVNc9MCApG5",
17 |     feature_group_version: int = 1,
18 | ) -> dict:
19 |     """
20 |     Extract data from the API, transform it, and load it to the feature store.
21 | 
22 |     As the official API expired in July 2023, we will use a copy of the data to simulate the same behavior. 
23 |     We made a copy of the data between '2020-06-30 22:00' and '2023-06-30 21:00'. Thus, there are 3 years of data to play with.
24 | 
25 |     Here is the link to the official obsolete dataset: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour
26 |     Here is the link to the copy of the dataset: https://drive.google.com/file/d/1y48YeDymLurOTUO-GeFOUXVNc9MCApG5/view?usp=drive_link
27 | 
28 |     Args:
29 |         export_end_reference_datetime: The end reference datetime of the export window. If None, the current time is used.
30 |             Because the data is always delayed with "days_delay" days, this date is used only as a reference point.
31 |             The real extracted window will be computed as [export_end_reference_datetime - days_delay - days_export, export_end_reference_datetime - days_delay].
32 |         days_delay: Data has a delay of N days. Thus, we have to shift our window with N days.
33 |         days_export: The number of days to export.
34 |         url: The URL of the API or of the copy of the data stored on GitHub.
35 |         feature_group_version: The version of the feature store feature group to save the data to.
36 | 
37 |     Returns:
38 |           A dictionary containing metadata of the pipeline.
39 |     """
40 | 
41 |     logger.info(f"Extracting data from API.")
42 |     data, metadata = extract.from_file(
43 |         export_end_reference_datetime, days_delay, days_export, url
44 |     )
45 |     if metadata["num_unique_samples_per_time_series"] < days_export * 24:
46 |         raise RuntimeError(
47 |             f"Could not extract the expected number of samples from the api: {metadata['num_unique_samples_per_time_series']} < {days_export * 24}. \
48 |             Check out the API at: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour "
49 |         )
50 |     logger.info("Successfully extracted data from API.")
51 | 
52 |     logger.info(f"Transforming data.")
53 |     data = transform(data)
54 |     logger.info("Successfully transformed data.")
55 | 
56 |     logger.info("Building validation expectation suite.")
57 |     validation_expectation_suite = validation.build_expectation_suite()
58 |     logger.info("Successfully built validation expectation suite.")
59 | 
60 |     logger.info(f"Validating data and loading it to the feature store.")
61 |     load.to_feature_store(
62 |         data,
63 |         validation_expectation_suite=validation_expectation_suite,
64 |         feature_group_version=feature_group_version,
65 |     )
66 |     metadata["feature_group_version"] = feature_group_version
67 |     logger.info("Successfully validated data and loaded it to the feature store.")
68 | 
69 |     logger.info(f"Wrapping up the pipeline.")
70 |     utils.save_json(metadata, file_name="feature_pipeline_metadata.json")
71 |     logger.info("Done!")
72 | 
73 |     return metadata
74 | 
75 | 
76 | def transform(data: pd.DataFrame):
77 |     """
78 |     Wrapper containing all the transformations from the ETL pipeline.
79 |     """
80 | 
81 |     data = cleaning.rename_columns(data)
82 |     data = cleaning.cast_columns(data)
83 |     data = cleaning.encode_area_column(data)
84 | 
85 |     return data
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     fire.Fire(run)
90 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from typing import Union
 4 | 
 5 | from dotenv import load_dotenv
 6 | 
 7 | 
 8 | def load_env_vars(root_dir: Union[str, Path]) -> dict:
 9 |     """
10 |     Load environment variables from .env.default and .env files.
11 | 
12 |     Args:
13 |         root_dir: Root directory of the .env files.
14 | 
15 |     Returns:
16 |         Dictionary with the environment variables.
17 |     """
18 | 
19 |     if isinstance(root_dir, str):
20 |         root_dir = Path(root_dir)
21 | 
22 |     load_dotenv(dotenv_path=root_dir / ".env.default")
23 |     load_dotenv(dotenv_path=root_dir / ".env", override=True)
24 | 
25 |     return dict(os.environ)
26 | 
27 | 
28 | def get_root_dir(default_value: str = ".") -> Path:
29 |     """
30 |     Get the root directory of the project.
31 | 
32 |     Args:
33 |         default_value: Default value to use if the environment variable is not set.
34 | 
35 |     Returns:
36 |         Path to the root directory of the project.
37 |     """
38 | 
39 |     return Path(os.getenv("ML_PIPELINE_ROOT_DIR", default_value))
40 | 
41 | 
42 | ML_PIPELINE_ROOT_DIR = get_root_dir()
43 | OUTPUT_DIR = ML_PIPELINE_ROOT_DIR / "output"
44 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
45 | 
46 | SETTINGS = load_env_vars(root_dir=ML_PIPELINE_ROOT_DIR)
47 | 


--------------------------------------------------------------------------------
/feature-pipeline/feature_pipeline/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from feature_pipeline import settings
 6 | 
 7 | 
 8 | def get_logger(name: str) -> logging.Logger:
 9 |     """
10 |     Template for getting a logger.
11 | 
12 |     Args:
13 |         name: Name of the logger.
14 | 
15 |     Returns: Logger.
16 |     """
17 | 
18 |     logging.basicConfig(level=logging.INFO)
19 |     logger = logging.getLogger(name)
20 | 
21 |     return logger
22 | 
23 | 
24 | def save_json(data: dict, file_name: str, save_dir: str = settings.OUTPUT_DIR):
25 |     """
26 |     Save a dictionary as a JSON file.
27 | 
28 |     Args:
29 |         data: data to save.
30 |         file_name: Name of the JSON file.
31 |         save_dir: Directory to save the JSON file.
32 | 
33 |     Returns: None
34 |     """
35 | 
36 |     data_path = Path(save_dir) / file_name
37 |     with open(data_path, "w") as f:
38 |         json.dump(data, f)
39 | 
40 | 
41 | def load_json(file_name: str, save_dir: str = settings.OUTPUT_DIR) -> dict:
42 |     """
43 |     Load a JSON file.
44 | 
45 |     Args:
46 |         file_name: Name of the JSON file.
47 |         save_dir: Directory of the JSON file.
48 | 
49 |     Returns: Dictionary with the data.
50 |     """
51 | 
52 |     data_path = Path(save_dir) / file_name
53 |     if not data_path.exists():
54 |         raise FileNotFoundError(f"Cached JSON from {data_path} does not exist.")
55 | 
56 |     with open(data_path, "r") as f:
57 |         return json.load(f)
58 | 


--------------------------------------------------------------------------------
/feature-pipeline/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "feature-pipeline"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Iusztin Paul <p.e.iusztin@gmail.com>", "Kurtis Pykes <kurtispykes@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "feature_pipeline"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "~3.9"
11 | hopsworks = "3.4.3"
12 | fire = "^0.5.0"
13 | yarl = "^1.8.2"
14 | pandas = ">=1.3.5"
15 | requests = "^2.28.2"
16 | python-dotenv = ">=0.21.1"
17 | 
18 | [tool.poetry.group.dev.dependencies]
19 | black = "^23.1.0"
20 | 
21 | [build-system]
22 | requires = ["poetry-core"]
23 | build-backend = "poetry.core.masonry.api"
24 | 


--------------------------------------------------------------------------------
/images/airflow_login_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/airflow_login_screenshot.png


--------------------------------------------------------------------------------
/images/airflow_ml_pipeline_dag_overview_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/airflow_ml_pipeline_dag_overview_screenshot.png


--------------------------------------------------------------------------------
/images/airflow_ml_pipeline_dag_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/airflow_ml_pipeline_dag_screenshot.png


--------------------------------------------------------------------------------
/images/airflow_variables_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/airflow_variables_screenshot.png


--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/architecture.png


--------------------------------------------------------------------------------
/images/forecasting_demo_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/forecasting_demo_screenshot.png


--------------------------------------------------------------------------------
/images/gcp_expose_ports_firewall_rule_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_expose_ports_firewall_rule_screenshot.png


--------------------------------------------------------------------------------
/images/gcp_gcs_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_gcs_screenshot.png


--------------------------------------------------------------------------------
/images/gcp_iap_for_tcp_firewall_rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_iap_for_tcp_firewall_rule.png


--------------------------------------------------------------------------------
/images/gcp_ssh_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_ssh_screenshot.png


--------------------------------------------------------------------------------
/images/gcp_vm_external_ip_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_vm_external_ip_screenshot.png


--------------------------------------------------------------------------------
/images/github_actions_secrets_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/github_actions_secrets_screenshot.png


--------------------------------------------------------------------------------
/images/github_actions_see_cicd_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/github_actions_see_cicd_screenshot.png


--------------------------------------------------------------------------------
/images/github_actions_variables_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/github_actions_variables_screenshot.png


--------------------------------------------------------------------------------
/images/gmail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gmail.png


--------------------------------------------------------------------------------
/images/linkedin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/linkedin.png


--------------------------------------------------------------------------------
/images/medium.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/medium.png


--------------------------------------------------------------------------------
/images/screenshot_introduction_video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/screenshot_introduction_video.png


--------------------------------------------------------------------------------
/images/substack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/substack.png


--------------------------------------------------------------------------------
/images/twitter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/twitter.png


--------------------------------------------------------------------------------
/scripts/install_poetry_macos_m1_chip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Fetch the utils.sh script from a URL and source it
 4 | UTILS_SCRIPT=$(curl -s https://raw.githubusercontent.com/gao-hongnan/common-utils/main/scripts/utils.sh)
 5 | source /dev/stdin <<<"$UTILS_SCRIPT"
 6 | logger "INFO" "Fetched the utils.sh script from a URL and sourced it"
 7 | 
 8 | resolve_hopswork() {
 9 |   # Check if librdkafka is installed and at the correct version
10 |   installed_versions=$(brew list --versions librdkafka)
11 |   required_version="1.9.2"  # replace this with the version you want
12 | 
13 |   if ! echo "$installed_versions" | grep -q "$required_version"; then
14 |     # If librdkafka is not installed or not at the correct version, proceed with installation
15 | 
16 |     # see https://community.hopsworks.ai/t/ssl-handshake-failed-on-macos-hopsworks-serverless/886/3
17 |     curl -O https://raw.githubusercontent.com/Homebrew/homebrew-core/f7d0f40bbc4075177ecf16812fd95951a723a996/Formula/librdkafka.rb
18 |     brew install --build-from-source librdkafka.rb
19 |     rm librdkafka.rb
20 |   else
21 |     logger "INFO" "librdkafka is already installed at version: $(brew list --versions librdkafka)"
22 |   fi
23 | 
24 |   # Set VERSION to the required version, assuming it is now installed
25 |   VERSION=$required_version
26 |   # use below if the librdkafka version is fixed
27 |   # VERSION=$(ls /opt/homebrew/Cellar/librdkafka | tail -n 1)
28 | 
29 |   # Export necessary environment variables
30 |   export C_INCLUDE_PATH=/opt/homebrew/Cellar/librdkafka/$VERSION/include
31 |   export LIBRARY_PATH=/opt/homebrew/Cellar/librdkafka/$VERSION/lib
32 | }
33 | 
34 | resolve_lightgbm() {
35 |   # see https://stackoverflow.com/questions/74566704/cannot-install-lightgbm-3-3-3-on-apple-silicon
36 |   # Check if cmake is installed
37 |   if ! brew list --versions cmake >/dev/null; then
38 |     # If cmake is not installed, install it
39 |     brew install cmake
40 |   else
41 |     logger "INFO" "cmake is already installed at version: $(cmake --version)"
42 |   fi
43 | 
44 |   # Check if libomp is installed
45 |   if ! brew list --versions libomp >/dev/null; then
46 |     # If libomp is not installed, install it
47 |     brew install libomp
48 |   else
49 |     logger "INFO" "libomp is already installed at version: $(brew list --versions libomp)"
50 |   fi
51 | }
52 | 
53 | 
54 | custom_install_hopswork_and_lightgbm_if_arm64() {
55 |     # Check if on macOS with M1 or ARM chip
56 |     if [[ "$(uname -m)" == "arm64" ]]; then
57 |         logger "INFO" "Installing librdkafka for M1 chip"
58 |         resolve_hopswork
59 |         resolve_lightgbm
60 |     fi
61 | }
62 | 
63 | custom_install_hopswork_and_lightgbm_if_arm64


--------------------------------------------------------------------------------
/training-pipeline/.env.default:
--------------------------------------------------------------------------------
1 | FS_API_KEY = "<your-feature-store-api-key>"
2 | FS_PROJECT_NAME = "<your-feature-store-project-name>"
3 | WANDB_API_KEY = "<your-wandb-api-key>"
4 | WANDB_ENTITY = "teaching-mlops"
5 | WANDB_PROJECT = "energy_consumption"
6 | GOOGLE_CLOUD_PROJECT = "energy_consumption"
7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions"
8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "path/to/your/service-account.json"
9 | 


--------------------------------------------------------------------------------
/training-pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # Training Pipeline
 2 | 
 3 | Check out [Lesson 2](https://medium.com/towards-data-science/a-guide-to-building-effective-training-pipelines-for-maximum-results-6fdaef594cee) on Medium to better understand how we built the training pipeline.
 4 | 
 5 | ## Install for Development
 6 | 
 7 | Create virtual environment:
 8 | ```shell
 9 | cd training-pipeline
10 | poetry shell
11 | poetry install
12 | ```
13 | 
14 | Check the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#-set-up-additional-tools-) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections to see **how to set up** the **additional tools** and **credentials** you need to run this project.
15 | 
16 | 
17 | ## Usage for Development
18 | 
19 | </br> **Run the scripts in the following order:** </br></br>
20 | 
21 | 
22 | 1. Start the hyperparameter tuning script:
23 | ```shell
24 | python -m training_pipeline.hyperparameter_tuning
25 | ```
26 | 
27 | 2. Upload the best config based on the previous hyperparameter tuning step:
28 | ```shell
29 | python -m training_pipeline.best_config
30 | ```
31 | 3. Start the training script using the best configuration uploaded one step before:
32 | ```shell
33 | python -m training_pipeline.train
34 | ```
35 | 
36 | **NOTE:** Be careful to complete the `.env` file and set the `ML_PIPELINE_ROOT_DIR` variable as explained in the [Set Up the ML_PIPELINE_ROOT_DIR Variable](https://github.com/iusztinpaul/energy-forecasting#set-up-the-ml_pipeline_root_dir-variable) section of the main README.
37 | 


--------------------------------------------------------------------------------
/training-pipeline/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "training-pipeline"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Iusztin Paul <p.e.iusztin@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "training_pipeline"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "~3.9"
11 | pyarrow = "^11.0.0"
12 | wandb = "^0.14.0"
13 | matplotlib = "^3.7.1"
14 | hopsworks = "3.4.3"
15 | python-dotenv = "^1.0.0"
16 | lightgbm = "^3.3.5"
17 | sktime = "^0.16.1"
18 | seaborn = "^0.12.2"
19 | fire = "^0.5.0"
20 | Jinja2 = "3.0.1"
21 | 
22 | [tool.poetry.group.dev.dependencies]
23 | black = "^23.1.0"
24 | 
25 | [build-system]
26 | requires = ["poetry-core"]
27 | build-backend = "poetry.core.masonry.api"
28 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/training-pipeline/training_pipeline/__init__.py


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/best_config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import fire
 3 | import wandb
 4 | 
 5 | from typing import Optional
 6 | 
 7 | from training_pipeline import utils
 8 | from training_pipeline.settings import SETTINGS, OUTPUT_DIR
 9 | 
10 | logger = utils.get_logger(__name__)
11 | 
12 | 
13 | """
14 | NOTE: We moved the log best model logic to a different process as there is a bug in W&B sweeps that whatever you do, 
15 | when you create a new run after a sweep, it will override the last run of the sweep. 
16 | This will result in overriding the wrong run and getting the wrong config.
17 | """
18 | 
19 | 
20 | def upload(sweep_id: Optional[str] = None):
21 |     """Upload the best config from the given sweep to the "best_experiment" wandb Artifact.
22 | 
23 |     Args:
24 |         sweep_id (Optional[str], optional): Sweep ID to look for the best config. If None, it will look for the last sweep in the cached last_sweep_metadata.json file. Defaults to None.
25 |     """
26 | 
27 |     if sweep_id is None:
28 |         last_sweep_metadata = utils.load_json("last_sweep_metadata.json")
29 |         sweep_id = last_sweep_metadata["sweep_id"]
30 | 
31 |         logger.info(f"Loading sweep_id from last_sweep_metadata.json with {sweep_id=}")
32 | 
33 |     api = wandb.Api()
34 |     sweep = api.sweep(
35 |         f"{SETTINGS['WANDB_ENTITY']}/{SETTINGS['WANDB_PROJECT']}/{sweep_id}"
36 |     )
37 |     best_run = sweep.best_run()
38 | 
39 |     with utils.init_wandb_run(
40 |         name="best_experiment",
41 |         job_type="hpo",
42 |         group="train",
43 |         run_id=best_run.id,
44 |         resume="must",
45 |     ) as run:
46 |         run.use_artifact("config:latest")
47 | 
48 |         best_config = dict(run.config)
49 | 
50 |         logger.info(f"Best run {best_run.name}")
51 |         logger.info("Best run config:")
52 |         logger.info(best_config)
53 |         logger.info(
54 |             f"Best run = {best_run.name} with results {dict(run.summary['validation'])}"
55 |         )
56 | 
57 |         config_path = OUTPUT_DIR / "best_config.json"
58 |         with open(config_path, "w") as f:
59 |             json.dump(best_config, f, indent=4)
60 | 
61 |         artifact = wandb.Artifact(
62 |             name="best_config",
63 |             type="model",
64 |             metadata={"results": {"validation": dict(run.summary["validation"])}},
65 |         )
66 |         artifact.add_file(str(config_path))
67 |         run.log_artifact(artifact)
68 | 
69 |         run.finish()
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     fire.Fire(upload)
74 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/configs/__init__.py:
--------------------------------------------------------------------------------
1 | from training_pipeline.configs import gridsearch
2 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/configs/gridsearch.py:
--------------------------------------------------------------------------------
 1 | # NOTE: In a production environment, we would move this to a YAML file and load it from there.
 2 | #       Also, we would use random or bayesian search + early stopping to speed up the process.
 3 | sweep_configs = {
 4 |     "method": "grid",
 5 |     "metric": {"name": "validation.MAPE", "goal": "minimize"},
 6 |     "parameters": {
 7 |         "forecaster__estimator__n_jobs": {"values": [-1]},
 8 |         "forecaster__estimator__n_estimators": {"values": [1000, 2000, 2500]},
 9 |         "forecaster__estimator__learning_rate": {"values": [0.1, 0.15]},
10 |         "forecaster__estimator__max_depth": {"values": [-1, 5]},
11 |         "forecaster__estimator__reg_lambda": {"values": [0, 0.01, 0.015]},
12 |         "daily_season__manual_selection": {"values": [["day_of_week", "hour_of_day"]]},
13 |         "forecaster_transformers__window_summarizer__lag_feature__lag": {
14 |             "values": [list(range(1, 73))]
15 |         },
16 |         "forecaster_transformers__window_summarizer__lag_feature__mean": {
17 |             "values": [[[1, 24], [1, 48], [1, 72]]]
18 |         },
19 |         "forecaster_transformers__window_summarizer__lag_feature__std": {
20 |             "values": [[[1, 24], [1, 48]]]
21 |         },
22 |         "forecaster_transformers__window_summarizer__n_jobs": {"values": [1]},
23 |     },
24 | }
25 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/data.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | import hopsworks
  3 | import pandas as pd
  4 | import wandb
  5 | 
  6 | from sktime.forecasting.model_selection import temporal_train_test_split
  7 | 
  8 | from training_pipeline.utils import init_wandb_run
  9 | from training_pipeline.settings import SETTINGS
 10 | 
 11 | 
 12 | def load_dataset_from_feature_store(
 13 |     feature_view_version: int, training_dataset_version: int, fh: int = 24
 14 | ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
 15 |     """Load features from feature store.
 16 | 
 17 |     Args:
 18 |         feature_view_version (int): feature store feature view version to load data from
 19 |         training_dataset_version (int): feature store training dataset version to load data from
 20 |         fh (int, optional): Forecast horizon. Defaults to 24.
 21 | 
 22 |     Returns:
 23 |         Train and test splits loaded from the feature store as pandas dataframes.
 24 |     """
 25 | 
 26 |     project = hopsworks.login(
 27 |         api_key_value=SETTINGS["FS_API_KEY"], project=SETTINGS["FS_PROJECT_NAME"]
 28 |     )
 29 |     fs = project.get_feature_store()
 30 | 
 31 |     with init_wandb_run(
 32 |         name="load_training_data", job_type="load_feature_view", group="dataset"
 33 |     ) as run:
 34 |         feature_view = fs.get_feature_view(
 35 |             name="energy_consumption_denmark_view", version=feature_view_version
 36 |         )
 37 |         data, _ = feature_view.get_training_data(
 38 |             training_dataset_version=training_dataset_version
 39 |         )
 40 | 
 41 |         fv_metadata = feature_view.to_dict()
 42 |         fv_metadata["query"] = fv_metadata["query"].to_string()
 43 |         fv_metadata["features"] = [f.name for f in fv_metadata["features"]]
 44 |         fv_metadata["link"] = feature_view._feature_view_engine._get_feature_view_url(
 45 |             feature_view
 46 |         )
 47 |         fv_metadata["feature_view_version"] = feature_view_version
 48 |         fv_metadata["training_dataset_version"] = training_dataset_version
 49 | 
 50 |         raw_data_at = wandb.Artifact(
 51 |             name="energy_consumption_denmark_feature_view",
 52 |             type="feature_view",
 53 |             metadata=fv_metadata,
 54 |         )
 55 |         run.log_artifact(raw_data_at)
 56 | 
 57 |         run.finish()
 58 | 
 59 |     with init_wandb_run(
 60 |         name="train_test_split", job_type="prepare_dataset", group="dataset"
 61 |     ) as run:
 62 |         run.use_artifact("energy_consumption_denmark_feature_view:latest")
 63 | 
 64 |         y_train, y_test, X_train, X_test = prepare_data(data, fh=fh)
 65 | 
 66 |         for split in ["train", "test"]:
 67 |             split_X = locals()[f"X_{split}"]
 68 |             split_y = locals()[f"y_{split}"]
 69 | 
 70 |             split_metadata = {
 71 |                 "timespan": [
 72 |                     split_X.index.get_level_values(-1).min(),
 73 |                     split_X.index.get_level_values(-1).max(),
 74 |                 ],
 75 |                 "dataset_size": len(split_X),
 76 |                 "num_areas": len(split_X.index.get_level_values(0).unique()),
 77 |                 "num_consumer_types": len(split_X.index.get_level_values(1).unique()),
 78 |                 "y_features": split_y.columns.tolist(),
 79 |                 "X_features": split_X.columns.tolist(),
 80 |             }
 81 |             artifact = wandb.Artifact(
 82 |                 name=f"split_{split}",
 83 |                 type="split",
 84 |                 metadata=split_metadata,
 85 |             )
 86 |             run.log_artifact(artifact)
 87 | 
 88 |         run.finish()
 89 | 
 90 |     return y_train, y_test, X_train, X_test
 91 | 
 92 | 
 93 | def prepare_data(
 94 |     data: pd.DataFrame, target: str = "energy_consumption", fh: int = 24
 95 | ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
 96 |     """
 97 |     Structure the data for training:
 98 |     - Set the index as is required by sktime.
 99 |     - Prepare exogenous variables.
100 |     - Prepare the time series to be forecasted.
101 |     - Split the data into train and test sets.
102 |     """
103 | 
104 |     # Set the index as is required by sktime.
105 |     data["datetime_utc"] = pd.PeriodIndex(data["datetime_utc"], freq="H")
106 |     data = data.set_index(["area", "consumer_type", "datetime_utc"]).sort_index()
107 | 
108 |     # Prepare exogenous variables.
109 |     X = data.drop(columns=[target])
110 |     # Prepare the time series to be forecasted.
111 |     y = data[[target]]
112 | 
113 |     y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, test_size=fh)
114 | 
115 |     return y_train, y_test, X_train, X_test
116 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/hyperparameter_tuning.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from typing import Optional
  3 | 
  4 | import fire
  5 | import numpy as np
  6 | import pandas as pd
  7 | import wandb
  8 | 
  9 | from matplotlib import pyplot as plt
 10 | from sktime.forecasting.model_evaluation import evaluate as cv_evaluate
 11 | from sktime.forecasting.model_selection import ExpandingWindowSplitter
 12 | from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError
 13 | from sktime.utils.plotting import plot_windows
 14 | 
 15 | from training_pipeline import utils
 16 | from training_pipeline.configs import gridsearch as gridsearch_configs
 17 | from training_pipeline.data import load_dataset_from_feature_store
 18 | from training_pipeline.models import build_model
 19 | from training_pipeline.utils import init_wandb_run
 20 | from training_pipeline.settings import SETTINGS, OUTPUT_DIR
 21 | 
 22 | 
 23 | logger = utils.get_logger(__name__)
 24 | 
 25 | 
 26 | def run(
 27 |     fh: int = 24,
 28 |     feature_view_version: Optional[int] = None,
 29 |     training_dataset_version: Optional[int] = None,
 30 | ) -> dict:
 31 |     """Run hyperparameter optimization search.
 32 | 
 33 |     Args:
 34 |         fh (int, optional): Forecasting horizon. Defaults to 24.
 35 |         feature_view_version (Optional[int], optional): feature store - feature view version.
 36 |              If none, it will try to load the version from the cached feature_view_metadata.json file. Defaults to None.
 37 |         training_dataset_version (Optional[int], optional): feature store - feature view - training dataset version.
 38 |             If none, it will try to load the version from the cached feature_view_metadata.json file. Defaults to None.
 39 | 
 40 |     Returns:
 41 |         dict: Dictionary containing metadata about the hyperparameter optimization run.
 42 |     """
 43 | 
 44 |     feature_view_metadata = utils.load_json("feature_view_metadata.json")
 45 |     if feature_view_version is None:
 46 |         feature_view_version = feature_view_metadata["feature_view_version"]
 47 |     if training_dataset_version is None:
 48 |         training_dataset_version = feature_view_metadata["training_dataset_version"]
 49 | 
 50 |     y_train, _, X_train, _ = load_dataset_from_feature_store(
 51 |         feature_view_version=feature_view_version,
 52 |         training_dataset_version=training_dataset_version,
 53 |         fh=fh,
 54 |     )
 55 | 
 56 |     sweep_id = run_hyperparameter_optimization(y_train, X_train, fh=fh)
 57 | 
 58 |     metadata = {"sweep_id": sweep_id}
 59 |     utils.save_json(metadata, file_name="last_sweep_metadata.json")
 60 | 
 61 |     return metadata
 62 | 
 63 | 
 64 | def run_hyperparameter_optimization(
 65 |     y_train: pd.DataFrame, X_train: pd.DataFrame, fh: int
 66 | ):
 67 |     """Runs hyperparameter optimization search using W&B sweeps."""
 68 | 
 69 |     sweep_id = wandb.sweep(
 70 |         sweep=gridsearch_configs.sweep_configs, project=SETTINGS["WANDB_PROJECT"]
 71 |     )
 72 | 
 73 |     wandb.agent(
 74 |         project=SETTINGS["WANDB_PROJECT"],
 75 |         sweep_id=sweep_id,
 76 |         function=partial(run_sweep, y_train=y_train, X_train=X_train, fh=fh),
 77 |     )
 78 | 
 79 |     return sweep_id
 80 | 
 81 | 
 82 | def run_sweep(y_train: pd.DataFrame, X_train: pd.DataFrame, fh: int):
 83 |     """Runs a single hyperparameter optimization step (train + CV eval) using W&B sweeps."""
 84 | 
 85 |     with init_wandb_run(
 86 |         name="experiment", job_type="hpo", group="train", add_timestamp_to_name=True
 87 |     ) as run:
 88 |         run.use_artifact("split_train:latest")
 89 | 
 90 |         config = wandb.config
 91 |         config = dict(config)
 92 |         model = build_model(config)
 93 | 
 94 |         model, results = train_model_cv(model, y_train, X_train, fh=fh)
 95 |         wandb.log(results)
 96 | 
 97 |         metadata = {
 98 |             "experiment": {"name": run.name, "fh": fh},
 99 |             "results": results,
100 |             "config": config,
101 |         }
102 |         artifact = wandb.Artifact(
103 |             name=f"config",
104 |             type="model",
105 |             metadata=metadata,
106 |         )
107 |         run.log_artifact(artifact)
108 | 
109 |         run.finish()
110 | 
111 | 
112 | def train_model_cv(
113 |     model, y_train: pd.DataFrame, X_train: pd.DataFrame, fh: int, k: int = 3
114 | ):
115 |     """Train and evaluate the given model using cross-validation."""
116 | 
117 |     data_length = len(y_train.index.get_level_values(-1).unique())
118 |     assert data_length >= fh * 10, "Not enough data to perform a 3 fold CV."
119 | 
120 |     cv_step_length = data_length // k
121 |     initial_window = max(fh * 3, cv_step_length - fh)
122 |     cv = ExpandingWindowSplitter(
123 |         step_length=cv_step_length, fh=np.arange(fh) + 1, initial_window=initial_window
124 |     )
125 |     render_cv_scheme(cv, y_train)
126 | 
127 |     results = cv_evaluate(
128 |         forecaster=model,
129 |         y=y_train,
130 |         X=X_train,
131 |         cv=cv,
132 |         strategy="refit",
133 |         scoring=MeanAbsolutePercentageError(symmetric=False),
134 |         error_score="raise",
135 |         return_data=False,
136 |     )
137 | 
138 |     results = results.rename(
139 |         columns={
140 |             "test_MeanAbsolutePercentageError": "MAPE",
141 |             "fit_time": "fit_time",
142 |             "pred_time": "prediction_time",
143 |         }
144 |     )
145 |     mean_results = results[["MAPE", "fit_time", "prediction_time"]].mean(axis=0)
146 |     mean_results = mean_results.to_dict()
147 |     results = {"validation": mean_results}
148 | 
149 |     logger.info(f"Validation MAPE: {results['validation']['MAPE']:.2f}")
150 |     logger.info(f"Mean fit time: {results['validation']['fit_time']:.2f} s")
151 |     logger.info(f"Mean predict time: {results['validation']['prediction_time']:.2f} s")
152 | 
153 |     return model, results
154 | 
155 | 
156 | def render_cv_scheme(cv, y_train: pd.DataFrame) -> str:
157 |     """Render the CV scheme used for training and log it to W&B."""
158 | 
159 |     random_time_series = (
160 |         y_train.groupby(level=[0, 1])
161 |         .get_group((1, 111))
162 |         .reset_index(level=[0, 1], drop=True)
163 |     )
164 |     plot_windows(cv, random_time_series)
165 | 
166 |     save_path = str(OUTPUT_DIR / "cv_scheme.png")
167 |     plt.savefig(save_path)
168 |     wandb.log({"cv_scheme": wandb.Image(save_path)})
169 | 
170 |     return save_path
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     fire.Fire(run)
175 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/models.py:
--------------------------------------------------------------------------------
 1 | import lightgbm as lgb
 2 | 
 3 | from sktime.forecasting.compose import make_reduction, ForecastingPipeline
 4 | from sktime.forecasting.naive import NaiveForecaster
 5 | from sktime.transformations.series.date import DateTimeFeatures
 6 | from sktime.transformations.series.summarize import WindowSummarizer
 7 | 
 8 | from training_pipeline import transformers
 9 | 
10 | 
11 | def build_model(config: dict):
12 |     """
13 |     Build an Sktime model using the given config.
14 | 
15 |     It supports defaults for windowing the following parameters:
16 |     - lag: list(range(1, 72 + 1))
17 |     - mean: [[1, 24], [1, 48], [1, 72]]
18 |     - std: [[1, 24], [1, 48], [1, 72]]
19 |     """
20 | 
21 |     lag = config.pop(
22 |         "forecaster_transformers__window_summarizer__lag_feature__lag",
23 |         list(range(1, 72 + 1)),
24 |     )
25 |     mean = config.pop(
26 |         "forecaster_transformers__window_summarizer__lag_feature__mean",
27 |         [[1, 24], [1, 48], [1, 72]],
28 |     )
29 |     std = config.pop(
30 |         "forecaster_transformers__window_summarizer__lag_feature__std",
31 |         [[1, 24], [1, 48], [1, 72]],
32 |     )
33 |     n_jobs = config.pop("forecaster_transformers__window_summarizer__n_jobs", 1)
34 |     window_summarizer = WindowSummarizer(
35 |         **{"lag_feature": {"lag": lag, "mean": mean, "std": std}},
36 |         n_jobs=n_jobs,
37 |     )
38 | 
39 |     regressor = lgb.LGBMRegressor()
40 |     forecaster = make_reduction(
41 |         regressor,
42 |         transformers=[window_summarizer],
43 |         strategy="recursive",
44 |         pooling="global",
45 |         window_length=None,
46 |     )
47 | 
48 |     pipe = ForecastingPipeline(
49 |         steps=[
50 |             ("attach_area_and_consumer_type", transformers.AttachAreaConsumerType()),
51 |             (
52 |                 "daily_season",
53 |                 DateTimeFeatures(
54 |                     manual_selection=["day_of_week", "hour_of_day"],
55 |                     keep_original_columns=True,
56 |                 ),
57 |             ),
58 |             ("forecaster", forecaster),
59 |         ]
60 |     )
61 |     pipe = pipe.set_params(**config)
62 | 
63 |     return pipe
64 | 
65 | 
66 | def build_baseline_model(seasonal_periodicity: int):
67 |     """Builds a naive forecaster baseline model using Sktime that predicts the last value given a seasonal periodicity."""
68 | 
69 |     return NaiveForecaster(sp=seasonal_periodicity)
70 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import warnings
 3 | from pathlib import Path
 4 | from typing import Union
 5 | 
 6 | import matplotlib
 7 | from dotenv import load_dotenv
 8 | 
 9 | 
10 | warnings.filterwarnings(action="ignore", category=FutureWarning, module="sktime")
11 | matplotlib.use("Agg")
12 | 
13 | 
14 | def load_env_vars(root_dir: Union[str, Path]) -> dict:
15 |     """
16 |     Load environment variables from .env.default and .env files.
17 | 
18 |     Args:
19 |         root_dir: Root directory of the .env files.
20 | 
21 |     Returns:
22 |         Dictionary with the environment variables.
23 |     """
24 | 
25 |     if isinstance(root_dir, str):
26 |         root_dir = Path(root_dir)
27 | 
28 |     load_dotenv(dotenv_path=root_dir / ".env.default")
29 |     load_dotenv(dotenv_path=root_dir / ".env", override=True)
30 | 
31 |     return dict(os.environ)
32 | 
33 | 
34 | def get_root_dir(default_value: str = ".") -> Path:
35 |     """
36 |     Get the root directory of the project.
37 | 
38 |     Args:
39 |         default_value: Default value to use if the environment variable is not set.
40 | 
41 |     Returns:
42 |         Path to the root directory of the project.
43 |     """
44 | 
45 |     return Path(os.getenv("ML_PIPELINE_ROOT_DIR", default_value))
46 | 
47 | 
48 | ML_PIPELINE_ROOT_DIR = get_root_dir()
49 | OUTPUT_DIR = ML_PIPELINE_ROOT_DIR / "output"
50 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
51 | 
52 | SETTINGS = load_env_vars(root_dir=ML_PIPELINE_ROOT_DIR)
53 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/train.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from collections import OrderedDict
  3 | import os
  4 | from pathlib import Path
  5 | from typing import OrderedDict as OrderedDictType, Optional, Tuple
  6 | 
  7 | import fire
  8 | import hopsworks
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | import pandas as pd
 12 | import wandb
 13 | from sktime.performance_metrics.forecasting import (
 14 |     mean_squared_percentage_error,
 15 |     mean_absolute_percentage_error,
 16 | )
 17 | from sktime.utils.plotting import plot_series
 18 | 
 19 | 
 20 | from training_pipeline import utils
 21 | from training_pipeline.settings import SETTINGS, OUTPUT_DIR
 22 | from training_pipeline.data import load_dataset_from_feature_store
 23 | from training_pipeline.models import build_model, build_baseline_model
 24 | 
 25 | 
 26 | logger = utils.get_logger(__name__)
 27 | 
 28 | 
 29 | def from_best_config(
 30 |     fh: int = 24,
 31 |     feature_view_version: Optional[int] = None,
 32 |     training_dataset_version: Optional[int] = None,
 33 | ) -> dict:
 34 |     """Train and evaluate on the test set the best model found in the hyperparameter optimization run.
 35 |     After training and evaluating it uploads the artifacts to wandb & hopsworks model registries.
 36 | 
 37 |     Args:
 38 |         fh (int, optional): Forecasting horizon. Defaults to 24.
 39 |         feature_view_version (Optional[int], optional): feature store - feature view version.
 40 |              If none, it will try to load the version from the cached feature_view_metadata.json file. Defaults to None.
 41 |         training_dataset_version (Optional[int], optional): feature store - feature view - training dataset version.
 42 |             If none, it will try to load the version from the cached feature_view_metadata.json file. Defaults to None.
 43 | 
 44 |     Returns:
 45 |         dict: Dictionary containing metadata about the training experiment.
 46 |     """
 47 | 
 48 |     feature_view_metadata = utils.load_json("feature_view_metadata.json")
 49 |     if feature_view_version is None:
 50 |         feature_view_version = feature_view_metadata["feature_view_version"]
 51 |     if training_dataset_version is None:
 52 |         training_dataset_version = feature_view_metadata["training_dataset_version"]
 53 | 
 54 |     y_train, y_test, X_train, X_test = load_dataset_from_feature_store(
 55 |         feature_view_version=feature_view_version,
 56 |         training_dataset_version=training_dataset_version,
 57 |         fh=fh,
 58 |     )
 59 | 
 60 |     training_start_datetime = y_train.index.get_level_values("datetime_utc").min()
 61 |     training_end_datetime = y_train.index.get_level_values("datetime_utc").max()
 62 |     testing_start_datetime = y_test.index.get_level_values("datetime_utc").min()
 63 |     testing_end_datetime = y_test.index.get_level_values("datetime_utc").max()
 64 |     logger.info(
 65 |         f"Training model on data from {training_start_datetime} to {training_end_datetime}."
 66 |     )
 67 |     logger.info(
 68 |         f"Testing model on data from {testing_start_datetime} to {testing_end_datetime}."
 69 |     )
 70 |     # Loading predictions from 2023-04-06 22:00:00 to 2023-04-07 21:00:00.
 71 | 
 72 |     with utils.init_wandb_run(
 73 |         name="best_model",
 74 |         job_type="train_best_model",
 75 |         group="train",
 76 |         reinit=True,
 77 |         add_timestamp_to_name=True,
 78 |     ) as run:
 79 |         run.use_artifact("split_train:latest")
 80 |         run.use_artifact("split_test:latest")
 81 |         # Load the best config from sweep.
 82 |         best_config_artifact = run.use_artifact(
 83 |             "best_config:latest",
 84 |             type="model",
 85 |         )
 86 |         download_dir = best_config_artifact.download()
 87 |         config_path = Path(download_dir) / "best_config.json"
 88 |         with open(config_path) as f:
 89 |             config = json.load(f)
 90 |         # Log the config to the experiment.
 91 |         run.config.update(config)
 92 | 
 93 |         # # Baseline model
 94 |         baseline_forecaster = build_baseline_model(seasonal_periodicity=fh)
 95 |         baseline_forecaster = train_model(baseline_forecaster, y_train, X_train, fh=fh)
 96 |         _, metrics_baseline = evaluate(baseline_forecaster, y_test, X_test)
 97 |         slices = metrics_baseline.pop("slices")
 98 |         for k, v in metrics_baseline.items():
 99 |             logger.info(f"Baseline test {k}: {v}")
100 |         wandb.log({"test": {"baseline": metrics_baseline}})
101 |         wandb.log({"test.baseline.slices": wandb.Table(dataframe=slices)})
102 | 
103 |         # Build & train best model.
104 |         best_model = build_model(config)
105 |         best_forecaster = train_model(best_model, y_train, X_train, fh=fh)
106 | 
107 |         # Evaluate best model
108 |         y_pred, metrics = evaluate(best_forecaster, y_test, X_test)
109 |         slices = metrics.pop("slices")
110 |         for k, v in metrics.items():
111 |             logger.info(f"Model test {k}: {v}")
112 |         wandb.log({"test": {"model": metrics}})
113 |         wandb.log({"test.model.slices": wandb.Table(dataframe=slices)})
114 | 
115 |         # Render best model on the test set.
116 |         results = OrderedDict({"y_train": y_train, "y_test": y_test, "y_pred": y_pred})
117 |         render(results, prefix="images_test")
118 | 
119 |         # Update best model with the test set.
120 |         # NOTE: Method update() is not supported by LightGBM + Sktime. Instead we will retrain the model on the entire dataset.
121 |         # best_forecaster = best_forecaster.update(y_test, X=X_test)
122 |         best_forecaster = train_model(
123 |             model=best_forecaster,
124 |             y_train=pd.concat([y_train, y_test]).sort_index(),
125 |             X_train=pd.concat([X_train, X_test]).sort_index(),
126 |             fh=fh,
127 |         )
128 |         X_forecast = compute_forecast_exogenous_variables(X_test, fh)
129 |         y_forecast = forecast(best_forecaster, X_forecast)
130 |         logger.info(
131 |             f"Forecasted future values for renderin between {y_test.index.get_level_values('datetime_utc').min()} and {y_test.index.get_level_values('datetime_utc').max()}."
132 |         )
133 |         results = OrderedDict(
134 |             {
135 |                 "y_train": y_train,
136 |                 "y_test": y_test,
137 |                 "y_forecast": y_forecast,
138 |             }
139 |         )
140 |         # Render best model future forecasts.
141 |         render(results, prefix="images_forecast")
142 | 
143 |         # Save best model.
144 |         save_model_path = OUTPUT_DIR / "best_model.pkl"
145 |         utils.save_model(best_forecaster, save_model_path)
146 |         metadata = {
147 |             "experiment": {
148 |                 "fh": fh,
149 |                 "feature_view_version": feature_view_version,
150 |                 "training_dataset_version": training_dataset_version,
151 |                 "training_start_datetime": training_start_datetime.to_timestamp().isoformat(),
152 |                 "training_end_datetime": training_end_datetime.to_timestamp().isoformat(),
153 |                 "testing_start_datetime": testing_start_datetime.to_timestamp().isoformat(),
154 |                 "testing_end_datetime": testing_end_datetime.to_timestamp().isoformat(),
155 |             },
156 |             "results": {"test": metrics},
157 |         }
158 |         artifact = wandb.Artifact(name="best_model", type="model", metadata=metadata)
159 |         artifact.add_file(str(save_model_path))
160 |         run.log_artifact(artifact)
161 | 
162 |         run.finish()
163 |         artifact.wait()
164 | 
165 |     model_version = add_best_model_to_model_registry(artifact)
166 | 
167 |     metadata = {"model_version": model_version}
168 |     utils.save_json(metadata, file_name="train_metadata.json")
169 | 
170 |     return metadata
171 | 
172 | 
173 | def train_model(model, y_train: pd.DataFrame, X_train: pd.DataFrame, fh: int):
174 |     """Train the forecaster on the given training set and forecast horizon."""
175 | 
176 |     fh = np.arange(fh) + 1
177 |     model.fit(y_train, X=X_train, fh=fh)
178 | 
179 |     return model
180 | 
181 | 
182 | def evaluate(
183 |     forecaster, y_test: pd.DataFrame, X_test: pd.DataFrame
184 | ) -> Tuple[pd.DataFrame, dict]:
185 |     """Evaluate the forecaster on the test set by computing the following metrics:
186 |         - RMSPE
187 |         - MAPE
188 |         - Slices: RMSPE, MAPE
189 | 
190 |     Args:
191 |         forecaster: model following the sklearn API
192 |         y_test (pd.DataFrame): time series to forecast
193 |         X_test (pd.DataFrame): exogenous variables
194 | 
195 |     Returns:
196 |         The predictions as a pd.DataFrame and a dict of metrics.
197 |     """
198 | 
199 |     y_pred = forecaster.predict(X=X_test)
200 | 
201 |     # Compute aggregated metrics.
202 |     results = dict()
203 |     rmspe = mean_squared_percentage_error(y_test, y_pred, squared=False)
204 |     results["RMSPE"] = rmspe
205 |     mape = mean_absolute_percentage_error(y_test, y_pred, symmetric=False)
206 |     results["MAPE"] = mape
207 | 
208 |     # Compute metrics per slice.
209 |     y_test_slices = y_test.groupby(["area", "consumer_type"])
210 |     y_pred_slices = y_pred.groupby(["area", "consumer_type"])
211 |     slices = pd.DataFrame(columns=["area", "consumer_type", "RMSPE", "MAPE"])
212 |     for y_test_slice, y_pred_slice in zip(y_test_slices, y_pred_slices):
213 |         (area_y_test, consumer_type_y_test), y_test_slice_data = y_test_slice
214 |         (area_y_pred, consumer_type_y_pred), y_pred_slice_data = y_pred_slice
215 | 
216 |         assert (
217 |             area_y_test == area_y_pred and consumer_type_y_test == consumer_type_y_pred
218 |         ), "Slices are not aligned."
219 | 
220 |         rmspe_slice = mean_squared_percentage_error(
221 |             y_test_slice_data, y_pred_slice_data, squared=False
222 |         )
223 |         mape_slice = mean_absolute_percentage_error(
224 |             y_test_slice_data, y_pred_slice_data, symmetric=False
225 |         )
226 | 
227 |         slice_results = pd.DataFrame(
228 |             {
229 |                 "area": [area_y_test],
230 |                 "consumer_type": [consumer_type_y_test],
231 |                 "RMSPE": [rmspe_slice],
232 |                 "MAPE": [mape_slice],
233 |             }
234 |         )
235 |         slices = pd.concat([slices, slice_results], ignore_index=True)
236 | 
237 |     results["slices"] = slices
238 | 
239 |     return y_pred, results
240 | 
241 | 
242 | def render(
243 |     timeseries: OrderedDictType[str, pd.DataFrame],
244 |     prefix: Optional[str] = None,
245 |     delete_from_disk: bool = True,
246 | ):
247 |     """Render the timeseries as a single plot per (area, consumer_type) and saves them to disk and to wandb."""
248 | 
249 |     grouped_timeseries = OrderedDict()
250 |     for split, df in timeseries.items():
251 |         df = df.reset_index(level=[0, 1])
252 |         groups = df.groupby(["area", "consumer_type"])
253 |         for group_name, split_group_values in groups:
254 |             group_values = grouped_timeseries.get(group_name, {})
255 | 
256 |             grouped_timeseries[group_name] = {
257 |                 f"{split}": split_group_values["energy_consumption"],
258 |                 **group_values,
259 |             }
260 | 
261 |     output_dir = OUTPUT_DIR / prefix if prefix else OUTPUT_DIR
262 |     output_dir.mkdir(parents=True, exist_ok=True)
263 |     for group_name, group_values_dict in grouped_timeseries.items():
264 |         fig, ax = plot_series(
265 |             *group_values_dict.values(), labels=group_values_dict.keys()
266 |         )
267 |         fig.suptitle(f"Area: {group_name[0]} - Consumer type: {group_name[1]}")
268 | 
269 |         # save matplotlib image
270 |         image_save_path = str(output_dir / f"{group_name[0]}_{group_name[1]}.png")
271 |         plt.savefig(image_save_path)
272 |         plt.close(fig)
273 | 
274 |         if prefix:
275 |             wandb.log({prefix: wandb.Image(image_save_path)})
276 |         else:
277 |             wandb.log(wandb.Image(image_save_path))
278 | 
279 |         if delete_from_disk:
280 |             os.remove(image_save_path)
281 | 
282 | 
283 | def compute_forecast_exogenous_variables(X_test: pd.DataFrame, fh: int):
284 |     """Computes the exogenous variables for the forecast horizon."""
285 | 
286 |     X_forecast = X_test.copy()
287 |     X_forecast.index.set_levels(
288 |         X_forecast.index.levels[-1] + fh, level=-1, inplace=True
289 |     )
290 | 
291 |     return X_forecast
292 | 
293 | 
294 | def forecast(forecaster, X_forecast: pd.DataFrame):
295 |     """Forecast the energy consumption for the given exogenous variables and time horizon."""
296 | 
297 |     return forecaster.predict(X=X_forecast)
298 | 
299 | 
300 | def add_best_model_to_model_registry(best_model_artifact: wandb.Artifact) -> int:
301 |     """Adds the best model artifact to the model registry."""
302 | 
303 |     project = hopsworks.login(
304 |         api_key_value=SETTINGS["FS_API_KEY"], project=SETTINGS["FS_PROJECT_NAME"]
305 |     )
306 | 
307 |     # Upload the model to the Hopsworks model registry.
308 |     best_model_dir = best_model_artifact.download()
309 |     best_model_path = Path(best_model_dir) / "best_model.pkl"
310 |     best_model_metrics = best_model_artifact.metadata["results"]["test"]
311 | 
312 |     mr = project.get_model_registry()
313 |     py_model = mr.python.create_model("best_model", metrics=best_model_metrics)
314 |     py_model.save(best_model_path)
315 | 
316 |     return py_model.version
317 | 
318 | 
319 | if __name__ == "__main__":
320 |     fire.Fire(from_best_config)
321 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/transformers.py:
--------------------------------------------------------------------------------
 1 | from sktime.transformations.base import BaseTransformer
 2 | from sktime.transformations.compose import CORE_MTYPES
 3 | 
 4 | 
 5 | class AttachAreaConsumerType(BaseTransformer):
 6 |     """Transformer used to extract the area and consumer type from the index to the input data."""
 7 | 
 8 |     _tags = {
 9 |         "capability:inverse_transform": True,  # can the transformer inverse transform?
10 |         "univariate-only": False,  # can the transformer handle multivariate X?
11 |         "X_inner_mtype": CORE_MTYPES,  # which mtypes do _fit/_predict support for X?
12 |         # this can be a Panel mtype even if transform-input is Series, vectorized
13 |         "y_inner_mtype": "None",  # which mtypes do _fit/_predict support for y?
14 |         "fit_is_empty": True,  # is fit empty and can be skipped? Yes = True
15 |         "transform-returns-same-time-index": True,
16 |         # does transform return have the same time index as input X
17 |         "handles-missing-data": True,  # can estimator handle missing data?
18 |     }
19 | 
20 |     def _transform(self, X, y=None):
21 |         X["area_exog"] = X.index.get_level_values(0)
22 |         X["consumer_type_exog"] = X.index.get_level_values(1)
23 | 
24 |         return X
25 | 
26 |     def _inverse_transform(self, X, y=None):
27 |         X = X.drop(columns=["area_exog", "consumer_type_exog"])
28 | 
29 |         return X
30 | 


--------------------------------------------------------------------------------
/training-pipeline/training_pipeline/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import joblib
  4 | import pandas as pd
  5 | import wandb
  6 | 
  7 | from pathlib import Path
  8 | from typing import Union, Optional
  9 | 
 10 | 
 11 | from training_pipeline import settings
 12 | 
 13 | 
 14 | def save_json(data: dict, file_name: str, save_dir: str = settings.OUTPUT_DIR):
 15 |     """
 16 |     Save a dictionary as a JSON file.
 17 | 
 18 |     Args:
 19 |         data: data to save.
 20 |         file_name: Name of the JSON file.
 21 |         save_dir: Directory to save the JSON file.
 22 | 
 23 |     Returns: None
 24 |     """
 25 | 
 26 |     data_path = Path(save_dir) / file_name
 27 |     with open(data_path, "w") as f:
 28 |         json.dump(data, f)
 29 | 
 30 | 
 31 | def load_json(file_name: str, save_dir: str = settings.OUTPUT_DIR) -> dict:
 32 |     """
 33 |     Load a JSON file.
 34 | 
 35 |     Args:
 36 |         file_name: Name of the JSON file.
 37 |         save_dir: Directory of the JSON file.
 38 | 
 39 |     Returns: Dictionary with the data.
 40 |     """
 41 | 
 42 |     data_path = Path(save_dir) / file_name
 43 |     with open(data_path, "r") as f:
 44 |         return json.load(f)
 45 | 
 46 | 
 47 | def save_model(model, model_path: Union[str, Path]):
 48 |     """
 49 |     Template for saving a model.
 50 | 
 51 |     Args:
 52 |         model: Trained model.
 53 |         model_path: Path to save the model.
 54 |     """
 55 | 
 56 |     joblib.dump(model, model_path)
 57 | 
 58 | 
 59 | def load_model(model_path: Union[str, Path]):
 60 |     """
 61 |     Template for loading a model.
 62 | 
 63 |     Args:
 64 |         model_path: Path to the model.
 65 | 
 66 |     Returns: Loaded model.
 67 |     """
 68 | 
 69 |     return joblib.load(model_path)
 70 | 
 71 | 
 72 | def load_data_from_parquet(data_path: str) -> pd.DataFrame:
 73 |     """
 74 |     Template for loading data from a parquet file.
 75 | 
 76 |     Args:
 77 |         data_path: Path to the parquet file.
 78 | 
 79 |     Returns: Dataframe with the data.
 80 |     """
 81 | 
 82 |     return pd.read_parquet(data_path)
 83 | 
 84 | 
 85 | def get_logger(name: str) -> logging.Logger:
 86 |     """
 87 |     Template for getting a logger.
 88 | 
 89 |     Args:
 90 |         name: Name of the logger.
 91 | 
 92 |     Returns: Logger.
 93 |     """
 94 | 
 95 |     logging.basicConfig(level=logging.INFO)
 96 |     logger = logging.getLogger(name)
 97 | 
 98 |     return logger
 99 | 
100 | 
101 | def init_wandb_run(
102 |     name: str,
103 |     group: Optional[str] = None,
104 |     job_type: Optional[str] = None,
105 |     add_timestamp_to_name: bool = False,
106 |     run_id: Optional[str] = None,
107 |     resume: Optional[str] = None,
108 |     reinit: bool = False,
109 |     project: str = settings.SETTINGS["WANDB_PROJECT"],
110 |     entity: str = settings.SETTINGS["WANDB_ENTITY"],
111 | ):
112 |     """Wrapper over the wandb.init function."""
113 | 
114 |     if add_timestamp_to_name:
115 |         name = f"{name}_{pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')}"
116 | 
117 |     run = wandb.init(
118 |         project=project,
119 |         entity=entity,
120 |         name=name,
121 |         group=group,
122 |         job_type=job_type,
123 |         id=run_id,
124 |         reinit=reinit,
125 |         resume=resume,
126 |     )
127 | 
128 |     return run
129 | 
130 | 
131 | def check_if_artifact_exists(
132 |     artifact_name: str,
133 |     project: str = settings.SETTINGS["WANDB_PROJECT"],
134 |     entity: str = settings.SETTINGS["WANDB_ENTITY"],
135 | ) -> bool:
136 |     """Utiliy function that checks if a W&B artifact exists."""
137 | 
138 |     try:
139 |         get_artifact(artifact_name, project, entity)
140 | 
141 |         return True
142 |     except wandb.errors.CommError:
143 |         return False
144 | 
145 | 
146 | def get_artifact(
147 |     artifact_name: str,
148 |     project: str = settings.SETTINGS["WANDB_PROJECT"],
149 |     entity: str = settings.SETTINGS["WANDB_ENTITY"],
150 | ) -> wandb.Artifact:
151 |     """Get the latest version of a W&B artifact."""
152 | 
153 |     api = wandb.Api()
154 |     artifact = api.artifact(f"{entity}/{project}/{artifact_name}:latest")
155 | 
156 |     return artifact
157 | 


--------------------------------------------------------------------------------