├── .env.default ├── .github └── workflows │ ├── ci_cd_ml_pipeline.yml │ └── ci_cd_web_app.yml ├── .gitignore ├── LICENSE ├── README.md ├── README_CICD.md ├── README_DEPLOY.md ├── airflow ├── .gitignore ├── Dockerfile ├── dags │ ├── .env.default │ ├── __init__.py │ └── ml_pipeline_dag.py ├── docker-compose.yaml ├── poetry.lock └── pyproject.toml ├── app-api ├── .dockerignore ├── .env.default ├── Dockerfile ├── README.md ├── api │ ├── __init__.py │ ├── __main__.py │ ├── application.py │ ├── config.py │ ├── schemas │ │ ├── __init__.py │ │ ├── area_values.py │ │ ├── consumer_type_values.py │ │ ├── health.py │ │ └── predictions.py │ └── views.py ├── poetry.lock ├── pyproject.toml └── run.sh ├── app-frontend ├── .dockerignore ├── .streamlit │ └── config.toml ├── Dockerfile ├── README.md ├── frontend │ ├── __init__.py │ ├── components.py │ ├── main.py │ └── settings.py ├── poetry.lock └── pyproject.toml ├── app-monitoring ├── .dockerignore ├── .streamlit │ └── config.toml ├── Dockerfile ├── README.md ├── monitoring │ ├── __init__.py │ ├── components.py │ ├── main.py │ └── settings.py ├── poetry.lock └── pyproject.toml ├── batch-prediction-pipeline ├── .env.default ├── README.md ├── batch_prediction_pipeline │ ├── __init__.py │ ├── batch.py │ ├── data.py │ ├── monitoring.py │ ├── settings.py │ └── utils.py ├── poetry.lock └── pyproject.toml ├── deploy ├── app-docker-compose.local.yml ├── app-docker-compose.yml └── ml-pipeline.sh ├── feature-pipeline ├── .env.default ├── README.md ├── feature_pipeline │ ├── __init__.py │ ├── clean_feature_store.py │ ├── etl │ │ ├── __init__.py │ │ ├── cleaning.py │ │ ├── extract.py │ │ ├── load.py │ │ └── validation.py │ ├── feature_view.py │ ├── pipeline.py │ ├── settings.py │ └── utils.py ├── poetry.lock └── pyproject.toml ├── images ├── airflow_login_screenshot.png ├── airflow_ml_pipeline_dag_overview_screenshot.png ├── airflow_ml_pipeline_dag_screenshot.png ├── airflow_variables_screenshot.png ├── architecture.png ├── forecasting_demo_screenshot.png ├── gcp_expose_ports_firewall_rule_screenshot.png ├── gcp_gcs_screenshot.png ├── gcp_iap_for_tcp_firewall_rule.png ├── gcp_ssh_screenshot.png ├── gcp_vm_external_ip_screenshot.png ├── github_actions_secrets_screenshot.png ├── github_actions_see_cicd_screenshot.png ├── github_actions_variables_screenshot.png ├── gmail.png ├── linkedin.png ├── medium.png ├── screenshot_introduction_video.png ├── substack.png └── twitter.png ├── scripts └── install_poetry_macos_m1_chip.sh └── training-pipeline ├── .env.default ├── README.md ├── poetry.lock ├── pyproject.toml └── training_pipeline ├── __init__.py ├── best_config.py ├── configs ├── __init__.py └── gridsearch.py ├── data.py ├── hyperparameter_tuning.py ├── models.py ├── settings.py ├── train.py ├── transformers.py └── utils.py /.env.default: -------------------------------------------------------------------------------- 1 | FS_API_KEY = "" 2 | FS_PROJECT_NAME = "" 3 | WANDB_API_KEY = "" 4 | WANDB_ENTITY = "teaching-mlops" 5 | WANDB_PROJECT = "energy_consumption" 6 | GOOGLE_CLOUD_PROJECT = "energy_consumption" 7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions" 8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "/absolute/path/to/your/service-account.json" 9 | -------------------------------------------------------------------------------- /.github/workflows/ci_cd_ml_pipeline.yml: -------------------------------------------------------------------------------- 1 | name: CD/CD for the ml-pipeline that builds all the pipeline modules and pushes them to the private PyPI registry. From where Airflow will install the latest versions and use them in the next run. 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - 'app-api/' 7 | - 'app-frontend/' 8 | - '**/*.yml' 9 | - '**/*.md' 10 | branches: [ "main" ] 11 | 12 | env: 13 | CLOUDSDK_CORE_PROJECT: '${{ vars.CLOUDSDK_CORE_PROJECT }}' 14 | USER: '${{ vars.USER }}' 15 | INSTANCE_NAME: '${{ vars.ML_PIPELINE_INSTANCE_NAME }}' 16 | ZONE: '${{ vars.ZONE }}' 17 | 18 | jobs: 19 | ci_cd: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: 'actions/checkout@v3' 23 | 24 | - id: 'auth' 25 | uses: 'google-github-actions/auth@v0' 26 | with: 27 | credentials_json: '${{ secrets.GCP_CREDENTIALS }}' 28 | - id: 'compute-ssh' 29 | uses: 'google-github-actions/ssh-compute@v0' 30 | with: 31 | project_id: '${{ env.CLOUDSDK_CORE_PROJECT }}' 32 | user: '${{ env.USER }}' 33 | instance_name: '${{ env.INSTANCE_NAME }}' 34 | zone: '${{ env.ZONE }}' 35 | ssh_private_key: '${{ secrets.GCP_SSH_PRIVATE_KEY }}' 36 | command: > 37 | cd ~/energy-forecasting && 38 | git pull && 39 | sh deploy/ml-pipeline.sh 40 | -------------------------------------------------------------------------------- /.github/workflows/ci_cd_web_app.yml: -------------------------------------------------------------------------------- 1 | name: CI/CD for the web app (API + frontend) 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - 'batch-prediction-pipeline/' 7 | - 'feature-pipeline/' 8 | - 'training-pipeline' 9 | - '**/*.yml' 10 | - '**/*.md' 11 | branches: [ "main" ] 12 | 13 | env: 14 | CLOUDSDK_CORE_PROJECT: '${{ vars.CLOUDSDK_CORE_PROJECT }}' 15 | USER: '${{ vars.USER }}' 16 | INSTANCE_NAME: '${{ vars.APP_INSTANCE_NAME }}' 17 | ZONE: '${{ vars.ZONE }}' 18 | 19 | jobs: 20 | ci_cd: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - uses: 'actions/checkout@v3' 24 | 25 | - id: 'auth' 26 | uses: 'google-github-actions/auth@v0' 27 | with: 28 | credentials_json: '${{ secrets.GCP_CREDENTIALS }}' 29 | - id: 'compute-ssh' 30 | uses: 'google-github-actions/ssh-compute@v0' 31 | with: 32 | project_id: '${{ env.CLOUDSDK_CORE_PROJECT }}' 33 | user: '${{ env.USER }}' 34 | instance_name: '${{ env.INSTANCE_NAME }}' 35 | zone: '${{ env.ZONE }}' 36 | ssh_private_key: '${{ secrets.GCP_SSH_PRIVATE_KEY }}' 37 | command: > 38 | cd ~/energy-forecasting && 39 | git pull && 40 | docker compose -f deploy/app-docker-compose.yml --project-directory . up --build -d 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | */.env 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | # IDEs 133 | .idea 134 | .vscode 135 | 136 | # Tools 137 | mage_data/ 138 | wandb/ 139 | energy_consumption/ 140 | 141 | # Data 142 | *.parquet 143 | models/ 144 | output/ 145 | artifacts/ 146 | 147 | # Models 148 | *.pkl 149 | 150 | # Sensitive 151 | credentials/ 152 | .hw_api_key 153 | 154 | # Local notes 155 | deploy_hardcoded_steps.md 156 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Paul Iusztin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README_CICD.md: -------------------------------------------------------------------------------- 1 | # The Full Stack 7-Steps MLOps Framework 2 | 3 | Congratulations, you are close to the whole experience if you reached this far. This is the last step from the 7 lessons of the course. 4 | 5 | **NOTE:** You can finish this lesson only if you deploy your code to GCP. If you haven't. [Check out this section to see how.](https://github.com/iusztinpaul/energy-forecasting/blob/main/README_DEPLOY.md) 6 | 7 | [Access Lesson 7 on Medium for more detailed step-by-step instructions.](https://towardsdatascience.com/seamless-ci-cd-pipelines-with-github-actions-on-gcp-your-tools-for-effective-mlops-96f676f72012). 8 | 9 | # CI/CD 10 | 11 | We will use GitHub Actions to create the CI/CD pipeline. GitHub Actions will let us run various commands on specific triggers, such as a new commit to a branch. 12 | 13 | ## Fork the Repository 14 | 15 | By forking the repository, you will create the exact identical copy of the code on your own GitHub account. Thus, you will have full access to the settings of the repository. 16 | 17 | [Check out this doc to see how to fork a repository on GitHub.](https://docs.github.com/en/get-started/quickstart/fork-a-repo) 18 | 19 | ## Set Actions Variables 20 | 21 | Go to your forked repository. After go to `Settings` -> `Secrets and variables` (in the Security tab) -> `Actions`. 22 | 23 | Now, click `Variables`. You can create a new variable from that section by clicking `New repository variable`. 24 | 25 | See the image below 👇 26 | 27 |

28 | 29 |

30 | 31 | You have to create 5 variables that will be used by the GitHub Actions scripts: 32 | * `APP_INSTANCE_NAME` : the name of the web app VM. | In our case, it is `app`. The default should be ok if you use our recommended naming conventions. 33 | * `GCLOUD_PROJECT` : the ID of your GCP Project | Here, you have to change it with your own project ID. 34 | * `ML_PIPELINE_INSTANCE_NAME` : the name of the ML pipeline VM. | In our case, it is `ml-pipeline`. The default should be ok if you use our recommended naming conventions. 35 | * `USER`: the user you used to connect to the VMs while settings up the machine using the SSH connection. | Mine was `pauliusztin`, but you must change it with yours. 36 | * `ZONE` : the zone where you deployed the VMs. | The default should be ok if you use our recommended naming conventions. 37 | 38 | ## Set Secrets 39 | 40 | In the same `Secrets and variables/Actions` section, hit the `Secrets` tab. 41 | 42 | You can create a new secret by pressing the `New repository secret` button. 43 | 44 | These are similar to the variables we just completed, but after you fill in their values, you can't see them anymore. That is why these are called secrets. Here is where you add all your sensitive information. In our case, the GCP credentials and private keys. 45 | 46 | See the image below 👇 47 | 48 |

49 | 50 |

51 | 52 | The `GCP_CREDENTIALS` secret contains the content of the JSON key of your VM admin service account. By settings this up, the CI/CD pipeline will use that service account to authenticate to the VMs. 53 | 54 | Because the content of the file is in JSON format, you have to run the following commands: 55 | 56 | Install the jq CLI: 57 | ```shell 58 | sudo apt update 59 | sudo apt install -y jq 60 | jq --version 61 | ``` 62 | Format your JSON key file: 63 | ```shell 64 | jq -c . /path/to/your/admin-vm.json 65 | ``` 66 | Take the output of this command and create your `GCP_CREDENTIALS` secret with it. 67 | 68 | The `GCP_SSH_PRIVATE_KEY` is your GCP private SSH key (not your personal one - GCP creates an additional one automatically), which was built on your local computer when you used SSH to connect to the VMs. 69 | 70 | To copy it, run the following: 71 | ```shell 72 | cd ~/.ssh 73 | cat google_compute_engine 74 | ``` 75 | Copy the output from the terminal and create the `GCP_SSH_PRIVATE_KEY` variable. 76 | 77 | 78 | ## Run the CI/CD Pipeline 79 | 80 | Now make any change to the code, push it to the main branch, and the GitHub Actions files should trigger automatically. 81 | 82 | To see their results, check your GitHub repository's `Actions` tab. 83 | 84 |

85 | 86 |

87 | 88 | Two actions will be triggered. One will build and deploy the `ml-pipeline` modules to your `ml-pipeline` GCP VM, and one will build and deploy the `web app` to your `app` GCP VM. 89 | 90 | If you want to understand better how we wrote the GitHub Actions scripts under the `.github/workflows` directory [check out the **"CI/CD Pipeline Using GitHub Actions"** section of Lesson 7 on Medium](https://towardsdatascience.com/seamless-ci-cd-pipelines-with-github-actions-on-gcp-your-tools-for-effective-mlops-96f676f72012) that explains everything in detail. 91 | -------------------------------------------------------------------------------- /README_DEPLOY.md: -------------------------------------------------------------------------------- 1 | # The Full Stack 7-Steps MLOps Framework 2 | 3 | ## Deploy to GCP 4 | 5 | This step must only be finished if you want to deploy the code on GCP VMs and build the CI/CD with GitHub Actions. 6 | 7 | Note that this step might result in a few costs on GCP. It won't be much. While developing this course, I spent only ~20$, which will probably be less for you. 8 | 9 | Also, you can get some free credits if you have a new GCP account (I had 300$ free credits). Just be sure to delete the resources after you finish the course. 10 | 11 | [Access Lesson 7 on Medium for more detailed step-by-step instructions.](https://towardsdatascience.com/seamless-ci-cd-pipelines-with-github-actions-on-gcp-your-tools-for-effective-mlops-96f676f72012). 12 | 13 | ------ 14 | 15 | ## General Set Up 16 | 17 | Before setting up the code, you must go to your GCP project and create a few additional resources. After, you can SSH to your machines and deploy your code. 18 | 19 | #### GCP Resources 20 | 21 | ### Admin VM Service Account with IAP Access 22 | 23 | We need a new GCP service account with admin rights & IAP access when working with GCP VMs. You have to create a new service account and assign to the new service account the following roles: 24 | * Compute Instance Admin (v1) 25 | * IAP-secured Tunnel User 26 | * Service Account Token Creator 27 | * Service Account User 28 | 29 | IAP stands for Identity-Aware Proxy. It is a way to create tunnels that route TCP traffic. For your knowledge, you can read more about this topic using the following docs (you don't have to fully understand it to proceed to the next steps): 30 | * [Using IAP for TCP forwarding](https://cloud.google.com/iap/docs/using-tcp-forwarding) 31 | * [Overview of TCP forwarding](https://cloud.google.com/iap/docs/tcp-forwarding-overview) 32 | 33 | ### Expose Ports Firewall Rule 34 | 35 | Create a firewall rule that exposes the following TCP ports: 8501, 8502, 8001. 36 | 37 | Also, add a `target tag` called `energy-forecasting-expose-ports`. 38 | 39 | Here is how my firewall rule looks like: 40 | 41 |

42 | 43 |

44 | 45 | Here are 2 docs that helped me create and configure the ports for the firewall rule: 46 | * [Doc 1](https://stackoverflow.com/questions/21065922/how-to-open-a-specific-port-such-as-9090-in-google-compute-engine) 47 | * [Doc 2](https://www.howtogeek.com/devops/how-to-open-firewall-ports-on-a-gcp-compute-engine-instance/) 48 | 49 | 50 | ### IAP for TCP Tunneling Firewall Rule 51 | 52 | Now we will create a firewall rule allowing IAP for TCP Tunneling on all the VMs connected to the `default` network. 53 | 54 | [Docs on how to create the firewall rule.](https://cloud.google.com/iap/docs/using-tcp-forwarding#preparing_your_project_for_tcp_forwarding) 55 | 56 | Here is how my firewall rule looks like: 57 | 58 |

59 | 60 |

61 | 62 | 63 | ### VM for the Pipeline 64 | 65 | Go to your GCP project -> `VM Instances` -> `Create Instance` 66 | 67 | Choose `e2-standard-2: 2 vCPU cores - 8 GB RAM` as your VM instance type. 68 | 69 | Call it: `ml-pipeline` 70 | 71 | Change the disk to `20 GB Storage` 72 | 73 | Pick region `europe-west3 (Frankfurt)` and zone `europe-west3-c` 74 | 75 | Network: `default` 76 | 77 | Also, check the `HTTP` and `HTTPS` boxes and add the `energy-forecasting-expose-ports` custom firewall rule we did a few steps back. 78 | 79 | Here are 2 docs that helped me create and configure the ports for the firewall rule: 80 | * [Doc 1](https://stackoverflow.com/questions/21065922/how-to-open-a-specific-port-such-as-9090-in-google-compute-engine) 81 | * [Doc 2](https://www.howtogeek.com/devops/how-to-open-firewall-ports-on-a-gcp-compute-engine-instance/) 82 | 83 | 84 | ### VM for the Web App 85 | 86 | Go to your GCP project -> `VM Instances` -> `Create Instance` 87 | 88 | Choose: `e2-micro: 0.25 2 vCPU - 1 GB memory` as your VM instance type. 89 | 90 | Call it: `app` 91 | 92 | Change the disk to: `15 GB standard persisted disk` 93 | 94 | Pick region `europe-west3 (Frankfurt)` and zone `europe-west3-c` 95 | 96 | Network: `default` 97 | 98 | Also, check the `HTTP` and `HTTPS` boxes and add the `energy-forecasting-expose-ports` custom firewall rule we created a few steps back. 99 | 100 | Here are 2 docs that helped me create and configure the ports for the firewall rule: 101 | * [Doc 1](https://stackoverflow.com/questions/21065922/how-to-open-a-specific-port-such-as-9090-in-google-compute-engine) 102 | * [Doc 2](https://www.howtogeek.com/devops/how-to-open-firewall-ports-on-a-gcp-compute-engine-instance/) 103 | 104 | 105 | ### External Static IP 106 | 107 | If we want the external IP for our web app to be static (aka not to change), we have to attach a static address to our web app VM. 108 | 109 | More precisely, we suggest adding it only to the `app` VM we created a few steps ahead. 110 | 111 | That is perfectly fine if you want to also add a static external IP to the `ml-pipeline` VM. 112 | 113 | [Docs on reserving a static external IP address.](https://cloud.google.com/compute/docs/ip-addresses/reserve-static-external-ip-address) 114 | 115 | 116 | ---- 117 | 118 | #### Now that the boring part is finished, let's start deploying the code 👇 👇 👇 119 | 120 | 121 | ## Deploy - General Steps 122 | 123 | ### Configure Your Service Account 124 | 125 | We will use your service account configured with admin rights for VMs and IAP access to SSH from your local machine to the GCP VMs. 126 | 127 | First, we must tell the `gcloud` GCP CLI to use that service account. 128 | 129 | To do so, you have to create a key for your service account and download it as a JSON file (same as you did for the buckets service accounts - [here are some docs to refresh your mind](https://cloud.google.com/iam/docs/keys-create-delete)). 130 | 131 | After you download the file, you just have to run the following `gcloud` command in your terminal: 132 | ```shell 133 | gcloud auth activate-service-account SERVICE_ACCOUNT@DOMAIN.COM --key-file=/path/key.json --project=PROJECT_ID 134 | ``` 135 | 136 | [Check out this doc for more details about the gcloud auth command](https://cloud.google.com/sdk/gcloud/reference/auth/activate-service-account). 137 | 138 | Now whenever you run commands with `gcloud`, it will use this service account to authenticate. 139 | 140 | 141 | ## Deploy - The Pipeline 142 | 143 | Let's connect through SSH to the `ml-pipeline` GCP VM you created a few steps ahead: 144 | ```shell 145 | gcloud compute ssh ml-pipeline --zone europe-west3-c --quiet --tunnel-through-iap --project 146 | ``` 147 | **NOTE 1:** Change the `zone` if you haven't created a VM within the same zone as us.
148 | **NOTE 2:** Your `project-id` is NOT your `project name`. Go to your GCP projects list and find the project id. 149 | 150 | Starting this point, if you configured the firewalls and service account correctly, as everything is Dockerized, all the steps will be 99% similar to the ones from the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#tools) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections. 151 | 152 | You can follow the same steps while your terminal has an SSH connection with the GCP machine. 153 | 154 | Note that the GCP machine is a Linux machine. Thus, you can directly copy & paste the commands from the README regardless of the OS you use on your local machine. 155 | 156 |

157 | 158 |

159 | 160 | Now you must repeat all the steps you've done setting `The Pipeline` locally, using this SSH connection. 161 | 162 | ### BUT YOU HAVE TO KEEP IN MIND THE FOLLOWING: 163 | 164 | **Clone the code in the home directory of the VM:** 165 | 166 | Just SHH to the VM and run: 167 | ```shell 168 | git clone https://github.com/iusztinpaul/energy-forecasting.git 169 | cd energy-forecasting 170 | ``` 171 | 172 | **Install Docker using the following commands:**

173 | Install Docker: 174 | ```shell 175 | sudo apt update 176 | sudo apt install --yes apt-transport-https ca-certificates curl gnupg2 software-properties-common 177 | curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add - 178 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" 179 | sudo apt update 180 | sudo apt install --yes docker-ce 181 | ``` 182 | Add sudo access to Docker: 183 | ```shell 184 | sudo usermod -aG docker $USER 185 | logout 186 | ``` 187 | Login again to your machine: 188 | ```shell 189 | gcloud compute ssh ml-pipeline --zone europe-west3-c --quiet --tunnel-through-iap --project 190 | ``` 191 | 192 | [Check out these docs for the full instructions.](https://tomroth.com.au/gcp-docker/) 193 | 194 | **Replace all `cp` commands with `gcloud compute scp`:**

195 | 196 | This command will help you to copy files from your local machine to the VM. 197 | 198 | For example, instead of running: 199 | ```shell 200 | cp -r /path/to/admin/gcs/credentials/admin-buckets.json credentials/gcp/energy_consumption 201 | ``` 202 | Run in a different terminal (not the one connected with SSH to your VM): 203 | ```shell 204 | gcloud compute scp --recurse --zone europe-west3-c --quiet --tunnel-through-iap --project /local/path/to/admin-buckets.json ml-pipeline:~/energy-forecasting/airflow/dags/credentials/gcp/energy_consumption/ 205 | ``` 206 | This command will copy your local `admin-buckets.json` file to the `ml-pipeline` VM. 207 | 208 | 209 | **!!!** This is all you need to know. All the other steps are the same as the ones run locally. Only Docker has a slightly different installation, and you need a different way to copy files from your local machine to the VM. 210 | 211 | 212 | Now to access the Airflow application, go to your VM view from GCP and go to the `Network tags` section. You will find the `External IP address` column, as shown in the image below. Copy that IP and attach port `8080` to it. 213 | 214 | For example, based on the `External IP address` from the image below, I accessed Airflow using this address: `35.207.134.188:8080`. 215 | 216 | Congrats! You connected to your own self-hosted Airflow application. 217 | 218 | If it doesn't connect, give it a few seconds to load properly. 219 | 220 |

221 | 222 |

223 | 224 | 225 | ## Deploy - The Web App 226 | Let's connect through SSH to the `app` GCP VM you created a few steps ahead: 227 | ```shell 228 | gcloud compute ssh app --zone europe-west3-c --quiet --tunnel-through-iap --project 229 | ``` 230 | **NOTE 1:** Change the `zone` if you haven't created a VM within the same zone as us.
231 | **NOTE 2:** Your `project-id` is NOT your `project name`. Go to your GCP projects list and find the project id. 232 | 233 | Here the process is similar to deploying the ML Pipeline. 234 | 235 | You can deploy the web app following the exact same steps described in [Lesson 6](https://towardsdatascience.com/fastapi-and-streamlit-the-python-duo-you-must-know-about-72825def1243) or in the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#tools) & [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections of the GitHub repository. 236 | 237 | But don't forget to keep in mind the same edge cases described in the [Deploy - The Pipeline](https://github.com/iusztinpaul/energy-forecasting/blob/main/README_DEPLOY.md#deploy---the-pipeline) section. 238 | 239 | ---- 240 | 241 | 👀 **As you can see, here you have done everything manually. If you want to know how to create a simple CI/CD using GitHub Actions [check out this section](https://github.com/iusztinpaul/energy-forecasting/blob/main/README_CICD.md).** 242 | -------------------------------------------------------------------------------- /airflow/.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | .env 3 | -------------------------------------------------------------------------------- /airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.7.0 2 | 3 | ARG CURRENT_USER=$USER 4 | 5 | USER root 6 | # Install Python dependencies to be able to process the wheels from the private PyPI server. 7 | RUN apt-get -y update && ACCEPT_EULA=Y apt-get -y upgrade 8 | RUN apt-get install -y python3.9-distutils python3.9-dev build-essential 9 | USER ${CURRENT_USER} 10 | -------------------------------------------------------------------------------- /airflow/dags/.env.default: -------------------------------------------------------------------------------- 1 | FS_API_KEY = "" 2 | FS_PROJECT_NAME = "" 3 | WANDB_API_KEY = "" 4 | WANDB_ENTITY = "teaching-mlops" 5 | WANDB_PROJECT = "energy_consumption" 6 | GOOGLE_CLOUD_PROJECT = "energy_consumption" 7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions" 8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "/opt/airflow/dags/credentials/gcp/energy_consumption/admin-buckets.json" 9 | -------------------------------------------------------------------------------- /airflow/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/airflow/dags/__init__.py -------------------------------------------------------------------------------- /airflow/dags/ml_pipeline_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow.decorators import dag, task 4 | from airflow.models import Variable 5 | from airflow.operators.empty import EmptyOperator 6 | from airflow.utils.trigger_rule import TriggerRule 7 | from airflow.utils.edgemodifier import Label 8 | 9 | 10 | @dag( 11 | dag_id="ml_pipeline", 12 | schedule="@hourly", 13 | start_date=datetime(2023, 4, 14), 14 | catchup=False, 15 | tags=["feature-engineering", "model-training", "batch-prediction"], 16 | max_active_runs=1, 17 | ) 18 | def ml_pipeline(): 19 | @task.virtualenv( 20 | task_id="run_feature_pipeline", 21 | requirements=[ 22 | "--trusted-host 172.17.0.1", 23 | "--extra-index-url http://172.17.0.1", 24 | "feature_pipeline", 25 | ], 26 | python_version="3.9", 27 | multiple_outputs=True, 28 | system_site_packages=True, 29 | ) 30 | def run_feature_pipeline( 31 | export_end_reference_datetime: str, 32 | days_delay: int, 33 | days_export: int, 34 | url: str, 35 | feature_group_version: int, 36 | ) -> dict: 37 | """ 38 | Run the feature pipeline. 39 | 40 | Args: 41 | export_end_reference_datetime: The end reference datetime of the export window. If None, the current time is used. 42 | Because the data is always delayed with "days_delay" days, this date is used only as a reference point. 43 | The real extracted window will be computed as [export_end_reference_datetime - days_delay - days_export, export_end_reference_datetime - days_delay]. 44 | 45 | days_delay : int 46 | Data has a delay of N days. Thus, we have to shift our window with N days. 47 | 48 | days_export : int 49 | The number of days to export. 50 | 51 | url : str 52 | URL to the raw data. 53 | 54 | feature_group_version : int 55 | Version of the feature store feature group to use. 56 | 57 | Returns: 58 | Metadata of the feature pipeline run. 59 | """ 60 | 61 | from datetime import datetime 62 | 63 | from feature_pipeline import utils, pipeline 64 | 65 | logger = utils.get_logger(__name__) 66 | 67 | try: 68 | export_end_reference_datetime = datetime.strptime( 69 | export_end_reference_datetime, "%Y-%m-%d %H:%M:%S.%f%z" 70 | ) 71 | except ValueError: 72 | export_end_reference_datetime = datetime.strptime( 73 | export_end_reference_datetime, "%Y-%m-%d %H:%M:%S%z" 74 | ) 75 | export_end_reference_datetime = export_end_reference_datetime.replace( 76 | microsecond=0, tzinfo=None 77 | ) 78 | 79 | logger.info(f"export_end_datetime = {export_end_reference_datetime}") 80 | logger.info(f"days_delay = {days_delay}") 81 | logger.info(f"days_export = {days_export}") 82 | logger.info(f"url = {url}") 83 | logger.info(f"feature_group_version = {feature_group_version}") 84 | 85 | return pipeline.run( 86 | export_end_reference_datetime=export_end_reference_datetime, 87 | days_delay=days_delay, 88 | days_export=days_export, 89 | url=url, 90 | feature_group_version=feature_group_version, 91 | ) 92 | 93 | @task.virtualenv( 94 | task_id="create_feature_view", 95 | requirements=[ 96 | "--trusted-host 172.17.0.1", 97 | "--extra-index-url http://172.17.0.1", 98 | "feature_pipeline", 99 | ], 100 | python_version="3.9", 101 | multiple_outputs=True, 102 | system_site_packages=False, 103 | ) 104 | def create_feature_view(feature_pipeline_metadata: dict) -> dict: 105 | """ 106 | This function creates a feature view based on the feature pipeline computations. The feature view 107 | is created using the feature group version from the feature pipeline metadata. 108 | """ 109 | 110 | from feature_pipeline import feature_view 111 | 112 | return feature_view.create( 113 | feature_group_version=feature_pipeline_metadata["feature_group_version"] 114 | ) 115 | 116 | @task.virtualenv( 117 | task_id="run_hyperparameter_tuning", 118 | requirements=[ 119 | "--trusted-host 172.17.0.1", 120 | "--extra-index-url http://172.17.0.1", 121 | "training_pipeline", 122 | ], 123 | python_version="3.9", 124 | multiple_outputs=True, 125 | system_site_packages=False, 126 | ) 127 | def run_hyperparameter_tuning(feature_view_metadata: dict) -> dict: 128 | """ 129 | This function runs hyperparameter tuning for the training pipeline. 130 | The feature store feature view version and training dataset version are passed 131 | based on the results from the create_feature_view task. 132 | """ 133 | 134 | from training_pipeline import hyperparameter_tuning 135 | 136 | return hyperparameter_tuning.run( 137 | feature_view_version=feature_view_metadata["feature_view_version"], 138 | training_dataset_version=feature_view_metadata["training_dataset_version"], 139 | ) 140 | 141 | @task.virtualenv( 142 | task_id="upload_best_config", 143 | requirements=[ 144 | "--trusted-host 172.17.0.1", 145 | "--extra-index-url http://172.17.0.1", 146 | "training_pipeline", 147 | ], 148 | python_version="3.9", 149 | multiple_outputs=False, 150 | system_site_packages=False, 151 | ) 152 | def upload_best_config(last_sweep_metadata: dict): 153 | """ 154 | Upload the best config to W&B ML platform found in the hyperparameter tuning step 155 | based on the given sweep id. 156 | """ 157 | 158 | from training_pipeline import best_config 159 | 160 | best_config.upload(sweep_id=last_sweep_metadata["sweep_id"]) 161 | 162 | @task.virtualenv( 163 | task_id="train_from_best_config", 164 | requirements=[ 165 | "--trusted-host 172.17.0.1", 166 | "--extra-index-url http://172.17.0.1", 167 | "training_pipeline", 168 | ], 169 | python_version="3.9", 170 | multiple_outputs=True, 171 | system_site_packages=False, 172 | trigger_rule=TriggerRule.ALL_DONE, 173 | ) 174 | def train_from_best_config(feature_view_metadata: dict) -> dict: 175 | """Trains model from the best config found in hyperparameter tuning. 176 | 177 | Args: 178 | feature_view_metadata (dict): Contains feature store feature view and training dataset version. 179 | 180 | Returns: 181 | metadata from the training run 182 | """ 183 | 184 | from training_pipeline import utils, train 185 | 186 | has_best_config = utils.check_if_artifact_exists("best_config") 187 | if has_best_config is False: 188 | raise RuntimeError( 189 | "No best config found. Please run hyperparameter tuning first." 190 | ) 191 | 192 | return train.from_best_config( 193 | feature_view_version=feature_view_metadata["feature_view_version"], 194 | training_dataset_version=feature_view_metadata["training_dataset_version"], 195 | ) 196 | 197 | @task.virtualenv( 198 | task_id="compute_monitoring", 199 | requirements=[ 200 | "--trusted-host 172.17.0.1", 201 | "--extra-index-url http://172.17.0.1", 202 | "batch_prediction_pipeline", 203 | ], 204 | python_version="3.9", 205 | system_site_packages=False, 206 | ) 207 | def compute_monitoring(feature_view_metadata: dict): 208 | """Compute monitoring metrics for newly obbserved data. 209 | 210 | Args: 211 | feature_view_metadata: metadata containing the version of the feature store feature view version. 212 | """ 213 | 214 | from batch_prediction_pipeline import monitoring 215 | 216 | monitoring.compute( 217 | feature_view_version=feature_view_metadata["feature_view_version"], 218 | ) 219 | 220 | @task.virtualenv( 221 | task_id="batch_predict", 222 | requirements=[ 223 | "--trusted-host 172.17.0.1", 224 | "--extra-index-url http://172.17.0.1", 225 | "batch_prediction_pipeline", 226 | ], 227 | python_version="3.9", 228 | system_site_packages=False, 229 | ) 230 | def batch_predict( 231 | feature_view_metadata: dict, 232 | train_metadata: dict, 233 | feature_pipeline_metadata: dict, 234 | fh: int = 24, 235 | ): 236 | """ 237 | This is the function that runs the batch prediction pipeline 238 | 239 | Args: 240 | feature_view_metadata (dict): the metadata from the create feature view task 241 | train_metadata (dict): the metadata from the training pipeline task 242 | feature_pipeline_metadata (dict): the metadata from the feature pipeline task 243 | fh (int, optional): forecast horizon. Defaults to 24. 244 | """ 245 | 246 | from datetime import datetime 247 | from batch_prediction_pipeline import batch 248 | 249 | start_datetime = datetime.strptime( 250 | feature_pipeline_metadata["export_datetime_utc_start"], 251 | feature_pipeline_metadata["datetime_format"], 252 | ) 253 | end_datetime = datetime.strptime( 254 | feature_pipeline_metadata["export_datetime_utc_end"], 255 | feature_pipeline_metadata["datetime_format"], 256 | ) 257 | 258 | batch.predict( 259 | fh=fh, 260 | feature_view_version=feature_view_metadata["feature_view_version"], 261 | model_version=train_metadata["model_version"], 262 | start_datetime=start_datetime, 263 | end_datetime=end_datetime, 264 | ) 265 | 266 | @task.branch(task_id="if_run_hyperparameter_tuning_branching") 267 | def if_run_hyperparameter_tuning_branching(run_hyperparameter_tuning: bool) -> bool: 268 | """Task used to branch between hyperparameter tuning and skipping it.""" 269 | if run_hyperparameter_tuning is True: 270 | return ["branch_run_hyperparameter_tuning"] 271 | else: 272 | return ["branch_skip_hyperparameter_tuning"] 273 | 274 | # Define empty operators used for branching between hyperparameter tuning and skipping it. 275 | branch_run_hyperparameter_tuning_operator = EmptyOperator( 276 | task_id="branch_run_hyperparameter_tuning" 277 | ) 278 | branch_skip_hyperparameter_tuning_operator = EmptyOperator( 279 | task_id="branch_skip_hyperparameter_tuning" 280 | ) 281 | 282 | # Define Airflow variables. 283 | days_delay = int(Variable.get("ml_pipeline_days_delay", default_var=15)) 284 | days_export = int(Variable.get("ml_pipeline_days_export", default_var=30)) 285 | url = Variable.get( 286 | "ml_pipeline_url", 287 | default_var="https://drive.google.com/uc?export=download&id=1y48YeDymLurOTUO-GeFOUXVNc9MCApG5", 288 | ) 289 | feature_group_version = int( 290 | Variable.get("ml_pipeline_feature_group_version", default_var=1) 291 | ) 292 | should_run_hyperparameter_tuning = ( 293 | Variable.get( 294 | "ml_pipeline_should_run_hyperparameter_tuning", default_var="False" 295 | ) 296 | == "True" 297 | ) 298 | 299 | # Feature pipeline 300 | feature_pipeline_metadata = run_feature_pipeline( 301 | export_end_reference_datetime="{{ dag_run.logical_date }}", 302 | days_delay=days_delay, 303 | days_export=days_export, 304 | url=url, 305 | feature_group_version=feature_group_version, 306 | ) 307 | feature_view_metadata = create_feature_view(feature_pipeline_metadata) 308 | 309 | # Training pipeline 310 | if_run_hyperparameter_tuning_branch = if_run_hyperparameter_tuning_branching( 311 | should_run_hyperparameter_tuning 312 | ) 313 | last_sweep_metadata = run_hyperparameter_tuning(feature_view_metadata) 314 | upload_best_model_step = upload_best_config(last_sweep_metadata) 315 | train_metadata = train_from_best_config(feature_view_metadata) 316 | 317 | # Batch prediction pipeline 318 | compute_monitoring_step = compute_monitoring(feature_view_metadata) 319 | batch_predict_step = batch_predict( 320 | feature_view_metadata, train_metadata, feature_pipeline_metadata 321 | ) 322 | 323 | # Define DAG structure. 324 | ( 325 | feature_view_metadata 326 | >> if_run_hyperparameter_tuning_branch 327 | >> [ 328 | if_run_hyperparameter_tuning_branch 329 | >> Label("Run HPO") 330 | >> branch_run_hyperparameter_tuning_operator 331 | >> last_sweep_metadata 332 | >> upload_best_model_step, 333 | if_run_hyperparameter_tuning_branch 334 | >> Label("Skip HPO") 335 | >> branch_skip_hyperparameter_tuning_operator, 336 | ] 337 | >> train_metadata 338 | >> compute_monitoring_step 339 | >> batch_predict_step 340 | ) 341 | 342 | 343 | ml_pipeline() 344 | -------------------------------------------------------------------------------- /airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.7.0 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. 31 | # Default: . 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 33 | # 34 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 35 | # Default: airflow 36 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 37 | # Default: airflow 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 39 | # Use this option ONLY for quick checks. Installing requirements at container 40 | # startup is done EVERY TIME the service is started. 41 | # A better way is to build a custom image or extend the official image 42 | # as described in https://airflow.apache.org/docs/docker-stack/build.html. 43 | # Default: '' 44 | # 45 | # Feel free to modify this file to suit your needs. 46 | --- 47 | version: '3.8' 48 | x-airflow-common: 49 | &airflow-common 50 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 51 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 52 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 53 | # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.7.0} 54 | build: . 55 | environment: 56 | &airflow-common-env 57 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 58 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 59 | # For backward compatibility, with Airflow <2.3 60 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 61 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 62 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 63 | AIRFLOW__CORE__FERNET_KEY: '' 64 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 65 | AIRFLOW__CORE__LOAD_EXAMPLES: 'true' 66 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' 67 | # yamllint disable rule:line-length 68 | # Use simple http server on scheduler for health checks 69 | # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server 70 | # yamllint enable rule:line-length 71 | AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' 72 | # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks 73 | # for other purpose (development, test and especially production usage) build/extend Airflow image. 74 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 75 | 76 | # ML pipeline Custom Environment Variables 77 | ML_PIPELINE_ROOT_DIR: ${ML_PIPELINE_ROOT_DIR:-/opt/airflow/dags} 78 | volumes: 79 | - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags 80 | - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs 81 | - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config 82 | - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins 83 | user: "${AIRFLOW_UID:-50000}:0" 84 | depends_on: 85 | &airflow-common-depends-on 86 | redis: 87 | condition: service_healthy 88 | postgres: 89 | condition: service_healthy 90 | 91 | services: 92 | postgres: 93 | image: postgres:13 94 | platform: linux/amd64 95 | environment: 96 | POSTGRES_USER: airflow 97 | POSTGRES_PASSWORD: airflow 98 | POSTGRES_DB: airflow 99 | volumes: 100 | - postgres-db-volume:/var/lib/postgresql/data 101 | healthcheck: 102 | test: ["CMD", "pg_isready", "-U", "airflow"] 103 | interval: 10s 104 | retries: 5 105 | start_period: 5s 106 | restart: always 107 | 108 | redis: 109 | image: redis:latest 110 | expose: 111 | - 6379 112 | healthcheck: 113 | test: ["CMD", "redis-cli", "ping"] 114 | interval: 10s 115 | timeout: 30s 116 | retries: 50 117 | start_period: 30s 118 | restart: always 119 | 120 | airflow-webserver: 121 | <<: *airflow-common 122 | command: webserver 123 | ports: 124 | - "8080:8080" 125 | healthcheck: 126 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 127 | interval: 30s 128 | timeout: 10s 129 | retries: 5 130 | start_period: 30s 131 | restart: always 132 | depends_on: 133 | <<: *airflow-common-depends-on 134 | airflow-init: 135 | condition: service_completed_successfully 136 | 137 | airflow-scheduler: 138 | <<: *airflow-common 139 | command: scheduler 140 | healthcheck: 141 | test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] 142 | interval: 30s 143 | timeout: 10s 144 | retries: 5 145 | start_period: 30s 146 | restart: always 147 | depends_on: 148 | <<: *airflow-common-depends-on 149 | airflow-init: 150 | condition: service_completed_successfully 151 | 152 | airflow-worker: 153 | <<: *airflow-common 154 | command: celery worker 155 | healthcheck: 156 | # yamllint disable rule:line-length 157 | test: 158 | - "CMD-SHELL" 159 | - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 160 | interval: 30s 161 | timeout: 10s 162 | retries: 5 163 | start_period: 30s 164 | environment: 165 | <<: *airflow-common-env 166 | # Required to handle warm shutdown of the celery workers properly 167 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 168 | DUMB_INIT_SETSID: "0" 169 | restart: always 170 | depends_on: 171 | <<: *airflow-common-depends-on 172 | airflow-init: 173 | condition: service_completed_successfully 174 | 175 | airflow-triggerer: 176 | <<: *airflow-common 177 | command: triggerer 178 | healthcheck: 179 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 180 | interval: 30s 181 | timeout: 10s 182 | retries: 5 183 | start_period: 30s 184 | restart: always 185 | depends_on: 186 | <<: *airflow-common-depends-on 187 | airflow-init: 188 | condition: service_completed_successfully 189 | 190 | airflow-init: 191 | <<: *airflow-common 192 | entrypoint: /bin/bash 193 | # yamllint disable rule:line-length 194 | command: 195 | - -c 196 | - | 197 | function ver() { 198 | printf "%04d%04d%04d%04d" $${1//./ } 199 | } 200 | airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version) 201 | airflow_version_comparable=$$(ver $${airflow_version}) 202 | min_airflow_version=2.2.0 203 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 204 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 205 | echo 206 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 207 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 208 | echo 209 | exit 1 210 | fi 211 | if [[ -z "${AIRFLOW_UID}" ]]; then 212 | echo 213 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 214 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 215 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 216 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 217 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" 218 | echo 219 | fi 220 | one_meg=1048576 221 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 222 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 223 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 224 | warning_resources="false" 225 | if (( mem_available < 4000 )) ; then 226 | echo 227 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 228 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 229 | echo 230 | warning_resources="true" 231 | fi 232 | if (( cpus_available < 2 )); then 233 | echo 234 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 235 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 236 | echo 237 | warning_resources="true" 238 | fi 239 | if (( disk_available < one_meg * 10 )); then 240 | echo 241 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 242 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 243 | echo 244 | warning_resources="true" 245 | fi 246 | if [[ $${warning_resources} == "true" ]]; then 247 | echo 248 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 249 | echo "Please follow the instructions to increase amount of resources available:" 250 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" 251 | echo 252 | fi 253 | mkdir -p /sources/logs /sources/dags /sources/plugins 254 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 255 | exec /entrypoint airflow version 256 | # yamllint enable rule:line-length 257 | environment: 258 | <<: *airflow-common-env 259 | _AIRFLOW_DB_MIGRATE: 'true' 260 | _AIRFLOW_WWW_USER_CREATE: 'true' 261 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 262 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 263 | _PIP_ADDITIONAL_REQUIREMENTS: '' 264 | user: "0:0" 265 | volumes: 266 | - ${AIRFLOW_PROJ_DIR:-.}:/sources 267 | 268 | airflow-cli: 269 | <<: *airflow-common 270 | profiles: 271 | - debug 272 | environment: 273 | <<: *airflow-common-env 274 | CONNECTION_CHECK_MAX_COUNT: "0" 275 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 276 | command: 277 | - bash 278 | - -c 279 | - airflow 280 | 281 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up 282 | # or by explicitly targeted on the command line e.g. docker-compose up flower. 283 | # See: https://docs.docker.com/compose/profiles/ 284 | flower: 285 | <<: *airflow-common 286 | command: celery flower 287 | profiles: 288 | - flower 289 | ports: 290 | - "5555:5555" 291 | healthcheck: 292 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 293 | interval: 30s 294 | timeout: 10s 295 | retries: 5 296 | start_period: 30s 297 | restart: always 298 | depends_on: 299 | <<: *airflow-common-depends-on 300 | airflow-init: 301 | condition: service_completed_successfully 302 | 303 | my-private-pypi: 304 | image: pypiserver/pypiserver:v1.5.2 305 | platform: linux/amd64 306 | restart: always 307 | ports: 308 | - "80:8080" 309 | volumes: 310 | - ~/.htpasswd:/data/.htpasswd 311 | command: 312 | - run 313 | - -P 314 | - .htpasswd/htpasswd.txt 315 | - --overwrite 316 | 317 | volumes: 318 | postgres-db-volume: 319 | -------------------------------------------------------------------------------- /airflow/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "energy-forecasting" 3 | version = "0.1.0" 4 | description = "" 5 | authors = [ 6 | "Paul Iusztin ", 7 | "Kurtis Pykes " 8 | ] 9 | 10 | [tool.poetry.dependencies] 11 | python = "~3.9" 12 | pyarrow = "^11.0.0" 13 | tqdm = "^4.64.1" 14 | category-encoders = "^2.6.0" 15 | wandb = "^0.13.10" 16 | matplotlib = "^3.7.0" 17 | hopsworks = "^3.0.5" 18 | python-dotenv = "^0.21.1" 19 | lightgbm = "^3.3.5" 20 | sktime = "^0.16.1" 21 | seaborn = "^0.12.2" 22 | google-cloud-storage = "^2.7.0" 23 | yarl = "^1.8.2" 24 | fire = "^0.5.0" 25 | Jinja2 = "3.0.1" 26 | 27 | [tool.poetry.dev-dependencies] 28 | 29 | [tool.poetry.group.dev.dependencies] 30 | black = "^23.1.0" 31 | 32 | [build-system] 33 | requires = ["poetry-core>=1.0.0"] 34 | build-backend = "poetry.core.masonry.api" 35 | -------------------------------------------------------------------------------- /app-api/.dockerignore: -------------------------------------------------------------------------------- 1 | jupyter_notebooks* 2 | */env* 3 | */venv* 4 | venv 5 | env 6 | .circleci* 7 | *.env 8 | *.log 9 | .git 10 | .gitignore 11 | .tox -------------------------------------------------------------------------------- /app-api/.env.default: -------------------------------------------------------------------------------- 1 | APP_API_GCP_PROJECT = "energy_consumption" 2 | APP_API_GCP_BUCKET = "hourly-batch-predictions" 3 | APP_API_GCP_SERVICE_ACCOUNT_JSON_PATH = "/app/src/credentials/gcp/energy_consumption/read-buckets.json" 4 | -------------------------------------------------------------------------------- /app-api/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.4 2 | 3 | WORKDIR /app/src 4 | 5 | RUN apt-get update && apt-get upgrade -y 6 | RUN pip install --no-cache -U pip 7 | RUN pip install --no-cache poetry==1.4.2 8 | 9 | # Configuring poetry. 10 | RUN poetry config virtualenvs.create false 11 | 12 | # First copy & install requirements to speed up the build process in case only the code changes. 13 | COPY ./app-api/pyproject.toml /app/src/ 14 | COPY ./app-api/poetry.lock /app/src/ 15 | 16 | RUN poetry install --no-interaction --no-root -vvv 17 | 18 | # Copy the rest of the files. 19 | ADD ./app-api /app/src 20 | 21 | # Give access to run the run.sh script. 22 | RUN chmod +x run.sh 23 | 24 | CMD ["bash", "./run.sh"] 25 | -------------------------------------------------------------------------------- /app-api/README.md: -------------------------------------------------------------------------------- 1 | # API - Web APP 2 | 3 | Check out [Lesson 6](https://towardsdatascience.com/fastapi-and-streamlit-the-python-duo-you-must-know-about-72825def1243) on Medium to better understand how we built the FastAPI backend. 4 | 5 | ## Install for Development 6 | 7 | Create virtual environment: 8 | ```shell 9 | cd app-api 10 | poetry shell 11 | poetry install 12 | ``` 13 | 14 | Check the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#-set-up-additional-tools-) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections to see **how to set up** the **additional tools** and **credentials** you need to run this project. 15 | 16 | ## Usage for Development 17 | 18 | To start the server, run the following: 19 | ```shell 20 | sh run.sh 21 | ``` 22 | 23 | Access http://127.0.0.1:8001/api/v1/docs to see the docs. 24 | 25 | **NOTE:** Be careful to complete the `.env` file as explained in the main README. 26 | -------------------------------------------------------------------------------- /app-api/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/app-api/api/__init__.py -------------------------------------------------------------------------------- /app-api/api/__main__.py: -------------------------------------------------------------------------------- 1 | import uvicorn 2 | 3 | from api.config import get_settings 4 | 5 | 6 | def main() -> None: 7 | """Entrypoint of the application.""" 8 | uvicorn.run( 9 | "api.application:get_app", 10 | workers=get_settings().WORKERS_COUNT, 11 | host=get_settings().HOST, 12 | port=get_settings().PORT, 13 | reload=get_settings().RELOAD, 14 | log_level=get_settings().LOG_LEVEL.value.lower(), 15 | factory=True, 16 | ) 17 | 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /app-api/api/application.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import uvicorn 4 | from fastapi import APIRouter, FastAPI 5 | from fastapi.middleware.cors import CORSMiddleware 6 | 7 | from api.views import api_router 8 | from api.config import get_settings 9 | 10 | 11 | def get_app() -> FastAPI: 12 | """Create FastAPI app.""" 13 | 14 | app = FastAPI( 15 | title=get_settings().PROJECT_NAME, 16 | docs_url=f"/api/{get_settings().VERSION}/docs", 17 | redoc_url=f"/api/{get_settings().VERSION}/redoc", 18 | openapi_url=f"/api/{get_settings().VERSION}/openapi.json", 19 | ) 20 | # For demo purposes, allow all origins. 21 | app.add_middleware( 22 | CORSMiddleware, 23 | allow_origins=["*"], 24 | allow_credentials=True, 25 | allow_methods=["*"], 26 | allow_headers=["*"], 27 | ) 28 | 29 | app.include_router(api_router, prefix=f"/api/{get_settings().VERSION}") 30 | 31 | return app 32 | -------------------------------------------------------------------------------- /app-api/api/config.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from functools import lru_cache 3 | import logging 4 | import sys 5 | from types import FrameType 6 | from typing import List, Optional, cast 7 | 8 | from pydantic import AnyHttpUrl, BaseSettings 9 | 10 | 11 | class LogLevel(str, enum.Enum): # noqa: WPS600 12 | """Possible log levels.""" 13 | 14 | NOTSET = "NOTSET" 15 | DEBUG = "DEBUG" 16 | INFO = "INFO" 17 | WARNING = "WARNING" 18 | ERROR = "ERROR" 19 | FATAL = "FATAL" 20 | 21 | 22 | class Settings(BaseSettings): 23 | """ 24 | Application settings. 25 | 26 | These parameters can be configured 27 | with environment variables. 28 | """ 29 | 30 | # General configurations. 31 | HOST: str = "0.0.0.0" 32 | PORT: int = 8001 33 | LOG_LEVEL: LogLevel = LogLevel.INFO 34 | # - Current version of the API. 35 | VERSION: str = "v1" 36 | # - Quantity of workers for uvicorn. 37 | WORKERS_COUNT: int = 1 38 | # - Enable uvicorn reloading. 39 | RELOAD: bool = False 40 | 41 | PROJECT_NAME: str = "Energy Consumption API" 42 | 43 | # Google Cloud Platform credentials 44 | GCP_PROJECT: Optional[str] = None 45 | GCP_BUCKET: Optional[str] = None 46 | GCP_SERVICE_ACCOUNT_JSON_PATH: Optional[str] = None 47 | 48 | class Config: 49 | env_file = ".env" 50 | env_prefix = "APP_API_" 51 | case_sensitive = False 52 | env_file_encoding = "utf-8" 53 | 54 | 55 | @lru_cache() 56 | def get_settings(): 57 | return Settings() 58 | -------------------------------------------------------------------------------- /app-api/api/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | from api.schemas.health import Health 2 | from api.schemas.predictions import ( 3 | PredictionResults, 4 | MonitoringMetrics, 5 | MonitoringValues, 6 | ) 7 | from api.schemas.consumer_type_values import UniqueConsumerType 8 | from api.schemas.area_values import UniqueArea 9 | -------------------------------------------------------------------------------- /app-api/api/schemas/area_values.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class UniqueArea(BaseModel): 7 | values: List[int] 8 | -------------------------------------------------------------------------------- /app-api/api/schemas/consumer_type_values.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class UniqueConsumerType(BaseModel): 7 | values: List[int] 8 | -------------------------------------------------------------------------------- /app-api/api/schemas/health.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class Health(BaseModel): 5 | name: str 6 | api_version: str 7 | -------------------------------------------------------------------------------- /app-api/api/schemas/predictions.py: -------------------------------------------------------------------------------- 1 | from typing import List, Any 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class PredictionResults(BaseModel): 7 | datetime_utc: List[int] 8 | energy_consumption: List[float] 9 | preds_datetime_utc: List[int] 10 | preds_energy_consumption: List[float] 11 | 12 | 13 | class MonitoringMetrics(BaseModel): 14 | datetime_utc: List[int] 15 | mape: List[float] 16 | 17 | 18 | class MonitoringValues(BaseModel): 19 | y_monitoring_datetime_utc: List[int] 20 | y_monitoring_energy_consumption: List[float] 21 | predictions_monitoring_datetime_utc: List[int] 22 | predictions_monitoring_energy_consumptionc: List[float] 23 | -------------------------------------------------------------------------------- /app-api/api/views.py: -------------------------------------------------------------------------------- 1 | import gcsfs 2 | from typing import Any, List 3 | 4 | import pandas as pd 5 | from fastapi import APIRouter, HTTPException 6 | 7 | from api import schemas 8 | from api.config import get_settings 9 | 10 | 11 | fs = gcsfs.GCSFileSystem( 12 | project=get_settings().GCP_PROJECT, 13 | token=get_settings().GCP_SERVICE_ACCOUNT_JSON_PATH, 14 | ) 15 | 16 | api_router = APIRouter() 17 | 18 | 19 | @api_router.get("/health", response_model=schemas.Health, status_code=200) 20 | def health() -> dict: 21 | """ 22 | Health check endpoint. 23 | """ 24 | 25 | health_data = schemas.Health( 26 | name=get_settings().PROJECT_NAME, api_version=get_settings().VERSION 27 | ) 28 | 29 | return health_data.dict() 30 | 31 | 32 | @api_router.get( 33 | "/consumer_type_values", response_model=schemas.UniqueConsumerType, status_code=200 34 | ) 35 | def consumer_type_values() -> List: 36 | """ 37 | Retrieve unique consumer types. 38 | """ 39 | 40 | # Download the data from GCS. 41 | X = pd.read_parquet(f"{get_settings().GCP_BUCKET}/X.parquet", filesystem=fs) 42 | 43 | unique_consumer_type = list(X.index.unique(level="consumer_type")) 44 | 45 | return {"values": unique_consumer_type} 46 | 47 | 48 | @api_router.get("/area_values", response_model=schemas.UniqueArea, status_code=200) 49 | def area_values() -> List: 50 | """ 51 | Retrieve unique areas. 52 | """ 53 | 54 | # Download the data from GCS. 55 | X = pd.read_parquet(f"{get_settings().GCP_BUCKET}/X.parquet", filesystem=fs) 56 | 57 | unique_area = list(X.index.unique(level="area")) 58 | 59 | return {"values": unique_area} 60 | 61 | 62 | @api_router.get( 63 | "/predictions/{area}/{consumer_type}", 64 | response_model=schemas.PredictionResults, 65 | status_code=200, 66 | ) 67 | async def get_predictions(area: int, consumer_type: int) -> Any: 68 | """ 69 | Get forecasted predictions based on the given area and consumer type. 70 | """ 71 | 72 | # Download the data from GCS. 73 | train_df = pd.read_parquet(f"{get_settings().GCP_BUCKET}/y.parquet", filesystem=fs) 74 | preds_df = pd.read_parquet( 75 | f"{get_settings().GCP_BUCKET}/predictions.parquet", filesystem=fs 76 | ) 77 | 78 | # Query the data for the given area and consumer type. 79 | try: 80 | train_df = train_df.xs((area, consumer_type), level=["area", "consumer_type"]) 81 | preds_df = preds_df.xs((area, consumer_type), level=["area", "consumer_type"]) 82 | except KeyError: 83 | raise HTTPException( 84 | status_code=404, 85 | detail=f"No data found for the given area and consumer type: {area}, {consumer_type}", 86 | ) 87 | 88 | if len(train_df) == 0 or len(preds_df) == 0: 89 | raise HTTPException( 90 | status_code=404, 91 | detail=f"No data found for the given area and consumer type: {area}, {consumer_type}", 92 | ) 93 | 94 | # Return only the latest week of observations. 95 | train_df = train_df.sort_index().tail(24 * 7) 96 | 97 | # Prepare data to be returned. 98 | datetime_utc = train_df.index.get_level_values("datetime_utc").to_list() 99 | energy_consumption = train_df["energy_consumption"].to_list() 100 | 101 | preds_datetime_utc = preds_df.index.get_level_values("datetime_utc").to_list() 102 | preds_energy_consumption = preds_df["energy_consumption"].to_list() 103 | 104 | results = { 105 | "datetime_utc": datetime_utc, 106 | "energy_consumption": energy_consumption, 107 | "preds_datetime_utc": preds_datetime_utc, 108 | "preds_energy_consumption": preds_energy_consumption, 109 | } 110 | 111 | return results 112 | 113 | 114 | @api_router.get( 115 | "/monitoring/metrics", 116 | response_model=schemas.MonitoringMetrics, 117 | status_code=200, 118 | ) 119 | async def get_metrics() -> Any: 120 | """ 121 | Get monitoring metrics. 122 | """ 123 | 124 | # Download the data from GCS. 125 | metrics = pd.read_parquet( 126 | f"{get_settings().GCP_BUCKET}/metrics_monitoring.parquet", filesystem=fs 127 | ) 128 | 129 | datetime_utc = metrics.index.to_list() 130 | mape = metrics["MAPE"].to_list() 131 | 132 | return { 133 | "datetime_utc": datetime_utc, 134 | "mape": mape, 135 | } 136 | 137 | 138 | @api_router.get( 139 | "/monitoring/values/{area}/{consumer_type}", 140 | response_model=schemas.MonitoringValues, 141 | status_code=200, 142 | ) 143 | async def get_predictions(area: int, consumer_type: int) -> Any: 144 | """ 145 | Get forecasted predictions based on the given area and consumer type. 146 | """ 147 | 148 | # Download the data from GCS. 149 | y_monitoring = pd.read_parquet( 150 | f"{get_settings().GCP_BUCKET}/y_monitoring.parquet", filesystem=fs 151 | ) 152 | predictions_monitoring = pd.read_parquet( 153 | f"{get_settings().GCP_BUCKET}/predictions_monitoring.parquet", filesystem=fs 154 | ) 155 | 156 | # Query the data for the given area and consumer type. 157 | try: 158 | y_monitoring = y_monitoring.xs( 159 | (area, consumer_type), level=["area", "consumer_type"] 160 | ) 161 | predictions_monitoring = predictions_monitoring.xs( 162 | (area, consumer_type), level=["area", "consumer_type"] 163 | ) 164 | except KeyError: 165 | raise HTTPException( 166 | status_code=404, 167 | detail=f"No data found for the given area and consumer typefrontend: {area}, {consumer_type}", 168 | ) 169 | 170 | if len(y_monitoring) == 0 or len(predictions_monitoring) == 0: 171 | raise HTTPException( 172 | status_code=404, 173 | detail=f"No data found for the given area and consumer type: {area}, {consumer_type}", 174 | ) 175 | 176 | # Prepare data to be returned. 177 | y_monitoring_datetime_utc = y_monitoring.index.get_level_values( 178 | "datetime_utc" 179 | ).to_list() 180 | y_monitoring_energy_consumption = y_monitoring["energy_consumption"].to_list() 181 | 182 | predictions_monitoring_datetime_utc = predictions_monitoring.index.get_level_values( 183 | "datetime_utc" 184 | ).to_list() 185 | predictions_monitoring_energy_consumptionc = predictions_monitoring[ 186 | "energy_consumption" 187 | ].to_list() 188 | 189 | results = { 190 | "y_monitoring_datetime_utc": y_monitoring_datetime_utc, 191 | "y_monitoring_energy_consumption": y_monitoring_energy_consumption, 192 | "predictions_monitoring_datetime_utc": predictions_monitoring_datetime_utc, 193 | "predictions_monitoring_energy_consumptionc": predictions_monitoring_energy_consumptionc, 194 | } 195 | 196 | return results 197 | -------------------------------------------------------------------------------- /app-api/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "app-api" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Iusztin Paul ", "Kurtis Pykes "] 6 | readme = "README.md" 7 | packages = [{include = "api"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "~3.9" 11 | pandas = ">=1.5.3,<1.6.0" 12 | uvicorn = ">=0.21.0,<0.22.0" 13 | fastapi = ">=0.94.1,<0.95.0" 14 | pydantic = ">=1.10.6,<1.11.0" 15 | pyarrow = ">=11.0.0,<11.1.0" 16 | gcsfs = "2023.3.0" 17 | python-dotenv = "0.21.1" 18 | 19 | 20 | [build-system] 21 | requires = ["poetry-core"] 22 | build-backend = "poetry.core.masonry.api" 23 | -------------------------------------------------------------------------------- /app-api/run.sh: -------------------------------------------------------------------------------- 1 | /usr/local/bin/python -m api 2 | -------------------------------------------------------------------------------- /app-frontend/.dockerignore: -------------------------------------------------------------------------------- 1 | jupyter_notebooks* 2 | */env* 3 | */venv* 4 | venv 5 | env 6 | .circleci* 7 | *.env 8 | *.log 9 | .git 10 | .gitignore 11 | .tox -------------------------------------------------------------------------------- /app-frontend/.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | base = "dark" 3 | -------------------------------------------------------------------------------- /app-frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.8 2 | 3 | WORKDIR /app/src 4 | 5 | RUN apt-get update && apt-get upgrade -y 6 | RUN pip install --no-cache -U pip 7 | RUN pip install --no-cache poetry==1.4.2 8 | 9 | # Configuring poetry. 10 | RUN poetry config virtualenvs.create false 11 | 12 | # First copy & install requirements to speed up the build process in case only the code changes. 13 | COPY ./app-frontend/pyproject.toml /app/src/ 14 | COPY ./app-frontend/poetry.lock /app/src/ 15 | 16 | RUN poetry install --no-interaction --no-root -vvv 17 | 18 | # Copy the rest of the files. 19 | ADD ./app-frontend /app/src 20 | 21 | CMD ["streamlit", "run", "frontend/main.py", "--server.port", "8501"] -------------------------------------------------------------------------------- /app-frontend/README.md: -------------------------------------------------------------------------------- 1 | # Frontend - Web APP 2 | 3 | Check out [Lesson 6](https://towardsdatascience.com/fastapi-and-streamlit-the-python-duo-you-must-know-about-72825def1243) on Medium to better understand how we built the Streamlit predictions dashboard. 4 | 5 | ## Install for Development 6 | 7 | Create virtual environment: 8 | ```shell 9 | cd app-frontend 10 | poetry shell 11 | poetry install 12 | ``` 13 | 14 | **NOTE:** Be sure that the API is already running. 15 | 16 | 17 | ## Usage for Development 18 | 19 | To start the app, run the following: 20 | ```shell 21 | streamlit run frontend/main.py --server.port 8501 22 | ``` 23 | 24 | Access http://127.0.0.1:8501/ to see the app. 25 | -------------------------------------------------------------------------------- /app-frontend/frontend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/app-frontend/frontend/__init__.py -------------------------------------------------------------------------------- /app-frontend/frontend/components.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import requests 3 | 4 | import pandas as pd 5 | import plotly.graph_objects as go 6 | 7 | from settings import API_URL 8 | 9 | 10 | def build_data_plot(area: int, consumer_type: int): 11 | """ 12 | Build plotly graph for data. 13 | """ 14 | 15 | # Get predictions from API. 16 | response = requests.get( 17 | API_URL / "predictions" / f"{area}" / f"{consumer_type}", verify=False 18 | ) 19 | if response.status_code != 200: 20 | # If the response is invalid, build empty dataframes in the proper format. 21 | train_df = build_dataframe([], []) 22 | preds_df = build_dataframe([], []) 23 | 24 | title = "NO DATA AVAILABLE FOR THE GIVEN AREA AND CONSUMER TYPE" 25 | else: 26 | json_response = response.json() 27 | 28 | # Build DataFrames for plotting. 29 | datetime_utc = json_response.get("datetime_utc") 30 | energy_consumption = json_response.get("energy_consumption") 31 | pred_datetime_utc = json_response.get("preds_datetime_utc") 32 | pred_energy_consumption = json_response.get("preds_energy_consumption") 33 | 34 | train_df = build_dataframe(datetime_utc, energy_consumption) 35 | preds_df = build_dataframe(pred_datetime_utc, pred_energy_consumption) 36 | 37 | title = "Energy Consumption per DE35 Industry Code per Hour" 38 | 39 | # Create plot. 40 | fig = go.Figure() 41 | fig.update_layout( 42 | title=dict( 43 | text=title, 44 | font=dict(family="Arial", size=16), 45 | ), 46 | showlegend=True, 47 | ) 48 | fig.update_xaxes(title_text="Datetime UTC") 49 | fig.update_yaxes(title_text="Total Consumption") 50 | fig.add_scatter( 51 | x=train_df["datetime_utc"], 52 | y=train_df["energy_consumption"], 53 | name="Observations", 54 | line=dict(color="#C4B6B6"), 55 | hovertemplate="
".join(["Datetime: %{x}", "Energy Consumption: %{y} kWh"]), 56 | ) 57 | fig.add_scatter( 58 | x=preds_df["datetime_utc"], 59 | y=preds_df["energy_consumption"], 60 | name="Predictions", 61 | line=dict(color="#FFC703"), 62 | hovertemplate="
".join(["Datetime: %{x}", "Total Consumption: %{y} kWh"]), 63 | ) 64 | 65 | return fig 66 | 67 | 68 | def build_dataframe(datetime_utc: List[int], energy_consumption_values: List[float]): 69 | """ 70 | Build DataFrame for plotting from timestamps and energy consumption values. 71 | 72 | Args: 73 | datetime_utc (List[int]): list of timestamp values in UTC 74 | values (List[float]): list of energy consumption values 75 | """ 76 | 77 | df = pd.DataFrame( 78 | list(zip(datetime_utc, energy_consumption_values)), 79 | columns=["datetime_utc", "energy_consumption"], 80 | ) 81 | df["datetime_utc"] = pd.to_datetime(df["datetime_utc"], unit="h") 82 | 83 | # Resample to hourly frequency to make the data continuous. 84 | df = df.set_index("datetime_utc") 85 | df = df.resample("H").asfreq() 86 | df = df.reset_index() 87 | 88 | return df 89 | -------------------------------------------------------------------------------- /app-frontend/frontend/main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | import streamlit as st 4 | 5 | from settings import API_URL, TITLE 6 | from components import build_data_plot 7 | 8 | 9 | st.set_page_config(page_title=TITLE) 10 | st.title(TITLE) 11 | 12 | 13 | # Create dropdown for area selection. 14 | area_response = requests.get(API_URL / "area_values") 15 | 16 | area = st.selectbox( 17 | label="Denmark is divided in two price areas, or bidding zones,\ 18 | divided by the Great Belt. DK1 (shown as 1) is west of the Great Belt \ 19 | and DK2 (shown as 2) is east of the Great Belt.", 20 | options=area_response.json().get("values", []), 21 | ) 22 | 23 | # Create dropdown for consumer type selection. 24 | consumer_type_response = requests.get(API_URL / "consumer_type_values") 25 | 26 | consumer_type = st.selectbox( 27 | label="The consumer type is the Industry Code DE35 which is owned \ 28 | and maintained by Danish Energy, a non-commercial lobby \ 29 | organization for Danish energy companies. \ 30 | The code is used by Danish energy companies.", 31 | options=consumer_type_response.json().get("values", []), 32 | ) 33 | 34 | 35 | # Check if both area and consumer type have values listed, then create plot for data. 36 | if area and consumer_type: 37 | st.plotly_chart(build_data_plot(area, consumer_type)) 38 | -------------------------------------------------------------------------------- /app-frontend/frontend/settings.py: -------------------------------------------------------------------------------- 1 | from yarl import URL 2 | 3 | 4 | TITLE = "Energy Consumption Forecasting" 5 | API_URL = URL("http://172.17.0.1:8001/api/v1") 6 | -------------------------------------------------------------------------------- /app-frontend/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "app-frontend" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Iusztin Paul ", "Kurtis Pykes "] 6 | readme = "README.md" 7 | packages = [{include = "frontend"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = ">3.9.7,<3.10" 11 | streamlit = ">=1.20.0,<1.21.0" 12 | plotly = ">=5.14.1,<5.15.0" 13 | yarl = "^1.8.2" 14 | 15 | 16 | [build-system] 17 | requires = ["poetry-core"] 18 | build-backend = "poetry.core.masonry.api" 19 | -------------------------------------------------------------------------------- /app-monitoring/.dockerignore: -------------------------------------------------------------------------------- 1 | jupyter_notebooks* 2 | */env* 3 | */venv* 4 | venv 5 | env 6 | .circleci* 7 | *.env 8 | *.log 9 | .git 10 | .gitignore 11 | .tox -------------------------------------------------------------------------------- /app-monitoring/.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | base = "dark" 3 | -------------------------------------------------------------------------------- /app-monitoring/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.8 2 | 3 | WORKDIR /app/src 4 | 5 | RUN apt-get update && apt-get upgrade -y 6 | RUN pip install --no-cache -U pip 7 | RUN pip install --no-cache poetry==1.4.2 8 | 9 | # Configuring poetry. 10 | RUN poetry config virtualenvs.create false 11 | 12 | # First copy & install requirements to speed up the build process in case only the code changes. 13 | COPY ./app-monitoring/pyproject.toml /app/src/ 14 | COPY ./app-monitoring/poetry.lock /app/src/ 15 | 16 | RUN poetry install --no-interaction --no-root -vvv 17 | 18 | # Copy the rest of the files. 19 | ADD ./app-monitoring /app/src 20 | 21 | CMD ["streamlit", "run", "monitoring/main.py", "--server.port", "8502"] -------------------------------------------------------------------------------- /app-monitoring/README.md: -------------------------------------------------------------------------------- 1 | # Monitoring - Web APP 2 | 3 | Check out [Lesson 6](https://towardsdatascience.com/fastapi-and-streamlit-the-python-duo-you-must-know-about-72825def1243) on Medium to better understand how we built the Streamlit monitoring dashboard. 4 | 5 | ## Install for Development 6 | 7 | Create virtual environment: 8 | ```shell 9 | cd app-monitoring 10 | poetry shell 11 | poetry install 12 | ``` 13 | 14 | **NOTE:** Be sure that the API is already running. 15 | 16 | 17 | ## Usage for Development 18 | 19 | To start the app, run the following: 20 | ```shell 21 | streamlit run monitoring/main.py --server.port 8502 22 | ``` 23 | 24 | Access http://127.0.0.1:8502/ to see the app. 25 | -------------------------------------------------------------------------------- /app-monitoring/monitoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/app-monitoring/monitoring/__init__.py -------------------------------------------------------------------------------- /app-monitoring/monitoring/components.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import requests 3 | 4 | import pandas as pd 5 | import plotly.graph_objects as go 6 | 7 | from settings import API_URL 8 | 9 | 10 | def build_metrics_plot(): 11 | """ 12 | Build plotly graph for metrics. 13 | """ 14 | 15 | response = requests.get(API_URL / "monitoring" / "metrics", verify=False) 16 | if response.status_code != 200: 17 | # If the response is invalid, build empty dataframes in the proper format. 18 | metrics_df = build_dataframe([], [], values_column_name="mape") 19 | 20 | title = "No metrics available." 21 | else: 22 | json_response = response.json() 23 | 24 | # Build DataFrame for plotting. 25 | datetime_utc = json_response.get("datetime_utc", []) 26 | mape = json_response.get("mape", []) 27 | metrics_df = build_dataframe(datetime_utc, mape, values_column_name="mape") 28 | 29 | title = "Predictions vs. Observations | Aggregated Metrics" 30 | 31 | # Create plot. 32 | fig = go.Figure() 33 | fig.update_layout( 34 | title=dict( 35 | text=title, 36 | font=dict(family="Arial", size=16), 37 | ), 38 | showlegend=True, 39 | ) 40 | fig.update_xaxes(title_text="Datetime UTC") 41 | fig.update_yaxes(title_text="MAPE") 42 | fig.add_scatter( 43 | x=metrics_df["datetime_utc"], 44 | y=metrics_df["mape"], 45 | name="MAPE", 46 | line=dict(color="#C4B6B6"), 47 | hovertemplate="
".join(["Datetime UTC: %{x}", "MAPE: %{y} kWh"]), 48 | ) 49 | 50 | return fig 51 | 52 | 53 | def build_data_plot(area: int, consumer_type: int): 54 | """ 55 | Build plotly graph for data. 56 | """ 57 | 58 | # Get predictions from API. 59 | response = requests.get( 60 | API_URL / "monitoring" / "values" / f"{area}" / f"{consumer_type}", verify=False 61 | ) 62 | if response.status_code != 200: 63 | # If the response is invalid, build empty dataframes in the proper format. 64 | train_df = build_dataframe([], []) 65 | preds_df = build_dataframe([], []) 66 | 67 | title = "NO DATA AVAILABLE FOR THE GIVEN AREA AND CONSUMER TYPE" 68 | else: 69 | json_response = response.json() 70 | 71 | # Build DataFrames for plotting. 72 | y_monitoring_datetime_utc = json_response.get("y_monitoring_datetime_utc", []) 73 | y_monitoring_energy_consumption = json_response.get( 74 | "y_monitoring_energy_consumption", [] 75 | ) 76 | predictions_monitoring_datetime_utc = json_response.get( 77 | "predictions_monitoring_datetime_utc", [] 78 | ) 79 | predictions_monitoring_energy_consumptionc = json_response.get( 80 | "predictions_monitoring_energy_consumptionc", [] 81 | ) 82 | 83 | train_df = build_dataframe( 84 | y_monitoring_datetime_utc, y_monitoring_energy_consumption 85 | ) 86 | preds_df = build_dataframe( 87 | predictions_monitoring_datetime_utc, 88 | predictions_monitoring_energy_consumptionc, 89 | ) 90 | 91 | title = "Predictions vs. Observations | Energy Consumption" 92 | 93 | # Create plot. 94 | fig = go.Figure() 95 | fig.update_layout( 96 | title=dict( 97 | text=title, 98 | font=dict(family="Arial", size=16), 99 | ), 100 | showlegend=True, 101 | ) 102 | fig.update_xaxes(title_text="Datetime UTC") 103 | fig.update_yaxes(title_text="Total Consumption") 104 | fig.add_scatter( 105 | x=train_df["datetime_utc"], 106 | y=train_df["energy_consumption"], 107 | name="Observations", 108 | line=dict(color="#C4B6B6"), 109 | hovertemplate="
".join( 110 | ["Datetime UTC: %{x}", "Energy Consumption: %{y} kWh"] 111 | ), 112 | ) 113 | fig.add_scatter( 114 | x=preds_df["datetime_utc"], 115 | y=preds_df["energy_consumption"], 116 | name="Predictions", 117 | line=dict(color="#FFC703"), 118 | hovertemplate="
".join( 119 | ["Datetime UTC: %{x}", "Total Consumption: %{y} kWh"] 120 | ), 121 | ) 122 | 123 | return fig 124 | 125 | 126 | def build_dataframe( 127 | datetime_utc: List[int], 128 | energy_consumption_values: List[float], 129 | values_column_name: str = "energy_consumption", 130 | ): 131 | """ 132 | Build DataFrame for plotting from timestamps and energy consumption values. 133 | 134 | Args: 135 | datetime_utc (List[int]): list of timestamp values in UTC 136 | values (List[float]): list of energy consumption values 137 | values_column_name (str): name of the column containing the values 138 | """ 139 | 140 | df = pd.DataFrame( 141 | list(zip(datetime_utc, energy_consumption_values)), 142 | columns=["datetime_utc", values_column_name], 143 | ) 144 | df["datetime_utc"] = pd.to_datetime(df["datetime_utc"], unit="h") 145 | 146 | # Resample to hourly frequency to make the data continuous. 147 | df = df.set_index("datetime_utc") 148 | df = df.resample("H").asfreq() 149 | df = df.reset_index() 150 | 151 | return df 152 | -------------------------------------------------------------------------------- /app-monitoring/monitoring/main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | import streamlit as st 4 | 5 | from settings import API_URL, TITLE 6 | from components import build_metrics_plot, build_data_plot 7 | 8 | 9 | st.set_page_config(page_title=TITLE) 10 | st.title(TITLE) 11 | 12 | # Create plot for metrics over time. 13 | st.plotly_chart(build_metrics_plot()) 14 | 15 | st.divider() 16 | 17 | 18 | # Create dropdown for area selection. 19 | area_response = requests.get(API_URL / "area_values") 20 | 21 | area = st.selectbox( 22 | label="Denmark is divided in two price areas, or bidding zones,\ 23 | divided by the Great Belt. DK1 (shown as 1) is west of the Great Belt \ 24 | and DK2 (shown as 2) is east of the Great Belt.", 25 | options=area_response.json().get("values", []), 26 | ) 27 | 28 | # Create dropdown for consumer type selection. 29 | consumer_type_response = requests.get(API_URL / "consumer_type_values") 30 | 31 | consumer_type = st.selectbox( 32 | label="The consumer type is the Industry Code DE35 which is owned \ 33 | and maintained by Danish Energy, a non-commercial lobby \ 34 | organization for Danish energy companies. \ 35 | The code is used by Danish energy companies.", 36 | options=consumer_type_response.json().get("values", []), 37 | ) 38 | 39 | 40 | # Check if both area and consumer type have values listed, then create plot for data. 41 | if area and consumer_type: 42 | st.plotly_chart(build_data_plot(area, consumer_type)) 43 | -------------------------------------------------------------------------------- /app-monitoring/monitoring/settings.py: -------------------------------------------------------------------------------- 1 | from yarl import URL 2 | 3 | 4 | TITLE = "Monitoring | Energy Consumption" 5 | API_URL = URL("http://172.17.0.1:8001/api/v1") 6 | -------------------------------------------------------------------------------- /app-monitoring/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "app-monitoring" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Iusztin Paul "] 6 | readme = "README.md" 7 | packages = [{include = "monitoring"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = ">3.9.7,<3.10" 11 | streamlit = ">=1.21.0,<1.22.0" 12 | plotly = ">=5.14.1,<5.15.0" 13 | yarl = "^1.8.2" 14 | 15 | 16 | [build-system] 17 | requires = ["poetry-core"] 18 | build-backend = "poetry.core.masonry.api" 19 | -------------------------------------------------------------------------------- /batch-prediction-pipeline/.env.default: -------------------------------------------------------------------------------- 1 | FS_API_KEY = "" 2 | FS_PROJECT_NAME = "" 3 | WANDB_API_KEY = "" 4 | WANDB_ENTITY = "teaching-mlops" 5 | WANDB_PROJECT = "energy_consumption" 6 | GOOGLE_CLOUD_PROJECT = "energy_consumption" 7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions" 8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "path/to/your/service-account.json" 9 | -------------------------------------------------------------------------------- /batch-prediction-pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Batch Prediction Pipeline 2 | 3 | Check out [Lesson 3](https://towardsdatascience.com/unlock-the-secret-to-efficient-batch-prediction-pipelines-using-python-a-feature-store-and-gcs-17a1462ca489) on Medium to better understand how we built the batch prediction pipeline. 4 | 5 | Also, check out [Lesson 5](https://towardsdatascience.com/ensuring-trustworthy-ml-systems-with-data-validation-and-real-time-monitoring-89ab079f4360) to learn how we implemented the monitoring layer to compute the model's real-time performance. 6 | 7 | ## Install for Development 8 | 9 | The batch prediction pipeline uses the training pipeline module as a dependency. Thus, as a first step, we must ensure that the training pipeline module is published to our private PyPi server. 10 | 11 | **NOTE:** Make sure that your private PyPi server is running. Check the [Usage section](https://github.com/iusztinpaul/energy-forecasting#the-pipeline) if it isn't. 12 | 13 | Build & publish the `training-pipeline` to your private PyPi server: 14 | ```shell 15 | cd training-pipeline 16 | poetry build 17 | poetry publish -r my-pypi 18 | cd .. 19 | ``` 20 | 21 | Install the virtual environment for `batch-prediction-pipeline`: 22 | ```shell 23 | cd batch-prediction-pipeline 24 | poetry shell 25 | poetry install 26 | ``` 27 | 28 | Check the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#-set-up-additional-tools-) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections to see **how to set up** the **additional tools** and **credentials** you need to run this project. 29 | 30 | ## Usage for Development 31 | 32 | To start batch prediction script, run: 33 | ```shell 34 | python -m batch_prediction_pipeline.batch 35 | ``` 36 | 37 | To compute the monitoring metrics based, run the following: 38 | ```shell 39 | python -m batch_prediction_pipeline.monitoring 40 | ``` 41 | 42 | **NOTE:** Be careful to complete the `.env` file and set the `ML_PIPELINE_ROOT_DIR` variable as explained in the [Set Up the ML_PIPELINE_ROOT_DIR Variable](https://github.com/iusztinpaul/energy-forecasting#set-up-the-ml_pipeline_root_dir-variable) section of the main README. 43 | -------------------------------------------------------------------------------- /batch-prediction-pipeline/batch_prediction_pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/batch-prediction-pipeline/batch_prediction_pipeline/__init__.py -------------------------------------------------------------------------------- /batch-prediction-pipeline/batch_prediction_pipeline/batch.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import hopsworks 6 | import pandas as pd 7 | 8 | from batch_prediction_pipeline import data 9 | from batch_prediction_pipeline import settings 10 | from batch_prediction_pipeline import utils 11 | 12 | 13 | logger = utils.get_logger(__name__) 14 | 15 | 16 | def predict( 17 | fh: int = 24, 18 | feature_view_version: Optional[int] = None, 19 | model_version: Optional[int] = None, 20 | start_datetime: Optional[datetime] = None, 21 | end_datetime: Optional[datetime] = None, 22 | ) -> None: 23 | """Main function used to do batch predictions. 24 | 25 | Args: 26 | fh (int, optional): forecast horizon. Defaults to 24. 27 | feature_view_version (Optional[int], optional): feature store feature view version. If None is provided, it will try to load it from the cached feature_view_metadata.json file. 28 | model_version (Optional[int], optional): model version to load from the model registry. If None is provided, it will try to load it from the cached train_metadata.json file. 29 | start_datetime (Optional[datetime], optional): start datetime used for extracting features for predictions. If None is provided, it will try to load it from the cached feature_pipeline_metadata.json file. 30 | end_datetime (Optional[datetime], optional): end datetime used for extracting features for predictions. If None is provided, it will try to load it from the cached feature_pipeline_metadata.json file. 31 | """ 32 | 33 | if feature_view_version is None: 34 | feature_view_metadata = utils.load_json("feature_view_metadata.json") 35 | feature_view_version = feature_view_metadata["feature_view_version"] 36 | if model_version is None: 37 | train_metadata = utils.load_json("train_metadata.json") 38 | model_version = train_metadata["model_version"] 39 | if start_datetime is None or end_datetime is None: 40 | feature_pipeline_metadata = utils.load_json("feature_pipeline_metadata.json") 41 | start_datetime = datetime.strptime( 42 | feature_pipeline_metadata["export_datetime_utc_start"], 43 | feature_pipeline_metadata["datetime_format"], 44 | ) 45 | end_datetime = datetime.strptime( 46 | feature_pipeline_metadata["export_datetime_utc_end"], 47 | feature_pipeline_metadata["datetime_format"], 48 | ) 49 | 50 | logger.info("Connecting to the feature store...") 51 | project = hopsworks.login( 52 | api_key_value=settings.SETTINGS["FS_API_KEY"], 53 | project=settings.SETTINGS["FS_PROJECT_NAME"], 54 | ) 55 | fs = project.get_feature_store() 56 | logger.info("Successfully connected to the feature store.") 57 | 58 | logger.info("Loading data from feature store...") 59 | logger.info(f"Loading features from {start_datetime} to {end_datetime}.") 60 | X, y = data.load_data_from_feature_store( 61 | fs, 62 | feature_view_version, 63 | start_datetime=start_datetime, 64 | end_datetime=end_datetime, 65 | ) 66 | logger.info("Successfully loaded data from feature store.") 67 | 68 | logger.info("Loading model from model registry...") 69 | model = load_model_from_model_registry(project, model_version) 70 | logger.info("Successfully loaded model from model registry.") 71 | 72 | logger.info("Making predictions...") 73 | predictions = forecast(model, X, fh=fh) 74 | predictions_start_datetime = predictions.index.get_level_values( 75 | level="datetime_utc" 76 | ).min() 77 | predictions_end_datetime = predictions.index.get_level_values( 78 | level="datetime_utc" 79 | ).max() 80 | logger.info( 81 | f"Forecasted energy consumption from {predictions_start_datetime} to {predictions_end_datetime}." 82 | ) 83 | logger.info("Successfully made predictions.") 84 | 85 | logger.info("Saving predictions...") 86 | save(X, y, predictions) 87 | logger.info("Successfully saved predictions.") 88 | 89 | # Save the predictions to the bucket for monitoring. 90 | logger.info("Merging predictions with cached predictions...") 91 | save_for_monitoring(predictions, start_datetime) 92 | logger.info("Successfully merged predictions with cached predictions...") 93 | 94 | 95 | def load_model_from_model_registry(project, model_version: int): 96 | """ 97 | This function loads a model from the Model Registry. 98 | The model is downloaded, saved locally, and loaded into memory. 99 | """ 100 | 101 | mr = project.get_model_registry() 102 | model_registry_reference = mr.get_model(name="best_model", version=model_version) 103 | model_dir = model_registry_reference.download() 104 | model_path = Path(model_dir) / "best_model.pkl" 105 | 106 | model = utils.load_model(model_path) 107 | 108 | return model 109 | 110 | 111 | def forecast(model, X: pd.DataFrame, fh: int = 24): 112 | """ 113 | Get a forecast of the total load for the given areas and consumer types. 114 | 115 | Args: 116 | model (sklearn.base.BaseEstimator): Fitted model that implements the predict method. 117 | X (pd.DataFrame): Exogenous data with area, consumer_type, and datetime_utc as index. 118 | fh (int): Forecast horizon. 119 | 120 | Returns: 121 | pd.DataFrame: Forecast of total load for each area, consumer_type, and datetime_utc. 122 | """ 123 | 124 | all_areas = X.index.get_level_values(level=0).unique() 125 | all_consumer_types = X.index.get_level_values(level=1).unique() 126 | latest_datetime = X.index.get_level_values(level=2).max() 127 | 128 | start = latest_datetime + 1 129 | end = start + fh - 1 130 | fh_range = pd.date_range( 131 | start=start.to_timestamp(), end=end.to_timestamp(), freq="H" 132 | ) 133 | fh_range = pd.PeriodIndex(fh_range, freq="H") 134 | 135 | index = pd.MultiIndex.from_product( 136 | [all_areas, all_consumer_types, fh_range], 137 | names=["area", "consumer_type", "datetime_utc"], 138 | ) 139 | X_forecast = pd.DataFrame(index=index) 140 | X_forecast["area_exog"] = X_forecast.index.get_level_values(0) 141 | X_forecast["consumer_type_exog"] = X_forecast.index.get_level_values(1) 142 | 143 | predictions = model.predict(X=X_forecast) 144 | 145 | return predictions 146 | 147 | 148 | def save(X: pd.DataFrame, y: pd.DataFrame, predictions: pd.DataFrame): 149 | """Save the input data, target data, and predictions to GCS.""" 150 | 151 | # Get the bucket object from the GCS client. 152 | bucket = utils.get_bucket() 153 | 154 | # Save the input data and target data to the bucket. 155 | for df, blob_name in zip( 156 | [X, y, predictions], ["X.parquet", "y.parquet", "predictions.parquet"] 157 | ): 158 | logger.info(f"Saving {blob_name} to bucket...") 159 | utils.write_blob_to( 160 | bucket=bucket, 161 | blob_name=blob_name, 162 | data=df, 163 | ) 164 | logger.info(f"Successfully saved {blob_name} to bucket.") 165 | 166 | 167 | def save_for_monitoring(predictions: pd.DataFrame, start_datetime: datetime): 168 | """Save predictions to GCS for monitoring. 169 | 170 | The predictions are saved as a parquet file in GCS. 171 | The predictions are saved in a bucket with the following structure: 172 | gs:///predictions_monitoring.parquet 173 | 174 | The predictions are stored in a multiindex dataframe with the following indexes: 175 | - area: The area of the predictions, e.g. "DK1". 176 | - consumer_type: The consumer type of the predictions, e.g. "residential". 177 | - datetime_utc: The timestamp of the predictions, e.g. "2020-01-01 00:00:00" with a frequency of 1 hour. 178 | """ 179 | 180 | bucket = utils.get_bucket() 181 | 182 | cached_predictions = utils.read_blob_from( 183 | bucket=bucket, blob_name=f"predictions_monitoring.parquet" 184 | ) 185 | has_cached_predictions = cached_predictions is not None 186 | if has_cached_predictions is True: 187 | # Merge predictions with cached predictions. 188 | cached_predictions.index = cached_predictions.index.set_levels( 189 | pd.to_datetime(cached_predictions.index.levels[2], unit="h").to_period("H"), 190 | level=2, 191 | ) 192 | 193 | merged_predictions = predictions.merge( 194 | cached_predictions, 195 | left_index=True, 196 | right_index=True, 197 | how="outer", 198 | suffixes=("_new", "_cached"), 199 | ) 200 | new_predictions = merged_predictions.filter(regex=".*?_new") 201 | new_predictions.columns = new_predictions.columns.str.replace("_new", "") 202 | cached_predictions = merged_predictions.filter(regex=".*?_cached") 203 | cached_predictions.columns = cached_predictions.columns.str.replace( 204 | "_cached", "" 205 | ) 206 | 207 | # NOTE: fillna() not working properly on multindex DataFrames. Got nasty bugs because of it. 208 | new_predictions.update(cached_predictions) 209 | predictions = new_predictions 210 | 211 | predictions = predictions.loc[ 212 | predictions.index.get_level_values("datetime_utc") 213 | >= pd.Period(start_datetime, freq="H") 214 | ] 215 | predictions = predictions.dropna(subset=["energy_consumption"]) 216 | 217 | utils.write_blob_to( 218 | bucket=bucket, 219 | blob_name=f"predictions_monitoring.parquet", 220 | data=predictions, 221 | ) 222 | logger.info(f"Successfully cached predictions forecasted before {start_datetime}.") 223 | 224 | 225 | if __name__ == "__main__": 226 | predict() 227 | -------------------------------------------------------------------------------- /batch-prediction-pipeline/batch_prediction_pipeline/data.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Tuple 3 | 4 | import pandas as pd 5 | 6 | from hsfs.feature_store import FeatureStore 7 | 8 | 9 | def load_data_from_feature_store( 10 | fs: FeatureStore, 11 | feature_view_version: int, 12 | start_datetime: datetime, 13 | end_datetime: datetime, 14 | target: str = "energy_consumption", 15 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 16 | """Loads data for a given time range from the feature store. 17 | 18 | Args: 19 | fs: Feature store. 20 | feature_view_version: Feature view version. 21 | start_datetime: Start datetime. 22 | end_datetime: End datetime. 23 | target: Name of the target feature. 24 | 25 | Returns: 26 | Tuple of exogenous variables and the time series to be forecasted. 27 | """ 28 | 29 | feature_view = fs.get_feature_view( 30 | name="energy_consumption_denmark_view", version=feature_view_version 31 | ) 32 | data = feature_view.get_batch_data(start_time=start_datetime, end_time=end_datetime) 33 | 34 | # Set the index as is required by sktime. 35 | data["datetime_utc"] = pd.PeriodIndex(data["datetime_utc"], freq="H") 36 | data = data.set_index(["area", "consumer_type", "datetime_utc"]).sort_index() 37 | 38 | # Prepare exogenous variables. 39 | X = data.drop(columns=[target]) 40 | # Prepare the time series to be forecasted. 41 | y = data[[target]] 42 | 43 | return X, y 44 | -------------------------------------------------------------------------------- /batch-prediction-pipeline/batch_prediction_pipeline/monitoring.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import hopsworks 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sktime.performance_metrics.forecasting import mean_absolute_percentage_error 8 | 9 | from batch_prediction_pipeline import data 10 | from batch_prediction_pipeline import settings 11 | from batch_prediction_pipeline import utils 12 | 13 | 14 | logger = utils.get_logger(__name__) 15 | 16 | 17 | def compute(feature_view_version: Optional[int] = None) -> None: 18 | """Computes the metrics on the latest n_days of predictions. 19 | 20 | Args: 21 | feature_view_version: The version of the feature view to load data from the feature store. If None is provided, it will try to load it from the cached feature_view_metadata.json file. 22 | """ 23 | 24 | if feature_view_version is None: 25 | feature_view_metadata = utils.load_json("feature_view_metadata.json") 26 | feature_view_version = feature_view_metadata["feature_view_version"] 27 | 28 | logger.info("Loading old predictions...") 29 | bucket = utils.get_bucket() 30 | predictions = utils.read_blob_from( 31 | bucket=bucket, blob_name=f"predictions_monitoring.parquet" 32 | ) 33 | if predictions is None or len(predictions) == 0: 34 | logger.info( 35 | "Haven't found any predictions to compute the metrics on. Exiting..." 36 | ) 37 | 38 | return 39 | predictions.index = predictions.index.set_levels( 40 | pd.to_datetime(predictions.index.levels[2], unit="h").to_period("H"), level=2 41 | ) 42 | logger.info("Successfully loaded old predictions.") 43 | 44 | logger.info("Connecting to the feature store...") 45 | project = hopsworks.login( 46 | api_key_value=settings.SETTINGS["FS_API_KEY"], 47 | project=settings.SETTINGS["FS_PROJECT_NAME"], 48 | ) 49 | fs = project.get_feature_store() 50 | logger.info("Successfully connected to the feature store.") 51 | 52 | logger.info("Loading latest data from feature store...") 53 | predictions_min_datetime_utc = ( 54 | predictions.index.get_level_values("datetime_utc").min().to_timestamp() 55 | ) 56 | predictions_max_datetime_utc = ( 57 | predictions.index.get_level_values("datetime_utc").max().to_timestamp() 58 | ) 59 | logger.info( 60 | f"Loading predictions from {predictions_min_datetime_utc} to {predictions_max_datetime_utc}." 61 | ) 62 | _, latest_observations = data.load_data_from_feature_store( 63 | fs, 64 | feature_view_version, 65 | start_datetime=predictions_min_datetime_utc, 66 | end_datetime=predictions_max_datetime_utc, 67 | ) 68 | logger.info("Successfully loaded latest data from feature store.") 69 | 70 | if len(latest_observations) == 0: 71 | logger.info( 72 | "Haven't found any new ground truths to compute the metrics on. Exiting..." 73 | ) 74 | 75 | return 76 | 77 | logger.info("Computing metrics...") 78 | predictions = predictions.rename( 79 | columns={"energy_consumption": "energy_consumption_predictions"} 80 | ) 81 | latest_observations = latest_observations.rename( 82 | columns={"energy_consumption": "energy_consumption_observations"} 83 | ) 84 | 85 | predictions["energy_consumption_observations"] = np.nan 86 | predictions.update(latest_observations) 87 | 88 | # Compute metrics only on data points that have ground truth. 89 | predictions = predictions.dropna(subset=["energy_consumption_observations"]) 90 | if len(predictions) == 0: 91 | logger.info( 92 | "Haven't found any new ground truths to compute the metrics on. Exiting..." 93 | ) 94 | 95 | return 96 | 97 | mape_metrics = predictions.groupby("datetime_utc").apply( 98 | lambda point_in_time: mean_absolute_percentage_error( 99 | point_in_time["energy_consumption_observations"], 100 | point_in_time["energy_consumption_predictions"], 101 | symmetric=False, 102 | ) 103 | ) 104 | mape_metrics = mape_metrics.rename("MAPE") 105 | metrics = mape_metrics.to_frame() 106 | logger.info("Successfully computed metrics...") 107 | 108 | logger.info("Saving new metrics...") 109 | utils.write_blob_to( 110 | bucket=bucket, 111 | blob_name=f"metrics_monitoring.parquet", 112 | data=metrics, 113 | ) 114 | latest_observations = latest_observations.rename( 115 | columns={"energy_consumption_observations": "energy_consumption"} 116 | ) 117 | utils.write_blob_to( 118 | bucket=bucket, 119 | blob_name=f"y_monitoring.parquet", 120 | data=latest_observations[["energy_consumption"]], 121 | ) 122 | logger.info("Successfully saved new metrics.") 123 | 124 | 125 | if __name__ == "__main__": 126 | compute() 127 | -------------------------------------------------------------------------------- /batch-prediction-pipeline/batch_prediction_pipeline/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | from pathlib import Path 4 | from typing import Union 5 | 6 | from dotenv import load_dotenv 7 | 8 | 9 | warnings.filterwarnings(action="ignore", category=FutureWarning, module="sktime") 10 | 11 | 12 | def load_env_vars(root_dir: Union[str, Path]) -> dict: 13 | """ 14 | Load environment variables from .env.default and .env files. 15 | 16 | Args: 17 | root_dir: Root directory of the .env files. 18 | 19 | Returns: 20 | Dictionary with the environment variables. 21 | """ 22 | 23 | if isinstance(root_dir, str): 24 | root_dir = Path(root_dir) 25 | 26 | load_dotenv(dotenv_path=root_dir / ".env.default") 27 | load_dotenv(dotenv_path=root_dir / ".env", override=True) 28 | 29 | return dict(os.environ) 30 | 31 | 32 | def get_root_dir(default_value: str = ".") -> Path: 33 | """ 34 | Get the root directory of the project. 35 | 36 | Args: 37 | default_value: Default value to use if the environment variable is not set. 38 | 39 | Returns: 40 | Path to the root directory of the project. 41 | """ 42 | 43 | return Path(os.getenv("ML_PIPELINE_ROOT_DIR", default_value)) 44 | 45 | 46 | # The settings will be loaded and the outputs will be saved relative to the 'ML_PIPELINE_ROOT_DIR' directory. 47 | ML_PIPELINE_ROOT_DIR = get_root_dir() 48 | OUTPUT_DIR = ML_PIPELINE_ROOT_DIR / "output" 49 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 50 | 51 | SETTINGS = load_env_vars(root_dir=ML_PIPELINE_ROOT_DIR) 52 | -------------------------------------------------------------------------------- /batch-prediction-pipeline/batch_prediction_pipeline/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from pathlib import Path 4 | from typing import Optional, Union 5 | import joblib 6 | 7 | import pandas as pd 8 | 9 | from google.cloud import storage 10 | 11 | from batch_prediction_pipeline import settings 12 | 13 | 14 | def get_logger(name: str) -> logging.Logger: 15 | """ 16 | Template for getting a logger. 17 | 18 | Args: 19 | name: Name of the logger. 20 | 21 | Returns: Logger. 22 | """ 23 | 24 | logging.basicConfig(level=logging.INFO) 25 | logger = logging.getLogger(name) 26 | 27 | return logger 28 | 29 | 30 | def load_model(model_path: Union[str, Path]): 31 | """ 32 | Template for loading a model. 33 | 34 | Args: 35 | model_path: Path to the model. 36 | 37 | Returns: Loaded model. 38 | """ 39 | 40 | return joblib.load(model_path) 41 | 42 | 43 | def save_json(data: dict, file_name: str, save_dir: str = settings.OUTPUT_DIR): 44 | """ 45 | Save a dictionary as a JSON file. 46 | 47 | Args: 48 | data: data to save. 49 | file_name: Name of the JSON file. 50 | save_dir: Directory to save the JSON file. 51 | 52 | Returns: None 53 | """ 54 | 55 | data_path = Path(save_dir) / file_name 56 | with open(data_path, "w") as f: 57 | json.dump(data, f) 58 | 59 | 60 | def load_json(file_name: str, save_dir: str = settings.OUTPUT_DIR) -> dict: 61 | """ 62 | Load a JSON file. 63 | 64 | Args: 65 | file_name: Name of the JSON file. 66 | save_dir: Directory of the JSON file. 67 | 68 | Returns: Dictionary with the data. 69 | """ 70 | 71 | data_path = Path(save_dir) / file_name 72 | with open(data_path, "r") as f: 73 | return json.load(f) 74 | 75 | 76 | def get_bucket( 77 | bucket_name: str = settings.SETTINGS["GOOGLE_CLOUD_BUCKET_NAME"], 78 | bucket_project: str = settings.SETTINGS["GOOGLE_CLOUD_PROJECT"], 79 | json_credentials_path: str = settings.SETTINGS[ 80 | "GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH" 81 | ], 82 | ) -> storage.Bucket: 83 | """Get a Google Cloud Storage bucket. 84 | 85 | This function returns a Google Cloud Storage bucket that can be used to upload and download 86 | files from Google Cloud Storage. 87 | 88 | Args: 89 | bucket_name : str 90 | The name of the bucket to connect to. 91 | bucket_project : str 92 | The name of the project in which the bucket resides. 93 | json_credentials_path : str 94 | Path to the JSON credentials file for your Google Cloud Project. 95 | 96 | Returns 97 | storage.Bucket 98 | A storage bucket that can be used to upload and download files from Google Cloud Storage. 99 | """ 100 | 101 | storage_client = storage.Client.from_service_account_json( 102 | json_credentials_path=json_credentials_path, 103 | project=bucket_project, 104 | ) 105 | bucket = storage_client.bucket(bucket_name=bucket_name) 106 | 107 | return bucket 108 | 109 | 110 | def write_blob_to(bucket: storage.Bucket, blob_name: str, data: pd.DataFrame): 111 | """Write a dataframe to a GCS bucket as a parquet file. 112 | 113 | Args: 114 | bucket (google.cloud.storage.Bucket): The bucket to write to. 115 | blob_name (str): The name of the blob to write to. Must be a parquet file. 116 | data (pd.DataFrame): The dataframe to write to GCS. 117 | """ 118 | 119 | blob = bucket.blob(blob_name=blob_name) 120 | with blob.open("wb") as f: 121 | data.to_parquet(f) 122 | 123 | 124 | def read_blob_from(bucket: storage.Bucket, blob_name: str) -> Optional[pd.DataFrame]: 125 | """Reads a blob from a bucket and returns a dataframe. 126 | 127 | Args: 128 | bucket: The bucket to read from. 129 | blob_name: The name of the blob to read. 130 | 131 | Returns: 132 | A dataframe containing the data from the blob. 133 | """ 134 | 135 | blob = bucket.blob(blob_name=blob_name) 136 | if not blob.exists(): 137 | return None 138 | 139 | with blob.open("rb") as f: 140 | return pd.read_parquet(f) 141 | -------------------------------------------------------------------------------- /batch-prediction-pipeline/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "batch-prediction-pipeline" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Iusztin Paul "] 6 | readme = "README.md" 7 | packages = [{include = "batch_prediction_pipeline"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "~3.9" 11 | category-encoders = "^2.6.0" 12 | hopsworks = "3.4.3" 13 | python-dotenv = "^1.0.0" 14 | lightgbm = "^3.3.5" 15 | sktime = "^0.16.1" 16 | google-cloud-storage = "^2.7.0" 17 | fire = "^0.5.0" 18 | training-pipeline = "^0.1.0" 19 | 20 | 21 | [tool.poetry.group.dev.dependencies] 22 | black = "^23.1.0" 23 | 24 | [build-system] 25 | requires = ["poetry-core"] 26 | build-backend = "poetry.core.masonry.api" 27 | 28 | [[tool.poetry.source]] 29 | name = "test" # This name will be used in the configuration to retreive the proper credentials 30 | url = "http://localhost" # URL used to download your packages from 31 | -------------------------------------------------------------------------------- /deploy/app-docker-compose.local.yml: -------------------------------------------------------------------------------- 1 | version: '3.9' 2 | 3 | services: 4 | frontend: 5 | volumes: 6 | - ./app-frontend:/app/src/ 7 | 8 | monitoring: 9 | volumes: 10 | - ./app-monitoring:/app/src/ 11 | 12 | api: 13 | volumes: 14 | - ./app-api:/app/src/ 15 | environment: 16 | # Enables autoreload. 17 | APP_API_RELOAD: "True" 18 | -------------------------------------------------------------------------------- /deploy/app-docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.9' 2 | 3 | services: 4 | frontend: 5 | build: 6 | dockerfile: app-frontend/Dockerfile 7 | image: app-frontend:${APP_FRONTEND_VERSION:-latest} 8 | restart: always 9 | ports: 10 | - 8501:8501 11 | depends_on: 12 | - api 13 | 14 | monitoring: 15 | build: 16 | dockerfile: app-monitoring/Dockerfile 17 | image: app-monitoring:${APP_MONITORING_VERSION:-latest} 18 | restart: always 19 | ports: 20 | - 8502:8502 21 | depends_on: 22 | - api 23 | 24 | api: 25 | build: 26 | dockerfile: app-api/Dockerfile 27 | image: app-api:${APP_API_VERSION:-latest} 28 | restart: always 29 | volumes: 30 | - ./credentials:/app/src/credentials 31 | env_file: 32 | - app-api/.env 33 | ports: 34 | - 8001:8001 -------------------------------------------------------------------------------- /deploy/ml-pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Build and publish the feature-pipeline, training-pipeline, and batch-prediction-pipeline packages. 4 | # This is done so that the pipelines can be run from the CLI. 5 | # The pipelines are executed in the feature-pipeline, training-pipeline, and batch-prediction-pipeline 6 | # directories, so we must change directories before building and publishing the packages. 7 | # The my-pypi repository must be defined in the project's poetry.toml file. 8 | 9 | cd feature-pipeline 10 | poetry build 11 | poetry publish -r my-pypi 12 | 13 | cd ../training-pipeline 14 | poetry build 15 | poetry publish -r my-pypi 16 | 17 | cd ../batch-prediction-pipeline 18 | poetry build 19 | poetry publish -r my-pypi 20 | -------------------------------------------------------------------------------- /feature-pipeline/.env.default: -------------------------------------------------------------------------------- 1 | FS_API_KEY = "" 2 | FS_PROJECT_NAME = "" 3 | WANDB_API_KEY = "" 4 | WANDB_ENTITY = "teaching-mlops" 5 | WANDB_PROJECT = "energy_consumption" 6 | GOOGLE_CLOUD_PROJECT = "energy_consumption" 7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions" 8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "path/to/your/service-account.json" 9 | -------------------------------------------------------------------------------- /feature-pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Feature Pipeline 2 | 3 | Check out [Lesson 1](https://medium.com/towards-data-science/a-framework-for-building-a-production-ready-feature-engineering-pipeline-f0b29609b20f) on Medium to better understand how we built the FE pipeline. 4 | 5 | Also, check out [Lesson 5](https://towardsdatascience.com/ensuring-trustworthy-ml-systems-with-data-validation-and-real-time-monitoring-89ab079f4360) to learn how we implemented the data validation layer using Great Expectations. 6 | 7 | ## Install for Development 8 | 9 | Create virtual environment: 10 | ```shell 11 | cd feature-pipeline 12 | poetry shell 13 | poetry install 14 | ``` 15 | 16 | Check the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#-set-up-additional-tools-) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections to see **how to set up** the **additional tools** and **credentials** you need to run this project. 17 | 18 | ## Usage for Development 19 | 20 | To start the ETL pipeline run: 21 | ```shell 22 | python -m feature_pipeline.pipeline 23 | ``` 24 | 25 | To create a new feature view run: 26 | ```shell 27 | python -m feature_pipeline.feature_view 28 | ``` 29 | 30 | **NOTE:** Be careful to complete the `.env` file and set the `ML_PIPELINE_ROOT_DIR` variable as explained in the [Set Up the ML_PIPELINE_ROOT_DIR Variable](https://github.com/iusztinpaul/energy-forecasting#set-up-the-ml_pipeline_root_dir-variable) section of the main README. 31 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/feature-pipeline/feature_pipeline/__init__.py -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/clean_feature_store.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import hopsworks 3 | 4 | from feature_pipeline import settings 5 | 6 | 7 | def clean(): 8 | """ 9 | Utiliy function used during development to clean all the data from the feature store. 10 | """ 11 | 12 | project = hopsworks.login( 13 | api_key_value=settings.SETTINGS["FS_API_KEY"], 14 | project=settings.SETTINGS["FS_PROJECT_NAME"], 15 | ) 16 | fs = project.get_feature_store() 17 | 18 | print("Deleting feature views and training datasets...") 19 | try: 20 | feature_views = fs.get_feature_views(name="energy_consumption_denmark_view") 21 | 22 | for feature_view in feature_views: 23 | try: 24 | feature_view.delete() 25 | except Exception as e: 26 | print(e) 27 | except Exception as e: 28 | print(e) 29 | 30 | print("Deleting feature groups...") 31 | try: 32 | feature_groups = fs.get_feature_groups(name="energy_consumption_denmark") 33 | for feature_group in feature_groups: 34 | try: 35 | feature_group.delete() 36 | except Exception as e: 37 | print(e) 38 | except Exception as e: 39 | print(e) 40 | 41 | 42 | if __name__ == "__main__": 43 | fire.Fire(clean) 44 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/etl/__init__.py: -------------------------------------------------------------------------------- 1 | from feature_pipeline.etl.cleaning import * 2 | from feature_pipeline.etl.extract import * 3 | from feature_pipeline.etl.load import * 4 | from feature_pipeline.etl.validation import * 5 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/etl/cleaning.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def rename_columns(df: pd.DataFrame) -> pd.DataFrame: 5 | """ 6 | Rename columns to match our schema. 7 | """ 8 | 9 | data = df.copy() 10 | 11 | # Drop irrelevant columns. 12 | data.drop(columns=["HourDK"], inplace=True) 13 | 14 | # Rename columns 15 | data.rename( 16 | columns={ 17 | "HourUTC": "datetime_utc", 18 | "PriceArea": "area", 19 | "ConsumerType_DE35": "consumer_type", 20 | "TotalCon": "energy_consumption", 21 | }, 22 | inplace=True, 23 | ) 24 | 25 | return data 26 | 27 | 28 | def cast_columns(df: pd.DataFrame) -> pd.DataFrame: 29 | """ 30 | Cast columns to the correct data type. 31 | """ 32 | 33 | data = df.copy() 34 | 35 | data["datetime_utc"] = pd.to_datetime(data["datetime_utc"]) 36 | data["area"] = data["area"].astype("string") 37 | data["consumer_type"] = data["consumer_type"].astype("int32") 38 | data["energy_consumption"] = data["energy_consumption"].astype("float64") 39 | 40 | return data 41 | 42 | 43 | def encode_area_column(df: pd.DataFrame) -> pd.DataFrame: 44 | """ 45 | Encode the area column to integers. 46 | """ 47 | 48 | data = df.copy() 49 | 50 | area_mappings = {"DK": 0, "DK1": 1, "DK2": 2} 51 | 52 | data["area"] = data["area"].map(lambda string_area: area_mappings.get(string_area)) 53 | data["area"] = data["area"].astype("int8") 54 | 55 | return data 56 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/etl/extract.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from json import JSONDecodeError 3 | from pathlib import Path 4 | from pandas.errors import EmptyDataError 5 | from typing import Any, Dict, Tuple, Optional 6 | 7 | import pandas as pd 8 | import requests 9 | 10 | from yarl import URL 11 | 12 | from feature_pipeline import utils, settings 13 | 14 | 15 | logger = utils.get_logger(__name__) 16 | 17 | 18 | def from_file( 19 | export_end_reference_datetime: Optional[datetime.datetime] = None, 20 | days_delay: int = 15, 21 | days_export: int = 30, 22 | url: str = "https://drive.google.com/uc?export=download&id=1y48YeDymLurOTUO-GeFOUXVNc9MCApG5", 23 | datetime_format: str = "%Y-%m-%d %H:%M", 24 | cache_dir: Optional[Path] = None, 25 | ) -> Optional[Tuple[pd.DataFrame, Dict[str, Any]]]: 26 | """ 27 | Extract data from the DK energy consumption API. 28 | 29 | As the official API expired in July 2023, we will use a copy of the data to simulate the same behavior. 30 | We made a copy of the data between '2020-06-30 22:00' and '2023-06-30 21:00'. Thus, there are 3 years of data to play with. 31 | 32 | Here is the link to the official obsolete dataset: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour 33 | Here is the link to the copy of the dataset: https://drive.google.com/file/d/1y48YeDymLurOTUO-GeFOUXVNc9MCApG5/view?usp=drive_link 34 | 35 | Args: 36 | export_end_reference_datetime: The end reference datetime of the export window. If None, the current time is used. 37 | Because the data is always delayed with "days_delay" days, this date is used only as a reference point. 38 | The real extracted window will be computed as [export_end_reference_datetime - days_delay - days_export, export_end_reference_datetime - days_delay]. 39 | days_delay: Data has a delay of N days. Thus, we have to shift our window with N days. 40 | days_export: The number of days to export. 41 | url: The URL of the API. 42 | datetime_format: The datetime format of the fields from the file. 43 | cache_dir: The directory where the downloaded data will be cached. By default it will be downloaded in the standard output directory. 44 | 45 | 46 | Returns: 47 | A tuple of a Pandas DataFrame containing the exported data and a dictionary of metadata. 48 | """ 49 | 50 | export_start, export_end = _compute_extraction_window(export_end_reference_datetime=export_end_reference_datetime, days_delay=days_delay, days_export=days_export) 51 | records = _extract_records_from_file_url(url=url, export_start=export_start, export_end=export_end, datetime_format=datetime_format, cache_dir=cache_dir) 52 | 53 | metadata = { 54 | "days_delay": days_delay, 55 | "days_export": days_export, 56 | "url": url, 57 | "export_datetime_utc_start": export_start.strftime(datetime_format), 58 | "export_datetime_utc_end": export_end.strftime(datetime_format), 59 | "datetime_format": datetime_format, 60 | "num_unique_samples_per_time_series": len(records["HourUTC"].unique()), 61 | } 62 | 63 | return records, metadata 64 | 65 | 66 | def _extract_records_from_file_url(url: str, export_start: datetime.datetime, export_end: datetime.datetime, datetime_format: str, cache_dir: Optional[Path] = None) -> Optional[pd.DataFrame]: 67 | """Extract records from the file backup based on the given export window.""" 68 | 69 | if cache_dir is None: 70 | cache_dir = settings.OUTPUT_DIR / "data" 71 | cache_dir.mkdir(parents=True, exist_ok=True) 72 | 73 | file_path = cache_dir / "ConsumptionDE35Hour.csv" 74 | if not file_path.exists(): 75 | logger.info(f"Downloading data from: {url}") 76 | 77 | try: 78 | response = requests.get(url) 79 | except requests.exceptions.HTTPError as e: 80 | logger.error( 81 | f"Response status = {response.status_code}. Could not download the file due to: {e}" 82 | ) 83 | 84 | return None 85 | 86 | if response.status_code != 200: 87 | raise ValueError(f"Response status = {response.status_code}. Could not download the file.") 88 | 89 | with file_path.open("w") as f: 90 | f.write(response.text) 91 | 92 | logger.info(f"Successfully downloaded data to: {file_path}") 93 | else: 94 | logger.info(f"Data already downloaded at: {file_path}") 95 | 96 | try: 97 | data = pd.read_csv(file_path, delimiter=";") 98 | except EmptyDataError: 99 | file_path.unlink(missing_ok=True) 100 | 101 | raise ValueError(f"Downloaded file at {file_path} is empty. Could not load it into a DataFrame.") 102 | 103 | records = data[(data["HourUTC"] >= export_start.strftime(datetime_format)) & (data["HourUTC"] < export_end.strftime(datetime_format))] 104 | 105 | return records 106 | 107 | 108 | def from_api( 109 | export_end_reference_datetime: Optional[datetime.datetime] = None, 110 | days_delay: int = 15, 111 | days_export: int = 30, 112 | url: str = "https://api.energidataservice.dk/dataset/ConsumptionDE35Hour", 113 | datetime_format: str = "%Y-%m-%dT%H:%M:%SZ" 114 | ) -> Optional[Tuple[pd.DataFrame, Dict[str, Any]]]: 115 | """ 116 | Extract data from the DK energy consumption API. 117 | 118 | IMPORTANT NOTE: This dataset will not be updated starting July 2023. The dataset will expire during 2023. 119 | Here is the link to the dataset: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour 120 | 121 | Args: 122 | export_end_reference_datetime: The end reference datetime of the export window. If None, the current time is used. 123 | Because the data is always delayed with "days_delay" days, this date is used only as a reference point. 124 | The real extracted window will be computed as [export_end_reference_datetime - days_delay - days_export, export_end_reference_datetime - days_delay]. 125 | days_delay: Data has a delay of N days. Thus, we have to shift our window with N days. 126 | days_export: The number of days to export. 127 | url: The URL of the API. 128 | datetime_format: The datetime format of the fields in the API response. 129 | 130 | Returns: 131 | A tuple of a Pandas DataFrame containing the exported data and a dictionary of metadata. 132 | """ 133 | 134 | export_start, export_end = _compute_extraction_window(export_end_reference_datetime=export_end_reference_datetime, days_delay=days_delay, days_export=days_export) 135 | 136 | records = _extract_records_from_api_url(url=url, export_start=export_start, export_end=export_end) 137 | 138 | metadata = { 139 | "days_delay": days_delay, 140 | "days_export": days_export, 141 | "url": url, 142 | "export_datetime_utc_start": export_start.strftime(datetime_format), 143 | "export_datetime_utc_end": export_end.strftime(datetime_format), 144 | "datetime_format": datetime_format, 145 | "num_unique_samples_per_time_series": len(records["HourUTC"].unique()), 146 | } 147 | 148 | return records, metadata 149 | 150 | def _extract_records_from_api_url(url: str, export_start: datetime.datetime, export_end: datetime.datetime): 151 | """Extracts records from the official API based on the given export window.""" 152 | 153 | query_params = { 154 | "offset": 0, 155 | "sort": "HourUTC", 156 | "timezone": "utc", 157 | "start": export_start.strftime("%Y-%m-%dT%H:%M"), 158 | "end": export_end.strftime("%Y-%m-%dT%H:%M"), 159 | } 160 | url = URL(url) % query_params 161 | url = str(url) 162 | logger.info(f"Requesting data from API with URL: {url}") 163 | response = requests.get(url) 164 | logger.info(f"Response received from API with status code: {response.status_code} ") 165 | 166 | # Parse API response. 167 | try: 168 | response = response.json() 169 | except JSONDecodeError: 170 | logger.error( 171 | f"Response status = {response.status_code}. Could not decode response from API with URL: {url}" 172 | ) 173 | 174 | return None 175 | 176 | records = response["records"] 177 | records = pd.DataFrame.from_records(records) 178 | 179 | return records 180 | 181 | def _compute_extraction_window(export_end_reference_datetime: datetime.datetime, days_delay: int, days_export: int) -> Tuple[datetime.datetime, datetime.datetime]: 182 | """Compute the extraction window relative to 'export_end_reference_datetime' and take into consideration the maximum and minimum data points available in the dataset.""" 183 | 184 | if export_end_reference_datetime is None: 185 | # As the dataset will expire in July 2023, we set the export end reference datetime to the last day of June 2023 + the delay. 186 | export_end_reference_datetime = datetime.datetime( 187 | 2023, 6, 30, 21, 0, 0 188 | ) + datetime.timedelta(days=days_delay) 189 | export_end_reference_datetime = export_end_reference_datetime.replace( 190 | minute=0, second=0, microsecond=0 191 | ) 192 | else: 193 | export_end_reference_datetime = export_end_reference_datetime.replace( 194 | minute=0, second=0, microsecond=0 195 | ) 196 | 197 | # TODO: Change the API source, until then we have to clamp the export_end_reference_datetime to the last day of June 2023 to simulate the same behavior. 198 | expiring_dataset_datetime = datetime.datetime(2023, 6, 30, 21, 0, 0) + datetime.timedelta( 199 | days=days_delay 200 | ) 201 | if export_end_reference_datetime > expiring_dataset_datetime: 202 | export_end_reference_datetime = expiring_dataset_datetime 203 | 204 | logger.warning( 205 | "We clapped 'export_end_reference_datetime' to 'datetime(2023, 6, 30) + datetime.timedelta(days=days_delay)' as \ 206 | the dataset will not be updated starting from July 2023. The dataset will expire during 2023. \ 207 | Check out the following link for more information: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour" 208 | ) 209 | 210 | export_end = export_end_reference_datetime - datetime.timedelta(days=days_delay) 211 | export_start = export_end_reference_datetime - datetime.timedelta( 212 | days=days_delay + days_export 213 | ) 214 | 215 | min_export_start = datetime.datetime(2020, 6, 30, 22, 0, 0) 216 | if export_start < min_export_start: 217 | export_start = min_export_start 218 | export_end = export_start + datetime.timedelta(days=days_export) 219 | 220 | logger.warning( 221 | "We clapped 'export_start' to 'datetime(2020, 6, 30, 22, 0, 0)' and 'export_end' to 'export_start + datetime.timedelta(days=days_export)' as this is the latest window available in the dataset." 222 | ) 223 | 224 | return export_start, export_end 225 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/etl/load.py: -------------------------------------------------------------------------------- 1 | import hopsworks 2 | import pandas as pd 3 | from great_expectations.core import ExpectationSuite 4 | from hsfs.feature_group import FeatureGroup 5 | 6 | from feature_pipeline.settings import SETTINGS 7 | 8 | 9 | def to_feature_store( 10 | data: pd.DataFrame, 11 | validation_expectation_suite: ExpectationSuite, 12 | feature_group_version: int, 13 | ) -> FeatureGroup: 14 | """ 15 | This function takes in a pandas DataFrame and a validation expectation suite, 16 | performs validation on the data using the suite, and then saves the data to a 17 | feature store in the feature store. 18 | """ 19 | 20 | # Connect to feature store. 21 | project = hopsworks.login( 22 | api_key_value=SETTINGS["FS_API_KEY"], project=SETTINGS["FS_PROJECT_NAME"] 23 | ) 24 | feature_store = project.get_feature_store() 25 | 26 | # Create feature group. 27 | energy_feature_group = feature_store.get_or_create_feature_group( 28 | name="energy_consumption_denmark", 29 | version=feature_group_version, 30 | description="Denmark hourly energy consumption data. Data is uploaded with an 15 days delay.", 31 | primary_key=["area", "consumer_type"], 32 | event_time="datetime_utc", 33 | online_enabled=False, 34 | expectation_suite=validation_expectation_suite, 35 | ) 36 | # Upload data. 37 | energy_feature_group.insert( 38 | features=data, 39 | overwrite=False, 40 | write_options={ 41 | "wait_for_job": True, 42 | }, 43 | ) 44 | 45 | # Add feature descriptions. 46 | feature_descriptions = [ 47 | { 48 | "name": "datetime_utc", 49 | "description": """ 50 | Datetime interval in UTC when the data was observed. 51 | """, 52 | "validation_rules": "Always full hours, i.e. minutes are 00", 53 | }, 54 | { 55 | "name": "area", 56 | "description": """ 57 | Denmark is divided in two price areas, divided by the Great Belt: DK1 and DK2. 58 | If price area is “DK”, the data covers all Denmark. 59 | """, 60 | "validation_rules": "0 (DK), 1 (DK1) or 2 (Dk2) (int)", 61 | }, 62 | { 63 | "name": "consumer_type", 64 | "description": """ 65 | The consumer type is the Industry Code DE35 which is owned by Danish Energy. 66 | The code is used by Danish energy companies. 67 | """, 68 | "validation_rules": ">0 (int)", 69 | }, 70 | { 71 | "name": "energy_consumption", 72 | "description": "Total electricity consumption in kWh.", 73 | "validation_rules": ">=0 (float)", 74 | }, 75 | ] 76 | for description in feature_descriptions: 77 | energy_feature_group.update_feature_description( 78 | description["name"], description["description"] 79 | ) 80 | 81 | # Update statistics. 82 | energy_feature_group.statistics_config = { 83 | "enabled": True, 84 | "histograms": True, 85 | "correlations": True, 86 | } 87 | energy_feature_group.update_statistics_config() 88 | energy_feature_group.compute_statistics() 89 | 90 | return energy_feature_group 91 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/etl/validation.py: -------------------------------------------------------------------------------- 1 | from great_expectations.core import ExpectationSuite, ExpectationConfiguration 2 | 3 | 4 | def build_expectation_suite() -> ExpectationSuite: 5 | """ 6 | Builder used to retrieve an instance of the validation expectation suite. 7 | """ 8 | 9 | expectation_suite_energy_consumption = ExpectationSuite( 10 | expectation_suite_name="energy_consumption_suite" 11 | ) 12 | 13 | # Columns. 14 | expectation_suite_energy_consumption.add_expectation( 15 | ExpectationConfiguration( 16 | expectation_type="expect_table_columns_to_match_ordered_list", 17 | kwargs={ 18 | "column_list": [ 19 | "datetime_utc", 20 | "area", 21 | "consumer_type", 22 | "energy_consumption", 23 | ] 24 | }, 25 | ) 26 | ) 27 | expectation_suite_energy_consumption.add_expectation( 28 | ExpectationConfiguration( 29 | expectation_type="expect_table_column_count_to_equal", kwargs={"value": 4} 30 | ) 31 | ) 32 | 33 | # Datetime UTC 34 | expectation_suite_energy_consumption.add_expectation( 35 | ExpectationConfiguration( 36 | expectation_type="expect_column_values_to_not_be_null", 37 | kwargs={"column": "datetime_utc"}, 38 | ) 39 | ) 40 | 41 | # Area 42 | expectation_suite_energy_consumption.add_expectation( 43 | ExpectationConfiguration( 44 | expectation_type="expect_column_distinct_values_to_be_in_set", 45 | kwargs={"column": "area", "value_set": (0, 1, 2)}, 46 | ) 47 | ) 48 | expectation_suite_energy_consumption.add_expectation( 49 | ExpectationConfiguration( 50 | expectation_type="expect_column_values_to_be_of_type", 51 | kwargs={"column": "area", "type_": "int8"}, 52 | ) 53 | ) 54 | 55 | # Consumer type 56 | expectation_suite_energy_consumption.add_expectation( 57 | ExpectationConfiguration( 58 | expectation_type="expect_column_distinct_values_to_be_in_set", 59 | kwargs={ 60 | "column": "consumer_type", 61 | "value_set": ( 62 | 111, 63 | 112, 64 | 119, 65 | 121, 66 | 122, 67 | 123, 68 | 130, 69 | 211, 70 | 212, 71 | 215, 72 | 220, 73 | 310, 74 | 320, 75 | 330, 76 | 340, 77 | 350, 78 | 360, 79 | 370, 80 | 381, 81 | 382, 82 | 390, 83 | 410, 84 | 421, 85 | 422, 86 | 431, 87 | 432, 88 | 433, 89 | 441, 90 | 442, 91 | 443, 92 | 444, 93 | 445, 94 | 446, 95 | 447, 96 | 450, 97 | 461, 98 | 462, 99 | 999, 100 | ), 101 | }, 102 | ) 103 | ) 104 | expectation_suite_energy_consumption.add_expectation( 105 | ExpectationConfiguration( 106 | expectation_type="expect_column_values_to_be_of_type", 107 | kwargs={"column": "consumer_type", "type_": "int32"}, 108 | ) 109 | ) 110 | 111 | # Energy consumption 112 | expectation_suite_energy_consumption.add_expectation( 113 | ExpectationConfiguration( 114 | expectation_type="expect_column_min_to_be_between", 115 | kwargs={ 116 | "column": "energy_consumption", 117 | "min_value": 0, 118 | "strict_min": False, 119 | }, 120 | ) 121 | ) 122 | expectation_suite_energy_consumption.add_expectation( 123 | ExpectationConfiguration( 124 | expectation_type="expect_column_values_to_be_of_type", 125 | kwargs={"column": "energy_consumption", "type_": "float64"}, 126 | ) 127 | ) 128 | expectation_suite_energy_consumption.add_expectation( 129 | ExpectationConfiguration( 130 | expectation_type="expect_column_values_to_not_be_null", 131 | kwargs={"column": "energy_consumption"}, 132 | ) 133 | ) 134 | 135 | return expectation_suite_energy_consumption 136 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/feature_view.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Optional 3 | 4 | import fire 5 | import hopsworks 6 | 7 | from feature_pipeline import utils 8 | from feature_pipeline import settings 9 | import hsfs 10 | 11 | 12 | logger = utils.get_logger(__name__) 13 | 14 | 15 | def create( 16 | feature_group_version: Optional[int] = None, 17 | start_datetime: Optional[datetime] = None, 18 | end_datetime: Optional[datetime] = None, 19 | ) -> dict: 20 | """Create a new feature view version and training dataset 21 | based on the given feature group version and start and end datetimes. 22 | 23 | Args: 24 | feature_group_version (Optional[int]): The version of the 25 | feature group. If None is provided, it will try to load it 26 | from the cached feature_pipeline_metadata.json file. 27 | start_datetime (Optional[datetime]): The start 28 | datetime of the training dataset that will be created. 29 | If None is provided, it will try to load it 30 | from the cached feature_pipeline_metadata.json file. 31 | end_datetime (Optional[datetime]): The end 32 | datetime of the training dataset that will be created. 33 | If None is provided, it will try to load it 34 | from the cached feature_pipeline_metadata.json file. 35 | 36 | Returns: 37 | dict: The feature group version. 38 | 39 | """ 40 | 41 | if feature_group_version is None: 42 | feature_pipeline_metadata = utils.load_json("feature_pipeline_metadata.json") 43 | feature_group_version = feature_pipeline_metadata["feature_group_version"] 44 | 45 | if start_datetime is None or end_datetime is None: 46 | feature_pipeline_metadata = utils.load_json("feature_pipeline_metadata.json") 47 | start_datetime = datetime.strptime( 48 | feature_pipeline_metadata["export_datetime_utc_start"], 49 | feature_pipeline_metadata["datetime_format"], 50 | ) 51 | end_datetime = datetime.strptime( 52 | feature_pipeline_metadata["export_datetime_utc_end"], 53 | feature_pipeline_metadata["datetime_format"], 54 | ) 55 | 56 | project = hopsworks.login( 57 | api_key_value=settings.SETTINGS["FS_API_KEY"], 58 | project=settings.SETTINGS["FS_PROJECT_NAME"], 59 | ) 60 | fs = project.get_feature_store() 61 | 62 | # Delete old feature views as the free tier only allows 100 feature views. 63 | # NOTE: Normally you would not want to delete feature views. We do it here just to stay in the free tier. 64 | try: 65 | feature_views = fs.get_feature_views(name="energy_consumption_denmark_view") 66 | except hsfs.client.exceptions.RestAPIError: 67 | logger.info("No feature views found for energy_consumption_denmark_view.") 68 | 69 | feature_views = [] 70 | 71 | for feature_view in feature_views: 72 | try: 73 | feature_view.delete_all_training_datasets() 74 | except hsfs.client.exceptions.RestAPIError: 75 | logger.error( 76 | f"Failed to delete training datasets for feature view {feature_view.name} with version {feature_view.version}." 77 | ) 78 | 79 | try: 80 | feature_view.delete() 81 | except hsfs.client.exceptions.RestAPIError: 82 | logger.error( 83 | f"Failed to delete feature view {feature_view.name} with version {feature_view.version}." 84 | ) 85 | 86 | # Create feature view in the given feature group version. 87 | energy_consumption_fg = fs.get_feature_group( 88 | "energy_consumption_denmark", version=feature_group_version 89 | ) 90 | ds_query = energy_consumption_fg.select_all() 91 | feature_view = fs.create_feature_view( 92 | name="energy_consumption_denmark_view", 93 | description="Energy consumption for Denmark forecasting model.", 94 | query=ds_query, 95 | labels=[], 96 | ) 97 | 98 | # Create training dataset. 99 | logger.info( 100 | f"Creating training dataset between {start_datetime} and {end_datetime}." 101 | ) 102 | feature_view.create_training_data( 103 | description="Energy consumption training dataset", 104 | data_format="csv", 105 | start_time=start_datetime, 106 | end_time=end_datetime, 107 | write_options={"wait_for_job": True}, 108 | coalesce=False, 109 | ) 110 | 111 | # Save metadata. 112 | metadata = { 113 | "feature_view_version": feature_view.version, 114 | "training_dataset_version": 1, 115 | } 116 | utils.save_json( 117 | metadata, 118 | file_name="feature_view_metadata.json", 119 | ) 120 | 121 | return metadata 122 | 123 | 124 | if __name__ == "__main__": 125 | fire.Fire(create) 126 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from typing import Optional 3 | import fire 4 | import pandas as pd 5 | 6 | from feature_pipeline.etl import cleaning, load, extract, validation 7 | from feature_pipeline import utils 8 | 9 | logger = utils.get_logger(__name__) 10 | 11 | 12 | def run( 13 | export_end_reference_datetime: Optional[datetime.datetime] = None, 14 | days_delay: int = 15, 15 | days_export: int = 30, 16 | url: str = "https://drive.google.com/uc?export=download&id=1y48YeDymLurOTUO-GeFOUXVNc9MCApG5", 17 | feature_group_version: int = 1, 18 | ) -> dict: 19 | """ 20 | Extract data from the API, transform it, and load it to the feature store. 21 | 22 | As the official API expired in July 2023, we will use a copy of the data to simulate the same behavior. 23 | We made a copy of the data between '2020-06-30 22:00' and '2023-06-30 21:00'. Thus, there are 3 years of data to play with. 24 | 25 | Here is the link to the official obsolete dataset: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour 26 | Here is the link to the copy of the dataset: https://drive.google.com/file/d/1y48YeDymLurOTUO-GeFOUXVNc9MCApG5/view?usp=drive_link 27 | 28 | Args: 29 | export_end_reference_datetime: The end reference datetime of the export window. If None, the current time is used. 30 | Because the data is always delayed with "days_delay" days, this date is used only as a reference point. 31 | The real extracted window will be computed as [export_end_reference_datetime - days_delay - days_export, export_end_reference_datetime - days_delay]. 32 | days_delay: Data has a delay of N days. Thus, we have to shift our window with N days. 33 | days_export: The number of days to export. 34 | url: The URL of the API or of the copy of the data stored on GitHub. 35 | feature_group_version: The version of the feature store feature group to save the data to. 36 | 37 | Returns: 38 | A dictionary containing metadata of the pipeline. 39 | """ 40 | 41 | logger.info(f"Extracting data from API.") 42 | data, metadata = extract.from_file( 43 | export_end_reference_datetime, days_delay, days_export, url 44 | ) 45 | if metadata["num_unique_samples_per_time_series"] < days_export * 24: 46 | raise RuntimeError( 47 | f"Could not extract the expected number of samples from the api: {metadata['num_unique_samples_per_time_series']} < {days_export * 24}. \ 48 | Check out the API at: https://www.energidataservice.dk/tso-electricity/ConsumptionDE35Hour " 49 | ) 50 | logger.info("Successfully extracted data from API.") 51 | 52 | logger.info(f"Transforming data.") 53 | data = transform(data) 54 | logger.info("Successfully transformed data.") 55 | 56 | logger.info("Building validation expectation suite.") 57 | validation_expectation_suite = validation.build_expectation_suite() 58 | logger.info("Successfully built validation expectation suite.") 59 | 60 | logger.info(f"Validating data and loading it to the feature store.") 61 | load.to_feature_store( 62 | data, 63 | validation_expectation_suite=validation_expectation_suite, 64 | feature_group_version=feature_group_version, 65 | ) 66 | metadata["feature_group_version"] = feature_group_version 67 | logger.info("Successfully validated data and loaded it to the feature store.") 68 | 69 | logger.info(f"Wrapping up the pipeline.") 70 | utils.save_json(metadata, file_name="feature_pipeline_metadata.json") 71 | logger.info("Done!") 72 | 73 | return metadata 74 | 75 | 76 | def transform(data: pd.DataFrame): 77 | """ 78 | Wrapper containing all the transformations from the ETL pipeline. 79 | """ 80 | 81 | data = cleaning.rename_columns(data) 82 | data = cleaning.cast_columns(data) 83 | data = cleaning.encode_area_column(data) 84 | 85 | return data 86 | 87 | 88 | if __name__ == "__main__": 89 | fire.Fire(run) 90 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Union 4 | 5 | from dotenv import load_dotenv 6 | 7 | 8 | def load_env_vars(root_dir: Union[str, Path]) -> dict: 9 | """ 10 | Load environment variables from .env.default and .env files. 11 | 12 | Args: 13 | root_dir: Root directory of the .env files. 14 | 15 | Returns: 16 | Dictionary with the environment variables. 17 | """ 18 | 19 | if isinstance(root_dir, str): 20 | root_dir = Path(root_dir) 21 | 22 | load_dotenv(dotenv_path=root_dir / ".env.default") 23 | load_dotenv(dotenv_path=root_dir / ".env", override=True) 24 | 25 | return dict(os.environ) 26 | 27 | 28 | def get_root_dir(default_value: str = ".") -> Path: 29 | """ 30 | Get the root directory of the project. 31 | 32 | Args: 33 | default_value: Default value to use if the environment variable is not set. 34 | 35 | Returns: 36 | Path to the root directory of the project. 37 | """ 38 | 39 | return Path(os.getenv("ML_PIPELINE_ROOT_DIR", default_value)) 40 | 41 | 42 | ML_PIPELINE_ROOT_DIR = get_root_dir() 43 | OUTPUT_DIR = ML_PIPELINE_ROOT_DIR / "output" 44 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 45 | 46 | SETTINGS = load_env_vars(root_dir=ML_PIPELINE_ROOT_DIR) 47 | -------------------------------------------------------------------------------- /feature-pipeline/feature_pipeline/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from pathlib import Path 4 | 5 | from feature_pipeline import settings 6 | 7 | 8 | def get_logger(name: str) -> logging.Logger: 9 | """ 10 | Template for getting a logger. 11 | 12 | Args: 13 | name: Name of the logger. 14 | 15 | Returns: Logger. 16 | """ 17 | 18 | logging.basicConfig(level=logging.INFO) 19 | logger = logging.getLogger(name) 20 | 21 | return logger 22 | 23 | 24 | def save_json(data: dict, file_name: str, save_dir: str = settings.OUTPUT_DIR): 25 | """ 26 | Save a dictionary as a JSON file. 27 | 28 | Args: 29 | data: data to save. 30 | file_name: Name of the JSON file. 31 | save_dir: Directory to save the JSON file. 32 | 33 | Returns: None 34 | """ 35 | 36 | data_path = Path(save_dir) / file_name 37 | with open(data_path, "w") as f: 38 | json.dump(data, f) 39 | 40 | 41 | def load_json(file_name: str, save_dir: str = settings.OUTPUT_DIR) -> dict: 42 | """ 43 | Load a JSON file. 44 | 45 | Args: 46 | file_name: Name of the JSON file. 47 | save_dir: Directory of the JSON file. 48 | 49 | Returns: Dictionary with the data. 50 | """ 51 | 52 | data_path = Path(save_dir) / file_name 53 | if not data_path.exists(): 54 | raise FileNotFoundError(f"Cached JSON from {data_path} does not exist.") 55 | 56 | with open(data_path, "r") as f: 57 | return json.load(f) 58 | -------------------------------------------------------------------------------- /feature-pipeline/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "feature-pipeline" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Iusztin Paul ", "Kurtis Pykes "] 6 | readme = "README.md" 7 | packages = [{include = "feature_pipeline"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "~3.9" 11 | hopsworks = "3.4.3" 12 | fire = "^0.5.0" 13 | yarl = "^1.8.2" 14 | pandas = ">=1.3.5" 15 | requests = "^2.28.2" 16 | python-dotenv = ">=0.21.1" 17 | 18 | [tool.poetry.group.dev.dependencies] 19 | black = "^23.1.0" 20 | 21 | [build-system] 22 | requires = ["poetry-core"] 23 | build-backend = "poetry.core.masonry.api" 24 | -------------------------------------------------------------------------------- /images/airflow_login_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/airflow_login_screenshot.png -------------------------------------------------------------------------------- /images/airflow_ml_pipeline_dag_overview_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/airflow_ml_pipeline_dag_overview_screenshot.png -------------------------------------------------------------------------------- /images/airflow_ml_pipeline_dag_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/airflow_ml_pipeline_dag_screenshot.png -------------------------------------------------------------------------------- /images/airflow_variables_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/airflow_variables_screenshot.png -------------------------------------------------------------------------------- /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/architecture.png -------------------------------------------------------------------------------- /images/forecasting_demo_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/forecasting_demo_screenshot.png -------------------------------------------------------------------------------- /images/gcp_expose_ports_firewall_rule_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_expose_ports_firewall_rule_screenshot.png -------------------------------------------------------------------------------- /images/gcp_gcs_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_gcs_screenshot.png -------------------------------------------------------------------------------- /images/gcp_iap_for_tcp_firewall_rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_iap_for_tcp_firewall_rule.png -------------------------------------------------------------------------------- /images/gcp_ssh_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_ssh_screenshot.png -------------------------------------------------------------------------------- /images/gcp_vm_external_ip_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gcp_vm_external_ip_screenshot.png -------------------------------------------------------------------------------- /images/github_actions_secrets_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/github_actions_secrets_screenshot.png -------------------------------------------------------------------------------- /images/github_actions_see_cicd_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/github_actions_see_cicd_screenshot.png -------------------------------------------------------------------------------- /images/github_actions_variables_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/github_actions_variables_screenshot.png -------------------------------------------------------------------------------- /images/gmail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/gmail.png -------------------------------------------------------------------------------- /images/linkedin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/linkedin.png -------------------------------------------------------------------------------- /images/medium.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/medium.png -------------------------------------------------------------------------------- /images/screenshot_introduction_video.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/screenshot_introduction_video.png -------------------------------------------------------------------------------- /images/substack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/substack.png -------------------------------------------------------------------------------- /images/twitter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/images/twitter.png -------------------------------------------------------------------------------- /scripts/install_poetry_macos_m1_chip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Fetch the utils.sh script from a URL and source it 4 | UTILS_SCRIPT=$(curl -s https://raw.githubusercontent.com/gao-hongnan/common-utils/main/scripts/utils.sh) 5 | source /dev/stdin <<<"$UTILS_SCRIPT" 6 | logger "INFO" "Fetched the utils.sh script from a URL and sourced it" 7 | 8 | resolve_hopswork() { 9 | # Check if librdkafka is installed and at the correct version 10 | installed_versions=$(brew list --versions librdkafka) 11 | required_version="1.9.2" # replace this with the version you want 12 | 13 | if ! echo "$installed_versions" | grep -q "$required_version"; then 14 | # If librdkafka is not installed or not at the correct version, proceed with installation 15 | 16 | # see https://community.hopsworks.ai/t/ssl-handshake-failed-on-macos-hopsworks-serverless/886/3 17 | curl -O https://raw.githubusercontent.com/Homebrew/homebrew-core/f7d0f40bbc4075177ecf16812fd95951a723a996/Formula/librdkafka.rb 18 | brew install --build-from-source librdkafka.rb 19 | rm librdkafka.rb 20 | else 21 | logger "INFO" "librdkafka is already installed at version: $(brew list --versions librdkafka)" 22 | fi 23 | 24 | # Set VERSION to the required version, assuming it is now installed 25 | VERSION=$required_version 26 | # use below if the librdkafka version is fixed 27 | # VERSION=$(ls /opt/homebrew/Cellar/librdkafka | tail -n 1) 28 | 29 | # Export necessary environment variables 30 | export C_INCLUDE_PATH=/opt/homebrew/Cellar/librdkafka/$VERSION/include 31 | export LIBRARY_PATH=/opt/homebrew/Cellar/librdkafka/$VERSION/lib 32 | } 33 | 34 | resolve_lightgbm() { 35 | # see https://stackoverflow.com/questions/74566704/cannot-install-lightgbm-3-3-3-on-apple-silicon 36 | # Check if cmake is installed 37 | if ! brew list --versions cmake >/dev/null; then 38 | # If cmake is not installed, install it 39 | brew install cmake 40 | else 41 | logger "INFO" "cmake is already installed at version: $(cmake --version)" 42 | fi 43 | 44 | # Check if libomp is installed 45 | if ! brew list --versions libomp >/dev/null; then 46 | # If libomp is not installed, install it 47 | brew install libomp 48 | else 49 | logger "INFO" "libomp is already installed at version: $(brew list --versions libomp)" 50 | fi 51 | } 52 | 53 | 54 | custom_install_hopswork_and_lightgbm_if_arm64() { 55 | # Check if on macOS with M1 or ARM chip 56 | if [[ "$(uname -m)" == "arm64" ]]; then 57 | logger "INFO" "Installing librdkafka for M1 chip" 58 | resolve_hopswork 59 | resolve_lightgbm 60 | fi 61 | } 62 | 63 | custom_install_hopswork_and_lightgbm_if_arm64 -------------------------------------------------------------------------------- /training-pipeline/.env.default: -------------------------------------------------------------------------------- 1 | FS_API_KEY = "" 2 | FS_PROJECT_NAME = "" 3 | WANDB_API_KEY = "" 4 | WANDB_ENTITY = "teaching-mlops" 5 | WANDB_PROJECT = "energy_consumption" 6 | GOOGLE_CLOUD_PROJECT = "energy_consumption" 7 | GOOGLE_CLOUD_BUCKET_NAME = "hourly-batch-predictions" 8 | GOOGLE_CLOUD_SERVICE_ACCOUNT_JSON_PATH = "path/to/your/service-account.json" 9 | -------------------------------------------------------------------------------- /training-pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Training Pipeline 2 | 3 | Check out [Lesson 2](https://medium.com/towards-data-science/a-guide-to-building-effective-training-pipelines-for-maximum-results-6fdaef594cee) on Medium to better understand how we built the training pipeline. 4 | 5 | ## Install for Development 6 | 7 | Create virtual environment: 8 | ```shell 9 | cd training-pipeline 10 | poetry shell 11 | poetry install 12 | ``` 13 | 14 | Check the [Set Up Additional Tools](https://github.com/iusztinpaul/energy-forecasting#-set-up-additional-tools-) and [Usage](https://github.com/iusztinpaul/energy-forecasting#usage) sections to see **how to set up** the **additional tools** and **credentials** you need to run this project. 15 | 16 | 17 | ## Usage for Development 18 | 19 |
**Run the scripts in the following order:**

20 | 21 | 22 | 1. Start the hyperparameter tuning script: 23 | ```shell 24 | python -m training_pipeline.hyperparameter_tuning 25 | ``` 26 | 27 | 2. Upload the best config based on the previous hyperparameter tuning step: 28 | ```shell 29 | python -m training_pipeline.best_config 30 | ``` 31 | 3. Start the training script using the best configuration uploaded one step before: 32 | ```shell 33 | python -m training_pipeline.train 34 | ``` 35 | 36 | **NOTE:** Be careful to complete the `.env` file and set the `ML_PIPELINE_ROOT_DIR` variable as explained in the [Set Up the ML_PIPELINE_ROOT_DIR Variable](https://github.com/iusztinpaul/energy-forecasting#set-up-the-ml_pipeline_root_dir-variable) section of the main README. 37 | -------------------------------------------------------------------------------- /training-pipeline/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "training-pipeline" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Iusztin Paul "] 6 | readme = "README.md" 7 | packages = [{include = "training_pipeline"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "~3.9" 11 | pyarrow = "^11.0.0" 12 | wandb = "^0.14.0" 13 | matplotlib = "^3.7.1" 14 | hopsworks = "3.4.3" 15 | python-dotenv = "^1.0.0" 16 | lightgbm = "^3.3.5" 17 | sktime = "^0.16.1" 18 | seaborn = "^0.12.2" 19 | fire = "^0.5.0" 20 | Jinja2 = "3.0.1" 21 | 22 | [tool.poetry.group.dev.dependencies] 23 | black = "^23.1.0" 24 | 25 | [build-system] 26 | requires = ["poetry-core"] 27 | build-backend = "poetry.core.masonry.api" 28 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iusztinpaul/energy-forecasting/78bd9f503f9fa1546802b82eb51a74f78deadf0e/training-pipeline/training_pipeline/__init__.py -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/best_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import fire 3 | import wandb 4 | 5 | from typing import Optional 6 | 7 | from training_pipeline import utils 8 | from training_pipeline.settings import SETTINGS, OUTPUT_DIR 9 | 10 | logger = utils.get_logger(__name__) 11 | 12 | 13 | """ 14 | NOTE: We moved the log best model logic to a different process as there is a bug in W&B sweeps that whatever you do, 15 | when you create a new run after a sweep, it will override the last run of the sweep. 16 | This will result in overriding the wrong run and getting the wrong config. 17 | """ 18 | 19 | 20 | def upload(sweep_id: Optional[str] = None): 21 | """Upload the best config from the given sweep to the "best_experiment" wandb Artifact. 22 | 23 | Args: 24 | sweep_id (Optional[str], optional): Sweep ID to look for the best config. If None, it will look for the last sweep in the cached last_sweep_metadata.json file. Defaults to None. 25 | """ 26 | 27 | if sweep_id is None: 28 | last_sweep_metadata = utils.load_json("last_sweep_metadata.json") 29 | sweep_id = last_sweep_metadata["sweep_id"] 30 | 31 | logger.info(f"Loading sweep_id from last_sweep_metadata.json with {sweep_id=}") 32 | 33 | api = wandb.Api() 34 | sweep = api.sweep( 35 | f"{SETTINGS['WANDB_ENTITY']}/{SETTINGS['WANDB_PROJECT']}/{sweep_id}" 36 | ) 37 | best_run = sweep.best_run() 38 | 39 | with utils.init_wandb_run( 40 | name="best_experiment", 41 | job_type="hpo", 42 | group="train", 43 | run_id=best_run.id, 44 | resume="must", 45 | ) as run: 46 | run.use_artifact("config:latest") 47 | 48 | best_config = dict(run.config) 49 | 50 | logger.info(f"Best run {best_run.name}") 51 | logger.info("Best run config:") 52 | logger.info(best_config) 53 | logger.info( 54 | f"Best run = {best_run.name} with results {dict(run.summary['validation'])}" 55 | ) 56 | 57 | config_path = OUTPUT_DIR / "best_config.json" 58 | with open(config_path, "w") as f: 59 | json.dump(best_config, f, indent=4) 60 | 61 | artifact = wandb.Artifact( 62 | name="best_config", 63 | type="model", 64 | metadata={"results": {"validation": dict(run.summary["validation"])}}, 65 | ) 66 | artifact.add_file(str(config_path)) 67 | run.log_artifact(artifact) 68 | 69 | run.finish() 70 | 71 | 72 | if __name__ == "__main__": 73 | fire.Fire(upload) 74 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from training_pipeline.configs import gridsearch 2 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/configs/gridsearch.py: -------------------------------------------------------------------------------- 1 | # NOTE: In a production environment, we would move this to a YAML file and load it from there. 2 | # Also, we would use random or bayesian search + early stopping to speed up the process. 3 | sweep_configs = { 4 | "method": "grid", 5 | "metric": {"name": "validation.MAPE", "goal": "minimize"}, 6 | "parameters": { 7 | "forecaster__estimator__n_jobs": {"values": [-1]}, 8 | "forecaster__estimator__n_estimators": {"values": [1000, 2000, 2500]}, 9 | "forecaster__estimator__learning_rate": {"values": [0.1, 0.15]}, 10 | "forecaster__estimator__max_depth": {"values": [-1, 5]}, 11 | "forecaster__estimator__reg_lambda": {"values": [0, 0.01, 0.015]}, 12 | "daily_season__manual_selection": {"values": [["day_of_week", "hour_of_day"]]}, 13 | "forecaster_transformers__window_summarizer__lag_feature__lag": { 14 | "values": [list(range(1, 73))] 15 | }, 16 | "forecaster_transformers__window_summarizer__lag_feature__mean": { 17 | "values": [[[1, 24], [1, 48], [1, 72]]] 18 | }, 19 | "forecaster_transformers__window_summarizer__lag_feature__std": { 20 | "values": [[[1, 24], [1, 48]]] 21 | }, 22 | "forecaster_transformers__window_summarizer__n_jobs": {"values": [1]}, 23 | }, 24 | } 25 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/data.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | import hopsworks 3 | import pandas as pd 4 | import wandb 5 | 6 | from sktime.forecasting.model_selection import temporal_train_test_split 7 | 8 | from training_pipeline.utils import init_wandb_run 9 | from training_pipeline.settings import SETTINGS 10 | 11 | 12 | def load_dataset_from_feature_store( 13 | feature_view_version: int, training_dataset_version: int, fh: int = 24 14 | ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: 15 | """Load features from feature store. 16 | 17 | Args: 18 | feature_view_version (int): feature store feature view version to load data from 19 | training_dataset_version (int): feature store training dataset version to load data from 20 | fh (int, optional): Forecast horizon. Defaults to 24. 21 | 22 | Returns: 23 | Train and test splits loaded from the feature store as pandas dataframes. 24 | """ 25 | 26 | project = hopsworks.login( 27 | api_key_value=SETTINGS["FS_API_KEY"], project=SETTINGS["FS_PROJECT_NAME"] 28 | ) 29 | fs = project.get_feature_store() 30 | 31 | with init_wandb_run( 32 | name="load_training_data", job_type="load_feature_view", group="dataset" 33 | ) as run: 34 | feature_view = fs.get_feature_view( 35 | name="energy_consumption_denmark_view", version=feature_view_version 36 | ) 37 | data, _ = feature_view.get_training_data( 38 | training_dataset_version=training_dataset_version 39 | ) 40 | 41 | fv_metadata = feature_view.to_dict() 42 | fv_metadata["query"] = fv_metadata["query"].to_string() 43 | fv_metadata["features"] = [f.name for f in fv_metadata["features"]] 44 | fv_metadata["link"] = feature_view._feature_view_engine._get_feature_view_url( 45 | feature_view 46 | ) 47 | fv_metadata["feature_view_version"] = feature_view_version 48 | fv_metadata["training_dataset_version"] = training_dataset_version 49 | 50 | raw_data_at = wandb.Artifact( 51 | name="energy_consumption_denmark_feature_view", 52 | type="feature_view", 53 | metadata=fv_metadata, 54 | ) 55 | run.log_artifact(raw_data_at) 56 | 57 | run.finish() 58 | 59 | with init_wandb_run( 60 | name="train_test_split", job_type="prepare_dataset", group="dataset" 61 | ) as run: 62 | run.use_artifact("energy_consumption_denmark_feature_view:latest") 63 | 64 | y_train, y_test, X_train, X_test = prepare_data(data, fh=fh) 65 | 66 | for split in ["train", "test"]: 67 | split_X = locals()[f"X_{split}"] 68 | split_y = locals()[f"y_{split}"] 69 | 70 | split_metadata = { 71 | "timespan": [ 72 | split_X.index.get_level_values(-1).min(), 73 | split_X.index.get_level_values(-1).max(), 74 | ], 75 | "dataset_size": len(split_X), 76 | "num_areas": len(split_X.index.get_level_values(0).unique()), 77 | "num_consumer_types": len(split_X.index.get_level_values(1).unique()), 78 | "y_features": split_y.columns.tolist(), 79 | "X_features": split_X.columns.tolist(), 80 | } 81 | artifact = wandb.Artifact( 82 | name=f"split_{split}", 83 | type="split", 84 | metadata=split_metadata, 85 | ) 86 | run.log_artifact(artifact) 87 | 88 | run.finish() 89 | 90 | return y_train, y_test, X_train, X_test 91 | 92 | 93 | def prepare_data( 94 | data: pd.DataFrame, target: str = "energy_consumption", fh: int = 24 95 | ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: 96 | """ 97 | Structure the data for training: 98 | - Set the index as is required by sktime. 99 | - Prepare exogenous variables. 100 | - Prepare the time series to be forecasted. 101 | - Split the data into train and test sets. 102 | """ 103 | 104 | # Set the index as is required by sktime. 105 | data["datetime_utc"] = pd.PeriodIndex(data["datetime_utc"], freq="H") 106 | data = data.set_index(["area", "consumer_type", "datetime_utc"]).sort_index() 107 | 108 | # Prepare exogenous variables. 109 | X = data.drop(columns=[target]) 110 | # Prepare the time series to be forecasted. 111 | y = data[[target]] 112 | 113 | y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, test_size=fh) 114 | 115 | return y_train, y_test, X_train, X_test 116 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/hyperparameter_tuning.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from typing import Optional 3 | 4 | import fire 5 | import numpy as np 6 | import pandas as pd 7 | import wandb 8 | 9 | from matplotlib import pyplot as plt 10 | from sktime.forecasting.model_evaluation import evaluate as cv_evaluate 11 | from sktime.forecasting.model_selection import ExpandingWindowSplitter 12 | from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError 13 | from sktime.utils.plotting import plot_windows 14 | 15 | from training_pipeline import utils 16 | from training_pipeline.configs import gridsearch as gridsearch_configs 17 | from training_pipeline.data import load_dataset_from_feature_store 18 | from training_pipeline.models import build_model 19 | from training_pipeline.utils import init_wandb_run 20 | from training_pipeline.settings import SETTINGS, OUTPUT_DIR 21 | 22 | 23 | logger = utils.get_logger(__name__) 24 | 25 | 26 | def run( 27 | fh: int = 24, 28 | feature_view_version: Optional[int] = None, 29 | training_dataset_version: Optional[int] = None, 30 | ) -> dict: 31 | """Run hyperparameter optimization search. 32 | 33 | Args: 34 | fh (int, optional): Forecasting horizon. Defaults to 24. 35 | feature_view_version (Optional[int], optional): feature store - feature view version. 36 | If none, it will try to load the version from the cached feature_view_metadata.json file. Defaults to None. 37 | training_dataset_version (Optional[int], optional): feature store - feature view - training dataset version. 38 | If none, it will try to load the version from the cached feature_view_metadata.json file. Defaults to None. 39 | 40 | Returns: 41 | dict: Dictionary containing metadata about the hyperparameter optimization run. 42 | """ 43 | 44 | feature_view_metadata = utils.load_json("feature_view_metadata.json") 45 | if feature_view_version is None: 46 | feature_view_version = feature_view_metadata["feature_view_version"] 47 | if training_dataset_version is None: 48 | training_dataset_version = feature_view_metadata["training_dataset_version"] 49 | 50 | y_train, _, X_train, _ = load_dataset_from_feature_store( 51 | feature_view_version=feature_view_version, 52 | training_dataset_version=training_dataset_version, 53 | fh=fh, 54 | ) 55 | 56 | sweep_id = run_hyperparameter_optimization(y_train, X_train, fh=fh) 57 | 58 | metadata = {"sweep_id": sweep_id} 59 | utils.save_json(metadata, file_name="last_sweep_metadata.json") 60 | 61 | return metadata 62 | 63 | 64 | def run_hyperparameter_optimization( 65 | y_train: pd.DataFrame, X_train: pd.DataFrame, fh: int 66 | ): 67 | """Runs hyperparameter optimization search using W&B sweeps.""" 68 | 69 | sweep_id = wandb.sweep( 70 | sweep=gridsearch_configs.sweep_configs, project=SETTINGS["WANDB_PROJECT"] 71 | ) 72 | 73 | wandb.agent( 74 | project=SETTINGS["WANDB_PROJECT"], 75 | sweep_id=sweep_id, 76 | function=partial(run_sweep, y_train=y_train, X_train=X_train, fh=fh), 77 | ) 78 | 79 | return sweep_id 80 | 81 | 82 | def run_sweep(y_train: pd.DataFrame, X_train: pd.DataFrame, fh: int): 83 | """Runs a single hyperparameter optimization step (train + CV eval) using W&B sweeps.""" 84 | 85 | with init_wandb_run( 86 | name="experiment", job_type="hpo", group="train", add_timestamp_to_name=True 87 | ) as run: 88 | run.use_artifact("split_train:latest") 89 | 90 | config = wandb.config 91 | config = dict(config) 92 | model = build_model(config) 93 | 94 | model, results = train_model_cv(model, y_train, X_train, fh=fh) 95 | wandb.log(results) 96 | 97 | metadata = { 98 | "experiment": {"name": run.name, "fh": fh}, 99 | "results": results, 100 | "config": config, 101 | } 102 | artifact = wandb.Artifact( 103 | name=f"config", 104 | type="model", 105 | metadata=metadata, 106 | ) 107 | run.log_artifact(artifact) 108 | 109 | run.finish() 110 | 111 | 112 | def train_model_cv( 113 | model, y_train: pd.DataFrame, X_train: pd.DataFrame, fh: int, k: int = 3 114 | ): 115 | """Train and evaluate the given model using cross-validation.""" 116 | 117 | data_length = len(y_train.index.get_level_values(-1).unique()) 118 | assert data_length >= fh * 10, "Not enough data to perform a 3 fold CV." 119 | 120 | cv_step_length = data_length // k 121 | initial_window = max(fh * 3, cv_step_length - fh) 122 | cv = ExpandingWindowSplitter( 123 | step_length=cv_step_length, fh=np.arange(fh) + 1, initial_window=initial_window 124 | ) 125 | render_cv_scheme(cv, y_train) 126 | 127 | results = cv_evaluate( 128 | forecaster=model, 129 | y=y_train, 130 | X=X_train, 131 | cv=cv, 132 | strategy="refit", 133 | scoring=MeanAbsolutePercentageError(symmetric=False), 134 | error_score="raise", 135 | return_data=False, 136 | ) 137 | 138 | results = results.rename( 139 | columns={ 140 | "test_MeanAbsolutePercentageError": "MAPE", 141 | "fit_time": "fit_time", 142 | "pred_time": "prediction_time", 143 | } 144 | ) 145 | mean_results = results[["MAPE", "fit_time", "prediction_time"]].mean(axis=0) 146 | mean_results = mean_results.to_dict() 147 | results = {"validation": mean_results} 148 | 149 | logger.info(f"Validation MAPE: {results['validation']['MAPE']:.2f}") 150 | logger.info(f"Mean fit time: {results['validation']['fit_time']:.2f} s") 151 | logger.info(f"Mean predict time: {results['validation']['prediction_time']:.2f} s") 152 | 153 | return model, results 154 | 155 | 156 | def render_cv_scheme(cv, y_train: pd.DataFrame) -> str: 157 | """Render the CV scheme used for training and log it to W&B.""" 158 | 159 | random_time_series = ( 160 | y_train.groupby(level=[0, 1]) 161 | .get_group((1, 111)) 162 | .reset_index(level=[0, 1], drop=True) 163 | ) 164 | plot_windows(cv, random_time_series) 165 | 166 | save_path = str(OUTPUT_DIR / "cv_scheme.png") 167 | plt.savefig(save_path) 168 | wandb.log({"cv_scheme": wandb.Image(save_path)}) 169 | 170 | return save_path 171 | 172 | 173 | if __name__ == "__main__": 174 | fire.Fire(run) 175 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/models.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lgb 2 | 3 | from sktime.forecasting.compose import make_reduction, ForecastingPipeline 4 | from sktime.forecasting.naive import NaiveForecaster 5 | from sktime.transformations.series.date import DateTimeFeatures 6 | from sktime.transformations.series.summarize import WindowSummarizer 7 | 8 | from training_pipeline import transformers 9 | 10 | 11 | def build_model(config: dict): 12 | """ 13 | Build an Sktime model using the given config. 14 | 15 | It supports defaults for windowing the following parameters: 16 | - lag: list(range(1, 72 + 1)) 17 | - mean: [[1, 24], [1, 48], [1, 72]] 18 | - std: [[1, 24], [1, 48], [1, 72]] 19 | """ 20 | 21 | lag = config.pop( 22 | "forecaster_transformers__window_summarizer__lag_feature__lag", 23 | list(range(1, 72 + 1)), 24 | ) 25 | mean = config.pop( 26 | "forecaster_transformers__window_summarizer__lag_feature__mean", 27 | [[1, 24], [1, 48], [1, 72]], 28 | ) 29 | std = config.pop( 30 | "forecaster_transformers__window_summarizer__lag_feature__std", 31 | [[1, 24], [1, 48], [1, 72]], 32 | ) 33 | n_jobs = config.pop("forecaster_transformers__window_summarizer__n_jobs", 1) 34 | window_summarizer = WindowSummarizer( 35 | **{"lag_feature": {"lag": lag, "mean": mean, "std": std}}, 36 | n_jobs=n_jobs, 37 | ) 38 | 39 | regressor = lgb.LGBMRegressor() 40 | forecaster = make_reduction( 41 | regressor, 42 | transformers=[window_summarizer], 43 | strategy="recursive", 44 | pooling="global", 45 | window_length=None, 46 | ) 47 | 48 | pipe = ForecastingPipeline( 49 | steps=[ 50 | ("attach_area_and_consumer_type", transformers.AttachAreaConsumerType()), 51 | ( 52 | "daily_season", 53 | DateTimeFeatures( 54 | manual_selection=["day_of_week", "hour_of_day"], 55 | keep_original_columns=True, 56 | ), 57 | ), 58 | ("forecaster", forecaster), 59 | ] 60 | ) 61 | pipe = pipe.set_params(**config) 62 | 63 | return pipe 64 | 65 | 66 | def build_baseline_model(seasonal_periodicity: int): 67 | """Builds a naive forecaster baseline model using Sktime that predicts the last value given a seasonal periodicity.""" 68 | 69 | return NaiveForecaster(sp=seasonal_periodicity) 70 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | from pathlib import Path 4 | from typing import Union 5 | 6 | import matplotlib 7 | from dotenv import load_dotenv 8 | 9 | 10 | warnings.filterwarnings(action="ignore", category=FutureWarning, module="sktime") 11 | matplotlib.use("Agg") 12 | 13 | 14 | def load_env_vars(root_dir: Union[str, Path]) -> dict: 15 | """ 16 | Load environment variables from .env.default and .env files. 17 | 18 | Args: 19 | root_dir: Root directory of the .env files. 20 | 21 | Returns: 22 | Dictionary with the environment variables. 23 | """ 24 | 25 | if isinstance(root_dir, str): 26 | root_dir = Path(root_dir) 27 | 28 | load_dotenv(dotenv_path=root_dir / ".env.default") 29 | load_dotenv(dotenv_path=root_dir / ".env", override=True) 30 | 31 | return dict(os.environ) 32 | 33 | 34 | def get_root_dir(default_value: str = ".") -> Path: 35 | """ 36 | Get the root directory of the project. 37 | 38 | Args: 39 | default_value: Default value to use if the environment variable is not set. 40 | 41 | Returns: 42 | Path to the root directory of the project. 43 | """ 44 | 45 | return Path(os.getenv("ML_PIPELINE_ROOT_DIR", default_value)) 46 | 47 | 48 | ML_PIPELINE_ROOT_DIR = get_root_dir() 49 | OUTPUT_DIR = ML_PIPELINE_ROOT_DIR / "output" 50 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 51 | 52 | SETTINGS = load_env_vars(root_dir=ML_PIPELINE_ROOT_DIR) 53 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/train.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import OrderedDict 3 | import os 4 | from pathlib import Path 5 | from typing import OrderedDict as OrderedDictType, Optional, Tuple 6 | 7 | import fire 8 | import hopsworks 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import pandas as pd 12 | import wandb 13 | from sktime.performance_metrics.forecasting import ( 14 | mean_squared_percentage_error, 15 | mean_absolute_percentage_error, 16 | ) 17 | from sktime.utils.plotting import plot_series 18 | 19 | 20 | from training_pipeline import utils 21 | from training_pipeline.settings import SETTINGS, OUTPUT_DIR 22 | from training_pipeline.data import load_dataset_from_feature_store 23 | from training_pipeline.models import build_model, build_baseline_model 24 | 25 | 26 | logger = utils.get_logger(__name__) 27 | 28 | 29 | def from_best_config( 30 | fh: int = 24, 31 | feature_view_version: Optional[int] = None, 32 | training_dataset_version: Optional[int] = None, 33 | ) -> dict: 34 | """Train and evaluate on the test set the best model found in the hyperparameter optimization run. 35 | After training and evaluating it uploads the artifacts to wandb & hopsworks model registries. 36 | 37 | Args: 38 | fh (int, optional): Forecasting horizon. Defaults to 24. 39 | feature_view_version (Optional[int], optional): feature store - feature view version. 40 | If none, it will try to load the version from the cached feature_view_metadata.json file. Defaults to None. 41 | training_dataset_version (Optional[int], optional): feature store - feature view - training dataset version. 42 | If none, it will try to load the version from the cached feature_view_metadata.json file. Defaults to None. 43 | 44 | Returns: 45 | dict: Dictionary containing metadata about the training experiment. 46 | """ 47 | 48 | feature_view_metadata = utils.load_json("feature_view_metadata.json") 49 | if feature_view_version is None: 50 | feature_view_version = feature_view_metadata["feature_view_version"] 51 | if training_dataset_version is None: 52 | training_dataset_version = feature_view_metadata["training_dataset_version"] 53 | 54 | y_train, y_test, X_train, X_test = load_dataset_from_feature_store( 55 | feature_view_version=feature_view_version, 56 | training_dataset_version=training_dataset_version, 57 | fh=fh, 58 | ) 59 | 60 | training_start_datetime = y_train.index.get_level_values("datetime_utc").min() 61 | training_end_datetime = y_train.index.get_level_values("datetime_utc").max() 62 | testing_start_datetime = y_test.index.get_level_values("datetime_utc").min() 63 | testing_end_datetime = y_test.index.get_level_values("datetime_utc").max() 64 | logger.info( 65 | f"Training model on data from {training_start_datetime} to {training_end_datetime}." 66 | ) 67 | logger.info( 68 | f"Testing model on data from {testing_start_datetime} to {testing_end_datetime}." 69 | ) 70 | # Loading predictions from 2023-04-06 22:00:00 to 2023-04-07 21:00:00. 71 | 72 | with utils.init_wandb_run( 73 | name="best_model", 74 | job_type="train_best_model", 75 | group="train", 76 | reinit=True, 77 | add_timestamp_to_name=True, 78 | ) as run: 79 | run.use_artifact("split_train:latest") 80 | run.use_artifact("split_test:latest") 81 | # Load the best config from sweep. 82 | best_config_artifact = run.use_artifact( 83 | "best_config:latest", 84 | type="model", 85 | ) 86 | download_dir = best_config_artifact.download() 87 | config_path = Path(download_dir) / "best_config.json" 88 | with open(config_path) as f: 89 | config = json.load(f) 90 | # Log the config to the experiment. 91 | run.config.update(config) 92 | 93 | # # Baseline model 94 | baseline_forecaster = build_baseline_model(seasonal_periodicity=fh) 95 | baseline_forecaster = train_model(baseline_forecaster, y_train, X_train, fh=fh) 96 | _, metrics_baseline = evaluate(baseline_forecaster, y_test, X_test) 97 | slices = metrics_baseline.pop("slices") 98 | for k, v in metrics_baseline.items(): 99 | logger.info(f"Baseline test {k}: {v}") 100 | wandb.log({"test": {"baseline": metrics_baseline}}) 101 | wandb.log({"test.baseline.slices": wandb.Table(dataframe=slices)}) 102 | 103 | # Build & train best model. 104 | best_model = build_model(config) 105 | best_forecaster = train_model(best_model, y_train, X_train, fh=fh) 106 | 107 | # Evaluate best model 108 | y_pred, metrics = evaluate(best_forecaster, y_test, X_test) 109 | slices = metrics.pop("slices") 110 | for k, v in metrics.items(): 111 | logger.info(f"Model test {k}: {v}") 112 | wandb.log({"test": {"model": metrics}}) 113 | wandb.log({"test.model.slices": wandb.Table(dataframe=slices)}) 114 | 115 | # Render best model on the test set. 116 | results = OrderedDict({"y_train": y_train, "y_test": y_test, "y_pred": y_pred}) 117 | render(results, prefix="images_test") 118 | 119 | # Update best model with the test set. 120 | # NOTE: Method update() is not supported by LightGBM + Sktime. Instead we will retrain the model on the entire dataset. 121 | # best_forecaster = best_forecaster.update(y_test, X=X_test) 122 | best_forecaster = train_model( 123 | model=best_forecaster, 124 | y_train=pd.concat([y_train, y_test]).sort_index(), 125 | X_train=pd.concat([X_train, X_test]).sort_index(), 126 | fh=fh, 127 | ) 128 | X_forecast = compute_forecast_exogenous_variables(X_test, fh) 129 | y_forecast = forecast(best_forecaster, X_forecast) 130 | logger.info( 131 | f"Forecasted future values for renderin between {y_test.index.get_level_values('datetime_utc').min()} and {y_test.index.get_level_values('datetime_utc').max()}." 132 | ) 133 | results = OrderedDict( 134 | { 135 | "y_train": y_train, 136 | "y_test": y_test, 137 | "y_forecast": y_forecast, 138 | } 139 | ) 140 | # Render best model future forecasts. 141 | render(results, prefix="images_forecast") 142 | 143 | # Save best model. 144 | save_model_path = OUTPUT_DIR / "best_model.pkl" 145 | utils.save_model(best_forecaster, save_model_path) 146 | metadata = { 147 | "experiment": { 148 | "fh": fh, 149 | "feature_view_version": feature_view_version, 150 | "training_dataset_version": training_dataset_version, 151 | "training_start_datetime": training_start_datetime.to_timestamp().isoformat(), 152 | "training_end_datetime": training_end_datetime.to_timestamp().isoformat(), 153 | "testing_start_datetime": testing_start_datetime.to_timestamp().isoformat(), 154 | "testing_end_datetime": testing_end_datetime.to_timestamp().isoformat(), 155 | }, 156 | "results": {"test": metrics}, 157 | } 158 | artifact = wandb.Artifact(name="best_model", type="model", metadata=metadata) 159 | artifact.add_file(str(save_model_path)) 160 | run.log_artifact(artifact) 161 | 162 | run.finish() 163 | artifact.wait() 164 | 165 | model_version = add_best_model_to_model_registry(artifact) 166 | 167 | metadata = {"model_version": model_version} 168 | utils.save_json(metadata, file_name="train_metadata.json") 169 | 170 | return metadata 171 | 172 | 173 | def train_model(model, y_train: pd.DataFrame, X_train: pd.DataFrame, fh: int): 174 | """Train the forecaster on the given training set and forecast horizon.""" 175 | 176 | fh = np.arange(fh) + 1 177 | model.fit(y_train, X=X_train, fh=fh) 178 | 179 | return model 180 | 181 | 182 | def evaluate( 183 | forecaster, y_test: pd.DataFrame, X_test: pd.DataFrame 184 | ) -> Tuple[pd.DataFrame, dict]: 185 | """Evaluate the forecaster on the test set by computing the following metrics: 186 | - RMSPE 187 | - MAPE 188 | - Slices: RMSPE, MAPE 189 | 190 | Args: 191 | forecaster: model following the sklearn API 192 | y_test (pd.DataFrame): time series to forecast 193 | X_test (pd.DataFrame): exogenous variables 194 | 195 | Returns: 196 | The predictions as a pd.DataFrame and a dict of metrics. 197 | """ 198 | 199 | y_pred = forecaster.predict(X=X_test) 200 | 201 | # Compute aggregated metrics. 202 | results = dict() 203 | rmspe = mean_squared_percentage_error(y_test, y_pred, squared=False) 204 | results["RMSPE"] = rmspe 205 | mape = mean_absolute_percentage_error(y_test, y_pred, symmetric=False) 206 | results["MAPE"] = mape 207 | 208 | # Compute metrics per slice. 209 | y_test_slices = y_test.groupby(["area", "consumer_type"]) 210 | y_pred_slices = y_pred.groupby(["area", "consumer_type"]) 211 | slices = pd.DataFrame(columns=["area", "consumer_type", "RMSPE", "MAPE"]) 212 | for y_test_slice, y_pred_slice in zip(y_test_slices, y_pred_slices): 213 | (area_y_test, consumer_type_y_test), y_test_slice_data = y_test_slice 214 | (area_y_pred, consumer_type_y_pred), y_pred_slice_data = y_pred_slice 215 | 216 | assert ( 217 | area_y_test == area_y_pred and consumer_type_y_test == consumer_type_y_pred 218 | ), "Slices are not aligned." 219 | 220 | rmspe_slice = mean_squared_percentage_error( 221 | y_test_slice_data, y_pred_slice_data, squared=False 222 | ) 223 | mape_slice = mean_absolute_percentage_error( 224 | y_test_slice_data, y_pred_slice_data, symmetric=False 225 | ) 226 | 227 | slice_results = pd.DataFrame( 228 | { 229 | "area": [area_y_test], 230 | "consumer_type": [consumer_type_y_test], 231 | "RMSPE": [rmspe_slice], 232 | "MAPE": [mape_slice], 233 | } 234 | ) 235 | slices = pd.concat([slices, slice_results], ignore_index=True) 236 | 237 | results["slices"] = slices 238 | 239 | return y_pred, results 240 | 241 | 242 | def render( 243 | timeseries: OrderedDictType[str, pd.DataFrame], 244 | prefix: Optional[str] = None, 245 | delete_from_disk: bool = True, 246 | ): 247 | """Render the timeseries as a single plot per (area, consumer_type) and saves them to disk and to wandb.""" 248 | 249 | grouped_timeseries = OrderedDict() 250 | for split, df in timeseries.items(): 251 | df = df.reset_index(level=[0, 1]) 252 | groups = df.groupby(["area", "consumer_type"]) 253 | for group_name, split_group_values in groups: 254 | group_values = grouped_timeseries.get(group_name, {}) 255 | 256 | grouped_timeseries[group_name] = { 257 | f"{split}": split_group_values["energy_consumption"], 258 | **group_values, 259 | } 260 | 261 | output_dir = OUTPUT_DIR / prefix if prefix else OUTPUT_DIR 262 | output_dir.mkdir(parents=True, exist_ok=True) 263 | for group_name, group_values_dict in grouped_timeseries.items(): 264 | fig, ax = plot_series( 265 | *group_values_dict.values(), labels=group_values_dict.keys() 266 | ) 267 | fig.suptitle(f"Area: {group_name[0]} - Consumer type: {group_name[1]}") 268 | 269 | # save matplotlib image 270 | image_save_path = str(output_dir / f"{group_name[0]}_{group_name[1]}.png") 271 | plt.savefig(image_save_path) 272 | plt.close(fig) 273 | 274 | if prefix: 275 | wandb.log({prefix: wandb.Image(image_save_path)}) 276 | else: 277 | wandb.log(wandb.Image(image_save_path)) 278 | 279 | if delete_from_disk: 280 | os.remove(image_save_path) 281 | 282 | 283 | def compute_forecast_exogenous_variables(X_test: pd.DataFrame, fh: int): 284 | """Computes the exogenous variables for the forecast horizon.""" 285 | 286 | X_forecast = X_test.copy() 287 | X_forecast.index.set_levels( 288 | X_forecast.index.levels[-1] + fh, level=-1, inplace=True 289 | ) 290 | 291 | return X_forecast 292 | 293 | 294 | def forecast(forecaster, X_forecast: pd.DataFrame): 295 | """Forecast the energy consumption for the given exogenous variables and time horizon.""" 296 | 297 | return forecaster.predict(X=X_forecast) 298 | 299 | 300 | def add_best_model_to_model_registry(best_model_artifact: wandb.Artifact) -> int: 301 | """Adds the best model artifact to the model registry.""" 302 | 303 | project = hopsworks.login( 304 | api_key_value=SETTINGS["FS_API_KEY"], project=SETTINGS["FS_PROJECT_NAME"] 305 | ) 306 | 307 | # Upload the model to the Hopsworks model registry. 308 | best_model_dir = best_model_artifact.download() 309 | best_model_path = Path(best_model_dir) / "best_model.pkl" 310 | best_model_metrics = best_model_artifact.metadata["results"]["test"] 311 | 312 | mr = project.get_model_registry() 313 | py_model = mr.python.create_model("best_model", metrics=best_model_metrics) 314 | py_model.save(best_model_path) 315 | 316 | return py_model.version 317 | 318 | 319 | if __name__ == "__main__": 320 | fire.Fire(from_best_config) 321 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/transformers.py: -------------------------------------------------------------------------------- 1 | from sktime.transformations.base import BaseTransformer 2 | from sktime.transformations.compose import CORE_MTYPES 3 | 4 | 5 | class AttachAreaConsumerType(BaseTransformer): 6 | """Transformer used to extract the area and consumer type from the index to the input data.""" 7 | 8 | _tags = { 9 | "capability:inverse_transform": True, # can the transformer inverse transform? 10 | "univariate-only": False, # can the transformer handle multivariate X? 11 | "X_inner_mtype": CORE_MTYPES, # which mtypes do _fit/_predict support for X? 12 | # this can be a Panel mtype even if transform-input is Series, vectorized 13 | "y_inner_mtype": "None", # which mtypes do _fit/_predict support for y? 14 | "fit_is_empty": True, # is fit empty and can be skipped? Yes = True 15 | "transform-returns-same-time-index": True, 16 | # does transform return have the same time index as input X 17 | "handles-missing-data": True, # can estimator handle missing data? 18 | } 19 | 20 | def _transform(self, X, y=None): 21 | X["area_exog"] = X.index.get_level_values(0) 22 | X["consumer_type_exog"] = X.index.get_level_values(1) 23 | 24 | return X 25 | 26 | def _inverse_transform(self, X, y=None): 27 | X = X.drop(columns=["area_exog", "consumer_type_exog"]) 28 | 29 | return X 30 | -------------------------------------------------------------------------------- /training-pipeline/training_pipeline/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import joblib 4 | import pandas as pd 5 | import wandb 6 | 7 | from pathlib import Path 8 | from typing import Union, Optional 9 | 10 | 11 | from training_pipeline import settings 12 | 13 | 14 | def save_json(data: dict, file_name: str, save_dir: str = settings.OUTPUT_DIR): 15 | """ 16 | Save a dictionary as a JSON file. 17 | 18 | Args: 19 | data: data to save. 20 | file_name: Name of the JSON file. 21 | save_dir: Directory to save the JSON file. 22 | 23 | Returns: None 24 | """ 25 | 26 | data_path = Path(save_dir) / file_name 27 | with open(data_path, "w") as f: 28 | json.dump(data, f) 29 | 30 | 31 | def load_json(file_name: str, save_dir: str = settings.OUTPUT_DIR) -> dict: 32 | """ 33 | Load a JSON file. 34 | 35 | Args: 36 | file_name: Name of the JSON file. 37 | save_dir: Directory of the JSON file. 38 | 39 | Returns: Dictionary with the data. 40 | """ 41 | 42 | data_path = Path(save_dir) / file_name 43 | with open(data_path, "r") as f: 44 | return json.load(f) 45 | 46 | 47 | def save_model(model, model_path: Union[str, Path]): 48 | """ 49 | Template for saving a model. 50 | 51 | Args: 52 | model: Trained model. 53 | model_path: Path to save the model. 54 | """ 55 | 56 | joblib.dump(model, model_path) 57 | 58 | 59 | def load_model(model_path: Union[str, Path]): 60 | """ 61 | Template for loading a model. 62 | 63 | Args: 64 | model_path: Path to the model. 65 | 66 | Returns: Loaded model. 67 | """ 68 | 69 | return joblib.load(model_path) 70 | 71 | 72 | def load_data_from_parquet(data_path: str) -> pd.DataFrame: 73 | """ 74 | Template for loading data from a parquet file. 75 | 76 | Args: 77 | data_path: Path to the parquet file. 78 | 79 | Returns: Dataframe with the data. 80 | """ 81 | 82 | return pd.read_parquet(data_path) 83 | 84 | 85 | def get_logger(name: str) -> logging.Logger: 86 | """ 87 | Template for getting a logger. 88 | 89 | Args: 90 | name: Name of the logger. 91 | 92 | Returns: Logger. 93 | """ 94 | 95 | logging.basicConfig(level=logging.INFO) 96 | logger = logging.getLogger(name) 97 | 98 | return logger 99 | 100 | 101 | def init_wandb_run( 102 | name: str, 103 | group: Optional[str] = None, 104 | job_type: Optional[str] = None, 105 | add_timestamp_to_name: bool = False, 106 | run_id: Optional[str] = None, 107 | resume: Optional[str] = None, 108 | reinit: bool = False, 109 | project: str = settings.SETTINGS["WANDB_PROJECT"], 110 | entity: str = settings.SETTINGS["WANDB_ENTITY"], 111 | ): 112 | """Wrapper over the wandb.init function.""" 113 | 114 | if add_timestamp_to_name: 115 | name = f"{name}_{pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')}" 116 | 117 | run = wandb.init( 118 | project=project, 119 | entity=entity, 120 | name=name, 121 | group=group, 122 | job_type=job_type, 123 | id=run_id, 124 | reinit=reinit, 125 | resume=resume, 126 | ) 127 | 128 | return run 129 | 130 | 131 | def check_if_artifact_exists( 132 | artifact_name: str, 133 | project: str = settings.SETTINGS["WANDB_PROJECT"], 134 | entity: str = settings.SETTINGS["WANDB_ENTITY"], 135 | ) -> bool: 136 | """Utiliy function that checks if a W&B artifact exists.""" 137 | 138 | try: 139 | get_artifact(artifact_name, project, entity) 140 | 141 | return True 142 | except wandb.errors.CommError: 143 | return False 144 | 145 | 146 | def get_artifact( 147 | artifact_name: str, 148 | project: str = settings.SETTINGS["WANDB_PROJECT"], 149 | entity: str = settings.SETTINGS["WANDB_ENTITY"], 150 | ) -> wandb.Artifact: 151 | """Get the latest version of a W&B artifact.""" 152 | 153 | api = wandb.Api() 154 | artifact = api.artifact(f"{entity}/{project}/{artifact_name}:latest") 155 | 156 | return artifact 157 | --------------------------------------------------------------------------------