├── .gitignore
├── 00_data_source_exploration
    ├── energy_demand_api.ipynb
    ├── environment.yml
    ├── readme.md
    ├── weather_forecast_data.ipynb
    └── weather_station.ipynb
├── 01_terraform
    ├── .terraform-version
    ├── main.tf
    ├── readme.md
    ├── variables.tf
    └── vm_init.sh
├── 02_airflow
    ├── Dockerfile
    ├── dags
    │   ├── batch_predict.py
    │   ├── batch_predict_dag.py
    │   ├── fix_owm_schema_dag.py
    │   ├── gcloud_helpers.py
    │   ├── ingest_historical_weather_data_dag.py
    │   ├── ingest_live_hourly_weather_dag.py
    │   ├── ingest_raw_electricity_data_dag.py
    │   └── ingest_weather_forecast_dag.py
    ├── docker-compose.yml
    ├── readme.md
    └── requirements.txt
├── 03_dbt
    ├── .gitignore
    ├── README.md
    ├── analysis
    │   └── .gitkeep
    ├── dbt_project.yml
    ├── macros
    │   └── .gitkeep
    ├── models
    │   ├── core
    │   │   ├── fact_eia_demand_forecast.sql
    │   │   ├── fact_eia_demand_historical.sql
    │   │   ├── ml_model_metrics.sql
    │   │   ├── recorded_temperature.sql
    │   │   └── schema.yml
    │   ├── mart
    │   │   └── joined_temp_and_demand.sql
    │   └── staging
    │   │   ├── cast_isd_weather.sql
    │   │   ├── cast_owm_weather.sql
    │   │   └── union_weather_station.sql
    ├── seeds
    │   ├── .gitkeep
    │   ├── isd_stations.csv
    │   └── properties.yml
    ├── snapshots
    │   └── .gitkeep
    └── tests
    │   └── .gitkeep
├── 04_dashboard
    ├── .streamlit
    │   └── config.toml
    ├── app.py
    ├── info.py
    ├── readme.md
    └── requirements.txt
├── 05_model_training
    ├── 01_EDA.ipynb
    ├── 02_simple_linear_model.ipynb
    ├── 03_mlflow.ipynb
    ├── Untitled.ipynb
    ├── mlflow_docker
    │   ├── Dockerfile
    │   ├── docker-compose.yml
    │   └── requirements.txt
    ├── readme.md
    ├── requirements.txt
    └── ts_diagnostics.py
├── 06_deployment
    ├── batch_predict.py
    └── readme.md
├── 07_monitoring
    ├── 00_EDA.ipynb
    ├── app.py
    ├── readme.md
    └── requirements.txt
├── img
    ├── Architecture.PNG
    ├── batch_predict_dag.PNG
    ├── dashboard.PNG
    ├── dashboard1.PNG
    ├── dashboard_mockup.png
    ├── dbt_demand.PNG
    ├── dbt_demand_forecast.PNG
    ├── dbt_monitoring.PNG
    ├── dbt_temp.PNG
    ├── de_architecture.PNG
    ├── eia_dag.PNG
    ├── mlflow1.PNG
    ├── mlflow2.PNG
    ├── mlops_architecture.PNG
    ├── monitoring_dashboard_1.PNG
    ├── monitoring_dashboard_2.PNG
    ├── noaa_dag.PNG
    ├── owm_dag.PNG
    ├── pipeline.png
    └── weather_forecast_dag.PNG
├── proposal.md
├── readme.md
└── steps_to_recreate_project
    ├── images
        ├── 01_service_account.PNG
        ├── 02_service_account_key.PNG
        └── 03_vm.PNG
    └── readme.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .idea
 3 | *.tfstate
 4 | *.tfstate.*
 5 | **.terraform
 6 | **.terraform.lock.*
 7 | **google_credentials.json
 8 | **logs/
 9 | **.env
10 | **__pycache__/
11 | .history
12 | *.csv
13 | .ipynb_checkpoints/
14 | *.bin
15 | *.xml
16 | secrets.toml
17 | mlflow.db
18 | .vscode


--------------------------------------------------------------------------------
/00_data_source_exploration/environment.yml:
--------------------------------------------------------------------------------
  1 | name: energy
  2 | channels:
  3 |   - conda-forge
  4 |   - defaults
  5 | dependencies:
  6 |   - _libgcc_mutex=0.1=conda_forge
  7 |   - _openmp_mutex=4.5=2_gnu
  8 |   - alsa-lib=1.2.6.1=h7f98852_0
  9 |   - argon2-cffi=21.3.0=pyhd8ed1ab_0
 10 |   - argon2-cffi-bindings=21.2.0=py310h5764c6d_2
 11 |   - asttokens=2.0.5=pyhd8ed1ab_0
 12 |   - attr=2.5.1=h166bdaf_0
 13 |   - attrs=21.4.0=pyhd8ed1ab_0
 14 |   - backcall=0.2.0=pyh9f0ad1d_0
 15 |   - backports=1.0=py_2
 16 |   - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
 17 |   - beautifulsoup4=4.11.1=pyha770c72_0
 18 |   - bleach=5.0.1=pyhd8ed1ab_0
 19 |   - brotli=1.0.9=h166bdaf_7
 20 |   - brotli-bin=1.0.9=h166bdaf_7
 21 |   - bzip2=1.0.8=h7f98852_4
 22 |   - c-ares=1.18.1=h7f98852_0
 23 |   - ca-certificates=2022.6.15=ha878542_0
 24 |   - cartopy=0.20.2=py310hb408dcc_6
 25 |   - certifi=2022.6.15=py310hff52083_0
 26 |   - cffi=1.15.0=py310h0fdd8cc_0
 27 |   - cycler=0.11.0=pyhd8ed1ab_0
 28 |   - dbus=1.13.6=h5008d03_3
 29 |   - debugpy=1.6.0=py310hd8f1fbe_0
 30 |   - decorator=5.1.1=pyhd8ed1ab_0
 31 |   - defusedxml=0.7.1=pyhd8ed1ab_0
 32 |   - entrypoints=0.4=pyhd8ed1ab_0
 33 |   - executing=0.8.3=pyhd8ed1ab_0
 34 |   - expat=2.4.8=h27087fc_0
 35 |   - fftw=3.3.10=nompi_h77c792f_102
 36 |   - flit-core=3.7.1=pyhd8ed1ab_0
 37 |   - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
 38 |   - font-ttf-inconsolata=3.000=h77eed37_0
 39 |   - font-ttf-source-code-pro=2.038=h77eed37_0
 40 |   - font-ttf-ubuntu=0.83=hab24e00_0
 41 |   - fontconfig=2.14.0=h8e229c2_0
 42 |   - fonts-conda-ecosystem=1=0
 43 |   - fonts-conda-forge=1=0
 44 |   - fonttools=4.33.3=py310h5764c6d_0
 45 |   - freetype=2.10.4=h0708190_1
 46 |   - geos=3.10.3=h27087fc_0
 47 |   - gettext=0.19.8.1=h73d1719_1008
 48 |   - giflib=5.2.1=h36c2ea0_2
 49 |   - glib=2.70.2=h780b84a_4
 50 |   - glib-tools=2.70.2=h780b84a_4
 51 |   - gst-plugins-base=1.20.3=hf6a322e_0
 52 |   - gstreamer=1.20.3=hd4edc92_0
 53 |   - icu=70.1=h27087fc_0
 54 |   - importlib-metadata=4.11.4=py310hff52083_0
 55 |   - importlib_metadata=4.11.4=hd8ed1ab_0
 56 |   - importlib_resources=5.8.0=pyhd8ed1ab_0
 57 |   - ipykernel=6.15.0=pyh210e3f2_0
 58 |   - ipython=8.4.0=py310hff52083_0
 59 |   - ipython_genutils=0.2.0=py_1
 60 |   - ipywidgets=7.7.1=pyhd8ed1ab_0
 61 |   - jack=1.9.18=h8c3723f_1002
 62 |   - jedi=0.18.1=py310hff52083_1
 63 |   - jinja2=3.1.2=pyhd8ed1ab_1
 64 |   - jpeg=9e=h166bdaf_1
 65 |   - jsonschema=4.6.1=pyhd8ed1ab_0
 66 |   - jupyter=1.0.0=py310hff52083_7
 67 |   - jupyter_client=7.3.4=pyhd8ed1ab_0
 68 |   - jupyter_console=6.4.4=pyhd8ed1ab_0
 69 |   - jupyter_core=4.10.0=py310hff52083_0
 70 |   - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0
 71 |   - jupyterlab_widgets=1.1.1=pyhd8ed1ab_0
 72 |   - keyutils=1.6.1=h166bdaf_0
 73 |   - kiwisolver=1.4.3=py310hbf28c38_0
 74 |   - krb5=1.19.3=h3790be6_0
 75 |   - lcms2=2.12=hddcbb42_0
 76 |   - ld_impl_linux-64=2.36.1=hea4e1c9_2
 77 |   - lerc=3.0=h9c3ff4c_0
 78 |   - libblas=3.9.0=15_linux64_openblas
 79 |   - libbrotlicommon=1.0.9=h166bdaf_7
 80 |   - libbrotlidec=1.0.9=h166bdaf_7
 81 |   - libbrotlienc=1.0.9=h166bdaf_7
 82 |   - libcap=2.64=ha37c62d_0
 83 |   - libcblas=3.9.0=15_linux64_openblas
 84 |   - libclang=14.0.6=default_h2e3cab8_0
 85 |   - libclang13=14.0.6=default_h3a83d3e_0
 86 |   - libcups=2.3.3=hf5a7f15_1
 87 |   - libcurl=7.83.1=h7bff187_0
 88 |   - libdb=6.2.32=h9c3ff4c_0
 89 |   - libdeflate=1.12=h166bdaf_0
 90 |   - libedit=3.1.20191231=he28a2e2_2
 91 |   - libev=4.33=h516909a_1
 92 |   - libevent=2.1.10=h9b69904_4
 93 |   - libffi=3.4.2=h7f98852_5
 94 |   - libflac=1.3.4=h27087fc_0
 95 |   - libgcc-ng=12.1.0=h8d9b700_16
 96 |   - libgfortran-ng=12.1.0=h69a702a_16
 97 |   - libgfortran5=12.1.0=hdcd56e2_16
 98 |   - libglib=2.70.2=h174f98d_4
 99 |   - libgomp=12.1.0=h8d9b700_16
100 |   - libiconv=1.16=h516909a_0
101 |   - liblapack=3.9.0=15_linux64_openblas
102 |   - libllvm14=14.0.6=he0ac6c6_0
103 |   - libnghttp2=1.47.0=h727a467_0
104 |   - libnsl=2.0.0=h7f98852_0
105 |   - libogg=1.3.4=h7f98852_1
106 |   - libopenblas=0.3.20=pthreads_h78a6416_0
107 |   - libopus=1.3.1=h7f98852_1
108 |   - libpng=1.6.37=h21135ba_2
109 |   - libpq=14.4=hd77ab85_0
110 |   - libsndfile=1.0.31=h9c3ff4c_1
111 |   - libsodium=1.0.18=h36c2ea0_1
112 |   - libssh2=1.10.0=ha56f1ee_2
113 |   - libstdcxx-ng=12.1.0=ha89aaad_16
114 |   - libtiff=4.4.0=hc85c160_1
115 |   - libtool=2.4.6=h9c3ff4c_1008
116 |   - libudev1=249=h166bdaf_4
117 |   - libuuid=2.32.1=h7f98852_1000
118 |   - libvorbis=1.3.7=h9c3ff4c_0
119 |   - libwebp=1.2.2=h3452ae3_0
120 |   - libwebp-base=1.2.2=h7f98852_1
121 |   - libxcb=1.13=h7f98852_1004
122 |   - libxkbcommon=1.0.3=he3ba5ed_0
123 |   - libxml2=2.9.14=h22db469_0
124 |   - libzlib=1.2.12=h166bdaf_1
125 |   - lz4-c=1.9.3=h9c3ff4c_1
126 |   - markupsafe=2.1.1=py310h5764c6d_1
127 |   - matplotlib-base=3.5.2=py310h5701ce4_0
128 |   - matplotlib-inline=0.1.3=pyhd8ed1ab_0
129 |   - mistune=0.8.4=py310h6acc77f_1005
130 |   - munkres=1.1.4=pyh9f0ad1d_0
131 |   - mysql-common=8.0.29=haf5c9bc_1
132 |   - mysql-libs=8.0.29=h28c427c_1
133 |   - nbclient=0.6.4=pyhd8ed1ab_1
134 |   - nbconvert=6.5.0=pyhd8ed1ab_0
135 |   - nbconvert-core=6.5.0=pyhd8ed1ab_0
136 |   - nbconvert-pandoc=6.5.0=pyhd8ed1ab_0
137 |   - nbformat=5.4.0=pyhd8ed1ab_0
138 |   - ncurses=6.3=h27087fc_1
139 |   - nest-asyncio=1.5.5=pyhd8ed1ab_0
140 |   - notebook=6.4.12=pyha770c72_0
141 |   - nspr=4.32=h9c3ff4c_1
142 |   - nss=3.78=h2350873_0
143 |   - numpy=1.23.0=py310h53a5b5f_0
144 |   - openjpeg=2.4.0=hb52868f_1
145 |   - openssl=1.1.1p=h166bdaf_0
146 |   - packaging=21.3=pyhd8ed1ab_0
147 |   - pandas=1.4.3=py310h769672d_0
148 |   - pandoc=2.18=ha770c72_0
149 |   - pandocfilters=1.5.0=pyhd8ed1ab_0
150 |   - parso=0.8.3=pyhd8ed1ab_0
151 |   - pcre=8.45=h9c3ff4c_0
152 |   - pexpect=4.8.0=pyh9f0ad1d_2
153 |   - pickleshare=0.7.5=py_1003
154 |   - pillow=9.1.1=py310he619898_1
155 |   - pip=22.1.2=pyhd8ed1ab_0
156 |   - portaudio=19.6.0=h57a0ea0_5
157 |   - proj=9.0.1=h93bde94_0
158 |   - prometheus_client=0.14.1=pyhd8ed1ab_0
159 |   - prompt-toolkit=3.0.30=pyha770c72_0
160 |   - prompt_toolkit=3.0.30=hd8ed1ab_0
161 |   - psutil=5.9.1=py310h5764c6d_0
162 |   - pthread-stubs=0.4=h36c2ea0_1001
163 |   - ptyprocess=0.7.0=pyhd3deb0d_0
164 |   - pulseaudio=14.0=h7f54b18_8
165 |   - pure_eval=0.2.2=pyhd8ed1ab_0
166 |   - pycparser=2.21=pyhd8ed1ab_0
167 |   - pygments=2.12.0=pyhd8ed1ab_0
168 |   - pyparsing=3.0.9=pyhd8ed1ab_0
169 |   - pyproj=3.3.1=py310hf94497c_1
170 |   - pyqt=5.15.4=py310h29803b5_1
171 |   - pyrsistent=0.18.1=py310h5764c6d_1
172 |   - pyshp=2.3.0=pyhd8ed1ab_0
173 |   - python=3.10.5=h582c2e5_0_cpython
174 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
175 |   - python-fastjsonschema=2.15.3=pyhd8ed1ab_0
176 |   - python_abi=3.10=2_cp310
177 |   - pytz=2022.1=pyhd8ed1ab_0
178 |   - pyzmq=23.2.0=py310h330234f_0
179 |   - qt-main=5.15.4=ha5833f6_2
180 |   - qtconsole=5.3.1=pyhd8ed1ab_0
181 |   - qtconsole-base=5.3.1=pyha770c72_0
182 |   - qtpy=2.1.0=pyhd8ed1ab_0
183 |   - readline=8.1.2=h0f457ee_0
184 |   - scipy=1.8.1=py310h7612f91_0
185 |   - send2trash=1.8.0=pyhd8ed1ab_0
186 |   - setuptools=62.6.0=py310hff52083_0
187 |   - shapely=1.8.2=py310h7b2ee30_2
188 |   - sip=6.5.1=py310h122e73d_2
189 |   - six=1.16.0=pyh6c4a22f_0
190 |   - soupsieve=2.3.1=pyhd8ed1ab_0
191 |   - sqlite=3.39.0=h4ff8645_0
192 |   - stack_data=0.3.0=pyhd8ed1ab_0
193 |   - terminado=0.15.0=py310hff52083_0
194 |   - tinycss2=1.1.1=pyhd8ed1ab_0
195 |   - tk=8.6.12=h27826a3_0
196 |   - toml=0.10.2=pyhd8ed1ab_0
197 |   - tornado=6.1=py310h5764c6d_3
198 |   - traitlets=5.3.0=pyhd8ed1ab_0
199 |   - typing_extensions=4.2.0=pyha770c72_1
200 |   - tzdata=2022a=h191b570_0
201 |   - unicodedata2=14.0.0=py310h5764c6d_1
202 |   - wcwidth=0.2.5=pyh9f0ad1d_2
203 |   - webencodings=0.5.1=py_1
204 |   - wheel=0.37.1=pyhd8ed1ab_0
205 |   - widgetsnbextension=3.6.1=pyha770c72_0
206 |   - xarray=2022.3.0=pyhd8ed1ab_0
207 |   - xcb-util=0.4.0=h166bdaf_0
208 |   - xcb-util-image=0.4.0=h166bdaf_0
209 |   - xcb-util-keysyms=0.4.0=h166bdaf_0
210 |   - xcb-util-renderutil=0.3.9=h166bdaf_0
211 |   - xcb-util-wm=0.4.1=h166bdaf_0
212 |   - xorg-libxau=1.0.9=h7f98852_0
213 |   - xorg-libxdmcp=1.1.3=h7f98852_0
214 |   - xz=5.2.5=h516909a_1
215 |   - zeromq=4.3.4=h9c3ff4c_1
216 |   - zipp=3.8.0=pyhd8ed1ab_0
217 |   - zlib=1.2.12=h166bdaf_1
218 |   - zstd=1.5.2=h8a70e8d_1
219 |   - pip:
220 |     - charset-normalizer==2.0.12
221 |     - idna==3.3
222 |     - pygrib==2.1.4
223 |     - pyqt5-sip==12.9.0
224 |     - python-dotenv==0.20.0
225 |     - requests==2.28.0
226 |     - urllib3==1.26.9
227 | prefix: /home/michael/miniconda3/envs/energy
228 | 


--------------------------------------------------------------------------------
/00_data_source_exploration/readme.md:
--------------------------------------------------------------------------------
 1 | # Data Source Exploration Notebooks
 2 | 
 3 | These notebooks explore and document the data sources used in the project. Code from these notebooks was modified in the airflow DAGs to regularly ingest the data from the sources into the data lake on Google Cloud. 
 4 | 
 5 | To run the code, you will need to create a conda environment containing the dependencies. You can either create one from the `environment.yml` file with the command: 
 6 | 
 7 | ```bash
 8 | conda env create -f environment.yml
 9 | ```
10 | 
11 | or create the environment yourself with the following commands:
12 | 
13 | ```bash
14 | conda create -n energy jupyter requests cartopy xarray
15 | conda activate energy
16 | pip install python-dotenv pygrib
17 | ```
18 | 
19 | then activate the environment: `conda activate energy`, and run `jupyter notebook`.
20 | 
21 | ## API Keys
22 | To access the data from the EIA API and the Open Weather Map API, you will need to register for your own API key [with the EIA](https://www.eia.gov/opendata/register.php) and [OWM](https://home.openweathermap.org/users/sign_up) for free. When they email you the key, place it in a file called `.env` located in this directory and be sure to add that file to your .gitignore so you don't commit it to your repo. The file should look like this:
23 | 
24 | ```
25 | EIA_KEY=your_eia_api_key
26 | OWM_KEY=your_owm_api_key
27 | ```
28 | 


--------------------------------------------------------------------------------
/01_terraform/.terraform-version:
--------------------------------------------------------------------------------
1 | 1.0.2


--------------------------------------------------------------------------------
/01_terraform/main.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_version = ">= 1.0"
  3 |   backend "local" {}  # Can change from "local" to "gcs" (for google) or "s3" (for aws), if you would like to preserve your tf-state online
  4 |   required_providers {
  5 |     google = {
  6 |       source  = "hashicorp/google"
  7 |     }
  8 |   }
  9 | }
 10 | 
 11 | provider "google" {
 12 |   project = var.project
 13 |   region = var.region
 14 |   # credentials =  # Use this if you do not want to set env-var GOOGLE_APPLICATION_CREDENTIALS
 15 | }
 16 | 
 17 | # Data Lake Bucket
 18 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket
 19 | resource "google_storage_bucket" "data-lake-bucket" {
 20 |   name          = "${local.data_lake_bucket}_${var.project}" # Concatenating DL bucket & Project name for unique naming
 21 |   location      = var.region
 22 | 
 23 |   # Optional, but recommended settings:
 24 |   storage_class = var.storage_class
 25 |   uniform_bucket_level_access = true
 26 | 
 27 |   versioning {
 28 |     enabled     = true
 29 |   }
 30 | 
 31 |   lifecycle_rule {
 32 |     action {
 33 |       type = "Delete"
 34 |     }
 35 |     condition {
 36 |       age = 30  // days
 37 |     }
 38 |   }
 39 | 
 40 |   force_destroy = true
 41 | }
 42 | 
 43 | # DWH
 44 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset
 45 | resource "google_bigquery_dataset" "dataset" {
 46 |   dataset_id = var.BQ_DATASET
 47 |   project    = var.project
 48 |   location   = var.region
 49 | }
 50 | 
 51 | 
 52 | # A single Compute Engine instance
 53 | resource "google_compute_instance" "default" {
 54 |  name         = "energy-data-proj-vm"
 55 |  machine_type = "e2-standard-4"
 56 |  zone         = var.zone
 57 | 
 58 |  boot_disk {
 59 |    initialize_params {
 60 |      image = "ubuntu-1804-lts"
 61 |      size  = "30"
 62 |    }
 63 |  }
 64 | 
 65 |  network_interface {
 66 |    network = "default"
 67 |     access_config {
 68 |     # Include this section to give the VM an external IP address
 69 |   }
 70 |  }
 71 | 
 72 |   metadata = {
 73 |     ssh-keys = "${var.ssh_user}:${file(var.ssh_public_key_file)}"
 74 |   }
 75 | }
 76 | 
 77 | 
 78 | resource "google_storage_bucket" "mlflow-runs" {
 79 | 
 80 |   name          = "mlflow-runs-${var.project}" # Concatenating DL bucket & Project name for unique naming
 81 |   location      = var.region
 82 | 
 83 |   # Optional, but recommended settings:
 84 |   storage_class = var.storage_class
 85 |   uniform_bucket_level_access = true
 86 | 
 87 |   versioning {
 88 |     enabled     = true
 89 |   }
 90 | 
 91 |   lifecycle_rule {
 92 |     action {
 93 |       type = "Delete"
 94 |     }
 95 |     condition {
 96 |       age = 30  // days
 97 |     }
 98 |   }
 99 | 
100 |   force_destroy = true
101 | } 
102 | 
103 | 
104 | 
105 | # resource "google_compute_network" "default" {
106 | #   provider = google-beta
107 | 
108 | #   name = "default"
109 | # }
110 | 
111 | # resource "google_sql_database_instance" "main" {
112 | #   name             = "main-instance"
113 | #   database_version = "POSTGRES_14"
114 | #   region           = "us-central1"
115 | #   # depends_on = [google_service_networking_connection.private_vpc_connection]
116 | 
117 | #   settings {
118 | #     # Second-generation instance tiers are based on the machine
119 | #     # type. See argument reference below.
120 | #     tier = var.machine_type
121 | #     ip_configuration {
122 | #       ipv4_enabled = false
123 | #       private_network = google_compute_network.default.id
124 | #     }
125 | #   }
126 |   
127 | 
128 | 
129 | # }
130 | 
131 | 
132 | # module "sql-db_postgresql" {
133 | #   # https://registry.terraform.io/modules/GoogleCloudPlatform/sql-db/google/latest/submodules/postgresql
134 | #   source  = "GoogleCloudPlatform/sql-db/google//modules/postgresql"
135 | #   version = "11.0.0"
136 | 
137 | #   project_id = var.project
138 | #   region  = var.region
139 | #   zone = var.zone
140 | #   name   = var.instance_name
141 | #   db_name = var.db_name
142 | 
143 | #   database_version      = var.postgres_version
144 | #   tier = var.machine_type
145 | 
146 | 
147 | 
148 | #   ip_configuration = {
149 | #                       authorized_networks = [],
150 | #                       ipv4_enabled        = false,
151 | #                       private_network     = "default"
152 | #                       require_ssl         = false
153 | #                       allocated_ip_range = ""
154 | #                           }
155 | 
156 | 
157 | #   deletion_protection = true
158 | 
159 | #   # These together will construct the master_user privileges, i.e.
160 | #   # 'master_user_name'@'master_user_host' IDENTIFIED BY 'master_user_password'.
161 | #   # These should typically be set as the environment variable TF_VAR_master_user_password, etc.
162 | #   # so you don't check these into source control."
163 | 
164 | 
165 | # }


--------------------------------------------------------------------------------
/01_terraform/readme.md:
--------------------------------------------------------------------------------
 1 | # Use Terraform to create GCP resources
 2 | 
 3 | * 2022-03-03 - set up a GCS bucket called `energy_project_bucket_data-eng-zoomcamp-339102` for the project
 4 |   * note that in [`variables.tf`](variables.tf), there is no default value given for the project variable. 
 5 |   Terraform will prompt for this variable upon `terraform apply` and `terraform plan`. You can also supply
 6 |   it in the prompt like this: 
 7 |   
 8 |     ```shell
 9 |     # Create new infra
10 |     terraform apply -var="project=<your-gcp-project-id>"
11 |     ```


--------------------------------------------------------------------------------
/01_terraform/variables.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   data_lake_bucket = "energy_project_bucket"
 3 | }
 4 | 
 5 | variable "project" {
 6 |   description = "Your GCP Project ID"
 7 | }
 8 | 
 9 | variable "region" {
10 |   description = "Region for GCP resources. Choose as per your location: https://cloud.google.com/about/locations"
11 |   default = "us-central1"
12 |   type = string
13 | }
14 | 
15 | variable "zone" {
16 |   description = "Zone for compute instance"
17 |   default = "us-central1-c"
18 |   type = string
19 | }
20 | variable "storage_class" {
21 |   description = "Storage class type for your bucket. Check official docs for more info."
22 |   default = "STANDARD"
23 | }
24 | 
25 | variable "BQ_DATASET" {
26 |   description = "BigQuery Dataset that raw data (from GCS) will be written to"
27 |   type = string
28 |   default = "energy_data"
29 | }
30 | 
31 | variable "ssh_public_key_file" {
32 |   description = "Path to the public ssh key that will be used to connect to the Compute Instance created by terraform"
33 |   type = string
34 |   default = "~/.ssh/gcp3.pub"
35 | }
36 | 
37 | variable "ssh_user" {
38 |   description = "username to connect to the Compute Instance created by terraform via ssh"
39 |   type = string
40 |   default = "michael"
41 | }
42 | 
43 | variable "instance_name" {
44 |   description = "Name of the Postgres Instance created by terraform"
45 |   type = string
46 |   default = "mlflow-postgres"
47 | }
48 | 
49 | variable "db_name" {
50 |   description = "Name of the Postgres Database created by terraform"
51 |   type = string
52 |   default = "mlflow"
53 | }
54 | 
55 | variable "postgres_version" {
56 |   description = "The engine version of the database, e.g. `POSTGRES_9_6`. See https://cloud.google.com/sql/docs/db-versions for supported versions."
57 |   type        = string
58 |   default     = "POSTGRES_14"
59 | }
60 | 
61 | 
62 | variable "machine_type" {
63 |   description = "The machine type to use, see https://cloud.google.com/sql/pricing for more details"
64 |   type        = string
65 |   default     = "db-custom-1-3840"
66 | }
67 | 
68 | variable "user_name" {
69 |   description = "The username for mlflow-postgres the default user credentials, i.e. 'master_user_name'@'master_user_host' IDENTIFIED BY 'master_user_password'. This should typically be set as the environment variable TF_VAR_master_user_name so you don't check it into source control."
70 |   type        = string
71 |   default = "mlflow_user"
72 | }
73 | 
74 | variable "user_password" {
75 |   description = "The password for ml-flow postgres the default user credentials, i.e. 'master_user_name'@'master_user_host' IDENTIFIED BY 'master_user_password'. This should typically be set as the environment variable TF_VAR_master_user_password so you don't check it into source control."
76 |   type        = string
77 |   default = "mlflow_pass"
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/01_terraform/vm_init.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run using the below command
 4 | # bash vm_setup.sh
 5 | #!/bin/bash
 6 | 
 7 | # Run using the below command
 8 | # bash vm_setup.sh
 9 | 
10 | echo hello $USER
11 | 
12 | if [ -f ~/setup_result.txt ]; 
13 | then
14 | 
15 | cat ~/setup_result.txt
16 | 
17 | else
18 | sudo apt-add-repository ppa:fish-shell/release-3 -y
19 | sudo apt-get update
20 | sudo apt install wget
21 | 
22 | echo "installing miniconda..."
23 | mkdir -p ~/miniconda3
24 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
25 | bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
26 | rm -rf ~/miniconda3/miniconda.sh
27 | ~/miniconda3/bin/conda init bash
28 | ~/miniconda3/bin/conda init zsh
29 | 
30 | echo "Installing Docker..."
31 | sudo apt-get -y install docker.io
32 | 
33 | echo "Docker without sudo setup..."
34 | sudo groupadd docker
35 | sudo gpasswd -a $USER docker
36 | sudo service docker restart
37 | 
38 | echo "Installing docker-compose..."
39 | cd
40 | mkdir -p bin
41 | cd bin
42 | wget https://github.com/docker/compose/releases/download/v2.3.3/docker-compose-linux-x86_64 -O docker-compose
43 | sudo chmod +x docker-compose
44 | 
45 | echo "Setup .bashrc..."
46 | echo '' >> ~/.bashrc
47 | echo 'export PATH=${HOME}/bin:${PATH}' >> ~/.bashrc
48 | eval "$(cat ~/.bashrc | tail -n +10)" # A hack because source .bashrc doesn't work inside the script
49 | 
50 | echo "docker-compose version..."
51 | docker-compose --version
52 | 
53 | sudo apt -y install fish
54 | echo '' >> ~/.bashrc
55 | echo 'exec fish' >> ~/.bashrc
56 | 
57 | echo "The setup script vm_init.sh ran successfully on at `date`" >> ~/setup_result.txt
58 | 
59 | fi
60 | 
61 | 


--------------------------------------------------------------------------------
/02_airflow/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:2.2.3
 2 | 
 3 | ENV AIRFLOW_HOME=/opt/airflow
 4 | 
 5 | USER root
 6 | RUN apt-get update -qq && apt-get install vim -qqq
 7 | 
 8 | COPY requirements.txt .
 9 | RUN python -m pip install --upgrade pip
10 | RUN pip install --no-cache-dir -r requirements.txt
11 | 
12 | # https://airflow.apache.org/docs/docker-stack/recipes.html
13 | 
14 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
15 | 
16 | USER 0
17 | 
18 | ARG CLOUD_SDK_VERSION=322.0.0
19 | ENV GCLOUD_HOME=/opt/google-cloud-sdk
20 | 
21 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}"
22 | 
23 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \
24 |     && TMP_DIR="$(mktemp -d)" \
25 |     && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \
26 |     && mkdir -p "${GCLOUD_HOME}" \
27 |     && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \
28 |     && "${GCLOUD_HOME}/install.sh" \
29 |        --bash-completion=false \
30 |        --path-update=false \
31 |        --usage-reporting=false \
32 |        --additional-components alpha beta kubectl \
33 |        --quiet \
34 |     && rm -rf "${TMP_DIR}" \
35 |     && gcloud --version
36 | 
37 | 
38 | WORKDIR $AIRFLOW_HOME
39 | 
40 | USER 1002


--------------------------------------------------------------------------------
/02_airflow/dags/batch_predict.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from google.cloud import bigquery
  3 | import pandas as pd
  4 | import calendar
  5 | import pytz
  6 | import mlflow
  7 | import logging
  8 | 
  9 | 
 10 | def pull_temp_forecast(start_date, tz='America/Denver'):
 11 |     q = f"""
 12 |             WITH forecast_pit as
 13 |             (
 14 |             SELECT 
 15 |                 forecast_time,
 16 |                 MAX(creation_time) creation_time
 17 | 
 18 |             FROM
 19 |                 `mlops-zoomcamp-354700.energy_data.weather_forecast`
 20 |             WHERE
 21 |                 creation_time <= TIMESTAMP( "{start_date}", "{tz}")
 22 |                 AND forecast_time >= TIMESTAMP("{start_date}", "{tz}")
 23 |             GROUP BY
 24 |                 forecast_time
 25 |             )
 26 | 
 27 |             SELECT 
 28 |             f.forecast_time,
 29 |             f.temp_f,
 30 |             f.creation_time
 31 |             FROM `mlops-zoomcamp-354700.energy_data.weather_forecast` f
 32 |             INNER JOIN forecast_pit
 33 |             ON forecast_pit.creation_time =  f.creation_time 
 34 |             AND forecast_pit.forecast_time = f.forecast_time
 35 | 
 36 |             ORDER BY forecast_time
 37 |     """
 38 | 
 39 |     tz_info = pytz.timezone(tz)
 40 |     
 41 |     # interpolate temp_f forecast on hourly basis (previously it is only every 3 hours)
 42 |     df = (pd.read_gbq(q, project_id='mlops-zoomcamp-354700')
 43 |             .assign(temp_F=lambda df_: df_['temp_f'].astype(float))
 44 |             .set_index('forecast_time')
 45 |             .loc[:, 'temp_F']
 46 |             .resample('H')
 47 |             .interpolate('cubic')
 48 |             .reset_index()
 49 |             .assign(energy_timestamp_mtn=lambda df_: df_['forecast_time'].dt.tz_convert(tz_info))
 50 |             .drop(columns=['forecast_time'])
 51 |     )
 52 |     
 53 |     return df
 54 | 
 55 | 
 56 | def make_features(df, features):
 57 |     df_out = (df
 58 |                 .reset_index()
 59 |                 .assign(
 60 |                     year=lambda df_: df_['energy_timestamp_mtn'].dt.year,
 61 |                     day_of_year=lambda df_: df_['energy_timestamp_mtn'].dt.day_of_year,
 62 |                     hour=lambda df_: df_['energy_timestamp_mtn'].dt.hour,
 63 |                     is_weekend=lambda df_: df_['energy_timestamp_mtn'].dt.day_of_week >= 5, # saturady day_of_week = 5, sunday = 6
 64 |                     is_summer=lambda df_: df_['energy_timestamp_mtn'].dt.month.between(5, 9, inclusive='both'),
 65 |                     month=lambda df_: df_['energy_timestamp_mtn'].dt.month,
 66 |                     temp_F_squared=lambda df_: df_['temp_F'] * df_['temp_F'],
 67 |                     hour_squared=lambda df_: df_['hour'] ** 2,
 68 |                     hour_cubed=lambda df_: df_['hour'] ** 3,
 69 |             )
 70 | 
 71 |         .set_index('energy_timestamp_mtn')                                    
 72 |     )
 73 | 
 74 | 
 75 |     for month in calendar.month_name[1:]:
 76 |         df_out[month] = pd.to_numeric(df_out.index.month_name == month)
 77 |         
 78 |     return df_out[features]
 79 | 
 80 | 
 81 | def load_model(run_id):
 82 |     logged_model = f'gs://mlflow-runs-mlops-zoomcamp-354700/2/{run_id}/artifacts/model'
 83 |     model = mlflow.pyfunc.load_model(logged_model)
 84 |     return model
 85 | 
 86 | 
 87 | def save_results(df, y_pred, run_id, start_date, tz, output_file):
 88 |     df_result = (pd.DataFrame()
 89 |                     .assign(energy_timestamp_mtn=df.index,
 90 |                             predicted_energy_demand=y_pred,
 91 |                             temp_f_forecast=df['temp_F'].reset_index(drop=True),
 92 |                             model_version=run_id,
 93 |                             prediction_start_date=start_date,
 94 |                             prediction_creation_date=datetime.now())
 95 |                 )
 96 |     logging.info(f'df_result_prediction_start_date: {df_result.prediction_start_date}')
 97 | 
 98 |     df_result['prediction_start_date'] = df_result['prediction_start_date'].dt.tz_localize(tz)
 99 | 
100 |     logging.info(f'df_result_prediction_start_date: {df_result.prediction_start_date}')
101 |     df_result.to_parquet(output_file, index=False)
102 | 
103 | 
104 | def apply_model(run_id, features, start_date, date_fmt, output_file, tz='America/Denver'):
105 |     logging.info(f'start_date: {start_date}')
106 |     logging.info(f'date_fmt: {date_fmt}')
107 | 
108 |     start_date = datetime.strptime(start_date, date_fmt)
109 |     logging.info('converted start_date from string to datetime')
110 |     logging.info(f'start_date: {start_date}')
111 | 
112 |     df = pull_temp_forecast(start_date, tz)
113 |     logging.info(f'df: {df.energy_timestamp_mtn.min()}')
114 |     logging.info(f'df: {df.energy_timestamp_mtn.max()}')
115 | 
116 |     df = make_features(df, features=features)
117 |     model = load_model(run_id)
118 |     y_pred = model.predict(df)
119 |     save_results(df, y_pred, run_id, start_date, tz, output_file)
120 |     return output_file
121 | 
122 | 
123 | if __name__ == '__main__':
124 | 
125 |     start_date = datetime(2022, 7, 15, 0, 0, 0)
126 |     tz = "US/Mountain"
127 |     run_id = '49c833f911ae43488e67063f410b7b5e'
128 |     output_file = 'output.parquet'
129 | 
130 |     features = ['temp_F', 'year', 'day_of_year', 'hour', 'is_weekend', 
131 |             'is_summer', 'month', 'temp_F_squared', 'hour_squared', 'hour_cubed']
132 |     
133 |     apply_model(run_id, features, start_date, output_file, tz)
134 | 
135 |     result = pd.read_parquet(output_file)
136 |     print(result)
137 |     


--------------------------------------------------------------------------------
/02_airflow/dags/batch_predict_dag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | import requests
  5 | from datetime import datetime
  6 | import pandas as pd
  7 | import pyarrow as pa
  8 | import pyarrow.parquet as pq
  9 | 
 10 | from airflow import DAG
 11 | from airflow.operators.bash import BashOperator
 12 | from airflow.operators.python import PythonOperator
 13 | from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
 14 | 
 15 | from gcloud_helpers import upload_to_gcs
 16 | from batch_predict import apply_model
 17 | import logging
 18 | 
 19 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
 20 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
 21 | OWM_API_KEY = os.environ.get("OWM_API_KEY")
 22 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
 23 | 
 24 | BIGQUERY_DATASET = 'energy_data'
 25 | date_fmt = '%Y-%m-%d_%H'
 26 | # DATE_string = "{{ execution_date.strftime(date_fmt) }}"
 27 | DATE_string = "{{ dag_run.logical_date.strftime('%Y-%m-%d_%H') }}"
 28 | 
 29 | # DATE = datetime.strptime(DATE, date_fmt)
 30 | 
 31 | tz = "US/Mountain"
 32 | run_id = '49c833f911ae43488e67063f410b7b5e'
 33 | output_file = f'{DATE_string}_{run_id}_output.parquet'
 34 | 
 35 | features = ['temp_F', 'year', 'day_of_year', 'hour', 'is_weekend', 
 36 |         'is_summer', 'month', 'temp_F_squared', 'hour_squared', 'hour_cubed']
 37 | 
 38 | default_args = {
 39 |     'owner': 'airflow',
 40 |     'depends_on_past': False,
 41 |     'retries': 1,
 42 | }
 43 | 
 44 | # 2022-07-28T00:00:00+00:00
 45 | with DAG(
 46 |     dag_id='batch_predict_dag',
 47 |     schedule_interval='@daily',
 48 |     start_date=datetime(2022, 7, 5, 0, 0, 0),
 49 |     catchup=True,
 50 |     default_args=default_args,
 51 |     max_active_runs=3,
 52 |     tags=['mlops', 'batch_predict']
 53 | ) as dag:    
 54 | 
 55 | 
 56 | 
 57 |     # make predictions and save locally to parquet file
 58 |     predict_task = PythonOperator(
 59 |         task_id='predict_task',
 60 |         python_callable=apply_model,
 61 |         op_kwargs={'run_id': run_id,
 62 |                    'features': features,
 63 |                    'start_date': DATE_string,
 64 |                    'date_fmt': date_fmt,
 65 |                    'output_file': f"{AIRFLOW_HOME}/{output_file}",
 66 |                    'tz': tz}
 67 |     )
 68 | 
 69 |     # upload to GCS
 70 |     upload_to_gcs_task = PythonOperator(
 71 |         task_id='upload_to_gcs_task',
 72 |         python_callable=upload_to_gcs,
 73 |         op_kwargs={
 74 |             'bucket': BUCKET,
 75 |             'object_name': f"staged/batch_predict/{DATE_string}/{run_id}/{output_file}",
 76 |             'local_file': f"{AIRFLOW_HOME}/{output_file}"
 77 |         }
 78 |     )   
 79 | 
 80 |     # load into BigQuery
 81 |     load_to_bigquery_task = GCSToBigQueryOperator(
 82 |         task_id='load_to_bigquery_task',
 83 |         bucket=BUCKET,
 84 |         source_objects=f"staged/batch_predict/{DATE_string}/{run_id}/{output_file}",
 85 |         destination_project_dataset_table=f'{BIGQUERY_DATASET}.energy_demand_forecasts',
 86 |         source_format='PARQUET',
 87 |         write_disposition='WRITE_APPEND',
 88 |         create_disposition='CREATE_IF_NEEDED',
 89 |     )
 90 | 
 91 |     # delete local file
 92 |     delete_local_file_task = BashOperator(
 93 |         task_id='delete_local_file_task',
 94 |         bash_command=f"rm -f {AIRFLOW_HOME}/{output_file}"
 95 |     )
 96 | 
 97 | predict_task >> upload_to_gcs_task >> load_to_bigquery_task >> delete_local_file_task
 98 | 
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/02_airflow/dags/fix_owm_schema_dag.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import json
  4 | import logging
  5 | import requests
  6 | from datetime import datetime
  7 | import pyarrow as pa
  8 | import pyarrow.parquet as pq
  9 | import pandas as pd
 10 | 
 11 | from airflow import DAG
 12 | from airflow.operators.bash import BashOperator
 13 | from airflow.operators.python import PythonOperator
 14 | from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
 15 | 
 16 | from gcloud_helpers import upload_to_gcs, download_from_gcs
 17 | 
 18 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
 19 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
 20 | OWM_API_KEY = os.environ.get("OWM_API_KEY")
 21 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
 22 | 
 23 | BIGQUERY_DATASET = 'energy_data'
 24 | DATASET_FILE_SUFFIX = "{{ logical_date.strftime(\'%Y-%m-%d-%H\') }}"
 25 | YEAR = "{{ execution_date.strftime(\'%Y\') }}"
 26 | 
 27 | # lat lon of the location that weather data will be downladed from Open Weather Map
 28 | # right now the DAG only downloads one location (DIA). This may need to be parameterized better later.
 29 | LAT = 39.847
 30 | LON = -104.656
 31 | 
 32 | def extract_weather_data(file_suffix):
 33 |     """
 34 |     Extract data from an OWM weather observation json file and store the results locally as a parquet file
 35 |     """
 36 | 
 37 |     with open(f"{AIRFLOW_HOME}/{file_suffix}.json") as f:
 38 |         j = json.load(f)
 39 |     
 40 |     # extract metadata table from json
 41 |     df = (pd.DataFrame(j['main'], index=[0])
 42 |                 .assign(timestamp=pd.to_datetime(j['dt'], unit='s').tz_localize('UTC'),
 43 |                         **j['coord'],
 44 |                         temp=lambda df_: df_['temp'].astype(float),
 45 |                         feels_like=lambda df_: df_['feels_like'].astype(float),
 46 |                         temp_min=lambda df_: df_['temp_min'].astype(float),
 47 |                         temp_max=lambda df_: df_['temp_max'].astype(float),
 48 |                         pressure=lambda df_: df_['pressure'].astype(float),
 49 |                         humidity=lambda df_: df_['humidity'].astype(float),
 50 | 
 51 |                         )
 52 |             )
 53 |     logging.info(df.head())
 54 |     logging.info(df.dtypes)
 55 |     fields = [('temp', pa.float64()),
 56 |             ('feels_like', pa.float64()),
 57 |             ('temp_min', pa.float64()),
 58 |             ('temp_max', pa.float64()),    
 59 |             ('pressure', pa.float64()),
 60 |             ('humidity', pa.float64()),
 61 |             ('timestamp', pa.timestamp('s')),
 62 |             ('lon', pa.float64()),
 63 |             ('lat', pa.float64())
 64 |     ] 
 65 |                  
 66 |     schema = pa.schema(fields)
 67 |     logging.info(schema)
 68 |     table = pa.Table.from_pandas(df, schema=schema)
 69 |     logging.info(table.schema)
 70 |     logging.info(table.to_pandas().head())
 71 | 
 72 |     pq.write_table(table, f'{AIRFLOW_HOME}/{file_suffix}.parquet')
 73 |     logging.info(f'parquet file written to {AIRFLOW_HOME}/{file_suffix}.parquet')
 74 | 
 75 | 
 76 | 
 77 | default_args = {
 78 |     "owner": "airflow",
 79 |     "depends_on_past": False,
 80 |     "retries": 1,
 81 | }
 82 | 
 83 | 
 84 | 
 85 | with DAG(
 86 |     dag_id="fix_owm_schema_dag",
 87 |     schedule_interval="@hourly",
 88 |     default_args=default_args,
 89 |     start_date=datetime(2022, 6, 28),
 90 |     end_date=datetime(2022, 7, 16),
 91 |     catchup=True,
 92 |     max_active_runs=5,
 93 |     tags=['dtc-de', 'weather'],
 94 | ) as dag:
 95 |     
 96 |     download_raw_from_gcs_task = PythonOperator(
 97 |         task_id=f"download_raw_record_from_gcs",
 98 |         python_callable=download_from_gcs,
 99 |         op_kwargs={
100 |             "bucket": BUCKET,
101 |             "object_name": f"raw/owm/{LAT}_{LON}/{DATASET_FILE_SUFFIX}.json",
102 |             "local_file_name": f"{AIRFLOW_HOME}/{DATASET_FILE_SUFFIX}.json",
103 |         }
104 |     )
105 | 
106 |     extract_data_task = PythonOperator(
107 |         task_id=f"extract_eia_series_data_task",
108 |         python_callable=extract_weather_data,
109 |         op_kwargs={
110 |             "file_suffix": DATASET_FILE_SUFFIX 
111 |         }
112 |     )
113 | 
114 |     local_extracted_to_gcs_task = PythonOperator(
115 |         task_id=f'local_extracted_to_gcs_task',
116 |         python_callable = upload_to_gcs,
117 |         op_kwargs={
118 |             'bucket': BUCKET,
119 |             'object_name': f"staged/live_weather/{LAT}_{LON}/{DATASET_FILE_SUFFIX}.parquet",
120 |             'local_file': f"{AIRFLOW_HOME}/{DATASET_FILE_SUFFIX}.parquet"
121 |         }
122 |     )
123 | 
124 |     # delete all of the files downloaded to the worker
125 |     cleanup_task = BashOperator(
126 |         task_id=f"cleanup_task",
127 |         bash_command=f'rm {AIRFLOW_HOME}/{DATASET_FILE_SUFFIX}.json {AIRFLOW_HOME}/{DATASET_FILE_SUFFIX}.parquet'
128 |     )
129 | 
130 |     load_to_bq_task = GCSToBigQueryOperator(
131 |         task_id='load_to_bq_task',
132 |         bucket=BUCKET,
133 |         source_objects=f"staged/live_weather/{LAT}_{LON}/{DATASET_FILE_SUFFIX}.parquet",
134 |         destination_project_dataset_table=f'{BIGQUERY_DATASET}.hourly_updated_weather',
135 |         source_format='parquet',
136 |         write_disposition='WRITE_APPEND',
137 |         create_disposition='CREATE_IF_NEEDED',
138 | 
139 |     )
140 | 
141 | 
142 | download_raw_from_gcs_task >> extract_data_task >> local_extracted_to_gcs_task >> load_to_bq_task >> cleanup_task


--------------------------------------------------------------------------------
/02_airflow/dags/gcloud_helpers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from google.cloud import storage
 3 | 
 4 | 
 5 | 
 6 | def upload_to_gcs(bucket, object_name, local_file):
 7 |     """
 8 |     Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
 9 |     :param bucket: GCS bucket name
10 |     :param object_name: target path & file-name
11 |     :param local_file: source path & file-name
12 |     :return:
13 |     """
14 |     # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
15 |     # (Ref: https://github.com/googleapis/python-storage/issues/74)
16 |     storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
17 |     storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
18 |     # End of Workaround
19 | 
20 |     client = storage.Client()
21 |     bucket = client.bucket(bucket)
22 | 
23 |     blob = bucket.blob(object_name)
24 |     blob.upload_from_filename(local_file)
25 | 
26 | 
27 | def download_from_gcs(bucket, object_name, local_file_name):
28 |     """
29 |     Ref: https://cloud.google.com/storage/docs/downloading-objects#storage-download-object-python
30 |     :param bucket: GCS bucket name
31 |     :param object_name: target path & file-name
32 |     :param local_file_name: source path & file-name
33 |     :return:
34 |     """
35 |     client = storage.Client()
36 |     bucket = client.bucket(bucket)
37 |     blob = bucket.blob(object_name)
38 |     blob.download_to_filename(local_file_name)
39 | 
40 | 
41 | def upload_multiple_files_to_gcs(bucket, object_names, local_files):
42 |     """
43 |     Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
44 |     :param bucket: GCS bucket name
45 |     :param object_name: target path & file-name
46 |     :param local_file: source path & file-name
47 |     :return:
48 |     """
49 |     # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
50 |     # (Ref: https://github.com/googleapis/python-storage/issues/74)
51 |     storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
52 |     storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
53 | 
54 |     if not type(object_names) == list and not type(local_files) == list:
55 |         raise TypeError('object_names and local_files must be lists')
56 |     if not len(object_names) == len(local_files):
57 |         raise ValueError('object_names and local_files must be the same length')
58 | 
59 |     for remote, local in zip(object_names, local_files):
60 |         upload_to_gcs(bucket, remote, local)
61 |         logging.info(f'uploaded {local} to {bucket}/{remote}')
62 | 


--------------------------------------------------------------------------------
/02_airflow/dags/ingest_historical_weather_data_dag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | from airflow import DAG
  7 | from airflow.operators.bash import BashOperator
  8 | from airflow.operators.python import PythonOperator
  9 | from airflow.utils.task_group import TaskGroup
 10 | from airflow.providers.google.cloud.operators.bigquery import BigQueryCreateExternalTableOperator, BigQueryInsertJobOperator
 11 | 
 12 | from gcloud_helpers import upload_to_gcs
 13 | 
 14 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
 15 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
 16 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
 17 | BIGQUERY_DATASET = 'energy_data'
 18 | 
 19 | LOCAL_DATASET_FILE_SUFFIX= "{{ logical_date.strftime(\'%Y-%m-%d-%H\') }}"
 20 | REMOTE_DATASET_FILE_SUFFIX = "{{ logical_date.strftime(\'%Y-%m-%d\') }}" 
 21 | YEAR = "{{ logical_date.strftime(\'%Y\') }}"
 22 | 
 23 | # NOAA ISD Station IDs whose historical data will be downloaded by this DAG
 24 | STATION_IDS = ['72565003017']
 25 | 
 26 | 
 27 | def extract_historical_weather_data(csv):
 28 |     """
 29 |     Extract weather data from NOAA ISD csv and save the result locally in a parquet file
 30 |     """
 31 | 
 32 |     station_data = pd.read_csv(csv)
 33 | 
 34 |     # temp and qc value are stored in the same field and must be separated. same for dew point
 35 |     station_data[['temperature_degC', 'temperature_QC']] = station_data['TMP'].str.split(',', expand=True)
 36 |     station_data[['dew_point_degC', 'dew_point_QC']] = station_data['DEW'].str.split(',', expand=True)
 37 | 
 38 |     station_data = (station_data
 39 |                     .astype({'temperature_degC': float, 'dew_point_degC': float})
 40 |                     .assign(temperature_degC=lambda df_: (df_['temperature_degC'] / 10).replace(999.9, np.nan),
 41 |                             dew_point_degC=lambda df_: (df_['dew_point_degC'] / 10).replace(999.9, np.nan),
 42 |                     )
 43 |                 )
 44 |     
 45 |     columns = ['STATION', 'NAME', 'DATE', 'temperature_degC', 'dew_point_degC', 'temperature_QC', 'dew_point_QC']
 46 |     station_data[columns].to_parquet(f"{AIRFLOW_HOME}/{station_id}.parquet")
 47 | 
 48 | 
 49 | default_args = {
 50 |     "owner": "airflow",
 51 |     "depends_on_past": False,
 52 |     "retries": 1,
 53 | }
 54 | 
 55 | 
 56 | with DAG(
 57 |     dag_id="historical_weather_dag",
 58 |     schedule_interval="@daily",
 59 |     default_args=default_args,
 60 |     start_date=datetime(2015, 1, 1),
 61 |     end_date=(datetime(2022, 7, 3)),
 62 |     catchup=True,
 63 |     max_active_runs=1,
 64 |     tags=['dtc-de', 'weather'],
 65 | ) as dag:
 66 |     with TaskGroup(group_id='download_and_extract') as dl_and_extract_tg:
 67 |         for station_id in STATION_IDS: 
 68 | 
 69 |             download_task = BashOperator(
 70 |                         task_id=f"download_weather_{station_id}_task",
 71 |                         bash_command=f'curl https://noaa-global-hourly-pds.s3.amazonaws.com/{YEAR}/{station_id}.csv -o {AIRFLOW_HOME}/{station_id}.csv'
 72 |                     )
 73 | 
 74 |             local_raw_to_gcs_task = PythonOperator(
 75 |                         task_id=f"local_raw_to_gcs_{station_id}_task",
 76 |                         python_callable=upload_to_gcs,
 77 |                         op_kwargs={
 78 |                             "bucket": BUCKET,
 79 |                             "object_name": f"raw/weather_station/{YEAR}/{station_id}.csv",
 80 |                             "local_file": f"{AIRFLOW_HOME}/{station_id}.csv",
 81 |                         }
 82 |                     )
 83 | 
 84 |             extract_data_task = PythonOperator(
 85 |                     task_id=f"extract_weather_station_data_task_{station_id}",
 86 |                     python_callable=extract_historical_weather_data,
 87 |                     op_kwargs={
 88 |                         "csv": f"{AIRFLOW_HOME}/{station_id}.csv",
 89 |                     }
 90 |                 )
 91 | 
 92 |             local_extracted_to_gcs_task = PythonOperator(
 93 |                     task_id=f"local_extracted_to_gcs_{station_id}_task",
 94 |                     python_callable=upload_to_gcs,
 95 |                     op_kwargs={
 96 |                         "bucket": BUCKET,
 97 |                         "object_name": f"staged/weather_station/{YEAR}/{station_id}.parquet",
 98 |                         "local_file": f"{AIRFLOW_HOME}/{station_id}.parquet",
 99 |                     }
100 |                 )
101 | 
102 |             # delete all of the files downloaded to the worker
103 |             cleanup_task = BashOperator(
104 |                 task_id=f"cleanup_{station_id}_task",
105 |                 bash_command=f'rm {AIRFLOW_HOME}/{station_id}.csv {AIRFLOW_HOME}/{station_id}.parquet'
106 |             )
107 |             
108 |             download_task >> local_raw_to_gcs_task >> extract_data_task >> local_extracted_to_gcs_task >> cleanup_task
109 | 
110 |     gcs_to_bq_ext_task = BigQueryCreateExternalTableOperator(
111 |             task_id=f"gcs_to_bq_ext_weather_task",
112 |             table_resource={
113 |                 "tableReference": {
114 |                     "projectId": PROJECT_ID,
115 |                     "datasetId": BIGQUERY_DATASET,
116 |                     "tableId": f'{YEAR}_weather_station_external',
117 |                 },
118 |                 "externalDataConfiguration": {
119 |                     "sourceFormat": "PARQUET",
120 |                     "sourceUris": [f"gs://{BUCKET}/staged/weather_station/{YEAR}/*"],
121 |                 },
122 |             },
123 |         )
124 | 
125 |     CREATE_NATIVE_TABLE_QUERY = f"""CREATE OR REPLACE TABLE {BIGQUERY_DATASET}.{YEAR}_weather_station_native
126 |                             AS SELECT * FROM {BIGQUERY_DATASET}.{YEAR}_weather_station_external;"""
127 | 
128 |     create_native_bq_table_task = BigQueryInsertJobOperator(
129 |         task_id=f"bq_ext_to_native_task",
130 |         configuration={
131 |             "query": {
132 |                 "query": CREATE_NATIVE_TABLE_QUERY,
133 |                 "useLegacySql": False,
134 |             }
135 |         },
136 |     )
137 | 
138 |     
139 | 
140 |     dl_and_extract_tg >> gcs_to_bq_ext_task >> create_native_bq_table_task
141 | 


--------------------------------------------------------------------------------
/02_airflow/dags/ingest_live_hourly_weather_dag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | import requests
  5 | from datetime import datetime
  6 | import pandas as pd
  7 | import pyarrow as pa
  8 | import pyarrow.parquet as pq
  9 | 
 10 | from airflow import DAG
 11 | from airflow.operators.bash import BashOperator
 12 | from airflow.operators.python import PythonOperator
 13 | from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
 14 | 
 15 | from gcloud_helpers import upload_to_gcs
 16 | 
 17 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
 18 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
 19 | OWM_API_KEY = os.environ.get("OWM_API_KEY")
 20 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
 21 | 
 22 | BIGQUERY_DATASET = 'energy_data'
 23 | DATASET_FILE_SUFFIX = "{{ logical_date.strftime(\'%Y-%m-%d-%H\') }}"
 24 | YEAR = "{{ execution_date.strftime(\'%Y\') }}"
 25 | 
 26 | # lat lon of the location that weather data will be downladed from Open Weather Map
 27 | # right now the DAG only downloads one location (DIA). This may need to be parameterized better later.
 28 | LAT = 39.847
 29 | LON = -104.656
 30 | 
 31 | 
 32 | def download_current_weather_data(lat, lon, outfile):
 33 |     """
 34 |     Request weather data for a location from the OWM API and store the result locally as a json file
 35 |     """
 36 |     url = 'https://api.openweathermap.org/data/2.5/weather?'
 37 | 
 38 |     params = {'appid': OWM_API_KEY,
 39 |             'lat': lat,
 40 |             'lon': lon,
 41 |             'units': 'metric'}
 42 | 
 43 | 
 44 |     logging.info("requesting data from OWM API")
 45 |     r = requests.get(url, params)
 46 | 
 47 |     if r.status_code == 200:
 48 |         logging.info(r.status_code)
 49 | 
 50 |         with open(outfile, 'w') as f:
 51 |             json.dump(r.json(), f)
 52 |         logging.info(f'file written to {outfile}')
 53 |     
 54 |     else:
 55 |         error_message = f'OpenWeatherMap API returned value {r.status_code}'
 56 |         raise ValueError(error_message)
 57 | 
 58 | 
 59 | def extract_weather_data(file_suffix):
 60 |     """
 61 |     Extract data from an OWM weather observation json file and store the results locally as a parquet file
 62 |     """
 63 | 
 64 |     with open(f"{AIRFLOW_HOME}/{file_suffix}.json") as f:
 65 |         j = json.load(f)
 66 |     
 67 |     # extract metadata table from json
 68 |     df = (pd.DataFrame(j['main'], index=[0])
 69 |                 .assign(timestamp=pd.to_datetime(j['dt'], unit='s').tz_localize('UTC'),
 70 |                         **j['coord'],
 71 |                         temp=lambda df_: df_['temp'].astype(float),
 72 |                         feels_like=lambda df_: df_['feels_like'].astype(float),
 73 |                         temp_min=lambda df_: df_['temp_min'].astype(float),
 74 |                         temp_max=lambda df_: df_['temp_max'].astype(float),
 75 |                         pressure=lambda df_: df_['pressure'].astype(float),
 76 |                         humidity=lambda df_: df_['humidity'].astype(float),
 77 | 
 78 |                         )
 79 |             )
 80 |     logging.info(df.head())
 81 |     logging.info(df.dtypes)
 82 |     fields = [('temp', pa.float64()),
 83 |             ('feels_like', pa.float64()),
 84 |             ('temp_min', pa.float64()),
 85 |             ('temp_max', pa.float64()),    
 86 |             ('pressure', pa.float64()),
 87 |             ('humidity', pa.float64()),
 88 |             ('timestamp', pa.timestamp('s')),
 89 |             ('lon', pa.float64()),
 90 |             ('lat', pa.float64())
 91 |     ] 
 92 |                  
 93 |     schema = pa.schema(fields)
 94 |     logging.info(schema)
 95 |     table = pa.Table.from_pandas(df, schema=schema)
 96 |     logging.info(table.schema)
 97 |     logging.info(table.to_pandas().head())
 98 | 
 99 |     pq.write_table(table, f'{AIRFLOW_HOME}/{file_suffix}.parquet')
100 |     logging.info(f'parquet file written to {AIRFLOW_HOME}/{file_suffix}.parquet')
101 | 
102 | 
103 | 
104 | default_args = {
105 |     "owner": "airflow",
106 |     "depends_on_past": False,
107 |     "retries": 1,
108 | }
109 | 
110 | 
111 | 
112 | with DAG(
113 |     dag_id="current_weather_owm_dag",
114 |     schedule_interval="@hourly",
115 |     default_args=default_args,
116 |     start_date=datetime(2022, 4, 2),
117 |     catchup=False,
118 |     max_active_runs=1,
119 |     tags=['dtc-de', 'weather'],
120 | ) as dag:
121 |     
122 |     download_dataset_task = PythonOperator(
123 |         task_id=f"download_dataset_task",
124 |         python_callable=download_current_weather_data,
125 |         op_kwargs={
126 |             "lat": LAT,
127 |             "lon": LON,
128 |             "outfile": f"{AIRFLOW_HOME}/{DATASET_FILE_SUFFIX}.json"
129 |         },
130 |     )
131 | 
132 |     local_raw_to_gcs_task = PythonOperator(
133 |         task_id=f"local_raw_to_gcs_task",
134 |         python_callable=upload_to_gcs,
135 |         op_kwargs={
136 |             "bucket": BUCKET,
137 |             "object_name": f"raw/live_weather/{LAT}_{LON}/{DATASET_FILE_SUFFIX}.json",
138 |             "local_file": f"{AIRFLOW_HOME}/{DATASET_FILE_SUFFIX}.json",
139 |         }
140 |     )
141 |     
142 |     extract_data_task = PythonOperator(
143 |         task_id=f"extract_eia_series_data_task",
144 |         python_callable=extract_weather_data,
145 |         op_kwargs={
146 |             "file_suffix": DATASET_FILE_SUFFIX 
147 |         }
148 |     )
149 | 
150 |     local_extracted_to_gcs_task = PythonOperator(
151 |         task_id=f'local_extracted_to_gcs_task',
152 |         python_callable = upload_to_gcs,
153 |         op_kwargs={
154 |             'bucket': BUCKET,
155 |             'object_name': f"staged/live_weather/{LAT}_{LON}/{DATASET_FILE_SUFFIX}.parquet",
156 |             'local_file': f"{AIRFLOW_HOME}/{DATASET_FILE_SUFFIX}.parquet"
157 |         }
158 |     )
159 | 
160 |     # delete all of the files downloaded to the worker
161 |     cleanup_task = BashOperator(
162 |         task_id=f"cleanup_task",
163 |         bash_command=f'rm {AIRFLOW_HOME}/{DATASET_FILE_SUFFIX}.json {AIRFLOW_HOME}/{DATASET_FILE_SUFFIX}.parquet'
164 |     )
165 | 
166 |     load_to_bq_task = GCSToBigQueryOperator(
167 |         task_id='load_to_bq_task',
168 |         bucket=BUCKET,
169 |         source_objects=f"staged/live_weather/{LAT}_{LON}/{DATASET_FILE_SUFFIX}.parquet",
170 |         destination_project_dataset_table=f'{BIGQUERY_DATASET}.hourly_updated_weather',
171 |         source_format='parquet',
172 |         write_disposition='WRITE_APPEND',
173 |         create_disposition='CREATE_IF_NEEDED',
174 | 
175 |     )
176 | 
177 | 
178 | 
179 |     download_dataset_task >> local_raw_to_gcs_task >> extract_data_task >> local_extracted_to_gcs_task >> load_to_bq_task >> cleanup_task


--------------------------------------------------------------------------------
/02_airflow/dags/ingest_raw_electricity_data_dag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | import requests
  5 | from datetime import datetime
  6 | import pandas as pd
  7 | 
  8 | import pyarrow as pa
  9 | import pyarrow.parquet as pq
 10 | 
 11 | from airflow import DAG
 12 | from airflow.operators.bash import BashOperator
 13 | from airflow.operators.python import PythonOperator
 14 | from airflow.utils.task_group import TaskGroup
 15 | from airflow.providers.google.cloud.operators.bigquery import BigQueryCreateExternalTableOperator, BigQueryInsertJobOperator
 16 | 
 17 | from gcloud_helpers import upload_to_gcs, upload_multiple_files_to_gcs
 18 | 
 19 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
 20 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
 21 | EIA_API_KEY = os.environ.get("EIA_API_KEY")
 22 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
 23 | BIGQUERY_DATASET = 'energy_data'
 24 | 
 25 | LOCAL_DATASET_FILE_SUFFIX= "{{ logical_date.strftime(\'%Y-%m-%d-%H\') }}"
 26 | REMOTE_DATASET_FILE_SUFFIX = "{{ logical_date.strftime(\'%Y-%m-%d\') }}" 
 27 | 
 28 | # EIA series ID's that will be downloaded by this DAG
 29 | SERIES_LIST = [
 30 |     'EBA.PSCO-ALL.D.H', # Electrical Demand Public Service Company of Colorado in UTC
 31 |     'EBA.PSCO-ALL.DF.H' # Day-ahead demand forecast for Public Service Company of Colorado (PSCO), hourly - UTC time
 32 | ]
 33 | 
 34 | def download_energy_demand_json(series_id, outfile):
 35 |     """
 36 |     Request data for an EIA series and save the result locally as a json file 
 37 |     """
 38 | 
 39 |     url = 'https://api.eia.gov/series/'
 40 |     params = {'api_key': EIA_API_KEY,
 41 |              'series_id': series_id,
 42 |              }
 43 | 
 44 |     logging.info("requesting data from EIA API")
 45 |     r = requests.get(url, params)
 46 | 
 47 |     if r.status_code == 200:
 48 |         logging.info(r.status_code)
 49 |         with open(outfile, 'w') as f:
 50 |             json.dump(r.json(), f)
 51 |         logging.info(f'file written to {outfile}')
 52 | 
 53 |     else:
 54 |         error_message = f'EIA API returned value {r.status_code}'
 55 |         raise ValueError(error_message)
 56 | 
 57 | 
 58 | def extract_energy_demand_data(series_id, local_file_name, local_file_suffix):
 59 |     """
 60 |     Extract data and metadata of an EIA series json file and store the results locally as a parquet file
 61 |     """
 62 | 
 63 |     with open(f"{AIRFLOW_HOME}/{local_file_name}") as f:
 64 |         j = json.load(f)
 65 |     
 66 |     # extract metadata table from json
 67 |     metadata = pd.DataFrame((j['series'][0])).loc[[0], :].drop('data', axis=1)
 68 |     metadata['series_id'] = metadata['series_id'].str.replace('.', '_')
 69 |     
 70 |     # extract data series 
 71 |     data = j['series'][0]['data']
 72 |     data_df = (pd.DataFrame(data, columns=['timestamp', 'value'])
 73 |                 .assign(timestamp=lambda df_:pd.to_datetime(df_['timestamp']),
 74 |                         series_id=series_id.replace('.', '_'))
 75 |               )
 76 |     data_df.columns = data_df.columns.str.replace('.', '_')
 77 | 
 78 |     # the schema must be defined for the data_df to store the table as parquet.
 79 |     # Pandas/pyarrow can not automatically define the schema. The value field
 80 |     # is mostly integers, but does have missing values. Automatically defining
 81 |     # the schema will assign integer which cannot contain missing values
 82 |     fields = [
 83 |     ('timestamp', pa.timestamp(unit='ns')),
 84 |     ('value', pa.float64()),
 85 |     ('series_id', pa.string()),
 86 |     ]
 87 |     schema = pa.schema(fields)
 88 |     table = pa.Table.from_pandas(data_df, schema=schema)
 89 |    
 90 |     pq.write_table(table, f'{AIRFLOW_HOME}/data_{local_file_suffix}.parquet')
 91 |     metadata.to_parquet(f'{AIRFLOW_HOME}/metadata_{local_file_suffix}.parquet')
 92 | 
 93 |     logging.info('files converted to parquet')
 94 | 
 95 | 
 96 | 
 97 | default_args = {
 98 |     "owner": "airflow",
 99 |     "depends_on_past": False,
100 |     "retries": 1,
101 | }
102 | 
103 | with DAG(
104 |     dag_id="raw_electricity_ingestion_dag",
105 |     schedule_interval="@hourly",
106 |     default_args=default_args,
107 |     start_date=datetime(2022, 3, 25),
108 |     catchup=True,
109 |     max_active_runs=5,
110 |     tags=['dtc-de', 'eia'],
111 | ) as dag:
112 |     with TaskGroup(group_id='download_and_extract') as dl_and_extract_tg:
113 |         for series_id in SERIES_LIST:
114 |             local_file_suffix = f'{series_id}_{LOCAL_DATASET_FILE_SUFFIX}'
115 |             remote_file_suffix= f'{series_id}_{REMOTE_DATASET_FILE_SUFFIX}'
116 | 
117 |             download_dataset_task = PythonOperator(
118 |                 task_id=f"download_{series_id}_dataset_task",
119 |                 python_callable=download_energy_demand_json,
120 |                 op_kwargs={
121 |                     "series_id": series_id,
122 |                     "outfile": f"{AIRFLOW_HOME}/{local_file_suffix}.json"
123 |                 },
124 |             )
125 | 
126 |             local_raw_to_gcs_task = PythonOperator(
127 |                 task_id=f"local_raw_to_gcs_{series_id}_task",
128 |                 python_callable=upload_to_gcs,
129 |                 op_kwargs={
130 |                     "bucket": BUCKET,
131 |                     "object_name": f"raw/eia/{series_id}/{remote_file_suffix}.json",
132 |                     "local_file": f"{AIRFLOW_HOME}/{local_file_suffix}.json",
133 |                 }
134 |             )
135 |             
136 |             extract_data_task = PythonOperator(
137 |                 task_id=f"extract_eia_series_data_task_{series_id}",
138 |                 python_callable=extract_energy_demand_data,
139 |                 op_kwargs={
140 |                     "series_id": series_id,
141 |                     "local_file_name": f"{local_file_suffix}.json",
142 |                     "local_file_suffix": local_file_suffix
143 |                 }
144 |             )
145 | 
146 |             local_extracted_to_gcs_task = PythonOperator(
147 |                 task_id=f'local_extracted_to_gcs_{series_id}_task',
148 |                 python_callable = upload_multiple_files_to_gcs,
149 |                 op_kwargs={
150 |                     'bucket': BUCKET,
151 |                     'object_names': [f"staged/eia/data/{series_id}.parquet",
152 |                                     f"staged/eia/metadata/{series_id}.parquet"],
153 |                     'local_files': [f"{AIRFLOW_HOME}/data_{local_file_suffix}.parquet", 
154 |                                     f"{AIRFLOW_HOME}/metadata_{local_file_suffix}.parquet"]
155 |                 }
156 |             )
157 | 
158 |             # delete all of the files downloaded to the worker
159 |             cleanup_task = BashOperator(
160 |                 task_id=f"cleanup_{series_id}_task",
161 |                 bash_command=f'rm {AIRFLOW_HOME}/{local_file_suffix}.json {AIRFLOW_HOME}/data_{local_file_suffix}.parquet {AIRFLOW_HOME}/metadata_{local_file_suffix}.parquet'
162 |             )    
163 | 
164 |             download_dataset_task >> local_raw_to_gcs_task >> extract_data_task >> local_extracted_to_gcs_task >> cleanup_task
165 |     
166 |     with TaskGroup(group_id='load_to_bq') as load_to_bq_tg:
167 |         bucket_subfolders = ['data', 'metadata']
168 |         external_table_names = ['demand_data_external', 'demand_metadata_external']
169 |         native_table_names = ['demand_data_native', 'demand_metadata_native']
170 | 
171 |         for bucket_subfolder, external_table, native_table in zip(bucket_subfolders, 
172 |                                                                   external_table_names, 
173 |                                                                   native_table_names):
174 | 
175 |             gcs_to_bq_ext_task = BigQueryCreateExternalTableOperator(
176 |                 task_id=f"gcs_to_bq_ext_eia_series_{bucket_subfolder}_task",
177 |                 table_resource={
178 |                     "tableReference": {
179 |                         "projectId": PROJECT_ID,
180 |                         "datasetId": BIGQUERY_DATASET,
181 |                         "tableId": external_table,
182 |                     },
183 |                     "externalDataConfiguration": {
184 |                         "sourceFormat": "PARQUET",
185 |                         "sourceUris": [f"gs://{BUCKET}/staged/eia/{bucket_subfolder}/*"],
186 |                     },
187 |                 },
188 |             )
189 |             
190 |             CREATE_NATIVE_TABLE_QUERY = f"""CREATE OR REPLACE TABLE {BIGQUERY_DATASET}.{native_table}
191 |                                             AS SELECT * FROM {BIGQUERY_DATASET}.{external_table};"""
192 | 
193 |             create_native_bq_table_task = BigQueryInsertJobOperator(
194 |                 task_id=f"bq_ext_to_native_{bucket_subfolder}_task",
195 |                 configuration={
196 |                     "query": {
197 |                         "query": CREATE_NATIVE_TABLE_QUERY,
198 |                         "useLegacySql": False,
199 |                     }
200 |                 },
201 |             )
202 | 
203 |             gcs_to_bq_ext_task >> create_native_bq_table_task
204 |     
205 | 
206 |     dl_and_extract_tg >> load_to_bq_tg
207 |     
208 |     


--------------------------------------------------------------------------------
/02_airflow/dags/ingest_weather_forecast_dag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | import logging
  4 | import requests
  5 | import pandas as pd
  6 | import pyarrow as pa
  7 | import pyarrow.parquet as pq
  8 | import xml.etree.ElementTree as ET  
  9 | 
 10 | from airflow import DAG
 11 | from airflow.operators.bash_operator import BashOperator
 12 | from airflow.operators.python_operator import PythonOperator
 13 | from airflow.providers.google.cloud.operators.bigquery import BigQueryCreateExternalTableOperator, BigQueryInsertJobOperator
 14 | 
 15 | from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
 16 | from gcloud_helpers import upload_to_gcs, upload_multiple_files_to_gcs
 17 | 
 18 | 
 19 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
 20 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
 21 | EIA_API_KEY = os.environ.get("EIA_API_KEY")
 22 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
 23 | BIGQUERY_DATASET = 'energy_data'
 24 | 
 25 | DATASET_FILE_SUFFIX= "{{ logical_date.strftime(\'%Y-%m-%d-%H\') }}"
 26 | # REMOTE_DATASET_FILE_SUFFIX = "{{ logical_date.strftime(\'%Y-%m-%d\') }}" 
 27 | 
 28 | # lat lon of the location that weather data will be downladed from Open Weather Map
 29 | # right now the DAG only downloads one location (DIA). This may need to be parameterized better later.
 30 | LAT = 39.847
 31 | LON = -104.656
 32 | 
 33 | 
 34 | def download_temperature_forecast(lat, lon, outfile):
 35 |     
 36 |     params = dict(lat=lat,
 37 |               lon=lon,
 38 |               product='time-series',
 39 |               Unit='e',
 40 |               temp='temp',
 41 |               )
 42 | 
 43 |     r = requests.get(url='https://graphical.weather.gov/xml/sample_products/browser_interface/ndfdXMLclient.php', params=params)
 44 | 
 45 |     if r.status_code == 200:
 46 |         logging.info(r.status_code)
 47 |         # save the xml file
 48 |         with open(outfile, 'wb') as f:
 49 |             f.write(r.content)
 50 |         logging.info(f'file written to {outfile}')
 51 |     else:
 52 |         error_message = f'NWS API returned value {r.status_code}'
 53 |         raise ValueError(error_message)
 54 | 
 55 | 
 56 | def extract_temperature_forecast(local_file_name):
 57 |     
 58 |     tree = ET.parse(local_file_name)
 59 |     root = tree.getroot()
 60 | 
 61 |     # extract forecast creation time
 62 |     for item in root.findall('.head/product/creation-date'):
 63 |         creation_time = item.text
 64 | 
 65 |     # extract location 
 66 |     for item in root.findall('.data/location/point'):
 67 |         lat = item.attrib.get('latitude')
 68 |         lon = item.attrib.get('longitude')
 69 | 
 70 |     # extract forecast times
 71 |     fcst_times = list()
 72 |     for item in root.findall('.data/time-layout/start-valid-time'):
 73 |         fcst_times.append(item.text)
 74 | 
 75 |     # extract temperature values  
 76 |     temps = list()
 77 |     for item in root.findall('.data/parameters/temperature/value'):
 78 |         temps.append(item.text)
 79 |         
 80 | 
 81 |     assert len(temps) == len(fcst_times)
 82 | 
 83 |     # create dataframe
 84 |     forecast_df = (pd.DataFrame(data = {'lon': lon,
 85 |                                         'lat': lat,
 86 |                                         'forecast_time': fcst_times,
 87 |                                         'temp_F': temps,
 88 |                                         'creation_time': creation_time})
 89 |                     # assign correct data types
 90 |                     .assign(forecast_time=lambda df_: pd.to_datetime(df_['forecast_time']),
 91 |                             lon=lambda df_: df_['lon'].astype(float),
 92 |                             lat=lambda df_: df_['lat'].astype(float),
 93 |                             temp_F=lambda df_: df_['temp_F'].astype(int),
 94 |                             creation_time=lambda df_: pd.to_datetime(df_['creation_time']),
 95 | 
 96 |                             
 97 |                         )
 98 |                 )
 99 | 
100 |     fields = [('forecast_time', pa.timestamp('s')),
101 |               ('lon', pa.float64()),
102 |               ('lat', pa.float64()),
103 |               ('temp_F', pa.int64()),
104 |               ('creation_time', pa.timestamp('s'))
105 |               ]
106 |     schema = pa.schema(fields)
107 |     table = pa.Table.from_pandas(forecast_df, schema=schema)
108 |     pq.write_table(table, local_file_name.replace('.xml', '.parquet'))
109 | 
110 |     logging.info('files converted to parquet')
111 | 
112 | 
113 | default_args = {
114 |     "owner": "airflow",
115 |     "depends_on_past": False,
116 |     "retries": 1,
117 | }
118 | 
119 | with DAG(
120 |     dag_id="ingest_weather_forecast_dag",
121 |     schedule_interval="@daily",
122 |     default_args=default_args,
123 |     start_date=datetime(2022, 7, 1),
124 |     catchup=True,
125 |     max_active_runs=5,
126 |     tags=['dtc-de', 'nws']
127 | ) as dag:
128 | 
129 |     download_dataset_task = PythonOperator(
130 |         task_id="download_dataset_task",
131 |         python_callable=download_temperature_forecast,
132 |         op_kwargs={
133 |             'lat': LAT, 
134 |             'lon': LON, 
135 |             'outfile': f'{AIRFLOW_HOME}/weather_forecast_{DATASET_FILE_SUFFIX}.xml'
136 |             },
137 |     )
138 | 
139 | 
140 |     local_raw_to_gcs_task = PythonOperator(
141 |         task_id="local_raw_to_gcs_task",
142 |         python_callable=upload_to_gcs,
143 |         op_kwargs={
144 |             "bucket": BUCKET,
145 |             "object_name": f"raw/weather_forecast/{LAT}_{LON}_{DATASET_FILE_SUFFIX}.xml",
146 |             "local_file": f'{AIRFLOW_HOME}/weather_forecast_{DATASET_FILE_SUFFIX}.xml'
147 |         }
148 |     )
149 | 
150 |     extract_data_task = PythonOperator(
151 |         task_id='extract_data_task',
152 |         python_callable=extract_temperature_forecast,
153 |         op_kwargs={
154 |             'local_file_name': f'{AIRFLOW_HOME}/weather_forecast_{DATASET_FILE_SUFFIX}.xml'
155 |         }
156 |     )
157 | 
158 |     local_extracted_to_gcs_task = PythonOperator(
159 |         task_id="local_extracted_to_gcs_task",
160 |         python_callable=upload_to_gcs,
161 |         op_kwargs={
162 |             "bucket": BUCKET,
163 |             "object_name": f"staged/weather_forecast/{LAT}_{LON}_{DATASET_FILE_SUFFIX}.parquet",
164 |             "local_file": f'{AIRFLOW_HOME}/weather_forecast_{DATASET_FILE_SUFFIX}.parquet'
165 |         }
166 |     
167 |     )
168 | 
169 |     # delete the files downloaded locally
170 |     cleanup_task = BashOperator(
171 |         task_id="cleanup_task",
172 |         bash_command = f'rm {AIRFLOW_HOME}/weather_forecast_{DATASET_FILE_SUFFIX}.xml && rm {AIRFLOW_HOME}/weather_forecast_{DATASET_FILE_SUFFIX}.parquet'
173 |     )
174 | 
175 |     upload_to_bigquery_task = GCSToBigQueryOperator(
176 |         task_id="upload_to_bigquery_task",
177 |         bucket=BUCKET,
178 |         source_objects=[f"staged/weather_forecast/{LAT}_{LON}_{DATASET_FILE_SUFFIX}.parquet"],
179 |         destination_project_dataset_table=f"{BIGQUERY_DATASET}.weather_forecast",
180 |         source_format="PARQUET",
181 |         write_disposition="WRITE_APPEND",
182 |         create_disposition="CREATE_IF_NEEDED",      
183 |     )
184 | 
185 | 
186 |     (download_dataset_task >> 
187 |         local_raw_to_gcs_task >> 
188 |         extract_data_task >> 
189 |         local_extracted_to_gcs_task >> 
190 |         cleanup_task >> 
191 |         upload_to_bigquery_task
192 |     )


--------------------------------------------------------------------------------
/02_airflow/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.2.3
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 31 | #
 32 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 33 | #                                Default: airflow
 34 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 37 | #                                Default: ''
 38 | #
 39 | # Feel free to modify this file to suit your needs.
 40 | ---
 41 | version: '3'
 42 | x-airflow-common:
 43 |   &airflow-common
 44 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 45 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 46 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 47 |   # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.3}
 48 |   build: .
 49 |   environment:
 50 |     &airflow-common-env
 51 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 52 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 53 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 54 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 55 |     AIRFLOW__CORE__FERNET_KEY: ''
 56 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 57 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 58 |     AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
 59 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-mlflow scikit-learn sqlalchemy<1.4.0}
 60 |     GOOGLE_APPLICATION_CREDENTIALS: /.google/credentials/google_credentials.json
 61 |     AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT: 'google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/google_credentials.json'
 62 |     GCP_PROJECT_ID: 'mlops-zoomcamp-354700'
 63 |     GCP_GCS_BUCKET: 'energy_project_bucket_mlops-zoomcamp-354700'
 64 |     EIA_API_KEY: '${EIA_KEY}'
 65 |     OWM_API_KEY: '${OWM_KEY}'
 66 | 
 67 |   volumes:
 68 |     - ./dags:/opt/airflow/dags
 69 |     - ./logs:/opt/airflow/logs
 70 |     - ./plugins:/opt/airflow/plugins
 71 |     - ~/.google/credentials/:/.google/credentials:ro
 72 | 
 73 |   user: "${AIRFLOW_UID:-50000}:0"
 74 |   depends_on:
 75 |     &airflow-common-depends-on
 76 |     redis:
 77 |       condition: service_healthy
 78 |     postgres:
 79 |       condition: service_healthy
 80 | 
 81 | services:
 82 |   postgres:
 83 |     image: postgres:13
 84 |     environment:
 85 |       POSTGRES_USER: airflow
 86 |       POSTGRES_PASSWORD: airflow
 87 |       POSTGRES_DB: airflow
 88 |     volumes:
 89 |       - postgres-db-volume:/var/lib/postgresql/data
 90 |     healthcheck:
 91 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 92 |       interval: 5s
 93 |       retries: 5
 94 |     restart: always
 95 | 
 96 |   redis:
 97 |     image: redis:latest
 98 |     expose:
 99 |       - 6379
100 |     healthcheck:
101 |       test: ["CMD", "redis-cli", "ping"]
102 |       interval: 5s
103 |       timeout: 30s
104 |       retries: 50
105 |     restart: always
106 | 
107 |   airflow-webserver:
108 |     <<: *airflow-common
109 |     command: webserver
110 |     ports:
111 |       - 8080:8080
112 |     healthcheck:
113 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
114 |       interval: 10s
115 |       timeout: 10s
116 |       retries: 5
117 |     restart: always
118 |     depends_on:
119 |       <<: *airflow-common-depends-on
120 |       airflow-init:
121 |         condition: service_completed_successfully
122 | 
123 |   airflow-scheduler:
124 |     <<: *airflow-common
125 |     command: scheduler
126 |     healthcheck:
127 |       test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
128 |       interval: 10s
129 |       timeout: 10s
130 |       retries: 5
131 |     restart: always
132 |     depends_on:
133 |       <<: *airflow-common-depends-on
134 |       airflow-init:
135 |         condition: service_completed_successfully
136 | 
137 |   airflow-worker:
138 |     <<: *airflow-common
139 |     command: celery worker
140 |     healthcheck:
141 |       test:
142 |         - "CMD-SHELL"
143 |         - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
144 |       interval: 10s
145 |       timeout: 10s
146 |       retries: 5
147 |     environment:
148 |       <<: *airflow-common-env
149 |       # Required to handle warm shutdown of the celery workers properly
150 |       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
151 |       DUMB_INIT_SETSID: "0"
152 |     restart: always
153 |     depends_on:
154 |       <<: *airflow-common-depends-on
155 |       airflow-init:
156 |         condition: service_completed_successfully
157 | 
158 |   airflow-triggerer:
159 |     <<: *airflow-common
160 |     command: triggerer
161 |     healthcheck:
162 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
163 |       interval: 10s
164 |       timeout: 10s
165 |       retries: 5
166 |     restart: always
167 |     depends_on:
168 |       <<: *airflow-common-depends-on
169 |       airflow-init:
170 |         condition: service_completed_successfully
171 | 
172 |   airflow-init:
173 |     <<: *airflow-common
174 |     entrypoint: /bin/bash
175 |     # yamllint disable rule:line-length
176 |     command:
177 |       - -c
178 |       - |
179 |         function ver() {
180 |           printf "%04d%04d%04d%04d" $${1//./ }
181 |         }
182 |         airflow_version=$$(gosu airflow airflow version)
183 |         airflow_version_comparable=$$(ver $${airflow_version})
184 |         min_airflow_version=2.2.0
185 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
186 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
187 |           echo
188 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
189 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
190 |           echo
191 |           exit 1
192 |         fi
193 |         if [[ -z "${AIRFLOW_UID}" ]]; then
194 |           echo
195 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
196 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
197 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
198 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
199 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
200 |           echo
201 |         fi
202 |         one_meg=1048576
203 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
204 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
205 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
206 |         warning_resources="false"
207 |         if (( mem_available < 4000 )) ; then
208 |           echo
209 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
210 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
211 |           echo
212 |           warning_resources="true"
213 |         fi
214 |         if (( cpus_available < 2 )); then
215 |           echo
216 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
217 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
218 |           echo
219 |           warning_resources="true"
220 |         fi
221 |         if (( disk_available < one_meg * 10 )); then
222 |           echo
223 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
224 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
225 |           echo
226 |           warning_resources="true"
227 |         fi
228 |         if [[ $${warning_resources} == "true" ]]; then
229 |           echo
230 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
231 |           echo "Please follow the instructions to increase amount of resources available:"
232 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
233 |           echo
234 |         fi
235 |         mkdir -p /sources/logs /sources/dags /sources/plugins
236 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
237 |         exec /entrypoint airflow version
238 |     # yamllint enable rule:line-length
239 |     environment:
240 |       <<: *airflow-common-env
241 |       _AIRFLOW_DB_UPGRADE: 'true'
242 |       _AIRFLOW_WWW_USER_CREATE: 'true'
243 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
244 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
245 |     user: "0:0"
246 |     volumes:
247 |       - .:/sources
248 | 
249 |   airflow-cli:
250 |     <<: *airflow-common
251 |     profiles:
252 |       - debug
253 |     environment:
254 |       <<: *airflow-common-env
255 |       CONNECTION_CHECK_MAX_COUNT: "0"
256 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
257 |     command:
258 |       - bash
259 |       - -c
260 |       - airflow
261 | 
262 |   flower:
263 |     <<: *airflow-common
264 |     command: celery flower
265 |     ports:
266 |       - 5555:5555
267 |     healthcheck:
268 |       test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
269 |       interval: 10s
270 |       timeout: 10s
271 |       retries: 5
272 |     restart: always
273 |     depends_on:
274 |       <<: *airflow-common-depends-on
275 |       airflow-init:
276 |         condition: service_completed_successfully
277 | 
278 | volumes:
279 |   postgres-db-volume:
280 | 


--------------------------------------------------------------------------------
/02_airflow/readme.md:
--------------------------------------------------------------------------------
 1 | # Batch Processing With Airflow
 2 | 
 3 | I used Apache Airflow to create and orchestrate the pipeline that extracts and loads the data to the data lakes and the data warehouse and to batch deploy model predictions.
 4 | 
 5 | 
 6 | 
 7 | ## Ingest Historical Weather Data DAG
 8 | [dags/ingest_historical_weather_data_dag.py](./dags/ingest_historical_weather_data_dag.py)
 9 | 
10 | ![](../img/noaa_dag.PNG)
11 | 
12 | ## Ingest Live Hourly Weather DAG
13 | [dags/ingest_live_hourly_weather_dag.py](./dags/ingest_live_hourly_weather_dag.py)
14 | 
15 | ![](../img/owm_dag.PNG)
16 | 
17 | ## Ingest Raw Electricity DAG
18 | [dags/ingest_raw_electricity_data_dag.py](./dags/ingest_raw_electricity_data_dag.py)
19 | 
20 | ![](../img/eia_dag.PNG)
21 | 
22 | ## Ingest Weather Forecast DAG
23 | [dags/ingest_weather_forecast_dag.py](./dags/ingest_weather_forecast_dag.py)
24 | 
25 | ![](../img/weather_forecast_dag.PNG)
26 | 
27 | ## Batch Deploy Model Predictions DAG
28 | [dags/batch_predict_dag.py](./dags/batch_predict_dag.py)
29 | 
30 | ![](../img/batch_predict_dag.PNG)
31 | 


--------------------------------------------------------------------------------
/02_airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-airflow-providers-google
2 | mlflow
3 | pyarrow
4 | scikit-learn


--------------------------------------------------------------------------------
/03_dbt/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/03_dbt/README.md:
--------------------------------------------------------------------------------
 1 | # dbt Tranformation in the Warehouse
 2 | 
 3 | Tables within the data warehouse were optimized by not partitioning or clustering. Tables less than 1GB should not be clustered or partitioned due to the overhead created, and my fact tables are under 1GB.
 4 | 
 5 | Look inside the [models directory](03_dbt/models) to see the transformations I applied with dbt.
 6 | 
 7 | # dbt transformation model for temperature data
 8 | [models/core/recorded_temperature.sql](./models/core/recorded_temperature.sql)
 9 | 
10 | ![](../img/dbt_temp.PNG)
11 | 
12 | # dbt transformation for actual energy demand
13 | [models/core/fact_eia_demand_historical.sql](./models/core/fact_eia_demand_historical.sql)
14 | 
15 | ![](../img/dbt_demand.PNG)
16 | 
17 | # dbt transformation for forecasted energy demand
18 | [models/core/fact_eia_demand_forecast.sql](./models/core/fact_eia_demand_forecast.sql)
19 | 
20 | ![](../img/dbt_demand_forecast.PNG)
21 | 
22 | # dbt transformation for model monitoring
23 | [models/core/ml_model_metrics.sql](./models/core/ml_model_metrics.sql)
24 | 
25 | Because the predictions from the batch deployed models are stored in the data warehouse, and the actual hourly data will be loaded and transformed the following day, I created a sql view with dbt to calculate the model metrics on the previous day's predictions.
26 | 
27 | ![](../img/dbt_monitoring.PNG)


--------------------------------------------------------------------------------
/03_dbt/analysis/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/03_dbt/analysis/.gitkeep


--------------------------------------------------------------------------------
/03_dbt/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'energy_data'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'default'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `source-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | 
22 | target-path: "target"  # directory which will store compiled SQL files
23 | clean-targets:         # directories to be removed by `dbt clean`
24 |   - "target"
25 |   - "dbt_packages"
26 | 
27 | 
28 | # Configuring models
29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
30 | 
31 | # In this example config, we tell dbt to build all models in the example/ directory
32 | # as tables. These settings can be overridden in the individual model files
33 | # using the `{{ config(...) }}` macro.
34 | models:
35 |   energy_data:
36 |     # Applies to all files under models/example/
37 |     example:
38 |       materialized: view
39 | 


--------------------------------------------------------------------------------
/03_dbt/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/03_dbt/macros/.gitkeep


--------------------------------------------------------------------------------
/03_dbt/models/core/fact_eia_demand_forecast.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | select 
 4 | 
 5 | d.series_id,
 6 | d.timestamp as timestamp_UTC,
 7 | DATETIME(d.timestamp, 'America/Denver') as timestamp_MTN,
 8 | d.value,
 9 | md.units,
10 | md.name,
11 | md.updated
12 | 		
13 | from {{ source('staging', 'demand_data_native') }} d 
14 | join {{ source('staging', 'demand_metadata_native') }} md
15 | on d.series_id = md.series_id
16 | 
17 | where d.series_id = 'EBA_PSCO-ALL_DF_H'
18 | order by d.timestamp
19 | -- dbt build --m <model.sql> --var 'is_test_run: false'
20 | {% if var('is_test_run', default=true) %}
21 | 
22 |   limit 100
23 | 
24 | {% endif %}


--------------------------------------------------------------------------------
/03_dbt/models/core/fact_eia_demand_historical.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | select 
 4 | 
 5 | d.series_id,
 6 | d.timestamp as timestamp_UTC,
 7 | DATETIME(d.timestamp, 'America/Denver') as timestamp_MTN,
 8 | d.value,
 9 | md.units,
10 | md.name,
11 | md.updated
12 | 		
13 | from {{ source('staging', 'demand_data_native') }} d 
14 | join {{ source('staging', 'demand_metadata_native') }} md
15 | on d.series_id = md.series_id
16 | 
17 | where d.series_id = 'EBA_PSCO-ALL_D_H'
18 | order by d.timestamp
19 | -- dbt build --m <model.sql> --var 'is_test_run: false'
20 | {% if var('is_test_run', default=true) %}
21 | 
22 |   limit 100
23 | 
24 | {% endif %}


--------------------------------------------------------------------------------
/03_dbt/models/core/ml_model_metrics.sql:
--------------------------------------------------------------------------------
 1 |   {{ config(materialized='view') }}
 2 |   with 
 3 |   forecasts as (
 4 |     select 
 5 |         DATETIME(energy_timestamp_mtn, "America/Denver") as energy_timestamp_mtn,
 6 |         predicted_energy_demand,
 7 |         temp_f_forecast,
 8 |         model_version,
 9 |         DATETIME(prediction_start_date, "America/Denver") as prediction_start_date,
10 |         prediction_creation_date,
11 |         DATETIME_DIFF(DATETIME(energy_timestamp_mtn, "America/Denver"), DATETIME(prediction_start_date, "America/Denver"), HOUR) as hours_from_pred_start
12 | 
13 |     from {{ source('staging', 'energy_demand_forecasts' ) }}
14 |     ),
15 | 
16 |   actuals as (
17 |     select *
18 |     from {{ ref('joined_temp_and_demand') }}
19 |     WHERE energy_timestamp_mtn >= (select min(energy_timestamp_mtn) from forecasts)
20 |   )
21 | 
22 | 
23 | select a.energy_timestamp_mtn,
24 |        predicted_energy_demand,
25 |        temp_f_forecast,
26 |        model_version,
27 |        prediction_start_date,
28 |        prediction_creation_date,
29 |        hours_from_pred_start,
30 |        temp_timestamp_mtn,
31 |        energy_demand,
32 |        demand_units, 
33 |        series_name,
34 |        temp_F,
35 |        date_energy_updated,
36 |        
37 |        energy_demand - predicted_energy_demand as energy_error,
38 |        temp_F - temp_f_forecast as temp_f_error,
39 |        abs(energy_demand - predicted_energy_demand) as energy_error_abs,
40 |        abs(temp_F - temp_f_forecast) as temp_f_error_abs,
41 |        (energy_demand - predicted_energy_demand) / energy_demand as energy_error_pct,
42 |        (temp_F - temp_f_forecast) / temp_F as temp_f_error_pct,
43 |        abs((energy_demand - predicted_energy_demand) / energy_demand) as energy_error_abs_pct,
44 |        abs((temp_F - temp_f_forecast) / temp_F) as temp_f_error_abs_pct
45 | 
46 | from forecasts f 
47 | left join actuals a
48 | ON f.energy_timestamp_mtn = a.energy_timestamp_mtn
49 | 
50 | 
51 | -- dbt build --m <model.sql> --var 'is_test_run: false'
52 | {% if var('is_test_run', default=true) %}
53 |   limit 100
54 | 
55 | {% endif %}


--------------------------------------------------------------------------------
/03_dbt/models/core/recorded_temperature.sql:
--------------------------------------------------------------------------------
 1 | --https://discourse.getdbt.com/t/unioning-identically-structured-data-sources/921
 2 | {{ config(materialized='table') }}
 3 | 
 4 | {% set models =  ['isd', 'owm'] %}
 5 | 
 6 | {% for model in models %}
 7 |   select 
 8 |       *,
 9 |       '{{ model }}' as source
10 |   from {{ ref('cast_'~model~'_weather') }}
11 | {% if not loop.last -%} union all {%- endif %}
12 | {% endfor %} 
13 | 
14 | 
15 | -- dbt build --m <model.sql> --var 'is_test_run: false'
16 | {% if var('is_test_run', default=true) %}
17 | 
18 |   limit 100
19 | 
20 | {% endif %}


--------------------------------------------------------------------------------
/03_dbt/models/core/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |     - name: staging
 5 |       database: mlops-zoomcamp-354700
 6 |       schema: energy_data
 7 | 
 8 |       tables:
 9 |         - name: demand_metadata_native
10 |         - name: demand_data_native
11 |         - name: hourly_updated_weather
12 |         - name: 2015_weather_station_native
13 |         - name: 2016_weather_station_native
14 |         - name: 2017_weather_station_native
15 |         - name: 2018_weather_station_native
16 |         - name: 2019_weather_station_native
17 |         - name: 2020_weather_station_native
18 |         - name: 2021_weather_station_native
19 |         - name: 2022_weather_station_native
20 |         - name: energy_demand_forecasts
21 | 
22 |       
23 |       
24 |       
25 | 


--------------------------------------------------------------------------------
/03_dbt/models/mart/joined_temp_and_demand.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {{ config(materialized='view') }}
 3 | 
 4 | SELECT 
 5 |   temp.observation_time_MTN as temp_timestamp_mtn, 
 6 |   energy.timestamp_MTN as energy_timestamp_mtn,
 7 |   energy.value as energy_demand,
 8 |   energy.units as demand_units,
 9 |   energy.name as series_name,
10 |   temp_F,
11 |   temp_C,
12 |   lat as temp_location_lat,
13 |   lon as temp_location_lon,
14 |   source as temp_source,
15 |   energy.updated as date_energy_updated
16 | 
17 | FROM {{ ref('fact_eia_demand_historical') }} energy
18 | LEFT JOIN {{ ref('recorded_temperature') }} temp
19 |   ON DATETIME_TRUNC(temp.observation_time_MTN, HOUR) = DATETIME_TRUNC(energy.timestamp_MTN, HOUR) 
20 |   


--------------------------------------------------------------------------------
/03_dbt/models/staging/cast_isd_weather.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | with isd as ( 
 4 |   SELECT
 5 |     -- CAST( CONCAT(USAF, WBAN) AS int) as station_id,
 6 |     CONCAT(USAF, WBAN) as station_id,
 7 |     LAT,
 8 |     LON
 9 |   FROM {{ ref('isd_stations') }}
10 | )
11 | 
12 | 
13 | select
14 |     isd.LAT as lat,
15 |     isd.LON as lon,
16 |     obs.temperature_degC as temp_C,
17 |     ROUND( obs.temperature_degC * (9/5) + 32, 1) as temp_F,
18 |     DATETIME(timestamp(obs.DATE), 'UTC') as observation_time_UTC,
19 |     DATETIME(timestamp(obs.DATE), 'America/Denver') as observation_time_MTN
20 | 
21 | from {{ ref('union_weather_station') }} obs
22 | join isd
23 | on CAST(obs.STATION AS STRING) = isd.station_id 
24 | 
25 | WHERE obs.temperature_QC in ('1', '5', '9') AND
26 |       obs.temperature_degC IS NOT NULL
27 | 
28 | -- dbt build --m <model.sql> --var 'is_test_run: false'
29 | {% if var('is_test_run', default=true) %}
30 | 
31 |   limit 100
32 | 
33 | {% endif %}


--------------------------------------------------------------------------------
/03_dbt/models/staging/cast_owm_weather.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | select
 4 |     lat,
 5 |     lon, 
 6 |     temp as temp_C,
 7 |     ROUND( temp * (9/5) + 32, 1) as temp_F,
 8 |     DATETIME(timestamp(timestamp), 'UTC') as observation_time_UTC,
 9 |     DATETIME(timestamp(timestamp), 'America/Denver') as observation_time_MTN
10 | 
11 | from {{ source('staging', 'hourly_updated_weather') }}
12 | 
13 | -- dbt build --m <model.sql> --var 'is_test_run: false'
14 | {% if var('is_test_run', default=true) %}
15 | 
16 |   limit 100
17 | 
18 | {% endif %}


--------------------------------------------------------------------------------
/03_dbt/models/staging/union_weather_station.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | -- https://discourse.getdbt.com/t/unioning-identically-structured-data-sources/921
 4 | {% set years =  ['2022', '2021', '2020', '2019', '2018', '2017', '2016', '2015'] %}
 5 | 
 6 | {% for year in years %}
 7 |   select 
 8 |       *,
 9 |   from {{ source('staging', year~'_weather_station_native' ) }}
10 | {% if not loop.last -%} union all {%- endif %}
11 | {% endfor %} 
12 | 
13 | 
14 | -- dbt build --m <model.sql> --var 'is_test_run: false'
15 | {% if var('is_test_run', default=true) %}
16 | 
17 |   limit 100
18 | 
19 | {% endif %}


--------------------------------------------------------------------------------
/03_dbt/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/03_dbt/seeds/.gitkeep


--------------------------------------------------------------------------------
/03_dbt/seeds/properties.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | seeds:
4 |   - name: isd_stations
5 |     config:
6 |       column_types:
7 |         USAF: string
8 |         WBAN: string


--------------------------------------------------------------------------------
/03_dbt/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/03_dbt/snapshots/.gitkeep


--------------------------------------------------------------------------------
/03_dbt/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/03_dbt/tests/.gitkeep


--------------------------------------------------------------------------------
/04_dashboard/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | base="dark"


--------------------------------------------------------------------------------
/04_dashboard/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import date, timedelta
  3 | 
  4 | from info import info_text, note
  5 | from google.oauth2 import service_account
  6 | from google.cloud import bigquery
  7 | 
  8 | import pandas as pd
  9 | import pandas_gbq
 10 | import plotly.express as px
 11 | from plotly.subplots import make_subplots
 12 | import streamlit as st
 13 | 
 14 | # https://docs.streamlit.io/knowledge-base/tutorials/databases/bigquery#enable-the-bigquery-api
 15 | # https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html
 16 | 
 17 | st.set_page_config(layout="wide", initial_sidebar_state='expanded')
 18 | 
 19 | # Create API client.
 20 | credentials = service_account.Credentials.from_service_account_info(
 21 |     st.secrets["gcp_service_account"]
 22 | )
 23 | client = bigquery.Client(credentials=credentials)
 24 | 
 25 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID", "mlops-zoomcamp-354700")
 26 | BIGQUERY_DATASET = 'energy_data_prod'
 27 | MODEL_VERSION = "49c833f911ae43488e67063f410b7b5e"
 28 | 
 29 | 
 30 | # Perform query.
 31 | # Uses st.experimental_memo to only rerun when the query changes or after 10 min.
 32 | @st.experimental_memo(ttl=600)
 33 | def run_query(query):
 34 |     # query_job = client.query(query)
 35 |     # rows_raw = query_job.result()
 36 |     # # Convert to list of dicts. Required for st.experimental_memo to hash the return value.
 37 |     # rows = [dict(row) for row in rows_raw]
 38 |     return pd.read_gbq(query, project_id=PROJECT_ID, credentials=credentials)
 39 | 
 40 | 
 41 | def plot_demand_time_series(eia_forecast_demand, prod_model_demand, actual_demand, weather_2022):
 42 | 
 43 |     fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
 44 | 
 45 |     fig.add_traces(
 46 |         list(px.line(
 47 |             eia_forecast_demand, 
 48 |             x='timestamp_MTN', 
 49 |             y='value',
 50 |             title="Actual and Forecasted Electrical Demand, Xcel Energy, Colorado",
 51 |             labels={'value': 'EIA Forecasted Demand (megawatthours)'}
 52 |             )
 53 |             .select_traces()
 54 |         )
 55 |     )
 56 | 
 57 |     fig.add_traces(
 58 |         list(px.line(actual_demand, x='timestamp_MTN', y='value', labels={'value': 'Actual_Demand (megawatthours)'}).select_traces())
 59 |         )
 60 | 
 61 |     fig.add_traces(
 62 |       list(px.line(prod_model_demand, x='timestamp_MTN', y='predicted_energy_demand', labels={'value': 'Predicted_Demand (megawatthours)'}).select_traces())
 63 |       )
 64 | 
 65 |     fig.add_trace(
 66 |         list(px.scatter(weather_2022, x='observation_time_MTN', y='temp_F').select_traces())[0],
 67 |         row=2, col=1
 68 |         )
 69 | 
 70 |     fig['data'][1]['line']['color']='#ef476f'
 71 |     fig['data'][1]['line']['width']=5
 72 |     fig['data'][0]['line']['color']='#06d6a0'
 73 |     fig['data'][0]['line']['width']=2
 74 | 
 75 |     fig['data'][1]['showlegend']=True
 76 |     fig['data'][1]['name']='Actual Demand (MWh)'
 77 |     fig['data'][0]['showlegend']=True
 78 |     fig['data'][0]['name']='EIA Demand Forecast (MWh)'
 79 |     fig['data'][2]['name']='Modeled Demand (MWh)'
 80 |     fig['data'][2]['showlegend']=True
 81 | 
 82 |     fig['data'][3]['showlegend']=True
 83 |     fig['data'][3]['name']='Denver Airport Actual Temperature (F)'
 84 | 
 85 |     return fig
 86 | 
 87 | def main():
 88 |     TODAY = date.today()
 89 |     TOMORROW = TODAY + timedelta(2)
 90 |     WEEK_PRIOR = TODAY - timedelta(7)
 91 |     with st.form('date_picker'):
 92 |         start_date, end_date = st.date_input('Select a date range, then click "Update"', min_value=date(2015,7, 4), max_value=TOMORROW, value=(WEEK_PRIOR, TOMORROW))
 93 |         submitted = st.form_submit_button("Update")
 94 | 
 95 | 
 96 |     actual_demand = run_query(f"""SELECT * 
 97 |                                   FROM 
 98 |                                     `{PROJECT_ID}.{BIGQUERY_DATASET}.fact_eia_demand_historical` 
 99 |                                   WHERE 
100 |                                     date(timestamp_MTN) BETWEEN date('{start_date}') and date('{end_date}') 
101 |                                   ORDER BY timestamp_MTN""")
102 | 
103 |     eia_forecast_demand = run_query(f"""SELECT * 
104 |                                         FROM 
105 |                                           `{PROJECT_ID}.{BIGQUERY_DATASET}.fact_eia_demand_forecast` 
106 |                                         WHERE 
107 |                                           date(timestamp_MTN) BETWEEN date('{start_date}') and date('{end_date}')
108 |                                         ORDER BY timestamp_MTN""")
109 | 
110 |     prod_model_demand = run_query(f"""SELECT
111 |                                         DATETIME(energy_timestamp_mtn, "America/Denver") as timestamp_MTN,
112 |                                         predicted_energy_demand,
113 |                                         temp_f_forecast,
114 |                                         model_version,
115 |         
116 |                                     FROM `{PROJECT_ID}.energy_data.energy_demand_forecasts`                              
117 |                                       WHERE DATETIME_DIFF(DATETIME(energy_timestamp_mtn, "America/Denver"), DATETIME(prediction_start_date, "America/Denver"), HOUR) < 24
118 |                                       and 
119 |                                       model_version = '{MODEL_VERSION}'
120 |                                       and 
121 |                                       DATETIME(energy_timestamp_mtn, "America/Denver") BETWEEN date('{start_date}') and date('{end_date}')
122 |                                       ORDER BY energy_timestamp_mtn"""
123 |                                       )                                        
124 | 
125 | 
126 |     
127 |     weather_2022 = run_query(f"""SELECT * 
128 |                                FROM 
129 |                                  `{PROJECT_ID}.{BIGQUERY_DATASET}.recorded_temperature` 
130 |                                WHERE 
131 |                                  observation_time_MTN BETWEEN date('{start_date}') and date('{end_date}')
132 |                                ORDER BY observation_time_MTN""")
133 | 
134 |     fig = plot_demand_time_series(eia_forecast_demand, prod_model_demand, actual_demand, weather_2022)
135 |     fig.update_layout(height=700)
136 |     st.plotly_chart(fig, use_container_width=True)
137 | 
138 | if __name__ == '__main__':
139 | 
140 |     st.title('⚡ Energy Demand and Temperature for Xcel Energy in CO ⚡')
141 |     st.sidebar.write(info_text)
142 |     # with st.sidebar.expander('Note on missing data between May 20 and May 30, 2022:'):
143 |     #   st.write(note)
144 | 
145 |     main()
146 | 
147 | 


--------------------------------------------------------------------------------
/04_dashboard/info.py:
--------------------------------------------------------------------------------
 1 | info_text = """
 2 | # Energy Demand Dashboard
 3 | Michael Harty
 4 | 
 5 | [Project Repo](https://github.com/mharty3/energy_data_capstone)
 6 | 
 7 | I made this dashboard as part of my Data Engineering and MLOps Zoomcamp Capstone Project. 
 8 | 
 9 | It displays data from the data pipeline I created that extracts and transforms data from various sources including the EIA, NOAA, and Open Weather Map API.
10 | 
11 | Actual energy demand and weather data is updated hourly, and the EIA energy demand forecast is updated each morning whenever the EIA releases their forecast for that day (usually around 8am MDT).
12 | 
13 | 
14 | """
15 | 
16 | 
17 | note ="""
18 | Historical weather data prior to May 30 is being pulled from the NOAA Integrated Surface Database. 
19 | It has hourly weather observation data dating back to 1901, however it is usually updated on a few days delay. When their database is updated with recent data, 
20 | the missing data on the dashboard will be backfilled. 
21 | 
22 | From May 30 onward, weather data is being pulled from a different, live updating source and should be kept up to date.
23 | """


--------------------------------------------------------------------------------
/04_dashboard/readme.md:
--------------------------------------------------------------------------------
1 | [Link to live dashboard](https://share.streamlit.io/mharty3/energy_data_exploration/04_dashboard/app.py)
2 | 
3 | 
4 | ![](../img/dashboard.PNG)


--------------------------------------------------------------------------------
/04_dashboard/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-bigquery
2 | streamlit
3 | pandas
4 | pandas_gbq
5 | plotly
6 | click==7.1.2


--------------------------------------------------------------------------------
/05_model_training/03_mlflow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 28,
  6 |    "id": "4edd8f6c-3a6b-488e-b739-a8cedb8dcd9b",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "data": {
 11 |       "text/plain": [
 12 |        "<Experiment: artifact_location='gs://mlflow-runs-mlops-zoomcamp-354700/2', experiment_id='2', lifecycle_stage='active', name='linear_regression_model', tags={}>"
 13 |       ]
 14 |      },
 15 |      "execution_count": 28,
 16 |      "metadata": {},
 17 |      "output_type": "execute_result"
 18 |     }
 19 |    ],
 20 |    "source": [
 21 |     "%reload_ext autoreload\n",
 22 |     "%autoreload 2\n",
 23 |     "\n",
 24 |     "from google.cloud import bigquery\n",
 25 |     "import matplotlib.pyplot as plt\n",
 26 |     "import numpy as np\n",
 27 |     "import pandas as pd\n",
 28 |     "import calendar\n",
 29 |     "from sklearn.linear_model import LinearRegression, Ridge\n",
 30 |     "from datetime import datetime\n",
 31 |     "import numpy as np\n",
 32 |     "import calendar\n",
 33 |     "from tqdm.auto import tqdm\n",
 34 |     "import time\n",
 35 |     "from ts_diagnostics import cross_validation, performance_metrics\n",
 36 |     "from mlflow.models.signature import infer_signature\n",
 37 |     "\n",
 38 |     "# connect to mlflow server\n",
 39 |     "import mlflow\n",
 40 |     "TRACKING_SERVER_HOST = \"10.128.0.2:5000\"\n",
 41 |     "mlflow.set_tracking_uri(f\"http://{TRACKING_SERVER_HOST}\")\n",
 42 |     "mlflow.set_experiment(\"linear_regression_model\")"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 44,
 48 |    "id": "8b9cfa80-3f1d-4968-81a2-6991b4793f4a",
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "def pull_training_data(data_start, data_end):\n",
 53 |     "    q = \"SELECT * FROM `mlops-zoomcamp-354700.energy_data_prod.joined_temp_and_demand`\"\n",
 54 |     "    df_raw = pd.read_gbq(q, project_id='mlops-zoomcamp-354700')\n",
 55 |     "    return df_raw[df_raw['energy_timestamp_mtn'].between(data_start, data_end)]\n",
 56 |     "    \n",
 57 |     "\n",
 58 |     "def trim_data(df, min_val, max_val):\n",
 59 |     "    return df[df['energy_demand'].between(min_val, max_val)].set_index('energy_timestamp_mtn')\n",
 60 |     "\n",
 61 |     "\n",
 62 |     "def make_features(df, min_y_val, max_y_val):\n",
 63 |     "    df_train = (trim_data(df, min_y_val, max_y_val)\n",
 64 |     "                                         .reset_index()\n",
 65 |     "                                         .dropna(subset=['energy_demand', 'temp_F'])\n",
 66 |     "                                         .assign(\n",
 67 |     "                                                year=lambda df_: df_['energy_timestamp_mtn'].dt.year,\n",
 68 |     "                                                day_of_year=lambda df_: df_['energy_timestamp_mtn'].dt.day_of_year,\n",
 69 |     "                                                hour=lambda df_: df_['energy_timestamp_mtn'].dt.hour,\n",
 70 |     "                                                is_weekend=lambda df_: df_['energy_timestamp_mtn'].dt.day_of_week >= 5, # saturady day_of_week = 5, sunday = 6\n",
 71 |     "                                                is_summer=lambda df_: df_['energy_timestamp_mtn'].dt.month.between(5, 9, inclusive='both'),\n",
 72 |     "                                                month=lambda df_: df_['energy_timestamp_mtn'].dt.month,\n",
 73 |     "                                                temp_F_squared=lambda df_: df_['temp_F'] * df_['temp_F'],\n",
 74 |     "                                                hour_squared=lambda df_: df_['hour'] ** 2,\n",
 75 |     "                                                hour_cubed=lambda df_: df_['hour'] ** 3,\n",
 76 |     "                                        )\n",
 77 |     "\n",
 78 |     "                                    .set_index('energy_timestamp_mtn')                                    \n",
 79 |     "  )\n",
 80 |     "\n",
 81 |     "\n",
 82 |     "    for month in calendar.month_name[1:]:\n",
 83 |     "        df_train[month] = pd.to_numeric(df_train.index.month_name == month)\n",
 84 |     "        \n",
 85 |     "    return df_train\n",
 86 |     "\n",
 87 |     "    "
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 41,
 93 |    "id": "017c68d6-8824-4dce-a447-2afb0d573a62",
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# define the max and min dates to pull training data from BQ\n",
 98 |     "data_start = datetime(2015, 8, 1)\n",
 99 |     "data_end = datetime(2021, 6, 1)\n",
100 |     "\n",
101 |     "# define start and end of training data for the model\n",
102 |     "train_start_date = datetime(2015, 8, 1)\n",
103 |     "train_end_date = datetime(2021, 1, 1)\n",
104 |     "\n",
105 |     "min_y_val = 2_000\n",
106 |     "max_y_val = 11_000\n",
107 |     "\n",
108 |     "# define the features to include in the model\n",
109 |     "features_to_include = ['temp_F', \n",
110 |     "                       'year', \n",
111 |     "                       'day_of_year', \n",
112 |     "                       'hour', \n",
113 |     "                       'is_weekend', \n",
114 |     "                       'is_summer', \n",
115 |     "                       'month', \n",
116 |     "                       'temp_F_squared',\n",
117 |     "                       'hour_squared',\n",
118 |     "                       'hour_cubed',\n",
119 |     "                      ]\n",
120 |     "\n",
121 |     "cv_horizon = '2 days'\n",
122 |     "cv_initial = f'{4 * 365} days'\n",
123 |     "cv_period = '55 days'\n",
124 |     "\n",
125 |     "df_raw = pull_training_data(data_start, data_end)\n",
126 |     "df_train = make_features(df_raw, min_y_val, max_y_val)\n",
127 |     "\n",
128 |     "# filter \n",
129 |     "time_filter =  (df_train.index > train_start_date) & (df_train.index < train_end_date)\n",
130 |     "X = df_train.loc[time_filter, features_to_include]\n",
131 |     "y = df_train.loc[time_filter, 'energy_demand']\n",
132 |     " "
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 43,
138 |    "id": "c814c4fe-2af7-40cd-84e1-23e3c7b840fb",
139 |    "metadata": {
140 |     "tags": []
141 |    },
142 |    "outputs": [
143 |     {
144 |      "name": "stderr",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "2022/07/20 03:56:02 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n"
148 |      ]
149 |     },
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "Making 10 forecasts with cutoffs between 2019-08-22 23:00:00 and 2020-12-29 23:00:00\n"
155 |      ]
156 |     },
157 |     {
158 |      "name": "stderr",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "  0%|                                                                                                                                                                   | 0/10 [00:00<?, ?it/s]2022/07/20 03:56:06 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
162 |       " 10%|███████████████▌                                                                                                                                           | 1/10 [00:02<00:24,  2.73s/it]2022/07/20 03:56:09 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
163 |       " 20%|███████████████████████████████                                                                                                                            | 2/10 [00:05<00:20,  2.56s/it]2022/07/20 03:56:11 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
164 |       " 30%|██████████████████████████████████████████████▌                                                                                                            | 3/10 [00:07<00:17,  2.50s/it]2022/07/20 03:56:14 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
165 |       "2022/07/20 03:56:16 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed, possibly due older server version. The model artifacts have been logged successfully under gs://mlflow-runs-mlops-zoomcamp-354700/2/303079bbc52646c1a92e0c765a45e581/artifacts. In addition to exporting model artifacts, MLflow clients 1.7.0 and above attempt to record model metadata to the tracking store. If logging to a mlflow server via REST, consider upgrading the server version to MLflow 1.7.0 or above.\n",
166 |       " 40%|██████████████████████████████████████████████████████████████                                                                                             | 4/10 [00:09<00:14,  2.40s/it]2022/07/20 03:56:16 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
167 |       "2022/07/20 03:56:18 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed, possibly due older server version. The model artifacts have been logged successfully under gs://mlflow-runs-mlops-zoomcamp-354700/2/303079bbc52646c1a92e0c765a45e581/artifacts. In addition to exporting model artifacts, MLflow clients 1.7.0 and above attempt to record model metadata to the tracking store. If logging to a mlflow server via REST, consider upgrading the server version to MLflow 1.7.0 or above.\n",
168 |       " 50%|█████████████████████████████████████████████████████████████████████████████▌                                                                             | 5/10 [00:12<00:12,  2.44s/it]2022/07/20 03:56:18 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
169 |       "2022/07/20 03:56:21 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed, possibly due older server version. The model artifacts have been logged successfully under gs://mlflow-runs-mlops-zoomcamp-354700/2/303079bbc52646c1a92e0c765a45e581/artifacts. In addition to exporting model artifacts, MLflow clients 1.7.0 and above attempt to record model metadata to the tracking store. If logging to a mlflow server via REST, consider upgrading the server version to MLflow 1.7.0 or above.\n",
170 |       " 60%|█████████████████████████████████████████████████████████████████████████████████████████████                                                              | 6/10 [00:14<00:09,  2.46s/it]2022/07/20 03:56:21 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
171 |       "2022/07/20 03:56:23 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed, possibly due older server version. The model artifacts have been logged successfully under gs://mlflow-runs-mlops-zoomcamp-354700/2/303079bbc52646c1a92e0c765a45e581/artifacts. In addition to exporting model artifacts, MLflow clients 1.7.0 and above attempt to record model metadata to the tracking store. If logging to a mlflow server via REST, consider upgrading the server version to MLflow 1.7.0 or above.\n",
172 |       " 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 7/10 [00:17<00:07,  2.44s/it]2022/07/20 03:56:23 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
173 |       "2022/07/20 03:56:26 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed, possibly due older server version. The model artifacts have been logged successfully under gs://mlflow-runs-mlops-zoomcamp-354700/2/303079bbc52646c1a92e0c765a45e581/artifacts. In addition to exporting model artifacts, MLflow clients 1.7.0 and above attempt to record model metadata to the tracking store. If logging to a mlflow server via REST, consider upgrading the server version to MLflow 1.7.0 or above.\n",
174 |       " 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 8/10 [00:19<00:04,  2.46s/it]2022/07/20 03:56:26 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
175 |       "2022/07/20 03:56:28 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed, possibly due older server version. The model artifacts have been logged successfully under gs://mlflow-runs-mlops-zoomcamp-354700/2/303079bbc52646c1a92e0c765a45e581/artifacts. In addition to exporting model artifacts, MLflow clients 1.7.0 and above attempt to record model metadata to the tracking store. If logging to a mlflow server via REST, consider upgrading the server version to MLflow 1.7.0 or above.\n",
176 |       " 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 9/10 [00:22<00:02,  2.47s/it]2022/07/20 03:56:28 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
177 |       "2022/07/20 03:56:31 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed, possibly due older server version. The model artifacts have been logged successfully under gs://mlflow-runs-mlops-zoomcamp-354700/2/303079bbc52646c1a92e0c765a45e581/artifacts. In addition to exporting model artifacts, MLflow clients 1.7.0 and above attempt to record model metadata to the tracking store. If logging to a mlflow server via REST, consider upgrading the server version to MLflow 1.7.0 or above.\n",
178 |       "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:24<00:00,  2.47s/it]\n",
179 |       "/home/michael/miniconda3/envs/mlops2/lib/python3.9/site-packages/mlflow/models/signature.py:129: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\n",
180 |       "  inputs = _infer_schema(model_input)\n",
181 |       "2022/07/20 03:56:33 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed, possibly due older server version. The model artifacts have been logged successfully under gs://mlflow-runs-mlops-zoomcamp-354700/2/303079bbc52646c1a92e0c765a45e581/artifacts. In addition to exporting model artifacts, MLflow clients 1.7.0 and above attempt to record model metadata to the tracking store. If logging to a mlflow server via REST, consider upgrading the server version to MLflow 1.7.0 or above.\n"
182 |      ]
183 |     }
184 |    ],
185 |    "source": [
186 |     "with mlflow.start_run():\n",
187 |     "    model = LinearRegression(fit_intercept=True)\n",
188 |     "\n",
189 |     "    train_start = time.time()\n",
190 |     "    model.fit(X, y)\n",
191 |     "    train_end = time.time()\n",
192 |     "\n",
193 |     "    cv_start = time.time()\n",
194 |     "    df_cv = cross_validation(model, \n",
195 |     "                 X, \n",
196 |     "                 y, \n",
197 |     "                 cv_horizon, \n",
198 |     "                 cv_period, \n",
199 |     "                 cv_initial)\n",
200 |     "    \n",
201 |     "    df_p = (performance_metrics(df_cv, rolling_window=1)\n",
202 |     "                            # convert time_delta to seconds\n",
203 |     "                           .assign(horizon=lambda df_: df_['horizon'].dt.total_seconds())\n",
204 |     "                       )\n",
205 |     "    cv_end = time.time()\n",
206 |     "\n",
207 |     "    params=dict()\n",
208 |     "    params['train_data_start_date'] = train_start_date\n",
209 |     "    params['train_data_end_date'] = train_end_date\n",
210 |     "    params['cv_initial'] = cv_initial\n",
211 |     "    params['cv_horizon'] = cv_horizon\n",
212 |     "    params['cv_period'] = cv_period\n",
213 |     "    params['features'] = features_to_include\n",
214 |     "    \n",
215 |     "    \n",
216 |     "    metrics = df_p.to_dict('records')[0]\n",
217 |     "    metrics['train_duration_minutes'] = (train_end - train_start) / 60\n",
218 |     "    metrics['cv_duration_minutes'] = (cv_end - cv_start) / 60\n",
219 |     "\n",
220 |     "    mlflow.log_params(params)\n",
221 |     "    mlflow.log_metrics(metrics)\n",
222 |     "    \n",
223 |     "    signature = infer_signature(X, model.predict(X))\n",
224 |     "    mlflow.sklearn.log_model(model, artifact_path=\"models\", signature=signature)\n",
225 |     "    # mlflow.log_artifacts('ts_diagnostics.py')\n",
226 |     "    # mlflow.sklearn.log_model(model, 'model')"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 34,
232 |    "id": "b49a6ad4-3f58-4ba1-8821-a42a74cbf740",
233 |    "metadata": {},
234 |    "outputs": [
235 |     {
236 |      "data": {
237 |       "text/plain": [
238 |        "array([ -88.04979886,  137.40104012,    0.66634504,   61.96212739,\n",
239 |        "       -340.23553386,   51.6999897 ,   -4.07105147,    0.95781905])"
240 |       ]
241 |      },
242 |      "execution_count": 34,
243 |      "metadata": {},
244 |      "output_type": "execute_result"
245 |     }
246 |    ],
247 |    "source": [
248 |     "model.coef_"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 35,
254 |    "id": "900aceda-b62e-46c0-9222-d6edacb1723e",
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "['temp_F', 'year', 'day_of_year', 'hour', 'is_weekend', 'is_summer', 'month', 'temp_F_squared']\n"
262 |      ]
263 |     }
264 |    ],
265 |    "source": [
266 |     "print(features_to_include)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "id": "727d9798-3cef-4eef-af65-ba0b0ad7d1a0",
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": []
276 |   }
277 |  ],
278 |  "metadata": {
279 |   "kernelspec": {
280 |    "display_name": "Python 3 (ipykernel)",
281 |    "language": "python",
282 |    "name": "python3"
283 |   },
284 |   "language_info": {
285 |    "codemirror_mode": {
286 |     "name": "ipython",
287 |     "version": 3
288 |    },
289 |    "file_extension": ".py",
290 |    "mimetype": "text/x-python",
291 |    "name": "python",
292 |    "nbconvert_exporter": "python",
293 |    "pygments_lexer": "ipython3",
294 |    "version": "3.9.13"
295 |   }
296 |  },
297 |  "nbformat": 4,
298 |  "nbformat_minor": 5
299 | }
300 | 


--------------------------------------------------------------------------------
/05_model_training/mlflow_docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3-slim
 2 | 
 3 | WORKDIR /mlflow/
 4 | RUN apt-get update && pip install -U pip
 5 | RUN apt-get install postgresql-client -y
 6 | COPY requirements.txt .
 7 | RUN pip install -r requirements.txt
 8 | 
 9 | EXPOSE 5000
10 | 
11 | CMD mlflow server --backend-store-uri postgresql://michael:admin@${DB_PRIVATE_IP}:5432/${DB_NAME} \
12 | 	          --default-artifact-root gs://${BUCKET_NAME}/mlruns \
13 |                   --host 0.0.0.0 \
14 |                   --port 5000 
15 | 


--------------------------------------------------------------------------------
/05_model_training/mlflow_docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   mlflow:
 5 |     build: .
 6 |     ports:
 7 |       - 5000:5000
 8 |     volumes:
 9 |       - "./data:/mlflow"
10 |       - ~/.google/credentials/:/.google/credentials
11 |     environment:
12 |       DB_USER: '${DB_USER}'
13 |       DB_PASSWORD: '${DB_PASSWORD}'
14 |       DB_PRIVATE_IP: '${DB_PRIVATE_IP}'
15 |       DB_NAME: '${DB_NAME}'
16 |       BUCKET_NAME: '${BUCKET_NAME}'
17 |       GOOGLE_APPLICATION_CREDENTIALS: /.google/credentials/google_credentials.json
18 | 


--------------------------------------------------------------------------------
/05_model_training/mlflow_docker/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow
2 | google-cloud-storage
3 | psycopg2-binary
4 | 
5 | 


--------------------------------------------------------------------------------
/05_model_training/readme.md:
--------------------------------------------------------------------------------
 1 | # Model Training, Experiment Tracking, and Registration with MLFlow
 2 | 
 3 | The purpose of this project was to build an MLOps framework for model development, deployment, and monitoring, not to develop the best model. So with that in mind, I did not spend as much time as I would have liked on perfecting the model. Hopefully I will have time to come back and improve the model in the future. 
 4 | 
 5 | ## Setting up the MLFlow Tracking Server
 6 | 
 7 | Even though I am working on this project by myself, I wanted to go through the process of setting up the MLFlow tracking server and artifact storage as if I were on a team of data scientists and ML engineers. I modified this [excellent explanation](https://kargarisaac.github.io/blog/mlops/jupyter/2022/06/15/MLFlow-on-GCP.html) from Issac Kargar to dockerize and deploy the MLFlow on my GCP project. The tracking server is running on a virtual machine, the backend store is on a CloudSQL Postgres instance, and the artifact storage is in a GCS bucket. More information on the setup can be found in the [steps to recreate project](../steps_to_recreate_project) section of this repo.
 8 | 
 9 | 
10 | ## Model Experiment Tracking
11 | The extent of the experimentation for the modeling was very simple, but it serves to demonstrate the capabilities of MLFlow very well. I trained a linear model based on the recorded temperature data and some time series feature engineering. Then I recorded experiment runs with various features and interaction terms, and logged the model to MLFlow. The image below shows some of the model runs sorted by cross validation rmse. The model with the lowest rmse was trained on the following features: 
12 | 
13 | ['temp_F', 'year', 'day_of_year', 'hour', 'is_weekend', 'is_summer', 'month', 'temp_F_squared', 'hour_squared', 'hour_cubed']
14 | 
15 | ![](../img/mlflow1.PNG)
16 | 
17 | ## Model Experiment Comparison
18 | 
19 | MLFlow has some very useful features for comparing experiments. I wasn't able to take advantage of them for this project, since the I did not tune and optimize hyperparameters, but I can see how useful these interactive visualization would be to understand how various combinations of parameters and hyperparameters affect model experiments.
20 | 
21 | For example see the interactive parallel coordinate plot below.
22 | 
23 | ![](../img/mlflow2.PNG)
24 | 
25 | ## Model Registration
26 | You can see that based on the highlighted symbol in the "Model" column of the above table, that the best model was logged to the MLFlow model registry so it can be batch deployed using Airflow.
27 | 
28 | 


--------------------------------------------------------------------------------
/05_model_training/requirements.txt:
--------------------------------------------------------------------------------
 1 | prophet
 2 | jupyterlab
 3 | mlflow
 4 | google-cloud-BigQuery
 5 | google-cloud-storage
 6 | db-dtypes
 7 | pandas-gbq
 8 | seaborn
 9 | scikit-learn==1.1
10 | probscale


--------------------------------------------------------------------------------
/05_model_training/ts_diagnostics.py:
--------------------------------------------------------------------------------
  1 | # rolling window diagnostics
  2 | # modified from facebook prophet to be used with sklearn: 
  3 | # https://github.com/facebook/prophet/blob/main/python/prophet/diagnostics.py
  4 | 
  5 | from tqdm.auto import tqdm
  6 | from copy import deepcopy
  7 | import concurrent.futures
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | 
 13 | def generate_cutoffs(X, horizon, initial, period):
 14 |     """Generate cutoff dates
 15 |     Parameters
 16 |     ----------
 17 |     df: pd.DataFrame with historical data.
 18 |     horizon: pd.Timedelta forecast horizon.
 19 |     initial: pd.Timedelta window of the initial forecast period.
 20 |     period: pd.Timedelta simulated forecasts are done with this period.
 21 |     Returns
 22 |     -------
 23 |     list of pd.Timestamp
 24 |     """
 25 |     # Last cutoff is 'latest date in data - horizon' date
 26 |     cutoff = X.index.max() - horizon
 27 |     if cutoff < X.index.min():
 28 |         raise ValueError('Less data than horizon.')
 29 |     result = [cutoff]
 30 |     while result[-1] >= min(X.index) + initial:
 31 |         cutoff -= period
 32 |         # If data does not exist in data range (cutoff, cutoff + horizon]
 33 |         if not (((X.index > cutoff) & (X.index <= cutoff + horizon)).any()):
 34 |             # Next cutoff point is 'last date before cutoff in data - horizon'
 35 |             if cutoff > X.index.min():
 36 |                 closest_date = X[X.index <= cutoff].max().index
 37 |                 cutoff = closest_date - horizon
 38 |             # else no data left, leave cutoff as is, it will be dropped.
 39 |         result.append(cutoff)
 40 |     result = result[:-1]
 41 |     if len(result) == 0:
 42 |         raise ValueError(
 43 |             'Less data than horizon after initial window. '
 44 |             'Make horizon or initial shorter.'
 45 |         )
 46 |     # logger.info('Making {} forecasts with cutoffs between {} and {}'.format(
 47 |     #     len(result), result[-1], result[0]
 48 |     # ))
 49 |     print('Making {} forecasts with cutoffs between {} and {}'.format(
 50 |         len(result), result[-1], result[0]
 51 |     ))
 52 |     return list(reversed(result))
 53 | 
 54 | def cross_validation(model, X, y, horizon, period=None, initial=None, parallel=None, cutoffs=None, disable_tqdm=False):
 55 |     """Cross-Validation for time series.
 56 |     Computes forecasts from historical cutoff points, which user can input.
 57 |     If not provided, begins from (end - horizon) and works backwards, making
 58 |     cutoffs with a spacing of period until initial is reached.
 59 |     When period is equal to the time interval of the data, this is the
 60 |     technique described in https://robjhyndman.com/hyndsight/tscv/ .
 61 |     Parameters
 62 |     ----------
 63 |     model: Prophet class object. Fitted Prophet model.
 64 |     horizon: string with pd.Timedelta compatible style, e.g., '5 days',
 65 |         '3 hours', '10 seconds'.
 66 |     period: string with pd.Timedelta compatible style. Simulated forecast will
 67 |         be done at every this period. If not provided, 0.5 * horizon is used.
 68 |     initial: string with pd.Timedelta compatible style. The first training
 69 |         period will include at least this much data. If not provided,
 70 |         3 * horizon is used.
 71 |     cutoffs: list of pd.Timestamp specifying cutoffs to be used during
 72 |         cross validation. If not provided, they are generated as described
 73 |         above.
 74 |     parallel : {None, 'processes', 'threads', 'dask', object}
 75 |     disable_tqdm: if True it disables the progress bar that would otherwise show up when parallel=None
 76 |         How to parallelize the forecast computation. By default no parallelism
 77 |         is used.
 78 |         * None : No parallelism.
 79 |         * 'processes' : Parallelize with concurrent.futures.ProcessPoolExectuor.
 80 |         * 'threads' : Parallelize with concurrent.futures.ThreadPoolExecutor.
 81 |             Note that some operations currently hold Python's Global Interpreter
 82 |             Lock, so parallelizing with threads may be slower than training
 83 |             sequentially.
 84 |         * 'dask': Parallelize with Dask.
 85 |            This requires that a dask.distributed Client be created.
 86 |         * object : Any instance with a `.map` method. This method will
 87 |           be called with :func:`single_cutoff_forecast` and a sequence of
 88 |           iterables where each element is the tuple of arguments to pass to
 89 |           :func:`single_cutoff_forecast`
 90 |           .. code-block::
 91 |              class MyBackend:
 92 |                  def map(self, func, *iterables):
 93 |                      results = [
 94 |                         func(*args)
 95 |                         for args in zip(*iterables)
 96 |                      ]
 97 |                      return results
 98 |     Returns
 99 |     -------
100 |     A pd.DataFrame with the forecast, actual value and cutoff.
101 |     """
102 |       
103 |     horizon = pd.Timedelta(horizon)
104 |     period = pd.Timedelta(period)
105 |     initial = pd.Timedelta(initial)
106 |         
107 | 
108 |     # Compute Cutoffs
109 |     cutoffs = generate_cutoffs(X, horizon, initial, period)
110 |            
111 |     if parallel:
112 |         valid = {"threads", "processes"}
113 | 
114 |         if parallel == "threads":
115 |             pool = concurrent.futures.ThreadPoolExecutor()
116 |         elif parallel == "processes":
117 |             pool = concurrent.futures.ProcessPoolExecutor()
118 |        
119 |         else:
120 |             msg = ("'parallel' should be one of {} for an instance with a "
121 |                    "'map' method".format(', '.join(valid)))
122 |             raise ValueError(msg)
123 | 
124 |         iterables = ((df, model, cutoff, horizon, predict_columns)
125 |                      for cutoff in cutoffs)
126 |         iterables = zip(*iterables)
127 | 
128 |         logger.info("Applying in parallel with %s", pool)
129 |         predicts = pool.map(single_cutoff_forecast, *iterables)
130 |         
131 |     else:
132 |         predicts = [
133 |             single_cutoff_forecast(model, X, y, cutoff, horizon) 
134 |             for cutoff in (tqdm(cutoffs) if not disable_tqdm else cutoffs)
135 |         ]
136 | 
137 |     return pd.concat(predicts, axis=0).reset_index(drop=True)
138 | 
139 | def single_cutoff_forecast(model, X, y, cutoff, horizon):
140 |     """Forecast for single cutoff. Used in cross validation function
141 |     when evaluating for multiple cutoffs either sequentially or in parallel .
142 |     Parameters
143 |     ----------
144 |     df: pd.DataFrame.
145 |         DataFrame with history to be used for single
146 |         cutoff forecast.
147 |     model: Prophet model object.
148 |     cutoff: pd.Timestamp cutoff date.
149 |         Simulated Forecast will start from this date.
150 |     horizon: pd.Timedelta forecast horizon.
151 |     predict_columns: List of strings e.g. ['ds', 'yhat'].
152 |         Columns with date and forecast to be returned in output.
153 |     Returns
154 |     -------
155 |     A pd.DataFrame with the forecast, actual value and cutoff.
156 |     """
157 | 
158 |     # Train model
159 |     history_c = X[X.index <= cutoff]
160 |     history_c_y = y[y.index <= cutoff]
161 |     if history_c.shape[0] < 2:
162 |         raise Exception(
163 |             'Less than two datapoints before cutoff. '
164 |             'Increase initial window.')
165 |         
166 |     model.fit(history_c, history_c_y)
167 |     
168 |     # Calculate yhat
169 |     index_predicted = (X.index > cutoff) & (X.index <= cutoff + horizon)
170 |     # Get the columns for the future dataframe
171 |     
172 |    
173 |     yhat = model.predict(X[index_predicted])
174 |     
175 | 
176 |     # Merge yhat(predicts), y(X, original data) and cutoff
177 |     return pd.concat([
178 |         X[index_predicted],
179 |         pd.DataFrame({'ds':X[index_predicted].index, 'y': y[index_predicted], 'yhat': yhat, 'cutoff': [cutoff] * len(yhat)})
180 |     ], axis=1)
181 | 
182 | def performance_metrics(df, metrics=None, rolling_window=0.1, monthly=False):
183 |     """Compute performance metrics from cross-validation results.
184 |     Computes a suite of performance metrics on the output of cross-validation.
185 |     By default the following metrics are included:
186 |     'mse': mean squared error
187 |     'rmse': root mean squared error
188 |     'mae': mean absolute error
189 |     'mape': mean absolute percent error
190 |     'mdape': median absolute percent error
191 |     'smape': symmetric mean absolute percentage error
192 |     'coverage': coverage of the upper and lower intervals
193 |     A subset of these can be specified by passing a list of names as the
194 |     `metrics` argument.
195 |     Metrics are calculated over a rolling window of cross validation
196 |     predictions, after sorting by horizon. Averaging is first done within each
197 |     value of horizon, and then across horizons as needed to reach the window
198 |     size. The size of that window (number of simulated forecast points) is
199 |     determined by the rolling_window argument, which specifies a proportion of
200 |     simulated forecast points to include in each window. rolling_window=0 will
201 |     compute it separately for each horizon. The default of rolling_window=0.1
202 |     will use 10% of the rows in df in each window. rolling_window=1 will
203 |     compute the metric across all simulated forecast points. The results are
204 |     set to the right edge of the window.
205 |     If rolling_window < 0, then metrics are computed at each datapoint with no
206 |     averaging (i.e., 'mse' will actually be squared error with no mean).
207 |     The output is a dataframe containing column 'horizon' along with columns
208 |     for each of the metrics computed.
209 |     Parameters
210 |     ----------
211 |     df: The dataframe returned by cross_validation.
212 |     metrics: A list of performance metrics to compute. If not provided, will
213 |         use ['mse', 'rmse', 'mae', 'mape', 'mdape', 'smape', 'coverage'].
214 |     rolling_window: Proportion of data to use in each rolling window for
215 |         computing the metrics. Should be in [0, 1] to average.
216 |     monthly: monthly=True will compute horizons as numbers of calendar months 
217 |         from the cutoff date, starting from 0 for the cutoff month.
218 |     Returns
219 |     -------
220 |     Dataframe with a column for each metric, and column 'horizon'
221 |     """
222 |     valid_metrics = ['mse', 'rmse', 'mae', 'mape', 'mdape', 'smape', 'coverage']
223 |     if metrics is None:
224 |         metrics = valid_metrics
225 |     if ('yhat_lower' not in df or 'yhat_upper' not in df) and ('coverage' in metrics):
226 |         metrics.remove('coverage')
227 |     if len(set(metrics)) != len(metrics):
228 |         raise ValueError('Input metrics must be a list of unique values')
229 |     if not set(metrics).issubset(set(valid_metrics)):
230 |         raise ValueError(
231 |             'Valid values for metrics are: {}'.format(valid_metrics)
232 |         )
233 |     df_m = df.copy()
234 |     if monthly:
235 |         df_m['horizon'] = df_m['ds'].dt.to_period('M').astype(int) - df_m['cutoff'].dt.to_period('M').astype(int)
236 |     else:
237 |         df_m['horizon'] = df_m['ds'] - df_m['cutoff']
238 |     df_m.sort_values('horizon', inplace=True)
239 |     if 'mape' in metrics and df_m['y'].abs().min() < 1e-8:
240 |         logger.info('Skipping MAPE because y close to 0')
241 |         metrics.remove('mape')
242 |     if len(metrics) == 0:
243 |         return None
244 |     w = int(rolling_window * df_m.shape[0])
245 |     if w >= 0:
246 |         w = max(w, 1)
247 |         w = min(w, df_m.shape[0])
248 |     # Compute all metrics
249 |     dfs = {}
250 |     for metric in metrics:
251 |         dfs[metric] = eval(metric)(df_m, w)
252 |     res = dfs[metrics[0]]
253 |     for i in range(1, len(metrics)):
254 |         res_m = dfs[metrics[i]]
255 |         assert np.array_equal(res['horizon'].values, res_m['horizon'].values)
256 |         res[metrics[i]] = res_m[metrics[i]]
257 |     return res
258 | 
259 | 
260 | def rolling_mean_by_h(x, h, w, name):
261 |     """Compute a rolling mean of x, after first aggregating by h.
262 |     Right-aligned. Computes a single mean for each unique value of h. Each
263 |     mean is over at least w samples.
264 |     Parameters
265 |     ----------
266 |     x: Array.
267 |     h: Array of horizon for each value in x.
268 |     w: Integer window size (number of elements).
269 |     name: Name for metric in result dataframe
270 |     Returns
271 |     -------
272 |     Dataframe with columns horizon and name, the rolling mean of x.
273 |     """
274 |     # Aggregate over h
275 |     df = pd.DataFrame({'x': x, 'h': h})
276 |     df2 = (
277 |         df.groupby('h').agg(['sum', 'count']).reset_index().sort_values('h')
278 |     )
279 |     xs = df2['x']['sum'].values
280 |     ns = df2['x']['count'].values
281 |     hs = df2.h.values
282 | 
283 |     trailing_i = len(df2) - 1
284 |     x_sum = 0
285 |     n_sum = 0
286 |     # We don't know output size but it is bounded by len(df2)
287 |     res_x = np.empty(len(df2))
288 | 
289 |     # Start from the right and work backwards
290 |     for i in range(len(df2) - 1, -1, -1):
291 |         x_sum += xs[i]
292 |         n_sum += ns[i]
293 |         while n_sum >= w:
294 |             # Include points from the previous horizon. All of them if still
295 |             # less than w, otherwise weight the mean by the difference
296 |             excess_n = n_sum - w
297 |             excess_x = excess_n * xs[i] / ns[i]
298 |             res_x[trailing_i] = (x_sum - excess_x)/ w
299 |             x_sum -= xs[trailing_i]
300 |             n_sum -= ns[trailing_i]
301 |             trailing_i -= 1
302 | 
303 |     res_h = hs[(trailing_i + 1):]
304 |     res_x = res_x[(trailing_i + 1):]
305 | 
306 |     return pd.DataFrame({'horizon': res_h, name: res_x})
307 |     
308 | 
309 | 
310 | def rolling_median_by_h(x, h, w, name):
311 |     """Compute a rolling median of x, after first aggregating by h.
312 |     Right-aligned. Computes a single median for each unique value of h. Each
313 |     median is over at least w samples.
314 |     For each h where there are fewer than w samples, we take samples from the previous h,
315 |     moving backwards. (In other words, we ~ assume that the x's are shuffled within each h.)
316 |     Parameters
317 |     ----------
318 |     x: Array.
319 |     h: Array of horizon for each value in x.
320 |     w: Integer window size (number of elements).
321 |     name: Name for metric in result dataframe
322 |     Returns
323 |     -------
324 |     Dataframe with columns horizon and name, the rolling median of x.
325 |     """
326 |     # Aggregate over h
327 |     df = pd.DataFrame({'x': x, 'h': h})
328 |     grouped = df.groupby('h')
329 |     df2 = grouped.size().reset_index().sort_values('h')
330 |     hs = df2['h']
331 | 
332 |     res_h = []
333 |     res_x = []
334 |     # Start from the right and work backwards
335 |     i = len(hs) - 1
336 |     while i >= 0:
337 |         h_i = hs[i]
338 |         xs = grouped.get_group(h_i).x.tolist()
339 | 
340 |         # wrap in array so this works if h is pandas Series with custom index or numpy array
341 |         next_idx_to_add = np.array(h == h_i).argmax() - 1
342 |         while (len(xs) < w) and (next_idx_to_add >= 0):
343 |             # Include points from the previous horizon. All of them if still
344 |             # less than w, otherwise just enough to get to w.
345 |             xs.append(x[next_idx_to_add])
346 |             next_idx_to_add -= 1
347 |         if len(xs) < w:
348 |             # Ran out of points before getting enough.
349 |             break
350 |         res_h.append(hs[i])
351 |         res_x.append(np.median(xs))
352 |         i -= 1
353 |     res_h.reverse()
354 |     res_x.reverse()
355 |     return pd.DataFrame({'horizon': res_h, name: res_x})
356 | 
357 | 
358 | # The functions below specify performance metrics for cross-validation results.
359 | # Each takes as input the output of cross_validation, and returns the statistic
360 | # as a dataframe, given a window size for rolling aggregation.
361 | 
362 | 
363 | def mse(df, w):
364 |     """Mean squared error
365 |     Parameters
366 |     ----------
367 |     df: Cross-validation results dataframe.
368 |     w: Aggregation window size.
369 |     Returns
370 |     -------
371 |     Dataframe with columns horizon and mse.
372 |     """
373 |     se = (df['y'] - df['yhat']) ** 2
374 |     if w < 0:
375 |         return pd.DataFrame({'horizon': df['horizon'], 'mse': se})
376 |     return rolling_mean_by_h(
377 |         x=se.values, h=df['horizon'].values, w=w, name='mse'
378 |     )
379 | 
380 | 
381 | def rmse(df, w):
382 |     """Root mean squared error
383 |     Parameters
384 |     ----------
385 |     df: Cross-validation results dataframe.
386 |     w: Aggregation window size.
387 |     Returns
388 |     -------
389 |     Dataframe with columns horizon and rmse.
390 |     """
391 |     res = mse(df, w)
392 |     res['mse'] = np.sqrt(res['mse'])
393 |     res.rename({'mse': 'rmse'}, axis='columns', inplace=True)
394 |     return res
395 | 
396 | 
397 | def mae(df, w):
398 |     """Mean absolute error
399 |     Parameters
400 |     ----------
401 |     df: Cross-validation results dataframe.
402 |     w: Aggregation window size.
403 |     Returns
404 |     -------
405 |     Dataframe with columns horizon and mae.
406 |     """
407 |     ae = np.abs(df['y'] - df['yhat'])
408 |     if w < 0:
409 |         return pd.DataFrame({'horizon': df['horizon'], 'mae': ae})
410 |     return rolling_mean_by_h(
411 |         x=ae.values, h=df['horizon'].values, w=w, name='mae'
412 |     )
413 | 
414 | 
415 | def mape(df, w):
416 |     """Mean absolute percent error
417 |     Parameters
418 |     ----------
419 |     df: Cross-validation results dataframe.
420 |     w: Aggregation window size.
421 |     Returns
422 |     -------
423 |     Dataframe with columns horizon and mape.
424 |     """
425 |     ape = np.abs((df['y'] - df['yhat']) / df['y'])
426 |     if w < 0:
427 |         return pd.DataFrame({'horizon': df['horizon'], 'mape': ape})
428 |     return rolling_mean_by_h(
429 |         x=ape.values, h=df['horizon'].values, w=w, name='mape'
430 |     )
431 | 
432 | 
433 | def mdape(df, w):
434 |     """Median absolute percent error
435 |     Parameters
436 |     ----------
437 |     df: Cross-validation results dataframe.
438 |     w: Aggregation window size.
439 |     Returns
440 |     -------
441 |     Dataframe with columns horizon and mdape.
442 |     """
443 |     ape = np.abs((df['y'] - df['yhat']) / df['y'])
444 |     if w < 0:
445 |         return pd.DataFrame({'horizon': df['horizon'], 'mdape': ape})
446 |     return rolling_median_by_h(
447 |         x=ape.values, h=df['horizon'], w=w, name='mdape'
448 |     )
449 | 
450 | 
451 | def smape(df, w):
452 |     """Symmetric mean absolute percentage error
453 |     based on Chen and Yang (2004) formula
454 |     Parameters
455 |     ----------
456 |     df: Cross-validation results dataframe.
457 |     w: Aggregation window size.
458 |     Returns
459 |     -------
460 |     Dataframe with columns horizon and smape.
461 |     """
462 |     sape = np.abs(df['y'] - df['yhat']) / ((np.abs(df['y']) + np.abs(df['yhat'])) / 2)
463 |     if w < 0:
464 |         return pd.DataFrame({'horizon': df['horizon'], 'smape': sape})
465 |     return rolling_mean_by_h(
466 |         x=sape.values, h=df['horizon'].values, w=w, name='smape'
467 |     )
468 | 
469 | 
470 | def coverage(df, w):
471 |     """Coverage
472 |     Parameters
473 |     ----------
474 |     df: Cross-validation results dataframe.
475 |     w: Aggregation window size.
476 |     Returns
477 |     -------
478 |     Dataframe with columns horizon and coverage.
479 |     """
480 |     is_covered = (df['y'] >= df['yhat_lower']) & (df['y'] <= df['yhat_upper'])
481 |     if w < 0:
482 |         return pd.DataFrame({'horizon': df['horizon'], 'coverage': is_covered})
483 |     return rolling_mean_by_h(
484 |         x=is_covered.values, h=df['horizon'].values, w=w, name='coverage'
485 |     )


--------------------------------------------------------------------------------
/06_deployment/batch_predict.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from google.cloud import bigquery
  3 | import pandas as pd
  4 | import calendar
  5 | import pytz
  6 | import mlflow
  7 | 
  8 | 
  9 | def pull_temp_forecast(start_date, tz='America/Denver'):
 10 |     q = f"""
 11 |             WITH forecast_pit as
 12 |             (
 13 |             SELECT 
 14 |                 forecast_time,
 15 |                 MAX(creation_time) creation_time
 16 | 
 17 |             FROM
 18 |                 `mlops-zoomcamp-354700.energy_data.weather_forecast`
 19 |             WHERE
 20 |                 creation_time <= TIMESTAMP( "{start_date}", "{tz}")
 21 |                 AND forecast_time >= TIMESTAMP("{start_date}", "{tz}")
 22 |             GROUP BY
 23 |                 forecast_time
 24 |             )
 25 | 
 26 |             SELECT 
 27 |             f.forecast_time,
 28 |             f.temp_f,
 29 |             f.creation_time
 30 |             FROM `mlops-zoomcamp-354700.energy_data.weather_forecast` f
 31 |             INNER JOIN forecast_pit
 32 |             ON forecast_pit.creation_time =  f.creation_time 
 33 |             AND forecast_pit.forecast_time = f.forecast_time
 34 | 
 35 |             ORDER BY forecast_time
 36 |     """
 37 | 
 38 |     tz_info = pytz.timezone(tz)
 39 |     
 40 |     # interpolate temp_f forecast on hourly basis (previously it is only every 3 hours)
 41 |     df = (pd.read_gbq(q, project_id='mlops-zoomcamp-354700')
 42 |             .assign(temp_F=lambda df_: df_['temp_f'].astype(float))
 43 |             .set_index('forecast_time')
 44 |             .loc[:, 'temp_F']
 45 |             .resample('H')
 46 |             .interpolate('cubic')
 47 |             .reset_index()
 48 |             .assign(energy_timestamp_mtn=lambda df_: df_['forecast_time'].dt.tz_convert(tz_info))
 49 |             .drop(columns=['forecast_time'])
 50 |     )
 51 |     
 52 |     return df
 53 | 
 54 | 
 55 | def make_features(df, features):
 56 |     df_out = (df
 57 |                 .reset_index()
 58 |                 .assign(
 59 |                     year=lambda df_: df_['energy_timestamp_mtn'].dt.year,
 60 |                     day_of_year=lambda df_: df_['energy_timestamp_mtn'].dt.day_of_year,
 61 |                     hour=lambda df_: df_['energy_timestamp_mtn'].dt.hour,
 62 |                     is_weekend=lambda df_: df_['energy_timestamp_mtn'].dt.day_of_week >= 5, # saturady day_of_week = 5, sunday = 6
 63 |                     is_summer=lambda df_: df_['energy_timestamp_mtn'].dt.month.between(5, 9, inclusive='both'),
 64 |                     month=lambda df_: df_['energy_timestamp_mtn'].dt.month,
 65 |                     temp_F_squared=lambda df_: df_['temp_F'] * df_['temp_F'],
 66 |                     hour_squared=lambda df_: df_['hour'] ** 2,
 67 |                     hour_cubed=lambda df_: df_['hour'] ** 3,
 68 |             )
 69 | 
 70 |         .set_index('energy_timestamp_mtn')                                    
 71 |     )
 72 | 
 73 | 
 74 |     for month in calendar.month_name[1:]:
 75 |         df_out[month] = pd.to_numeric(df_out.index.month_name == month)
 76 |         
 77 |     return df_out[features]
 78 | 
 79 | 
 80 | def load_model(run_id):
 81 |     logged_model = f'gs://mlflow-runs-mlops-zoomcamp-354700/2/{run_id}/artifacts/model'
 82 |     model = mlflow.pyfunc.load_model(logged_model)
 83 |     return model
 84 | 
 85 | 
 86 | def save_results(df, y_pred, run_id, start_date, output_file):
 87 |     df_result = (pd.DataFrame()
 88 |                     .assign(energy_timestamp_mtn=df.index,
 89 |                             predicted_energy_demand=y_pred,
 90 |                             temp_f_forecast=df['temp_F'].reset_index(drop=True),
 91 |                             model_version=run_id,
 92 |                             prediction_start_date=start_date,
 93 |                             prediction_creation_date=datetime.now())
 94 |                 )
 95 |     df_result.to_parquet(output_file, index=False)
 96 | 
 97 | 
 98 | def apply_model(run_id, features, start_date, output_file, tz='America/Denver'):
 99 |     df = pull_temp_forecast(start_date, tz='America/Denver')
100 |     df = make_features(df, features=features)
101 |     model = load_model(run_id)
102 |     y_pred = model.predict(df)
103 |     save_results(df, y_pred, run_id, start_date, output_file)
104 |     return output_file
105 | 
106 | 
107 | if __name__ == '__main__':
108 | 
109 |     start_date = datetime(2022, 7, 15, 0, 0, 0)
110 |     tz = "US/Mountain"
111 |     run_id = '49c833f911ae43488e67063f410b7b5e'
112 |     output_file = 'output.parquet'
113 | 
114 |     features = ['temp_F', 'year', 'day_of_year', 'hour', 'is_weekend', 
115 |             'is_summer', 'month', 'temp_F_squared', 'hour_squared', 'hour_cubed']
116 |     
117 |     apply_model(run_id, features, start_date, output_file, tz)
118 | 
119 |     result = pd.read_parquet(output_file)
120 |     print(result)
121 |     


--------------------------------------------------------------------------------
/06_deployment/readme.md:
--------------------------------------------------------------------------------
1 | # Batch Model Deployment with Airflow
2 | 
3 | To deploy the model that was trained and logged to the MLFlow model registry, I used the same Airflow instance that is orchestrating the data collection pipeline. 
4 | 
5 | The batch_predict script in this directory was modified into an [Airflow DAG](../02_airflow/dags/batch_predict_dag.py). Every night at midnight, the DAG pulls the model from MLFlow, pulls the necessary data from the data warehouse, runs the model, and loads the next set of predictions into the Data Warehouse.
6 | 
7 | 


--------------------------------------------------------------------------------
/07_monitoring/app.py:
--------------------------------------------------------------------------------
  1 | from google.oauth2 import service_account
  2 | from google.cloud import bigquery
  3 | import probscale
  4 | import numpy as np
  5 | import pandas as pd
  6 | import calendar
  7 | import hvplot.pandas
  8 | import holoviews as hv
  9 | from datetime import date, timedelta
 10 | import os
 11 | import streamlit as st
 12 | 
 13 | st.set_page_config(layout='wide')
 14 | 
 15 | # Query the data from BigQuery.
 16 | # Create API client.
 17 | credentials = service_account.Credentials.from_service_account_info(
 18 |     st.secrets["gcp_service_account"]
 19 | )
 20 | client = bigquery.Client(credentials=credentials)
 21 | 
 22 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID", "mlops-zoomcamp-354700")
 23 | BIGQUERY_DATASET = 'energy_data_prod'
 24 | 
 25 | # Perform query.
 26 | # Uses st.experimental_memo to only rerun when the query changes or after 10 min.
 27 | @st.experimental_memo(ttl=600)
 28 | def run_query(query):
 29 |     # query_job = client.query(query)
 30 |     # rows_raw = query_job.result()
 31 |     # # Convert to list of dicts. Required for st.experimental_memo to hash the return value.
 32 |     # rows = [dict(row) for row in rows_raw]
 33 |     return pd.read_gbq(query, project_id=PROJECT_ID, credentials=credentials)
 34 | 
 35 | 
 36 | # model selection
 37 | q = f"""SELECT DISTINCT(model_version) 
 38 |        FROM `mlops-zoomcamp-354700.energy_data_prod.ml_model_metrics` 
 39 |        """
 40 | model_options = run_query(q)['model_version'].tolist()
 41 | model_selection = st.sidebar.selectbox("Select Model", model_options)
 42 | 
 43 | 
 44 | # date range 
 45 | TODAY = date.today()
 46 | TOMORROW = TODAY + timedelta(2)
 47 | TWO_WEEK_PRIOR = TODAY - timedelta(14)
 48 | with st.sidebar.form('date_picker'):
 49 |         start_date, end_date = st.date_input('Select a date range, then click "Update"', min_value=date(2015,7, 4), max_value=TOMORROW, value=(TWO_WEEK_PRIOR, TOMORROW))
 50 |         submitted = st.form_submit_button("Update")
 51 | 
 52 | q = f"""SELECT * 
 53 |        FROM `mlops-zoomcamp-354700.energy_data_prod.ml_model_metrics`
 54 |        WHERE hours_from_pred_start <= 24 and 
 55 |              date(energy_timestamp_mtn) BETWEEN date('{start_date}') and date('{end_date}') 
 56 |        """
 57 | metrics = run_query(q)
 58 | 
 59 | st.title("Model Monitoring Dashboard")
 60 | st.write(f"Analyzing predictions between {start_date.strftime('%Y-%m-%d')} and {end_date.strftime('%Y-%m-%d')}.\n Use the sidebar to select a different date range.")
 61 | 
 62 | # display key metrics
 63 | st.write('## Hourly Energy Demand Prediction Metrics')
 64 | col1, col2, col3 = st.columns(3)
 65 | col1.metric('Mean Error', f"{round(metrics['energy_error'].mean(), 2)} MWh")
 66 | col2.metric('Mean Absolute Error', f"{round(metrics['energy_error_abs'].mean(), 2)} MWh")
 67 | col3.metric('Mean Absolute Percentage Error', f"{round(metrics['energy_error_abs_pct'].mean() * 100, 2)}%")
 68 | 
 69 | st.write('## Hourly Temperature Forecast Metrics')
 70 | col1, col2, col3 = st.columns(3)
 71 | col1.metric('Mean Error', f"{round(metrics['temp_f_error'].mean(), 2)} degF")
 72 | col2.metric('Mean Absolute Error', f"{round(metrics['temp_f_error_abs'].mean(), 2)} degF")
 73 | col3.metric('Mean Absolute Percentage Error', f"{round(metrics['temp_f_error_abs_pct'].mean() * 100, 2)}%")
 74 | 
 75 | st.markdown("""---""") 
 76 | st.write('## Hourly Monitoring Plots')
 77 | # plot actual vs predicted scatter plot
 78 | st.write('### Actual vs Predicted')
 79 | e = metrics.hvplot.scatter(x='energy_demand', y='predicted_energy_demand', label='Energy Demand (MWh)')
 80 | t = metrics.hvplot.scatter(x='temp_F', y='temp_f_forecast', label='Temp (F)')
 81 | 
 82 | st.bokeh_chart(
 83 |     hv.render(e + t)
 84 | )
 85 | 
 86 | # plot actual vs predicted over time
 87 | st.write('### Actual vs Predicted Over Time')
 88 | actual = metrics.hvplot(x='energy_timestamp_mtn', y='energy_demand', label='actual')
 89 | predicted = metrics.hvplot(x='energy_timestamp_mtn', y='predicted_energy_demand', label='predicted', title='Energy Demand (MWh) - Actual vs Predicted Over Time')
 90 | e = actual * predicted
 91 | 
 92 | actual = metrics.hvplot(x='energy_timestamp_mtn', y='temp_F', label='actual')
 93 | predicted = metrics.hvplot(x='energy_timestamp_mtn', y='temp_f_forecast', label='predicted', title='Temp (F) - Actual vs Predicted Over Time')
 94 | t = actual * predicted
 95 | 
 96 | st.bokeh_chart(
 97 |     hv.render(e + t)
 98 | )
 99 | 
100 | # plot error over time
101 | st.write('### Error Over Time')
102 | actual = metrics.hvplot(x='energy_timestamp_mtn', y='energy_error', label='Energy Demand (MWh) Error (actual - predicted) Over Time')
103 | e = actual * hv.HLine(0).opts(color='gray', line_width=1)
104 | 
105 | actual = metrics.hvplot(x='energy_timestamp_mtn', y='temp_f_error', label='Temp (F) Error (actual - predicted) Over Time')
106 | t = actual * hv.HLine(0).opts(color='gray', line_width=1)
107 | st.bokeh_chart(
108 |     hv.render(e + t)
109 | )
110 | 
111 | # plot absolute percentage error over time
112 | st.write('### Absolute Percentage Error Over Time')
113 | actual = metrics.hvplot(x='energy_timestamp_mtn', y='energy_error_abs_pct', label='Energy Demand (MWh) Absolute Percentage Error')
114 | e = actual
115 | 
116 | actual = metrics.hvplot(x='energy_timestamp_mtn', y='temp_f_error_abs_pct', label='Temp (F) Absolute Percentage Error')
117 | t = actual
118 | st.bokeh_chart(
119 |     hv.render(e + t)
120 | )
121 | 
122 | # plot error distribution
123 | st.write('### Error Distribution')
124 | e = metrics.hvplot.hist('energy_error', label='Energy Demand (MWh) Error Distribution')
125 | t = metrics.hvplot.hist('temp_f_error', label='Temp (F) Error Distribution')
126 | st.bokeh_chart(
127 |     hv.render(e + t)
128 | )
129 | 


--------------------------------------------------------------------------------
/07_monitoring/readme.md:
--------------------------------------------------------------------------------
 1 | # Model Performance Monitoring
 2 | 
 3 | ## Metrics calculation with dbt in the Warehouse
 4 | I took a batch monitoring approach for this project. As described in the previous section, the model is batch deployed on a daily basis using Airflow to predict the next day's hourly energy demand. Those predictions are loaded into the data warehouse. 
 5 | 
 6 | The data pipeline is also updating actual recorded energy demand and temperature on an hourly basis. Since all of that data is in the warehouse too, I created a [sql model in dbt](../03_dbt/models/core/ml_model_metrics.sql) to compute metrics on each prediction as soon as the actual data is loaded and transformed.
 7 | 
 8 | ## Model Performance Dashboard
 9 | To visualize the model performance, I created a dashboard using Streamlit that reads the calculated metric data from the warehouse.
10 | 
11 | [LINK TO DASHBOARD](https://mharty3-energy-data-capstone-07-monitoringapp-o8dnn1.streamlitapp.com/)
12 | 
13 | ### Dashboard Screenshots
14 | ![](../img/monitoring_dashboard_1.PNG)
15 | ![](../img/monitoring_dashboard_2.PNG)
16 | 
17 | 


--------------------------------------------------------------------------------
/07_monitoring/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-bigquery
2 | streamlit
3 | pandas
4 | pandas_gbq
5 | hvplot
6 | probscale
7 | click==7.1.2


--------------------------------------------------------------------------------
/img/Architecture.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/Architecture.PNG


--------------------------------------------------------------------------------
/img/batch_predict_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/batch_predict_dag.PNG


--------------------------------------------------------------------------------
/img/dashboard.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/dashboard.PNG


--------------------------------------------------------------------------------
/img/dashboard1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/dashboard1.PNG


--------------------------------------------------------------------------------
/img/dashboard_mockup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/dashboard_mockup.png


--------------------------------------------------------------------------------
/img/dbt_demand.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/dbt_demand.PNG


--------------------------------------------------------------------------------
/img/dbt_demand_forecast.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/dbt_demand_forecast.PNG


--------------------------------------------------------------------------------
/img/dbt_monitoring.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/dbt_monitoring.PNG


--------------------------------------------------------------------------------
/img/dbt_temp.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/dbt_temp.PNG


--------------------------------------------------------------------------------
/img/de_architecture.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/de_architecture.PNG


--------------------------------------------------------------------------------
/img/eia_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/eia_dag.PNG


--------------------------------------------------------------------------------
/img/mlflow1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/mlflow1.PNG


--------------------------------------------------------------------------------
/img/mlflow2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/mlflow2.PNG


--------------------------------------------------------------------------------
/img/mlops_architecture.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/mlops_architecture.PNG


--------------------------------------------------------------------------------
/img/monitoring_dashboard_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/monitoring_dashboard_1.PNG


--------------------------------------------------------------------------------
/img/monitoring_dashboard_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/monitoring_dashboard_2.PNG


--------------------------------------------------------------------------------
/img/noaa_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/noaa_dag.PNG


--------------------------------------------------------------------------------
/img/owm_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/owm_dag.PNG


--------------------------------------------------------------------------------
/img/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/pipeline.png


--------------------------------------------------------------------------------
/img/weather_forecast_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/img/weather_forecast_dag.PNG


--------------------------------------------------------------------------------
/proposal.md:
--------------------------------------------------------------------------------
 1 | ## Course Project Proposal
 2 | 
 3 | The goal of this project is to apply everything I learned
 4 | in this course to build an end-to-end data pipeline.
 5 | 
 6 | Deadlines:
 7 | 
 8 | * Submitting the project: 28 March, 22:00 CET
 9 | * Peer reviewing: 4 April, 22:00 CET
10 | 
11 | ## Objective
12 | 
13 | This project will create the necessary data engineering infrastructure (an "end-to-end data pipeline") to evaluate trends in electricity demand in the Front Range of Colorado, USA.
14 | 
15 | Loading historical demand data along with historical temperature records will allow data scientists to develop reliable demand forecasts. I will also load the existing demand forecast from the EIA for baseline comparisons.
16 | 
17 | Accurately forecasting energy demand is critical for electrical grid operators. Without significant power storage options, electricity must be used at the time it is generated. Balancing authorities must have an accurate power demand forecast in order to successfully plan generation and interchange over the upcoming day(s). As the shift to renewables accelerates, this will only become more important because renewables are often more intermittent than traditional generation sources of power.
18 | 
19 | ## Technologies 
20 | 
21 | * Cloud: GCP (some data sources will be extracted from AWS)
22 | * Infrastructure as code (IaC): Terraform and Docker
23 | * Workflow orchestration: Airflow
24 | * Data Lake: GCS
25 | * Data Warehouse: BigQuery
26 | * Batch processing: probably dbt
27 | 
28 | ## Data Sources
29 | 
30 | Multiple data sources of varying types will feed the pipeline.
31 | 
32 | * **Historical Energy Demand**: US EIA via REST API [link](https://www.eia.gov/opendata/)
33 | * **Historical Weather Records**: US NOAA via csv files hosted in a public AWS S3 Bucket [link](https://registry.opendata.aws/noaa-isd/)
34 | * **Temperature Forecast Data**: US NOAA via GRIB files hosted in a public AWS S3 Bucket [link](https://registry.opendata.aws/noaa-ndfd/)
35 | 
36 | ## Data Pipeline
37 | 
38 | The cadence of the pipeline is still TBD. It will be scheduled to run at least daily, perhaps even hourly.
39 | 
40 | 1. Extract data from various sources and save raw data to the data lake
41 | 2. Transform raw data to cloud optimized storage formats (parquet and zarr)
42 | 3. Load timeseries data to the data warehouse (will need more research for optimized timeseries storage and processing)
43 | 3. Transform (with dbt) as needed to join historical datasets with forecasted datasets and set up efficient time groupbys
44 | 4. Visualize in Google Cloud Studio
45 | 
46 | ![](img/pipeline.png)
47 | 
48 | ## Dashboard Concept
49 | ![](img/dashboard_mockup.png)
50 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Capstone Project for Data Engineering Zoomcamp and MLOps Zoomcamp
 2 | 
 3 | This is my capstone project for both the Data Engineering Zoomcamp and the MLOps Zoomcamp from Data Talks Club.  Course work and notes for the DE Zoomcamp are in [this other repo](https://github.com/mharty3/data_engineering_zoomcamp_2022).
 4 | 
 5 | ## Objective
 6 | With few exceptions, electricity must be generated at the same time it is needed. This creates a challenge for electrical grid operators who have to make a plan to generate the electricity their customers need each hour. Accurately forecasting energy demand is critical for grid operators so they can appropriately plan to meet the generation needs of their customers.
 7 | 
 8 | This project utilizes Data Engineering and Machine Learning Operations (MLOps) concepts to build a system for forecasting hourly electricity demand in my region of Colorado. Implementing the project demonstrates the data skills I learned in the Data Talks Club's Zoomcamps. 
 9 | 
10 | ### Data Engineering
11 | I applied data engineering and EtLT concepts from the modern data stack including workflow orchestration with Airflow, cloud data warehousing and data lake storage on Google Cloud, and data transformation with dbt. I used these concepts and tools to build a data pipeline that populates a Big Query data warehouse with the data a data scientist needs to make develop an hourly day-ahead demand forecast. 
12 | 
13 | ![](img/de_architecture.PNG)
14 | 
15 | ### MLOps
16 | Once the data pipeline was running, I used concepts and tools from MLOps to build a system for developing, deploying, and monitoring machine learning models that predict hourly energy demand. I used experiment tracking and model registration with MLFlow, batch model deployment with Airflow, and model monitoring with dbt and Streamlit.
17 | 
18 | ![](img/mlops_architecture.PNG)
19 | 
20 | 
21 | 
22 | ## High level requirements
23 | 
24 | * The system should allow users to access historical electricity demand, EIA demand forecasts, historical weather data, and up-to-date weather forecast data in a Big Query data warehouse
25 | * The scope will be limited to the Public Service Company of Colorado Balancing Authority (aka Xcel Energy)
26 | * There should be an interactive dashboard to interact and visualize the data that is being loaded into the data warehouse
27 | * An MLFlow tracking server, artifact storage, and model registry should be used to track the models that are being developed and deployed
28 | * Production models should be deployed from the model registry to Airflow for batch model deployment
29 | * Model performance metrics should be tracked and visualized in a dashboard
30 | 
31 | 
32 | ## Data Sources
33 | Notebooks exploring each of these data sources can be found [here](00_data_source_exploration)
34 | 
35 | * Electricity Demand and Generation - [EIA Open Data](https://www.eia.gov/opendata/)
36 |   * The United States Energy Information Administration (EIA) provides open access to hundreds of thousands of time series datasets via a REST API. The data is in the public domain, and requires [registraton and an API key](https://www.eia.gov/opendata/register.php).
37 | 
38 | * Historical Weather Data - [NOAA Integrated Surface Data](https://registry.opendata.aws/noaa-isd/)
39 |   * The United States National Oceanic and Atmospheric Administration (NOAA) maintins the Integrated Surface Database (ISD) with global hourly weather station observations from nearly 30,000 stations. The data is available in csv format in open AWS S3 bucket.
40 | 
41 | * Live weather data - [Open Weather Map API](https://openweathermap.org/)
42 |   * Live weather observation data for anywhere on the globe is available for free (with certain API call limits) via the Open Weather Map REST API.
43 | 
44 | * Weather Forecast Data - [NOAA National Digital Forecast Database](https://registry.opendata.aws/noaa-ndfd/)
45 |   * NOAA maintains the National Digital Forecast Database (NDFD) which is a suite of gridded forecasts of weather conditions for the United States. The data is available in gridded format in an open AWS S3 bucket or via XML from a REST API.
46 | 
47 | ## Technologies and Tools
48 | - Cloud - [**Google Cloud Platform**](https://cloud.google.com)
49 | - Infrastructure as Code - [**Terraform**](https://www.terraform.io)
50 | - Containerization - [**Docker**](https://www.docker.com) and [**Docker Compose**](https://docs.docker.com/compose/)
51 | - Workflow Orchestration - [**Airflow**](https://airflow.apache.org)
52 | - Pre-Load Transformation - [**pandas**](https://pandas.pydata.org/) and [**pyarrow**](https://arrow.apache.org/docs/python/index.html)
53 | - Data Lake - [**Google Cloud Storage**](https://cloud.google.com/storage)
54 | - Data Warehouse - [**BigQuery**](https://cloud.google.com/bigquery)
55 | - Post-Load Transformation - [**dbt**](https://www.getdbt.com)
56 | - Data Visualization/Dashboard - [**Streamlit**](https://streamlit.io/) and [**Plotly Express**](https://plotly.com/python/plotly-express/) and [**hvplot**](https://hvplot.holoviz.org/)
57 | - Model Development, Experiment Tracking, and Registration - [**scikit-learn**](https://scikit-learn.org/) and [**MLflow**](https://www.mlflow.org/)
58 | - Model Deployment - Batch Deployment with [**Airflow**](https://airflow.apache.org)
59 | - Model Monitoring - [**dbt**](https://www.getdbt.com) and [**Streamlit**](https://streamlit.io/)
60 | 
61 | 
62 | 
63 | ## Data and Forecast Dashboard
64 | [Link](https://share.streamlit.io/mharty3/energy_data_capstone/04_dashboard/app.py)
65 | 
66 | Note: I am running low on free GCP credits, so by the time you read this, the apps may no longer work.
67 | 
68 | ![](img/dashboard1.PNG)
69 | 
70 | ## Monitoring Dashboard
71 | [Link](https://mharty3-energy-data-capstone-07-monitoringapp-o8dnn1.streamlitapp.com/)
72 | ![](img/monitoring_dashboard_1.PNG)
73 | 


--------------------------------------------------------------------------------
/steps_to_recreate_project/images/01_service_account.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/steps_to_recreate_project/images/01_service_account.PNG


--------------------------------------------------------------------------------
/steps_to_recreate_project/images/02_service_account_key.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/steps_to_recreate_project/images/02_service_account_key.PNG


--------------------------------------------------------------------------------
/steps_to_recreate_project/images/03_vm.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mharty3/energy_data_capstone/c49d57ddcafd28c12923186dd29de95edde3c7a3/steps_to_recreate_project/images/03_vm.PNG


--------------------------------------------------------------------------------
/steps_to_recreate_project/readme.md:
--------------------------------------------------------------------------------
  1 | # Steps to recreate project
  2 | 
  3 | ## Provision Cloud Infrastructure
  4 | This section will walk you through how to use the existing terraform configuration files to create a Big Query instance, a GCS storage bucket, and a Google Cloud Compute Engine Instance (virtual machine). Then it will walk through configuring the compute instance to be ready to use. 
  5 | 
  6 | 1. Create a new project on google cloud platform
  7 | 
  8 | 2. Set up IAM (Identity and Access Management) for a Service Account. Grant this account **Storage Admin** + **Storage Object Admin** + **BigQuery Admin** + **Compute Admin** privileges. This service account will be used by terraform to provision the infrastructure for the project.
  9 | 
 10 | ![](./images/01_service_account.PNG)
 11 | 
 12 | 3. Create and download a json key for credentials. Store the file in `~/.config/gcloud`
 13 |    Create an environment variable with the path to the credentials json. This will be used by terraform to access GCP.
 14 | 
 15 |    ```bash
 16 |    export GOOGLE_APPLICATION_CREDENTIALS="path/to/credentials.json"
 17 |    ```
 18 | 
 19 | ![](./images/02_service_account_key.PNG)
 20 | 
 21 | 4. On GCP, enable the Compute Engine API
 22 | 
 23 | 4. Create an ssh key that will be used to connect the the VM. Update the value of the `metadata: ssh_public_key` variable in the `variables.tf` file with the path to the public key.
 24 | 
 25 |     ```bash
 26 |     ssh-keygen -t rsa -f ~/.ssh/<name_of_credential> -C <user-name> -b 2048
 27 |     ```
 28 | 
 29 | 5. Use terraform to provision the infrastructure. Terraform will prompt you to enter the project id from GCP. If you have used terraform in this directory in a different project, you may need to create a new terraform workspace.
 30 | 
 31 |    ```bash
 32 |    terraform workspace new <workspace-name>
 33 |    ```
 34 | 
 35 |     ```bash
 36 |     cd 01_terraform
 37 |     terraform init
 38 |     terraform plan
 39 |     terraform apply
 40 |     ```
 41 | 
 42 | Now we can see that the infrastructure has been created on GCP. For example, the VM instance: 
 43 | 
 44 | ![](./images/03_vm.PNG)
 45 | 
 46 | 
 47 | 5. Configure local ssh to connect to the vm. 
 48 | 
 49 |     On Linux (or WSL in my case) create or modify a file called `~/.ssh/config` to contain the following block of text:
 50 | 
 51 |         ```
 52 |         Host energy-vm
 53 |             HostName <external IP of Compute instance>
 54 |             User <username>
 55 |             IdentityFile <path to ssh private key created above. something like ~/.ssh/gcp2 >
 56 |         ```
 57 |     
 58 |     For windows you will want to add the same text to the windows ssh config located in 
 59 |     
 60 |     `/mnt/c/Users/<username>/.ssh/config`
 61 |     
 62 |     Double check the path to the private key, you may need to copy it to the windows side from WSL.
 63 | 
 64 | 6. Now connect to the vm from the terminal with `ssh energy-vm`
 65 | 
 66 | 7. In order to authenticate with GitHub, you will need to create a new ssh key and add it to GitHub. Generate the key with the command: 
 67 | 
 68 |     ```bash
 69 |     ssh-keygen -t rsa -f ~/.ssh/id_rsa -b 4096
 70 |     ```
 71 | 
 72 |     On GitHub, go to settings, ssh keys, and click New SSH Key. Paste in the contents of the file `~/.ssh/id_rsa.pub` into the textbox. 
 73 | 
 74 | 7. Git clone this repo using ssh
 75 | 
 76 |     ```bash
 77 |     git clone git@github.com:mharty3/energy_data_capstone.git
 78 |     ```
 79 | 
 80 | 8. Set up git identity: 
 81 |     
 82 |     ```bash
 83 |     git config --global user.email <email>
 84 |     git config --global user.name <username>
 85 |     ```
 86 | 
 87 | 8. Run the bash setup script and the below commands to install `oh-my-fish` shell and install the theme that I like. 
 88 |     
 89 |     ```bash
 90 |      bash energy_data_capstone/01_terraform/vm_init.sh
 91 |      curl https://raw.githubusercontent.com/oh-my-fish/oh-my-fish/master/bin/install | fish
 92 |      omf install agnoster
 93 |      conda intit fish
 94 |      ```
 95 | 
 96 | 9. Disconnect and re-connect from the VM via SSH and it will be all set up! Remember that if you shut down and restart the VM, the IP will probably change and you will need to update your local ssh config files accordingly.
 97 | 
 98 | ### Useful links for GCP and Terraform: 
 99 | 
100 | * https://cloud.google.com/community/tutorials/getting-started-on-gcp-with-terraform
101 | 
102 | * https://stackoverflow.com/questions/62638916/how-to-provide-image-name-in-gcp-terraform-script
103 | 
104 | ## Set up airflow
105 | 
106 | 1. Create a file in `02_airflow` called `.env`. Add the following variables:
107 | 
108 |     ```
109 |     AIRFLOW_UID=50000
110 |     EIA_KEY=<api key for the eia>
111 |     OWM_KEY=<api key for open weather map>
112 |     ```
113 | 
114 | 2. Update the values for the variables `GCP_PROJECT_ID` and `GCP_GCS_BUCKET` in `docker-compose.yml` with the new values for the new project id
115 | 
116 | 3. Copy the google credentials json file created above (it was saved in `~/.config/gcloud` on the local machine) to the vm using scp. Note this command is run in the terminal on the local machine, and `<energy-vm>` should be whatever the remote host is named in the `~/.ssh/config` file.
117 | 
118 |     ```bash
119 |     scp <local path to credentials> <energy-vm>:~/.google/credentials/google_credentials.json 
120 |     ```
121 | 
122 | 
123 | 
124 | 4. cd into into `02_airflow` and run:
125 |     
126 |     ```
127 |     docker-compose up airflow-init
128 |     docker-compose up
129 |     ```
130 | 
131 | 5. Forward the port `8080` from the VM to the local machine to access the web interface at `http://localhost:8080/home`
132 | 
133 | 
134 | 6. Before activating the dags, it is convenient to set the `ingest_historical_weather_data_dag` to a yearly interval for a quicker backfill. Once it has caught up to the previous year, adjust the `end-date` to several days into the future, and change the interval back to daily. There is typically a lag of several days before observations are added to the historical data source, so this will allow the dag to pull the updated observations as they are added to the file. 
135 | 
136 | 7. Activate the dags, and the runs will begin populating the data lake and data warehouse.
137 | 
138 | ## dbt
139 | 
140 | 1. Sign into dbt cloud
141 | 
142 | 2. Update BigQuery Connection Credentials: Click on the top left hamburger, and go to Account Settings. Select the energy_data project, edit the connection information by uploading the credentials JSON file from google cloud. 
143 | 
144 | 3. Update the schema with the new database name: From the top left, hamburger menu, click Develop. Then navigate to `03_dbt/models/core/schmea.yml`. Update the database under sourced to have the new database name (eg. de-zoomcamp-347002)
145 | 
146 | 4. In the hamburger menu, go to Jobs. Click the Build Fact Tables job, and click run now. This will build the dbt models in the data warehouse.
147 | 
148 | ## Streamlit Dashboard
149 | 
150 | 1. Update the variable `PROJECT_ID` in `04_dashboard/app.py` to the project id of the GCP project
151 | 
152 | 2. Follow the instructions [here](https://docs.streamlit.io/knowledge-base/tutorials/databases/bigquery) to create a `.streamlit/secrets.toml` file from the json credentials file. This will allow streamlit to connect to the data warehouse
153 | 
154 | 3. Create a new app on streamlit sharing, connect to the repo location, and paste in the secrets.toml file. 
155 | 
156 | 4. If necessary, update the `note` in `04_dashboard/info.py` and the expander header in `app.py` to reflect the dates of the missing data that will be backfilled.
157 | 
158 | 
159 | ## Mlflow
160 | The instructions are modified from this [link](https://kargarisaac.github.io/blog/mlops/jupyter/2022/06/15/MLFlow-on-GCP.html).
161 | 
162 | 1. Set up a GCP Postgres database on the default private network
163 | 2. Create the file `05_model_training/mlflow_docker/.env` which contains the following variables which match the database created in the previous step and the mlflow_runs bucket created by terraform:
164 |   - DB_USER
165 |   - DB_PASSWORD
166 |   - DB_PRIVATE_IP
167 |   - DB_NAME
168 |   - BUCKET_NAME
169 | 3. On the remote machine, cd into `05_model_training/mlflow_docker` and run the following command:
170 |     ```bash
171 |     docker-compose up -d
172 |     ``` 
173 |     This will start the mlflow tracking server which will store runs and experiments in the postgres db and artifacts in the gcp bucket.
174 | 


--------------------------------------------------------------------------------