├── MLOps Pipeline
    ├── Utils
    │   ├── Images
    │   │   ├── newplot.png
    │   │   ├── newplot (1).png
    │   │   ├── newplot (5).png
    │   │   └── MLOps Architecture (1).png
    │   └── requirements.txt
    ├── Workflow Config
    │   ├── Great Expecations Config.py
    │   ├── Daily Inference.py
    │   └── Initial Deployment.py
    ├── Data Engineering
    │   ├── 02. Transformation
    │   │   ├── 01. Training Data Transformation.py
    │   │   └── 02. Monitoring Data Transformation.py
    │   ├── 03. Data Quality
    │   │   ├── 02. Great Expectations.py
    │   │   ├── 01. Data Quality Checks.py
    │   │   └── 02. Monitoring Data Quality Checks.py
    │   └── 01. Ingestion
    │   │   ├── 02. Monitoring Data Ingestion.py
    │   │   └── 01. Training Data Ingestion.py
    └── ML Engineering
    │   └── Demand Forecasting Daily
    │       ├── 02.Daily Inference(XGBOOST).py
    │       ├── 03.Daily Monitoring.py
    │       ├── 00. Initial Deployment
    │           ├── 01.Exploratory Data Analysis.py
    │           ├── 02.Feature Engineering.py
    │           ├── 04.Unit Test.py
    │           ├── 03.Model Training.py
    │           ├── 03.Model Training(Pyspark Edition).py
    │           └── Model Training(LSTM).py
    │       ├── 04.Model Retraining Monthly(pyspark edition).py
    │       ├── 01.Feature Engineering.py
    │       └── 06.Performance Evaluation.py
├── .gitignore
├── LICENSE
├── README.md
└── DOCUMENTATION.md


/MLOps Pipeline/Utils/Images/newplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Philippos01/mlops-energy-forecast-thesis/HEAD/MLOps Pipeline/Utils/Images/newplot.png


--------------------------------------------------------------------------------
/MLOps Pipeline/Utils/Images/newplot (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Philippos01/mlops-energy-forecast-thesis/HEAD/MLOps Pipeline/Utils/Images/newplot (1).png


--------------------------------------------------------------------------------
/MLOps Pipeline/Utils/Images/newplot (5).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Philippos01/mlops-energy-forecast-thesis/HEAD/MLOps Pipeline/Utils/Images/newplot (5).png


--------------------------------------------------------------------------------
/MLOps Pipeline/Utils/Images/MLOps Architecture (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Philippos01/mlops-energy-forecast-thesis/HEAD/MLOps Pipeline/Utils/Images/MLOps Architecture (1).png


--------------------------------------------------------------------------------
/MLOps Pipeline/Utils/requirements.txt:
--------------------------------------------------------------------------------
 1 | mlflow
 2 | databricks
 3 | databricks-feature-store
 4 | xgboost
 5 | tensorflow
 6 | protobuf
 7 | keras
 8 | pyspark
 9 | matplotlib
10 | pandas
11 | scipy
12 | requests
13 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Workflow Config/Great Expecations Config.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %pip install great_expectations 
 3 | # MAGIC ! great_expectations --yes init
 4 | # MAGIC %pip install pyyaml
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | import datetime
 9 | import pandas as pd
10 | import yaml
11 | from pyspark.sql.types import TimestampType,DoubleType
12 | from great_expectations.core.batch import RuntimeBatchRequest
13 | from great_expectations.core.yaml_handler import YAMLHandler
14 | from great_expectations.util import get_context
15 | from great_expectations.data_context.types.base import (
16 |     DataContextConfig,
17 |     FilesystemStoreBackendDefaults,
18 | )
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # .gitignore
 2 | 
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | *$py.class
 7 | 
 8 | # Databricks notebook files
 9 | *.dbc
10 | 
11 | # Jupyter Notebook
12 | .ipynb_checkpoints
13 | 
14 | # Python environments
15 | .env
16 | .venv
17 | env/
18 | venv/
19 | ENV/
20 | 
21 | # Pip related
22 | pip-wheel-metadata/
23 | *.egg-info/
24 | *.egg
25 | 
26 | # Setuptools distribution folder
27 | /dist/
28 | 
29 | # Installer logs
30 | pip-log.txt
31 | pip-delete-this-directory.txt
32 | 
33 | # Unit test / coverage reports
34 | htmlcov/
35 | .tox/
36 | .nox/
37 | .coverage
38 | .coverage.*
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | *.cover
43 | .hypothesis/
44 | .pytest_cache/
45 | 
46 | # Log files
47 | *.log
48 | 
49 | # Data files
50 | *.csv
51 | *.xlsx
52 | *.db
53 | 
54 | # Directories with large data files
55 | data/
56 | model/
57 | 
58 | # Configuration files
59 | *.cfg
60 | *.ini
61 | 
62 | # Keys and secrets
63 | *.pem
64 | *.key
65 | *.secret
66 | 
67 | # OS generated files
68 | .DS_Store
69 | .DS_Store?
70 | ._*
71 | .Spotlight-V100
72 | .Trashes
73 | ehthumbs.db
74 | Thumbs.db
75 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 FILIPPOS PRIOVOLOS
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Workflow Config/Daily Inference.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC ## Installations
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %pip install mlflow
 8 | # MAGIC %pip install xgboost
 9 | # MAGIC %pip install databricks && pip install databricks-feature-store
10 | # MAGIC #%pip install mlflow==2.4 numpy==1.22.4 protobuf==4.23.2 tensorflow==2.12.0
11 | # MAGIC
12 | 
13 | # COMMAND ----------
14 | 
15 | # MAGIC %md
16 | # MAGIC ## Imports
17 | 
18 | # COMMAND ----------
19 | 
20 | import pandas as pd
21 | import mlflow
22 | from databricks import feature_store
23 | from pyspark.sql.functions import col, sum, date_sub, to_date, hour,lit,add_months,date_format,expr,abs
24 | from pyspark.ml.feature import OneHotEncoder, StringIndexer
25 | from pyspark.ml import Pipeline
26 | from pyspark.ml.functions import vector_to_array
27 | import matplotlib.pyplot as plt
28 | from pyspark.sql import Row
29 | from pyspark.sql.types import DoubleType
30 | from pyspark.mllib.evaluation import RegressionMetrics
31 | 
32 | # COMMAND ----------
33 | 
34 | # MAGIC %md
35 | # MAGIC ## Configuration
36 | 
37 | # COMMAND ----------
38 | 
39 | countries=['belgium','denmark','france','germany','greece','italy','luxembourg','netherlands','spain','sweden','switzerland']
40 | model_name = 'pyspark_mlflow_model'
41 | db = 'df_dev'
42 | fs = feature_store.FeatureStoreClient()
43 | 
44 | # COMMAND ----------
45 | 
46 | spark.sql("USE df_dev")
47 | 
48 | # COMMAND ----------
49 | 
50 | from datetime import datetime, timedelta
51 | substract_days = 162
52 | date = (datetime.today() - timedelta(days=substract_days)).strftime('%Y-%m-%d')
53 | yesterdate = (datetime.today() - timedelta(days=1) - timedelta(days=substract_days)).strftime('%Y-%m-%d')
54 | 
55 | # COMMAND ----------
56 | 
57 | 
58 | # Check if the row exists
59 | row_exists = spark.sql(f"""
60 |     SELECT 1
61 |     FROM inference_daily
62 |     WHERE execution_date = '{date}' AND execution_yesterdate = '{yesterdate}'
63 | """).collect()
64 | 
65 | # If row does not exist, insert it
66 | if not row_exists:
67 |     spark.sql(f"""
68 |         INSERT INTO inference_daily (execution_date, execution_yesterdate)
69 |         VALUES ('{date}', '{yesterdate}')
70 |     """)
71 | 
72 | # COMMAND ----------
73 | 
74 | # Read the table
75 | df = spark.table("inference_daily")
76 | 
77 | # Show the contents of the table
78 | df.show()
79 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Data Engineering/02. Transformation/01. Training Data Transformation.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | #%run "/Repos/CI ADO Repo/01.Develop/Workflow Config/Daily Inference"
 3 | from pyspark.sql import functions as F
 4 | from pyspark.sql.functions import concat, col, lit, lpad
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | database= 'df_dev'
 9 | 
10 | # COMMAND ----------
11 | 
12 | spark.sql(f'USE {database}')
13 | 
14 | # COMMAND ----------
15 | 
16 | # MAGIC %md
17 | # MAGIC * Loads the data from the consumption_countries_hourly table in the df_dev database.
18 | 
19 | # COMMAND ----------
20 | 
21 | df = spark.read.table('df_dev.consumption_countries_hourly')
22 | 
23 | # COMMAND ----------
24 | 
25 |  display(df)
26 | 
27 | # COMMAND ----------
28 | 
29 | # MAGIC %md
30 | # MAGIC 1. Extracts the date and hour from the DATETIME column.
31 | # MAGIC 1. Groups the data by country, date, and hour, and sums up the hourly consumption.
32 | # MAGIC 1. Renames the summed column to HOURLY_CONSUMPTION_MW.
33 | # MAGIC 1. Constructs a new DATETIME column by concatenating the date and hour.
34 | # MAGIC 1. Converts the DATETIME column to timestamp format.
35 | # MAGIC 1. Selects and reorders the columns to match the desired schema.
36 | 
37 | # COMMAND ----------
38 | 
39 | # Extract the date and hour from the start_time
40 | df = df.withColumn('date', F.to_date(df['DATETIME']))
41 | df = df.withColumn('hour', F.hour(df['DATETIME']))
42 | 
43 | # Group by country, date and hour, and sum up the hourly consumption
44 | df_hourly = df.groupBy('COUNTRY', 'date', 'hour').sum('HOURLY_CONSUMPTION_MW')
45 | 
46 | # Rename the sum column
47 | df_hourly = df_hourly.withColumnRenamed('sum(HOURLY_CONSUMPTION_MW)', 'HOURLY_CONSUMPTION_MW')
48 | 
49 | # Make sure the hour is a two-digit string
50 | df_hourly = df_hourly.withColumn('hour', lpad(col('hour'), 2, '0'))
51 | 
52 | # Construct a new 'DATETIME' column
53 | df_hourly = df_hourly.withColumn('DATETIME', 
54 |                                  concat(col('date'), lit(' '), col('hour'), lit(':00:00')))
55 | 
56 | # Convert 'DATETIME' to timestamp type
57 | df_hourly = df_hourly.withColumn('DATETIME', 
58 |                                  F.to_timestamp(df_hourly['DATETIME'], 'yyyy-MM-dd HH:mm:ss'))
59 | 
60 | # Select and reorder the columns
61 | df_hourly = df_hourly.select('DATETIME', 'HOURLY_CONSUMPTION_MW', 'COUNTRY')
62 | 
63 | # COMMAND ----------
64 | 
65 | df_hourly.count()
66 | 
67 | # COMMAND ----------
68 | 
69 | display(df_hourly)
70 | 
71 | # COMMAND ----------
72 | 
73 | # MAGIC %md 
74 | # MAGIC * Writes the transformed DataFrame into a new table named final_consumption_countries_hourly in the df_dev database. The mode overwrite is used to replace the existing data in the table (if any).
75 | 
76 | # COMMAND ----------
77 | 
78 | # Write the DataFrame into a new table
79 | df_hourly.write.format('delta').mode('overwrite').saveAsTable('df_dev.final_consumption_countries_hourly')
80 | 
81 | # COMMAND ----------
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Data Engineering/02. Transformation/02. Monitoring Data Transformation.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | #%run "/Repos/CI ADO Repo/01.Develop/Workflow Config/Daily Inference"
 3 | from pyspark.sql import functions as F
 4 | from pyspark.sql.functions import concat, col, lit, lpad
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | database= 'db_monitor'
 9 | 
10 | # COMMAND ----------
11 | 
12 | spark.sql(f'USE {database}')
13 | 
14 | # COMMAND ----------
15 | 
16 | # MAGIC %md
17 | # MAGIC * Loads the data from the consumption_countries_hourly table in the df_dev database.
18 | 
19 | # COMMAND ----------
20 | 
21 | df = spark.read.table('monitoring_consumption_countries_hourly')
22 | 
23 | # COMMAND ----------
24 | 
25 |  display(df)
26 | 
27 | # COMMAND ----------
28 | 
29 | # MAGIC %md
30 | # MAGIC 1. Extracts the date and hour from the DATETIME column.
31 | # MAGIC 1. Groups the data by country, date, and hour, and sums up the hourly consumption.
32 | # MAGIC 1. Renames the summed column to HOURLY_CONSUMPTION_MW.
33 | # MAGIC 1. Constructs a new DATETIME column by concatenating the date and hour.
34 | # MAGIC 1. Converts the DATETIME column to timestamp format.
35 | # MAGIC 1. Selects and reorders the columns to match the desired schema.
36 | 
37 | # COMMAND ----------
38 | 
39 | # Extract the date and hour from the start_time
40 | df = df.withColumn('date', F.to_date(df['DATETIME']))
41 | df = df.withColumn('hour', F.hour(df['DATETIME']))
42 | 
43 | # Group by country, date and hour, and sum up the hourly consumption
44 | df_hourly = df.groupBy('COUNTRY', 'date', 'hour').sum('HOURLY_CONSUMPTION_MW')
45 | 
46 | # Rename the sum column
47 | df_hourly = df_hourly.withColumnRenamed('sum(HOURLY_CONSUMPTION_MW)', 'HOURLY_CONSUMPTION_MW')
48 | 
49 | # Make sure the hour is a two-digit string
50 | df_hourly = df_hourly.withColumn('hour', lpad(col('hour'), 2, '0'))
51 | 
52 | # Construct a new 'DATETIME' column
53 | df_hourly = df_hourly.withColumn('DATETIME', 
54 |                                  concat(col('date'), lit(' '), col('hour'), lit(':00:00')))
55 | 
56 | # Convert 'DATETIME' to timestamp type
57 | df_hourly = df_hourly.withColumn('DATETIME', 
58 |                                  F.to_timestamp(df_hourly['DATETIME'], 'yyyy-MM-dd HH:mm:ss'))
59 | 
60 | # Select and reorder the columns
61 | df_hourly = df_hourly.select('DATETIME', 'HOURLY_CONSUMPTION_MW', 'COUNTRY')
62 | 
63 | # COMMAND ----------
64 | 
65 | df_hourly.count()
66 | 
67 | # COMMAND ----------
68 | 
69 | display(df_hourly)
70 | 
71 | # COMMAND ----------
72 | 
73 | # MAGIC %md 
74 | # MAGIC * Writes the transformed DataFrame into a new table named final_consumption_countries_hourly in the df_dev database. The mode overwrite is used to replace the existing data in the table (if any).
75 | 
76 | # COMMAND ----------
77 | 
78 | # Write the DataFrame into a new table
79 | df_hourly.write.format('delta').mode('overwrite').saveAsTable('final_monitoring_consumption_countries_hourly')
80 | 
81 | # COMMAND ----------
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/02.Daily Inference(XGBOOST).py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Configuration
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Daily Inference"
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | # MAGIC %md
 12 | # MAGIC ## Configuration
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | input_table = 'hourly_forecasting_features'
 17 | output_table = 'predictions_xgb'
 18 | 
 19 | # COMMAND ----------
 20 | 
 21 | # MAGIC %md
 22 | # MAGIC ## Load Inference Input
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | inference_df = spark.sql(f"SELECT CONSUMPTION_ID, DATETIME FROM {db}.{input_table} WHERE DATETIME BETWEEN '{date} 00:00:00' AND '{date} 23:00:00'")
 27 | #inference_data = inference_df.drop("CONSUMPTION_ID","DATETIME")
 28 | display(inference_df)
 29 | 
 30 | # COMMAND ----------
 31 | 
 32 | # MAGIC %md
 33 | # MAGIC ## Model's Prediction
 34 | 
 35 | # COMMAND ----------
 36 | 
 37 | client=mlflow.tracking.MlflowClient()
 38 | latest_version= client.get_latest_versions(model_name,stages=['Production'])[0].version
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | # MAGIC %md
 43 | # MAGIC * The following code performs batch scoring on the inference_df(which is the future date we want to predict), using the latest model deployed in Production 
 44 | 
 45 | # COMMAND ----------
 46 | 
 47 | results = fs.score_batch(
 48 |     f"models:/{model_name}/{latest_version}",
 49 |     inference_df,
 50 |     result_type="float",
 51 | )
 52 | display(results)
 53 | 
 54 | # COMMAND ----------
 55 | 
 56 | greece_predictions = results.filter(results["greece"] == 1).select("prediction","HOUR")
 57 | greece_predictions.display()
 58 | 
 59 | # COMMAND ----------
 60 | 
 61 | # MAGIC %md
 62 | # MAGIC ## Store Results
 63 | 
 64 | # COMMAND ----------
 65 | 
 66 | # MAGIC %md
 67 | # MAGIC * It selects relevant columns from the initial results and converts them into a Pandas DataFrame for easy manipulation.
 68 | # MAGIC * It renames and creates new columns, including predicted consumption, country extracted from the consumption ID, and placeholders for actual consumption and residuals.
 69 | # MAGIC * The Pandas DataFrame is converted back to a Spark DataFrame with a specific selection of columns.
 70 | # MAGIC * Data types for certain columns are cast to float.
 71 | # MAGIC * Finally, the data is registered as a temporary SQL view named 'Inference_Output', allowing for SQL-based analysis and querying.
 72 | 
 73 | # COMMAND ----------
 74 | 
 75 | df = results.select(['CONSUMPTION_ID', 'DATETIME', 'prediction']).toPandas()
 76 | df.rename(columns={'prediction': 'PREDICTED_CONSUMPTION'}, inplace=True)
 77 | df['DATETIME'] = df.DATETIME.astype(str)
 78 | df['COUNTRY'] = df['CONSUMPTION_ID'].apply(lambda x: x.split('_')[0])
 79 | df['ACTUAL_CONSUMPTION'] = None
 80 | df['RESIDUAL'] = None
 81 | df['MODEL_USED'] = f"models:/{model_name}/{latest_version}"
 82 | output_cols = ['DATETIME', 'COUNTRY', 'PREDICTED_CONSUMPTION', 'ACTUAL_CONSUMPTION', 'RESIDUAL', 'MODEL_USED']
 83 | output_df = spark.createDataFrame(df[output_cols])
 84 | output_df.withColumn('ACTUAL_CONSUMPTION', col('ACTUAL_CONSUMPTION').cast('float'))\
 85 |          .withColumn('RESIDUAL', col('RESIDUAL').cast('float'))\
 86 |          .createOrReplaceTempView('Inference_Output')
 87 | 
 88 | # COMMAND ----------
 89 | 
 90 | # MAGIC %md
 91 | # MAGIC * It prepares the list of columns to be inserted or updated.
 92 | # MAGIC * It uses Spark SQL to merge data from a temporary view Inference_Output into a target table.
 93 | # MAGIC * If a record with matching DATETIME and COUNTRY is found, it updates the existing record in the target table with the new data.
 94 | # MAGIC * If no matching record is found, it inserts the new data as a new record in the target table.
 95 | 
 96 | # COMMAND ----------
 97 | 
 98 | insert_columns = [f"B.{col}" for col in output_cols]
 99 | update_columns = [f"{col}=B.{col}" for col in output_cols]
100 | spark.sql(f"""
101 | MERGE INTO {db}.{output_table} A
102 | USING Inference_Output B
103 | ON A.DATETIME = B.DATETIME AND A.COUNTRY = B.COUNTRY
104 | WHEN MATCHED THEN
105 |   UPDATE SET
106 |     {', '.join(update_columns)}
107 | WHEN NOT MATCHED
108 |   THEN INSERT (
109 |     {', '.join(output_cols)}
110 | ) VALUES (  
111 |    {', '.join(insert_columns)}
112 | )
113 | """)
114 | 
115 | # COMMAND ----------
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Workflow Config/Initial Deployment.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Installations
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %pip install mlflow
  8 | # MAGIC %pip install databricks && pip install databricks-feature-store
  9 | # MAGIC %pip install xgboost
 10 | # MAGIC %pip install tensorflow
 11 | # MAGIC %pip install protobuf
 12 | # MAGIC %pip install mlflow keras
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # MAGIC %md
 17 | # MAGIC ## Imports
 18 | 
 19 | # COMMAND ----------
 20 | 
 21 | from pyspark.sql import SparkSession
 22 | from pyspark.sql.functions import col,concat, when, lit, to_date, date_sub, max as max_, rand, lpad, concat_ws,sum,mean
 23 | from pyspark.ml.feature import VectorAssembler, VectorIndexer,OneHotEncoder, StringIndexer
 24 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 25 | from pyspark.ml import Pipeline
 26 | from pyspark.ml.evaluation import RegressionEvaluator
 27 | from pyspark.sql.types import DoubleType, TimestampType, DateType 
 28 | from databricks import feature_store
 29 | from databricks.feature_store import feature_table, FeatureLookup
 30 | import mlflow
 31 | from mlflow.tracking import MlflowClient
 32 | import mlflow.keras
 33 | import mlflow.sklearn
 34 | import mlflow.models.signature as sch
 35 | from mlflow.models.signature import ModelSignature
 36 | from mlflow.types.schema import Schema, ColSpec
 37 | import matplotlib.pyplot as plt
 38 | import pandas as pd
 39 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
 40 | from scipy.stats import ks_2samp
 41 | import scipy.stats as stats
 42 | from xgboost import plot_importance, plot_tree, XGBRegressor
 43 | from xgboost.spark import SparkXGBRegressor
 44 | from datetime import datetime, timedelta
 45 | from dateutil.relativedelta import relativedelta
 46 | import time
 47 | import unittest 
 48 | import requests
 49 | import json
 50 | 
 51 | # COMMAND ----------
 52 | 
 53 | # MAGIC %md
 54 | # MAGIC ## Configuration
 55 | 
 56 | # COMMAND ----------
 57 | 
 58 | train_start = '2015-01-01' 
 59 | train_end = '2021-12-31'
 60 | test_start = '2022-01-01'
 61 | test_end = '2023-01-01'
 62 | db = 'df_dev'
 63 | feauture_store = 'hourly_forecasting_features'
 64 | consumption_countries_hourly ='final_consumption_countries_hourly'
 65 | model_name = 'pyspark_mlflow_model'
 66 | access_token = 'dapie24d3f30586ca9b17dbd6d28ce208086-2'
 67 | databricks_instance = 'adb-8855338042472349.9.azuredatabricks.net'
 68 | countries = ["belgium", "denmark", "france", "germany", "greece", "italy", "luxembourg", "netherlands", "spain", "sweden","switzerland"] #new
 69 | experiment_id_training = '3578670731332255'
 70 | experiment_id_retraining = '3578670731332164'
 71 | fs = feature_store.FeatureStoreClient()
 72 | pip_requirements = ["pyspark==3.4.0", "mlflow==2.3.2", "xgboost==1.7.5"]
 73 | user = 'filippos.priovolos01@gmail.com'
 74 | 
 75 | # COMMAND ----------
 76 | 
 77 | # MAGIC %md
 78 | # MAGIC ## Schema
 79 | 
 80 | # COMMAND ----------
 81 | 
 82 | input_schema = Schema([
 83 |     ColSpec("integer", "belgium"),
 84 |     ColSpec("integer", "denmark"),
 85 |     ColSpec("integer", "france"),
 86 |     ColSpec("integer", "germany"),
 87 |     ColSpec("integer", "greece"),
 88 |     ColSpec("integer", "italy"),
 89 |     ColSpec("integer", "luxembourg"),
 90 |     ColSpec("integer", "netherlands"),
 91 |     ColSpec("integer", "spain"),
 92 |     ColSpec("integer", "sweden"),
 93 |     ColSpec("integer", "switzerland"),
 94 |     ColSpec("integer", "HOUR"),
 95 |     ColSpec("integer", "DAY_OF_WEEK"),
 96 |     ColSpec("integer", "MONTH"),
 97 |     ColSpec("integer", "QUARTER"),
 98 |     ColSpec("integer", "YEAR"),
 99 |     ColSpec("integer", "DAY_OF_YEAR"),
100 |     ColSpec("integer", "DAY_OF_MONTH"),
101 |     ColSpec("integer", "WEEK_OF_YEAR"),
102 |     ColSpec("double", "ROLLING_MEAN_24H"),
103 |     ColSpec("double", "ROLLING_STD_24H"),
104 |     ColSpec("double", "ROLLING_SUM_7D"),
105 |     ColSpec("double", "PREV_DAY_CONSUMPTION"),
106 |     ColSpec("double", "PREV_WEEK_CONSUMPTION"),
107 |     ColSpec("double", "PREVIOUS_MONTH_CONSUMPTION")
108 | ])
109 | 
110 | output_schema = Schema([ColSpec("double", "HOURLY_CONSUMPTION_MW")])
111 | 
112 | # COMMAND ----------
113 | 
114 | # MAGIC %md
115 | # MAGIC ## Model Signature
116 | 
117 | # COMMAND ----------
118 | 
119 | # Create a model signature from the input and output schemas
120 | signature = ModelSignature(inputs=input_schema, outputs=output_schema)
121 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/03.Daily Monitoring.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Daily Inference"
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | # MAGIC %md
  7 | # MAGIC * The code uses Apache Spark SQL to query and manipulate data.
  8 | # MAGIC * It first selects data from a table for a specific date and casts the DATETIME column to a string, storing the result in a temporary view called 'daily_features'.
  9 | # MAGIC * It then performs a merge operation between a target table 'predictions_xgb' and the temporary view.
 10 | # MAGIC * For rows that have matching DATETIME and COUNTRY in both the target table and temporary view, it updates the RESIDUAL and ACTUAL_CONSUMPTION columns in the target table based on the data in the temporary view.
 11 | 
 12 | # COMMAND ----------
 13 | 
 14 | spark.sql(f"""SELECT *, CAST(DATETIME AS STRING) AS STR_DATETIME
 15 |               FROM db_monitor.final_monitoring_consumption_countries_hourly
 16 |               WHERE DATETIME >= '{date} 00:00' AND DATETIME <= '{date} 23:59' """).createOrReplaceTempView('daily_features')
 17 | 
 18 | spark.sql(f"""
 19 | MERGE INTO df_dev.predictions_xgb A
 20 | USING daily_features B
 21 | ON A.DATETIME = B.STR_DATETIME AND A.COUNTRY = B.COUNTRY
 22 | WHEN MATCHED THEN
 23 |   UPDATE SET 
 24 |     A.RESIDUAL = A.PREDICTED_CONSUMPTION - B.HOURLY_CONSUMPTION_MW,
 25 |     A.ACTUAL_CONSUMPTION = B.HOURLY_CONSUMPTION_MW
 26 | """)
 27 | 
 28 | 
 29 | # COMMAND ----------
 30 | 
 31 | df = spark.sql(f"SELECT * FROM df_dev.predictions_xgb WHERE DATETIME >= '{date} 00:00' AND DATETIME <= '{date} 23:59' ")
 32 | 
 33 | # Convert the data types of ACTUAL_CONSUMPTION and PREDICTED_CONSUMPTION columns to DoubleType
 34 | df = df.withColumn('ACTUAL_CONSUMPTION', col('ACTUAL_CONSUMPTION').cast(DoubleType()))
 35 | df = df.withColumn('PREDICTED_CONSUMPTION', col('PREDICTED_CONSUMPTION').cast(DoubleType()))
 36 | 
 37 | valuesAndPreds = df.select(['ACTUAL_CONSUMPTION', 'PREDICTED_CONSUMPTION'])
 38 | valuesAndPreds = valuesAndPreds.rdd.map(tuple)
 39 | 
 40 | metrics = RegressionMetrics(valuesAndPreds)
 41 | 
 42 | # Squared Error
 43 | print("MSE = %s" % metrics.meanSquaredError)
 44 | print("RMSE = %s" % metrics.rootMeanSquaredError)
 45 | 
 46 | # Mean absolute error
 47 | print("MAE = %s" % metrics.meanAbsoluteError)
 48 | 
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | # Calculate the percentage difference by dividing the difference by the absolute value of actual consumption
 53 | df = df.withColumn('PERCENTAGE_DIFFERENCE', (col('RESIDUAL') / abs(col('ACTUAL_CONSUMPTION'))) * 100)
 54 | 
 55 | # Calculate the absolute value of the percentage difference
 56 | df = df.withColumn('ABS_PERCENTAGE_DIFFERENCE', abs(col('PERCENTAGE_DIFFERENCE')))
 57 | 
 58 | # Calculate the average absolute percentage difference
 59 | average_absolute_percentage_difference = df.selectExpr('avg(ABS_PERCENTAGE_DIFFERENCE)').collect()[0][0]
 60 | 
 61 | # Calculate the average percentage difference
 62 | average_percentage_difference = df.selectExpr('avg(PERCENTAGE_DIFFERENCE)').collect()[0][0]
 63 | 
 64 | display(df)
 65 | # Print the average percentage difference
 66 | print('Average Percentage Difference:', average_percentage_difference)
 67 | # Print the average absolute percentage difference
 68 | print('Average Absolute Percentage Difference:', average_absolute_percentage_difference)
 69 | 
 70 | # COMMAND ----------
 71 | 
 72 | display(df.filter(df['COUNTRY'] == 'greece'))
 73 | 
 74 | # COMMAND ----------
 75 | 
 76 | # MAGIC %md
 77 | # MAGIC ## Save Inference Data to main table
 78 | 
 79 | # COMMAND ----------
 80 | 
 81 | # Retrieve the predictions_xgb DataFrame using the table name
 82 | predictions_xgb = spark.table('df_dev.predictions_xgb')
 83 | 
 84 | # Select the columns from the first table and cast appropriate columns to match the second table's schema
 85 | merged_df = predictions_xgb.select(
 86 |     col('DATETIME').cast('timestamp').alias('DATETIME'),
 87 |     col('COUNTRY'),
 88 |     col('PREDICTED_CONSUMPTION').cast(DoubleType()).alias('HOURLY_CONSUMPTION_MW')
 89 | )
 90 | 
 91 | # Perform a merge operation to insert new records into the second table if they don't already exist
 92 | merged_df.createOrReplaceTempView('temp_table')
 93 | 
 94 | spark.sql("""
 95 |     MERGE INTO df_dev.final_consumption_countries_hourly AS target
 96 |     USING temp_table AS source
 97 |     ON target.DATETIME = source.DATETIME AND target.COUNTRY = source.COUNTRY
 98 |     WHEN NOT MATCHED THEN
 99 |         INSERT (DATETIME, HOURLY_CONSUMPTION_MW, COUNTRY)
100 |         VALUES (source.DATETIME, source.HOURLY_CONSUMPTION_MW, source.COUNTRY)
101 | """)
102 | 
103 | # COMMAND ----------
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/01.Exploratory Data Analysis.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | !pip install statsmodels
  3 | from statsmodels.tsa.seasonal import seasonal_decompose
  4 | from pyspark.sql.functions import count, when, isnull, col
  5 | from pyspark.sql import functions as F
  6 | import plotly.subplots as sp
  7 | import plotly.graph_objects as go
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | # MAGIC %md
 12 | # MAGIC ## Univariate Analysis
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | spark.sql('USE df_dev')
 17 | df = spark.read.table('final_consumption_countries_hourly')
 18 | 
 19 | # COMMAND ----------
 20 | 
 21 | # MAGIC %md
 22 | # MAGIC * Distribution of records across years, months, days and hours:
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | display(df.withColumn('year', F.year('DATETIME')).groupBy('year').count())
 27 | display(df.withColumn('month', F.month('DATETIME')).groupBy('month').count())
 28 | display(df.withColumn('day', F.dayofweek('DATETIME')).groupBy('day').count())
 29 | display(df.withColumn('hour', F.hour('DATETIME')).groupBy('hour').count())
 30 | 
 31 | # COMMAND ----------
 32 | 
 33 | # MAGIC %md
 34 | # MAGIC * Frequency of records for each country
 35 | 
 36 | # COMMAND ----------
 37 | 
 38 | df.groupBy('COUNTRY').count().show()
 39 | 
 40 | 
 41 | # COMMAND ----------
 42 | 
 43 | # MAGIC %md
 44 | # MAGIC ##  Bivariate Analysis
 45 | 
 46 | # COMMAND ----------
 47 | 
 48 | # MAGIC %md
 49 | # MAGIC * Average hourly consumption per country
 50 | 
 51 | # COMMAND ----------
 52 | 
 53 | df.groupBy('COUNTRY').agg(F.avg('HOURLY_CONSUMPTION_MW').alias('avg_consumption')).show()
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | # MAGIC %md
 58 | # MAGIC * Monthly consumption trends per country
 59 | 
 60 | # COMMAND ----------
 61 | 
 62 | df.withColumn('year', F.year('DATETIME')) \
 63 |   .withColumn('month', F.month('DATETIME')) \
 64 |   .groupBy('year', 'month', 'COUNTRY') \
 65 |   .agg(F.sum('HOURLY_CONSUMPTION_MW').alias('total_consumption')) \
 66 |   .orderBy('year', 'month') \
 67 |   .show()
 68 | 
 69 | 
 70 | # COMMAND ----------
 71 | 
 72 | # MAGIC %md
 73 | # MAGIC * Heatmap: Average hourly consumption for each country by hour of the day or by month of the year
 74 | 
 75 | # COMMAND ----------
 76 | 
 77 | import plotly.graph_objects as go
 78 | 
 79 | # Convert the DataFrame to a 2D list for Plotly
 80 | heatmap_data = df_heatmap.values.tolist()
 81 | 
 82 | # Create the heatmap
 83 | fig = go.Figure(data=go.Heatmap(
 84 |     z=heatmap_data,
 85 |     x=df_heatmap.columns.tolist(),
 86 |     y=df_heatmap.index.tolist(),
 87 |     colorscale='RdBu_r', # you can change this to other color scales
 88 | ))
 89 | 
 90 | # Set the layout
 91 | fig.update_layout(
 92 |     title='Average Hourly Consumption by Country and Hour of Day',
 93 |     xaxis_title='Hour of Day',
 94 |     yaxis_title='Country',
 95 | )
 96 | 
 97 | # Display the figure
 98 | fig.show()
 99 | 
100 | 
101 | # COMMAND ----------
102 | 
103 | pandas_df = df.toPandas()
104 | 
105 | # COMMAND ----------
106 | 
107 | # MAGIC %md
108 | # MAGIC The decompose_country function takes a DataFrame df and a country name as inputs. It performs a time series decomposition on the 'HOURLY_CONSUMPTION_MW' column of the DataFrame for the specified country.
109 | # MAGIC
110 | # MAGIC 1. It filters the DataFrame to include data only for the specified country.
111 | # MAGIC 1. The data is sorted by date.
112 | # MAGIC 1. The date column is set as the index.
113 | # MAGIC 1. The data is resampled to a chosen frequency (monthly in this case).
114 | # MAGIC 1. Any missing values are filled using forward filling.
115 | # MAGIC 1. The seasonal decomposition is performed using an additive model.
116 | # MAGIC 1. The trend, seasonality, and residuals components are extracted.
117 | # MAGIC 1. Subplots are created for the original data, trend, seasonality, and residuals.
118 | # MAGIC 1. Traces are added to the subplots to visualize the components.
119 | # MAGIC 1. The plot layout is updated with appropriate dimensions and a title.
120 | # MAGIC 1. The plot is displayed.
121 | # MAGIC
122 | # MAGIC By calling the decompose_country function with a DataFrame and a country name, the code generates a plot showing the original data, trend, seasonality, and residuals components of the time series for that country.
123 | 
124 | # COMMAND ----------
125 | 
126 | def decompose_country(df, country):
127 |     # Filter data for the specified country
128 |     df_country = df[df['COUNTRY'] == country]
129 |     
130 |     # Ensure the data is sorted by date
131 |     df_country = df_country.sort_values('DATETIME')
132 | 
133 |     # Set the date as the index
134 |     df_country.set_index('DATETIME', inplace=True)
135 |     
136 |     # Resample to hourly data, you can choose different frequency according to your data
137 |     df_country = df_country.resample('M').asfreq()
138 | 
139 |     # Forward fill to handle the newly created NaNs
140 |     df_country = df_country.bfill()
141 | 
142 |     # Perform the decomposition
143 |     decomposition = seasonal_decompose(df_country['HOURLY_CONSUMPTION_MW'], model='additive')
144 | 
145 |     # Get the trend, seasonality and residuals
146 |     trend = decomposition.trend
147 |     seasonal = decomposition.seasonal
148 |     residual = decomposition.resid
149 | 
150 |     # Create subplots: 4 rows, 1 column
151 |     fig = sp.make_subplots(rows=4, cols=1)
152 | 
153 |     # Add traces
154 |     fig.add_trace(go.Scatter(x=df_country.index, y=df_country['HOURLY_CONSUMPTION_MW'], mode='lines', name='Original'), row=1, col=1)
155 |     fig.add_trace(go.Scatter(x=trend.index, y=trend, mode='lines', name='Trend'), row=2, col=1)
156 |     fig.add_trace(go.Scatter(x=seasonal.index, y=seasonal, mode='lines', name='Seasonality'), row=3, col=1)
157 |     fig.add_trace(go.Scatter(x=residual.index, y=residual, mode='lines', name='Residuals'), row=4, col=1)
158 | 
159 |     # Update layout
160 |     fig.update_layout(height=800, width=1000, title_text="Decomposition for " + country, showlegend=True)
161 | 
162 |     # Render the plot
163 |     fig.show()
164 | 
165 | decompose_country(pandas_df, 'greece')
166 | 
167 | 
168 | # COMMAND ----------
169 | 
170 | 
171 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Data Engineering/03. Data Quality/02. Great Expectations.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC Note: You can find the official documentation of great-expectations [here](https://docs.greatexpectations.io/docs/deployment_patterns/how_to_use_great_expectations_in_databricks/)
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %md
  8 | # MAGIC ## Install Great Expectations
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # MAGIC %run "/Users/filippos.priovolos01@gmail.com/Workflow Config/Great Expecations Config"
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # MAGIC %md
 17 | # MAGIC ## Set up Great Expectations
 18 | 
 19 | # COMMAND ----------
 20 | 
 21 | root_directory = "/dbfs/great_expectations/"
 22 | data_context_config = DataContextConfig(
 23 |     store_backend_defaults=FilesystemStoreBackendDefaults(
 24 |         root_directory=root_directory
 25 |     ),
 26 | )
 27 | context = get_context(project_config=data_context_config)
 28 | 
 29 | # COMMAND ----------
 30 | 
 31 | # MAGIC %md 
 32 | # MAGIC ## Prepare data
 33 | 
 34 | # COMMAND ----------
 35 | 
 36 | df = spark.read.format("delta") \
 37 |     .option("header", "true") \
 38 |     .option("inferSchema", "true") \
 39 |     .table("df_dev.final_consumption_countries_hourly")
 40 | 
 41 | 
 42 | # COMMAND ----------
 43 | 
 44 | display(df)
 45 | 
 46 | # COMMAND ----------
 47 | 
 48 | # Sort the DataFrame by country and datetime
 49 | df_sorted = df.orderBy("COUNTRY","DATETIME")
 50 | display(df_sorted)
 51 | 
 52 | # COMMAND ----------
 53 | 
 54 | # MAGIC %md
 55 | # MAGIC ## Connect to the data
 56 | 
 57 | # COMMAND ----------
 58 | 
 59 | my_spark_datasource_config = {
 60 |     "name": "delta_datasource",
 61 |     "class_name": "Datasource",
 62 |     "execution_engine": {"class_name": "SparkDFExecutionEngine"},
 63 |     "data_connectors": {
 64 |         "delta_connector": {
 65 |             "module_name": "great_expectations.datasource.data_connector",
 66 |             "class_name": "RuntimeDataConnector",
 67 |             "batch_identifiers": [
 68 |                 "prod",
 69 |                 "run_id1",
 70 |             ],
 71 |         }
 72 |     },
 73 | }
 74 | 
 75 | 
 76 | # COMMAND ----------
 77 | 
 78 | context.test_yaml_config(yaml.dump(my_spark_datasource_config))
 79 | 
 80 | # COMMAND ----------
 81 | 
 82 | context.add_datasource(**my_spark_datasource_config)
 83 | 
 84 | # COMMAND ----------
 85 | 
 86 | batch_request = RuntimeBatchRequest(
 87 |     datasource_name="delta_datasource",
 88 |     data_connector_name="delta_connector",
 89 |     data_asset_name="my_data_asset_name",
 90 |     batch_identifiers={
 91 |         "prod": "my_production_data",
 92 |         "run_id1": f"my_run_id{datetime.date.today().strftime('%Y%m%d')}",
 93 |     },
 94 |     runtime_parameters={"batch_data": df},
 95 | )
 96 | 
 97 | 
 98 | # COMMAND ----------
 99 | 
100 | expectation_suite_name = "my_expectation_suite"
101 | context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)
102 | validator = context.get_validator(
103 |     batch_request=batch_request,
104 |     expectation_suite_name=expectation_suite_name,
105 | )
106 | 
107 | print(validator.head())
108 | 
109 | # COMMAND ----------
110 | 
111 | from datetime import datetime 
112 | 
113 | # Define Expectations for the columns
114 | validator.expect_column_values_to_not_be_null("DATETIME")
115 | validator.expect_column_values_to_not_be_null("HOURLY_CONSUMPTION_MW")
116 | validator.expect_column_values_to_not_be_null("COUNTRY")
117 | 
118 | validator.expect_column_values_to_be_of_type("DATETIME", "TimestampType")
119 | validator.expect_column_values_to_be_of_type("HOURLY_CONSUMPTION_MW", "DoubleType")
120 | validator.expect_column_values_to_be_of_type("COUNTRY", "StringType")
121 | 
122 | validator.expect_column_values_to_be_in_set("COUNTRY", ["belgium", "denmark", "france", "germany", "greece", "italy", "luxembourg", "netherlands", "spain", "sweden", "switzerland"])
123 | 
124 | validator.expect_column_values_to_be_between("HOURLY_CONSUMPTION_MW", min_value=0)
125 | 
126 | # This expectation checks if the mean of the HOURLY_CONSUMPTION_MW is within a certain range. Please adjust the min_value and max_value according to your data.
127 | validator.expect_column_mean_to_be_between("HOURLY_CONSUMPTION_MW", min_value=25000, max_value=50000)
128 | 
129 | # This expectation checks if the median of the HOURLY_CONSUMPTION_MW is within a certain range. Please adjust the min_value and max_value according to your data.
130 | validator.expect_column_median_to_be_between("HOURLY_CONSUMPTION_MW", min_value=20000, max_value=35000)
131 | 
132 | # This expectation checks if the standard deviation of the HOURLY_CONSUMPTION_MW is within a certain range. Please adjust the min_value and max_value according to your data.
133 | validator.expect_column_stdev_to_be_between("HOURLY_CONSUMPTION_MW", min_value=40000, max_value=70000)
134 | 
135 | # Check if timestamps are in the correct range
136 | start_date = datetime(2015, 1, 1)
137 | end_date = datetime(2023, 1, 1)
138 | validator.expect_column_values_to_be_between("DATETIME", min_value=start_date, max_value=end_date)
139 | 
140 | 
141 | 
142 | # COMMAND ----------
143 | 
144 | validator.save_expectation_suite(discard_failed_expectations=False)
145 | 
146 | # COMMAND ----------
147 | 
148 | # MAGIC %md
149 | # MAGIC ## Validate data
150 | 
151 | # COMMAND ----------
152 | 
153 | my_checkpoint_name = "my_data_validation_checkpoint"
154 | checkpoint_config = {
155 |     "name": my_checkpoint_name,
156 |     "config_version": 1.0,
157 |     "class_name": "SimpleCheckpoint",
158 |     "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
159 | }
160 | 
161 | 
162 | # COMMAND ----------
163 | 
164 | my_checkpoint = context.test_yaml_config(yaml.dump(checkpoint_config))
165 | 
166 | # COMMAND ----------
167 | 
168 | context.add_or_update_checkpoint(**checkpoint_config)
169 | 
170 | # COMMAND ----------
171 | 
172 | checkpoint_result = context.run_checkpoint(
173 |     checkpoint_name=my_checkpoint_name,
174 |     validations=[
175 |         {
176 |             "batch_request": batch_request,
177 |             "expectation_suite_name": expectation_suite_name,
178 |         }
179 |     ],
180 | )
181 | 
182 | # COMMAND ----------
183 | 
184 | # MAGIC %md
185 | # MAGIC ## Build and view Data Docs
186 | 
187 | # COMMAND ----------
188 | 
189 | html = '/dbfs/great_expectations/uncommitted/data_docs/local_site/index.html'
190 | with open(html, "r") as f:
191 |     data = "".join([l for l in f])
192 | displayHTML(data)
193 | 
194 | # COMMAND ----------
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/02.Feature Engineering.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment"
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | # MAGIC %md
  7 | # MAGIC ## Configuration
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | input_table_name = 'final_consumption_countries_hourly'
 12 | output_table_name = 'hourly_forecasting_features'
 13 | save_into_feature_store = True
 14 | delete_fs = False
 15 | 
 16 | # COMMAND ----------
 17 | 
 18 | # MAGIC %md
 19 | # MAGIC ## Load Dataset
 20 | 
 21 | # COMMAND ----------
 22 | 
 23 | table = spark.table(f'{db}.{input_table_name}')
 24 | table.describe().show()
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | # MAGIC %md
 29 | # MAGIC ## One-Hot-Encoding of Categorical Columns (Countries)
 30 | 
 31 | # COMMAND ----------
 32 | 
 33 | # MAGIC %md
 34 | # MAGIC The create_country_features function adds binary country-specific features to a DataFrame. It iterates over distinct country values in the 'COUNTRY' column, creates a new column for each country, and assigns a value of 1 if the row corresponds to that country, and 0 otherwise. The updated DataFrame with the added features is returned and displayed using the display function.
 35 | 
 36 | # COMMAND ----------
 37 | 
 38 | from pyspark.sql import functions as F
 39 | def create_country_features(df):
 40 |     # for col in df.columns: 
 41 |     countries = [row['COUNTRY'] for row in df.select('COUNTRY').distinct().collect()]
 42 |     countries.sort()
 43 |     for country in countries: 
 44 |         df = df.withColumn("{}".format(country), F.when((df['COUNTRY'] == country), 1).otherwise(0))
 45 |     return df
 46 | 
 47 | features = create_country_features(table)
 48 | display(features)
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | # MAGIC %md
 53 | # MAGIC ## Create Features for Model Training
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | # MAGIC %md
 58 | # MAGIC 1. Preprocessing and Sorting
 59 | # MAGIC     * Convert the 'DATETIME' column to datetime format.
 60 | # MAGIC     * Set this converted column as the index of the DataFrame.
 61 | # MAGIC     * Sort the DataFrame by 'COUNTRY' and 'DATETIME' columns.
 62 | # MAGIC 2. Extracting Date Features
 63 | # MAGIC     * Create new columns for various date components: 'HOUR', 'DAY_OF_WEEK', 'MONTH', 'QUARTER', 'YEAR', 'DAY_OF_YEAR', 'DAY_OF_MONTH', and 'WEEK_OF_YEAR'.
 64 | # MAGIC 3. Calculate Rolling Statistics & Lagged Features
 65 | # MAGIC     * For each country, calculate rolling mean, rolling standard deviation, and rolling sum of the 'HOURLY_CONSUMPTION_MW' over specific windows (24 hours and 7 days).
 66 | # MAGIC     * Create lagged features for 'HOURLY_CONSUMPTION_MW' such as the consumption of the previous day, previous week, and previous month.
 67 | # MAGIC 4. Handling Null Values
 68 | # MAGIC     * Backward fill the null values generated due to shifting (lagged features) and rolling operations.
 69 | # MAGIC 5. Drop Original Consumption Column
 70 | # MAGIC     * Drop the 'HOURLY_CONSUMPTION_MW' column as we have generated statistical features from it.
 71 | # MAGIC 6. Return the Modified DataFrame
 72 | # MAGIC     * The function returns the DataFrame with the newly created features.
 73 | 
 74 | # COMMAND ----------
 75 | 
 76 | def create_features(df):
 77 |     """
 78 |     Creates time series features from datetime index in order to save them in Features Store
 79 |     """
 80 |     # Convert 'DATETIME' column to datetime format and set it as the index
 81 |     df['DATETIME'] = pd.to_datetime(df['DATETIME'])
 82 |     df.set_index('DATETIME', inplace=True)
 83 |     df.sort_values(['COUNTRY', 'DATETIME'], inplace=True)
 84 | 
 85 |     # Extract date-related features
 86 |     df['HOUR'] = df.index.hour  
 87 |     df['DAY_OF_WEEK'] = df.index.dayofweek
 88 |     df['MONTH'] = df.index.month
 89 |     df['QUARTER'] = df.index.quarter
 90 |     df['YEAR'] = df.index.year
 91 |     df['DAY_OF_YEAR'] = df.index.dayofyear
 92 |     df['DAY_OF_MONTH'] = df.index.day
 93 |     df['WEEK_OF_YEAR'] = df.index.isocalendar().week
 94 | 
 95 |     # Calculate rolling statistics and lagged features for each country
 96 |     for country in df['COUNTRY'].unique():
 97 |         df.loc[df['COUNTRY'] == country, 'ROLLING_MEAN_24H'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].rolling(window=24).mean()
 98 |         df.loc[df['COUNTRY'] == country, 'ROLLING_STD_24H'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].rolling(window=24).std()
 99 |         df.loc[df['COUNTRY'] == country, 'ROLLING_SUM_7D'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].rolling(window=7 * 24, min_periods=1).sum()
100 |         df.loc[df['COUNTRY'] == country, 'PREV_DAY_CONSUMPTION'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].shift(24)
101 |         df.loc[df['COUNTRY'] == country, 'PREV_WEEK_CONSUMPTION'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].shift(24 * 7)
102 |         df.loc[df['COUNTRY'] == country, 'PREVIOUS_MONTH_CONSUMPTION'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].shift(24*30)
103 | 
104 |     # Backward fill only the rows that end up as null after shifting
105 |     df['PREV_DAY_CONSUMPTION'] = df['PREV_DAY_CONSUMPTION'].fillna(method='bfill')
106 |     df['PREV_WEEK_CONSUMPTION'] = df['PREV_WEEK_CONSUMPTION'].fillna(method='bfill')
107 |     df['PREVIOUS_MONTH_CONSUMPTION'] = df['PREVIOUS_MONTH_CONSUMPTION'].fillna(method='bfill')
108 |     df['ROLLING_MEAN_24H'] = df['ROLLING_MEAN_24H'].fillna(method='bfill')
109 |     df['ROLLING_STD_24H'] = df['ROLLING_STD_24H'].fillna(method='bfill')
110 | 
111 |     df = df.drop('HOURLY_CONSUMPTION_MW',axis=1)
112 |     
113 |     return df
114 | 
115 | 
116 | # COMMAND ----------
117 | 
118 | # Convert features df from spark to pandas and call the create_features() 
119 | features = create_features(features.toPandas())
120 | features
121 | 
122 | # COMMAND ----------
123 | 
124 | # MAGIC %md
125 | # MAGIC ## Create Primary Key
126 | 
127 | # COMMAND ----------
128 | 
129 | # MAGIC %md
130 | # MAGIC By concatenating the 'COUNTRY' and 'DATETIME' values with an underscore ('_'), the code aims to create a composite key that uniquely identifies each row in the DataFrame
131 | 
132 | # COMMAND ----------
133 | 
134 | features.reset_index(inplace=True)
135 | features['CONSUMPTION_ID'] = features.COUNTRY + '_' + features.DATETIME.astype(str)
136 | features.head()
137 | 
138 | # COMMAND ----------
139 | 
140 | # MAGIC %md
141 | # MAGIC ## Save features dataset into Feature Store
142 | 
143 | # COMMAND ----------
144 | 
145 | if save_into_feature_store:
146 | 
147 |     features.drop(['COUNTRY'], axis=1, inplace=True)
148 | 
149 |     features = spark.createDataFrame(features)
150 | 
151 |     fs = feature_store.FeatureStoreClient()
152 | 
153 |     fs.create_table(
154 |         name=f'{db}.{output_table_name}',
155 |         primary_keys=['CONSUMPTION_ID'],
156 |         timestamp_keys='DATETIME',
157 |         df=features
158 |     )
159 | 
160 | # COMMAND ----------
161 | 
162 | # MAGIC %md
163 | # MAGIC ## Delete Features Store
164 | 
165 | # COMMAND ----------
166 | 
167 | if delete_fs:
168 |     from databricks.feature_store import FeatureStoreClient
169 |     fs = FeatureStoreClient()
170 |     fs.drop_table(name='df_dev.hourly_forecasting_features')
171 |     print("Feature Store was succesfuly deleted")
172 | 
173 | # COMMAND ----------
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Data Engineering/01. Ingestion/02. Monitoring Data Ingestion.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Import Libraries
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | from pyspark.sql import SparkSession
  8 | from pyspark.sql.functions import col, lit, split, to_timestamp, date_format
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # Use df_landing database
 13 | spark.sql('USE db_monitor') 
 14 | # create a Spark session
 15 | spark = SparkSession.builder.getOrCreate()
 16 | 
 17 | # COMMAND ----------
 18 | 
 19 | # Retrieve a list of all tables in the current database
 20 | tables = spark.sql('SHOW TABLES') \
 21 |     .select('tableName') \
 22 |     .rdd.flatMap(lambda x: x) \
 23 |     .collect()
 24 | 
 25 | print(tables)
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | # MAGIC %md
 30 | # MAGIC ## Load & Aggregate Data from Database
 31 | 
 32 | # COMMAND ----------
 33 | 
 34 | # MAGIC %md
 35 | # MAGIC The load_data function in PySpark takes a table name as an input, and performs the following steps:
 36 | # MAGIC
 37 | # MAGIC 1. Load Data: Reads data from the specified table into a DataFrame.
 38 | # MAGIC 2. Split Datetime: Splits the 'datetime' string into start and end times, and assigns the start time to a new column 'start_time'.
 39 | # MAGIC 3. Convert Datetime: Transforms the 'start_time' string into a timestamp format.
 40 | # MAGIC 4. Extract Hourly Time: Reduces the 'start_time' to an hourly format, discarding minute and second information.
 41 | # MAGIC 5. Extract Country Name: Derives the country name from the table name (assumed to be the first part of the table name before an underscore).
 42 | # MAGIC 6. Add Country Column: Adds a new column 'country' to the DataFrame, populated with the extracted country name.
 43 | # MAGIC 7. Return DataFrame: Returns the modified DataFrame.
 44 | # MAGIC
 45 | # MAGIC This function prepares the loaded data for further analysis by transforming the timestamp into an hourly format and adding a country identifier.
 46 | 
 47 | # COMMAND ----------
 48 | 
 49 | # function to load data from a table and add a country column
 50 | def load_data(table_name):
 51 |     df = spark.read.table(table_name)
 52 | 
 53 |     # split the datetime string into start and end times
 54 |     split_col = split(df['datetime'], ' - ')
 55 |     df = df.withColumn('start_time', split_col.getItem(0))
 56 | 
 57 |     # convert the start time into timestamp format
 58 |     datetime_format = "dd.MM.yyyy HH:mm"
 59 |     df = df.withColumn('start_time', to_timestamp(df['start_time'], datetime_format))
 60 | 
 61 |     # floor the start_time to the hour
 62 |     #df = df.withColumn('start_time', date_format(df['start_time'], 'yyyy-MM-dd HH:00:00').cast('timestamp'))
 63 | 
 64 |     # get the country name from the table name
 65 |     country = table_name.split("_")[0]
 66 | 
 67 |     # add the country column
 68 |     df = df.withColumn("country", lit(country))
 69 | 
 70 |     # sort the values based on start_time in ascending order
 71 |     df = df.sort("start_time")
 72 | 
 73 |     return df
 74 | 
 75 | 
 76 | # COMMAND ----------
 77 | 
 78 | # MAGIC %md
 79 | # MAGIC ## Save data in each table
 80 | 
 81 | # COMMAND ----------
 82 | 
 83 | # dictionary to store dataframes
 84 | df_dict = {}
 85 | 
 86 | # load data from each table
 87 | for table in tables:
 88 |     df_dict[table.split('_')[0]] = load_data(table)
 89 | 
 90 | # COMMAND ----------
 91 | 
 92 | # MAGIC %md
 93 | # MAGIC 1. Sorts the DataFrame by the 'start_time' column.
 94 | # MAGIC 2. Replaces null values in the 'Actual_MW' column with the previous non-null value using forward fill. This is achieved by applying the last() function with the ignorenulls=True argument over a window specification.
 95 | # MAGIC 3. Replaces invalid values (0 or less) in the 'Actual_MW' column with the previous non-invalid value using forward fill. This is done by using the when() function to check if the value is less than or equal to 0, and if so, replaces it with the previous non-invalid value from the window.
 96 | # MAGIC 4. Updates the DataFrame in the dictionary with the modified DataFrame.
 97 | # MAGIC The code ensures that null and invalid values are replaced with appropriate values using forward fill, maintaining the ordering of the data by 'start_time' for each country DataFrame.
 98 | 
 99 | # COMMAND ----------
100 | 
101 | for country, df_country in df_dict.items():
102 |     print(country,df_country)
103 | 
104 | # COMMAND ----------
105 | 
106 | # Import the necessary functions
107 | from pyspark.sql.functions import col,when
108 | 
109 | # Iterate over each country DataFrame in the dictionary
110 | for country, df_country in df_dict.items():
111 |     # Sort the DataFrame by 'start_time'
112 |     df_country = df_country.orderBy('start_time')
113 | 
114 |     # Replace invalid values (0 or less) with null
115 |     df_country = df_country.withColumn('Actual_MW', when(col('Actual_MW') <= 0, None).otherwise(col('Actual_MW')))
116 |     
117 |     # Drop rows with null values
118 |     df_country = df_country.dropna()
119 | 
120 |     # Update the DataFrame in the dictionary
121 |     df_dict[country] = df_country
122 | 
123 | 
124 | # COMMAND ----------
125 | 
126 | # MAGIC %md
127 | # MAGIC * The get_hourly_query function is defined to create a SQL query for each table (country). This query selects the start_time, Actual_MW (renamed as HOURLY_CONSUMPTION_MW), and the table name (representing the country).
128 | # MAGIC
129 | # MAGIC
130 | 
131 | # COMMAND ----------
132 | 
133 | # function to generate SQL query for a given table
134 | def get_hourly_query(table_name):
135 |     return f"""
136 |     SELECT 
137 |         start_time AS DATETIME, 
138 |         Actual_MW AS HOURLY_CONSUMPTION_MW, 
139 |         '{table_name}' AS COUNTRY
140 |     FROM {table_name}
141 |     """
142 | 
143 | # COMMAND ----------
144 | 
145 | # register each DataFrame as a temporary view in Spark
146 | for table_name, df in df_dict.items():
147 |     df.createOrReplaceTempView(table_name)
148 | 
149 | # COMMAND ----------
150 | 
151 | # MAGIC %md
152 | # MAGIC
153 | # MAGIC * The final_hourly_query is created by applying the get_hourly_query function to each country's DataFrame in the dictionary, joining the resulting SQL queries with UNION ALL. The UNION ALL SQL operation combines the rows from these separate queries into a single set of results.
154 | 
155 | # COMMAND ----------
156 | 
157 | final_hourly_query = ' UNION ALL '.join([get_hourly_query(country) for country in df_dict.keys()])
158 | 
159 | # COMMAND ----------
160 | 
161 | # MAGIC %md
162 | # MAGIC
163 | # MAGIC * The final_hourly_query is then executed using spark.sql(). This command runs the combined SQL query and creates a DataFrame.
164 | # MAGIC
165 | # MAGIC * The .dropDuplicates(['DATETIME', 'COUNTRY']) operation removes any duplicate rows from the DataFrame based on the DATETIME and COUNTRY columns.
166 | # MAGIC
167 | # MAGIC * The .createOrReplaceTempView('final_hourly_df') operation creates a temporary view with the name 'final_hourly_df'. This is a named logical plan that is used as a stand-in for the DataFrame in Spark SQL queries.
168 | 
169 | # COMMAND ----------
170 | 
171 | spark.sql(final_hourly_query) \
172 |     .dropDuplicates(['DATETIME', 'COUNTRY']) \
173 |     .createOrReplaceTempView('final_hourly_df')
174 | 
175 | spark.sql("""
176 | SELECT * FROM final_hourly_df
177 | ORDER BY DATETIME,COUNTRY
178 | """).createOrReplaceTempView('final_hourly_df_ordered')
179 | 
180 | # COMMAND ----------
181 | 
182 | # MAGIC %md
183 | # MAGIC * The MERGE INTO statement is a SQL command that updates the consumption_countries_hourly table in the database. If a record (based on DATETIME and COUNTRY) already exists in the table, it updates the existing record with the new data. If a record does not exist, it inserts a new record with the data.
184 | # MAGIC
185 | # MAGIC
186 | 
187 | # COMMAND ----------
188 | 
189 | spark.sql(f"""
190 | MERGE INTO monitoring_consumption_countries_hourly A
191 | USING final_hourly_df B
192 | ON A.DATETIME = B.DATETIME AND A.COUNTRY = B.COUNTRY
193 | WHEN MATCHED THEN
194 |   UPDATE SET
195 |     DATETIME = B.DATETIME,
196 |     HOURLY_CONSUMPTION_MW = B.HOURLY_CONSUMPTION_MW,
197 |     COUNTRY = B.COUNTRY
198 | WHEN NOT MATCHED
199 |   THEN INSERT (
200 |     DATETIME,
201 |     HOURLY_CONSUMPTION_MW, 
202 |     COUNTRY
203 | ) VALUES (  
204 |     B.DATETIME,
205 |     B.HOURLY_CONSUMPTION_MW, 
206 |     B.COUNTRY
207 | )
208 | """)
209 | 
210 | # COMMAND ----------
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Data Engineering/01. Ingestion/01. Training Data Ingestion.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Import Libraries
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | from pyspark.sql import SparkSession
  8 | from pyspark.sql.functions import col, lit, split, to_timestamp, date_format
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # Use df_landing database
 13 | spark.sql('USE df_landing') 
 14 | # create a Spark session
 15 | spark = SparkSession.builder.getOrCreate()
 16 | 
 17 | # COMMAND ----------
 18 | 
 19 | # Retrieve a list of all tables in the current database
 20 | tables = spark.sql('SHOW TABLES') \
 21 |     .select('tableName') \
 22 |     .rdd.flatMap(lambda x: x) \
 23 |     .collect()
 24 | 
 25 | print(tables)
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | # MAGIC %md
 30 | # MAGIC ## Load & Aggregate Data from Database
 31 | 
 32 | # COMMAND ----------
 33 | 
 34 | # MAGIC %md
 35 | # MAGIC The load_data function in PySpark takes a table name as an input, and performs the following steps:
 36 | # MAGIC
 37 | # MAGIC 1. Load Data: Reads data from the specified table into a DataFrame.
 38 | # MAGIC 2. Split Datetime: Splits the 'datetime' string into start and end times, and assigns the start time to a new column 'start_time'.
 39 | # MAGIC 3. Convert Datetime: Transforms the 'start_time' string into a timestamp format.
 40 | # MAGIC 4. Extract Hourly Time: Reduces the 'start_time' to an hourly format, discarding minute and second information.
 41 | # MAGIC 5. Extract Country Name: Derives the country name from the table name (assumed to be the first part of the table name before an underscore).
 42 | # MAGIC 6. Add Country Column: Adds a new column 'country' to the DataFrame, populated with the extracted country name.
 43 | # MAGIC 7. Return DataFrame: Returns the modified DataFrame.
 44 | # MAGIC
 45 | # MAGIC This function prepares the loaded data for further analysis by transforming the timestamp into an hourly format and adding a country identifier.
 46 | 
 47 | # COMMAND ----------
 48 | 
 49 | # function to load data from a table and add a country column
 50 | def load_data(table_name):
 51 |     df = spark.read.table(table_name)
 52 | 
 53 |     # split the datetime string into start and end times
 54 |     split_col = split(df['datetime'], ' - ')
 55 |     df = df.withColumn('start_time', split_col.getItem(0))
 56 | 
 57 |     # convert the start time into timestamp format
 58 |     datetime_format = "dd.MM.yyyy HH:mm"
 59 |     df = df.withColumn('start_time', to_timestamp(df['start_time'], datetime_format))
 60 | 
 61 |     # floor the start_time to the hour
 62 |     #df = df.withColumn('start_time', date_format(df['start_time'], 'yyyy-MM-dd HH:00:00').cast('timestamp'))
 63 | 
 64 |     # get the country name from the table name
 65 |     country = table_name.split("_")[0]
 66 | 
 67 |     # add the country column
 68 |     df = df.withColumn("country", lit(country))
 69 | 
 70 |     # sort the values based on start_time in ascending order
 71 |     df = df.sort("start_time")
 72 | 
 73 |     return df
 74 | 
 75 | 
 76 | # COMMAND ----------
 77 | 
 78 | # MAGIC %md
 79 | # MAGIC ## Save data in each table
 80 | 
 81 | # COMMAND ----------
 82 | 
 83 | # dictionary to store dataframes
 84 | df_dict = {}
 85 | 
 86 | # load data from each table
 87 | for table in tables:
 88 |     df_dict[table.split('_')[0]] = load_data(table)
 89 | 
 90 | # COMMAND ----------
 91 | 
 92 | # MAGIC %md
 93 | # MAGIC 1. Sorts the DataFrame by the 'start_time' column.
 94 | # MAGIC 2. Replaces null values in the 'Actual_MW' column with the previous non-null value using forward fill. This is achieved by applying the last() function with the ignorenulls=True argument over a window specification.
 95 | # MAGIC 3. Replaces invalid values (0 or less) in the 'Actual_MW' column with the previous non-invalid value using forward fill. This is done by using the when() function to check if the value is less than or equal to 0, and if so, replaces it with the previous non-invalid value from the window.
 96 | # MAGIC 4. Updates the DataFrame in the dictionary with the modified DataFrame.
 97 | # MAGIC The code ensures that null and invalid values are replaced with appropriate values using forward fill, maintaining the ordering of the data by 'start_time' for each country DataFrame.
 98 | 
 99 | # COMMAND ----------
100 | 
101 | from pyspark.sql.functions import col, when, last
102 | from pyspark.sql.window import Window
103 | 
104 | # Iterate over each country DataFrame in the dictionary
105 | for country, df_country in df_dict.items():
106 |     # Sort the DataFrame by 'start_time'
107 |     df_country = df_country.orderBy('start_time')
108 | 
109 |     # Replace invalid values (0 or less) with null
110 |     df_country = df_country.withColumn('Actual_MW', when(col('Actual_MW') <= 0, None).otherwise(col('Actual_MW')))
111 |     
112 |     # Replace null values with previous non-null values using forward fill
113 |     window_spec = Window.partitionBy('country').orderBy('start_time').rowsBetween(Window.unboundedPreceding, 0)
114 |     df_country = df_country.withColumn('Actual_MW', last('Actual_MW', ignorenulls=True).over(window_spec))
115 | 
116 |     # Update the DataFrame in the dictionary
117 |     df_dict[country] = df_country
118 | 
119 | 
120 | # COMMAND ----------
121 | 
122 | # MAGIC %md
123 | # MAGIC * The get_hourly_query function is defined to create a SQL query for each table (country). This query selects the start_time, Actual_MW (renamed as HOURLY_CONSUMPTION_MW), and the table name (representing the country).
124 | # MAGIC
125 | # MAGIC
126 | 
127 | # COMMAND ----------
128 | 
129 | # function to generate SQL query for a given table
130 | def get_hourly_query(table_name):
131 |     return f"""
132 |     SELECT 
133 |         start_time AS DATETIME, 
134 |         Actual_MW AS HOURLY_CONSUMPTION_MW, 
135 |         '{table_name}' AS COUNTRY
136 |     FROM {table_name}
137 |     """
138 | 
139 | # COMMAND ----------
140 | 
141 | # register each DataFrame as a temporary view in Spark
142 | for table_name, df in df_dict.items():
143 |     df.createOrReplaceTempView(table_name)
144 | 
145 | # COMMAND ----------
146 | 
147 | # MAGIC %md
148 | # MAGIC
149 | # MAGIC * The final_hourly_query is created by applying the get_hourly_query function to each country's DataFrame in the dictionary, joining the resulting SQL queries with UNION ALL. The UNION ALL SQL operation combines the rows from these separate queries into a single set of results.
150 | 
151 | # COMMAND ----------
152 | 
153 | final_hourly_query = ' UNION ALL '.join([get_hourly_query(country) for country in df_dict.keys()])
154 | 
155 | # COMMAND ----------
156 | 
157 | # MAGIC %md
158 | # MAGIC
159 | # MAGIC * The final_hourly_query is then executed using spark.sql(). This command runs the combined SQL query and creates a DataFrame.
160 | # MAGIC
161 | # MAGIC * The .dropDuplicates(['DATETIME', 'COUNTRY']) operation removes any duplicate rows from the DataFrame based on the DATETIME and COUNTRY columns.
162 | # MAGIC
163 | # MAGIC * The .createOrReplaceTempView('final_hourly_df') operation creates a temporary view with the name 'final_hourly_df'. This is a named logical plan that is used as a stand-in for the DataFrame in Spark SQL queries.
164 | 
165 | # COMMAND ----------
166 | 
167 | spark.sql(final_hourly_query) \
168 |     .dropDuplicates(['DATETIME', 'COUNTRY']) \
169 |     .createOrReplaceTempView('final_hourly_df')
170 | 
171 | spark.sql("""
172 | SELECT * FROM final_hourly_df
173 | ORDER BY DATETIME,COUNTRY
174 | """).createOrReplaceTempView('final_hourly_df_ordered')
175 | 
176 | # COMMAND ----------
177 | 
178 | database= 'df_dev'
179 | 
180 | # COMMAND ----------
181 | 
182 | # MAGIC %md
183 | # MAGIC * The MERGE INTO statement is a SQL command that updates the consumption_countries_hourly table in the database. If a record (based on DATETIME and COUNTRY) already exists in the table, it updates the existing record with the new data. If a record does not exist, it inserts a new record with the data.
184 | # MAGIC
185 | # MAGIC
186 | 
187 | # COMMAND ----------
188 | 
189 | spark.sql(f"""
190 | MERGE INTO {database}.consumption_countries_hourly A
191 | USING final_hourly_df B
192 | ON A.DATETIME = B.DATETIME AND A.COUNTRY = B.COUNTRY
193 | WHEN MATCHED THEN
194 |   UPDATE SET
195 |     DATETIME = B.DATETIME,
196 |     HOURLY_CONSUMPTION_MW = B.HOURLY_CONSUMPTION_MW,
197 |     COUNTRY = B.COUNTRY
198 | WHEN NOT MATCHED
199 |   THEN INSERT (
200 |     DATETIME,
201 |     HOURLY_CONSUMPTION_MW, 
202 |     COUNTRY
203 | ) VALUES (  
204 |     B.DATETIME,
205 |     B.HOURLY_CONSUMPTION_MW, 
206 |     B.COUNTRY
207 | )
208 | """)
209 | 
210 | # COMMAND ----------
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/04.Unit Test.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Configuration
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment"
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | # MAGIC %md
 12 | # MAGIC ## Load Datasets
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # MAGIC
 17 | # MAGIC %md
 18 | # MAGIC * Load energy consumption data from a database into a Pandas DataFrame.
 19 | # MAGIC * Create a new column CONSUMPTION_ID by concatenating country codes with the date-time information.
 20 | # MAGIC * Convert the DATETIME column to a proper datetime data type for time-based operations.
 21 | # MAGIC * Define test labels, based on date-time ranges.
 22 | # MAGIC * Convert the subsets back into Spark DataFrames and select only the CONSUMPTION_ID, DATETIME, and HOURLY_CONSUMPTION_MW columns for further processing
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | # Load Consumption Region Table
 27 | consumption_countries_hourly = spark.table(f'{db}.final_consumption_countries_hourly').toPandas()
 28 | consumption_countries_hourly['CONSUMPTION_ID'] = consumption_countries_hourly.COUNTRY + '_' + consumption_countries_hourly.DATETIME.astype(str)
 29 | consumption_countries_hourly['DATETIME'] = pd.to_datetime(consumption_countries_hourly['DATETIME'])
 30 | # Split the labels into training and test
 31 | test_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME > test_start) & (consumption_countries_hourly.DATETIME <= test_end)]
 32 | # Transforms to Spark DataFranes
 33 | test_labels = spark.createDataFrame(test_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 34 | 
 35 | # COMMAND ----------
 36 | 
 37 | # MAGIC %md
 38 | # MAGIC * Search for runs: The mlflow.search_runs function is called to search for all runs associated with the specified experiment_id_training. The runs are sorted by start time in descending order, meaning the latest run will be the first one in the list. The result is stored in the runs variable.
 39 | # MAGIC
 40 | # MAGIC * Select the latest run: The latest_run_id is assigned the run ID of the first run in the runs list (i.e., the latest run). This ID will be used to retrieve the details of the latest run.
 41 | # MAGIC
 42 | # MAGIC * Get the latest run details: The mlflow.get_run function is called with the latest_run_id to retrieve the details of the latest run. The details are stored in the latest_run variable.
 43 | # MAGIC
 44 | # MAGIC * Get the logged metrics: The metrics logged during the latest run are extracted from the latest_run.data.metrics attribute and stored in the metrics variable.
 45 | 
 46 | # COMMAND ----------
 47 | 
 48 | # Search for all runs associated with the experiment ID, sorted by start time
 49 | runs = mlflow.search_runs(experiment_ids=experiment_id_training, order_by=["start_time desc"])
 50 | 
 51 | #Select the first run in the list (i.e., the latest run)
 52 | latest_run_id = runs.iloc[0]["run_id"]
 53 | latest_run = mlflow.get_run(latest_run_id)
 54 | 
 55 | # Get the metrics logged during the latest run
 56 | metrics = latest_run.data.metrics
 57 | 
 58 | # Print the metrics
 59 | for key, value in metrics.items():
 60 |     print(key, value)
 61 | 
 62 | 
 63 | # COMMAND ----------
 64 | 
 65 | # MAGIC %md
 66 | # MAGIC ## Model Performance Testing
 67 | 
 68 | # COMMAND ----------
 69 | 
 70 | # MAGIC %md
 71 | # MAGIC * We define some thresholds for our model to meet
 72 | 
 73 | # COMMAND ----------
 74 | 
 75 | mse_threshold = 1000000000.0
 76 | mae_threshold = 30000.0
 77 | rmse_threshold = 40000.0
 78 | r2_threshold = 0.9
 79 | training_time_threshold = 3600.0
 80 | 
 81 | # COMMAND ----------
 82 | 
 83 | # MAGIC %md
 84 | # MAGIC The test_model_performance() function evaluates the performance of a model by comparing specific metrics against defined thresholds. It checks if metrics such as MSE, MAE, RMSE, R2 score, and training time meet the specified thresholds. Success or failure messages are printed for each test, and a boolean variable (all_tests_passed) is updated accordingly. The function returns the overall result indicating whether all tests passed (True) or if any of them failed (False).
 85 | 
 86 | # COMMAND ----------
 87 | 
 88 | def test_model_performance():
 89 |     all_tests_passed = True
 90 |     try:
 91 |         assert metrics['MSE'] < mse_threshold
 92 |         print(f"MSE test passed with {metrics['MSE']} mean squared error")
 93 |     except AssertionError:
 94 |         print(f"MSE test failed. Expected < {mse_threshold} but got {metrics['MSE']}")
 95 |         all_tests_passed = False
 96 | 
 97 |     try:
 98 |         assert metrics['MAE'] < mae_threshold
 99 |         print(f"MAE test passed with {metrics['MAE']} mean absolute error")
100 |     except AssertionError:
101 |         print(f"MAE test failed. Expected < {mae_threshold} but got {metrics['MAE']}")
102 |         all_tests_passed = False
103 | 
104 |     try:
105 |         assert metrics['RMSE'] < rmse_threshold
106 |         print(f"RMSE test passed with {metrics['RMSE']} root mean squared error")
107 |     except AssertionError:
108 |         print(f"RMSE test failed. Expected < {rmse_threshold} but got {metrics['RMSE']}")
109 |         all_tests_passed = False
110 |     
111 |     try:
112 |         assert metrics['R2'] > r2_threshold
113 |         print(f"R2 test passed with {metrics['R2']} score")
114 |     except AssertionError:
115 |         print(f"R2 test failed. Expected > {r2_threshold} but got {metrics['R2']}")
116 |         all_tests_passed = False
117 | 
118 | 
119 |     try:
120 |         assert metrics['Training Time(sec)'] < training_time_threshold #1hour
121 |         print(f"Model training time test passed with {metrics['Training Time(sec)']} seconds")
122 |     except AssertionError:
123 |         print(f"Model training time test failed. Expected < {training_time_threshold} seconds but got {metrics['Training Time(sec)']} seconds")
124 |         all_tests_passed = False
125 | 
126 |     return all_tests_passed
127 | 
128 | # COMMAND ----------
129 | 
130 | # MAGIC %md
131 | # MAGIC ## Metrics Visualization
132 | 
133 | # COMMAND ----------
134 | 
135 | # MAGIC %md
136 | # MAGIC We create a DataFrame that shows the metric values and their corresponding thresholds, along with pass/fail status for each test. It checks if the metric values meet the defined thresholds and assigns "Test Passed" or "Test Failed" based on the comparison. The purpose is to provide a visual representation of the test results for easy interpretation and evaluation of the model's performance against the thresholds.
137 | 
138 | # COMMAND ----------
139 | 
140 | # Create a DataFrame with the metric values and their corresponding thresholds
141 | df = spark.createDataFrame([
142 |     ("MSE", metrics['MSE'], mse_threshold),
143 |     ("MAE", metrics['MAE'], mae_threshold),
144 |     ("RMSE", metrics['RMSE'], rmse_threshold),
145 |     ("R2", metrics['R2'], r2_threshold),
146 |     ("Training Time(sec)", metrics['Training Time(sec)'], training_time_threshold )
147 | ], ["Metric", "Value", "Threshold"])
148 | 
149 | # Cast the "Threshold" column to DoubleType
150 | df = df.withColumn("Threshold", df["Threshold"].cast(DoubleType()))
151 | df = df.withColumn("Pass", when(df["Metric"].isin(["MSE", "MAE", "RMSE","Training Time(sec)"]), df["Value"] <= df["Threshold"]).otherwise(df["Value"] >= df["Threshold"]))
152 | # Add a column to show pass/fail as strings
153 | df = df.withColumn("Status", when(df["Pass"], "Test Passed").otherwise("Test Failed"))
154 | # Show the DataFrame
155 | display(df)
156 | 
157 | # COMMAND ----------
158 | 
159 | # MAGIC %md
160 | # MAGIC ## Model Staging
161 | 
162 | # COMMAND ----------
163 | 
164 | # MAGIC %md
165 | # MAGIC * The code automates the transition of the latest version of a registered model to the staging environment.
166 | # MAGIC * It checks the performance of the model using performance tests.
167 | # MAGIC * If all performance tests pass, the model is transitioned to the staging environment.
168 | # MAGIC * If any test fails, the model is not staged and a message is printed indicating the failure.
169 | # MAGIC * The purpose is to ensure that only models meeting the performance criteria are moved to the staging environment.
170 | 
171 | # COMMAND ----------
172 | 
173 | def proceed_model_to_staging():
174 |     # Get the latest version of the registered model
175 |     client = mlflow.tracking.MlflowClient()
176 |     model_version = client.get_latest_versions(model_name, stages=["None"])[0].version
177 | 
178 |     # Define the endpoint URL
179 |     endpoint_url = f"https://{databricks_instance}/api/2.0/mlflow/databricks/model-versions/transition-stage"
180 | 
181 |     stage = 'Staging' #Define the stage you want your model to transit
182 |     comment = "Transitioning to staging environment after performance testing"
183 |     headers = { "Authorization": "Bearer " + access_token }
184 | 
185 |     request_body = {
186 |         "version": f"{model_version}",
187 |         "name": model_name, 
188 |         "stage" : stage, #Specifies the environment we want to transit our model
189 |         "archive_existing_versions": False, #Specifies whether to archive all current model versions in the target stage.
190 |         "comment": comment 
191 |     }
192 | 
193 |     # Make the request
194 |     response = requests.post(endpoint_url, headers=headers,json=request_body)
195 | 
196 |     # Check the response status code
197 |     if response.status_code == 200:
198 |         print("Model version transitioned to staging")
199 |     else:
200 |         print(f"Error transitioning model version to staging: {response.text}")
201 | 
202 | 
203 | # Call function for staging
204 | 
205 | all_tests_passed = test_model_performance()
206 | # run performance tests here
207 | if all_tests_passed:
208 |     # proceed with model staging
209 |     proceed_model_to_staging()
210 | else:
211 |     print("Model performance tests failed. Model will not be staged.")
212 | 
213 | # COMMAND ----------
214 | 
215 | 
216 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Data Engineering/03. Data Quality/01. Data Quality Checks.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | ! pip install plotly
  3 | from pyspark.sql.functions import count, when, isnull, col
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | import pandas as pd
  7 | import plotly.graph_objects as go
  8 | import numpy as np
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | spark.sql('USE df_dev') 
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | df = spark.read.table('final_consumption_countries_hourly')
 17 | 
 18 | # COMMAND ----------
 19 | 
 20 | df.count()
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # MAGIC %md
 25 | # MAGIC ## Data Profiling
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | display(df)
 30 | 
 31 | # COMMAND ----------
 32 | 
 33 | # MAGIC %md
 34 | # MAGIC ## Sanity & Data Quality Checks 
 35 | 
 36 | # COMMAND ----------
 37 | 
 38 | # MAGIC %md
 39 | # MAGIC 1. Missing Values Check: Counts the number of null values in each column of the DataFrame, calculates the total number of nulls, and calculates the percentage of null values relative to the total number of rows.
 40 | # MAGIC
 41 | # MAGIC 1. Duplicates Check: Determines the count of duplicate rows by subtracting the count of the DataFrame after dropping duplicates from the original count. It also calculates the percentage of duplicate rows relative to the total number of rows.
 42 | # MAGIC
 43 | # MAGIC 1. Invalid Values Check: Counts the number of invalid records in each dataframe(ex. negative/zero energy consumption)
 44 | # MAGIC
 45 | # MAGIC 1. Outlier Detection: Defines bounds based on the first and third quartiles of the 'Actual_MW' column using the approxQuantile function. It then identifies outliers by counting the number of rows where the 'Actual_MW' value falls outside the defined bounds.
 46 | # MAGIC
 47 | # MAGIC 1. Schema Verification: Compares the DataFrame's column names to the expected column names ('Datetime', 'Actual_MW', 'start_time', 'country') and checks if any unexpected columns exist.
 48 | # MAGIC
 49 | # MAGIC 1. Summary Print: Displays the results of the data quality checks, including the count and percentage of missing values, count and percentage of duplicate rows, presence of outliers, and whether the schema matches the expected columns.
 50 | # MAGIC
 51 | # MAGIC 1. Statistical Checks: Prints basic statistical measures of the DataFrame using the describe function.
 52 | 
 53 | # COMMAND ----------
 54 | 
 55 | print("\nData quality checks for concatenated dataframe...")
 56 | 
 57 | # 1. Missing values check
 58 | null_counts = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).collect()
 59 | total_nulls = sum(row[c] for row in null_counts for c in df.columns)
 60 | nulls_percentage = (total_nulls / df.count()) * 100
 61 | 
 62 | # 2. Invalid values check
 63 | invalid_values = df.filter((df['HOURLY_CONSUMPTION_MW'] <= 0)).count()
 64 | invalid_percentage = (invalid_values / df.count()) * 100
 65 | 
 66 | # 3. Duplicates check
 67 | duplicates_count = df.count() - df.dropDuplicates().count()
 68 | duplicates_percentage = (duplicates_count / df.count()) * 100
 69 | 
 70 | # 4. Outlier detection
 71 | bounds = {
 72 |     c: dict(
 73 |         zip(["q1", "q3"], df.approxQuantile(c, [0.25, 0.75], 0))
 74 |     )
 75 |     for c in ["HOURLY_CONSUMPTION_MW"]
 76 | }
 77 | outliers = 0
 78 | for c in bounds:
 79 |     iqr = bounds[c]['q3'] - bounds[c]['q1']
 80 |     bounds[c]['lower'] = bounds[c]['q1'] - (iqr * 1.5)
 81 |     bounds[c]['upper'] = bounds[c]['q3'] + (iqr * 1.5)
 82 |     outliers += df.filter(
 83 |         (df[c] < bounds[c]['lower']) | 
 84 |         (df[c] > bounds[c]['upper'])
 85 |     ).count()
 86 | 
 87 | # 5. Schema verification 
 88 | expected_columns = ['DATETIME', 'HOURLY_CONSUMPTION_MW', 'COUNTRY']
 89 | schema_check = len(set(df.columns) - set(expected_columns)) == 0
 90 | 
 91 | # Summary print
 92 | print(f"Missing values: {total_nulls if total_nulls > 0 else 'None'} ({nulls_percentage:.4f}% of total rows)")
 93 | print(f"Duplicate rows: {duplicates_count if duplicates_count > 0 else 'None'} ({duplicates_percentage:.4f}% of total rows)")
 94 | print(f"Invalid values: {invalid_values if invalid_values > 0 else 'None'} ({invalid_percentage:.4f}% of total rows)")
 95 | print(f"Outliers: {'Found' if outliers else 'None'}")
 96 | print(f"Schema check: {'Unexpected schema' if not schema_check else 'Schema as expected'}")
 97 | 
 98 | # 6. Statistical checks
 99 | print("Basic statistical measures:")
100 | df.describe().show()
101 | 
102 | 
103 | # COMMAND ----------
104 | 
105 | from pyspark.sql.functions import col, lag, expr
106 | from pyspark.sql.window import Window
107 | from pyspark.sql import functions as F
108 | 
109 | # Specify the column names in your DataFrame
110 | datetime_col = "DATETIME"
111 | country_col = "COUNTRY"
112 | 
113 | # Sort the DataFrame by 'DATETIME' within each country
114 | window_spec = Window.partitionBy(country_col).orderBy(datetime_col)
115 | df_sorted = df.withColumn("start_time", col(datetime_col).cast("timestamp")).orderBy(country_col, datetime_col)
116 | 
117 | # Calculate the time difference between consecutive records within each country
118 | df_sorted = df_sorted.withColumn("time_diff", col("start_time").cast("long") - lag(col("start_time").cast("long")).over(window_spec))
119 | 
120 | # Check if all time differences are exactly 1 hour within each country
121 | country_continuity = df_sorted.groupBy(country_col).agg(F.min(F.when(col("time_diff") == 3600, True)).alias("is_continuous"))
122 | 
123 | # Show the results
124 | country_continuity.show()
125 | 
126 | 
127 | # COMMAND ----------
128 | 
129 | # MAGIC %md
130 | # MAGIC ## Clean Datasets(Duplicates,Null,Invalid)
131 | 
132 | # COMMAND ----------
133 | 
134 | pandas_df = df.toPandas()
135 | 
136 | # COMMAND ----------
137 | 
138 | # MAGIC %md
139 | # MAGIC ## LinePlot of the Energy Consumption Forecasting
140 | 
141 | # COMMAND ----------
142 | 
143 | def create_plot(country, time_range_name, time_range):
144 |     # Filter data for the specific country
145 |     filtered_df = pandas_df[pandas_df['COUNTRY'] == country]
146 | 
147 |     # Filter data to the specified time range
148 |     filtered_df = filtered_df.loc[(filtered_df['DATETIME'] >= time_range[0]) & (filtered_df['DATETIME'] <= time_range[1])]
149 | 
150 |     # Aggregate data to daily averages
151 |     daily_df = filtered_df.groupby(pd.Grouper(key='DATETIME', freq='D')).mean()
152 | 
153 |     # Create a rolling average
154 |     daily_df['Rolling_MW'] = daily_df['HOURLY_CONSUMPTION_MW'].rolling(window=7).mean()
155 | 
156 |     # Find the times corresponding to min and max Actual_MW in the filtered data
157 |     min_time = filtered_df.loc[filtered_df['HOURLY_CONSUMPTION_MW'].idxmin(), 'DATETIME']
158 |     max_time = filtered_df.loc[filtered_df['HOURLY_CONSUMPTION_MW'].idxmax(), 'DATETIME']
159 | 
160 |     # Create a line plot
161 |     fig = go.Figure()
162 | 
163 |     # Add trace for actual MW
164 |     fig.add_trace(go.Scatter(x=filtered_df['DATETIME'], y=filtered_df['HOURLY_CONSUMPTION_MW'], mode='markers',
165 |                              name='Actual MW',
166 |                              hovertemplate=
167 |                              "<b>%{x}</b><br><br>" +
168 |                              "Actual MW: %{y}<br>" +
169 |                              "<extra></extra>"))
170 | 
171 |     # Add trace for rolling average
172 |     fig.add_trace(go.Scatter(x=daily_df.index, y=daily_df['Rolling_MW'], mode='markers',
173 |                              name='7-day Rolling Average',
174 |                              hovertemplate=
175 |                              "<b>%{x}</b><br><br>" +
176 |                              "Rolling MW: %{y}<br>" +
177 |                              "<extra></extra>"))
178 | 
179 |     # Add markers for min and max values
180 |     fig.add_trace(go.Scatter(x=[min_time, max_time],
181 |                              y=[filtered_df.loc[filtered_df['DATETIME'] == min_time, 'HOURLY_CONSUMPTION_MW'].values[0],
182 |                                 filtered_df.loc[filtered_df['DATETIME'] == max_time, 'HOURLY_CONSUMPTION_MW'].values[0]],
183 |                              mode='markers+text',
184 |                              marker=dict(size=[10, 10]),
185 |                              text=['Min', 'Max'],
186 |                              textposition="top center",
187 |                              name='Min/Max',
188 |                              hovertemplate=
189 |                              "<b>%{x}</b><br><br>" +
190 |                              "Actual MW: %{y}<br>" +
191 |                              "<extra></extra>"))
192 | 
193 |     # Add vertical lines for min and max values
194 |     fig.add_shape(
195 |         dict(type="line", x0=min_time, y0=0, x1=min_time, y1=filtered_df['HOURLY_CONSUMPTION_MW'].max(),
196 |              line=dict(color="RoyalBlue", width=2)))
197 |     fig.add_shape(
198 |         dict(type="line", x0=max_time, y0=0, x1=max_time, y1=filtered_df['HOURLY_CONSUMPTION_MW'].max(),
199 |              line=dict(color="RoyalBlue", width=2)))
200 | 
201 |     # Update layout
202 |     fig.update_layout(title=f'Daily Energy Consumption for {country.capitalize()} over {time_range_name.capitalize()}',
203 |                       xaxis_title='DATETIME',
204 |                       yaxis_title='Energy Consumption (HOURLY_CONSUMPTION_MW)',
205 |                       hovermode='x')
206 | 
207 |     fig.show()
208 | 
209 | 
210 | # COMMAND ----------
211 | 
212 | # Define time ranges
213 | time_ranges = {
214 |     'decade':['2015-01-01','2023-01-01'],
215 |     'year': ['2022-01-01', '2023-01-01'],
216 |     'month': ['2022-12-01', '2023-01-01'],
217 |     'week': ['2022-12-25', '2023-01-01']
218 | }
219 | 
220 | create_plot('greece','year', time_ranges['year'])
221 | 
222 | # COMMAND ----------
223 | 
224 | # MAGIC %md
225 | # MAGIC ## Box-and-Whisker Plot
226 | 
227 | # COMMAND ----------
228 | 
229 | import plotly.graph_objects as go
230 | 
231 | def create_box_plot(country):
232 |     # Filter data for the specific country
233 |     filtered_df = pandas_df[pandas_df['COUNTRY'] == country]
234 | 
235 |     # Create a box plot
236 |     fig = go.Figure()
237 | 
238 |     # Add box trace
239 |     fig.add_trace(go.Box(y=filtered_df['HOURLY_CONSUMPTION_MW'], name='HOURLY_CONSUMPTION_MW'))
240 | 
241 |     # Update layout
242 |     fig.update_layout(title=f'Boxplot of Energy Consumption for {country.capitalize()}',
243 |                       yaxis_title='Energy Consumption (HOURLY_CONSUMPTION_MW)')
244 | 
245 |     fig.show()
246 | 
247 | create_box_plot('greece')
248 | 
249 | # COMMAND ----------
250 | 
251 | 
252 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/Data Engineering/03. Data Quality/02. Monitoring Data Quality Checks.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | ! pip install plotly
  3 | from pyspark.sql.functions import count, when, isnull, col
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | import pandas as pd
  7 | import plotly.graph_objects as go
  8 | import numpy as np
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | spark.sql('USE db_monitor') 
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | df = spark.read.table('final_monitoring_consumption_countries_hourly')
 17 | 
 18 | # COMMAND ----------
 19 | 
 20 | df.count()
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # MAGIC %md
 25 | # MAGIC ## Data Profiling
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | display(df)
 30 | 
 31 | # COMMAND ----------
 32 | 
 33 | # MAGIC %md
 34 | # MAGIC ## Sanity & Data Quality Checks 
 35 | 
 36 | # COMMAND ----------
 37 | 
 38 | # MAGIC %md
 39 | # MAGIC 1. Missing Values Check: Counts the number of null values in each column of the DataFrame, calculates the total number of nulls, and calculates the percentage of null values relative to the total number of rows.
 40 | # MAGIC
 41 | # MAGIC 1. Duplicates Check: Determines the count of duplicate rows by subtracting the count of the DataFrame after dropping duplicates from the original count. It also calculates the percentage of duplicate rows relative to the total number of rows.
 42 | # MAGIC
 43 | # MAGIC 1. Invalid Values Check: Counts the number of invalid records in each dataframe(ex. negative/zero energy consumption)
 44 | # MAGIC
 45 | # MAGIC 1. Outlier Detection: Defines bounds based on the first and third quartiles of the 'Actual_MW' column using the approxQuantile function. It then identifies outliers by counting the number of rows where the 'Actual_MW' value falls outside the defined bounds.
 46 | # MAGIC
 47 | # MAGIC 1. Schema Verification: Compares the DataFrame's column names to the expected column names ('Datetime', 'Actual_MW', 'start_time', 'country') and checks if any unexpected columns exist.
 48 | # MAGIC
 49 | # MAGIC 1. Summary Print: Displays the results of the data quality checks, including the count and percentage of missing values, count and percentage of duplicate rows, presence of outliers, and whether the schema matches the expected columns.
 50 | # MAGIC
 51 | # MAGIC 1. Statistical Checks: Prints basic statistical measures of the DataFrame using the describe function.
 52 | 
 53 | # COMMAND ----------
 54 | 
 55 | print("\nData quality checks for concatenated dataframe...")
 56 | 
 57 | # 1. Missing values check
 58 | null_counts = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).collect()
 59 | total_nulls = sum(row[c] for row in null_counts for c in df.columns)
 60 | nulls_percentage = (total_nulls / df.count()) * 100
 61 | 
 62 | # 2. Invalid values check
 63 | invalid_values = df.filter((df['HOURLY_CONSUMPTION_MW'] <= 0)).count()
 64 | invalid_percentage = (invalid_values / df.count()) * 100
 65 | 
 66 | # 3. Duplicates check
 67 | duplicates_count = df.count() - df.dropDuplicates().count()
 68 | duplicates_percentage = (duplicates_count / df.count()) * 100
 69 | 
 70 | # 4. Outlier detection
 71 | bounds = {
 72 |     c: dict(
 73 |         zip(["q1", "q3"], df.approxQuantile(c, [0.25, 0.75], 0))
 74 |     )
 75 |     for c in ["HOURLY_CONSUMPTION_MW"]
 76 | }
 77 | outliers = 0
 78 | for c in bounds:
 79 |     iqr = bounds[c]['q3'] - bounds[c]['q1']
 80 |     bounds[c]['lower'] = bounds[c]['q1'] - (iqr * 1.5)
 81 |     bounds[c]['upper'] = bounds[c]['q3'] + (iqr * 1.5)
 82 |     outliers += df.filter(
 83 |         (df[c] < bounds[c]['lower']) | 
 84 |         (df[c] > bounds[c]['upper'])
 85 |     ).count()
 86 | 
 87 | # 5. Schema verification 
 88 | expected_columns = ['DATETIME', 'HOURLY_CONSUMPTION_MW', 'COUNTRY']
 89 | schema_check = len(set(df.columns) - set(expected_columns)) == 0
 90 | 
 91 | # Summary print
 92 | print(f"Missing values: {total_nulls if total_nulls > 0 else 'None'} ({nulls_percentage:.4f}% of total rows)")
 93 | print(f"Duplicate rows: {duplicates_count if duplicates_count > 0 else 'None'} ({duplicates_percentage:.4f}% of total rows)")
 94 | print(f"Invalid values: {invalid_values if invalid_values > 0 else 'None'} ({invalid_percentage:.4f}% of total rows)")
 95 | print(f"Outliers: {'Found' if outliers else 'None'}")
 96 | print(f"Schema check: {'Unexpected schema' if not schema_check else 'Schema as expected'}")
 97 | 
 98 | # 6. Statistical checks
 99 | print("Basic statistical measures:")
100 | df.describe().show()
101 | 
102 | 
103 | # COMMAND ----------
104 | 
105 | from pyspark.sql.functions import col, lag, expr
106 | from pyspark.sql.window import Window
107 | from pyspark.sql import functions as F
108 | 
109 | # Specify the column names in your DataFrame
110 | datetime_col = "DATETIME"
111 | country_col = "COUNTRY"
112 | 
113 | # Sort the DataFrame by 'DATETIME' within each country
114 | window_spec = Window.partitionBy(country_col).orderBy(datetime_col)
115 | df_sorted = df.withColumn("start_time", col(datetime_col).cast("timestamp")).orderBy(country_col, datetime_col)
116 | 
117 | # Calculate the time difference between consecutive records within each country
118 | df_sorted = df_sorted.withColumn("time_diff", col("start_time").cast("long") - lag(col("start_time").cast("long")).over(window_spec))
119 | 
120 | # Check if all time differences are exactly 1 hour within each country
121 | country_continuity = df_sorted.groupBy(country_col).agg(F.min(F.when(col("time_diff") == 3600, True)).alias("is_continuous"))
122 | 
123 | # Show the results
124 | country_continuity.show()
125 | 
126 | 
127 | # COMMAND ----------
128 | 
129 | # MAGIC %md
130 | # MAGIC ## Clean Datasets(Duplicates,Null,Invalid)
131 | 
132 | # COMMAND ----------
133 | 
134 | pandas_df = df.toPandas()
135 | 
136 | # COMMAND ----------
137 | 
138 | # MAGIC %md
139 | # MAGIC ## LinePlot of the Energy Consumption Forecasting
140 | 
141 | # COMMAND ----------
142 | 
143 | def create_plot(country, time_range_name, time_range):
144 |     # Filter data for the specific country
145 |     filtered_df = pandas_df[pandas_df['COUNTRY'] == country]
146 | 
147 |     # Filter data to the specified time range
148 |     filtered_df = filtered_df.loc[(filtered_df['DATETIME'] >= time_range[0]) & (filtered_df['DATETIME'] <= time_range[1])]
149 | 
150 |     # Aggregate data to daily averages
151 |     daily_df = filtered_df.groupby(pd.Grouper(key='DATETIME', freq='D')).mean()
152 | 
153 |     # Create a rolling average
154 |     daily_df['Rolling_MW'] = daily_df['HOURLY_CONSUMPTION_MW'].rolling(window=7).mean()
155 | 
156 |     # Find the times corresponding to min and max Actual_MW in the filtered data
157 |     min_time = filtered_df.loc[filtered_df['HOURLY_CONSUMPTION_MW'].idxmin(), 'DATETIME']
158 |     max_time = filtered_df.loc[filtered_df['HOURLY_CONSUMPTION_MW'].idxmax(), 'DATETIME']
159 | 
160 |     # Create a line plot
161 |     fig = go.Figure()
162 | 
163 |     # Add trace for actual MW
164 |     fig.add_trace(go.Scatter(x=filtered_df['DATETIME'], y=filtered_df['HOURLY_CONSUMPTION_MW'], mode='markers',
165 |                              name='Actual MW',
166 |                              hovertemplate=
167 |                              "<b>%{x}</b><br><br>" +
168 |                              "Actual MW: %{y}<br>" +
169 |                              "<extra></extra>"))
170 | 
171 |     # Add trace for rolling average
172 |     fig.add_trace(go.Scatter(x=daily_df.index, y=daily_df['Rolling_MW'], mode='markers',
173 |                              name='7-day Rolling Average',
174 |                              hovertemplate=
175 |                              "<b>%{x}</b><br><br>" +
176 |                              "Rolling MW: %{y}<br>" +
177 |                              "<extra></extra>"))
178 | 
179 |     # Add markers for min and max values
180 |     fig.add_trace(go.Scatter(x=[min_time, max_time],
181 |                              y=[filtered_df.loc[filtered_df['DATETIME'] == min_time, 'HOURLY_CONSUMPTION_MW'].values[0],
182 |                                 filtered_df.loc[filtered_df['DATETIME'] == max_time, 'HOURLY_CONSUMPTION_MW'].values[0]],
183 |                              mode='markers+text',
184 |                              marker=dict(size=[10, 10]),
185 |                              text=['Min', 'Max'],
186 |                              textposition="top center",
187 |                              name='Min/Max',
188 |                              hovertemplate=
189 |                              "<b>%{x}</b><br><br>" +
190 |                              "Actual MW: %{y}<br>" +
191 |                              "<extra></extra>"))
192 | 
193 |     # Add vertical lines for min and max values
194 |     fig.add_shape(
195 |         dict(type="line", x0=min_time, y0=0, x1=min_time, y1=filtered_df['HOURLY_CONSUMPTION_MW'].max(),
196 |              line=dict(color="RoyalBlue", width=2)))
197 |     fig.add_shape(
198 |         dict(type="line", x0=max_time, y0=0, x1=max_time, y1=filtered_df['HOURLY_CONSUMPTION_MW'].max(),
199 |              line=dict(color="RoyalBlue", width=2)))
200 | 
201 |     # Update layout
202 |     fig.update_layout(title=f'Daily Energy Consumption for {country.capitalize()} over {time_range_name.capitalize()}',
203 |                       xaxis_title='DATETIME',
204 |                       yaxis_title='Energy Consumption (HOURLY_CONSUMPTION_MW)',
205 |                       hovermode='x')
206 | 
207 |     fig.show()
208 | 
209 | 
210 | # COMMAND ----------
211 | 
212 | # Define time ranges
213 | time_ranges = {
214 |     'decade':['2015-01-01','2023-01-01'],
215 |     'year': ['2022-01-01', '2023-01-01'],
216 |     'month': ['2023-01-01', '2023-02-01'],
217 |     'week': ['2022-12-25', '2023-01-01']
218 | }
219 | 
220 | create_plot('greece','month', time_ranges['month'])
221 | 
222 | # COMMAND ----------
223 | 
224 | # MAGIC %md
225 | # MAGIC ## Box-and-Whisker Plot
226 | 
227 | # COMMAND ----------
228 | 
229 | import plotly.graph_objects as go
230 | 
231 | def create_box_plot(country):
232 |     # Filter data for the specific country
233 |     filtered_df = pandas_df[pandas_df['COUNTRY'] == country]
234 | 
235 |     # Create a box plot
236 |     fig = go.Figure()
237 | 
238 |     # Add box trace
239 |     fig.add_trace(go.Box(y=filtered_df['HOURLY_CONSUMPTION_MW'], name='HOURLY_CONSUMPTION_MW'))
240 | 
241 |     # Update layout
242 |     fig.update_layout(title=f'Boxplot of Energy Consumption for {country.capitalize()}',
243 |                       yaxis_title='Energy Consumption (HOURLY_CONSUMPTION_MW)')
244 | 
245 |     fig.show()
246 | 
247 | create_box_plot('greece')
248 | 
249 | # COMMAND ----------
250 | 
251 | 
252 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/03.Model Training.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC ## Configuration
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment"
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | # MAGIC %md
 12 | # MAGIC ##Load Datasets
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # MAGIC %md
 17 | # MAGIC * Load energy consumption data from a database into a Pandas DataFrame.
 18 | # MAGIC * Create a new column CONSUMPTION_ID by concatenating country codes with the date-time information.
 19 | # MAGIC * Convert the DATETIME column to a proper datetime data type for time-based operations.
 20 | # MAGIC * Split the data into two subsets: train_labels and test_labels, based on date-time ranges.
 21 | # MAGIC * Convert the subsets back into Spark DataFrames and select only the CONSUMPTION_ID, DATETIME, and HOURLY_CONSUMPTION_MW columns for further processing
 22 | 
 23 | # COMMAND ----------
 24 | 
 25 | # Load Consumption Region Table
 26 | consumption_countries_hourly = spark.table(f'{db}.final_consumption_countries_hourly').toPandas()
 27 | consumption_countries_hourly['CONSUMPTION_ID'] = consumption_countries_hourly.COUNTRY + '_' + consumption_countries_hourly.DATETIME.astype(str)
 28 | consumption_countries_hourly['DATETIME'] = pd.to_datetime(consumption_countries_hourly['DATETIME'])
 29 | 
 30 | # Split the labels into training and test
 31 | train_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME >= train_start) & (consumption_countries_hourly.DATETIME <= train_end)]
 32 | test_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME > test_start) & (consumption_countries_hourly.DATETIME <= test_end)]
 33 | #val_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME > test_end) & (consumption_countries_hourly.DATETIME <= validation_end)]
 34 | 
 35 | # Transforms to Spark DataFranes
 36 | train_labels = spark.createDataFrame(train_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 37 | test_labels = spark.createDataFrame(test_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 38 | #val_labels = spark.createDataFrame(val_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | # MAGIC %md
 43 | # MAGIC * Define load_data function to create training sets by fetching features based on specified keys.
 44 | # MAGIC * Inside the function, initialize feature lookups and create a training set by matching keys from input data.
 45 | # MAGIC * Convert the training set to a Pandas DataFrame.
 46 | # MAGIC * Call the load_data function to create training and test sets, and store them in variables training_set, train_df, and test_df.
 47 | 
 48 | # COMMAND ----------
 49 | 
 50 | def load_data(table_name, labels, lookup_key, ts_lookup_key):
 51 |     # In the FeatureLookup, if you do not provide the `feature_names` parameter, all features except primary keys are returned
 52 |     model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key, timestamp_lookup_key=ts_lookup_key)]
 53 | 
 54 |     # fs.create_training_set looks up features in model_feature_lookups that match the primary key from inference_data_df
 55 |     training_set = fs.create_training_set(labels, 
 56 |                                           model_feature_lookups, 
 57 |                                           label="HOURLY_CONSUMPTION_MW", 
 58 |                                           exclude_columns=["CONSUMPTION_ID", "DATETIME"])
 59 |     training_pd = training_set.load_df().toPandas()
 60 | 
 61 |     return training_set, training_pd
 62 | 
 63 | training_set, train_df = load_data(f'{db}.{feauture_store}', train_labels, 'CONSUMPTION_ID', "DATETIME")
 64 | _, test_df = load_data(f'{db}.{feauture_store}', test_labels, 'CONSUMPTION_ID', "DATETIME")
 65 | #_, val_df = load_data(f'{db}.{feauture_store}', val_labels, 'CONSUMPTION_ID', "DATE")
 66 | 
 67 | 
 68 | # COMMAND ----------
 69 | 
 70 | # MAGIC %md
 71 | # MAGIC ## Split to Features/Labels
 72 | 
 73 | # COMMAND ----------
 74 | 
 75 | X_train = train_df.drop(columns=['HOURLY_CONSUMPTION_MW'])
 76 | y_train = train_df['HOURLY_CONSUMPTION_MW']
 77 | X_test = test_df.drop(columns=['HOURLY_CONSUMPTION_MW'])
 78 | y_test = test_df['HOURLY_CONSUMPTION_MW']
 79 | 
 80 | # COMMAND ----------
 81 | 
 82 | # MAGIC %md
 83 | # MAGIC ### Create XGB Regressor and Register Model to Feature Store
 84 | 
 85 | # COMMAND ----------
 86 | 
 87 | # MAGIC %md
 88 | # MAGIC * The regressor is configured with the following hyperparameters:
 89 | # MAGIC * n_estimators: The number of trees in the ensemble (200).
 90 | # MAGIC * max_depth: The maximum depth of each tree (8).
 91 | # MAGIC * learning_rate: The step size shrinkage used in each boosting iteration (0.1).
 92 | # MAGIC * objective: The loss function to be optimized, using squared error for regression ('reg:squarederror').
 93 | # MAGIC * booster: The type of booster to use, specifically the gradient boosting tree ('gbtree').
 94 | # MAGIC * subsample: The fraction of training samples used for training each tree (0.8).
 95 | # MAGIC * colsample_bytree: The fraction of features used for training each tree (0.8).
 96 | # MAGIC * random_state: The random seed used for reproducibility (42).
 97 | 
 98 | # COMMAND ----------
 99 | 
100 | def create_regressor():
101 |     return XGBRegressor(
102 |         n_estimators=300,
103 |         max_depth=8,
104 |         learning_rate=0.1,
105 |         objective='reg:squarederror',
106 |         booster='gbtree',
107 |         subsample=0.8,
108 |         colsample_bytree=0.8,
109 |         random_state=42,
110 |     )
111 | 
112 | # COMMAND ----------
113 | 
114 | # MAGIC %md
115 | # MAGIC * mse (Mean Squared Error): It measures the average squared difference between the true and predicted values.
116 | # MAGIC * rmse (Root Mean Squared Error): It is the square root of the MSE, providing a more interpretable measure of the error.
117 | # MAGIC * mae (Mean Absolute Error): It calculates the average absolute difference between the true and predicted values.
118 | # MAGIC * r2 (R-squared): It indicates the proportion of the variance in the true values that is explained by the predicted values.
119 | # MAGIC The calculated metrics are returned as a tuple (mse, rmse, mae, r2).
120 | 
121 | # COMMAND ----------
122 | 
123 | def evaluate_model(y_test, y_pred):
124 |     mse = mean_squared_error(y_test, y_pred)
125 |     rmse = mean_squared_error(y_test, y_pred, squared=False)
126 |     mae = mean_absolute_error(y_test, y_pred)
127 |     r2 = r2_score(y_test, y_pred)
128 |     return mse, rmse, mae, r2
129 | 
130 | # COMMAND ----------
131 | 
132 | 
133 | def log_metrics(mse, rmse, mae, r2, training_time):
134 |     mlflow.log_metric("MAE", mae)
135 |     mlflow.log_metric("MSE", mse)
136 |     mlflow.log_metric("RMSE", rmse)
137 |     mlflow.log_metric("R2", r2)
138 |     mlflow.log_metric("Training Time(sec)", training_time)
139 | 
140 | # COMMAND ----------
141 | 
142 | # MAGIC %md
143 | # MAGIC * It starts an MLflow run within a with statement to encapsulate the training and logging process.
144 | # MAGIC * An XGBoost regressor is created using the create_regressor() function and trained on the training data using the fit() method. The regressor's predictions are then calculated using the testing data.
145 | # MAGIC * The performance of the model is evaluated using the evaluate_model() function, which calculates metrics such as MSE, RMSE, MAE, and R2.
146 | # MAGIC * The input schema, model hyperparameters, metrics, feature importances, and other relevant information are logged using MLflow's tracking capabilities.
147 | # MAGIC * The trained model is logged as an artifact using the feature store's log_model() function, and various parameters and information are logged for comparison and tracking purposes.
148 | # MAGIC
149 | # MAGIC The purpose of this function is to train and log the model to feature store
150 | 
151 | # COMMAND ----------
152 | 
153 | def train_model(X_train, X_test, y_train, y_test, training_set, fs, model_name, input_schema):
154 |     experiment_id = experiment_id_training
155 |     experiment = mlflow.get_experiment(experiment_id)
156 |     
157 |     if experiment:
158 |         experiment_name = experiment.name
159 |         mlflow.set_experiment(experiment_name)
160 |         print(f"Active experiment set to '{experiment_name}'")
161 |     else:
162 |         print(f"No experiment found with name '{experiment_name}'")
163 |     
164 |     with mlflow.start_run(nested=True) as run:
165 |         # Create and train XGBoost regressor
166 |         reg = create_regressor()
167 |         start_time = time.time()
168 |         reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=10, verbose=False)
169 |         end_time = time.time()
170 |         
171 |         # Make predictions
172 |         y_pred = reg.predict(X_test)
173 |         
174 |         # Evaluate the model
175 |         mse, rmse, mae, r2 = evaluate_model(y_test, y_pred)
176 |         
177 |         # Log the model input schema
178 |         input_schema = {"feature_names": list(X_train.columns)}
179 |         mlflow.log_dict(input_schema, "input_schema.json")
180 | 
181 |         # Log some tags for the model
182 |         tags = {"model_type": "XGBoost", "dataset": "energy_consumption","Workflow Type": "Initial Training"}
183 |         mlflow.set_tags(tags)
184 | 
185 |         # Log some parameters for the model
186 |         params = reg.get_params()
187 |         mlflow.log_dict(params, "hyperparams.json")
188 | 
189 |         # Log metrics
190 |         training_time = end_time - start_time
191 |         log_metrics(mse, rmse, mae, r2, training_time)
192 |         
193 |         # Log the feature importances of the model
194 |         importance = reg.get_booster().get_score(importance_type="gain")
195 |         mlflow.log_dict(importance, "importance.json")
196 |         # Log the model and its description as artifacts
197 |         description = "This is an XGBoost model trained to predict energy consumption of 11 European Countries in hourly basis."
198 |         mlflow.log_text(description, "description.txt")
199 |         
200 |         # Log the current timestamp as the code version
201 |         current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
202 |         mlflow.log_param("code_version", current_time)
203 | 
204 |         # Log additional important parameters for comparison
205 |         mlflow.log_param("n_estimators", params["n_estimators"])
206 |         mlflow.log_param("max_depth", params["max_depth"])
207 |         mlflow.log_param("learning_rate", params["learning_rate"])
208 |         mlflow.log_param("subsample", params["subsample"])
209 |         mlflow.log_param("colsample_bytree", params["colsample_bytree"])
210 |         mlflow.log_param("random_state", params["random_state"])
211 |         # Log the training data size
212 |         training_size = len(X_train)
213 |         testing_size = len(X_test)
214 |         training_range = {
215 |             'start': train_start,
216 |             'end': train_end
217 |         }
218 |         testing_range = {
219 |             'start': test_start,
220 |             'end': test_end
221 |         }
222 |         mlflow.log_param("training_range", training_range)
223 |         mlflow.log_param("testing_range", testing_range)
224 |         mlflow.log_param("training_data_size", training_size)
225 |         mlflow.log_param("testing_data_size", testing_size)
226 | 
227 |         # Log the model
228 |         fs.log_model(
229 |             model=reg,
230 |             artifact_path=f"{model_name}_artifact_path",
231 |             flavor=mlflow.xgboost,
232 |             training_set=training_set,
233 |             registered_model_name=model_name
234 |         )
235 |     
236 |     return {"R2": r2, "MSE": mse, "RMSE": rmse, "MAE": mae, "Training Time(sec)": training_time}
237 | 
238 | 
239 | metrics = train_model(X_train, X_test, y_train, y_test, training_set, fs, model_name, input_schema)
240 | 
241 | 
242 | # COMMAND ----------
243 | 
244 | 
245 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/04.Model Retraining Monthly(pyspark edition).py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Import Libraries
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment"
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | # MAGIC %md 
 12 | # MAGIC ## Configuration
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | config = spark.sql('select train_start, train_end, test_end from df_dev.config_retrain_monthly').collect()
 17 | train_start, train_end, test_end = config[0]
 18 | 
 19 | # train_start = '2013-06-01' #the retrain start date
 20 | # train_end = '2018-06-20'   #the retrain end date (20/06/2018 - 30/06/2018) 10 days for testing
 21 | # test_end = '2018-06-30'
 22 | # Convert start and end dates to datetime objects
 23 | start_new_train_date = pd.to_datetime(validation_end) + pd.DateOffset(days=1) # 1 day after validation end
 24 | end_new_train_date = pd.to_datetime(train_end)
 25 | start_new_test_date = pd.to_datetime(train_end) + pd.DateOffset(days=1) # 1 day after train end
 26 | end_new_test_date = pd.to_datetime(test_end)
 27 | # Calculate the number of days between start and end dates
 28 | num_new_train_days = (end_new_train_date - start_new_train_date).days
 29 | num_new_test_days = (end_new_test_date - start_new_test_date).days
 30 | 
 31 | # COMMAND ----------
 32 | 
 33 | # MAGIC %md
 34 | # MAGIC ##Load Datasets
 35 | 
 36 | # COMMAND ----------
 37 | 
 38 | # Load Consumption Region Table
 39 | consumption_regions_daily = spark.table(f'{db}.{consumption_regions_daily}')
 40 | consumption_regions_daily = consumption_regions_daily.withColumn('CONSUMPTION_ID', concat(col('REGION'), lit('_'), col('DATE')))
 41 | consumption_regions_daily = consumption_regions_daily.withColumn('DATE', col('DATE').cast(DateType()))
 42 | 
 43 | # Split the labels into training and test
 44 | train_labels = consumption_regions_daily.filter((col('DATE') >= train_start) & (col('DATE') <= train_end))
 45 | test_labels = consumption_regions_daily.filter((col('DATE') > train_end) & (col('DATE') <= test_end))
 46 | #val_labels = consumption_regions_daily.filter((col('DATE') > test_end) & (col('DATE') <= validation_end))
 47 | 
 48 | # Select the required columns
 49 | train_labels = train_labels.select("CONSUMPTION_ID", "DATE", "DAILY_CONSUMPTION_MW")
 50 | test_labels = test_labels.select("CONSUMPTION_ID", "DATE", "DAILY_CONSUMPTION_MW")
 51 | #val_labels = val_labels.select("CONSUMPTION_ID", "DATE", "DAILY_CONSUMPTION_MW")
 52 | 
 53 | # COMMAND ----------
 54 | 
 55 | def load_data(table_name, labels, lookup_key, ts_lookup_key):
 56 |     # In the FeatureLookup, if you do not provide the `feature_names` parameter, all features except primary keys are returned
 57 |     model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key, timestamp_lookup_key=ts_lookup_key)]
 58 | 
 59 |     # fs.create_training_set looks up features in model_feature_lookups that match the primary key from inference_data_df
 60 |     training_set = fs.create_training_set(labels, 
 61 |                                           model_feature_lookups, 
 62 |                                           label="DAILY_CONSUMPTION_MW", 
 63 |                                           exclude_columns=["CONSUMPTION_ID", "DATE"])
 64 |     training_df = training_set.load_df()
 65 | 
 66 |     return training_set, training_df
 67 | 
 68 | # Cast the 'DATE' column to 'TIMESTAMP' data type
 69 | train_labels = train_labels.withColumn('DATE', col('DATE').cast(TimestampType()))
 70 | test_labels = test_labels.withColumn('DATE', col('DATE').cast(TimestampType()))
 71 | #val_labels = val_labels.withColumn('DATE', col('DATE').cast(TimestampType()))
 72 | 
 73 | # Load the data for the training set
 74 | training_set, train_df = load_data(f'{db}.forecasting_features_daily', train_labels, 'CONSUMPTION_ID', 'DATE')
 75 | 
 76 | # Load the data for the test set
 77 | _, test_df = load_data(f'{db}.forecasting_features_daily', test_labels, 'CONSUMPTION_ID', 'DATE')
 78 | 
 79 | # Load the data for the validation set
 80 | #_, val_df = load_data(f'{db}.forecasting_features_daily', val_labels, 'CONSUMPTION_ID', 'DATE')
 81 | 
 82 | 
 83 | # COMMAND ----------
 84 | 
 85 | concatenated_df = train_df.union(test_df)
 86 | display(concatenated_df)
 87 | 
 88 | # COMMAND ----------
 89 | 
 90 | # MAGIC %md
 91 | # MAGIC ## Data Drift Test
 92 | 
 93 | # COMMAND ----------
 94 | 
 95 | # Convert year, month, and day columns to string and pad month and day with zeros
 96 | train_df_str = train_df.withColumn("YEAR", col("YEAR").cast("string"))
 97 | train_df_str = train_df_str.withColumn("MONTH", lpad(col("MONTH").cast("string"), 2, '0'))
 98 | train_df_str = train_df_str.withColumn("DAY_OF_MONTH", lpad(col("DAY_OF_MONTH").cast("string"), 2, '0'))
 99 | 
100 | # Concatenate year, month, and day columns with '-' separator and convert to date
101 | date_df = train_df_str.withColumn(
102 |     'date', 
103 |     to_date(concat_ws('-', train_df_str["YEAR"], train_df_str["MONTH"], train_df_str["DAY_OF_MONTH"]), 'yyyy-MM-dd')
104 | )
105 | 
106 | # Extract the most recent num_new_train_days days of data
107 | max_date_row = date_df.agg(max_("date").alias("max_date")).first()
108 | max_date = max_date_row["max_date"]
109 | 
110 | new_data = date_df.filter(col("date") >= date_sub(lit(max_date), num_new_train_days))
111 | 
112 | # Extract a random sample of num_new_train_days * 11 days data
113 | old_data = date_df.filter(
114 |     col("date") < date_sub(lit(max_date), num_new_train_days)
115 | ).orderBy(rand()).limit(num_new_train_days * 11)
116 | 
117 | # Concatenate the new and old data
118 | all_data = new_data.union(old_data)
119 | 
120 | 
121 | # COMMAND ----------
122 | 
123 | # Apply the ks_2samp test to each feature
124 | for feature_name in regions:
125 |     old_feature_data = old_data.select(feature_name).rdd.flatMap(lambda x: x).collect()
126 |     new_feature_data = new_data.select(feature_name).rdd.flatMap(lambda x: x).collect()
127 |     
128 |     _, p_value = ks_2samp(old_feature_data, new_feature_data)
129 |     
130 |     if p_value < 0.05:
131 |         print(f"The distribution of {feature_name} has drifted.")
132 |     else:
133 |         print(f"The distribution of {feature_name} has not drifted.")
134 | 
135 | 
136 | # COMMAND ----------
137 | 
138 | # MAGIC %md
139 | # MAGIC ## Retrain the Machine Learning Pipeline
140 | 
141 | # COMMAND ----------
142 | 
143 | # MAGIC %md
144 | # MAGIC * Create temporal views of these dataframes in order to be passed int the source notebook
145 | 
146 | # COMMAND ----------
147 | 
148 | concatenated_df.createOrReplaceTempView("concatenated_df_view")
149 | train_df.createOrReplaceTempView("train_df_view")
150 | test_df.createOrReplaceTempView("test_df_view")
151 | 
152 | # COMMAND ----------
153 | 
154 | # MAGIC %run "/Repos/CI ADO Repo/01.Develop/Utils/Train ML Pipeline"
155 | 
156 | # COMMAND ----------
157 | 
158 | # MAGIC %md
159 | # MAGIC ### Register Retrained Model to MLflow
160 | 
161 | # COMMAND ----------
162 | 
163 | 
164 |     with mlflow.start_run(nested=True) as run:
165 | 
166 |         experiment = mlflow.get_experiment(experiment_id_retraining)
167 |         if experiment:
168 |             experiment_name = experiment.name
169 |             mlflow.set_experiment(experiment_name)
170 |             print(f"Active experiment set to '{experiment_name}'")
171 |         else:
172 |             print(f"No experiment found with name '{experiment_name}'")
173 |         
174 |         # Define the output schema
175 |         output_schema = sch.Schema([sch.ColSpec("float", "DAILY_CONSUMPTION_MW")])
176 | 
177 |         # Create a model signature from the input and output schemas
178 |         signature = ModelSignature(inputs=input_schema, outputs=output_schema)
179 | 
180 |         # Log the model input schema
181 |         schema = {"input_schema": list(concatenated_df.columns[:-1]),"output_schema":concatenated_df.columns[-1]}
182 |         mlflow.log_dict(schema, "schema.json")
183 | 
184 |         # Log some tags for the model
185 |         mlflow.set_tags(tags)
186 | 
187 |         # Log some parameters for the model
188 |         mlflow.log_dict(hyperparameters, "hyperparams.json")
189 | 
190 |         # Log the evaluation metrics as metrics
191 |         mlflow.log_metric("MAE", mae)
192 |         mlflow.log_metric("MSE", mse)
193 |         mlflow.log_metric("RMSE", rmse)
194 |         mlflow.log_metric("R2", r2)
195 | 
196 |         #Log the time taken to train as metric
197 |         mlflow.log_metric("Training Time(sec)", training_time)
198 | 
199 |         # Log evaluation metrics as artifact
200 |         metrics = {"R2": r2, "MSE": mse, "RMSE": rmse, 'MAE':mae,'Training Time(sec)':training_time}
201 |         mlflow.log_dict(metrics, "metrics.json")
202 | 
203 |         # Log the model description as artifact
204 |         mlflow.log_text(description, "description.txt")
205 |         
206 |         # Log the current timestamp as the code version
207 |         mlflow.log_param("code_version", current_time)
208 | 
209 |         # Log additional important parameters for comparison
210 |         mlflow.log_param("n_estimators", hyperparameters["n_estimators"])
211 |         mlflow.log_param("max_depth", hyperparameters["max_depth"])
212 |         mlflow.log_param("learning_rate", hyperparameters["learning_rate"])
213 |         mlflow.log_param("training_data_size", training_size)
214 |         mlflow.log_param("testing_data_size", testing_size)
215 | 
216 |         # Log the model with its signature
217 |         mlflow.spark.log_model(xgb_model, artifact_path="model", signature=signature,pip_requirements=pip_requirements)
218 | 
219 |         # Register the model with its signature
220 |         model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
221 |         mlflow.register_model(model_uri=model_uri, name="pyspark_mlflow_model")
222 | 
223 |         # Get the latest model version(The one that we now registered)
224 |         client = MlflowClient()
225 |         model_version = client.get_latest_versions("pyspark_mlflow_model")[0].version
226 | 
227 |         # Save your data to a new DBFS directory for each run
228 |         data_path = f"dbfs:/FileStore/Data_Versioning/data_model_v{model_version}.parquet"
229 |         concatenated_df.write.format("parquet").save(data_path)
230 |  
231 |         # Log the DBFS path as an artifact
232 |         with open("data_path.txt", "w") as f:
233 |             f.write(data_path)
234 |         mlflow.log_artifact("data_path.txt")
235 | 
236 | # COMMAND ----------
237 | 
238 | # MAGIC %md
239 | # MAGIC ## Model Staging
240 | 
241 | # COMMAND ----------
242 | 
243 | def proceed_model_to_staging():    
244 |     # Get the latest version of the registered model
245 |     client = mlflow.tracking.MlflowClient()
246 |     model_version = client.get_latest_versions(model_name, stages=["None"])[0].version
247 | 
248 |     # Define the endpoint URL
249 |     endpoint_url = f"https://{databricks_instance}/api/2.0/mlflow/databricks/model-versions/transition-stage"
250 | 
251 |     stage = 'Staging' #Define the stage you want your model to transit
252 |     comment = "Transitioning to staging environment after performance testing"
253 |     headers = { "Authorization": "Bearer " + access_token }
254 | 
255 |     request_body = {
256 |         "version": f"{model_version}",
257 |         "name": model_name, 
258 |         "stage" : stage, #Specifies the environment we want to transit our model
259 |         "archive_existing_versions": False, #Specifies whether to archive all current model versions in the target stage.
260 |         "comment": comment 
261 |     }
262 | 
263 |     # Make the request
264 |     response = requests.post(endpoint_url, headers=headers,json=request_body)
265 | 
266 |     # Check the response status code
267 |     if response.status_code == 200:
268 |         print("Model version transitioned to staging")
269 |     else:
270 |         print(f"Error transitioning model version to staging: {response.text}")
271 | 
272 | 
273 | # COMMAND ----------
274 | 
275 | all_tests_passed = True
276 | # run performance tests here
277 | if all_tests_passed:
278 |     # proceed with model staging
279 |     proceed_model_to_staging()
280 | else:
281 |     print("Model performance tests failed. Model will not be staged.")
282 | 
283 | 
284 | # COMMAND ----------
285 | 
286 | 
287 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Streamlining Energy Consumption Forecasting using MLOps
  2 | 
  3 | This project focuses on streamlining the process of forecasting energy consumption by employing Machine Learning Operations (MLOps). It integrates data engineering, machine learning algorithms, and automation to create a scalable and efficient forecasting system.
  4 | 
  5 | [![Generic badge](https://img.shields.io/badge/Status-Complete-green.svg)](https://shields.io/)
  6 | [![Generic badge](https://img.shields.io/badge/Databricks-Powered-blue.svg)](https://shields.io/)
  7 | [![made-with-azure](https://img.shields.io/badge/Made%20with-Azure-1f425f.svg)](https://azure.microsoft.com/)
  8 | [![made-with-databricks](https://img.shields.io/badge/Made%20with-Databricks-orange.svg)](https://www.databricks.com/)
  9 | 
 10 | 
 11 | ## Table of Contents
 12 | - [Introduction](#-introduction)
 13 | - [Requirements](#️-requirements)
 14 | - [Setup & Installation](#️-setup--installation)
 15 | - [Aim of the Project](#-aim-of-the-project)
 16 | - [Results and Findings](#-results-and-findings)
 17 | - [Acknowledgments](#-acknowledgments)
 18 | - [Contact](#-contact)
 19 | - [Related Publication](#-related-publication)
 20 | - [Citation](#-citation)
 21 | 
 22 | ## 📌 Introduction
 23 | The core objective of this project is to develop and orchestrate an automated pipeline for forecasting energy consumption across eleven European countries, namely Belgium, Denmark, France, Germany, Greece, Italy, Luxembourg, Netherlands, Spain, Sweden, and Switzerland. The pipeline is specifically tailored for processing hourly energy consumption data.
 24 | 
 25 | This project is fully integrated within the Azure Cloud ecosystem and leverages the power and scalability of the Databricks platform. Utilizing these cutting-edge cloud technologies ensures that the pipeline is not only highly scalable but also incredibly efficient and reliable.
 26 | 
 27 | Forecasting energy consumption is pivotal for European countries, as it plays an instrumental role in ensuring energy sustainability, optimizing power generation and distribution, and facilitating informed decision-making. By producing reliable and timely forecasts, this project empowers energy providers and stakeholders with insights that can lead to cost reductions, enhanced operational efficiencies, and the promotion of sustainable energy practices.
 28 | 
 29 | The end goal is to establish a robust, scalable, and automated solution that provides precise forecasting of energy consumption. Through automating the forecasting process, we aim to keep up with the ever-evolving demands of the energy sector and contribute significantly to Europe’s broader economic and environmental objectives.
 30 | 
 31 | 
 32 | ## 🛠️ Requirements
 33 | 
 34 | ### Data Source
 35 | This project utilizes data from the ENTSO-E Transparency Platform, which provides comprehensive information on the European electricity market. To access the dataset, you will need to create an account on the ENTSO-E Transparency Platform. Once you have an account, you can access and download the dataset required for this project.
 36 | 
 37 | [Create an account on ENTSO-E Transparency Platform](https://keycloak-transparency.entsoe.eu/realms/tp/protocol/openid-connect/auth?response_type=code&client_id=tp-web&redirect_uri=https%3A%2F%2Ftransparency.entsoe.eu%2Fsso%2Flogin&state=7135aea4-5563-4a24-9fae-727dcee13294&login=true&scope=openid)
 38 | 
 39 | ### Libraries and Dependencies
 40 | This project is dependent on several libraries and frameworks. It's important to ensure that all of the necessary libraries are installed to be able to run the code seamlessly.
 41 | 
 42 | You can install the required libraries using the <b>'requirements.txt'</b> file included in the repository. Run the following command:
 43 | 
 44 | ```
 45 | cd mlops-energy-forecast-thesis/MLOps Pipeline/Utils
 46 | pip install -r requirements.txt
 47 | ```
 48 | 
 49 | ### Azure and Databricks
 50 | As the project is fully integrated with the Azure Cloud and utilizes the Databricks platform, you will need to have:
 51 | 
 52 | * An active Azure subscription.
 53 | * A Databricks workspace set up within Azure.
 54 | 
 55 | ## ⚙️ Setup & Installation
 56 | 
 57 | Follow these simplified steps to set up the project:
 58 | 
 59 | 1. **Create Accounts**: Sign up for [Azure Cloud](https://azure.microsoft.com/), [Databricks](https://databricks.com/), and [ENTSO-E Transparency Platform](https://transparency.entsoe.eu/).
 60 | 
 61 | 2. **Clone the Repository**: Clone this repository to your machine or Azure virtual machine.
 62 | 
 63 |    ```sh
 64 |    git clone https://github.com/Philippos01/mlops-energy-forecast-thesis.git
 65 | 
 66 | 3. **Install Requirements**: Navigate to the project directory and install the required libraries using the requirements.txt file.
 67 | ```
 68 | cd mlops-energy-forecast-thesis/MLOps Pipeline/Utils
 69 | pip install -r requirements.txt
 70 | ```
 71 | 
 72 | 4. **Set Up Databricks**: Log in to Databricks, and create a new workspace. Within the workspace, create a new cluster and make sure that it's running. Import the project notebooks into your workspace.
 73 |    
 74 | 5. **Configure Azure**: In your Azure account, create a resource group. Within this resource group, create a Databricks workspace (if you haven't already during the Databricks setup) and configure the necessary resources such as storage accounts, networking, etc.
 75 |    
 76 | 6. **Download and Import Dataset**: Log in to the ENTSO-E Transparency Platform and download the dataset. Import this dataset into Databricks.
 77 |    
 78 | 7. **Run Notebooks** : In Databricks, open the notebooks and attach them to the cluster you created earlier. Run the cells in sequence, making sure to input your API keys when prompted.
 79 | 
 80 | 8. **Monitor with MLflow**: You can keep track of experiments, parameters, metrics, and artifacts using MLflow in Databricks.
 81 | 
 82 | 9. **Deploy the Model**: After training and evaluating the model, follow the instructions in the documentation to deploy it for forecasting.
 83 | 
 84 | 10. **Schedule Notebooks**: Optionally, you can schedule the notebooks to run periodically to automate data retrieval and model training.
 85 | 
 86 | ## 🎯 Aim of the Project
 87 | 
 88 | This project aims to implement a data-driven approach for forecasting energy consumption across 11 European countries (Belgium, Denmark, France, Germany, Greece, Italy, Luxembourg, Netherlands, Spain, Sweden, Switzerland) on an hourly basis using Azure Databricks. The technical steps encompassed in the project are as follows:
 89 | 
 90 | * **Data Acquisition** : Downlaod energy consumption data from the ENTSO-E Transparency Platform, an authoritative source for European energy market data. This project utilizes manual downloads and uploads to Databricks, but this process can be automated for future scalability.
 91 | 
 92 | * **Data Processing and Feature Engineering**: Handle any missing or inconsistent data and engineer new features that might improve the performance of the forecasting models. This involves processing the raw data to format it appropriately for machine learning models.
 93 | 
 94 | * **Model Building**: Develop forecasting models using machine learning algorithms such as XGBoost and LSTM (Long Short-Term Memory networks) to predict energy consumption patterns. The choice of algorithms is based on their proven performance in time-series forecasting tasks.
 95 | 
 96 | * **Model Evaluation**: Evaluate the performance of the forecasting models using metrics such as Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R-squared (R²). This helps in quantifying how well the models are performing.
 97 | 
 98 | * **Deployment and Monitoring**: Save the chosen model in Feature Store and make it available for inference within Databricks. Incorporate monitoring tools to track the model’s performance over time and ensure it stays within acceptable limits. This approach facilitates a seamless integration within the Databricks ecosystem, enabling easy access and utilization of the model for forecasting purposes.
 99 | 
100 | * **Scalability and Performance**: Leverage the Azure cloud and Databricks platform to ensure that the implemented solution can handle large volumes of data efficiently. This enables the project to scale with the addition of new data or expansion to more countries.
101 | 
102 | By successfully implementing these technical steps, this project contributes to the larger goal of enabling better energy management and planning through data-driven insights and forecasts.
103 | 
104 | For a comprehensive and in-depth analysis of the project's objectives and how it achieves them, please refer to the detailed documentation:
105 | 
106 | [📄 Read the Detailed Documentation](./DOCUMENTATION.md)
107 | 
108 | ## 📈 Results and Findings
109 | 
110 | This section presents the results and findings obtained through the energy consumption forecasting pipeline. The results are categorized into explanatory analysis, average hourly consumption analysis, model comparison, and evaluation metrics for the deployed model.
111 | 
112 | ### Explanatory Analysis
113 | 
114 | #### Daily Energy Consumption(e.g. Greece)
115 | 
116 | Explanatory data analysis is essential for understanding the patterns and trends in the dataset. Below is a plot illustrating daily energy consumption in Greece. The plot reveals seasonality and trends in energy consumption, which are crucial for accurate forecasting.
117 | 
118 | ![Greece Daily Energy Consumption](MLOps%20Pipeline/Utils/Images/newplot.png)
119 | *Daily Energy Consumption in Greece.*
120 | 
121 | ### Average Hourly Consumption by Country and Hour of Day
122 | 
123 | The plot below provides insights into average hourly energy consumption by country and hour of day. This is crucial to understand which countries consume more energy at different times of the day and can guide resource allocation and energy production planning.
124 | 
125 | ![Average Hourly Consumption by Country and Hour of Day](MLOps%20Pipeline/Utils/Images/newplot%20(1).png)
126 | *Average Hourly Consumption by Country and Hour of Day.*
127 | 
128 | ### Model Comparison: Daily Staging & Production Model Comparison for Greece
129 | 
130 | To evaluate and select the best model for forecasting, we compared the daily staging and production models. The plot below illustrates how closely each model's predictions match the actual energy consumption data(For the sake of the example we illustrate data from Greece for the 1st week of April)
131 | 
132 | ![Daily Staging & Production Model Comparison for Greece](MLOps%20Pipeline/Utils/Images/newplot%20(5).png)
133 | *Daily Staging & Production Model Comparison for Greece over one week.*
134 | 
135 | ### Evaluation Metrics for Deployed Model
136 | 
137 | The current deployed model was evaluated based on various metrics such as Mean Squared Error (MSE), Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and R-squared (R²). These metrics provide a quantitative understanding of the model's performance in forecasting energy consumption.
138 | 
139 | - **MSE**: 24742781.8
140 | - **MAE**: 1859.5
141 | - **RMSE**: 4974.2
142 | - **R²**: 0.994
143 | - **Training Time**: 134.2 sec
144 | 
145 | These findings and insights are instrumental for utility companies, policy-makers, and consumers in making informed decisions regarding energy consumption, production, and resource allocation.
146 | 
147 | ## 🙏 Acknowledgments
148 | 
149 | This project was conducted as part of my thesis at the Athens University of Economics and Business, Department of Management Science and Technology.
150 | 
151 | ## 👥 Contact
152 | 
153 | If you have any questions or would like to discuss this project, feel free to reach out:
154 | 
155 | - LinkedIn: [LinkedIn](https://www.linkedin.com/in/fpriovolos/)
156 | - Email: filippos.priovolos01@gmail.com
157 | 
158 | ## 📝 Related Publication
159 | 
160 | This project is also the subject of a research paper that combines a theoretical and empirical approach. The paper dives into the details of the MLOps methodologies, techniques, and analysis involved in forecasting energy consumption with Azure Databricks 
161 | 
162 | - **Title**: "Streamlining MLOps for Energy Consumption Forecasting, A Case Study"
163 | - **Authors**: Filippos Priovolos
164 | If you use the content of this repository or the related paper in your research, please consider citing as shown in the citation section.
165 | 
166 | 
167 | ## 🧾 Citation
168 | 
169 | If you use this project in your research or want to refer to it, please attribute it as follows:
170 | 
171 | ```bibtex
172 | @misc{author2023energy,
173 |   title={Streamlining MLOps for Energy Consumption Forecasting, A Case Study},
174 |   author={Filippos Priovolos},
175 |   year={2023},
176 | }
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/03.Model Training(Pyspark Edition).py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC ## Configuration
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment"
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | # MAGIC %md
 12 | # MAGIC ##Load Datasets
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | train_start = '2015-01-01' 
 17 | train_end = '2021-12-31'
 18 | test_start = '2022-01-01'
 19 | test_end = '2023-01-01'
 20 | 
 21 | # COMMAND ----------
 22 | 
 23 | # Load Consumption Region Table
 24 | consumption_countries_hourly = spark.table(f'{db}.{consumption_countries_hourly}')
 25 | 
 26 | # Update the key column construction in the PySpark code
 27 | consumption_countries_hourly = consumption_countries_hourly.withColumn('CONSUMPTION_ID', concat(col('COUNTRY'), lit('_'), col('DATETIME').cast('string')))
 28 | 
 29 | # Split the labels into training and test
 30 | train_labels = consumption_countries_hourly.filter((col('DATETIME') >= train_start) & (col('DATETIME') <= train_end))
 31 | test_labels = consumption_countries_hourly.filter((col('DATETIME') > test_start) & (col('DATETIME') <= test_end))
 32 | #val_labels = consumption_countries_hourly.filter((col('DATETIME') > test_end) & (col('DATETIME') <= validation_end))
 33 | 
 34 | # Select the required columns
 35 | train_labels = train_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 36 | test_labels = test_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 37 | #val_labels = val_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 38 | 
 39 | # COMMAND ----------
 40 | 
 41 | display(train_labels)
 42 | 
 43 | # COMMAND ----------
 44 | 
 45 | def load_data(table_name, labels, lookup_key, ts_lookup_key):
 46 |     # In the FeatureLookup, if you do not provide the `feature_names` parameter, all features except primary keys are returned
 47 |     model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key, timestamp_lookup_key=ts_lookup_key)]
 48 |     # fs.create_training_set looks up features in model_feature_lookups that match the primary key from inference_data_df
 49 |     training_set = fs.create_training_set(labels, 
 50 |                                           model_feature_lookups, 
 51 |                                           label="HOURLY_CONSUMPTION_MW", 
 52 |                                           exclude_columns=["CONSUMPTION_ID", "DATETIME"])
 53 |     training_df = training_set.load_df()
 54 | 
 55 |     return training_set, training_df
 56 | 
 57 | # Cast the 'DATETIME' column to 'TIMESTAMP' data type
 58 | train_labels = train_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType()))
 59 | test_labels = test_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType()))
 60 | #val_labels = val_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType()))
 61 | 
 62 | # Load the data for the training set
 63 | training_set, train_df = load_data(f'{db}.hourly_forecasting_features', train_labels, 'CONSUMPTION_ID', 'DATETIME')
 64 | 
 65 | # Load the data for the test set
 66 | _, test_df = load_data(f'{db}.hourly_forecasting_features', test_labels, 'CONSUMPTION_ID', 'DATETIME')
 67 | 
 68 | # Load the data for the validation set
 69 | #_, val_df = load_data(f'{db}.hourly_forecasting_features', val_labels, 'CONSUMPTION_ID', 'DATETIME')
 70 | 
 71 | 
 72 | # COMMAND ----------
 73 | 
 74 | display(train_df)
 75 | 
 76 | # COMMAND ----------
 77 | 
 78 | concatenated_df = train_df.union(test_df)
 79 | display(concatenated_df)
 80 | 
 81 | # COMMAND ----------
 82 | 
 83 | # MAGIC %md 
 84 | # MAGIC ## Train the Machine Learning Pipeline
 85 | 
 86 | # COMMAND ----------
 87 | 
 88 | # MAGIC %md
 89 | # MAGIC ## Train the machine learning pipeline
 90 | # MAGIC Now that we have reviewed the data and prepared it as a DataFrame with numeric values, we're ready to train a model to predict future energy consumption.
 91 | # MAGIC
 92 | # MAGIC MLlib pipelines combine multiple steps into a single workflow, making it easier to iterate as we develop the model.
 93 | # MAGIC
 94 | # MAGIC In this example, we create a pipeline using the following functions:
 95 | # MAGIC
 96 | # MAGIC * `VectorAssembler`: Assembles the feature columns into a feature vector.
 97 | # MAGIC * `VectorIndexer`: Identifies columns that should be treated as categorical. This is done heuristically, identifying any column with a small number of distinct values as categorical. In this example, all the region columns are considered categorical(2 values)
 98 | # MAGIC * `SparkXGBRegressor`: Uses the SparkXGBRegressor estimator to learn how to predict energy consumption from the feature vectors.
 99 | # MAGIC * `CrossValidator`: The XGBoost regression algorithm has several hyperparameters. This notebook illustrates how to use hyperparameter tuning in Spark. This capability automatically tests a grid of hyperparameters and chooses the best resulting model.
100 | 
101 | # COMMAND ----------
102 | 
103 | # MAGIC %md
104 | # MAGIC * The first step is to create the VectorAssembler and VectorIndexer steps.
105 | 
106 | # COMMAND ----------
107 | 
108 | # Remove the target column from the input feature set.
109 | featuresCols = concatenated_df.columns
110 | featuresCols.remove('HOURLY_CONSUMPTION_MW')
111 | 
112 | # vectorAssembler combines all feature columns into a single feature vector column, "rawFeatures".
113 | vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="rawFeatures")
114 |  
115 | # vectorIndexer identifies categorical features and indexes them, and creates a new column "features". 
116 | vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=3)
117 | 
118 | # COMMAND ----------
119 | 
120 | # MAGIC
121 | # MAGIC %md
122 | # MAGIC * Next, define the model. To use distributed training, set num_workers to the number of spark tasks you want to concurrently run during training xgboost model.
123 | 
124 | # COMMAND ----------
125 | 
126 | # The next step is to define the model training stage of the pipeline. 
127 | # The following command defines a XgboostRegressor model that takes an input column "features" by default and learns to predict the labels in the "cnt" column.
128 | # Set `num_workers` to the number of spark tasks you want to concurrently run during training xgboost model.
129 | xgb_regressor = SparkXGBRegressor(label_col="HOURLY_CONSUMPTION_MW")
130 | 
131 | # COMMAND ----------
132 | 
133 | # MAGIC %md
134 | # MAGIC * The third step is to wrap the model you just defined in a CrossValidator stage. CrossValidator calls the XgboostRegressor estimator with different hyperparameter settings. It trains multiple models and selects the best one, based on minimizing a specified metric. In this example, the metric is root mean squared error (RMSE).
135 | 
136 | # COMMAND ----------
137 | 
138 | # Define a grid of hyperparameters to test:
139 | #  - maxDepth: maximum depth of each decision tree 
140 | #  - maxIter: iterations, or the total number of trees 
141 | paramGrid = ParamGridBuilder()\
142 |   .addGrid(xgb_regressor.max_depth, [8])\
143 |   .addGrid(xgb_regressor.n_estimators, [200])\
144 |   .addGrid(xgb_regressor.learning_rate, [0.1])\
145 |   .build()
146 | 
147 | # Define an evaluation metric.  The CrossValidator compares the true labels with predicted values for each combination of parameters, and calculates this value to determine the best model.
148 | evaluator = RegressionEvaluator(metricName="rmse",
149 |                                 labelCol=xgb_regressor.getLabelCol(),
150 |                                 predictionCol=xgb_regressor.getPredictionCol())
151 |  
152 | 
153 | 
154 | # COMMAND ----------
155 | 
156 | # MAGIC %md
157 | # MAGIC * Create the pipeline
158 | 
159 | # COMMAND ----------
160 | 
161 | pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, xgb_regressor])
162 | 
163 | # COMMAND ----------
164 | 
165 |  # Declare the CrossValidator, which performs the model tuning.
166 | cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid)
167 | 
168 | # COMMAND ----------
169 | 
170 | # MAGIC %md
171 | # MAGIC Train the pipeline:
172 | # MAGIC
173 | # MAGIC Now that we have set up the workflow, we can train the pipeline with a single call.
174 | # MAGIC When we call fit(), the pipeline runs feature processing, model tuning, and training and returns a fitted pipeline with the best model it found. This step takes several minutes.
175 | 
176 | # COMMAND ----------
177 | 
178 | start_time = time.time()
179 | cvModel  = cv.fit(train_df)
180 | end_time = time.time()
181 | # Retrieve best model in the pipeline
182 | xgb_model = cvModel.bestModel.stages[-1]
183 | 
184 | # COMMAND ----------
185 | 
186 | # MAGIC %md
187 | # MAGIC Make predictions and evaluate results:
188 | # MAGIC
189 | # MAGIC The final step is to use the fitted model to make predictions on the test dataset and evaluate the model's performance. The model's performance on the test dataset provides an approximation of how it is likely to perform on new data.
190 | # MAGIC
191 | # MAGIC Computing evaluation metrics is important for understanding the quality of predictions, as well as for comparing models and tuning parameters.
192 | 
193 | # COMMAND ----------
194 | 
195 | # MAGIC %md
196 | # MAGIC The `transform()` method of the pipeline model applies the full pipeline to the input dataset. The pipeline applies the feature processing steps to the dataset and then uses the fitted Xgboost Regressor model to make predictions. The pipeline returns a DataFrame with a new column predictions.
197 | 
198 | # COMMAND ----------
199 | 
200 | predictions = cvModel.transform(test_df)
201 | 
202 | # COMMAND ----------
203 | 
204 | # MAGIC %md
205 | # MAGIC A common way to evaluate the performance of a regression model is the calculate the root mean squared error (RMSE). The value is not very informative on its own, but you can use it to compare different models. `CrossValidator` determines the best model by selecting the one that minimizes RMSE.
206 | 
207 | # COMMAND ----------
208 | 
209 | display(predictions.select("HOURLY_CONSUMPTION_MW", "prediction", *featuresCols))
210 | 
211 | # COMMAND ----------
212 | 
213 | rmse = evaluator.evaluate(predictions)
214 | print("RMSE on our test set:", rmse)
215 | 
216 | # COMMAND ----------
217 | 
218 | display(predictions.select("HOURLY_CONSUMPTION_MW", "prediction"))
219 | 
220 | # COMMAND ----------
221 | 
222 | # MAGIC %md
223 | # MAGIC ## Define Metrics/Parameters to be logged
224 | 
225 | # COMMAND ----------
226 | 
227 | # Metrcis
228 | mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
229 | mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
230 | rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
231 | r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
232 | 
233 | #Hyperparameters
234 | 
235 | # Get the index of the best model
236 | best_model_index = cvModel.avgMetrics.index(min(cvModel.avgMetrics))
237 | 
238 | # Get the parameters of the best model
239 | best_model_params = cvModel.getEstimatorParamMaps()[best_model_index]
240 | 
241 | # Store the parameters in a dictionary
242 | hyperparameters = {}
243 | 
244 | # Loop over the parameters and store them in the dictionary
245 | for param, value in best_model_params.items():
246 |     hyperparameters[param.name] = value
247 | 
248 | #Model Training Time
249 | training_time = end_time - start_time
250 | 
251 | #Model Training/Testing Data Size
252 | training_size = train_df.count()
253 | testing_size = test_df.count()
254 | 
255 | #Current Time
256 | current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
257 | 
258 | #Description
259 | description = "The logged model is an XGBoost regressor that has been trained to predict DAILY_CONSUMPTION_MW based on various input features. It performs well in accurately estimating energy consumption. The model takes into account important factors and patterns present in the data to make reliable predictions. It has been fine-tuned using cross-validation and optimized hyperparameters to ensure its effectiveness"
260 | 
261 | #Model Tags
262 | tags = {
263 |     "model_type": "XGBoost Regressor",
264 |     "dataset": "Energy Consumption",
265 |     "application": "Energy Management",
266 |     "framework": "PySpark"
267 | }
268 | 
269 | # COMMAND ----------
270 | 
271 | # MAGIC %md
272 | # MAGIC ### Register Model to MLflow
273 | 
274 | # COMMAND ----------
275 | 
276 |     with mlflow.start_run(nested=True) as run:
277 |       
278 |         # Log the model input schema
279 |         schema = {"input_schema": list(concatenated_df.columns[:-1]),"output_schema":concatenated_df.columns[-1]}
280 |         mlflow.log_dict(schema, "schema.json")
281 | 
282 |         # Log some tags for the model
283 |         mlflow.set_tags(tags)
284 | 
285 |         # Log some parameters for the model
286 |         mlflow.log_dict(hyperparameters, "hyperparams.json")
287 | 
288 |         # Log the evaluation metrics as metrics
289 |         mlflow.log_metric("MAE", mae)
290 |         mlflow.log_metric("MSE", mse)
291 |         mlflow.log_metric("RMSE", rmse)
292 |         mlflow.log_metric("R2", r2)
293 | 
294 |         #Log the time taken to train as metric
295 |         mlflow.log_metric("Training Time(sec)", training_time)
296 | 
297 |         # Log evaluation metrics as artifact
298 |         metrics = {"R2": r2, "MSE": mse, "RMSE": rmse, 'MAE':mae,'Training Time(sec)':training_time}
299 |         mlflow.log_dict(metrics, "metrics.json")
300 | 
301 |         # Log the model description as artifact
302 |         mlflow.log_text(description, "description.txt")
303 |         
304 |         # Log the current timestamp as the code version
305 |         mlflow.log_param("code_version", current_time)
306 | 
307 |         # Log additional important parameters for comparison
308 |         mlflow.log_param("n_estimators", hyperparameters["n_estimators"])
309 |         mlflow.log_param("max_depth", hyperparameters["max_depth"])
310 |         mlflow.log_param("learning_rate", hyperparameters["learning_rate"])
311 |         mlflow.log_param("training_data_size", training_size)
312 |         mlflow.log_param("testing_data_size", testing_size)
313 | 
314 |         # Log the model with its signature
315 |         mlflow.spark.log_model(xgb_model, artifact_path="model", signature=signature,pip_requirements=pip_requirements)
316 | 
317 |         # Register the model with its signature
318 |         model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
319 |         mlflow.register_model(model_uri=model_uri, name="pyspark_mlflow_model")
320 | 
321 |         # Get the latest model version(The one that we now registered)
322 |         client = MlflowClient()
323 |         # Search for all versions of the registered model
324 |         versions = client.search_model_versions("name='pyspark_mlflow_model'")
325 |         # Sort the versions by creation timestamp in descending order
326 |         sorted_versions = sorted(versions, key=lambda v: v.creation_timestamp, reverse=True)
327 |         # Get the latest version
328 |         latest_version = sorted_versions[0]
329 |         # Access the version number
330 |         model_version = latest_version.version
331 | 
332 |         # Save your data to a new DBFS directory for each run
333 |         data_path = f"dbfs:/FileStore/Data_Versioning/data_model_v{model_version}.parquet"
334 |         concatenated_df.write.format("parquet").save(data_path)
335 |  
336 |         # Log the DBFS path as an artifact
337 |         with open("data_path.txt", "w") as f:
338 |             f.write(data_path)
339 |         mlflow.log_artifact("data_path.txt")
340 | 
341 | # COMMAND ----------
342 | 
343 | 
344 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/01.Feature Engineering.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Daily Inference"
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | # MAGIC %md
  7 | # MAGIC * Import inference data and convert them to timestamp
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | from datetime import datetime
 12 | from pyspark.sql.functions import hour, when
 13 | 
 14 | # Convert string dates to timestamps
 15 | df = spark.sql(f"""
 16 | SELECT
 17 |     TO_TIMESTAMP('{yesterdate}', 'yyyy-MM-dd') AS yesterdate_ts,
 18 |     TO_TIMESTAMP('{date}', 'yyyy-MM-dd') AS date_ts
 19 | """)
 20 | 
 21 | 
 22 | 
 23 | # COMMAND ----------
 24 | 
 25 | # MAGIC %md
 26 | # MAGIC * Define the date to be predicted as date_to_predict, which is to be extracted from the index of the DataFrame df
 27 | # MAGIC * Read data from the table "final_consumption_countries_hourly" 
 28 | # MAGIC * Filter the df_cons DataFrame to retain only rows from the last 720 hours (30 days) prior to the date_to_predict
 29 | # MAGIC * Convert the resulting Spark DataFrame df_cons to a Pandas DataFrame
 30 | 
 31 | # COMMAND ----------
 32 | 
 33 | # Select the date portion of the timestamp and convert it to string format
 34 | date_df = df.select(date_format("date_ts", "yyyy-MM-dd").alias("date_string"))
 35 | 
 36 | # Extract the date string from the DataFrame
 37 | date_to_predict = date_df.first()["date_string"]
 38 | # Read data from the table
 39 | df_cons = spark.read.table("final_consumption_countries_hourly")
 40 | 
 41 | # Before filtering
 42 | print("Before filtering:")
 43 | print(df_cons.select("DATETIME").agg({"DATETIME": "min"}).collect()[0])
 44 | print(df_cons.select("DATETIME").agg({"DATETIME": "max"}).collect()[0])
 45 | 
 46 | # Filter the data to include only rows from the exact previous month
 47 | df_cons = df_cons.filter(
 48 |     (col("DATETIME") < to_date(lit(date_to_predict))) &
 49 |     (col("DATETIME") >= add_months(to_date(lit(date_to_predict)), -1))
 50 | )
 51 | 
 52 | # After filtering
 53 | print("After filtering:")
 54 | print(df_cons.select("DATETIME").agg({"DATETIME": "min"}).collect()[0])
 55 | print(df_cons.select("DATETIME").agg({"DATETIME": "max"}).collect()[0])
 56 | 
 57 | # Convert Spark DataFrame to pandas DataFrame
 58 | df_cons = df_cons.toPandas()
 59 | 
 60 | # Sort 'final_consumption' dataframe by 'DATETIME' and 'COUNTRY'
 61 | df_cons.sort_values(by=['DATETIME', 'COUNTRY'], inplace=True)
 62 | 
 63 | # Display DataFrame
 64 | df_cons
 65 | 
 66 | # COMMAND ----------
 67 | 
 68 | # MAGIC %md
 69 | # MAGIC * Print earliest and latest timestamp to validate dates
 70 | 
 71 | # COMMAND ----------
 72 | 
 73 | # Display min and max 'DATETIME'
 74 | print("Earliest timestamp:", df_cons['DATETIME'].min())
 75 | print("Latest timestamp:", df_cons['DATETIME'].max())
 76 | 
 77 | # COMMAND ----------
 78 | 
 79 | # MAGIC %md
 80 | # MAGIC * Create a Dataframe with 24x11 rows each, timestamp for each country will have value 1, others will have value 0(one-hot-encoding)
 81 | 
 82 | # COMMAND ----------
 83 | 
 84 | # Create an array with 24 hours
 85 | hours = list(range(24))
 86 | 
 87 | # Create a DataFrame with 24x11 rows, each timestamp for each country will have value 1, others will have value 0
 88 | data = []
 89 | for hour in hours:
 90 |     for country in countries:
 91 |         timestamp_str = f"{date} {str(hour).zfill(2)}:00:00"
 92 |         row = [timestamp_str] + [1 if c == country else 0 for c in countries]
 93 |         data.append(Row(*row))
 94 | 
 95 | # Define column names
 96 | columns = ['DATETIME'] + countries
 97 | 
 98 | # Create DataFrame
 99 | df = spark.createDataFrame(data, columns)
100 | 
101 | # Convert string timestamp to actual timestamp
102 | df = df.withColumn("DATETIME", expr(f"to_timestamp(DATETIME, 'yyyy-MM-dd HH:mm:ss')"))
103 | 
104 | df = df.toPandas()
105 | df
106 | 
107 | 
108 | # COMMAND ----------
109 | 
110 | # MAGIC %md
111 | # MAGIC The given code transforms df from wide to long format by melting it, filters the rows to include only those with a value of 1, and then drops the "VALUE" column. This operation helps reshape and manipulate the data, to deal with ategorical variables.
112 | 
113 | # COMMAND ----------
114 | 
115 | df_melted = df.melt(id_vars='DATETIME', var_name='COUNTRY', value_name='VALUE')
116 | df_melted = df_melted[df_melted['VALUE'] == 1]
117 | df_melted = df_melted.drop('VALUE', axis=1)
118 | df_melted
119 | 
120 | # COMMAND ----------
121 | 
122 | df_combined = pd.concat([df_melted, df_cons], axis=0)
123 | 
124 | #  sort the resulting dataframe by 'DATETIME'
125 | df_combined = df_combined.sort_values('DATETIME')
126 | df_combined
127 | 
128 | # COMMAND ----------
129 | 
130 | # MAGIC %md
131 | # MAGIC * The function create_lag_features takes in a DataFrame df with columns 'COUNTRY', 'DATETIME', and 'HOURLY_CONSUMPTION_MW'.
132 | # MAGIC * It sorts df based on 'COUNTRY' and 'DATETIME'.
133 | # MAGIC * Creates lag features for the previous day, week, and month's consumption.
134 | # MAGIC * Forward fills NaN values in 'HOURLY_CONSUMPTION_MW' for each country.
135 | # MAGIC * Calculates rolling statistics (mean, standard deviation, sum) for the past 24 hours and 7 days.
136 | # MAGIC * Backward fills NaN values in the lag features for each country.
137 | # MAGIC * Returns the modified DataFrame with new lag and rolling features.
138 | 
139 | # COMMAND ----------
140 | 
141 | def create_lag_features(df):
142 |     """
143 |     Creates lag features from datetime index
144 |     """
145 |     df = df.sort_values(['COUNTRY', 'DATETIME']).reset_index(drop=True)  # Sort by 'COUNTRY' and 'DATETIME' and reset index
146 |     # Group by country and shift to create lagged features
147 |     df['PREV_DAY_CONSUMPTION'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].shift(24)
148 |     df['PREV_WEEK_CONSUMPTION'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].shift(24 * 7)
149 |     df['PREVIOUS_MONTH_CONSUMPTION'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].shift(24 * 30)
150 | 
151 |     # Forward fill to handle NaN values in HOURLY_CONSUMPTION_MW for rolling window calculations
152 |     df['HOURLY_CONSUMPTION_MW'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].fillna(method='ffill')
153 | 
154 |     # Calculate rolling statistics for each country
155 |     df['ROLLING_MEAN_24H'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].rolling(window=24,min_periods=1).mean().reset_index(0,drop=True)
156 |     df['ROLLING_STD_24H'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].rolling(window=24,min_periods=1).std().reset_index(0,drop=True)
157 |     df['ROLLING_SUM_7D'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].rolling(window=7 * 24, min_periods=1).sum().reset_index(0,drop=True)
158 | 
159 |     # Backward fill only the rows that end up as null after shifting for each country
160 |     df['PREV_DAY_CONSUMPTION'] = df.groupby('COUNTRY')['PREV_DAY_CONSUMPTION'].fillna(method='bfill')
161 |     df['PREV_WEEK_CONSUMPTION'] = df.groupby('COUNTRY')['PREV_WEEK_CONSUMPTION'].fillna(method='bfill')
162 |     df['PREVIOUS_MONTH_CONSUMPTION'] = df.groupby('COUNTRY')['PREVIOUS_MONTH_CONSUMPTION'].fillna(method='bfill')
163 | 
164 |     return df
165 | 
166 | df_combined = create_lag_features(df_combined)
167 | 
168 | 
169 | # COMMAND ----------
170 | 
171 | df_combined
172 | 
173 | # COMMAND ----------
174 | 
175 | # MAGIC %md
176 | # MAGIC The given code converts a variable to datetime format, extracts rows from a DataFrame based on a specific date, and drops a specified column from the resulting DataFrame. This allows for working with a subset of data for a specific date and removing unnecessary columns for further analysis or processing.
177 | 
178 | # COMMAND ----------
179 | 
180 | # Convert your predicting_date to datetime format
181 | date_to_predict = pd.to_datetime(date_to_predict)
182 | 
183 | # Extract the date from the 'DATETIME' column, compare it to predicting_date
184 | df_final = df_combined[df_combined['DATETIME'].dt.date == date_to_predict.date()]
185 | df_final.drop(columns=['HOURLY_CONSUMPTION_MW'],inplace=True)
186 | 
187 | # COMMAND ----------
188 | 
189 | df_final
190 | 
191 | # COMMAND ----------
192 | 
193 | # MAGIC %md
194 | # MAGIC * The function create_time_features takes a DataFrame df with 'DATETIME' as one of its columns.
195 | # MAGIC * Sets 'DATETIME' as the index of the DataFrame.
196 | # MAGIC * Extracts and creates new features such as 'HOUR', 'DAY_OF_WEEK', 'MONTH', 'QUARTER', 'YEAR', 'DAY_OF_YEAR', 'DAY_OF_MONTH', and 'WEEK_OF_YEAR' from the 'DATETIME' index.
197 | # MAGIC * Sorts the DataFrame based on the 'DATETIME' index.
198 | # MAGIC * Returns the modified DataFrame with the new time-related features.
199 | 
200 | # COMMAND ----------
201 | 
202 | def create_time_features(df):
203 |     """
204 |     Creates time series features from datetime index
205 |     """
206 |     # Ensure 'DATETIME' is the index
207 |     df.set_index('DATETIME', inplace=True)
208 | 
209 |     # Create date-related features
210 |     df['HOUR'] = df.index.hour  
211 |     df['DAY_OF_WEEK'] = df.index.dayofweek
212 |     df['MONTH'] = df.index.month
213 |     df['QUARTER'] = df.index.quarter
214 |     df['YEAR'] = df.index.year
215 |     df['DAY_OF_YEAR'] = df.index.dayofyear
216 |     df['DAY_OF_MONTH'] = df.index.day
217 |     df['WEEK_OF_YEAR'] = df.index.isocalendar().week
218 | 
219 |     # Sort the DataFrame by the datetime index
220 |     df.sort_index(inplace=True)
221 |     
222 |     return df
223 | 
224 | df_final = create_time_features(df_final)
225 | df_final
226 | 
227 | # COMMAND ----------
228 | 
229 | df_final
230 | 
231 | # COMMAND ----------
232 | 
233 | # MAGIC %md
234 | # MAGIC * The function one_hot_encode takes a DataFrame df with 'COUNTRY' as one of its columns.
235 | # MAGIC * Defines a list of country names to be one-hot encoded.
236 | # MAGIC * Iterates through each country in the list:
237 | # MAGIC * For each country, it creates a new column in the DataFrame, named after the country.
238 | # MAGIC * Each entry in the new column is set to 1 if the 'COUNTRY' column matches the country name, otherwise it's set to 0.
239 | # MAGIC * Returns the modified DataFrame with new one-hot encoded columns for countries.
240 | 
241 | # COMMAND ----------
242 | 
243 | def one_hot_encode(df):
244 |     countries = ['belgium','denmark','france','germany','greece','italy','luxembourg','netherlands','spain','sweden','switzerland']
245 |     countries.sort()
246 |     for country in countries:
247 |         df[country] = df.apply(lambda row: 1 if row['COUNTRY'] == country else 0, axis=1)
248 |     return df
249 | 
250 | df_final = one_hot_encode(df_final)
251 | df_final = df_final.reset_index()
252 | df_final
253 | 
254 | # COMMAND ----------
255 | 
256 | df_final.columns
257 | 
258 | # COMMAND ----------
259 | 
260 | df_final[df_final['COUNTRY']=="greece"]
261 | 
262 | # COMMAND ----------
263 | 
264 | # MAGIC %md
265 | # MAGIC * The code converts a Pandas DataFrame (df_final) to a Spark DataFrame (spark_df).
266 | # MAGIC * The Spark DataFrame is saved as a table in Databricks using the name specified in table_name.
267 | # MAGIC * A Delta table is created based on the Spark DataFrame.
268 | # MAGIC * The purpose is to store the data in a table format in Databricks, facilitating further analysis and querying using Spark SQL.
269 | 
270 | # COMMAND ----------
271 | 
272 | spark_df = spark.createDataFrame(df_final)
273 | # Save the Spark DataFrame as a table in Databricks
274 | table_name = 'inferenece_features'
275 | spark_df.createOrReplaceTempView(table_name)
276 | spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING delta AS SELECT * FROM {table_name}")
277 | 
278 | # COMMAND ----------
279 | 
280 | df_final.columns
281 | 
282 | # COMMAND ----------
283 | 
284 | # MAGIC %md
285 | # MAGIC * Create a Spark DataFrame: The Pandas DataFrame df_final is converted to a Spark DataFrame using the spark.createDataFrame() function. This conversion allows for working with the DataFrame using Spark's distributed computing capabilities.
286 | # MAGIC * Create a temporary view: The Spark DataFrame df_final_spark is assigned as a temporary view named 'df_final' using the createOrReplaceTempView() function. This creates a temporary view of the DataFrame within the Spark session, enabling the execution of SQL queries and operations on the DataFrame.
287 | 
288 | # COMMAND ----------
289 | 
290 | df_final_spark=spark.createDataFrame(df_final) 
291 | df_final_spark.createOrReplaceTempView('df_final')
292 | 
293 | # COMMAND ----------
294 | 
295 | # MAGIC %md
296 | # MAGIC * SQL Query: It executes an SQL query to select all columns from the 'df_final' table/view.
297 | # MAGIC * Add CONSUMPTION_ID column: Using the withColumn() function, a new column named 'CONSUMPTION_ID'(Primary Key) is added to the DataFrame. The values in this column are created by concatenating 'COUNTRY' and 'DATETIME' columns with an underscore ('_') separator.
298 | # MAGIC * Convert DATETIME column: Using the withColumn() function again, the 'DATETIME' column is converted to a timestamp data type by casting it with the CAST() function.
299 | # MAGIC * Drop COUNTRY column: The 'COUNTRY' column is dropped from the DataFrame using the drop() function.
300 | # MAGIC * Create temporary view: Finally, the modified DataFrame is used to create a new temporary view named 'daily_features'.
301 | 
302 | # COMMAND ----------
303 | 
304 | import pyspark.sql.functions as f
305 | 
306 | spark.sql('select * from df_final') \
307 |     .withColumn('CONSUMPTION_ID', f.expr('concat_ws("_", COUNTRY, DATETIME)')) \
308 |     .withColumn('DATETIME', f.expr('CAST(DATETIME AS timestamp)')) \
309 |     .drop('COUNTRY').createOrReplaceTempView('daily_features')
310 | 
311 | # COMMAND ----------
312 | 
313 | # MAGIC %md
314 | # MAGIC * The variable columns is assigned the list of column names from a table named daily_features retrieved through a Spark SQL query.
315 | # MAGIC * feature_columns is created by taking all column names in columns except for 'DATETIME' and 'CONSUMPTION_ID'.
316 | # MAGIC * update_columns is a string created by joining elements from feature_columns with ' = B.' prefixed to each element and separated by commas. * This could be used in an SQL UPDATE statement.
317 | # MAGIC * insert_columns is a string created by joining 'B.' prefixed elements from feature_columns, separated by commas. This could be used in an SQL INSERT statement.
318 | 
319 | # COMMAND ----------
320 | 
321 | columns = spark.sql('select * from daily_features').columns
322 | feature_columns = [column for column in columns if column not in ('DATETIME', 'CONSUMPTION_ID')]
323 | update_columns = ', '.join([f'{column} = B.{column}' for column in feature_columns])
324 | insert_columns = ', '.join([f'B.{column}' for column in feature_columns])
325 | 
326 | # COMMAND ----------
327 | 
328 | # MAGIC %md
329 | # MAGIC * The query is merging data from a table named daily_features (aliased as B) into another table called hourly_forecasting_features (aliased as A).
330 | # MAGIC * The merge is based on the condition that the 'DATETIME' and 'CONSUMPTION_ID' columns in both tables must be equal (A.DATETIME = B.DATETIME AND A.CONSUMPTION_ID = B.CONSUMPTION_ID).
331 | # MAGIC * If there is a match between the records in tables A and B (based on 'DATETIME' and 'CONSUMPTION_ID'), then the corresponding records in table A are updated with the values from table B. The columns to be updated are defined by the string update_columns, which was created earlier to have the form column1 = B.column1, column2 = B.column2, ....
332 | # MAGIC * If there is no match between the records in table A and B, then a new record is inserted into table A with values from table B. The columns that will be inserted are 'DATETIME', 'CONSUMPTION_ID', and the additional feature columns. The columns to be inserted are defined in the format (column1, column2, ...) and the values to be inserted are in the format (B.column1, B.column2, ...).
333 | 
334 | # COMMAND ----------
335 | 
336 | spark.sql(f"""
337 | MERGE INTO hourly_forecasting_features A
338 | USING daily_features B
339 | ON A.DATETIME = B.DATETIME AND A.CONSUMPTION_ID = B.CONSUMPTION_ID
340 | WHEN MATCHED THEN
341 |   UPDATE SET
342 |     {update_columns}
343 | WHEN NOT MATCHED
344 |   THEN INSERT (
345 |     DATETIME,
346 |     CONSUMPTION_ID,
347 |     {', '.join(feature_columns)}
348 | ) VALUES (  
349 |     B.DATETIME,
350 |     B.CONSUMPTION_ID,
351 |    {insert_columns}
352 | )
353 | """)
354 | 
355 | # COMMAND ----------
356 | 
357 | 
358 | 


--------------------------------------------------------------------------------
/DOCUMENTATION.md:
--------------------------------------------------------------------------------
  1 | # Project Documentation: Streamlining Energy Consumption Forecasting using MLOps
  2 | 
  3 | ![Databricks Badge](https://img.shields.io/badge/Databricks-FF3621?style=for-the-badge&logo=databricks&logoColor=white)
  4 | ![Azure Badge](https://img.shields.io/badge/Microsoft_Azure-0089D6?style=for-the-badge&logo=microsoft-azure&logoColor=white)
  5 | ![MLflow Badge](https://img.shields.io/badge/MLflow-FF3621?style=for-the-badge&logo=mlflow&logoColor=white)
  6 | ![Python Badge](https://img.shields.io/badge/Python-3776AB?style=for-the-badge&logo=python&logoColor=white)
  7 | 
  8 | 
  9 | ## 📊 Data Source
 10 | 
 11 | The data used in this project is sourced from the [ENTSO-E Transparency Platform](https://transparency.entsoe.eu/). ENTSO-E stands for European Network of Transmission System Operators for Electricity. 
 12 | It is an organization that brings together 42 electricity transmission system operators (TSOs) from 35 countries across Europe. ENTSO-E plays a crucial role in coordinating TSOs and facilitating the European electricity market.
 13 | The platform provides authoritative, comprehensive, and transparent energy market data for European countries. 
 14 | 
 15 | For this project, hourly energy consumption data from 2015 to 2022 has been used. Additionally, some new data from 2023 has also been retrieved for testing purposes. The dataset includes hourly energy consumption data across 11 selected European countries (Belgium, Denmark, France, Germany, Greece, Italy, Luxembourg, Netherlands, Spain, Sweden, Switzerland).
 16 | 
 17 | 
 18 | # MLOps Pipeline
 19 | 
 20 | ## 🛠️ Data Engineering
 21 | 
 22 | ### Data Ingestion:
 23 | In the data ingestion stage, raw energy consumption data is collected from the ENTSO-E Transparency Platform. The data is downloaded and uploaded into Azure Databricks for processing.
 24 | It is important to ensure that data ingestion is efficient and reliable, as it forms the foundation for the subsequent steps in the pipeline.
 25 | 
 26 | ### Data Transformation:
 27 | After ingestion, the data undergoes various transformations to convert it into a format that is suitable for analysis and modeling.
 28 | This includes converting timestamps to a standardized format, aggregating data, and reshaping datasets. 
 29 | In this project, hourly energy consumption data is used; however, it is possible to aggregate this data for different time windows (e.g., daily) based on the requirements.
 30 | 
 31 | ### Data Quality Checks:
 32 | 
 33 | Quality checks are essential to ensure the integrity and completeness of the data. This includes handling missing values, identifying and rectifying any inconsistencies in the data, and ensuring that it meets the required standards for analysis. 
 34 | 
 35 | In this project, the primary tool utilized for this purpose is [Great Expectations](https://greatexpectations.io/), an open-source library for setting, validating, and documenting data expectations. 
 36 | 
 37 | Great Expectations was instrumental in defining expectations for data, which serve as automated assertions about data quality that are easy to implement and maintain. If any data does not meet these predefined expectations, the system alerts us, thereby ensuring that any decision made based on the data is as accurate as possible.
 38 | 
 39 | For an example of the data quality reports produced by Great Expectations in this project, see the links below:
 40 | 
 41 | #### Links to Data Quality report files:
 42 | 
 43 | - [**Data Quality Expectations**](https://philippos01.github.io/mlops-energy-forecast-thesis/MLOps%20Pipeline/Utils/Great%20Expectations/my_expectation_suite.html)
 44 | - [**Data Quality Validation**](https://philippos01.github.io/mlops-energy-forecast-thesis/MLOps%20Pipeline/Utils/Great%20Expectations/fde64798683368bcaf8fe113b0dd4b14.html)
 45 | 
 46 | 
 47 | ## 🚀 Initial Deployment
 48 | 
 49 | This section describes the critical steps undertaken during the initial deployment phase of the MLOps pipeline. The pipeline consists of an exploratory data analysis, feature engineering, model training, and unit testing.
 50 | 
 51 | ### 🕵️‍♂️ Exploratory Data Analysis (EDA):
 52 | Before diving into model training, it is essential to understand the characteristics of the data. Exploratory Data Analysis (EDA) involves summarizing the main features of the data, usually with visual methods. Through EDA, we can begin to uncover patterns, spot anomalies, and frame hypotheses for testing.
 53 | 
 54 | - **Univariate Analysis**: Involves the examination of single features or variables. For this project, the Univariate Analysis includes:
 55 |   * Distribution of records across years, months, days, and hours.
 56 |   * Frequency of records for each country.
 57 | 
 58 | - **Bivariate Analysis**: Investigates the relationship between two features or variables. In this project, the Bivariate Analysis includes:
 59 |   * Average hourly consumption per country.
 60 |   * Monthly consumption trends per country.
 61 | 
 62 | - **Visualizations**: Creating graphical representations of the data. For this project, specific visualizations include:
 63 |   * A heatmap for Average Hourly Consumption by Country and Hour of Day to observe patterns in energy consumption.
 64 |   * Decomposition plots for each country to examine original, trend, seasonality, and residuals in the time series data.
 65 | 
 66 | ### 🧪 Feature Engineering:
 67 | After understanding the data through EDA, the next step is to prepare it for modeling. Feature engineering includes creating new features, transforming existing ones, and encoding categorical variables.
 68 | 
 69 | - **One-Hot Encoding of Countries**: This involves converting the categorical 'country' feature into numerical format, where each country is represented as a binary vector.
 70 | 
 71 | - **Feature Creation**: Generating new features that might improve the model's performance. For example, creating time-based features like the day of the week, month, year.
 72 | 
 73 | - **Primary Key Creation**: Creating a unique identifier for each record. This is essential for indexing and retrieving records efficiently from the database.
 74 | 
 75 | - **Saving Features to Databricks Feature Store**: After engineering, features are saved in Databricks Feature Store, which acts as a centralized repository for feature data, ensuring consistency across different models and deployments.
 76 | 
 77 | ### 🤖 Model Training:
 78 | With the features prepared, we now proceed to model training. This step involves selecting an algorithm, training the model, and evaluating its performance.
 79 | 
 80 | - **Data Loading from Feature Store**: The features engineered previously are loaded from Databricks Feature Store.
 81 | 
 82 | - **Data Splitting**: The dataset is split into training and testing sets by ensuring the continuity of the data to correctly evaluate the model's performance on unseen data.
 83 | 
 84 | - **Model Creation and Training**: The algorithm is selected, and the model is trained using the training dataset.
 85 | 
 86 | - **Logging to Feature Store**: The trained model, along with its metrics and parameters and artifacts is logged in the Databricks Feature Store for versioning and reproducibility.
 87 | 
 88 | ### 🧪 Unit Testing:
 89 | After the model is trained, it undergoes unit testing to ensure that it meets the required performance benchmarks.
 90 | 
 91 | - **Performance Testing**: The model is subjected to a set of tests to evaluate its performance. Metrics such as Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE) are used.
 92 | 
 93 | - **Proceed to Staging Environment**: If the model passes performance tests, it is moved to a staging environment. This stage closely resembles the production environment and is used for final testing before the model is deployed for real-world use.
 94 | 
 95 | 
 96 | ## 🔄 Daily Inference
 97 | 
 98 | This subsection outlines the daily inference procedure which is a crucial aspect of the MLOps pipeline. It ensures that the model continues to provide value by making predictions on new data.
 99 | The daily inference procedure comprises three key steps: feature engineering on new data, the inference process itself, and monitoring the predictions.
100 | 
101 | ### 🧪 Feature Engineering on New Data:
102 | To make predictions on new data, it's important to transform the data in a manner consistent with the training data. This involves applying the same transformations and encodings that were done during the initial deployment phase.
103 | 
104 | - **Data Transformation**: The new data is transformed to ensure it's in a compatible format for the model to make predictions. This includes handling any missing values, encoding categorical variables, and creating new features.
105 | 
106 | - **Saving Transformed Data**: Once the data is transformed, it's saved in a structured format that is easily retrievable. This structured data will be used for making predictions.
107 | 
108 | ### 🎯 Daily Inference Procedure:
109 | This is the process where the model uses the transformed new data to make predictions. These predictions can be used for various applications such as forecasting energy consumption.
110 | 
111 | - **Retrieving New Data**: The transformed new data is retrieved from the database.
112 | 
113 | - **Batch Scoring**: The model, using a batch scoring function in the feature store, makes predictions on the new data. Batch scoring is efficient for making predictions on large datasets.
114 | 
115 | - **Saving Predictions**: The predictions made by the model are saved back to the database. This data can be retrieved later for analysis and reporting.
116 | 
117 | ### 📊 Daily Monitoring Procedure:
118 | After the predictions are made and saved, it is critical to monitor how the model is performing on new data. This involves evaluating predictions and creating visualizations.
119 | 
120 | - **Retrieving Predicted Data**: The data that has been predicted by the model is retrieved from the database.
121 | 
122 | - **Evaluating Predictions**: The predictions are evaluated through various metrics to understand how well the model is performing.
123 | 
124 | - **Creating Visualizations**: Visualizations such as graphs and charts are created to help interpret the predictions. This can include trend analysis and distribution of predictions over time.
125 | 
126 | - **Reporting**: The results from the evaluation and visualizations are documented and reported. This reporting can be used for decision-making and planning.
127 | 
128 | By employing a systematic daily inference procedure, it ensures that the model remains functional and valuable in a real-world setting, while constantly monitoring its performance.
129 | 
130 | ## 🔄 Model Retraining
131 | 
132 | One of the key aspects of maintaining a robust MLOps pipeline is to ensure that the deployed models remain efficient and accurate over time. The Model Retraining subsection focuses on the automated retraining of models on a regular basis, using the latest data.
133 | 
134 | ### 🗓️ Scheduled Retraining:
135 | Model retraining is scheduled to occur automatically at regular intervals - every 1, 3, or 6 months. This is essential as data patterns may evolve, and the model needs to adapt to these changes to maintain its accuracy and relevance.
136 | 
137 | - **Data Preparation**: The data saved during the daily inference, which includes the predicted values and their corresponding actual values, is used for retraining. This dataset is accumulated over the decided interval (1, 3, or 6 months).
138 | 
139 | - **Retraining Process**: The model is retrained using the accumulated data. This ensures that the model learns from the most recent data patterns and adapts its parameters accordingly.
140 | 
141 | ### 📈 Performance Evaluation:
142 | Post retraining, it's imperative to evaluate the model's performance to ascertain whether there’s an improvement in the predictions.
143 | 
144 | - **Tracking Progress**: The performance of the models over time is tracked. This includes monitoring the number of trainings and retrainings and how the metrics evolve with each iteration.
145 | 
146 | - **Comparative Analysis**: The retrained model, which is initially in the staging environment, is compared against the current production model. The evaluation metrics of both models are analyzed to determine if the retrained model shows improved performance.
147 | 
148 | - **Model Promotion**: If the retrained model in the staging environment outperforms the current production model, it is promoted to replace the production model. The model that was in production is archived for record-keeping.
149 | 
150 | - **Documentation**: All the steps, decisions, and metrics are documented for future reference and transparency.
151 | 
152 | By continuously monitoring and retraining the model, this process ensures that the model remains adaptive to changing data patterns and provides the most accurate and efficient predictions possible.
153 | 
154 | 
155 | ## 🚀 Deployment Strategy
156 | 
157 | In the context of MLOps, deployment is a critical phase where the machine learning model is integrated into a production environment, making it accessible for real-world forecasting. A robust deployment strategy ensures that the model is reliable, scalable, and efficient.
158 | 
159 | ### MLflow and Databricks Integration:
160 | - The model, post-training, is saved and registered in Feature Store within MLflow, a platform that manages the ML lifecycle, including experimentation, reproducibility, and deployment.
161 | 
162 | - MLflow is natively integrated within the Databricks workspace. This seamless integration is crucial as it allows for efficient management and tracking of models within a familiar ecosystem.
163 | 
164 | ### Scalability and Performance:
165 | - Databricks, known for its high-performance analytics engine, is particularly suited for handling large datasets and complex computations. By deploying the model within Databricks, we leverage its ability to scale effortlessly to meet data and computational demands.
166 | 
167 | # 🔄 Workflow Overview
168 | 
169 | In this project, we have established three main workflows that are integral to the systematic functioning and updating of the energy consumption forecasting system:
170 | 
171 | ## 1.  Initial Deployment / Redeployment
172 | 
173 | This workflow encompasses all the steps necessary for the initial deployment of the model, as well as any subsequent redeployments. It includes data engineering, exploratory data analysis, feature engineering, model training, and performance evaluation. This workflow is initiated manually and ensures that the model is properly set up and integrated into the Azure Databricks and MLflow ecosystem.
174 | 
175 | ## 2.  Daily Inference
176 | 
177 | The Daily Inference workflow is automated and triggered every day. Its purpose is to forecast the energy consumption for the next day. This workflow starts by retrieving new data from the database and processing it to be compatible with the model. Through the batch scoring function of the feature store, predictions are generated and subsequently saved back into the database for further analysis and utilization.
178 | 
179 | ## 3. Model Retraining
180 | 
181 | The Model Retraining workflow is designed to ensure that the forecasting model remains up-to-date and incorporates the latest data for higher accuracy. This workflow is automatically triggered every three months. During this process, the model is retrained using newly collected data that has been saved during the Daily Inference workflow. After the retraining process, the model's performance is evaluated and compared to the current production model. If the retrained model exhibits improved performance, it replaces the existing production model, which is then archived.
182 | 
183 | These workflows are designed to work seamlessly together to provide an efficient, scalable, and up-to-date energy consumption forecasting system. Through automation and systematic processes, this setup ensures accuracy and sustainability in forecasting energy consumption across multiple European countries.
184 | 
185 | 
186 | ## Overall Architecture
187 | ![MLOps Architecture](MLOps%20Pipeline/Utils/Images/MLOps%20Architecture%20(1).png)
188 | ## 🎉 Conclusion
189 | 
190 | This project efficiently addresses the challenge of forecasting energy consumption across multiple European countries. By employing Azure Databricks and MLflow, it leverages a powerful and scalable environment for data processing and model deployment. Continuous monitoring and automatic retraining ensure that the model remains accurate and up-to-date. 
191 | This solution offers immense value to utilities and grid operators in optimizing energy management and planning.
192 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/Model Training(LSTM).py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment"
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | from pyspark import SparkContext, SparkConf
  7 | from pyspark.sql import SQLContext
  8 | from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler, MinMaxScaler
  9 | from pyspark.sql.functions import rand
 10 | from pyspark.mllib.evaluation import MulticlassMetrics
 11 | import tensorflow as tf
 12 | from tensorflow.keras.layers import Dense
 13 | from tensorflow.keras.models import Sequential
 14 | # initialize SparkSession
 15 | 
 16 | # COMMAND ----------
 17 | 
 18 | # Load Consumption Region Table
 19 | consumption_countries_hourly = spark.table('df_dev.final_consumption_countries_hourly')
 20 | 
 21 | # Update the key column construction in the PySpark code
 22 | consumption_countries_hourly = consumption_countries_hourly.withColumn('CONSUMPTION_ID', concat(col('COUNTRY'), lit('_'), col('DATETIME').cast('string')))
 23 | 
 24 | # Split the labels into training and test
 25 | train_labels = consumption_countries_hourly.filter((col('DATETIME') >= train_start) & (col('DATETIME') <= train_end))
 26 | test_labels = consumption_countries_hourly.filter((col('DATETIME') > train_end) & (col('DATETIME') <= test_end))
 27 | val_labels = consumption_countries_hourly.filter((col('DATETIME') > test_end) & (col('DATETIME') <= validation_end))
 28 | 
 29 | # Select the required columns
 30 | train_labels = train_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 31 | test_labels = test_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 32 | val_labels = val_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 33 | 
 34 | # COMMAND ----------
 35 | 
 36 | 
 37 | def load_data(table_name, labels, lookup_key, ts_lookup_key):
 38 |     # In the FeatureLookup, if you do not provide the `feature_names` parameter, all features except primary keys are returned
 39 |     model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key, timestamp_lookup_key=ts_lookup_key)]
 40 |     # fs.create_training_set looks up features in model_feature_lookups that match the primary key from inference_data_df
 41 |     training_set = fs.create_training_set(labels, 
 42 |                                           model_feature_lookups, 
 43 |                                           label="HOURLY_CONSUMPTION_MW", 
 44 |                                           exclude_columns=["CONSUMPTION_ID", "DATETIME"])
 45 |     training_df = training_set.load_df()
 46 | 
 47 |     return training_set, training_df
 48 | 
 49 | # Cast the 'DATETIME' column to 'TIMESTAMP' data type
 50 | train_labels = train_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType()))
 51 | test_labels = test_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType()))
 52 | val_labels = val_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType()))
 53 | 
 54 | # Load the data for the training set
 55 | training_set, train_df = load_data(f'{db}.hourly_forecasting_features', train_labels, 'CONSUMPTION_ID', 'DATETIME')
 56 | 
 57 | # Load the data for the test set
 58 | _, test_df = load_data(f'{db}.hourly_forecasting_features', test_labels, 'CONSUMPTION_ID', 'DATETIME')
 59 | 
 60 | # Load the data for the validation set
 61 | _, val_df = load_data(f'{db}.hourly_forecasting_features', val_labels, 'CONSUMPTION_ID', 'DATETIME')
 62 | 
 63 | 
 64 | # COMMAND ----------
 65 | 
 66 | display(train_df)
 67 | 
 68 | # COMMAND ----------
 69 | 
 70 | train_df
 71 | 
 72 | # COMMAND ----------
 73 | 
 74 | # MAGIC %md
 75 | # MAGIC Define the features and label columns: We first need to specify which columns in the dataframe are features and which column is the label
 76 | 
 77 | # COMMAND ----------
 78 | 
 79 | featuresCols = train_df.columns[:-1]
 80 | target_names = [train_df.columns[-1]]
 81 | 
 82 | # COMMAND ----------
 83 | 
 84 | # MAGIC %md
 85 | # MAGIC Create VectorAssembler and MinMaxScaler objects: VectorAssembler combines the specified feature columns into a single vector column. MinMaxScaler normalizes these feature vectors to be in the range [0, 1].
 86 | 
 87 | # COMMAND ----------
 88 | 
 89 | # Assuming you have loaded your dataset into a DataFrame called 'data'
 90 | # Assuming the label column name is 'label'
 91 | 
 92 | # Extract the labels from the DataFrame
 93 | labels = train_df.select('HOURLY_CONSUMPTION_MW').rdd.flatMap(lambda x: x).collect()
 94 | 
 95 | # Find the minimum and maximum values of the labels
 96 | min_label = min(labels)
 97 | max_label = max(labels)
 98 | 
 99 | 
100 | # COMMAND ----------
101 | 
102 | from pyspark.ml.feature import VectorAssembler, MinMaxScaler
103 | 
104 | vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol='assembled_features')
105 | # Create a separate MinMaxScaler for features
106 | scaler_features = MinMaxScaler(min=0.0, max=1.0, inputCol='assembled_features', outputCol='features')
107 | vectorAssemblerLabel = VectorAssembler(inputCols=target_names, outputCol='label')
108 | scaler_label = MinMaxScaler(min=0.0, max=1.0, inputCol='label', outputCol='scaled_label')
109 | 
110 | # COMMAND ----------
111 | 
112 | # MAGIC %md
113 | # MAGIC Create a pipeline of transformations: The pipeline includes vector assembly and scaling stages.
114 | 
115 | # COMMAND ----------
116 | 
117 | from pyspark.ml import Pipeline
118 | 
119 | stages = [vectorAssembler,scaler_features,vectorAssemblerLabel,scaler_label]
120 | pipeline = Pipeline(stages=stages)
121 | 
122 | # COMMAND ----------
123 | 
124 | # MAGIC %md
125 | # MAGIC Apply the transformations to each DataFrame:
126 | 
127 | # COMMAND ----------
128 | 
129 | # Fit the pipeline to the training data
130 | pipeline_model = pipeline.fit(train_df)
131 | 
132 | # Transform each DataFrame
133 | train_transformed = pipeline_model.transform(train_df)
134 | val_transformed = pipeline_model.transform(val_df)
135 | test_transformed = pipeline_model.transform(test_df)
136 | 
137 | 
138 | # COMMAND ----------
139 | 
140 | # Save the fitted pipeline for later use
141 | pipeline_model.write().overwrite().save("/dbfs/FileStore/Fitted_Pipeline")
142 | 
143 | # COMMAND ----------
144 | 
145 | train_transformed.show(truncate=False, vertical=True, n=1)
146 | 
147 | # COMMAND ----------
148 | 
149 | # MAGIC %md
150 | # MAGIC Convert to Pandas DataFrames
151 | 
152 | # COMMAND ----------
153 | 
154 | # Convert to pandas
155 | train_pd = train_transformed.toPandas()
156 | val_pd = val_transformed.toPandas()
157 | test_pd = test_transformed.toPandas()
158 | 
159 | # COMMAND ----------
160 | 
161 | # MAGIC %md
162 | # MAGIC Exctract features and labels
163 | 
164 | # COMMAND ----------
165 | 
166 | train_pd
167 | 
168 | # COMMAND ----------
169 | 
170 |     # Extract features and labels
171 |     import numpy as np
172 |     X_train = np.array(train_pd['features'].to_list())
173 |     y_train = np.array(train_pd['scaled_label'].to_list())
174 | 
175 |     X_val = np.array(val_pd['features'].to_list())
176 |     y_val = np.array(val_pd['scaled_label'].to_list())
177 | 
178 |     X_test = np.array(test_pd['features'].to_list())
179 |     y_test = np.array(test_pd['scaled_label'].to_list())
180 | 
181 | # COMMAND ----------
182 | 
183 | y_train[0:1]
184 | 
185 | # COMMAND ----------
186 | 
187 | X_train[0:1]
188 | 
189 | # COMMAND ----------
190 | 
191 | # MAGIC %md
192 | # MAGIC Reshape data for LSTM:
193 | 
194 | # COMMAND ----------
195 | 
196 | # Reshape for LSTM
197 | X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
198 | X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
199 | X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
200 | 
201 | # COMMAND ----------
202 | 
203 | y_train.shape
204 | 
205 | # COMMAND ----------
206 | 
207 | X_train.shape
208 | 
209 | # COMMAND ----------
210 | 
211 | # MAGIC %md
212 | # MAGIC Define and compile LSTM model:
213 | 
214 | # COMMAND ----------
215 | 
216 | from tensorflow.keras.models import Sequential
217 | from tensorflow.keras.layers import LSTM, Dense
218 | from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
219 | import os
220 | 
221 | experiment_log_dir = f"/dbfs/{user}/tb"
222 | checkpoint_path = f"/dbfs/{user}/keras_checkpoint_weights_day_ckpt"
223 | os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
224 | 
225 | epochs = 100
226 | early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=10, restore_best_weights = True)
227 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=experiment_log_dir)
228 | model_checkpoint = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=True)
229 | 
230 | # COMMAND ----------
231 | 
232 | model = Sequential()
233 | model.add(LSTM(100, activation='tanh', return_sequences=True, input_shape=(X_train.shape[1], 1)))
234 | model.add(LSTM(64, activation='tanh', return_sequences=False))
235 | model.add(Dense(1))
236 | model.compile(optimizer='adam', loss='mse',metrics=['mae'])
237 | 
238 | # COMMAND ----------
239 | 
240 | start_time = time.time()
241 | history = model.fit(X_train, y_train, validation_data = (X_val , y_val), epochs=epochs, callbacks=[tensorboard_callback, model_checkpoint, early_stopping],verbose=1)
242 | end_time = time.time()
243 | 
244 | # COMMAND ----------
245 | 
246 | # Validate the model
247 | val_loss = model.evaluate(X_val, y_val)
248 | 
249 | # Test the model
250 | test_loss = model.evaluate(X_test, y_test)
251 | 
252 | # COMMAND ----------
253 | 
254 | y_pred = model.predict(X_test)
255 | 
256 | # COMMAND ----------
257 | 
258 | import numpy as np
259 | 
260 | # Assuming you have loaded the predicted scaled labels into a variable called 'y_pred'
261 | 
262 | # Define the minimum and maximum values for the labels
263 | min_label = 201.0
264 | max_label = 324310.0
265 | 
266 | # Compute the scaled label range
267 | scaled_label_range = max_label - min_label
268 | 
269 | # Perform inverse scaling on the predicted labels
270 | y_pred_original = (y_pred * scaled_label_range) + min_label
271 | y_test_original = (y_test * scaled_label_range) + min_label
272 | 
273 | 
274 | # COMMAND ----------
275 | 
276 | y_pred.flatten()
277 | 
278 | # COMMAND ----------
279 | 
280 | # create a dataframe
281 | compare_df = pd.DataFrame({'Actual': y_test_original.flatten(), 'Predicted': y_pred_original.flatten()})
282 | compare_df
283 | 
284 | # COMMAND ----------
285 | 
286 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
287 | import numpy as np
288 | 
289 | # Generate predictions
290 | y_pred = model.predict(X_test)
291 | 
292 | # Flatten y_test and y_pred to 1D arrays (this may not be necessary depending on the shape of your arrays)
293 | y_test_flat = y_test.flatten()
294 | y_pred_flat = y_pred.flatten()
295 | 
296 | # Compute RMSE
297 | rmse = np.sqrt(mean_squared_error(y_test_flat, y_pred_flat))
298 | print("Root Mean Square Error: ", rmse)
299 | 
300 | # Compute MAE
301 | mae = mean_absolute_error(y_test_flat, y_pred_flat)
302 | print("Mean Absolute Error: ", mae)
303 | 
304 | # Compute R2 Score
305 | r2 = r2_score(y_test_flat, y_pred_flat)
306 | print("R-squared: ", r2)
307 | 
308 | 
309 | # COMMAND ----------
310 | 
311 | # Since your output might be multi-dimensional, you might want to select a specific dimension for plotting
312 | # Here's an example for the first dimension
313 | dim = 0
314 | y_test_dim = y_test[:, dim]
315 | y_test_pred_dim = y_pred[:, dim]
316 | 
317 | # Create a new figure
318 | plt.figure(figsize=(10, 6))
319 | 
320 | # Plot the actual values
321 | plt.plot(y_test_dim, 'b-', label='actual')
322 | 
323 | # Plot the predicted values
324 | plt.plot(y_test_pred_dim, 'r-', label='predicted')
325 | 
326 | # Create the legend
327 | plt.legend()
328 | 
329 | # Show the plot
330 | plt.show()
331 | 
332 | # COMMAND ----------
333 | 
334 | # MAGIC %md
335 | # MAGIC ## Define Metrics/Parameters to be logged
336 | 
337 | # COMMAND ----------
338 | 
339 | # Metrcis
340 | mse = mean_squared_error(y_test_flat, y_pred_flat)
341 | mae = mean_absolute_error(y_test_flat, y_pred_flat)
342 | rmse = np.sqrt(mse)  # or mse**(0.5)  
343 | r2 = r2_score(y_test_flat, y_pred_flat)
344 | 
345 | #Hyperparameters
346 | hyperparameters = {
347 |     "epochs": epochs,
348 |     "batch_size": 21088, # if you defined a batch size
349 |     "early_stopping_patience": early_stopping.patience,
350 |     "optimizer": str(type(model.optimizer).__name__),
351 |     "loss_function": model.loss.__name__ if callable(model.loss) else str(model.loss),
352 |     "first_layer_units": model.layers[0].units,
353 |     "first_layer_activation": model.layers[0].activation.__name__ if callable(model.layers[0].activation) else str(model.layers[0].activation),
354 |     "second_layer_units": model.layers[1].units,
355 |     "second_layer_activation": model.layers[1].activation.__name__ if callable(model.layers[1].activation) else str(model.layers[1].activation),
356 |     "min_label" : min_label,
357 |     "max_label" : max_label,
358 |     "training_size":len(X_train),
359 |     "training_range": {
360 |         'start': '2015-01-01',
361 |         'end': '2021-12-31'
362 |     },
363 |     "testing_size":len(X_test),
364 |     "testing_range":{
365 |         'start':'2022-01-01',
366 |         'end':'2022-09-30'
367 |     },
368 |     "validation_size" : len(X_val),
369 |     "validation_range":{
370 |         'start':'2022-10-01',
371 |         'end':'2023-01-01'
372 |     }
373 | 
374 | }
375 | 
376 | #Model Training Time
377 | training_time = end_time - start_time
378 | 
379 | #Current Time
380 | current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
381 | 
382 | # Description
383 | description = "The logged model is an LSTM-based recurrent neural network that has been trained to predict DAILY_CONSUMPTION_MW based on various input features. It leverages the temporal dependencies present in the data, making it suitable for energy consumption prediction. The model has been fine-tuned with the optimal number of epochs and other hyperparameters to ensure its effectiveness."
384 | 
385 | # Model Tags
386 | tags = {
387 |     "model_type": "RNN LSTM",
388 |     "dataset": "Energy Consumption",
389 |     "application": "Energy Management",
390 |     "framework": "TensorFlow/Keras"
391 | }
392 | 
393 | 
394 | # COMMAND ----------
395 | 
396 | # MAGIC %md
397 | # MAGIC ## Register Model to Mlflow
398 | 
399 | # COMMAND ----------
400 | 
401 | from mlflow.models.signature import infer_signature
402 | signature = infer_signature(X_train, model.predict(X_train))
403 | 
404 | # COMMAND ----------
405 | 
406 | model_name = 'lstm_model'
407 | 
408 | # COMMAND ----------
409 | 
410 |     with mlflow.start_run(nested=True) as run:
411 |       
412 |         # Log the model input schema
413 |         schema = {"input_schema": list(train_df.columns[:-1]),"output_schema":train_df.columns[-1]}
414 |         mlflow.log_dict(schema, "schema.json")
415 | 
416 |         # Log some tags for the model
417 |         mlflow.set_tags(tags)
418 | 
419 |         # Log some parameters for the model
420 |         mlflow.log_dict(hyperparameters, "hyperparams.json")
421 | 
422 |         # Log the evaluation metrics as metrics
423 |         mlflow.log_metric("MAE", mae)
424 |         mlflow.log_metric("MSE", mse)
425 |         mlflow.log_metric("RMSE", rmse)
426 |         mlflow.log_metric("R2", r2)
427 | 
428 |         #Log the time taken to train as metric
429 |         mlflow.log_metric("Training Time(sec)", training_time)
430 | 
431 |         # Log evaluation metrics as artifact
432 |         metrics = {"R2": r2, "MSE": mse, "RMSE": rmse, 'MAE':mae}
433 |         mlflow.log_dict(metrics, "metrics.json")
434 | 
435 |         # Log the model description as artifact
436 |         mlflow.log_text(description, "description.txt")
437 |         
438 |         # Log the current timestamp as the code version
439 |         mlflow.log_param("code_version", current_time)
440 | 
441 |         # Log all hyperparameters
442 |         mlflow.log_params(hyperparameters)
443 | 
444 |         fs.log_model(
445 |                     model=model,
446 |                     artifact_path=f"{model_name}_artifact_path",
447 |                     flavor=mlflow.tensorflow,
448 |                     training_set = training_set ,
449 |                     registered_model_name = model_name
450 |                 )
451 | 
452 | # COMMAND ----------
453 | 
454 |     with mlflow.start_run(nested=True) as run:
455 |       
456 |         # Log the model input schema
457 |         schema = {"input_schema": list(train_df.columns[:-1]),"output_schema":train_df.columns[-1]}
458 |         mlflow.log_dict(schema, "schema.json")
459 | 
460 |         # Log some tags for the model
461 |         mlflow.set_tags(tags)
462 | 
463 |         # Log some parameters for the model
464 |         mlflow.log_dict(hyperparameters, "hyperparams.json")
465 | 
466 |         # Log the evaluation metrics as metrics
467 |         mlflow.log_metric("MAE", mae)
468 |         mlflow.log_metric("MSE", mse)
469 |         mlflow.log_metric("RMSE", rmse)
470 |         mlflow.log_metric("R2", r2)
471 | 
472 |         #Log the time taken to train as metric
473 |         #mlflow.log_metric("Training Time(sec)", training_time)
474 | 
475 |         # Log evaluation metrics as artifact
476 |         metrics = {"R2": r2, "MSE": mse, "RMSE": rmse, 'MAE':mae,'Training Time(sec)':training_time}
477 |         mlflow.log_dict(metrics, "metrics.json")
478 | 
479 |         # Log the model description as artifact
480 |         mlflow.log_text(description, "description.txt")
481 |         
482 |         # Log the current timestamp as the code version
483 |         mlflow.log_param("code_version", current_time)
484 | 
485 |         # Log all hyperparameters
486 |         mlflow.log_params(hyperparameters)
487 | 
488 |         # Log the model with its signature
489 |         mlflow.keras.log_model(model,artifact_path="model", signature=signature)
490 | 
491 |         # Register the model with its signature
492 |         model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
493 |         mlflow.register_model(model_uri=model_uri, name="lstm_model")
494 | 
495 |         # Get the latest model version(The one that we now registered)
496 |         client = MlflowClient()
497 |         # Search for all versions of the registered model
498 |         versions = client.search_model_versions("name='lstm_model'")
499 |         # Sort the versions by creation timestamp in descending order
500 |         sorted_versions = sorted(versions, key=lambda v: v.creation_timestamp, reverse=True)
501 |         # Get the latest version
502 |         latest_version = sorted_versions[0]
503 |         # Access the version number
504 |         model_version = latest_version.version
505 | 
506 |         # Save your data to a new DBFS directory for each run
507 |         data_path = f"dbfs:/FileStore/Data_Versioning/data_model_v{model_version}.parquet"
508 |         train_df.write.format("parquet").save(data_path)
509 |  
510 |         # Log the DBFS path as an artifact
511 |         with open("data_path.txt", "w") as f:
512 |             f.write(data_path)
513 |         mlflow.log_artifact("data_path.txt")
514 | 
515 | # COMMAND ----------
516 | 
517 | 
518 | 


--------------------------------------------------------------------------------
/MLOps Pipeline/ML Engineering/Demand Forecasting Daily/06.Performance Evaluation.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Configuration
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment"
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | # MAGIC %md
 12 | # MAGIC ## Configuration
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | date_object = datetime.strptime(train_end, '%Y-%m-%d')
 17 | new_train_end = (date_object + relativedelta(months=3)).strftime('%Y-%m-%d')
 18 | date_object = datetime.strptime(test_start, '%Y-%m-%d')
 19 | new_test_start = (date_object + relativedelta(months=3)).strftime('%Y-%m-%d')
 20 | new_test_end = '2023-01-01'
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # MAGIC %md
 25 | # MAGIC ## Load all the Current Data
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | # MAGIC %md
 30 | # MAGIC * Load energy consumption data from a database into a Pandas DataFrame.
 31 | # MAGIC * Create a new column CONSUMPTION_ID by concatenating country codes with the date-time information.
 32 | # MAGIC * Convert the DATETIME column to a proper datetime data type for time-based operations.
 33 | # MAGIC * Define test labels based on date-time ranges.
 34 | # MAGIC * Convert the test labels back into Spark DataFrames and select only the CONSUMPTION_ID, DATETIME, and HOURLY_CONSUMPTION_MW columns for further processing
 35 | 
 36 | # COMMAND ----------
 37 | 
 38 | # Load Consumption Region Table
 39 | consumption_countries_hourly = spark.table('df_dev.final_consumption_countries_hourly').toPandas()
 40 | consumption_countries_hourly['CONSUMPTION_ID'] = consumption_countries_hourly.COUNTRY + '_' + consumption_countries_hourly.DATETIME.astype(str)
 41 | consumption_countries_hourly['DATETIME'] = pd.to_datetime(consumption_countries_hourly['DATETIME'])
 42 | test_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME > new_test_start) & (consumption_countries_hourly.DATETIME <= new_test_end)]
 43 | test_labels = spark.createDataFrame(test_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW")
 44 | 
 45 | # COMMAND ----------
 46 | 
 47 | # MAGIC %md
 48 | # MAGIC ## Get Initial Deployment Training Runs Based on Experiment ID
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | # MAGIC %md
 53 | # MAGIC This code snippet uses the mlflow library to search for experiment runs based on a specific experiment ID (experiment_id_training), orders them by the Mean Absolute Error (MAE) metric, and stores the results in a DataFrame called runs_training. It then displays the first 5 rows of this DataFrame.
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | runs_training = mlflow.search_runs(experiment_ids=experiment_id_training,
 58 |                           order_by=['metrics.MAE'])
 59 | runs_training.head(5)
 60 | 
 61 | # COMMAND ----------
 62 | 
 63 | # MAGIC %md
 64 | # MAGIC ## Find Best Runs of Past Month for the Initial Model Training
 65 | 
 66 | # COMMAND ----------
 67 | 
 68 | #earliest_start_time = (datetime.now() - timedelta(days=14)).strftime('%Y-%m-%d')
 69 | #recent_runs = runs_training[runs_training.start_time >= earliest_start_time]
 70 | runs_training = runs_training.assign(Run_Date=runs_training.start_time.dt.floor(freq='D'))
 71 | 
 72 | # Filter the rows to only include those with non-null values in the "metrics.MAE" column
 73 | runs_training = runs_training[runs_training['metrics.MAE'].notna()]
 74 | #print("Length of recent_runs before filtering: ", len(runs_training))
 75 | #print("Length of recent_runs after filtering: ", len(runs_training))
 76 | 
 77 | best_runs_per_day_idx = runs_training.groupby(['Run_Date'])['metrics.MAE'].idxmin()
 78 | best_runs = runs_training.loc[best_runs_per_day_idx]
 79 | 
 80 | # Select the required columns for display
 81 | metrics_columns = ['Run_Date', 'metrics.MAE', 'metrics.Training Time(sec)', 'metrics.RMSE', 'metrics.R2', 'metrics.MSE']
 82 | display(best_runs[metrics_columns])
 83 | 
 84 | # COMMAND ----------
 85 | 
 86 | # MAGIC %md
 87 | # MAGIC ## Get Retraining Best Runs Based on Experiment Id
 88 | 
 89 | # COMMAND ----------
 90 | 
 91 | runs_retraining = mlflow.search_runs(experiment_ids=experiment_id_retraining,
 92 |                           order_by=['metrics.MAE'])
 93 | runs_retraining.head(5)
 94 | 
 95 | # COMMAND ----------
 96 | 
 97 | #earliest_start_time = (datetime.now() - timedelta(days=14)).strftime('%Y-%m-%d')
 98 | #recent_runs = runs_retraining[runs_retraining.start_time >= earliest_start_time]
 99 | runs_retraining = runs_retraining.assign(Run_Date=runs_retraining.start_time.dt.floor(freq='D'))
100 | 
101 | # Filter the rows to only include those with non-null values in the "metrics.MAE" column
102 | runs_retraining = runs_retraining[runs_retraining['metrics.MAE'].notna()]
103 | #print("Length of recent_runs before filtering: ", len(runs_retraining))
104 | #print("Length of recent_runs after filtering: ", len(recent_runs))
105 | 
106 | best_runs_per_day_idx = runs_retraining.groupby(['Run_Date'])['metrics.MAE'].idxmin()
107 | best_runs = runs_retraining.loc[best_runs_per_day_idx]
108 | 
109 | # Select the required columns for display
110 | metrics_columns = ['Run_Date', 'metrics.MAE', 'metrics.Training Time(sec)', 'metrics.RMSE', 'metrics.R2', 'metrics.MSE']
111 | display(best_runs[metrics_columns])
112 | 
113 | 
114 | # COMMAND ----------
115 | 
116 | # MAGIC %md
117 | # MAGIC ## Find Number of Initial Training Runs for Past Month 
118 | 
119 | # COMMAND ----------
120 | 
121 | # MAGIC %md
122 | # MAGIC * Calculates the date 30 days ago.
123 | # MAGIC * Filters experiment runs from the last 30 days.
124 | # MAGIC * Adds a column representing the date of each run.
125 | # MAGIC * Groups runs by date and counts the number of runs per day.
126 | # MAGIC * Formats the date for display.
127 | # MAGIC * Renames a column for clarity.
128 | # MAGIC * Displays a DataFrame showing the number of experiment runs for each day over the last 30 days.
129 | 
130 | # COMMAND ----------
131 | 
132 | earliest_start_time = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
133 | recent_runs = runs_training[runs_training.start_time >= earliest_start_time]
134 | 
135 | recent_runs['Run Date'] = recent_runs.start_time.dt.floor(freq='D')
136 | 
137 | runs_per_day = recent_runs.groupby(
138 |   ['Run Date']
139 | ).count()[['run_id']].reset_index()
140 | runs_per_day['Run Date'] = runs_per_day['Run Date'].dt.strftime('%Y-%m-%d')
141 | runs_per_day.rename({ 'run_id': 'Number of Runs' }, axis='columns', inplace=True)
142 | 
143 | display(runs_per_day)
144 | 
145 | # COMMAND ----------
146 | 
147 | # MAGIC %md
148 | # MAGIC ## Find Number of Retraining Runs for Past Month 
149 | 
150 | # COMMAND ----------
151 | 
152 | # MAGIC %md
153 | # MAGIC * Calculates the date 30 days ago.
154 | # MAGIC * Filters experiment runs from the last 30 days.
155 | # MAGIC * Adds a column representing the date of each run.
156 | # MAGIC * Groups runs by date and counts the number of runs per day.
157 | # MAGIC * Formats the date for display.
158 | # MAGIC * Renames a column for clarity.
159 | # MAGIC * Displays a DataFrame showing the number of experiment runs for each day over the last 30 days.
160 | 
161 | # COMMAND ----------
162 | 
163 | earliest_start_time = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
164 | recent_runs = runs_retraining[runs_retraining.start_time >= earliest_start_time]
165 | 
166 | recent_runs['Run Date'] = recent_runs.start_time.dt.floor(freq='D')
167 | 
168 | runs_per_day = recent_runs.groupby(
169 |   ['Run Date']
170 | ).count()[['run_id']].reset_index()
171 | runs_per_day['Run Date'] = runs_per_day['Run Date'].dt.strftime('%Y-%m-%d')
172 | runs_per_day.rename({ 'run_id': 'Number of Runs' }, axis='columns', inplace=True)
173 | 
174 | display(runs_per_day)
175 | 
176 | # COMMAND ----------
177 | 
178 | # MAGIC %md
179 | # MAGIC ## Model Comparison (Staging - Production)
180 | 
181 | # COMMAND ----------
182 | 
183 | # MAGIC %md
184 | # MAGIC ### Request for Latest Models of Each Environment
185 | 
186 | # COMMAND ----------
187 | 
188 | # MAGIC %md
189 | # MAGIC * Sets up HTTP headers with an authorization token and query parameters with a model name for an API request.
190 | # MAGIC * Sends a GET request to the MLflow REST API of a Databricks instance to retrieve all versions of a registered machine learning model.
191 | # MAGIC * Checks if the API response was successful. If not, an exception is raised with an error message containing the status code and error message from the API response.
192 | # MAGIC * If the API response was successful, it extracts the registered model details from the response JSON.
193 | # MAGIC * Prints the list of model versions in a JSON formatted output.
194 | 
195 | # COMMAND ----------
196 | 
197 | # Set the headers and query parameters for the request
198 | headers = {"Authorization": f"Bearer {access_token}"}
199 | params = {"name": model_name}
200 | 
201 | # Send the GET request to the MLflow REST API to retrieve all versions of the model
202 | response = requests.get(f"https://{databricks_instance}/api/2.0/preview/mlflow/registered-models/get", headers=headers, params=params)
203 | 
204 | # Check if the response was successful
205 | if response.status_code != 200:
206 |     raise Exception(f"Failed to retrieve registered models. Status code: {response.status_code}. Error message: {response.json()['error_code']}: {response.json()['message']}")
207 | 
208 | model_versions = response.json()['registered_model']
209 | 
210 | # Print the list of model versions
211 | print(json.dumps(model_versions, indent=2))
212 | 
213 | 
214 | # COMMAND ----------
215 | 
216 | # MAGIC %md
217 | # MAGIC ## Retrieve Latest Staging & Production Models
218 | 
219 | # COMMAND ----------
220 | 
221 | prod_uri = None
222 | staging_uri = None
223 | 
224 | for i in range(len(model_versions['latest_versions'])):
225 |     if model_versions['latest_versions'][i]['current_stage'] == 'Production':
226 |         prod_uri = f"models:/{model_name}/{model_versions['latest_versions'][i]['version']}"
227 |     elif model_versions['latest_versions'][i]['current_stage'] == 'Staging':
228 |         staging_uri = f"models:/{model_name}/{model_versions['latest_versions'][i]['version']}"
229 | 
230 | if prod_uri is None:
231 |     print('No model versions found in production')
232 | else:
233 |     print(f'Latest production model version: {prod_uri}')
234 |     
235 | if staging_uri is None:
236 |     print('No model versions found in staging')
237 | else:
238 |     print(f'Latest staging model version: {staging_uri}')
239 | 
240 | # COMMAND ----------
241 | 
242 | # MAGIC %md
243 | # MAGIC ### Perform Batch Score to the Models
244 | 
245 | # COMMAND ----------
246 | 
247 | # MAGIC %md
248 | # MAGIC * Perform batch scoring for the latest deployed staging & production model
249 | 
250 | # COMMAND ----------
251 | 
252 | # Check if both the production and staging URIs have been retrieved
253 | if not prod_uri:
254 |     raise Exception("Failed to retrieve the production model URI.")
255 | if not staging_uri:
256 |     raise Exception("Failed to retrieve the staging model URI.")
257 | 
258 | # Score the test dataset using the production and staging models
259 | staging_scores = fs.score_batch(staging_uri, test_labels, result_type='float')
260 | prod_scores = fs.score_batch(prod_uri, test_labels, result_type='float')
261 | 
262 | prod_scores = prod_scores.withColumnRenamed("prediction", "prod_prediction")
263 | staging_scores = staging_scores.withColumnRenamed("prediction", "staging_prediction")
264 | 
265 | display(prod_scores)
266 | display(staging_scores)
267 | 
268 | # COMMAND ----------
269 | 
270 | # MAGIC %md
271 | # MAGIC ### Join Staging & Production Dataframes
272 | 
273 | # COMMAND ----------
274 | 
275 | # Join the two dataframes on the `consumption_id` column, keeping all columns from `staging_df` and only the `prod_prediction` column from `prod_df`
276 | merged_df = staging_scores.join(prod_scores.select('consumption_id', 'prod_prediction'), 'consumption_id', 'inner').select(staging_scores.columns + [col('prod_prediction')])
277 | 
278 | # Define the column expression to extract the correct region
279 | country_col_expr = (
280 |     concat(*[
281 |         when(col(country) == 1, country).otherwise("")
282 |         for country in countries
283 |     ])
284 | )
285 | 
286 | # Add a new column to the DataFrame with the concatenated region name
287 | merged_df = merged_df.withColumn("COUNTRY", country_col_expr)
288 | 
289 | display(merged_df)
290 | 
291 | # COMMAND ----------
292 | 
293 | from pyspark.sql.functions import year, month, col, concat, when,weekofyear
294 | # Filter to keep only the data for April 2022
295 | filtered_df = merged_df.filter((year(col('DATETIME')) == 2022) & (month(col('DATETIME')) == 4) & (weekofyear(col('DATETIME')) == 14))
296 | 
297 | # Display the filtered DataFrame
298 | display(filtered_df)
299 | 
300 | # COMMAND ----------
301 | 
302 | display(merged_df.filter(col('Country') == 'greece'))
303 | 
304 | # COMMAND ----------
305 | 
306 | # MAGIC %md
307 | # MAGIC ## Compare Staging vs Production
308 | 
309 | # COMMAND ----------
310 | 
311 | # MAGIC %md
312 | # MAGIC A function named calculate_smape is defined. This function takes three arguments:
313 | # MAGIC * df: A DataFrame that contains the prediction and actual values.
314 | # MAGIC * prediction_col: The name of the column that contains the predicted values.
315 | # MAGIC * actual_col: The name of the column that contains the actual values.
316 | # MAGIC     
317 | # MAGIC The function computes the SMAPE based on these input values. The SMAPE is calculated as the mean absolute difference between the predicted and actual values, divided by the average of the absolute predicted and actual values, all multiplied by 100.
318 | # MAGIC
319 | # MAGIC 1. The calculate_smape function is used to calculate the SMAPE for the staging and production models, using the respective prediction and actual values.
320 | # MAGIC 1. Based on the calculated SMAPE values, the code determines which model (staging or production) is better. The model with the lower SMAPE is considered the better one since a lower SMAPE indicates a better fit of the model.
321 | 
322 | # COMMAND ----------
323 | 
324 | def calculate_smape(df, prediction_col, actual_col):
325 |     from pyspark.sql.functions import abs
326 |     # Calculate SMAPE using PySpark functions
327 |     diff = col(prediction_col) - col(actual_col)
328 |     denominator = (abs(col(prediction_col)) + abs(col(actual_col))) / 2
329 |     smape = df.select(mean((abs(diff) / denominator) * 100).alias("SMAPE")).collect()[0]["SMAPE"]
330 |     
331 |     return smape
332 | 
333 | # Calculate SMAPE for staging predictions
334 | staging_smape = calculate_smape(staging_scores, 'staging_prediction', 'HOURLY_CONSUMPTION_MW')
335 | print(f"Staging Model SMAPE: {staging_smape}%")
336 | 
337 | # Calculate SMAPE for production predictions
338 | prod_smape = calculate_smape(prod_scores, 'prod_prediction', 'HOURLY_CONSUMPTION_MW')
339 | print(f"Production Model SMAPE: {prod_smape}%")
340 | 
341 | # Determine which model is better based on SMAPE
342 | if staging_smape < prod_smape:
343 |     print(f"Staging Model is better with a SMAPE of {staging_smape:.2f}%.")
344 |     best_model = staging_uri
345 | else:
346 |     print(f"Production Model is better with a SMAPE of {prod_smape:.2f}%.")
347 |     best_model = prod_uri
348 | 
349 | # Print the URI of the best model
350 | print(best_model)
351 | 
352 | 
353 | # COMMAND ----------
354 | 
355 | # MAGIC %md
356 | # MAGIC ## Transit the Best Model to Production Stage
357 | 
358 | # COMMAND ----------
359 | 
360 | # MAGIC %md
361 | # MAGIC * The function initializes the MlflowClient and assigns it to the variable client.
362 | # MAGIC * It retrieves the latest version of the registered model in the Staging stage by calling the get_latest_versions method of the MlflowClient and assigns it to the variable model_version.
363 | # MAGIC * It defines the endpoint URL for sending the transition request to the Databricks MLflow API. The URL is assigned to the variable endpoint_url.
364 | # MAGIC * The stage to which the model should be transitioned is defined as 'Production'. Additionally, a comment for the transition request is set.
365 | # MAGIC * It sets the request headers to include the authorization token.
366 | # MAGIC * It constructs the request body, which includes the version of the model to be transitioned, the model name, the desired stage, a flag indicating whether to archive existing versions in the target stage, the comment, and a flag to indicate that this is a transition request.
367 | # MAGIC * It sends a POST request to the API endpoint with the defined headers and request body.
368 | # MAGIC * Finally, it checks the status code of the response. If the status code is 200, it prints a message indicating that the model transition request was sent successfully. Otherwise, it prints an error message with the response text.
369 | 
370 | # COMMAND ----------
371 | 
372 | def request_model_transition_to_production():
373 |     
374 |     # Get the latest version of the registered model in the Staging stage
375 |     client = mlflow.tracking.MlflowClient()
376 |     model_version = client.get_latest_versions(model_name, stages=["Staging"])[0].version
377 | 
378 |     # Define the endpoint URL
379 |     endpoint_url = f"https://{databricks_instance}/api/2.0/mlflow/transition-requests/create"
380 | 
381 |     stage = 'Production' #Define the stage you want your model to transit
382 |     comment = "Requesting transition to Production environment after comparing models"
383 |     headers = { "Authorization": "Bearer " + access_token }
384 | 
385 |     request_body = {
386 |         "version": f"{model_version}",
387 |         "name": model_name, 
388 |         "stage" : stage, #Specifies the environment we want to transit our model
389 |         "archive_existing_versions": True, #Specifies whether to archive all current model versions in the target stage.
390 |         "comment": comment,
391 |         "request_transition": True
392 |     }
393 |     print(model_version,model_name)
394 |     # Make the request
395 |     response = requests.post(endpoint_url, headers=headers,json=request_body)
396 | 
397 |     # Check the response status code
398 |     if response.status_code == 200:
399 |         print("Model version transition request sent")
400 |     else:
401 |         print(f"Error sending transition request: {response.text}")
402 | 
403 | 
404 | # COMMAND ----------
405 | 
406 | # MAGIC %md
407 | # MAGIC * Initializes an MLflow Client by assigning it to the variable client. The MLflow Client provides a programmatic way to interact with an MLflow tracking server.
408 | # MAGIC * Extracts the model_name and model_version from the best_model string, which presumably holds a URI for the model. It does so by splitting the string and accessing the relevant parts.
409 | # MAGIC * Queries the current stage (e.g., Staging, Production) of the model version using the get_model_version method of the MLflow Client. It assigns this stage to the variable best_model_stage.
410 | # MAGIC * Checks if the current stage of the best model is not 'Production'. If it isn't, it calls the previously defined function request_model_transition_to_production to request transitioning this model to the Production stage.
411 | # MAGIC * If the best model is already in the Production stage, it prints "Best model is already in Production".
412 | 
413 | # COMMAND ----------
414 | 
415 | client = mlflow.tracking.MlflowClient()
416 | model_name = best_model.split('/')[1]
417 | model_version = best_model.split('/')[-1]
418 | best_model_stage = client.get_model_version(name=model_name, version=model_version).current_stage
419 | if best_model_stage != 'Production':
420 |     # transit model to production
421 |     request_model_transition_to_production()
422 | else:
423 |     print("Best model is already in Production")
424 | 
425 | 
426 | # COMMAND ----------
427 | 
428 | 
429 | 


--------------------------------------------------------------------------------