├── MLOps Pipeline ├── Utils │ ├── Images │ │ ├── newplot.png │ │ ├── newplot (1).png │ │ ├── newplot (5).png │ │ └── MLOps Architecture (1).png │ └── requirements.txt ├── Workflow Config │ ├── Great Expecations Config.py │ ├── Daily Inference.py │ └── Initial Deployment.py ├── Data Engineering │ ├── 02. Transformation │ │ ├── 01. Training Data Transformation.py │ │ └── 02. Monitoring Data Transformation.py │ ├── 03. Data Quality │ │ ├── 02. Great Expectations.py │ │ ├── 01. Data Quality Checks.py │ │ └── 02. Monitoring Data Quality Checks.py │ └── 01. Ingestion │ │ ├── 02. Monitoring Data Ingestion.py │ │ └── 01. Training Data Ingestion.py └── ML Engineering │ └── Demand Forecasting Daily │ ├── 02.Daily Inference(XGBOOST).py │ ├── 03.Daily Monitoring.py │ ├── 00. Initial Deployment │ ├── 01.Exploratory Data Analysis.py │ ├── 02.Feature Engineering.py │ ├── 04.Unit Test.py │ ├── 03.Model Training.py │ ├── 03.Model Training(Pyspark Edition).py │ └── Model Training(LSTM).py │ ├── 04.Model Retraining Monthly(pyspark edition).py │ ├── 01.Feature Engineering.py │ └── 06.Performance Evaluation.py ├── .gitignore ├── LICENSE ├── README.md └── DOCUMENTATION.md /MLOps Pipeline/Utils/Images/newplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Philippos01/mlops-energy-forecast-thesis/HEAD/MLOps Pipeline/Utils/Images/newplot.png -------------------------------------------------------------------------------- /MLOps Pipeline/Utils/Images/newplot (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Philippos01/mlops-energy-forecast-thesis/HEAD/MLOps Pipeline/Utils/Images/newplot (1).png -------------------------------------------------------------------------------- /MLOps Pipeline/Utils/Images/newplot (5).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Philippos01/mlops-energy-forecast-thesis/HEAD/MLOps Pipeline/Utils/Images/newplot (5).png -------------------------------------------------------------------------------- /MLOps Pipeline/Utils/Images/MLOps Architecture (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Philippos01/mlops-energy-forecast-thesis/HEAD/MLOps Pipeline/Utils/Images/MLOps Architecture (1).png -------------------------------------------------------------------------------- /MLOps Pipeline/Utils/requirements.txt: -------------------------------------------------------------------------------- 1 | mlflow 2 | databricks 3 | databricks-feature-store 4 | xgboost 5 | tensorflow 6 | protobuf 7 | keras 8 | pyspark 9 | matplotlib 10 | pandas 11 | scipy 12 | requests 13 | -------------------------------------------------------------------------------- /MLOps Pipeline/Workflow Config/Great Expecations Config.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install great_expectations 3 | # MAGIC ! great_expectations --yes init 4 | # MAGIC %pip install pyyaml 5 | 6 | # COMMAND ---------- 7 | 8 | import datetime 9 | import pandas as pd 10 | import yaml 11 | from pyspark.sql.types import TimestampType,DoubleType 12 | from great_expectations.core.batch import RuntimeBatchRequest 13 | from great_expectations.core.yaml_handler import YAMLHandler 14 | from great_expectations.util import get_context 15 | from great_expectations.data_context.types.base import ( 16 | DataContextConfig, 17 | FilesystemStoreBackendDefaults, 18 | ) 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # .gitignore 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # Databricks notebook files 9 | *.dbc 10 | 11 | # Jupyter Notebook 12 | .ipynb_checkpoints 13 | 14 | # Python environments 15 | .env 16 | .venv 17 | env/ 18 | venv/ 19 | ENV/ 20 | 21 | # Pip related 22 | pip-wheel-metadata/ 23 | *.egg-info/ 24 | *.egg 25 | 26 | # Setuptools distribution folder 27 | /dist/ 28 | 29 | # Installer logs 30 | pip-log.txt 31 | pip-delete-this-directory.txt 32 | 33 | # Unit test / coverage reports 34 | htmlcov/ 35 | .tox/ 36 | .nox/ 37 | .coverage 38 | .coverage.* 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | *.cover 43 | .hypothesis/ 44 | .pytest_cache/ 45 | 46 | # Log files 47 | *.log 48 | 49 | # Data files 50 | *.csv 51 | *.xlsx 52 | *.db 53 | 54 | # Directories with large data files 55 | data/ 56 | model/ 57 | 58 | # Configuration files 59 | *.cfg 60 | *.ini 61 | 62 | # Keys and secrets 63 | *.pem 64 | *.key 65 | *.secret 66 | 67 | # OS generated files 68 | .DS_Store 69 | .DS_Store? 70 | ._* 71 | .Spotlight-V100 72 | .Trashes 73 | ehthumbs.db 74 | Thumbs.db 75 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 FILIPPOS PRIOVOLOS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MLOps Pipeline/Workflow Config/Daily Inference.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Installations 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %pip install mlflow 8 | # MAGIC %pip install xgboost 9 | # MAGIC %pip install databricks && pip install databricks-feature-store 10 | # MAGIC #%pip install mlflow==2.4 numpy==1.22.4 protobuf==4.23.2 tensorflow==2.12.0 11 | # MAGIC 12 | 13 | # COMMAND ---------- 14 | 15 | # MAGIC %md 16 | # MAGIC ## Imports 17 | 18 | # COMMAND ---------- 19 | 20 | import pandas as pd 21 | import mlflow 22 | from databricks import feature_store 23 | from pyspark.sql.functions import col, sum, date_sub, to_date, hour,lit,add_months,date_format,expr,abs 24 | from pyspark.ml.feature import OneHotEncoder, StringIndexer 25 | from pyspark.ml import Pipeline 26 | from pyspark.ml.functions import vector_to_array 27 | import matplotlib.pyplot as plt 28 | from pyspark.sql import Row 29 | from pyspark.sql.types import DoubleType 30 | from pyspark.mllib.evaluation import RegressionMetrics 31 | 32 | # COMMAND ---------- 33 | 34 | # MAGIC %md 35 | # MAGIC ## Configuration 36 | 37 | # COMMAND ---------- 38 | 39 | countries=['belgium','denmark','france','germany','greece','italy','luxembourg','netherlands','spain','sweden','switzerland'] 40 | model_name = 'pyspark_mlflow_model' 41 | db = 'df_dev' 42 | fs = feature_store.FeatureStoreClient() 43 | 44 | # COMMAND ---------- 45 | 46 | spark.sql("USE df_dev") 47 | 48 | # COMMAND ---------- 49 | 50 | from datetime import datetime, timedelta 51 | substract_days = 162 52 | date = (datetime.today() - timedelta(days=substract_days)).strftime('%Y-%m-%d') 53 | yesterdate = (datetime.today() - timedelta(days=1) - timedelta(days=substract_days)).strftime('%Y-%m-%d') 54 | 55 | # COMMAND ---------- 56 | 57 | 58 | # Check if the row exists 59 | row_exists = spark.sql(f""" 60 | SELECT 1 61 | FROM inference_daily 62 | WHERE execution_date = '{date}' AND execution_yesterdate = '{yesterdate}' 63 | """).collect() 64 | 65 | # If row does not exist, insert it 66 | if not row_exists: 67 | spark.sql(f""" 68 | INSERT INTO inference_daily (execution_date, execution_yesterdate) 69 | VALUES ('{date}', '{yesterdate}') 70 | """) 71 | 72 | # COMMAND ---------- 73 | 74 | # Read the table 75 | df = spark.table("inference_daily") 76 | 77 | # Show the contents of the table 78 | df.show() 79 | -------------------------------------------------------------------------------- /MLOps Pipeline/Data Engineering/02. Transformation/01. Training Data Transformation.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | #%run "/Repos/CI ADO Repo/01.Develop/Workflow Config/Daily Inference" 3 | from pyspark.sql import functions as F 4 | from pyspark.sql.functions import concat, col, lit, lpad 5 | 6 | # COMMAND ---------- 7 | 8 | database= 'df_dev' 9 | 10 | # COMMAND ---------- 11 | 12 | spark.sql(f'USE {database}') 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC * Loads the data from the consumption_countries_hourly table in the df_dev database. 18 | 19 | # COMMAND ---------- 20 | 21 | df = spark.read.table('df_dev.consumption_countries_hourly') 22 | 23 | # COMMAND ---------- 24 | 25 | display(df) 26 | 27 | # COMMAND ---------- 28 | 29 | # MAGIC %md 30 | # MAGIC 1. Extracts the date and hour from the DATETIME column. 31 | # MAGIC 1. Groups the data by country, date, and hour, and sums up the hourly consumption. 32 | # MAGIC 1. Renames the summed column to HOURLY_CONSUMPTION_MW. 33 | # MAGIC 1. Constructs a new DATETIME column by concatenating the date and hour. 34 | # MAGIC 1. Converts the DATETIME column to timestamp format. 35 | # MAGIC 1. Selects and reorders the columns to match the desired schema. 36 | 37 | # COMMAND ---------- 38 | 39 | # Extract the date and hour from the start_time 40 | df = df.withColumn('date', F.to_date(df['DATETIME'])) 41 | df = df.withColumn('hour', F.hour(df['DATETIME'])) 42 | 43 | # Group by country, date and hour, and sum up the hourly consumption 44 | df_hourly = df.groupBy('COUNTRY', 'date', 'hour').sum('HOURLY_CONSUMPTION_MW') 45 | 46 | # Rename the sum column 47 | df_hourly = df_hourly.withColumnRenamed('sum(HOURLY_CONSUMPTION_MW)', 'HOURLY_CONSUMPTION_MW') 48 | 49 | # Make sure the hour is a two-digit string 50 | df_hourly = df_hourly.withColumn('hour', lpad(col('hour'), 2, '0')) 51 | 52 | # Construct a new 'DATETIME' column 53 | df_hourly = df_hourly.withColumn('DATETIME', 54 | concat(col('date'), lit(' '), col('hour'), lit(':00:00'))) 55 | 56 | # Convert 'DATETIME' to timestamp type 57 | df_hourly = df_hourly.withColumn('DATETIME', 58 | F.to_timestamp(df_hourly['DATETIME'], 'yyyy-MM-dd HH:mm:ss')) 59 | 60 | # Select and reorder the columns 61 | df_hourly = df_hourly.select('DATETIME', 'HOURLY_CONSUMPTION_MW', 'COUNTRY') 62 | 63 | # COMMAND ---------- 64 | 65 | df_hourly.count() 66 | 67 | # COMMAND ---------- 68 | 69 | display(df_hourly) 70 | 71 | # COMMAND ---------- 72 | 73 | # MAGIC %md 74 | # MAGIC * Writes the transformed DataFrame into a new table named final_consumption_countries_hourly in the df_dev database. The mode overwrite is used to replace the existing data in the table (if any). 75 | 76 | # COMMAND ---------- 77 | 78 | # Write the DataFrame into a new table 79 | df_hourly.write.format('delta').mode('overwrite').saveAsTable('df_dev.final_consumption_countries_hourly') 80 | 81 | # COMMAND ---------- 82 | 83 | 84 | -------------------------------------------------------------------------------- /MLOps Pipeline/Data Engineering/02. Transformation/02. Monitoring Data Transformation.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | #%run "/Repos/CI ADO Repo/01.Develop/Workflow Config/Daily Inference" 3 | from pyspark.sql import functions as F 4 | from pyspark.sql.functions import concat, col, lit, lpad 5 | 6 | # COMMAND ---------- 7 | 8 | database= 'db_monitor' 9 | 10 | # COMMAND ---------- 11 | 12 | spark.sql(f'USE {database}') 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC * Loads the data from the consumption_countries_hourly table in the df_dev database. 18 | 19 | # COMMAND ---------- 20 | 21 | df = spark.read.table('monitoring_consumption_countries_hourly') 22 | 23 | # COMMAND ---------- 24 | 25 | display(df) 26 | 27 | # COMMAND ---------- 28 | 29 | # MAGIC %md 30 | # MAGIC 1. Extracts the date and hour from the DATETIME column. 31 | # MAGIC 1. Groups the data by country, date, and hour, and sums up the hourly consumption. 32 | # MAGIC 1. Renames the summed column to HOURLY_CONSUMPTION_MW. 33 | # MAGIC 1. Constructs a new DATETIME column by concatenating the date and hour. 34 | # MAGIC 1. Converts the DATETIME column to timestamp format. 35 | # MAGIC 1. Selects and reorders the columns to match the desired schema. 36 | 37 | # COMMAND ---------- 38 | 39 | # Extract the date and hour from the start_time 40 | df = df.withColumn('date', F.to_date(df['DATETIME'])) 41 | df = df.withColumn('hour', F.hour(df['DATETIME'])) 42 | 43 | # Group by country, date and hour, and sum up the hourly consumption 44 | df_hourly = df.groupBy('COUNTRY', 'date', 'hour').sum('HOURLY_CONSUMPTION_MW') 45 | 46 | # Rename the sum column 47 | df_hourly = df_hourly.withColumnRenamed('sum(HOURLY_CONSUMPTION_MW)', 'HOURLY_CONSUMPTION_MW') 48 | 49 | # Make sure the hour is a two-digit string 50 | df_hourly = df_hourly.withColumn('hour', lpad(col('hour'), 2, '0')) 51 | 52 | # Construct a new 'DATETIME' column 53 | df_hourly = df_hourly.withColumn('DATETIME', 54 | concat(col('date'), lit(' '), col('hour'), lit(':00:00'))) 55 | 56 | # Convert 'DATETIME' to timestamp type 57 | df_hourly = df_hourly.withColumn('DATETIME', 58 | F.to_timestamp(df_hourly['DATETIME'], 'yyyy-MM-dd HH:mm:ss')) 59 | 60 | # Select and reorder the columns 61 | df_hourly = df_hourly.select('DATETIME', 'HOURLY_CONSUMPTION_MW', 'COUNTRY') 62 | 63 | # COMMAND ---------- 64 | 65 | df_hourly.count() 66 | 67 | # COMMAND ---------- 68 | 69 | display(df_hourly) 70 | 71 | # COMMAND ---------- 72 | 73 | # MAGIC %md 74 | # MAGIC * Writes the transformed DataFrame into a new table named final_consumption_countries_hourly in the df_dev database. The mode overwrite is used to replace the existing data in the table (if any). 75 | 76 | # COMMAND ---------- 77 | 78 | # Write the DataFrame into a new table 79 | df_hourly.write.format('delta').mode('overwrite').saveAsTable('final_monitoring_consumption_countries_hourly') 80 | 81 | # COMMAND ---------- 82 | 83 | 84 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/02.Daily Inference(XGBOOST).py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Configuration 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Daily Inference" 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %md 12 | # MAGIC ## Configuration 13 | 14 | # COMMAND ---------- 15 | 16 | input_table = 'hourly_forecasting_features' 17 | output_table = 'predictions_xgb' 18 | 19 | # COMMAND ---------- 20 | 21 | # MAGIC %md 22 | # MAGIC ## Load Inference Input 23 | 24 | # COMMAND ---------- 25 | 26 | inference_df = spark.sql(f"SELECT CONSUMPTION_ID, DATETIME FROM {db}.{input_table} WHERE DATETIME BETWEEN '{date} 00:00:00' AND '{date} 23:00:00'") 27 | #inference_data = inference_df.drop("CONSUMPTION_ID","DATETIME") 28 | display(inference_df) 29 | 30 | # COMMAND ---------- 31 | 32 | # MAGIC %md 33 | # MAGIC ## Model's Prediction 34 | 35 | # COMMAND ---------- 36 | 37 | client=mlflow.tracking.MlflowClient() 38 | latest_version= client.get_latest_versions(model_name,stages=['Production'])[0].version 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %md 43 | # MAGIC * The following code performs batch scoring on the inference_df(which is the future date we want to predict), using the latest model deployed in Production 44 | 45 | # COMMAND ---------- 46 | 47 | results = fs.score_batch( 48 | f"models:/{model_name}/{latest_version}", 49 | inference_df, 50 | result_type="float", 51 | ) 52 | display(results) 53 | 54 | # COMMAND ---------- 55 | 56 | greece_predictions = results.filter(results["greece"] == 1).select("prediction","HOUR") 57 | greece_predictions.display() 58 | 59 | # COMMAND ---------- 60 | 61 | # MAGIC %md 62 | # MAGIC ## Store Results 63 | 64 | # COMMAND ---------- 65 | 66 | # MAGIC %md 67 | # MAGIC * It selects relevant columns from the initial results and converts them into a Pandas DataFrame for easy manipulation. 68 | # MAGIC * It renames and creates new columns, including predicted consumption, country extracted from the consumption ID, and placeholders for actual consumption and residuals. 69 | # MAGIC * The Pandas DataFrame is converted back to a Spark DataFrame with a specific selection of columns. 70 | # MAGIC * Data types for certain columns are cast to float. 71 | # MAGIC * Finally, the data is registered as a temporary SQL view named 'Inference_Output', allowing for SQL-based analysis and querying. 72 | 73 | # COMMAND ---------- 74 | 75 | df = results.select(['CONSUMPTION_ID', 'DATETIME', 'prediction']).toPandas() 76 | df.rename(columns={'prediction': 'PREDICTED_CONSUMPTION'}, inplace=True) 77 | df['DATETIME'] = df.DATETIME.astype(str) 78 | df['COUNTRY'] = df['CONSUMPTION_ID'].apply(lambda x: x.split('_')[0]) 79 | df['ACTUAL_CONSUMPTION'] = None 80 | df['RESIDUAL'] = None 81 | df['MODEL_USED'] = f"models:/{model_name}/{latest_version}" 82 | output_cols = ['DATETIME', 'COUNTRY', 'PREDICTED_CONSUMPTION', 'ACTUAL_CONSUMPTION', 'RESIDUAL', 'MODEL_USED'] 83 | output_df = spark.createDataFrame(df[output_cols]) 84 | output_df.withColumn('ACTUAL_CONSUMPTION', col('ACTUAL_CONSUMPTION').cast('float'))\ 85 | .withColumn('RESIDUAL', col('RESIDUAL').cast('float'))\ 86 | .createOrReplaceTempView('Inference_Output') 87 | 88 | # COMMAND ---------- 89 | 90 | # MAGIC %md 91 | # MAGIC * It prepares the list of columns to be inserted or updated. 92 | # MAGIC * It uses Spark SQL to merge data from a temporary view Inference_Output into a target table. 93 | # MAGIC * If a record with matching DATETIME and COUNTRY is found, it updates the existing record in the target table with the new data. 94 | # MAGIC * If no matching record is found, it inserts the new data as a new record in the target table. 95 | 96 | # COMMAND ---------- 97 | 98 | insert_columns = [f"B.{col}" for col in output_cols] 99 | update_columns = [f"{col}=B.{col}" for col in output_cols] 100 | spark.sql(f""" 101 | MERGE INTO {db}.{output_table} A 102 | USING Inference_Output B 103 | ON A.DATETIME = B.DATETIME AND A.COUNTRY = B.COUNTRY 104 | WHEN MATCHED THEN 105 | UPDATE SET 106 | {', '.join(update_columns)} 107 | WHEN NOT MATCHED 108 | THEN INSERT ( 109 | {', '.join(output_cols)} 110 | ) VALUES ( 111 | {', '.join(insert_columns)} 112 | ) 113 | """) 114 | 115 | # COMMAND ---------- 116 | 117 | 118 | -------------------------------------------------------------------------------- /MLOps Pipeline/Workflow Config/Initial Deployment.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Installations 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %pip install mlflow 8 | # MAGIC %pip install databricks && pip install databricks-feature-store 9 | # MAGIC %pip install xgboost 10 | # MAGIC %pip install tensorflow 11 | # MAGIC %pip install protobuf 12 | # MAGIC %pip install mlflow keras 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC ## Imports 18 | 19 | # COMMAND ---------- 20 | 21 | from pyspark.sql import SparkSession 22 | from pyspark.sql.functions import col,concat, when, lit, to_date, date_sub, max as max_, rand, lpad, concat_ws,sum,mean 23 | from pyspark.ml.feature import VectorAssembler, VectorIndexer,OneHotEncoder, StringIndexer 24 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 25 | from pyspark.ml import Pipeline 26 | from pyspark.ml.evaluation import RegressionEvaluator 27 | from pyspark.sql.types import DoubleType, TimestampType, DateType 28 | from databricks import feature_store 29 | from databricks.feature_store import feature_table, FeatureLookup 30 | import mlflow 31 | from mlflow.tracking import MlflowClient 32 | import mlflow.keras 33 | import mlflow.sklearn 34 | import mlflow.models.signature as sch 35 | from mlflow.models.signature import ModelSignature 36 | from mlflow.types.schema import Schema, ColSpec 37 | import matplotlib.pyplot as plt 38 | import pandas as pd 39 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 40 | from scipy.stats import ks_2samp 41 | import scipy.stats as stats 42 | from xgboost import plot_importance, plot_tree, XGBRegressor 43 | from xgboost.spark import SparkXGBRegressor 44 | from datetime import datetime, timedelta 45 | from dateutil.relativedelta import relativedelta 46 | import time 47 | import unittest 48 | import requests 49 | import json 50 | 51 | # COMMAND ---------- 52 | 53 | # MAGIC %md 54 | # MAGIC ## Configuration 55 | 56 | # COMMAND ---------- 57 | 58 | train_start = '2015-01-01' 59 | train_end = '2021-12-31' 60 | test_start = '2022-01-01' 61 | test_end = '2023-01-01' 62 | db = 'df_dev' 63 | feauture_store = 'hourly_forecasting_features' 64 | consumption_countries_hourly ='final_consumption_countries_hourly' 65 | model_name = 'pyspark_mlflow_model' 66 | access_token = 'dapie24d3f30586ca9b17dbd6d28ce208086-2' 67 | databricks_instance = 'adb-8855338042472349.9.azuredatabricks.net' 68 | countries = ["belgium", "denmark", "france", "germany", "greece", "italy", "luxembourg", "netherlands", "spain", "sweden","switzerland"] #new 69 | experiment_id_training = '3578670731332255' 70 | experiment_id_retraining = '3578670731332164' 71 | fs = feature_store.FeatureStoreClient() 72 | pip_requirements = ["pyspark==3.4.0", "mlflow==2.3.2", "xgboost==1.7.5"] 73 | user = 'filippos.priovolos01@gmail.com' 74 | 75 | # COMMAND ---------- 76 | 77 | # MAGIC %md 78 | # MAGIC ## Schema 79 | 80 | # COMMAND ---------- 81 | 82 | input_schema = Schema([ 83 | ColSpec("integer", "belgium"), 84 | ColSpec("integer", "denmark"), 85 | ColSpec("integer", "france"), 86 | ColSpec("integer", "germany"), 87 | ColSpec("integer", "greece"), 88 | ColSpec("integer", "italy"), 89 | ColSpec("integer", "luxembourg"), 90 | ColSpec("integer", "netherlands"), 91 | ColSpec("integer", "spain"), 92 | ColSpec("integer", "sweden"), 93 | ColSpec("integer", "switzerland"), 94 | ColSpec("integer", "HOUR"), 95 | ColSpec("integer", "DAY_OF_WEEK"), 96 | ColSpec("integer", "MONTH"), 97 | ColSpec("integer", "QUARTER"), 98 | ColSpec("integer", "YEAR"), 99 | ColSpec("integer", "DAY_OF_YEAR"), 100 | ColSpec("integer", "DAY_OF_MONTH"), 101 | ColSpec("integer", "WEEK_OF_YEAR"), 102 | ColSpec("double", "ROLLING_MEAN_24H"), 103 | ColSpec("double", "ROLLING_STD_24H"), 104 | ColSpec("double", "ROLLING_SUM_7D"), 105 | ColSpec("double", "PREV_DAY_CONSUMPTION"), 106 | ColSpec("double", "PREV_WEEK_CONSUMPTION"), 107 | ColSpec("double", "PREVIOUS_MONTH_CONSUMPTION") 108 | ]) 109 | 110 | output_schema = Schema([ColSpec("double", "HOURLY_CONSUMPTION_MW")]) 111 | 112 | # COMMAND ---------- 113 | 114 | # MAGIC %md 115 | # MAGIC ## Model Signature 116 | 117 | # COMMAND ---------- 118 | 119 | # Create a model signature from the input and output schemas 120 | signature = ModelSignature(inputs=input_schema, outputs=output_schema) 121 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/03.Daily Monitoring.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Daily Inference" 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC * The code uses Apache Spark SQL to query and manipulate data. 8 | # MAGIC * It first selects data from a table for a specific date and casts the DATETIME column to a string, storing the result in a temporary view called 'daily_features'. 9 | # MAGIC * It then performs a merge operation between a target table 'predictions_xgb' and the temporary view. 10 | # MAGIC * For rows that have matching DATETIME and COUNTRY in both the target table and temporary view, it updates the RESIDUAL and ACTUAL_CONSUMPTION columns in the target table based on the data in the temporary view. 11 | 12 | # COMMAND ---------- 13 | 14 | spark.sql(f"""SELECT *, CAST(DATETIME AS STRING) AS STR_DATETIME 15 | FROM db_monitor.final_monitoring_consumption_countries_hourly 16 | WHERE DATETIME >= '{date} 00:00' AND DATETIME <= '{date} 23:59' """).createOrReplaceTempView('daily_features') 17 | 18 | spark.sql(f""" 19 | MERGE INTO df_dev.predictions_xgb A 20 | USING daily_features B 21 | ON A.DATETIME = B.STR_DATETIME AND A.COUNTRY = B.COUNTRY 22 | WHEN MATCHED THEN 23 | UPDATE SET 24 | A.RESIDUAL = A.PREDICTED_CONSUMPTION - B.HOURLY_CONSUMPTION_MW, 25 | A.ACTUAL_CONSUMPTION = B.HOURLY_CONSUMPTION_MW 26 | """) 27 | 28 | 29 | # COMMAND ---------- 30 | 31 | df = spark.sql(f"SELECT * FROM df_dev.predictions_xgb WHERE DATETIME >= '{date} 00:00' AND DATETIME <= '{date} 23:59' ") 32 | 33 | # Convert the data types of ACTUAL_CONSUMPTION and PREDICTED_CONSUMPTION columns to DoubleType 34 | df = df.withColumn('ACTUAL_CONSUMPTION', col('ACTUAL_CONSUMPTION').cast(DoubleType())) 35 | df = df.withColumn('PREDICTED_CONSUMPTION', col('PREDICTED_CONSUMPTION').cast(DoubleType())) 36 | 37 | valuesAndPreds = df.select(['ACTUAL_CONSUMPTION', 'PREDICTED_CONSUMPTION']) 38 | valuesAndPreds = valuesAndPreds.rdd.map(tuple) 39 | 40 | metrics = RegressionMetrics(valuesAndPreds) 41 | 42 | # Squared Error 43 | print("MSE = %s" % metrics.meanSquaredError) 44 | print("RMSE = %s" % metrics.rootMeanSquaredError) 45 | 46 | # Mean absolute error 47 | print("MAE = %s" % metrics.meanAbsoluteError) 48 | 49 | 50 | # COMMAND ---------- 51 | 52 | # Calculate the percentage difference by dividing the difference by the absolute value of actual consumption 53 | df = df.withColumn('PERCENTAGE_DIFFERENCE', (col('RESIDUAL') / abs(col('ACTUAL_CONSUMPTION'))) * 100) 54 | 55 | # Calculate the absolute value of the percentage difference 56 | df = df.withColumn('ABS_PERCENTAGE_DIFFERENCE', abs(col('PERCENTAGE_DIFFERENCE'))) 57 | 58 | # Calculate the average absolute percentage difference 59 | average_absolute_percentage_difference = df.selectExpr('avg(ABS_PERCENTAGE_DIFFERENCE)').collect()[0][0] 60 | 61 | # Calculate the average percentage difference 62 | average_percentage_difference = df.selectExpr('avg(PERCENTAGE_DIFFERENCE)').collect()[0][0] 63 | 64 | display(df) 65 | # Print the average percentage difference 66 | print('Average Percentage Difference:', average_percentage_difference) 67 | # Print the average absolute percentage difference 68 | print('Average Absolute Percentage Difference:', average_absolute_percentage_difference) 69 | 70 | # COMMAND ---------- 71 | 72 | display(df.filter(df['COUNTRY'] == 'greece')) 73 | 74 | # COMMAND ---------- 75 | 76 | # MAGIC %md 77 | # MAGIC ## Save Inference Data to main table 78 | 79 | # COMMAND ---------- 80 | 81 | # Retrieve the predictions_xgb DataFrame using the table name 82 | predictions_xgb = spark.table('df_dev.predictions_xgb') 83 | 84 | # Select the columns from the first table and cast appropriate columns to match the second table's schema 85 | merged_df = predictions_xgb.select( 86 | col('DATETIME').cast('timestamp').alias('DATETIME'), 87 | col('COUNTRY'), 88 | col('PREDICTED_CONSUMPTION').cast(DoubleType()).alias('HOURLY_CONSUMPTION_MW') 89 | ) 90 | 91 | # Perform a merge operation to insert new records into the second table if they don't already exist 92 | merged_df.createOrReplaceTempView('temp_table') 93 | 94 | spark.sql(""" 95 | MERGE INTO df_dev.final_consumption_countries_hourly AS target 96 | USING temp_table AS source 97 | ON target.DATETIME = source.DATETIME AND target.COUNTRY = source.COUNTRY 98 | WHEN NOT MATCHED THEN 99 | INSERT (DATETIME, HOURLY_CONSUMPTION_MW, COUNTRY) 100 | VALUES (source.DATETIME, source.HOURLY_CONSUMPTION_MW, source.COUNTRY) 101 | """) 102 | 103 | # COMMAND ---------- 104 | 105 | 106 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/01.Exploratory Data Analysis.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | !pip install statsmodels 3 | from statsmodels.tsa.seasonal import seasonal_decompose 4 | from pyspark.sql.functions import count, when, isnull, col 5 | from pyspark.sql import functions as F 6 | import plotly.subplots as sp 7 | import plotly.graph_objects as go 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %md 12 | # MAGIC ## Univariate Analysis 13 | 14 | # COMMAND ---------- 15 | 16 | spark.sql('USE df_dev') 17 | df = spark.read.table('final_consumption_countries_hourly') 18 | 19 | # COMMAND ---------- 20 | 21 | # MAGIC %md 22 | # MAGIC * Distribution of records across years, months, days and hours: 23 | 24 | # COMMAND ---------- 25 | 26 | display(df.withColumn('year', F.year('DATETIME')).groupBy('year').count()) 27 | display(df.withColumn('month', F.month('DATETIME')).groupBy('month').count()) 28 | display(df.withColumn('day', F.dayofweek('DATETIME')).groupBy('day').count()) 29 | display(df.withColumn('hour', F.hour('DATETIME')).groupBy('hour').count()) 30 | 31 | # COMMAND ---------- 32 | 33 | # MAGIC %md 34 | # MAGIC * Frequency of records for each country 35 | 36 | # COMMAND ---------- 37 | 38 | df.groupBy('COUNTRY').count().show() 39 | 40 | 41 | # COMMAND ---------- 42 | 43 | # MAGIC %md 44 | # MAGIC ## Bivariate Analysis 45 | 46 | # COMMAND ---------- 47 | 48 | # MAGIC %md 49 | # MAGIC * Average hourly consumption per country 50 | 51 | # COMMAND ---------- 52 | 53 | df.groupBy('COUNTRY').agg(F.avg('HOURLY_CONSUMPTION_MW').alias('avg_consumption')).show() 54 | 55 | # COMMAND ---------- 56 | 57 | # MAGIC %md 58 | # MAGIC * Monthly consumption trends per country 59 | 60 | # COMMAND ---------- 61 | 62 | df.withColumn('year', F.year('DATETIME')) \ 63 | .withColumn('month', F.month('DATETIME')) \ 64 | .groupBy('year', 'month', 'COUNTRY') \ 65 | .agg(F.sum('HOURLY_CONSUMPTION_MW').alias('total_consumption')) \ 66 | .orderBy('year', 'month') \ 67 | .show() 68 | 69 | 70 | # COMMAND ---------- 71 | 72 | # MAGIC %md 73 | # MAGIC * Heatmap: Average hourly consumption for each country by hour of the day or by month of the year 74 | 75 | # COMMAND ---------- 76 | 77 | import plotly.graph_objects as go 78 | 79 | # Convert the DataFrame to a 2D list for Plotly 80 | heatmap_data = df_heatmap.values.tolist() 81 | 82 | # Create the heatmap 83 | fig = go.Figure(data=go.Heatmap( 84 | z=heatmap_data, 85 | x=df_heatmap.columns.tolist(), 86 | y=df_heatmap.index.tolist(), 87 | colorscale='RdBu_r', # you can change this to other color scales 88 | )) 89 | 90 | # Set the layout 91 | fig.update_layout( 92 | title='Average Hourly Consumption by Country and Hour of Day', 93 | xaxis_title='Hour of Day', 94 | yaxis_title='Country', 95 | ) 96 | 97 | # Display the figure 98 | fig.show() 99 | 100 | 101 | # COMMAND ---------- 102 | 103 | pandas_df = df.toPandas() 104 | 105 | # COMMAND ---------- 106 | 107 | # MAGIC %md 108 | # MAGIC The decompose_country function takes a DataFrame df and a country name as inputs. It performs a time series decomposition on the 'HOURLY_CONSUMPTION_MW' column of the DataFrame for the specified country. 109 | # MAGIC 110 | # MAGIC 1. It filters the DataFrame to include data only for the specified country. 111 | # MAGIC 1. The data is sorted by date. 112 | # MAGIC 1. The date column is set as the index. 113 | # MAGIC 1. The data is resampled to a chosen frequency (monthly in this case). 114 | # MAGIC 1. Any missing values are filled using forward filling. 115 | # MAGIC 1. The seasonal decomposition is performed using an additive model. 116 | # MAGIC 1. The trend, seasonality, and residuals components are extracted. 117 | # MAGIC 1. Subplots are created for the original data, trend, seasonality, and residuals. 118 | # MAGIC 1. Traces are added to the subplots to visualize the components. 119 | # MAGIC 1. The plot layout is updated with appropriate dimensions and a title. 120 | # MAGIC 1. The plot is displayed. 121 | # MAGIC 122 | # MAGIC By calling the decompose_country function with a DataFrame and a country name, the code generates a plot showing the original data, trend, seasonality, and residuals components of the time series for that country. 123 | 124 | # COMMAND ---------- 125 | 126 | def decompose_country(df, country): 127 | # Filter data for the specified country 128 | df_country = df[df['COUNTRY'] == country] 129 | 130 | # Ensure the data is sorted by date 131 | df_country = df_country.sort_values('DATETIME') 132 | 133 | # Set the date as the index 134 | df_country.set_index('DATETIME', inplace=True) 135 | 136 | # Resample to hourly data, you can choose different frequency according to your data 137 | df_country = df_country.resample('M').asfreq() 138 | 139 | # Forward fill to handle the newly created NaNs 140 | df_country = df_country.bfill() 141 | 142 | # Perform the decomposition 143 | decomposition = seasonal_decompose(df_country['HOURLY_CONSUMPTION_MW'], model='additive') 144 | 145 | # Get the trend, seasonality and residuals 146 | trend = decomposition.trend 147 | seasonal = decomposition.seasonal 148 | residual = decomposition.resid 149 | 150 | # Create subplots: 4 rows, 1 column 151 | fig = sp.make_subplots(rows=4, cols=1) 152 | 153 | # Add traces 154 | fig.add_trace(go.Scatter(x=df_country.index, y=df_country['HOURLY_CONSUMPTION_MW'], mode='lines', name='Original'), row=1, col=1) 155 | fig.add_trace(go.Scatter(x=trend.index, y=trend, mode='lines', name='Trend'), row=2, col=1) 156 | fig.add_trace(go.Scatter(x=seasonal.index, y=seasonal, mode='lines', name='Seasonality'), row=3, col=1) 157 | fig.add_trace(go.Scatter(x=residual.index, y=residual, mode='lines', name='Residuals'), row=4, col=1) 158 | 159 | # Update layout 160 | fig.update_layout(height=800, width=1000, title_text="Decomposition for " + country, showlegend=True) 161 | 162 | # Render the plot 163 | fig.show() 164 | 165 | decompose_country(pandas_df, 'greece') 166 | 167 | 168 | # COMMAND ---------- 169 | 170 | 171 | -------------------------------------------------------------------------------- /MLOps Pipeline/Data Engineering/03. Data Quality/02. Great Expectations.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC Note: You can find the official documentation of great-expectations [here](https://docs.greatexpectations.io/docs/deployment_patterns/how_to_use_great_expectations_in_databricks/) 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC ## Install Great Expectations 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %run "/Users/filippos.priovolos01@gmail.com/Workflow Config/Great Expecations Config" 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC ## Set up Great Expectations 18 | 19 | # COMMAND ---------- 20 | 21 | root_directory = "/dbfs/great_expectations/" 22 | data_context_config = DataContextConfig( 23 | store_backend_defaults=FilesystemStoreBackendDefaults( 24 | root_directory=root_directory 25 | ), 26 | ) 27 | context = get_context(project_config=data_context_config) 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %md 32 | # MAGIC ## Prepare data 33 | 34 | # COMMAND ---------- 35 | 36 | df = spark.read.format("delta") \ 37 | .option("header", "true") \ 38 | .option("inferSchema", "true") \ 39 | .table("df_dev.final_consumption_countries_hourly") 40 | 41 | 42 | # COMMAND ---------- 43 | 44 | display(df) 45 | 46 | # COMMAND ---------- 47 | 48 | # Sort the DataFrame by country and datetime 49 | df_sorted = df.orderBy("COUNTRY","DATETIME") 50 | display(df_sorted) 51 | 52 | # COMMAND ---------- 53 | 54 | # MAGIC %md 55 | # MAGIC ## Connect to the data 56 | 57 | # COMMAND ---------- 58 | 59 | my_spark_datasource_config = { 60 | "name": "delta_datasource", 61 | "class_name": "Datasource", 62 | "execution_engine": {"class_name": "SparkDFExecutionEngine"}, 63 | "data_connectors": { 64 | "delta_connector": { 65 | "module_name": "great_expectations.datasource.data_connector", 66 | "class_name": "RuntimeDataConnector", 67 | "batch_identifiers": [ 68 | "prod", 69 | "run_id1", 70 | ], 71 | } 72 | }, 73 | } 74 | 75 | 76 | # COMMAND ---------- 77 | 78 | context.test_yaml_config(yaml.dump(my_spark_datasource_config)) 79 | 80 | # COMMAND ---------- 81 | 82 | context.add_datasource(**my_spark_datasource_config) 83 | 84 | # COMMAND ---------- 85 | 86 | batch_request = RuntimeBatchRequest( 87 | datasource_name="delta_datasource", 88 | data_connector_name="delta_connector", 89 | data_asset_name="my_data_asset_name", 90 | batch_identifiers={ 91 | "prod": "my_production_data", 92 | "run_id1": f"my_run_id{datetime.date.today().strftime('%Y%m%d')}", 93 | }, 94 | runtime_parameters={"batch_data": df}, 95 | ) 96 | 97 | 98 | # COMMAND ---------- 99 | 100 | expectation_suite_name = "my_expectation_suite" 101 | context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name) 102 | validator = context.get_validator( 103 | batch_request=batch_request, 104 | expectation_suite_name=expectation_suite_name, 105 | ) 106 | 107 | print(validator.head()) 108 | 109 | # COMMAND ---------- 110 | 111 | from datetime import datetime 112 | 113 | # Define Expectations for the columns 114 | validator.expect_column_values_to_not_be_null("DATETIME") 115 | validator.expect_column_values_to_not_be_null("HOURLY_CONSUMPTION_MW") 116 | validator.expect_column_values_to_not_be_null("COUNTRY") 117 | 118 | validator.expect_column_values_to_be_of_type("DATETIME", "TimestampType") 119 | validator.expect_column_values_to_be_of_type("HOURLY_CONSUMPTION_MW", "DoubleType") 120 | validator.expect_column_values_to_be_of_type("COUNTRY", "StringType") 121 | 122 | validator.expect_column_values_to_be_in_set("COUNTRY", ["belgium", "denmark", "france", "germany", "greece", "italy", "luxembourg", "netherlands", "spain", "sweden", "switzerland"]) 123 | 124 | validator.expect_column_values_to_be_between("HOURLY_CONSUMPTION_MW", min_value=0) 125 | 126 | # This expectation checks if the mean of the HOURLY_CONSUMPTION_MW is within a certain range. Please adjust the min_value and max_value according to your data. 127 | validator.expect_column_mean_to_be_between("HOURLY_CONSUMPTION_MW", min_value=25000, max_value=50000) 128 | 129 | # This expectation checks if the median of the HOURLY_CONSUMPTION_MW is within a certain range. Please adjust the min_value and max_value according to your data. 130 | validator.expect_column_median_to_be_between("HOURLY_CONSUMPTION_MW", min_value=20000, max_value=35000) 131 | 132 | # This expectation checks if the standard deviation of the HOURLY_CONSUMPTION_MW is within a certain range. Please adjust the min_value and max_value according to your data. 133 | validator.expect_column_stdev_to_be_between("HOURLY_CONSUMPTION_MW", min_value=40000, max_value=70000) 134 | 135 | # Check if timestamps are in the correct range 136 | start_date = datetime(2015, 1, 1) 137 | end_date = datetime(2023, 1, 1) 138 | validator.expect_column_values_to_be_between("DATETIME", min_value=start_date, max_value=end_date) 139 | 140 | 141 | 142 | # COMMAND ---------- 143 | 144 | validator.save_expectation_suite(discard_failed_expectations=False) 145 | 146 | # COMMAND ---------- 147 | 148 | # MAGIC %md 149 | # MAGIC ## Validate data 150 | 151 | # COMMAND ---------- 152 | 153 | my_checkpoint_name = "my_data_validation_checkpoint" 154 | checkpoint_config = { 155 | "name": my_checkpoint_name, 156 | "config_version": 1.0, 157 | "class_name": "SimpleCheckpoint", 158 | "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template", 159 | } 160 | 161 | 162 | # COMMAND ---------- 163 | 164 | my_checkpoint = context.test_yaml_config(yaml.dump(checkpoint_config)) 165 | 166 | # COMMAND ---------- 167 | 168 | context.add_or_update_checkpoint(**checkpoint_config) 169 | 170 | # COMMAND ---------- 171 | 172 | checkpoint_result = context.run_checkpoint( 173 | checkpoint_name=my_checkpoint_name, 174 | validations=[ 175 | { 176 | "batch_request": batch_request, 177 | "expectation_suite_name": expectation_suite_name, 178 | } 179 | ], 180 | ) 181 | 182 | # COMMAND ---------- 183 | 184 | # MAGIC %md 185 | # MAGIC ## Build and view Data Docs 186 | 187 | # COMMAND ---------- 188 | 189 | html = '/dbfs/great_expectations/uncommitted/data_docs/local_site/index.html' 190 | with open(html, "r") as f: 191 | data = "".join([l for l in f]) 192 | displayHTML(data) 193 | 194 | # COMMAND ---------- 195 | 196 | 197 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/02.Feature Engineering.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment" 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC ## Configuration 8 | 9 | # COMMAND ---------- 10 | 11 | input_table_name = 'final_consumption_countries_hourly' 12 | output_table_name = 'hourly_forecasting_features' 13 | save_into_feature_store = True 14 | delete_fs = False 15 | 16 | # COMMAND ---------- 17 | 18 | # MAGIC %md 19 | # MAGIC ## Load Dataset 20 | 21 | # COMMAND ---------- 22 | 23 | table = spark.table(f'{db}.{input_table_name}') 24 | table.describe().show() 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %md 29 | # MAGIC ## One-Hot-Encoding of Categorical Columns (Countries) 30 | 31 | # COMMAND ---------- 32 | 33 | # MAGIC %md 34 | # MAGIC The create_country_features function adds binary country-specific features to a DataFrame. It iterates over distinct country values in the 'COUNTRY' column, creates a new column for each country, and assigns a value of 1 if the row corresponds to that country, and 0 otherwise. The updated DataFrame with the added features is returned and displayed using the display function. 35 | 36 | # COMMAND ---------- 37 | 38 | from pyspark.sql import functions as F 39 | def create_country_features(df): 40 | # for col in df.columns: 41 | countries = [row['COUNTRY'] for row in df.select('COUNTRY').distinct().collect()] 42 | countries.sort() 43 | for country in countries: 44 | df = df.withColumn("{}".format(country), F.when((df['COUNTRY'] == country), 1).otherwise(0)) 45 | return df 46 | 47 | features = create_country_features(table) 48 | display(features) 49 | 50 | # COMMAND ---------- 51 | 52 | # MAGIC %md 53 | # MAGIC ## Create Features for Model Training 54 | 55 | # COMMAND ---------- 56 | 57 | # MAGIC %md 58 | # MAGIC 1. Preprocessing and Sorting 59 | # MAGIC * Convert the 'DATETIME' column to datetime format. 60 | # MAGIC * Set this converted column as the index of the DataFrame. 61 | # MAGIC * Sort the DataFrame by 'COUNTRY' and 'DATETIME' columns. 62 | # MAGIC 2. Extracting Date Features 63 | # MAGIC * Create new columns for various date components: 'HOUR', 'DAY_OF_WEEK', 'MONTH', 'QUARTER', 'YEAR', 'DAY_OF_YEAR', 'DAY_OF_MONTH', and 'WEEK_OF_YEAR'. 64 | # MAGIC 3. Calculate Rolling Statistics & Lagged Features 65 | # MAGIC * For each country, calculate rolling mean, rolling standard deviation, and rolling sum of the 'HOURLY_CONSUMPTION_MW' over specific windows (24 hours and 7 days). 66 | # MAGIC * Create lagged features for 'HOURLY_CONSUMPTION_MW' such as the consumption of the previous day, previous week, and previous month. 67 | # MAGIC 4. Handling Null Values 68 | # MAGIC * Backward fill the null values generated due to shifting (lagged features) and rolling operations. 69 | # MAGIC 5. Drop Original Consumption Column 70 | # MAGIC * Drop the 'HOURLY_CONSUMPTION_MW' column as we have generated statistical features from it. 71 | # MAGIC 6. Return the Modified DataFrame 72 | # MAGIC * The function returns the DataFrame with the newly created features. 73 | 74 | # COMMAND ---------- 75 | 76 | def create_features(df): 77 | """ 78 | Creates time series features from datetime index in order to save them in Features Store 79 | """ 80 | # Convert 'DATETIME' column to datetime format and set it as the index 81 | df['DATETIME'] = pd.to_datetime(df['DATETIME']) 82 | df.set_index('DATETIME', inplace=True) 83 | df.sort_values(['COUNTRY', 'DATETIME'], inplace=True) 84 | 85 | # Extract date-related features 86 | df['HOUR'] = df.index.hour 87 | df['DAY_OF_WEEK'] = df.index.dayofweek 88 | df['MONTH'] = df.index.month 89 | df['QUARTER'] = df.index.quarter 90 | df['YEAR'] = df.index.year 91 | df['DAY_OF_YEAR'] = df.index.dayofyear 92 | df['DAY_OF_MONTH'] = df.index.day 93 | df['WEEK_OF_YEAR'] = df.index.isocalendar().week 94 | 95 | # Calculate rolling statistics and lagged features for each country 96 | for country in df['COUNTRY'].unique(): 97 | df.loc[df['COUNTRY'] == country, 'ROLLING_MEAN_24H'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].rolling(window=24).mean() 98 | df.loc[df['COUNTRY'] == country, 'ROLLING_STD_24H'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].rolling(window=24).std() 99 | df.loc[df['COUNTRY'] == country, 'ROLLING_SUM_7D'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].rolling(window=7 * 24, min_periods=1).sum() 100 | df.loc[df['COUNTRY'] == country, 'PREV_DAY_CONSUMPTION'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].shift(24) 101 | df.loc[df['COUNTRY'] == country, 'PREV_WEEK_CONSUMPTION'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].shift(24 * 7) 102 | df.loc[df['COUNTRY'] == country, 'PREVIOUS_MONTH_CONSUMPTION'] = df.loc[df['COUNTRY'] == country, 'HOURLY_CONSUMPTION_MW'].shift(24*30) 103 | 104 | # Backward fill only the rows that end up as null after shifting 105 | df['PREV_DAY_CONSUMPTION'] = df['PREV_DAY_CONSUMPTION'].fillna(method='bfill') 106 | df['PREV_WEEK_CONSUMPTION'] = df['PREV_WEEK_CONSUMPTION'].fillna(method='bfill') 107 | df['PREVIOUS_MONTH_CONSUMPTION'] = df['PREVIOUS_MONTH_CONSUMPTION'].fillna(method='bfill') 108 | df['ROLLING_MEAN_24H'] = df['ROLLING_MEAN_24H'].fillna(method='bfill') 109 | df['ROLLING_STD_24H'] = df['ROLLING_STD_24H'].fillna(method='bfill') 110 | 111 | df = df.drop('HOURLY_CONSUMPTION_MW',axis=1) 112 | 113 | return df 114 | 115 | 116 | # COMMAND ---------- 117 | 118 | # Convert features df from spark to pandas and call the create_features() 119 | features = create_features(features.toPandas()) 120 | features 121 | 122 | # COMMAND ---------- 123 | 124 | # MAGIC %md 125 | # MAGIC ## Create Primary Key 126 | 127 | # COMMAND ---------- 128 | 129 | # MAGIC %md 130 | # MAGIC By concatenating the 'COUNTRY' and 'DATETIME' values with an underscore ('_'), the code aims to create a composite key that uniquely identifies each row in the DataFrame 131 | 132 | # COMMAND ---------- 133 | 134 | features.reset_index(inplace=True) 135 | features['CONSUMPTION_ID'] = features.COUNTRY + '_' + features.DATETIME.astype(str) 136 | features.head() 137 | 138 | # COMMAND ---------- 139 | 140 | # MAGIC %md 141 | # MAGIC ## Save features dataset into Feature Store 142 | 143 | # COMMAND ---------- 144 | 145 | if save_into_feature_store: 146 | 147 | features.drop(['COUNTRY'], axis=1, inplace=True) 148 | 149 | features = spark.createDataFrame(features) 150 | 151 | fs = feature_store.FeatureStoreClient() 152 | 153 | fs.create_table( 154 | name=f'{db}.{output_table_name}', 155 | primary_keys=['CONSUMPTION_ID'], 156 | timestamp_keys='DATETIME', 157 | df=features 158 | ) 159 | 160 | # COMMAND ---------- 161 | 162 | # MAGIC %md 163 | # MAGIC ## Delete Features Store 164 | 165 | # COMMAND ---------- 166 | 167 | if delete_fs: 168 | from databricks.feature_store import FeatureStoreClient 169 | fs = FeatureStoreClient() 170 | fs.drop_table(name='df_dev.hourly_forecasting_features') 171 | print("Feature Store was succesfuly deleted") 172 | 173 | # COMMAND ---------- 174 | 175 | 176 | -------------------------------------------------------------------------------- /MLOps Pipeline/Data Engineering/01. Ingestion/02. Monitoring Data Ingestion.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Import Libraries 4 | 5 | # COMMAND ---------- 6 | 7 | from pyspark.sql import SparkSession 8 | from pyspark.sql.functions import col, lit, split, to_timestamp, date_format 9 | 10 | # COMMAND ---------- 11 | 12 | # Use df_landing database 13 | spark.sql('USE db_monitor') 14 | # create a Spark session 15 | spark = SparkSession.builder.getOrCreate() 16 | 17 | # COMMAND ---------- 18 | 19 | # Retrieve a list of all tables in the current database 20 | tables = spark.sql('SHOW TABLES') \ 21 | .select('tableName') \ 22 | .rdd.flatMap(lambda x: x) \ 23 | .collect() 24 | 25 | print(tables) 26 | 27 | # COMMAND ---------- 28 | 29 | # MAGIC %md 30 | # MAGIC ## Load & Aggregate Data from Database 31 | 32 | # COMMAND ---------- 33 | 34 | # MAGIC %md 35 | # MAGIC The load_data function in PySpark takes a table name as an input, and performs the following steps: 36 | # MAGIC 37 | # MAGIC 1. Load Data: Reads data from the specified table into a DataFrame. 38 | # MAGIC 2. Split Datetime: Splits the 'datetime' string into start and end times, and assigns the start time to a new column 'start_time'. 39 | # MAGIC 3. Convert Datetime: Transforms the 'start_time' string into a timestamp format. 40 | # MAGIC 4. Extract Hourly Time: Reduces the 'start_time' to an hourly format, discarding minute and second information. 41 | # MAGIC 5. Extract Country Name: Derives the country name from the table name (assumed to be the first part of the table name before an underscore). 42 | # MAGIC 6. Add Country Column: Adds a new column 'country' to the DataFrame, populated with the extracted country name. 43 | # MAGIC 7. Return DataFrame: Returns the modified DataFrame. 44 | # MAGIC 45 | # MAGIC This function prepares the loaded data for further analysis by transforming the timestamp into an hourly format and adding a country identifier. 46 | 47 | # COMMAND ---------- 48 | 49 | # function to load data from a table and add a country column 50 | def load_data(table_name): 51 | df = spark.read.table(table_name) 52 | 53 | # split the datetime string into start and end times 54 | split_col = split(df['datetime'], ' - ') 55 | df = df.withColumn('start_time', split_col.getItem(0)) 56 | 57 | # convert the start time into timestamp format 58 | datetime_format = "dd.MM.yyyy HH:mm" 59 | df = df.withColumn('start_time', to_timestamp(df['start_time'], datetime_format)) 60 | 61 | # floor the start_time to the hour 62 | #df = df.withColumn('start_time', date_format(df['start_time'], 'yyyy-MM-dd HH:00:00').cast('timestamp')) 63 | 64 | # get the country name from the table name 65 | country = table_name.split("_")[0] 66 | 67 | # add the country column 68 | df = df.withColumn("country", lit(country)) 69 | 70 | # sort the values based on start_time in ascending order 71 | df = df.sort("start_time") 72 | 73 | return df 74 | 75 | 76 | # COMMAND ---------- 77 | 78 | # MAGIC %md 79 | # MAGIC ## Save data in each table 80 | 81 | # COMMAND ---------- 82 | 83 | # dictionary to store dataframes 84 | df_dict = {} 85 | 86 | # load data from each table 87 | for table in tables: 88 | df_dict[table.split('_')[0]] = load_data(table) 89 | 90 | # COMMAND ---------- 91 | 92 | # MAGIC %md 93 | # MAGIC 1. Sorts the DataFrame by the 'start_time' column. 94 | # MAGIC 2. Replaces null values in the 'Actual_MW' column with the previous non-null value using forward fill. This is achieved by applying the last() function with the ignorenulls=True argument over a window specification. 95 | # MAGIC 3. Replaces invalid values (0 or less) in the 'Actual_MW' column with the previous non-invalid value using forward fill. This is done by using the when() function to check if the value is less than or equal to 0, and if so, replaces it with the previous non-invalid value from the window. 96 | # MAGIC 4. Updates the DataFrame in the dictionary with the modified DataFrame. 97 | # MAGIC The code ensures that null and invalid values are replaced with appropriate values using forward fill, maintaining the ordering of the data by 'start_time' for each country DataFrame. 98 | 99 | # COMMAND ---------- 100 | 101 | for country, df_country in df_dict.items(): 102 | print(country,df_country) 103 | 104 | # COMMAND ---------- 105 | 106 | # Import the necessary functions 107 | from pyspark.sql.functions import col,when 108 | 109 | # Iterate over each country DataFrame in the dictionary 110 | for country, df_country in df_dict.items(): 111 | # Sort the DataFrame by 'start_time' 112 | df_country = df_country.orderBy('start_time') 113 | 114 | # Replace invalid values (0 or less) with null 115 | df_country = df_country.withColumn('Actual_MW', when(col('Actual_MW') <= 0, None).otherwise(col('Actual_MW'))) 116 | 117 | # Drop rows with null values 118 | df_country = df_country.dropna() 119 | 120 | # Update the DataFrame in the dictionary 121 | df_dict[country] = df_country 122 | 123 | 124 | # COMMAND ---------- 125 | 126 | # MAGIC %md 127 | # MAGIC * The get_hourly_query function is defined to create a SQL query for each table (country). This query selects the start_time, Actual_MW (renamed as HOURLY_CONSUMPTION_MW), and the table name (representing the country). 128 | # MAGIC 129 | # MAGIC 130 | 131 | # COMMAND ---------- 132 | 133 | # function to generate SQL query for a given table 134 | def get_hourly_query(table_name): 135 | return f""" 136 | SELECT 137 | start_time AS DATETIME, 138 | Actual_MW AS HOURLY_CONSUMPTION_MW, 139 | '{table_name}' AS COUNTRY 140 | FROM {table_name} 141 | """ 142 | 143 | # COMMAND ---------- 144 | 145 | # register each DataFrame as a temporary view in Spark 146 | for table_name, df in df_dict.items(): 147 | df.createOrReplaceTempView(table_name) 148 | 149 | # COMMAND ---------- 150 | 151 | # MAGIC %md 152 | # MAGIC 153 | # MAGIC * The final_hourly_query is created by applying the get_hourly_query function to each country's DataFrame in the dictionary, joining the resulting SQL queries with UNION ALL. The UNION ALL SQL operation combines the rows from these separate queries into a single set of results. 154 | 155 | # COMMAND ---------- 156 | 157 | final_hourly_query = ' UNION ALL '.join([get_hourly_query(country) for country in df_dict.keys()]) 158 | 159 | # COMMAND ---------- 160 | 161 | # MAGIC %md 162 | # MAGIC 163 | # MAGIC * The final_hourly_query is then executed using spark.sql(). This command runs the combined SQL query and creates a DataFrame. 164 | # MAGIC 165 | # MAGIC * The .dropDuplicates(['DATETIME', 'COUNTRY']) operation removes any duplicate rows from the DataFrame based on the DATETIME and COUNTRY columns. 166 | # MAGIC 167 | # MAGIC * The .createOrReplaceTempView('final_hourly_df') operation creates a temporary view with the name 'final_hourly_df'. This is a named logical plan that is used as a stand-in for the DataFrame in Spark SQL queries. 168 | 169 | # COMMAND ---------- 170 | 171 | spark.sql(final_hourly_query) \ 172 | .dropDuplicates(['DATETIME', 'COUNTRY']) \ 173 | .createOrReplaceTempView('final_hourly_df') 174 | 175 | spark.sql(""" 176 | SELECT * FROM final_hourly_df 177 | ORDER BY DATETIME,COUNTRY 178 | """).createOrReplaceTempView('final_hourly_df_ordered') 179 | 180 | # COMMAND ---------- 181 | 182 | # MAGIC %md 183 | # MAGIC * The MERGE INTO statement is a SQL command that updates the consumption_countries_hourly table in the database. If a record (based on DATETIME and COUNTRY) already exists in the table, it updates the existing record with the new data. If a record does not exist, it inserts a new record with the data. 184 | # MAGIC 185 | # MAGIC 186 | 187 | # COMMAND ---------- 188 | 189 | spark.sql(f""" 190 | MERGE INTO monitoring_consumption_countries_hourly A 191 | USING final_hourly_df B 192 | ON A.DATETIME = B.DATETIME AND A.COUNTRY = B.COUNTRY 193 | WHEN MATCHED THEN 194 | UPDATE SET 195 | DATETIME = B.DATETIME, 196 | HOURLY_CONSUMPTION_MW = B.HOURLY_CONSUMPTION_MW, 197 | COUNTRY = B.COUNTRY 198 | WHEN NOT MATCHED 199 | THEN INSERT ( 200 | DATETIME, 201 | HOURLY_CONSUMPTION_MW, 202 | COUNTRY 203 | ) VALUES ( 204 | B.DATETIME, 205 | B.HOURLY_CONSUMPTION_MW, 206 | B.COUNTRY 207 | ) 208 | """) 209 | 210 | # COMMAND ---------- 211 | 212 | 213 | -------------------------------------------------------------------------------- /MLOps Pipeline/Data Engineering/01. Ingestion/01. Training Data Ingestion.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Import Libraries 4 | 5 | # COMMAND ---------- 6 | 7 | from pyspark.sql import SparkSession 8 | from pyspark.sql.functions import col, lit, split, to_timestamp, date_format 9 | 10 | # COMMAND ---------- 11 | 12 | # Use df_landing database 13 | spark.sql('USE df_landing') 14 | # create a Spark session 15 | spark = SparkSession.builder.getOrCreate() 16 | 17 | # COMMAND ---------- 18 | 19 | # Retrieve a list of all tables in the current database 20 | tables = spark.sql('SHOW TABLES') \ 21 | .select('tableName') \ 22 | .rdd.flatMap(lambda x: x) \ 23 | .collect() 24 | 25 | print(tables) 26 | 27 | # COMMAND ---------- 28 | 29 | # MAGIC %md 30 | # MAGIC ## Load & Aggregate Data from Database 31 | 32 | # COMMAND ---------- 33 | 34 | # MAGIC %md 35 | # MAGIC The load_data function in PySpark takes a table name as an input, and performs the following steps: 36 | # MAGIC 37 | # MAGIC 1. Load Data: Reads data from the specified table into a DataFrame. 38 | # MAGIC 2. Split Datetime: Splits the 'datetime' string into start and end times, and assigns the start time to a new column 'start_time'. 39 | # MAGIC 3. Convert Datetime: Transforms the 'start_time' string into a timestamp format. 40 | # MAGIC 4. Extract Hourly Time: Reduces the 'start_time' to an hourly format, discarding minute and second information. 41 | # MAGIC 5. Extract Country Name: Derives the country name from the table name (assumed to be the first part of the table name before an underscore). 42 | # MAGIC 6. Add Country Column: Adds a new column 'country' to the DataFrame, populated with the extracted country name. 43 | # MAGIC 7. Return DataFrame: Returns the modified DataFrame. 44 | # MAGIC 45 | # MAGIC This function prepares the loaded data for further analysis by transforming the timestamp into an hourly format and adding a country identifier. 46 | 47 | # COMMAND ---------- 48 | 49 | # function to load data from a table and add a country column 50 | def load_data(table_name): 51 | df = spark.read.table(table_name) 52 | 53 | # split the datetime string into start and end times 54 | split_col = split(df['datetime'], ' - ') 55 | df = df.withColumn('start_time', split_col.getItem(0)) 56 | 57 | # convert the start time into timestamp format 58 | datetime_format = "dd.MM.yyyy HH:mm" 59 | df = df.withColumn('start_time', to_timestamp(df['start_time'], datetime_format)) 60 | 61 | # floor the start_time to the hour 62 | #df = df.withColumn('start_time', date_format(df['start_time'], 'yyyy-MM-dd HH:00:00').cast('timestamp')) 63 | 64 | # get the country name from the table name 65 | country = table_name.split("_")[0] 66 | 67 | # add the country column 68 | df = df.withColumn("country", lit(country)) 69 | 70 | # sort the values based on start_time in ascending order 71 | df = df.sort("start_time") 72 | 73 | return df 74 | 75 | 76 | # COMMAND ---------- 77 | 78 | # MAGIC %md 79 | # MAGIC ## Save data in each table 80 | 81 | # COMMAND ---------- 82 | 83 | # dictionary to store dataframes 84 | df_dict = {} 85 | 86 | # load data from each table 87 | for table in tables: 88 | df_dict[table.split('_')[0]] = load_data(table) 89 | 90 | # COMMAND ---------- 91 | 92 | # MAGIC %md 93 | # MAGIC 1. Sorts the DataFrame by the 'start_time' column. 94 | # MAGIC 2. Replaces null values in the 'Actual_MW' column with the previous non-null value using forward fill. This is achieved by applying the last() function with the ignorenulls=True argument over a window specification. 95 | # MAGIC 3. Replaces invalid values (0 or less) in the 'Actual_MW' column with the previous non-invalid value using forward fill. This is done by using the when() function to check if the value is less than or equal to 0, and if so, replaces it with the previous non-invalid value from the window. 96 | # MAGIC 4. Updates the DataFrame in the dictionary with the modified DataFrame. 97 | # MAGIC The code ensures that null and invalid values are replaced with appropriate values using forward fill, maintaining the ordering of the data by 'start_time' for each country DataFrame. 98 | 99 | # COMMAND ---------- 100 | 101 | from pyspark.sql.functions import col, when, last 102 | from pyspark.sql.window import Window 103 | 104 | # Iterate over each country DataFrame in the dictionary 105 | for country, df_country in df_dict.items(): 106 | # Sort the DataFrame by 'start_time' 107 | df_country = df_country.orderBy('start_time') 108 | 109 | # Replace invalid values (0 or less) with null 110 | df_country = df_country.withColumn('Actual_MW', when(col('Actual_MW') <= 0, None).otherwise(col('Actual_MW'))) 111 | 112 | # Replace null values with previous non-null values using forward fill 113 | window_spec = Window.partitionBy('country').orderBy('start_time').rowsBetween(Window.unboundedPreceding, 0) 114 | df_country = df_country.withColumn('Actual_MW', last('Actual_MW', ignorenulls=True).over(window_spec)) 115 | 116 | # Update the DataFrame in the dictionary 117 | df_dict[country] = df_country 118 | 119 | 120 | # COMMAND ---------- 121 | 122 | # MAGIC %md 123 | # MAGIC * The get_hourly_query function is defined to create a SQL query for each table (country). This query selects the start_time, Actual_MW (renamed as HOURLY_CONSUMPTION_MW), and the table name (representing the country). 124 | # MAGIC 125 | # MAGIC 126 | 127 | # COMMAND ---------- 128 | 129 | # function to generate SQL query for a given table 130 | def get_hourly_query(table_name): 131 | return f""" 132 | SELECT 133 | start_time AS DATETIME, 134 | Actual_MW AS HOURLY_CONSUMPTION_MW, 135 | '{table_name}' AS COUNTRY 136 | FROM {table_name} 137 | """ 138 | 139 | # COMMAND ---------- 140 | 141 | # register each DataFrame as a temporary view in Spark 142 | for table_name, df in df_dict.items(): 143 | df.createOrReplaceTempView(table_name) 144 | 145 | # COMMAND ---------- 146 | 147 | # MAGIC %md 148 | # MAGIC 149 | # MAGIC * The final_hourly_query is created by applying the get_hourly_query function to each country's DataFrame in the dictionary, joining the resulting SQL queries with UNION ALL. The UNION ALL SQL operation combines the rows from these separate queries into a single set of results. 150 | 151 | # COMMAND ---------- 152 | 153 | final_hourly_query = ' UNION ALL '.join([get_hourly_query(country) for country in df_dict.keys()]) 154 | 155 | # COMMAND ---------- 156 | 157 | # MAGIC %md 158 | # MAGIC 159 | # MAGIC * The final_hourly_query is then executed using spark.sql(). This command runs the combined SQL query and creates a DataFrame. 160 | # MAGIC 161 | # MAGIC * The .dropDuplicates(['DATETIME', 'COUNTRY']) operation removes any duplicate rows from the DataFrame based on the DATETIME and COUNTRY columns. 162 | # MAGIC 163 | # MAGIC * The .createOrReplaceTempView('final_hourly_df') operation creates a temporary view with the name 'final_hourly_df'. This is a named logical plan that is used as a stand-in for the DataFrame in Spark SQL queries. 164 | 165 | # COMMAND ---------- 166 | 167 | spark.sql(final_hourly_query) \ 168 | .dropDuplicates(['DATETIME', 'COUNTRY']) \ 169 | .createOrReplaceTempView('final_hourly_df') 170 | 171 | spark.sql(""" 172 | SELECT * FROM final_hourly_df 173 | ORDER BY DATETIME,COUNTRY 174 | """).createOrReplaceTempView('final_hourly_df_ordered') 175 | 176 | # COMMAND ---------- 177 | 178 | database= 'df_dev' 179 | 180 | # COMMAND ---------- 181 | 182 | # MAGIC %md 183 | # MAGIC * The MERGE INTO statement is a SQL command that updates the consumption_countries_hourly table in the database. If a record (based on DATETIME and COUNTRY) already exists in the table, it updates the existing record with the new data. If a record does not exist, it inserts a new record with the data. 184 | # MAGIC 185 | # MAGIC 186 | 187 | # COMMAND ---------- 188 | 189 | spark.sql(f""" 190 | MERGE INTO {database}.consumption_countries_hourly A 191 | USING final_hourly_df B 192 | ON A.DATETIME = B.DATETIME AND A.COUNTRY = B.COUNTRY 193 | WHEN MATCHED THEN 194 | UPDATE SET 195 | DATETIME = B.DATETIME, 196 | HOURLY_CONSUMPTION_MW = B.HOURLY_CONSUMPTION_MW, 197 | COUNTRY = B.COUNTRY 198 | WHEN NOT MATCHED 199 | THEN INSERT ( 200 | DATETIME, 201 | HOURLY_CONSUMPTION_MW, 202 | COUNTRY 203 | ) VALUES ( 204 | B.DATETIME, 205 | B.HOURLY_CONSUMPTION_MW, 206 | B.COUNTRY 207 | ) 208 | """) 209 | 210 | # COMMAND ---------- 211 | 212 | 213 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/04.Unit Test.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Configuration 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment" 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %md 12 | # MAGIC ## Load Datasets 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC 17 | # MAGIC %md 18 | # MAGIC * Load energy consumption data from a database into a Pandas DataFrame. 19 | # MAGIC * Create a new column CONSUMPTION_ID by concatenating country codes with the date-time information. 20 | # MAGIC * Convert the DATETIME column to a proper datetime data type for time-based operations. 21 | # MAGIC * Define test labels, based on date-time ranges. 22 | # MAGIC * Convert the subsets back into Spark DataFrames and select only the CONSUMPTION_ID, DATETIME, and HOURLY_CONSUMPTION_MW columns for further processing 23 | 24 | # COMMAND ---------- 25 | 26 | # Load Consumption Region Table 27 | consumption_countries_hourly = spark.table(f'{db}.final_consumption_countries_hourly').toPandas() 28 | consumption_countries_hourly['CONSUMPTION_ID'] = consumption_countries_hourly.COUNTRY + '_' + consumption_countries_hourly.DATETIME.astype(str) 29 | consumption_countries_hourly['DATETIME'] = pd.to_datetime(consumption_countries_hourly['DATETIME']) 30 | # Split the labels into training and test 31 | test_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME > test_start) & (consumption_countries_hourly.DATETIME <= test_end)] 32 | # Transforms to Spark DataFranes 33 | test_labels = spark.createDataFrame(test_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 34 | 35 | # COMMAND ---------- 36 | 37 | # MAGIC %md 38 | # MAGIC * Search for runs: The mlflow.search_runs function is called to search for all runs associated with the specified experiment_id_training. The runs are sorted by start time in descending order, meaning the latest run will be the first one in the list. The result is stored in the runs variable. 39 | # MAGIC 40 | # MAGIC * Select the latest run: The latest_run_id is assigned the run ID of the first run in the runs list (i.e., the latest run). This ID will be used to retrieve the details of the latest run. 41 | # MAGIC 42 | # MAGIC * Get the latest run details: The mlflow.get_run function is called with the latest_run_id to retrieve the details of the latest run. The details are stored in the latest_run variable. 43 | # MAGIC 44 | # MAGIC * Get the logged metrics: The metrics logged during the latest run are extracted from the latest_run.data.metrics attribute and stored in the metrics variable. 45 | 46 | # COMMAND ---------- 47 | 48 | # Search for all runs associated with the experiment ID, sorted by start time 49 | runs = mlflow.search_runs(experiment_ids=experiment_id_training, order_by=["start_time desc"]) 50 | 51 | #Select the first run in the list (i.e., the latest run) 52 | latest_run_id = runs.iloc[0]["run_id"] 53 | latest_run = mlflow.get_run(latest_run_id) 54 | 55 | # Get the metrics logged during the latest run 56 | metrics = latest_run.data.metrics 57 | 58 | # Print the metrics 59 | for key, value in metrics.items(): 60 | print(key, value) 61 | 62 | 63 | # COMMAND ---------- 64 | 65 | # MAGIC %md 66 | # MAGIC ## Model Performance Testing 67 | 68 | # COMMAND ---------- 69 | 70 | # MAGIC %md 71 | # MAGIC * We define some thresholds for our model to meet 72 | 73 | # COMMAND ---------- 74 | 75 | mse_threshold = 1000000000.0 76 | mae_threshold = 30000.0 77 | rmse_threshold = 40000.0 78 | r2_threshold = 0.9 79 | training_time_threshold = 3600.0 80 | 81 | # COMMAND ---------- 82 | 83 | # MAGIC %md 84 | # MAGIC The test_model_performance() function evaluates the performance of a model by comparing specific metrics against defined thresholds. It checks if metrics such as MSE, MAE, RMSE, R2 score, and training time meet the specified thresholds. Success or failure messages are printed for each test, and a boolean variable (all_tests_passed) is updated accordingly. The function returns the overall result indicating whether all tests passed (True) or if any of them failed (False). 85 | 86 | # COMMAND ---------- 87 | 88 | def test_model_performance(): 89 | all_tests_passed = True 90 | try: 91 | assert metrics['MSE'] < mse_threshold 92 | print(f"MSE test passed with {metrics['MSE']} mean squared error") 93 | except AssertionError: 94 | print(f"MSE test failed. Expected < {mse_threshold} but got {metrics['MSE']}") 95 | all_tests_passed = False 96 | 97 | try: 98 | assert metrics['MAE'] < mae_threshold 99 | print(f"MAE test passed with {metrics['MAE']} mean absolute error") 100 | except AssertionError: 101 | print(f"MAE test failed. Expected < {mae_threshold} but got {metrics['MAE']}") 102 | all_tests_passed = False 103 | 104 | try: 105 | assert metrics['RMSE'] < rmse_threshold 106 | print(f"RMSE test passed with {metrics['RMSE']} root mean squared error") 107 | except AssertionError: 108 | print(f"RMSE test failed. Expected < {rmse_threshold} but got {metrics['RMSE']}") 109 | all_tests_passed = False 110 | 111 | try: 112 | assert metrics['R2'] > r2_threshold 113 | print(f"R2 test passed with {metrics['R2']} score") 114 | except AssertionError: 115 | print(f"R2 test failed. Expected > {r2_threshold} but got {metrics['R2']}") 116 | all_tests_passed = False 117 | 118 | 119 | try: 120 | assert metrics['Training Time(sec)'] < training_time_threshold #1hour 121 | print(f"Model training time test passed with {metrics['Training Time(sec)']} seconds") 122 | except AssertionError: 123 | print(f"Model training time test failed. Expected < {training_time_threshold} seconds but got {metrics['Training Time(sec)']} seconds") 124 | all_tests_passed = False 125 | 126 | return all_tests_passed 127 | 128 | # COMMAND ---------- 129 | 130 | # MAGIC %md 131 | # MAGIC ## Metrics Visualization 132 | 133 | # COMMAND ---------- 134 | 135 | # MAGIC %md 136 | # MAGIC We create a DataFrame that shows the metric values and their corresponding thresholds, along with pass/fail status for each test. It checks if the metric values meet the defined thresholds and assigns "Test Passed" or "Test Failed" based on the comparison. The purpose is to provide a visual representation of the test results for easy interpretation and evaluation of the model's performance against the thresholds. 137 | 138 | # COMMAND ---------- 139 | 140 | # Create a DataFrame with the metric values and their corresponding thresholds 141 | df = spark.createDataFrame([ 142 | ("MSE", metrics['MSE'], mse_threshold), 143 | ("MAE", metrics['MAE'], mae_threshold), 144 | ("RMSE", metrics['RMSE'], rmse_threshold), 145 | ("R2", metrics['R2'], r2_threshold), 146 | ("Training Time(sec)", metrics['Training Time(sec)'], training_time_threshold ) 147 | ], ["Metric", "Value", "Threshold"]) 148 | 149 | # Cast the "Threshold" column to DoubleType 150 | df = df.withColumn("Threshold", df["Threshold"].cast(DoubleType())) 151 | df = df.withColumn("Pass", when(df["Metric"].isin(["MSE", "MAE", "RMSE","Training Time(sec)"]), df["Value"] <= df["Threshold"]).otherwise(df["Value"] >= df["Threshold"])) 152 | # Add a column to show pass/fail as strings 153 | df = df.withColumn("Status", when(df["Pass"], "Test Passed").otherwise("Test Failed")) 154 | # Show the DataFrame 155 | display(df) 156 | 157 | # COMMAND ---------- 158 | 159 | # MAGIC %md 160 | # MAGIC ## Model Staging 161 | 162 | # COMMAND ---------- 163 | 164 | # MAGIC %md 165 | # MAGIC * The code automates the transition of the latest version of a registered model to the staging environment. 166 | # MAGIC * It checks the performance of the model using performance tests. 167 | # MAGIC * If all performance tests pass, the model is transitioned to the staging environment. 168 | # MAGIC * If any test fails, the model is not staged and a message is printed indicating the failure. 169 | # MAGIC * The purpose is to ensure that only models meeting the performance criteria are moved to the staging environment. 170 | 171 | # COMMAND ---------- 172 | 173 | def proceed_model_to_staging(): 174 | # Get the latest version of the registered model 175 | client = mlflow.tracking.MlflowClient() 176 | model_version = client.get_latest_versions(model_name, stages=["None"])[0].version 177 | 178 | # Define the endpoint URL 179 | endpoint_url = f"https://{databricks_instance}/api/2.0/mlflow/databricks/model-versions/transition-stage" 180 | 181 | stage = 'Staging' #Define the stage you want your model to transit 182 | comment = "Transitioning to staging environment after performance testing" 183 | headers = { "Authorization": "Bearer " + access_token } 184 | 185 | request_body = { 186 | "version": f"{model_version}", 187 | "name": model_name, 188 | "stage" : stage, #Specifies the environment we want to transit our model 189 | "archive_existing_versions": False, #Specifies whether to archive all current model versions in the target stage. 190 | "comment": comment 191 | } 192 | 193 | # Make the request 194 | response = requests.post(endpoint_url, headers=headers,json=request_body) 195 | 196 | # Check the response status code 197 | if response.status_code == 200: 198 | print("Model version transitioned to staging") 199 | else: 200 | print(f"Error transitioning model version to staging: {response.text}") 201 | 202 | 203 | # Call function for staging 204 | 205 | all_tests_passed = test_model_performance() 206 | # run performance tests here 207 | if all_tests_passed: 208 | # proceed with model staging 209 | proceed_model_to_staging() 210 | else: 211 | print("Model performance tests failed. Model will not be staged.") 212 | 213 | # COMMAND ---------- 214 | 215 | 216 | -------------------------------------------------------------------------------- /MLOps Pipeline/Data Engineering/03. Data Quality/01. Data Quality Checks.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | ! pip install plotly 3 | from pyspark.sql.functions import count, when, isnull, col 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import pandas as pd 7 | import plotly.graph_objects as go 8 | import numpy as np 9 | 10 | # COMMAND ---------- 11 | 12 | spark.sql('USE df_dev') 13 | 14 | # COMMAND ---------- 15 | 16 | df = spark.read.table('final_consumption_countries_hourly') 17 | 18 | # COMMAND ---------- 19 | 20 | df.count() 21 | 22 | # COMMAND ---------- 23 | 24 | # MAGIC %md 25 | # MAGIC ## Data Profiling 26 | 27 | # COMMAND ---------- 28 | 29 | display(df) 30 | 31 | # COMMAND ---------- 32 | 33 | # MAGIC %md 34 | # MAGIC ## Sanity & Data Quality Checks 35 | 36 | # COMMAND ---------- 37 | 38 | # MAGIC %md 39 | # MAGIC 1. Missing Values Check: Counts the number of null values in each column of the DataFrame, calculates the total number of nulls, and calculates the percentage of null values relative to the total number of rows. 40 | # MAGIC 41 | # MAGIC 1. Duplicates Check: Determines the count of duplicate rows by subtracting the count of the DataFrame after dropping duplicates from the original count. It also calculates the percentage of duplicate rows relative to the total number of rows. 42 | # MAGIC 43 | # MAGIC 1. Invalid Values Check: Counts the number of invalid records in each dataframe(ex. negative/zero energy consumption) 44 | # MAGIC 45 | # MAGIC 1. Outlier Detection: Defines bounds based on the first and third quartiles of the 'Actual_MW' column using the approxQuantile function. It then identifies outliers by counting the number of rows where the 'Actual_MW' value falls outside the defined bounds. 46 | # MAGIC 47 | # MAGIC 1. Schema Verification: Compares the DataFrame's column names to the expected column names ('Datetime', 'Actual_MW', 'start_time', 'country') and checks if any unexpected columns exist. 48 | # MAGIC 49 | # MAGIC 1. Summary Print: Displays the results of the data quality checks, including the count and percentage of missing values, count and percentage of duplicate rows, presence of outliers, and whether the schema matches the expected columns. 50 | # MAGIC 51 | # MAGIC 1. Statistical Checks: Prints basic statistical measures of the DataFrame using the describe function. 52 | 53 | # COMMAND ---------- 54 | 55 | print("\nData quality checks for concatenated dataframe...") 56 | 57 | # 1. Missing values check 58 | null_counts = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).collect() 59 | total_nulls = sum(row[c] for row in null_counts for c in df.columns) 60 | nulls_percentage = (total_nulls / df.count()) * 100 61 | 62 | # 2. Invalid values check 63 | invalid_values = df.filter((df['HOURLY_CONSUMPTION_MW'] <= 0)).count() 64 | invalid_percentage = (invalid_values / df.count()) * 100 65 | 66 | # 3. Duplicates check 67 | duplicates_count = df.count() - df.dropDuplicates().count() 68 | duplicates_percentage = (duplicates_count / df.count()) * 100 69 | 70 | # 4. Outlier detection 71 | bounds = { 72 | c: dict( 73 | zip(["q1", "q3"], df.approxQuantile(c, [0.25, 0.75], 0)) 74 | ) 75 | for c in ["HOURLY_CONSUMPTION_MW"] 76 | } 77 | outliers = 0 78 | for c in bounds: 79 | iqr = bounds[c]['q3'] - bounds[c]['q1'] 80 | bounds[c]['lower'] = bounds[c]['q1'] - (iqr * 1.5) 81 | bounds[c]['upper'] = bounds[c]['q3'] + (iqr * 1.5) 82 | outliers += df.filter( 83 | (df[c] < bounds[c]['lower']) | 84 | (df[c] > bounds[c]['upper']) 85 | ).count() 86 | 87 | # 5. Schema verification 88 | expected_columns = ['DATETIME', 'HOURLY_CONSUMPTION_MW', 'COUNTRY'] 89 | schema_check = len(set(df.columns) - set(expected_columns)) == 0 90 | 91 | # Summary print 92 | print(f"Missing values: {total_nulls if total_nulls > 0 else 'None'} ({nulls_percentage:.4f}% of total rows)") 93 | print(f"Duplicate rows: {duplicates_count if duplicates_count > 0 else 'None'} ({duplicates_percentage:.4f}% of total rows)") 94 | print(f"Invalid values: {invalid_values if invalid_values > 0 else 'None'} ({invalid_percentage:.4f}% of total rows)") 95 | print(f"Outliers: {'Found' if outliers else 'None'}") 96 | print(f"Schema check: {'Unexpected schema' if not schema_check else 'Schema as expected'}") 97 | 98 | # 6. Statistical checks 99 | print("Basic statistical measures:") 100 | df.describe().show() 101 | 102 | 103 | # COMMAND ---------- 104 | 105 | from pyspark.sql.functions import col, lag, expr 106 | from pyspark.sql.window import Window 107 | from pyspark.sql import functions as F 108 | 109 | # Specify the column names in your DataFrame 110 | datetime_col = "DATETIME" 111 | country_col = "COUNTRY" 112 | 113 | # Sort the DataFrame by 'DATETIME' within each country 114 | window_spec = Window.partitionBy(country_col).orderBy(datetime_col) 115 | df_sorted = df.withColumn("start_time", col(datetime_col).cast("timestamp")).orderBy(country_col, datetime_col) 116 | 117 | # Calculate the time difference between consecutive records within each country 118 | df_sorted = df_sorted.withColumn("time_diff", col("start_time").cast("long") - lag(col("start_time").cast("long")).over(window_spec)) 119 | 120 | # Check if all time differences are exactly 1 hour within each country 121 | country_continuity = df_sorted.groupBy(country_col).agg(F.min(F.when(col("time_diff") == 3600, True)).alias("is_continuous")) 122 | 123 | # Show the results 124 | country_continuity.show() 125 | 126 | 127 | # COMMAND ---------- 128 | 129 | # MAGIC %md 130 | # MAGIC ## Clean Datasets(Duplicates,Null,Invalid) 131 | 132 | # COMMAND ---------- 133 | 134 | pandas_df = df.toPandas() 135 | 136 | # COMMAND ---------- 137 | 138 | # MAGIC %md 139 | # MAGIC ## LinePlot of the Energy Consumption Forecasting 140 | 141 | # COMMAND ---------- 142 | 143 | def create_plot(country, time_range_name, time_range): 144 | # Filter data for the specific country 145 | filtered_df = pandas_df[pandas_df['COUNTRY'] == country] 146 | 147 | # Filter data to the specified time range 148 | filtered_df = filtered_df.loc[(filtered_df['DATETIME'] >= time_range[0]) & (filtered_df['DATETIME'] <= time_range[1])] 149 | 150 | # Aggregate data to daily averages 151 | daily_df = filtered_df.groupby(pd.Grouper(key='DATETIME', freq='D')).mean() 152 | 153 | # Create a rolling average 154 | daily_df['Rolling_MW'] = daily_df['HOURLY_CONSUMPTION_MW'].rolling(window=7).mean() 155 | 156 | # Find the times corresponding to min and max Actual_MW in the filtered data 157 | min_time = filtered_df.loc[filtered_df['HOURLY_CONSUMPTION_MW'].idxmin(), 'DATETIME'] 158 | max_time = filtered_df.loc[filtered_df['HOURLY_CONSUMPTION_MW'].idxmax(), 'DATETIME'] 159 | 160 | # Create a line plot 161 | fig = go.Figure() 162 | 163 | # Add trace for actual MW 164 | fig.add_trace(go.Scatter(x=filtered_df['DATETIME'], y=filtered_df['HOURLY_CONSUMPTION_MW'], mode='markers', 165 | name='Actual MW', 166 | hovertemplate= 167 | "%{x}

" + 168 | "Actual MW: %{y}
" + 169 | "")) 170 | 171 | # Add trace for rolling average 172 | fig.add_trace(go.Scatter(x=daily_df.index, y=daily_df['Rolling_MW'], mode='markers', 173 | name='7-day Rolling Average', 174 | hovertemplate= 175 | "%{x}

" + 176 | "Rolling MW: %{y}
" + 177 | "")) 178 | 179 | # Add markers for min and max values 180 | fig.add_trace(go.Scatter(x=[min_time, max_time], 181 | y=[filtered_df.loc[filtered_df['DATETIME'] == min_time, 'HOURLY_CONSUMPTION_MW'].values[0], 182 | filtered_df.loc[filtered_df['DATETIME'] == max_time, 'HOURLY_CONSUMPTION_MW'].values[0]], 183 | mode='markers+text', 184 | marker=dict(size=[10, 10]), 185 | text=['Min', 'Max'], 186 | textposition="top center", 187 | name='Min/Max', 188 | hovertemplate= 189 | "%{x}

" + 190 | "Actual MW: %{y}
" + 191 | "")) 192 | 193 | # Add vertical lines for min and max values 194 | fig.add_shape( 195 | dict(type="line", x0=min_time, y0=0, x1=min_time, y1=filtered_df['HOURLY_CONSUMPTION_MW'].max(), 196 | line=dict(color="RoyalBlue", width=2))) 197 | fig.add_shape( 198 | dict(type="line", x0=max_time, y0=0, x1=max_time, y1=filtered_df['HOURLY_CONSUMPTION_MW'].max(), 199 | line=dict(color="RoyalBlue", width=2))) 200 | 201 | # Update layout 202 | fig.update_layout(title=f'Daily Energy Consumption for {country.capitalize()} over {time_range_name.capitalize()}', 203 | xaxis_title='DATETIME', 204 | yaxis_title='Energy Consumption (HOURLY_CONSUMPTION_MW)', 205 | hovermode='x') 206 | 207 | fig.show() 208 | 209 | 210 | # COMMAND ---------- 211 | 212 | # Define time ranges 213 | time_ranges = { 214 | 'decade':['2015-01-01','2023-01-01'], 215 | 'year': ['2022-01-01', '2023-01-01'], 216 | 'month': ['2022-12-01', '2023-01-01'], 217 | 'week': ['2022-12-25', '2023-01-01'] 218 | } 219 | 220 | create_plot('greece','year', time_ranges['year']) 221 | 222 | # COMMAND ---------- 223 | 224 | # MAGIC %md 225 | # MAGIC ## Box-and-Whisker Plot 226 | 227 | # COMMAND ---------- 228 | 229 | import plotly.graph_objects as go 230 | 231 | def create_box_plot(country): 232 | # Filter data for the specific country 233 | filtered_df = pandas_df[pandas_df['COUNTRY'] == country] 234 | 235 | # Create a box plot 236 | fig = go.Figure() 237 | 238 | # Add box trace 239 | fig.add_trace(go.Box(y=filtered_df['HOURLY_CONSUMPTION_MW'], name='HOURLY_CONSUMPTION_MW')) 240 | 241 | # Update layout 242 | fig.update_layout(title=f'Boxplot of Energy Consumption for {country.capitalize()}', 243 | yaxis_title='Energy Consumption (HOURLY_CONSUMPTION_MW)') 244 | 245 | fig.show() 246 | 247 | create_box_plot('greece') 248 | 249 | # COMMAND ---------- 250 | 251 | 252 | -------------------------------------------------------------------------------- /MLOps Pipeline/Data Engineering/03. Data Quality/02. Monitoring Data Quality Checks.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | ! pip install plotly 3 | from pyspark.sql.functions import count, when, isnull, col 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import pandas as pd 7 | import plotly.graph_objects as go 8 | import numpy as np 9 | 10 | # COMMAND ---------- 11 | 12 | spark.sql('USE db_monitor') 13 | 14 | # COMMAND ---------- 15 | 16 | df = spark.read.table('final_monitoring_consumption_countries_hourly') 17 | 18 | # COMMAND ---------- 19 | 20 | df.count() 21 | 22 | # COMMAND ---------- 23 | 24 | # MAGIC %md 25 | # MAGIC ## Data Profiling 26 | 27 | # COMMAND ---------- 28 | 29 | display(df) 30 | 31 | # COMMAND ---------- 32 | 33 | # MAGIC %md 34 | # MAGIC ## Sanity & Data Quality Checks 35 | 36 | # COMMAND ---------- 37 | 38 | # MAGIC %md 39 | # MAGIC 1. Missing Values Check: Counts the number of null values in each column of the DataFrame, calculates the total number of nulls, and calculates the percentage of null values relative to the total number of rows. 40 | # MAGIC 41 | # MAGIC 1. Duplicates Check: Determines the count of duplicate rows by subtracting the count of the DataFrame after dropping duplicates from the original count. It also calculates the percentage of duplicate rows relative to the total number of rows. 42 | # MAGIC 43 | # MAGIC 1. Invalid Values Check: Counts the number of invalid records in each dataframe(ex. negative/zero energy consumption) 44 | # MAGIC 45 | # MAGIC 1. Outlier Detection: Defines bounds based on the first and third quartiles of the 'Actual_MW' column using the approxQuantile function. It then identifies outliers by counting the number of rows where the 'Actual_MW' value falls outside the defined bounds. 46 | # MAGIC 47 | # MAGIC 1. Schema Verification: Compares the DataFrame's column names to the expected column names ('Datetime', 'Actual_MW', 'start_time', 'country') and checks if any unexpected columns exist. 48 | # MAGIC 49 | # MAGIC 1. Summary Print: Displays the results of the data quality checks, including the count and percentage of missing values, count and percentage of duplicate rows, presence of outliers, and whether the schema matches the expected columns. 50 | # MAGIC 51 | # MAGIC 1. Statistical Checks: Prints basic statistical measures of the DataFrame using the describe function. 52 | 53 | # COMMAND ---------- 54 | 55 | print("\nData quality checks for concatenated dataframe...") 56 | 57 | # 1. Missing values check 58 | null_counts = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).collect() 59 | total_nulls = sum(row[c] for row in null_counts for c in df.columns) 60 | nulls_percentage = (total_nulls / df.count()) * 100 61 | 62 | # 2. Invalid values check 63 | invalid_values = df.filter((df['HOURLY_CONSUMPTION_MW'] <= 0)).count() 64 | invalid_percentage = (invalid_values / df.count()) * 100 65 | 66 | # 3. Duplicates check 67 | duplicates_count = df.count() - df.dropDuplicates().count() 68 | duplicates_percentage = (duplicates_count / df.count()) * 100 69 | 70 | # 4. Outlier detection 71 | bounds = { 72 | c: dict( 73 | zip(["q1", "q3"], df.approxQuantile(c, [0.25, 0.75], 0)) 74 | ) 75 | for c in ["HOURLY_CONSUMPTION_MW"] 76 | } 77 | outliers = 0 78 | for c in bounds: 79 | iqr = bounds[c]['q3'] - bounds[c]['q1'] 80 | bounds[c]['lower'] = bounds[c]['q1'] - (iqr * 1.5) 81 | bounds[c]['upper'] = bounds[c]['q3'] + (iqr * 1.5) 82 | outliers += df.filter( 83 | (df[c] < bounds[c]['lower']) | 84 | (df[c] > bounds[c]['upper']) 85 | ).count() 86 | 87 | # 5. Schema verification 88 | expected_columns = ['DATETIME', 'HOURLY_CONSUMPTION_MW', 'COUNTRY'] 89 | schema_check = len(set(df.columns) - set(expected_columns)) == 0 90 | 91 | # Summary print 92 | print(f"Missing values: {total_nulls if total_nulls > 0 else 'None'} ({nulls_percentage:.4f}% of total rows)") 93 | print(f"Duplicate rows: {duplicates_count if duplicates_count > 0 else 'None'} ({duplicates_percentage:.4f}% of total rows)") 94 | print(f"Invalid values: {invalid_values if invalid_values > 0 else 'None'} ({invalid_percentage:.4f}% of total rows)") 95 | print(f"Outliers: {'Found' if outliers else 'None'}") 96 | print(f"Schema check: {'Unexpected schema' if not schema_check else 'Schema as expected'}") 97 | 98 | # 6. Statistical checks 99 | print("Basic statistical measures:") 100 | df.describe().show() 101 | 102 | 103 | # COMMAND ---------- 104 | 105 | from pyspark.sql.functions import col, lag, expr 106 | from pyspark.sql.window import Window 107 | from pyspark.sql import functions as F 108 | 109 | # Specify the column names in your DataFrame 110 | datetime_col = "DATETIME" 111 | country_col = "COUNTRY" 112 | 113 | # Sort the DataFrame by 'DATETIME' within each country 114 | window_spec = Window.partitionBy(country_col).orderBy(datetime_col) 115 | df_sorted = df.withColumn("start_time", col(datetime_col).cast("timestamp")).orderBy(country_col, datetime_col) 116 | 117 | # Calculate the time difference between consecutive records within each country 118 | df_sorted = df_sorted.withColumn("time_diff", col("start_time").cast("long") - lag(col("start_time").cast("long")).over(window_spec)) 119 | 120 | # Check if all time differences are exactly 1 hour within each country 121 | country_continuity = df_sorted.groupBy(country_col).agg(F.min(F.when(col("time_diff") == 3600, True)).alias("is_continuous")) 122 | 123 | # Show the results 124 | country_continuity.show() 125 | 126 | 127 | # COMMAND ---------- 128 | 129 | # MAGIC %md 130 | # MAGIC ## Clean Datasets(Duplicates,Null,Invalid) 131 | 132 | # COMMAND ---------- 133 | 134 | pandas_df = df.toPandas() 135 | 136 | # COMMAND ---------- 137 | 138 | # MAGIC %md 139 | # MAGIC ## LinePlot of the Energy Consumption Forecasting 140 | 141 | # COMMAND ---------- 142 | 143 | def create_plot(country, time_range_name, time_range): 144 | # Filter data for the specific country 145 | filtered_df = pandas_df[pandas_df['COUNTRY'] == country] 146 | 147 | # Filter data to the specified time range 148 | filtered_df = filtered_df.loc[(filtered_df['DATETIME'] >= time_range[0]) & (filtered_df['DATETIME'] <= time_range[1])] 149 | 150 | # Aggregate data to daily averages 151 | daily_df = filtered_df.groupby(pd.Grouper(key='DATETIME', freq='D')).mean() 152 | 153 | # Create a rolling average 154 | daily_df['Rolling_MW'] = daily_df['HOURLY_CONSUMPTION_MW'].rolling(window=7).mean() 155 | 156 | # Find the times corresponding to min and max Actual_MW in the filtered data 157 | min_time = filtered_df.loc[filtered_df['HOURLY_CONSUMPTION_MW'].idxmin(), 'DATETIME'] 158 | max_time = filtered_df.loc[filtered_df['HOURLY_CONSUMPTION_MW'].idxmax(), 'DATETIME'] 159 | 160 | # Create a line plot 161 | fig = go.Figure() 162 | 163 | # Add trace for actual MW 164 | fig.add_trace(go.Scatter(x=filtered_df['DATETIME'], y=filtered_df['HOURLY_CONSUMPTION_MW'], mode='markers', 165 | name='Actual MW', 166 | hovertemplate= 167 | "%{x}

" + 168 | "Actual MW: %{y}
" + 169 | "")) 170 | 171 | # Add trace for rolling average 172 | fig.add_trace(go.Scatter(x=daily_df.index, y=daily_df['Rolling_MW'], mode='markers', 173 | name='7-day Rolling Average', 174 | hovertemplate= 175 | "%{x}

" + 176 | "Rolling MW: %{y}
" + 177 | "")) 178 | 179 | # Add markers for min and max values 180 | fig.add_trace(go.Scatter(x=[min_time, max_time], 181 | y=[filtered_df.loc[filtered_df['DATETIME'] == min_time, 'HOURLY_CONSUMPTION_MW'].values[0], 182 | filtered_df.loc[filtered_df['DATETIME'] == max_time, 'HOURLY_CONSUMPTION_MW'].values[0]], 183 | mode='markers+text', 184 | marker=dict(size=[10, 10]), 185 | text=['Min', 'Max'], 186 | textposition="top center", 187 | name='Min/Max', 188 | hovertemplate= 189 | "%{x}

" + 190 | "Actual MW: %{y}
" + 191 | "")) 192 | 193 | # Add vertical lines for min and max values 194 | fig.add_shape( 195 | dict(type="line", x0=min_time, y0=0, x1=min_time, y1=filtered_df['HOURLY_CONSUMPTION_MW'].max(), 196 | line=dict(color="RoyalBlue", width=2))) 197 | fig.add_shape( 198 | dict(type="line", x0=max_time, y0=0, x1=max_time, y1=filtered_df['HOURLY_CONSUMPTION_MW'].max(), 199 | line=dict(color="RoyalBlue", width=2))) 200 | 201 | # Update layout 202 | fig.update_layout(title=f'Daily Energy Consumption for {country.capitalize()} over {time_range_name.capitalize()}', 203 | xaxis_title='DATETIME', 204 | yaxis_title='Energy Consumption (HOURLY_CONSUMPTION_MW)', 205 | hovermode='x') 206 | 207 | fig.show() 208 | 209 | 210 | # COMMAND ---------- 211 | 212 | # Define time ranges 213 | time_ranges = { 214 | 'decade':['2015-01-01','2023-01-01'], 215 | 'year': ['2022-01-01', '2023-01-01'], 216 | 'month': ['2023-01-01', '2023-02-01'], 217 | 'week': ['2022-12-25', '2023-01-01'] 218 | } 219 | 220 | create_plot('greece','month', time_ranges['month']) 221 | 222 | # COMMAND ---------- 223 | 224 | # MAGIC %md 225 | # MAGIC ## Box-and-Whisker Plot 226 | 227 | # COMMAND ---------- 228 | 229 | import plotly.graph_objects as go 230 | 231 | def create_box_plot(country): 232 | # Filter data for the specific country 233 | filtered_df = pandas_df[pandas_df['COUNTRY'] == country] 234 | 235 | # Create a box plot 236 | fig = go.Figure() 237 | 238 | # Add box trace 239 | fig.add_trace(go.Box(y=filtered_df['HOURLY_CONSUMPTION_MW'], name='HOURLY_CONSUMPTION_MW')) 240 | 241 | # Update layout 242 | fig.update_layout(title=f'Boxplot of Energy Consumption for {country.capitalize()}', 243 | yaxis_title='Energy Consumption (HOURLY_CONSUMPTION_MW)') 244 | 245 | fig.show() 246 | 247 | create_box_plot('greece') 248 | 249 | # COMMAND ---------- 250 | 251 | 252 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/03.Model Training.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Configuration 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment" 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %md 12 | # MAGIC ##Load Datasets 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC * Load energy consumption data from a database into a Pandas DataFrame. 18 | # MAGIC * Create a new column CONSUMPTION_ID by concatenating country codes with the date-time information. 19 | # MAGIC * Convert the DATETIME column to a proper datetime data type for time-based operations. 20 | # MAGIC * Split the data into two subsets: train_labels and test_labels, based on date-time ranges. 21 | # MAGIC * Convert the subsets back into Spark DataFrames and select only the CONSUMPTION_ID, DATETIME, and HOURLY_CONSUMPTION_MW columns for further processing 22 | 23 | # COMMAND ---------- 24 | 25 | # Load Consumption Region Table 26 | consumption_countries_hourly = spark.table(f'{db}.final_consumption_countries_hourly').toPandas() 27 | consumption_countries_hourly['CONSUMPTION_ID'] = consumption_countries_hourly.COUNTRY + '_' + consumption_countries_hourly.DATETIME.astype(str) 28 | consumption_countries_hourly['DATETIME'] = pd.to_datetime(consumption_countries_hourly['DATETIME']) 29 | 30 | # Split the labels into training and test 31 | train_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME >= train_start) & (consumption_countries_hourly.DATETIME <= train_end)] 32 | test_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME > test_start) & (consumption_countries_hourly.DATETIME <= test_end)] 33 | #val_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME > test_end) & (consumption_countries_hourly.DATETIME <= validation_end)] 34 | 35 | # Transforms to Spark DataFranes 36 | train_labels = spark.createDataFrame(train_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 37 | test_labels = spark.createDataFrame(test_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 38 | #val_labels = spark.createDataFrame(val_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %md 43 | # MAGIC * Define load_data function to create training sets by fetching features based on specified keys. 44 | # MAGIC * Inside the function, initialize feature lookups and create a training set by matching keys from input data. 45 | # MAGIC * Convert the training set to a Pandas DataFrame. 46 | # MAGIC * Call the load_data function to create training and test sets, and store them in variables training_set, train_df, and test_df. 47 | 48 | # COMMAND ---------- 49 | 50 | def load_data(table_name, labels, lookup_key, ts_lookup_key): 51 | # In the FeatureLookup, if you do not provide the `feature_names` parameter, all features except primary keys are returned 52 | model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key, timestamp_lookup_key=ts_lookup_key)] 53 | 54 | # fs.create_training_set looks up features in model_feature_lookups that match the primary key from inference_data_df 55 | training_set = fs.create_training_set(labels, 56 | model_feature_lookups, 57 | label="HOURLY_CONSUMPTION_MW", 58 | exclude_columns=["CONSUMPTION_ID", "DATETIME"]) 59 | training_pd = training_set.load_df().toPandas() 60 | 61 | return training_set, training_pd 62 | 63 | training_set, train_df = load_data(f'{db}.{feauture_store}', train_labels, 'CONSUMPTION_ID', "DATETIME") 64 | _, test_df = load_data(f'{db}.{feauture_store}', test_labels, 'CONSUMPTION_ID', "DATETIME") 65 | #_, val_df = load_data(f'{db}.{feauture_store}', val_labels, 'CONSUMPTION_ID', "DATE") 66 | 67 | 68 | # COMMAND ---------- 69 | 70 | # MAGIC %md 71 | # MAGIC ## Split to Features/Labels 72 | 73 | # COMMAND ---------- 74 | 75 | X_train = train_df.drop(columns=['HOURLY_CONSUMPTION_MW']) 76 | y_train = train_df['HOURLY_CONSUMPTION_MW'] 77 | X_test = test_df.drop(columns=['HOURLY_CONSUMPTION_MW']) 78 | y_test = test_df['HOURLY_CONSUMPTION_MW'] 79 | 80 | # COMMAND ---------- 81 | 82 | # MAGIC %md 83 | # MAGIC ### Create XGB Regressor and Register Model to Feature Store 84 | 85 | # COMMAND ---------- 86 | 87 | # MAGIC %md 88 | # MAGIC * The regressor is configured with the following hyperparameters: 89 | # MAGIC * n_estimators: The number of trees in the ensemble (200). 90 | # MAGIC * max_depth: The maximum depth of each tree (8). 91 | # MAGIC * learning_rate: The step size shrinkage used in each boosting iteration (0.1). 92 | # MAGIC * objective: The loss function to be optimized, using squared error for regression ('reg:squarederror'). 93 | # MAGIC * booster: The type of booster to use, specifically the gradient boosting tree ('gbtree'). 94 | # MAGIC * subsample: The fraction of training samples used for training each tree (0.8). 95 | # MAGIC * colsample_bytree: The fraction of features used for training each tree (0.8). 96 | # MAGIC * random_state: The random seed used for reproducibility (42). 97 | 98 | # COMMAND ---------- 99 | 100 | def create_regressor(): 101 | return XGBRegressor( 102 | n_estimators=300, 103 | max_depth=8, 104 | learning_rate=0.1, 105 | objective='reg:squarederror', 106 | booster='gbtree', 107 | subsample=0.8, 108 | colsample_bytree=0.8, 109 | random_state=42, 110 | ) 111 | 112 | # COMMAND ---------- 113 | 114 | # MAGIC %md 115 | # MAGIC * mse (Mean Squared Error): It measures the average squared difference between the true and predicted values. 116 | # MAGIC * rmse (Root Mean Squared Error): It is the square root of the MSE, providing a more interpretable measure of the error. 117 | # MAGIC * mae (Mean Absolute Error): It calculates the average absolute difference between the true and predicted values. 118 | # MAGIC * r2 (R-squared): It indicates the proportion of the variance in the true values that is explained by the predicted values. 119 | # MAGIC The calculated metrics are returned as a tuple (mse, rmse, mae, r2). 120 | 121 | # COMMAND ---------- 122 | 123 | def evaluate_model(y_test, y_pred): 124 | mse = mean_squared_error(y_test, y_pred) 125 | rmse = mean_squared_error(y_test, y_pred, squared=False) 126 | mae = mean_absolute_error(y_test, y_pred) 127 | r2 = r2_score(y_test, y_pred) 128 | return mse, rmse, mae, r2 129 | 130 | # COMMAND ---------- 131 | 132 | 133 | def log_metrics(mse, rmse, mae, r2, training_time): 134 | mlflow.log_metric("MAE", mae) 135 | mlflow.log_metric("MSE", mse) 136 | mlflow.log_metric("RMSE", rmse) 137 | mlflow.log_metric("R2", r2) 138 | mlflow.log_metric("Training Time(sec)", training_time) 139 | 140 | # COMMAND ---------- 141 | 142 | # MAGIC %md 143 | # MAGIC * It starts an MLflow run within a with statement to encapsulate the training and logging process. 144 | # MAGIC * An XGBoost regressor is created using the create_regressor() function and trained on the training data using the fit() method. The regressor's predictions are then calculated using the testing data. 145 | # MAGIC * The performance of the model is evaluated using the evaluate_model() function, which calculates metrics such as MSE, RMSE, MAE, and R2. 146 | # MAGIC * The input schema, model hyperparameters, metrics, feature importances, and other relevant information are logged using MLflow's tracking capabilities. 147 | # MAGIC * The trained model is logged as an artifact using the feature store's log_model() function, and various parameters and information are logged for comparison and tracking purposes. 148 | # MAGIC 149 | # MAGIC The purpose of this function is to train and log the model to feature store 150 | 151 | # COMMAND ---------- 152 | 153 | def train_model(X_train, X_test, y_train, y_test, training_set, fs, model_name, input_schema): 154 | experiment_id = experiment_id_training 155 | experiment = mlflow.get_experiment(experiment_id) 156 | 157 | if experiment: 158 | experiment_name = experiment.name 159 | mlflow.set_experiment(experiment_name) 160 | print(f"Active experiment set to '{experiment_name}'") 161 | else: 162 | print(f"No experiment found with name '{experiment_name}'") 163 | 164 | with mlflow.start_run(nested=True) as run: 165 | # Create and train XGBoost regressor 166 | reg = create_regressor() 167 | start_time = time.time() 168 | reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=10, verbose=False) 169 | end_time = time.time() 170 | 171 | # Make predictions 172 | y_pred = reg.predict(X_test) 173 | 174 | # Evaluate the model 175 | mse, rmse, mae, r2 = evaluate_model(y_test, y_pred) 176 | 177 | # Log the model input schema 178 | input_schema = {"feature_names": list(X_train.columns)} 179 | mlflow.log_dict(input_schema, "input_schema.json") 180 | 181 | # Log some tags for the model 182 | tags = {"model_type": "XGBoost", "dataset": "energy_consumption","Workflow Type": "Initial Training"} 183 | mlflow.set_tags(tags) 184 | 185 | # Log some parameters for the model 186 | params = reg.get_params() 187 | mlflow.log_dict(params, "hyperparams.json") 188 | 189 | # Log metrics 190 | training_time = end_time - start_time 191 | log_metrics(mse, rmse, mae, r2, training_time) 192 | 193 | # Log the feature importances of the model 194 | importance = reg.get_booster().get_score(importance_type="gain") 195 | mlflow.log_dict(importance, "importance.json") 196 | # Log the model and its description as artifacts 197 | description = "This is an XGBoost model trained to predict energy consumption of 11 European Countries in hourly basis." 198 | mlflow.log_text(description, "description.txt") 199 | 200 | # Log the current timestamp as the code version 201 | current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) 202 | mlflow.log_param("code_version", current_time) 203 | 204 | # Log additional important parameters for comparison 205 | mlflow.log_param("n_estimators", params["n_estimators"]) 206 | mlflow.log_param("max_depth", params["max_depth"]) 207 | mlflow.log_param("learning_rate", params["learning_rate"]) 208 | mlflow.log_param("subsample", params["subsample"]) 209 | mlflow.log_param("colsample_bytree", params["colsample_bytree"]) 210 | mlflow.log_param("random_state", params["random_state"]) 211 | # Log the training data size 212 | training_size = len(X_train) 213 | testing_size = len(X_test) 214 | training_range = { 215 | 'start': train_start, 216 | 'end': train_end 217 | } 218 | testing_range = { 219 | 'start': test_start, 220 | 'end': test_end 221 | } 222 | mlflow.log_param("training_range", training_range) 223 | mlflow.log_param("testing_range", testing_range) 224 | mlflow.log_param("training_data_size", training_size) 225 | mlflow.log_param("testing_data_size", testing_size) 226 | 227 | # Log the model 228 | fs.log_model( 229 | model=reg, 230 | artifact_path=f"{model_name}_artifact_path", 231 | flavor=mlflow.xgboost, 232 | training_set=training_set, 233 | registered_model_name=model_name 234 | ) 235 | 236 | return {"R2": r2, "MSE": mse, "RMSE": rmse, "MAE": mae, "Training Time(sec)": training_time} 237 | 238 | 239 | metrics = train_model(X_train, X_test, y_train, y_test, training_set, fs, model_name, input_schema) 240 | 241 | 242 | # COMMAND ---------- 243 | 244 | 245 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/04.Model Retraining Monthly(pyspark edition).py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Import Libraries 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment" 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %md 12 | # MAGIC ## Configuration 13 | 14 | # COMMAND ---------- 15 | 16 | config = spark.sql('select train_start, train_end, test_end from df_dev.config_retrain_monthly').collect() 17 | train_start, train_end, test_end = config[0] 18 | 19 | # train_start = '2013-06-01' #the retrain start date 20 | # train_end = '2018-06-20' #the retrain end date (20/06/2018 - 30/06/2018) 10 days for testing 21 | # test_end = '2018-06-30' 22 | # Convert start and end dates to datetime objects 23 | start_new_train_date = pd.to_datetime(validation_end) + pd.DateOffset(days=1) # 1 day after validation end 24 | end_new_train_date = pd.to_datetime(train_end) 25 | start_new_test_date = pd.to_datetime(train_end) + pd.DateOffset(days=1) # 1 day after train end 26 | end_new_test_date = pd.to_datetime(test_end) 27 | # Calculate the number of days between start and end dates 28 | num_new_train_days = (end_new_train_date - start_new_train_date).days 29 | num_new_test_days = (end_new_test_date - start_new_test_date).days 30 | 31 | # COMMAND ---------- 32 | 33 | # MAGIC %md 34 | # MAGIC ##Load Datasets 35 | 36 | # COMMAND ---------- 37 | 38 | # Load Consumption Region Table 39 | consumption_regions_daily = spark.table(f'{db}.{consumption_regions_daily}') 40 | consumption_regions_daily = consumption_regions_daily.withColumn('CONSUMPTION_ID', concat(col('REGION'), lit('_'), col('DATE'))) 41 | consumption_regions_daily = consumption_regions_daily.withColumn('DATE', col('DATE').cast(DateType())) 42 | 43 | # Split the labels into training and test 44 | train_labels = consumption_regions_daily.filter((col('DATE') >= train_start) & (col('DATE') <= train_end)) 45 | test_labels = consumption_regions_daily.filter((col('DATE') > train_end) & (col('DATE') <= test_end)) 46 | #val_labels = consumption_regions_daily.filter((col('DATE') > test_end) & (col('DATE') <= validation_end)) 47 | 48 | # Select the required columns 49 | train_labels = train_labels.select("CONSUMPTION_ID", "DATE", "DAILY_CONSUMPTION_MW") 50 | test_labels = test_labels.select("CONSUMPTION_ID", "DATE", "DAILY_CONSUMPTION_MW") 51 | #val_labels = val_labels.select("CONSUMPTION_ID", "DATE", "DAILY_CONSUMPTION_MW") 52 | 53 | # COMMAND ---------- 54 | 55 | def load_data(table_name, labels, lookup_key, ts_lookup_key): 56 | # In the FeatureLookup, if you do not provide the `feature_names` parameter, all features except primary keys are returned 57 | model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key, timestamp_lookup_key=ts_lookup_key)] 58 | 59 | # fs.create_training_set looks up features in model_feature_lookups that match the primary key from inference_data_df 60 | training_set = fs.create_training_set(labels, 61 | model_feature_lookups, 62 | label="DAILY_CONSUMPTION_MW", 63 | exclude_columns=["CONSUMPTION_ID", "DATE"]) 64 | training_df = training_set.load_df() 65 | 66 | return training_set, training_df 67 | 68 | # Cast the 'DATE' column to 'TIMESTAMP' data type 69 | train_labels = train_labels.withColumn('DATE', col('DATE').cast(TimestampType())) 70 | test_labels = test_labels.withColumn('DATE', col('DATE').cast(TimestampType())) 71 | #val_labels = val_labels.withColumn('DATE', col('DATE').cast(TimestampType())) 72 | 73 | # Load the data for the training set 74 | training_set, train_df = load_data(f'{db}.forecasting_features_daily', train_labels, 'CONSUMPTION_ID', 'DATE') 75 | 76 | # Load the data for the test set 77 | _, test_df = load_data(f'{db}.forecasting_features_daily', test_labels, 'CONSUMPTION_ID', 'DATE') 78 | 79 | # Load the data for the validation set 80 | #_, val_df = load_data(f'{db}.forecasting_features_daily', val_labels, 'CONSUMPTION_ID', 'DATE') 81 | 82 | 83 | # COMMAND ---------- 84 | 85 | concatenated_df = train_df.union(test_df) 86 | display(concatenated_df) 87 | 88 | # COMMAND ---------- 89 | 90 | # MAGIC %md 91 | # MAGIC ## Data Drift Test 92 | 93 | # COMMAND ---------- 94 | 95 | # Convert year, month, and day columns to string and pad month and day with zeros 96 | train_df_str = train_df.withColumn("YEAR", col("YEAR").cast("string")) 97 | train_df_str = train_df_str.withColumn("MONTH", lpad(col("MONTH").cast("string"), 2, '0')) 98 | train_df_str = train_df_str.withColumn("DAY_OF_MONTH", lpad(col("DAY_OF_MONTH").cast("string"), 2, '0')) 99 | 100 | # Concatenate year, month, and day columns with '-' separator and convert to date 101 | date_df = train_df_str.withColumn( 102 | 'date', 103 | to_date(concat_ws('-', train_df_str["YEAR"], train_df_str["MONTH"], train_df_str["DAY_OF_MONTH"]), 'yyyy-MM-dd') 104 | ) 105 | 106 | # Extract the most recent num_new_train_days days of data 107 | max_date_row = date_df.agg(max_("date").alias("max_date")).first() 108 | max_date = max_date_row["max_date"] 109 | 110 | new_data = date_df.filter(col("date") >= date_sub(lit(max_date), num_new_train_days)) 111 | 112 | # Extract a random sample of num_new_train_days * 11 days data 113 | old_data = date_df.filter( 114 | col("date") < date_sub(lit(max_date), num_new_train_days) 115 | ).orderBy(rand()).limit(num_new_train_days * 11) 116 | 117 | # Concatenate the new and old data 118 | all_data = new_data.union(old_data) 119 | 120 | 121 | # COMMAND ---------- 122 | 123 | # Apply the ks_2samp test to each feature 124 | for feature_name in regions: 125 | old_feature_data = old_data.select(feature_name).rdd.flatMap(lambda x: x).collect() 126 | new_feature_data = new_data.select(feature_name).rdd.flatMap(lambda x: x).collect() 127 | 128 | _, p_value = ks_2samp(old_feature_data, new_feature_data) 129 | 130 | if p_value < 0.05: 131 | print(f"The distribution of {feature_name} has drifted.") 132 | else: 133 | print(f"The distribution of {feature_name} has not drifted.") 134 | 135 | 136 | # COMMAND ---------- 137 | 138 | # MAGIC %md 139 | # MAGIC ## Retrain the Machine Learning Pipeline 140 | 141 | # COMMAND ---------- 142 | 143 | # MAGIC %md 144 | # MAGIC * Create temporal views of these dataframes in order to be passed int the source notebook 145 | 146 | # COMMAND ---------- 147 | 148 | concatenated_df.createOrReplaceTempView("concatenated_df_view") 149 | train_df.createOrReplaceTempView("train_df_view") 150 | test_df.createOrReplaceTempView("test_df_view") 151 | 152 | # COMMAND ---------- 153 | 154 | # MAGIC %run "/Repos/CI ADO Repo/01.Develop/Utils/Train ML Pipeline" 155 | 156 | # COMMAND ---------- 157 | 158 | # MAGIC %md 159 | # MAGIC ### Register Retrained Model to MLflow 160 | 161 | # COMMAND ---------- 162 | 163 | 164 | with mlflow.start_run(nested=True) as run: 165 | 166 | experiment = mlflow.get_experiment(experiment_id_retraining) 167 | if experiment: 168 | experiment_name = experiment.name 169 | mlflow.set_experiment(experiment_name) 170 | print(f"Active experiment set to '{experiment_name}'") 171 | else: 172 | print(f"No experiment found with name '{experiment_name}'") 173 | 174 | # Define the output schema 175 | output_schema = sch.Schema([sch.ColSpec("float", "DAILY_CONSUMPTION_MW")]) 176 | 177 | # Create a model signature from the input and output schemas 178 | signature = ModelSignature(inputs=input_schema, outputs=output_schema) 179 | 180 | # Log the model input schema 181 | schema = {"input_schema": list(concatenated_df.columns[:-1]),"output_schema":concatenated_df.columns[-1]} 182 | mlflow.log_dict(schema, "schema.json") 183 | 184 | # Log some tags for the model 185 | mlflow.set_tags(tags) 186 | 187 | # Log some parameters for the model 188 | mlflow.log_dict(hyperparameters, "hyperparams.json") 189 | 190 | # Log the evaluation metrics as metrics 191 | mlflow.log_metric("MAE", mae) 192 | mlflow.log_metric("MSE", mse) 193 | mlflow.log_metric("RMSE", rmse) 194 | mlflow.log_metric("R2", r2) 195 | 196 | #Log the time taken to train as metric 197 | mlflow.log_metric("Training Time(sec)", training_time) 198 | 199 | # Log evaluation metrics as artifact 200 | metrics = {"R2": r2, "MSE": mse, "RMSE": rmse, 'MAE':mae,'Training Time(sec)':training_time} 201 | mlflow.log_dict(metrics, "metrics.json") 202 | 203 | # Log the model description as artifact 204 | mlflow.log_text(description, "description.txt") 205 | 206 | # Log the current timestamp as the code version 207 | mlflow.log_param("code_version", current_time) 208 | 209 | # Log additional important parameters for comparison 210 | mlflow.log_param("n_estimators", hyperparameters["n_estimators"]) 211 | mlflow.log_param("max_depth", hyperparameters["max_depth"]) 212 | mlflow.log_param("learning_rate", hyperparameters["learning_rate"]) 213 | mlflow.log_param("training_data_size", training_size) 214 | mlflow.log_param("testing_data_size", testing_size) 215 | 216 | # Log the model with its signature 217 | mlflow.spark.log_model(xgb_model, artifact_path="model", signature=signature,pip_requirements=pip_requirements) 218 | 219 | # Register the model with its signature 220 | model_uri = f"runs:/{mlflow.active_run().info.run_id}/model" 221 | mlflow.register_model(model_uri=model_uri, name="pyspark_mlflow_model") 222 | 223 | # Get the latest model version(The one that we now registered) 224 | client = MlflowClient() 225 | model_version = client.get_latest_versions("pyspark_mlflow_model")[0].version 226 | 227 | # Save your data to a new DBFS directory for each run 228 | data_path = f"dbfs:/FileStore/Data_Versioning/data_model_v{model_version}.parquet" 229 | concatenated_df.write.format("parquet").save(data_path) 230 | 231 | # Log the DBFS path as an artifact 232 | with open("data_path.txt", "w") as f: 233 | f.write(data_path) 234 | mlflow.log_artifact("data_path.txt") 235 | 236 | # COMMAND ---------- 237 | 238 | # MAGIC %md 239 | # MAGIC ## Model Staging 240 | 241 | # COMMAND ---------- 242 | 243 | def proceed_model_to_staging(): 244 | # Get the latest version of the registered model 245 | client = mlflow.tracking.MlflowClient() 246 | model_version = client.get_latest_versions(model_name, stages=["None"])[0].version 247 | 248 | # Define the endpoint URL 249 | endpoint_url = f"https://{databricks_instance}/api/2.0/mlflow/databricks/model-versions/transition-stage" 250 | 251 | stage = 'Staging' #Define the stage you want your model to transit 252 | comment = "Transitioning to staging environment after performance testing" 253 | headers = { "Authorization": "Bearer " + access_token } 254 | 255 | request_body = { 256 | "version": f"{model_version}", 257 | "name": model_name, 258 | "stage" : stage, #Specifies the environment we want to transit our model 259 | "archive_existing_versions": False, #Specifies whether to archive all current model versions in the target stage. 260 | "comment": comment 261 | } 262 | 263 | # Make the request 264 | response = requests.post(endpoint_url, headers=headers,json=request_body) 265 | 266 | # Check the response status code 267 | if response.status_code == 200: 268 | print("Model version transitioned to staging") 269 | else: 270 | print(f"Error transitioning model version to staging: {response.text}") 271 | 272 | 273 | # COMMAND ---------- 274 | 275 | all_tests_passed = True 276 | # run performance tests here 277 | if all_tests_passed: 278 | # proceed with model staging 279 | proceed_model_to_staging() 280 | else: 281 | print("Model performance tests failed. Model will not be staged.") 282 | 283 | 284 | # COMMAND ---------- 285 | 286 | 287 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Streamlining Energy Consumption Forecasting using MLOps 2 | 3 | This project focuses on streamlining the process of forecasting energy consumption by employing Machine Learning Operations (MLOps). It integrates data engineering, machine learning algorithms, and automation to create a scalable and efficient forecasting system. 4 | 5 | [![Generic badge](https://img.shields.io/badge/Status-Complete-green.svg)](https://shields.io/) 6 | [![Generic badge](https://img.shields.io/badge/Databricks-Powered-blue.svg)](https://shields.io/) 7 | [![made-with-azure](https://img.shields.io/badge/Made%20with-Azure-1f425f.svg)](https://azure.microsoft.com/) 8 | [![made-with-databricks](https://img.shields.io/badge/Made%20with-Databricks-orange.svg)](https://www.databricks.com/) 9 | 10 | 11 | ## Table of Contents 12 | - [Introduction](#-introduction) 13 | - [Requirements](#️-requirements) 14 | - [Setup & Installation](#️-setup--installation) 15 | - [Aim of the Project](#-aim-of-the-project) 16 | - [Results and Findings](#-results-and-findings) 17 | - [Acknowledgments](#-acknowledgments) 18 | - [Contact](#-contact) 19 | - [Related Publication](#-related-publication) 20 | - [Citation](#-citation) 21 | 22 | ## 📌 Introduction 23 | The core objective of this project is to develop and orchestrate an automated pipeline for forecasting energy consumption across eleven European countries, namely Belgium, Denmark, France, Germany, Greece, Italy, Luxembourg, Netherlands, Spain, Sweden, and Switzerland. The pipeline is specifically tailored for processing hourly energy consumption data. 24 | 25 | This project is fully integrated within the Azure Cloud ecosystem and leverages the power and scalability of the Databricks platform. Utilizing these cutting-edge cloud technologies ensures that the pipeline is not only highly scalable but also incredibly efficient and reliable. 26 | 27 | Forecasting energy consumption is pivotal for European countries, as it plays an instrumental role in ensuring energy sustainability, optimizing power generation and distribution, and facilitating informed decision-making. By producing reliable and timely forecasts, this project empowers energy providers and stakeholders with insights that can lead to cost reductions, enhanced operational efficiencies, and the promotion of sustainable energy practices. 28 | 29 | The end goal is to establish a robust, scalable, and automated solution that provides precise forecasting of energy consumption. Through automating the forecasting process, we aim to keep up with the ever-evolving demands of the energy sector and contribute significantly to Europe’s broader economic and environmental objectives. 30 | 31 | 32 | ## 🛠️ Requirements 33 | 34 | ### Data Source 35 | This project utilizes data from the ENTSO-E Transparency Platform, which provides comprehensive information on the European electricity market. To access the dataset, you will need to create an account on the ENTSO-E Transparency Platform. Once you have an account, you can access and download the dataset required for this project. 36 | 37 | [Create an account on ENTSO-E Transparency Platform](https://keycloak-transparency.entsoe.eu/realms/tp/protocol/openid-connect/auth?response_type=code&client_id=tp-web&redirect_uri=https%3A%2F%2Ftransparency.entsoe.eu%2Fsso%2Flogin&state=7135aea4-5563-4a24-9fae-727dcee13294&login=true&scope=openid) 38 | 39 | ### Libraries and Dependencies 40 | This project is dependent on several libraries and frameworks. It's important to ensure that all of the necessary libraries are installed to be able to run the code seamlessly. 41 | 42 | You can install the required libraries using the 'requirements.txt' file included in the repository. Run the following command: 43 | 44 | ``` 45 | cd mlops-energy-forecast-thesis/MLOps Pipeline/Utils 46 | pip install -r requirements.txt 47 | ``` 48 | 49 | ### Azure and Databricks 50 | As the project is fully integrated with the Azure Cloud and utilizes the Databricks platform, you will need to have: 51 | 52 | * An active Azure subscription. 53 | * A Databricks workspace set up within Azure. 54 | 55 | ## ⚙️ Setup & Installation 56 | 57 | Follow these simplified steps to set up the project: 58 | 59 | 1. **Create Accounts**: Sign up for [Azure Cloud](https://azure.microsoft.com/), [Databricks](https://databricks.com/), and [ENTSO-E Transparency Platform](https://transparency.entsoe.eu/). 60 | 61 | 2. **Clone the Repository**: Clone this repository to your machine or Azure virtual machine. 62 | 63 | ```sh 64 | git clone https://github.com/Philippos01/mlops-energy-forecast-thesis.git 65 | 66 | 3. **Install Requirements**: Navigate to the project directory and install the required libraries using the requirements.txt file. 67 | ``` 68 | cd mlops-energy-forecast-thesis/MLOps Pipeline/Utils 69 | pip install -r requirements.txt 70 | ``` 71 | 72 | 4. **Set Up Databricks**: Log in to Databricks, and create a new workspace. Within the workspace, create a new cluster and make sure that it's running. Import the project notebooks into your workspace. 73 | 74 | 5. **Configure Azure**: In your Azure account, create a resource group. Within this resource group, create a Databricks workspace (if you haven't already during the Databricks setup) and configure the necessary resources such as storage accounts, networking, etc. 75 | 76 | 6. **Download and Import Dataset**: Log in to the ENTSO-E Transparency Platform and download the dataset. Import this dataset into Databricks. 77 | 78 | 7. **Run Notebooks** : In Databricks, open the notebooks and attach them to the cluster you created earlier. Run the cells in sequence, making sure to input your API keys when prompted. 79 | 80 | 8. **Monitor with MLflow**: You can keep track of experiments, parameters, metrics, and artifacts using MLflow in Databricks. 81 | 82 | 9. **Deploy the Model**: After training and evaluating the model, follow the instructions in the documentation to deploy it for forecasting. 83 | 84 | 10. **Schedule Notebooks**: Optionally, you can schedule the notebooks to run periodically to automate data retrieval and model training. 85 | 86 | ## 🎯 Aim of the Project 87 | 88 | This project aims to implement a data-driven approach for forecasting energy consumption across 11 European countries (Belgium, Denmark, France, Germany, Greece, Italy, Luxembourg, Netherlands, Spain, Sweden, Switzerland) on an hourly basis using Azure Databricks. The technical steps encompassed in the project are as follows: 89 | 90 | * **Data Acquisition** : Downlaod energy consumption data from the ENTSO-E Transparency Platform, an authoritative source for European energy market data. This project utilizes manual downloads and uploads to Databricks, but this process can be automated for future scalability. 91 | 92 | * **Data Processing and Feature Engineering**: Handle any missing or inconsistent data and engineer new features that might improve the performance of the forecasting models. This involves processing the raw data to format it appropriately for machine learning models. 93 | 94 | * **Model Building**: Develop forecasting models using machine learning algorithms such as XGBoost and LSTM (Long Short-Term Memory networks) to predict energy consumption patterns. The choice of algorithms is based on their proven performance in time-series forecasting tasks. 95 | 96 | * **Model Evaluation**: Evaluate the performance of the forecasting models using metrics such as Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R-squared (R²). This helps in quantifying how well the models are performing. 97 | 98 | * **Deployment and Monitoring**: Save the chosen model in Feature Store and make it available for inference within Databricks. Incorporate monitoring tools to track the model’s performance over time and ensure it stays within acceptable limits. This approach facilitates a seamless integration within the Databricks ecosystem, enabling easy access and utilization of the model for forecasting purposes. 99 | 100 | * **Scalability and Performance**: Leverage the Azure cloud and Databricks platform to ensure that the implemented solution can handle large volumes of data efficiently. This enables the project to scale with the addition of new data or expansion to more countries. 101 | 102 | By successfully implementing these technical steps, this project contributes to the larger goal of enabling better energy management and planning through data-driven insights and forecasts. 103 | 104 | For a comprehensive and in-depth analysis of the project's objectives and how it achieves them, please refer to the detailed documentation: 105 | 106 | [📄 Read the Detailed Documentation](./DOCUMENTATION.md) 107 | 108 | ## 📈 Results and Findings 109 | 110 | This section presents the results and findings obtained through the energy consumption forecasting pipeline. The results are categorized into explanatory analysis, average hourly consumption analysis, model comparison, and evaluation metrics for the deployed model. 111 | 112 | ### Explanatory Analysis 113 | 114 | #### Daily Energy Consumption(e.g. Greece) 115 | 116 | Explanatory data analysis is essential for understanding the patterns and trends in the dataset. Below is a plot illustrating daily energy consumption in Greece. The plot reveals seasonality and trends in energy consumption, which are crucial for accurate forecasting. 117 | 118 | ![Greece Daily Energy Consumption](MLOps%20Pipeline/Utils/Images/newplot.png) 119 | *Daily Energy Consumption in Greece.* 120 | 121 | ### Average Hourly Consumption by Country and Hour of Day 122 | 123 | The plot below provides insights into average hourly energy consumption by country and hour of day. This is crucial to understand which countries consume more energy at different times of the day and can guide resource allocation and energy production planning. 124 | 125 | ![Average Hourly Consumption by Country and Hour of Day](MLOps%20Pipeline/Utils/Images/newplot%20(1).png) 126 | *Average Hourly Consumption by Country and Hour of Day.* 127 | 128 | ### Model Comparison: Daily Staging & Production Model Comparison for Greece 129 | 130 | To evaluate and select the best model for forecasting, we compared the daily staging and production models. The plot below illustrates how closely each model's predictions match the actual energy consumption data(For the sake of the example we illustrate data from Greece for the 1st week of April) 131 | 132 | ![Daily Staging & Production Model Comparison for Greece](MLOps%20Pipeline/Utils/Images/newplot%20(5).png) 133 | *Daily Staging & Production Model Comparison for Greece over one week.* 134 | 135 | ### Evaluation Metrics for Deployed Model 136 | 137 | The current deployed model was evaluated based on various metrics such as Mean Squared Error (MSE), Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and R-squared (R²). These metrics provide a quantitative understanding of the model's performance in forecasting energy consumption. 138 | 139 | - **MSE**: 24742781.8 140 | - **MAE**: 1859.5 141 | - **RMSE**: 4974.2 142 | - **R²**: 0.994 143 | - **Training Time**: 134.2 sec 144 | 145 | These findings and insights are instrumental for utility companies, policy-makers, and consumers in making informed decisions regarding energy consumption, production, and resource allocation. 146 | 147 | ## 🙏 Acknowledgments 148 | 149 | This project was conducted as part of my thesis at the Athens University of Economics and Business, Department of Management Science and Technology. 150 | 151 | ## 👥 Contact 152 | 153 | If you have any questions or would like to discuss this project, feel free to reach out: 154 | 155 | - LinkedIn: [LinkedIn](https://www.linkedin.com/in/fpriovolos/) 156 | - Email: filippos.priovolos01@gmail.com 157 | 158 | ## 📝 Related Publication 159 | 160 | This project is also the subject of a research paper that combines a theoretical and empirical approach. The paper dives into the details of the MLOps methodologies, techniques, and analysis involved in forecasting energy consumption with Azure Databricks 161 | 162 | - **Title**: "Streamlining MLOps for Energy Consumption Forecasting, A Case Study" 163 | - **Authors**: Filippos Priovolos 164 | If you use the content of this repository or the related paper in your research, please consider citing as shown in the citation section. 165 | 166 | 167 | ## 🧾 Citation 168 | 169 | If you use this project in your research or want to refer to it, please attribute it as follows: 170 | 171 | ```bibtex 172 | @misc{author2023energy, 173 | title={Streamlining MLOps for Energy Consumption Forecasting, A Case Study}, 174 | author={Filippos Priovolos}, 175 | year={2023}, 176 | } 177 | 178 | 179 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/03.Model Training(Pyspark Edition).py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Configuration 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment" 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %md 12 | # MAGIC ##Load Datasets 13 | 14 | # COMMAND ---------- 15 | 16 | train_start = '2015-01-01' 17 | train_end = '2021-12-31' 18 | test_start = '2022-01-01' 19 | test_end = '2023-01-01' 20 | 21 | # COMMAND ---------- 22 | 23 | # Load Consumption Region Table 24 | consumption_countries_hourly = spark.table(f'{db}.{consumption_countries_hourly}') 25 | 26 | # Update the key column construction in the PySpark code 27 | consumption_countries_hourly = consumption_countries_hourly.withColumn('CONSUMPTION_ID', concat(col('COUNTRY'), lit('_'), col('DATETIME').cast('string'))) 28 | 29 | # Split the labels into training and test 30 | train_labels = consumption_countries_hourly.filter((col('DATETIME') >= train_start) & (col('DATETIME') <= train_end)) 31 | test_labels = consumption_countries_hourly.filter((col('DATETIME') > test_start) & (col('DATETIME') <= test_end)) 32 | #val_labels = consumption_countries_hourly.filter((col('DATETIME') > test_end) & (col('DATETIME') <= validation_end)) 33 | 34 | # Select the required columns 35 | train_labels = train_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 36 | test_labels = test_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 37 | #val_labels = val_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 38 | 39 | # COMMAND ---------- 40 | 41 | display(train_labels) 42 | 43 | # COMMAND ---------- 44 | 45 | def load_data(table_name, labels, lookup_key, ts_lookup_key): 46 | # In the FeatureLookup, if you do not provide the `feature_names` parameter, all features except primary keys are returned 47 | model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key, timestamp_lookup_key=ts_lookup_key)] 48 | # fs.create_training_set looks up features in model_feature_lookups that match the primary key from inference_data_df 49 | training_set = fs.create_training_set(labels, 50 | model_feature_lookups, 51 | label="HOURLY_CONSUMPTION_MW", 52 | exclude_columns=["CONSUMPTION_ID", "DATETIME"]) 53 | training_df = training_set.load_df() 54 | 55 | return training_set, training_df 56 | 57 | # Cast the 'DATETIME' column to 'TIMESTAMP' data type 58 | train_labels = train_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType())) 59 | test_labels = test_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType())) 60 | #val_labels = val_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType())) 61 | 62 | # Load the data for the training set 63 | training_set, train_df = load_data(f'{db}.hourly_forecasting_features', train_labels, 'CONSUMPTION_ID', 'DATETIME') 64 | 65 | # Load the data for the test set 66 | _, test_df = load_data(f'{db}.hourly_forecasting_features', test_labels, 'CONSUMPTION_ID', 'DATETIME') 67 | 68 | # Load the data for the validation set 69 | #_, val_df = load_data(f'{db}.hourly_forecasting_features', val_labels, 'CONSUMPTION_ID', 'DATETIME') 70 | 71 | 72 | # COMMAND ---------- 73 | 74 | display(train_df) 75 | 76 | # COMMAND ---------- 77 | 78 | concatenated_df = train_df.union(test_df) 79 | display(concatenated_df) 80 | 81 | # COMMAND ---------- 82 | 83 | # MAGIC %md 84 | # MAGIC ## Train the Machine Learning Pipeline 85 | 86 | # COMMAND ---------- 87 | 88 | # MAGIC %md 89 | # MAGIC ## Train the machine learning pipeline 90 | # MAGIC Now that we have reviewed the data and prepared it as a DataFrame with numeric values, we're ready to train a model to predict future energy consumption. 91 | # MAGIC 92 | # MAGIC MLlib pipelines combine multiple steps into a single workflow, making it easier to iterate as we develop the model. 93 | # MAGIC 94 | # MAGIC In this example, we create a pipeline using the following functions: 95 | # MAGIC 96 | # MAGIC * `VectorAssembler`: Assembles the feature columns into a feature vector. 97 | # MAGIC * `VectorIndexer`: Identifies columns that should be treated as categorical. This is done heuristically, identifying any column with a small number of distinct values as categorical. In this example, all the region columns are considered categorical(2 values) 98 | # MAGIC * `SparkXGBRegressor`: Uses the SparkXGBRegressor estimator to learn how to predict energy consumption from the feature vectors. 99 | # MAGIC * `CrossValidator`: The XGBoost regression algorithm has several hyperparameters. This notebook illustrates how to use hyperparameter tuning in Spark. This capability automatically tests a grid of hyperparameters and chooses the best resulting model. 100 | 101 | # COMMAND ---------- 102 | 103 | # MAGIC %md 104 | # MAGIC * The first step is to create the VectorAssembler and VectorIndexer steps. 105 | 106 | # COMMAND ---------- 107 | 108 | # Remove the target column from the input feature set. 109 | featuresCols = concatenated_df.columns 110 | featuresCols.remove('HOURLY_CONSUMPTION_MW') 111 | 112 | # vectorAssembler combines all feature columns into a single feature vector column, "rawFeatures". 113 | vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="rawFeatures") 114 | 115 | # vectorIndexer identifies categorical features and indexes them, and creates a new column "features". 116 | vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=3) 117 | 118 | # COMMAND ---------- 119 | 120 | # MAGIC 121 | # MAGIC %md 122 | # MAGIC * Next, define the model. To use distributed training, set num_workers to the number of spark tasks you want to concurrently run during training xgboost model. 123 | 124 | # COMMAND ---------- 125 | 126 | # The next step is to define the model training stage of the pipeline. 127 | # The following command defines a XgboostRegressor model that takes an input column "features" by default and learns to predict the labels in the "cnt" column. 128 | # Set `num_workers` to the number of spark tasks you want to concurrently run during training xgboost model. 129 | xgb_regressor = SparkXGBRegressor(label_col="HOURLY_CONSUMPTION_MW") 130 | 131 | # COMMAND ---------- 132 | 133 | # MAGIC %md 134 | # MAGIC * The third step is to wrap the model you just defined in a CrossValidator stage. CrossValidator calls the XgboostRegressor estimator with different hyperparameter settings. It trains multiple models and selects the best one, based on minimizing a specified metric. In this example, the metric is root mean squared error (RMSE). 135 | 136 | # COMMAND ---------- 137 | 138 | # Define a grid of hyperparameters to test: 139 | # - maxDepth: maximum depth of each decision tree 140 | # - maxIter: iterations, or the total number of trees 141 | paramGrid = ParamGridBuilder()\ 142 | .addGrid(xgb_regressor.max_depth, [8])\ 143 | .addGrid(xgb_regressor.n_estimators, [200])\ 144 | .addGrid(xgb_regressor.learning_rate, [0.1])\ 145 | .build() 146 | 147 | # Define an evaluation metric. The CrossValidator compares the true labels with predicted values for each combination of parameters, and calculates this value to determine the best model. 148 | evaluator = RegressionEvaluator(metricName="rmse", 149 | labelCol=xgb_regressor.getLabelCol(), 150 | predictionCol=xgb_regressor.getPredictionCol()) 151 | 152 | 153 | 154 | # COMMAND ---------- 155 | 156 | # MAGIC %md 157 | # MAGIC * Create the pipeline 158 | 159 | # COMMAND ---------- 160 | 161 | pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, xgb_regressor]) 162 | 163 | # COMMAND ---------- 164 | 165 | # Declare the CrossValidator, which performs the model tuning. 166 | cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid) 167 | 168 | # COMMAND ---------- 169 | 170 | # MAGIC %md 171 | # MAGIC Train the pipeline: 172 | # MAGIC 173 | # MAGIC Now that we have set up the workflow, we can train the pipeline with a single call. 174 | # MAGIC When we call fit(), the pipeline runs feature processing, model tuning, and training and returns a fitted pipeline with the best model it found. This step takes several minutes. 175 | 176 | # COMMAND ---------- 177 | 178 | start_time = time.time() 179 | cvModel = cv.fit(train_df) 180 | end_time = time.time() 181 | # Retrieve best model in the pipeline 182 | xgb_model = cvModel.bestModel.stages[-1] 183 | 184 | # COMMAND ---------- 185 | 186 | # MAGIC %md 187 | # MAGIC Make predictions and evaluate results: 188 | # MAGIC 189 | # MAGIC The final step is to use the fitted model to make predictions on the test dataset and evaluate the model's performance. The model's performance on the test dataset provides an approximation of how it is likely to perform on new data. 190 | # MAGIC 191 | # MAGIC Computing evaluation metrics is important for understanding the quality of predictions, as well as for comparing models and tuning parameters. 192 | 193 | # COMMAND ---------- 194 | 195 | # MAGIC %md 196 | # MAGIC The `transform()` method of the pipeline model applies the full pipeline to the input dataset. The pipeline applies the feature processing steps to the dataset and then uses the fitted Xgboost Regressor model to make predictions. The pipeline returns a DataFrame with a new column predictions. 197 | 198 | # COMMAND ---------- 199 | 200 | predictions = cvModel.transform(test_df) 201 | 202 | # COMMAND ---------- 203 | 204 | # MAGIC %md 205 | # MAGIC A common way to evaluate the performance of a regression model is the calculate the root mean squared error (RMSE). The value is not very informative on its own, but you can use it to compare different models. `CrossValidator` determines the best model by selecting the one that minimizes RMSE. 206 | 207 | # COMMAND ---------- 208 | 209 | display(predictions.select("HOURLY_CONSUMPTION_MW", "prediction", *featuresCols)) 210 | 211 | # COMMAND ---------- 212 | 213 | rmse = evaluator.evaluate(predictions) 214 | print("RMSE on our test set:", rmse) 215 | 216 | # COMMAND ---------- 217 | 218 | display(predictions.select("HOURLY_CONSUMPTION_MW", "prediction")) 219 | 220 | # COMMAND ---------- 221 | 222 | # MAGIC %md 223 | # MAGIC ## Define Metrics/Parameters to be logged 224 | 225 | # COMMAND ---------- 226 | 227 | # Metrcis 228 | mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"}) 229 | mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"}) 230 | rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}) 231 | r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"}) 232 | 233 | #Hyperparameters 234 | 235 | # Get the index of the best model 236 | best_model_index = cvModel.avgMetrics.index(min(cvModel.avgMetrics)) 237 | 238 | # Get the parameters of the best model 239 | best_model_params = cvModel.getEstimatorParamMaps()[best_model_index] 240 | 241 | # Store the parameters in a dictionary 242 | hyperparameters = {} 243 | 244 | # Loop over the parameters and store them in the dictionary 245 | for param, value in best_model_params.items(): 246 | hyperparameters[param.name] = value 247 | 248 | #Model Training Time 249 | training_time = end_time - start_time 250 | 251 | #Model Training/Testing Data Size 252 | training_size = train_df.count() 253 | testing_size = test_df.count() 254 | 255 | #Current Time 256 | current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) 257 | 258 | #Description 259 | description = "The logged model is an XGBoost regressor that has been trained to predict DAILY_CONSUMPTION_MW based on various input features. It performs well in accurately estimating energy consumption. The model takes into account important factors and patterns present in the data to make reliable predictions. It has been fine-tuned using cross-validation and optimized hyperparameters to ensure its effectiveness" 260 | 261 | #Model Tags 262 | tags = { 263 | "model_type": "XGBoost Regressor", 264 | "dataset": "Energy Consumption", 265 | "application": "Energy Management", 266 | "framework": "PySpark" 267 | } 268 | 269 | # COMMAND ---------- 270 | 271 | # MAGIC %md 272 | # MAGIC ### Register Model to MLflow 273 | 274 | # COMMAND ---------- 275 | 276 | with mlflow.start_run(nested=True) as run: 277 | 278 | # Log the model input schema 279 | schema = {"input_schema": list(concatenated_df.columns[:-1]),"output_schema":concatenated_df.columns[-1]} 280 | mlflow.log_dict(schema, "schema.json") 281 | 282 | # Log some tags for the model 283 | mlflow.set_tags(tags) 284 | 285 | # Log some parameters for the model 286 | mlflow.log_dict(hyperparameters, "hyperparams.json") 287 | 288 | # Log the evaluation metrics as metrics 289 | mlflow.log_metric("MAE", mae) 290 | mlflow.log_metric("MSE", mse) 291 | mlflow.log_metric("RMSE", rmse) 292 | mlflow.log_metric("R2", r2) 293 | 294 | #Log the time taken to train as metric 295 | mlflow.log_metric("Training Time(sec)", training_time) 296 | 297 | # Log evaluation metrics as artifact 298 | metrics = {"R2": r2, "MSE": mse, "RMSE": rmse, 'MAE':mae,'Training Time(sec)':training_time} 299 | mlflow.log_dict(metrics, "metrics.json") 300 | 301 | # Log the model description as artifact 302 | mlflow.log_text(description, "description.txt") 303 | 304 | # Log the current timestamp as the code version 305 | mlflow.log_param("code_version", current_time) 306 | 307 | # Log additional important parameters for comparison 308 | mlflow.log_param("n_estimators", hyperparameters["n_estimators"]) 309 | mlflow.log_param("max_depth", hyperparameters["max_depth"]) 310 | mlflow.log_param("learning_rate", hyperparameters["learning_rate"]) 311 | mlflow.log_param("training_data_size", training_size) 312 | mlflow.log_param("testing_data_size", testing_size) 313 | 314 | # Log the model with its signature 315 | mlflow.spark.log_model(xgb_model, artifact_path="model", signature=signature,pip_requirements=pip_requirements) 316 | 317 | # Register the model with its signature 318 | model_uri = f"runs:/{mlflow.active_run().info.run_id}/model" 319 | mlflow.register_model(model_uri=model_uri, name="pyspark_mlflow_model") 320 | 321 | # Get the latest model version(The one that we now registered) 322 | client = MlflowClient() 323 | # Search for all versions of the registered model 324 | versions = client.search_model_versions("name='pyspark_mlflow_model'") 325 | # Sort the versions by creation timestamp in descending order 326 | sorted_versions = sorted(versions, key=lambda v: v.creation_timestamp, reverse=True) 327 | # Get the latest version 328 | latest_version = sorted_versions[0] 329 | # Access the version number 330 | model_version = latest_version.version 331 | 332 | # Save your data to a new DBFS directory for each run 333 | data_path = f"dbfs:/FileStore/Data_Versioning/data_model_v{model_version}.parquet" 334 | concatenated_df.write.format("parquet").save(data_path) 335 | 336 | # Log the DBFS path as an artifact 337 | with open("data_path.txt", "w") as f: 338 | f.write(data_path) 339 | mlflow.log_artifact("data_path.txt") 340 | 341 | # COMMAND ---------- 342 | 343 | 344 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/01.Feature Engineering.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Daily Inference" 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC * Import inference data and convert them to timestamp 8 | 9 | # COMMAND ---------- 10 | 11 | from datetime import datetime 12 | from pyspark.sql.functions import hour, when 13 | 14 | # Convert string dates to timestamps 15 | df = spark.sql(f""" 16 | SELECT 17 | TO_TIMESTAMP('{yesterdate}', 'yyyy-MM-dd') AS yesterdate_ts, 18 | TO_TIMESTAMP('{date}', 'yyyy-MM-dd') AS date_ts 19 | """) 20 | 21 | 22 | 23 | # COMMAND ---------- 24 | 25 | # MAGIC %md 26 | # MAGIC * Define the date to be predicted as date_to_predict, which is to be extracted from the index of the DataFrame df 27 | # MAGIC * Read data from the table "final_consumption_countries_hourly" 28 | # MAGIC * Filter the df_cons DataFrame to retain only rows from the last 720 hours (30 days) prior to the date_to_predict 29 | # MAGIC * Convert the resulting Spark DataFrame df_cons to a Pandas DataFrame 30 | 31 | # COMMAND ---------- 32 | 33 | # Select the date portion of the timestamp and convert it to string format 34 | date_df = df.select(date_format("date_ts", "yyyy-MM-dd").alias("date_string")) 35 | 36 | # Extract the date string from the DataFrame 37 | date_to_predict = date_df.first()["date_string"] 38 | # Read data from the table 39 | df_cons = spark.read.table("final_consumption_countries_hourly") 40 | 41 | # Before filtering 42 | print("Before filtering:") 43 | print(df_cons.select("DATETIME").agg({"DATETIME": "min"}).collect()[0]) 44 | print(df_cons.select("DATETIME").agg({"DATETIME": "max"}).collect()[0]) 45 | 46 | # Filter the data to include only rows from the exact previous month 47 | df_cons = df_cons.filter( 48 | (col("DATETIME") < to_date(lit(date_to_predict))) & 49 | (col("DATETIME") >= add_months(to_date(lit(date_to_predict)), -1)) 50 | ) 51 | 52 | # After filtering 53 | print("After filtering:") 54 | print(df_cons.select("DATETIME").agg({"DATETIME": "min"}).collect()[0]) 55 | print(df_cons.select("DATETIME").agg({"DATETIME": "max"}).collect()[0]) 56 | 57 | # Convert Spark DataFrame to pandas DataFrame 58 | df_cons = df_cons.toPandas() 59 | 60 | # Sort 'final_consumption' dataframe by 'DATETIME' and 'COUNTRY' 61 | df_cons.sort_values(by=['DATETIME', 'COUNTRY'], inplace=True) 62 | 63 | # Display DataFrame 64 | df_cons 65 | 66 | # COMMAND ---------- 67 | 68 | # MAGIC %md 69 | # MAGIC * Print earliest and latest timestamp to validate dates 70 | 71 | # COMMAND ---------- 72 | 73 | # Display min and max 'DATETIME' 74 | print("Earliest timestamp:", df_cons['DATETIME'].min()) 75 | print("Latest timestamp:", df_cons['DATETIME'].max()) 76 | 77 | # COMMAND ---------- 78 | 79 | # MAGIC %md 80 | # MAGIC * Create a Dataframe with 24x11 rows each, timestamp for each country will have value 1, others will have value 0(one-hot-encoding) 81 | 82 | # COMMAND ---------- 83 | 84 | # Create an array with 24 hours 85 | hours = list(range(24)) 86 | 87 | # Create a DataFrame with 24x11 rows, each timestamp for each country will have value 1, others will have value 0 88 | data = [] 89 | for hour in hours: 90 | for country in countries: 91 | timestamp_str = f"{date} {str(hour).zfill(2)}:00:00" 92 | row = [timestamp_str] + [1 if c == country else 0 for c in countries] 93 | data.append(Row(*row)) 94 | 95 | # Define column names 96 | columns = ['DATETIME'] + countries 97 | 98 | # Create DataFrame 99 | df = spark.createDataFrame(data, columns) 100 | 101 | # Convert string timestamp to actual timestamp 102 | df = df.withColumn("DATETIME", expr(f"to_timestamp(DATETIME, 'yyyy-MM-dd HH:mm:ss')")) 103 | 104 | df = df.toPandas() 105 | df 106 | 107 | 108 | # COMMAND ---------- 109 | 110 | # MAGIC %md 111 | # MAGIC The given code transforms df from wide to long format by melting it, filters the rows to include only those with a value of 1, and then drops the "VALUE" column. This operation helps reshape and manipulate the data, to deal with ategorical variables. 112 | 113 | # COMMAND ---------- 114 | 115 | df_melted = df.melt(id_vars='DATETIME', var_name='COUNTRY', value_name='VALUE') 116 | df_melted = df_melted[df_melted['VALUE'] == 1] 117 | df_melted = df_melted.drop('VALUE', axis=1) 118 | df_melted 119 | 120 | # COMMAND ---------- 121 | 122 | df_combined = pd.concat([df_melted, df_cons], axis=0) 123 | 124 | # sort the resulting dataframe by 'DATETIME' 125 | df_combined = df_combined.sort_values('DATETIME') 126 | df_combined 127 | 128 | # COMMAND ---------- 129 | 130 | # MAGIC %md 131 | # MAGIC * The function create_lag_features takes in a DataFrame df with columns 'COUNTRY', 'DATETIME', and 'HOURLY_CONSUMPTION_MW'. 132 | # MAGIC * It sorts df based on 'COUNTRY' and 'DATETIME'. 133 | # MAGIC * Creates lag features for the previous day, week, and month's consumption. 134 | # MAGIC * Forward fills NaN values in 'HOURLY_CONSUMPTION_MW' for each country. 135 | # MAGIC * Calculates rolling statistics (mean, standard deviation, sum) for the past 24 hours and 7 days. 136 | # MAGIC * Backward fills NaN values in the lag features for each country. 137 | # MAGIC * Returns the modified DataFrame with new lag and rolling features. 138 | 139 | # COMMAND ---------- 140 | 141 | def create_lag_features(df): 142 | """ 143 | Creates lag features from datetime index 144 | """ 145 | df = df.sort_values(['COUNTRY', 'DATETIME']).reset_index(drop=True) # Sort by 'COUNTRY' and 'DATETIME' and reset index 146 | # Group by country and shift to create lagged features 147 | df['PREV_DAY_CONSUMPTION'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].shift(24) 148 | df['PREV_WEEK_CONSUMPTION'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].shift(24 * 7) 149 | df['PREVIOUS_MONTH_CONSUMPTION'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].shift(24 * 30) 150 | 151 | # Forward fill to handle NaN values in HOURLY_CONSUMPTION_MW for rolling window calculations 152 | df['HOURLY_CONSUMPTION_MW'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].fillna(method='ffill') 153 | 154 | # Calculate rolling statistics for each country 155 | df['ROLLING_MEAN_24H'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].rolling(window=24,min_periods=1).mean().reset_index(0,drop=True) 156 | df['ROLLING_STD_24H'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].rolling(window=24,min_periods=1).std().reset_index(0,drop=True) 157 | df['ROLLING_SUM_7D'] = df.groupby('COUNTRY')['HOURLY_CONSUMPTION_MW'].rolling(window=7 * 24, min_periods=1).sum().reset_index(0,drop=True) 158 | 159 | # Backward fill only the rows that end up as null after shifting for each country 160 | df['PREV_DAY_CONSUMPTION'] = df.groupby('COUNTRY')['PREV_DAY_CONSUMPTION'].fillna(method='bfill') 161 | df['PREV_WEEK_CONSUMPTION'] = df.groupby('COUNTRY')['PREV_WEEK_CONSUMPTION'].fillna(method='bfill') 162 | df['PREVIOUS_MONTH_CONSUMPTION'] = df.groupby('COUNTRY')['PREVIOUS_MONTH_CONSUMPTION'].fillna(method='bfill') 163 | 164 | return df 165 | 166 | df_combined = create_lag_features(df_combined) 167 | 168 | 169 | # COMMAND ---------- 170 | 171 | df_combined 172 | 173 | # COMMAND ---------- 174 | 175 | # MAGIC %md 176 | # MAGIC The given code converts a variable to datetime format, extracts rows from a DataFrame based on a specific date, and drops a specified column from the resulting DataFrame. This allows for working with a subset of data for a specific date and removing unnecessary columns for further analysis or processing. 177 | 178 | # COMMAND ---------- 179 | 180 | # Convert your predicting_date to datetime format 181 | date_to_predict = pd.to_datetime(date_to_predict) 182 | 183 | # Extract the date from the 'DATETIME' column, compare it to predicting_date 184 | df_final = df_combined[df_combined['DATETIME'].dt.date == date_to_predict.date()] 185 | df_final.drop(columns=['HOURLY_CONSUMPTION_MW'],inplace=True) 186 | 187 | # COMMAND ---------- 188 | 189 | df_final 190 | 191 | # COMMAND ---------- 192 | 193 | # MAGIC %md 194 | # MAGIC * The function create_time_features takes a DataFrame df with 'DATETIME' as one of its columns. 195 | # MAGIC * Sets 'DATETIME' as the index of the DataFrame. 196 | # MAGIC * Extracts and creates new features such as 'HOUR', 'DAY_OF_WEEK', 'MONTH', 'QUARTER', 'YEAR', 'DAY_OF_YEAR', 'DAY_OF_MONTH', and 'WEEK_OF_YEAR' from the 'DATETIME' index. 197 | # MAGIC * Sorts the DataFrame based on the 'DATETIME' index. 198 | # MAGIC * Returns the modified DataFrame with the new time-related features. 199 | 200 | # COMMAND ---------- 201 | 202 | def create_time_features(df): 203 | """ 204 | Creates time series features from datetime index 205 | """ 206 | # Ensure 'DATETIME' is the index 207 | df.set_index('DATETIME', inplace=True) 208 | 209 | # Create date-related features 210 | df['HOUR'] = df.index.hour 211 | df['DAY_OF_WEEK'] = df.index.dayofweek 212 | df['MONTH'] = df.index.month 213 | df['QUARTER'] = df.index.quarter 214 | df['YEAR'] = df.index.year 215 | df['DAY_OF_YEAR'] = df.index.dayofyear 216 | df['DAY_OF_MONTH'] = df.index.day 217 | df['WEEK_OF_YEAR'] = df.index.isocalendar().week 218 | 219 | # Sort the DataFrame by the datetime index 220 | df.sort_index(inplace=True) 221 | 222 | return df 223 | 224 | df_final = create_time_features(df_final) 225 | df_final 226 | 227 | # COMMAND ---------- 228 | 229 | df_final 230 | 231 | # COMMAND ---------- 232 | 233 | # MAGIC %md 234 | # MAGIC * The function one_hot_encode takes a DataFrame df with 'COUNTRY' as one of its columns. 235 | # MAGIC * Defines a list of country names to be one-hot encoded. 236 | # MAGIC * Iterates through each country in the list: 237 | # MAGIC * For each country, it creates a new column in the DataFrame, named after the country. 238 | # MAGIC * Each entry in the new column is set to 1 if the 'COUNTRY' column matches the country name, otherwise it's set to 0. 239 | # MAGIC * Returns the modified DataFrame with new one-hot encoded columns for countries. 240 | 241 | # COMMAND ---------- 242 | 243 | def one_hot_encode(df): 244 | countries = ['belgium','denmark','france','germany','greece','italy','luxembourg','netherlands','spain','sweden','switzerland'] 245 | countries.sort() 246 | for country in countries: 247 | df[country] = df.apply(lambda row: 1 if row['COUNTRY'] == country else 0, axis=1) 248 | return df 249 | 250 | df_final = one_hot_encode(df_final) 251 | df_final = df_final.reset_index() 252 | df_final 253 | 254 | # COMMAND ---------- 255 | 256 | df_final.columns 257 | 258 | # COMMAND ---------- 259 | 260 | df_final[df_final['COUNTRY']=="greece"] 261 | 262 | # COMMAND ---------- 263 | 264 | # MAGIC %md 265 | # MAGIC * The code converts a Pandas DataFrame (df_final) to a Spark DataFrame (spark_df). 266 | # MAGIC * The Spark DataFrame is saved as a table in Databricks using the name specified in table_name. 267 | # MAGIC * A Delta table is created based on the Spark DataFrame. 268 | # MAGIC * The purpose is to store the data in a table format in Databricks, facilitating further analysis and querying using Spark SQL. 269 | 270 | # COMMAND ---------- 271 | 272 | spark_df = spark.createDataFrame(df_final) 273 | # Save the Spark DataFrame as a table in Databricks 274 | table_name = 'inferenece_features' 275 | spark_df.createOrReplaceTempView(table_name) 276 | spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING delta AS SELECT * FROM {table_name}") 277 | 278 | # COMMAND ---------- 279 | 280 | df_final.columns 281 | 282 | # COMMAND ---------- 283 | 284 | # MAGIC %md 285 | # MAGIC * Create a Spark DataFrame: The Pandas DataFrame df_final is converted to a Spark DataFrame using the spark.createDataFrame() function. This conversion allows for working with the DataFrame using Spark's distributed computing capabilities. 286 | # MAGIC * Create a temporary view: The Spark DataFrame df_final_spark is assigned as a temporary view named 'df_final' using the createOrReplaceTempView() function. This creates a temporary view of the DataFrame within the Spark session, enabling the execution of SQL queries and operations on the DataFrame. 287 | 288 | # COMMAND ---------- 289 | 290 | df_final_spark=spark.createDataFrame(df_final) 291 | df_final_spark.createOrReplaceTempView('df_final') 292 | 293 | # COMMAND ---------- 294 | 295 | # MAGIC %md 296 | # MAGIC * SQL Query: It executes an SQL query to select all columns from the 'df_final' table/view. 297 | # MAGIC * Add CONSUMPTION_ID column: Using the withColumn() function, a new column named 'CONSUMPTION_ID'(Primary Key) is added to the DataFrame. The values in this column are created by concatenating 'COUNTRY' and 'DATETIME' columns with an underscore ('_') separator. 298 | # MAGIC * Convert DATETIME column: Using the withColumn() function again, the 'DATETIME' column is converted to a timestamp data type by casting it with the CAST() function. 299 | # MAGIC * Drop COUNTRY column: The 'COUNTRY' column is dropped from the DataFrame using the drop() function. 300 | # MAGIC * Create temporary view: Finally, the modified DataFrame is used to create a new temporary view named 'daily_features'. 301 | 302 | # COMMAND ---------- 303 | 304 | import pyspark.sql.functions as f 305 | 306 | spark.sql('select * from df_final') \ 307 | .withColumn('CONSUMPTION_ID', f.expr('concat_ws("_", COUNTRY, DATETIME)')) \ 308 | .withColumn('DATETIME', f.expr('CAST(DATETIME AS timestamp)')) \ 309 | .drop('COUNTRY').createOrReplaceTempView('daily_features') 310 | 311 | # COMMAND ---------- 312 | 313 | # MAGIC %md 314 | # MAGIC * The variable columns is assigned the list of column names from a table named daily_features retrieved through a Spark SQL query. 315 | # MAGIC * feature_columns is created by taking all column names in columns except for 'DATETIME' and 'CONSUMPTION_ID'. 316 | # MAGIC * update_columns is a string created by joining elements from feature_columns with ' = B.' prefixed to each element and separated by commas. * This could be used in an SQL UPDATE statement. 317 | # MAGIC * insert_columns is a string created by joining 'B.' prefixed elements from feature_columns, separated by commas. This could be used in an SQL INSERT statement. 318 | 319 | # COMMAND ---------- 320 | 321 | columns = spark.sql('select * from daily_features').columns 322 | feature_columns = [column for column in columns if column not in ('DATETIME', 'CONSUMPTION_ID')] 323 | update_columns = ', '.join([f'{column} = B.{column}' for column in feature_columns]) 324 | insert_columns = ', '.join([f'B.{column}' for column in feature_columns]) 325 | 326 | # COMMAND ---------- 327 | 328 | # MAGIC %md 329 | # MAGIC * The query is merging data from a table named daily_features (aliased as B) into another table called hourly_forecasting_features (aliased as A). 330 | # MAGIC * The merge is based on the condition that the 'DATETIME' and 'CONSUMPTION_ID' columns in both tables must be equal (A.DATETIME = B.DATETIME AND A.CONSUMPTION_ID = B.CONSUMPTION_ID). 331 | # MAGIC * If there is a match between the records in tables A and B (based on 'DATETIME' and 'CONSUMPTION_ID'), then the corresponding records in table A are updated with the values from table B. The columns to be updated are defined by the string update_columns, which was created earlier to have the form column1 = B.column1, column2 = B.column2, .... 332 | # MAGIC * If there is no match between the records in table A and B, then a new record is inserted into table A with values from table B. The columns that will be inserted are 'DATETIME', 'CONSUMPTION_ID', and the additional feature columns. The columns to be inserted are defined in the format (column1, column2, ...) and the values to be inserted are in the format (B.column1, B.column2, ...). 333 | 334 | # COMMAND ---------- 335 | 336 | spark.sql(f""" 337 | MERGE INTO hourly_forecasting_features A 338 | USING daily_features B 339 | ON A.DATETIME = B.DATETIME AND A.CONSUMPTION_ID = B.CONSUMPTION_ID 340 | WHEN MATCHED THEN 341 | UPDATE SET 342 | {update_columns} 343 | WHEN NOT MATCHED 344 | THEN INSERT ( 345 | DATETIME, 346 | CONSUMPTION_ID, 347 | {', '.join(feature_columns)} 348 | ) VALUES ( 349 | B.DATETIME, 350 | B.CONSUMPTION_ID, 351 | {insert_columns} 352 | ) 353 | """) 354 | 355 | # COMMAND ---------- 356 | 357 | 358 | -------------------------------------------------------------------------------- /DOCUMENTATION.md: -------------------------------------------------------------------------------- 1 | # Project Documentation: Streamlining Energy Consumption Forecasting using MLOps 2 | 3 | ![Databricks Badge](https://img.shields.io/badge/Databricks-FF3621?style=for-the-badge&logo=databricks&logoColor=white) 4 | ![Azure Badge](https://img.shields.io/badge/Microsoft_Azure-0089D6?style=for-the-badge&logo=microsoft-azure&logoColor=white) 5 | ![MLflow Badge](https://img.shields.io/badge/MLflow-FF3621?style=for-the-badge&logo=mlflow&logoColor=white) 6 | ![Python Badge](https://img.shields.io/badge/Python-3776AB?style=for-the-badge&logo=python&logoColor=white) 7 | 8 | 9 | ## 📊 Data Source 10 | 11 | The data used in this project is sourced from the [ENTSO-E Transparency Platform](https://transparency.entsoe.eu/). ENTSO-E stands for European Network of Transmission System Operators for Electricity. 12 | It is an organization that brings together 42 electricity transmission system operators (TSOs) from 35 countries across Europe. ENTSO-E plays a crucial role in coordinating TSOs and facilitating the European electricity market. 13 | The platform provides authoritative, comprehensive, and transparent energy market data for European countries. 14 | 15 | For this project, hourly energy consumption data from 2015 to 2022 has been used. Additionally, some new data from 2023 has also been retrieved for testing purposes. The dataset includes hourly energy consumption data across 11 selected European countries (Belgium, Denmark, France, Germany, Greece, Italy, Luxembourg, Netherlands, Spain, Sweden, Switzerland). 16 | 17 | 18 | # MLOps Pipeline 19 | 20 | ## 🛠️ Data Engineering 21 | 22 | ### Data Ingestion: 23 | In the data ingestion stage, raw energy consumption data is collected from the ENTSO-E Transparency Platform. The data is downloaded and uploaded into Azure Databricks for processing. 24 | It is important to ensure that data ingestion is efficient and reliable, as it forms the foundation for the subsequent steps in the pipeline. 25 | 26 | ### Data Transformation: 27 | After ingestion, the data undergoes various transformations to convert it into a format that is suitable for analysis and modeling. 28 | This includes converting timestamps to a standardized format, aggregating data, and reshaping datasets. 29 | In this project, hourly energy consumption data is used; however, it is possible to aggregate this data for different time windows (e.g., daily) based on the requirements. 30 | 31 | ### Data Quality Checks: 32 | 33 | Quality checks are essential to ensure the integrity and completeness of the data. This includes handling missing values, identifying and rectifying any inconsistencies in the data, and ensuring that it meets the required standards for analysis. 34 | 35 | In this project, the primary tool utilized for this purpose is [Great Expectations](https://greatexpectations.io/), an open-source library for setting, validating, and documenting data expectations. 36 | 37 | Great Expectations was instrumental in defining expectations for data, which serve as automated assertions about data quality that are easy to implement and maintain. If any data does not meet these predefined expectations, the system alerts us, thereby ensuring that any decision made based on the data is as accurate as possible. 38 | 39 | For an example of the data quality reports produced by Great Expectations in this project, see the links below: 40 | 41 | #### Links to Data Quality report files: 42 | 43 | - [**Data Quality Expectations**](https://philippos01.github.io/mlops-energy-forecast-thesis/MLOps%20Pipeline/Utils/Great%20Expectations/my_expectation_suite.html) 44 | - [**Data Quality Validation**](https://philippos01.github.io/mlops-energy-forecast-thesis/MLOps%20Pipeline/Utils/Great%20Expectations/fde64798683368bcaf8fe113b0dd4b14.html) 45 | 46 | 47 | ## 🚀 Initial Deployment 48 | 49 | This section describes the critical steps undertaken during the initial deployment phase of the MLOps pipeline. The pipeline consists of an exploratory data analysis, feature engineering, model training, and unit testing. 50 | 51 | ### 🕵️‍♂️ Exploratory Data Analysis (EDA): 52 | Before diving into model training, it is essential to understand the characteristics of the data. Exploratory Data Analysis (EDA) involves summarizing the main features of the data, usually with visual methods. Through EDA, we can begin to uncover patterns, spot anomalies, and frame hypotheses for testing. 53 | 54 | - **Univariate Analysis**: Involves the examination of single features or variables. For this project, the Univariate Analysis includes: 55 | * Distribution of records across years, months, days, and hours. 56 | * Frequency of records for each country. 57 | 58 | - **Bivariate Analysis**: Investigates the relationship between two features or variables. In this project, the Bivariate Analysis includes: 59 | * Average hourly consumption per country. 60 | * Monthly consumption trends per country. 61 | 62 | - **Visualizations**: Creating graphical representations of the data. For this project, specific visualizations include: 63 | * A heatmap for Average Hourly Consumption by Country and Hour of Day to observe patterns in energy consumption. 64 | * Decomposition plots for each country to examine original, trend, seasonality, and residuals in the time series data. 65 | 66 | ### 🧪 Feature Engineering: 67 | After understanding the data through EDA, the next step is to prepare it for modeling. Feature engineering includes creating new features, transforming existing ones, and encoding categorical variables. 68 | 69 | - **One-Hot Encoding of Countries**: This involves converting the categorical 'country' feature into numerical format, where each country is represented as a binary vector. 70 | 71 | - **Feature Creation**: Generating new features that might improve the model's performance. For example, creating time-based features like the day of the week, month, year. 72 | 73 | - **Primary Key Creation**: Creating a unique identifier for each record. This is essential for indexing and retrieving records efficiently from the database. 74 | 75 | - **Saving Features to Databricks Feature Store**: After engineering, features are saved in Databricks Feature Store, which acts as a centralized repository for feature data, ensuring consistency across different models and deployments. 76 | 77 | ### 🤖 Model Training: 78 | With the features prepared, we now proceed to model training. This step involves selecting an algorithm, training the model, and evaluating its performance. 79 | 80 | - **Data Loading from Feature Store**: The features engineered previously are loaded from Databricks Feature Store. 81 | 82 | - **Data Splitting**: The dataset is split into training and testing sets by ensuring the continuity of the data to correctly evaluate the model's performance on unseen data. 83 | 84 | - **Model Creation and Training**: The algorithm is selected, and the model is trained using the training dataset. 85 | 86 | - **Logging to Feature Store**: The trained model, along with its metrics and parameters and artifacts is logged in the Databricks Feature Store for versioning and reproducibility. 87 | 88 | ### 🧪 Unit Testing: 89 | After the model is trained, it undergoes unit testing to ensure that it meets the required performance benchmarks. 90 | 91 | - **Performance Testing**: The model is subjected to a set of tests to evaluate its performance. Metrics such as Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE) are used. 92 | 93 | - **Proceed to Staging Environment**: If the model passes performance tests, it is moved to a staging environment. This stage closely resembles the production environment and is used for final testing before the model is deployed for real-world use. 94 | 95 | 96 | ## 🔄 Daily Inference 97 | 98 | This subsection outlines the daily inference procedure which is a crucial aspect of the MLOps pipeline. It ensures that the model continues to provide value by making predictions on new data. 99 | The daily inference procedure comprises three key steps: feature engineering on new data, the inference process itself, and monitoring the predictions. 100 | 101 | ### 🧪 Feature Engineering on New Data: 102 | To make predictions on new data, it's important to transform the data in a manner consistent with the training data. This involves applying the same transformations and encodings that were done during the initial deployment phase. 103 | 104 | - **Data Transformation**: The new data is transformed to ensure it's in a compatible format for the model to make predictions. This includes handling any missing values, encoding categorical variables, and creating new features. 105 | 106 | - **Saving Transformed Data**: Once the data is transformed, it's saved in a structured format that is easily retrievable. This structured data will be used for making predictions. 107 | 108 | ### 🎯 Daily Inference Procedure: 109 | This is the process where the model uses the transformed new data to make predictions. These predictions can be used for various applications such as forecasting energy consumption. 110 | 111 | - **Retrieving New Data**: The transformed new data is retrieved from the database. 112 | 113 | - **Batch Scoring**: The model, using a batch scoring function in the feature store, makes predictions on the new data. Batch scoring is efficient for making predictions on large datasets. 114 | 115 | - **Saving Predictions**: The predictions made by the model are saved back to the database. This data can be retrieved later for analysis and reporting. 116 | 117 | ### 📊 Daily Monitoring Procedure: 118 | After the predictions are made and saved, it is critical to monitor how the model is performing on new data. This involves evaluating predictions and creating visualizations. 119 | 120 | - **Retrieving Predicted Data**: The data that has been predicted by the model is retrieved from the database. 121 | 122 | - **Evaluating Predictions**: The predictions are evaluated through various metrics to understand how well the model is performing. 123 | 124 | - **Creating Visualizations**: Visualizations such as graphs and charts are created to help interpret the predictions. This can include trend analysis and distribution of predictions over time. 125 | 126 | - **Reporting**: The results from the evaluation and visualizations are documented and reported. This reporting can be used for decision-making and planning. 127 | 128 | By employing a systematic daily inference procedure, it ensures that the model remains functional and valuable in a real-world setting, while constantly monitoring its performance. 129 | 130 | ## 🔄 Model Retraining 131 | 132 | One of the key aspects of maintaining a robust MLOps pipeline is to ensure that the deployed models remain efficient and accurate over time. The Model Retraining subsection focuses on the automated retraining of models on a regular basis, using the latest data. 133 | 134 | ### 🗓️ Scheduled Retraining: 135 | Model retraining is scheduled to occur automatically at regular intervals - every 1, 3, or 6 months. This is essential as data patterns may evolve, and the model needs to adapt to these changes to maintain its accuracy and relevance. 136 | 137 | - **Data Preparation**: The data saved during the daily inference, which includes the predicted values and their corresponding actual values, is used for retraining. This dataset is accumulated over the decided interval (1, 3, or 6 months). 138 | 139 | - **Retraining Process**: The model is retrained using the accumulated data. This ensures that the model learns from the most recent data patterns and adapts its parameters accordingly. 140 | 141 | ### 📈 Performance Evaluation: 142 | Post retraining, it's imperative to evaluate the model's performance to ascertain whether there’s an improvement in the predictions. 143 | 144 | - **Tracking Progress**: The performance of the models over time is tracked. This includes monitoring the number of trainings and retrainings and how the metrics evolve with each iteration. 145 | 146 | - **Comparative Analysis**: The retrained model, which is initially in the staging environment, is compared against the current production model. The evaluation metrics of both models are analyzed to determine if the retrained model shows improved performance. 147 | 148 | - **Model Promotion**: If the retrained model in the staging environment outperforms the current production model, it is promoted to replace the production model. The model that was in production is archived for record-keeping. 149 | 150 | - **Documentation**: All the steps, decisions, and metrics are documented for future reference and transparency. 151 | 152 | By continuously monitoring and retraining the model, this process ensures that the model remains adaptive to changing data patterns and provides the most accurate and efficient predictions possible. 153 | 154 | 155 | ## 🚀 Deployment Strategy 156 | 157 | In the context of MLOps, deployment is a critical phase where the machine learning model is integrated into a production environment, making it accessible for real-world forecasting. A robust deployment strategy ensures that the model is reliable, scalable, and efficient. 158 | 159 | ### MLflow and Databricks Integration: 160 | - The model, post-training, is saved and registered in Feature Store within MLflow, a platform that manages the ML lifecycle, including experimentation, reproducibility, and deployment. 161 | 162 | - MLflow is natively integrated within the Databricks workspace. This seamless integration is crucial as it allows for efficient management and tracking of models within a familiar ecosystem. 163 | 164 | ### Scalability and Performance: 165 | - Databricks, known for its high-performance analytics engine, is particularly suited for handling large datasets and complex computations. By deploying the model within Databricks, we leverage its ability to scale effortlessly to meet data and computational demands. 166 | 167 | # 🔄 Workflow Overview 168 | 169 | In this project, we have established three main workflows that are integral to the systematic functioning and updating of the energy consumption forecasting system: 170 | 171 | ## 1. Initial Deployment / Redeployment 172 | 173 | This workflow encompasses all the steps necessary for the initial deployment of the model, as well as any subsequent redeployments. It includes data engineering, exploratory data analysis, feature engineering, model training, and performance evaluation. This workflow is initiated manually and ensures that the model is properly set up and integrated into the Azure Databricks and MLflow ecosystem. 174 | 175 | ## 2. Daily Inference 176 | 177 | The Daily Inference workflow is automated and triggered every day. Its purpose is to forecast the energy consumption for the next day. This workflow starts by retrieving new data from the database and processing it to be compatible with the model. Through the batch scoring function of the feature store, predictions are generated and subsequently saved back into the database for further analysis and utilization. 178 | 179 | ## 3. Model Retraining 180 | 181 | The Model Retraining workflow is designed to ensure that the forecasting model remains up-to-date and incorporates the latest data for higher accuracy. This workflow is automatically triggered every three months. During this process, the model is retrained using newly collected data that has been saved during the Daily Inference workflow. After the retraining process, the model's performance is evaluated and compared to the current production model. If the retrained model exhibits improved performance, it replaces the existing production model, which is then archived. 182 | 183 | These workflows are designed to work seamlessly together to provide an efficient, scalable, and up-to-date energy consumption forecasting system. Through automation and systematic processes, this setup ensures accuracy and sustainability in forecasting energy consumption across multiple European countries. 184 | 185 | 186 | ## Overall Architecture 187 | ![MLOps Architecture](MLOps%20Pipeline/Utils/Images/MLOps%20Architecture%20(1).png) 188 | ## 🎉 Conclusion 189 | 190 | This project efficiently addresses the challenge of forecasting energy consumption across multiple European countries. By employing Azure Databricks and MLflow, it leverages a powerful and scalable environment for data processing and model deployment. Continuous monitoring and automatic retraining ensure that the model remains accurate and up-to-date. 191 | This solution offers immense value to utilities and grid operators in optimizing energy management and planning. 192 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/00. Initial Deployment/Model Training(LSTM).py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment" 3 | 4 | # COMMAND ---------- 5 | 6 | from pyspark import SparkContext, SparkConf 7 | from pyspark.sql import SQLContext 8 | from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler, MinMaxScaler 9 | from pyspark.sql.functions import rand 10 | from pyspark.mllib.evaluation import MulticlassMetrics 11 | import tensorflow as tf 12 | from tensorflow.keras.layers import Dense 13 | from tensorflow.keras.models import Sequential 14 | # initialize SparkSession 15 | 16 | # COMMAND ---------- 17 | 18 | # Load Consumption Region Table 19 | consumption_countries_hourly = spark.table('df_dev.final_consumption_countries_hourly') 20 | 21 | # Update the key column construction in the PySpark code 22 | consumption_countries_hourly = consumption_countries_hourly.withColumn('CONSUMPTION_ID', concat(col('COUNTRY'), lit('_'), col('DATETIME').cast('string'))) 23 | 24 | # Split the labels into training and test 25 | train_labels = consumption_countries_hourly.filter((col('DATETIME') >= train_start) & (col('DATETIME') <= train_end)) 26 | test_labels = consumption_countries_hourly.filter((col('DATETIME') > train_end) & (col('DATETIME') <= test_end)) 27 | val_labels = consumption_countries_hourly.filter((col('DATETIME') > test_end) & (col('DATETIME') <= validation_end)) 28 | 29 | # Select the required columns 30 | train_labels = train_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 31 | test_labels = test_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 32 | val_labels = val_labels.select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 33 | 34 | # COMMAND ---------- 35 | 36 | 37 | def load_data(table_name, labels, lookup_key, ts_lookup_key): 38 | # In the FeatureLookup, if you do not provide the `feature_names` parameter, all features except primary keys are returned 39 | model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key, timestamp_lookup_key=ts_lookup_key)] 40 | # fs.create_training_set looks up features in model_feature_lookups that match the primary key from inference_data_df 41 | training_set = fs.create_training_set(labels, 42 | model_feature_lookups, 43 | label="HOURLY_CONSUMPTION_MW", 44 | exclude_columns=["CONSUMPTION_ID", "DATETIME"]) 45 | training_df = training_set.load_df() 46 | 47 | return training_set, training_df 48 | 49 | # Cast the 'DATETIME' column to 'TIMESTAMP' data type 50 | train_labels = train_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType())) 51 | test_labels = test_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType())) 52 | val_labels = val_labels.withColumn('DATETIME', col('DATETIME').cast(TimestampType())) 53 | 54 | # Load the data for the training set 55 | training_set, train_df = load_data(f'{db}.hourly_forecasting_features', train_labels, 'CONSUMPTION_ID', 'DATETIME') 56 | 57 | # Load the data for the test set 58 | _, test_df = load_data(f'{db}.hourly_forecasting_features', test_labels, 'CONSUMPTION_ID', 'DATETIME') 59 | 60 | # Load the data for the validation set 61 | _, val_df = load_data(f'{db}.hourly_forecasting_features', val_labels, 'CONSUMPTION_ID', 'DATETIME') 62 | 63 | 64 | # COMMAND ---------- 65 | 66 | display(train_df) 67 | 68 | # COMMAND ---------- 69 | 70 | train_df 71 | 72 | # COMMAND ---------- 73 | 74 | # MAGIC %md 75 | # MAGIC Define the features and label columns: We first need to specify which columns in the dataframe are features and which column is the label 76 | 77 | # COMMAND ---------- 78 | 79 | featuresCols = train_df.columns[:-1] 80 | target_names = [train_df.columns[-1]] 81 | 82 | # COMMAND ---------- 83 | 84 | # MAGIC %md 85 | # MAGIC Create VectorAssembler and MinMaxScaler objects: VectorAssembler combines the specified feature columns into a single vector column. MinMaxScaler normalizes these feature vectors to be in the range [0, 1]. 86 | 87 | # COMMAND ---------- 88 | 89 | # Assuming you have loaded your dataset into a DataFrame called 'data' 90 | # Assuming the label column name is 'label' 91 | 92 | # Extract the labels from the DataFrame 93 | labels = train_df.select('HOURLY_CONSUMPTION_MW').rdd.flatMap(lambda x: x).collect() 94 | 95 | # Find the minimum and maximum values of the labels 96 | min_label = min(labels) 97 | max_label = max(labels) 98 | 99 | 100 | # COMMAND ---------- 101 | 102 | from pyspark.ml.feature import VectorAssembler, MinMaxScaler 103 | 104 | vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol='assembled_features') 105 | # Create a separate MinMaxScaler for features 106 | scaler_features = MinMaxScaler(min=0.0, max=1.0, inputCol='assembled_features', outputCol='features') 107 | vectorAssemblerLabel = VectorAssembler(inputCols=target_names, outputCol='label') 108 | scaler_label = MinMaxScaler(min=0.0, max=1.0, inputCol='label', outputCol='scaled_label') 109 | 110 | # COMMAND ---------- 111 | 112 | # MAGIC %md 113 | # MAGIC Create a pipeline of transformations: The pipeline includes vector assembly and scaling stages. 114 | 115 | # COMMAND ---------- 116 | 117 | from pyspark.ml import Pipeline 118 | 119 | stages = [vectorAssembler,scaler_features,vectorAssemblerLabel,scaler_label] 120 | pipeline = Pipeline(stages=stages) 121 | 122 | # COMMAND ---------- 123 | 124 | # MAGIC %md 125 | # MAGIC Apply the transformations to each DataFrame: 126 | 127 | # COMMAND ---------- 128 | 129 | # Fit the pipeline to the training data 130 | pipeline_model = pipeline.fit(train_df) 131 | 132 | # Transform each DataFrame 133 | train_transformed = pipeline_model.transform(train_df) 134 | val_transformed = pipeline_model.transform(val_df) 135 | test_transformed = pipeline_model.transform(test_df) 136 | 137 | 138 | # COMMAND ---------- 139 | 140 | # Save the fitted pipeline for later use 141 | pipeline_model.write().overwrite().save("/dbfs/FileStore/Fitted_Pipeline") 142 | 143 | # COMMAND ---------- 144 | 145 | train_transformed.show(truncate=False, vertical=True, n=1) 146 | 147 | # COMMAND ---------- 148 | 149 | # MAGIC %md 150 | # MAGIC Convert to Pandas DataFrames 151 | 152 | # COMMAND ---------- 153 | 154 | # Convert to pandas 155 | train_pd = train_transformed.toPandas() 156 | val_pd = val_transformed.toPandas() 157 | test_pd = test_transformed.toPandas() 158 | 159 | # COMMAND ---------- 160 | 161 | # MAGIC %md 162 | # MAGIC Exctract features and labels 163 | 164 | # COMMAND ---------- 165 | 166 | train_pd 167 | 168 | # COMMAND ---------- 169 | 170 | # Extract features and labels 171 | import numpy as np 172 | X_train = np.array(train_pd['features'].to_list()) 173 | y_train = np.array(train_pd['scaled_label'].to_list()) 174 | 175 | X_val = np.array(val_pd['features'].to_list()) 176 | y_val = np.array(val_pd['scaled_label'].to_list()) 177 | 178 | X_test = np.array(test_pd['features'].to_list()) 179 | y_test = np.array(test_pd['scaled_label'].to_list()) 180 | 181 | # COMMAND ---------- 182 | 183 | y_train[0:1] 184 | 185 | # COMMAND ---------- 186 | 187 | X_train[0:1] 188 | 189 | # COMMAND ---------- 190 | 191 | # MAGIC %md 192 | # MAGIC Reshape data for LSTM: 193 | 194 | # COMMAND ---------- 195 | 196 | # Reshape for LSTM 197 | X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1)) 198 | X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1)) 199 | X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1)) 200 | 201 | # COMMAND ---------- 202 | 203 | y_train.shape 204 | 205 | # COMMAND ---------- 206 | 207 | X_train.shape 208 | 209 | # COMMAND ---------- 210 | 211 | # MAGIC %md 212 | # MAGIC Define and compile LSTM model: 213 | 214 | # COMMAND ---------- 215 | 216 | from tensorflow.keras.models import Sequential 217 | from tensorflow.keras.layers import LSTM, Dense 218 | from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint 219 | import os 220 | 221 | experiment_log_dir = f"/dbfs/{user}/tb" 222 | checkpoint_path = f"/dbfs/{user}/keras_checkpoint_weights_day_ckpt" 223 | os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True) 224 | 225 | epochs = 100 226 | early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=10, restore_best_weights = True) 227 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=experiment_log_dir) 228 | model_checkpoint = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=True) 229 | 230 | # COMMAND ---------- 231 | 232 | model = Sequential() 233 | model.add(LSTM(100, activation='tanh', return_sequences=True, input_shape=(X_train.shape[1], 1))) 234 | model.add(LSTM(64, activation='tanh', return_sequences=False)) 235 | model.add(Dense(1)) 236 | model.compile(optimizer='adam', loss='mse',metrics=['mae']) 237 | 238 | # COMMAND ---------- 239 | 240 | start_time = time.time() 241 | history = model.fit(X_train, y_train, validation_data = (X_val , y_val), epochs=epochs, callbacks=[tensorboard_callback, model_checkpoint, early_stopping],verbose=1) 242 | end_time = time.time() 243 | 244 | # COMMAND ---------- 245 | 246 | # Validate the model 247 | val_loss = model.evaluate(X_val, y_val) 248 | 249 | # Test the model 250 | test_loss = model.evaluate(X_test, y_test) 251 | 252 | # COMMAND ---------- 253 | 254 | y_pred = model.predict(X_test) 255 | 256 | # COMMAND ---------- 257 | 258 | import numpy as np 259 | 260 | # Assuming you have loaded the predicted scaled labels into a variable called 'y_pred' 261 | 262 | # Define the minimum and maximum values for the labels 263 | min_label = 201.0 264 | max_label = 324310.0 265 | 266 | # Compute the scaled label range 267 | scaled_label_range = max_label - min_label 268 | 269 | # Perform inverse scaling on the predicted labels 270 | y_pred_original = (y_pred * scaled_label_range) + min_label 271 | y_test_original = (y_test * scaled_label_range) + min_label 272 | 273 | 274 | # COMMAND ---------- 275 | 276 | y_pred.flatten() 277 | 278 | # COMMAND ---------- 279 | 280 | # create a dataframe 281 | compare_df = pd.DataFrame({'Actual': y_test_original.flatten(), 'Predicted': y_pred_original.flatten()}) 282 | compare_df 283 | 284 | # COMMAND ---------- 285 | 286 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 287 | import numpy as np 288 | 289 | # Generate predictions 290 | y_pred = model.predict(X_test) 291 | 292 | # Flatten y_test and y_pred to 1D arrays (this may not be necessary depending on the shape of your arrays) 293 | y_test_flat = y_test.flatten() 294 | y_pred_flat = y_pred.flatten() 295 | 296 | # Compute RMSE 297 | rmse = np.sqrt(mean_squared_error(y_test_flat, y_pred_flat)) 298 | print("Root Mean Square Error: ", rmse) 299 | 300 | # Compute MAE 301 | mae = mean_absolute_error(y_test_flat, y_pred_flat) 302 | print("Mean Absolute Error: ", mae) 303 | 304 | # Compute R2 Score 305 | r2 = r2_score(y_test_flat, y_pred_flat) 306 | print("R-squared: ", r2) 307 | 308 | 309 | # COMMAND ---------- 310 | 311 | # Since your output might be multi-dimensional, you might want to select a specific dimension for plotting 312 | # Here's an example for the first dimension 313 | dim = 0 314 | y_test_dim = y_test[:, dim] 315 | y_test_pred_dim = y_pred[:, dim] 316 | 317 | # Create a new figure 318 | plt.figure(figsize=(10, 6)) 319 | 320 | # Plot the actual values 321 | plt.plot(y_test_dim, 'b-', label='actual') 322 | 323 | # Plot the predicted values 324 | plt.plot(y_test_pred_dim, 'r-', label='predicted') 325 | 326 | # Create the legend 327 | plt.legend() 328 | 329 | # Show the plot 330 | plt.show() 331 | 332 | # COMMAND ---------- 333 | 334 | # MAGIC %md 335 | # MAGIC ## Define Metrics/Parameters to be logged 336 | 337 | # COMMAND ---------- 338 | 339 | # Metrcis 340 | mse = mean_squared_error(y_test_flat, y_pred_flat) 341 | mae = mean_absolute_error(y_test_flat, y_pred_flat) 342 | rmse = np.sqrt(mse) # or mse**(0.5) 343 | r2 = r2_score(y_test_flat, y_pred_flat) 344 | 345 | #Hyperparameters 346 | hyperparameters = { 347 | "epochs": epochs, 348 | "batch_size": 21088, # if you defined a batch size 349 | "early_stopping_patience": early_stopping.patience, 350 | "optimizer": str(type(model.optimizer).__name__), 351 | "loss_function": model.loss.__name__ if callable(model.loss) else str(model.loss), 352 | "first_layer_units": model.layers[0].units, 353 | "first_layer_activation": model.layers[0].activation.__name__ if callable(model.layers[0].activation) else str(model.layers[0].activation), 354 | "second_layer_units": model.layers[1].units, 355 | "second_layer_activation": model.layers[1].activation.__name__ if callable(model.layers[1].activation) else str(model.layers[1].activation), 356 | "min_label" : min_label, 357 | "max_label" : max_label, 358 | "training_size":len(X_train), 359 | "training_range": { 360 | 'start': '2015-01-01', 361 | 'end': '2021-12-31' 362 | }, 363 | "testing_size":len(X_test), 364 | "testing_range":{ 365 | 'start':'2022-01-01', 366 | 'end':'2022-09-30' 367 | }, 368 | "validation_size" : len(X_val), 369 | "validation_range":{ 370 | 'start':'2022-10-01', 371 | 'end':'2023-01-01' 372 | } 373 | 374 | } 375 | 376 | #Model Training Time 377 | training_time = end_time - start_time 378 | 379 | #Current Time 380 | current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) 381 | 382 | # Description 383 | description = "The logged model is an LSTM-based recurrent neural network that has been trained to predict DAILY_CONSUMPTION_MW based on various input features. It leverages the temporal dependencies present in the data, making it suitable for energy consumption prediction. The model has been fine-tuned with the optimal number of epochs and other hyperparameters to ensure its effectiveness." 384 | 385 | # Model Tags 386 | tags = { 387 | "model_type": "RNN LSTM", 388 | "dataset": "Energy Consumption", 389 | "application": "Energy Management", 390 | "framework": "TensorFlow/Keras" 391 | } 392 | 393 | 394 | # COMMAND ---------- 395 | 396 | # MAGIC %md 397 | # MAGIC ## Register Model to Mlflow 398 | 399 | # COMMAND ---------- 400 | 401 | from mlflow.models.signature import infer_signature 402 | signature = infer_signature(X_train, model.predict(X_train)) 403 | 404 | # COMMAND ---------- 405 | 406 | model_name = 'lstm_model' 407 | 408 | # COMMAND ---------- 409 | 410 | with mlflow.start_run(nested=True) as run: 411 | 412 | # Log the model input schema 413 | schema = {"input_schema": list(train_df.columns[:-1]),"output_schema":train_df.columns[-1]} 414 | mlflow.log_dict(schema, "schema.json") 415 | 416 | # Log some tags for the model 417 | mlflow.set_tags(tags) 418 | 419 | # Log some parameters for the model 420 | mlflow.log_dict(hyperparameters, "hyperparams.json") 421 | 422 | # Log the evaluation metrics as metrics 423 | mlflow.log_metric("MAE", mae) 424 | mlflow.log_metric("MSE", mse) 425 | mlflow.log_metric("RMSE", rmse) 426 | mlflow.log_metric("R2", r2) 427 | 428 | #Log the time taken to train as metric 429 | mlflow.log_metric("Training Time(sec)", training_time) 430 | 431 | # Log evaluation metrics as artifact 432 | metrics = {"R2": r2, "MSE": mse, "RMSE": rmse, 'MAE':mae} 433 | mlflow.log_dict(metrics, "metrics.json") 434 | 435 | # Log the model description as artifact 436 | mlflow.log_text(description, "description.txt") 437 | 438 | # Log the current timestamp as the code version 439 | mlflow.log_param("code_version", current_time) 440 | 441 | # Log all hyperparameters 442 | mlflow.log_params(hyperparameters) 443 | 444 | fs.log_model( 445 | model=model, 446 | artifact_path=f"{model_name}_artifact_path", 447 | flavor=mlflow.tensorflow, 448 | training_set = training_set , 449 | registered_model_name = model_name 450 | ) 451 | 452 | # COMMAND ---------- 453 | 454 | with mlflow.start_run(nested=True) as run: 455 | 456 | # Log the model input schema 457 | schema = {"input_schema": list(train_df.columns[:-1]),"output_schema":train_df.columns[-1]} 458 | mlflow.log_dict(schema, "schema.json") 459 | 460 | # Log some tags for the model 461 | mlflow.set_tags(tags) 462 | 463 | # Log some parameters for the model 464 | mlflow.log_dict(hyperparameters, "hyperparams.json") 465 | 466 | # Log the evaluation metrics as metrics 467 | mlflow.log_metric("MAE", mae) 468 | mlflow.log_metric("MSE", mse) 469 | mlflow.log_metric("RMSE", rmse) 470 | mlflow.log_metric("R2", r2) 471 | 472 | #Log the time taken to train as metric 473 | #mlflow.log_metric("Training Time(sec)", training_time) 474 | 475 | # Log evaluation metrics as artifact 476 | metrics = {"R2": r2, "MSE": mse, "RMSE": rmse, 'MAE':mae,'Training Time(sec)':training_time} 477 | mlflow.log_dict(metrics, "metrics.json") 478 | 479 | # Log the model description as artifact 480 | mlflow.log_text(description, "description.txt") 481 | 482 | # Log the current timestamp as the code version 483 | mlflow.log_param("code_version", current_time) 484 | 485 | # Log all hyperparameters 486 | mlflow.log_params(hyperparameters) 487 | 488 | # Log the model with its signature 489 | mlflow.keras.log_model(model,artifact_path="model", signature=signature) 490 | 491 | # Register the model with its signature 492 | model_uri = f"runs:/{mlflow.active_run().info.run_id}/model" 493 | mlflow.register_model(model_uri=model_uri, name="lstm_model") 494 | 495 | # Get the latest model version(The one that we now registered) 496 | client = MlflowClient() 497 | # Search for all versions of the registered model 498 | versions = client.search_model_versions("name='lstm_model'") 499 | # Sort the versions by creation timestamp in descending order 500 | sorted_versions = sorted(versions, key=lambda v: v.creation_timestamp, reverse=True) 501 | # Get the latest version 502 | latest_version = sorted_versions[0] 503 | # Access the version number 504 | model_version = latest_version.version 505 | 506 | # Save your data to a new DBFS directory for each run 507 | data_path = f"dbfs:/FileStore/Data_Versioning/data_model_v{model_version}.parquet" 508 | train_df.write.format("parquet").save(data_path) 509 | 510 | # Log the DBFS path as an artifact 511 | with open("data_path.txt", "w") as f: 512 | f.write(data_path) 513 | mlflow.log_artifact("data_path.txt") 514 | 515 | # COMMAND ---------- 516 | 517 | 518 | -------------------------------------------------------------------------------- /MLOps Pipeline/ML Engineering/Demand Forecasting Daily/06.Performance Evaluation.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Configuration 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %run "/Repos/filippos.priovolos01@gmail.com/mlops-energy-forecast-thesis/MLOps Thesis Pipeline/Workflow Config/Initial Deployment" 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %md 12 | # MAGIC ## Configuration 13 | 14 | # COMMAND ---------- 15 | 16 | date_object = datetime.strptime(train_end, '%Y-%m-%d') 17 | new_train_end = (date_object + relativedelta(months=3)).strftime('%Y-%m-%d') 18 | date_object = datetime.strptime(test_start, '%Y-%m-%d') 19 | new_test_start = (date_object + relativedelta(months=3)).strftime('%Y-%m-%d') 20 | new_test_end = '2023-01-01' 21 | 22 | # COMMAND ---------- 23 | 24 | # MAGIC %md 25 | # MAGIC ## Load all the Current Data 26 | 27 | # COMMAND ---------- 28 | 29 | # MAGIC %md 30 | # MAGIC * Load energy consumption data from a database into a Pandas DataFrame. 31 | # MAGIC * Create a new column CONSUMPTION_ID by concatenating country codes with the date-time information. 32 | # MAGIC * Convert the DATETIME column to a proper datetime data type for time-based operations. 33 | # MAGIC * Define test labels based on date-time ranges. 34 | # MAGIC * Convert the test labels back into Spark DataFrames and select only the CONSUMPTION_ID, DATETIME, and HOURLY_CONSUMPTION_MW columns for further processing 35 | 36 | # COMMAND ---------- 37 | 38 | # Load Consumption Region Table 39 | consumption_countries_hourly = spark.table('df_dev.final_consumption_countries_hourly').toPandas() 40 | consumption_countries_hourly['CONSUMPTION_ID'] = consumption_countries_hourly.COUNTRY + '_' + consumption_countries_hourly.DATETIME.astype(str) 41 | consumption_countries_hourly['DATETIME'] = pd.to_datetime(consumption_countries_hourly['DATETIME']) 42 | test_labels = consumption_countries_hourly.loc[(consumption_countries_hourly.DATETIME > new_test_start) & (consumption_countries_hourly.DATETIME <= new_test_end)] 43 | test_labels = spark.createDataFrame(test_labels).select("CONSUMPTION_ID", "DATETIME", "HOURLY_CONSUMPTION_MW") 44 | 45 | # COMMAND ---------- 46 | 47 | # MAGIC %md 48 | # MAGIC ## Get Initial Deployment Training Runs Based on Experiment ID 49 | 50 | # COMMAND ---------- 51 | 52 | # MAGIC %md 53 | # MAGIC This code snippet uses the mlflow library to search for experiment runs based on a specific experiment ID (experiment_id_training), orders them by the Mean Absolute Error (MAE) metric, and stores the results in a DataFrame called runs_training. It then displays the first 5 rows of this DataFrame. 54 | 55 | # COMMAND ---------- 56 | 57 | runs_training = mlflow.search_runs(experiment_ids=experiment_id_training, 58 | order_by=['metrics.MAE']) 59 | runs_training.head(5) 60 | 61 | # COMMAND ---------- 62 | 63 | # MAGIC %md 64 | # MAGIC ## Find Best Runs of Past Month for the Initial Model Training 65 | 66 | # COMMAND ---------- 67 | 68 | #earliest_start_time = (datetime.now() - timedelta(days=14)).strftime('%Y-%m-%d') 69 | #recent_runs = runs_training[runs_training.start_time >= earliest_start_time] 70 | runs_training = runs_training.assign(Run_Date=runs_training.start_time.dt.floor(freq='D')) 71 | 72 | # Filter the rows to only include those with non-null values in the "metrics.MAE" column 73 | runs_training = runs_training[runs_training['metrics.MAE'].notna()] 74 | #print("Length of recent_runs before filtering: ", len(runs_training)) 75 | #print("Length of recent_runs after filtering: ", len(runs_training)) 76 | 77 | best_runs_per_day_idx = runs_training.groupby(['Run_Date'])['metrics.MAE'].idxmin() 78 | best_runs = runs_training.loc[best_runs_per_day_idx] 79 | 80 | # Select the required columns for display 81 | metrics_columns = ['Run_Date', 'metrics.MAE', 'metrics.Training Time(sec)', 'metrics.RMSE', 'metrics.R2', 'metrics.MSE'] 82 | display(best_runs[metrics_columns]) 83 | 84 | # COMMAND ---------- 85 | 86 | # MAGIC %md 87 | # MAGIC ## Get Retraining Best Runs Based on Experiment Id 88 | 89 | # COMMAND ---------- 90 | 91 | runs_retraining = mlflow.search_runs(experiment_ids=experiment_id_retraining, 92 | order_by=['metrics.MAE']) 93 | runs_retraining.head(5) 94 | 95 | # COMMAND ---------- 96 | 97 | #earliest_start_time = (datetime.now() - timedelta(days=14)).strftime('%Y-%m-%d') 98 | #recent_runs = runs_retraining[runs_retraining.start_time >= earliest_start_time] 99 | runs_retraining = runs_retraining.assign(Run_Date=runs_retraining.start_time.dt.floor(freq='D')) 100 | 101 | # Filter the rows to only include those with non-null values in the "metrics.MAE" column 102 | runs_retraining = runs_retraining[runs_retraining['metrics.MAE'].notna()] 103 | #print("Length of recent_runs before filtering: ", len(runs_retraining)) 104 | #print("Length of recent_runs after filtering: ", len(recent_runs)) 105 | 106 | best_runs_per_day_idx = runs_retraining.groupby(['Run_Date'])['metrics.MAE'].idxmin() 107 | best_runs = runs_retraining.loc[best_runs_per_day_idx] 108 | 109 | # Select the required columns for display 110 | metrics_columns = ['Run_Date', 'metrics.MAE', 'metrics.Training Time(sec)', 'metrics.RMSE', 'metrics.R2', 'metrics.MSE'] 111 | display(best_runs[metrics_columns]) 112 | 113 | 114 | # COMMAND ---------- 115 | 116 | # MAGIC %md 117 | # MAGIC ## Find Number of Initial Training Runs for Past Month 118 | 119 | # COMMAND ---------- 120 | 121 | # MAGIC %md 122 | # MAGIC * Calculates the date 30 days ago. 123 | # MAGIC * Filters experiment runs from the last 30 days. 124 | # MAGIC * Adds a column representing the date of each run. 125 | # MAGIC * Groups runs by date and counts the number of runs per day. 126 | # MAGIC * Formats the date for display. 127 | # MAGIC * Renames a column for clarity. 128 | # MAGIC * Displays a DataFrame showing the number of experiment runs for each day over the last 30 days. 129 | 130 | # COMMAND ---------- 131 | 132 | earliest_start_time = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d') 133 | recent_runs = runs_training[runs_training.start_time >= earliest_start_time] 134 | 135 | recent_runs['Run Date'] = recent_runs.start_time.dt.floor(freq='D') 136 | 137 | runs_per_day = recent_runs.groupby( 138 | ['Run Date'] 139 | ).count()[['run_id']].reset_index() 140 | runs_per_day['Run Date'] = runs_per_day['Run Date'].dt.strftime('%Y-%m-%d') 141 | runs_per_day.rename({ 'run_id': 'Number of Runs' }, axis='columns', inplace=True) 142 | 143 | display(runs_per_day) 144 | 145 | # COMMAND ---------- 146 | 147 | # MAGIC %md 148 | # MAGIC ## Find Number of Retraining Runs for Past Month 149 | 150 | # COMMAND ---------- 151 | 152 | # MAGIC %md 153 | # MAGIC * Calculates the date 30 days ago. 154 | # MAGIC * Filters experiment runs from the last 30 days. 155 | # MAGIC * Adds a column representing the date of each run. 156 | # MAGIC * Groups runs by date and counts the number of runs per day. 157 | # MAGIC * Formats the date for display. 158 | # MAGIC * Renames a column for clarity. 159 | # MAGIC * Displays a DataFrame showing the number of experiment runs for each day over the last 30 days. 160 | 161 | # COMMAND ---------- 162 | 163 | earliest_start_time = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d') 164 | recent_runs = runs_retraining[runs_retraining.start_time >= earliest_start_time] 165 | 166 | recent_runs['Run Date'] = recent_runs.start_time.dt.floor(freq='D') 167 | 168 | runs_per_day = recent_runs.groupby( 169 | ['Run Date'] 170 | ).count()[['run_id']].reset_index() 171 | runs_per_day['Run Date'] = runs_per_day['Run Date'].dt.strftime('%Y-%m-%d') 172 | runs_per_day.rename({ 'run_id': 'Number of Runs' }, axis='columns', inplace=True) 173 | 174 | display(runs_per_day) 175 | 176 | # COMMAND ---------- 177 | 178 | # MAGIC %md 179 | # MAGIC ## Model Comparison (Staging - Production) 180 | 181 | # COMMAND ---------- 182 | 183 | # MAGIC %md 184 | # MAGIC ### Request for Latest Models of Each Environment 185 | 186 | # COMMAND ---------- 187 | 188 | # MAGIC %md 189 | # MAGIC * Sets up HTTP headers with an authorization token and query parameters with a model name for an API request. 190 | # MAGIC * Sends a GET request to the MLflow REST API of a Databricks instance to retrieve all versions of a registered machine learning model. 191 | # MAGIC * Checks if the API response was successful. If not, an exception is raised with an error message containing the status code and error message from the API response. 192 | # MAGIC * If the API response was successful, it extracts the registered model details from the response JSON. 193 | # MAGIC * Prints the list of model versions in a JSON formatted output. 194 | 195 | # COMMAND ---------- 196 | 197 | # Set the headers and query parameters for the request 198 | headers = {"Authorization": f"Bearer {access_token}"} 199 | params = {"name": model_name} 200 | 201 | # Send the GET request to the MLflow REST API to retrieve all versions of the model 202 | response = requests.get(f"https://{databricks_instance}/api/2.0/preview/mlflow/registered-models/get", headers=headers, params=params) 203 | 204 | # Check if the response was successful 205 | if response.status_code != 200: 206 | raise Exception(f"Failed to retrieve registered models. Status code: {response.status_code}. Error message: {response.json()['error_code']}: {response.json()['message']}") 207 | 208 | model_versions = response.json()['registered_model'] 209 | 210 | # Print the list of model versions 211 | print(json.dumps(model_versions, indent=2)) 212 | 213 | 214 | # COMMAND ---------- 215 | 216 | # MAGIC %md 217 | # MAGIC ## Retrieve Latest Staging & Production Models 218 | 219 | # COMMAND ---------- 220 | 221 | prod_uri = None 222 | staging_uri = None 223 | 224 | for i in range(len(model_versions['latest_versions'])): 225 | if model_versions['latest_versions'][i]['current_stage'] == 'Production': 226 | prod_uri = f"models:/{model_name}/{model_versions['latest_versions'][i]['version']}" 227 | elif model_versions['latest_versions'][i]['current_stage'] == 'Staging': 228 | staging_uri = f"models:/{model_name}/{model_versions['latest_versions'][i]['version']}" 229 | 230 | if prod_uri is None: 231 | print('No model versions found in production') 232 | else: 233 | print(f'Latest production model version: {prod_uri}') 234 | 235 | if staging_uri is None: 236 | print('No model versions found in staging') 237 | else: 238 | print(f'Latest staging model version: {staging_uri}') 239 | 240 | # COMMAND ---------- 241 | 242 | # MAGIC %md 243 | # MAGIC ### Perform Batch Score to the Models 244 | 245 | # COMMAND ---------- 246 | 247 | # MAGIC %md 248 | # MAGIC * Perform batch scoring for the latest deployed staging & production model 249 | 250 | # COMMAND ---------- 251 | 252 | # Check if both the production and staging URIs have been retrieved 253 | if not prod_uri: 254 | raise Exception("Failed to retrieve the production model URI.") 255 | if not staging_uri: 256 | raise Exception("Failed to retrieve the staging model URI.") 257 | 258 | # Score the test dataset using the production and staging models 259 | staging_scores = fs.score_batch(staging_uri, test_labels, result_type='float') 260 | prod_scores = fs.score_batch(prod_uri, test_labels, result_type='float') 261 | 262 | prod_scores = prod_scores.withColumnRenamed("prediction", "prod_prediction") 263 | staging_scores = staging_scores.withColumnRenamed("prediction", "staging_prediction") 264 | 265 | display(prod_scores) 266 | display(staging_scores) 267 | 268 | # COMMAND ---------- 269 | 270 | # MAGIC %md 271 | # MAGIC ### Join Staging & Production Dataframes 272 | 273 | # COMMAND ---------- 274 | 275 | # Join the two dataframes on the `consumption_id` column, keeping all columns from `staging_df` and only the `prod_prediction` column from `prod_df` 276 | merged_df = staging_scores.join(prod_scores.select('consumption_id', 'prod_prediction'), 'consumption_id', 'inner').select(staging_scores.columns + [col('prod_prediction')]) 277 | 278 | # Define the column expression to extract the correct region 279 | country_col_expr = ( 280 | concat(*[ 281 | when(col(country) == 1, country).otherwise("") 282 | for country in countries 283 | ]) 284 | ) 285 | 286 | # Add a new column to the DataFrame with the concatenated region name 287 | merged_df = merged_df.withColumn("COUNTRY", country_col_expr) 288 | 289 | display(merged_df) 290 | 291 | # COMMAND ---------- 292 | 293 | from pyspark.sql.functions import year, month, col, concat, when,weekofyear 294 | # Filter to keep only the data for April 2022 295 | filtered_df = merged_df.filter((year(col('DATETIME')) == 2022) & (month(col('DATETIME')) == 4) & (weekofyear(col('DATETIME')) == 14)) 296 | 297 | # Display the filtered DataFrame 298 | display(filtered_df) 299 | 300 | # COMMAND ---------- 301 | 302 | display(merged_df.filter(col('Country') == 'greece')) 303 | 304 | # COMMAND ---------- 305 | 306 | # MAGIC %md 307 | # MAGIC ## Compare Staging vs Production 308 | 309 | # COMMAND ---------- 310 | 311 | # MAGIC %md 312 | # MAGIC A function named calculate_smape is defined. This function takes three arguments: 313 | # MAGIC * df: A DataFrame that contains the prediction and actual values. 314 | # MAGIC * prediction_col: The name of the column that contains the predicted values. 315 | # MAGIC * actual_col: The name of the column that contains the actual values. 316 | # MAGIC 317 | # MAGIC The function computes the SMAPE based on these input values. The SMAPE is calculated as the mean absolute difference between the predicted and actual values, divided by the average of the absolute predicted and actual values, all multiplied by 100. 318 | # MAGIC 319 | # MAGIC 1. The calculate_smape function is used to calculate the SMAPE for the staging and production models, using the respective prediction and actual values. 320 | # MAGIC 1. Based on the calculated SMAPE values, the code determines which model (staging or production) is better. The model with the lower SMAPE is considered the better one since a lower SMAPE indicates a better fit of the model. 321 | 322 | # COMMAND ---------- 323 | 324 | def calculate_smape(df, prediction_col, actual_col): 325 | from pyspark.sql.functions import abs 326 | # Calculate SMAPE using PySpark functions 327 | diff = col(prediction_col) - col(actual_col) 328 | denominator = (abs(col(prediction_col)) + abs(col(actual_col))) / 2 329 | smape = df.select(mean((abs(diff) / denominator) * 100).alias("SMAPE")).collect()[0]["SMAPE"] 330 | 331 | return smape 332 | 333 | # Calculate SMAPE for staging predictions 334 | staging_smape = calculate_smape(staging_scores, 'staging_prediction', 'HOURLY_CONSUMPTION_MW') 335 | print(f"Staging Model SMAPE: {staging_smape}%") 336 | 337 | # Calculate SMAPE for production predictions 338 | prod_smape = calculate_smape(prod_scores, 'prod_prediction', 'HOURLY_CONSUMPTION_MW') 339 | print(f"Production Model SMAPE: {prod_smape}%") 340 | 341 | # Determine which model is better based on SMAPE 342 | if staging_smape < prod_smape: 343 | print(f"Staging Model is better with a SMAPE of {staging_smape:.2f}%.") 344 | best_model = staging_uri 345 | else: 346 | print(f"Production Model is better with a SMAPE of {prod_smape:.2f}%.") 347 | best_model = prod_uri 348 | 349 | # Print the URI of the best model 350 | print(best_model) 351 | 352 | 353 | # COMMAND ---------- 354 | 355 | # MAGIC %md 356 | # MAGIC ## Transit the Best Model to Production Stage 357 | 358 | # COMMAND ---------- 359 | 360 | # MAGIC %md 361 | # MAGIC * The function initializes the MlflowClient and assigns it to the variable client. 362 | # MAGIC * It retrieves the latest version of the registered model in the Staging stage by calling the get_latest_versions method of the MlflowClient and assigns it to the variable model_version. 363 | # MAGIC * It defines the endpoint URL for sending the transition request to the Databricks MLflow API. The URL is assigned to the variable endpoint_url. 364 | # MAGIC * The stage to which the model should be transitioned is defined as 'Production'. Additionally, a comment for the transition request is set. 365 | # MAGIC * It sets the request headers to include the authorization token. 366 | # MAGIC * It constructs the request body, which includes the version of the model to be transitioned, the model name, the desired stage, a flag indicating whether to archive existing versions in the target stage, the comment, and a flag to indicate that this is a transition request. 367 | # MAGIC * It sends a POST request to the API endpoint with the defined headers and request body. 368 | # MAGIC * Finally, it checks the status code of the response. If the status code is 200, it prints a message indicating that the model transition request was sent successfully. Otherwise, it prints an error message with the response text. 369 | 370 | # COMMAND ---------- 371 | 372 | def request_model_transition_to_production(): 373 | 374 | # Get the latest version of the registered model in the Staging stage 375 | client = mlflow.tracking.MlflowClient() 376 | model_version = client.get_latest_versions(model_name, stages=["Staging"])[0].version 377 | 378 | # Define the endpoint URL 379 | endpoint_url = f"https://{databricks_instance}/api/2.0/mlflow/transition-requests/create" 380 | 381 | stage = 'Production' #Define the stage you want your model to transit 382 | comment = "Requesting transition to Production environment after comparing models" 383 | headers = { "Authorization": "Bearer " + access_token } 384 | 385 | request_body = { 386 | "version": f"{model_version}", 387 | "name": model_name, 388 | "stage" : stage, #Specifies the environment we want to transit our model 389 | "archive_existing_versions": True, #Specifies whether to archive all current model versions in the target stage. 390 | "comment": comment, 391 | "request_transition": True 392 | } 393 | print(model_version,model_name) 394 | # Make the request 395 | response = requests.post(endpoint_url, headers=headers,json=request_body) 396 | 397 | # Check the response status code 398 | if response.status_code == 200: 399 | print("Model version transition request sent") 400 | else: 401 | print(f"Error sending transition request: {response.text}") 402 | 403 | 404 | # COMMAND ---------- 405 | 406 | # MAGIC %md 407 | # MAGIC * Initializes an MLflow Client by assigning it to the variable client. The MLflow Client provides a programmatic way to interact with an MLflow tracking server. 408 | # MAGIC * Extracts the model_name and model_version from the best_model string, which presumably holds a URI for the model. It does so by splitting the string and accessing the relevant parts. 409 | # MAGIC * Queries the current stage (e.g., Staging, Production) of the model version using the get_model_version method of the MLflow Client. It assigns this stage to the variable best_model_stage. 410 | # MAGIC * Checks if the current stage of the best model is not 'Production'. If it isn't, it calls the previously defined function request_model_transition_to_production to request transitioning this model to the Production stage. 411 | # MAGIC * If the best model is already in the Production stage, it prints "Best model is already in Production". 412 | 413 | # COMMAND ---------- 414 | 415 | client = mlflow.tracking.MlflowClient() 416 | model_name = best_model.split('/')[1] 417 | model_version = best_model.split('/')[-1] 418 | best_model_stage = client.get_model_version(name=model_name, version=model_version).current_stage 419 | if best_model_stage != 'Production': 420 | # transit model to production 421 | request_model_transition_to_production() 422 | else: 423 | print("Best model is already in Production") 424 | 425 | 426 | # COMMAND ---------- 427 | 428 | 429 | --------------------------------------------------------------------------------