├── .DS_Store
├── Images
    ├── .DS_Store
    ├── image.png
    ├── image1.png
    ├── image10.png
    ├── image11.png
    ├── image12.png
    ├── image13.png
    ├── image14.png
    ├── image15.png
    ├── image16.png
    ├── image17.png
    ├── image2.png
    ├── image3.png
    ├── image4.png
    ├── image5.png
    ├── image6.png
    ├── image7.png
    ├── image9.png
    ├── ELT Diagram.png
    ├── architecture.png
    ├── Load-forecast-viz.png
    ├── team6_pyspark_code.png
    ├── team6_cloud_functions.png
    ├── weather-and-spp-over-time.png
    ├── team6_cloud_scheduler_image.png
    ├── team6_bucket_storage_structure.png
    └── Energy_Generation_and_Load_Consumption_Over_Time.png
├── Queries
    ├── .DS_Store
    ├── How does energy consumption vary by time of day_.sql
    ├── Average Energy Consumption by Month.sql
    └── Percentage Distribution of Each Energy.sql
├── Cloud Functions
    ├── requirements.txt
    ├── ercot_fm_latest_csv.py
    ├── ercot_load_forecast_csv.py
    ├── ercot_spp_csv.py
    ├── ercot_load_historical_6m.py
    ├── open_weather_live_data.py
    ├── ercot_load_latest_csv.py
    └── historicalHourlyWeather.py
├── PySpark Scripts
    ├── mergeHistoricalWeather.py
    ├── pyspark_ercot_load_latest_BQ_archive_csv.py
    ├── ercot_pyspark_load_historical_BQ_archive_csv.py
    ├── pyspark_ercot_load_forecast_BQ_archive_csv.py
    ├── test_pyspark_merge_spp_weather.py
    └── pyspark_ercot_merge_fm_load_latest_BQ_archive_csv.py
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/.DS_Store


--------------------------------------------------------------------------------
/Images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/.DS_Store


--------------------------------------------------------------------------------
/Images/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image.png


--------------------------------------------------------------------------------
/Images/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image1.png


--------------------------------------------------------------------------------
/Images/image10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image10.png


--------------------------------------------------------------------------------
/Images/image11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image11.png


--------------------------------------------------------------------------------
/Images/image12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image12.png


--------------------------------------------------------------------------------
/Images/image13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image13.png


--------------------------------------------------------------------------------
/Images/image14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image14.png


--------------------------------------------------------------------------------
/Images/image15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image15.png


--------------------------------------------------------------------------------
/Images/image16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image16.png


--------------------------------------------------------------------------------
/Images/image17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image17.png


--------------------------------------------------------------------------------
/Images/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image2.png


--------------------------------------------------------------------------------
/Images/image3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image3.png


--------------------------------------------------------------------------------
/Images/image4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image4.png


--------------------------------------------------------------------------------
/Images/image5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image5.png


--------------------------------------------------------------------------------
/Images/image6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image6.png


--------------------------------------------------------------------------------
/Images/image7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image7.png


--------------------------------------------------------------------------------
/Images/image9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/image9.png


--------------------------------------------------------------------------------
/Queries/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Queries/.DS_Store


--------------------------------------------------------------------------------
/Images/ELT Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/ELT Diagram.png


--------------------------------------------------------------------------------
/Images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/architecture.png


--------------------------------------------------------------------------------
/Images/Load-forecast-viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/Load-forecast-viz.png


--------------------------------------------------------------------------------
/Images/team6_pyspark_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/team6_pyspark_code.png


--------------------------------------------------------------------------------
/Images/team6_cloud_functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/team6_cloud_functions.png


--------------------------------------------------------------------------------
/Images/weather-and-spp-over-time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/weather-and-spp-over-time.png


--------------------------------------------------------------------------------
/Images/team6_cloud_scheduler_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/team6_cloud_scheduler_image.png


--------------------------------------------------------------------------------
/Images/team6_bucket_storage_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/team6_bucket_storage_structure.png


--------------------------------------------------------------------------------
/Cloud Functions/requirements.txt:
--------------------------------------------------------------------------------
1 | functions-framework==3.*
2 | gridstatus >= 0.27.0
3 | pandas >= 2.2.1
4 | google-cloud-storage>=1.44.0
5 | google-cloud-bigquery


--------------------------------------------------------------------------------
/Images/Energy_Generation_and_Load_Consumption_Over_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishsalunkhe/energydatalake/main/Images/Energy_Generation_and_Load_Consumption_Over_Time.png


--------------------------------------------------------------------------------
/Queries/How does energy consumption vary by time of day_.sql:
--------------------------------------------------------------------------------
 1 | SELECT 
 2 |     EXTRACT(HOUR FROM INTERVAL_START) AS hour_of_day,
 3 |     AVG(load) AS average_load
 4 | FROM 
 5 |     ercot_merged.ercot_fm_load_merged
 6 | GROUP BY 
 7 |     hour_of_day
 8 | ORDER BY 
 9 |     hour_of_day;
10 | 


--------------------------------------------------------------------------------
/Queries/Average Energy Consumption by Month.sql:
--------------------------------------------------------------------------------
 1 | -- Understanding Seasonal Variations in Electricity Demand:
 2 | -- How does Ercot's electricity demand vary throughout the year?
 3 | SELECT 
 4 |     EXTRACT(MONTH FROM INTERVAL_START) AS month,
 5 |     AVG(load) AS average_load
 6 | FROM 
 7 |     ercot_merged.ercot_fm_load_merged
 8 | GROUP BY 
 9 |     month
10 | ORDER BY 
11 |     month;
12 | 


--------------------------------------------------------------------------------
/Queries/Percentage Distribution of Each Energy.sql:
--------------------------------------------------------------------------------
 1 | -- How does each energy source contribute to the overall energy mix, expressed as a percentage?
 2 | -- This query calculates the percentage contribution of each energy source to the total energy mix, rounded to two decimal places.
 3 | 
 4 | 
 5 | SELECT 
 6 |     ROUND(SUM(coal_and_lignite) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS coal_and_lignite_percent,
 7 |     ROUND(SUM(hydro) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS hydro_percent,
 8 |     ROUND(SUM(nuclear) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS nuclear_percent,
 9 |     ROUND(SUM(power_storage) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS power_storage_percent,
10 |     ROUND(SUM(solar) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS solar_percent,
11 |     ROUND(SUM(wind) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS wind_percent,
12 |     ROUND(SUM(natural_gas) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS natural_gas_percent,
13 |     ROUND(SUM(other) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS other_percent
14 | FROM 
15 |     `ercot_merged.ercot_fm_load_merged`;
16 | 
17 | 


--------------------------------------------------------------------------------
/Cloud Functions/ercot_fm_latest_csv.py:
--------------------------------------------------------------------------------
 1 | import functions_framework
 2 | import gridstatus
 3 | import pandas as pd
 4 | import datetime
 5 | from google.cloud import storage
 6 | import logging
 7 | 
 8 | # Developer: Ushashri
 9 | # Purpose: This script defines an HTTP Cloud Function to fetch ERCOT fuel mix data,
10 | #          create a CSV file, and store it in a designated cloud bucket.
11 | 
12 | # Configure logging
13 | logging.basicConfig(level=logging.INFO)
14 | 
15 | @functions_framework.http
16 | def ercot_fm_latest_csv(request):
17 |     """HTTP Cloud Function.
18 |     This function fetches the ERCOT fuel mix data and creates a CSV
19 |     file in the designated cloud bucket."""
20 |     try:
21 |         iso = gridstatus.Ercot()
22 | 
23 |         df = iso.get_fuel_mix(date='latest')
24 | 
25 |         timestamp = datetime.datetime.now()
26 |         # Updated filename format with underscores and dashes
27 |         filename = f"ercot_fm_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}.csv"
28 | 
29 |         bucket_name = "ercot_test"
30 |         folder_name = "ercot_fm_csv/fm_latest"
31 |         destination_blob_name = f"{folder_name}/{filename}"
32 | 
33 |         upload_blob(bucket_name, df.to_csv(index=False), destination_blob_name)
34 | 
35 |         return f"{filename} stored to Cloud Storage Bucket"
36 |     except Exception as e:
37 |         logging.error(f"An error occurred: {str(e)}")
38 |         return "An error occurred while processing the request."
39 | 
40 | 
41 | def upload_blob(bucket_name, data, destination_blob_name):
42 |     """Uploads data to a Google Cloud Storage bucket."""
43 |     try:
44 |         storage_client = storage.Client()
45 |         bucket = storage_client.bucket(bucket_name)
46 |         blob = bucket.blob(destination_blob_name)
47 |         blob.upload_from_string(data)
48 |         logging.info(f"File {destination_blob_name} uploaded successfully to Cloud Storage.")
49 |     except Exception as e:
50 |         logging.error(f"An error occurred while uploading file to Cloud Storage: {str(e)}")
51 | 


--------------------------------------------------------------------------------
/Cloud Functions/ercot_load_forecast_csv.py:
--------------------------------------------------------------------------------
 1 | import functions_framework
 2 | import gridstatus
 3 | import pandas as pd
 4 | import datetime
 5 | import logging
 6 | from google.cloud import storage
 7 | 
 8 | # Developer: Shashank
 9 | # Purpose: This script defines an HTTP Cloud Function to fetch ERCOT load forecast data
10 | #          and store it as a CSV file in a designated cloud bucket.
11 | 
12 | # Configure logging
13 | logging.basicConfig(level=logging.INFO)
14 | 
15 | 
16 | @functions_framework.http
17 | def ercot_load_forecast_csv(request):
18 |     """HTTP Cloud Function.
19 |     This function fetches the ERCOT load forecast data and creates a CSV
20 |     file in the designated cloud bucket.
21 |     """
22 |     try:
23 |         iso = gridstatus.Ercot()
24 | 
25 |         # Fetch ERCOT load forecast data
26 |         df = iso.get_load_forecast(date='today')
27 | 
28 |         timestamp = datetime.datetime.now()
29 |         # Updated filename format with underscores and dashes
30 |         filename = f"ercot_load_forecast_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}.csv"
31 | 
32 |         bucket_name = "ercot_test"
33 |         folder_name = "ercot_load_forecast_csv"
34 |         destination_blob_name = f"{folder_name}/{filename}"
35 | 
36 |         # Upload data to Cloud Storage
37 |         upload_blob(bucket_name, df.to_csv(index=False), destination_blob_name)
38 | 
39 |         logging.info(f"{filename} stored to Cloud Storage Bucket")
40 |         return f"{filename} stored to Cloud Storage Bucket"
41 |     except Exception as e:
42 |         logging.error(f"An error occurred: {str(e)}")
43 |         return "An error occurred"
44 | 
45 | 
46 | def upload_blob(bucket_name, data, destination_blob_name):
47 |     """Uploads data to a Google Cloud Storage bucket."""
48 |     try:
49 |         storage_client = storage.Client()
50 |         bucket = storage_client.bucket(bucket_name)
51 |         blob = bucket.blob(destination_blob_name)
52 |         blob.upload_from_string(data)
53 |     except Exception as e:
54 |         logging.error(f"Error uploading blob: {str(e)}")
55 |         raise e  # Re-raise the exception for proper handling by the caller


--------------------------------------------------------------------------------
/Cloud Functions/ercot_spp_csv.py:
--------------------------------------------------------------------------------
 1 | import functions_framework
 2 | import gridstatus
 3 | import json
 4 | import datetime
 5 | from google.cloud import storage
 6 | import logging
 7 | 
 8 | # Developer: Bingqi
 9 | # Purpose: This script defines an HTTP Cloud Function to fetch ERCOT SPP data and store it as a CSV file
10 | #          in a designated cloud bucket.
11 | 
12 | # Initialize logging
13 | logging.basicConfig(level=logging.INFO)
14 | 
15 | @functions_framework.http
16 | def ercot_spp_csv(request):
17 |     """HTTP Cloud Function.
18 |     This function fetches the ERCOT SPP data and creates a CSV
19 |     file in the designated cloud bucket.
20 |     """
21 |     try:
22 |         iso = gridstatus.Ercot()
23 | 
24 |         # Fetch ERCOT SPP data
25 |         df = iso.get_spp(date="latest", market="REAL_TIME_15_MIN", location_type="Load Zone")
26 | 
27 |         # Convert Timestamp columns to string format
28 |         df['Time'] = df['Time'].astype(str)
29 |         df['Interval Start'] = df['Interval Start'].astype(str)
30 |         df['Interval End'] = df['Interval End'].astype(str)
31 | 
32 |         csv_data = df.to_dict(orient="records")
33 |         timestamp = datetime.datetime.now()
34 |         filename = f"ercot_spp_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}.csv"
35 | 
36 |         bucket_name = "ercot_test"
37 |         folder_name = "ercot_spp_csv/spp_latest"
38 |         destination_blob_name = f"{folder_name}/{filename}"
39 | 
40 |         # Upload DataFrame to Cloud Storage
41 |         upload_blob(bucket_name, df.to_csv(index=False), destination_blob_name)
42 | 
43 |         return f"{filename} stored to Cloud Storage Bucket"
44 | 
45 |     except Exception as e:
46 |         logging.error(f"An error occurred: {str(e)}")
47 |         return "An error occurred while processing the request."
48 | 
49 | 
50 | def upload_blob(bucket_name, data, destination_blob_name):
51 |     """Uploads data to a Google Cloud Storage bucket."""
52 |     try:
53 |         storage_client = storage.Client()
54 |         bucket = storage_client.bucket(bucket_name)
55 |         blob = bucket.blob(destination_blob_name)
56 |         blob.upload_from_string(data)
57 |         logging.info(f"File {destination_blob_name} uploaded successfully to Cloud Storage.")
58 |     except Exception as e:
59 |         logging.error(f"An error occurred while uploading file to Cloud Storage: {str(e)}")


--------------------------------------------------------------------------------
/Cloud Functions/ercot_load_historical_6m.py:
--------------------------------------------------------------------------------
 1 | import functions_framework
 2 | import gridstatus
 3 | import pandas as pd
 4 | import datetime
 5 | import logging
 6 | from google.cloud import storage
 7 | 
 8 | # Developer: Shashank
 9 | # Purpose: This script defines an HTTP Cloud Function to fetch ERCOT load data for the last 6 months 
10 | #          (from 3rd Nov 2023 to 3rd May 2024) and store it as a CSV file in a designated cloud bucket.
11 | 
12 | # Initialize logging
13 | logging.basicConfig(level=logging.INFO)
14 | 
15 | @functions_framework.http
16 | def ercot_load_historical_6m(request):
17 |     """HTTP Cloud Function.
18 |     This function fetches the ERCOT load data from last 6 months (3rd Nov 2023 to 3rd May 2024) 
19 |     and creates a CSV file in the designated cloud bucket.
20 |     """
21 |     try:
22 |         iso = gridstatus.Ercot()
23 | 
24 |         # Retrieve ERCOT load data for the last 6 months
25 |         df = iso.get_load(start='2023-11-10', end='2024-05-10')
26 | 
27 |         # Check if DataFrame is empty
28 |         if df.empty:
29 |             logging.warning("No data found for the specified date range.")
30 |             return "No data found for the specified date range."
31 | 
32 |         timestamp = datetime.datetime.now()
33 |         # Updated filename format with underscores and dashes
34 |         filename = f"ercot_load_6m_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}.csv"
35 | 
36 |         bucket_name = "ercot_test"
37 |         folder_name = "ercot_load_csv/load_historical"
38 |         destination_blob_name = f"{folder_name}/{filename}"
39 | 
40 |         # Upload DataFrame to Cloud Storage
41 |         upload_blob(bucket_name, df.to_csv(index=False), destination_blob_name)
42 | 
43 |         return f"{filename} stored to Cloud Storage Bucket"
44 |     
45 |     except Exception as e:
46 |         logging.error(f"An error occurred: {str(e)}")
47 |         return "An error occurred while processing the request."
48 | 
49 | 
50 | def upload_blob(bucket_name, data, destination_blob_name):
51 |     """Uploads data to a Google Cloud Storage bucket."""
52 |     try:
53 |         storage_client = storage.Client()
54 |         bucket = storage_client.bucket(bucket_name)
55 |         blob = bucket.blob(destination_blob_name)
56 |         blob.upload_from_string(data)
57 |         logging.info(f"File {destination_blob_name} uploaded successfully to Cloud Storage.")
58 |     except Exception as e:
59 |         logging.error(f"An error occurred while uploading file to Cloud Storage: {str(e)}")


--------------------------------------------------------------------------------
/Cloud Functions/open_weather_live_data.py:
--------------------------------------------------------------------------------
 1 | import functions_framework
 2 | import os
 3 | import requests
 4 | import pandas as pd
 5 | import datetime
 6 | from google.cloud import storage
 7 | import pytz
 8 | import logging
 9 | 
10 | # Developer: Aditya
11 | # Purpose: This script defines an HTTP Cloud Function to fetch weather data from the OpenWeather API,
12 | #          format it into a DataFrame, and store it as a CSV file in a designated cloud bucket.
13 | 
14 | # Constants
15 | API_KEY = "8c6d96932235d74117595f9e8423547b"
16 | cities_data = {
17 |     "LZ_HOUSTON": {"latitude": 29.763, "longitude": -95.363},
18 |     "LZ_WEST": {"latitude": 32.452, "longitude": -99.718},
19 |     "LZ_SOUTH": {"latitude": 27.801, "longitude": -97.396},
20 |     "LZ_NORTH": {"latitude": 33.578, "longitude": -101.855},
21 | }
22 | 
23 | # Configure logging
24 | logging.basicConfig(level=logging.INFO)
25 | 
26 | @functions_framework.http
27 | def fetch_and_store_weather_data(request):
28 |     """HTTP Cloud Function.
29 |     This function fetches weather data from the OpenWeather API
30 |     and stores it in a CSV file in a cloud bucket."""
31 |     try:
32 |         weather_data = fetch_weather_data()
33 |         df = pd.DataFrame(weather_data)
34 | 
35 |         timestamp = datetime.datetime.now()
36 |         filename = f"weather_live_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}.csv"
37 | 
38 |         bucket_name = "openweather_live_data"
39 |         folder_name = "quarter_hourly_weather_data"
40 |         destination_blob_name = f"{folder_name}/{filename}"
41 | 
42 |         upload_blob(bucket_name, df.to_csv(index=False), destination_blob_name)
43 | 
44 |         return f"Weather data uploaded as {filename}"
45 |     except Exception as e:
46 |         logging.error(f"An error occurred: {str(e)}")
47 |         return "An error occurred while processing the request."
48 | 
49 | 
50 | def fetch_weather_data():
51 |     """Fetch weather data for predefined cities."""
52 |     weather_records = []
53 |     for zone, coords in cities_data.items():
54 |         try:
55 |             url = f"https://api.openweathermap.org/data/2.5/weather?lat={coords['latitude']}&lon={coords['longitude']}&units=imperial&appid={API_KEY}"
56 |             response = requests.get(url)
57 |             weather_data = response.json()
58 |             weather_records.append({
59 |                 "Location": zone,
60 |                 "Temperature": weather_data["main"]["temp"],
61 |                 "Temp_min": weather_data["main"]["temp_min"],
62 |                 "Temp_max": weather_data["main"]["temp_max"],
63 |                 "Pressure": weather_data["main"]["pressure"],
64 |                 "Humidity": weather_data["main"]["humidity"],
65 |                 "Wind Speed": weather_data["wind"]["speed"],
66 |                 "Date": datetime.datetime.utcfromtimestamp(weather_data['dt']).replace(tzinfo=pytz.utc).astimezone(pytz.timezone('America/Chicago'))
67 |             })
68 |         except Exception as e:
69 |             logging.error(f"Error fetching weather data for {zone}: {str(e)}")
70 |     return weather_records
71 | 
72 | 
73 | def upload_blob(bucket_name, data, destination_blob_name):
74 |     """Uploads data to a Google Cloud Storage bucket."""
75 |     try:
76 |         storage_client = storage.Client()
77 |         bucket = storage_client.bucket(bucket_name)
78 |         blob = bucket.blob(destination_blob_name)
79 |         blob.upload_from_string(data)
80 |     except Exception as e:
81 |         logging.error(f"Error uploading blob: {str(e)}")
82 |         raise e  # Re-raise the exception for proper handling by the caller


--------------------------------------------------------------------------------
/PySpark Scripts/mergeHistoricalWeather.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from google.cloud import storage
  3 | from pyspark.sql import SparkSession
  4 | from google.cloud import bigquery 
  5 | # Initialize logging
  6 | logging.basicConfig(level=logging.INFO)
  7 | 
  8 | # Initialize SparkSession
  9 | spark = SparkSession.builder \
 10 |     .appName("HistoricalWeatherDataETL") \
 11 |     .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.22.0") \
 12 |     .getOrCreate()
 13 | 
 14 | # Define cloud storage paths
 15 | source_folder_path = "gs://openmeteo-weather/hourly-historical-weather-data"
 16 | destination_folder_path = "gs://openmeteo-weather/merged-historical-weather-data"
 17 | temporary_gcs_bucket = "ercot_test"
 18 | 
 19 | # Create a Cloud Storage client
 20 | storage_client = storage.Client()
 21 | 
 22 | # Function to check if a folder in Cloud Storage contains any files
 23 | def check_folder_has_files(bucket_name, folder_path):
 24 |     bucket = storage_client.bucket(bucket_name)
 25 |     blobs = bucket.list_blobs(prefix=folder_path)
 26 |     return any(blobs)
 27 | 
 28 | # Check if source folder has files
 29 | if not check_folder_has_files("openmeteo-weather", "hourly-historical-weather-data"):
 30 |     logging.info("No files found in the source folder. Exiting...")
 31 |     # Add any additional handling or return statement if needed
 32 | else:
 33 |     # Read and merge CSV files into one dataframe
 34 |     dfs = []
 35 |     blobs = storage_client.list_blobs("openmeteo-weather", prefix="hourly-historical-weather-data/")
 36 |     for blob in blobs:
 37 |         if blob.name.endswith('.csv'):
 38 |             df = spark.read.option("header", "true").csv(f"gs://{blob.bucket.name}/{blob.name}")
 39 |             dfs.append(df)
 40 | 
 41 |     # Combine all DataFrames into one
 42 |     merged_df = dfs[0]
 43 |     for df in dfs[1:]:
 44 |         merged_df = merged_df.union(df)
 45 | 
 46 |     # Data cleaning
 47 | 
 48 |     # Assuming data cleaning steps here
 49 |     # For demonstration, let's assume we drop null values
 50 |     merged_df = merged_df.dropna()
 51 | 
 52 |     # Convert "date" column to timestamp format with timezone offset (-05:00)
 53 |     merged_df = merged_df.withColumn("date", merged_df["date"].cast("timestamp"))
 54 | 
 55 |     # Display unique zones and their count
 56 |     zone_counts = merged_df.groupBy("zone").count()
 57 |     logging.info("Unique zones and their count:")
 58 |     zone_counts.show()
 59 | 
 60 |     # Write merged DataFrame to destination folder
 61 |     logging.info("Writing merged DataFrame to destination folder...")
 62 |     merged_df.write \
 63 |         .format("csv") \
 64 |         .option("header", "true") \
 65 |         .mode("overwrite") \
 66 |         .save(destination_folder_path)
 67 | 
 68 |     logging.info("Merged DataFrame successfully written to destination folder.")
 69 | 
 70 |     # Define BigQuery table schema
 71 |     schema = [
 72 |         bigquery.SchemaField("latitude", "FLOAT"),
 73 |         bigquery.SchemaField("longitude", "FLOAT"),
 74 |         bigquery.SchemaField("date", "TIMESTAMP"),
 75 |         bigquery.SchemaField("temperature_2m", "FLOAT"),
 76 |         bigquery.SchemaField("relative_humidity_2m", "FLOAT"),
 77 |         bigquery.SchemaField("dew_point_2m", "FLOAT"),
 78 |         bigquery.SchemaField("precipitation", "FLOAT"),
 79 |         bigquery.SchemaField("rain", "FLOAT"),
 80 |         bigquery.SchemaField("snowfall", "FLOAT"),
 81 |         bigquery.SchemaField("cloud_cover", "FLOAT"),
 82 |         bigquery.SchemaField("cloud_cover_low", "FLOAT"),
 83 |         bigquery.SchemaField("cloud_cover_mid", "FLOAT"),
 84 |         bigquery.SchemaField("cloud_cover_high", "FLOAT"),
 85 |         bigquery.SchemaField("wind_speed_10m", "FLOAT"),
 86 |         bigquery.SchemaField("wind_speed_100m", "FLOAT"),
 87 |         bigquery.SchemaField("wind_direction_10m", "FLOAT"),
 88 |         bigquery.SchemaField("wind_direction_100m", "FLOAT"),
 89 |         bigquery.SchemaField("wind_gusts_10m", "FLOAT"),
 90 |         bigquery.SchemaField("zone", "STRING"),  # Add zone field to schema
 91 |     ]
 92 | 
 93 |     # Define BigQuery table ID
 94 |     project_id = "driven-stage-365620"
 95 |     dataset_id = "ercot_merged"
 96 |     table_id = "historical_weather_data"
 97 | 
 98 |     # Write merged DataFrame to BigQuery
 99 |     logging.info("Writing merged DataFrame to BigQuery...")
100 |     merged_df.write \
101 |         .format("bigquery") \
102 |         .option("temporaryGcsBucket", temporary_gcs_bucket) \
103 |         .option("table", f"{project_id}.{dataset_id}.{table_id}") \
104 |         .mode("overwrite") \
105 |         .save()
106 | 
107 |     logging.info("Merged DataFrame successfully written to BigQuery.")
108 | 
109 | # Stop SparkSession
110 | spark.stop()


--------------------------------------------------------------------------------
/PySpark Scripts/pyspark_ercot_load_latest_BQ_archive_csv.py:
--------------------------------------------------------------------------------
  1 | #This code reads csv files from 1 folder in cloud storage, merges them, transforms datatypes, updates big query table, archives the files and deletes from source folder
  2 | from pyspark.sql import SparkSession
  3 | from pyspark.sql.functions import col,sum
  4 | from google.cloud import storage
  5 | from pyspark.sql.functions import col, to_timestamp
  6 | from pyspark.sql.types import FloatType, DecimalType
  7 | import logging
  8 | 
  9 | logging.basicConfig(level=logging.INFO)
 10 | # Spark session
 11 | spark = SparkSession.builder \
 12 |     .appName("ercotLoadLatestApp") \
 13 |     .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.22.0") \
 14 |     .getOrCreate()
 15 | 
 16 | # Cloud storage paths
 17 | folder_path = "gs://ercot_test/ercot_load_csv/load_latest"
 18 | 
 19 | destination_table = "driven-stage-365620.ercot_merged.ercot_load_latest"
 20 | temporary_gcs_bucket = "ercot_test"
 21 | archive_folder_name = "ercot_test/ercot_archive_csv"
 22 | 
 23 | # Create a Cloud Storage client
 24 | storage_client = storage.Client()
 25 | 
 26 | # Define a function to check if a folder in Cloud Storage contains any files
 27 | def check_folder_has_files(bucket_name, folder_path):
 28 |     bucket = storage_client.bucket(bucket_name)
 29 |     blobs = bucket.list_blobs(prefix=folder_path)
 30 |     return any(blobs)
 31 | 
 32 | if not check_folder_has_files("ercot_test", "ercot_load_csv/load_latest"):
 33 |     logging.info("No files found in the primary folder. Exiting...")
 34 |     # Add any additional handling or return statement if needed
 35 | else:
 36 |     # Proceed with reading and cleaning dataframes
 37 |     df_load_latest_clean = spark.read.option("header", "true").csv(folder_path)
 38 | 
 39 | 
 40 | #Column name changes
 41 | df_load_latest_clean = df_load_latest_clean.select([col(c).alias(c.replace(' ', '_').lower()) for c in df_load_latest_clean.columns])
 42 | 
 43 | #Logging-1
 44 | logging.info(f"Number of rows in the dataframe before datatype transformation: {df_load_latest_clean.count()}")
 45 | 
 46 | #Printing the combined df before transformation
 47 | print("Schema of the dataframe:")
 48 | df_load_latest_clean.printSchema()
 49 | 
 50 | df_load_latest_clean.show(5)
 51 | 
 52 | null_counts = df_load_latest_clean.select([sum(col(c).isNull().cast('int')).alias(c) for c in df_load_latest_clean.columns])
 53 | #Logging-2
 54 | logging.info(f"Null counts : {null_counts}")
 55 | 
 56 | #datatype Transformations and checking for nulls
 57 | # Converting 'time' columns to timestamp
 58 | df_load_latest_clean = df_load_latest_clean.withColumn('time', to_timestamp(col('time')))
 59 | df_load_latest_clean = df_load_latest_clean.withColumn('interval_start', to_timestamp(col('interval_start')))
 60 | df_load_latest_clean = df_load_latest_clean.withColumn('interval_end', to_timestamp(col('interval_end')))
 61 | 
 62 | # Converting other columns to float and handling null values
 63 | float_columns = ['load']
 64 | decimal_type = DecimalType(10, 2)
 65 | for col_name in float_columns:
 66 |      df_load_latest_clean = df_load_latest_clean.withColumn(col_name, col(col_name).cast(decimal_type))
 67 | 
 68 | #Dropping Null values
 69 | df_load_latest_clean = df_load_latest_clean.na.drop()
 70 | 
 71 | # Print schema of transformed dataframe
 72 | print("Schema of the cleaned dataframe:")
 73 | df_load_latest_clean.printSchema()
 74 | df_load_latest_clean.show(5)
 75 | # Writing DF to BigQuery
 76 | df_load_latest_clean.write.format('bigquery') \
 77 |     .option('table', destination_table) \
 78 |     .option('temporaryGcsBucket', temporary_gcs_bucket) \
 79 |     .mode('append') \
 80 |     .save()
 81 | #Reinstate this part of the code after datatype transformation works correctly in big query
 82 | # Creating cloud storage client
 83 | storage_client = storage.Client()
 84 | 
 85 | # Defining source and destination folders for archiving
 86 | archive_configurations = [
 87 |     
 88 |     {
 89 |         "source_bucket_name": "ercot_test",
 90 |         "source_folder_name": "ercot_load_csv/load_latest",
 91 |         "destination_bucket_name": "ercot_test",
 92 |         "destination_folder_name": "ercot_archive_csv"
 93 |     }
 94 | ]
 95 | 
 96 | # Looping through archive config
 97 | for config in archive_configurations:
 98 |     
 99 |     source_bucket = storage_client.bucket(config["source_bucket_name"])
100 |     destination_bucket = storage_client.bucket(config["destination_bucket_name"])
101 | 
102 |     # Getting all the present files in the source folder
103 |     blobs = source_bucket.list_blobs(prefix=config["source_folder_name"])
104 | 
105 |     # Iterating and archiving
106 |     for blob in blobs:
107 |         # Creating destination blob
108 |         if not blob.name.endswith('/'):
109 |             destination_blob_name = blob.name.replace(config["source_folder_name"], config["destination_folder_name"], 1)
110 | 
111 |             # Copying files to destination blob
112 |             source_blob = source_bucket.blob(blob.name)
113 |             destination_blob = destination_bucket.blob(destination_blob_name)
114 |             destination_blob.upload_from_string(source_blob.download_as_string())
115 | 
116 |             # Deleting copied file at source folder
117 |             source_blob.delete()
118 |    
119 | 
120 | #end
121 | spark.stop()


--------------------------------------------------------------------------------
/PySpark Scripts/ercot_pyspark_load_historical_BQ_archive_csv.py:
--------------------------------------------------------------------------------
  1 | #This code reads csv files from 1 folder in cloud storage, merges them, transforms datatypes, updates big query table, archives the files and deletes from source folder
  2 | from pyspark.sql import SparkSession
  3 | from pyspark.sql.functions import col,sum
  4 | from google.cloud import storage
  5 | from pyspark.sql.functions import col, to_timestamp
  6 | from pyspark.sql.types import FloatType, DecimalType
  7 | import logging
  8 | 
  9 | logging.basicConfig(level=logging.INFO)
 10 | # Spark session
 11 | spark = SparkSession.builder \
 12 |     .appName("ercotLoadHistoricalApp") \
 13 |     .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.22.0") \
 14 |     .getOrCreate()
 15 | 
 16 | # Cloud storage paths
 17 | folder_path = "gs://ercot_test/ercot_load_csv/load_historical"
 18 | 
 19 | destination_table = "driven-stage-365620.ercot_merged.ercot_load_historical"
 20 | temporary_gcs_bucket = "ercot_test"
 21 | archive_folder_name = "ercot_test/ercot_archive_csv"
 22 | 
 23 | storage_client = storage.Client()
 24 | 
 25 | def check_folder_has_files(bucket_name, folder_path):
 26 |     bucket = storage_client.bucket(bucket_name)
 27 |     blobs = bucket.list_blobs(prefix=folder_path)
 28 |     return any(blobs)
 29 | 
 30 | if not check_folder_has_files("ercot_test", "ercot_load_csv/load_historical"):
 31 |     logging.info("No files found in the primary folder. Exiting...")
 32 |     # Add any additional handling or return statement if needed
 33 | else:
 34 |     # Proceed with reading and cleaning dataframes
 35 |     df_load_historical = spark.read.option("header", "true").csv(folder_path)
 36 | # Reading and cleaning dataframes
 37 | 
 38 | #Column name changes
 39 | df_load_historical_clean = df_load_historical.select([col(c).alias(c.replace(' ', '_').lower()) for c in df_load_historical.columns])
 40 | 
 41 | #Logging-1
 42 | logging.info(f"Number of rows in the dataframe before datatype transformation: {df_load_historical_clean.count()}")
 43 | 
 44 | #Printing the combined df before transformation
 45 | print("Schema of the dataframe:")
 46 | df_load_historical_clean.printSchema()
 47 | 
 48 | df_load_historical_clean.show(5)
 49 | 
 50 | null_counts = df_load_historical_clean.select([sum(col(c).isNull().cast('int')).alias(c) for c in df_load_historical_clean.columns])
 51 | #Logging-2
 52 | logging.info(f"Null counts : {null_counts}")
 53 | 
 54 | #datatype Transformations and checking for nulls
 55 | # Converting 'time' columns to timestamp
 56 | df_load_historical_clean = df_load_historical_clean.withColumn('time', to_timestamp(col('time')))
 57 | df_load_historical_clean = df_load_historical_clean.withColumn('interval_start', to_timestamp(col('interval_start')))
 58 | df_load_historical_clean = df_load_historical_clean.withColumn('interval_end', to_timestamp(col('interval_end')))
 59 | 
 60 | # Converting other columns to float and handling null values
 61 | float_columns = ['load']
 62 | decimal_type = DecimalType(10, 2)
 63 | for col_name in float_columns:
 64 |      df_load_historical_clean = df_load_historical_clean.withColumn(col_name, col(col_name).cast(decimal_type))
 65 | 
 66 | #Dropping Null values
 67 | df_load_historical_clean = df_load_historical_clean.na.drop()
 68 | # Drop duplicate rows
 69 | df_load_historical_clean = df_load_historical_clean.dropDuplicates()
 70 | 
 71 | # Print schema of transformed dataframe
 72 | print("Schema of the cleaned dataframe:")
 73 | df_load_historical_clean.printSchema()
 74 | df_load_historical_clean.show(5)
 75 | # Writing DF to BigQuery
 76 | df_load_historical_clean.write.format('bigquery') \
 77 |     .option('table', destination_table) \
 78 |     .option('temporaryGcsBucket', temporary_gcs_bucket) \
 79 |     .mode('append') \
 80 |     .save()
 81 | #Reinstate this part of the code after datatype transformation works correctly in big query
 82 | # Creating cloud storage client
 83 | storage_client = storage.Client()
 84 | 
 85 | # Defining source and destination folders for archiving
 86 | archive_configurations = [
 87 |     
 88 |     {
 89 |         "source_bucket_name": "ercot_test",
 90 |         "source_folder_name": "ercot_load_csv/load_historical",
 91 |         "destination_bucket_name": "ercot_test",
 92 |         "destination_folder_name": "ercot_archive_csv"
 93 |     }
 94 | ]
 95 | 
 96 | # Looping through archive config
 97 | for config in archive_configurations:
 98 |     
 99 |     source_bucket = storage_client.bucket(config["source_bucket_name"])
100 |     destination_bucket = storage_client.bucket(config["destination_bucket_name"])
101 | 
102 |     # Getting all the present files in the source folder
103 |     blobs = source_bucket.list_blobs(prefix=config["source_folder_name"])
104 | 
105 |     # Iterating and archiving
106 |     for blob in blobs:
107 |         # Creating destination blob
108 |         if not blob.name.endswith('/'):
109 |             destination_blob_name = blob.name.replace(config["source_folder_name"], config["destination_folder_name"], 1)
110 | 
111 |             # Copying files to destination blob
112 |             source_blob = source_bucket.blob(blob.name)
113 |             destination_blob = destination_bucket.blob(destination_blob_name)
114 |             destination_blob.upload_from_string(source_blob.download_as_string())
115 | 
116 |             # Deleting copied file at source folder
117 |             source_blob.delete()
118 |    
119 | 
120 | #end
121 | spark.stop()


--------------------------------------------------------------------------------
/PySpark Scripts/pyspark_ercot_load_forecast_BQ_archive_csv.py:
--------------------------------------------------------------------------------
  1 | #This code reads csv files from 1 folder in cloud storage, merges them, transforms datatypes, updates big query table, archives the files and deletes from source folder
  2 | from pyspark.sql import SparkSession
  3 | from pyspark.sql.functions import col,sum, round
  4 | from google.cloud import storage
  5 | from pyspark.sql.functions import col, to_timestamp
  6 | from pyspark.sql.types import FloatType, DecimalType
  7 | import logging
  8 | 
  9 | logging.basicConfig(level=logging.INFO)
 10 | # Spark session
 11 | spark = SparkSession.builder \
 12 |     .appName("ercotLoadForecastApp") \
 13 |     .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.22.0") \
 14 |     .getOrCreate()
 15 | 
 16 | #print("Hello world!")
 17 | 
 18 | # Cloud storage paths
 19 | folder_path = "gs://ercot_test/ercot_load_forecast_csv"
 20 | 
 21 | destination_table = "driven-stage-365620.ercot_merged.ercot_load_forecast"
 22 | temporary_gcs_bucket = "ercot_test"
 23 | archive_folder_name = "ercot_archive_csv"
 24 | 
 25 | storage_client = storage.Client()
 26 | def check_folder_has_files(bucket_name, folder_path):
 27 |     bucket = storage_client.bucket(bucket_name)
 28 |     blobs = bucket.list_blobs(prefix=folder_path)
 29 |     return any(blobs)
 30 | 
 31 | if not check_folder_has_files("ercot_test", "ercot_load_forecast_csv"):
 32 |     logging.info("No files found in the primary folder. Exiting...")
 33 |     # Add any additional handling or return statement if needed
 34 | else:
 35 |     # Proceed with reading and cleaning dataframes
 36 |     df_load_forecast = spark.read.option("header", "true").csv(folder_path)
 37 | # Reading and cleaning dataframes
 38 | 
 39 | print("Schema of the dataframe:")
 40 | df_load_forecast.printSchema()
 41 | df_load_forecast.show(5)
 42 | 
 43 | #Column name changes
 44 | df_load_forecast_clean = df_load_forecast.select([col(c).alias(c.replace(' ', '_').lower()) for c in df_load_forecast.columns])
 45 | 
 46 | #Logging-1
 47 | logging.info(f"Number of rows in the dataframe before datatype transformation: {df_load_forecast_clean.count()}")
 48 | 
 49 | #Printing the combined df before transformation
 50 | print("Schema of the dataframe:")
 51 | df_load_forecast_clean.printSchema()
 52 | #df_load_forecast_clean.show(5)
 53 | 
 54 | null_counts = df_load_forecast_clean.select([sum(col(c).isNull().cast('int')).alias(c) for c in df_load_forecast_clean.columns])
 55 | #Logging-2
 56 | logging.info(f"Null counts : {null_counts}")
 57 | 
 58 | #datatype Transformations and checking for nulls
 59 | # Converting 'time' columns to timestamp
 60 | df_load_forecast_clean = df_load_forecast_clean.withColumn('time', to_timestamp(col('time')))
 61 | df_load_forecast_clean = df_load_forecast_clean.withColumn('interval_start', to_timestamp(col('interval_start')))
 62 | df_load_forecast_clean = df_load_forecast_clean.withColumn('interval_end', to_timestamp(col('interval_end')))
 63 | df_load_forecast_clean = df_load_forecast_clean.withColumn('publish_time', to_timestamp(col('publish_time')))
 64 | df_load_forecast_clean.show(5)
 65 | # Converting other columns to float and handling null values
 66 | float_columns = ['north', 'south', 'west','houston', 'system_total']
 67 | decimal_type = DecimalType(10, 2)
 68 | for col_name in float_columns:
 69 |      df_load_forecast_clean = df_load_forecast_clean.withColumn(col_name, col(col_name).cast(decimal_type))
 70 | 
 71 | #Dropping null values, examination of csvs shows no nulls
 72 | df_load_forecast_clean = df_load_forecast_clean.na.drop()
 73 | # Drop duplicate rows
 74 | df_load_forecast_clean = df_load_forecast_clean.dropDuplicates()
 75 | # Print schema of transformed dataframe
 76 | print("Schema of the cleaned dataframe:")
 77 | df_load_forecast_clean.printSchema()
 78 | df_load_forecast_clean.show(5)
 79 | 
 80 | try:
 81 |     # Writing DF to BigQuery
 82 |     df_load_forecast_clean.write.format('bigquery') \
 83 |         .option('table', destination_table) \
 84 |         .option('temporaryGcsBucket', temporary_gcs_bucket) \
 85 |         .mode('append') \
 86 |         .save()
 87 |     logging.info("Data written to BigQuery successfully.")
 88 | except Exception as e:
 89 |     logging.error(f"Error writing to BigQuery: {str(e)}")
 90 | 
 91 | 
 92 | #Reinstate this part of the code after datatype transformation works correctly in big query
 93 | # Creating cloud storage client
 94 | storage_client = storage.Client()
 95 | 
 96 | # Defining source and destination folders for archiving
 97 | archive_configurations = [
 98 |     
 99 |     {
100 |         "source_bucket_name": "ercot_test",
101 |         "source_folder_name": "ercot_load_forecast_csv",
102 |         "destination_bucket_name": "ercot_test",
103 |         "destination_folder_name": "ercot_archive_csv"
104 |     }
105 | ]
106 | 
107 | # Looping through archive config
108 | for config in archive_configurations:
109 |     
110 |     source_bucket = storage_client.bucket(config["source_bucket_name"])
111 |     destination_bucket = storage_client.bucket(config["destination_bucket_name"])
112 | 
113 |     # Getting all the present files in the source folder
114 |     blobs = source_bucket.list_blobs(prefix=config["source_folder_name"])
115 | 
116 |     # Iterating and archiving
117 |     for blob in blobs:
118 |         # Creating destination blob
119 |         if not blob.name.endswith('/'):
120 |             destination_blob_name = blob.name.replace(config["source_folder_name"], config["destination_folder_name"], 1)
121 | 
122 |             # Copying files to destination blob
123 |             source_blob = source_bucket.blob(blob.name)
124 |             destination_blob = destination_bucket.blob(destination_blob_name)
125 |             destination_blob.upload_from_string(source_blob.download_as_string())
126 | 
127 |             # Deleting copied file at source folder
128 |             source_blob.delete()
129 |    
130 | 
131 | #end
132 | spark.stop()


--------------------------------------------------------------------------------
/PySpark Scripts/test_pyspark_merge_spp_weather.py:
--------------------------------------------------------------------------------
  1 | #This code reads csv files from 2 folders in cloud storage, merges them, transforms datatypes, updates big query table, archives the files and deletes from source folder
  2 | from pyspark.sql import SparkSession
  3 | from pyspark.sql.functions import col
  4 | from google.cloud import storage
  5 | from pyspark.sql.functions import col, to_timestamp
  6 | import pandas as pd
  7 | from pyspark.sql.functions import col, unix_timestamp, expr
  8 | import logging
  9 | 
 10 | logging.basicConfig(level=logging.INFO)
 11 | 
 12 | spark = SparkSession.builder \
 13 |     .appName("spp_weather") \
 14 |     .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.22.0") \
 15 |     .getOrCreate()
 16 | 
 17 | primary_folder_path = "gs://ercot_test/ercot_spp_csv/spp_latest"
 18 | secondary_folder_path = "gs://openweather_live_data/quarter_hourly_weather_data"
 19 | destination_table = "driven-stage-365620.ercot_merged.ercot_spp_weather_merged"
 20 | temporary_gcs_bucket = "ercot_test"
 21 | 
 22 | # Create a Cloud Storage client
 23 | storage_client = storage.Client()
 24 | 
 25 | # Define a function to check if a folder in Cloud Storage contains any files
 26 | def check_folder_has_files(bucket_name, folder_path):
 27 |     bucket = storage_client.bucket(bucket_name)
 28 |     blobs = bucket.list_blobs(prefix=folder_path)
 29 |     return any(blobs)
 30 | 
 31 | # Check if primary folder has files
 32 | if not check_folder_has_files("ercot_test", "ercot_spp_csv/spp_latest"):
 33 |     logging.info("No files found in the primary folder. Exiting...")
 34 |     # Add any additional handling or return statement if needed
 35 | else:
 36 |     # Proceed with reading and cleaning dataframes
 37 |     prices_df = spark.read.option("header", "true").csv(primary_folder_path)
 38 | # Check if secondary folder has files
 39 | if not check_folder_has_files("openweather_live_data", "quarter_hourly_weather_data"):
 40 |     logging.info("No files found in the secondary folder. Exiting...")
 41 |     # Add any additional handling or return statement if needed
 42 | else:
 43 |     # Proceed with reading and cleaning dataframes
 44 |     weather_df = spark.read.option("header", "true").csv(secondary_folder_path)
 45 | 
 46 | # Reading and cleaning dataframes
 47 | 
 48 | # Convert Date and Time into timestamp format considering the timezone offset included in your data (-05:00)
 49 | weather_df = weather_df.withColumn("Timestamp", to_timestamp(col("Date"), "yyyy-MM-dd HH:mm:ssXXX"))
 50 | prices_df = prices_df.withColumn("Interval Start", to_timestamp(col("Interval Start"), "yyyy-MM-dd HH:mm:ssXXX"))
 51 | prices_df = prices_df.withColumn("Interval End", to_timestamp(col("Interval End"), "yyyy-MM-dd HH:mm:ssXXX"))
 52 | 
 53 | 
 54 | joined_df = weather_df.alias("weather").join(
 55 |     prices_df.alias("prices"),
 56 |     (col("weather.Location") == col("prices.Location")) &  # Spatial alignment
 57 |     (col("weather.Timestamp").between(col("prices.Interval Start"), col("prices.Interval End"))),  # Temporal alignment
 58 |     "inner"
 59 | )
 60 | 
 61 | 
 62 | final_df = joined_df.select(
 63 |     col("weather.Location"),
 64 |     col("weather.Temperature").cast("float").alias("Temperature"),
 65 |     col("weather.Temp_min").cast("float").alias("Temp_min"),
 66 |     col("weather.Temp_max").cast("float").alias("Temp_max"),
 67 |     col("weather.Pressure").cast("float").alias("Pressure"),
 68 |     col("weather.Humidity").cast("float").alias("Humidity"),
 69 |     col("weather.Wind Speed").cast("float").alias("Wind_Speed"),  # Note: Adjusted column name
 70 |     col("weather.Timestamp").alias("Weather_Timestamp"),
 71 |     col("prices.SPP").cast("float").alias("SPP"),
 72 |     to_timestamp(col("prices.Time")).alias("Price_Time"),  # Convert to datetime
 73 |     col("prices.Interval Start").alias("Price_Interval_Start"),
 74 |     col("prices.Interval End").alias("Price_Interval_End")
 75 | )
 76 | 
 77 | 
 78 | final_df = final_df.dropDuplicates()
 79 | print("Weather DataFrame:")
 80 | weather_df.select("Location", "Timestamp").show(truncate=False)
 81 | 
 82 | print("Prices DataFrame:")
 83 | prices_df.select("Location", "Interval Start", "Interval End").show(truncate=False)
 84 | 
 85 | # Show the results to verify the join
 86 | final_df.show(truncate=False)
 87 | 
 88 | # Write the final DataFrame to BigQuery
 89 | final_df.write.format("bigquery") \
 90 |     .option("temporaryGcsBucket", temporary_gcs_bucket) \
 91 |     .option("table", destination_table) \
 92 |     .mode("append") \
 93 |     .save()
 94 | 
 95 | storage_client = storage.Client()
 96 | 
 97 | # Defining source and destination folders for archiving
 98 | archive_configurations = [
 99 |     {
100 |         "source_bucket_name": "ercot_test",
101 |         "source_folder_name": "ercot_spp_csv/spp_latest",
102 |         "destination_bucket_name": "ercot_test",
103 |         "destination_folder_name": "ercot_archive_csv"
104 |     },
105 |     {
106 |         "source_bucket_name": "openweather_live_data",
107 |         "source_folder_name": "quarter_hourly_weather_data",
108 |         "destination_bucket_name": "openweather_live_data",
109 |         "destination_folder_name": "openweather_archive"
110 |     }
111 | ]
112 | 
113 | # Looping through archive config
114 | for config in archive_configurations:
115 |     
116 |     source_bucket = storage_client.bucket(config["source_bucket_name"])
117 |     destination_bucket = storage_client.bucket(config["destination_bucket_name"])
118 | 
119 |     # Getting all the present files in the source folder
120 |     blobs = source_bucket.list_blobs(prefix=config["source_folder_name"])
121 | 
122 |     # Iterating and archiving
123 |     for blob in blobs:
124 |         # Creating destination blob
125 |         if not blob.name.endswith('/'):
126 |             destination_blob_name = blob.name.replace(config["source_folder_name"], config["destination_folder_name"], 1)
127 | 
128 |             # Copying files to destination blob
129 |             source_blob = source_bucket.blob(blob.name)
130 |             destination_blob = destination_bucket.blob(destination_blob_name)
131 |             destination_blob.upload_from_string(source_blob.download_as_string())
132 | 
133 |             # Deleting copied file at source folder
134 |             source_blob.delete()
135 |    
136 | 
137 | 
138 | #end
139 | spark.stop()


--------------------------------------------------------------------------------
/Cloud Functions/ercot_load_latest_csv.py:
--------------------------------------------------------------------------------
  1 | import functions_framework
  2 | import openmeteo_requests
  3 | import pandas as pd
  4 | from google.cloud import storage
  5 | from retry_requests import retry
  6 | import requests_cache
  7 | import logging
  8 | import datetime
  9 | 
 10 | # Developer: Ashish
 11 | # Purpose: This script defines an HTTP Cloud Function to fetch historical hourly weather data for cities,
 12 | #          store it as CSV files in a designated cloud bucket, and upload them to Google Cloud Storage.
 13 | 
 14 | # Configure logging
 15 | logging.basicConfig(level=logging.INFO)
 16 | 
 17 | # Constants
 18 | START_DATE = "2023-11-10"
 19 | END_DATE = "2024-05-10"
 20 | BUCKET_NAME = "openmeteo-weather"
 21 | HISTORICAL_DATA_FOLDER = "hourly-historical-weather-data"
 22 | 
 23 | # Zone data with names, latitudes, and longitudes
 24 | zones_data = {
 25 |             "LZ_HOUSTON": {"latitude": 29.763, "longitude": -95.363},
 26 |             "LZ_WEST": {"latitude": 32.452, "longitude": -99.718},
 27 |             "LZ_SOUTH": {"latitude": 27.801, "longitude": -97.396},
 28 |             "LZ_NORTH": {"latitude": 33.578, "longitude": -101.855},
 29 |         }
 30 | 
 31 | 
 32 | # Setup the Open-Meteo API client with cache and retry on error
 33 | cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
 34 | retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
 35 | openmeteo = openmeteo_requests.Client(session=retry_session)
 36 | 
 37 | 
 38 | def validate_zone_data(zone_data):
 39 |     """Validate zone data to ensure it contains latitude and longitude.
 40 | 
 41 |     Args:
 42 |         zone_data (dict): The data for a zone including latitude and longitude.
 43 | 
 44 |     Returns:
 45 |         bool: True if the zone data is valid, False otherwise.
 46 |     """
 47 |     return all(key in zone_data for key in ["latitude", "longitude"])
 48 | 
 49 | 
 50 | def store_hourly_data_for_zone(zone, zone_data):
 51 |     """Store hourly historical weather data for a given zone.
 52 | 
 53 |     Args:
 54 |         zone (str): The name of the zone.
 55 |         zone_data (dict): The data for the zone including latitude and longitude.
 56 | 
 57 |     Returns:
 58 |         str: The location of the stored data in Cloud Storage, or None if an error occurred.
 59 |     """
 60 |     try:
 61 |         # Make sure all required weather variables are listed here
 62 |         url = "https://archive-api.open-meteo.com/v1/archive"
 63 |         params = {
 64 |             "latitude": zone_data["latitude"],
 65 |             "longitude": zone_data["longitude"],
 66 |             "start_date": START_DATE,
 67 |             "end_date": END_DATE,
 68 |             "hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation",
 69 |                        "rain", "snowfall", "cloud_cover", "cloud_cover_low", "cloud_cover_mid",
 70 |                        "cloud_cover_high", "wind_speed_10m", "wind_speed_100m", "wind_direction_10m",
 71 |                        "wind_direction_100m", "wind_gusts_10m"]
 72 |         }
 73 |         responses = openmeteo.weather_api(url, params=params)
 74 | 
 75 |         # Process first location. Add a for-loop for multiple locations or weather models
 76 |         response = responses[0]
 77 |         hourly = response.Hourly()
 78 | 
 79 |         # Create a dictionary to store hourly data
 80 |         hourly_data = {
 81 |             "zone": zone,
 82 |             "latitude": zone_data["latitude"],
 83 |             "longitude": zone_data["longitude"],
 84 |             "date": pd.date_range(
 85 |                 start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
 86 |                 end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
 87 |                 freq=pd.Timedelta(seconds=hourly.Interval()),
 88 |                 inclusive="left"
 89 |             )
 90 |         }
 91 | 
 92 |         # Extract data for each variable
 93 |         for i, variable_name in enumerate(params["hourly"]):
 94 |             values = hourly.Variables(i).ValuesAsNumpy()
 95 |             hourly_data[variable_name] = values
 96 | 
 97 |         # Create DataFrame from the hourly data dictionary
 98 |         hourly_dataframe = pd.DataFrame(data=hourly_data)
 99 | 
100 |         # Convert DataFrame to CSV format
101 |         csv_data = hourly_dataframe.to_csv(index=False).encode()
102 | 
103 |         # Define blob name
104 |         blob_name = f"{HISTORICAL_DATA_FOLDER}/{zone}_hourly_weather_data.csv"
105 | 
106 |         # Upload CSV data to Google Cloud Storage
107 |         upload_blob(BUCKET_NAME, csv_data, blob_name)
108 | 
109 |         location = f"gs://{BUCKET_NAME}/{blob_name}"
110 |         logging.info(f"Historical hourly weather data for {zone} stored in Cloud Storage at {location}")
111 |         return location
112 |     except Exception as e:
113 |         logging.error(f"Error processing data for {zone}: {str(e)}")
114 |         return None
115 | 
116 | 
117 | def upload_blob(bucket_name, data, blob_name):
118 |     """Uploads data to a Google Cloud Storage bucket.
119 | 
120 |     Args:
121 |         bucket_name (str): The name of the Cloud Storage bucket.
122 |         data (bytes): The data to upload.
123 |         blob_name (str): The name of the blob in the bucket.
124 |     """
125 |     storage_client = storage.Client()
126 |     bucket = storage_client.bucket(bucket_name)
127 |     blob = bucket.blob(blob_name)
128 |     blob.upload_from_string(data)
129 | 
130 | 
131 | @functions_framework.http
132 | def store_hourly_hist_weather(request):
133 |     """HTTP Cloud Function to store historical hourly weather data for cities.
134 | 
135 |     This function retrieves historical hourly weather data for cities
136 |     and stores it as a CSV file in Google Cloud Storage.
137 | 
138 |     Returns:
139 |         str: A message indicating the success or failure of the operation.
140 |     """
141 |     try:
142 |         locations = []
143 |         for zone, zone_data in zones_data.items():
144 |             if validate_zone_data(zone_data):
145 |                 location = store_hourly_data_for_zone(zone, zone_data)
146 |                 if location:
147 |                     locations.append(location)
148 |             else:
149 |                 logging.error(f"Invalid data for zone: {zone}")
150 |         
151 |         if locations:
152 |             return f"Historical hourly weather data stored in Cloud Storage. Locations: {', '.join(locations)}"
153 |         else:
154 |             return "No data stored in Cloud Storage. Check logs for details."
155 |     except Exception as e:
156 |         logging.error(f"An error occurred: {str(e)}")
157 |         return "An error occurred"
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     store_hourly_hist_weather(None)


--------------------------------------------------------------------------------
/Cloud Functions/historicalHourlyWeather.py:
--------------------------------------------------------------------------------
  1 | import functions_framework
  2 | import openmeteo_requests
  3 | import pandas as pd
  4 | from google.cloud import storage
  5 | from retry_requests import retry
  6 | import requests_cache
  7 | import logging
  8 | import datetime
  9 | 
 10 | # Developer: Ashish
 11 | # Purpose: This script defines an HTTP Cloud Function to fetch historical hourly weather data for cities,
 12 | #          store it as CSV files in a designated cloud bucket, and upload them to Google Cloud Storage.
 13 | 
 14 | # Configure logging
 15 | logging.basicConfig(level=logging.INFO)
 16 | 
 17 | # Constants
 18 | START_DATE = "2023-11-10"
 19 | END_DATE = "2024-05-10"
 20 | BUCKET_NAME = "openmeteo-weather"
 21 | HISTORICAL_DATA_FOLDER = "hourly-historical-weather-data"
 22 | 
 23 | # Zone data with names, latitudes, and longitudes
 24 | zones_data = {
 25 |             "LZ_HOUSTON": {"latitude": 29.763, "longitude": -95.363},
 26 |             "LZ_WEST": {"latitude": 32.452, "longitude": -99.718},
 27 |             "LZ_SOUTH": {"latitude": 27.801, "longitude": -97.396},
 28 |             "LZ_NORTH": {"latitude": 33.578, "longitude": -101.855},
 29 |         }
 30 | 
 31 | 
 32 | # Setup the Open-Meteo API client with cache and retry on error
 33 | cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
 34 | retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
 35 | openmeteo = openmeteo_requests.Client(session=retry_session)
 36 | 
 37 | 
 38 | def validate_zone_data(zone_data):
 39 |     """Validate zone data to ensure it contains latitude and longitude.
 40 | 
 41 |     Args:
 42 |         zone_data (dict): The data for a zone including latitude and longitude.
 43 | 
 44 |     Returns:
 45 |         bool: True if the zone data is valid, False otherwise.
 46 |     """
 47 |     return all(key in zone_data for key in ["latitude", "longitude"])
 48 | 
 49 | 
 50 | def store_hourly_data_for_zone(zone, zone_data):
 51 |     """Store hourly historical weather data for a given zone.
 52 | 
 53 |     Args:
 54 |         zone (str): The name of the zone.
 55 |         zone_data (dict): The data for the zone including latitude and longitude.
 56 | 
 57 |     Returns:
 58 |         str: The location of the stored data in Cloud Storage, or None if an error occurred.
 59 |     """
 60 |     try:
 61 |         # Make sure all required weather variables are listed here
 62 |         url = "https://archive-api.open-meteo.com/v1/archive"
 63 |         params = {
 64 |             "latitude": zone_data["latitude"],
 65 |             "longitude": zone_data["longitude"],
 66 |             "start_date": START_DATE,
 67 |             "end_date": END_DATE,
 68 |             "hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation",
 69 |                        "rain", "snowfall", "cloud_cover", "cloud_cover_low", "cloud_cover_mid",
 70 |                        "cloud_cover_high", "wind_speed_10m", "wind_speed_100m", "wind_direction_10m",
 71 |                        "wind_direction_100m", "wind_gusts_10m"]
 72 |         }
 73 |         responses = openmeteo.weather_api(url, params=params)
 74 | 
 75 |         # Process first location. Add a for-loop for multiple locations or weather models
 76 |         response = responses[0]
 77 |         hourly = response.Hourly()
 78 | 
 79 |         # Create a dictionary to store hourly data
 80 |         hourly_data = {
 81 |             "zone": zone,
 82 |             "latitude": zone_data["latitude"],
 83 |             "longitude": zone_data["longitude"],
 84 |             "date": pd.date_range(
 85 |                 start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
 86 |                 end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
 87 |                 freq=pd.Timedelta(seconds=hourly.Interval()),
 88 |                 inclusive="left"
 89 |             )
 90 |         }
 91 | 
 92 |         # Extract data for each variable
 93 |         for i, variable_name in enumerate(params["hourly"]):
 94 |             values = hourly.Variables(i).ValuesAsNumpy()
 95 |             hourly_data[variable_name] = values
 96 | 
 97 |         # Create DataFrame from the hourly data dictionary
 98 |         hourly_dataframe = pd.DataFrame(data=hourly_data)
 99 | 
100 |         # Convert DataFrame to CSV format
101 |         csv_data = hourly_dataframe.to_csv(index=False).encode()
102 | 
103 |         # Define blob name
104 |         blob_name = f"{HISTORICAL_DATA_FOLDER}/{zone}_hourly_weather_data.csv"
105 | 
106 |         # Upload CSV data to Google Cloud Storage
107 |         upload_blob(BUCKET_NAME, csv_data, blob_name)
108 | 
109 |         location = f"gs://{BUCKET_NAME}/{blob_name}"
110 |         logging.info(f"Historical hourly weather data for {zone} stored in Cloud Storage at {location}")
111 |         return location
112 |     except Exception as e:
113 |         logging.error(f"Error processing data for {zone}: {str(e)}")
114 |         return None
115 | 
116 | 
117 | def upload_blob(bucket_name, data, blob_name):
118 |     """Uploads data to a Google Cloud Storage bucket.
119 | 
120 |     Args:
121 |         bucket_name (str): The name of the Cloud Storage bucket.
122 |         data (bytes): The data to upload.
123 |         blob_name (str): The name of the blob in the bucket.
124 |     """
125 |     storage_client = storage.Client()
126 |     bucket = storage_client.bucket(bucket_name)
127 |     blob = bucket.blob(blob_name)
128 |     blob.upload_from_string(data)
129 | 
130 | 
131 | @functions_framework.http
132 | def store_hourly_hist_weather(request):
133 |     """HTTP Cloud Function to store historical hourly weather data for cities.
134 | 
135 |     This function retrieves historical hourly weather data for cities
136 |     and stores it as a CSV file in Google Cloud Storage.
137 | 
138 |     Returns:
139 |         str: A message indicating the success or failure of the operation.
140 |     """
141 |     try:
142 |         locations = []
143 |         for zone, zone_data in zones_data.items():
144 |             if validate_zone_data(zone_data):
145 |                 location = store_hourly_data_for_zone(zone, zone_data)
146 |                 if location:
147 |                     locations.append(location)
148 |             else:
149 |                 logging.error(f"Invalid data for zone: {zone}")
150 |         
151 |         if locations:
152 |             return f"Historical hourly weather data stored in Cloud Storage. Locations: {', '.join(locations)}"
153 |         else:
154 |             return "No data stored in Cloud Storage. Check logs for details."
155 |     except Exception as e:
156 |         logging.error(f"An error occurred: {str(e)}")
157 |         return "An error occurred"
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     store_hourly_hist_weather(None)


--------------------------------------------------------------------------------
/PySpark Scripts/pyspark_ercot_merge_fm_load_latest_BQ_archive_csv.py:
--------------------------------------------------------------------------------
  1 | #This code reads csv files from 2 folders in cloud storage, merges them, transforms datatypes, updates big query table, archives the files and deletes from source folder
  2 | from pyspark.sql import SparkSession
  3 | from pyspark.sql.functions import col,sum
  4 | from google.cloud import storage
  5 | from pyspark.sql.functions import col, to_timestamp
  6 | from pyspark.sql.types import FloatType, DecimalType
  7 | import logging
  8 | import pandas as pd
  9 | 
 10 | logging.basicConfig(level=logging.INFO)
 11 | # Spark session
 12 | spark = SparkSession.builder \
 13 |     .appName("ercotFuelApp") \
 14 |     .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.22.0") \
 15 |     .getOrCreate()
 16 | 
 17 | # Cloud storage paths
 18 | primary_folder_path = "gs://ercot_test/ercot_fm_csv/fm_latest"
 19 | secondary_folder_path = "gs://ercot_test/ercot_load_csv/load_latest"
 20 | destination_table = "driven-stage-365620.ercot_merged.ercot_fm_load_merged"
 21 | temporary_gcs_bucket = "ercot_test"
 22 | archive_folder_name = "ercot_test/ercot_archive_csv"
 23 | 
 24 | 
 25 | # Create a Cloud Storage client
 26 | storage_client = storage.Client()
 27 | 
 28 | # Define a function to check if a folder in Cloud Storage contains any files
 29 | def check_folder_has_files(bucket_name, folder_path):
 30 |     bucket = storage_client.bucket(bucket_name)
 31 |     blobs = bucket.list_blobs(prefix=folder_path)
 32 |     return any(blobs)
 33 | 
 34 | # Check if primary folder has files
 35 | if not check_folder_has_files("ercot_test", "ercot_fm_csv/fm_latest"):
 36 |     logging.info("No files found in the primary folder. Exiting...")
 37 |     # Add any additional handling or return statement if needed
 38 | else:
 39 |     # Proceed with reading and cleaning dataframes
 40 |     df_primary = spark.read.option("header", "true").csv(primary_folder_path)
 41 | 
 42 | # Check if secondary folder has files
 43 | if not check_folder_has_files("ercot_test", "ercot_load_csv/load_latest"):
 44 |     logging.info("No files found in the secondary folder. Exiting...")
 45 |     # Add any additional handling or return statement if needed
 46 | else:
 47 |     # Proceed with reading and cleaning dataframes
 48 |     df_secondary = spark.read.option("header", "true").csv(secondary_folder_path)
 49 | # Reading and cleaning dataframes
 50 | 
 51 | logging.info(f"{df_primary.count()}")
 52 | logging.info(f"{df_secondary.count()}")
 53 | 
 54 | # #Column name changes
 55 | # df_primary_clean = df_primary.select([col(c).alias(c.replace(' ', '_').lower()) for c in df_primary.columns])
 56 | # df_secondary_clean = df_secondary.select([col(c).alias(c.replace(' ', '_').lower()) for c in df_secondary.columns])
 57 | 
 58 | # Reading and cleaning dataframes using Pandas
 59 | df_primary_pd = df_primary.toPandas()
 60 | df_secondary_pd = df_secondary.toPandas()
 61 | 
 62 | #Column name changes in Pandas DataFrames
 63 | df_primary_clean = df_primary_pd.rename(columns=lambda x: x.replace(' ', '_').lower())
 64 | df_secondary_clean = df_secondary_pd.rename(columns=lambda x: x.replace(' ', '_').lower())
 65 | 
 66 | # Convert 'time' column to datetime
 67 | df_primary_clean['time'] = pd.to_datetime(df_primary_clean['time'], utc=True)
 68 | df_secondary_clean['time'] = pd.to_datetime(df_secondary_clean['time'], utc=True)
 69 | 
 70 | df_primary_clean.sort_values(by='time', inplace=True)
 71 | df_secondary_clean.sort_values(by='time', inplace=True)
 72 | 
 73 | 
 74 | # Outer join of above dataframes
 75 | primary_key = "time"
 76 | #df_merged = df_primary_clean.join(df_secondary_clean, primary_key, "outer")
 77 | df_merged = pd.merge_asof(df_primary_clean, df_secondary_clean, on = ["time"])
 78 | df_merged.dropna(inplace=True)
 79 | df_merged = spark.createDataFrame(df_merged)
 80 | #Logging-1
 81 | logging.info(f"Number of rows in merged dataframe before datatype transformation: {df_merged.count()}")
 82 | 
 83 | #Printing merged_df before transformation
 84 | df_merged.show(5)
 85 | 
 86 | null_counts = df_merged.select([sum(col(c).isNull().cast('int')).alias(c) for c in df_merged.columns])
 87 | logging.info(f"Null counts : {null_counts}")
 88 | 
 89 | #datatype Transformations and checking for nulls
 90 | # Converting 'time' columns to timestamp
 91 | df_merged = df_merged.withColumn('time', to_timestamp(col('time')))
 92 | df_merged = df_merged.withColumn('interval_start', to_timestamp(col('interval_start')))
 93 | df_merged = df_merged.withColumn('interval_end', to_timestamp(col('interval_end')))
 94 | 
 95 | # Converting other columns to float and handling null values
 96 | float_columns = ['coal_and_lignite', 'hydro', 'nuclear', 'power_storage', 'solar', 'wind', 'natural_gas', 'other', 'load']
 97 | decimal_type = DecimalType(10, 2)
 98 | for col_name in float_columns:
 99 |      df_merged = df_merged.withColumn(col_name, col(col_name).cast(decimal_type))
100 | 
101 | 
102 | #Dropping rows with null values
103 | df_merged = df_merged.na.drop()
104 | 
105 | # Drop duplicate rows
106 | df_merged = df_merged.dropDuplicates()
107 | 
108 | logging.info(f"Number of rows in merged dataframe after dropping duplicates: {df_merged.count()}")
109 | 
110 | # Print schema of merged dataframe
111 | print("Schema of merged dataframe:")
112 | df_merged.printSchema()
113 | df_merged.show(5)
114 | # Writing DF to BigQuery
115 | df_merged.write.format('bigquery') \
116 |     .option('table', destination_table) \
117 |     .option('temporaryGcsBucket', temporary_gcs_bucket) \
118 |     .mode('append') \
119 |     .save()
120 | #Reinstate this part of the code after datatype transformation works correctly in big query
121 | # Creating cloud storage client
122 | storage_client = storage.Client()
123 | 
124 | # Defining source and destination folders for archiving
125 | archive_configurations = [
126 |     {
127 |         "source_bucket_name": "ercot_test",
128 |         "source_folder_name": "ercot_fm_csv/fm_latest",
129 |         "destination_bucket_name": "ercot_test",
130 |         "destination_folder_name": "ercot_archive_csv"
131 |     },
132 |     {
133 |         "source_bucket_name": "ercot_test",
134 |         "source_folder_name": "ercot_load_csv/load_latest",
135 |         "destination_bucket_name": "ercot_test",
136 |         "destination_folder_name": "ercot_archive_csv"
137 |     }
138 | ]
139 | 
140 | # Looping through archive config
141 | for config in archive_configurations:
142 |     
143 |     source_bucket = storage_client.bucket(config["source_bucket_name"])
144 |     destination_bucket = storage_client.bucket(config["destination_bucket_name"])
145 | 
146 |     # Getting all the present files in the source folder
147 |     blobs = source_bucket.list_blobs(prefix=config["source_folder_name"])
148 | 
149 |     # Iterating and archiving
150 |     for blob in blobs:
151 |         # Creating destination blob
152 |         if not blob.name.endswith('/'):
153 |             destination_blob_name = blob.name.replace(config["source_folder_name"], config["destination_folder_name"], 1)
154 | 
155 |             # Copying files to destination blob
156 |             source_blob = source_bucket.blob(blob.name)
157 |             destination_blob = destination_bucket.blob(destination_blob_name)
158 |             destination_blob.upload_from_string(source_blob.download_as_string())
159 | 
160 |             # Deleting copied file at source folder
161 |             source_blob.delete()
162 |    
163 | 
164 | #end
165 | spark.stop()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Grid Status: ERCOT - Energy Data Lake
  2 | 
  3 | ## Summary
  4 | 
  5 | Our Data pipeline ingests data from 5 APIs and loads it as is in cloud storage. Our cloud functions fetch the data from relevant APIs. Our workflows execute the relevant Pyspark jobs to pick up all the files in the cloud storage, transform the data to represent correct data types, remove nulls and duplicates, merge data where required, to populate the corresponding BigQuery tables and then move the files to archive folder. All these actions are synchronized using Cloud schedulers.
  6 | 
  7 | 
  8 | ## Electric Reliability Council of Texas (ERCOT)
  9 | 
 10 | The Electric Reliability Council of Texas (ERCOT) manages the flow of electric power to more than 26 million Texas customers -- representing about 90 percent of the state’s electric load. As the independent system operator for the region, ERCOT schedules power on an electric grid that connects more than 54,100 miles of transmission lines and 1,250 generation units, including Private Use Networks. It also performs financial settlement for the competitive wholesale bulk-power market and administers retail switching for 8 million premises in competitive choice areas.
 11 | 
 12 | ### ercot_fuel_mix:
 13 | This API provides insights into the energy sources utilized for electricity generation within the ERCOT region.
 14 | 
 15 | ### ercot_load_forecast & ercot_load:
 16 | These APIs offer data on both forecasted and actual electricity demand across various regions within ERCOT. 
 17 | 
 18 | ### ercot_spp:
 19 | It includes information on energy prices within the ERCOT markets based on load zones.
 20 | 
 21 | **Documentation of All Endpoints:** 
 22 | 
 23 | * https://www.ercot.com/services/mdt/data-portal
 24 | * https://docs.gridstatus.io/en/latest/index.html
 25 | * https://www.gridstatus.io/datasets
 26 | * https://apiexplorer.ercot.com/
 27 | 
 28 | ## OpenMeteo Data:
 29 | 
 30 | This API provides an extensive range of weather data points that can be correlated with energy consumption patterns:
 31 | 
 32 | ### Temperature_2m:
 33 | Directly impacts heating and cooling requirements, thus influencing energy demand throughout different seasons.
 34 | 
 35 | ### Precipitation:
 36 | Heavy rainfall or snowfall can disrupt energy production from sources like solar or wind, affecting overall energy availability and grid stability.
 37 | 
 38 | ### Wind Speed:
 39 | Wind speed and direction play a significant role in wind power generation, making this data crucial for assessing the impact of weather on renewable energy production.
 40 | 
 41 | ### Cloud Cover:
 42 | Cloud cover directly affects solar energy production by obstructing sunlight. Monitoring cloud cover data helps in understanding the variability in solar energy generation and its impact on the energy grid.
 43 | 
 44 | ### Example Endpoint
 45 | 
 46 | https://archive-api.open-meteo.com/v1/archive?latitude=52.52&longitude=13.41&start_date=2024-04-29&end_date=2024-05-13&hourly=temperature_2m
 47 | 
 48 | ## OpenWeather Current Weather Data API
 49 | 
 50 | This API provides access to current weather data, including minute-by-minute forecast for 1 hour and hourly forecast for 48 hours.
 51 | 
 52 | Endpoint: https://api.openweathermap.org/data/2.5/weather
 53 | 
 54 | 
 55 | **Parameters:**
 56 | - `lat` (required): Latitude of the location. If you need geocoding services, you can use the OpenWeather Geocoding API.
 57 | - `lon` (required): Longitude of the location. If you need geocoding services, you can use the OpenWeather Geocoding API.
 58 | - `appid` (required): Your unique API key, which can be found on your OpenWeather account page under the "API key" tab.
 59 | - `mode` (optional): Response format. Possible values are `xml` and `html`. If not specified, JSON format is used by default.
 60 | - `units` (optional): Units of measurement. Possible values are `standard`, `metric`, and `imperial`. If not specified, standard units are used by default.
 61 | - `lang` (optional): Language parameter to specify the output language.
 62 | 
 63 | **Example Request:**
 64 | https://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={API key}
 65 | 
 66 | 
 67 | ## ELT Diagram
 68 | 
 69 | ![ELT Diagram](Images/architecture.png)
 70 | 
 71 | ## Step 1: Storage Buckets
 72 | 
 73 | We created Storage Buckets to hold incoming data. Our data storage bucket is named ercot_test. It is structured to hold incoming CSV files into respective folders. The structure is available in the image below.
 74 | 
 75 | ![Storage Buckets](Images/team6_bucket_storage_structure.png)
 76 | 
 77 | ## Step 2: Cloud Functions
 78 | 
 79 | We created cloud functions that access data and save to the respective folders.
 80 | 
 81 | ![Cloud Functions](Images/team6_cloud_functions.png)
 82 | 
 83 | ### Our Cloud Functions
 84 | 
 85 | We have 5 cloud functions to pull in data applying the following 4 methods on the gridstatus iso
 86 | iso =  gridstatus.Ercot()
 87 | * get_fuel_mix(),
 88 | * get_load(), (Source for 2 functions, one for load_latest and one for load_historical)
 89 | * get_load_forecast()
 90 | * Weather
 91 | 
 92 | Additionally, we created two more cloud functions that loads historical data upto the past 6 months:
 93 | * Ercot_load_historical_6m: This cloud function fetches the load information from ERCOT for the last 6 months and stores it in a bucket. 
 94 | * Store_hourly_hist_weather_6m: Similarly, this cloud function fetches the historical weather data from the last 6 months and stores it in a bucket.
 95 | 
 96 | ![Cloud Functions](Images/team6_cloud_functions.png)
 97 | 
 98 | ## Step 3: Spark Jobs
 99 | 
100 | We created Spark jobs that took the csv files, applied transformations such as
101 | 
102 | * Reading all the existing files in the folder and merging them into a single dataframe
103 | * Transforming column names by replacing spaces with ‘_’ to suit updation to Bigquery
104 | * The schema of the csv files was interpreted as strings by Pyspark. Therefore, we applied relevant transformations to correct the schema
105 | * We corrected float numbers to 2 decimal places
106 | * We deleted Nulls
107 | * We iterated over all the CSV files in the source buckets, copying each one into an archive folder, before removing them from their original locations.
108 | * Basic logging is also implemented to capture information such as the dataframe's shape and the presence of null values. These logging statements are helpful for debugging, monitoring the execution flow and understanding the data being transformed
109 | * For the one pipeline that merges fuel mix and load data, we matched the data on the ‘Time’ column and removed only non-null rows
110 | 
111 | We saved the spark jobs code to our code storage bucket named store_dataproc_files. We have a code_notebooks folder in it, wherein we have the final_code folder which has Pyspark_code and cloud_functions code subfolders.  Its structure is as follows:
112 | 
113 | ![Spark Jobs](Images/image7.png)
114 | 
115 | ## Step 4: Dataproc Workflows
116 | 
117 | We then created Dataproc-Workflows, to create 4 distinct pipelines. They are:
118 | 
119 | * eda-wf-ercot-load-forecast -13: Executes the pyspark job related to reading all the files in the ercot_ load_corecast_csv folder, merging them into a single dataframe, apply transformations, update to bigquery and then move all the files to an archive. 
120 | * ercot-load-hist-13: Executes the pyspark job related to reading all the files in the ercot_ load_csv/fm_historical folder, merging them into a single dataframe, apply transformations, update to bigquery and then move all the files to an archive. 
121 | * ercot-merge-fm-load-13 : Executes the pyspark job related to reading all the files in the ercot_ load_csv/load_latest and ercot_fm_csv/fm_latest  folder, merging them into a single dataframe, apply transformations, update to bigquery and then move all the files to an archive. 
122 | * Eda-wf-ercot-load-latest: Executes the pyspark job related to reading all the files in the ercot_ load_csv/load_latestl folder, merging them into a single dataframe, apply transformations, update to bigquery and then move all the files to an archive. 
123 | * Workflow-spp-weather-merge : Executes the pyspark job which reads all the files in spp_latest folder that contains all the prices of fuels in 4 zones and quarter_hourly_weather_data folder that contains the weather data for the 4 zones. It then cleans and joins the two datasets on the location column(zones) and the time column.
124 | * mergeHistoricalWeather.py : This pyspark job merges all the historical weather data that is fetched from the OpenMeteo API.
125 | * pyspark_ercot_load_latest_BQ_archive_csv : Executes the pyspark job that reads all csv files from load_latest folder, merges them, transforms datatypes, updates big query table, archives the files and deletes from source folder. 
126 | 
127 | ![Dataproc Workflows](Images/image4.png)
128 | 
129 | ## BigQuery Data Pipeline with Cloud Scheduler
130 | 
131 | This document describes the strategy for ensuring only new files are processed by BigQuery and the Cloud Scheduler configuration for running the pipeline.
132 | 
133 | ### Strategy for New File Processing
134 | 
135 | Instead of fetching specific files, the process involves:
136 | 
137 | 1. Processing all files present in the designated bucket.
138 | 2. Moving processed files to an archive folder upon completion.
139 | 
140 | ### Cloud Schedulers and Tasks
141 | 
142 | Cloud Schedulers are configured using cron jobs to trigger Cloud Functions and Dataproc cluster jobs at regular intervals.
143 | 
144 | **List of Schedulers:**
145 | 
146 | | Scheduler Name                 | Task Performed                                                | Frequency                                           |
147 | |-------------------------------|--------------------------------------------------------------|----------------------------------------------------|
148 | | `eda-wf-ercot-load-forecast`    | Executes ercot load historical workflow                        | Every 12 hours, at 00:00 (daily)                 |
149 | | `eda-wf-ercot-load-hist`         | Executes ercot-load-hist-wf workflow                         | Every 12 hours, at 00:00 (daily)                 |
150 | | `eda-wf-ercot-merge-fm-load`      | Executes ercot merge fuel mix and load latest workflow      | Every 12 hours, at 00:00 (daily)                 |
151 | | `ercot-fm-latest`               | Executes ercot_fm_latest_csv cloud function                  | Every hour, at 15 past the hour (daily)        |
152 | | `ercot-load-historical`        | Executes ercot_load_historical_csv cloud function            | Every hour, at 00:00 (daily)                       |
153 | | `ercot-load-latest`             | Executes ercot_load_latest_csv cloud function                 | Every hour, at 15 past the hour (daily)        |
154 | | `ercot_load_forecast`          | Executes ercot_load_forecast_csv cloud function              | Every hour, at 15 past the hour (daily)        |
155 | | `Quarter_hourly_spp_csv`        | Executes ercot_spp_csv function                             | Every 15 minutes (hourly, daily)                    |
156 | | `Quater_hourly_weather`        | Executes open_weather_live_data function                     | Every 15 minutes (hourly)                             |
157 | | `Sparkjob-spp-weather-merge`     | Executes Spark job to merge SPP and weather data           | Every 12 hours                                       |
158 | 
159 | ![Schedulers](Images/team6_cloud_scheduler_image.png)
160 | 
161 | ### Data Storage in BigQuery
162 | 
163 | After processing, data is loaded into BigQuery tables:
164 | 
165 | 
166 | ![Data Storage](Images/image2.png)
167 | 
168 | * `Ercot_fm_load_merged`
169 | ![Data Storage](Images/image5.png)
170 | ![Data Storage](Images/image15.png)
171 | 
172 | * `Ercot_load_forecast`
173 | ![Data Storage](Images/image16.png)
174 | ![Data Storage](Images/image1.png)
175 | 
176 | * `Ercot_load_historical`
177 | ![Data Storage](Images/image3.png)
178 | ![Data Storage](Images/image11.png)
179 | 
180 | * `Ercot_load_latest`
181 | ![Data Storage](Images/image9.png)
182 | ![Data Storage](Images/image6.png)
183 | 
184 | * `Ercot_spp_weather_merged`
185 | ![Data Storage](Images/image.png)
186 | ![Data Storage](Images/image17.png)
187 | 
188 | 
189 | 
190 | ### Data Analysis with Looker
191 | 
192 | Queries are used to fetch data from BigQuery for business analysis. Visualizations are then created using Google Looker.
193 | 
194 | 
195 | ![Data Storage](Images/Energy_Generation_and_Load_Consumption_Over_Time.png) 
196 | It appears that generation is highest during the middle of the day, which is likely due to solar power generation. Consumption is highest in the morning and evening, when people are typically awake and using appliances.  Overall, nuclear and natural gas appears to be following a predictable pattern, while solar and wind is more erratic.
197 | 
198 | ![Data Storage](Images/Load-forecast-viz.png)
199 | The chart illustrates the forecasted electrical load spanning three days across five locations: Houston, North, South, West, and the system total. It visually demonstrates a trend where the total energy demand increases throughout the day, peaking during daytime hours and tapering off during the night. This trend aligns with typical human activity patterns.
200 | 
201 | ![Data Storage](Images/weather-and-spp-over-time.png)
202 | The graph depicts the Settlement point priceprice (SPP) alongside weather conditions (Humidity, Temperature, and Wind Speed) over the course of a day
203 | It appears that there is not a clear correlation between the weather conditions and the price of energy. However, it is important to note that this data is only for one day, so it may not be representative of a typical day or season.
204 | 
205 | 
206 | ### Queries
207 | 
208 | Business Question 1:
209 | 
210 | "What is the average energy consumption per month?"
211 | 
212 | Explanation:
213 | 
214 | "This query calculates the average energy consumption for each month by extracting the month from the timestamp INTERVAL_START and averaging the load values for each month. This analysis provides insights into the seasonal variation in energy consumption, which can be valuable for resource planning and demand forecasting."
215 | 
216 | Query:
217 | SELECT 
218 |     EXTRACT(MONTH FROM INTERVAL_START) AS month,
219 |     AVG(load) AS average_load
220 | FROM 
221 |     ercot_merged.ercot_fm_load_merged
222 | GROUP BY 
223 |     month
224 | ORDER BY 
225 |     month;
226 | 
227 | Business Question 2:
228 | 
229 | "How does energy consumption vary throughout the day?"
230 | 
231 | Explanation:
232 | 
233 | "This query calculates the average energy consumption for each hour of the day by extracting the hour component from the timestamp INTERVAL_START and averaging the load values for each hour. Analyzing energy consumption patterns throughout the day can help identify peak usage hours, which are critical for grid management and capacity planning."
234 | 
235 | SELECT 
236 |     EXTRACT(HOUR FROM INTERVAL_START) AS hour_of_day,
237 |     AVG(load) AS average_load
238 | FROM 
239 |     ercot_merged.ercot_fm_load_merged
240 | GROUP BY 
241 |     hour_of_day
242 | ORDER BY 
243 |     hour_of_day;
244 | 
245 | 
246 | Business Question 3:
247 | 
248 | "What is the percentage distribution of different energy sources in the overall energy mix?"
249 | 
250 | Explanation:
251 | 
252 | "This query calculates the percentage contribution of each energy source to the total energy mix. It divides the sum of each energy source by the sum of all energy sources (coal_and_lignite, hydro, nuclear, power_storage, solar, wind, natural_gas, and other), and then multiplies by 100 to get the percentage. Understanding the distribution of energy sources helps in assessing the reliance on different energy types and planning for a more diversified energy portfolio."
253 | 
254 | SELECT 
255 |     ROUND(SUM(coal_and_lignite) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS coal_and_lignite_percent,
256 |     ROUND(SUM(hydro) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS hydro_percent,
257 |     ROUND(SUM(nuclear) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS nuclear_percent,
258 |     ROUND(SUM(power_storage) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS power_storage_percent,
259 |     ROUND(SUM(solar) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS solar_percent,
260 |     ROUND(SUM(wind) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS wind_percent,
261 |     ROUND(SUM(natural_gas) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS natural_gas_percent,
262 |     ROUND(SUM(other) / SUM(coal_and_lignite + hydro + nuclear + power_storage + solar + wind + natural_gas + other) * 100, 2) AS other_percent
263 | FROM 
264 |     `ercot_merged.ercot_fm_load_merged`;
265 | 
266 | 
267 | 
268 | 
269 | Business Question 4:
270 | 
271 | "How does weather condition affect electricity prices?"
272 | 
273 | 
274 | Explanation:
275 | 
276 | "This query calculates the average electricity prices (SPP) along with the associated weather conditions such as temperature, humidity, and wind speed. By examining the relationship between weather parameters and electricity prices, we can identify correlations and potential factors influencing price fluctuations."
277 | 
278 | Query:
279 | 
280 | SELECT 
281 |     ROUND(AVG(SPP), 2) AS average_price,
282 |     Temperature,
283 |     Humidity,
284 |     Wind_Speed
285 | FROM 
286 |     driven-stage-365620.ercot_merged.ercot_spp_weather_merged
287 | GROUP BY 
288 |     Temperature, Humidity, Wind_Speed
289 | ORDER BY 
290 |     average_price DESC;
291 | 
292 | 
293 | ## References
294 | 
295 | * Electric Reliability Council of Texas. (n.d.). About ERCOT. Retrieved February 25, 2024, from https://www.ercot.com/about
296 | * Gridstatus. (n.d.). ERCOT Grid Status. Retrieved February 25, 2024, from https://www.gridstatus.io/live/ercot
297 | * Gridstatus. (n.d.). Gridstatus Documentation. Retrieved February 25, 2024, from https://docs.gridstatus.io/en/latest/index.html
298 | * OpenWeather. (n.d.). OpenWeatherMap API Documentation. Retrieved February 25, 2024, from https://openweathermap.org/api
299 | * Open-Meteo. (n.d.). Open-Meteo API Documentation. Retrieved May 19, 2024, from https://open-meteo.com/en/docs
300 | 


--------------------------------------------------------------------------------