├── src ├── .gitkeep ├── __init__.py ├── data │ ├── __init__.py │ ├── dataset.py │ └── prepare_data.py ├── engine │ ├── __init__.py │ ├── predictor.py │ └── pytorch_trainer.py ├── models │ ├── __init__.py │ ├── load_checkpoint.py │ └── lstm_model.py └── utils │ ├── __init__.py │ ├── metrics.py │ ├── load_checkpoint.py │ ├── mlflow_logger.py │ └── training.py ├── notebooks ├── .gitkeep └── weather-prediction.ipynb ├── results ├── .gitkeep ├── mae.png ├── mse.png ├── training_metrics.png └── training_report.txt ├── .dvc ├── .gitignore └── config ├── requirements.txt ├── .gitignore ├── best_model.pt.dvc ├── dataset.csv.dvc ├── processed_dataset.csv.dvc ├── .dvcignore ├── config ├── predict.yaml ├── mmscaler_values.yaml ├── pt_training.yaml └── prepare_data.json ├── predict.py └── README.md /src/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /results/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sagnik1511/samay_yantra/HEAD/requirements.txt -------------------------------------------------------------------------------- /results/mae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sagnik1511/samay_yantra/HEAD/results/mae.png -------------------------------------------------------------------------------- /results/mse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sagnik1511/samay_yantra/HEAD/results/mse.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .idea/ 3 | __pycache__/ 4 | mlruns/ 5 | dataset.csv 6 | processed_dataset.csv 7 | best_model.pt -------------------------------------------------------------------------------- /best_model.pt.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 1886f4f99f525d703077729a9a07c8ce 3 | size: 6072 4 | path: best_model.pt 5 | -------------------------------------------------------------------------------- /dataset.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: cedea3f0adbde8941aa869c5d011f1f9 3 | size: 120605973 4 | path: dataset.csv 5 | -------------------------------------------------------------------------------- /results/training_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sagnik1511/samay_yantra/HEAD/results/training_metrics.png -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- 1 | [core] 2 | remote = storage 3 | ['remote "storage"'] 4 | url = gdrive://1OFD5KiaSD2GPs3VR-8fHB0dSnRfXVyNF 5 | -------------------------------------------------------------------------------- /processed_dataset.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 8a1a8b779d09205871e67ccc4092033e 3 | size: 171641999 4 | path: processed_dataset.csv 5 | -------------------------------------------------------------------------------- /.dvcignore: -------------------------------------------------------------------------------- 1 | # Add patterns of files dvc should ignore, which could improve 2 | # the performance. Learn more at 3 | # https://dvc.org/doc/user-guide/dvcignore 4 | -------------------------------------------------------------------------------- /src/utils/metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import mean_squared_log_error, r2_score, mean_absolute_error 2 | 3 | 4 | metrics_ = { 5 | "mae": mean_absolute_error 6 | } 7 | -------------------------------------------------------------------------------- /config/predict.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | path: "processed_dataset.csv" 3 | seq_length: 20 4 | model: 5 | num_classes: 10 6 | input_size: 10 7 | hidden_size: 2 8 | num_layers: 1 9 | checkpoint_path: "best_model.pt" 10 | -------------------------------------------------------------------------------- /config/mmscaler_values.yaml: -------------------------------------------------------------------------------- 1 | max_values: 2 | - 1020.07 3 | - 37.28 4 | - 100.0 5 | - 28.32 6 | - 18.13 7 | - 28.82 8 | - 1393.54 9 | - 16.83 10 | - 14.0 11 | - 1219.32 12 | min_values: 13 | - 913.6 14 | - -23.01 15 | - 12.95 16 | - 0.79 17 | - 0.5 18 | - 0.8 19 | - 1059.45 20 | - -0.39 21 | - 0.0 22 | - 0.0 23 | -------------------------------------------------------------------------------- /src/utils/load_checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from src.models.lstm_model import LSTM 3 | 4 | 5 | def load_best_model(model_config): 6 | path = "best_model.pt" 7 | chkp = torch.load(path) 8 | model = LSTM(**model_config) 9 | model.load_state_dict(chkp["model"]) 10 | 11 | return model 12 | -------------------------------------------------------------------------------- /config/pt_training.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | path: "processed_dataset.csv" 3 | batch_size: 5000 4 | seq_length: 20 5 | split_ratio: 0.8 6 | model: 7 | num_classes: 10 8 | input_size: 10 9 | hidden_size: 2 10 | num_layers: 1 11 | optimizer: 12 | lr: 0.0005 13 | training_hp: 14 | num_epochs: 10 15 | log_index: 10 16 | 17 | -------------------------------------------------------------------------------- /src/models/load_checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from src.models.lstm_model import LSTM 3 | 4 | 5 | def load_model(path, model_param, device="cpu"): 6 | chkp = torch.load(path, map_location=device) 7 | del model_param["checkpoint_path"] 8 | model = LSTM(**model_param) 9 | model.load_state_dict(chkp["model"]) 10 | 11 | return model 12 | -------------------------------------------------------------------------------- /notebooks/weather-prediction.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"print(\"NOOBS code in Jupyter Notebook!\")","metadata":{},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /src/utils/mlflow_logger.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | from urllib.parse import urlparse 3 | 4 | 5 | def log_pt_models(model, hparams, results): 6 | with mlflow.start_run(): 7 | for key in ["batch_size", "seq_length", "split_ratio"]: 8 | mlflow.log_param(key, hparams["dataset"][key]) 9 | for key in ["model", "optimizer", "training_hp"]: 10 | mlflow.log_params(hparams[key]) 11 | mlflow.log_metrics(results) 12 | 13 | tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme 14 | if tracking_url_type_store != "file": 15 | mlflow.pytorch.log_model(model, "model", registered_model_name="LSTM_Model") 16 | else: 17 | mlflow.pytorch.log_model(model, "model") 18 | -------------------------------------------------------------------------------- /config/prepare_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_path" : { 3 | "input" : "dataset.csv", 4 | "output" : "processed_dataset.csv", 5 | "mmscaler_values" : "config/mmscaler_values.yaml" 6 | }, 7 | "imp_features" : [ 8 | "Date Time", 9 | "p (mbar)", 10 | "T (degC)", 11 | "rh (%)", 12 | "VPact (mbar)", 13 | "sh (g/kg)", 14 | "H2OC (mmol/mol)", 15 | "rho (g/m**3)", 16 | "wv (m/s)", 17 | "rain (mm)", 18 | "SWDR (W/m**2)" 19 | ], 20 | "renamed_cols" : [ 21 | "time", 22 | "pressure", 23 | "tempereature", 24 | "relative_humidity", 25 | "vapour_pressure", 26 | "specific_humidty", 27 | "water_vap_concentration", 28 | "airtight", 29 | "wind_speed", 30 | "rain", 31 | "SWDR" 32 | ], 33 | "interpolation_method" : "slinear" 34 | } -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | from src.engine.predictor import fetch_index, predict_results 2 | from src.models.load_checkpoint import load_model 3 | import yaml 4 | 5 | 6 | def predict(user_input, config_path): 7 | 8 | with open(config_path, "r") as f: 9 | config = yaml.safe_load(f) 10 | f.close() 11 | print(config) 12 | model = load_model(config["model"]["checkpoint_path"], config["model"]) 13 | index = fetch_index(user_input, config["dataset"]["seq_length"], config["dataset"]["path"]) 14 | print(f"index : {index}") 15 | ans = predict_results(config["dataset"]["path"], index, model, config["dataset"]["seq_length"]) 16 | 17 | return ans 18 | 19 | 20 | if __name__ == "__main__": 21 | user_inp = "2021-02-01 08:50:00" 22 | ans = predict(user_inp, "config/predict.yaml") 23 | print(ans) 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/data/dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset, DataLoader 2 | 3 | 4 | class WeatherDataset(Dataset): 5 | 6 | def __init__(self, meta_df, seq_length): 7 | self.meta_df = meta_df 8 | self.seq_length = seq_length 9 | 10 | def __len__(self): 11 | return len(self.meta_df) - self.seq_length - 1 12 | 13 | def __getitem__(self, index): 14 | X = self.meta_df.iloc[index: index + self.seq_length, 1:].to_numpy().astype("float32") 15 | y = self.meta_df.iloc[index + self.seq_length, 1:].to_numpy().astype("float32") 16 | 17 | return X, y 18 | 19 | 20 | def split_data(dataframe, split_ratio=0.8): 21 | df = dataframe.copy() 22 | split_index = int(split_ratio * len(df)) 23 | train_df = df.iloc[:split_index] 24 | val_df = df.iloc[split_index:] 25 | 26 | return train_df, val_df 27 | 28 | 29 | def create_loaders(dataframe, split_ratio, batch_size, seq_length): 30 | train_set, val_set = split_data(dataframe=dataframe, split_ratio=split_ratio) 31 | train_ds = WeatherDataset(train_set, seq_length) 32 | val_ds = WeatherDataset(val_set, seq_length) 33 | 34 | train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True) 35 | val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True) 36 | 37 | return train_dl, val_dl 38 | -------------------------------------------------------------------------------- /src/models/lstm_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | 6 | class LSTM(nn.Module): 7 | 8 | def __init__(self, num_classes, input_size, hidden_size, num_layers): 9 | super(LSTM, self).__init__() 10 | 11 | self.num_classes = num_classes 12 | self.num_layers = num_layers 13 | self.input_size = input_size 14 | self.hidden_size = hidden_size 15 | self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, 16 | num_layers=num_layers, batch_first=True) 17 | self.fc = nn.Linear(hidden_size, num_classes) 18 | self.sigmoid = nn.Sigmoid() 19 | 20 | def forward(self, x, device): 21 | h_0 = Variable(torch.zeros( 22 | self.num_layers, x.size(0), self.hidden_size)).to(device) 23 | c_0 = Variable(torch.zeros( 24 | self.num_layers, x.size(0), self.hidden_size)).to(device) 25 | _, (h_out, _) = self.lstm(x, (h_0, c_0)) 26 | h_out = h_out.view(-1, self.hidden_size) 27 | out = self.sigmoid(self.fc(h_out)) 28 | return out 29 | 30 | 31 | def test(): 32 | b, s, c = 4, 12, 10 33 | rand_value = torch.rand(b, s, c) 34 | 35 | model = LSTM(num_classes=c, input_size=c, hidden_size=2, num_layers=1) 36 | op = model(rand_value) 37 | print(op.shape) 38 | 39 | 40 | if __name__ == "__main__": 41 | test() 42 | -------------------------------------------------------------------------------- /src/engine/predictor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from torch import from_numpy 3 | import yaml 4 | from datetime import datetime 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | 9 | def fetch_index(given_time, seq_length, csv_path): 10 | min_range = pd.read_csv(csv_path, usecols=[0], 11 | skiprows=lambda x: x not in [seq_length, seq_length + 1]).to_numpy().reshape(-1)[0] 12 | max_range = "2021-01-01 00:10:00" 13 | fin_index = 893646 14 | 15 | if given_time < min_range: 16 | raise ValueError 17 | else: 18 | if max_range > given_time: 19 | timestamps = pd.read_csv(csv_path, usecols=["time"]) 20 | index = timestamps[timestamps["time"] == given_time].index[0] 21 | else: 22 | fmt = '%Y-%m-%d %H:%M:%S' 23 | t1 = datetime.strptime(max_range, fmt) 24 | t2 = datetime.strptime(given_time, fmt) 25 | diff = (t2 - t1).total_seconds() // 600 26 | index = int(fin_index + diff) 27 | 28 | return index 29 | 30 | 31 | def load_mmscaler_values(): 32 | with open("config/mmscaler_values.yaml", "r") as f: 33 | scaler_values = yaml.safe_load(f) 34 | f.close() 35 | return scaler_values 36 | 37 | 38 | def predict_single_record(model, record_arr, device="cpu"): 39 | record_arr = from_numpy(record_arr).unsqueeze(0) 40 | op = model(record_arr, device).squeeze(0) 41 | if op.device != "cpu": 42 | op = op.cpu() 43 | op = op.detach().numpy().reshape(-1) 44 | return op.tolist() 45 | 46 | 47 | def anti_transform(op_arr): 48 | scaler_dict = load_mmscaler_values() 49 | for index in range(len(op_arr)): 50 | op_arr[index] = (op_arr[index] * (scaler_dict["max_values"][index] 51 | - scaler_dict["min_values"][index])) + scaler_dict["min_values"][index] 52 | ans = [round(el, 3) for el in op_arr.tolist()] 53 | return ans 54 | 55 | 56 | def predict_results(csv_path, index, model, seq_len): 57 | if index < 893646: 58 | df = pd.read_csv(csv_path, skiprows=lambda x: x not in [index+1], header=None) 59 | df = df.to_numpy()[:, 1:].reshape(-1) 60 | ans = anti_transform(df) 61 | else: 62 | num_repeats = index - 893646 63 | data = pd.read_csv(csv_path, 64 | skiprows=lambda x: x not in [i for i in range(893646 - seq_len, 893646)], header=None) 65 | data = data.to_numpy()[:, 1:] 66 | temp_data = data.astype("float32") 67 | print(f"Predicting Future for {num_repeats} iteration") 68 | for index in tqdm(range(num_repeats)): 69 | op = predict_single_record(model, temp_data) 70 | temp_data = temp_data[1:, :].tolist() 71 | temp_data.append(op) 72 | temp_data = np.array(temp_data).astype("float32") 73 | 74 | ans = anti_transform(temp_data[-1, :]) 75 | 76 | return ans 77 | -------------------------------------------------------------------------------- /src/data/prepare_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import numpy as np 4 | from sklearn.preprocessing import MinMaxScaler 5 | import json 6 | import yaml 7 | 8 | 9 | def fetch_imp_features(dataframe, job_config): 10 | fet_cols = job_config["imp_features"] 11 | renamed_cols = job_config["renamed_cols"] 12 | assert len(fet_cols) == len(renamed_cols), "Size mismatch, can not rename columns." 13 | df = dataframe.copy() 14 | df = df[fet_cols] 15 | df.columns = renamed_cols 16 | 17 | return df 18 | 19 | 20 | def process_outliers_to_nan(dataframe): 21 | df = dataframe.copy() 22 | df.replace(-9999.0, np.nan, inplace=True) 23 | df.replace(-9999.990, np.nan, inplace=True) 24 | df["wind_speed"].replace(28.4900, np.nan, inplace=True) 25 | 26 | return df 27 | 28 | 29 | def perform_interpolation(dataframe, job_config): 30 | 31 | method = job_config["interpolation_method"] 32 | df = dataframe.copy() 33 | for fet in df.columns[1:]: 34 | df[fet] = df[fet].interpolate(method=method) 35 | 36 | return df 37 | 38 | 39 | def perform_scaling(dataframe): 40 | df = dataframe.copy() 41 | mm_scalers = [MinMaxScaler() for _ in range(len(df.columns) - 1)] 42 | mm_scaler_values = { 43 | "min_values": [], 44 | "max_values": [] 45 | } 46 | for index, fet in enumerate(df.columns[1:]): 47 | df[fet] = mm_scalers[index].fit_transform(df[fet].to_numpy().reshape(-1, 1)).reshape(-1) 48 | mm_scaler_values["min_values"].append(mm_scalers[index].data_min_.tolist()[0]) 49 | mm_scaler_values["max_values"].append(mm_scalers[index].data_max_.tolist()[0]) 50 | 51 | return df, mm_scaler_values 52 | 53 | 54 | def prepare_dataset(job_config): 55 | 56 | ip_path = job_config["data_path"]["input"] 57 | op_path = job_config["data_path"]["output"] 58 | mmscaler_conf_path = job_config["data_path"]["mmscaler_values"] 59 | assert os.path.isfile(ip_path), "Source data not found ..." 60 | dataframe = pd.read_csv(ip_path, parse_dates=["Date Time"]) 61 | print("Data Loaded ...") 62 | df = dataframe.copy() 63 | df = fetch_imp_features(dataframe=df, job_config=job_config) 64 | print("Important features filtered ...") 65 | df = process_outliers_to_nan(dataframe=df) 66 | df.drop_duplicates("time", inplace=True) 67 | df = perform_interpolation(dataframe=df, job_config=job_config) 68 | print("Performed interpolation ...") 69 | df, scaler_values = perform_scaling(dataframe=df) 70 | print('Performed Scaling ...') 71 | with open(mmscaler_conf_path, "w") as f: 72 | yaml.dump(scaler_values, f) 73 | f.close() 74 | print("Scaler values stored ...") 75 | df.to_csv(op_path, index=False) 76 | print(f"Processed data stored to {op_path}") 77 | 78 | 79 | def main(): 80 | job_config_path = os.path.join(os.getcwd(), "config", "prepare_data.json") 81 | with open(job_config_path, "r") as f: 82 | job_config = json.load(f) 83 | f.close() 84 | print(job_config) 85 | prepare_dataset(job_config=job_config) 86 | 87 | 88 | if __name__ == "__main__": 89 | main() 90 | -------------------------------------------------------------------------------- /src/utils/training.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from src.models.lstm_model import LSTM 3 | from src.data.dataset import create_loaders 4 | import matplotlib.pyplot as plt 5 | import torch 6 | 7 | 8 | def init_objects(job_config): 9 | dataframe = pd.read_csv(job_config["dataset"]["path"]) 10 | train_dl, val_dl = create_loaders(dataframe=dataframe, 11 | split_ratio=job_config["dataset"]["split_ratio"], 12 | batch_size=job_config["dataset"]["batch_size"], 13 | seq_length=job_config["dataset"]["seq_length"]) 14 | print("Dataloaders Generated...") 15 | model = LSTM(num_classes=job_config["model"]["num_classes"], 16 | input_size=job_config["model"]["input_size"], 17 | hidden_size=job_config["model"]["hidden_size"], 18 | num_layers=job_config["model"]["num_layers"]) 19 | print("Model Generated...") 20 | print(model) 21 | optim = torch.optim.Adam(params=model.parameters(), 22 | lr=job_config["optimizer"]["lr"]) 23 | print("Optimizer Generated...") 24 | loss_fn = torch.nn.MSELoss() 25 | 26 | return (train_dl, val_dl), model, optim, loss_fn 27 | 28 | 29 | def save_best_model_on_loss(curr_losses, best_losses, model, optim, track_on="validation"): 30 | assert track_on in ["training", "validation"] 31 | curr_train_loss, curr_val_loss = curr_losses 32 | best_train_loss, best_val_loss = best_losses 33 | if track_on == "engine": 34 | flag = save_best_model(curr_loss=curr_train_loss, 35 | best_loss=best_train_loss, 36 | model=model, 37 | optim=optim) 38 | else: 39 | flag = save_best_model(curr_loss=curr_val_loss, 40 | best_loss=best_val_loss, 41 | model=model, 42 | optim=optim) 43 | if flag: 44 | return curr_losses 45 | else: 46 | return best_losses 47 | 48 | 49 | def save_best_model(curr_loss, best_loss, model, optim): 50 | if curr_loss <= best_loss: 51 | weights = { 52 | "model": model.state_dict(), 53 | "optim": optim.state_dict() 54 | } 55 | torch.save(weights, "best_model.pt") 56 | print("Model Updated...") 57 | print(f"Current best loss : {'%.6f'%curr_loss}") 58 | return True 59 | else: 60 | print("Model didn't Updated...") 61 | print(f"Current best loss : {'%.6f' % best_loss}") 62 | return False 63 | 64 | 65 | def save_training_curve(train_dict, val_dict): 66 | for name in val_dict.keys(): 67 | plt.figure(figsize=(20, 6)) 68 | plt.plot(train_dict[name], label=f"train_{name}") 69 | plt.plot(val_dict[name], label=f"train_{name}") 70 | plt.legend() 71 | plt.savefig(f"results/{name}.png") 72 | plt.close() 73 | 74 | 75 | def update_metric_dict(true, pred, metric_dict, res_dict): 76 | if true.device != "cpu": 77 | true = true.cpu() 78 | pred = pred.cpu() 79 | true = true.detach().numpy().reshape(-1) 80 | pred = pred.detach().numpy().reshape(-1) 81 | for name, metric in metric_dict.items(): 82 | res_dict[name].append(metric(true, pred)) 83 | 84 | return res_dict 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /src/engine/pytorch_trainer.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import torch 3 | import numpy as np 4 | from src.utils.training import (init_objects, 5 | save_best_model_on_loss, 6 | save_training_curve, 7 | update_metric_dict) 8 | from src.utils.mlflow_logger import log_pt_models 9 | from src.utils.load_checkpoint import load_best_model 10 | from src.utils.metrics import metrics_ 11 | 12 | 13 | def trainer(model, train_dl, val_dl, loss_fn, optim, job_config): 14 | training_hp = job_config["training_hp"] 15 | epochs = training_hp["num_epochs"] 16 | log_index = training_hp["log_index"] 17 | best_train_loss = torch.inf 18 | best_val_loss = torch.inf 19 | train_loss_array, val_loss_array = [], [] 20 | train_metric_results = {k: [] for k, _ in metrics_.items()} 21 | val_metric_results = {k: [] for k, _ in metrics_.items()} 22 | device = "cuda" if torch.cuda.is_available() else "cpu" 23 | print(f"Device found : {device}") 24 | model.to(device) 25 | print('Model loaded on device...') 26 | for epoch in range(epochs): 27 | print(f"Epoch {epoch + 1} :") 28 | train_epoch_metric_results = {k: [] for k, _ in metrics_.items()} 29 | val_epoch_metric_results = {k: [] for k, _ in metrics_.items()} 30 | train_loss = val_loss = 0.0 31 | model.train() 32 | for index, (x, y) in enumerate(train_dl): 33 | if device != "cpu": 34 | x = x.cuda() 35 | y = y.cuda() 36 | op = model(x, device) 37 | train_epoch_metric_results = update_metric_dict(y, op, metrics_, train_epoch_metric_results) 38 | curr_loss = loss_fn(op, y) 39 | if index % log_index == 0: 40 | print(f"Step {index} Loss : {'%.6f' % curr_loss.item()}") 41 | train_loss += curr_loss.item() 42 | curr_loss.backward() 43 | optim.step() 44 | model.eval() 45 | for x, y in val_dl: 46 | if device != "cpu": 47 | x = x.cuda() 48 | y = y.cuda() 49 | op = model(x, device) 50 | val_epoch_metric_results = update_metric_dict(y, op, metrics_, val_epoch_metric_results) 51 | curr_loss = loss_fn(op, y) 52 | val_loss += curr_loss.item() 53 | print(f"Train Loss : {'%.6f' % train_loss} || Validation Loss : {'%.6f' % val_loss}") 54 | train_res = {k: np.mean(v) for k, v in train_epoch_metric_results.items()} 55 | val_res = {k: sum(v) for k, v in val_epoch_metric_results.items()} 56 | print(f"Train Metric Results : {train_res}") 57 | print(f"Validation Metric Results : {val_res}") 58 | for tot, epo in zip([train_metric_results, val_metric_results], 59 | [train_res, val_res]): 60 | for name, val in epo.items(): 61 | tot[name].append(val) 62 | train_loss_array.append(train_loss) 63 | val_loss_array.append(val_loss) 64 | best_train_loss, best_val_loss = save_best_model_on_loss(curr_losses=(train_loss, val_loss), 65 | best_losses=(best_train_loss, best_val_loss), 66 | model=model, 67 | optim=optim) 68 | print("\n") 69 | results = { 70 | "training_mse": best_train_loss, 71 | "validation_mse": best_val_loss 72 | } 73 | best_model = load_best_model(job_config["model"]) 74 | log_pt_models(model=best_model, 75 | hparams=job_config, 76 | results=results) 77 | train_metric_results["mse"] = train_loss_array 78 | val_metric_results["mse"] = val_loss_array 79 | save_training_curve(train_metric_results, val_metric_results) 80 | print("Training Completed...") 81 | 82 | 83 | def main(): 84 | job_config_path = "config/pt_training.yaml" 85 | with open(job_config_path, "r") as f: 86 | job_config = yaml.safe_load(f) 87 | f.close() 88 | print("Configuration Loaded...") 89 | print(job_config) 90 | (train_dl, val_dl), lstm_model, optim, loss_fn = init_objects(job_config) 91 | trainer(model=lstm_model, 92 | train_dl=train_dl, 93 | val_dl=val_dl, 94 | loss_fn=loss_fn, 95 | optim=optim, 96 | job_config=job_config) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

SamayYantra

3 |

Weather Prediction of Beutenberg using time series forcasting with deep learning

4 | banner 6 |

Samay Yantra

7 | 8 | 9 | 10 |

GOALS of the Project

11 | In ancient days, people used to look at the sky or feel it's humidity and several factors tried to predict the upcoming days which were most of them predicted true to a high accuracy. 12 | Nowadays as we can detect several weather factors, the prediction of weather is becoming too complex to be handled by human brain. Deep learning took up the next step and brought ease to this domain. 13 | Time series forecasting with deep neural networks changes the frontiers. Now we can predict/forecast innumerable future attributes based on historical data. Not only it shared us the predictions, but the data can be taken for more advanced analysis and research. 14 | The secondary goal of the project is to implement cutting edge MLOps to actual real problems. 15 | The ternary goal of the project is to implement this on traditional python OOPS but Jupyter Notebook , so that we can match different real-world ML codebase and learn accordingly. 16 |

Technology

17 |

1. Pytorch

18 |

2. Scikit-Learn

19 |

3. Pandas

20 |

4. Numpy

21 |

5. MLFlow

22 |
23 | 24 | # Data Collection Process : 25 | 26 | The raw data has been recorded by the Weather station of [Max Planck Institute for Biogeochemistry](https://www.bgc-jena.mpg.de/wetter/), Jena, Germany. 27 | Jena Weather dataset is made up of many different quantities (such air temperature, atmospheric pressure, humidity, wind direction, and so on) were recorded every 10 minutes, over several years. This dataset covers data from January 1st 2004 to December 31st 2020 28 | The actual data is this a copy which is published for academic purposes as a kaggle dataset, Link : [kaggle/Weather Station Beutenberg Dataset](https://www.kaggle.com/datasets/mnassrib/jena-weather-dataset) 29 | . 30 | The primary data is stored as a single *.csv* file which is later processed to *processed.csv* file to be taken for training. 31 | 32 | ### Special Note : 33 | Data has been stored using **DVC(Data version Control)**, so the repository package can be 34 | used flexibly without adding the data straight in the repo but fetch from any remote source e.g. **AWS S3**, **GDRIVE**, etc. 35 | For this case, the data has been stored in GDRIVE. 36 | 37 | 38 | # Directory Structure : 39 | 40 | The data follows a strict data science project structure. 41 | 42 | . 43 | └── root/ 44 | ├──.dvc/ 45 | ├── config/ 46 | ├── mlruns/ 47 | ├── models/ 48 | ├── notebooks/ 49 | ├── results/ 50 | └── src/ 51 | ├── data 52 | ├── features 53 | ├── models 54 | └── visualization 55 | 56 | # Installation and Usage : 57 |

Installation

58 | 59 | 60 | 1. Create a Virtual Environment : [Tutorial](https://docs.python.org/3/library/venv.html) 61 | 2. Clone the repository by running this command. 62 | ```shell 63 | git clone https://github.com/sagnik1511/samay_yantra.git 64 | ``` 65 | 3. Open the directory with *cmd*. 66 | 4. Copy this command in terminal to install dependencies. 67 | ```shell 68 | pip install -r requirements.txt 69 | ``` 70 | 5. Installing the requirements.txt may generate some error due to outdated MS Visual C++ Build. You can fix this problem using [this](https://www.youtube.com/watch?v=rcI1_e38BWs). 71 | 72 | # Approach : 73 | 1. Go to the root directory using `cd` command. 74 | 2. The first step is to download the actual data into the project.Copy and run this command. 75 | ```shell 76 | dvc pull 77 | ``` 78 | 3. If you want to run the training process, simply change the configuration in `config/pt_training.yaml` and then run this command . Keep in mind that you have to stay at the root directory. 79 | ```shell 80 | python -m src.engine.pytorch_trainer 81 | ``` 82 | 83 | 4. Further usage will be updated soon... 84 | 85 | # Results: 86 | You can visit [reports](https://github.com/sagnik1511/SamayYantra/tree/main/reports) directory where all the runs are stored. Currently, for some privacy issues, the mlflow runs are not shared in here. 87 | 88 |
89 |

Thanks for visiting :D

90 |

Do STAR if you find it useful

91 | 92 |
93 | 94 | 95 | -------------------------------------------------------------------------------- /results/training_report.txt: -------------------------------------------------------------------------------- 1 | Configuration Loaded... 2 | {'dataset': {'path': 'processed_dataset.csv', 'batch_size': 5000, 'seq_length': 12, 'split_ratio': 0.8}, 'model': {'num_classes': 10, 'input_size': 10, 'hidden_size': 2, 'num_layers': 3 | 1}, 'optimizer': {'lr': 0.0005}, 'training_hp': {'num_epochs': 10, 'log_index': 10}} 4 | Dataloaders Generated... 5 | Model Generated... 6 | LSTM( 7 | (lstm): LSTM(10, 2, batch_first=True) 8 | (fc): Linear(in_features=2, out_features=10, bias=True) 9 | ) 10 | Optimizer Generated... 11 | Device found : cuda 12 | Model loaded on device... 13 | Epoch 1 : 14 | Step 0 Loss : 0.400730 15 | Step 10 Loss : 0.390075 16 | Step 20 Loss : 0.379931 17 | Step 30 Loss : 0.367938 18 | Step 40 Loss : 0.354751 19 | Step 50 Loss : 0.342859 20 | Step 60 Loss : 0.329298 21 | Step 70 Loss : 0.316875 22 | Step 80 Loss : 0.305422 23 | Step 90 Loss : 0.294833 24 | Step 100 Loss : 0.284875 25 | Step 110 Loss : 0.275529 26 | Step 120 Loss : 0.266353 27 | Step 130 Loss : 0.258478 28 | Step 140 Loss : 0.249010 29 | Train Loss : 45.733553 || Validation Loss : 8.782230 30 | Model Updated... 31 | Current best loss : 8.782230 32 | 33 | 34 | Epoch 2 : 35 | Step 0 Loss : 0.246973 36 | Step 10 Loss : 0.239489 37 | Step 20 Loss : 0.231310 38 | Step 30 Loss : 0.222712 39 | Step 40 Loss : 0.215650 40 | Step 50 Loss : 0.208018 41 | Step 60 Loss : 0.201227 42 | Step 70 Loss : 0.193382 43 | Step 80 Loss : 0.184516 44 | Step 90 Loss : 0.177906 45 | Step 100 Loss : 0.171014 46 | Step 110 Loss : 0.163382 47 | Step 120 Loss : 0.157045 48 | Step 130 Loss : 0.150653 49 | Step 140 Loss : 0.144826 50 | Train Loss : 27.613929 || Validation Loss : 5.089219 51 | Model Updated... 52 | Current best loss : 5.089219 53 | 54 | 55 | Epoch 3 : 56 | Step 0 Loss : 0.142830 57 | Step 10 Loss : 0.136966 58 | Step 20 Loss : 0.131699 59 | Step 30 Loss : 0.126804 60 | Step 40 Loss : 0.122074 61 | Step 50 Loss : 0.117513 62 | Step 60 Loss : 0.113231 63 | Step 70 Loss : 0.108528 64 | Step 80 Loss : 0.104816 65 | Step 90 Loss : 0.101312 66 | Step 100 Loss : 0.096990 67 | Step 110 Loss : 0.094033 68 | Step 120 Loss : 0.089982 69 | Step 130 Loss : 0.086228 70 | Step 140 Loss : 0.082436 71 | Train Loss : 15.721028 || Validation Loss : 2.940303 72 | Model Updated... 73 | Current best loss : 2.940303 74 | 75 | 76 | Epoch 4 : 77 | Step 0 Loss : 0.082047 78 | Step 10 Loss : 0.078933 79 | Step 20 Loss : 0.075038 80 | Step 30 Loss : 0.072082 81 | Step 40 Loss : 0.069719 82 | Step 50 Loss : 0.066482 83 | Step 60 Loss : 0.064385 84 | Step 70 Loss : 0.062030 85 | Step 80 Loss : 0.060018 86 | Step 90 Loss : 0.058719 87 | Step 100 Loss : 0.057207 88 | Step 110 Loss : 0.055559 89 | Step 120 Loss : 0.054028 90 | Step 130 Loss : 0.052832 91 | Step 140 Loss : 0.051676 92 | Train Loss : 9.110056 || Validation Loss : 1.936779 93 | Model Updated... 94 | Current best loss : 1.936779 95 | 96 | 97 | Epoch 5 : 98 | Step 0 Loss : 0.052366 99 | Step 10 Loss : 0.051080 100 | Step 20 Loss : 0.050515 101 | Step 30 Loss : 0.049729 102 | Step 40 Loss : 0.048244 103 | Step 50 Loss : 0.047954 104 | Step 60 Loss : 0.045566 105 | Step 70 Loss : 0.045765 106 | Step 80 Loss : 0.044726 107 | Step 90 Loss : 0.044683 108 | Step 100 Loss : 0.043754 109 | Step 110 Loss : 0.042881 110 | Step 120 Loss : 0.042125 111 | Step 130 Loss : 0.041396 112 | Step 140 Loss : 0.040874 113 | Train Loss : 6.564872 || Validation Loss : 1.578639 114 | Model Updated... 115 | Current best loss : 1.578639 116 | 117 | 118 | Epoch 6 : 119 | Step 0 Loss : 0.040972 120 | Step 10 Loss : 0.039978 121 | Step 20 Loss : 0.040800 122 | Step 30 Loss : 0.039807 123 | Step 40 Loss : 0.039929 124 | Step 50 Loss : 0.039873 125 | Step 60 Loss : 0.040065 126 | Step 70 Loss : 0.039826 127 | Step 80 Loss : 0.039482 128 | Step 90 Loss : 0.040380 129 | Step 100 Loss : 0.039739 130 | Step 110 Loss : 0.040256 131 | Step 120 Loss : 0.040269 132 | Step 130 Loss : 0.039535 133 | Step 140 Loss : 0.039283 134 | Train Loss : 5.723184 || Validation Loss : 1.546589 135 | Model Updated... 136 | Current best loss : 1.546589 137 | 138 | 139 | Epoch 7 : 140 | Step 0 Loss : 0.039914 141 | Step 10 Loss : 0.039406 142 | Step 20 Loss : 0.039742 143 | Step 30 Loss : 0.040374 144 | Step 40 Loss : 0.039554 145 | Step 50 Loss : 0.040020 146 | Step 60 Loss : 0.039124 147 | Step 70 Loss : 0.039243 148 | Step 80 Loss : 0.039483 149 | Step 90 Loss : 0.039072 150 | Step 100 Loss : 0.038611 151 | Step 110 Loss : 0.038497 152 | Step 120 Loss : 0.038803 153 | Step 130 Loss : 0.038428 154 | Step 140 Loss : 0.037840 155 | Train Loss : 5.607280 || Validation Loss : 1.460559 156 | Model Updated... 157 | Current best loss : 1.460559 158 | 159 | 160 | Epoch 8 : 161 | Step 0 Loss : 0.036983 162 | Step 10 Loss : 0.037550 163 | Step 20 Loss : 0.037424 164 | Step 30 Loss : 0.037318 165 | Step 40 Loss : 0.037119 166 | Step 50 Loss : 0.037172 167 | Step 60 Loss : 0.036731 168 | Step 70 Loss : 0.036120 169 | Step 80 Loss : 0.035058 170 | Step 90 Loss : 0.035926 171 | Step 100 Loss : 0.035322 172 | Step 110 Loss : 0.035502 173 | Step 120 Loss : 0.035048 174 | Step 130 Loss : 0.034890 175 | Step 140 Loss : 0.034707 176 | Train Loss : 5.159610 || Validation Loss : 1.327795 177 | Model Updated... 178 | Current best loss : 1.327795 179 | 180 | 181 | Epoch 9 : 182 | Step 0 Loss : 0.034557 183 | Step 10 Loss : 0.034898 184 | Step 20 Loss : 0.034155 185 | Step 30 Loss : 0.033752 186 | Step 40 Loss : 0.033506 187 | Step 50 Loss : 0.034096 188 | Step 60 Loss : 0.033892 189 | Step 70 Loss : 0.033634 190 | Step 80 Loss : 0.033149 191 | Step 90 Loss : 0.033251 192 | Step 100 Loss : 0.033261 193 | Step 110 Loss : 0.032598 194 | Step 120 Loss : 0.033227 195 | Step 130 Loss : 0.032869 196 | Step 140 Loss : 0.032903 197 | Train Loss : 4.798131 || Validation Loss : 1.240079 198 | Model Updated... 199 | Current best loss : 1.240079 200 | 201 | 202 | Epoch 10 : 203 | Step 0 Loss : 0.033113 204 | Step 10 Loss : 0.032516 205 | Step 20 Loss : 0.032675 206 | Step 30 Loss : 0.032951 207 | Step 40 Loss : 0.033049 208 | Step 50 Loss : 0.032906 209 | Step 60 Loss : 0.032652 210 | Step 70 Loss : 0.033128 211 | Step 80 Loss : 0.032546 212 | Step 90 Loss : 0.032942 213 | Step 100 Loss : 0.033307 214 | Step 110 Loss : 0.033222 215 | Step 120 Loss : 0.033647 216 | Step 130 Loss : 0.033756 217 | Step 140 Loss : 0.034032 218 | Train Loss : 4.724269 || Validation Loss : 1.260879 219 | Model didn't Updated... 220 | Current best loss : 1.240079 221 | 222 | 223 | 2022/06/13 19:36:28 WARNING mlflow.utils.requirements_utils: Found torch version (1.11.0+cu113) contains a local version label (+cu113). MLflow logged a pip requirement for this packag 224 | e as 'torch==1.11.0' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requi 225 | rements`. 226 | 2022/06/13 19:36:34 WARNING mlflow.utils.requirements_utils: Found torch version (1.11.0+cu113) contains a local version label (+cu113). MLflow logged a pip requirement for this packag 227 | e as 'torch==1.11.0' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requi 228 | rements`. 229 | Training Completed... --------------------------------------------------------------------------------