├── src
    ├── .gitkeep
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── dataset.py
    │   └── prepare_data.py
    ├── engine
    │   ├── __init__.py
    │   ├── predictor.py
    │   └── pytorch_trainer.py
    ├── models
    │   ├── __init__.py
    │   ├── load_checkpoint.py
    │   └── lstm_model.py
    └── utils
    │   ├── __init__.py
    │   ├── metrics.py
    │   ├── load_checkpoint.py
    │   ├── mlflow_logger.py
    │   └── training.py
├── notebooks
    ├── .gitkeep
    └── weather-prediction.ipynb
├── results
    ├── .gitkeep
    ├── mae.png
    ├── mse.png
    ├── training_metrics.png
    └── training_report.txt
├── .dvc
    ├── .gitignore
    └── config
├── requirements.txt
├── .gitignore
├── best_model.pt.dvc
├── dataset.csv.dvc
├── processed_dataset.csv.dvc
├── .dvcignore
├── config
    ├── predict.yaml
    ├── mmscaler_values.yaml
    ├── pt_training.yaml
    └── prepare_data.json
├── predict.py
└── README.md


/src/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/results/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/engine/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /config.local
2 | /tmp
3 | /cache
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sagnik1511/samay_yantra/HEAD/requirements.txt


--------------------------------------------------------------------------------
/results/mae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sagnik1511/samay_yantra/HEAD/results/mae.png


--------------------------------------------------------------------------------
/results/mse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sagnik1511/samay_yantra/HEAD/results/mse.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | .idea/
3 | __pycache__/
4 | mlruns/
5 | dataset.csv
6 | processed_dataset.csv
7 | best_model.pt


--------------------------------------------------------------------------------
/best_model.pt.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 1886f4f99f525d703077729a9a07c8ce
3 |   size: 6072
4 |   path: best_model.pt
5 | 


--------------------------------------------------------------------------------
/dataset.csv.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: cedea3f0adbde8941aa869c5d011f1f9
3 |   size: 120605973
4 |   path: dataset.csv
5 | 


--------------------------------------------------------------------------------
/results/training_metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sagnik1511/samay_yantra/HEAD/results/training_metrics.png


--------------------------------------------------------------------------------
/.dvc/config:
--------------------------------------------------------------------------------
1 | [core]
2 |     remote = storage
3 | ['remote "storage"']
4 |     url = gdrive://1OFD5KiaSD2GPs3VR-8fHB0dSnRfXVyNF
5 | 


--------------------------------------------------------------------------------
/processed_dataset.csv.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 8a1a8b779d09205871e67ccc4092033e
3 |   size: 171641999
4 |   path: processed_dataset.csv
5 | 


--------------------------------------------------------------------------------
/.dvcignore:
--------------------------------------------------------------------------------
1 | # Add patterns of files dvc should ignore, which could improve
2 | # the performance. Learn more at
3 | # https://dvc.org/doc/user-guide/dvcignore
4 | 


--------------------------------------------------------------------------------
/src/utils/metrics.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import mean_squared_log_error, r2_score, mean_absolute_error
2 | 
3 | 
4 | metrics_ = {
5 |     "mae": mean_absolute_error
6 | }
7 | 


--------------------------------------------------------------------------------
/config/predict.yaml:
--------------------------------------------------------------------------------
 1 | dataset:
 2 |   path: "processed_dataset.csv"
 3 |   seq_length: 20
 4 | model:
 5 |   num_classes: 10
 6 |   input_size: 10
 7 |   hidden_size: 2
 8 |   num_layers: 1
 9 |   checkpoint_path: "best_model.pt"
10 | 


--------------------------------------------------------------------------------
/config/mmscaler_values.yaml:
--------------------------------------------------------------------------------
 1 | max_values:
 2 | - 1020.07
 3 | - 37.28
 4 | - 100.0
 5 | - 28.32
 6 | - 18.13
 7 | - 28.82
 8 | - 1393.54
 9 | - 16.83
10 | - 14.0
11 | - 1219.32
12 | min_values:
13 | - 913.6
14 | - -23.01
15 | - 12.95
16 | - 0.79
17 | - 0.5
18 | - 0.8
19 | - 1059.45
20 | - -0.39
21 | - 0.0
22 | - 0.0
23 | 


--------------------------------------------------------------------------------
/src/utils/load_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from src.models.lstm_model import LSTM
 3 | 
 4 | 
 5 | def load_best_model(model_config):
 6 |     path = "best_model.pt"
 7 |     chkp = torch.load(path)
 8 |     model = LSTM(**model_config)
 9 |     model.load_state_dict(chkp["model"])
10 | 
11 |     return model
12 | 


--------------------------------------------------------------------------------
/config/pt_training.yaml:
--------------------------------------------------------------------------------
 1 | dataset:
 2 |   path: "processed_dataset.csv"
 3 |   batch_size: 5000
 4 |   seq_length: 20
 5 |   split_ratio: 0.8
 6 | model:
 7 |   num_classes: 10
 8 |   input_size: 10
 9 |   hidden_size: 2
10 |   num_layers: 1
11 | optimizer:
12 |   lr: 0.0005
13 | training_hp:
14 |   num_epochs: 10
15 |   log_index: 10
16 | 
17 | 


--------------------------------------------------------------------------------
/src/models/load_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from src.models.lstm_model import LSTM
 3 | 
 4 | 
 5 | def load_model(path, model_param, device="cpu"):
 6 |     chkp = torch.load(path, map_location=device)
 7 |     del model_param["checkpoint_path"]
 8 |     model = LSTM(**model_param)
 9 |     model.load_state_dict(chkp["model"])
10 | 
11 |     return model
12 | 


--------------------------------------------------------------------------------
/notebooks/weather-prediction.ipynb:
--------------------------------------------------------------------------------
1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"print(\"NOOBS code in Jupyter Notebook!\")","metadata":{},"execution_count":null,"outputs":[]}]}


--------------------------------------------------------------------------------
/src/utils/mlflow_logger.py:
--------------------------------------------------------------------------------
 1 | import mlflow
 2 | from urllib.parse import urlparse
 3 | 
 4 | 
 5 | def log_pt_models(model, hparams, results):
 6 |     with mlflow.start_run():
 7 |         for key in ["batch_size", "seq_length", "split_ratio"]:
 8 |             mlflow.log_param(key, hparams["dataset"][key])
 9 |         for key in ["model", "optimizer", "training_hp"]:
10 |             mlflow.log_params(hparams[key])
11 |         mlflow.log_metrics(results)
12 | 
13 |         tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
14 |         if tracking_url_type_store != "file":
15 |             mlflow.pytorch.log_model(model, "model", registered_model_name="LSTM_Model")
16 |         else:
17 |             mlflow.pytorch.log_model(model, "model")
18 | 


--------------------------------------------------------------------------------
/config/prepare_data.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_path" : {
 3 |     "input" : "dataset.csv",
 4 |     "output" : "processed_dataset.csv",
 5 |     "mmscaler_values" : "config/mmscaler_values.yaml"
 6 |   },
 7 |   "imp_features" : [
 8 |     "Date Time",
 9 |     "p (mbar)",
10 |     "T (degC)",
11 |     "rh (%)",
12 |     "VPact (mbar)",
13 |     "sh (g/kg)",
14 |     "H2OC (mmol/mol)",
15 |     "rho (g/m**3)",
16 |     "wv (m/s)",
17 |     "rain (mm)",
18 |     "SWDR (W/m**2)"
19 |   ],
20 |   "renamed_cols" : [
21 |     "time",
22 |     "pressure",
23 |     "tempereature",
24 |     "relative_humidity",
25 |     "vapour_pressure",
26 |     "specific_humidty",
27 |     "water_vap_concentration",
28 |     "airtight",
29 |     "wind_speed",
30 |     "rain",
31 |     "SWDR"
32 |   ],
33 |   "interpolation_method" : "slinear"
34 | }


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
 1 | from src.engine.predictor import fetch_index, predict_results
 2 | from src.models.load_checkpoint import load_model
 3 | import yaml
 4 | 
 5 | 
 6 | def predict(user_input, config_path):
 7 | 
 8 |     with open(config_path, "r") as f:
 9 |         config = yaml.safe_load(f)
10 |         f.close()
11 |     print(config)
12 |     model = load_model(config["model"]["checkpoint_path"], config["model"])
13 |     index = fetch_index(user_input, config["dataset"]["seq_length"], config["dataset"]["path"])
14 |     print(f"index : {index}")
15 |     ans = predict_results(config["dataset"]["path"], index, model, config["dataset"]["seq_length"])
16 | 
17 |     return ans
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     user_inp = "2021-02-01 08:50:00"
22 |     ans = predict(user_inp, "config/predict.yaml")
23 |     print(ans)
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/src/data/dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset, DataLoader
 2 | 
 3 | 
 4 | class WeatherDataset(Dataset):
 5 | 
 6 |     def __init__(self, meta_df, seq_length):
 7 |         self.meta_df = meta_df
 8 |         self.seq_length = seq_length
 9 | 
10 |     def __len__(self):
11 |         return len(self.meta_df) - self.seq_length - 1
12 | 
13 |     def __getitem__(self, index):
14 |         X = self.meta_df.iloc[index: index + self.seq_length, 1:].to_numpy().astype("float32")
15 |         y = self.meta_df.iloc[index + self.seq_length, 1:].to_numpy().astype("float32")
16 | 
17 |         return X, y
18 | 
19 | 
20 | def split_data(dataframe, split_ratio=0.8):
21 |     df = dataframe.copy()
22 |     split_index = int(split_ratio * len(df))
23 |     train_df = df.iloc[:split_index]
24 |     val_df = df.iloc[split_index:]
25 | 
26 |     return train_df, val_df
27 | 
28 | 
29 | def create_loaders(dataframe, split_ratio, batch_size, seq_length):
30 |     train_set, val_set = split_data(dataframe=dataframe, split_ratio=split_ratio)
31 |     train_ds = WeatherDataset(train_set, seq_length)
32 |     val_ds = WeatherDataset(val_set, seq_length)
33 | 
34 |     train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
35 |     val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)
36 | 
37 |     return train_dl, val_dl
38 | 


--------------------------------------------------------------------------------
/src/models/lstm_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | 
 5 | 
 6 | class LSTM(nn.Module):
 7 | 
 8 |     def __init__(self, num_classes, input_size, hidden_size, num_layers):
 9 |         super(LSTM, self).__init__()
10 | 
11 |         self.num_classes = num_classes
12 |         self.num_layers = num_layers
13 |         self.input_size = input_size
14 |         self.hidden_size = hidden_size
15 |         self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
16 |                             num_layers=num_layers, batch_first=True)
17 |         self.fc = nn.Linear(hidden_size, num_classes)
18 |         self.sigmoid = nn.Sigmoid()
19 | 
20 |     def forward(self, x, device):
21 |         h_0 = Variable(torch.zeros(
22 |             self.num_layers, x.size(0), self.hidden_size)).to(device)
23 |         c_0 = Variable(torch.zeros(
24 |             self.num_layers, x.size(0), self.hidden_size)).to(device)
25 |         _, (h_out, _) = self.lstm(x, (h_0, c_0))
26 |         h_out = h_out.view(-1, self.hidden_size)
27 |         out = self.sigmoid(self.fc(h_out))
28 |         return out
29 | 
30 | 
31 | def test():
32 |     b, s, c = 4, 12, 10
33 |     rand_value = torch.rand(b, s, c)
34 | 
35 |     model = LSTM(num_classes=c, input_size=c, hidden_size=2, num_layers=1)
36 |     op = model(rand_value)
37 |     print(op.shape)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     test()
42 | 


--------------------------------------------------------------------------------
/src/engine/predictor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from torch import from_numpy
 3 | import yaml
 4 | from datetime import datetime
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | def fetch_index(given_time, seq_length, csv_path):
10 |     min_range = pd.read_csv(csv_path, usecols=[0],
11 |                             skiprows=lambda x: x not in [seq_length, seq_length + 1]).to_numpy().reshape(-1)[0]
12 |     max_range = "2021-01-01 00:10:00"
13 |     fin_index = 893646
14 | 
15 |     if given_time < min_range:
16 |         raise ValueError
17 |     else:
18 |         if max_range > given_time:
19 |             timestamps = pd.read_csv(csv_path, usecols=["time"])
20 |             index = timestamps[timestamps["time"] == given_time].index[0]
21 |         else:
22 |             fmt = '%Y-%m-%d %H:%M:%S'
23 |             t1 = datetime.strptime(max_range, fmt)
24 |             t2 = datetime.strptime(given_time, fmt)
25 |             diff = (t2 - t1).total_seconds() // 600
26 |             index = int(fin_index + diff)
27 | 
28 |         return index
29 | 
30 | 
31 | def load_mmscaler_values():
32 |     with open("config/mmscaler_values.yaml", "r") as f:
33 |         scaler_values = yaml.safe_load(f)
34 |         f.close()
35 |     return scaler_values
36 | 
37 | 
38 | def predict_single_record(model, record_arr, device="cpu"):
39 |     record_arr = from_numpy(record_arr).unsqueeze(0)
40 |     op = model(record_arr, device).squeeze(0)
41 |     if op.device != "cpu":
42 |         op = op.cpu()
43 |     op = op.detach().numpy().reshape(-1)
44 |     return op.tolist()
45 | 
46 | 
47 | def anti_transform(op_arr):
48 |     scaler_dict = load_mmscaler_values()
49 |     for index in range(len(op_arr)):
50 |         op_arr[index] = (op_arr[index] * (scaler_dict["max_values"][index]
51 |                                           - scaler_dict["min_values"][index])) + scaler_dict["min_values"][index]
52 |     ans = [round(el, 3) for el in op_arr.tolist()]
53 |     return ans
54 | 
55 | 
56 | def predict_results(csv_path, index, model, seq_len):
57 |     if index < 893646:
58 |         df = pd.read_csv(csv_path, skiprows=lambda x: x not in [index+1], header=None)
59 |         df = df.to_numpy()[:, 1:].reshape(-1)
60 |         ans = anti_transform(df)
61 |     else:
62 |         num_repeats = index - 893646
63 |         data = pd.read_csv(csv_path,
64 |                            skiprows=lambda x: x not in [i for i in range(893646 - seq_len, 893646)], header=None)
65 |         data = data.to_numpy()[:, 1:]
66 |         temp_data = data.astype("float32")
67 |         print(f"Predicting Future for {num_repeats} iteration")
68 |         for index in tqdm(range(num_repeats)):
69 |             op = predict_single_record(model, temp_data)
70 |             temp_data = temp_data[1:, :].tolist()
71 |             temp_data.append(op)
72 |             temp_data = np.array(temp_data).astype("float32")
73 | 
74 |         ans = anti_transform(temp_data[-1, :])
75 | 
76 |     return ans
77 | 


--------------------------------------------------------------------------------
/src/data/prepare_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import numpy as np
 4 | from sklearn.preprocessing import MinMaxScaler
 5 | import json
 6 | import yaml
 7 | 
 8 | 
 9 | def fetch_imp_features(dataframe, job_config):
10 |     fet_cols = job_config["imp_features"]
11 |     renamed_cols = job_config["renamed_cols"]
12 |     assert len(fet_cols) == len(renamed_cols), "Size mismatch, can not rename columns."
13 |     df = dataframe.copy()
14 |     df = df[fet_cols]
15 |     df.columns = renamed_cols
16 | 
17 |     return df
18 | 
19 | 
20 | def process_outliers_to_nan(dataframe):
21 |     df = dataframe.copy()
22 |     df.replace(-9999.0, np.nan, inplace=True)
23 |     df.replace(-9999.990, np.nan, inplace=True)
24 |     df["wind_speed"].replace(28.4900, np.nan, inplace=True)
25 | 
26 |     return df
27 | 
28 | 
29 | def perform_interpolation(dataframe, job_config):
30 | 
31 |     method = job_config["interpolation_method"]
32 |     df = dataframe.copy()
33 |     for fet in df.columns[1:]:
34 |         df[fet] = df[fet].interpolate(method=method)
35 | 
36 |     return df
37 | 
38 | 
39 | def perform_scaling(dataframe):
40 |     df = dataframe.copy()
41 |     mm_scalers = [MinMaxScaler() for _ in range(len(df.columns) - 1)]
42 |     mm_scaler_values = {
43 |         "min_values": [],
44 |         "max_values": []
45 |     }
46 |     for index, fet in enumerate(df.columns[1:]):
47 |         df[fet] = mm_scalers[index].fit_transform(df[fet].to_numpy().reshape(-1, 1)).reshape(-1)
48 |         mm_scaler_values["min_values"].append(mm_scalers[index].data_min_.tolist()[0])
49 |         mm_scaler_values["max_values"].append(mm_scalers[index].data_max_.tolist()[0])
50 | 
51 |     return df, mm_scaler_values
52 | 
53 | 
54 | def prepare_dataset(job_config):
55 | 
56 |     ip_path = job_config["data_path"]["input"]
57 |     op_path = job_config["data_path"]["output"]
58 |     mmscaler_conf_path = job_config["data_path"]["mmscaler_values"]
59 |     assert os.path.isfile(ip_path), "Source data not found ..."
60 |     dataframe = pd.read_csv(ip_path, parse_dates=["Date Time"])
61 |     print("Data Loaded ...")
62 |     df = dataframe.copy()
63 |     df = fetch_imp_features(dataframe=df, job_config=job_config)
64 |     print("Important features filtered ...")
65 |     df = process_outliers_to_nan(dataframe=df)
66 |     df.drop_duplicates("time", inplace=True)
67 |     df = perform_interpolation(dataframe=df, job_config=job_config)
68 |     print("Performed interpolation ...")
69 |     df, scaler_values = perform_scaling(dataframe=df)
70 |     print('Performed Scaling ...')
71 |     with open(mmscaler_conf_path, "w") as f:
72 |         yaml.dump(scaler_values, f)
73 |         f.close()
74 |     print("Scaler values stored ...")
75 |     df.to_csv(op_path, index=False)
76 |     print(f"Processed data stored to {op_path}")
77 | 
78 | 
79 | def main():
80 |     job_config_path = os.path.join(os.getcwd(), "config", "prepare_data.json")
81 |     with open(job_config_path, "r") as f:
82 |         job_config = json.load(f)
83 |         f.close()
84 |     print(job_config)
85 |     prepare_dataset(job_config=job_config)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     main()
90 | 


--------------------------------------------------------------------------------
/src/utils/training.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from src.models.lstm_model import LSTM
 3 | from src.data.dataset import create_loaders
 4 | import matplotlib.pyplot as plt
 5 | import torch
 6 | 
 7 | 
 8 | def init_objects(job_config):
 9 |     dataframe = pd.read_csv(job_config["dataset"]["path"])
10 |     train_dl, val_dl = create_loaders(dataframe=dataframe,
11 |                                       split_ratio=job_config["dataset"]["split_ratio"],
12 |                                       batch_size=job_config["dataset"]["batch_size"],
13 |                                       seq_length=job_config["dataset"]["seq_length"])
14 |     print("Dataloaders Generated...")
15 |     model = LSTM(num_classes=job_config["model"]["num_classes"],
16 |                  input_size=job_config["model"]["input_size"],
17 |                  hidden_size=job_config["model"]["hidden_size"],
18 |                  num_layers=job_config["model"]["num_layers"])
19 |     print("Model Generated...")
20 |     print(model)
21 |     optim = torch.optim.Adam(params=model.parameters(),
22 |                              lr=job_config["optimizer"]["lr"])
23 |     print("Optimizer Generated...")
24 |     loss_fn = torch.nn.MSELoss()
25 | 
26 |     return (train_dl, val_dl), model, optim, loss_fn
27 | 
28 | 
29 | def save_best_model_on_loss(curr_losses, best_losses, model, optim, track_on="validation"):
30 |     assert track_on in ["training", "validation"]
31 |     curr_train_loss, curr_val_loss = curr_losses
32 |     best_train_loss, best_val_loss = best_losses
33 |     if track_on == "engine":
34 |         flag = save_best_model(curr_loss=curr_train_loss,
35 |                                best_loss=best_train_loss,
36 |                                model=model,
37 |                                optim=optim)
38 |     else:
39 |         flag = save_best_model(curr_loss=curr_val_loss,
40 |                                best_loss=best_val_loss,
41 |                                model=model,
42 |                                optim=optim)
43 |     if flag:
44 |         return curr_losses
45 |     else:
46 |         return best_losses
47 | 
48 | 
49 | def save_best_model(curr_loss, best_loss, model, optim):
50 |     if curr_loss <= best_loss:
51 |         weights = {
52 |             "model": model.state_dict(),
53 |             "optim": optim.state_dict()
54 |         }
55 |         torch.save(weights, "best_model.pt")
56 |         print("Model Updated...")
57 |         print(f"Current best loss : {'%.6f'%curr_loss}")
58 |         return True
59 |     else:
60 |         print("Model didn't Updated...")
61 |         print(f"Current best loss : {'%.6f' % best_loss}")
62 |         return False
63 | 
64 | 
65 | def save_training_curve(train_dict, val_dict):
66 |     for name in val_dict.keys():
67 |         plt.figure(figsize=(20, 6))
68 |         plt.plot(train_dict[name], label=f"train_{name}")
69 |         plt.plot(val_dict[name], label=f"train_{name}")
70 |         plt.legend()
71 |         plt.savefig(f"results/{name}.png")
72 |         plt.close()
73 | 
74 | 
75 | def update_metric_dict(true, pred, metric_dict, res_dict):
76 |     if true.device != "cpu":
77 |         true = true.cpu()
78 |         pred = pred.cpu()
79 |     true = true.detach().numpy().reshape(-1)
80 |     pred = pred.detach().numpy().reshape(-1)
81 |     for name, metric in metric_dict.items():
82 |         res_dict[name].append(metric(true, pred))
83 | 
84 |     return res_dict
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/src/engine/pytorch_trainer.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import torch
  3 | import numpy as np
  4 | from src.utils.training import (init_objects,
  5 |                                 save_best_model_on_loss,
  6 |                                 save_training_curve,
  7 |                                 update_metric_dict)
  8 | from src.utils.mlflow_logger import log_pt_models
  9 | from src.utils.load_checkpoint import load_best_model
 10 | from src.utils.metrics import metrics_
 11 | 
 12 | 
 13 | def trainer(model, train_dl, val_dl, loss_fn, optim, job_config):
 14 |     training_hp = job_config["training_hp"]
 15 |     epochs = training_hp["num_epochs"]
 16 |     log_index = training_hp["log_index"]
 17 |     best_train_loss = torch.inf
 18 |     best_val_loss = torch.inf
 19 |     train_loss_array, val_loss_array = [], []
 20 |     train_metric_results = {k: [] for k, _ in metrics_.items()}
 21 |     val_metric_results = {k: [] for k, _ in metrics_.items()}
 22 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 23 |     print(f"Device found : {device}")
 24 |     model.to(device)
 25 |     print('Model loaded on device...')
 26 |     for epoch in range(epochs):
 27 |         print(f"Epoch {epoch + 1} :")
 28 |         train_epoch_metric_results = {k: [] for k, _ in metrics_.items()}
 29 |         val_epoch_metric_results = {k: [] for k, _ in metrics_.items()}
 30 |         train_loss = val_loss = 0.0
 31 |         model.train()
 32 |         for index, (x, y) in enumerate(train_dl):
 33 |             if device != "cpu":
 34 |                 x = x.cuda()
 35 |                 y = y.cuda()
 36 |             op = model(x, device)
 37 |             train_epoch_metric_results = update_metric_dict(y, op, metrics_, train_epoch_metric_results)
 38 |             curr_loss = loss_fn(op, y)
 39 |             if index % log_index == 0:
 40 |                 print(f"Step {index} Loss : {'%.6f' % curr_loss.item()}")
 41 |             train_loss += curr_loss.item()
 42 |             curr_loss.backward()
 43 |             optim.step()
 44 |         model.eval()
 45 |         for x, y in val_dl:
 46 |             if device != "cpu":
 47 |                 x = x.cuda()
 48 |                 y = y.cuda()
 49 |             op = model(x, device)
 50 |             val_epoch_metric_results = update_metric_dict(y, op, metrics_, val_epoch_metric_results)
 51 |             curr_loss = loss_fn(op, y)
 52 |             val_loss += curr_loss.item()
 53 |         print(f"Train Loss : {'%.6f' % train_loss} || Validation Loss : {'%.6f' % val_loss}")
 54 |         train_res = {k: np.mean(v) for k, v in train_epoch_metric_results.items()}
 55 |         val_res = {k: sum(v) for k, v in val_epoch_metric_results.items()}
 56 |         print(f"Train Metric Results : {train_res}")
 57 |         print(f"Validation Metric Results : {val_res}")
 58 |         for tot, epo in zip([train_metric_results, val_metric_results],
 59 |                             [train_res, val_res]):
 60 |             for name, val in epo.items():
 61 |                 tot[name].append(val)
 62 |         train_loss_array.append(train_loss)
 63 |         val_loss_array.append(val_loss)
 64 |         best_train_loss, best_val_loss = save_best_model_on_loss(curr_losses=(train_loss, val_loss),
 65 |                                                                  best_losses=(best_train_loss, best_val_loss),
 66 |                                                                  model=model,
 67 |                                                                  optim=optim)
 68 |         print("\n")
 69 |     results = {
 70 |         "training_mse": best_train_loss,
 71 |         "validation_mse": best_val_loss
 72 |     }
 73 |     best_model = load_best_model(job_config["model"])
 74 |     log_pt_models(model=best_model,
 75 |                   hparams=job_config,
 76 |                   results=results)
 77 |     train_metric_results["mse"] = train_loss_array
 78 |     val_metric_results["mse"] = val_loss_array
 79 |     save_training_curve(train_metric_results, val_metric_results)
 80 |     print("Training Completed...")
 81 | 
 82 | 
 83 | def main():
 84 |     job_config_path = "config/pt_training.yaml"
 85 |     with open(job_config_path, "r") as f:
 86 |         job_config = yaml.safe_load(f)
 87 |         f.close()
 88 |     print("Configuration Loaded...")
 89 |     print(job_config)
 90 |     (train_dl, val_dl), lstm_model, optim, loss_fn = init_objects(job_config)
 91 |     trainer(model=lstm_model,
 92 |             train_dl=train_dl,
 93 |             val_dl=val_dl,
 94 |             loss_fn=loss_fn,
 95 |             optim=optim,
 96 |             job_config=job_config)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | <h1>SamayYantra</h1>
 3 | <h3>Weather Prediction of Beutenberg using time series forcasting with deep learning</h3>
 4 | <img width="600px" src="https://socialify.git.ci/sagnik1511/samay_yantra/image?description=1&font=Inter&language=1&name=1&owner=1&pattern=Circuit%20Board&theme=Dark" 
 5 | alt="banner">
 6 | <h1>Samay Yantra</h1>
 7 | <img src="https://forthebadge.com/images/badges/built-with-love.svg">
 8 | <img src="https://forthebadge.com/images/badges/made-with-python.svg">
 9 | <img src="https://forthebadge.com/images/badges/built-with-science.svg">
10 | <h1>GOALS of the Project</h1>
11 | In ancient days, people used to look at the sky or feel it's humidity and several factors tried to predict the upcoming days which were most of them predicted true to a high accuracy.
12 | Nowadays as we can detect several weather factors, the prediction of weather is becoming too complex to be handled by human brain. Deep learning took up the next step and brought ease to this domain.
13 | Time series forecasting with deep neural networks changes the frontiers. Now we can predict/forecast innumerable future attributes based on historical data. Not only it shared us the predictions, but the data can be taken for more advanced analysis and research.
14 | The secondary goal of the project is to implement cutting edge MLOps to actual real problems.
15 | The ternary goal of the project is to implement this on traditional python OOPS but Jupyter Notebook , so that we can match different real-world ML codebase and learn accordingly.
16 | <h1>Technology</h1>
17 | <h3>1. Pytorch</h3>
18 | <h3>2. Scikit-Learn</h3>
19 | <h3>3. Pandas</h3>
20 | <h3>4. Numpy</h3>
21 | <h3>5. MLFlow</h3>
22 | </div>
23 | 
24 | # Data Collection Process :
25 | 
26 | The raw data has been recorded by the Weather station of [Max Planck Institute for Biogeochemistry](https://www.bgc-jena.mpg.de/wetter/), Jena, Germany.
27 | Jena Weather dataset is made up of many different quantities (such air temperature, atmospheric pressure, humidity, wind direction, and so on) were recorded every 10 minutes, over several years. This dataset covers data from January 1st 2004 to December 31st 2020
28 | The actual data is this a copy which is published for academic purposes as a kaggle dataset,  Link : [kaggle/Weather Station Beutenberg Dataset](https://www.kaggle.com/datasets/mnassrib/jena-weather-dataset)
29 | .
30 | The primary data is stored as a single *.csv* file which is later processed to *processed.csv* file to be taken for training.
31 | 
32 | ### Special Note : 
33 | Data has been stored using **DVC(Data version Control)**, so the repository package can be 
34 | used flexibly without adding the data straight in the repo but fetch from any remote source e.g. **AWS S3**, **GDRIVE**, etc.
35 | For this case, the data has been stored in GDRIVE.
36 | 
37 | 
38 | # Directory Structure :
39 | 
40 | The data follows a strict data science project structure.
41 | 
42 |     .
43 |     └── root/
44 |         ├──.dvc/
45 |         ├── config/
46 |         ├── mlruns/
47 |         ├── models/
48 |         ├── notebooks/
49 |         ├── results/
50 |         └── src/
51 |             ├── data
52 |             ├── features
53 |             ├── models
54 |             └── visualization
55 |             
56 | # Installation and Usage :
57 | <div align="center"><h1>Installation</h1></div>
58 | 
59 | 
60 | 1. Create a Virtual Environment : [Tutorial](https://docs.python.org/3/library/venv.html)
61 | 2. Clone the repository by running this command.
62 | ```shell
63 | git clone https://github.com/sagnik1511/samay_yantra.git
64 | ```
65 | 3. Open the directory with *cmd*.
66 | 4. Copy this command in terminal to install dependencies.
67 | ```shell
68 | pip install -r requirements.txt
69 | ```
70 | 5. Installing the requirements.txt may generate some error due to outdated MS Visual C++ Build. You can fix this problem using [this](https://www.youtube.com/watch?v=rcI1_e38BWs).
71 | 
72 | # Approach :
73 | 1. Go to the root directory using `cd` command.
74 | 2. The first step is to download the actual data into the project.Copy and run this command.
75 | ```shell
76 | dvc pull
77 | ```
78 | 3. If you want to run the training process, simply change the configuration in `config/pt_training.yaml` and then run this command . Keep in mind that you have to stay at the root directory.
79 | ```shell
80 | python -m src.engine.pytorch_trainer
81 | ```
82 | 
83 | 4. Further usage will be updated soon...
84 | 
85 | # Results:
86 | You can visit [reports](https://github.com/sagnik1511/SamayYantra/tree/main/reports) directory where all the runs are stored. Currently, for some privacy issues, the mlflow runs are not shared in here.
87 | 
88 | <div align="center">
89 | <h1>Thanks for visiting :D</h1>
90 | <h3>Do STAR if you find it useful</h3>
91 | <img src="https://freefrontend.com/assets/img/css-weather-icons/Animated-Weather-Icons.gif?">
92 | </div>
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/results/training_report.txt:
--------------------------------------------------------------------------------
  1 | Configuration Loaded...
  2 | {'dataset': {'path': 'processed_dataset.csv', 'batch_size': 5000, 'seq_length': 12, 'split_ratio': 0.8}, 'model': {'num_classes': 10, 'input_size': 10, 'hidden_size': 2, 'num_layers':
  3 | 1}, 'optimizer': {'lr': 0.0005}, 'training_hp': {'num_epochs': 10, 'log_index': 10}}
  4 | Dataloaders Generated...
  5 | Model Generated...
  6 | LSTM(
  7 |   (lstm): LSTM(10, 2, batch_first=True)
  8 |   (fc): Linear(in_features=2, out_features=10, bias=True)
  9 | )
 10 | Optimizer Generated...
 11 | Device found : cuda
 12 | Model loaded on device...
 13 | Epoch 1 :
 14 | Step 0 Loss : 0.400730
 15 | Step 10 Loss : 0.390075
 16 | Step 20 Loss : 0.379931
 17 | Step 30 Loss : 0.367938
 18 | Step 40 Loss : 0.354751
 19 | Step 50 Loss : 0.342859
 20 | Step 60 Loss : 0.329298
 21 | Step 70 Loss : 0.316875
 22 | Step 80 Loss : 0.305422
 23 | Step 90 Loss : 0.294833
 24 | Step 100 Loss : 0.284875
 25 | Step 110 Loss : 0.275529
 26 | Step 120 Loss : 0.266353
 27 | Step 130 Loss : 0.258478
 28 | Step 140 Loss : 0.249010
 29 | Train Loss : 45.733553 || Validation Loss : 8.782230
 30 | Model Updated...
 31 | Current best loss : 8.782230
 32 | 
 33 | 
 34 | Epoch 2 :
 35 | Step 0 Loss : 0.246973
 36 | Step 10 Loss : 0.239489
 37 | Step 20 Loss : 0.231310
 38 | Step 30 Loss : 0.222712
 39 | Step 40 Loss : 0.215650
 40 | Step 50 Loss : 0.208018
 41 | Step 60 Loss : 0.201227
 42 | Step 70 Loss : 0.193382
 43 | Step 80 Loss : 0.184516
 44 | Step 90 Loss : 0.177906
 45 | Step 100 Loss : 0.171014
 46 | Step 110 Loss : 0.163382
 47 | Step 120 Loss : 0.157045
 48 | Step 130 Loss : 0.150653
 49 | Step 140 Loss : 0.144826
 50 | Train Loss : 27.613929 || Validation Loss : 5.089219
 51 | Model Updated...
 52 | Current best loss : 5.089219
 53 | 
 54 | 
 55 | Epoch 3 :
 56 | Step 0 Loss : 0.142830
 57 | Step 10 Loss : 0.136966
 58 | Step 20 Loss : 0.131699
 59 | Step 30 Loss : 0.126804
 60 | Step 40 Loss : 0.122074
 61 | Step 50 Loss : 0.117513
 62 | Step 60 Loss : 0.113231
 63 | Step 70 Loss : 0.108528
 64 | Step 80 Loss : 0.104816
 65 | Step 90 Loss : 0.101312
 66 | Step 100 Loss : 0.096990
 67 | Step 110 Loss : 0.094033
 68 | Step 120 Loss : 0.089982
 69 | Step 130 Loss : 0.086228
 70 | Step 140 Loss : 0.082436
 71 | Train Loss : 15.721028 || Validation Loss : 2.940303
 72 | Model Updated...
 73 | Current best loss : 2.940303
 74 | 
 75 | 
 76 | Epoch 4 :
 77 | Step 0 Loss : 0.082047
 78 | Step 10 Loss : 0.078933
 79 | Step 20 Loss : 0.075038
 80 | Step 30 Loss : 0.072082
 81 | Step 40 Loss : 0.069719
 82 | Step 50 Loss : 0.066482
 83 | Step 60 Loss : 0.064385
 84 | Step 70 Loss : 0.062030
 85 | Step 80 Loss : 0.060018
 86 | Step 90 Loss : 0.058719
 87 | Step 100 Loss : 0.057207
 88 | Step 110 Loss : 0.055559
 89 | Step 120 Loss : 0.054028
 90 | Step 130 Loss : 0.052832
 91 | Step 140 Loss : 0.051676
 92 | Train Loss : 9.110056 || Validation Loss : 1.936779
 93 | Model Updated...
 94 | Current best loss : 1.936779
 95 | 
 96 | 
 97 | Epoch 5 :
 98 | Step 0 Loss : 0.052366
 99 | Step 10 Loss : 0.051080
100 | Step 20 Loss : 0.050515
101 | Step 30 Loss : 0.049729
102 | Step 40 Loss : 0.048244
103 | Step 50 Loss : 0.047954
104 | Step 60 Loss : 0.045566
105 | Step 70 Loss : 0.045765
106 | Step 80 Loss : 0.044726
107 | Step 90 Loss : 0.044683
108 | Step 100 Loss : 0.043754
109 | Step 110 Loss : 0.042881
110 | Step 120 Loss : 0.042125
111 | Step 130 Loss : 0.041396
112 | Step 140 Loss : 0.040874
113 | Train Loss : 6.564872 || Validation Loss : 1.578639
114 | Model Updated...
115 | Current best loss : 1.578639
116 | 
117 | 
118 | Epoch 6 :
119 | Step 0 Loss : 0.040972
120 | Step 10 Loss : 0.039978
121 | Step 20 Loss : 0.040800
122 | Step 30 Loss : 0.039807
123 | Step 40 Loss : 0.039929
124 | Step 50 Loss : 0.039873
125 | Step 60 Loss : 0.040065
126 | Step 70 Loss : 0.039826
127 | Step 80 Loss : 0.039482
128 | Step 90 Loss : 0.040380
129 | Step 100 Loss : 0.039739
130 | Step 110 Loss : 0.040256
131 | Step 120 Loss : 0.040269
132 | Step 130 Loss : 0.039535
133 | Step 140 Loss : 0.039283
134 | Train Loss : 5.723184 || Validation Loss : 1.546589
135 | Model Updated...
136 | Current best loss : 1.546589
137 | 
138 | 
139 | Epoch 7 :
140 | Step 0 Loss : 0.039914
141 | Step 10 Loss : 0.039406
142 | Step 20 Loss : 0.039742
143 | Step 30 Loss : 0.040374
144 | Step 40 Loss : 0.039554
145 | Step 50 Loss : 0.040020
146 | Step 60 Loss : 0.039124
147 | Step 70 Loss : 0.039243
148 | Step 80 Loss : 0.039483
149 | Step 90 Loss : 0.039072
150 | Step 100 Loss : 0.038611
151 | Step 110 Loss : 0.038497
152 | Step 120 Loss : 0.038803
153 | Step 130 Loss : 0.038428
154 | Step 140 Loss : 0.037840
155 | Train Loss : 5.607280 || Validation Loss : 1.460559
156 | Model Updated...
157 | Current best loss : 1.460559
158 | 
159 | 
160 | Epoch 8 :
161 | Step 0 Loss : 0.036983
162 | Step 10 Loss : 0.037550
163 | Step 20 Loss : 0.037424
164 | Step 30 Loss : 0.037318
165 | Step 40 Loss : 0.037119
166 | Step 50 Loss : 0.037172
167 | Step 60 Loss : 0.036731
168 | Step 70 Loss : 0.036120
169 | Step 80 Loss : 0.035058
170 | Step 90 Loss : 0.035926
171 | Step 100 Loss : 0.035322
172 | Step 110 Loss : 0.035502
173 | Step 120 Loss : 0.035048
174 | Step 130 Loss : 0.034890
175 | Step 140 Loss : 0.034707
176 | Train Loss : 5.159610 || Validation Loss : 1.327795
177 | Model Updated...
178 | Current best loss : 1.327795
179 | 
180 | 
181 | Epoch 9 :
182 | Step 0 Loss : 0.034557
183 | Step 10 Loss : 0.034898
184 | Step 20 Loss : 0.034155
185 | Step 30 Loss : 0.033752
186 | Step 40 Loss : 0.033506
187 | Step 50 Loss : 0.034096
188 | Step 60 Loss : 0.033892
189 | Step 70 Loss : 0.033634
190 | Step 80 Loss : 0.033149
191 | Step 90 Loss : 0.033251
192 | Step 100 Loss : 0.033261
193 | Step 110 Loss : 0.032598
194 | Step 120 Loss : 0.033227
195 | Step 130 Loss : 0.032869
196 | Step 140 Loss : 0.032903
197 | Train Loss : 4.798131 || Validation Loss : 1.240079
198 | Model Updated...
199 | Current best loss : 1.240079
200 | 
201 | 
202 | Epoch 10 :
203 | Step 0 Loss : 0.033113
204 | Step 10 Loss : 0.032516
205 | Step 20 Loss : 0.032675
206 | Step 30 Loss : 0.032951
207 | Step 40 Loss : 0.033049
208 | Step 50 Loss : 0.032906
209 | Step 60 Loss : 0.032652
210 | Step 70 Loss : 0.033128
211 | Step 80 Loss : 0.032546
212 | Step 90 Loss : 0.032942
213 | Step 100 Loss : 0.033307
214 | Step 110 Loss : 0.033222
215 | Step 120 Loss : 0.033647
216 | Step 130 Loss : 0.033756
217 | Step 140 Loss : 0.034032
218 | Train Loss : 4.724269 || Validation Loss : 1.260879
219 | Model didn't Updated...
220 | Current best loss : 1.240079
221 | 
222 | 
223 | 2022/06/13 19:36:28 WARNING mlflow.utils.requirements_utils: Found torch version (1.11.0+cu113) contains a local version label (+cu113). MLflow logged a pip requirement for this packag
224 | e as 'torch==1.11.0' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requi
225 | rements`.
226 | 2022/06/13 19:36:34 WARNING mlflow.utils.requirements_utils: Found torch version (1.11.0+cu113) contains a local version label (+cu113). MLflow logged a pip requirement for this packag
227 | e as 'torch==1.11.0' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requi
228 | rements`.
229 | Training Completed...


--------------------------------------------------------------------------------