├── .gitignore ├── requirements.txt ├── src ├── models │ ├── models_classes.py │ ├── model_callbacks.py │ ├── lobcast_model.py │ ├── utils_models.py │ ├── mlp │ │ └── mlp.py │ ├── binctabl │ │ ├── binctabl.py │ │ └── base.py │ ├── cnn1 │ │ └── cnn1.py │ └── cnn2 │ │ └── cnn2.py ├── batch_experiments │ └── setup01.py ├── run_batch.py ├── run.py ├── metrics │ ├── metrics_log.py │ ├── metrics_learning.py │ └── report.py ├── data_preprocessing │ ├── dataModule.py │ ├── utils_dataset.py │ └── FI │ │ └── FIDataBuilder.py ├── hyper_parameters.py ├── constants.py ├── utils │ ├── ultils_run.py │ ├── utils_generic.py │ └── util_training.py ├── settings.py └── lobcast.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | /thrash/ 3 | /data/ 4 | /data/experiments/ 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse==1.1 2 | backtesting==0.3.3 3 | matplotlib==3.5.0 4 | numpy==1.18.5 5 | pandas==1.2.4 6 | plotly==4.14.3 7 | psutil==5.9.4 8 | pytorch_lightning==1.8.6 9 | scikit_learn==0.24.2 10 | seaborn==0.12.2 11 | torch==1.13.1 12 | tqdm==4.64.1 13 | wandb -------------------------------------------------------------------------------- /src/models/models_classes.py: -------------------------------------------------------------------------------- 1 | 2 | # MODELS 3 | import src.models.mlp.mlp as mlp 4 | import src.models.cnn1.cnn1 as cnn1 5 | import src.models.cnn2.cnn2 as cnn2 6 | import src.models.binctabl.binctabl as binctabl 7 | 8 | from enum import Enum 9 | 10 | 11 | class Models(Enum): 12 | MLP = mlp.MLP_lm 13 | CNN1 = cnn1.CNN_lm 14 | CNN2 = cnn2.CNN2_ml 15 | BINCTABL = binctabl.BinCTABL_ml 16 | # add new modules here 17 | -------------------------------------------------------------------------------- /src/models/model_callbacks.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | import src.constants as cst 3 | 4 | 5 | def callback_save_model(path, fname_root, metric, top_k=3): 6 | check_point_callback = pl.callbacks.ModelCheckpoint( 7 | monitor=metric, 8 | verbose=True, 9 | save_top_k=top_k, 10 | mode='max', 11 | dirpath=path, 12 | filename=fname_root + '_{epoch}-{' + metric + ':.2f}' 13 | ) 14 | return check_point_callback 15 | 16 | 17 | # TODO avoid early stopping 18 | -------------------------------------------------------------------------------- /src/models/lobcast_model.py: -------------------------------------------------------------------------------- 1 | 2 | import pytorch_lightning as pl 3 | from src.hyper_parameters import HPTunable 4 | 5 | 6 | class LOBCAST_module: 7 | def __init__(self, model, tunable_parameters=None): 8 | self.model = model 9 | self.tunable_parameters = tunable_parameters if tunable_parameters is not None else HPTunable() 10 | self.name = model.__class__.__name__ 11 | self.line_color = "red" 12 | self.line_shape = "-" 13 | 14 | 15 | class LOBCAST_model(pl.LightningModule): 16 | def __init__(self, input_dim, output_dim): 17 | super().__init__() 18 | self.input_dim = input_dim 19 | self.output_dim = output_dim 20 | -------------------------------------------------------------------------------- /src/batch_experiments/setup01.py: -------------------------------------------------------------------------------- 1 | 2 | from src.settings import SettingsExp 3 | import src.constants as cst 4 | 5 | # cartesian product of the tests 6 | INDEPENDENT_VARIABLES = { 7 | SettingsExp.SEED: [0], 8 | SettingsExp.PREDICTION_MODEL: [cst.Models.CNN1, cst.Models.CNN2], 9 | SettingsExp.PREDICTION_HORIZON_FUTURE: [10, 5], 10 | SettingsExp.PREDICTION_HORIZON_PAST: [1], 11 | SettingsExp.OBSERVATION_PERIOD: [100] 12 | } 13 | 14 | # no entry in here = cartesian product of INDEPENDENT_VARIABLES 15 | # k: v, when k does not vary, the variable k is fixed to v 16 | INDEPENDENT_VARIABLES_CONSTRAINTS = { 17 | SettingsExp.PREDICTION_MODEL: cst.Models.CNN1, # when other variables vary, PREDICTION_MODEL = MLP 18 | SettingsExp.PREDICTION_HORIZON_FUTURE: 5 19 | } 20 | -------------------------------------------------------------------------------- /src/run_batch.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import wandb 4 | from src.lobcast import LOBCAST 5 | from src.utils.ultils_run import grid_search_configurations, ExecutionPlan, wandb_init 6 | from src.settings import SettingsExp 7 | import src.constants as cst 8 | 9 | from src.batch_experiments import setup01 10 | from src.run import run_simulation 11 | 12 | 13 | def main(): 14 | sim = LOBCAST() 15 | 16 | # for multiple experiments 17 | ep = ExecutionPlan(setup01.INDEPENDENT_VARIABLES, 18 | setup01.INDEPENDENT_VARIABLES_CONSTRAINTS) 19 | 20 | setting_confs = ep.configurations() 21 | 22 | print("Running the following configurations:") 23 | print(setting_confs) 24 | 25 | for setting_conf in setting_confs: 26 | sim.update_settings(setting_conf) 27 | run_simulation(sim) 28 | print("done:", setting_conf) 29 | 30 | 31 | if __name__ == '__main__': 32 | main() 33 | 34 | 35 | # python -m src.run 36 | -------------------------------------------------------------------------------- /src/models/utils_models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import src.constants as cst 4 | from src.utils.util_training import LOBCAST_NNEngine 5 | 6 | 7 | # MODELS 8 | from src.utils.utils_generic import get_class_arguments 9 | 10 | 11 | def get_tuned_parameters(sim, params): 12 | values = [sim.HP_TUNED.__getattribute__(p) for p in params] 13 | return values 14 | 15 | 16 | def pick_model(sim, data_module): 17 | loss_weights = None 18 | 19 | num_features = data_module.x_shape 20 | num_classes = data_module.num_classes 21 | 22 | args = get_class_arguments(sim.SETTINGS.PREDICTION_MODEL.value.model)[2:] 23 | args_values = get_tuned_parameters(sim, args) 24 | neural_architecture = sim.SETTINGS.PREDICTION_MODEL.value.model(num_features, num_classes, *args_values) 25 | 26 | engine = LOBCAST_NNEngine( 27 | neural_architecture, 28 | loss_weights, 29 | hps=sim.HP_TUNED, 30 | metrics_log=sim.METRICS, 31 | wandb_log=sim.WANDB_INSTANCE, 32 | ).to(sim.SETTINGS.DEVICE) 33 | 34 | return engine 35 | -------------------------------------------------------------------------------- /src/run.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import src.constants as cst 4 | import wandb 5 | from src.lobcast import LOBCAST 6 | from src.utils.ultils_run import grid_search_configurations, wandb_init 7 | from src.settings import SettingsExp 8 | 9 | 10 | def run_simulation(sim): 11 | if not sim.SETTINGS.IS_WANDB: 12 | # generates runs based on a grid search of the hyper params 13 | hparams_configs = grid_search_configurations(sim.HP_TUNABLE.__dict__) 14 | for hparams_config in hparams_configs: 15 | sim.update_hyper_parameters(hparams_config) 16 | sim.end_setup() 17 | sim.run() 18 | sim.evaluate() 19 | sim.close() 20 | else: 21 | # hyper params search is handled by wandb 22 | sweep_id, wandb_lunch = wandb_init(sim) 23 | wandb.agent(sweep_id, function=lambda: wandb_lunch(sim)) 24 | 25 | 26 | def main(): 27 | sim = LOBCAST() 28 | 29 | setting_conf = sim.parse_cl_arguments() 30 | sim.update_settings(setting_conf) 31 | run_simulation(sim) 32 | 33 | 34 | if __name__ == '__main__': 35 | main() 36 | 37 | 38 | # python -m src.run --PREDICTION_MODEL MLP 39 | -------------------------------------------------------------------------------- /src/metrics/metrics_log.py: -------------------------------------------------------------------------------- 1 | 2 | from src.utils.utils_generic import write_json, is_jsonable 3 | from collections import defaultdict 4 | 5 | 6 | class Metrics: 7 | def __init__(self, path, fname_root): 8 | self.metrics = defaultdict(dict) # dict logged every X epochs 9 | self.path = path 10 | self.fname_root = fname_root 11 | self.is_best_model = False 12 | 13 | def add_metric(self, epoch, dataset_type, eval_dict): 14 | self.metrics[dataset_type][epoch] = eval_dict 15 | 16 | def reset_stats(self): 17 | self.metrics = defaultdict(dict) 18 | 19 | def dump_info(self, settings, h_parameters): 20 | print("Dumping config at", self.path) 21 | merged = {**settings, **h_parameters} 22 | merged = {k: (v if is_jsonable(v) else str(v)) for k, v in merged.items()} # make string unserializable vals 23 | write_json(merged, self.path + self.fname_root + "_" + "config.json") 24 | return merged 25 | 26 | def dump_metrics(self, fname): 27 | print("Dumping metrics at", self.path) 28 | write_json(self.metrics, self.path + self.fname_root + "_" + fname) 29 | return self.metrics 30 | -------------------------------------------------------------------------------- /src/data_preprocessing/dataModule.py: -------------------------------------------------------------------------------- 1 | 2 | import pytorch_lightning as pl 3 | from torch.utils.data import DataLoader 4 | import src.constants as cst 5 | 6 | 7 | class DataModule(pl.LightningDataModule): 8 | """ Splits the datasets in TRAIN, VALIDATION_MODEL, TEST. """ 9 | 10 | def __init__(self, train_set, val_set, test_set, batch_size, device, is_shuffle_train=True): 11 | super().__init__() 12 | 13 | self.train_set = train_set 14 | self.val_set = val_set 15 | self.test_set = test_set 16 | 17 | self.batch_size = batch_size 18 | self.is_shuffle_train = is_shuffle_train 19 | 20 | self.x_shape = self.test_set.x_shape 21 | self.num_classes = cst.NUM_CLASSES 22 | self.pin_memory = True if device == 'cuda' else False 23 | 24 | def setup(self, stage=None): 25 | pass 26 | 27 | def train_dataloader(self): 28 | return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=self.is_shuffle_train, pin_memory=self.pin_memory, drop_last=False) 29 | 30 | def val_dataloader(self): 31 | return DataLoader(self.val_set, batch_size=self.batch_size, shuffle=False, pin_memory=self.pin_memory, drop_last=False) 32 | 33 | def test_dataloader(self): 34 | return DataLoader(self.test_set, batch_size=self.batch_size, shuffle=False, pin_memory=self.pin_memory, drop_last=False) 35 | -------------------------------------------------------------------------------- /src/hyper_parameters.py: -------------------------------------------------------------------------------- 1 | 2 | from src.utils.utils_generic import dict_to_string 3 | 4 | 5 | class Hyperparameters: 6 | def add_hyperparameters(self, params: dict): 7 | for key, value in params.items(): 8 | self.__setattr__(key, value) 9 | 10 | def add_hyperparameter(self, key, value): 11 | self.__setattr__(key, value) 12 | 13 | def __repr__(self): 14 | return dict_to_string(self.__dict__) 15 | 16 | 17 | class HPTuned(Hyperparameters): 18 | """ Tuned hyperparameters of the models. Hyperparameters are assigned with their chosen value 19 | by an external scheduler (e.g. wandb grid search).""" 20 | 21 | def update_hyperparameter(self, hp, value): 22 | try: 23 | self.__getattribute__(hp) 24 | self.__setattr__(hp, value) 25 | 26 | except AttributeError: 27 | raise AttributeError(f"This class has no {hp} to set.") 28 | 29 | 30 | class HPTunable(Hyperparameters): 31 | """ Tunable hyperparameters of the models. Contains the domains of hyperparameters exploration. """ 32 | def __init__(self): 33 | self.BATCH_SIZE = {"values": [32, 64]} # {"min": 0.0001, "max": 0.1} or {"values": [11]} 34 | self.LEARNING_RATE = {"values": [0.0001, 0.001, 0.01]} # {"min": 0.0001, "max": 0.1} # {"min": 0.0001, "max": 0.1} 35 | self.OPTIMIZER = {"values": ["SGD"]} 36 | -------------------------------------------------------------------------------- /src/models/mlp/mlp.py: -------------------------------------------------------------------------------- 1 | 2 | # Using Deep Learning to Detect Price Change Indications in Financial Markets 3 | # Source: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8081663 4 | 5 | from torch import nn 6 | from src.models.lobcast_model import LOBCAST_model, LOBCAST_module 7 | from src.hyper_parameters import HPTunable 8 | 9 | 10 | class MLP(LOBCAST_model): 11 | def __init__( 12 | self, 13 | input_dim, 14 | output_dim, 15 | hidden_layer_dim, 16 | p_dropout 17 | ): 18 | super().__init__(input_dim, output_dim) 19 | 20 | flat_dims = self.input_dim[0] * self.input_dim[1] 21 | self.linear1 = nn.Linear(flat_dims, hidden_layer_dim) 22 | self.leakyReLU = nn.LeakyReLU() 23 | self.dropout = nn.Dropout(p=p_dropout) 24 | self.linear2 = nn.Linear(hidden_layer_dim, self.output_dim) 25 | 26 | def forward(self, x): 27 | # [batch_size x 40 x observation_length] 28 | x = x.view(x.size(0), -1).float() 29 | out = self.linear1(x) 30 | out = self.leakyReLU(out) 31 | out = self.dropout(out) 32 | out = self.linear2(out) 33 | return out 34 | 35 | 36 | class HP(HPTunable): 37 | def __init__(self): 38 | super().__init__() 39 | self.hidden_layer_dim = {"values": [128]} 40 | self.p_dropout = {"values": [.1, .5]} 41 | 42 | 43 | MLP_lm = LOBCAST_module(MLP, HP()) 44 | -------------------------------------------------------------------------------- /src/data_preprocessing/utils_dataset.py: -------------------------------------------------------------------------------- 1 | import src.constants as cst 2 | from src.data_preprocessing.dataModule import DataModule 3 | from src.data_preprocessing.FI.FIDataBuilder import FIDataset 4 | 5 | 6 | def prepare_data_fi(sim): 7 | fi_train, fi_val, fi_test = None, None, None 8 | 9 | if not sim.SETTINGS.IS_TEST_ONLY: 10 | fi_train = FIDataset( 11 | cst.DATASET_FI, 12 | dataset_type=cst.DatasetType.TRAIN, 13 | horizon=sim.SETTINGS.PREDICTION_HORIZON_FUTURE, 14 | observation_length=sim.SETTINGS.OBSERVATION_PERIOD, 15 | train_val_split=sim.SETTINGS.TRAIN_SET_PORTION, 16 | n_trends=sim.SETTINGS.N_TRENDS 17 | ) 18 | 19 | fi_val = FIDataset( 20 | cst.DATASET_FI, 21 | dataset_type=cst.DatasetType.VALIDATION, 22 | horizon=sim.SETTINGS.PREDICTION_HORIZON_FUTURE, 23 | observation_length=sim.SETTINGS.OBSERVATION_PERIOD, 24 | train_val_split=sim.SETTINGS.TRAIN_SET_PORTION, 25 | n_trends=sim.SETTINGS.N_TRENDS 26 | ) 27 | 28 | fi_test = FIDataset( 29 | cst.DATASET_FI, 30 | dataset_type=cst.DatasetType.TEST, 31 | observation_length=sim.SETTINGS.OBSERVATION_PERIOD, 32 | horizon=sim.SETTINGS.PREDICTION_HORIZON_FUTURE, 33 | train_val_split=sim.SETTINGS.TRAIN_SET_PORTION, 34 | n_trends=sim.SETTINGS.N_TRENDS 35 | ) 36 | 37 | fi_dm = DataModule( 38 | fi_train, fi_val, fi_test, 39 | sim.HP_TUNED.BATCH_SIZE, 40 | sim.SETTINGS.DEVICE, 41 | sim.SETTINGS.IS_SHUFFLE_TRAIN_SET 42 | ) 43 | return fi_dm 44 | 45 | 46 | def pick_dataset(sim): 47 | if sim.SETTINGS.DATASET_NAME == cst.DatasetFamily.FI: 48 | return prepare_data_fi(sim) 49 | else: 50 | raise ValueError(f"Unhandled dataset name: {sim.SETTINGS}") 51 | -------------------------------------------------------------------------------- /src/metrics/metrics_learning.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from sklearn.metrics import classification_report 4 | from sklearn.metrics import matthews_corrcoef 5 | from sklearn.metrics import cohen_kappa_score 6 | from sklearn.metrics import confusion_matrix 7 | 8 | import src.constants as cst 9 | import numpy as np 10 | 11 | 12 | def compute_metrics(truth, prediction, loss_vals): 13 | truth = torch.Tensor(truth) 14 | prediction = torch.Tensor(prediction) 15 | 16 | cr = classification_report(truth, prediction, output_dict=True, zero_division=0) 17 | accuracy = cr['accuracy'] # MICRO-F1 18 | 19 | f1score = cr['macro avg']['f1-score'] # MACRO-F1 20 | precision = cr['macro avg']['precision'] # MACRO-PRECISION 21 | recall = cr['macro avg']['recall'] # MACRO-RECALL 22 | 23 | f1score_w = cr['weighted avg']['f1-score'] # WEIGHTED-F1 24 | precision_w = cr['weighted avg']['precision'] # WEIGHTED-PRECISION 25 | recall_w = cr['weighted avg']['recall'] # WEIGHTED-RECALL 26 | 27 | mcc = matthews_corrcoef(truth, prediction) 28 | cok = cohen_kappa_score(truth, prediction) 29 | 30 | # y_actu = pd.Series(truth, name='actual') 31 | # y_pred = pd.Series(prediction, name='predicted') 32 | mat_confusion = confusion_matrix(truth, prediction) 33 | 34 | val_dict = { 35 | cst.Metrics.F1.value: float(f1score), 36 | cst.Metrics.F1_W.value: float(f1score_w), 37 | cst.Metrics.PRECISION.value: float(precision), 38 | cst.Metrics.PRECISION_W.value: float(precision_w), 39 | cst.Metrics.RECALL.value: float(recall), 40 | cst.Metrics.RECALL_W.value: float(recall_w), 41 | cst.Metrics.ACCURACY.value: float(accuracy), 42 | cst.Metrics.MCC.value: float(mcc), 43 | cst.Metrics.COK.value: float(cok), 44 | cst.Metrics.LOSS.value: float(np.sum(loss_vals)), 45 | cst.Metrics.CM.value: mat_confusion.tolist() 46 | } 47 | return val_dict 48 | -------------------------------------------------------------------------------- /src/models/binctabl/binctabl.py: -------------------------------------------------------------------------------- 1 | 2 | import torch.nn as nn 3 | import torch 4 | from src.models.binctabl.base import BiN, BL_layer, TABL_layer 5 | 6 | from src.models.lobcast_model import LOBCAST_model, LOBCAST_module 7 | from src.hyper_parameters import HPTunable 8 | 9 | 10 | class BinCTABL(LOBCAST_model): 11 | def __init__(self, 12 | input_dim, 13 | output_dim, 14 | d2, d1, t1, t2, d3, t3, d4, t4): 15 | super().__init__(input_dim, output_dim) 16 | 17 | self.BiN = BiN(d2, d1, t1, t2) 18 | self.BL = BL_layer(d2, d1, t1, t2) 19 | self.BL2 = BL_layer(d3, d2, t2, t3) 20 | self.TABL = TABL_layer(d4, d3, t3, t4) 21 | self.dropout = nn.Dropout(0.1) 22 | 23 | def forward(self, x): 24 | # first of all we pass the input to the BiN layer, then we use the C(TABL) architecture 25 | x = torch.permute(x, (0, 2, 1)) 26 | 27 | x = self.BiN(x) 28 | 29 | self.max_norm_(self.BL.W1.data) 30 | self.max_norm_(self.BL.W2.data) 31 | x = self.BL(x) 32 | x = self.dropout(x) 33 | 34 | self.max_norm_(self.BL2.W1.data) 35 | self.max_norm_(self.BL2.W2.data) 36 | x = self.BL2(x) 37 | x = self.dropout(x) 38 | 39 | self.max_norm_(self.TABL.W1.data) 40 | self.max_norm_(self.TABL.W.data) 41 | self.max_norm_(self.TABL.W2.data) 42 | x = self.TABL(x) 43 | x = torch.squeeze(x) 44 | x = torch.softmax(x, 1) 45 | return x 46 | 47 | def max_norm_(self, w): 48 | with torch.no_grad(): 49 | if (torch.linalg.matrix_norm(w) > 10.0): 50 | norm = torch.linalg.matrix_norm(w) 51 | desired = torch.clamp(norm, min=0.0, max=10.0) 52 | w *= (desired / (1e-8 + norm)) 53 | 54 | 55 | class HP(HPTunable): 56 | def __init__(self): 57 | super().__init__() 58 | self.d1 = {"values": [40]} 59 | self.d2 = {"values": [60]} 60 | self.d3 = {"values": [120]} 61 | self.d4 = {"values": [3]} 62 | 63 | self.t1 = {"values": [10]} 64 | self.t2 = {"values": [10]} 65 | self.t3 = {"values": [5]} 66 | self.t4 = {"values": [1]} 67 | 68 | 69 | BinCTABL_ml = LOBCAST_module(BinCTABL, HP()) 70 | -------------------------------------------------------------------------------- /src/metrics/report.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import src.constants as cst 3 | from src.utils.utils_generic import read_json 4 | 5 | 6 | saved_metrics = [ 7 | cst.Metrics.F1.value, 8 | cst.Metrics.F1_W.value, 9 | cst.Metrics.PRECISION.value, 10 | cst.Metrics.PRECISION_W.value, 11 | cst.Metrics.RECALL.value, 12 | cst.Metrics.RECALL_W.value, 13 | cst.Metrics.ACCURACY.value, 14 | cst.Metrics.MCC.value, 15 | cst.Metrics.COK.value, 16 | cst.Metrics.LOSS.value, 17 | ] 18 | 19 | 20 | def plot_metric_training(json_data_path, metric, pdf): 21 | json_data = read_json(json_data_path) 22 | 23 | # Extract data 24 | data_train = json_data[cst.ModelSteps.TRAINING.value] 25 | epochs_train = sorted(map(int, data_train.keys())) 26 | metric_values_train = [data_train[str(epoch)][metric] for epoch in epochs_train] 27 | 28 | data_val = json_data[cst.ModelSteps.VALIDATION.value] 29 | epochs_val = sorted(map(int, data_val.keys())) 30 | metric_values_val = [data_val[str(epoch)][metric] for epoch in epochs_val] 31 | 32 | # Plotting 33 | plt.figure(figsize=(5, 5)) 34 | plt.plot(epochs_train, metric_values_train, label=cst.ModelSteps.TRAINING.value, marker='.') 35 | plt.plot(epochs_val, metric_values_val, label=cst.ModelSteps.VALIDATION.value, marker='.') 36 | 37 | plt.title(f'{metric.capitalize()} vs. Epochs') 38 | plt.xlabel('Epochs') 39 | plt.ylabel(metric.capitalize()) 40 | plt.legend() 41 | plt.grid(True, alpha=0.2) 42 | 43 | if metric not in [cst.Metrics.LOSS.value, cst.Metrics.CM.value]: 44 | plt.ylim(-0.05, 1.05) 45 | 46 | plt.tight_layout() 47 | pdf.savefig(plt.gcf()) 48 | plt.close() 49 | 50 | 51 | def plot_metric_best(json_data_path, metric, pdf): 52 | json_data = read_json(json_data_path) 53 | 54 | # Extract data 55 | data_test = json_data[cst.ModelSteps.TESTING.value] 56 | epochs_test = sorted(map(int, data_test.keys())) 57 | metric_values_test = [data_test[str(epoch)][metric] for epoch in epochs_test] 58 | 59 | data_val = json_data["validation"] 60 | epochs_val = sorted(map(int, data_val.keys())) 61 | metric_values_val = [data_val[str(epoch)][metric] for epoch in epochs_val] 62 | 63 | # Plotting 64 | plt.figure(figsize=(5, 5)) 65 | plt.bar([cst.ModelSteps.TESTING.value, cst.ModelSteps.VALIDATION.value], metric_values_test + metric_values_val, color=['blue', 'green']) 66 | 67 | plt.title(f'{metric.capitalize()} vs. Epochs') 68 | plt.xlabel('Epochs') 69 | plt.ylabel(metric.capitalize()) 70 | plt.grid(True, alpha=0.2) 71 | 72 | if metric not in [cst.Metrics.LOSS.value, cst.Metrics.CM.value]: 73 | plt.ylim(-0.05, 1.05) 74 | 75 | plt.tight_layout() 76 | pdf.savefig(plt.gcf()) 77 | plt.close() -------------------------------------------------------------------------------- /src/constants.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class Optimizers(Enum): 5 | ADAM = "Adam" 6 | RMSPROP = "RMSprop" 7 | SGD = "SGD" 8 | 9 | 10 | class Metrics(Enum): 11 | LOSS = 'loss' 12 | CM = 'cm' 13 | F1 = 'f1' 14 | F1_W = 'f1_w' 15 | 16 | PRECISION = 'precision' 17 | PRECISION_W = 'precision_w' 18 | 19 | RECALL = 'recall' 20 | RECALL_W = 'recall_w' 21 | 22 | ACCURACY = 'accuracy' 23 | MCC = 'mcc' 24 | COK = 'cohen-k' 25 | 26 | 27 | class ModelSteps(Enum): 28 | TRAINING = "training" 29 | VALIDATION = "validation" # final validation 30 | TESTING = "testing" 31 | 32 | 33 | VALIDATION_METRIC = "{}_{}".format(ModelSteps.VALIDATION.value, Metrics.F1.value) 34 | 35 | 36 | class NormalizationType(Enum): 37 | Z_SCORE = 0 38 | DYNAMIC = 1 39 | NONE = 2 40 | MINMAX = 3 41 | DECPRE = 4 42 | 43 | 44 | class FI_Horizons(Enum): 45 | K1 = 1 46 | K2 = 2 47 | K3 = 3 48 | K5 = 5 49 | K10 = 10 50 | 51 | 52 | class Predictions(Enum): 53 | DOWNWARD = 0 54 | STATIONARY = 1 55 | UPWARD = 2 56 | 57 | 58 | from src.models.models_classes import * 59 | # to use in the future 60 | 61 | 62 | class DatasetFamily(str, Enum): 63 | FI = "FI" 64 | LOB = "Lobster" 65 | META = "Meta" 66 | 67 | 68 | HORIZONS_MAPPINGS_FI = { 69 | 1: -5, 70 | 2: -4, 71 | 3: -3, 72 | 5: -2, 73 | 10: -1 74 | } 75 | 76 | HORIZONS_MAPPINGS_LOBSTER = { 77 | 10: -5, 78 | 20: -4, 79 | 30: -3, 80 | 50: -2, 81 | 100: -1 82 | } 83 | 84 | 85 | class OrderEvent(Enum): 86 | """ The possible kind of orders in the lob """ 87 | SUBMISSION = 1 88 | CANCELLATION = 2 89 | DELETION = 3 90 | EXECUTION = 4 91 | HIDDEN_EXECUTION = 5 92 | CROSS_TRADE = 6 93 | TRADING_HALT = 7 94 | OTHER = 8 95 | 96 | 97 | class DatasetType(Enum): 98 | TRAIN = "train" 99 | TEST = "test" 100 | VALIDATION = "val" 101 | 102 | 103 | DOWNLOAD_FI_COMMAND = ("wget --content-disposition \"https://download.fairdata.fi:443/download?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3MTEyMzAxODksImRhdGFzZXQiOiI3M2ViNDhkNy00ZGJjLTRhMTAtYTUyYS1kYTc0NWI0N2E2NDkiLCJwYWNrYWdlIjoiNzNlYjQ4ZDctNGRiYy00YTEwLWE1MmEtZGE3NDViNDdhNjQ5X2JoeXV4aWZqLnppcCIsImdlbmVyYXRlZF9ieSI6IjlmZGRmZmVlLWY4ZDItNDZkNS1hZmIwLWQyOTM0NzdlZjg2ZiIsInJhbmRvbV9zYWx0IjoiYjVkYzQxOTAifQ.bgDP51aFumRtPMbJUtUcjhpnu-O6nI6OYZlDbc3lrfQ\"") 104 | 105 | 106 | class ExpIndependentVariables(Enum): 107 | MODEL = 'model' 108 | K_FI = 'k' 109 | FORWARD_WIN = 'fw' 110 | BACKWARD_WIN = 'bw' 111 | 112 | 113 | N_LOB_LEVELS = 10 114 | NUM_CLASSES = 3 115 | 116 | PROJECT_NAME = "LOBCAST" 117 | VERSION = 2.0 118 | 119 | PROJECT_NAME_VERSION = f"{PROJECT_NAME}-v{VERSION}" 120 | DIR_EXPERIMENTS = f"data/experiments/{PROJECT_NAME_VERSION}" 121 | DIR_SAVED_MODEL = f"data/saved_models/{PROJECT_NAME_VERSION}" 122 | DATASET_FI = "data/datasets/FI-2010/BenchmarkDatasets/" 123 | 124 | METRICS_RUNNING_FILE_NAME = "metrics_train.json" 125 | METRICS_BEST_FILE_NAME = "metrics_best.json" 126 | WANDB_SWEEP_MAX_RUNS = 20 127 | 128 | 129 | class UnitHorizon(Enum): 130 | SECONDS = "seconds" 131 | HOURS = "hours" 132 | MINUTES = "minutes" 133 | DAYS = "days" 134 | EVENTS = "events" 135 | -------------------------------------------------------------------------------- /src/models/cnn1/cnn1.py: -------------------------------------------------------------------------------- 1 | # Forecasting Stock Prices from the Limit Order Book using Convolutional Neural Networks 2 | # Source: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8010701 3 | 4 | import pytorch_lightning as pl 5 | from torch import nn 6 | 7 | import src.models.lobcast_model 8 | from src.models.lobcast_model import LOBCAST_model, LOBCAST_module 9 | from src.hyper_parameters import HPTunable 10 | 11 | 12 | class CNN1(LOBCAST_model): 13 | 14 | def __init__(self, input_dim, output_dim): 15 | super().__init__(input_dim, output_dim) 16 | 17 | n_features = input_dim[1] 18 | 19 | # Convolution 1 20 | self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(4, n_features), padding=(3, 0), dilation=(2, 1)) 21 | self.relu1 = nn.LeakyReLU() 22 | 23 | # Convolution 2 24 | self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=(4,)) 25 | self.relu2 = nn.LeakyReLU() 26 | 27 | # Max pool 1 28 | self.maxpool1 = nn.MaxPool1d(kernel_size=2) 29 | 30 | # Convolution 3 31 | self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=(3,), padding=2) 32 | self.relu3 = nn.LeakyReLU() 33 | 34 | # Convolution 4 35 | self.conv4 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=(3,), padding=2) 36 | self.relu4 = nn.LeakyReLU() 37 | 38 | # Max pool 2 39 | self.maxpool2 = nn.MaxPool1d(kernel_size=2) 40 | 41 | # Fully connected 1 42 | self.fc1 = nn.Linear(26*32, 32) 43 | self.relu5 = nn.LeakyReLU() 44 | 45 | # Fully connected 2 46 | self.fc2 = nn.Linear(32, output_dim) 47 | 48 | def forward(self, x): 49 | # Adding the channel dimension 50 | x = x[:, None, :] # x.shape = [batch_size, 1, 100, 40] 51 | 52 | # print('x.shape:', x.shape) 53 | 54 | # Convolution 1 55 | out = self.conv1(x) 56 | out = self.relu1(out) 57 | out = out.reshape(out.shape[0], out.shape[1], -1) 58 | # print('After convolution1:', out.shape) 59 | 60 | # Convolution 2 61 | out = self.conv2(out) 62 | out = self.relu2(out) 63 | # print('After convolution2:', out.shape) 64 | 65 | # Max pool 1 66 | out = self.maxpool1(out) 67 | # print('After maxpool1:', out.shape) 68 | 69 | # Convolution 3 70 | out = self.conv3(out) 71 | out = self.relu3(out) 72 | # print('After convolution3:', out.shape) 73 | 74 | # Convolution 4 75 | out = self.conv4(out) 76 | out = self.relu4(out) 77 | # print('After convolution4:', out.shape) 78 | 79 | # Max pool 2 80 | out = self.maxpool2(out) 81 | # print('After maxcpool2:', out.shape) 82 | 83 | # flatten 84 | out = out.view(out.size(0), -1) 85 | # print('After flatten:', out.shape) 86 | 87 | # Linear function 1 88 | out = self.fc1(out) 89 | out = self.relu5(out) 90 | # print('After linear1:', out.shape) 91 | 92 | # Linear function (readout) 93 | out = self.fc2(out) 94 | # print('After linear2:', out.shape) 95 | 96 | return out 97 | 98 | 99 | CNN_lm = LOBCAST_module(CNN1) 100 | -------------------------------------------------------------------------------- /src/utils/ultils_run.py: -------------------------------------------------------------------------------- 1 | import wandb 2 | import src.constants as cst 3 | import itertools 4 | 5 | 6 | def wandb_init(sim): 7 | def wandb_lunch(sim): # runs multiple instances 8 | with wandb.init() as wandb_instance: 9 | sim.update_hyper_parameters(wandb_instance.config) 10 | sim.end_setup(wandb_instance) 11 | 12 | wandb_instance.log({k: str(v) for k, v in sim.SETTINGS.__dict__.items()}) 13 | sim.run() 14 | sim.evaluate() 15 | sim.close() 16 | 17 | sweep_id = wandb.sweep(project=cst.PROJECT_NAME_VERSION, sweep={ 18 | 'method': sim.SETTINGS.WANDB_SWEEP_METHOD, 19 | "metric": {"goal": "maximize", "name": cst.VALIDATION_METRIC}, 20 | 'parameters': sim.HP_TUNABLE.__dict__, 21 | 'description': str(sim.SETTINGS) + str(sim.HP_TUNABLE), 22 | }) 23 | return sweep_id, wandb_lunch 24 | 25 | 26 | def grid_search_configurations(tunable_variables, n_steps=3): 27 | """ Given a set of parameters to tune of the form 28 | 29 | { p1: {"values": [v1, v2, v3]}, 30 | p2: {"max": 1, "min": 0}, ... } 31 | 32 | returns the configurations associated with a grid search in the form: 33 | [ {p1:v1, p2:v1}, {p1:v1, v2}, ... ] 34 | """ 35 | all_domains = [] 36 | for name, domain in tunable_variables.items(): 37 | # continuous variable 38 | if 'min' in domain: 39 | step = (domain['max'] - domain['min']) / n_steps 40 | all_domains += [[domain['min'] + step * i for i in range(n_steps)]] 41 | print(f"Warning! Param {name} domain {domain} was discretized! In {n_steps} steps as {all_domains}.") 42 | 43 | # discrete variable 44 | elif 'values' in domain: 45 | all_domains += [domain['values']] 46 | configurations_tuples = itertools.product(*all_domains) 47 | 48 | # from tuples [(v1, v2, v3)] to [{p1: v1}, ...] 49 | configurations_dicts = [{k: v for k, v in zip(tunable_variables.keys(), selected_values)} for selected_values in configurations_tuples] 50 | return configurations_dicts 51 | 52 | 53 | class ExecutionPlan: 54 | def __init__(self, plan, constraints): 55 | self.plan = plan 56 | self.constraints = constraints 57 | 58 | def configurations(self): 59 | """ 60 | Generate configurations based on the execution plan and constraints. 61 | Returns: list: A list of dictionaries representing configurations for LOBCAST Settings, 62 | where keys are variable names and values are the corresponding values. 63 | """ 64 | all_domains = [list(dom) for dom in self.plan.values()] 65 | configurations_attempts = list(itertools.product(*all_domains)) 66 | 67 | chosen_configurations = set(configurations_attempts) 68 | if len(self.constraints) > 0: 69 | chosen_configurations = set() 70 | for fixed_var, fixed_value in self.constraints.items(): 71 | for configuration in configurations_attempts: 72 | vf_index = list(self.plan.keys()).index(fixed_var) 73 | if configuration[vf_index] == fixed_value: 74 | chosen_configurations |= {configuration} 75 | 76 | out_con = [] 77 | for co_tup in chosen_configurations: 78 | co_dic = {k.value: co_tup[i] for i, k in enumerate(self.plan.keys())} 79 | out_con.append(co_dic) 80 | return out_con 81 | -------------------------------------------------------------------------------- /src/models/cnn2/cnn2.py: -------------------------------------------------------------------------------- 1 | # Using Deep Learning for price prediction by exploiting stationary limit order book features 2 | # Source: https://www.sciencedirect.com/science/article/pii/S1568494620303410 3 | 4 | import pytorch_lightning as pl 5 | from torch import nn 6 | 7 | from src.models.lobcast_model import LOBCAST_model, LOBCAST_module 8 | from src.hyper_parameters import HPTunable 9 | 10 | 11 | class CNN2(LOBCAST_model): 12 | def __init__(self, input_dim, output_dim): 13 | super().__init__(input_dim, output_dim) 14 | 15 | # Convolution 1 16 | self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(10, 42), padding=(0, 2)) 17 | self.bn1 = nn.BatchNorm2d(16) 18 | self.prelu1 = nn.PReLU() 19 | 20 | # Convolution 2 21 | self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=(10,)) # 3 22 | self.bn2 = nn.BatchNorm1d(16) 23 | self.prelu2 = nn.PReLU() 24 | 25 | # Convolution 3 26 | self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=(8,)) # 1 27 | self.bn3 = nn.BatchNorm1d(32) 28 | self.prelu3 = nn.PReLU() 29 | 30 | # Convolution 4 31 | self.conv4 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=(6,)) # 1 32 | self.bn4 = nn.BatchNorm1d(32) 33 | self.prelu4 = nn.PReLU() 34 | 35 | # Convolution 5 36 | self.conv5 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=(4,)) # 1 37 | self.bn5 = nn.BatchNorm1d(32) 38 | self.prelu5 = nn.PReLU() 39 | 40 | # Fully connected 1 41 | self.fc1 = nn.Linear(249 * 32, 32) 42 | self.prelu6 = nn.PReLU() 43 | 44 | # Fully connected 2 45 | self.fc2 = nn.Linear(32, output_dim) 46 | 47 | def forward(self, x): 48 | # Adding the channel dimension 49 | x = x[:, None, :] # x.shape = [batch_size, 1, 100, 40] 50 | 51 | # print('x.shape:', x.shape) 52 | 53 | # Convolution 1 54 | out = self.conv1(x) 55 | # print('After convolution1:', out.shape) 56 | 57 | out = self.bn1(out) 58 | # print('After bn1:', out.shape) 59 | 60 | out = self.prelu1(out) 61 | out = out.reshape(out.shape[0], out.shape[1], -1) 62 | # print('After prelu1:', out.shape) 63 | 64 | # Convolution 2 65 | out = self.conv2(out) 66 | out = self.bn2(out) 67 | out = self.prelu2(out) 68 | # print('After convolution2, bn2, prelu2:', out.shape) 69 | 70 | # Convolution 3 71 | out = self.conv3(out) 72 | out = self.bn3(out) 73 | out = self.prelu3(out) 74 | # print('After convolution3, bn3, prelu3:', out.shape) 75 | 76 | # Convolution 4 77 | out = self.conv4(out) 78 | out = self.bn4(out) 79 | out = self.prelu4(out) 80 | # print('After convolution4, bn4, prelu4:', out.shape) 81 | 82 | # Convolution 5 83 | out = self.conv5(out) 84 | out = self.bn5(out) 85 | out = self.prelu5(out) 86 | # print('After convolution5, bn5, prelu5:', out.shape) 87 | 88 | # flatten 89 | out = out.view(out.size(0), -1) 90 | # print('After flatten:', out.shape) 91 | 92 | # Linear function 1 93 | out = self.fc1(out) 94 | out = self.prelu6(out) 95 | # print('After fc1:', out.shape) 96 | 97 | # Linear function (readout) 98 | out = self.fc2(out) 99 | # print('After fc2:', out.shape) 100 | 101 | return out 102 | 103 | 104 | CNN2_ml = LOBCAST_module(CNN2, tunable_parameters=None) 105 | -------------------------------------------------------------------------------- /src/settings.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import numpy as np 4 | import torch 5 | import src.constants as cst 6 | import multiprocessing 7 | 8 | import src.settings 9 | 10 | np.set_printoptions(suppress=True) 11 | from src.utils.utils_generic import dict_to_string 12 | from enum import Enum 13 | 14 | 15 | class SettingsExp(Enum): 16 | SEED = "SEED" 17 | PREDICTION_MODEL = "PREDICTION_MODEL" 18 | PREDICTION_HORIZON_FUTURE = "PREDICTION_HORIZON_FUTURE" 19 | PREDICTION_HORIZON_PAST = "PREDICTION_HORIZON_PAST" 20 | OBSERVATION_PERIOD = "OBSERVATION_PERIOD" 21 | 22 | 23 | class Settings: 24 | """ A class with all the settings of the simulations. Settings are set at runtime from command line. """ 25 | def __init__(self): 26 | 27 | self.SEED: int = 0 28 | """ The random seed of the simulation. """ 29 | 30 | self.DATASET_NAME: cst.DatasetFamily = cst.DatasetFamily.FI 31 | """ Name of the dataset to run tests on. """ 32 | 33 | self.N_TRENDS = 3 34 | """ The number of trends to use for predictions. """ 35 | 36 | self.PREDICTION_MODEL = cst.Models.MLP 37 | self.PREDICTION_HORIZON_UNIT: cst.UnitHorizon = cst.UnitHorizon.EVENTS 38 | """ The time unit for time series discretization. """ 39 | 40 | self.PREDICTION_HORIZON_FUTURE: int = 5 41 | self.PREDICTION_HORIZON_PAST: int = 1 42 | self.OBSERVATION_PERIOD: int = 100 43 | self.IS_SHUFFLE_TRAIN_SET = True 44 | 45 | self.EPOCHS_UB = 30 46 | """ The number of training epochs. """ 47 | 48 | self.TRAIN_SET_PORTION = .8 49 | self.VALIDATION_EVERY = 1 50 | 51 | self.IS_TEST_ONLY = False 52 | """ Whether or not to run the simulation in test mode. If True, no train or validation are performed. """ 53 | 54 | self.TEST_MODEL_PATH: str = "data/saved_models/LOBCAST-(15-03-2024_20-23-49)/epoch=2-validation_f1=0.27.ckpt" 55 | """ The path to the model to test. """ 56 | 57 | self.DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 58 | self.N_GPUs = None if self.DEVICE == 'cpu' else torch.cuda.device_count() 59 | self.N_CPUs = multiprocessing.cpu_count() 60 | 61 | self.DIR_EXPERIMENTS = "" 62 | self.IS_WANDB = True 63 | self.WANDB_SWEEP_METHOD = 'grid' 64 | """ Whether or not to use wandb. """ 65 | 66 | self.IS_SANITY_CHECK = False 67 | """ Whether or not to use sanity checks. """ 68 | 69 | def check_parameters_validity(self): 70 | """ Checks if the parameters set at runtime are valid. """ 71 | CONSTRAINTS = [] 72 | c1 = (not self.IS_TEST_ONLY or os.path.exists(self.TEST_MODEL_PATH), "If IS_TEST_ONLY, then test model should exist.") 73 | 74 | c2 = (not self.DATASET_NAME == cst.DatasetFamily.FI or self.PREDICTION_HORIZON_UNIT == cst.UnitHorizon.EVENTS, 75 | f"FI-2010 Dataset can handle only event based granularity, {self.PREDICTION_HORIZON_UNIT} given.") 76 | 77 | c3 = (not self.DATASET_NAME == cst.DatasetFamily.FI or self.PREDICTION_HORIZON_PAST == 1, 78 | f"FI-2010 Dataset can handle only 1 event in the past horizon, {self.PREDICTION_HORIZON_PAST} given.") 79 | 80 | c4 = (not self.DATASET_NAME == cst.DatasetFamily.FI or self.PREDICTION_HORIZON_FUTURE in [1, 2, 3, 5, 10], 81 | f"FI-2010 Dataset can handle only {1, 2, 3, 5, 10} events in the future horizon, {self.PREDICTION_HORIZON_FUTURE} given.") 82 | 83 | c5 = (not self.DATASET_NAME == cst.DatasetFamily.FI or self.N_TRENDS == 3, 84 | f"FI-2010 Dataset can handle only 3 trends, {self.N_TRENDS} given.") 85 | 86 | c6 = (not self.PREDICTION_MODEL == cst.Models.BINCTABL or self.OBSERVATION_PERIOD == 10, 87 | f"At the moment, BINCTABL only allows OBSERVATION_PERIOD = 10, {self.OBSERVATION_PERIOD} given.") 88 | 89 | CONSTRAINTS += [c1, c2, c3, c4, c5, c6] 90 | for constrain, description in CONSTRAINTS: 91 | if not constrain: 92 | raise ValueError(f"Constraint not met! {description} Check your parameters.") 93 | 94 | def __repr__(self): 95 | return dict_to_string(self.__dict__) 96 | -------------------------------------------------------------------------------- /src/utils/utils_generic.py: -------------------------------------------------------------------------------- 1 | 2 | import pickle 3 | import os 4 | import json 5 | import platform, socket, re, uuid, psutil, logging 6 | import matplotlib.pyplot as plt 7 | 8 | import inspect 9 | 10 | 11 | def get_class_arguments(obj): 12 | signature = inspect.signature(obj) 13 | parameters = signature.parameters 14 | arguments = [param.name for param in parameters.values()] # list of arguments 15 | return arguments 16 | 17 | import src.constants as cst 18 | 19 | 20 | def read_data(fname): 21 | with open(fname, 'rb') as handle: 22 | out_df = pickle.load(handle) 23 | return out_df 24 | 25 | 26 | def write_data(data, path, fname): 27 | with open(path + fname, 'wb') as handle: 28 | os.makedirs(path, exist_ok=True) 29 | pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) 30 | 31 | 32 | def write_json(msg, fname): 33 | with open(fname, 'w') as fp: 34 | json.dump(msg, fp) 35 | 36 | 37 | def read_json(fname): 38 | data = None 39 | if os.path.exists(fname): 40 | with open(fname, 'r') as fp: 41 | data = json.load(fp) 42 | else: 43 | print("File", fname, "does not exist.") 44 | return data 45 | 46 | 47 | def is_jsonable(x): 48 | try: 49 | json.dumps(x) 50 | return True 51 | except (TypeError, OverflowError): 52 | return False 53 | 54 | 55 | def get_sys_info(): 56 | info = dict() 57 | info['platform'] = platform.system() 58 | info['platform-release'] = platform.release() 59 | info['platform-version'] = platform.version() 60 | info['architecture'] = platform.machine() 61 | info['hostname'] = socket.gethostname() 62 | info['ip-address'] = socket.gethostbyname(socket.gethostname()) 63 | info['mac-address'] = ':'.join(re.findall('..', '%012x' % uuid.getnode())) 64 | info['processor'] = platform.processor() 65 | info['ram'] = str(round(psutil.virtual_memory().total / (1024.0 **3)))+" GB" 66 | print(info) 67 | 68 | 69 | def get_sys_mac(): 70 | return ':'.join(re.findall('..', '%012x' % uuid.getnode())) 71 | 72 | 73 | def get_index_from_window(config): 74 | if config.DATASET_NAME == cst.DatasetFamily.FI: 75 | return cst.HORIZONS_MAPPINGS_FI[config.HYPER_PARAMETERS[cst.LearningHyperParameter.FI_HORIZON]] 76 | elif config.DATASET_NAME == cst.DatasetFamily.LOB: 77 | return cst.HORIZONS_MAPPINGS_LOBSTER[config.HYPER_PARAMETERS[cst.LearningHyperParameter.FORWARD_WINDOW.value]] 78 | 79 | 80 | def sample_color(index, cmap='tab10'): 81 | # 1. Choose your desired colormap 82 | cmap = plt.get_cmap(cmap) 83 | 84 | # 2. Segmenting the whole range (from 0 to 1) of the color map into multiple segments 85 | colors = [cmap(x) for x in range(cmap.N)] 86 | assert index < cmap.N 87 | 88 | # 3. Color the i-th line with the i-th color, i.e. slicedCM[i] 89 | color = colors[index] 90 | return color 91 | 92 | 93 | def sample_marker(index): 94 | MARKERS = ["s", "p", "P", "*", "h", "H", "+", "x", "X", "D", "d", "|", "_", 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ".", ",", "o", "v", "^", "<", ">", "1", "2", "3", "4", "8"] 95 | return MARKERS[index] 96 | 97 | 98 | def sample_pattern(index): 99 | MARKERS = ['/', '\\', '|', '-', '+', 'x', 'o', 'O', '.', '*'] + ['/o', '\\|', '|*', '-\\', '+o', 'x*', 'o-', 'O|', 'O.', '*-'] 100 | return MARKERS[index] 101 | 102 | 103 | def sample_line(index): 104 | MARKERS = ['-', '--', '-.', ':', 'None', ' ', '', 'solid', 'dashed', 'dashdot', 'dotted', 'loosely dotted', 'densely dotted', 'loosely dashed', 'densely dashed', 'loosely dashdotted', 'densely dashdotted', 'loosely dashdotdotted', 'dashdotdotted', 'densely dashdotdotted'] 105 | return MARKERS[index] 106 | 107 | 108 | def make_dir(path): 109 | if not os.path.exists(path): 110 | os.makedirs(path) 111 | 112 | 113 | def dict_to_string(dictionary): 114 | rep = "" 115 | for key, value in dictionary.items(): 116 | rep += f"{key}: {value}\n" 117 | return rep 118 | 119 | 120 | def str_to_bool(string: str): 121 | string = string.lower() 122 | return string in ['true', '1', 't', 'y', 'yes'] 123 | 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /src/utils/util_training.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import pytorch_lightning as pl 4 | 5 | import torch 6 | import numpy as np 7 | import torch.nn as nn 8 | import src.constants as cst 9 | from src.metrics.metrics_learning import compute_metrics 10 | 11 | 12 | class LOBCAST_NNEngine(pl.LightningModule): 13 | def __init__(self, neural_architecture, loss_weights, hps, metrics_log, wandb_log): 14 | super().__init__() 15 | self.neural_architecture = neural_architecture 16 | self.loss_weights = loss_weights 17 | self.hps = hps 18 | self.metrics_log = metrics_log 19 | self.wandb_log = wandb_log 20 | 21 | def log_wandb(self, metrics): 22 | if self.wandb_log: 23 | self.wandb_log.log(metrics) 24 | 25 | def forward(self, batch): 26 | # time x features - 40 x 100 in general 27 | out = self.neural_architecture(batch) 28 | logits = nn.Softmax(dim=1)(out) # todo check if within model 29 | return out, logits 30 | 31 | def training_step(self, batch, batch_idx): 32 | prediction_ind, y, loss_val, logits = self.make_predictions(batch) 33 | return {"loss": loss_val, "other": (prediction_ind, y, loss_val, logits)} 34 | 35 | def validation_step(self, batch, batch_idx): 36 | prediction_ind, y, loss_val, logits = self.make_predictions(batch) 37 | return prediction_ind, y, loss_val, logits 38 | 39 | def test_step(self, batch, batch_idx): 40 | prediction_ind, y, loss_val, logits = self.make_predictions(batch) 41 | return prediction_ind, y, loss_val, logits 42 | 43 | def make_predictions(self, batch): 44 | x, y = batch 45 | out, logits = self(x) 46 | loss_val = nn.CrossEntropyLoss(self.loss_weights)(out, y) 47 | 48 | # deriving prediction from softmax probs 49 | prediction_ind = torch.argmax(logits, dim=1) # B 50 | return prediction_ind, y, loss_val, logits 51 | 52 | def predict_step(self, batch, batch_idx, dataloader_idx=0): 53 | x, _ = batch 54 | t0 = time.time() 55 | self(x) 56 | torch.cuda.current_stream().synchronize() 57 | t1 = time.time() 58 | elapsed = t1 - t0 59 | print("Inference for the model:", elapsed, "ms") 60 | return elapsed 61 | 62 | def evaluate_classifier(self, stp_type, step_outputs): 63 | preds, truths, loss_vals, logits = self.__get_prediction_vectors(step_outputs) 64 | eval_dict = compute_metrics(truths, preds, loss_vals) 65 | 66 | var_name = "{}_{}".format(stp_type, cst.Metrics.LOSS.value) 67 | self.log(var_name, eval_dict[cst.Metrics.LOSS.value], prog_bar=True) 68 | 69 | var_name = "{}_{}".format(stp_type, cst.Metrics.F1.value) 70 | self.log(var_name, eval_dict[cst.Metrics.F1.value], prog_bar=True) 71 | 72 | path = cst.METRICS_BEST_FILE_NAME if self.metrics_log.is_best_model else cst.METRICS_RUNNING_FILE_NAME 73 | 74 | print("\n") 75 | print(f"END epoch {self.current_epoch} ({stp_type})") 76 | print("Logging stats...") 77 | self.metrics_log.add_metric(self.current_epoch, stp_type, eval_dict) 78 | self.metrics_log.dump_metrics(path) 79 | self.log_wandb({f"{stp_type}_{k}": v for k, v in eval_dict.items()}) 80 | print("Done.") 81 | 82 | def training_epoch_end(self, training_step_outputs): 83 | training_step_outputs = [batch["other"] for batch in training_step_outputs] 84 | self.evaluate_classifier(cst.ModelSteps.TRAINING.value, training_step_outputs) 85 | 86 | def validation_epoch_end(self, validation_step_outputs): 87 | self.evaluate_classifier(cst.ModelSteps.VALIDATION.value, validation_step_outputs) 88 | 89 | def test_epoch_end(self, test_step_outputs): 90 | self.evaluate_classifier(cst.ModelSteps.TESTING.value, test_step_outputs) 91 | 92 | def __get_prediction_vectors(self, model_output): 93 | """ Accumulates the models output after each validation and testing epoch end. """ 94 | 95 | preds, truths, losses, logits = [], [], [], [] 96 | for preds_b, y_b, loss_val, logits_b in model_output: 97 | preds += preds_b.tolist() 98 | truths += y_b.tolist() 99 | logits += logits_b.tolist() 100 | losses += [loss_val.item()] # loss is single per batch 101 | 102 | preds = np.array(preds) 103 | truths = np.array(truths) 104 | logits = np.array(logits) 105 | losses = np.array(losses) 106 | 107 | return preds, truths, losses, logits 108 | 109 | def configure_optimizers(self): 110 | if self.hps.OPTIMIZER == "SGD": 111 | return torch.optim.SGD(self.parameters(), lr=self.hps.LEARNING_RATE) 112 | elif self.hps.OPTIMIZER == "ADAM": 113 | return torch.optim.Adam(self.parameters(), lr=self.hps.LEARNING_RATE) 114 | elif self.hps.OPTIMIZER == "RMSPROP": 115 | return torch.optim.RMSprop(self.parameters(), lr=self.hps.LEARNING_RATE) 116 | -------------------------------------------------------------------------------- /src/models/binctabl/base.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class BiN(nn.Module): 6 | def __init__(self, d2, d1, t1, t2): 7 | super().__init__() 8 | self.t1 = t1 9 | self.d1 = d1 10 | self.t2 = t2 11 | self.d2 = d2 12 | 13 | bias1 = torch.Tensor(t1, 1) 14 | self.B1 = nn.Parameter(bias1) 15 | nn.init.constant_(self.B1, 0) 16 | 17 | l1 = torch.Tensor(t1, 1) 18 | self.l1 = nn.Parameter(l1) 19 | nn.init.xavier_normal_(self.l1) 20 | 21 | bias2 = torch.Tensor(d1, 1) 22 | self.B2 = nn.Parameter(bias2) 23 | nn.init.constant_(self.B2, 0) 24 | 25 | l2 = torch.Tensor(d1, 1) 26 | self.l2 = nn.Parameter(l2) 27 | nn.init.xavier_normal_(self.l2) 28 | 29 | y1 = torch.Tensor(1, ) 30 | self.y1 = nn.Parameter(y1) 31 | nn.init.constant_(self.y1, 0.5) 32 | 33 | y2 = torch.Tensor(1, ) 34 | self.y2 = nn.Parameter(y2) 35 | nn.init.constant_(self.y2, 0.5) 36 | 37 | def forward(self, x): 38 | 39 | # if the two scalars are negative then we setting them to 0 40 | if (self.y1[0] < 0): 41 | y1 = torch.cuda.FloatTensor(1, ) 42 | self.y1 = nn.Parameter(y1) 43 | nn.init.constant_(self.y1, 0.01) 44 | 45 | if (self.y2[0] < 0): 46 | y2 = torch.cuda.FloatTensor(1, ) 47 | self.y2 = nn.Parameter(y2) 48 | nn.init.constant_(self.y2, 0.01) 49 | 50 | # normalization along the temporal dimensione 51 | T2 = torch.ones([self.t1, 1]) 52 | x2 = torch.mean(x, dim=2) 53 | x2 = torch.reshape(x2, (x2.shape[0], x2.shape[1], 1)) 54 | 55 | std = torch.std(x, dim=2) 56 | std = torch.reshape(std, (std.shape[0], std.shape[1], 1)) 57 | # it can be possible that the std of some temporal slices is 0, and this produces inf values, so we have to set them to one 58 | std[std < 1e-4] = 1 59 | 60 | diff = x - (x2 @ (T2.T)) 61 | Z2 = diff / (std @ (T2.T)) 62 | 63 | X2 = self.l2 @ T2.T 64 | X2 = X2 * Z2 65 | X2 = X2 + (self.B2 @ T2.T) 66 | 67 | # normalization along the feature dimension 68 | T1 = torch.ones([self.d1, 1]) 69 | x1 = torch.mean(x, dim=1) 70 | x1 = torch.reshape(x1, (x1.shape[0], x1.shape[1], 1)) 71 | 72 | std = torch.std(x, dim=1) 73 | std = torch.reshape(std, (std.shape[0], std.shape[1], 1)) 74 | 75 | op1 = x1 @ T1.T 76 | op1 = torch.permute(op1, (0, 2, 1)) 77 | 78 | op2 = std @ T1.T 79 | op2 = torch.permute(op2, (0, 2, 1)) 80 | 81 | z1 = (x - op1) / (op2) 82 | X1 = (T1 @ self.l1.T) 83 | X1 = X1 * z1 84 | X1 = X1 + (T1 @ self.B1.T) 85 | 86 | # weighing the imporance of temporal and feature normalization 87 | x = self.y1 * X1 + self.y2 * X2 88 | 89 | return x 90 | 91 | 92 | class TABL_layer(nn.Module): 93 | def __init__(self, d2, d1, t1, t2): 94 | super().__init__() 95 | self.t1 = t1 96 | 97 | weight = torch.Tensor(d2, d1) 98 | self.W1 = nn.Parameter(weight) 99 | nn.init.kaiming_uniform_(self.W1, nonlinearity='relu') 100 | 101 | weight2 = torch.Tensor(t1, t1) 102 | self.W = nn.Parameter(weight2) 103 | nn.init.constant_(self.W, 1 / t1) 104 | 105 | weight3 = torch.Tensor(t1, t2) 106 | self.W2 = nn.Parameter(weight3) 107 | nn.init.kaiming_uniform_(self.W2, nonlinearity='relu') 108 | 109 | bias1 = torch.Tensor(d2, t2) 110 | self.B = nn.Parameter(bias1) 111 | nn.init.constant_(self.B, 0) 112 | 113 | l = torch.Tensor(1, ) 114 | self.l = nn.Parameter(l) 115 | nn.init.constant_(self.l, 0.5) 116 | 117 | self.activation = nn.ReLU() 118 | 119 | def forward(self, X): 120 | 121 | # maintaining the weight parameter between 0 and 1. 122 | if (self.l[0] < 0): 123 | l = torch.Tensor(1, ) 124 | self.l = nn.Parameter(l) 125 | nn.init.constant_(self.l, 0.0) 126 | 127 | if (self.l[0] > 1): 128 | l = torch.Tensor(1, ) 129 | self.l = nn.Parameter(l) 130 | nn.init.constant_(self.l, 1.0) 131 | 132 | # modelling the dependence along the first mode of X while keeping the temporal order intact (7) 133 | X = self.W1 @ X 134 | 135 | # enforcing constant (1) on the diagonal 136 | W = self.W - self.W * torch.eye(self.t1, dtype=torch.float32) + torch.eye(self.t1, dtype=torch.float32) / self.t1 137 | 138 | # attention, the aim of the second step is to learn how important the temporal instances are to each other (8) 139 | E = X @ W 140 | 141 | # computing the attention mask (9) 142 | A = torch.softmax(E, dim=-1) 143 | 144 | # applying a soft attention mechanism (10) 145 | # he attention mask A obtained from the third step is used to zero out the effect of unimportant elements 146 | X = self.l[0] * (X) + (1.0 - self.l[0]) * X * A 147 | 148 | # the final step of the proposed layer estimates the temporal mapping W2, after the bias shift (11) 149 | y = X @ self.W2 + self.B 150 | return y 151 | 152 | 153 | class BL_layer(nn.Module): 154 | def __init__(self, d2, d1, t1, t2): 155 | super().__init__() 156 | weight1 = torch.Tensor(d2, d1) 157 | self.W1 = nn.Parameter(weight1) 158 | nn.init.kaiming_uniform_(self.W1, nonlinearity='relu') 159 | 160 | weight2 = torch.Tensor(t1, t2) 161 | self.W2 = nn.Parameter(weight2) 162 | nn.init.kaiming_uniform_(self.W2, nonlinearity='relu') 163 | 164 | bias1 = torch.zeros((d2, t2)) 165 | self.B = nn.Parameter(bias1) 166 | nn.init.constant_(self.B, 0) 167 | 168 | self.activation = nn.ReLU() 169 | 170 | def forward(self, x): 171 | 172 | x = self.activation(self.W1 @ x @ self.W2 + self.B) 173 | 174 | return x 175 | -------------------------------------------------------------------------------- /src/data_preprocessing/FI/FIDataBuilder.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import os.path 3 | from collections import Counter 4 | 5 | import src.constants as cst 6 | import numpy as np 7 | import tqdm 8 | import torch 9 | from pprint import pprint 10 | from torch.utils import data 11 | 12 | 13 | class FIDataset(data.Dataset): 14 | def __init__( 15 | self, 16 | dataset_path, 17 | dataset_type, 18 | horizon, 19 | observation_length, 20 | train_val_split, 21 | n_trends, 22 | auction=False, 23 | normalization_type=cst.NormalizationType.Z_SCORE, 24 | ): 25 | assert horizon in [1, 2, 3, 5, 10] 26 | 27 | self.fi_data_dir = dataset_path 28 | self.dataset_type = dataset_type 29 | self.train_val_split = train_val_split 30 | self.auction = auction 31 | self.normalization_type = normalization_type 32 | self.horizon = horizon 33 | self.observation_length = observation_length 34 | self.num_classes = n_trends 35 | 36 | # KEY call, generates the dataset 37 | self.data, self.samples_X, self.samples_y = None, None, None 38 | self.__prepare_dataset() 39 | 40 | _, occs = self.__class_balancing(self.samples_y) 41 | # LOSS_WEIGHTS_DICT = {m: 1e6 for m in cst.Models} 42 | LOSS_WEIGHT = 1e6 43 | self.loss_weights = torch.Tensor(LOSS_WEIGHT / occs) 44 | 45 | self.samples_X = torch.from_numpy(self.samples_X).type(torch.FloatTensor) # torch.Size([203800, 40]) 46 | self.samples_y = torch.from_numpy(self.samples_y).type(torch.LongTensor) # torch.Size([203800]) 47 | self.x_shape = (self.observation_length, self.samples_X.shape[1]) # shape of a single sample 48 | 49 | def __len__(self): 50 | """ Denotes the total number of samples. """ 51 | return self.samples_X.shape[0] - self.observation_length 52 | 53 | def __getitem__(self, index): 54 | """ Generates samples of data. """ 55 | sample = self.samples_X[index: index + self.observation_length], self.samples_y[index + self.observation_length - 1] 56 | return sample 57 | 58 | @staticmethod 59 | def __class_balancing(y): 60 | ys_occurrences = collections.Counter(y) 61 | occs = np.array([ys_occurrences[k] for k in sorted(ys_occurrences)]) 62 | return ys_occurrences, occs 63 | 64 | def __parse_dataset(self): 65 | """ Reads the dataset from the FI files. """ 66 | 67 | AUCTION = 'Auction' if self.auction else 'NoAuction' 68 | N = '1.' if self.normalization_type == cst.NormalizationType.Z_SCORE else '2.' if self.normalization_type == cst.NormalizationType.MINMAX else '3.' 69 | NORMALIZATION = 'Zscore' if self.normalization_type == cst.NormalizationType.Z_SCORE else 'MinMax' if self.normalization_type == cst.NormalizationType.MINMAX else 'DecPre' 70 | DATASET_TYPE = 'Training' if self.dataset_type == cst.DatasetType.TRAIN or self.dataset_type == cst.DatasetType.VALIDATION else 'Testing' 71 | DIR = self.fi_data_dir + \ 72 | "/{}".format(AUCTION) + \ 73 | "/{}{}_{}".format(N, AUCTION, NORMALIZATION) + \ 74 | "/{}_{}_{}".format(AUCTION, NORMALIZATION, DATASET_TYPE) 75 | 76 | NORMALIZATION = 'ZScore' if self.normalization_type == cst.NormalizationType.Z_SCORE else 'MinMax' if self.normalization_type == cst.NormalizationType.MINMAX else 'DecPre' 77 | DATASET_TYPE = 'Train' if self.dataset_type == cst.DatasetType.TRAIN or self.dataset_type == cst.DatasetType.VALIDATION else 'Test' 78 | 79 | F_EXTENSION = '.txt' 80 | 81 | # if it is training time, we open the 7-days training file 82 | # if it is testing time, we open the 3 test files 83 | if self.dataset_type == cst.DatasetType.TRAIN or self.dataset_type == cst.DatasetType.VALIDATION: 84 | 85 | F_NAME = DIR + '/{}_Dst_{}_{}_CF_7'.format(DATASET_TYPE, AUCTION, NORMALIZATION) + F_EXTENSION 86 | 87 | if not os.path.exists(F_NAME): 88 | error = "\n\nFile {} not found! Make sure to follow the following steps.".format(F_NAME) 89 | error += "\n\n (1) Download the dataset in data/datasets, by running:\n{}".format(cst.DOWNLOAD_FI_COMMAND) 90 | error += "\n (2) Unzip the file." 91 | error += "\n (3) Run: mv data/datasets/published/ data/datasets/FI-2010" 92 | error += "\n (4) Unzip data/datasets/FI-2010/BenchmarkDatasets/BenchmarkDatasets.zip in data/datasets/FI-2010/BenchmarkDatasets" 93 | error += "\n" 94 | raise FileNotFoundError(error) 95 | 96 | out_df = np.loadtxt(F_NAME) 97 | 98 | n_samples_train = int(np.floor(out_df.shape[1] * self.train_val_split)) 99 | if self.dataset_type == cst.DatasetType.TRAIN: 100 | out_df = out_df[:, :n_samples_train] 101 | 102 | elif self.dataset_type == cst.DatasetType.VALIDATION: 103 | out_df = out_df[:, n_samples_train:] 104 | 105 | else: 106 | F_NAMES = [DIR + '/{}_Dst_{}_{}_CF_{}'.format(DATASET_TYPE, AUCTION, NORMALIZATION, i) + F_EXTENSION for i in range(7, 10)] 107 | out_df = np.hstack([np.loadtxt(F_NAME) for F_NAME in F_NAMES]) 108 | 109 | self.data = out_df 110 | 111 | def __prepare_X(self): 112 | """ we only consider the first 40 features, i.e. the 10 levels of the LOB""" 113 | LOB_TEN_LEVEL_FEATURES = 40 114 | self.samples_X = self.data[:LOB_TEN_LEVEL_FEATURES, :].transpose() 115 | 116 | def __prepare_y(self): 117 | """ gets the labels """ 118 | # the last five elements in self.data contain the labels 119 | # they are based on the possible horizon values [1, 2, 3, 5, 10] 120 | self.samples_y = self.data[cst.HORIZONS_MAPPINGS_FI[self.horizon], :] 121 | self.samples_y -= 1 122 | 123 | def __prepare_dataset(self): 124 | """ Crucial call! """ 125 | 126 | self.__parse_dataset() 127 | 128 | self.__prepare_X() 129 | self.__prepare_y() 130 | 131 | print("Dataset type:", self.dataset_type, " - normalization:", self.normalization_type) 132 | occs, occs_vec = self.__class_balancing(self.samples_y) 133 | 134 | perc = ["{}%".format(round(i, 2)) for i in (occs_vec / np.sum(occs_vec)) * 100] 135 | print("Balancing", occs, "=>", perc) 136 | print() 137 | -------------------------------------------------------------------------------- /src/lobcast.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import argparse 4 | import os 5 | from datetime import datetime 6 | from enum import Enum 7 | from matplotlib.backends.backend_pdf import PdfPages 8 | 9 | import numpy as np 10 | from pytorch_lightning import seed_everything 11 | 12 | import src.constants as cst 13 | from src.metrics.metrics_log import Metrics 14 | 15 | np.set_printoptions(suppress=True) 16 | from src.utils.utils_generic import str_to_bool 17 | from src.settings import Settings 18 | from src.hyper_parameters import HPTunable, HPTuned 19 | 20 | from src.models.model_callbacks import callback_save_model 21 | from src.data_preprocessing.utils_dataset import pick_dataset 22 | from src.models.utils_models import pick_model 23 | from pytorch_lightning import Trainer 24 | from src.metrics.report import plot_metric_training, plot_metric_best, saved_metrics 25 | from src.utils.utils_generic import get_class_arguments 26 | 27 | 28 | class LOBCAST: 29 | """ LOBCAST class is responsible to maintain all the information about the current simulation. 30 | Including the simulation settings, tunable hyperparameters of the models. """ 31 | 32 | def __init__(self): 33 | 34 | self.SETTINGS = Settings() # the settings of the simulation 35 | self.HP_TUNABLE = HPTunable() # the hyperparameters to vary and their domains 36 | self.HP_TUNED = HPTuned() # the hyperparameters and their values 37 | 38 | def update_settings(self, setting_params: dict): 39 | """ Updates the settings with the given parameters. """ 40 | # settings new settings 41 | for key, value in setting_params.items(): 42 | self.SETTINGS.__setattr__(key, value) 43 | 44 | self.SETTINGS.check_parameters_validity() 45 | self.__init_hyper_parameters() 46 | 47 | if self.SETTINGS.IS_SANITY_CHECK: 48 | self.__sanity_check_settings() 49 | 50 | # at this point parameters are set 51 | print("\nRunning with settings:\n", self.SETTINGS.__dict__) 52 | 53 | def __sanity_check_settings(self): 54 | print("THIS IS A SANITY CHECK RUN.") 55 | self.SETTINGS.EPOCHS_UB = 1 56 | 57 | def update_hyper_parameters(self, tuning_parameters: dict): 58 | """Update the hyperparameters with the given parameters""" 59 | 60 | # coming from wandb or from local grid search 61 | for key, value in tuning_parameters.items(): 62 | self.HP_TUNED.update_hyperparameter(key, value) 63 | 64 | # at this point parameters are set 65 | print("\nRunning with hyper parameters:\n", self.HP_TUNED.__dict__) 66 | 67 | def end_setup(self, wandb_instance=None): 68 | """ Ends the simulation setup based on the chosen settings and parameters. """ 69 | 70 | self.DATE_TIME = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") 71 | dir_detail = "SANITY_CHECK" if self.SETTINGS.IS_SANITY_CHECK else self.DATE_TIME 72 | self.SETTINGS.DIR_EXPERIMENTS = f"{cst.DIR_EXPERIMENTS}-({dir_detail})/" 73 | 74 | self.__seed_everything(self.SETTINGS.SEED) 75 | self.__setup_all_directories(self.SETTINGS) 76 | 77 | self.METRICS = Metrics(self.SETTINGS.DIR_EXPERIMENTS, self.sim_name_format()) 78 | self.METRICS.dump_info(self.SETTINGS.__dict__, self.HP_TUNED.__dict__) 79 | 80 | self.WANDB_INSTANCE = wandb_instance 81 | 82 | def __init_hyper_parameters(self): 83 | """ Init the simulation hyperparameters gathering those from the chosen model, declared by the user. """ 84 | model_arguments = get_class_arguments(self.SETTINGS.PREDICTION_MODEL.value.model)[2:] 85 | model_tunable = self.SETTINGS.PREDICTION_MODEL.value.tunable_parameters 86 | 87 | # checks that HP are meaningful 88 | for param, values in model_tunable.__dict__.items(): 89 | if not (param in model_arguments or param in self.HP_TUNABLE.__dict__): 90 | raise KeyError(f"The declared hyper parameters \'{param}\' of model {self.SETTINGS.PREDICTION_MODEL.name} is never used. Remove it.") 91 | 92 | self.HP_TUNABLE = model_tunable 93 | 94 | # set to default, add the same parameters in the HP_TUNED object 95 | for key, _ in self.HP_TUNABLE.__dict__.items(): 96 | self.HP_TUNED.add_hyperparameter(key, None) 97 | 98 | def __seed_everything(self, seed): 99 | """ Sets the random seed of the whole simulator. """ 100 | seed_everything(seed) 101 | 102 | def sim_name_format(self): 103 | """ The name of the simulation. """ 104 | SIM_NAME = "MOD={}-SEED={}-DS={}-HU={}-HP={}-HF={}-OB={}" 105 | return SIM_NAME.format( 106 | self.SETTINGS.PREDICTION_MODEL.name, 107 | self.SETTINGS.SEED, 108 | self.SETTINGS.DATASET_NAME.value, 109 | self.SETTINGS.PREDICTION_HORIZON_UNIT.name, 110 | self.SETTINGS.PREDICTION_HORIZON_PAST, 111 | self.SETTINGS.PREDICTION_HORIZON_FUTURE, 112 | self.SETTINGS.OBSERVATION_PERIOD, 113 | ) 114 | 115 | def parse_cl_arguments(self): 116 | """ Parses the arguments for the command line. """ 117 | parser = argparse.ArgumentParser(description='LOBCAST arguments:') 118 | 119 | # every field in the settings, can be set crom cl 120 | for k, v in self.SETTINGS.__dict__.items(): 121 | var = v.name if isinstance(v, Enum) else v 122 | type_var = str if isinstance(v, Enum) else type(v) 123 | type_var = str_to_bool if type(v) == bool else type_var # to parse bool 124 | parser.add_argument(f'--{k}', default=var, type=type_var) 125 | 126 | args = vars(parser.parse_args()) 127 | 128 | print("Gathering CLI values.") 129 | setting_conf = dict() 130 | # every field in the settings, is set based on the parsed values, enums are parsed by NAME 131 | for k, v in self.SETTINGS.__dict__.items(): 132 | value = v.__class__[args[k]] if isinstance(v, Enum) else args[k] 133 | setting_conf[k] = value 134 | 135 | return setting_conf 136 | 137 | @staticmethod 138 | def __setup_all_directories(settings): 139 | """ Creates the necessary directories for the simulation. """ 140 | # create the paths for the simulation if they do not exist already 141 | paths = ["data", "data/datasets", "data/experiments", settings.DIR_EXPERIMENTS] 142 | for p in paths: 143 | if not os.path.exists(p): 144 | os.makedirs(p) 145 | 146 | def run(self): 147 | """ After having chosen settings and hyperparams, it runs LOBCAST training loop. """ 148 | 149 | data_module = pick_dataset(self) 150 | nets_module = pick_model(self, data_module) 151 | 152 | trainer = Trainer( 153 | accelerator=self.SETTINGS.DEVICE, 154 | devices=self.SETTINGS.N_GPUs, 155 | check_val_every_n_epoch=self.SETTINGS.VALIDATION_EVERY, 156 | max_epochs=self.SETTINGS.EPOCHS_UB, 157 | num_sanity_val_steps=1 if self.SETTINGS.IS_SANITY_CHECK else 0, 158 | callbacks=[ 159 | callback_save_model(self.SETTINGS.DIR_EXPERIMENTS, self.sim_name_format(), cst.VALIDATION_METRIC, top_k=3) 160 | ], 161 | ) 162 | 163 | model_path = self.SETTINGS.TEST_MODEL_PATH if self.SETTINGS.IS_TEST_ONLY else "best" 164 | 165 | if not self.SETTINGS.IS_TEST_ONLY: 166 | trainer.fit(nets_module, data_module) 167 | self.METRICS.reset_stats() 168 | 169 | # this flag is used when running simulation to know if final validation on best model is running 170 | self.METRICS.is_best_model = True 171 | 172 | # best model evaluation starts 173 | trainer.validate(nets_module, data_module, ckpt_path=model_path) 174 | trainer.test(nets_module, data_module, ckpt_path=model_path) 175 | 176 | def evaluate(self): 177 | fnames_root = self.SETTINGS.DIR_EXPERIMENTS + self.sim_name_format() 178 | pdf_best = PdfPages(fnames_root + "_" + 'metrics_best_plots.pdf') 179 | pdf_running = PdfPages(fnames_root + "_" + 'metrics_train_plots.pdf') 180 | 181 | for m in saved_metrics: 182 | plot_metric_best(fnames_root + "_" + cst.METRICS_BEST_FILE_NAME, m, pdf_best) 183 | plot_metric_training(fnames_root + "_" + cst.METRICS_RUNNING_FILE_NAME, m, pdf_running) 184 | 185 | pdf_best.close() 186 | pdf_running.close() 187 | 188 | def close(self): 189 | print('Completed.') 190 | if self.SETTINGS.IS_SANITY_CHECK: 191 | exit("Sanity check passed.") 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LOBCAST — Stock Price Trend Forecasting with Python 2 | 3 | ## 📈 LOBCAST 4 | LOBCAST is a Python-based open-source framework developed for stock market trend forecasting using Limit Order Book (LOB) 5 | data. The framework enables users to test deep learning models for the task of Stock Price Trend Prediction (SPTP). 6 | It serves as the official repository for the paper titled __LOB-Based Deep Learning Models for Stock Price Trend Prediction: 7 | A Benchmark Study__ [[paper](https://link.springer.com/article/10.1007/s10462-024-10715-4)]. 8 | 9 | The paper formalizes the SPTP task and the structure of LOB data. 10 | In the following sections, we elaborate on downloading LOB data, running stock predictions using LOBCAST with your new DL model, 11 | model evaluation and comparison. 12 | 13 | #### About mini-LOBCAST 14 | This main branch represents a newer version of LOBCAST named mini-LOBCAST. It enables benchmarking models on the standard 15 | LOB dataset used in the literature, specifically FI-2010 [[dataset](https://etsin.fairdata.fi/dataset/73eb48d7-4dbc-4a10-a52a-da745b47a649)]. 16 | This version will be expanded to include more datasets with procedures for handling data consistently for benchmarking. 17 | These procedures are already available in the branch v0-LOBCAST, which will be integrated soon. We encourage the use of this version, 18 | while also recommending a glance at the other branch for additional implemented models and functions. 19 | 20 | ## Installing LOBCAST 21 | 22 | You can install LOBCAST by cloning the repository and navigating into the directory: 23 | 24 | ``` 25 | git clone https://github.com/matteoprata/LOBCAST.git 26 | cd LOBCAST 27 | ``` 28 | 29 | Install all the required dependencies: 30 | ``` 31 | pip install -r requirements.txt 32 | ``` 33 | ### Downloading LOB Dataset 34 | To download the FI-2010 Dataset [[dataset](https://etsin.fairdata.fi/dataset/73eb48d7-4dbc-4a10-a52a-da745b47a649)], follow these instructions: 35 | 36 | 1. Download the dataset into `data/datasets` by running: 37 | ``` 38 | mkdir data/datasets 39 | cd data/datasets 40 | wget --content-disposition "wget --content-disposition "https://download.fairdata.fi:443/download?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3MTU1MDU5OTksImRhdGFzZXQiOiI3M2ViNDhkNy00ZGJjLTRhMTAtYTUyYS1kYTc0NWI0N2E2NDkiLCJwYWNrYWdlIjoiNzNlYjQ4ZDctNGRiYy00YTEwLWE1MmEtZGE3NDViNDdhNjQ5X2JoeXV4aWZqLnppcCIsImdlbmVyYXRlZF9ieSI6IjlmZGRmZmVlLWY4ZDItNDZkNS1hZmIwLWQyOTM0NzdlZjg2ZiIsInJhbmRvbV9zYWx0IjoiYjg1ZjNhM2YifQ.NOT94HPMUdwpi6lFsmnRhkToP2FAdmbmoEkhlRNBQGM" 41 | ``` 42 | 2. Unzip the file 43 | 3. Run: 44 | ``` 45 | mv data/datasets/published data/datasets/FI-2010 46 | ``` 47 | 4. Unzip `data/datasets/FI-2010/BenchmarkDatasets/BenchmarkDatasets.zip` into `data/datasets/FI-2010/BenchmarkDatasets`. 48 | 49 | Ensure that this path exists to execute LOBCAST on this dataset: 50 | ``` 51 | data/datasets/FI-2010/BenchmarkDatasets/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/* 52 | ``` 53 | 54 | ### Running 55 | Run LOBCAST locally with an MLP model and FI-2010 dataset using default settings in `src.settings`: 56 | ``` 57 | python -m src.run 58 | ``` 59 | 60 | To customize parameters: 61 | ``` 62 | python -m src.run --SEED 42 --PREDICTION_MODEL BINCTABL --OBSERVATION_PERIOD 10 --EPOCHS_UB 20 --IS_WANDB 0 63 | ``` 64 | This will execute LOBCAST with seed 42 on the FI-2010 dataset, using the BINCTABL model, with an observation period of 10 events, for 20 epochs, running locally (not on WANDB). 65 | 66 | The `run.py` file allows adjusting the following arguments, which are all attributes of the class `src.settings.Settings`. 67 | ``` 68 | LOBCAST 69 | optional arguments: 70 | -h, --help show this help message and exit 71 | --SEED 72 | --DATASET_NAME 73 | --N_TRENDS 74 | --PREDICTION_MODEL 75 | --PREDICTION_HORIZON_UNIT 76 | --PREDICTION_HORIZON_FUTURE 77 | --PREDICTION_HORIZON_PAST 78 | --OBSERVATION_PERIOD 79 | --IS_SHUFFLE_TRAIN_SET 80 | --EPOCHS_UB 81 | --TRAIN_SET_PORTION 82 | --VALIDATION_EVERY 83 | --IS_TEST_ONLY 84 | --TEST_MODEL_PATH 85 | --DEVICE 86 | --N_GPUs 87 | --N_CPUs 88 | --DIR_EXPERIMENTS 89 | --IS_WANDB 90 | --WANDB_SWEEP_METHOD 91 | --IS_SANITY_CHECK 92 | ``` 93 | At the end of the execution, json files containing all the statistics of the simulation and a PDF showing the performance 94 | of the model will be created at `data/experiments`. 95 | 96 | ### Settings 97 | To set up a simulation in terms of randomness, choice of dataset, choice of model, observation frame of models, and 98 | whether to log the metrics locally or on WANDB, LOBCAST allows setting all these parameters by accessing the 99 | `Lobcast().SETTINGS` object. These parameters are set at the beginning of the simulation and overwritten by the arguments 100 | passed from the command-line interface (CLI). 101 | 102 | ### Hyperparameters 103 | To find the right learning parameters of the model, hyperparameters can be specified in `src.hyperparameters.HPTunable`. 104 | By default, it contains the batch size, learning rate, and optimizer, but it can be extended by the user to specify other 105 | parameters. Keep in this class all the hyperparameters common to all the models. In the following we will see how to add 106 | model specific parameters. 107 | 108 | You can specify all the values that the parameters can take as ```{'values': [1, 2, 3]}```, or the min-max range as 109 | ```{'min': 1, 'max': 100}```. 110 | 111 | 112 | ### LOBCAST logic 113 | The logic of the simulator can be summarized as follows: 114 | 115 | 1. Initialize LOBCAST. 116 | 2. Parse settings from the CLI. 117 | 3. Update settings. 118 | 4. Choose hyperparameter configurations. 119 | 5. Run the simulation, including data gathering, model selection, and training loop. 120 | 6. Generate a PDF with evaluation metrics. 121 | 7. Close the simulation. 122 | 123 | The code below shows the simulation logic in `src.run`: 124 | ``` 125 | sim = LOBCAST() 126 | 127 | setting_conf = sim.parse_cl_arguments() # parse settings from CLI 128 | sim.update_settings(setting_conf) # updates simulation settings 129 | 130 | hparams_configs = grid_search_configurations(sim.HP_TUNABLE.__dict__)[0] # a dict with params chosen by grid search at 0 131 | sim.update_hyper_parameters(hparams_config) # update the simulation parameters 132 | sim.end_setup() 133 | 134 | sim.run() # run the simulation, data gathering, model selection and trainig loop 135 | sim.evaluate() # generate a pdf with the evaluation metrics 136 | sim.close() 137 | ``` 138 | 139 | 140 | ### Experimental Plans (_optional_) 141 | Running multiple experiments sequentially is facilitated by instantiating an execution plan. Alternatively to `src.run`, 142 | one can run sequential tests from `src.run_batch`. 143 | 144 | ``` 145 | ep = ExecutionPlan(setup01.INDEPENDENT_VARIABLES, 146 | setup01.INDEPENDENT_VARIABLES_CONSTRAINTS) 147 | 148 | setting_confs = ep.configurations() 149 | ``` 150 | 151 | An execution plan is defined in terms of `INDEPENDENT_VARIABLES` and `INDEPENDENT_VARIABLES_CONSTRAINTS`. 152 | These are two dictionaries. The first dictionary represents the variables to vary in a grid search. 153 | The `INDEPENDENT_VARIABLES_CONSTRAINTS` dictionary allows defining how the variable should be set when it does not vary, 154 | thus limiting the search concerning the grid search and eliminating certain configurations. The `setting_confs` contain 155 | the configurations to pass to `sim.update_settings(setting_conf)`, iteratively. 156 | 157 | To run an execution plan with the dictionaries defined in `src.batch_experiments.setup01`, execute: 158 | ``` 159 | python -m src.run_batch 160 | ``` 161 | 162 | Procedures for gathering performances from different models and generating comprehensive plots for benchmarking will be added in a new update. 163 | 164 | ### Adding a New Model 165 | To integrate a new model into LOBCAST, follow these steps: 166 | 167 | 1. Create model file: Add a `.py` file in the `src.models` directory. Define your new model class, inheriting from 168 | `src.models.lobcast_model.LOBCAST_model`: 169 | 170 | ``` 171 | class MyNewModel(LOBCAST_model): 172 | def __init__(self, input_dim, output_dim, param1, param2, param3): 173 | super().__init__(input_dim, output_dim) 174 | ... 175 | ``` 176 | 177 | 2. Define hyperparameters: Optionally define the domains of your model parameters by creating a class that inherits from 178 | `src.hyper_parameters.HPTunable`: 179 | 180 | ``` 181 | class HP(HPTunable): 182 | def __init__(self): 183 | super().__init__() 184 | self.param1 = {"values": [16, 32, 64]} 185 | self.param2 = {"values": [.1, .5]} 186 | ``` 187 | 3. Declare LOBCAST module: Instantiate a `src.models.lobcast_model.LOBCAST_module` to encapsulate the model and its hyperparameters: 188 | ``` 189 | mynewmodel_lm = LOBCAST_module(MLP, HP()) 190 | ``` 191 | 192 | 4. Declare model in Models enumerator: Add your model to the `src.models.models_classes.Models` enumerator: 193 | 194 | ``` 195 | class Models(Enum): 196 | NEW_MODEL = mynewmodel_lm 197 | ``` 198 | 199 | Now, you can execute the new model using the command: 200 | ``` 201 | python -m src.run --SEED 42 --PREDICTION_MODEL NEW_MODEL --IS_WANDB 0 202 | ``` 203 | Any undeclared settings will be assigned default values. 204 | 205 | 206 | Optionally, enforce constraints on the model settings using `src.settings.Settings.check_parameters_validity`. For example: 207 | 208 | ``` 209 | constraint = (not self.PREDICTION_MODEL == cst.Models.NEW_MODEL or self.OBSERVATION_PERIOD == 10, 210 | f"At the moment, NEW_MODEL only allows OBSERVATION_PERIOD = 10, {self.OBSERVATION_PERIOD} given.") 211 | ``` 212 | Ensure to add this constraint to `src.settings.Settings.check_parameters_validity.CONSTRAINTS` for enforcement. 213 | 214 | ### References 215 | Prata, Matteo, et al. __"LOB-based deep learning models for stock price trend prediction: a benchmark study."__ Artificial Intelligence Review 57.5 (2024): 1-45. 216 | 217 | > _The recent advancements in Deep Learning (DL) research have notably influenced the finance sector. We examine the 218 | > robustness and generalizability of fifteen state-of-the-art DL models focusing on Stock Price Trend Prediction (SPTP) 219 | > based on Limit Order Book (LOB) data. To carry out this study, we developed LOBCAST, an open-source framework that 220 | > incorporates data preprocessing, DL model training, evaluation and profit analysis. Our extensive experiments reveal 221 | > that all models exhibit a significant performance drop when exposed to new data, thereby raising questions about their 222 | > real-world market applicability. Our work serves as a benchmark, illuminating the potential and the limitations of current 223 | > approaches and providing insight for innovative solutions._ 224 | 225 | Link: https://link.springer.com/article/10.1007/s10462-024-10715-4 226 | 227 | 228 | ### Acknowledgments 229 | LOBCAST was developed by [Matteo Prata](https://github.com/matteoprata), [Giuseppe Masi](https://github.com/giuseppemasi99), [Leonardo Berti](https://github.com/LeonardoBerti00), [Andrea Coletta](https://github.com/Andrea94c), [Irene Cannistraci](https://github.com/icannistraci), Viviana Arrigoni. 230 | --------------------------------------------------------------------------------