├── .gitignore ├── .idea ├── .gitignore ├── LobFrame.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── README.md ├── data_processing ├── complete_homological_utils.py ├── data_process.py └── data_process_utils.py ├── loaders └── custom_dataset.py ├── loggers ├── analysis.py └── logger.py ├── main.py ├── models ├── AxialLob │ └── axiallob.py ├── CNN1 │ └── cnn1.py ├── CNN2 │ └── cnn2.py ├── CompleteHCNN │ └── complete_hcnn.py ├── DLA │ └── DLA.py ├── DeepLob │ └── deeplob.py ├── LobTransformer │ └── lobtransformer.py ├── TABL │ ├── bin_nn.py │ ├── bin_tabl.py │ ├── bl_layer.py │ └── tabl_layer.py ├── Transformer │ └── transformer.py └── iTransformer │ └── itransformer.py ├── optimizers ├── executor.py └── lightning_batch_gd.py ├── requirements.txt ├── requirements_mac_os.txt ├── simulator ├── market_sim.py ├── post_trading_analysis.py └── trading_agent.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/pycharm,osx,jupyternotebooks 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=pycharm,osx,jupyternotebooks 3 | 4 | ### JupyterNotebooks ### 5 | # gitignore template for Jupyter Notebooks 6 | # website: http://jupyter.org/ 7 | 8 | .ipynb_checkpoints 9 | */.ipynb_checkpoints/* 10 | 11 | # IPython 12 | profile_default/ 13 | ipython_config.py 14 | 15 | # Remove previous ipynb_checkpoints 16 | # git rm -r .ipynb_checkpoints/ 17 | 18 | ### OSX ### 19 | # General 20 | .DS_Store 21 | .AppleDouble 22 | .LSOverride 23 | 24 | # Icon must end with two \r 25 | Icon 26 | 27 | 28 | # Thumbnails 29 | ._* 30 | 31 | # Files that might appear in the root of a volume 32 | .DocumentRevisions-V100 33 | .fseventsd 34 | .Spotlight-V100 35 | .TemporaryItems 36 | .Trashes 37 | .VolumeIcon.icns 38 | .com.apple.timemachine.donotpresent 39 | 40 | # Directories potentially created on remote AFP share 41 | .AppleDB 42 | .AppleDesktop 43 | Network Trash Folder 44 | Temporary Items 45 | .apdisk 46 | 47 | ### PyCharm ### 48 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 49 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 50 | 51 | # User-specific stuff 52 | .idea/**/workspace.xml 53 | .idea/**/tasks.xml 54 | .idea/**/usage.statistics.xml 55 | .idea/**/dictionaries 56 | .idea/**/shelf 57 | 58 | # AWS User-specific 59 | .idea/**/aws.xml 60 | 61 | # Generated files 62 | .idea/**/contentModel.xml 63 | 64 | # Sensitive or high-churn files 65 | .idea/**/dataSources/ 66 | .idea/**/dataSources.ids 67 | .idea/**/dataSources.local.xml 68 | .idea/**/sqlDataSources.xml 69 | .idea/**/dynamic.xml 70 | .idea/**/uiDesigner.xml 71 | .idea/**/dbnavigator.xml 72 | 73 | # Gradle 74 | .idea/**/gradle.xml 75 | .idea/**/libraries 76 | 77 | # Gradle and Maven with auto-import 78 | # When using Gradle or Maven with auto-import, you should exclude module files, 79 | # since they will be recreated, and may cause churn. Uncomment if using 80 | # auto-import. 81 | # .idea/artifacts 82 | # .idea/compiler.xml 83 | # .idea/jarRepositories.xml 84 | # .idea/modules.xml 85 | # .idea/*.iml 86 | # .idea/modules 87 | # *.iml 88 | # *.ipr 89 | 90 | # CMake 91 | cmake-build-*/ 92 | 93 | # Mongo Explorer plugin 94 | .idea/**/mongoSettings.xml 95 | 96 | # File-based project format 97 | *.iws 98 | 99 | # IntelliJ 100 | out/ 101 | 102 | # mpeltonen/sbt-idea plugin 103 | .idea_modules/ 104 | 105 | # JIRA plugin 106 | atlassian-ide-plugin.xml 107 | 108 | # Cursive Clojure plugin 109 | .idea/replstate.xml 110 | 111 | # SonarLint plugin 112 | .idea/sonarlint/ 113 | 114 | # Crashlytics plugin (for Android Studio and IntelliJ) 115 | com_crashlytics_export_strings.xml 116 | crashlytics.properties 117 | crashlytics-build.properties 118 | fabric.properties 119 | 120 | # Editor-based Rest Client 121 | .idea/httpRequests 122 | 123 | # Android studio 3.1+ serialized cache file 124 | .idea/caches/build_file_checksums.ser 125 | 126 | ### PyCharm Patch ### 127 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 128 | 129 | # *.iml 130 | # modules.xml 131 | # .idea/misc.xml 132 | # *.ipr 133 | 134 | # Sonarlint plugin 135 | # https://plugins.jetbrains.com/plugin/7973-sonarlint 136 | .idea/**/sonarlint/ 137 | 138 | # SonarQube Plugin 139 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin 140 | .idea/**/sonarIssues.xml 141 | 142 | # Markdown Navigator plugin 143 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced 144 | .idea/**/markdown-navigator.xml 145 | .idea/**/markdown-navigator-enh.xml 146 | .idea/**/markdown-navigator/ 147 | 148 | # Cache file creation bug 149 | # See https://youtrack.jetbrains.com/issue/JBR-2257 150 | .idea/$CACHE_FILE$ 151 | 152 | # CodeStream plugin 153 | # https://plugins.jetbrains.com/plugin/12206-codestream 154 | .idea/codestream.xml 155 | 156 | # Azure Toolkit for IntelliJ plugin 157 | # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij 158 | .idea/**/azureSettings.xml 159 | 160 | # End of https://www.toptal.com/developers/gitignore/api/pycharm,osx,jupyternotebooks -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/LobFrame.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 137 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LOBFrame 2 | 3 | We release `LOBFrame` (see the two papers [`Deep Limit Order Book Forecasting`](https://arxiv.org/abs/2403.09267) and [`HLOB - Information Persistence and Structure in Limit Order Books`](https://arxiv.org/abs/2405.18938)), a novel, open-source code base which presents a renewed way to process large-scale Limit Order Book (LOB) data. This framework integrates all the latest cutting-edge insights from scientific research (see [Lucchese et al.](https://www.sciencedirect.com/science/article/pii/S0169207024000062), [Prata et al.](https://arxiv.org/pdf/2308.01915.pdf)) into a cohesive system. Its strength lies in the comprehensive nature of the implemented pipeline, which includes the data transformation and processing stage, an ultra-fast implementation of the training, validation, and testing steps, as well as the evaluation of the quality of a model's outputs through trading simulations. Moreover, it offers flexibility by accommodating the integration of new models, ensuring adaptability to future advancements in the field. 4 | 5 | ## Introduction 6 | 7 | In this tutorial, we show how to replicate the experiments presented in the two papers titled __"Deep Limit Order Book Forecasting: A microstructural guide"__ and __"HLOB - Information Persistence and Structure in Limit Order Books"__. 8 | 9 | Before starting, please remember to **ALWAYS CITE OUR WORKS** as follows: 10 | 11 | ``` 12 | @article{briola2024deep, 13 | title={Deep Limit Order Book Forecasting}, 14 | author={Briola, Antonio and Bartolucci, Silvia and Aste, Tomaso}, 15 | journal={arXiv preprint arXiv:2403.09267}, 16 | year={2024} 17 | } 18 | ``` 19 | 20 | ``` 21 | @misc{briola2024hlob, 22 | title={HLOB -- Information Persistence and Structure in Limit Order Books}, 23 | author={Antonio Briola and Silvia Bartolucci and Tomaso Aste}, 24 | year={2024}, 25 | eprint={2405.18938}, 26 | archivePrefix={arXiv}, 27 | primaryClass={q-fin.TR} 28 | } 29 | ``` 30 | 31 | ## Pre-requisites 32 | 33 | Install the required packages: 34 | 35 | ```bash 36 | pip3 install -r requirements.txt 37 | ``` 38 | 39 | If you are using a MacOS operating system, please proceed as follows: 40 | 41 | ```bash 42 | pip3 install -r requirements_mac_os.txt 43 | ``` 44 | 45 | ## Data 46 | All the code in this repository exploits [LOBSTER](https://lobsterdata.com) data. To have an overview on their structure, please refer 47 | to the official documentation available at the following [link](https://lobsterdata.com/info/DataStructure.php). 48 | 49 | # Preliminary operations 50 | Before starting any experiment: 51 | - Open the ```lightning_batch_gd.py``` file and insert the [Weights & Biases](https://wandb.ai/site) project's name and API key (search for TODOs). 52 | - Open the ```utils.py``` file and set the default values of the parameters. 53 | 54 | ## Usage 55 | To start an experiment from scratch, you need to follow these steps: 56 | - Place the raw data in the `data/nasdaq/raw` folder. The data must be in the LOBSTER format and each folder must be named with the asset's name (e.g. AAPL for Apple stock). 57 | - Run the following command to pre-process data: 58 | ```bash 59 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "data_processing" 60 | ``` 61 | - Run the following command to prepare the torch datasets (this allows to reduce the training time): 62 | ```bash 63 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "torch_dataset_preparation" --prediction_horizon 10 64 | ``` 65 | If you are interested also in performing the backtest stage, run the following command: 66 | ```bash 67 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "torch_dataset_preparation,torch_dataset_preparation_backtest" --prediction_horizon 10 68 | ``` 69 | - If you are planning to use the HLOB model (see the paper titled [`HLOB - Structure and Persistence of Information in Limit Order Books`](https://arxiv.org/abs/2405.18938)), it is mandatory to execute the following command: 70 | ```bash 71 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "complete_homological_structures_preparation" 72 | ``` 73 | - Run the following command to train the model: 74 | ```bash 75 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "training" 76 | ``` 77 | Currently available models are: 78 | - deeplob 79 | - transformer 80 | - itransformer 81 | - lobtransformer 82 | - dla 83 | - cnn1 84 | - cnn2 85 | - binbtabl 86 | - binctabl 87 | - axiallob 88 | - hlob 89 | - Run the following command to evaluate the model: 90 | ```bash 91 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --experiment_id "" --stages "evaluation" 92 | ``` 93 | - Run the following command to analyze the results: 94 | ```bash 95 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --experiment_id "" --stages "backtest,post_trading_analysis" 96 | ``` 97 | 98 | Multiple (compatible) stages can be executed at the same time. Consider the following example: 99 | ```bash 100 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "data_processing,torch_dataset_preparation,torch_dataset_preparation_backtest,training,evaluation,backtest,post_trading_analysis" 101 | ``` 102 | 103 | Each experiment can be resumed and re-run by specifying its ID in the `experiment_id` parameter. 104 | 105 | We now provide the typical structure of a folder before an experiment's run: 106 | 107 | ```bash 108 | . 109 | ├── README.md 110 | ├── data 111 | │   └── nasdaq 112 | │   ├── raw_data 113 | │   ├── 114 | │   └── 115 | │   ├── scaled_data 116 | │   ├── test 117 | │   ├── training 118 | │   └── validation 119 | │   └── unscaled_data 120 | │   ├── test 121 | │   ├── training 122 | │   └── validation 123 | ├── data_processing 124 | │   ├── data_process.py 125 | │   └── data_process_utils.py 126 | │   └── complete_homological_utils.py 127 | ├── loaders 128 | │   └── custom_dataset.py 129 | ├── loggers 130 | │   ├── logger.py 131 | │   └── results 132 | ├── main.py 133 | ├── models 134 | │   ├── AxialLob 135 | │   └── axiallob.py 136 | │   ├── CNN1 137 | │   └── cnn1.py 138 | │   ├── CNN2 139 | │   └── cnn2.py 140 | │   ├── DeepLob 141 | │   └── deeplob.py 142 | │   ├── DLA 143 | │   └── DLA.py 144 | │   ├── iTransformer 145 | │   └── itransformer.py 146 | │   ├── LobTransformer 147 | │   └── lobtransformer.py 148 | │   ├── TABL 149 | │   ├── bin_nn.py 150 | │   ├── bin_tabl.py 151 | │   ├── bl_layer.py 152 | │   └── tabl_layer.py 153 | │   ├── Transformer 154 | │   └── transformer.py 155 | |   ├── CompleteHCNN 156 | │   └── complete_hcnn.py 157 | ├── optimizers 158 | │   ├── executor.py 159 | │   └── lightning_batch_gd.py 160 | ├── requirements.txt 161 | ├── simulator 162 | │   ├── market_sim.py 163 | │   ├── post_trading_analysis.py 164 | │   └── trading_agent.py 165 | ├── torch_datasets 166 | │   └── threshold_1e-05 167 | │   └── batch_size_32 168 | │   └── 10 169 | │   ├── test_dataset.pt 170 | │   ├── test_dataset_backtest.pt 171 | │   ├── training_dataset.pt 172 | │   └── validation_dataset.pt 173 | ├── results 174 | └── utils.py 175 | ``` 176 | 177 | # License 178 | 179 | Copyright 2024 Antonio Briola, Silvia Bartolucci, Tomaso Aste. 180 | 181 | Licensed under the CC BY-NC-ND 4.0 Licence (the "Licence"); you may not use this file except in compliance with the License. You may obtain a copy of the License at: 182 | 183 | ``` 184 | https://creativecommons.org/licenses/by-nc-nd/4.0/ 185 | ``` 186 | 187 | Software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the provided link for the specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- /data_processing/complete_homological_utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import concurrent.futures 3 | from itertools import chain 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import polars as pl 8 | from typing import * 9 | 10 | import networkx as nx 11 | from fast_tmfg import * 12 | from sklearn.metrics import mutual_info_score 13 | 14 | from utils import get_training_test_stocks_as_string 15 | import matplotlib.pyplot as plt 16 | import seaborn as sns 17 | 18 | import torch 19 | 20 | 21 | def compute_pairwise_mi(df: pd.DataFrame, n_bins: int = 3000) -> pd.DataFrame: 22 | """ 23 | Compute the pairwise mutual information between the columns of a dataframe. 24 | 25 | Parameters 26 | ---------- 27 | df : pandas.Dataframe 28 | The pandas dataframe to compute the pairwise mutual information for. 29 | n_bins: int 30 | The number of bins to use for discretization. 31 | 32 | Returns 33 | ---------- 34 | mi_matrix: pandas.Dataframe 35 | The pairwise mutual information matrix. 36 | 37 | """ 38 | 39 | shuffled_df = df.sample(frac=1, random_state=1).reset_index(drop=True) # Shuffle the dataset. 40 | sampled_df = shuffled_df.sample(n=len(df), replace=True) # Perform bootstrapping. 41 | df = sampled_df.copy() # Copy the dataset into a variable called 'df'. 42 | df.reset_index(drop=True, inplace=True) # Reset the indices. 43 | del sampled_df # Delete an unused variable. 44 | 45 | flat_series = df.values.flatten() # Flat the df to perform a binning on all the values (not feature-by-feature). 46 | bins = pd.cut(flat_series, bins=n_bins, labels=False, retbins=True) # Perform the binning. 47 | # Apply the binning to each feature of the original dataset. 48 | for column in df.columns: 49 | df[column] = pd.cut(df[column], bins=bins[1], labels=False, include_lowest=True) 50 | del flat_series # Delete an unused variable. 51 | 52 | discretized_df = df.copy() # Copy the dataset into a variable called 'discretized_df'. 53 | del df # Delete an unused variable. 54 | 55 | # Initialize an empty Mutual Information (MI) matrix and fill it with 0s. 56 | n_features = discretized_df.shape[1] 57 | mi_matrix = np.zeros((n_features, n_features)) 58 | 59 | # Compute the pairwise MI and fill the MI matrix consequently. 60 | for i in range(n_features): 61 | for j in range(i, n_features): 62 | mi_value = mutual_info_score( 63 | discretized_df.iloc[:, i], discretized_df.iloc[:, j] 64 | ) 65 | mi_matrix[i, j] = mi_value 66 | mi_matrix[j, i] = mi_value 67 | 68 | mi_matrix = pd.DataFrame(mi_matrix) # Transform the MI matrix into a Pandas dataframe. 69 | return mi_matrix # Return the MI matrix in the form of a Pandas dataframe. 70 | 71 | 72 | def process_file( 73 | file: str, 74 | ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, nx.Graph, nx.Graph, nx.Graph]: 75 | """ 76 | Compute the TMFG for volumes of a given orderbook file. 77 | 78 | Parameters 79 | ---------- 80 | file : str 81 | The path to the file to compute the TMFG for. 82 | 83 | Returns 84 | ---------- 85 | sim_ask : pandas.DataFrame 86 | The pairwise mutual information matrix for the ask volumes. 87 | sim_bid : pandas.DataFrame 88 | The pairwise mutual information matrix for the bid volumes. 89 | sim_all : pandas.DataFrame 90 | The pairwise mutual information matrix for the ask and bid volumes. 91 | net_ask : networkx.Graph 92 | The TMFG for the ask volumes. 93 | net_bid : networkx.Graph 94 | The TMFG for the bid volumes. 95 | net_all : networkx.Graph 96 | The TMFG for the ask and bid volumes. 97 | """ 98 | 99 | print(f"Computing structure for file: {file}...") 100 | # Read the file using polars to accelerate the process. 101 | df = pl.read_csv(file) 102 | df = df.to_pandas() 103 | 104 | # Extract the volumes for the ask and bid sides. 105 | volumes_all = df.iloc[:, 1:41].iloc[:, 1::2] 106 | 107 | # Compute the pairwise mutual information matrices. 108 | sim_all = compute_pairwise_mi(volumes_all) 109 | 110 | # Compute the TMFGs. 111 | model_all = TMFG() 112 | cliques_all, seps_all, adj_matrix_all = model_all.fit_transform( 113 | sim_all, output="weighted_sparse_W_matrix" 114 | ) 115 | 116 | # Convert the adjacency matrices to networkx graphs. 117 | net_all = nx.from_numpy_array(adj_matrix_all) 118 | 119 | return sim_all, net_all, file 120 | 121 | 122 | def mean_tmfg(sm_list: List[pd.DataFrame]) -> pd.DataFrame: 123 | """ 124 | Compute the average similarity matrix for a list of similarity matrices. 125 | 126 | Parameters 127 | ---------- 128 | sm_list : List[pandas.DataFrame] 129 | The list of similarity matrices to compute the average for. 130 | 131 | Returns 132 | ---------- 133 | average_matrix : pandas.DataFrame 134 | The average similarity matrix. 135 | """ 136 | 137 | # Stack the matrices along a new axis (axis=0) 138 | stacked_matrices = np.stack(sm_list, axis=0) 139 | 140 | # Calculate the entry-wise average along the new axis 141 | average_matrix = np.mean(stacked_matrices, axis=0) 142 | np.fill_diagonal(average_matrix, 0) 143 | 144 | average_matrix = pd.DataFrame(average_matrix) 145 | 146 | ''' 147 | plt.figure(figsize=(10, 8)) # Optional: Adjusts the size of the figure 148 | sns.heatmap(average_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5) 149 | plt.title("Correlation Matrix Heatmap") 150 | plt.show() 151 | ''' 152 | 153 | return average_matrix 154 | 155 | 156 | def extract_components( 157 | cliques: List[List[int]], separators: List[List[int]], adjacency_matrix: np.ndarray 158 | ) -> Tuple[List[List[int]], List[List[int]], List[List[int]]]: 159 | """ 160 | Given the cliques, separators and adjacency matrix of a TMFG, extract the b-cliques of size 2 (edges), 3 (triangles) and 4 (tetrahedra). 161 | 162 | Parameters 163 | ---------- 164 | cliques : List[int] 165 | The list of cliques of the TMFG. 166 | separators : List[int] 167 | The list of separators of the TMFG. 168 | adjacency_matrix : numpy.ndarray 169 | The adjacency matrix of the TMFG. 170 | 171 | Returns 172 | ---------- 173 | final_b_cliques_4 : List[List[int]] 174 | The final list of tetrahera. 175 | final_b_cliques_3 : List[List[int]] 176 | The final list of triangles. 177 | final_b_cliques_2 : List[List[int]] 178 | The final list of edges. 179 | """ 180 | 181 | # Extract edges. 182 | edges = [] 183 | adjacency_matrix = nx.from_numpy_array(adjacency_matrix) 184 | 185 | for i in nx.enumerate_all_cliques(adjacency_matrix): 186 | if len(i) == 2: 187 | edges.append(sorted(i)) 188 | 189 | b_cliques_4 = [] 190 | b_cliques_3 = [] 191 | b_cliques_2 = [] 192 | 193 | b_cliques_all = nx.enumerate_all_cliques(adjacency_matrix) 194 | 195 | for i in b_cliques_all: 196 | if len(i) == 2: 197 | b_cliques_2.append(sorted(i)) 198 | if len(i) == 3: 199 | b_cliques_3.append(sorted(i)) 200 | if len(i) == 4: 201 | b_cliques_4.append(sorted(i)) 202 | 203 | final_b_cliques_4 = b_cliques_4 204 | 205 | final_b_cliques_3 = b_cliques_3 206 | 207 | final_b_cliques_2 = edges 208 | 209 | final_b_cliques_4 = [[(x * 2) + 1 for x in sublist] for sublist in final_b_cliques_4] 210 | final_b_cliques_4 = [[x, x - 1] for sublist in final_b_cliques_4 for x in sublist] 211 | final_b_cliques_4 = list(chain.from_iterable(final_b_cliques_4)) 212 | final_b_cliques_4 = [final_b_cliques_4[i:i + 8] for i in range(0, len(final_b_cliques_4), 8)] 213 | final_b_cliques_4 = [sorted(sublist) for sublist in final_b_cliques_4] 214 | 215 | final_b_cliques_3 = [[(x * 2) + 1 for x in sublist] for sublist in final_b_cliques_3] 216 | final_b_cliques_3 = [[x, x - 1] for sublist in final_b_cliques_3 for x in sublist] 217 | final_b_cliques_3 = list(chain.from_iterable(final_b_cliques_3)) 218 | final_b_cliques_3 = [final_b_cliques_3[i:i + 6] for i in range(0, len(final_b_cliques_3), 6)] 219 | final_b_cliques_3 = [sorted(sublist) for sublist in final_b_cliques_3] 220 | 221 | final_b_cliques_2 = [[(x * 2) + 1 for x in sublist] for sublist in final_b_cliques_2] 222 | final_b_cliques_2 = [[x, x - 1] for sublist in final_b_cliques_2 for x in sublist] 223 | final_b_cliques_2 = list(chain.from_iterable(final_b_cliques_2)) 224 | final_b_cliques_2 = [final_b_cliques_2[i:i + 4] for i in range(0, len(final_b_cliques_2), 4)] 225 | final_b_cliques_2 = [sorted(sublist) for sublist in final_b_cliques_2] 226 | 227 | return final_b_cliques_4, final_b_cliques_3, final_b_cliques_2 228 | 229 | 230 | def execute_pipeline(file_patterns, general_hyperparameters): 231 | files = [] 232 | for pattern in file_patterns: 233 | files.extend(glob.glob(pattern.format(dataset={general_hyperparameters['dataset']}))) 234 | 235 | max_threads = 5 236 | with concurrent.futures.ThreadPoolExecutor(max_threads) as executor: 237 | results = list(executor.map(process_file, files)) 238 | 239 | nets_all = [] 240 | sm_all = [] 241 | files_all = [] 242 | 243 | for result in results: 244 | sim_all, net_all, file = result 245 | nets_all.append(net_all) 246 | sm_all.append(sim_all) 247 | files_all.append(file) 248 | 249 | del results 250 | 251 | model_all = TMFG() 252 | cliques_all, seps_all, adj_matrix_all = model_all.fit_transform( 253 | mean_tmfg(sm_all), output="weighted_sparse_W_matrix" 254 | ) 255 | 256 | c4, c3, c2 = extract_components(cliques_all, seps_all, adj_matrix_all) 257 | c4 = list(chain.from_iterable(c4)) 258 | c3 = list(chain.from_iterable(c3)) 259 | c2 = list(chain.from_iterable(c2)) 260 | 261 | original_cliques_all = list(chain.from_iterable(cliques_all)) 262 | original_seps_all = list(chain.from_iterable(seps_all)) 263 | 264 | return c4, c3, c2, original_cliques_all, original_seps_all, adj_matrix_all, sm_all, files_all 265 | 266 | 267 | def get_complete_homology( 268 | general_hyperparameters: Dict[str, Any], 269 | model_hyperparameters: Dict[str, Any], 270 | ) -> Dict[str, List[List[int]]]: 271 | """ 272 | Compute the homological structures to be used in the HCNN building process. 273 | 274 | Parameters 275 | ---------- 276 | general_hyperparameters : Dict[str, Any] 277 | The general hyperparameters of the experiment. 278 | 279 | Returns 280 | ---------- 281 | homological_structures : Dict[str, List[List[int]]] 282 | """ 283 | 284 | file_patterns_training = [f"./data/{general_hyperparameters['dataset']}/unscaled_data/training/*{element}*.csv" for element in 285 | general_hyperparameters['training_stocks']] 286 | c4_training, c3_training, c2_training, original_cliques_all_training, original_seps_all_training, adj_matrix_all_training, sm_all_training, files_all_training = execute_pipeline( 287 | file_patterns_training, general_hyperparameters) 288 | 289 | file_patterns_validation = [f"./data/{general_hyperparameters['dataset']}/unscaled_data/validation/*{element}*.csv" for element in 290 | general_hyperparameters['training_stocks']] 291 | _, _, _, _, _, adj_matrix_all_validation, sm_all_validation, files_all_validation = execute_pipeline(file_patterns_validation, general_hyperparameters) 292 | 293 | file_patterns_test = [f"./data/{general_hyperparameters['dataset']}/unscaled_data/test/*{element}*.csv" for element in 294 | general_hyperparameters['target_stocks']] 295 | _, _, _, _, _, adj_matrix_all_test, sm_all_test, files_all_test = execute_pipeline(file_patterns_test, general_hyperparameters) 296 | 297 | homological_structures = {"tetrahedra": c4_training, 298 | "triangles": c3_training, 299 | "edges": c2_training, 300 | "original_cliques": original_cliques_all_training, 301 | "original_separators": original_seps_all_training, 302 | "adj_matrix_training": adj_matrix_all_training, 303 | "similarity_matrices_training": sm_all_training, 304 | "files_training": files_all_training, 305 | "adj_matrix_validation": adj_matrix_all_validation, 306 | "similarity_matrices_validation": sm_all_validation, 307 | "files_validation": files_all_validation, 308 | "adj_matrix_test": adj_matrix_all_test, 309 | "similarity_matrices_test": sm_all_test, 310 | "files_test": files_all_test 311 | } 312 | 313 | training_stocks_string, test_stocks_string = get_training_test_stocks_as_string(general_hyperparameters) 314 | print(training_stocks_string, test_stocks_string) 315 | torch.save(homological_structures, 316 | f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{training_stocks_string}_test_{test_stocks_string}/complete_homological_structures.pt") 317 | # torch.save(homological_structures, 318 | # f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/homological_structures_large_tick_stocks.pt") 319 | print('Homological structures have been saved.') 320 | 321 | # get_homology({'dataset': 'nasdaq'}) 322 | -------------------------------------------------------------------------------- /data_processing/data_process.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import glob 3 | import random 4 | import re 5 | from datetime import datetime, time 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | def process_data( 12 | ticker: str, 13 | input_path: str, 14 | output_path: str, 15 | logs_path: str, 16 | horizons: list[int], 17 | normalization_window: int, 18 | time_index: str = "seconds", 19 | features: str = "orderbooks", 20 | scaling: bool = True, 21 | ) -> None: 22 | """ 23 | Function to pre-process LOBSTER data. The data must be stored in the input_path directory as 'daily message LOB' and 'orderbook' files. 24 | 25 | The data are treated in the following way: 26 | - Orderbook's states with crossed quotes are removed. 27 | - Each state in the orderbook is time-stamped, with states occurring at the same time collapsed onto the last occurring state. 28 | - The first and last 10 minutes of market activity (inside usual opening times) are dropped. 29 | - Rolling z-score normalization is applied to the data, i.e. the mean and standard deviation of the previous 5 days is used to normalize current day's data. 30 | Hence, the first 5 days are dropped. 31 | - Smoothed returns at the requested horizons (in orderbook's changes) are returned: 32 | - if smoothing = "horizon": l = (m+ - m)/m, where m+ denotes the mean of the next h mid-prices, m(.) is current mid-price. 33 | - if smoothing = "uniform": l = (m+ - m)/m, where m+ denotes the mean of the k+1 mid-prices centered at m(. + h), m(.) is current mid-price. 34 | 35 | A log file is produced tracking: 36 | - Orderbook's files with problems. 37 | - Message orderbook's files with problems. 38 | - Trading days with unusual opening - closing times. 39 | - Trading days with crossed quotes. 40 | 41 | A statistics.csv file summarizes the following (daily) statistics: 42 | - # Updates (000): the total number of changes in the orderbook file. 43 | - # Trades (000): the total number of trades, computed by counting the number of message book events corresponding to the execution of (possibly hidden) 44 | limit orders ('event_type' 4 or 5 in LOBSTER orderbook's message file). 45 | - # Price Changes (000): the total number of price changes per day. 46 | - # Price (USD): average price on the day, weighted average by time. 47 | - # Spread (bps): average spread on the day, weighted average by time. 48 | - # Volume (USD MM): total volume traded on the day, computed as the sum of the volumes of all the executed trades ('event_type' 4 or 5 in LOBSTER orderbook's message file). 49 | The volume of a single trade is given by size*price. 50 | - # Tick size: the fraction of time that the bid-ask spread is equal to one tick for each stock. 51 | 52 | Args: 53 | ticker (str): The ticker to be considered. 54 | input_path (str): The path where the order book and message book files are stored, order book files have shape (:, 4*levels): 55 | ["ASKp1", "ASKs1", "BIDp1", "BIDs1", ..., "ASKp10", "ASKs10", "BIDp10", "BIDs10"]. 56 | output_path (str): The path where we wish to save the processed datasets. 57 | logs_path (str): The path where we wish to save the logs. 58 | time_index (str): The time-index to use ("seconds" or "datetime"). 59 | horizons (list): Forecasting horizons for labels. 60 | normalization_window (int): Window for rolling z-score normalization. 61 | features (str): Whether to return 'orderbooks' or 'orderflows'. 62 | scaling (bool): Whether to apply rolling z-score normalization. 63 | 64 | Returns: 65 | None. 66 | """ 67 | 68 | csv_file_list = glob.glob( 69 | f"{input_path}/*.csv" 70 | ) # Get the list of all the .csv files in the input_path directory. 71 | 72 | csv_orderbook = [ 73 | name for name in csv_file_list if "orderbook" in name 74 | ] # Get the list of all the orderbook files in the input_path directory. 75 | csv_orderbook.sort() # Sort the list of orderbook files. 76 | csv_message = [ 77 | name for name in csv_file_list if "message" in name 78 | ] # Get the list of all the message files in the input_path directory. 79 | csv_message.sort() # Sort the list of message files. 80 | 81 | # Check if exactly half of the files are order book and exactly half are messages. 82 | assert len(csv_message) == len(csv_orderbook) 83 | assert len(csv_file_list) == len(csv_message) + len(csv_orderbook) 84 | 85 | print(f"Data preprocessing loop started. SCALING: {str(scaling)}.") 86 | 87 | # Initialize the dataframe containing logs. 88 | logs = [] 89 | df_statistics = pd.DataFrame( 90 | [], 91 | columns=[ 92 | "Updates (000)", 93 | "Trades (000)", 94 | "Price Changes (000)", 95 | "Price (USD)", 96 | "Spread (bps)", 97 | "Volume (USD MM)", 98 | "Tick Size", 99 | ], 100 | dtype=float, 101 | ) 102 | 103 | # Initialize dataframes for dynamic Z-score normalization. 104 | mean_df = pd.DataFrame() 105 | mean2_df = pd.DataFrame() 106 | nsamples_df = pd.DataFrame() 107 | 108 | for orderbook_name in csv_orderbook: 109 | print(orderbook_name) 110 | 111 | # Read orderbook files and keep a record of problematic files. 112 | df_orderbook = None 113 | try: 114 | df_orderbook = pd.read_csv(orderbook_name, header=None) 115 | except: 116 | logs.append(f"{orderbook_name} skipped. Error: failed to read orderbook.") 117 | 118 | levels = int( 119 | df_orderbook.shape[1] / 4 120 | ) # Verify that the number of columns is a multiple of 4. 121 | feature_names_raw = [ 122 | "ASKp", 123 | "ASKs", 124 | "BIDp", 125 | "BIDs", 126 | ] # Define sorted raw features' names. 127 | feature_names = [] 128 | for i in range(1, levels + 1): 129 | for j in range(4): 130 | feature_names += [ 131 | feature_names_raw[j] + str(i) 132 | ] # Add to raw features' names the level number. 133 | df_orderbook.columns = ( 134 | feature_names # Rename the columns of the orderbook dataframe. 135 | ) 136 | 137 | # Divide prices by 10000. 138 | target_columns = [col for col in df_orderbook.columns if "ASKp" in col or "BIDp" in col] 139 | df_orderbook[target_columns] = df_orderbook[target_columns].astype(int) # / 10000 140 | 141 | df_orderbook.insert( 142 | 0, "mid_price", (df_orderbook["ASKp1"] + df_orderbook["BIDp1"]) / 2 143 | ) # Add the mid-price column to the orderbook dataframe. 144 | df_orderbook.mid_price = df_orderbook.mid_price.astype(int) 145 | 146 | # Extract the date from the orderbook file's name. 147 | match = re.findall(r"\d{4}-\d{2}-\d{2}", orderbook_name)[-1] 148 | date = datetime.strptime(match, "%Y-%m-%d") 149 | 150 | # Read message files and keep a record of problematic files. 151 | message_name = orderbook_name.replace("orderbook", "message") 152 | df_message = None 153 | try: 154 | df_message = pd.read_csv( 155 | message_name, usecols=[0, 1, 2, 3, 4, 5], header=None 156 | ) 157 | except: 158 | logs.append(f"{message_name} skipped. Error: failed to read message file.") 159 | 160 | # Check the two dataframes created before have the same length. 161 | assert len(df_message) == len(df_orderbook) 162 | 163 | # Rename the columns of the message dataframe. 164 | df_message.columns = [ 165 | "seconds", 166 | "event_type", 167 | "order ID", 168 | "volume", 169 | "price", 170 | "direction", 171 | ] 172 | 173 | # Remove trading halts. 174 | trading_halts_start = df_message[ 175 | (df_message["event_type"] == 7) & (df_message["price"] == -1) 176 | ].index 177 | trading_halts_end = df_message[ 178 | (df_message["event_type"] == 7) & (df_message["price"] == 1) 179 | ].index 180 | trading_halts_index = np.array([]) 181 | for halt_start, halt_end in zip(trading_halts_start, trading_halts_end): 182 | trading_halts_index = np.append( 183 | trading_halts_index, 184 | df_message.index[ 185 | (df_message.index >= halt_start) & (df_message.index < halt_end) 186 | ], 187 | ) 188 | if len(trading_halts_index) > 0: 189 | for halt_start, halt_end in zip(trading_halts_start, trading_halts_end): 190 | logs.append( 191 | f"Warning: trading halt between {str(df_message.loc[halt_start, 'seconds'])} and {str(df_message.loc[halt_end, 'seconds'])} in {orderbook_name}." 192 | ) 193 | df_orderbook = df_orderbook.drop(trading_halts_index) 194 | df_message = df_message.drop(trading_halts_index) 195 | 196 | # Remove crossed quotes. 197 | crossed_quotes_index = df_orderbook[ 198 | (df_orderbook["BIDp1"] > df_orderbook["ASKp1"]) 199 | ].index 200 | if len(crossed_quotes_index) > 0: 201 | logs.append( 202 | f"Warning: {str(len(crossed_quotes_index))} crossed quotes removed in {orderbook_name}." 203 | ) 204 | df_orderbook = df_orderbook.drop(crossed_quotes_index) 205 | df_message = df_message.drop(crossed_quotes_index) 206 | 207 | # Add the 'seconds since midnight' column to the orderbook from the message book. 208 | df_orderbook.insert(0, "seconds", df_message["seconds"]) 209 | 210 | # One conceptual event (e.g. limit order modification which is implemented as a cancellation followed by an immediate new arrival, 211 | # single market order executing against multiple resting limit orders) may appear as multiple rows in the message file, all with 212 | # the same timestamp. We hence group the order book data by unique timestamps and take the last entry. 213 | df_orderbook = df_orderbook.groupby(["seconds"]).tail(1) 214 | df_message = df_message.groupby(["seconds"]).tail(1) 215 | 216 | # Check market opening times for strange values. 217 | market_open = (int(df_orderbook["seconds"].iloc[0] / 60) / 60) # Open at minute before first transaction. 218 | market_close = (int(df_orderbook["seconds"].iloc[-1] / 60) + 1) / 60 # Close at minute after last transaction. 219 | 220 | if not (market_open == 9.5 and market_close == 16): 221 | logs.append( 222 | f"Warning: unusual opening times in {orderbook_name}: {str(market_open)} - {str(market_close)}." 223 | ) 224 | 225 | if time_index == "seconds": 226 | # Drop values outside of market hours using seconds 227 | df_orderbook = df_orderbook.loc[ 228 | (df_orderbook["seconds"] >= 34200) & (df_orderbook["seconds"] <= 57600) 229 | ] 230 | df_message = df_message.loc[ 231 | (df_message["seconds"] >= 34200) & (df_message["seconds"] <= 57600) 232 | ] 233 | 234 | # Drop first and last 10 minutes of trading using seconds. 235 | market_open_seconds = market_open * 60 * 60 + 10 * 60 236 | market_close_seconds = market_close * 60 * 60 - 10 * 60 237 | df_orderbook = df_orderbook.loc[ 238 | (df_orderbook["seconds"] >= market_open_seconds) 239 | & (df_orderbook["seconds"] <= market_close_seconds) 240 | ] 241 | df_message = df_message.loc[ 242 | (df_message["seconds"] >= market_open_seconds) 243 | & (df_message["seconds"] <= market_close_seconds) 244 | ] 245 | else: 246 | raise Exception("time_index must be seconds.") 247 | 248 | # Save statistical information. 249 | if len(df_orderbook) > 0: 250 | updates = df_orderbook.shape[0] / 1000 251 | trades = ( 252 | np.sum( 253 | (df_message["event_type"] == 4) | (df_message["event_type"] == 5) 254 | ) 255 | / 1000 256 | ) 257 | price_changes = np.sum(~(np.diff(df_orderbook["mid_price"]) == 0.0)) / 1000 258 | time_deltas = np.append( 259 | np.diff(df_orderbook["seconds"]), 260 | market_close_seconds - df_orderbook["seconds"].iloc[-1], 261 | ) 262 | price = np.average(df_orderbook["mid_price"] / 10 ** 4, weights=time_deltas) 263 | spread = np.average( 264 | (df_orderbook["ASKp1"] - df_orderbook["BIDp1"]) 265 | / df_orderbook["mid_price"] 266 | * 10000, 267 | weights=time_deltas, 268 | ) 269 | volume = ( 270 | np.sum( 271 | df_message.loc[ 272 | (df_message["event_type"] == 4) 273 | | (df_message["event_type"] == 5) 274 | ]["volume"] 275 | * df_message.loc[ 276 | (df_message["event_type"] == 4) 277 | | (df_message["event_type"] == 5) 278 | ]["price"] 279 | / 10 ** 4 280 | ) 281 | / 10 ** 6 282 | ) 283 | tick_size = np.average( 284 | (df_orderbook["ASKp1"] - df_orderbook["BIDp1"]) == 100.0, 285 | weights=time_deltas, 286 | ) 287 | 288 | df_statistics.loc[date] = [ 289 | updates, 290 | trades, 291 | price_changes, 292 | price, 293 | spread, 294 | volume, 295 | tick_size, 296 | ] 297 | 298 | if features == "orderbooks": 299 | pass 300 | elif features == "orderflows": 301 | # Compute bid and ask multilevel orderflow. 302 | ASK_prices = df_orderbook.loc[:, df_orderbook.columns.str.contains("ASKp")] 303 | BID_prices = df_orderbook.loc[:, df_orderbook.columns.str.contains("BIDp")] 304 | ASK_sizes = df_orderbook.loc[:, df_orderbook.columns.str.contains("ASKs")] 305 | BID_sizes = df_orderbook.loc[:, df_orderbook.columns.str.contains("BIDs")] 306 | 307 | ASK_price_changes = ASK_prices.diff().dropna().to_numpy() 308 | BID_price_changes = BID_prices.diff().dropna().to_numpy() 309 | ASK_size_changes = ASK_sizes.diff().dropna().to_numpy() 310 | BID_size_changes = BID_sizes.diff().dropna().to_numpy() 311 | 312 | ASK_sizes = ASK_sizes.to_numpy() 313 | BID_sizes = BID_sizes.to_numpy() 314 | 315 | ASK_OF = ( 316 | (ASK_price_changes > 0.0) * (-ASK_sizes[:-1, :]) 317 | + (ASK_price_changes == 0.0) * ASK_size_changes 318 | + (ASK_price_changes < 0) * ASK_sizes[1:, :] 319 | ) 320 | BID_OF = ( 321 | (BID_price_changes < 0.0) * (-BID_sizes[:-1, :]) 322 | + (BID_price_changes == 0.0) * BID_size_changes 323 | + (BID_price_changes > 0) * BID_sizes[1:, :] 324 | ) 325 | 326 | # Remove all price-volume features and add in orderflow. 327 | df_orderbook = df_orderbook.drop(feature_names, axis=1).iloc[1:, :] 328 | mid_seconds_columns = list(df_orderbook.columns) 329 | feature_names_raw = ["ASK_OF", "BID_OF"] 330 | feature_names = [] 331 | for feature_name in feature_names_raw: 332 | for i in range(1, levels + 1): 333 | feature_names += [feature_name + str(i)] 334 | df_orderbook[feature_names] = np.concatenate([ASK_OF, BID_OF], axis=1) 335 | 336 | # Re-order columns. 337 | feature_names_reordered = [[]] * len(feature_names) 338 | feature_names_reordered[::2] = feature_names[:levels] 339 | feature_names_reordered[1::2] = feature_names[levels:] 340 | feature_names = feature_names_reordered 341 | 342 | df_orderbook = df_orderbook[mid_seconds_columns + feature_names] 343 | else: 344 | raise ValueError("Features must be 'orderbooks' or 'orderflows'.") 345 | 346 | # Dynamic z-score normalization. 347 | orderbook_mean_df = pd.DataFrame( 348 | df_orderbook[feature_names].mean().values.reshape(-1, len(feature_names)), 349 | columns=feature_names, 350 | ) 351 | orderbook_mean2_df = pd.DataFrame( 352 | (df_orderbook[feature_names] ** 2) 353 | .mean() 354 | .values.reshape(-1, len(feature_names)), 355 | columns=feature_names, 356 | ) 357 | orderbook_nsamples_df = pd.DataFrame( 358 | np.array([[len(df_orderbook)]] * len(feature_names)).T, 359 | columns=feature_names, 360 | ) 361 | 362 | if len(mean_df) < normalization_window: 363 | logs.append( 364 | f"{orderbook_name} skipped. Initializing rolling z-score normalization." 365 | ) 366 | # Don't save the first days as we don't have enough days to normalize. 367 | mean_df = pd.concat([mean_df, orderbook_mean_df], ignore_index=True) 368 | mean2_df = pd.concat([mean2_df, orderbook_mean2_df], ignore_index=True) 369 | nsamples_df = pd.concat( 370 | [nsamples_df, orderbook_nsamples_df], ignore_index=True 371 | ) 372 | continue 373 | else: 374 | z_mean_df = pd.DataFrame( 375 | (nsamples_df * mean_df).sum(axis=0) / nsamples_df.sum(axis=0) 376 | ).T # Dynamically compute mean. 377 | z_stdev_df = pd.DataFrame( 378 | np.sqrt( 379 | (nsamples_df * mean2_df).sum(axis=0) / nsamples_df.sum(axis=0) 380 | - z_mean_df ** 2 381 | ) 382 | ) # Dynamically compute standard deviation. 383 | 384 | # Broadcast to df_orderbook size. 385 | z_mean_df = z_mean_df.loc[z_mean_df.index.repeat(len(df_orderbook))] 386 | z_stdev_df = z_stdev_df.loc[z_stdev_df.index.repeat(len(df_orderbook))] 387 | z_mean_df.index = df_orderbook.index 388 | z_stdev_df.index = df_orderbook.index 389 | if scaling is True: 390 | df_orderbook[feature_names] = (df_orderbook[feature_names] - z_mean_df) / z_stdev_df # Apply normalization. 391 | 392 | # Roll forward by dropping first rows and adding most recent mean and mean2. 393 | mean_df = mean_df.iloc[1:, :] 394 | mean2_df = mean2_df.iloc[1:, :] 395 | nsamples_df = nsamples_df.iloc[1:, :] 396 | 397 | mean_df = pd.concat([mean_df, orderbook_mean_df], ignore_index=True) 398 | mean2_df = pd.concat([mean2_df, orderbook_mean2_df], ignore_index=True) 399 | nsamples_df = pd.concat( 400 | [nsamples_df, orderbook_nsamples_df], ignore_index=True 401 | ) 402 | 403 | # Create labels with simple delta prices. 404 | rolling_mid = df_orderbook["mid_price"] 405 | rolling_mid = rolling_mid.to_numpy().flatten() 406 | for h in horizons: 407 | delta_ticks = rolling_mid[h:] - df_orderbook["mid_price"][:-h] 408 | df_orderbook[f"Raw_Target_{str(h)}"] = delta_ticks 409 | 410 | # Create labels applying smoothing. 411 | for h in horizons: 412 | rolling_mid_minus = df_orderbook['mid_price'].rolling(window=h, min_periods=h).mean().shift(h) 413 | rolling_mid_plus = df_orderbook["mid_price"].rolling(window=h, min_periods=h).mean().to_numpy().flatten() 414 | smooth_pct_change = rolling_mid_plus - rolling_mid_minus 415 | df_orderbook[f"Smooth_Target_{str(h)}"] = smooth_pct_change 416 | 417 | # Drop the mid-price column and transform seconds column into a readable format. 418 | df_orderbook = df_orderbook.drop(["mid_price"], axis=1) 419 | pattern = r"\d{4}-\d{2}-\d{2}" 420 | match = re.search(pattern, orderbook_name) 421 | date_temp = match.group() 422 | df_orderbook.seconds = df_orderbook.apply( 423 | lambda row: get_datetime_from_seconds(row["seconds"], date_temp), axis=1 424 | ) 425 | 426 | # Drop elements which cannot be used for training. 427 | df_orderbook = df_orderbook.dropna() 428 | df_orderbook.drop_duplicates(inplace=True, keep='last', subset='seconds') 429 | 430 | # Save processed files. 431 | output_name = f"{output_path}/{ticker}_{features}_{str(date.date())}" 432 | df_orderbook.to_csv(f"{output_name}.csv", header=True, index=False) 433 | 434 | logs.append(f"{orderbook_name} completed.") 435 | 436 | print(f"Data preprocessing loop finished. SCALING: {str(scaling)}.") 437 | 438 | with open(f"{logs_path}/{features}_processing_logs.txt", "w") as f: 439 | for log in logs: 440 | f.write(log + "\n") 441 | 442 | print("Please check processing logs.") 443 | 444 | df_statistics.to_csv( 445 | f"{logs_path}/{features}_statistics.csv", header=True, index=False 446 | ) # Save statistics. 447 | 448 | 449 | def get_datetime_from_seconds(seconds_after_midnight, date_str): 450 | # Convert the date_str to a datetime.date object. 451 | dt_date = datetime.strptime(date_str, "%Y-%m-%d").date() 452 | 453 | # Calculate the time component from seconds_after_midnight. 454 | hours = int(seconds_after_midnight // 3600) 455 | minutes = int((seconds_after_midnight % 3600) // 60) 456 | seconds = int(seconds_after_midnight % 60) 457 | microseconds = int( 458 | (seconds_after_midnight % 1) * 1e6 459 | ) # Convert decimal part to microseconds. 460 | 461 | # Create a datetime.time object for the time component. 462 | dt_time = time(hour=hours, minute=minutes, second=seconds, microsecond=microseconds) 463 | 464 | # Combine the date and time to create the datetime.datetime object. 465 | dt_datetime = datetime.combine(dt_date, dt_time) 466 | 467 | return dt_datetime 468 | -------------------------------------------------------------------------------- /data_processing/data_process_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from data_processing import data_process 4 | 5 | 6 | class DataUtils: 7 | def __init__(self, ticker, dataset, experiment_id, horizons, normalization_window): 8 | self.ticker = ticker # Ticker of the stock to be processed. 9 | self.dataset = dataset # Dataset to be used. 10 | self.experiment_id = experiment_id # Experiment ID. 11 | self.horizons = horizons # Horizons to be used when computing labels. 12 | self.normalization_window = normalization_window # Normalization window to be used when normalizing data. 13 | 14 | self.__raw_data_path = None # Path containing the raw LOB data. 15 | self.__processed_data_path_unscaled_data = ( 16 | None # Path containing the processed but unscaled LOB data. 17 | ) 18 | self.__processed_data_path_scaled_data = ( 19 | None # Path containing the processed and scaled LOB data. 20 | ) 21 | self.__logs_path = None # Path containing the logs of the data processing. 22 | 23 | def __set_raw_data_path(self): 24 | # Set the raw data path according to the dataset. 25 | if self.dataset == "nasdaq": 26 | self.__raw_data_path = f"./data/{self.dataset}/raw/{self.ticker}" 27 | 28 | def __set_processed_data_path_unscaled_data(self): 29 | # Set the path containing the processed but unscaled LOB data according to the dataset. 30 | if self.dataset == "nasdaq": 31 | self.__processed_data_path_unscaled_data = ( 32 | f"./data/{self.dataset}/unscaled_data/{self.ticker}" 33 | ) 34 | 35 | def __set_processed_data_path_scaled_data(self): 36 | # Set the path containing the processed and scaled LOB data according to the dataset. 37 | if self.dataset == "nasdaq": 38 | self.__processed_data_path_scaled_data = ( 39 | f"./data/{self.dataset}/scaled_data/{self.ticker}" 40 | ) 41 | 42 | def __set_logs_path(self): 43 | # Set the path containing the logs of the data processing according to the experiment ID. 44 | self.__logs_path = ( 45 | f"./loggers/results/{self.experiment_id}/data_processing_logs" 46 | ) 47 | 48 | def generate_data_folders(self): 49 | self.__set_raw_data_path() # Set the raw data path. 50 | self.__set_processed_data_path_unscaled_data() # Set the path containing the processed but unscaled LOB data. 51 | self.__set_processed_data_path_scaled_data() # Set the path containing the processed and scaled LOB data. 52 | self.__set_logs_path() # Set the path containing the logs of the data processing. 53 | 54 | # Create the folder for the processed but unscaled LOB data if it does not exist. 55 | if not os.path.exists(self.__processed_data_path_unscaled_data): 56 | os.makedirs(self.__processed_data_path_unscaled_data) 57 | 58 | # Create the folder for the processed and scaled LOB data if it does not exist. 59 | if not os.path.exists(self.__processed_data_path_scaled_data): 60 | os.makedirs(self.__processed_data_path_scaled_data) 61 | 62 | # Create the folder for the logs of the data processing if it does not exist. 63 | if not os.path.exists(self.__logs_path): 64 | os.makedirs(self.__logs_path) 65 | 66 | # Process the data to obtain scaled and unscaled data. 67 | def process_data(self): 68 | data_process.process_data( 69 | ticker=self.ticker, 70 | input_path=self.__raw_data_path, 71 | output_path=self.__processed_data_path_unscaled_data, 72 | logs_path=self.__logs_path, 73 | horizons=self.horizons, 74 | normalization_window=self.normalization_window, 75 | scaling=False, 76 | ) 77 | 78 | data_process.process_data( 79 | ticker=self.ticker, 80 | input_path=self.__raw_data_path, 81 | output_path=self.__processed_data_path_scaled_data, 82 | logs_path=self.__logs_path, 83 | horizons=self.horizons, 84 | normalization_window=self.normalization_window, 85 | scaling=True, 86 | ) 87 | -------------------------------------------------------------------------------- /loaders/custom_dataset.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import random 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from torch.utils.data import Dataset, DataLoader 8 | import polars as pl 9 | import tqdm 10 | import matplotlib.pyplot as plt 11 | 12 | from utils import detect_changing_points 13 | 14 | 15 | class CustomDataset(Dataset): 16 | def __init__( 17 | self, 18 | dataset, 19 | learning_stage, 20 | window_size, 21 | shuffling_seed, 22 | cache_size, 23 | lighten, 24 | threshold, 25 | all_horizons, 26 | prediction_horizon, 27 | targets_type, 28 | balanced_dataloader=False, 29 | backtest=False, 30 | training_stocks=None, 31 | validation_stocks=None, 32 | target_stocks=None 33 | ): 34 | self.learning_stage = learning_stage # The current learning stage (training, validation or testing). 35 | self.shuffling_seed = ( 36 | shuffling_seed # The seed for the random shuffling of the datasets. 37 | ) 38 | self.balanced_dataloader = balanced_dataloader # Whether to use a balanced dataloader or not. This option is available only for training. 39 | self.backtest = backtest 40 | self.targets_type = targets_type 41 | 42 | if self.learning_stage == "training": 43 | file_patterns = [f"./data/{dataset}/scaled_data/{self.learning_stage}/{element}_orderbooks*.csv" for element in training_stocks] 44 | self.csv_files = [] 45 | for pattern in file_patterns: 46 | self.csv_files.extend(glob.glob(pattern.format(dataset=dataset, self=self))) 47 | 48 | random.seed(self.shuffling_seed) 49 | random.shuffle(self.csv_files) 50 | else: 51 | # During the validation and testing stages it is fundamental to read the datasets in chronological order. 52 | if self.learning_stage == 'validation': 53 | file_patterns = [f"./data/{dataset}/scaled_data/{self.learning_stage}/{element}_orderbooks*.csv" for element in validation_stocks] 54 | else: 55 | file_patterns = [f"./data/{dataset}/scaled_data/{self.learning_stage}/{element}_orderbooks*.csv" for element in target_stocks] 56 | 57 | self.csv_files = [] 58 | for pattern in file_patterns: 59 | self.csv_files.extend(glob.glob(pattern.format(dataset=dataset, self=self))) 60 | self.csv_files = sorted(self.csv_files) 61 | 62 | self.window_size = window_size # The number of time steps in each window. 63 | self.lighten = lighten # Whether to use the light version of the dataset. 64 | self.threshold = threshold # The threshold for the classification task. 65 | self.prediction_horizon = ( 66 | prediction_horizon # The prediction horizon for the classification task. 67 | ) 68 | self.all_horizons = ( 69 | all_horizons # List of all the possible prediction horizons. 70 | ) 71 | 72 | self.cumulative_lengths = [0] # Store cumulative lengths of datasets. 73 | self.cache_size = cache_size # The number of datasets to cache in memory. 74 | self.cache_data = [ 75 | None 76 | ] * self.cache_size # Initialize a cache with empty slots. 77 | self.cache_indices = [ 78 | None 79 | ] * self.cache_size # Initialize the indices for the cache. 80 | self.current_cache_index = -1 81 | 82 | self.glob_indices = [] 83 | 84 | if self.balanced_dataloader: 85 | print(f"BALANCED dataset construction...") 86 | else: 87 | print(f"UNBALANCED dataset construction...") 88 | for csv_file in tqdm.tqdm(self.csv_files): 89 | df = pd.read_csv(csv_file) 90 | self.cumulative_lengths.append( 91 | self.cumulative_lengths[-1] + len(df) - window_size 92 | ) # Store the lengths of all datasets. 93 | 94 | # If requested by the user, during the training stage, we create balanced dataloaders. 95 | if self.learning_stage == "training" and self.balanced_dataloader is True: 96 | temp_labels = ( 97 | [] 98 | ) # This is a temporary variable which stores the (discretized) labels (i.e. classes) for each sample in each input dataset. 99 | 100 | if self.targets_type == "raw": 101 | labels = df.iloc[:-window_size, :][ 102 | f"Raw_Target_{self.prediction_horizon}" 103 | ] # Extract the raw, continuous labels (i.e. returns) from the current dataset. 104 | else: 105 | labels = df.iloc[:-window_size, :][ 106 | f"Smooth_Target_{self.prediction_horizon}" 107 | ] # Extract the raw, continuous labels (i.e. returns) from the current dataset. 108 | 109 | # For each file, we must know the corresponding index. This is the reason why we access the cumulative lengths list. 110 | for label, index in zip( 111 | labels, 112 | range(self.cumulative_lengths[-2], self.cumulative_lengths[-1]), 113 | ): 114 | # The discretization is performed using the provided threshold. Temporary labels are tuples of the form (class, index). 115 | if label > self.threshold: 116 | temp_labels.append((2, index)) 117 | elif label < -self.threshold: 118 | temp_labels.append((0, index)) 119 | else: 120 | temp_labels.append((1, index)) 121 | 122 | # Group data by class representatives 123 | class_groups = {} 124 | for item in temp_labels: 125 | ( 126 | class_representative, 127 | index, 128 | ) = item # Unpack the tuple (class, index). 129 | 130 | # Understand what is the un-cumulative index of each sample. 131 | corresponding_cumulative_length = detect_changing_points( 132 | index, self.cumulative_lengths 133 | ) 134 | if corresponding_cumulative_length is not None: 135 | # If the current sample does not belong to the first dataset, we must subtract the cumulative length of the previous dataset. 136 | temp_index = index - corresponding_cumulative_length 137 | else: 138 | # If the current sample belongs to the first dataset, we do not need to subtract anything. 139 | temp_index = index 140 | 141 | # Even having a balanced dataloader, labels would be messed up once computing models' inputs. 142 | # Indeed, given an index 'i', the input rows are the ones from 'i' to 'i + window_size' and the label to be used is the one at 'i + window_size'. 143 | # Therefore, we must subtract the window size from the index of each sample. 144 | if temp_index >= self.window_size: 145 | if class_representative in class_groups: 146 | class_groups[class_representative].append( 147 | index - self.window_size 148 | ) 149 | else: 150 | class_groups[class_representative] = [ 151 | index - self.window_size 152 | ] 153 | else: 154 | pass 155 | 156 | # Determine the desired number of samples per class (pseudo-balanced). We use the size of the less represented class. 157 | min_samples_class = min( 158 | len(indices) for indices in class_groups.values() 159 | ) 160 | if min_samples_class > 5000: 161 | min_samples_class = 5000 162 | 163 | # We randomly select indices from each class to create the subsample. 164 | subsample_indices = [] 165 | for class_representative, indices in class_groups.items(): 166 | random.seed(self.shuffling_seed) 167 | subsample_indices.extend(random.sample(indices, min_samples_class)) 168 | 169 | # We store the chosen indices in the 'global_indices_list'. 170 | random.seed(self.shuffling_seed) 171 | random.shuffle(subsample_indices) 172 | self.glob_indices.extend(subsample_indices) 173 | 174 | # If requested by the user, during the training stage, we use all the available samples distributed across input datasets. 175 | if self.learning_stage == "training" and self.balanced_dataloader is False: 176 | temp_labels = ( 177 | [] 178 | ) # This is a temporary variable which stores the (discretized) labels (i.e. classes) for each sample in each input dataset. 179 | 180 | if self.targets_type == "raw": 181 | labels = df.iloc[:-window_size, :][ 182 | f"Raw_Target_{self.prediction_horizon}" 183 | ] # Extract the raw, continuous labels (i.e. returns) from the current dataset. 184 | else: 185 | labels = df.iloc[:-window_size, :][ 186 | f"Smooth_Target_{self.prediction_horizon}" 187 | ] # Extract the raw, continuous labels (i.e. returns) from the current dataset. 188 | 189 | # For each file, we must know the corresponding index. This is the reason why we access the cumulative lengths list. 190 | for label, index in zip( 191 | labels, 192 | range(self.cumulative_lengths[-2], self.cumulative_lengths[-1]), 193 | ): 194 | # The discretization is performed using the provided threshold. Temporary labels are tuples of the form (class, index). 195 | if label > self.threshold: 196 | temp_labels.append((2, index)) 197 | elif label < -self.threshold: 198 | temp_labels.append((0, index)) 199 | else: 200 | temp_labels.append((1, index)) 201 | 202 | # Group data by class representatives 203 | class_groups = {} 204 | for item in temp_labels: 205 | ( 206 | class_representative, 207 | index, 208 | ) = item # Unpack the tuple (class, index). 209 | 210 | # Understand what is the un-cumulative index of each sample. 211 | corresponding_cumulative_length = detect_changing_points( 212 | index, self.cumulative_lengths 213 | ) 214 | if corresponding_cumulative_length is not None: 215 | # If the current sample does not belong to the first dataset, we must subtract the cumulative length of the previous dataset. 216 | temp_index = index - corresponding_cumulative_length 217 | else: 218 | # If the current sample belongs to the first dataset, we do not need to subtract anything. 219 | temp_index = index 220 | 221 | # Even having a balanced dataloader, labels would be messed up once computing models' inputs. 222 | # Indeed, given an index 'i', the input rows are the ones from 'i' to 'i + window_size' and the label to be used is the one at 'i + window_size'. 223 | # Therefore, we must subtract the window size from the index of each sample. 224 | if temp_index >= self.window_size: 225 | if class_representative in class_groups: 226 | class_groups[class_representative].append( 227 | index - self.window_size 228 | ) 229 | else: 230 | class_groups[class_representative] = [ 231 | index - self.window_size 232 | ] 233 | else: 234 | pass 235 | 236 | # We randomly select indices from each class to create the subsample. 237 | subsample_indices = [] 238 | for class_representative, indices in class_groups.items(): 239 | random.seed(self.shuffling_seed) 240 | subsample_indices.extend(random.sample(indices, int(len(indices) * 0.1))) 241 | 242 | # We store the chosen indices in the 'global_indices_list'. 243 | random.seed(self.shuffling_seed) 244 | random.shuffle(subsample_indices) 245 | self.glob_indices.extend(subsample_indices) 246 | 247 | def __len__(self): 248 | # This is the cumulative length of all input datasets. 249 | return self.cumulative_lengths[-1] 250 | 251 | def cache_dataset(self, dataset_index): 252 | if self.current_cache_index >= 0: 253 | # Remove the least recently used cache entry 254 | self.cache_data[self.current_cache_index] = None 255 | self.cache_indices[self.current_cache_index] = None 256 | 257 | # Select a random cache slot for the new dataset 258 | self.current_cache_index = random.randint(0, self.cache_size - 1) 259 | 260 | # Cache the data from the CSV file 261 | df = pl.read_csv(self.csv_files[dataset_index]).to_pandas() 262 | 263 | self.cache_data[self.current_cache_index] = df.values[:, 1:].astype(np.float32) 264 | self.cache_indices[self.current_cache_index] = dataset_index 265 | 266 | def __getitem__(self, index): 267 | try: 268 | dataset_index = 0 269 | while index >= self.cumulative_lengths[dataset_index + 1]: 270 | dataset_index += 1 271 | 272 | if self.cache_indices[self.current_cache_index] != dataset_index: 273 | # Cache the dataset if it's not already cached. 274 | self.cache_dataset(dataset_index) 275 | 276 | # Retrieve the un-cumulative index of the current sample. 277 | start_index = ( 278 | index 279 | if dataset_index == 0 280 | else index - self.cumulative_lengths[dataset_index] 281 | ) 282 | 283 | if self.lighten: 284 | # If the "lighten" option is enabled, we use only the first 5 levels of the orderbook (i.e. 4_level_features * 5_levels = 20_orderbook_features). 285 | window_data = self.cache_data[self.current_cache_index][ 286 | start_index: start_index + self.window_size, :20 287 | ] 288 | else: 289 | # If the "lighten" option is not enabled, we use all the 10 levels of the orderbook (i.e. 4_level_features * 10_levels = 40_orderbook_features). 290 | window_data = self.cache_data[self.current_cache_index][ 291 | start_index: start_index + self.window_size, :40 292 | ] 293 | 294 | # Determine the position of the prediction horizon in the list of all horizons. 295 | position = next( 296 | ( 297 | index 298 | for index, value in enumerate(self.all_horizons) 299 | if value == self.prediction_horizon 300 | ), 301 | None, 302 | ) 303 | # Extract the label from the dataset given its position. 304 | label = self.cache_data[self.current_cache_index][ 305 | start_index + self.window_size, 40: 306 | ][position] 307 | # Discretize the label using the provided threshold. 308 | if self.backtest is False: 309 | if label > self.threshold: 310 | label = 2 311 | elif label < -self.threshold: 312 | label = 0 313 | else: 314 | label = 1 315 | 316 | return torch.tensor(window_data).unsqueeze(0), torch.tensor(label) 317 | except Exception as e: 318 | print(f"Exception in DataLoader worker: {e}") 319 | raise e 320 | 321 | 322 | ''' 323 | if __name__ == "__main__": 324 | # Create dataset and DataLoader with random shuffling 325 | dataset = CustomDataset( 326 | dataset="nasdaq", 327 | learning_stage="training", 328 | window_size=100, 329 | shuffling_seed=42, 330 | cache_size=1, 331 | lighten=True, 332 | threshold=32, 333 | targets_type="raw", 334 | all_horizons=[5, 10, 30, 50, 100], 335 | prediction_horizon=100, 336 | balanced_dataloader=False, 337 | training_stocks=["CHTR"], 338 | validation_stocks=["CHTR"], 339 | target_stocks=["CHTR"] 340 | ) 341 | 342 | dataloader = DataLoader( 343 | dataset, batch_size=32, shuffle=False, num_workers=8, drop_last=True, sampler=dataset.glob_indices 344 | ) 345 | 346 | print(len(dataloader)) 347 | 348 | complete_list = [] 349 | # Example usage of the DataLoader 350 | for batch_data, batch_labels in dataloader: 351 | # Train your model using batch_data and batch_labels 352 | # print(batch_labels.tolist()) 353 | complete_list.extend(batch_labels.tolist()) 354 | #print(batch_data.shape, batch_labels.shape) 355 | 356 | plt.hist(complete_list) 357 | plt.show() 358 | ''' -------------------------------------------------------------------------------- /loggers/analysis.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import yaml 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from sklearn.metrics import * 8 | from scipy.stats import skew, kurtosis 9 | from tqdm import tqdm 10 | 11 | import matplotlib.pyplot as plt 12 | from matplotlib import rcParams, cycler 13 | import matplotlib.pyplot as plt 14 | from matplotlib.ticker import AutoMinorLocator 15 | 16 | import warnings 17 | import multiprocessing 18 | import pickle 19 | import gzip 20 | 21 | def find_substrings_in_string(string_list, main_string): 22 | return [s for s in string_list if s in main_string] 23 | 24 | def calculate_log_returns(series, step=50): 25 | return np.log(series / series.shift(step)).dropna().reset_index(drop=True) 26 | 27 | def optimized_rolling_diff(series, window_size): 28 | return series.rolling(window=window_size).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False).shift(-(window_size - 1)) 29 | 30 | def process_file(f): 31 | df = pd.read_csv(f) 32 | 33 | best_ask_price = df.ASKp1 / 10000 34 | best_bid_price = df.BIDp1 / 10000 35 | local_mids = (best_ask_price + best_bid_price) / 2 36 | local_spreads = best_ask_price - best_bid_price 37 | volatility_10 = np.std(calculate_log_returns(local_mids, 10)) 38 | volatility_50 = np.std(calculate_log_returns(local_mids, 50)) 39 | volatility_100 = np.std(calculate_log_returns(local_mids, 100)) 40 | levels_ask_side = ((df.ASKp10 / 10000 - df.ASKp1 / 10000) / 0.01).tolist() 41 | levels_bid_side = ((df.BIDp1 / 10000 - df.BIDp10 / 10000) / 0.01).tolist() 42 | df['seconds'] = pd.to_datetime(df['seconds']) 43 | secs = df['seconds'].astype(int) / 10**9 44 | 45 | seconds_in_horizon_10 = optimized_rolling_diff(secs, 10).dropna().tolist() 46 | seconds_in_horizon_50 = optimized_rolling_diff(secs, 50).dropna().tolist() 47 | seconds_in_horizon_100 = optimized_rolling_diff(secs, 100).dropna().tolist() 48 | 49 | print(f"Finished {f}.") 50 | return { 51 | 'Mids': local_mids.tolist(), 52 | 'Spreads': local_spreads.tolist(), 53 | 'Best_Ask_Volume': df.ASKs1.tolist(), 54 | 'Best_Bid_Volume': df.BIDs1.tolist(), 55 | 'Volatility_10': [volatility_10], 56 | 'Volatility_50': [volatility_50], 57 | 'Volatility_100': [volatility_100], 58 | 'Levels_Ask_Side': levels_ask_side, 59 | 'Levels_Bid_Side': levels_bid_side, 60 | 'Seconds_Horizon_10': seconds_in_horizon_10, 61 | 'Seconds_Horizon_50': seconds_in_horizon_50, 62 | 'Seconds_Horizon_100': seconds_in_horizon_100 63 | } 64 | 65 | def process_stock_files(file_list): 66 | stock_data = { 67 | 'Mids': [], 'Spreads': [], 'Best_Ask_Volume': [], 'Best_Bid_Volume': [], 68 | 'Volatility_10': [], 'Volatility_50': [], 'Volatility_100': [], 69 | 'Levels_Ask_Side': [], 'Levels_Bid_Side': [], 'Seconds_Horizon_10': [], 70 | 'Seconds_Horizon_50': [], 'Seconds_Horizon_100': [] 71 | } 72 | for f in file_list: 73 | file_data = process_file(f) 74 | for key in stock_data: 75 | stock_data[key].extend(file_data[key]) 76 | return stock_data 77 | 78 | def process_stock(s): 79 | files = sorted(glob.glob(f"../data/nasdaq/unscaled_data/{s}/*")) 80 | num_workers = 10 81 | 82 | # Splitting files into chunks for each process 83 | file_chunks = np.array_split(files, num_workers) 84 | 85 | with multiprocessing.Pool(num_workers) as pool: 86 | chunk_results = pool.map(process_stock_files, file_chunks) 87 | 88 | # Aggregating results from all chunks 89 | stock_data = { 90 | 'Mids': [], 'Spreads': [], 'Best_Ask_Volume': [], 'Best_Bid_Volume': [], 91 | 'Volatility_10': [], 'Volatility_50': [], 'Volatility_100': [], 92 | 'Levels_Ask_Side': [], 'Levels_Bid_Side': [], 'Seconds_Horizon_10': [], 93 | 'Seconds_Horizon_50': [], 'Seconds_Horizon_100': [] 94 | } 95 | for chunk in chunk_results: 96 | for key in stock_data: 97 | stock_data[key].extend(chunk[key]) 98 | 99 | return s, stock_data 100 | 101 | if __name__ == "__main__": 102 | stocks = ["BAC", "CHTR", "CSCO", "GOOG", "GS", "IBM", "MCD", "NVDA", "ORCL", "PFE", "PM", "VZ"] #"ABBV", "KO", "AAPL", 103 | 104 | for s in stocks: 105 | stock_dictionary = {} 106 | try: 107 | s, stock_data = process_stock(s) 108 | stock_dictionary[s] = stock_data 109 | print(f"Completed processing for stock: {s}") 110 | except Exception as e: 111 | print(f"Error processing stock: {s} with error {e}") 112 | 113 | with open(f'../statistical_analysis/{s}.pkl', 'wb') as f: 114 | pickle.dump(stock_dictionary, f, protocol=-1) -------------------------------------------------------------------------------- /loggers/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | 5 | import yaml 6 | import random 7 | import string 8 | 9 | 10 | def generate_id(name, target_stock): 11 | """ 12 | Generate a unique experiment identifier based on the input `name` and the current timestamp in the format "YYYY-MM-DD_HH_MM_SS". 13 | Create a directory path using this identifier within the 'loggers/results' directory relative to the script's location, and if 14 | it doesn't exist, create it. 15 | 16 | :param name: name of the DL model to be used in the experiment, (str). 17 | :return: experiment_id: unique experiment identifier, (str). 18 | """ 19 | random_string_part = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(7)) 20 | init_time = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") 21 | experiment_id = f"{target_stock}_{name}_{init_time}_{random_string_part}" 22 | 23 | root_path = sys.path[0] 24 | dir_path = f"{root_path}/loggers/results/{experiment_id}" 25 | if not os.path.exists(dir_path): 26 | os.makedirs(dir_path) 27 | 28 | return experiment_id 29 | 30 | 31 | def find_save_path(model_id): 32 | """ 33 | Find the directory path for saving results associated with a given `model_id`. This function constructs a directory path within the 34 | 'loggers/results' directory relative to the script's location. 35 | 36 | :param model_id: model identifier, (str). 37 | :return: directory path, (str). 38 | """ 39 | root_path = sys.path[0] 40 | dir_path = f"{root_path}/loggers/results/{model_id}" 41 | return dir_path 42 | 43 | 44 | def logger(experiment_id, header, contents): 45 | """ 46 | Log experimental results in a YAML file associated with the given `model_id`. If the file already exists, it appends new data to it; 47 | otherwise, it creates a new file. 48 | 49 | :param experiment_id: model identifier, (str). 50 | :param header: header for the data being logged, (str). 51 | :param contents: data to be logged, provided as a dictionary, (dict). 52 | """ 53 | root_path = sys.path[0] 54 | file_path = f"{root_path}/loggers/results/{experiment_id}/data.yaml" 55 | 56 | contents = {header: contents} 57 | 58 | if os.path.exists(file_path): 59 | with open(file_path, "r") as yamlfile: 60 | current_yaml = yaml.safe_load(yamlfile) 61 | current_yaml.update(contents) 62 | else: 63 | current_yaml = contents 64 | with open(file_path, "w") as yamlfile: 65 | yaml.dump(current_yaml, yamlfile) 66 | 67 | 68 | def read_log(model_id, header): 69 | """ 70 | Read and retrieve data from a log file associated with the given `model_id`. 71 | 72 | :param model_id: Model identifier, (str). 73 | :param header: Header of the data to retrieve from the log, (str). 74 | :return: The data associated with the specified header from the log, (any type depending on data stored). 75 | """ 76 | root_path = sys.path[0] 77 | file_path = f"{root_path}/loggers/results/{model_id}/log.yaml" 78 | 79 | with open(file_path, "r") as yamlfile: 80 | log = yaml.safe_load(yamlfile) 81 | 82 | return log[header] 83 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from data_processing import data_process_utils 2 | from loggers import logger 3 | from optimizers.executor import Executor 4 | from simulator import market_sim, post_trading_analysis 5 | from utils import ( 6 | data_split, 7 | load_yaml, 8 | save_dataset_info, 9 | parse_args, 10 | create_hyperparameters_yaml, 11 | ) 12 | from data_processing.complete_homological_utils import get_complete_homology 13 | 14 | if __name__ == "__main__": 15 | # Parse input arguments. 16 | args = parse_args() 17 | wb_error_detection = False 18 | 19 | if args.experiment_id is None: 20 | # If no experiment ID is passed, generate a new one. 21 | experiment_id = logger.generate_id(args.model, args.target_stocks) 22 | # Create a new configuration file containing the hyperparameters. 23 | create_hyperparameters_yaml(experiment_id, args) 24 | else: 25 | # If an experiment ID is passed, use it. 26 | experiment_id = args.experiment_id 27 | # Replace the hyperparameters file with the new arguments passed as input. 28 | # create_hyperparameters_yaml(experiment_id, args) 29 | 30 | # Load the configuration file containing the hyperparameters. 31 | hyperparameters_path = ( 32 | f"{logger.find_save_path(experiment_id)}/hyperparameters.yaml" 33 | ) 34 | 35 | # Load the configuration file (general hyperparameters). 36 | general_hyperparameters = load_yaml(hyperparameters_path, "general") 37 | # Load the configuration file (model's hyperparameters). 38 | model_hyperparameters = load_yaml(hyperparameters_path, "model") 39 | # Load the configuration file (trading hyperparameters). 40 | trading_hyperparameters = load_yaml(hyperparameters_path, "trading") 41 | 42 | if args.experiment_id is not None: 43 | general_hyperparameters['stages'] = args.stages.split(",") 44 | 45 | # Handle the data processing stage. 46 | if "data_processing" in general_hyperparameters["stages"]: 47 | # Make the list of training stocks a set to avoid duplicates. 48 | training_stocks = set(general_hyperparameters["training_stocks"]) 49 | # Make the list of target stocks a set to avoid duplicates. 50 | target_stocks = set(general_hyperparameters["target_stocks"]) 51 | # Iterate over stocks after performing the union of sets operation (a stock can occur both in training_stocks and target_stocks). 52 | for stock in list(training_stocks.union(target_stocks)): 53 | data_utils = data_process_utils.DataUtils( 54 | ticker=stock, 55 | dataset=general_hyperparameters["dataset"], 56 | experiment_id=experiment_id, 57 | horizons=general_hyperparameters["horizons"], 58 | normalization_window=general_hyperparameters["normalization_window"], 59 | ) 60 | # Generate the data folders. 61 | data_utils.generate_data_folders() 62 | # Transform the data. 63 | data_utils.process_data() 64 | # Split the data into training, validation and test sets. 65 | data_split( 66 | dataset=general_hyperparameters["dataset"], 67 | training_stocks=general_hyperparameters["training_stocks"], 68 | target_stock=general_hyperparameters["target_stocks"], 69 | training_ratio=general_hyperparameters["training_ratio"], 70 | validation_ratio=general_hyperparameters["validation_ratio"], 71 | include_target_stock_in_training=general_hyperparameters[ 72 | "include_target_stock_in_training" 73 | ], 74 | ) 75 | 76 | # Instantiate the executor as None. 77 | executor = None 78 | # For 'torch_dataset_preparation' stage, instantiate the executor with proper arguments. 79 | if "torch_dataset_preparation" in general_hyperparameters["stages"]: 80 | executor = Executor( 81 | experiment_id, general_hyperparameters, model_hyperparameters, torch_dataset_preparation=True 82 | ) 83 | 84 | if "torch_dataset_preparation_backtest" in general_hyperparameters["stages"]: 85 | executor = Executor( 86 | experiment_id, general_hyperparameters, model_hyperparameters, torch_dataset_preparation=False, torch_dataset_preparation_backtest=True 87 | ) 88 | 89 | if "complete_homological_structures_preparation" in general_hyperparameters["stages"]: 90 | get_complete_homology(general_hyperparameters=general_hyperparameters, model_hyperparameters=model_hyperparameters) 91 | 92 | # For the 'training' and 'evaluation' stages, instantiate the executor with proper arguments. 93 | if ( 94 | "training" in general_hyperparameters["stages"] 95 | or "evaluation" in general_hyperparameters["stages"] 96 | ): 97 | executor = Executor( 98 | experiment_id, general_hyperparameters, model_hyperparameters 99 | ) 100 | 101 | if "training" in general_hyperparameters["stages"]: 102 | try: 103 | # Keep track of the files used in the training, validation and test sets. 104 | save_dataset_info( 105 | experiment_id=experiment_id, 106 | general_hyperparameters=general_hyperparameters, 107 | ) 108 | # Train the model. 109 | executor.execute_training() 110 | # Clean up the experiment folder from wandb logging files. 111 | executor.logger_clean_up() 112 | except: 113 | print('Exception detected') 114 | wb_error_detection = True 115 | 116 | if "evaluation" in general_hyperparameters["stages"] and wb_error_detection is False: 117 | # Out-of-sample test of the model. 118 | executor.execute_testing() 119 | # Clean up the experiment folder from wandb logging files. 120 | executor.logger_clean_up() 121 | 122 | if "backtest" in general_hyperparameters["stages"]: 123 | # Backtest the model. 124 | market_sim.backtest( 125 | experiment_id=experiment_id, trading_hyperparameters=trading_hyperparameters 126 | ) 127 | 128 | if "post_trading_analysis" in general_hyperparameters["stages"]: 129 | # Perform a post-trading analysis. 130 | post_trading_analysis.post_trading_analysis( 131 | experiment_id=experiment_id, general_hyperparameters=general_hyperparameters, trading_hyperparameters=trading_hyperparameters, model_hyperparameters=model_hyperparameters 132 | ) 133 | -------------------------------------------------------------------------------- /models/AxialLob/axiallob.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pytorch_lightning as pl 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | def _conv1d1x1(in_channels, out_channels): 8 | return nn.Sequential(nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), 9 | nn.BatchNorm1d(out_channels)) 10 | 11 | 12 | class GatedAxialAttention(pl.LightningModule): 13 | def __init__(self, in_channels, out_channels, heads, dim, flag): 14 | assert (in_channels % heads == 0) and (out_channels % heads == 0) 15 | super().__init__() 16 | 17 | self.in_channels = in_channels 18 | self.out_channels = out_channels 19 | self.heads = heads 20 | self.dim_head_v = out_channels // heads 21 | self.flag = flag # if flag then we do the attention along width 22 | self.dim = dim 23 | self.dim_head_qk = self.dim_head_v // 2 24 | self.qkv_channels = self.dim_head_v + self.dim_head_qk * 2 25 | 26 | # Multi-head self attention 27 | self.to_qkv = _conv1d1x1(in_channels, self.heads * self.qkv_channels) 28 | self.bn_qkv = nn.BatchNorm1d(self.heads * self.qkv_channels) 29 | self.bn_similarity = nn.BatchNorm2d(heads * 3) 30 | self.bn_output = nn.BatchNorm1d(self.heads * self.qkv_channels) 31 | 32 | # Gating mechanism 33 | self.f_qr = nn.Parameter(torch.tensor(0.1), requires_grad=True) 34 | self.f_kr = nn.Parameter(torch.tensor(0.1), requires_grad=True) 35 | self.f_sve = nn.Parameter(torch.tensor(0.1), requires_grad=True) 36 | self.f_sv = nn.Parameter(torch.tensor(0.5), requires_grad=True) 37 | 38 | # Position embedding 39 | self.relative = nn.Parameter(torch.randn(self.dim_head_v * 2, dim * 2 - 1), requires_grad=True) 40 | query_index = torch.arange(dim).unsqueeze(0) 41 | key_index = torch.arange(dim).unsqueeze(1) 42 | relative_index = key_index - query_index + dim - 1 43 | self.register_buffer('flatten_index', relative_index.view(-1)) 44 | 45 | self.reset_parameters() 46 | # self.print_para() 47 | 48 | def forward(self, x): 49 | 50 | if self.flag: 51 | x = x.permute(0, 2, 1, 3) 52 | else: 53 | x = x.permute(0, 3, 1, 2) # n_instances, W, C, H 54 | N, W, C, H = x.shape 55 | x = x.contiguous().view(N * W, C, H) 56 | 57 | # Transformations 58 | x = self.to_qkv(x) 59 | 60 | qkv = self.bn_qkv(x) 61 | q, k, v = torch.split(qkv.reshape(N * W, self.heads, self.dim_head_v * 2, H), 62 | [self.dim_head_v // 2, self.dim_head_v // 2, self.dim_head_v], dim=2) 63 | 64 | # Calculate position embedding 65 | all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.dim_head_v * 2, self.dim, 66 | self.dim) 67 | q_embedding, k_embedding, v_embedding = torch.split(all_embeddings, 68 | [self.dim_head_qk, self.dim_head_qk, self.dim_head_v], 69 | dim=0) 70 | qr = torch.einsum('bgci,cij->bgij', q, q_embedding) 71 | kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3) 72 | qk = torch.einsum('bgci, bgcj->bgij', q, k) 73 | 74 | # multiply by factors 75 | qr = torch.mul(qr, self.f_qr) 76 | kr = torch.mul(kr, self.f_kr) 77 | 78 | stacked_similarity = torch.cat([qk, qr, kr], dim=1) 79 | stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.heads, H, H).sum(dim=1) 80 | # stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk) 81 | # (n_instances, heads, H, H, W) 82 | similarity = torch.softmax(stacked_similarity, dim=3) 83 | sv = torch.einsum('bgij,bgcj->bgci', similarity, v) 84 | sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding) 85 | 86 | # multiply by factors 87 | sv = torch.mul(sv, self.f_sv) 88 | sve = torch.mul(sve, self.f_sve) 89 | 90 | stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_channels * 2, H) 91 | output = self.bn_output(stacked_output).view(N, W, self.out_channels, 2, H).sum(dim=-2) 92 | 93 | if self.flag: 94 | output = output.permute(0, 2, 1, 3) 95 | else: 96 | output = output.permute(0, 2, 3, 1) 97 | 98 | return output 99 | 100 | def reset_parameters(self): 101 | nn.init.normal_(self.relative, 0., math.sqrt(1. / self.dim_head_v)) 102 | 103 | 104 | class AxialLOB(nn.Module): 105 | def __init__(self, W=40, H=100, c_in=32, c_out=32, c_final=4, n_heads=4, pool_kernel=(1, 4), pool_stride=(1, 4)): 106 | super().__init__() 107 | 108 | # channel output of the CNN_in is the channel input for the axial layer 109 | 110 | self.c_in = c_in 111 | self.c_out = c_out 112 | self.c_final = c_final 113 | 114 | self.CNN_in = nn.Conv2d(in_channels=1, out_channels=c_in, kernel_size=1) 115 | self.CNN_out = nn.Conv2d(in_channels=c_out, out_channels=c_final, kernel_size=1) 116 | self.CNN_res2 = nn.Conv2d(in_channels=c_out, out_channels=c_final, kernel_size=1) 117 | self.CNN_res1 = nn.Conv2d(in_channels=1, out_channels=c_out, kernel_size=1) 118 | 119 | self.norm = nn.BatchNorm2d(c_in) 120 | self.res_norm2 = nn.BatchNorm2d(c_final) 121 | self.res_norm1 = nn.BatchNorm2d(c_out) 122 | self.norm2 = nn.BatchNorm2d(c_final) 123 | 124 | self.axial_height_1 = GatedAxialAttention(c_out, c_out, n_heads, H, flag=False) 125 | self.axial_width_1 = GatedAxialAttention(c_out, c_out, n_heads, W, flag=True) 126 | 127 | self.axial_height_2 = GatedAxialAttention(c_out, c_out, n_heads, H, flag=False) 128 | self.axial_width_2 = GatedAxialAttention(c_out, c_out, n_heads, W, flag=True) 129 | 130 | self.activation = nn.ReLU() 131 | self.linear = nn.Linear(4000, 3) 132 | self.pooling = nn.AvgPool2d(kernel_size=pool_kernel, stride=pool_stride) 133 | 134 | def forward(self, x): 135 | # first convolution before the attention 136 | y = self.CNN_in(x) 137 | y = self.norm(y) 138 | y = self.activation(y) 139 | 140 | # attention mechanism through gated multi head axial layer 141 | y = self.axial_width_1(y) 142 | y = self.axial_height_1(y) 143 | 144 | # lower branch 145 | x = self.CNN_res1(x) 146 | x = self.res_norm1(x) 147 | x = self.activation(x) 148 | 149 | # first residual 150 | y = y + x 151 | z = y.detach().clone() 152 | 153 | # second axial layer 154 | y = self.axial_width_2(y) 155 | y = self.axial_height_2(y) 156 | 157 | # second convolution 158 | y = self.CNN_out(y) 159 | y = self.res_norm2(y) 160 | y = self.activation(y) 161 | 162 | # lower branch 163 | z = self.CNN_res2(z) 164 | z = self.norm2(z) 165 | z = self.activation(z) 166 | 167 | # second res connection 168 | y = y + z 169 | 170 | # final part 171 | y = self.pooling(y) 172 | y = torch.flatten(y, 1) 173 | y = self.linear(y) 174 | 175 | return y -------------------------------------------------------------------------------- /models/CNN1/cnn1.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | from torch import nn 3 | 4 | 5 | class CNN1(pl.LightningModule): 6 | def __init__(self, num_features=40, num_classes=3, temp=26): 7 | super().__init__() 8 | 9 | # Convolution 1 10 | self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(4, num_features), padding=(3, 0), dilation=(2, 1)) 11 | self.relu1 = nn.LeakyReLU() 12 | 13 | # Convolution 2 14 | self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=(4,)) 15 | self.relu2 = nn.LeakyReLU() 16 | 17 | # Max pool 1 18 | self.maxpool1 = nn.MaxPool1d(kernel_size=2) 19 | 20 | # Convolution 3 21 | self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=(3,), padding=2) 22 | self.relu3 = nn.LeakyReLU() 23 | 24 | # Convolution 4 25 | self.conv4 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=(3,), padding=2) 26 | self.relu4 = nn.LeakyReLU() 27 | 28 | # Max pool 2 29 | self.maxpool2 = nn.MaxPool1d(kernel_size=2) 30 | 31 | # Fully connected 1 32 | self.fc1 = nn.Linear(temp*32, 32) 33 | self.relu5 = nn.LeakyReLU() 34 | 35 | # Fully connected 2 36 | self.fc2 = nn.Linear(32, num_classes) 37 | 38 | def forward(self, x): 39 | # Convolution 1 40 | out = self.conv1(x) 41 | out = self.relu1(out) 42 | out = out.reshape(out.shape[0], out.shape[1], -1) 43 | # print('After convolution1:', out.shape) 44 | 45 | # Convolution 2 46 | out = self.conv2(out) 47 | out = self.relu2(out) 48 | # print('After convolution2:', out.shape) 49 | 50 | # Max pool 1 51 | out = self.maxpool1(out) 52 | # print('After maxpool1:', out.shape) 53 | 54 | # Convolution 3 55 | out = self.conv3(out) 56 | out = self.relu3(out) 57 | # print('After convolution3:', out.shape) 58 | 59 | # Convolution 4 60 | out = self.conv4(out) 61 | out = self.relu4(out) 62 | # print('After convolution4:', out.shape) 63 | 64 | # Max pool 2 65 | out = self.maxpool2(out) 66 | # print('After maxcpool2:', out.shape) 67 | 68 | # flatten 69 | out = out.view(out.size(0), -1) 70 | # print('After flatten:', out.shape) 71 | 72 | # Linear function 1 73 | out = self.fc1(out) 74 | out = self.relu5(out) 75 | # print('After linear1:', out.shape) 76 | 77 | # Linear function (readout) 78 | out = self.fc2(out) 79 | # print('After linear2:', out.shape) 80 | 81 | return out -------------------------------------------------------------------------------- /models/CNN2/cnn2.py: -------------------------------------------------------------------------------- 1 | # Using Deep Learning for price prediction by exploiting stationary limit order book features 2 | # Source: https://www.sciencedirect.com/science/article/pii/S1568494620303410 3 | 4 | import pytorch_lightning as pl 5 | from torch import nn 6 | 7 | 8 | class CNN2(pl.LightningModule): 9 | def __init__(self, num_features=40, num_classes=3, temp=249): 10 | super().__init__() 11 | 12 | # Convolution 1 13 | self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(10, 42), padding=(0, 2)) 14 | self.bn1 = nn.BatchNorm2d(16) 15 | self.prelu1 = nn.PReLU() 16 | 17 | # Convolution 2 18 | self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=(10,)) # 3 19 | self.bn2 = nn.BatchNorm1d(16) 20 | self.prelu2 = nn.PReLU() 21 | 22 | # Convolution 3 23 | self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=(8,)) # 1 24 | self.bn3 = nn.BatchNorm1d(32) 25 | self.prelu3 = nn.PReLU() 26 | 27 | # Convolution 4 28 | self.conv4 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=(6,)) # 1 29 | self.bn4 = nn.BatchNorm1d(32) 30 | self.prelu4 = nn.PReLU() 31 | 32 | # Convolution 5 33 | self.conv5 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=(4,)) # 1 34 | self.bn5 = nn.BatchNorm1d(32) 35 | self.prelu5 = nn.PReLU() 36 | 37 | # Fully connected 1 38 | self.fc1 = nn.Linear(temp*32, 32) 39 | self.prelu6 = nn.PReLU() 40 | 41 | # Fully connected 2 42 | self.fc2 = nn.Linear(32, num_classes) 43 | 44 | def forward(self, x): 45 | # Convolution 1 46 | out = self.conv1(x) 47 | # print('After convolution1:', out.shape) 48 | 49 | out = self.bn1(out) 50 | # print('After bn1:', out.shape) 51 | 52 | out = self.prelu1(out) 53 | out = out.reshape(out.shape[0], out.shape[1], -1) 54 | # print('After prelu1:', out.shape) 55 | 56 | # Convolution 2 57 | out = self.conv2(out) 58 | out = self.bn2(out) 59 | out = self.prelu2(out) 60 | # print('After convolution2, bn2, prelu2:', out.shape) 61 | 62 | # Convolution 3 63 | out = self.conv3(out) 64 | out = self.bn3(out) 65 | out = self.prelu3(out) 66 | # print('After convolution3, bn3, prelu3:', out.shape) 67 | 68 | # Convolution 4 69 | out = self.conv4(out) 70 | out = self.bn4(out) 71 | out = self.prelu4(out) 72 | # print('After convolution4, bn4, prelu4:', out.shape) 73 | 74 | # Convolution 5 75 | out = self.conv5(out) 76 | out = self.bn5(out) 77 | out = self.prelu5(out) 78 | # print('After convolution5, bn5, prelu5:', out.shape) 79 | 80 | # flatten 81 | out = out.view(out.size(0), -1) 82 | # print('After flatten:', out.shape) 83 | 84 | # Linear function 1 85 | out = self.fc1(out) 86 | out = self.prelu6(out) 87 | # print('After fc1:', out.shape) 88 | 89 | # Linear function (readout) 90 | out = self.fc2(out) 91 | # print('After fc2:', out.shape) 92 | 93 | return out -------------------------------------------------------------------------------- /models/CompleteHCNN/complete_hcnn.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class Complete_HCNN(pl.LightningModule): 7 | def __init__(self, lighten, homological_structures): 8 | super().__init__() 9 | self.name = "hcnn" 10 | if lighten: 11 | self.name += "-lighten" 12 | 13 | self.homological_structures = homological_structures 14 | self.tetrahedra = self.homological_structures['tetrahedra'] 15 | self.triangles = self.homological_structures['triangles'] 16 | self.edges = self.homological_structures['edges'] 17 | 18 | # ------------ # 19 | 20 | self.conv1_tetrahedra = nn.Sequential( 21 | nn.Conv2d( 22 | in_channels=1, out_channels=32, kernel_size=(1, 2), stride=(1, 2) 23 | ), 24 | nn.ReLU(), 25 | ) 26 | 27 | self.conv1_triangles = nn.Sequential( 28 | nn.Conv2d( 29 | in_channels=1, out_channels=32, kernel_size=(1, 2), stride=(1, 2) 30 | ), 31 | nn.ReLU(), 32 | ) 33 | 34 | self.conv1_edges = nn.Sequential( 35 | nn.Conv2d( 36 | in_channels=1, out_channels=32, kernel_size=(1, 2), stride=(1, 2) 37 | ), 38 | nn.ReLU(), 39 | ) 40 | 41 | # ------------ # 42 | 43 | self.conv2_tetrahedra = nn.Sequential( 44 | nn.Conv2d( 45 | in_channels=32, out_channels=32, kernel_size=(1, 4), stride=(1, 4) 46 | ), 47 | nn.ReLU(), 48 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 49 | nn.ReLU(), 50 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 51 | nn.ReLU(), 52 | ) 53 | 54 | self.conv2_triangles = nn.Sequential( 55 | nn.Conv2d( 56 | in_channels=32, out_channels=32, kernel_size=(1, 3), stride=(1, 3) 57 | ), 58 | nn.ReLU(), 59 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 60 | nn.ReLU(), 61 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 62 | nn.ReLU(), 63 | ) 64 | 65 | self.conv2_edges = nn.Sequential( 66 | nn.Conv2d( 67 | in_channels=32, out_channels=32, kernel_size=(1, 2), stride=(1, 2) 68 | ), 69 | nn.ReLU(), 70 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 71 | nn.ReLU(), 72 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 73 | nn.ReLU(), 74 | ) 75 | 76 | # ------------ # 77 | 78 | self.conv3_tetrahedra = nn.Sequential( 79 | nn.Conv2d( 80 | in_channels=32, out_channels=32, kernel_size=(1, int(len(self.tetrahedra) / 8)) 81 | ), 82 | nn.Dropout(0.35), 83 | nn.ReLU(), 84 | ) 85 | 86 | self.conv3_triangles = nn.Sequential( 87 | nn.Conv2d( 88 | in_channels=32, out_channels=32, kernel_size=(1, int(len(self.triangles) / 6)) 89 | ), 90 | nn.Dropout(0.35), 91 | nn.ReLU(), 92 | ) 93 | 94 | self.conv3_edges = nn.Sequential( 95 | nn.Conv2d( 96 | in_channels=32, out_channels=32, kernel_size=(1, int(len(self.edges) / 4)) 97 | ), 98 | nn.Dropout(0.35), 99 | nn.ReLU(), 100 | ) 101 | 102 | # ------------ # 103 | 104 | self.lstm = nn.LSTM( 105 | input_size=96, hidden_size=32, num_layers=1, batch_first=True 106 | ) 107 | self.fc1 = nn.Linear(32, 3) 108 | 109 | def forward(self, x): 110 | x_tetrahedra = x[:, :, :, self.tetrahedra] 111 | x_triangles = x[:, :, :, self.triangles] 112 | x_edges = x[:, :, :, self.edges] 113 | 114 | x_tetrahedra = self.conv1_tetrahedra(x_tetrahedra) 115 | x_triangles = self.conv1_triangles(x_triangles) 116 | x_edges = self.conv1_edges(x_edges) 117 | 118 | x_tetrahedra = self.conv2_tetrahedra(x_tetrahedra) 119 | x_triangles = self.conv2_triangles(x_triangles) 120 | x_edges = self.conv2_edges(x_edges) 121 | 122 | x_tetrahedra = self.conv3_tetrahedra(x_tetrahedra) 123 | x_triangles = self.conv3_triangles(x_triangles) 124 | x_edges = self.conv3_edges(x_edges) 125 | 126 | x = torch.cat((x_tetrahedra, x_triangles, x_edges), dim=1) 127 | 128 | x = x.permute(0, 2, 1, 3) 129 | x = torch.reshape(x, (-1, x.shape[1], x.shape[2])) 130 | 131 | x, _ = self.lstm(x) 132 | x = x[:, -1, :] 133 | logits = self.fc1(x) 134 | 135 | return logits 136 | -------------------------------------------------------------------------------- /models/DLA/DLA.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | from torch import nn 3 | import torch 4 | 5 | 6 | class DLA(pl.LightningModule): 7 | def __init__(self, lighten, num_snapshots=100, hidden_size=128): 8 | super().__init__() 9 | self.name = "mlp" 10 | num_features = 40 11 | if lighten: 12 | self.name += "-lighten" 13 | num_features = 20 14 | 15 | self.W1 = nn.Linear(num_features, num_features, bias=False) 16 | 17 | self.softmax = nn.Softmax(dim=1) 18 | 19 | self.gru = nn.GRU( 20 | input_size=num_features, 21 | hidden_size=hidden_size, 22 | num_layers=2, 23 | batch_first=True, 24 | dropout=0.5 25 | ) 26 | 27 | self.W2 = nn.Linear(hidden_size, hidden_size, bias=False) 28 | self.W3 = nn.Linear(num_snapshots*hidden_size, 3) 29 | 30 | def forward(self, x): 31 | # x.shape = [batch_size, num_snapshots, num_features] 32 | x = x.squeeze(1) 33 | 34 | X_tilde = self.W1(x) 35 | # alpha.shape = [batch_size, num_snapshots, num_features] 36 | 37 | alpha = self.softmax(X_tilde) 38 | # alpha.shape = [batch_size, num_snapshots, num_features] 39 | 40 | alpha = torch.mean(alpha, dim=2) 41 | # alpha.shape = [batch_size, num_snapshots] 42 | 43 | x_tilde = torch.einsum('ij,ijk->ijk', [alpha, x]) 44 | # x_tilde.shape = [batch_size, num_snapshots, num_features] 45 | 46 | H, _ = self.gru(x_tilde) 47 | # o.shape = [batch_size, num_snapshots, hidden_size] 48 | 49 | H_tilde = self.W2(H) 50 | # o.shape = [batch_size, num_snapshots, hidden_size] 51 | 52 | beta = self.softmax(H_tilde) 53 | # o.shape = [batch_size, num_snapshots, hidden_size] 54 | 55 | beta = torch.mean(beta, dim=2) 56 | # beta.shape = [batch_size, num_snapshots] 57 | 58 | h_tilde = torch.einsum('ij,ijk->ijk', [beta, H]) 59 | # h_tilde.shape = [batch_size, num_snapshots, hidden_size] 60 | 61 | h_tilde = torch.flatten(h_tilde, start_dim=1) 62 | # h_tilde.shape = [batch_size, hidden_size*num_snapshots] 63 | 64 | logits = self.W3(h_tilde) 65 | # out.shape = [batch_size, 3] 66 | 67 | return logits -------------------------------------------------------------------------------- /models/DeepLob/deeplob.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class DeepLOB(pl.LightningModule): 7 | def __init__(self, lighten): 8 | super().__init__() 9 | self.name = "deeplob" 10 | if lighten: 11 | self.name += "-lighten" 12 | 13 | # Convolution blocks. 14 | self.conv1 = nn.Sequential( 15 | nn.Conv2d( 16 | in_channels=1, out_channels=32, kernel_size=(1, 2), stride=(1, 2) 17 | ), 18 | nn.LeakyReLU(negative_slope=0.01), 19 | nn.BatchNorm2d(32), 20 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 21 | nn.LeakyReLU(negative_slope=0.01), 22 | nn.BatchNorm2d(32), 23 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 24 | nn.LeakyReLU(negative_slope=0.01), 25 | nn.BatchNorm2d(32), 26 | ) 27 | self.conv2 = nn.Sequential( 28 | nn.Conv2d( 29 | in_channels=32, out_channels=32, kernel_size=(1, 2), stride=(1, 2) 30 | ), 31 | nn.LeakyReLU(negative_slope=0.01), 32 | nn.BatchNorm2d(32), 33 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 34 | nn.LeakyReLU(negative_slope=0.01), 35 | nn.BatchNorm2d(32), 36 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 37 | nn.LeakyReLU(negative_slope=0.01), 38 | nn.BatchNorm2d(32), 39 | ) 40 | 41 | if lighten: 42 | conv3_kernel_size = 5 43 | else: 44 | conv3_kernel_size = 10 45 | 46 | self.conv3 = nn.Sequential( 47 | nn.Conv2d( 48 | in_channels=32, out_channels=32, kernel_size=(1, conv3_kernel_size) 49 | ), 50 | nn.LeakyReLU(negative_slope=0.01), 51 | nn.BatchNorm2d(32), 52 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 53 | nn.LeakyReLU(negative_slope=0.01), 54 | nn.BatchNorm2d(32), 55 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)), 56 | nn.LeakyReLU(negative_slope=0.01), 57 | nn.BatchNorm2d(32), 58 | ) 59 | 60 | # Inception modules. 61 | self.inp1 = nn.Sequential( 62 | nn.Conv2d( 63 | in_channels=32, out_channels=64, kernel_size=(1, 1), padding="same" 64 | ), 65 | nn.LeakyReLU(negative_slope=0.01), 66 | nn.BatchNorm2d(64), 67 | nn.Conv2d( 68 | in_channels=64, out_channels=64, kernel_size=(3, 1), padding="same" 69 | ), 70 | nn.LeakyReLU(negative_slope=0.01), 71 | nn.BatchNorm2d(64), 72 | ) 73 | self.inp2 = nn.Sequential( 74 | nn.Conv2d( 75 | in_channels=32, out_channels=64, kernel_size=(1, 1), padding="same" 76 | ), 77 | nn.LeakyReLU(negative_slope=0.01), 78 | nn.BatchNorm2d(64), 79 | nn.Conv2d( 80 | in_channels=64, out_channels=64, kernel_size=(5, 1), padding="same" 81 | ), 82 | nn.LeakyReLU(negative_slope=0.01), 83 | nn.BatchNorm2d(64), 84 | ) 85 | self.inp3 = nn.Sequential( 86 | nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)), 87 | nn.Conv2d( 88 | in_channels=32, out_channels=64, kernel_size=(1, 1), padding="same" 89 | ), 90 | nn.LeakyReLU(negative_slope=0.01), 91 | nn.BatchNorm2d(64), 92 | ) 93 | 94 | # lstm layers 95 | self.lstm = nn.LSTM( 96 | input_size=192, hidden_size=64, num_layers=1, batch_first=True 97 | ) 98 | self.fc1 = nn.Linear(64, 3) 99 | 100 | def forward(self, x): 101 | x = self.conv1(x) 102 | x = self.conv2(x) 103 | x = self.conv3(x) 104 | 105 | x_inp1 = self.inp1(x) 106 | x_inp2 = self.inp2(x) 107 | x_inp3 = self.inp3(x) 108 | 109 | x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1) 110 | 111 | x = x.permute(0, 2, 1, 3) 112 | x = torch.reshape(x, (-1, x.shape[1], x.shape[2])) 113 | 114 | x, _ = self.lstm(x) 115 | x = x[:, -1, :] 116 | logits = self.fc1(x) 117 | 118 | return logits 119 | -------------------------------------------------------------------------------- /models/LobTransformer/lobtransformer.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class LobTransformer(pl.LightningModule): 7 | def __init__(self, lighten): 8 | super().__init__() 9 | self.name = "lobtransformer" 10 | if lighten: 11 | self.name += "-lighten" 12 | 13 | hidden = 32 if not lighten else 16 14 | d_model = hidden * 2 * 3 15 | nhead = 8 if not lighten else 4 16 | num_layers = 2 if not lighten else 1 17 | 18 | # Convolution blocks. 19 | self.conv1 = nn.Sequential( 20 | nn.Conv2d( 21 | in_channels=1, out_channels=hidden, kernel_size=(1, 2), stride=(1, 2) 22 | ), 23 | nn.LeakyReLU(negative_slope=0.01), 24 | nn.BatchNorm2d(hidden), 25 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)), 26 | nn.LeakyReLU(negative_slope=0.01), 27 | nn.BatchNorm2d(hidden), 28 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)), 29 | nn.LeakyReLU(negative_slope=0.01), 30 | nn.BatchNorm2d(hidden), 31 | ) 32 | self.conv2 = nn.Sequential( 33 | nn.Conv2d( 34 | in_channels=hidden, out_channels=hidden, kernel_size=(1, 2), stride=(1, 2) 35 | ), 36 | nn.LeakyReLU(negative_slope=0.01), 37 | nn.BatchNorm2d(hidden), 38 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)), 39 | nn.LeakyReLU(negative_slope=0.01), 40 | nn.BatchNorm2d(hidden), 41 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)), 42 | nn.LeakyReLU(negative_slope=0.01), 43 | nn.BatchNorm2d(hidden), 44 | ) 45 | 46 | if lighten: 47 | conv3_kernel_size = 5 48 | else: 49 | conv3_kernel_size = 10 50 | 51 | self.conv3 = nn.Sequential( 52 | nn.Conv2d( 53 | in_channels=hidden, out_channels=hidden, kernel_size=(1, conv3_kernel_size) 54 | ), 55 | nn.LeakyReLU(negative_slope=0.01), 56 | nn.BatchNorm2d(hidden), 57 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)), 58 | nn.LeakyReLU(negative_slope=0.01), 59 | nn.BatchNorm2d(hidden), 60 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)), 61 | nn.LeakyReLU(negative_slope=0.01), 62 | nn.BatchNorm2d(hidden), 63 | ) 64 | 65 | # Inception modules. 66 | self.inp1 = nn.Sequential( 67 | nn.Conv2d( 68 | in_channels=hidden, out_channels=hidden*2, kernel_size=(1, 1), padding="same" 69 | ), 70 | nn.LeakyReLU(negative_slope=0.01), 71 | nn.BatchNorm2d(hidden*2), 72 | nn.Conv2d( 73 | in_channels=hidden*2, out_channels=hidden*2, kernel_size=(3, 1), padding="same" 74 | ), 75 | nn.LeakyReLU(negative_slope=0.01), 76 | nn.BatchNorm2d(hidden*2), 77 | ) 78 | self.inp2 = nn.Sequential( 79 | nn.Conv2d( 80 | in_channels=hidden, out_channels=hidden*2, kernel_size=(1, 1), padding="same" 81 | ), 82 | nn.LeakyReLU(negative_slope=0.01), 83 | nn.BatchNorm2d(hidden*2), 84 | nn.Conv2d( 85 | in_channels=hidden*2, out_channels=hidden*2, kernel_size=(5, 1), padding="same" 86 | ), 87 | nn.LeakyReLU(negative_slope=0.01), 88 | nn.BatchNorm2d(hidden*2), 89 | ) 90 | self.inp3 = nn.Sequential( 91 | nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)), 92 | nn.Conv2d( 93 | in_channels=hidden, out_channels=hidden*2, kernel_size=(1, 1), padding="same" 94 | ), 95 | nn.LeakyReLU(negative_slope=0.01), 96 | nn.BatchNorm2d(hidden*2), 97 | ) 98 | 99 | # transformer 100 | encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True) 101 | self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers) 102 | self.cat_head = nn.Linear(d_model, 3) 103 | 104 | 105 | def forward(self, x): 106 | x = self.conv1(x) 107 | x = self.conv2(x) 108 | x = self.conv3(x) 109 | 110 | x_inp1 = self.inp1(x) 111 | x_inp2 = self.inp2(x) 112 | x_inp3 = self.inp3(x) 113 | 114 | x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1) 115 | 116 | x = x.permute(0, 2, 1, 3) 117 | x = torch.reshape(x, (-1, x.shape[1], x.shape[2])) 118 | 119 | x = self.transformer_encoder(x) 120 | # mean pool 121 | x = torch.mean(x, dim=1) 122 | 123 | logits = self.cat_head(x) 124 | return logits 125 | -------------------------------------------------------------------------------- /models/TABL/bin_nn.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class BiN(pl.LightningModule): 7 | def __init__(self, d2, d1, t1, t2): 8 | super().__init__() 9 | self.t1 = t1 10 | self.d1 = d1 11 | self.t2 = t2 12 | self.d2 = d2 13 | 14 | bias1 = torch.Tensor(t1, 1) 15 | self.B1 = nn.Parameter(bias1) 16 | nn.init.constant_(self.B1, 0) 17 | 18 | l1 = torch.Tensor(t1, 1) 19 | self.l1 = nn.Parameter(l1) 20 | nn.init.xavier_normal_(self.l1) 21 | 22 | bias2 = torch.Tensor(d1, 1) 23 | self.B2 = nn.Parameter(bias2) 24 | nn.init.constant_(self.B2, 0) 25 | 26 | l2 = torch.Tensor(d1, 1) 27 | self.l2 = nn.Parameter(l2) 28 | nn.init.xavier_normal_(self.l2) 29 | 30 | y1 = torch.Tensor(1, ) 31 | self.y1 = nn.Parameter(y1) 32 | nn.init.constant_(self.y1, 0.5) 33 | 34 | y2 = torch.Tensor(1, ) 35 | self.y2 = nn.Parameter(y2) 36 | nn.init.constant_(self.y2, 0.5) 37 | 38 | def forward(self, x): 39 | 40 | # if the two scalars are negative then we setting them to 0 41 | if (self.y1[0] < 0): 42 | y1 = torch.cuda.FloatTensor(1, ) 43 | self.y1 = nn.Parameter(y1) 44 | nn.init.constant_(self.y1, 0.01) 45 | 46 | if (self.y2[0] < 0): 47 | y2 = torch.cuda.FloatTensor(1, ) 48 | self.y2 = nn.Parameter(y2) 49 | nn.init.constant_(self.y2, 0.01) 50 | 51 | # normalization along the temporal dimensione 52 | T2 = torch.ones([self.t1, 1], device="cuda") 53 | x2 = torch.mean(x, dim=2) 54 | x2 = torch.reshape(x2, (x2.shape[0], x2.shape[1], 1)) 55 | 56 | std = torch.std(x, dim=2) 57 | std = torch.reshape(std, (std.shape[0], std.shape[1], 1)) 58 | # it can be possible that the std of some temporal slices is 0, and this produces inf values, so we have to set them to one 59 | std[std < 1e-4] = 1 60 | 61 | diff = x - (x2 @ (T2.T)) 62 | Z2 = diff / (std @ (T2.T)) 63 | 64 | X2 = self.l2 @ T2.T 65 | X2 = X2 * Z2 66 | X2 = X2 + (self.B2 @ T2.T) 67 | 68 | # normalization along the feature dimension 69 | T1 = torch.ones([self.d1, 1], device="cuda") 70 | x1 = torch.mean(x, dim=1) 71 | x1 = torch.reshape(x1, (x1.shape[0], x1.shape[1], 1)) 72 | 73 | std = torch.std(x, dim=1) 74 | std = torch.reshape(std, (std.shape[0], std.shape[1], 1)) 75 | 76 | op1 = x1 @ T1.T 77 | op1 = torch.permute(op1, (0, 2, 1)) 78 | 79 | op2 = std @ T1.T 80 | op2 = torch.permute(op2, (0, 2, 1)) 81 | 82 | z1 = (x - op1) / (op2) 83 | X1 = (T1 @ self.l1.T) 84 | X1 = X1 * z1 85 | X1 = X1 + (T1 @ self.B1.T) 86 | 87 | # weighing the imporance of temporal and feature normalization 88 | x = self.y1 * X1 + self.y2 * X2 89 | 90 | return x -------------------------------------------------------------------------------- /models/TABL/bin_tabl.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | from torch import nn 3 | import torch 4 | from models.TABL.bin_nn import BiN 5 | from models.TABL.bl_layer import BL_layer 6 | from models.TABL.tabl_layer import TABL_layer 7 | 8 | 9 | class BiN_BTABL(pl.LightningModule): 10 | def __init__(self, d2, d1, t1, t2, d3, t3): 11 | super().__init__() 12 | 13 | self.BiN = BiN(d2, d1, t1, t2) 14 | self.BL = BL_layer(d2, d1, t1, t2) 15 | self.TABL = TABL_layer(d3, d2, t2, t3) 16 | self.dropout = nn.Dropout(0.1) 17 | 18 | def forward(self, x): 19 | x = x.squeeze(1) 20 | # first of all we pass the input to the BiN layer, then we use the B(TABL) architecture 21 | x = torch.permute(x, (0, 2, 1)) 22 | 23 | x = self.BiN(x) 24 | 25 | self.max_norm_(self.BL.W1.data) 26 | self.max_norm_(self.BL.W2.data) 27 | x = self.BL(x) 28 | x = self.dropout(x) 29 | 30 | self.max_norm_(self.TABL.W1.data) 31 | self.max_norm_(self.TABL.W.data) 32 | self.max_norm_(self.TABL.W2.data) 33 | x = self.TABL(x) 34 | x = torch.squeeze(x, 2) 35 | return x 36 | 37 | def max_norm_(self, w): 38 | with torch.no_grad(): 39 | if (torch.linalg.matrix_norm(w) > 10.0): 40 | norm = torch.linalg.matrix_norm(w) 41 | desired = torch.clamp(norm, min=0.0, max=10.0) 42 | w *= (desired / (1e-8 + norm)) 43 | 44 | 45 | class BiN_CTABL(pl.LightningModule): 46 | def __init__(self, d2, d1, t1, t2, d3, t3, d4, t4): 47 | super().__init__() 48 | 49 | self.BiN = BiN(d2, d1, t1, t2) 50 | self.BL = BL_layer(d2, d1, t1, t2) 51 | self.BL2 = BL_layer(d3, d2, t2, t3) 52 | self.TABL = TABL_layer(d4, d3, t3, t4) 53 | self.dropout = nn.Dropout(0.1) 54 | 55 | def forward(self, x): 56 | x = x.squeeze(1) 57 | # first of all we pass the input to the BiN layer, then we use the C(TABL) architecture 58 | x = torch.permute(x, (0, 2, 1)) 59 | 60 | x = self.BiN(x) 61 | 62 | self.max_norm_(self.BL.W1.data) 63 | self.max_norm_(self.BL.W2.data) 64 | x = self.BL(x) 65 | x = self.dropout(x) 66 | 67 | self.max_norm_(self.BL2.W1.data) 68 | self.max_norm_(self.BL2.W2.data) 69 | x = self.BL2(x) 70 | x = self.dropout(x) 71 | 72 | self.max_norm_(self.TABL.W1.data) 73 | self.max_norm_(self.TABL.W.data) 74 | self.max_norm_(self.TABL.W2.data) 75 | x = self.TABL(x) 76 | x = torch.squeeze(x, 2) 77 | return x 78 | 79 | def max_norm_(self, w): 80 | with torch.no_grad(): 81 | if (torch.linalg.matrix_norm(w) > 10.0): 82 | norm = torch.linalg.matrix_norm(w) 83 | desired = torch.clamp(norm, min=0.0, max=10.0) 84 | w *= (desired / (1e-8 + norm)) -------------------------------------------------------------------------------- /models/TABL/bl_layer.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | from torch import nn 3 | import torch 4 | 5 | 6 | class BL_layer(pl.LightningModule): 7 | def __init__(self, d2, d1, t1, t2): 8 | super().__init__() 9 | weight1 = torch.Tensor(d2, d1) 10 | self.W1 = nn.Parameter(weight1) 11 | nn.init.kaiming_uniform_(self.W1, nonlinearity='relu') 12 | 13 | weight2 = torch.Tensor(t1, t2) 14 | self.W2 = nn.Parameter(weight2) 15 | nn.init.kaiming_uniform_(self.W2, nonlinearity='relu') 16 | 17 | bias1 = torch.zeros((d2, t2)) 18 | self.B = nn.Parameter(bias1) 19 | nn.init.constant_(self.B, 0) 20 | 21 | self.activation = nn.ReLU() 22 | 23 | def forward(self, x): 24 | 25 | x = self.activation(self.W1 @ x @ self.W2 + self.B) 26 | 27 | return x -------------------------------------------------------------------------------- /models/TABL/tabl_layer.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | import torch 3 | from torch import nn 4 | 5 | 6 | class TABL_layer(pl.LightningModule): 7 | def __init__(self, d2, d1, t1, t2): 8 | super().__init__() 9 | self.t1 = t1 10 | 11 | weight = torch.Tensor(d2, d1) 12 | self.W1 = nn.Parameter(weight) 13 | nn.init.kaiming_uniform_(self.W1, nonlinearity='relu') 14 | 15 | weight2 = torch.Tensor(t1, t1) 16 | self.W = nn.Parameter(weight2) 17 | nn.init.constant_(self.W, 1 / t1) 18 | 19 | weight3 = torch.Tensor(t1, t2) 20 | self.W2 = nn.Parameter(weight3) 21 | nn.init.kaiming_uniform_(self.W2, nonlinearity='relu') 22 | 23 | bias1 = torch.Tensor(d2, t2) 24 | self.B = nn.Parameter(bias1) 25 | nn.init.constant_(self.B, 0) 26 | 27 | l = torch.Tensor(1, ) 28 | self.l = nn.Parameter(l) 29 | nn.init.constant_(self.l, 0.5) 30 | 31 | self.activation = nn.ReLU() 32 | 33 | def forward(self, X): 34 | 35 | # maintaining the weight parameter between 0 and 1. 36 | if (self.l[0] < 0): 37 | l = torch.Tensor(1, ) 38 | self.l = nn.Parameter(l) 39 | nn.init.constant_(self.l, 0.0) 40 | 41 | if (self.l[0] > 1): 42 | l = torch.Tensor(1, ) 43 | self.l = nn.Parameter(l) 44 | nn.init.constant_(self.l, 1.0) 45 | 46 | # modelling the dependence along the first mode of X while keeping the temporal order intact (7) 47 | X = self.W1 @ X 48 | 49 | # enforcing constant (1) on the diagonal 50 | W = self.W - self.W * torch.eye(self.t1, dtype=torch.float32, device="cuda") + torch.eye(self.t1, dtype=torch.float32, device="cuda") / self.t1 51 | 52 | # attention, the aim of the second step is to learn how important the temporal instances are to each other (8) 53 | E = X @ W 54 | 55 | # computing the attention mask (9) 56 | A = torch.softmax(E, dim=-1) 57 | 58 | # applying a soft attention mechanism (10) 59 | # he attention mask A obtained from the third step is used to zero out the effect of unimportant elements 60 | X = self.l[0] * (X) + (1.0 - self.l[0]) * X * A 61 | 62 | # the final step of the proposed layer estimates the temporal mapping W2, after the bias shift (11) 63 | y = X @ self.W2 + self.B 64 | return y -------------------------------------------------------------------------------- /models/Transformer/transformer.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import numpy as np 4 | 5 | import pytorch_lightning as pl 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class SinusoidalPositionalEmbedding(nn.Embedding): 11 | """This module produces sinusoidal positional embeddings of any length.""" 12 | 13 | def __init__( 14 | self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None 15 | ) -> None: 16 | super().__init__(num_positions, embedding_dim) 17 | self.weight = self._init_weight(self.weight) 18 | 19 | @staticmethod 20 | def _init_weight(out: nn.Parameter) -> nn.Parameter: 21 | """ 22 | Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in 23 | the 2nd half of the vector. [dim // 2:] 24 | """ 25 | n_pos, dim = out.shape 26 | position_enc = np.array( 27 | [ 28 | [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] 29 | for pos in range(n_pos) 30 | ] 31 | ) 32 | out.requires_grad = False # set early to avoid an error in pytorch-1.8+ 33 | sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1 34 | out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) 35 | out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) 36 | out.detach_() 37 | return out 38 | 39 | @torch.no_grad() 40 | def forward( 41 | self, input_ids_shape: torch.Size, past_key_values_length: int = 0 42 | ) -> torch.Tensor: 43 | """`input_ids_shape` is expected to be [bsz x seqlen].""" 44 | _, seq_len = input_ids_shape[:2] 45 | positions = torch.arange( 46 | past_key_values_length, 47 | past_key_values_length + seq_len, 48 | dtype=torch.long, 49 | device=self.weight.device, 50 | ) 51 | return super().forward(positions) 52 | 53 | 54 | class Transformer(pl.LightningModule): 55 | def __init__( 56 | self, 57 | lighten, 58 | dropout: float = 0.1, 59 | activation: str = "relu", 60 | norm_first: bool = False, 61 | ): 62 | super().__init__() 63 | self.name = "transformer" 64 | if lighten: 65 | self.name += "-lighten" 66 | 67 | d_model = 64 if not lighten else 32 68 | dim_feedforward = 256 if not lighten else 128 69 | nhead = 8 if not lighten else 4 70 | num_layers = 2 if not lighten else 1 71 | 72 | self.embed = nn.Linear(40, d_model, bias=False) 73 | 74 | self.embed_positions = SinusoidalPositionalEmbedding(100, d_model) 75 | 76 | layer_norm_eps: float = 1e-5 77 | encoder_layer = nn.TransformerEncoderLayer( 78 | d_model=d_model, 79 | nhead=nhead, 80 | dim_feedforward=dim_feedforward, 81 | dropout=dropout, 82 | activation=activation, 83 | layer_norm_eps=layer_norm_eps, 84 | norm_first=norm_first, 85 | batch_first=True, 86 | ) 87 | encoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps) 88 | self.transformer_encoder = nn.TransformerEncoder( 89 | encoder_layer, num_layers=num_layers, norm=encoder_norm 90 | ) 91 | self.cat_head = nn.Linear(d_model, 3) 92 | 93 | def forward(self, x): 94 | x = self.embed(x.squeeze(1)) 95 | 96 | embed_pos = self.embed_positions(x.shape) 97 | 98 | # transformer encoder 99 | x = self.transformer_encoder(x + embed_pos) 100 | 101 | # mean pool for classification 102 | x = torch.mean(x, dim=1) 103 | 104 | logits = self.cat_head(x) 105 | return logits 106 | -------------------------------------------------------------------------------- /models/iTransformer/itransformer.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class ITransformer(pl.LightningModule): 7 | def __init__( 8 | self, 9 | lighten, 10 | dropout: float = 0.1, 11 | activation: str = "relu", 12 | norm_first: bool = False, 13 | ): 14 | super().__init__() 15 | self.name = "itransformer" 16 | if lighten: 17 | self.name += "-lighten" 18 | 19 | d_model = 64 if not lighten else 32 20 | dim_feedforward = 256 if not lighten else 128 21 | nhead = 8 if not lighten else 4 22 | num_layers = 2 if not lighten else 1 23 | 24 | self.embed = nn.Linear(100, d_model, bias=False) 25 | layer_norm_eps: float = 1e-5 26 | encoder_layer = nn.TransformerEncoderLayer( 27 | d_model=d_model, 28 | nhead=nhead, 29 | dim_feedforward=dim_feedforward, 30 | dropout=dropout, 31 | activation=activation, 32 | layer_norm_eps=layer_norm_eps, 33 | norm_first=norm_first, 34 | batch_first=True, 35 | ) 36 | encoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps) 37 | self.transformer_encoder = nn.TransformerEncoder( 38 | encoder_layer, num_layers=num_layers, norm=encoder_norm 39 | ) 40 | self.cat_head = nn.Linear(d_model, 3) 41 | 42 | def forward(self, x): 43 | x = x.squeeze(1) 44 | # transpose 45 | x = x.permute(0, 2, 1) 46 | x = self.embed(x) 47 | 48 | # transformer encoder 49 | x = self.transformer_encoder(x) 50 | 51 | # mean pool for classification 52 | x = torch.mean(x, dim=1) 53 | 54 | logits = self.cat_head(x) 55 | return logits 56 | -------------------------------------------------------------------------------- /optimizers/executor.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from torch.utils.data import DataLoader 3 | import torch 4 | 5 | from loaders.custom_dataset import CustomDataset 6 | from models.DeepLob.deeplob import DeepLOB 7 | from models.iTransformer.itransformer import ITransformer 8 | from models.Transformer.transformer import Transformer 9 | from models.LobTransformer.lobtransformer import LobTransformer 10 | from models.DLA.DLA import DLA 11 | from models.CNN1.cnn1 import CNN1 12 | from models.CNN2.cnn2 import CNN2 13 | from models.AxialLob.axiallob import AxialLOB 14 | from models.TABL.bin_tabl import BiN_BTABL, BiN_CTABL 15 | from models.CompleteHCNN.complete_hcnn import Complete_HCNN 16 | from optimizers.lightning_batch_gd import BatchGDManager 17 | from loggers import logger 18 | from utils import create_tree, get_training_test_stocks_as_string 19 | 20 | 21 | class Executor: 22 | def __init__(self, experiment_id, general_hyperparameters, model_hyperparameters, torch_dataset_preparation=False, torch_dataset_preparation_backtest=False): 23 | self.manager = None 24 | self.model = None 25 | self.experiment_id = experiment_id 26 | self.torch_dataset_preparation = torch_dataset_preparation 27 | self.torch_dataset_preparation_backtest = torch_dataset_preparation_backtest 28 | 29 | self.training_stocks_string, self.test_stocks_string = get_training_test_stocks_as_string(general_hyperparameters) 30 | 31 | if self.torch_dataset_preparation: 32 | create_tree(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/") 33 | 34 | if general_hyperparameters["model"] == "deeplob": 35 | self.model = DeepLOB(lighten=model_hyperparameters["lighten"]) 36 | elif general_hyperparameters["model"] == "transformer": 37 | self.model = Transformer(lighten=model_hyperparameters["lighten"]) 38 | elif general_hyperparameters["model"] == "itransformer": 39 | self.model = ITransformer(lighten=model_hyperparameters["lighten"]) 40 | elif general_hyperparameters["model"] == "lobtransformer": 41 | self.model = LobTransformer(lighten=model_hyperparameters["lighten"]) 42 | elif general_hyperparameters["model"] == "dla": 43 | self.model = DLA(lighten=model_hyperparameters["lighten"]) 44 | elif general_hyperparameters["model"] == "cnn1": 45 | self.model = CNN1() 46 | elif general_hyperparameters["model"] == "cnn2": 47 | self.model = CNN2() 48 | elif general_hyperparameters["model"] == "binbtabl": 49 | self.model = BiN_BTABL(120, 40, 100, 5, 3, 1) 50 | elif general_hyperparameters["model"] == "binctabl": 51 | self.model = BiN_CTABL(120, 40, 100, 5, 120, 5, 3, 1) 52 | elif general_hyperparameters["model"] == "axiallob": 53 | self.model = AxialLOB() 54 | elif general_hyperparameters["model"] == "hlob": 55 | homological_structures = torch.load(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/complete_homological_structures.pt") 56 | self.model = Complete_HCNN(lighten=model_hyperparameters["lighten"], homological_structures=homological_structures) 57 | 58 | if self.torch_dataset_preparation: 59 | # Prepare the training dataloader. 60 | dataset = CustomDataset( 61 | dataset=general_hyperparameters["dataset"], 62 | learning_stage="training", 63 | window_size=model_hyperparameters["history_length"], 64 | shuffling_seed=model_hyperparameters["shuffling_seed"], 65 | cache_size=1, 66 | lighten=model_hyperparameters["lighten"], 67 | threshold=model_hyperparameters["threshold"], 68 | all_horizons=general_hyperparameters["horizons"], 69 | prediction_horizon=model_hyperparameters["prediction_horizon"], 70 | targets_type=general_hyperparameters["targets_type"], 71 | balanced_dataloader=model_hyperparameters["balanced_sampling"], 72 | training_stocks=general_hyperparameters["training_stocks"], 73 | validation_stocks=general_hyperparameters["target_stocks"], 74 | target_stocks=general_hyperparameters["target_stocks"] 75 | ) 76 | torch.save(dataset, f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/training_dataset.pt") 77 | elif self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest is False: 78 | dataset = torch.load(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/training_dataset.pt") 79 | self.train_loader = DataLoader( 80 | dataset, 81 | batch_size=model_hyperparameters["batch_size"], 82 | shuffle=False, 83 | num_workers=model_hyperparameters["num_workers"], 84 | sampler=dataset.glob_indices, 85 | ) 86 | 87 | if self.torch_dataset_preparation: 88 | # Prepare the validation dataloader. 89 | dataset = CustomDataset( 90 | dataset=general_hyperparameters["dataset"], 91 | learning_stage="validation", 92 | window_size=model_hyperparameters["history_length"], 93 | shuffling_seed=model_hyperparameters["shuffling_seed"], 94 | cache_size=1, 95 | lighten=model_hyperparameters["lighten"], 96 | threshold=model_hyperparameters["threshold"], 97 | all_horizons=general_hyperparameters["horizons"], 98 | targets_type=general_hyperparameters["targets_type"], 99 | prediction_horizon=model_hyperparameters["prediction_horizon"], 100 | training_stocks=general_hyperparameters["training_stocks"], 101 | validation_stocks=general_hyperparameters["target_stocks"], 102 | target_stocks=general_hyperparameters["target_stocks"] 103 | ) 104 | 105 | torch.save(dataset, f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/validation_dataset.pt") 106 | elif self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest is False: 107 | dataset = torch.load(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/validation_dataset.pt") 108 | self.val_loader = DataLoader( 109 | dataset, 110 | batch_size=model_hyperparameters["batch_size"], 111 | shuffle=False, 112 | num_workers=model_hyperparameters["num_workers"], 113 | ) 114 | 115 | if self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest: 116 | dataset = CustomDataset( 117 | dataset=general_hyperparameters["dataset"], 118 | learning_stage="test", 119 | window_size=model_hyperparameters["history_length"], 120 | shuffling_seed=model_hyperparameters["shuffling_seed"], 121 | cache_size=1, 122 | lighten=model_hyperparameters["lighten"], 123 | threshold=model_hyperparameters["threshold"], 124 | all_horizons=general_hyperparameters["horizons"], 125 | targets_type=general_hyperparameters["targets_type"], 126 | prediction_horizon=model_hyperparameters["prediction_horizon"], 127 | backtest=True, 128 | training_stocks=general_hyperparameters["training_stocks"], 129 | validation_stocks=general_hyperparameters["target_stocks"], 130 | target_stocks=general_hyperparameters["target_stocks"] 131 | ) 132 | torch.save(dataset, f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/test_dataset_backtest.pt") 133 | elif self.torch_dataset_preparation and self.torch_dataset_preparation_backtest is False: 134 | dataset = CustomDataset( 135 | dataset=general_hyperparameters["dataset"], 136 | learning_stage="test", 137 | window_size=model_hyperparameters["history_length"], 138 | shuffling_seed=model_hyperparameters["shuffling_seed"], 139 | cache_size=1, 140 | lighten=model_hyperparameters["lighten"], 141 | threshold=model_hyperparameters["threshold"], 142 | all_horizons=general_hyperparameters["horizons"], 143 | targets_type=general_hyperparameters["targets_type"], 144 | prediction_horizon=model_hyperparameters["prediction_horizon"], 145 | training_stocks=general_hyperparameters["training_stocks"], 146 | validation_stocks=general_hyperparameters["target_stocks"], 147 | target_stocks=general_hyperparameters["target_stocks"] 148 | ) 149 | torch.save(dataset, f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/test_dataset.pt") 150 | elif self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest is False: 151 | dataset = torch.load(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/test_dataset.pt") 152 | self.test_loader = DataLoader( 153 | dataset, 154 | batch_size=model_hyperparameters["batch_size"], 155 | shuffle=False, 156 | num_workers=model_hyperparameters["num_workers"], 157 | ) 158 | 159 | if self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest is False: 160 | self.manager = BatchGDManager( 161 | experiment_id=experiment_id, 162 | model=self.model, 163 | train_loader=self.train_loader, 164 | val_loader=self.val_loader, 165 | test_loader=self.test_loader, 166 | epochs=model_hyperparameters["epochs"], 167 | learning_rate=model_hyperparameters["learning_rate"], 168 | patience=model_hyperparameters["patience"], 169 | general_hyperparameters=general_hyperparameters, 170 | model_hyperparameters=model_hyperparameters, 171 | ) 172 | 173 | def execute_training(self): 174 | self.manager.train() 175 | 176 | def execute_testing(self): 177 | self.manager.test() 178 | 179 | def logger_clean_up(self): 180 | folder_path = f"{logger.find_save_path(self.experiment_id)}/wandb/" 181 | try: 182 | shutil.rmtree(folder_path) 183 | except: 184 | pass 185 | 186 | 187 | -------------------------------------------------------------------------------- /optimizers/lightning_batch_gd.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | import pickle 4 | import wandb 5 | import shutil 6 | import stat 7 | 8 | import lightning.pytorch as pl 9 | import numpy as np 10 | import torch 11 | from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint 12 | from torch import nn, optim 13 | from torchmetrics import Accuracy, F1Score 14 | from lightning.pytorch.loggers import WandbLogger 15 | 16 | from loggers import logger 17 | from utils import get_best_levels_prices_and_labels, wandb_hyperparameters_saving 18 | import sys 19 | 20 | class LOBLightningModule(pl.LightningModule): 21 | def __init__( 22 | self, 23 | model, 24 | experiment_id, 25 | learning_rate, 26 | general_hyperparameters, 27 | model_hyperparameters, 28 | ): 29 | super().__init__() 30 | self.model = model 31 | self.experiment_id = experiment_id 32 | self.learning_rate = learning_rate 33 | self.general_hyperparameters = general_hyperparameters 34 | self.model_hyperparameters = model_hyperparameters 35 | 36 | self.loss = nn.CrossEntropyLoss() 37 | 38 | self.training_accuracy = Accuracy(task="multiclass", num_classes=3) 39 | self.training_f1 = F1Score(task="multiclass", num_classes=3, average="macro") 40 | self.validation_accuracy = Accuracy(task="multiclass", num_classes=3) 41 | self.validation_f1 = F1Score(task="multiclass", num_classes=3, average="macro") 42 | 43 | self.batch_loss_training = [] 44 | self.batch_accuracies_training = [] 45 | self.batch_f1_scores_training = [] 46 | self.batch_loss_validation = [] 47 | self.batch_accuracies_validation = [] 48 | self.batch_f1_scores_validation = [] 49 | self.batch_loss_test = [] 50 | self.test_outputs = [] 51 | self.test_targets = [] 52 | self.test_probs = [] 53 | 54 | self.csv_path = f"{logger.find_save_path(experiment_id)}/metrics.csv" 55 | 56 | def forward(self, x): 57 | return self.model(x) 58 | 59 | def training_step(self, batch, batch_idx): 60 | inputs, targets = batch 61 | logits = self.model(inputs) 62 | loss = self.loss(logits, targets) 63 | outputs = nn.functional.softmax(logits, dim=1) 64 | outputs = torch.argmax(outputs, dim=1) 65 | train_acc = self.training_accuracy(outputs, targets) 66 | train_f1 = self.training_f1(outputs, targets) 67 | 68 | self.batch_loss_training.append(loss.item()) 69 | self.batch_accuracies_training.append(train_acc.item()) 70 | self.batch_f1_scores_training.append(train_f1.item()) 71 | 72 | return loss 73 | 74 | def validation_step(self, batch, batch_idx): 75 | inputs, targets = batch 76 | logits = self.model(inputs) 77 | loss = self.loss(logits, targets) 78 | outputs = nn.functional.softmax(logits, dim=1) 79 | outputs = torch.argmax(outputs, dim=1) 80 | val_acc = self.validation_accuracy(outputs, targets) 81 | val_f1 = self.validation_f1(outputs, targets) 82 | 83 | self.batch_loss_validation.append(loss.item()) 84 | self.batch_accuracies_validation.append(val_acc.item()) 85 | self.batch_f1_scores_validation.append(val_f1.item()) 86 | 87 | return loss 88 | 89 | def test_step(self, batch, batch_idx): 90 | inputs, targets = batch 91 | logits = self.model(inputs) 92 | loss = self.loss(logits, targets) 93 | outputs = nn.functional.softmax(logits, dim=1) 94 | 95 | saving_probs = copy.copy(outputs) 96 | self.test_probs.extend(saving_probs.tolist()) 97 | 98 | outputs = torch.argmax(outputs, dim=1).tolist() 99 | self.test_outputs.extend(outputs) 100 | self.test_targets.extend(targets.tolist()) 101 | 102 | return loss 103 | 104 | def configure_optimizers(self): 105 | optimizer = optim.AdamW( 106 | self.model.parameters(), 107 | lr=self.model_hyperparameters["learning_rate"], 108 | betas=(0.9, 0.95), 109 | weight_decay=1e-1, 110 | ) 111 | return optimizer 112 | 113 | def on_validation_epoch_end(self): 114 | # Calculate the average accuracy, F1, and MCC for the epoch (assuming you have them stored in a list). 115 | avg_loss_training = np.mean( 116 | self.batch_loss_training 117 | ) # Average of the batch-level losses (training). 118 | avg_accuracy_training = np.mean( 119 | self.batch_accuracies_training 120 | ) # Average of the batch-level accuracies (training). 121 | avg_f1_score_training = np.mean( 122 | self.batch_f1_scores_training 123 | ) # Average of the batch-level F1 scores (training). 124 | 125 | avg_loss_validation = np.mean( 126 | self.batch_loss_validation 127 | ) # Replace with your batch-level loss list (validation). 128 | avg_accuracy_validation = np.mean( 129 | self.batch_accuracies_validation 130 | ) # Replace with your batch-level accuracy list (validation). 131 | avg_f1_score_validation = np.mean( 132 | self.batch_f1_scores_validation 133 | ) # Replace with your batch-level F1 score list (validation). 134 | 135 | self.log( 136 | "loss", 137 | torch.tensor(avg_loss_training), 138 | prog_bar=True, 139 | on_step=False, 140 | on_epoch=True, 141 | ) 142 | self.log( 143 | "acc", 144 | torch.tensor(avg_accuracy_training), 145 | prog_bar=True, 146 | on_step=False, 147 | on_epoch=True, 148 | ) 149 | self.log( 150 | "f1", 151 | torch.tensor(avg_f1_score_training), 152 | prog_bar=False, 153 | on_step=False, 154 | on_epoch=True, 155 | ) 156 | self.log( 157 | "val_loss", 158 | torch.tensor(avg_loss_validation), 159 | prog_bar=True, 160 | on_step=False, 161 | on_epoch=True, 162 | ) 163 | self.log( 164 | "val_acc", 165 | torch.tensor(avg_accuracy_validation), 166 | prog_bar=True, 167 | on_step=False, 168 | on_epoch=True, 169 | ) 170 | self.log( 171 | "val_f1", 172 | torch.tensor(avg_f1_score_validation), 173 | prog_bar=False, 174 | on_step=False, 175 | on_epoch=True, 176 | ) 177 | 178 | self.training_accuracy.reset() 179 | self.training_f1.reset() 180 | self.validation_accuracy.reset() 181 | self.validation_f1.reset() 182 | 183 | # Append metrics to the list 184 | metrics_data = [ 185 | avg_loss_training, 186 | avg_accuracy_training, 187 | avg_f1_score_training, 188 | avg_loss_validation, 189 | avg_accuracy_validation, 190 | avg_f1_score_validation, 191 | ] 192 | 193 | # Save metrics to a CSV file 194 | if not os.path.exists(self.csv_path): 195 | header = [ 196 | "Training_Loss", 197 | "Training_Accuracy", 198 | "Training_F1", 199 | "Validation_Loss", 200 | "Validation_Accuracy", 201 | "Validation_F1", 202 | ] 203 | with open(self.csv_path, "w") as file: 204 | file.write(",".join(header) + "\n") 205 | 206 | with open(self.csv_path, "a") as file: 207 | file.write(",".join(map(str, metrics_data)) + "\n") 208 | 209 | def on_test_end(self): 210 | best_levels_prices, sanity_check_labels = get_best_levels_prices_and_labels( 211 | dataset=self.general_hyperparameters["dataset"], 212 | target_stocks=self.general_hyperparameters["target_stocks"], 213 | history_length=self.model_hyperparameters["history_length"], 214 | prediction_horizon=self.model_hyperparameters["prediction_horizon"], 215 | all_horizons=self.general_hyperparameters["horizons"], 216 | threshold=self.model_hyperparameters["threshold"], 217 | ) 218 | with open( 219 | os.path.join(logger.find_save_path(self.experiment_id), "prediction.pkl"), 220 | "wb", 221 | ) as f: 222 | pickle.dump( 223 | [ 224 | best_levels_prices, 225 | sanity_check_labels, 226 | np.array(self.test_targets), 227 | np.array(self.test_outputs), 228 | np.array(self.test_probs), 229 | ], 230 | f, 231 | ) 232 | 233 | 234 | class BatchGDManager: 235 | def __init__( 236 | self, 237 | experiment_id, 238 | model, 239 | train_loader, 240 | val_loader, 241 | test_loader, 242 | epochs, 243 | learning_rate, 244 | patience, 245 | general_hyperparameters, 246 | model_hyperparameters, 247 | ): 248 | self.experiment_id = experiment_id 249 | self.model = model 250 | self.train_loader = train_loader 251 | self.val_loader = val_loader 252 | self.test_loader = test_loader 253 | self.epochs = epochs 254 | self.learning_rate = learning_rate 255 | self.patience = patience 256 | self.general_hyperparameters = general_hyperparameters 257 | self.model_hyperparameters = model_hyperparameters 258 | self.lob_lightning_module = None 259 | self.trainer = None 260 | self.deleted_run = None 261 | 262 | def delete_run(self): 263 | api = wandb.Api() 264 | project_path = "" # TODO: Specify here the name of WB project. 265 | runs = api.runs(path=project_path) 266 | print('Deleting runs...') 267 | while len(runs) < 1: 268 | runs = api.runs(path=project_path) 269 | for run in runs: 270 | input_list = run.metadata 271 | if input_list is not None: 272 | input_list = input_list['args'] 273 | result_dict = {input_list[i][2:]: input_list[i + 1] for i in range(0, len(input_list), 2)} 274 | modified_dict = result_dict 275 | if modified_dict['model'] == str(self.general_hyperparameters['model']) and modified_dict['prediction_horizon'] == str(self.model_hyperparameters['prediction_horizon']) and modified_dict['training_stocks'] == str(self.general_hyperparameters['training_stocks'][0]) and modified_dict['target_stocks'] == str(self.general_hyperparameters['target_stocks'][0]): 276 | self.deleted_run = run.name 277 | run.delete() 278 | print(f"Run succesfully deleted from WanDB: {run.name}.") 279 | 280 | def train(self): 281 | self.lob_lightning_module = LOBLightningModule( 282 | self.model, 283 | experiment_id=self.experiment_id, 284 | learning_rate=self.learning_rate, 285 | general_hyperparameters=self.general_hyperparameters, 286 | model_hyperparameters=self.model_hyperparameters, 287 | ) 288 | 289 | checkpoint_callback = ModelCheckpoint( 290 | monitor="val_loss", 291 | dirpath=logger.find_save_path(self.experiment_id), 292 | filename="best_val_model", 293 | save_top_k=1, 294 | mode="min", 295 | ) 296 | early_stopping_callback = EarlyStopping("val_loss", patience=self.patience, min_delta=0.003) 297 | 298 | os.environ["WANDB_API_KEY"] = "" # TODO: Insert API key 299 | os.environ["WANDB__SERVICE_WAIT"] = "300" 300 | try: 301 | wandb_logger = WandbLogger( 302 | project="Limit_Order_Book", 303 | name=self.experiment_id, 304 | save_dir=logger.find_save_path(self.experiment_id), 305 | ) 306 | wandb_hyperparameters_saving( 307 | wandb_logger=wandb_logger, 308 | general_hyperparameters=self.general_hyperparameters, 309 | model_hyperparameters=self.model_hyperparameters, 310 | ) 311 | self.trainer = pl.Trainer( 312 | max_epochs=self.epochs, 313 | callbacks=[checkpoint_callback, early_stopping_callback], 314 | logger=wandb_logger, 315 | num_sanity_val_steps=0, 316 | ) 317 | self.trainer.fit(self.lob_lightning_module, self.train_loader, self.val_loader) 318 | wandb.finish() 319 | except: 320 | root_path = sys.path[0] 321 | dir_path = f"{root_path}/loggers/results/{self.experiment_id}" 322 | if os.path.exists(dir_path): 323 | shutil.rmtree(dir_path) 324 | print(f"Folder {self.experiment_id} deleted successfully.") 325 | else: 326 | print(f"Unable to delete folder {self.experiment_id}.") 327 | 328 | self.delete_run() 329 | 330 | model = self.general_hyperparameters['model'] 331 | horizon = self.model_hyperparameters['prediction_horizon'] 332 | training_stocks = self.general_hyperparameters['training_stocks'] 333 | target_stocks = self.general_hyperparameters['target_stocks'] 334 | errors_string = f"{model} {horizon} {training_stocks} {target_stocks} {self.deleted_run}\n" 335 | with open("errors.txt", 'r+') as file: 336 | content = file.read() 337 | 338 | # If the string does not exist in the file, append it 339 | if errors_string.strip() not in content: 340 | # Move the cursor to the end of the file before appending 341 | file.write(errors_string) 342 | print("String appended successfully.") 343 | else: 344 | print("String already exists in the file.") 345 | #raise Exception 346 | 347 | def test(self): 348 | if self.trainer is None: 349 | self.lob_lightning_module = LOBLightningModule( 350 | self.model, 351 | experiment_id=self.experiment_id, 352 | learning_rate=self.learning_rate, 353 | general_hyperparameters=self.general_hyperparameters, 354 | model_hyperparameters=self.model_hyperparameters, 355 | ) 356 | self.trainer = pl.Trainer() 357 | try: 358 | best_model = self.lob_lightning_module.load_from_checkpoint( 359 | checkpoint_path=f"{logger.find_save_path(self.experiment_id)}/best_val_model.ckpt", 360 | model=self.model, 361 | experiment_id=self.experiment_id, 362 | learning_rate=self.learning_rate, 363 | general_hyperparameters=self.general_hyperparameters, 364 | model_hyperparameters=self.model_hyperparameters, 365 | ) 366 | except: 367 | best_model = self.lob_lightning_module.load_from_checkpoint( 368 | checkpoint_path=f"{logger.find_save_path(self.experiment_id)}/best_val_model.ckpt", 369 | map_location=torch.device('cpu'), 370 | model=self.model, 371 | experiment_id=self.experiment_id, 372 | learning_rate=self.learning_rate, 373 | general_hyperparameters=self.general_hyperparameters, 374 | model_hyperparameters=self.model_hyperparameters, 375 | ) 376 | self.trainer.test(best_model, dataloaders=self.test_loader) 377 | else: 378 | best_model_path = ( 379 | f"{logger.find_save_path(self.experiment_id)}/best_val_model.ckpt" 380 | ) 381 | self.trainer.test(ckpt_path=best_model_path, dataloaders=self.test_loader) 382 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.5 2 | aiosignal==1.3.1 3 | annotated-types==0.5.0 4 | anyio==3.7.1 5 | appdirs==1.4.4 6 | arrow==1.2.3 7 | async-timeout==4.0.3 8 | attrs==23.1.0 9 | backoff==2.2.1 10 | beautifulsoup4==4.12.2 11 | blessed==1.20.0 12 | certifi==2023.7.22 13 | charset-normalizer==3.2.0 14 | click==8.1.7 15 | cmake==3.25.0 16 | contourpy==1.1.1 17 | croniter==1.4.1 18 | cycler==0.11.0 19 | dateutils==0.6.12 20 | deepdiff==6.5.0 21 | docker-pycreds==0.4.0 22 | exceptiongroup==1.1.3 23 | fast-tmfg==0.0.8 24 | fastapi==0.103.1 25 | filelock==3.9.0 26 | fonttools==4.42.1 27 | frozenlist==1.4.0 28 | fsspec==2023.9.2 29 | gitdb==4.0.10 30 | GitPython==3.1.37 31 | h11==0.14.0 32 | idna==3.4 33 | importlib-resources==6.1.0 34 | inquirer==3.1.3 35 | itsdangerous==2.1.2 36 | Jinja2==3.1.2 37 | joblib==1.3.2 38 | kiwisolver==1.4.5 39 | lightning==2.0.9 40 | lightning-cloud==0.5.38 41 | lightning-utilities==0.9.0 42 | lit==15.0.7 43 | markdown-it-py==3.0.0 44 | MarkupSafe==2.1.2 45 | matplotlib==3.8.0 46 | mdurl==0.1.2 47 | mpmath==1.3.0 48 | multidict==6.0.4 49 | networkx==3.0 50 | numpy==1.26.0 51 | ordered-set==4.1.0 52 | packaging==23.1 53 | pandas==2.1.1 54 | pathtools==0.1.2 55 | Pillow==10.0.1 56 | polars==0.19.5 57 | protobuf==4.24.3 58 | psutil==5.9.5 59 | pyarrow==13.0.0 60 | pydantic==2.1.1 61 | pydantic_core==2.4.0 62 | Pygments==2.16.1 63 | PyJWT==2.8.0 64 | pyparsing==3.1.1 65 | python-dateutil==2.8.2 66 | python-editor==1.0.4 67 | python-multipart==0.0.6 68 | pytorch-lightning==2.0.9 69 | pytz==2023.3.post1 70 | PyYAML==6.0.1 71 | readchar==4.0.5 72 | requests==2.31.0 73 | rich==13.5.3 74 | scikit-learn==1.3.1 75 | scipy==1.11.3 76 | seaborn==0.13.2 77 | sentry-sdk==1.31.0 78 | setproctitle==1.3.2 79 | six==1.16.0 80 | smmap==5.0.1 81 | sniffio==1.3.0 82 | soupsieve==2.5 83 | starlette==0.27.0 84 | starsessions==1.3.0 85 | sympy==1.12 86 | threadpoolctl==3.2.0 87 | torch==2.0.0+cu118 88 | torchinfo==1.8.0 89 | torchmetrics==1.2.0 90 | tqdm==4.66.1 91 | traitlets==5.10.1 92 | triton==2.0.0 93 | typing_extensions==4.8.0 94 | tzdata==2023.3 95 | urllib3==1.26.16 96 | uvicorn==0.23.2 97 | wandb==0.15.11 98 | wcwidth==0.2.6 99 | websocket-client==1.6.3 100 | websockets==11.0.3 101 | yarl==1.9.2 102 | zipp==3.17.0 103 | -------------------------------------------------------------------------------- /requirements_mac_os.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.5 2 | aiosignal==1.3.1 3 | annotated-types==0.5.0 4 | anyio==3.7.1 5 | appdirs==1.4.4 6 | arrow==1.2.3 7 | async-timeout==4.0.3 8 | attrs==23.1.0 9 | backoff==2.2.1 10 | beautifulsoup4==4.12.2 11 | blessed==1.20.0 12 | certifi==2023.7.22 13 | charset-normalizer==3.2.0 14 | click==8.1.7 15 | cmake==3.25.0 16 | contourpy==1.1.1 17 | croniter==1.4.1 18 | cycler==0.11.0 19 | dateutils==0.6.12 20 | deepdiff==6.5.0 21 | docker-pycreds==0.4.0 22 | exceptiongroup==1.1.3 23 | fast-tmfg==0.0.8 24 | fastapi==0.103.1 25 | filelock==3.9.0 26 | fonttools==4.42.1 27 | frozenlist==1.4.0 28 | fsspec==2023.9.2 29 | gitdb==4.0.10 30 | GitPython==3.1.37 31 | h11==0.14.0 32 | idna==3.4 33 | importlib-resources==6.1.0 34 | inquirer==3.1.3 35 | itsdangerous==2.1.2 36 | Jinja2==3.1.2 37 | joblib==1.3.2 38 | kiwisolver==1.4.5 39 | lightning==2.0.9 40 | lightning-cloud==0.5.38 41 | lightning-utilities==0.9.0 42 | lit==15.0.7 43 | markdown-it-py==3.0.0 44 | MarkupSafe==2.1.2 45 | matplotlib==3.8.0 46 | mdurl==0.1.2 47 | mpmath==1.3.0 48 | multidict==6.0.4 49 | networkx==3.0 50 | numpy==1.26.0 51 | ordered-set==4.1.0 52 | packaging==23.1 53 | pandas==2.1.1 54 | pathtools==0.1.2 55 | Pillow==10.0.1 56 | polars==0.19.5 57 | protobuf==4.24.3 58 | psutil==5.9.5 59 | pyarrow==13.0.0 60 | pydantic==2.1.1 61 | pydantic_core==2.4.0 62 | Pygments==2.16.1 63 | PyJWT==2.8.0 64 | pyparsing==3.1.1 65 | python-dateutil==2.8.2 66 | python-editor==1.0.4 67 | python-multipart==0.0.6 68 | pytorch-lightning==2.0.9 69 | pytz==2023.3.post1 70 | PyYAML==6.0.1 71 | readchar==4.0.5 72 | requests==2.31.0 73 | rich==13.5.3 74 | scikit-learn==1.3.1 75 | scipy==1.11.3 76 | seaborn==0.13.2 77 | sentry-sdk==1.31.0 78 | setproctitle==1.3.2 79 | six==1.16.0 80 | smmap==5.0.1 81 | sniffio==1.3.0 82 | soupsieve==2.5 83 | starlette==0.27.0 84 | starsessions==1.3.0 85 | sympy==1.12 86 | threadpoolctl==3.2.0 87 | torch==2.0.0 88 | torchinfo==1.8.0 89 | torchmetrics==1.2.0 90 | tqdm==4.66.1 91 | traitlets==5.10.1 92 | typing_extensions==4.8.0 93 | tzdata==2023.3 94 | urllib3==1.26.16 95 | uvicorn==0.23.2 96 | wandb==0.15.11 97 | wcwidth==0.2.6 98 | websocket-client==1.6.3 99 | websockets==11.0.3 100 | yarl==1.9.2 101 | zipp==3.17.0 102 | -------------------------------------------------------------------------------- /simulator/market_sim.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from tqdm import tqdm 4 | from loggers import logger 5 | from simulator.trading_agent import Trading 6 | from utils import load_yaml 7 | from datetime import timedelta 8 | 9 | 10 | def __get_data__(experiment_id): 11 | path = f"{logger.find_save_path(experiment_id)}/prediction.pkl" 12 | all_prices, sanity_check_labels, all_targets, all_predictions, all_probs = pd.read_pickle(path) 13 | all_prices.reset_index(drop=True, inplace=True) 14 | return all_prices, sanity_check_labels, all_targets.tolist(), all_predictions.tolist(), all_probs 15 | 16 | 17 | def backtest(experiment_id, trading_hyperparameters): 18 | prices, sanity_check_labels, targets, predictions, probs = __get_data__(experiment_id) 19 | TradingAgent = Trading(trading_hyperparameters) 20 | 21 | prices["Mid"] = (prices["BIDp1"] + prices["ASKp1"]) / 2 22 | prices["seconds"] = pd.to_datetime(prices["seconds"]) 23 | 24 | prices['Predictions'] = predictions 25 | prices.reset_index(drop=True, inplace=True) 26 | indices_to_delete = prices[prices['Predictions'] == 1].index 27 | prices = prices.drop(indices_to_delete) 28 | mask = (prices['Predictions'] != prices['Predictions'].shift()) | (prices.index == 0) 29 | prices = prices[mask] 30 | prices = prices.reset_index(drop=True) 31 | predictions = prices['Predictions'].tolist() 32 | prices = prices.drop(columns=['Predictions']) 33 | prices.reset_index(drop=True, inplace=True) 34 | 35 | dates = prices['seconds'].dt.date 36 | day_changed_indices = dates.ne(dates.shift()) 37 | new_day_indices = day_changed_indices.index[day_changed_indices].tolist() 38 | end_of_day_indices = [element - 1 for element in new_day_indices] 39 | end_of_day_indices.append(len(prices) - 1) 40 | end_of_day_indices = end_of_day_indices[1:] 41 | 42 | for i in tqdm(range(len(predictions))): 43 | mid_price = prices.at[i, "Mid"] 44 | best_bid_price = prices.at[i, "BIDp1"] 45 | best_ask_price = prices.at[i, "ASKp1"] 46 | timestamp = prices.at[i, "seconds"] 47 | prediction = predictions[i] 48 | probability = np.max(probs[i]) 49 | 50 | if trading_hyperparameters['mid_side_trading'] == 'mid_to_mid': 51 | if i in end_of_day_indices: 52 | if TradingAgent.long_inventory > 0: 53 | TradingAgent.exit_long(mid_price, timestamp) 54 | if TradingAgent.short_inventory > 0: 55 | TradingAgent.exit_short(mid_price, timestamp) 56 | else: 57 | if prediction == 2 and probability >= trading_hyperparameters['probability_threshold']: 58 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0: 59 | TradingAgent.long(mid_price, timestamp) 60 | elif TradingAgent.long_inventory == 0 and TradingAgent.short_inventory > 0: 61 | TradingAgent.exit_short(mid_price, timestamp) 62 | TradingAgent.long(mid_price, timestamp) 63 | elif prediction == 0 and probability >= trading_hyperparameters['probability_threshold']: 64 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0: 65 | TradingAgent.short(mid_price, timestamp) 66 | elif TradingAgent.short_inventory == 0 and TradingAgent.long_inventory > 0: 67 | TradingAgent.exit_long(mid_price, timestamp) 68 | TradingAgent.short(mid_price, timestamp) 69 | elif trading_hyperparameters['mid_side_trading'] == 'side_market_orders': 70 | if i in end_of_day_indices: 71 | if TradingAgent.long_inventory > 0: 72 | TradingAgent.exit_long(best_bid_price, timestamp) 73 | if TradingAgent.short_inventory > 0: 74 | TradingAgent.exit_short(best_ask_price, timestamp) 75 | else: 76 | if prediction == 2 and probability >= trading_hyperparameters['probability_threshold']: 77 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0: 78 | TradingAgent.long(best_ask_price, timestamp) 79 | elif TradingAgent.long_inventory == 0 and TradingAgent.short_inventory > 0: 80 | TradingAgent.exit_short(best_ask_price, timestamp) 81 | TradingAgent.long(best_ask_price, timestamp) 82 | elif prediction == 0 and probability >= trading_hyperparameters['probability_threshold']: 83 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0: 84 | TradingAgent.short(best_bid_price, timestamp) 85 | elif TradingAgent.short_inventory == 0 and TradingAgent.long_inventory > 0: 86 | TradingAgent.exit_long(best_bid_price, timestamp) 87 | TradingAgent.short(best_bid_price, timestamp) 88 | elif trading_hyperparameters['mid_side_trading'] == 'side_limit_orders': 89 | if i in end_of_day_indices: 90 | if TradingAgent.long_inventory > 0: 91 | TradingAgent.exit_long(best_ask_price, timestamp) 92 | if TradingAgent.short_inventory > 0: 93 | TradingAgent.exit_short(best_bid_price, timestamp) 94 | else: 95 | if prediction == 2 and probability >= trading_hyperparameters['probability_threshold']: 96 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0: 97 | TradingAgent.long(best_bid_price, timestamp) 98 | elif TradingAgent.long_inventory == 0 and TradingAgent.short_inventory > 0: 99 | TradingAgent.exit_short(best_bid_price, timestamp) 100 | TradingAgent.long(best_bid_price, timestamp) 101 | elif prediction == 0 and probability >= trading_hyperparameters['probability_threshold']: 102 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0: 103 | TradingAgent.short(best_ask_price, timestamp) 104 | elif TradingAgent.short_inventory == 0 and TradingAgent.long_inventory > 0: 105 | TradingAgent.exit_long(best_ask_price, timestamp) 106 | TradingAgent.short(best_ask_price, timestamp) 107 | 108 | trading_history_dataframe = pd.DataFrame(TradingAgent.trading_history) 109 | save_path = f"{logger.find_save_path(experiment_id)}/trading_simulation.pkl" 110 | trading_history_dataframe.to_pickle(save_path) 111 | -------------------------------------------------------------------------------- /simulator/post_trading_analysis.py: -------------------------------------------------------------------------------- 1 | from itertools import cycle 2 | 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.metrics import * 7 | from sklearn.preprocessing import LabelBinarizer 8 | import torch 9 | from torch.utils.data import DataLoader 10 | from tqdm import tqdm 11 | 12 | from loggers import logger 13 | from utils import get_training_test_stocks_as_string 14 | 15 | 16 | def __get_fees_free_pnl__(trading_simulation): 17 | df = trading_simulation 18 | profit_list = [] 19 | for index, row in df.iterrows(): 20 | profit_no_fees = 0 21 | if row.Type == 'Long': 22 | local_profit = (row.Price_Exit_Long - row.Price_Entry_Long) 23 | profit_no_fees += local_profit 24 | elif row.Type == 'Short': 25 | local_profit = (row.Price_Entry_Short - row.Price_Exit_Short) 26 | profit_no_fees += local_profit 27 | 28 | profit_list.append(profit_no_fees) 29 | return profit_list 30 | 31 | 32 | def __get_pnl_with_fees__(trading_simulation, trading_hyperparameters): 33 | df = trading_simulation 34 | profit_list = [] 35 | for index, row in df.iterrows(): 36 | profit_no_fees = 0 37 | if row.Type == 'Long': 38 | local_profit = (row.Price_Exit_Long - row.Price_Entry_Long) - (row.Price_Exit_Long * trading_hyperparameters['trading_fee']) - (row.Price_Entry_Long * trading_hyperparameters['trading_fee']) 39 | profit_no_fees += local_profit 40 | elif row.Type == 'Short': 41 | local_profit = (row.Price_Entry_Short - row.Price_Exit_Short) - (row.Price_Entry_Short * trading_hyperparameters['trading_fee']) - (row.Price_Exit_Short * trading_hyperparameters['trading_fee']) 42 | profit_no_fees += local_profit 43 | 44 | profit_list.append(profit_no_fees) 45 | return profit_list 46 | 47 | 48 | def __get_long_short_indices__(trading_simulation): 49 | long_indices = [] 50 | short_indices = [] 51 | for index, row in trading_simulation.iterrows(): 52 | if row.Type == 'Long': 53 | long_indices.append(pd.to_datetime(row.Entry_Long)) 54 | elif row.Type == 'Short': 55 | short_indices.append(pd.to_datetime(row.Entry_Short)) 56 | 57 | return long_indices, short_indices 58 | 59 | 60 | def post_trading_analysis(experiment_id, general_hyperparameters, trading_hyperparameters, model_hyperparameters): 61 | prediction = pd.read_pickle(f"{logger.find_save_path(experiment_id)}/prediction.pkl") 62 | trading_simulation = pd.read_pickle(f"{logger.find_save_path(experiment_id)}/trading_simulation.pkl") 63 | 64 | training_stocks_string, test_stocks_string = get_training_test_stocks_as_string(general_hyperparameters) 65 | 66 | dataset = torch.load( 67 | f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{training_stocks_string}_test_{test_stocks_string}/{model_hyperparameters['prediction_horizon']}/test_dataset_backtest.pt") 68 | print(f"Reading test (backtest version) dataset...") 69 | test_loader = DataLoader( 70 | dataset, 71 | batch_size=model_hyperparameters["batch_size"], 72 | shuffle=False, 73 | num_workers=model_hyperparameters["num_workers"], 74 | ) 75 | returns_labels_list = [] 76 | for data, labels in tqdm(test_loader): 77 | returns_labels_list.extend(labels.tolist()) 78 | 79 | targets = prediction[2].tolist() 80 | predictions = prediction[3].tolist() 81 | 82 | print(classification_report(targets, predictions)) 83 | 84 | distributions_dataset = pd.DataFrame({"Predictions": predictions, "PCs": returns_labels_list}) 85 | distribution_label_0 = distributions_dataset[distributions_dataset['Predictions'] == 0].PCs 86 | distribution_label_1 = distributions_dataset[distributions_dataset['Predictions'] == 1].PCs 87 | distribution_label_2 = distributions_dataset[distributions_dataset['Predictions'] == 2].PCs 88 | 89 | plt.hist(distribution_label_0, label='Label 0', alpha=0.5, bins=10) 90 | plt.hist(distribution_label_1, label='Label 1', alpha=0.5, bins=10) 91 | plt.hist(distribution_label_2, label='Label 2', alpha=0.5, bins=10) 92 | 93 | plt.title("Predictions' distribution") 94 | plt.xlabel("PCs Values") 95 | plt.ylabel("Frequency") 96 | plt.legend(title="Labels") 97 | plt.show() 98 | 99 | label_binarizer = LabelBinarizer().fit(targets) 100 | y_onehot_test = label_binarizer.transform(targets) 101 | colors = cycle(["aqua", "darkorange", "cornflowerblue"]) 102 | fig, ax = plt.subplots(figsize=(10, 8)) 103 | for class_id, color in zip(range(0, 3), colors): 104 | RocCurveDisplay.from_predictions( 105 | y_onehot_test[:, class_id], 106 | prediction[-1][:, class_id], 107 | name=f"ROC curve for class: {class_id}", 108 | color=color, 109 | ax=ax, 110 | plot_chance_level=(class_id == 2), 111 | ) 112 | 113 | plt.axis("square") 114 | plt.xlabel("False Positive Rate") 115 | plt.ylabel("True Positive Rate") 116 | plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass") 117 | plt.legend() 118 | plt.show() 119 | 120 | print(f"Matthews Correlation Coefficient: {round(matthews_corrcoef(targets, predictions), 2)}") 121 | print(f"Macro-average AUC-ROC (ovr): {round(roc_auc_score(targets, prediction[-1].tolist(), average='macro', multi_class='ovr'), 2)}") 122 | print(f"Macro-average AUC-ROC (ovo): {round(roc_auc_score(targets, prediction[-1].tolist(), average='macro', multi_class='ovo'), 2)}") 123 | print(f"Top-k (with k=2) Accuracy Score: {round(top_k_accuracy_score(targets, prediction[-1], k=2), 2)}") 124 | 125 | fig, axs = plt.subplots(2, 2, figsize=(10, 8)) 126 | for ax in axs.flat: 127 | ax.set_yticklabels([]) 128 | ax.set_yticks([]) 129 | ax.set_xticklabels([]) 130 | ax.set_xticks([]) 131 | 132 | # Confusion matrix plot. 133 | cm = confusion_matrix(targets, predictions, labels=[0, 1, 2], normalize='true') 134 | disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1, 2]) 135 | disp.plot(ax=axs[0, 0], cmap='Blues') 136 | axs[0, 0].set_title('Confusion Matrix') 137 | 138 | # P&L distribution plot. 139 | ax = fig.add_subplot(2, 2, 2) 140 | if trading_hyperparameters['simulation_type'] == 'no_fees': 141 | plt.hist(__get_fees_free_pnl__(trading_simulation), bins=30) 142 | elif trading_hyperparameters['simulation_type'] == 'with_fees': 143 | plt.hist(__get_pnl_with_fees__(trading_simulation, trading_hyperparameters), bins=30) 144 | axs[0, 1].set_title('P&L Distribution') 145 | 146 | # P&L cumsum plot. 147 | ax = fig.add_subplot(2, 2, 3) 148 | if trading_hyperparameters['simulation_type'] == 'no_fees': 149 | plt.plot(np.cumsum(__get_fees_free_pnl__(trading_simulation))) 150 | elif trading_hyperparameters['simulation_type'] == 'with_fees': 151 | plt.plot(np.cumsum(__get_pnl_with_fees__(trading_simulation, trading_hyperparameters))) 152 | axs[1, 0].set_title('P&L cumsum') 153 | 154 | # Mid price 155 | df = prediction[0].reset_index(drop=True) 156 | seconds = pd.to_datetime(df.seconds) 157 | mid = (df.BIDp1 + df.ASKp1) / 2 158 | trading_df = pd.DataFrame() 159 | trading_df['seconds'] = seconds 160 | trading_df['mid'] = mid 161 | 162 | long_indices, short_indices = __get_long_short_indices__(trading_simulation) 163 | trading_df.drop_duplicates(inplace=True, keep='first', subset='seconds') 164 | trading_df.set_index('seconds', inplace=True) 165 | 166 | ax = fig.add_subplot(2, 2, 4) 167 | plt.plot(trading_df.mid) 168 | for datetime in long_indices: 169 | y_value = trading_df.loc[datetime, 'mid'] 170 | ax.plot(datetime, y_value, marker='^', color='green', markersize=5) 171 | for datetime in short_indices: 172 | y_value = trading_df.loc[datetime, 'mid'] 173 | ax.plot(datetime, y_value, marker='v', color='red', markersize=5) 174 | 175 | axs[1, 1].set_title('Mid price') 176 | 177 | plt.tight_layout() 178 | plt.show() -------------------------------------------------------------------------------- /simulator/trading_agent.py: -------------------------------------------------------------------------------- 1 | class Trading: 2 | def __init__(self, trading_hyperparameters): 3 | self.long_inventory = 0 4 | self.short_inventory = 0 5 | self.long_price = 0 6 | self.short_price = 0 7 | self.date_time_entry_long = None 8 | self.date_time_exit_long = None 9 | self.date_time_entry_short = None 10 | self.date_time_exit_short = None 11 | self.trading_history = [] 12 | 13 | def long(self, price, datetime=None): 14 | amount = 1 15 | self.long_inventory += amount 16 | self.long_price = price 17 | self.date_time_entry_long = datetime 18 | 19 | def short(self, price, datetime=None): 20 | amount = 1 21 | self.short_inventory += amount 22 | self.short_price = price 23 | self.date_time_entry_short = datetime 24 | 25 | def exit_long(self, price, datetime=None): 26 | self.trading_history.append({'Type': 'Long', 'Entry_Long': self.date_time_entry_long, 'Price_Entry_Long': self.long_price, 27 | 'Exit_Long': datetime, 'Price_Exit_Long': price}) 28 | 29 | self.long_inventory = 0 30 | self.long_price = 0 31 | self.date_time_entry_long = None 32 | 33 | def exit_short(self, price, datetime=None): 34 | self.trading_history.append({'Type': 'Short', 'Entry_Short': self.date_time_entry_short, 'Price_Entry_Short': self.short_price, 35 | 'Exit_Short': datetime, 'Price_Exit_Short': price}) 36 | 37 | self.short_inventory = 0 38 | self.short_price = 0 39 | self.date_time_entry_short = None 40 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | import argparse 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import yaml 9 | 10 | from loggers import logger 11 | from typing import List, Union, Any 12 | 13 | 14 | def load_yaml(path: str, subsection: str) -> dict[str, Any]: 15 | """ 16 | Load a YAML file. 17 | 18 | Args: 19 | path (str): Path to the YAML file. 20 | subsection (str): Subsection to be considered (i.e. general, model, trading). 21 | 22 | Returns: 23 | A dictionary containing the YAML file. 24 | """ 25 | with open(path) as f: 26 | config = yaml.safe_load(f) 27 | return config[subsection] 28 | 29 | 30 | def data_split( 31 | dataset: str, 32 | training_stocks: list[str], 33 | target_stock: list[str], 34 | training_ratio: float, 35 | validation_ratio: float, 36 | include_target_stock_in_training: bool, 37 | ) -> None: 38 | """ 39 | Split the data into training, validation and test sets based on the training, validation and test ratios. 40 | 41 | Args: 42 | dataset (str): The considered dataset (i.e. nasdaq, lse, ...). 43 | training_stocks (list): The list of stocks to be used for training. 44 | target_stock (list): The list of stocks to be used for validation and test. 45 | training_ratio (float): The ratio of training data. 46 | validation_ratio (float): The ratio of validation data. 47 | include_target_stock_in_training (bool): Including or not the target stock in the training set. 48 | 49 | Returns: 50 | None. 51 | """ 52 | # List of target_stocks contains stocks that must be split into training, validation and test sets. 53 | # If requested, target stocks are removed from the training set in a second stage. 54 | for stock in target_stock: 55 | # Sorted list of scaled data. 56 | files_scaled = sorted(glob.glob(f"./data/{dataset}/scaled_data/{stock}/*.csv")) 57 | # Sorted list of unscaled data. 58 | files_unscaled = sorted( 59 | glob.glob(f"./data/{dataset}/unscaled_data/{stock}/*.csv") 60 | ) 61 | 62 | # Sanity check to make sure that the number of files in the scaled and unscaled folders is the same. 63 | assert len(files_scaled) == len( 64 | files_unscaled 65 | ), "The number of files in the scaled and unscaled folders must be the same." 66 | 67 | # Number of training files (based on training ratio). 68 | num_training_files = int(len(files_scaled) * training_ratio) 69 | # Number of validation files (based on validation ratio). 70 | num_validation_files = int(len(files_scaled) * validation_ratio) 71 | # Number of test files (based on test ratio). 72 | num_test_files = len(files_scaled) - num_training_files - num_validation_files 73 | 74 | # Create the training folder (scaled data) if it does not exist. 75 | if not os.path.exists(f"./data/{dataset}/scaled_data/training"): 76 | os.makedirs(f"./data/{dataset}/scaled_data/training") 77 | # Create the validation folder (scaled data) if it does not exist. 78 | if not os.path.exists(f"./data/{dataset}/scaled_data/validation"): 79 | os.makedirs(f"./data/{dataset}/scaled_data/validation") 80 | # Create the test folder (scaled data) if it does not exist. 81 | if not os.path.exists(f"./data/{dataset}/scaled_data/test"): 82 | os.makedirs(f"./data/{dataset}/scaled_data/test") 83 | 84 | # Create the training folder (unscaled data) if it does not exist. 85 | if not os.path.exists(f"./data/{dataset}/unscaled_data/training"): 86 | os.makedirs(f"./data/{dataset}/unscaled_data/training") 87 | # Create the validation folder (unscaled data) if it does not exist. 88 | if not os.path.exists(f"./data/{dataset}/unscaled_data/validation"): 89 | os.makedirs(f"./data/{dataset}/unscaled_data/validation") 90 | # Create the test folder (unscaled data) if it does not exist. 91 | if not os.path.exists(f"./data/{dataset}/unscaled_data/test"): 92 | os.makedirs(f"./data/{dataset}/unscaled_data/test") 93 | 94 | # Move the files to the training folder (scaled data). 95 | # If requested, target stocks are removed from the training set. 96 | for i in range(num_training_files): 97 | destination_folder = f"./data/{dataset}/scaled_data/training" 98 | file = files_scaled[i] 99 | if include_target_stock_in_training: 100 | shutil.move(file, destination_folder) 101 | else: 102 | if target_stock not in file: 103 | shutil.move(file, destination_folder) 104 | print(f"{file} --> {destination_folder}") 105 | 106 | # Move the files to the validation folder (scaled data). 107 | for i in range(num_validation_files): 108 | destination_folder = f"./data/{dataset}/scaled_data/validation" 109 | file = files_scaled[i + num_training_files] 110 | shutil.move(file, destination_folder) 111 | print(f"{file} --> {destination_folder}") 112 | 113 | # Move the files to the test folder (scaled data). 114 | for i in range(num_test_files): 115 | destination_folder = f"./data/{dataset}/scaled_data/test" 116 | file = files_scaled[i + num_training_files + num_validation_files] 117 | shutil.move(file, destination_folder) 118 | print(f"{file} --> {destination_folder}") 119 | 120 | # Move the files to the training folder (unscaled data). 121 | # If requested, target stocks are removed from the training set. 122 | for i in range(num_training_files): 123 | destination_folder = f"./data/{dataset}/unscaled_data/training" 124 | file = files_unscaled[i] 125 | if include_target_stock_in_training: 126 | shutil.move(file, destination_folder) 127 | else: 128 | if target_stock not in file: 129 | shutil.move(file, destination_folder) 130 | print(f"{file} --> {destination_folder}") 131 | 132 | # Move the files to the validation folder (unscaled data). 133 | for i in range(num_validation_files): 134 | destination_folder = f"./data/{dataset}/unscaled_data/validation" 135 | file = files_unscaled[i + num_training_files] 136 | shutil.move(file, destination_folder) 137 | print(f"{file} --> {destination_folder}") 138 | 139 | # Move the files to the test folder (unscaled data). 140 | for i in range(num_test_files): 141 | destination_folder = f"./data/{dataset}/unscaled_data/test" 142 | file = files_unscaled[i + num_training_files + num_validation_files] 143 | shutil.move(file, destination_folder) 144 | print(f"{file} --> {destination_folder}") 145 | 146 | # Delete the folders containing the original processed LOB data. 147 | shutil.rmtree(f"./data/{dataset}/scaled_data/{stock}") 148 | shutil.rmtree(f"./data/{dataset}/unscaled_data/{stock}") 149 | 150 | # Until now, only the data belonging to target_stocks have been treated. 151 | # Now, all the other stocks need to be treated. 152 | # Perform the set difference operation between the training_stocks and target_stock sets. 153 | difference_set = list(set(training_stocks).difference(set(target_stock))) 154 | 155 | # Stocks in difference_set are training-only data. 156 | for stock in difference_set: 157 | # Get the sorted list of scaled LOB files. 158 | files_scaled = sorted(glob.glob(f"./data/{dataset}/scaled_data/{stock}/*.csv")) 159 | # Get the sorted list of unscaled LOB files. 160 | files_unscaled = sorted( 161 | glob.glob(f"./data/{dataset}/unscaled_data/{stock}/*.csv") 162 | ) 163 | 164 | # Sanity check to make sure that the number of files in the scaled and unscaled folders is the same. 165 | assert len(files_scaled) == len( 166 | files_unscaled 167 | ), "The number of files in the scaled and unscaled folders must be the same." 168 | 169 | # Move the files to the training folder (scaled data). 170 | for i in range(len(files_scaled)): 171 | destination_folder = f"./data/{dataset}/scaled_data/training" 172 | file = files_scaled[i] 173 | shutil.move(file, destination_folder) 174 | print(f"{file} --> {destination_folder}") 175 | 176 | # Move the files to the training folder (unscaled data). 177 | for i in range(len(files_unscaled)): 178 | destination_folder = f"./data/{dataset}/unscaled_data/training" 179 | file = files_unscaled[i] 180 | shutil.move(file, destination_folder) 181 | print(f"{file} --> {destination_folder}") 182 | 183 | # Delete the folders containing the original processed LOB data. 184 | shutil.rmtree(f"./data/{dataset}/scaled_data/{stock}") 185 | shutil.rmtree(f"./data/{dataset}/unscaled_data/{stock}") 186 | 187 | # When dealing with multiple stocks, we want to maintain the same number of files for each of them in the training folder. 188 | print("Aligning data...") 189 | target_stock_dates = set() 190 | other_dates = set() 191 | # As a first step, we check the number of representatives of the target_stock in the training folder. 192 | for stock in target_stock: 193 | files = sorted( 194 | glob.glob(f"./data/{dataset}/unscaled_data/training/{stock}_*.csv") 195 | ) 196 | for file in files: 197 | date = file.split("/")[-1].split("_")[-1].split(".")[0] 198 | target_stock_dates.add(date) 199 | # As a second step, we check the number of representatives of the other stocks in the training folder. 200 | # As a third step, we remove redundant files (if any) from both scaled and unscaled data folder. 201 | for stock in training_stocks: 202 | files = sorted( 203 | glob.glob(f"./data/{dataset}/unscaled_data/training/{stock}_*.csv") 204 | ) 205 | for file in files: 206 | date = file.split("/")[-1].split("_")[-1].split(".")[0] 207 | other_dates.add(date) 208 | dates_to_remove = list(other_dates.difference(target_stock_dates)) 209 | for date in dates_to_remove: 210 | files = sorted( 211 | glob.glob(f"./data/{dataset}/unscaled_data/training/*_{date}.csv") 212 | ) 213 | for file in files: 214 | os.remove(file) 215 | files = sorted(glob.glob(f"./data/{dataset}/scaled_data/training/*_{date}.csv")) 216 | for file in files: 217 | os.remove(file) 218 | print("Data aligned.") 219 | 220 | 221 | def save_dataset_info( 222 | experiment_id: str, 223 | general_hyperparameters: dict[str, Any], 224 | ) -> None: 225 | """ 226 | Save all the days used in the training, validation and test sets. 227 | Args: 228 | experiment_id (str): ID of the experiment. 229 | general_hyperparameters (dict): General hyperparameters. 230 | 231 | Returns: 232 | None. 233 | """ 234 | # Access the training data folder and list all the files. 235 | training_days_temp = glob.glob( 236 | f"./data/{general_hyperparameters['dataset']}/scaled_data/training/*.csv" 237 | ) 238 | # Access the validation data folder and list all the files. 239 | validation_days_temp = glob.glob( 240 | f"./data/{general_hyperparameters['dataset']}/scaled_data/validation/*.csv" 241 | ) 242 | # Access the test data folder and list all the files. 243 | test_days_temp = glob.glob( 244 | f"./data/{general_hyperparameters['dataset']}/scaled_data/test/*.csv" 245 | ) 246 | 247 | training_days = [] 248 | validation_days = [] 249 | test_days = [] 250 | 251 | # Extract the dates from the file names (training data). 252 | for i in training_days_temp: 253 | i = i.split("/")[-1].split("_")[-1] 254 | training_days.append(i) 255 | 256 | # Extract the dates from the file names (validation data). 257 | for i in validation_days_temp: 258 | i = i.split("/")[-1].split("_")[-1] 259 | validation_days.append(i) 260 | 261 | # Extract the dates from the file names (test data). 262 | for i in test_days_temp: 263 | i = i.split("/")[-1].split("_")[-1] 264 | test_days.append(i) 265 | 266 | # Create a dictionary containing the training, validation and test days. 267 | dataset_info = { 268 | "training_days": sorted(set(training_days)), 269 | "validation_days": sorted(set(validation_days)), 270 | "test_days": sorted(set(test_days)), 271 | } 272 | 273 | # Save the dictionary as a YAML file. 274 | logger.logger( 275 | experiment_id=experiment_id, 276 | header="dataset_info", 277 | contents=dataset_info, 278 | ) 279 | 280 | 281 | def get_best_levels_prices_and_labels( 282 | dataset: str, 283 | target_stocks: str, 284 | history_length: int, 285 | all_horizons: list[int], 286 | prediction_horizon: int, 287 | threshold: float, 288 | ) -> tuple[Any, ...]: 289 | """ 290 | Get the best levels (bid and ask) prices and the corresponding discretized labels. 291 | Args: 292 | dataset (str): Name of the dataset to be used (e.g. nasdaq, lse, ...). 293 | history_length (int): Length of the history (each model's sample is a 2D array of shape (, )). 294 | all_horizons (list): List all horizons computed in the preprocessing stage. 295 | prediction_horizon (int): Horizon to be considered. 296 | threshold (float): Threshold to be used to discretize the labels. 297 | 298 | Returns: 299 | A tuple containing the best levels (bid and ask) prices and the corresponding discretized labels. 300 | """ 301 | 302 | # List the test files. 303 | test_files = sorted(glob.glob(f"./data/{dataset}/unscaled_data/test/*{target_stocks[0]}*.csv")) 304 | 305 | best_levels_prices = pd.DataFrame() 306 | 307 | # Get the position of the prediction horizon in the list of all horizons. 308 | position = next( 309 | ( 310 | index 311 | for index, value in enumerate(all_horizons) 312 | if value == prediction_horizon 313 | ), 314 | None, 315 | ) 316 | all_labels_temp = [] 317 | 318 | for file in test_files: 319 | # Load the file. 320 | df = pd.read_csv(file).iloc[history_length:, :] 321 | # Reset the index. 322 | df.reset_index(drop=True, inplace=True) 323 | # Get all the labels. 324 | label_df = df.iloc[:, 41:] 325 | # Get the label corresponding to the prediction horizon. 326 | label = label_df.iloc[:, position] 327 | # Get the best levels (ask and bid) prices and the datetime corresponding to each tick. 328 | best_levels_prices = pd.concat( 329 | [best_levels_prices, df[["seconds", "ASKp1", "BIDp1"]]] 330 | ) 331 | # Append the label to the list of labels. 332 | all_labels_temp = all_labels_temp + label.tolist() 333 | 334 | # Discretize the labels (0: downtrend, 1: no trend, 2: uptrend). 335 | all_labels = [ 336 | 2 if label >= threshold else 0 if label <= -threshold else 1 337 | for label in all_labels_temp 338 | ] 339 | 340 | return best_levels_prices, all_labels 341 | 342 | 343 | def detect_changing_points( 344 | target: int, cumulative_lengths: list[int] 345 | ) -> Union[int, None]: 346 | """ 347 | Detect the last index of the file containing the target value. 348 | Args: 349 | target (int): Target index. 350 | cumulative_lengths (list): List of cumulative lengths. 351 | 352 | Returns: 353 | 0 if the target value is in the first file, the last index of the file containing the target value otherwise. 354 | """ 355 | for i, length in enumerate(cumulative_lengths): 356 | if target <= length: 357 | if i == 0: 358 | return 0 359 | else: 360 | return cumulative_lengths[i - 1] 361 | return None 362 | 363 | 364 | def wandb_hyperparameters_saving( 365 | wandb_logger: Any, 366 | general_hyperparameters: dict[str, Any], 367 | model_hyperparameters: dict[str, Any], 368 | ) -> None: 369 | """ 370 | Save the general/model hyperparameters in the Weights & Biases dashboard. 371 | Args: 372 | wandb_logger (any): Wandb logger. 373 | general_hyperparameters (dict): General hyperparameters. 374 | model_hyperparameters (dict): Model hyperparameters. 375 | 376 | Returns: 377 | None. 378 | """ 379 | wbl = wandb_logger 380 | for key in general_hyperparameters: 381 | wbl.experiment.config[key] = general_hyperparameters[key] 382 | for key in model_hyperparameters: 383 | wbl.experiment.config[key] = model_hyperparameters[key] 384 | 385 | 386 | def str2bool(v): 387 | if isinstance(v, bool): 388 | return v 389 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 390 | return True 391 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 392 | return False 393 | else: 394 | raise argparse.ArgumentTypeError('Boolean value expected.') 395 | 396 | 397 | def parse_args() -> Any: 398 | """ 399 | Parser for input arguments. 400 | 401 | Returns: 402 | The parsed arguments. 403 | """ 404 | parser = argparse.ArgumentParser(description="Hyperparameters acquisition.") 405 | 406 | parser.add_argument( 407 | "--experiment_id", 408 | type=str, 409 | default=None, 410 | help="ID of the experiment (if any). This argument is used to resume older experiments or partially re-run experiments.", 411 | ) 412 | 413 | # General hyperparameters 414 | parser.add_argument( 415 | "--dataset", 416 | type=str, 417 | default="nasdaq", 418 | help="The dataset to be used (e.g. nasdaq, lse, ...). Each dataset has a different raw data format which needs to be correctly handled.", 419 | ) 420 | parser.add_argument( 421 | "--model", 422 | type=str, 423 | default="deeplob", 424 | help="The model to be used (e.g. deeplob, ...).", 425 | ) 426 | parser.add_argument( 427 | "--training_stocks", 428 | type=str, 429 | default="XYZ", 430 | help="Stock to be used for training (e.g., 'CSCO').", 431 | ) 432 | parser.add_argument( 433 | "--target_stocks", 434 | type=str, 435 | default="XYZ", 436 | help="The stock to be used in the validation and test sets (it is always unique)", 437 | ) 438 | parser.add_argument( 439 | "--normalization_window", 440 | type=int, 441 | default=5, 442 | help="Number of files to be used for rolling data normalization.", 443 | ) 444 | parser.add_argument( 445 | "--horizons", 446 | type=str, 447 | default="10,50,100", 448 | help="Horizon(s) to be considered (to be expressed in this format: '10,50,100').", 449 | ) 450 | parser.add_argument( 451 | "--training_ratio", 452 | type=float, 453 | default=0.6, 454 | help="Training data proportion." 455 | ) 456 | parser.add_argument( 457 | "--validation_ratio", 458 | type=float, 459 | default=0.2, 460 | help="Validation data proportion.", 461 | ) 462 | parser.add_argument( 463 | "--test_ratio", 464 | type=float, 465 | default=0.2, 466 | help="Test data proportion." 467 | ) 468 | parser.add_argument( 469 | "--stages", 470 | type=str, 471 | default="data_processing", 472 | help="Stage(s) to be run (to be expressed in this format: 'training,evaluation').", 473 | ) # data_processing | torch_dataset_preparation | torch_dataset_preparation_backtest | complete_homological_structures_preparation | training,evaluation | backtest,post_trading_analysis 474 | parser.add_argument( 475 | "--include_target_stock_in_training", 476 | type=str2bool, 477 | default=True, 478 | help="Including or not the target stock in the training set.", 479 | ) 480 | parser.add_argument( 481 | "--targets_type", 482 | type=str, 483 | default='raw', 484 | help="Type of targets to be used (i.e. smooth, raw).", 485 | ) 486 | 487 | # Model hyperparameters 488 | parser.add_argument( 489 | "--batch_size", 490 | type=int, 491 | default=32, 492 | help="Batch size." 493 | ) 494 | parser.add_argument( 495 | "--epochs", 496 | type=int, 497 | default=100, 498 | help="Maximum number of epochs." 499 | ) 500 | parser.add_argument( 501 | "--learning_rate", 502 | type=float, 503 | default=6e-5, 504 | help="Learning rate." 505 | ) 506 | parser.add_argument( 507 | "--num_workers", 508 | type=int, 509 | default=5, 510 | help="Number of workers to be used by the dataloader.", 511 | ) 512 | parser.add_argument( 513 | "--history_length", 514 | type=int, 515 | default=100, 516 | help="Length of the history to be used (each model's sample is a 2D array of shape (, ).", 517 | ) 518 | parser.add_argument( 519 | "--shuffling_seed", 520 | type=int, 521 | default=428, 522 | help="Seed to be used for data shuffling.", 523 | ) 524 | parser.add_argument( 525 | "--lighten", 526 | type=str2bool, 527 | default=False, 528 | help="Lighten the model's input (10 -> 5 levels).", 529 | ) 530 | parser.add_argument( 531 | "--threshold", 532 | type=float, 533 | default=0.0, 534 | help="Threshold to be used to discretize the labels.", 535 | ) 536 | parser.add_argument( 537 | "--prediction_horizon", 538 | type=int, 539 | default=10, 540 | help="Horizon to be considered in the inference stage.", 541 | ) 542 | parser.add_argument( 543 | "--balanced_sampling", 544 | type=str2bool, 545 | default=True, 546 | help="Either or not using a balanced sampling approach in the training stage.", 547 | ) 548 | parser.add_argument( 549 | "--patience", 550 | type=int, 551 | default=10, 552 | help="Patience to be used in the training stage.", 553 | ) 554 | 555 | # Trading hyperparameters 556 | parser.add_argument( 557 | "--initial_cash", 558 | type=int, 559 | default=1000, 560 | help="Initial cash to be used in the trading simulation.", 561 | ) 562 | parser.add_argument( 563 | "--trading_fee", 564 | type=float, 565 | default=0.0001, 566 | help="Trading fee to be used in the trading simulation.", 567 | ) 568 | parser.add_argument( 569 | "--mid_side_trading", 570 | type=str, 571 | default="mid_to_mid", 572 | help="Trading strategy to be used in the trading simulation.", 573 | ) 574 | parser.add_argument( 575 | "--simulation_type", 576 | type=str, 577 | default="with_fees", 578 | help="Either or not applying trading fees in the trading simulation.", 579 | ) 580 | parser.add_argument( 581 | "--probability_threshold", 582 | type=float, 583 | default=0.65, 584 | help="Threshold used to decide if exploiting or ignoring a signal in the trading simulation.", 585 | ) 586 | 587 | args = parser.parse_args() 588 | return args 589 | 590 | 591 | def create_hyperparameters_yaml(experiment_id: str, args: Any) -> None: 592 | """ 593 | Create and save a YAML file containing the hyperparameters as part of an experiment. 594 | Args: 595 | experiment_id (str): ID of the experiment. 596 | args (any): Stage's arguments. 597 | 598 | Returns: 599 | None. 600 | """ 601 | training_stocks = list( 602 | args.training_stocks.split(",") 603 | ) # Parsing of 'training_stocks' input argument. 604 | target_stocks = list( 605 | args.target_stocks.split(",") 606 | ) # Parsing of 'target_stocks' input argument. 607 | horizons = list( 608 | map(int, args.horizons.split(",")) 609 | ) # Parsing of 'horizons' input argument. 610 | stages = list(args.stages.split(",")) # Parsing of 'stages' input argument. 611 | 612 | # Create a dictionary (YAML structure) containing the hyperparameters. 613 | data = { 614 | "general": { 615 | "dataset": args.dataset, 616 | "model": args.model, 617 | "training_stocks": training_stocks, 618 | "target_stocks": target_stocks, 619 | "normalization_window": args.normalization_window, 620 | "horizons": horizons, 621 | "training_ratio": args.training_ratio, 622 | "validation_ratio": args.validation_ratio, 623 | "test_ratio": args.test_ratio, 624 | "stages": stages, 625 | "include_target_stock_in_training": args.include_target_stock_in_training, 626 | "targets_type": args.targets_type, 627 | }, 628 | "model": { 629 | "batch_size": args.batch_size, 630 | "epochs": args.epochs, 631 | "learning_rate": args.learning_rate, 632 | "num_workers": args.num_workers, 633 | "history_length": args.history_length, 634 | "shuffling_seed": args.shuffling_seed, 635 | "lighten": args.lighten, 636 | "threshold": args.threshold, 637 | "prediction_horizon": args.prediction_horizon, 638 | "balanced_sampling": args.balanced_sampling, 639 | "patience": args.patience, 640 | }, 641 | "trading": { 642 | "initial_cash": args.initial_cash, 643 | "trading_fee": args.trading_fee, 644 | "mid_side_trading": args.mid_side_trading, 645 | "simulation_type": args.simulation_type, 646 | "probability_threshold": args.probability_threshold, 647 | }, 648 | } 649 | 650 | # Specify the file path where saving the YAML file. 651 | file_path = f"{logger.find_save_path(experiment_id)}/hyperparameters.yaml" 652 | 653 | # Write the data to the YAML file. 654 | with open(file_path, "w") as file: 655 | yaml.dump(data, file) 656 | 657 | 658 | def create_tree(path: str) -> None: 659 | """ 660 | Create folders recursively. 661 | Args: 662 | path (str): Tree of folders to be created. 663 | 664 | Returns: 665 | None. 666 | """ 667 | # Recursively create a tree of folders. If the path already exists, delete it and create a new one. 668 | if os.path.exists(path): 669 | shutil.rmtree(path) 670 | os.makedirs(path) 671 | 672 | 673 | def get_training_test_stocks_as_string(general_hyperparameters): 674 | training_stocks = general_hyperparameters["training_stocks"] 675 | general_training_string = "" 676 | for s in training_stocks: 677 | general_training_string += s + "_" 678 | general_training_string = general_training_string[:-1] 679 | 680 | test_stocks = general_hyperparameters["target_stocks"] 681 | general_test_string = "" 682 | for s in test_stocks: 683 | general_test_string += s + "_" 684 | general_test_string = general_test_string[:-1] 685 | 686 | return general_training_string, general_test_string --------------------------------------------------------------------------------