├── .gitignore
├── .idea
├── .gitignore
├── LobFrame.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── README.md
├── data_processing
├── complete_homological_utils.py
├── data_process.py
└── data_process_utils.py
├── loaders
└── custom_dataset.py
├── loggers
├── analysis.py
└── logger.py
├── main.py
├── models
├── AxialLob
│ └── axiallob.py
├── CNN1
│ └── cnn1.py
├── CNN2
│ └── cnn2.py
├── CompleteHCNN
│ └── complete_hcnn.py
├── DLA
│ └── DLA.py
├── DeepLob
│ └── deeplob.py
├── LobTransformer
│ └── lobtransformer.py
├── TABL
│ ├── bin_nn.py
│ ├── bin_tabl.py
│ ├── bl_layer.py
│ └── tabl_layer.py
├── Transformer
│ └── transformer.py
└── iTransformer
│ └── itransformer.py
├── optimizers
├── executor.py
└── lightning_batch_gd.py
├── requirements.txt
├── requirements_mac_os.txt
├── simulator
├── market_sim.py
├── post_trading_analysis.py
└── trading_agent.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/pycharm,osx,jupyternotebooks
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=pycharm,osx,jupyternotebooks
3 |
4 | ### JupyterNotebooks ###
5 | # gitignore template for Jupyter Notebooks
6 | # website: http://jupyter.org/
7 |
8 | .ipynb_checkpoints
9 | */.ipynb_checkpoints/*
10 |
11 | # IPython
12 | profile_default/
13 | ipython_config.py
14 |
15 | # Remove previous ipynb_checkpoints
16 | # git rm -r .ipynb_checkpoints/
17 |
18 | ### OSX ###
19 | # General
20 | .DS_Store
21 | .AppleDouble
22 | .LSOverride
23 |
24 | # Icon must end with two \r
25 | Icon
26 |
27 |
28 | # Thumbnails
29 | ._*
30 |
31 | # Files that might appear in the root of a volume
32 | .DocumentRevisions-V100
33 | .fseventsd
34 | .Spotlight-V100
35 | .TemporaryItems
36 | .Trashes
37 | .VolumeIcon.icns
38 | .com.apple.timemachine.donotpresent
39 |
40 | # Directories potentially created on remote AFP share
41 | .AppleDB
42 | .AppleDesktop
43 | Network Trash Folder
44 | Temporary Items
45 | .apdisk
46 |
47 | ### PyCharm ###
48 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
49 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
50 |
51 | # User-specific stuff
52 | .idea/**/workspace.xml
53 | .idea/**/tasks.xml
54 | .idea/**/usage.statistics.xml
55 | .idea/**/dictionaries
56 | .idea/**/shelf
57 |
58 | # AWS User-specific
59 | .idea/**/aws.xml
60 |
61 | # Generated files
62 | .idea/**/contentModel.xml
63 |
64 | # Sensitive or high-churn files
65 | .idea/**/dataSources/
66 | .idea/**/dataSources.ids
67 | .idea/**/dataSources.local.xml
68 | .idea/**/sqlDataSources.xml
69 | .idea/**/dynamic.xml
70 | .idea/**/uiDesigner.xml
71 | .idea/**/dbnavigator.xml
72 |
73 | # Gradle
74 | .idea/**/gradle.xml
75 | .idea/**/libraries
76 |
77 | # Gradle and Maven with auto-import
78 | # When using Gradle or Maven with auto-import, you should exclude module files,
79 | # since they will be recreated, and may cause churn. Uncomment if using
80 | # auto-import.
81 | # .idea/artifacts
82 | # .idea/compiler.xml
83 | # .idea/jarRepositories.xml
84 | # .idea/modules.xml
85 | # .idea/*.iml
86 | # .idea/modules
87 | # *.iml
88 | # *.ipr
89 |
90 | # CMake
91 | cmake-build-*/
92 |
93 | # Mongo Explorer plugin
94 | .idea/**/mongoSettings.xml
95 |
96 | # File-based project format
97 | *.iws
98 |
99 | # IntelliJ
100 | out/
101 |
102 | # mpeltonen/sbt-idea plugin
103 | .idea_modules/
104 |
105 | # JIRA plugin
106 | atlassian-ide-plugin.xml
107 |
108 | # Cursive Clojure plugin
109 | .idea/replstate.xml
110 |
111 | # SonarLint plugin
112 | .idea/sonarlint/
113 |
114 | # Crashlytics plugin (for Android Studio and IntelliJ)
115 | com_crashlytics_export_strings.xml
116 | crashlytics.properties
117 | crashlytics-build.properties
118 | fabric.properties
119 |
120 | # Editor-based Rest Client
121 | .idea/httpRequests
122 |
123 | # Android studio 3.1+ serialized cache file
124 | .idea/caches/build_file_checksums.ser
125 |
126 | ### PyCharm Patch ###
127 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
128 |
129 | # *.iml
130 | # modules.xml
131 | # .idea/misc.xml
132 | # *.ipr
133 |
134 | # Sonarlint plugin
135 | # https://plugins.jetbrains.com/plugin/7973-sonarlint
136 | .idea/**/sonarlint/
137 |
138 | # SonarQube Plugin
139 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
140 | .idea/**/sonarIssues.xml
141 |
142 | # Markdown Navigator plugin
143 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
144 | .idea/**/markdown-navigator.xml
145 | .idea/**/markdown-navigator-enh.xml
146 | .idea/**/markdown-navigator/
147 |
148 | # Cache file creation bug
149 | # See https://youtrack.jetbrains.com/issue/JBR-2257
150 | .idea/$CACHE_FILE$
151 |
152 | # CodeStream plugin
153 | # https://plugins.jetbrains.com/plugin/12206-codestream
154 | .idea/codestream.xml
155 |
156 | # Azure Toolkit for IntelliJ plugin
157 | # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
158 | .idea/**/azureSettings.xml
159 |
160 | # End of https://www.toptal.com/developers/gitignore/api/pycharm,osx,jupyternotebooks
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/LobFrame.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
105 |
106 |
107 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LOBFrame
2 |
3 | We release `LOBFrame` (see the two papers [`Deep Limit Order Book Forecasting`](https://arxiv.org/abs/2403.09267) and [`HLOB - Information Persistence and Structure in Limit Order Books`](https://arxiv.org/abs/2405.18938)), a novel, open-source code base which presents a renewed way to process large-scale Limit Order Book (LOB) data. This framework integrates all the latest cutting-edge insights from scientific research (see [Lucchese et al.](https://www.sciencedirect.com/science/article/pii/S0169207024000062), [Prata et al.](https://arxiv.org/pdf/2308.01915.pdf)) into a cohesive system. Its strength lies in the comprehensive nature of the implemented pipeline, which includes the data transformation and processing stage, an ultra-fast implementation of the training, validation, and testing steps, as well as the evaluation of the quality of a model's outputs through trading simulations. Moreover, it offers flexibility by accommodating the integration of new models, ensuring adaptability to future advancements in the field.
4 |
5 | ## Introduction
6 |
7 | In this tutorial, we show how to replicate the experiments presented in the two papers titled __"Deep Limit Order Book Forecasting: A microstructural guide"__ and __"HLOB - Information Persistence and Structure in Limit Order Books"__.
8 |
9 | Before starting, please remember to **ALWAYS CITE OUR WORKS** as follows:
10 |
11 | ```
12 | @article{briola2024deep,
13 | title={Deep Limit Order Book Forecasting},
14 | author={Briola, Antonio and Bartolucci, Silvia and Aste, Tomaso},
15 | journal={arXiv preprint arXiv:2403.09267},
16 | year={2024}
17 | }
18 | ```
19 |
20 | ```
21 | @misc{briola2024hlob,
22 | title={HLOB -- Information Persistence and Structure in Limit Order Books},
23 | author={Antonio Briola and Silvia Bartolucci and Tomaso Aste},
24 | year={2024},
25 | eprint={2405.18938},
26 | archivePrefix={arXiv},
27 | primaryClass={q-fin.TR}
28 | }
29 | ```
30 |
31 | ## Pre-requisites
32 |
33 | Install the required packages:
34 |
35 | ```bash
36 | pip3 install -r requirements.txt
37 | ```
38 |
39 | If you are using a MacOS operating system, please proceed as follows:
40 |
41 | ```bash
42 | pip3 install -r requirements_mac_os.txt
43 | ```
44 |
45 | ## Data
46 | All the code in this repository exploits [LOBSTER](https://lobsterdata.com) data. To have an overview on their structure, please refer
47 | to the official documentation available at the following [link](https://lobsterdata.com/info/DataStructure.php).
48 |
49 | # Preliminary operations
50 | Before starting any experiment:
51 | - Open the ```lightning_batch_gd.py``` file and insert the [Weights & Biases](https://wandb.ai/site) project's name and API key (search for TODOs).
52 | - Open the ```utils.py``` file and set the default values of the parameters.
53 |
54 | ## Usage
55 | To start an experiment from scratch, you need to follow these steps:
56 | - Place the raw data in the `data/nasdaq/raw` folder. The data must be in the LOBSTER format and each folder must be named with the asset's name (e.g. AAPL for Apple stock).
57 | - Run the following command to pre-process data:
58 | ```bash
59 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "data_processing"
60 | ```
61 | - Run the following command to prepare the torch datasets (this allows to reduce the training time):
62 | ```bash
63 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "torch_dataset_preparation" --prediction_horizon 10
64 | ```
65 | If you are interested also in performing the backtest stage, run the following command:
66 | ```bash
67 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "torch_dataset_preparation,torch_dataset_preparation_backtest" --prediction_horizon 10
68 | ```
69 | - If you are planning to use the HLOB model (see the paper titled [`HLOB - Structure and Persistence of Information in Limit Order Books`](https://arxiv.org/abs/2405.18938)), it is mandatory to execute the following command:
70 | ```bash
71 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "complete_homological_structures_preparation"
72 | ```
73 | - Run the following command to train the model:
74 | ```bash
75 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "training"
76 | ```
77 | Currently available models are:
78 | - deeplob
79 | - transformer
80 | - itransformer
81 | - lobtransformer
82 | - dla
83 | - cnn1
84 | - cnn2
85 | - binbtabl
86 | - binctabl
87 | - axiallob
88 | - hlob
89 | - Run the following command to evaluate the model:
90 | ```bash
91 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --experiment_id "" --stages "evaluation"
92 | ```
93 | - Run the following command to analyze the results:
94 | ```bash
95 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --experiment_id "" --stages "backtest,post_trading_analysis"
96 | ```
97 |
98 | Multiple (compatible) stages can be executed at the same time. Consider the following example:
99 | ```bash
100 | python3 main --training_stocks "CSCO" --target_stocks "CSCO" --stages "data_processing,torch_dataset_preparation,torch_dataset_preparation_backtest,training,evaluation,backtest,post_trading_analysis"
101 | ```
102 |
103 | Each experiment can be resumed and re-run by specifying its ID in the `experiment_id` parameter.
104 |
105 | We now provide the typical structure of a folder before an experiment's run:
106 |
107 | ```bash
108 | .
109 | ├── README.md
110 | ├── data
111 | │ └── nasdaq
112 | │ ├── raw_data
113 | │ ├──
114 | │ └──
115 | │ ├── scaled_data
116 | │ ├── test
117 | │ ├── training
118 | │ └── validation
119 | │ └── unscaled_data
120 | │ ├── test
121 | │ ├── training
122 | │ └── validation
123 | ├── data_processing
124 | │ ├── data_process.py
125 | │ └── data_process_utils.py
126 | │ └── complete_homological_utils.py
127 | ├── loaders
128 | │ └── custom_dataset.py
129 | ├── loggers
130 | │ ├── logger.py
131 | │ └── results
132 | ├── main.py
133 | ├── models
134 | │ ├── AxialLob
135 | │ └── axiallob.py
136 | │ ├── CNN1
137 | │ └── cnn1.py
138 | │ ├── CNN2
139 | │ └── cnn2.py
140 | │ ├── DeepLob
141 | │ └── deeplob.py
142 | │ ├── DLA
143 | │ └── DLA.py
144 | │ ├── iTransformer
145 | │ └── itransformer.py
146 | │ ├── LobTransformer
147 | │ └── lobtransformer.py
148 | │ ├── TABL
149 | │ ├── bin_nn.py
150 | │ ├── bin_tabl.py
151 | │ ├── bl_layer.py
152 | │ └── tabl_layer.py
153 | │ ├── Transformer
154 | │ └── transformer.py
155 | | ├── CompleteHCNN
156 | │ └── complete_hcnn.py
157 | ├── optimizers
158 | │ ├── executor.py
159 | │ └── lightning_batch_gd.py
160 | ├── requirements.txt
161 | ├── simulator
162 | │ ├── market_sim.py
163 | │ ├── post_trading_analysis.py
164 | │ └── trading_agent.py
165 | ├── torch_datasets
166 | │ └── threshold_1e-05
167 | │ └── batch_size_32
168 | │ └── 10
169 | │ ├── test_dataset.pt
170 | │ ├── test_dataset_backtest.pt
171 | │ ├── training_dataset.pt
172 | │ └── validation_dataset.pt
173 | ├── results
174 | └── utils.py
175 | ```
176 |
177 | # License
178 |
179 | Copyright 2024 Antonio Briola, Silvia Bartolucci, Tomaso Aste.
180 |
181 | Licensed under the CC BY-NC-ND 4.0 Licence (the "Licence"); you may not use this file except in compliance with the License. You may obtain a copy of the License at:
182 |
183 | ```
184 | https://creativecommons.org/licenses/by-nc-nd/4.0/
185 | ```
186 |
187 | Software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the provided link for the specific language governing permissions and limitations under the License.
--------------------------------------------------------------------------------
/data_processing/complete_homological_utils.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import concurrent.futures
3 | from itertools import chain
4 |
5 | import pandas as pd
6 | import numpy as np
7 | import polars as pl
8 | from typing import *
9 |
10 | import networkx as nx
11 | from fast_tmfg import *
12 | from sklearn.metrics import mutual_info_score
13 |
14 | from utils import get_training_test_stocks_as_string
15 | import matplotlib.pyplot as plt
16 | import seaborn as sns
17 |
18 | import torch
19 |
20 |
21 | def compute_pairwise_mi(df: pd.DataFrame, n_bins: int = 3000) -> pd.DataFrame:
22 | """
23 | Compute the pairwise mutual information between the columns of a dataframe.
24 |
25 | Parameters
26 | ----------
27 | df : pandas.Dataframe
28 | The pandas dataframe to compute the pairwise mutual information for.
29 | n_bins: int
30 | The number of bins to use for discretization.
31 |
32 | Returns
33 | ----------
34 | mi_matrix: pandas.Dataframe
35 | The pairwise mutual information matrix.
36 |
37 | """
38 |
39 | shuffled_df = df.sample(frac=1, random_state=1).reset_index(drop=True) # Shuffle the dataset.
40 | sampled_df = shuffled_df.sample(n=len(df), replace=True) # Perform bootstrapping.
41 | df = sampled_df.copy() # Copy the dataset into a variable called 'df'.
42 | df.reset_index(drop=True, inplace=True) # Reset the indices.
43 | del sampled_df # Delete an unused variable.
44 |
45 | flat_series = df.values.flatten() # Flat the df to perform a binning on all the values (not feature-by-feature).
46 | bins = pd.cut(flat_series, bins=n_bins, labels=False, retbins=True) # Perform the binning.
47 | # Apply the binning to each feature of the original dataset.
48 | for column in df.columns:
49 | df[column] = pd.cut(df[column], bins=bins[1], labels=False, include_lowest=True)
50 | del flat_series # Delete an unused variable.
51 |
52 | discretized_df = df.copy() # Copy the dataset into a variable called 'discretized_df'.
53 | del df # Delete an unused variable.
54 |
55 | # Initialize an empty Mutual Information (MI) matrix and fill it with 0s.
56 | n_features = discretized_df.shape[1]
57 | mi_matrix = np.zeros((n_features, n_features))
58 |
59 | # Compute the pairwise MI and fill the MI matrix consequently.
60 | for i in range(n_features):
61 | for j in range(i, n_features):
62 | mi_value = mutual_info_score(
63 | discretized_df.iloc[:, i], discretized_df.iloc[:, j]
64 | )
65 | mi_matrix[i, j] = mi_value
66 | mi_matrix[j, i] = mi_value
67 |
68 | mi_matrix = pd.DataFrame(mi_matrix) # Transform the MI matrix into a Pandas dataframe.
69 | return mi_matrix # Return the MI matrix in the form of a Pandas dataframe.
70 |
71 |
72 | def process_file(
73 | file: str,
74 | ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, nx.Graph, nx.Graph, nx.Graph]:
75 | """
76 | Compute the TMFG for volumes of a given orderbook file.
77 |
78 | Parameters
79 | ----------
80 | file : str
81 | The path to the file to compute the TMFG for.
82 |
83 | Returns
84 | ----------
85 | sim_ask : pandas.DataFrame
86 | The pairwise mutual information matrix for the ask volumes.
87 | sim_bid : pandas.DataFrame
88 | The pairwise mutual information matrix for the bid volumes.
89 | sim_all : pandas.DataFrame
90 | The pairwise mutual information matrix for the ask and bid volumes.
91 | net_ask : networkx.Graph
92 | The TMFG for the ask volumes.
93 | net_bid : networkx.Graph
94 | The TMFG for the bid volumes.
95 | net_all : networkx.Graph
96 | The TMFG for the ask and bid volumes.
97 | """
98 |
99 | print(f"Computing structure for file: {file}...")
100 | # Read the file using polars to accelerate the process.
101 | df = pl.read_csv(file)
102 | df = df.to_pandas()
103 |
104 | # Extract the volumes for the ask and bid sides.
105 | volumes_all = df.iloc[:, 1:41].iloc[:, 1::2]
106 |
107 | # Compute the pairwise mutual information matrices.
108 | sim_all = compute_pairwise_mi(volumes_all)
109 |
110 | # Compute the TMFGs.
111 | model_all = TMFG()
112 | cliques_all, seps_all, adj_matrix_all = model_all.fit_transform(
113 | sim_all, output="weighted_sparse_W_matrix"
114 | )
115 |
116 | # Convert the adjacency matrices to networkx graphs.
117 | net_all = nx.from_numpy_array(adj_matrix_all)
118 |
119 | return sim_all, net_all, file
120 |
121 |
122 | def mean_tmfg(sm_list: List[pd.DataFrame]) -> pd.DataFrame:
123 | """
124 | Compute the average similarity matrix for a list of similarity matrices.
125 |
126 | Parameters
127 | ----------
128 | sm_list : List[pandas.DataFrame]
129 | The list of similarity matrices to compute the average for.
130 |
131 | Returns
132 | ----------
133 | average_matrix : pandas.DataFrame
134 | The average similarity matrix.
135 | """
136 |
137 | # Stack the matrices along a new axis (axis=0)
138 | stacked_matrices = np.stack(sm_list, axis=0)
139 |
140 | # Calculate the entry-wise average along the new axis
141 | average_matrix = np.mean(stacked_matrices, axis=0)
142 | np.fill_diagonal(average_matrix, 0)
143 |
144 | average_matrix = pd.DataFrame(average_matrix)
145 |
146 | '''
147 | plt.figure(figsize=(10, 8)) # Optional: Adjusts the size of the figure
148 | sns.heatmap(average_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)
149 | plt.title("Correlation Matrix Heatmap")
150 | plt.show()
151 | '''
152 |
153 | return average_matrix
154 |
155 |
156 | def extract_components(
157 | cliques: List[List[int]], separators: List[List[int]], adjacency_matrix: np.ndarray
158 | ) -> Tuple[List[List[int]], List[List[int]], List[List[int]]]:
159 | """
160 | Given the cliques, separators and adjacency matrix of a TMFG, extract the b-cliques of size 2 (edges), 3 (triangles) and 4 (tetrahedra).
161 |
162 | Parameters
163 | ----------
164 | cliques : List[int]
165 | The list of cliques of the TMFG.
166 | separators : List[int]
167 | The list of separators of the TMFG.
168 | adjacency_matrix : numpy.ndarray
169 | The adjacency matrix of the TMFG.
170 |
171 | Returns
172 | ----------
173 | final_b_cliques_4 : List[List[int]]
174 | The final list of tetrahera.
175 | final_b_cliques_3 : List[List[int]]
176 | The final list of triangles.
177 | final_b_cliques_2 : List[List[int]]
178 | The final list of edges.
179 | """
180 |
181 | # Extract edges.
182 | edges = []
183 | adjacency_matrix = nx.from_numpy_array(adjacency_matrix)
184 |
185 | for i in nx.enumerate_all_cliques(adjacency_matrix):
186 | if len(i) == 2:
187 | edges.append(sorted(i))
188 |
189 | b_cliques_4 = []
190 | b_cliques_3 = []
191 | b_cliques_2 = []
192 |
193 | b_cliques_all = nx.enumerate_all_cliques(adjacency_matrix)
194 |
195 | for i in b_cliques_all:
196 | if len(i) == 2:
197 | b_cliques_2.append(sorted(i))
198 | if len(i) == 3:
199 | b_cliques_3.append(sorted(i))
200 | if len(i) == 4:
201 | b_cliques_4.append(sorted(i))
202 |
203 | final_b_cliques_4 = b_cliques_4
204 |
205 | final_b_cliques_3 = b_cliques_3
206 |
207 | final_b_cliques_2 = edges
208 |
209 | final_b_cliques_4 = [[(x * 2) + 1 for x in sublist] for sublist in final_b_cliques_4]
210 | final_b_cliques_4 = [[x, x - 1] for sublist in final_b_cliques_4 for x in sublist]
211 | final_b_cliques_4 = list(chain.from_iterable(final_b_cliques_4))
212 | final_b_cliques_4 = [final_b_cliques_4[i:i + 8] for i in range(0, len(final_b_cliques_4), 8)]
213 | final_b_cliques_4 = [sorted(sublist) for sublist in final_b_cliques_4]
214 |
215 | final_b_cliques_3 = [[(x * 2) + 1 for x in sublist] for sublist in final_b_cliques_3]
216 | final_b_cliques_3 = [[x, x - 1] for sublist in final_b_cliques_3 for x in sublist]
217 | final_b_cliques_3 = list(chain.from_iterable(final_b_cliques_3))
218 | final_b_cliques_3 = [final_b_cliques_3[i:i + 6] for i in range(0, len(final_b_cliques_3), 6)]
219 | final_b_cliques_3 = [sorted(sublist) for sublist in final_b_cliques_3]
220 |
221 | final_b_cliques_2 = [[(x * 2) + 1 for x in sublist] for sublist in final_b_cliques_2]
222 | final_b_cliques_2 = [[x, x - 1] for sublist in final_b_cliques_2 for x in sublist]
223 | final_b_cliques_2 = list(chain.from_iterable(final_b_cliques_2))
224 | final_b_cliques_2 = [final_b_cliques_2[i:i + 4] for i in range(0, len(final_b_cliques_2), 4)]
225 | final_b_cliques_2 = [sorted(sublist) for sublist in final_b_cliques_2]
226 |
227 | return final_b_cliques_4, final_b_cliques_3, final_b_cliques_2
228 |
229 |
230 | def execute_pipeline(file_patterns, general_hyperparameters):
231 | files = []
232 | for pattern in file_patterns:
233 | files.extend(glob.glob(pattern.format(dataset={general_hyperparameters['dataset']})))
234 |
235 | max_threads = 5
236 | with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
237 | results = list(executor.map(process_file, files))
238 |
239 | nets_all = []
240 | sm_all = []
241 | files_all = []
242 |
243 | for result in results:
244 | sim_all, net_all, file = result
245 | nets_all.append(net_all)
246 | sm_all.append(sim_all)
247 | files_all.append(file)
248 |
249 | del results
250 |
251 | model_all = TMFG()
252 | cliques_all, seps_all, adj_matrix_all = model_all.fit_transform(
253 | mean_tmfg(sm_all), output="weighted_sparse_W_matrix"
254 | )
255 |
256 | c4, c3, c2 = extract_components(cliques_all, seps_all, adj_matrix_all)
257 | c4 = list(chain.from_iterable(c4))
258 | c3 = list(chain.from_iterable(c3))
259 | c2 = list(chain.from_iterable(c2))
260 |
261 | original_cliques_all = list(chain.from_iterable(cliques_all))
262 | original_seps_all = list(chain.from_iterable(seps_all))
263 |
264 | return c4, c3, c2, original_cliques_all, original_seps_all, adj_matrix_all, sm_all, files_all
265 |
266 |
267 | def get_complete_homology(
268 | general_hyperparameters: Dict[str, Any],
269 | model_hyperparameters: Dict[str, Any],
270 | ) -> Dict[str, List[List[int]]]:
271 | """
272 | Compute the homological structures to be used in the HCNN building process.
273 |
274 | Parameters
275 | ----------
276 | general_hyperparameters : Dict[str, Any]
277 | The general hyperparameters of the experiment.
278 |
279 | Returns
280 | ----------
281 | homological_structures : Dict[str, List[List[int]]]
282 | """
283 |
284 | file_patterns_training = [f"./data/{general_hyperparameters['dataset']}/unscaled_data/training/*{element}*.csv" for element in
285 | general_hyperparameters['training_stocks']]
286 | c4_training, c3_training, c2_training, original_cliques_all_training, original_seps_all_training, adj_matrix_all_training, sm_all_training, files_all_training = execute_pipeline(
287 | file_patterns_training, general_hyperparameters)
288 |
289 | file_patterns_validation = [f"./data/{general_hyperparameters['dataset']}/unscaled_data/validation/*{element}*.csv" for element in
290 | general_hyperparameters['training_stocks']]
291 | _, _, _, _, _, adj_matrix_all_validation, sm_all_validation, files_all_validation = execute_pipeline(file_patterns_validation, general_hyperparameters)
292 |
293 | file_patterns_test = [f"./data/{general_hyperparameters['dataset']}/unscaled_data/test/*{element}*.csv" for element in
294 | general_hyperparameters['target_stocks']]
295 | _, _, _, _, _, adj_matrix_all_test, sm_all_test, files_all_test = execute_pipeline(file_patterns_test, general_hyperparameters)
296 |
297 | homological_structures = {"tetrahedra": c4_training,
298 | "triangles": c3_training,
299 | "edges": c2_training,
300 | "original_cliques": original_cliques_all_training,
301 | "original_separators": original_seps_all_training,
302 | "adj_matrix_training": adj_matrix_all_training,
303 | "similarity_matrices_training": sm_all_training,
304 | "files_training": files_all_training,
305 | "adj_matrix_validation": adj_matrix_all_validation,
306 | "similarity_matrices_validation": sm_all_validation,
307 | "files_validation": files_all_validation,
308 | "adj_matrix_test": adj_matrix_all_test,
309 | "similarity_matrices_test": sm_all_test,
310 | "files_test": files_all_test
311 | }
312 |
313 | training_stocks_string, test_stocks_string = get_training_test_stocks_as_string(general_hyperparameters)
314 | print(training_stocks_string, test_stocks_string)
315 | torch.save(homological_structures,
316 | f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{training_stocks_string}_test_{test_stocks_string}/complete_homological_structures.pt")
317 | # torch.save(homological_structures,
318 | # f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/homological_structures_large_tick_stocks.pt")
319 | print('Homological structures have been saved.')
320 |
321 | # get_homology({'dataset': 'nasdaq'})
322 |
--------------------------------------------------------------------------------
/data_processing/data_process.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import glob
3 | import random
4 | import re
5 | from datetime import datetime, time
6 |
7 | import numpy as np
8 | import pandas as pd
9 |
10 |
11 | def process_data(
12 | ticker: str,
13 | input_path: str,
14 | output_path: str,
15 | logs_path: str,
16 | horizons: list[int],
17 | normalization_window: int,
18 | time_index: str = "seconds",
19 | features: str = "orderbooks",
20 | scaling: bool = True,
21 | ) -> None:
22 | """
23 | Function to pre-process LOBSTER data. The data must be stored in the input_path directory as 'daily message LOB' and 'orderbook' files.
24 |
25 | The data are treated in the following way:
26 | - Orderbook's states with crossed quotes are removed.
27 | - Each state in the orderbook is time-stamped, with states occurring at the same time collapsed onto the last occurring state.
28 | - The first and last 10 minutes of market activity (inside usual opening times) are dropped.
29 | - Rolling z-score normalization is applied to the data, i.e. the mean and standard deviation of the previous 5 days is used to normalize current day's data.
30 | Hence, the first 5 days are dropped.
31 | - Smoothed returns at the requested horizons (in orderbook's changes) are returned:
32 | - if smoothing = "horizon": l = (m+ - m)/m, where m+ denotes the mean of the next h mid-prices, m(.) is current mid-price.
33 | - if smoothing = "uniform": l = (m+ - m)/m, where m+ denotes the mean of the k+1 mid-prices centered at m(. + h), m(.) is current mid-price.
34 |
35 | A log file is produced tracking:
36 | - Orderbook's files with problems.
37 | - Message orderbook's files with problems.
38 | - Trading days with unusual opening - closing times.
39 | - Trading days with crossed quotes.
40 |
41 | A statistics.csv file summarizes the following (daily) statistics:
42 | - # Updates (000): the total number of changes in the orderbook file.
43 | - # Trades (000): the total number of trades, computed by counting the number of message book events corresponding to the execution of (possibly hidden)
44 | limit orders ('event_type' 4 or 5 in LOBSTER orderbook's message file).
45 | - # Price Changes (000): the total number of price changes per day.
46 | - # Price (USD): average price on the day, weighted average by time.
47 | - # Spread (bps): average spread on the day, weighted average by time.
48 | - # Volume (USD MM): total volume traded on the day, computed as the sum of the volumes of all the executed trades ('event_type' 4 or 5 in LOBSTER orderbook's message file).
49 | The volume of a single trade is given by size*price.
50 | - # Tick size: the fraction of time that the bid-ask spread is equal to one tick for each stock.
51 |
52 | Args:
53 | ticker (str): The ticker to be considered.
54 | input_path (str): The path where the order book and message book files are stored, order book files have shape (:, 4*levels):
55 | ["ASKp1", "ASKs1", "BIDp1", "BIDs1", ..., "ASKp10", "ASKs10", "BIDp10", "BIDs10"].
56 | output_path (str): The path where we wish to save the processed datasets.
57 | logs_path (str): The path where we wish to save the logs.
58 | time_index (str): The time-index to use ("seconds" or "datetime").
59 | horizons (list): Forecasting horizons for labels.
60 | normalization_window (int): Window for rolling z-score normalization.
61 | features (str): Whether to return 'orderbooks' or 'orderflows'.
62 | scaling (bool): Whether to apply rolling z-score normalization.
63 |
64 | Returns:
65 | None.
66 | """
67 |
68 | csv_file_list = glob.glob(
69 | f"{input_path}/*.csv"
70 | ) # Get the list of all the .csv files in the input_path directory.
71 |
72 | csv_orderbook = [
73 | name for name in csv_file_list if "orderbook" in name
74 | ] # Get the list of all the orderbook files in the input_path directory.
75 | csv_orderbook.sort() # Sort the list of orderbook files.
76 | csv_message = [
77 | name for name in csv_file_list if "message" in name
78 | ] # Get the list of all the message files in the input_path directory.
79 | csv_message.sort() # Sort the list of message files.
80 |
81 | # Check if exactly half of the files are order book and exactly half are messages.
82 | assert len(csv_message) == len(csv_orderbook)
83 | assert len(csv_file_list) == len(csv_message) + len(csv_orderbook)
84 |
85 | print(f"Data preprocessing loop started. SCALING: {str(scaling)}.")
86 |
87 | # Initialize the dataframe containing logs.
88 | logs = []
89 | df_statistics = pd.DataFrame(
90 | [],
91 | columns=[
92 | "Updates (000)",
93 | "Trades (000)",
94 | "Price Changes (000)",
95 | "Price (USD)",
96 | "Spread (bps)",
97 | "Volume (USD MM)",
98 | "Tick Size",
99 | ],
100 | dtype=float,
101 | )
102 |
103 | # Initialize dataframes for dynamic Z-score normalization.
104 | mean_df = pd.DataFrame()
105 | mean2_df = pd.DataFrame()
106 | nsamples_df = pd.DataFrame()
107 |
108 | for orderbook_name in csv_orderbook:
109 | print(orderbook_name)
110 |
111 | # Read orderbook files and keep a record of problematic files.
112 | df_orderbook = None
113 | try:
114 | df_orderbook = pd.read_csv(orderbook_name, header=None)
115 | except:
116 | logs.append(f"{orderbook_name} skipped. Error: failed to read orderbook.")
117 |
118 | levels = int(
119 | df_orderbook.shape[1] / 4
120 | ) # Verify that the number of columns is a multiple of 4.
121 | feature_names_raw = [
122 | "ASKp",
123 | "ASKs",
124 | "BIDp",
125 | "BIDs",
126 | ] # Define sorted raw features' names.
127 | feature_names = []
128 | for i in range(1, levels + 1):
129 | for j in range(4):
130 | feature_names += [
131 | feature_names_raw[j] + str(i)
132 | ] # Add to raw features' names the level number.
133 | df_orderbook.columns = (
134 | feature_names # Rename the columns of the orderbook dataframe.
135 | )
136 |
137 | # Divide prices by 10000.
138 | target_columns = [col for col in df_orderbook.columns if "ASKp" in col or "BIDp" in col]
139 | df_orderbook[target_columns] = df_orderbook[target_columns].astype(int) # / 10000
140 |
141 | df_orderbook.insert(
142 | 0, "mid_price", (df_orderbook["ASKp1"] + df_orderbook["BIDp1"]) / 2
143 | ) # Add the mid-price column to the orderbook dataframe.
144 | df_orderbook.mid_price = df_orderbook.mid_price.astype(int)
145 |
146 | # Extract the date from the orderbook file's name.
147 | match = re.findall(r"\d{4}-\d{2}-\d{2}", orderbook_name)[-1]
148 | date = datetime.strptime(match, "%Y-%m-%d")
149 |
150 | # Read message files and keep a record of problematic files.
151 | message_name = orderbook_name.replace("orderbook", "message")
152 | df_message = None
153 | try:
154 | df_message = pd.read_csv(
155 | message_name, usecols=[0, 1, 2, 3, 4, 5], header=None
156 | )
157 | except:
158 | logs.append(f"{message_name} skipped. Error: failed to read message file.")
159 |
160 | # Check the two dataframes created before have the same length.
161 | assert len(df_message) == len(df_orderbook)
162 |
163 | # Rename the columns of the message dataframe.
164 | df_message.columns = [
165 | "seconds",
166 | "event_type",
167 | "order ID",
168 | "volume",
169 | "price",
170 | "direction",
171 | ]
172 |
173 | # Remove trading halts.
174 | trading_halts_start = df_message[
175 | (df_message["event_type"] == 7) & (df_message["price"] == -1)
176 | ].index
177 | trading_halts_end = df_message[
178 | (df_message["event_type"] == 7) & (df_message["price"] == 1)
179 | ].index
180 | trading_halts_index = np.array([])
181 | for halt_start, halt_end in zip(trading_halts_start, trading_halts_end):
182 | trading_halts_index = np.append(
183 | trading_halts_index,
184 | df_message.index[
185 | (df_message.index >= halt_start) & (df_message.index < halt_end)
186 | ],
187 | )
188 | if len(trading_halts_index) > 0:
189 | for halt_start, halt_end in zip(trading_halts_start, trading_halts_end):
190 | logs.append(
191 | f"Warning: trading halt between {str(df_message.loc[halt_start, 'seconds'])} and {str(df_message.loc[halt_end, 'seconds'])} in {orderbook_name}."
192 | )
193 | df_orderbook = df_orderbook.drop(trading_halts_index)
194 | df_message = df_message.drop(trading_halts_index)
195 |
196 | # Remove crossed quotes.
197 | crossed_quotes_index = df_orderbook[
198 | (df_orderbook["BIDp1"] > df_orderbook["ASKp1"])
199 | ].index
200 | if len(crossed_quotes_index) > 0:
201 | logs.append(
202 | f"Warning: {str(len(crossed_quotes_index))} crossed quotes removed in {orderbook_name}."
203 | )
204 | df_orderbook = df_orderbook.drop(crossed_quotes_index)
205 | df_message = df_message.drop(crossed_quotes_index)
206 |
207 | # Add the 'seconds since midnight' column to the orderbook from the message book.
208 | df_orderbook.insert(0, "seconds", df_message["seconds"])
209 |
210 | # One conceptual event (e.g. limit order modification which is implemented as a cancellation followed by an immediate new arrival,
211 | # single market order executing against multiple resting limit orders) may appear as multiple rows in the message file, all with
212 | # the same timestamp. We hence group the order book data by unique timestamps and take the last entry.
213 | df_orderbook = df_orderbook.groupby(["seconds"]).tail(1)
214 | df_message = df_message.groupby(["seconds"]).tail(1)
215 |
216 | # Check market opening times for strange values.
217 | market_open = (int(df_orderbook["seconds"].iloc[0] / 60) / 60) # Open at minute before first transaction.
218 | market_close = (int(df_orderbook["seconds"].iloc[-1] / 60) + 1) / 60 # Close at minute after last transaction.
219 |
220 | if not (market_open == 9.5 and market_close == 16):
221 | logs.append(
222 | f"Warning: unusual opening times in {orderbook_name}: {str(market_open)} - {str(market_close)}."
223 | )
224 |
225 | if time_index == "seconds":
226 | # Drop values outside of market hours using seconds
227 | df_orderbook = df_orderbook.loc[
228 | (df_orderbook["seconds"] >= 34200) & (df_orderbook["seconds"] <= 57600)
229 | ]
230 | df_message = df_message.loc[
231 | (df_message["seconds"] >= 34200) & (df_message["seconds"] <= 57600)
232 | ]
233 |
234 | # Drop first and last 10 minutes of trading using seconds.
235 | market_open_seconds = market_open * 60 * 60 + 10 * 60
236 | market_close_seconds = market_close * 60 * 60 - 10 * 60
237 | df_orderbook = df_orderbook.loc[
238 | (df_orderbook["seconds"] >= market_open_seconds)
239 | & (df_orderbook["seconds"] <= market_close_seconds)
240 | ]
241 | df_message = df_message.loc[
242 | (df_message["seconds"] >= market_open_seconds)
243 | & (df_message["seconds"] <= market_close_seconds)
244 | ]
245 | else:
246 | raise Exception("time_index must be seconds.")
247 |
248 | # Save statistical information.
249 | if len(df_orderbook) > 0:
250 | updates = df_orderbook.shape[0] / 1000
251 | trades = (
252 | np.sum(
253 | (df_message["event_type"] == 4) | (df_message["event_type"] == 5)
254 | )
255 | / 1000
256 | )
257 | price_changes = np.sum(~(np.diff(df_orderbook["mid_price"]) == 0.0)) / 1000
258 | time_deltas = np.append(
259 | np.diff(df_orderbook["seconds"]),
260 | market_close_seconds - df_orderbook["seconds"].iloc[-1],
261 | )
262 | price = np.average(df_orderbook["mid_price"] / 10 ** 4, weights=time_deltas)
263 | spread = np.average(
264 | (df_orderbook["ASKp1"] - df_orderbook["BIDp1"])
265 | / df_orderbook["mid_price"]
266 | * 10000,
267 | weights=time_deltas,
268 | )
269 | volume = (
270 | np.sum(
271 | df_message.loc[
272 | (df_message["event_type"] == 4)
273 | | (df_message["event_type"] == 5)
274 | ]["volume"]
275 | * df_message.loc[
276 | (df_message["event_type"] == 4)
277 | | (df_message["event_type"] == 5)
278 | ]["price"]
279 | / 10 ** 4
280 | )
281 | / 10 ** 6
282 | )
283 | tick_size = np.average(
284 | (df_orderbook["ASKp1"] - df_orderbook["BIDp1"]) == 100.0,
285 | weights=time_deltas,
286 | )
287 |
288 | df_statistics.loc[date] = [
289 | updates,
290 | trades,
291 | price_changes,
292 | price,
293 | spread,
294 | volume,
295 | tick_size,
296 | ]
297 |
298 | if features == "orderbooks":
299 | pass
300 | elif features == "orderflows":
301 | # Compute bid and ask multilevel orderflow.
302 | ASK_prices = df_orderbook.loc[:, df_orderbook.columns.str.contains("ASKp")]
303 | BID_prices = df_orderbook.loc[:, df_orderbook.columns.str.contains("BIDp")]
304 | ASK_sizes = df_orderbook.loc[:, df_orderbook.columns.str.contains("ASKs")]
305 | BID_sizes = df_orderbook.loc[:, df_orderbook.columns.str.contains("BIDs")]
306 |
307 | ASK_price_changes = ASK_prices.diff().dropna().to_numpy()
308 | BID_price_changes = BID_prices.diff().dropna().to_numpy()
309 | ASK_size_changes = ASK_sizes.diff().dropna().to_numpy()
310 | BID_size_changes = BID_sizes.diff().dropna().to_numpy()
311 |
312 | ASK_sizes = ASK_sizes.to_numpy()
313 | BID_sizes = BID_sizes.to_numpy()
314 |
315 | ASK_OF = (
316 | (ASK_price_changes > 0.0) * (-ASK_sizes[:-1, :])
317 | + (ASK_price_changes == 0.0) * ASK_size_changes
318 | + (ASK_price_changes < 0) * ASK_sizes[1:, :]
319 | )
320 | BID_OF = (
321 | (BID_price_changes < 0.0) * (-BID_sizes[:-1, :])
322 | + (BID_price_changes == 0.0) * BID_size_changes
323 | + (BID_price_changes > 0) * BID_sizes[1:, :]
324 | )
325 |
326 | # Remove all price-volume features and add in orderflow.
327 | df_orderbook = df_orderbook.drop(feature_names, axis=1).iloc[1:, :]
328 | mid_seconds_columns = list(df_orderbook.columns)
329 | feature_names_raw = ["ASK_OF", "BID_OF"]
330 | feature_names = []
331 | for feature_name in feature_names_raw:
332 | for i in range(1, levels + 1):
333 | feature_names += [feature_name + str(i)]
334 | df_orderbook[feature_names] = np.concatenate([ASK_OF, BID_OF], axis=1)
335 |
336 | # Re-order columns.
337 | feature_names_reordered = [[]] * len(feature_names)
338 | feature_names_reordered[::2] = feature_names[:levels]
339 | feature_names_reordered[1::2] = feature_names[levels:]
340 | feature_names = feature_names_reordered
341 |
342 | df_orderbook = df_orderbook[mid_seconds_columns + feature_names]
343 | else:
344 | raise ValueError("Features must be 'orderbooks' or 'orderflows'.")
345 |
346 | # Dynamic z-score normalization.
347 | orderbook_mean_df = pd.DataFrame(
348 | df_orderbook[feature_names].mean().values.reshape(-1, len(feature_names)),
349 | columns=feature_names,
350 | )
351 | orderbook_mean2_df = pd.DataFrame(
352 | (df_orderbook[feature_names] ** 2)
353 | .mean()
354 | .values.reshape(-1, len(feature_names)),
355 | columns=feature_names,
356 | )
357 | orderbook_nsamples_df = pd.DataFrame(
358 | np.array([[len(df_orderbook)]] * len(feature_names)).T,
359 | columns=feature_names,
360 | )
361 |
362 | if len(mean_df) < normalization_window:
363 | logs.append(
364 | f"{orderbook_name} skipped. Initializing rolling z-score normalization."
365 | )
366 | # Don't save the first days as we don't have enough days to normalize.
367 | mean_df = pd.concat([mean_df, orderbook_mean_df], ignore_index=True)
368 | mean2_df = pd.concat([mean2_df, orderbook_mean2_df], ignore_index=True)
369 | nsamples_df = pd.concat(
370 | [nsamples_df, orderbook_nsamples_df], ignore_index=True
371 | )
372 | continue
373 | else:
374 | z_mean_df = pd.DataFrame(
375 | (nsamples_df * mean_df).sum(axis=0) / nsamples_df.sum(axis=0)
376 | ).T # Dynamically compute mean.
377 | z_stdev_df = pd.DataFrame(
378 | np.sqrt(
379 | (nsamples_df * mean2_df).sum(axis=0) / nsamples_df.sum(axis=0)
380 | - z_mean_df ** 2
381 | )
382 | ) # Dynamically compute standard deviation.
383 |
384 | # Broadcast to df_orderbook size.
385 | z_mean_df = z_mean_df.loc[z_mean_df.index.repeat(len(df_orderbook))]
386 | z_stdev_df = z_stdev_df.loc[z_stdev_df.index.repeat(len(df_orderbook))]
387 | z_mean_df.index = df_orderbook.index
388 | z_stdev_df.index = df_orderbook.index
389 | if scaling is True:
390 | df_orderbook[feature_names] = (df_orderbook[feature_names] - z_mean_df) / z_stdev_df # Apply normalization.
391 |
392 | # Roll forward by dropping first rows and adding most recent mean and mean2.
393 | mean_df = mean_df.iloc[1:, :]
394 | mean2_df = mean2_df.iloc[1:, :]
395 | nsamples_df = nsamples_df.iloc[1:, :]
396 |
397 | mean_df = pd.concat([mean_df, orderbook_mean_df], ignore_index=True)
398 | mean2_df = pd.concat([mean2_df, orderbook_mean2_df], ignore_index=True)
399 | nsamples_df = pd.concat(
400 | [nsamples_df, orderbook_nsamples_df], ignore_index=True
401 | )
402 |
403 | # Create labels with simple delta prices.
404 | rolling_mid = df_orderbook["mid_price"]
405 | rolling_mid = rolling_mid.to_numpy().flatten()
406 | for h in horizons:
407 | delta_ticks = rolling_mid[h:] - df_orderbook["mid_price"][:-h]
408 | df_orderbook[f"Raw_Target_{str(h)}"] = delta_ticks
409 |
410 | # Create labels applying smoothing.
411 | for h in horizons:
412 | rolling_mid_minus = df_orderbook['mid_price'].rolling(window=h, min_periods=h).mean().shift(h)
413 | rolling_mid_plus = df_orderbook["mid_price"].rolling(window=h, min_periods=h).mean().to_numpy().flatten()
414 | smooth_pct_change = rolling_mid_plus - rolling_mid_minus
415 | df_orderbook[f"Smooth_Target_{str(h)}"] = smooth_pct_change
416 |
417 | # Drop the mid-price column and transform seconds column into a readable format.
418 | df_orderbook = df_orderbook.drop(["mid_price"], axis=1)
419 | pattern = r"\d{4}-\d{2}-\d{2}"
420 | match = re.search(pattern, orderbook_name)
421 | date_temp = match.group()
422 | df_orderbook.seconds = df_orderbook.apply(
423 | lambda row: get_datetime_from_seconds(row["seconds"], date_temp), axis=1
424 | )
425 |
426 | # Drop elements which cannot be used for training.
427 | df_orderbook = df_orderbook.dropna()
428 | df_orderbook.drop_duplicates(inplace=True, keep='last', subset='seconds')
429 |
430 | # Save processed files.
431 | output_name = f"{output_path}/{ticker}_{features}_{str(date.date())}"
432 | df_orderbook.to_csv(f"{output_name}.csv", header=True, index=False)
433 |
434 | logs.append(f"{orderbook_name} completed.")
435 |
436 | print(f"Data preprocessing loop finished. SCALING: {str(scaling)}.")
437 |
438 | with open(f"{logs_path}/{features}_processing_logs.txt", "w") as f:
439 | for log in logs:
440 | f.write(log + "\n")
441 |
442 | print("Please check processing logs.")
443 |
444 | df_statistics.to_csv(
445 | f"{logs_path}/{features}_statistics.csv", header=True, index=False
446 | ) # Save statistics.
447 |
448 |
449 | def get_datetime_from_seconds(seconds_after_midnight, date_str):
450 | # Convert the date_str to a datetime.date object.
451 | dt_date = datetime.strptime(date_str, "%Y-%m-%d").date()
452 |
453 | # Calculate the time component from seconds_after_midnight.
454 | hours = int(seconds_after_midnight // 3600)
455 | minutes = int((seconds_after_midnight % 3600) // 60)
456 | seconds = int(seconds_after_midnight % 60)
457 | microseconds = int(
458 | (seconds_after_midnight % 1) * 1e6
459 | ) # Convert decimal part to microseconds.
460 |
461 | # Create a datetime.time object for the time component.
462 | dt_time = time(hour=hours, minute=minutes, second=seconds, microsecond=microseconds)
463 |
464 | # Combine the date and time to create the datetime.datetime object.
465 | dt_datetime = datetime.combine(dt_date, dt_time)
466 |
467 | return dt_datetime
468 |
--------------------------------------------------------------------------------
/data_processing/data_process_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from data_processing import data_process
4 |
5 |
6 | class DataUtils:
7 | def __init__(self, ticker, dataset, experiment_id, horizons, normalization_window):
8 | self.ticker = ticker # Ticker of the stock to be processed.
9 | self.dataset = dataset # Dataset to be used.
10 | self.experiment_id = experiment_id # Experiment ID.
11 | self.horizons = horizons # Horizons to be used when computing labels.
12 | self.normalization_window = normalization_window # Normalization window to be used when normalizing data.
13 |
14 | self.__raw_data_path = None # Path containing the raw LOB data.
15 | self.__processed_data_path_unscaled_data = (
16 | None # Path containing the processed but unscaled LOB data.
17 | )
18 | self.__processed_data_path_scaled_data = (
19 | None # Path containing the processed and scaled LOB data.
20 | )
21 | self.__logs_path = None # Path containing the logs of the data processing.
22 |
23 | def __set_raw_data_path(self):
24 | # Set the raw data path according to the dataset.
25 | if self.dataset == "nasdaq":
26 | self.__raw_data_path = f"./data/{self.dataset}/raw/{self.ticker}"
27 |
28 | def __set_processed_data_path_unscaled_data(self):
29 | # Set the path containing the processed but unscaled LOB data according to the dataset.
30 | if self.dataset == "nasdaq":
31 | self.__processed_data_path_unscaled_data = (
32 | f"./data/{self.dataset}/unscaled_data/{self.ticker}"
33 | )
34 |
35 | def __set_processed_data_path_scaled_data(self):
36 | # Set the path containing the processed and scaled LOB data according to the dataset.
37 | if self.dataset == "nasdaq":
38 | self.__processed_data_path_scaled_data = (
39 | f"./data/{self.dataset}/scaled_data/{self.ticker}"
40 | )
41 |
42 | def __set_logs_path(self):
43 | # Set the path containing the logs of the data processing according to the experiment ID.
44 | self.__logs_path = (
45 | f"./loggers/results/{self.experiment_id}/data_processing_logs"
46 | )
47 |
48 | def generate_data_folders(self):
49 | self.__set_raw_data_path() # Set the raw data path.
50 | self.__set_processed_data_path_unscaled_data() # Set the path containing the processed but unscaled LOB data.
51 | self.__set_processed_data_path_scaled_data() # Set the path containing the processed and scaled LOB data.
52 | self.__set_logs_path() # Set the path containing the logs of the data processing.
53 |
54 | # Create the folder for the processed but unscaled LOB data if it does not exist.
55 | if not os.path.exists(self.__processed_data_path_unscaled_data):
56 | os.makedirs(self.__processed_data_path_unscaled_data)
57 |
58 | # Create the folder for the processed and scaled LOB data if it does not exist.
59 | if not os.path.exists(self.__processed_data_path_scaled_data):
60 | os.makedirs(self.__processed_data_path_scaled_data)
61 |
62 | # Create the folder for the logs of the data processing if it does not exist.
63 | if not os.path.exists(self.__logs_path):
64 | os.makedirs(self.__logs_path)
65 |
66 | # Process the data to obtain scaled and unscaled data.
67 | def process_data(self):
68 | data_process.process_data(
69 | ticker=self.ticker,
70 | input_path=self.__raw_data_path,
71 | output_path=self.__processed_data_path_unscaled_data,
72 | logs_path=self.__logs_path,
73 | horizons=self.horizons,
74 | normalization_window=self.normalization_window,
75 | scaling=False,
76 | )
77 |
78 | data_process.process_data(
79 | ticker=self.ticker,
80 | input_path=self.__raw_data_path,
81 | output_path=self.__processed_data_path_scaled_data,
82 | logs_path=self.__logs_path,
83 | horizons=self.horizons,
84 | normalization_window=self.normalization_window,
85 | scaling=True,
86 | )
87 |
--------------------------------------------------------------------------------
/loaders/custom_dataset.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import random
3 |
4 | import numpy as np
5 | import pandas as pd
6 | import torch
7 | from torch.utils.data import Dataset, DataLoader
8 | import polars as pl
9 | import tqdm
10 | import matplotlib.pyplot as plt
11 |
12 | from utils import detect_changing_points
13 |
14 |
15 | class CustomDataset(Dataset):
16 | def __init__(
17 | self,
18 | dataset,
19 | learning_stage,
20 | window_size,
21 | shuffling_seed,
22 | cache_size,
23 | lighten,
24 | threshold,
25 | all_horizons,
26 | prediction_horizon,
27 | targets_type,
28 | balanced_dataloader=False,
29 | backtest=False,
30 | training_stocks=None,
31 | validation_stocks=None,
32 | target_stocks=None
33 | ):
34 | self.learning_stage = learning_stage # The current learning stage (training, validation or testing).
35 | self.shuffling_seed = (
36 | shuffling_seed # The seed for the random shuffling of the datasets.
37 | )
38 | self.balanced_dataloader = balanced_dataloader # Whether to use a balanced dataloader or not. This option is available only for training.
39 | self.backtest = backtest
40 | self.targets_type = targets_type
41 |
42 | if self.learning_stage == "training":
43 | file_patterns = [f"./data/{dataset}/scaled_data/{self.learning_stage}/{element}_orderbooks*.csv" for element in training_stocks]
44 | self.csv_files = []
45 | for pattern in file_patterns:
46 | self.csv_files.extend(glob.glob(pattern.format(dataset=dataset, self=self)))
47 |
48 | random.seed(self.shuffling_seed)
49 | random.shuffle(self.csv_files)
50 | else:
51 | # During the validation and testing stages it is fundamental to read the datasets in chronological order.
52 | if self.learning_stage == 'validation':
53 | file_patterns = [f"./data/{dataset}/scaled_data/{self.learning_stage}/{element}_orderbooks*.csv" for element in validation_stocks]
54 | else:
55 | file_patterns = [f"./data/{dataset}/scaled_data/{self.learning_stage}/{element}_orderbooks*.csv" for element in target_stocks]
56 |
57 | self.csv_files = []
58 | for pattern in file_patterns:
59 | self.csv_files.extend(glob.glob(pattern.format(dataset=dataset, self=self)))
60 | self.csv_files = sorted(self.csv_files)
61 |
62 | self.window_size = window_size # The number of time steps in each window.
63 | self.lighten = lighten # Whether to use the light version of the dataset.
64 | self.threshold = threshold # The threshold for the classification task.
65 | self.prediction_horizon = (
66 | prediction_horizon # The prediction horizon for the classification task.
67 | )
68 | self.all_horizons = (
69 | all_horizons # List of all the possible prediction horizons.
70 | )
71 |
72 | self.cumulative_lengths = [0] # Store cumulative lengths of datasets.
73 | self.cache_size = cache_size # The number of datasets to cache in memory.
74 | self.cache_data = [
75 | None
76 | ] * self.cache_size # Initialize a cache with empty slots.
77 | self.cache_indices = [
78 | None
79 | ] * self.cache_size # Initialize the indices for the cache.
80 | self.current_cache_index = -1
81 |
82 | self.glob_indices = []
83 |
84 | if self.balanced_dataloader:
85 | print(f"BALANCED dataset construction...")
86 | else:
87 | print(f"UNBALANCED dataset construction...")
88 | for csv_file in tqdm.tqdm(self.csv_files):
89 | df = pd.read_csv(csv_file)
90 | self.cumulative_lengths.append(
91 | self.cumulative_lengths[-1] + len(df) - window_size
92 | ) # Store the lengths of all datasets.
93 |
94 | # If requested by the user, during the training stage, we create balanced dataloaders.
95 | if self.learning_stage == "training" and self.balanced_dataloader is True:
96 | temp_labels = (
97 | []
98 | ) # This is a temporary variable which stores the (discretized) labels (i.e. classes) for each sample in each input dataset.
99 |
100 | if self.targets_type == "raw":
101 | labels = df.iloc[:-window_size, :][
102 | f"Raw_Target_{self.prediction_horizon}"
103 | ] # Extract the raw, continuous labels (i.e. returns) from the current dataset.
104 | else:
105 | labels = df.iloc[:-window_size, :][
106 | f"Smooth_Target_{self.prediction_horizon}"
107 | ] # Extract the raw, continuous labels (i.e. returns) from the current dataset.
108 |
109 | # For each file, we must know the corresponding index. This is the reason why we access the cumulative lengths list.
110 | for label, index in zip(
111 | labels,
112 | range(self.cumulative_lengths[-2], self.cumulative_lengths[-1]),
113 | ):
114 | # The discretization is performed using the provided threshold. Temporary labels are tuples of the form (class, index).
115 | if label > self.threshold:
116 | temp_labels.append((2, index))
117 | elif label < -self.threshold:
118 | temp_labels.append((0, index))
119 | else:
120 | temp_labels.append((1, index))
121 |
122 | # Group data by class representatives
123 | class_groups = {}
124 | for item in temp_labels:
125 | (
126 | class_representative,
127 | index,
128 | ) = item # Unpack the tuple (class, index).
129 |
130 | # Understand what is the un-cumulative index of each sample.
131 | corresponding_cumulative_length = detect_changing_points(
132 | index, self.cumulative_lengths
133 | )
134 | if corresponding_cumulative_length is not None:
135 | # If the current sample does not belong to the first dataset, we must subtract the cumulative length of the previous dataset.
136 | temp_index = index - corresponding_cumulative_length
137 | else:
138 | # If the current sample belongs to the first dataset, we do not need to subtract anything.
139 | temp_index = index
140 |
141 | # Even having a balanced dataloader, labels would be messed up once computing models' inputs.
142 | # Indeed, given an index 'i', the input rows are the ones from 'i' to 'i + window_size' and the label to be used is the one at 'i + window_size'.
143 | # Therefore, we must subtract the window size from the index of each sample.
144 | if temp_index >= self.window_size:
145 | if class_representative in class_groups:
146 | class_groups[class_representative].append(
147 | index - self.window_size
148 | )
149 | else:
150 | class_groups[class_representative] = [
151 | index - self.window_size
152 | ]
153 | else:
154 | pass
155 |
156 | # Determine the desired number of samples per class (pseudo-balanced). We use the size of the less represented class.
157 | min_samples_class = min(
158 | len(indices) for indices in class_groups.values()
159 | )
160 | if min_samples_class > 5000:
161 | min_samples_class = 5000
162 |
163 | # We randomly select indices from each class to create the subsample.
164 | subsample_indices = []
165 | for class_representative, indices in class_groups.items():
166 | random.seed(self.shuffling_seed)
167 | subsample_indices.extend(random.sample(indices, min_samples_class))
168 |
169 | # We store the chosen indices in the 'global_indices_list'.
170 | random.seed(self.shuffling_seed)
171 | random.shuffle(subsample_indices)
172 | self.glob_indices.extend(subsample_indices)
173 |
174 | # If requested by the user, during the training stage, we use all the available samples distributed across input datasets.
175 | if self.learning_stage == "training" and self.balanced_dataloader is False:
176 | temp_labels = (
177 | []
178 | ) # This is a temporary variable which stores the (discretized) labels (i.e. classes) for each sample in each input dataset.
179 |
180 | if self.targets_type == "raw":
181 | labels = df.iloc[:-window_size, :][
182 | f"Raw_Target_{self.prediction_horizon}"
183 | ] # Extract the raw, continuous labels (i.e. returns) from the current dataset.
184 | else:
185 | labels = df.iloc[:-window_size, :][
186 | f"Smooth_Target_{self.prediction_horizon}"
187 | ] # Extract the raw, continuous labels (i.e. returns) from the current dataset.
188 |
189 | # For each file, we must know the corresponding index. This is the reason why we access the cumulative lengths list.
190 | for label, index in zip(
191 | labels,
192 | range(self.cumulative_lengths[-2], self.cumulative_lengths[-1]),
193 | ):
194 | # The discretization is performed using the provided threshold. Temporary labels are tuples of the form (class, index).
195 | if label > self.threshold:
196 | temp_labels.append((2, index))
197 | elif label < -self.threshold:
198 | temp_labels.append((0, index))
199 | else:
200 | temp_labels.append((1, index))
201 |
202 | # Group data by class representatives
203 | class_groups = {}
204 | for item in temp_labels:
205 | (
206 | class_representative,
207 | index,
208 | ) = item # Unpack the tuple (class, index).
209 |
210 | # Understand what is the un-cumulative index of each sample.
211 | corresponding_cumulative_length = detect_changing_points(
212 | index, self.cumulative_lengths
213 | )
214 | if corresponding_cumulative_length is not None:
215 | # If the current sample does not belong to the first dataset, we must subtract the cumulative length of the previous dataset.
216 | temp_index = index - corresponding_cumulative_length
217 | else:
218 | # If the current sample belongs to the first dataset, we do not need to subtract anything.
219 | temp_index = index
220 |
221 | # Even having a balanced dataloader, labels would be messed up once computing models' inputs.
222 | # Indeed, given an index 'i', the input rows are the ones from 'i' to 'i + window_size' and the label to be used is the one at 'i + window_size'.
223 | # Therefore, we must subtract the window size from the index of each sample.
224 | if temp_index >= self.window_size:
225 | if class_representative in class_groups:
226 | class_groups[class_representative].append(
227 | index - self.window_size
228 | )
229 | else:
230 | class_groups[class_representative] = [
231 | index - self.window_size
232 | ]
233 | else:
234 | pass
235 |
236 | # We randomly select indices from each class to create the subsample.
237 | subsample_indices = []
238 | for class_representative, indices in class_groups.items():
239 | random.seed(self.shuffling_seed)
240 | subsample_indices.extend(random.sample(indices, int(len(indices) * 0.1)))
241 |
242 | # We store the chosen indices in the 'global_indices_list'.
243 | random.seed(self.shuffling_seed)
244 | random.shuffle(subsample_indices)
245 | self.glob_indices.extend(subsample_indices)
246 |
247 | def __len__(self):
248 | # This is the cumulative length of all input datasets.
249 | return self.cumulative_lengths[-1]
250 |
251 | def cache_dataset(self, dataset_index):
252 | if self.current_cache_index >= 0:
253 | # Remove the least recently used cache entry
254 | self.cache_data[self.current_cache_index] = None
255 | self.cache_indices[self.current_cache_index] = None
256 |
257 | # Select a random cache slot for the new dataset
258 | self.current_cache_index = random.randint(0, self.cache_size - 1)
259 |
260 | # Cache the data from the CSV file
261 | df = pl.read_csv(self.csv_files[dataset_index]).to_pandas()
262 |
263 | self.cache_data[self.current_cache_index] = df.values[:, 1:].astype(np.float32)
264 | self.cache_indices[self.current_cache_index] = dataset_index
265 |
266 | def __getitem__(self, index):
267 | try:
268 | dataset_index = 0
269 | while index >= self.cumulative_lengths[dataset_index + 1]:
270 | dataset_index += 1
271 |
272 | if self.cache_indices[self.current_cache_index] != dataset_index:
273 | # Cache the dataset if it's not already cached.
274 | self.cache_dataset(dataset_index)
275 |
276 | # Retrieve the un-cumulative index of the current sample.
277 | start_index = (
278 | index
279 | if dataset_index == 0
280 | else index - self.cumulative_lengths[dataset_index]
281 | )
282 |
283 | if self.lighten:
284 | # If the "lighten" option is enabled, we use only the first 5 levels of the orderbook (i.e. 4_level_features * 5_levels = 20_orderbook_features).
285 | window_data = self.cache_data[self.current_cache_index][
286 | start_index: start_index + self.window_size, :20
287 | ]
288 | else:
289 | # If the "lighten" option is not enabled, we use all the 10 levels of the orderbook (i.e. 4_level_features * 10_levels = 40_orderbook_features).
290 | window_data = self.cache_data[self.current_cache_index][
291 | start_index: start_index + self.window_size, :40
292 | ]
293 |
294 | # Determine the position of the prediction horizon in the list of all horizons.
295 | position = next(
296 | (
297 | index
298 | for index, value in enumerate(self.all_horizons)
299 | if value == self.prediction_horizon
300 | ),
301 | None,
302 | )
303 | # Extract the label from the dataset given its position.
304 | label = self.cache_data[self.current_cache_index][
305 | start_index + self.window_size, 40:
306 | ][position]
307 | # Discretize the label using the provided threshold.
308 | if self.backtest is False:
309 | if label > self.threshold:
310 | label = 2
311 | elif label < -self.threshold:
312 | label = 0
313 | else:
314 | label = 1
315 |
316 | return torch.tensor(window_data).unsqueeze(0), torch.tensor(label)
317 | except Exception as e:
318 | print(f"Exception in DataLoader worker: {e}")
319 | raise e
320 |
321 |
322 | '''
323 | if __name__ == "__main__":
324 | # Create dataset and DataLoader with random shuffling
325 | dataset = CustomDataset(
326 | dataset="nasdaq",
327 | learning_stage="training",
328 | window_size=100,
329 | shuffling_seed=42,
330 | cache_size=1,
331 | lighten=True,
332 | threshold=32,
333 | targets_type="raw",
334 | all_horizons=[5, 10, 30, 50, 100],
335 | prediction_horizon=100,
336 | balanced_dataloader=False,
337 | training_stocks=["CHTR"],
338 | validation_stocks=["CHTR"],
339 | target_stocks=["CHTR"]
340 | )
341 |
342 | dataloader = DataLoader(
343 | dataset, batch_size=32, shuffle=False, num_workers=8, drop_last=True, sampler=dataset.glob_indices
344 | )
345 |
346 | print(len(dataloader))
347 |
348 | complete_list = []
349 | # Example usage of the DataLoader
350 | for batch_data, batch_labels in dataloader:
351 | # Train your model using batch_data and batch_labels
352 | # print(batch_labels.tolist())
353 | complete_list.extend(batch_labels.tolist())
354 | #print(batch_data.shape, batch_labels.shape)
355 |
356 | plt.hist(complete_list)
357 | plt.show()
358 | '''
--------------------------------------------------------------------------------
/loggers/analysis.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import yaml
4 |
5 | import pandas as pd
6 | import numpy as np
7 | from sklearn.metrics import *
8 | from scipy.stats import skew, kurtosis
9 | from tqdm import tqdm
10 |
11 | import matplotlib.pyplot as plt
12 | from matplotlib import rcParams, cycler
13 | import matplotlib.pyplot as plt
14 | from matplotlib.ticker import AutoMinorLocator
15 |
16 | import warnings
17 | import multiprocessing
18 | import pickle
19 | import gzip
20 |
21 | def find_substrings_in_string(string_list, main_string):
22 | return [s for s in string_list if s in main_string]
23 |
24 | def calculate_log_returns(series, step=50):
25 | return np.log(series / series.shift(step)).dropna().reset_index(drop=True)
26 |
27 | def optimized_rolling_diff(series, window_size):
28 | return series.rolling(window=window_size).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False).shift(-(window_size - 1))
29 |
30 | def process_file(f):
31 | df = pd.read_csv(f)
32 |
33 | best_ask_price = df.ASKp1 / 10000
34 | best_bid_price = df.BIDp1 / 10000
35 | local_mids = (best_ask_price + best_bid_price) / 2
36 | local_spreads = best_ask_price - best_bid_price
37 | volatility_10 = np.std(calculate_log_returns(local_mids, 10))
38 | volatility_50 = np.std(calculate_log_returns(local_mids, 50))
39 | volatility_100 = np.std(calculate_log_returns(local_mids, 100))
40 | levels_ask_side = ((df.ASKp10 / 10000 - df.ASKp1 / 10000) / 0.01).tolist()
41 | levels_bid_side = ((df.BIDp1 / 10000 - df.BIDp10 / 10000) / 0.01).tolist()
42 | df['seconds'] = pd.to_datetime(df['seconds'])
43 | secs = df['seconds'].astype(int) / 10**9
44 |
45 | seconds_in_horizon_10 = optimized_rolling_diff(secs, 10).dropna().tolist()
46 | seconds_in_horizon_50 = optimized_rolling_diff(secs, 50).dropna().tolist()
47 | seconds_in_horizon_100 = optimized_rolling_diff(secs, 100).dropna().tolist()
48 |
49 | print(f"Finished {f}.")
50 | return {
51 | 'Mids': local_mids.tolist(),
52 | 'Spreads': local_spreads.tolist(),
53 | 'Best_Ask_Volume': df.ASKs1.tolist(),
54 | 'Best_Bid_Volume': df.BIDs1.tolist(),
55 | 'Volatility_10': [volatility_10],
56 | 'Volatility_50': [volatility_50],
57 | 'Volatility_100': [volatility_100],
58 | 'Levels_Ask_Side': levels_ask_side,
59 | 'Levels_Bid_Side': levels_bid_side,
60 | 'Seconds_Horizon_10': seconds_in_horizon_10,
61 | 'Seconds_Horizon_50': seconds_in_horizon_50,
62 | 'Seconds_Horizon_100': seconds_in_horizon_100
63 | }
64 |
65 | def process_stock_files(file_list):
66 | stock_data = {
67 | 'Mids': [], 'Spreads': [], 'Best_Ask_Volume': [], 'Best_Bid_Volume': [],
68 | 'Volatility_10': [], 'Volatility_50': [], 'Volatility_100': [],
69 | 'Levels_Ask_Side': [], 'Levels_Bid_Side': [], 'Seconds_Horizon_10': [],
70 | 'Seconds_Horizon_50': [], 'Seconds_Horizon_100': []
71 | }
72 | for f in file_list:
73 | file_data = process_file(f)
74 | for key in stock_data:
75 | stock_data[key].extend(file_data[key])
76 | return stock_data
77 |
78 | def process_stock(s):
79 | files = sorted(glob.glob(f"../data/nasdaq/unscaled_data/{s}/*"))
80 | num_workers = 10
81 |
82 | # Splitting files into chunks for each process
83 | file_chunks = np.array_split(files, num_workers)
84 |
85 | with multiprocessing.Pool(num_workers) as pool:
86 | chunk_results = pool.map(process_stock_files, file_chunks)
87 |
88 | # Aggregating results from all chunks
89 | stock_data = {
90 | 'Mids': [], 'Spreads': [], 'Best_Ask_Volume': [], 'Best_Bid_Volume': [],
91 | 'Volatility_10': [], 'Volatility_50': [], 'Volatility_100': [],
92 | 'Levels_Ask_Side': [], 'Levels_Bid_Side': [], 'Seconds_Horizon_10': [],
93 | 'Seconds_Horizon_50': [], 'Seconds_Horizon_100': []
94 | }
95 | for chunk in chunk_results:
96 | for key in stock_data:
97 | stock_data[key].extend(chunk[key])
98 |
99 | return s, stock_data
100 |
101 | if __name__ == "__main__":
102 | stocks = ["BAC", "CHTR", "CSCO", "GOOG", "GS", "IBM", "MCD", "NVDA", "ORCL", "PFE", "PM", "VZ"] #"ABBV", "KO", "AAPL",
103 |
104 | for s in stocks:
105 | stock_dictionary = {}
106 | try:
107 | s, stock_data = process_stock(s)
108 | stock_dictionary[s] = stock_data
109 | print(f"Completed processing for stock: {s}")
110 | except Exception as e:
111 | print(f"Error processing stock: {s} with error {e}")
112 |
113 | with open(f'../statistical_analysis/{s}.pkl', 'wb') as f:
114 | pickle.dump(stock_dictionary, f, protocol=-1)
--------------------------------------------------------------------------------
/loggers/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from datetime import datetime
4 |
5 | import yaml
6 | import random
7 | import string
8 |
9 |
10 | def generate_id(name, target_stock):
11 | """
12 | Generate a unique experiment identifier based on the input `name` and the current timestamp in the format "YYYY-MM-DD_HH_MM_SS".
13 | Create a directory path using this identifier within the 'loggers/results' directory relative to the script's location, and if
14 | it doesn't exist, create it.
15 |
16 | :param name: name of the DL model to be used in the experiment, (str).
17 | :return: experiment_id: unique experiment identifier, (str).
18 | """
19 | random_string_part = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(7))
20 | init_time = datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
21 | experiment_id = f"{target_stock}_{name}_{init_time}_{random_string_part}"
22 |
23 | root_path = sys.path[0]
24 | dir_path = f"{root_path}/loggers/results/{experiment_id}"
25 | if not os.path.exists(dir_path):
26 | os.makedirs(dir_path)
27 |
28 | return experiment_id
29 |
30 |
31 | def find_save_path(model_id):
32 | """
33 | Find the directory path for saving results associated with a given `model_id`. This function constructs a directory path within the
34 | 'loggers/results' directory relative to the script's location.
35 |
36 | :param model_id: model identifier, (str).
37 | :return: directory path, (str).
38 | """
39 | root_path = sys.path[0]
40 | dir_path = f"{root_path}/loggers/results/{model_id}"
41 | return dir_path
42 |
43 |
44 | def logger(experiment_id, header, contents):
45 | """
46 | Log experimental results in a YAML file associated with the given `model_id`. If the file already exists, it appends new data to it;
47 | otherwise, it creates a new file.
48 |
49 | :param experiment_id: model identifier, (str).
50 | :param header: header for the data being logged, (str).
51 | :param contents: data to be logged, provided as a dictionary, (dict).
52 | """
53 | root_path = sys.path[0]
54 | file_path = f"{root_path}/loggers/results/{experiment_id}/data.yaml"
55 |
56 | contents = {header: contents}
57 |
58 | if os.path.exists(file_path):
59 | with open(file_path, "r") as yamlfile:
60 | current_yaml = yaml.safe_load(yamlfile)
61 | current_yaml.update(contents)
62 | else:
63 | current_yaml = contents
64 | with open(file_path, "w") as yamlfile:
65 | yaml.dump(current_yaml, yamlfile)
66 |
67 |
68 | def read_log(model_id, header):
69 | """
70 | Read and retrieve data from a log file associated with the given `model_id`.
71 |
72 | :param model_id: Model identifier, (str).
73 | :param header: Header of the data to retrieve from the log, (str).
74 | :return: The data associated with the specified header from the log, (any type depending on data stored).
75 | """
76 | root_path = sys.path[0]
77 | file_path = f"{root_path}/loggers/results/{model_id}/log.yaml"
78 |
79 | with open(file_path, "r") as yamlfile:
80 | log = yaml.safe_load(yamlfile)
81 |
82 | return log[header]
83 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from data_processing import data_process_utils
2 | from loggers import logger
3 | from optimizers.executor import Executor
4 | from simulator import market_sim, post_trading_analysis
5 | from utils import (
6 | data_split,
7 | load_yaml,
8 | save_dataset_info,
9 | parse_args,
10 | create_hyperparameters_yaml,
11 | )
12 | from data_processing.complete_homological_utils import get_complete_homology
13 |
14 | if __name__ == "__main__":
15 | # Parse input arguments.
16 | args = parse_args()
17 | wb_error_detection = False
18 |
19 | if args.experiment_id is None:
20 | # If no experiment ID is passed, generate a new one.
21 | experiment_id = logger.generate_id(args.model, args.target_stocks)
22 | # Create a new configuration file containing the hyperparameters.
23 | create_hyperparameters_yaml(experiment_id, args)
24 | else:
25 | # If an experiment ID is passed, use it.
26 | experiment_id = args.experiment_id
27 | # Replace the hyperparameters file with the new arguments passed as input.
28 | # create_hyperparameters_yaml(experiment_id, args)
29 |
30 | # Load the configuration file containing the hyperparameters.
31 | hyperparameters_path = (
32 | f"{logger.find_save_path(experiment_id)}/hyperparameters.yaml"
33 | )
34 |
35 | # Load the configuration file (general hyperparameters).
36 | general_hyperparameters = load_yaml(hyperparameters_path, "general")
37 | # Load the configuration file (model's hyperparameters).
38 | model_hyperparameters = load_yaml(hyperparameters_path, "model")
39 | # Load the configuration file (trading hyperparameters).
40 | trading_hyperparameters = load_yaml(hyperparameters_path, "trading")
41 |
42 | if args.experiment_id is not None:
43 | general_hyperparameters['stages'] = args.stages.split(",")
44 |
45 | # Handle the data processing stage.
46 | if "data_processing" in general_hyperparameters["stages"]:
47 | # Make the list of training stocks a set to avoid duplicates.
48 | training_stocks = set(general_hyperparameters["training_stocks"])
49 | # Make the list of target stocks a set to avoid duplicates.
50 | target_stocks = set(general_hyperparameters["target_stocks"])
51 | # Iterate over stocks after performing the union of sets operation (a stock can occur both in training_stocks and target_stocks).
52 | for stock in list(training_stocks.union(target_stocks)):
53 | data_utils = data_process_utils.DataUtils(
54 | ticker=stock,
55 | dataset=general_hyperparameters["dataset"],
56 | experiment_id=experiment_id,
57 | horizons=general_hyperparameters["horizons"],
58 | normalization_window=general_hyperparameters["normalization_window"],
59 | )
60 | # Generate the data folders.
61 | data_utils.generate_data_folders()
62 | # Transform the data.
63 | data_utils.process_data()
64 | # Split the data into training, validation and test sets.
65 | data_split(
66 | dataset=general_hyperparameters["dataset"],
67 | training_stocks=general_hyperparameters["training_stocks"],
68 | target_stock=general_hyperparameters["target_stocks"],
69 | training_ratio=general_hyperparameters["training_ratio"],
70 | validation_ratio=general_hyperparameters["validation_ratio"],
71 | include_target_stock_in_training=general_hyperparameters[
72 | "include_target_stock_in_training"
73 | ],
74 | )
75 |
76 | # Instantiate the executor as None.
77 | executor = None
78 | # For 'torch_dataset_preparation' stage, instantiate the executor with proper arguments.
79 | if "torch_dataset_preparation" in general_hyperparameters["stages"]:
80 | executor = Executor(
81 | experiment_id, general_hyperparameters, model_hyperparameters, torch_dataset_preparation=True
82 | )
83 |
84 | if "torch_dataset_preparation_backtest" in general_hyperparameters["stages"]:
85 | executor = Executor(
86 | experiment_id, general_hyperparameters, model_hyperparameters, torch_dataset_preparation=False, torch_dataset_preparation_backtest=True
87 | )
88 |
89 | if "complete_homological_structures_preparation" in general_hyperparameters["stages"]:
90 | get_complete_homology(general_hyperparameters=general_hyperparameters, model_hyperparameters=model_hyperparameters)
91 |
92 | # For the 'training' and 'evaluation' stages, instantiate the executor with proper arguments.
93 | if (
94 | "training" in general_hyperparameters["stages"]
95 | or "evaluation" in general_hyperparameters["stages"]
96 | ):
97 | executor = Executor(
98 | experiment_id, general_hyperparameters, model_hyperparameters
99 | )
100 |
101 | if "training" in general_hyperparameters["stages"]:
102 | try:
103 | # Keep track of the files used in the training, validation and test sets.
104 | save_dataset_info(
105 | experiment_id=experiment_id,
106 | general_hyperparameters=general_hyperparameters,
107 | )
108 | # Train the model.
109 | executor.execute_training()
110 | # Clean up the experiment folder from wandb logging files.
111 | executor.logger_clean_up()
112 | except:
113 | print('Exception detected')
114 | wb_error_detection = True
115 |
116 | if "evaluation" in general_hyperparameters["stages"] and wb_error_detection is False:
117 | # Out-of-sample test of the model.
118 | executor.execute_testing()
119 | # Clean up the experiment folder from wandb logging files.
120 | executor.logger_clean_up()
121 |
122 | if "backtest" in general_hyperparameters["stages"]:
123 | # Backtest the model.
124 | market_sim.backtest(
125 | experiment_id=experiment_id, trading_hyperparameters=trading_hyperparameters
126 | )
127 |
128 | if "post_trading_analysis" in general_hyperparameters["stages"]:
129 | # Perform a post-trading analysis.
130 | post_trading_analysis.post_trading_analysis(
131 | experiment_id=experiment_id, general_hyperparameters=general_hyperparameters, trading_hyperparameters=trading_hyperparameters, model_hyperparameters=model_hyperparameters
132 | )
133 |
--------------------------------------------------------------------------------
/models/AxialLob/axiallob.py:
--------------------------------------------------------------------------------
1 | import math
2 | import pytorch_lightning as pl
3 | import torch
4 | import torch.nn as nn
5 |
6 |
7 | def _conv1d1x1(in_channels, out_channels):
8 | return nn.Sequential(nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
9 | nn.BatchNorm1d(out_channels))
10 |
11 |
12 | class GatedAxialAttention(pl.LightningModule):
13 | def __init__(self, in_channels, out_channels, heads, dim, flag):
14 | assert (in_channels % heads == 0) and (out_channels % heads == 0)
15 | super().__init__()
16 |
17 | self.in_channels = in_channels
18 | self.out_channels = out_channels
19 | self.heads = heads
20 | self.dim_head_v = out_channels // heads
21 | self.flag = flag # if flag then we do the attention along width
22 | self.dim = dim
23 | self.dim_head_qk = self.dim_head_v // 2
24 | self.qkv_channels = self.dim_head_v + self.dim_head_qk * 2
25 |
26 | # Multi-head self attention
27 | self.to_qkv = _conv1d1x1(in_channels, self.heads * self.qkv_channels)
28 | self.bn_qkv = nn.BatchNorm1d(self.heads * self.qkv_channels)
29 | self.bn_similarity = nn.BatchNorm2d(heads * 3)
30 | self.bn_output = nn.BatchNorm1d(self.heads * self.qkv_channels)
31 |
32 | # Gating mechanism
33 | self.f_qr = nn.Parameter(torch.tensor(0.1), requires_grad=True)
34 | self.f_kr = nn.Parameter(torch.tensor(0.1), requires_grad=True)
35 | self.f_sve = nn.Parameter(torch.tensor(0.1), requires_grad=True)
36 | self.f_sv = nn.Parameter(torch.tensor(0.5), requires_grad=True)
37 |
38 | # Position embedding
39 | self.relative = nn.Parameter(torch.randn(self.dim_head_v * 2, dim * 2 - 1), requires_grad=True)
40 | query_index = torch.arange(dim).unsqueeze(0)
41 | key_index = torch.arange(dim).unsqueeze(1)
42 | relative_index = key_index - query_index + dim - 1
43 | self.register_buffer('flatten_index', relative_index.view(-1))
44 |
45 | self.reset_parameters()
46 | # self.print_para()
47 |
48 | def forward(self, x):
49 |
50 | if self.flag:
51 | x = x.permute(0, 2, 1, 3)
52 | else:
53 | x = x.permute(0, 3, 1, 2) # n_instances, W, C, H
54 | N, W, C, H = x.shape
55 | x = x.contiguous().view(N * W, C, H)
56 |
57 | # Transformations
58 | x = self.to_qkv(x)
59 |
60 | qkv = self.bn_qkv(x)
61 | q, k, v = torch.split(qkv.reshape(N * W, self.heads, self.dim_head_v * 2, H),
62 | [self.dim_head_v // 2, self.dim_head_v // 2, self.dim_head_v], dim=2)
63 |
64 | # Calculate position embedding
65 | all_embeddings = torch.index_select(self.relative, 1, self.flatten_index).view(self.dim_head_v * 2, self.dim,
66 | self.dim)
67 | q_embedding, k_embedding, v_embedding = torch.split(all_embeddings,
68 | [self.dim_head_qk, self.dim_head_qk, self.dim_head_v],
69 | dim=0)
70 | qr = torch.einsum('bgci,cij->bgij', q, q_embedding)
71 | kr = torch.einsum('bgci,cij->bgij', k, k_embedding).transpose(2, 3)
72 | qk = torch.einsum('bgci, bgcj->bgij', q, k)
73 |
74 | # multiply by factors
75 | qr = torch.mul(qr, self.f_qr)
76 | kr = torch.mul(kr, self.f_kr)
77 |
78 | stacked_similarity = torch.cat([qk, qr, kr], dim=1)
79 | stacked_similarity = self.bn_similarity(stacked_similarity).view(N * W, 3, self.heads, H, H).sum(dim=1)
80 | # stacked_similarity = self.bn_qr(qr) + self.bn_kr(kr) + self.bn_qk(qk)
81 | # (n_instances, heads, H, H, W)
82 | similarity = torch.softmax(stacked_similarity, dim=3)
83 | sv = torch.einsum('bgij,bgcj->bgci', similarity, v)
84 | sve = torch.einsum('bgij,cij->bgci', similarity, v_embedding)
85 |
86 | # multiply by factors
87 | sv = torch.mul(sv, self.f_sv)
88 | sve = torch.mul(sve, self.f_sve)
89 |
90 | stacked_output = torch.cat([sv, sve], dim=-1).view(N * W, self.out_channels * 2, H)
91 | output = self.bn_output(stacked_output).view(N, W, self.out_channels, 2, H).sum(dim=-2)
92 |
93 | if self.flag:
94 | output = output.permute(0, 2, 1, 3)
95 | else:
96 | output = output.permute(0, 2, 3, 1)
97 |
98 | return output
99 |
100 | def reset_parameters(self):
101 | nn.init.normal_(self.relative, 0., math.sqrt(1. / self.dim_head_v))
102 |
103 |
104 | class AxialLOB(nn.Module):
105 | def __init__(self, W=40, H=100, c_in=32, c_out=32, c_final=4, n_heads=4, pool_kernel=(1, 4), pool_stride=(1, 4)):
106 | super().__init__()
107 |
108 | # channel output of the CNN_in is the channel input for the axial layer
109 |
110 | self.c_in = c_in
111 | self.c_out = c_out
112 | self.c_final = c_final
113 |
114 | self.CNN_in = nn.Conv2d(in_channels=1, out_channels=c_in, kernel_size=1)
115 | self.CNN_out = nn.Conv2d(in_channels=c_out, out_channels=c_final, kernel_size=1)
116 | self.CNN_res2 = nn.Conv2d(in_channels=c_out, out_channels=c_final, kernel_size=1)
117 | self.CNN_res1 = nn.Conv2d(in_channels=1, out_channels=c_out, kernel_size=1)
118 |
119 | self.norm = nn.BatchNorm2d(c_in)
120 | self.res_norm2 = nn.BatchNorm2d(c_final)
121 | self.res_norm1 = nn.BatchNorm2d(c_out)
122 | self.norm2 = nn.BatchNorm2d(c_final)
123 |
124 | self.axial_height_1 = GatedAxialAttention(c_out, c_out, n_heads, H, flag=False)
125 | self.axial_width_1 = GatedAxialAttention(c_out, c_out, n_heads, W, flag=True)
126 |
127 | self.axial_height_2 = GatedAxialAttention(c_out, c_out, n_heads, H, flag=False)
128 | self.axial_width_2 = GatedAxialAttention(c_out, c_out, n_heads, W, flag=True)
129 |
130 | self.activation = nn.ReLU()
131 | self.linear = nn.Linear(4000, 3)
132 | self.pooling = nn.AvgPool2d(kernel_size=pool_kernel, stride=pool_stride)
133 |
134 | def forward(self, x):
135 | # first convolution before the attention
136 | y = self.CNN_in(x)
137 | y = self.norm(y)
138 | y = self.activation(y)
139 |
140 | # attention mechanism through gated multi head axial layer
141 | y = self.axial_width_1(y)
142 | y = self.axial_height_1(y)
143 |
144 | # lower branch
145 | x = self.CNN_res1(x)
146 | x = self.res_norm1(x)
147 | x = self.activation(x)
148 |
149 | # first residual
150 | y = y + x
151 | z = y.detach().clone()
152 |
153 | # second axial layer
154 | y = self.axial_width_2(y)
155 | y = self.axial_height_2(y)
156 |
157 | # second convolution
158 | y = self.CNN_out(y)
159 | y = self.res_norm2(y)
160 | y = self.activation(y)
161 |
162 | # lower branch
163 | z = self.CNN_res2(z)
164 | z = self.norm2(z)
165 | z = self.activation(z)
166 |
167 | # second res connection
168 | y = y + z
169 |
170 | # final part
171 | y = self.pooling(y)
172 | y = torch.flatten(y, 1)
173 | y = self.linear(y)
174 |
175 | return y
--------------------------------------------------------------------------------
/models/CNN1/cnn1.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | from torch import nn
3 |
4 |
5 | class CNN1(pl.LightningModule):
6 | def __init__(self, num_features=40, num_classes=3, temp=26):
7 | super().__init__()
8 |
9 | # Convolution 1
10 | self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(4, num_features), padding=(3, 0), dilation=(2, 1))
11 | self.relu1 = nn.LeakyReLU()
12 |
13 | # Convolution 2
14 | self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=(4,))
15 | self.relu2 = nn.LeakyReLU()
16 |
17 | # Max pool 1
18 | self.maxpool1 = nn.MaxPool1d(kernel_size=2)
19 |
20 | # Convolution 3
21 | self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=(3,), padding=2)
22 | self.relu3 = nn.LeakyReLU()
23 |
24 | # Convolution 4
25 | self.conv4 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=(3,), padding=2)
26 | self.relu4 = nn.LeakyReLU()
27 |
28 | # Max pool 2
29 | self.maxpool2 = nn.MaxPool1d(kernel_size=2)
30 |
31 | # Fully connected 1
32 | self.fc1 = nn.Linear(temp*32, 32)
33 | self.relu5 = nn.LeakyReLU()
34 |
35 | # Fully connected 2
36 | self.fc2 = nn.Linear(32, num_classes)
37 |
38 | def forward(self, x):
39 | # Convolution 1
40 | out = self.conv1(x)
41 | out = self.relu1(out)
42 | out = out.reshape(out.shape[0], out.shape[1], -1)
43 | # print('After convolution1:', out.shape)
44 |
45 | # Convolution 2
46 | out = self.conv2(out)
47 | out = self.relu2(out)
48 | # print('After convolution2:', out.shape)
49 |
50 | # Max pool 1
51 | out = self.maxpool1(out)
52 | # print('After maxpool1:', out.shape)
53 |
54 | # Convolution 3
55 | out = self.conv3(out)
56 | out = self.relu3(out)
57 | # print('After convolution3:', out.shape)
58 |
59 | # Convolution 4
60 | out = self.conv4(out)
61 | out = self.relu4(out)
62 | # print('After convolution4:', out.shape)
63 |
64 | # Max pool 2
65 | out = self.maxpool2(out)
66 | # print('After maxcpool2:', out.shape)
67 |
68 | # flatten
69 | out = out.view(out.size(0), -1)
70 | # print('After flatten:', out.shape)
71 |
72 | # Linear function 1
73 | out = self.fc1(out)
74 | out = self.relu5(out)
75 | # print('After linear1:', out.shape)
76 |
77 | # Linear function (readout)
78 | out = self.fc2(out)
79 | # print('After linear2:', out.shape)
80 |
81 | return out
--------------------------------------------------------------------------------
/models/CNN2/cnn2.py:
--------------------------------------------------------------------------------
1 | # Using Deep Learning for price prediction by exploiting stationary limit order book features
2 | # Source: https://www.sciencedirect.com/science/article/pii/S1568494620303410
3 |
4 | import pytorch_lightning as pl
5 | from torch import nn
6 |
7 |
8 | class CNN2(pl.LightningModule):
9 | def __init__(self, num_features=40, num_classes=3, temp=249):
10 | super().__init__()
11 |
12 | # Convolution 1
13 | self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(10, 42), padding=(0, 2))
14 | self.bn1 = nn.BatchNorm2d(16)
15 | self.prelu1 = nn.PReLU()
16 |
17 | # Convolution 2
18 | self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=(10,)) # 3
19 | self.bn2 = nn.BatchNorm1d(16)
20 | self.prelu2 = nn.PReLU()
21 |
22 | # Convolution 3
23 | self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=(8,)) # 1
24 | self.bn3 = nn.BatchNorm1d(32)
25 | self.prelu3 = nn.PReLU()
26 |
27 | # Convolution 4
28 | self.conv4 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=(6,)) # 1
29 | self.bn4 = nn.BatchNorm1d(32)
30 | self.prelu4 = nn.PReLU()
31 |
32 | # Convolution 5
33 | self.conv5 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=(4,)) # 1
34 | self.bn5 = nn.BatchNorm1d(32)
35 | self.prelu5 = nn.PReLU()
36 |
37 | # Fully connected 1
38 | self.fc1 = nn.Linear(temp*32, 32)
39 | self.prelu6 = nn.PReLU()
40 |
41 | # Fully connected 2
42 | self.fc2 = nn.Linear(32, num_classes)
43 |
44 | def forward(self, x):
45 | # Convolution 1
46 | out = self.conv1(x)
47 | # print('After convolution1:', out.shape)
48 |
49 | out = self.bn1(out)
50 | # print('After bn1:', out.shape)
51 |
52 | out = self.prelu1(out)
53 | out = out.reshape(out.shape[0], out.shape[1], -1)
54 | # print('After prelu1:', out.shape)
55 |
56 | # Convolution 2
57 | out = self.conv2(out)
58 | out = self.bn2(out)
59 | out = self.prelu2(out)
60 | # print('After convolution2, bn2, prelu2:', out.shape)
61 |
62 | # Convolution 3
63 | out = self.conv3(out)
64 | out = self.bn3(out)
65 | out = self.prelu3(out)
66 | # print('After convolution3, bn3, prelu3:', out.shape)
67 |
68 | # Convolution 4
69 | out = self.conv4(out)
70 | out = self.bn4(out)
71 | out = self.prelu4(out)
72 | # print('After convolution4, bn4, prelu4:', out.shape)
73 |
74 | # Convolution 5
75 | out = self.conv5(out)
76 | out = self.bn5(out)
77 | out = self.prelu5(out)
78 | # print('After convolution5, bn5, prelu5:', out.shape)
79 |
80 | # flatten
81 | out = out.view(out.size(0), -1)
82 | # print('After flatten:', out.shape)
83 |
84 | # Linear function 1
85 | out = self.fc1(out)
86 | out = self.prelu6(out)
87 | # print('After fc1:', out.shape)
88 |
89 | # Linear function (readout)
90 | out = self.fc2(out)
91 | # print('After fc2:', out.shape)
92 |
93 | return out
--------------------------------------------------------------------------------
/models/CompleteHCNN/complete_hcnn.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class Complete_HCNN(pl.LightningModule):
7 | def __init__(self, lighten, homological_structures):
8 | super().__init__()
9 | self.name = "hcnn"
10 | if lighten:
11 | self.name += "-lighten"
12 |
13 | self.homological_structures = homological_structures
14 | self.tetrahedra = self.homological_structures['tetrahedra']
15 | self.triangles = self.homological_structures['triangles']
16 | self.edges = self.homological_structures['edges']
17 |
18 | # ------------ #
19 |
20 | self.conv1_tetrahedra = nn.Sequential(
21 | nn.Conv2d(
22 | in_channels=1, out_channels=32, kernel_size=(1, 2), stride=(1, 2)
23 | ),
24 | nn.ReLU(),
25 | )
26 |
27 | self.conv1_triangles = nn.Sequential(
28 | nn.Conv2d(
29 | in_channels=1, out_channels=32, kernel_size=(1, 2), stride=(1, 2)
30 | ),
31 | nn.ReLU(),
32 | )
33 |
34 | self.conv1_edges = nn.Sequential(
35 | nn.Conv2d(
36 | in_channels=1, out_channels=32, kernel_size=(1, 2), stride=(1, 2)
37 | ),
38 | nn.ReLU(),
39 | )
40 |
41 | # ------------ #
42 |
43 | self.conv2_tetrahedra = nn.Sequential(
44 | nn.Conv2d(
45 | in_channels=32, out_channels=32, kernel_size=(1, 4), stride=(1, 4)
46 | ),
47 | nn.ReLU(),
48 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
49 | nn.ReLU(),
50 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
51 | nn.ReLU(),
52 | )
53 |
54 | self.conv2_triangles = nn.Sequential(
55 | nn.Conv2d(
56 | in_channels=32, out_channels=32, kernel_size=(1, 3), stride=(1, 3)
57 | ),
58 | nn.ReLU(),
59 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
60 | nn.ReLU(),
61 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
62 | nn.ReLU(),
63 | )
64 |
65 | self.conv2_edges = nn.Sequential(
66 | nn.Conv2d(
67 | in_channels=32, out_channels=32, kernel_size=(1, 2), stride=(1, 2)
68 | ),
69 | nn.ReLU(),
70 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
71 | nn.ReLU(),
72 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
73 | nn.ReLU(),
74 | )
75 |
76 | # ------------ #
77 |
78 | self.conv3_tetrahedra = nn.Sequential(
79 | nn.Conv2d(
80 | in_channels=32, out_channels=32, kernel_size=(1, int(len(self.tetrahedra) / 8))
81 | ),
82 | nn.Dropout(0.35),
83 | nn.ReLU(),
84 | )
85 |
86 | self.conv3_triangles = nn.Sequential(
87 | nn.Conv2d(
88 | in_channels=32, out_channels=32, kernel_size=(1, int(len(self.triangles) / 6))
89 | ),
90 | nn.Dropout(0.35),
91 | nn.ReLU(),
92 | )
93 |
94 | self.conv3_edges = nn.Sequential(
95 | nn.Conv2d(
96 | in_channels=32, out_channels=32, kernel_size=(1, int(len(self.edges) / 4))
97 | ),
98 | nn.Dropout(0.35),
99 | nn.ReLU(),
100 | )
101 |
102 | # ------------ #
103 |
104 | self.lstm = nn.LSTM(
105 | input_size=96, hidden_size=32, num_layers=1, batch_first=True
106 | )
107 | self.fc1 = nn.Linear(32, 3)
108 |
109 | def forward(self, x):
110 | x_tetrahedra = x[:, :, :, self.tetrahedra]
111 | x_triangles = x[:, :, :, self.triangles]
112 | x_edges = x[:, :, :, self.edges]
113 |
114 | x_tetrahedra = self.conv1_tetrahedra(x_tetrahedra)
115 | x_triangles = self.conv1_triangles(x_triangles)
116 | x_edges = self.conv1_edges(x_edges)
117 |
118 | x_tetrahedra = self.conv2_tetrahedra(x_tetrahedra)
119 | x_triangles = self.conv2_triangles(x_triangles)
120 | x_edges = self.conv2_edges(x_edges)
121 |
122 | x_tetrahedra = self.conv3_tetrahedra(x_tetrahedra)
123 | x_triangles = self.conv3_triangles(x_triangles)
124 | x_edges = self.conv3_edges(x_edges)
125 |
126 | x = torch.cat((x_tetrahedra, x_triangles, x_edges), dim=1)
127 |
128 | x = x.permute(0, 2, 1, 3)
129 | x = torch.reshape(x, (-1, x.shape[1], x.shape[2]))
130 |
131 | x, _ = self.lstm(x)
132 | x = x[:, -1, :]
133 | logits = self.fc1(x)
134 |
135 | return logits
136 |
--------------------------------------------------------------------------------
/models/DLA/DLA.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | from torch import nn
3 | import torch
4 |
5 |
6 | class DLA(pl.LightningModule):
7 | def __init__(self, lighten, num_snapshots=100, hidden_size=128):
8 | super().__init__()
9 | self.name = "mlp"
10 | num_features = 40
11 | if lighten:
12 | self.name += "-lighten"
13 | num_features = 20
14 |
15 | self.W1 = nn.Linear(num_features, num_features, bias=False)
16 |
17 | self.softmax = nn.Softmax(dim=1)
18 |
19 | self.gru = nn.GRU(
20 | input_size=num_features,
21 | hidden_size=hidden_size,
22 | num_layers=2,
23 | batch_first=True,
24 | dropout=0.5
25 | )
26 |
27 | self.W2 = nn.Linear(hidden_size, hidden_size, bias=False)
28 | self.W3 = nn.Linear(num_snapshots*hidden_size, 3)
29 |
30 | def forward(self, x):
31 | # x.shape = [batch_size, num_snapshots, num_features]
32 | x = x.squeeze(1)
33 |
34 | X_tilde = self.W1(x)
35 | # alpha.shape = [batch_size, num_snapshots, num_features]
36 |
37 | alpha = self.softmax(X_tilde)
38 | # alpha.shape = [batch_size, num_snapshots, num_features]
39 |
40 | alpha = torch.mean(alpha, dim=2)
41 | # alpha.shape = [batch_size, num_snapshots]
42 |
43 | x_tilde = torch.einsum('ij,ijk->ijk', [alpha, x])
44 | # x_tilde.shape = [batch_size, num_snapshots, num_features]
45 |
46 | H, _ = self.gru(x_tilde)
47 | # o.shape = [batch_size, num_snapshots, hidden_size]
48 |
49 | H_tilde = self.W2(H)
50 | # o.shape = [batch_size, num_snapshots, hidden_size]
51 |
52 | beta = self.softmax(H_tilde)
53 | # o.shape = [batch_size, num_snapshots, hidden_size]
54 |
55 | beta = torch.mean(beta, dim=2)
56 | # beta.shape = [batch_size, num_snapshots]
57 |
58 | h_tilde = torch.einsum('ij,ijk->ijk', [beta, H])
59 | # h_tilde.shape = [batch_size, num_snapshots, hidden_size]
60 |
61 | h_tilde = torch.flatten(h_tilde, start_dim=1)
62 | # h_tilde.shape = [batch_size, hidden_size*num_snapshots]
63 |
64 | logits = self.W3(h_tilde)
65 | # out.shape = [batch_size, 3]
66 |
67 | return logits
--------------------------------------------------------------------------------
/models/DeepLob/deeplob.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class DeepLOB(pl.LightningModule):
7 | def __init__(self, lighten):
8 | super().__init__()
9 | self.name = "deeplob"
10 | if lighten:
11 | self.name += "-lighten"
12 |
13 | # Convolution blocks.
14 | self.conv1 = nn.Sequential(
15 | nn.Conv2d(
16 | in_channels=1, out_channels=32, kernel_size=(1, 2), stride=(1, 2)
17 | ),
18 | nn.LeakyReLU(negative_slope=0.01),
19 | nn.BatchNorm2d(32),
20 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
21 | nn.LeakyReLU(negative_slope=0.01),
22 | nn.BatchNorm2d(32),
23 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
24 | nn.LeakyReLU(negative_slope=0.01),
25 | nn.BatchNorm2d(32),
26 | )
27 | self.conv2 = nn.Sequential(
28 | nn.Conv2d(
29 | in_channels=32, out_channels=32, kernel_size=(1, 2), stride=(1, 2)
30 | ),
31 | nn.LeakyReLU(negative_slope=0.01),
32 | nn.BatchNorm2d(32),
33 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
34 | nn.LeakyReLU(negative_slope=0.01),
35 | nn.BatchNorm2d(32),
36 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
37 | nn.LeakyReLU(negative_slope=0.01),
38 | nn.BatchNorm2d(32),
39 | )
40 |
41 | if lighten:
42 | conv3_kernel_size = 5
43 | else:
44 | conv3_kernel_size = 10
45 |
46 | self.conv3 = nn.Sequential(
47 | nn.Conv2d(
48 | in_channels=32, out_channels=32, kernel_size=(1, conv3_kernel_size)
49 | ),
50 | nn.LeakyReLU(negative_slope=0.01),
51 | nn.BatchNorm2d(32),
52 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
53 | nn.LeakyReLU(negative_slope=0.01),
54 | nn.BatchNorm2d(32),
55 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
56 | nn.LeakyReLU(negative_slope=0.01),
57 | nn.BatchNorm2d(32),
58 | )
59 |
60 | # Inception modules.
61 | self.inp1 = nn.Sequential(
62 | nn.Conv2d(
63 | in_channels=32, out_channels=64, kernel_size=(1, 1), padding="same"
64 | ),
65 | nn.LeakyReLU(negative_slope=0.01),
66 | nn.BatchNorm2d(64),
67 | nn.Conv2d(
68 | in_channels=64, out_channels=64, kernel_size=(3, 1), padding="same"
69 | ),
70 | nn.LeakyReLU(negative_slope=0.01),
71 | nn.BatchNorm2d(64),
72 | )
73 | self.inp2 = nn.Sequential(
74 | nn.Conv2d(
75 | in_channels=32, out_channels=64, kernel_size=(1, 1), padding="same"
76 | ),
77 | nn.LeakyReLU(negative_slope=0.01),
78 | nn.BatchNorm2d(64),
79 | nn.Conv2d(
80 | in_channels=64, out_channels=64, kernel_size=(5, 1), padding="same"
81 | ),
82 | nn.LeakyReLU(negative_slope=0.01),
83 | nn.BatchNorm2d(64),
84 | )
85 | self.inp3 = nn.Sequential(
86 | nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)),
87 | nn.Conv2d(
88 | in_channels=32, out_channels=64, kernel_size=(1, 1), padding="same"
89 | ),
90 | nn.LeakyReLU(negative_slope=0.01),
91 | nn.BatchNorm2d(64),
92 | )
93 |
94 | # lstm layers
95 | self.lstm = nn.LSTM(
96 | input_size=192, hidden_size=64, num_layers=1, batch_first=True
97 | )
98 | self.fc1 = nn.Linear(64, 3)
99 |
100 | def forward(self, x):
101 | x = self.conv1(x)
102 | x = self.conv2(x)
103 | x = self.conv3(x)
104 |
105 | x_inp1 = self.inp1(x)
106 | x_inp2 = self.inp2(x)
107 | x_inp3 = self.inp3(x)
108 |
109 | x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)
110 |
111 | x = x.permute(0, 2, 1, 3)
112 | x = torch.reshape(x, (-1, x.shape[1], x.shape[2]))
113 |
114 | x, _ = self.lstm(x)
115 | x = x[:, -1, :]
116 | logits = self.fc1(x)
117 |
118 | return logits
119 |
--------------------------------------------------------------------------------
/models/LobTransformer/lobtransformer.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class LobTransformer(pl.LightningModule):
7 | def __init__(self, lighten):
8 | super().__init__()
9 | self.name = "lobtransformer"
10 | if lighten:
11 | self.name += "-lighten"
12 |
13 | hidden = 32 if not lighten else 16
14 | d_model = hidden * 2 * 3
15 | nhead = 8 if not lighten else 4
16 | num_layers = 2 if not lighten else 1
17 |
18 | # Convolution blocks.
19 | self.conv1 = nn.Sequential(
20 | nn.Conv2d(
21 | in_channels=1, out_channels=hidden, kernel_size=(1, 2), stride=(1, 2)
22 | ),
23 | nn.LeakyReLU(negative_slope=0.01),
24 | nn.BatchNorm2d(hidden),
25 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)),
26 | nn.LeakyReLU(negative_slope=0.01),
27 | nn.BatchNorm2d(hidden),
28 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)),
29 | nn.LeakyReLU(negative_slope=0.01),
30 | nn.BatchNorm2d(hidden),
31 | )
32 | self.conv2 = nn.Sequential(
33 | nn.Conv2d(
34 | in_channels=hidden, out_channels=hidden, kernel_size=(1, 2), stride=(1, 2)
35 | ),
36 | nn.LeakyReLU(negative_slope=0.01),
37 | nn.BatchNorm2d(hidden),
38 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)),
39 | nn.LeakyReLU(negative_slope=0.01),
40 | nn.BatchNorm2d(hidden),
41 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)),
42 | nn.LeakyReLU(negative_slope=0.01),
43 | nn.BatchNorm2d(hidden),
44 | )
45 |
46 | if lighten:
47 | conv3_kernel_size = 5
48 | else:
49 | conv3_kernel_size = 10
50 |
51 | self.conv3 = nn.Sequential(
52 | nn.Conv2d(
53 | in_channels=hidden, out_channels=hidden, kernel_size=(1, conv3_kernel_size)
54 | ),
55 | nn.LeakyReLU(negative_slope=0.01),
56 | nn.BatchNorm2d(hidden),
57 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)),
58 | nn.LeakyReLU(negative_slope=0.01),
59 | nn.BatchNorm2d(hidden),
60 | nn.Conv2d(in_channels=hidden, out_channels=hidden, kernel_size=(4, 1)),
61 | nn.LeakyReLU(negative_slope=0.01),
62 | nn.BatchNorm2d(hidden),
63 | )
64 |
65 | # Inception modules.
66 | self.inp1 = nn.Sequential(
67 | nn.Conv2d(
68 | in_channels=hidden, out_channels=hidden*2, kernel_size=(1, 1), padding="same"
69 | ),
70 | nn.LeakyReLU(negative_slope=0.01),
71 | nn.BatchNorm2d(hidden*2),
72 | nn.Conv2d(
73 | in_channels=hidden*2, out_channels=hidden*2, kernel_size=(3, 1), padding="same"
74 | ),
75 | nn.LeakyReLU(negative_slope=0.01),
76 | nn.BatchNorm2d(hidden*2),
77 | )
78 | self.inp2 = nn.Sequential(
79 | nn.Conv2d(
80 | in_channels=hidden, out_channels=hidden*2, kernel_size=(1, 1), padding="same"
81 | ),
82 | nn.LeakyReLU(negative_slope=0.01),
83 | nn.BatchNorm2d(hidden*2),
84 | nn.Conv2d(
85 | in_channels=hidden*2, out_channels=hidden*2, kernel_size=(5, 1), padding="same"
86 | ),
87 | nn.LeakyReLU(negative_slope=0.01),
88 | nn.BatchNorm2d(hidden*2),
89 | )
90 | self.inp3 = nn.Sequential(
91 | nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)),
92 | nn.Conv2d(
93 | in_channels=hidden, out_channels=hidden*2, kernel_size=(1, 1), padding="same"
94 | ),
95 | nn.LeakyReLU(negative_slope=0.01),
96 | nn.BatchNorm2d(hidden*2),
97 | )
98 |
99 | # transformer
100 | encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
101 | self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
102 | self.cat_head = nn.Linear(d_model, 3)
103 |
104 |
105 | def forward(self, x):
106 | x = self.conv1(x)
107 | x = self.conv2(x)
108 | x = self.conv3(x)
109 |
110 | x_inp1 = self.inp1(x)
111 | x_inp2 = self.inp2(x)
112 | x_inp3 = self.inp3(x)
113 |
114 | x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)
115 |
116 | x = x.permute(0, 2, 1, 3)
117 | x = torch.reshape(x, (-1, x.shape[1], x.shape[2]))
118 |
119 | x = self.transformer_encoder(x)
120 | # mean pool
121 | x = torch.mean(x, dim=1)
122 |
123 | logits = self.cat_head(x)
124 | return logits
125 |
--------------------------------------------------------------------------------
/models/TABL/bin_nn.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class BiN(pl.LightningModule):
7 | def __init__(self, d2, d1, t1, t2):
8 | super().__init__()
9 | self.t1 = t1
10 | self.d1 = d1
11 | self.t2 = t2
12 | self.d2 = d2
13 |
14 | bias1 = torch.Tensor(t1, 1)
15 | self.B1 = nn.Parameter(bias1)
16 | nn.init.constant_(self.B1, 0)
17 |
18 | l1 = torch.Tensor(t1, 1)
19 | self.l1 = nn.Parameter(l1)
20 | nn.init.xavier_normal_(self.l1)
21 |
22 | bias2 = torch.Tensor(d1, 1)
23 | self.B2 = nn.Parameter(bias2)
24 | nn.init.constant_(self.B2, 0)
25 |
26 | l2 = torch.Tensor(d1, 1)
27 | self.l2 = nn.Parameter(l2)
28 | nn.init.xavier_normal_(self.l2)
29 |
30 | y1 = torch.Tensor(1, )
31 | self.y1 = nn.Parameter(y1)
32 | nn.init.constant_(self.y1, 0.5)
33 |
34 | y2 = torch.Tensor(1, )
35 | self.y2 = nn.Parameter(y2)
36 | nn.init.constant_(self.y2, 0.5)
37 |
38 | def forward(self, x):
39 |
40 | # if the two scalars are negative then we setting them to 0
41 | if (self.y1[0] < 0):
42 | y1 = torch.cuda.FloatTensor(1, )
43 | self.y1 = nn.Parameter(y1)
44 | nn.init.constant_(self.y1, 0.01)
45 |
46 | if (self.y2[0] < 0):
47 | y2 = torch.cuda.FloatTensor(1, )
48 | self.y2 = nn.Parameter(y2)
49 | nn.init.constant_(self.y2, 0.01)
50 |
51 | # normalization along the temporal dimensione
52 | T2 = torch.ones([self.t1, 1], device="cuda")
53 | x2 = torch.mean(x, dim=2)
54 | x2 = torch.reshape(x2, (x2.shape[0], x2.shape[1], 1))
55 |
56 | std = torch.std(x, dim=2)
57 | std = torch.reshape(std, (std.shape[0], std.shape[1], 1))
58 | # it can be possible that the std of some temporal slices is 0, and this produces inf values, so we have to set them to one
59 | std[std < 1e-4] = 1
60 |
61 | diff = x - (x2 @ (T2.T))
62 | Z2 = diff / (std @ (T2.T))
63 |
64 | X2 = self.l2 @ T2.T
65 | X2 = X2 * Z2
66 | X2 = X2 + (self.B2 @ T2.T)
67 |
68 | # normalization along the feature dimension
69 | T1 = torch.ones([self.d1, 1], device="cuda")
70 | x1 = torch.mean(x, dim=1)
71 | x1 = torch.reshape(x1, (x1.shape[0], x1.shape[1], 1))
72 |
73 | std = torch.std(x, dim=1)
74 | std = torch.reshape(std, (std.shape[0], std.shape[1], 1))
75 |
76 | op1 = x1 @ T1.T
77 | op1 = torch.permute(op1, (0, 2, 1))
78 |
79 | op2 = std @ T1.T
80 | op2 = torch.permute(op2, (0, 2, 1))
81 |
82 | z1 = (x - op1) / (op2)
83 | X1 = (T1 @ self.l1.T)
84 | X1 = X1 * z1
85 | X1 = X1 + (T1 @ self.B1.T)
86 |
87 | # weighing the imporance of temporal and feature normalization
88 | x = self.y1 * X1 + self.y2 * X2
89 |
90 | return x
--------------------------------------------------------------------------------
/models/TABL/bin_tabl.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | from torch import nn
3 | import torch
4 | from models.TABL.bin_nn import BiN
5 | from models.TABL.bl_layer import BL_layer
6 | from models.TABL.tabl_layer import TABL_layer
7 |
8 |
9 | class BiN_BTABL(pl.LightningModule):
10 | def __init__(self, d2, d1, t1, t2, d3, t3):
11 | super().__init__()
12 |
13 | self.BiN = BiN(d2, d1, t1, t2)
14 | self.BL = BL_layer(d2, d1, t1, t2)
15 | self.TABL = TABL_layer(d3, d2, t2, t3)
16 | self.dropout = nn.Dropout(0.1)
17 |
18 | def forward(self, x):
19 | x = x.squeeze(1)
20 | # first of all we pass the input to the BiN layer, then we use the B(TABL) architecture
21 | x = torch.permute(x, (0, 2, 1))
22 |
23 | x = self.BiN(x)
24 |
25 | self.max_norm_(self.BL.W1.data)
26 | self.max_norm_(self.BL.W2.data)
27 | x = self.BL(x)
28 | x = self.dropout(x)
29 |
30 | self.max_norm_(self.TABL.W1.data)
31 | self.max_norm_(self.TABL.W.data)
32 | self.max_norm_(self.TABL.W2.data)
33 | x = self.TABL(x)
34 | x = torch.squeeze(x, 2)
35 | return x
36 |
37 | def max_norm_(self, w):
38 | with torch.no_grad():
39 | if (torch.linalg.matrix_norm(w) > 10.0):
40 | norm = torch.linalg.matrix_norm(w)
41 | desired = torch.clamp(norm, min=0.0, max=10.0)
42 | w *= (desired / (1e-8 + norm))
43 |
44 |
45 | class BiN_CTABL(pl.LightningModule):
46 | def __init__(self, d2, d1, t1, t2, d3, t3, d4, t4):
47 | super().__init__()
48 |
49 | self.BiN = BiN(d2, d1, t1, t2)
50 | self.BL = BL_layer(d2, d1, t1, t2)
51 | self.BL2 = BL_layer(d3, d2, t2, t3)
52 | self.TABL = TABL_layer(d4, d3, t3, t4)
53 | self.dropout = nn.Dropout(0.1)
54 |
55 | def forward(self, x):
56 | x = x.squeeze(1)
57 | # first of all we pass the input to the BiN layer, then we use the C(TABL) architecture
58 | x = torch.permute(x, (0, 2, 1))
59 |
60 | x = self.BiN(x)
61 |
62 | self.max_norm_(self.BL.W1.data)
63 | self.max_norm_(self.BL.W2.data)
64 | x = self.BL(x)
65 | x = self.dropout(x)
66 |
67 | self.max_norm_(self.BL2.W1.data)
68 | self.max_norm_(self.BL2.W2.data)
69 | x = self.BL2(x)
70 | x = self.dropout(x)
71 |
72 | self.max_norm_(self.TABL.W1.data)
73 | self.max_norm_(self.TABL.W.data)
74 | self.max_norm_(self.TABL.W2.data)
75 | x = self.TABL(x)
76 | x = torch.squeeze(x, 2)
77 | return x
78 |
79 | def max_norm_(self, w):
80 | with torch.no_grad():
81 | if (torch.linalg.matrix_norm(w) > 10.0):
82 | norm = torch.linalg.matrix_norm(w)
83 | desired = torch.clamp(norm, min=0.0, max=10.0)
84 | w *= (desired / (1e-8 + norm))
--------------------------------------------------------------------------------
/models/TABL/bl_layer.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | from torch import nn
3 | import torch
4 |
5 |
6 | class BL_layer(pl.LightningModule):
7 | def __init__(self, d2, d1, t1, t2):
8 | super().__init__()
9 | weight1 = torch.Tensor(d2, d1)
10 | self.W1 = nn.Parameter(weight1)
11 | nn.init.kaiming_uniform_(self.W1, nonlinearity='relu')
12 |
13 | weight2 = torch.Tensor(t1, t2)
14 | self.W2 = nn.Parameter(weight2)
15 | nn.init.kaiming_uniform_(self.W2, nonlinearity='relu')
16 |
17 | bias1 = torch.zeros((d2, t2))
18 | self.B = nn.Parameter(bias1)
19 | nn.init.constant_(self.B, 0)
20 |
21 | self.activation = nn.ReLU()
22 |
23 | def forward(self, x):
24 |
25 | x = self.activation(self.W1 @ x @ self.W2 + self.B)
26 |
27 | return x
--------------------------------------------------------------------------------
/models/TABL/tabl_layer.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | import torch
3 | from torch import nn
4 |
5 |
6 | class TABL_layer(pl.LightningModule):
7 | def __init__(self, d2, d1, t1, t2):
8 | super().__init__()
9 | self.t1 = t1
10 |
11 | weight = torch.Tensor(d2, d1)
12 | self.W1 = nn.Parameter(weight)
13 | nn.init.kaiming_uniform_(self.W1, nonlinearity='relu')
14 |
15 | weight2 = torch.Tensor(t1, t1)
16 | self.W = nn.Parameter(weight2)
17 | nn.init.constant_(self.W, 1 / t1)
18 |
19 | weight3 = torch.Tensor(t1, t2)
20 | self.W2 = nn.Parameter(weight3)
21 | nn.init.kaiming_uniform_(self.W2, nonlinearity='relu')
22 |
23 | bias1 = torch.Tensor(d2, t2)
24 | self.B = nn.Parameter(bias1)
25 | nn.init.constant_(self.B, 0)
26 |
27 | l = torch.Tensor(1, )
28 | self.l = nn.Parameter(l)
29 | nn.init.constant_(self.l, 0.5)
30 |
31 | self.activation = nn.ReLU()
32 |
33 | def forward(self, X):
34 |
35 | # maintaining the weight parameter between 0 and 1.
36 | if (self.l[0] < 0):
37 | l = torch.Tensor(1, )
38 | self.l = nn.Parameter(l)
39 | nn.init.constant_(self.l, 0.0)
40 |
41 | if (self.l[0] > 1):
42 | l = torch.Tensor(1, )
43 | self.l = nn.Parameter(l)
44 | nn.init.constant_(self.l, 1.0)
45 |
46 | # modelling the dependence along the first mode of X while keeping the temporal order intact (7)
47 | X = self.W1 @ X
48 |
49 | # enforcing constant (1) on the diagonal
50 | W = self.W - self.W * torch.eye(self.t1, dtype=torch.float32, device="cuda") + torch.eye(self.t1, dtype=torch.float32, device="cuda") / self.t1
51 |
52 | # attention, the aim of the second step is to learn how important the temporal instances are to each other (8)
53 | E = X @ W
54 |
55 | # computing the attention mask (9)
56 | A = torch.softmax(E, dim=-1)
57 |
58 | # applying a soft attention mechanism (10)
59 | # he attention mask A obtained from the third step is used to zero out the effect of unimportant elements
60 | X = self.l[0] * (X) + (1.0 - self.l[0]) * X * A
61 |
62 | # the final step of the proposed layer estimates the temporal mapping W2, after the bias shift (11)
63 | y = X @ self.W2 + self.B
64 | return y
--------------------------------------------------------------------------------
/models/Transformer/transformer.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import numpy as np
4 |
5 | import pytorch_lightning as pl
6 | import torch
7 | import torch.nn as nn
8 |
9 |
10 | class SinusoidalPositionalEmbedding(nn.Embedding):
11 | """This module produces sinusoidal positional embeddings of any length."""
12 |
13 | def __init__(
14 | self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None
15 | ) -> None:
16 | super().__init__(num_positions, embedding_dim)
17 | self.weight = self._init_weight(self.weight)
18 |
19 | @staticmethod
20 | def _init_weight(out: nn.Parameter) -> nn.Parameter:
21 | """
22 | Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
23 | the 2nd half of the vector. [dim // 2:]
24 | """
25 | n_pos, dim = out.shape
26 | position_enc = np.array(
27 | [
28 | [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
29 | for pos in range(n_pos)
30 | ]
31 | )
32 | out.requires_grad = False # set early to avoid an error in pytorch-1.8+
33 | sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
34 | out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
35 | out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
36 | out.detach_()
37 | return out
38 |
39 | @torch.no_grad()
40 | def forward(
41 | self, input_ids_shape: torch.Size, past_key_values_length: int = 0
42 | ) -> torch.Tensor:
43 | """`input_ids_shape` is expected to be [bsz x seqlen]."""
44 | _, seq_len = input_ids_shape[:2]
45 | positions = torch.arange(
46 | past_key_values_length,
47 | past_key_values_length + seq_len,
48 | dtype=torch.long,
49 | device=self.weight.device,
50 | )
51 | return super().forward(positions)
52 |
53 |
54 | class Transformer(pl.LightningModule):
55 | def __init__(
56 | self,
57 | lighten,
58 | dropout: float = 0.1,
59 | activation: str = "relu",
60 | norm_first: bool = False,
61 | ):
62 | super().__init__()
63 | self.name = "transformer"
64 | if lighten:
65 | self.name += "-lighten"
66 |
67 | d_model = 64 if not lighten else 32
68 | dim_feedforward = 256 if not lighten else 128
69 | nhead = 8 if not lighten else 4
70 | num_layers = 2 if not lighten else 1
71 |
72 | self.embed = nn.Linear(40, d_model, bias=False)
73 |
74 | self.embed_positions = SinusoidalPositionalEmbedding(100, d_model)
75 |
76 | layer_norm_eps: float = 1e-5
77 | encoder_layer = nn.TransformerEncoderLayer(
78 | d_model=d_model,
79 | nhead=nhead,
80 | dim_feedforward=dim_feedforward,
81 | dropout=dropout,
82 | activation=activation,
83 | layer_norm_eps=layer_norm_eps,
84 | norm_first=norm_first,
85 | batch_first=True,
86 | )
87 | encoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
88 | self.transformer_encoder = nn.TransformerEncoder(
89 | encoder_layer, num_layers=num_layers, norm=encoder_norm
90 | )
91 | self.cat_head = nn.Linear(d_model, 3)
92 |
93 | def forward(self, x):
94 | x = self.embed(x.squeeze(1))
95 |
96 | embed_pos = self.embed_positions(x.shape)
97 |
98 | # transformer encoder
99 | x = self.transformer_encoder(x + embed_pos)
100 |
101 | # mean pool for classification
102 | x = torch.mean(x, dim=1)
103 |
104 | logits = self.cat_head(x)
105 | return logits
106 |
--------------------------------------------------------------------------------
/models/iTransformer/itransformer.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class ITransformer(pl.LightningModule):
7 | def __init__(
8 | self,
9 | lighten,
10 | dropout: float = 0.1,
11 | activation: str = "relu",
12 | norm_first: bool = False,
13 | ):
14 | super().__init__()
15 | self.name = "itransformer"
16 | if lighten:
17 | self.name += "-lighten"
18 |
19 | d_model = 64 if not lighten else 32
20 | dim_feedforward = 256 if not lighten else 128
21 | nhead = 8 if not lighten else 4
22 | num_layers = 2 if not lighten else 1
23 |
24 | self.embed = nn.Linear(100, d_model, bias=False)
25 | layer_norm_eps: float = 1e-5
26 | encoder_layer = nn.TransformerEncoderLayer(
27 | d_model=d_model,
28 | nhead=nhead,
29 | dim_feedforward=dim_feedforward,
30 | dropout=dropout,
31 | activation=activation,
32 | layer_norm_eps=layer_norm_eps,
33 | norm_first=norm_first,
34 | batch_first=True,
35 | )
36 | encoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
37 | self.transformer_encoder = nn.TransformerEncoder(
38 | encoder_layer, num_layers=num_layers, norm=encoder_norm
39 | )
40 | self.cat_head = nn.Linear(d_model, 3)
41 |
42 | def forward(self, x):
43 | x = x.squeeze(1)
44 | # transpose
45 | x = x.permute(0, 2, 1)
46 | x = self.embed(x)
47 |
48 | # transformer encoder
49 | x = self.transformer_encoder(x)
50 |
51 | # mean pool for classification
52 | x = torch.mean(x, dim=1)
53 |
54 | logits = self.cat_head(x)
55 | return logits
56 |
--------------------------------------------------------------------------------
/optimizers/executor.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from torch.utils.data import DataLoader
3 | import torch
4 |
5 | from loaders.custom_dataset import CustomDataset
6 | from models.DeepLob.deeplob import DeepLOB
7 | from models.iTransformer.itransformer import ITransformer
8 | from models.Transformer.transformer import Transformer
9 | from models.LobTransformer.lobtransformer import LobTransformer
10 | from models.DLA.DLA import DLA
11 | from models.CNN1.cnn1 import CNN1
12 | from models.CNN2.cnn2 import CNN2
13 | from models.AxialLob.axiallob import AxialLOB
14 | from models.TABL.bin_tabl import BiN_BTABL, BiN_CTABL
15 | from models.CompleteHCNN.complete_hcnn import Complete_HCNN
16 | from optimizers.lightning_batch_gd import BatchGDManager
17 | from loggers import logger
18 | from utils import create_tree, get_training_test_stocks_as_string
19 |
20 |
21 | class Executor:
22 | def __init__(self, experiment_id, general_hyperparameters, model_hyperparameters, torch_dataset_preparation=False, torch_dataset_preparation_backtest=False):
23 | self.manager = None
24 | self.model = None
25 | self.experiment_id = experiment_id
26 | self.torch_dataset_preparation = torch_dataset_preparation
27 | self.torch_dataset_preparation_backtest = torch_dataset_preparation_backtest
28 |
29 | self.training_stocks_string, self.test_stocks_string = get_training_test_stocks_as_string(general_hyperparameters)
30 |
31 | if self.torch_dataset_preparation:
32 | create_tree(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/")
33 |
34 | if general_hyperparameters["model"] == "deeplob":
35 | self.model = DeepLOB(lighten=model_hyperparameters["lighten"])
36 | elif general_hyperparameters["model"] == "transformer":
37 | self.model = Transformer(lighten=model_hyperparameters["lighten"])
38 | elif general_hyperparameters["model"] == "itransformer":
39 | self.model = ITransformer(lighten=model_hyperparameters["lighten"])
40 | elif general_hyperparameters["model"] == "lobtransformer":
41 | self.model = LobTransformer(lighten=model_hyperparameters["lighten"])
42 | elif general_hyperparameters["model"] == "dla":
43 | self.model = DLA(lighten=model_hyperparameters["lighten"])
44 | elif general_hyperparameters["model"] == "cnn1":
45 | self.model = CNN1()
46 | elif general_hyperparameters["model"] == "cnn2":
47 | self.model = CNN2()
48 | elif general_hyperparameters["model"] == "binbtabl":
49 | self.model = BiN_BTABL(120, 40, 100, 5, 3, 1)
50 | elif general_hyperparameters["model"] == "binctabl":
51 | self.model = BiN_CTABL(120, 40, 100, 5, 120, 5, 3, 1)
52 | elif general_hyperparameters["model"] == "axiallob":
53 | self.model = AxialLOB()
54 | elif general_hyperparameters["model"] == "hlob":
55 | homological_structures = torch.load(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/complete_homological_structures.pt")
56 | self.model = Complete_HCNN(lighten=model_hyperparameters["lighten"], homological_structures=homological_structures)
57 |
58 | if self.torch_dataset_preparation:
59 | # Prepare the training dataloader.
60 | dataset = CustomDataset(
61 | dataset=general_hyperparameters["dataset"],
62 | learning_stage="training",
63 | window_size=model_hyperparameters["history_length"],
64 | shuffling_seed=model_hyperparameters["shuffling_seed"],
65 | cache_size=1,
66 | lighten=model_hyperparameters["lighten"],
67 | threshold=model_hyperparameters["threshold"],
68 | all_horizons=general_hyperparameters["horizons"],
69 | prediction_horizon=model_hyperparameters["prediction_horizon"],
70 | targets_type=general_hyperparameters["targets_type"],
71 | balanced_dataloader=model_hyperparameters["balanced_sampling"],
72 | training_stocks=general_hyperparameters["training_stocks"],
73 | validation_stocks=general_hyperparameters["target_stocks"],
74 | target_stocks=general_hyperparameters["target_stocks"]
75 | )
76 | torch.save(dataset, f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/training_dataset.pt")
77 | elif self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest is False:
78 | dataset = torch.load(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/training_dataset.pt")
79 | self.train_loader = DataLoader(
80 | dataset,
81 | batch_size=model_hyperparameters["batch_size"],
82 | shuffle=False,
83 | num_workers=model_hyperparameters["num_workers"],
84 | sampler=dataset.glob_indices,
85 | )
86 |
87 | if self.torch_dataset_preparation:
88 | # Prepare the validation dataloader.
89 | dataset = CustomDataset(
90 | dataset=general_hyperparameters["dataset"],
91 | learning_stage="validation",
92 | window_size=model_hyperparameters["history_length"],
93 | shuffling_seed=model_hyperparameters["shuffling_seed"],
94 | cache_size=1,
95 | lighten=model_hyperparameters["lighten"],
96 | threshold=model_hyperparameters["threshold"],
97 | all_horizons=general_hyperparameters["horizons"],
98 | targets_type=general_hyperparameters["targets_type"],
99 | prediction_horizon=model_hyperparameters["prediction_horizon"],
100 | training_stocks=general_hyperparameters["training_stocks"],
101 | validation_stocks=general_hyperparameters["target_stocks"],
102 | target_stocks=general_hyperparameters["target_stocks"]
103 | )
104 |
105 | torch.save(dataset, f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/validation_dataset.pt")
106 | elif self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest is False:
107 | dataset = torch.load(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/validation_dataset.pt")
108 | self.val_loader = DataLoader(
109 | dataset,
110 | batch_size=model_hyperparameters["batch_size"],
111 | shuffle=False,
112 | num_workers=model_hyperparameters["num_workers"],
113 | )
114 |
115 | if self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest:
116 | dataset = CustomDataset(
117 | dataset=general_hyperparameters["dataset"],
118 | learning_stage="test",
119 | window_size=model_hyperparameters["history_length"],
120 | shuffling_seed=model_hyperparameters["shuffling_seed"],
121 | cache_size=1,
122 | lighten=model_hyperparameters["lighten"],
123 | threshold=model_hyperparameters["threshold"],
124 | all_horizons=general_hyperparameters["horizons"],
125 | targets_type=general_hyperparameters["targets_type"],
126 | prediction_horizon=model_hyperparameters["prediction_horizon"],
127 | backtest=True,
128 | training_stocks=general_hyperparameters["training_stocks"],
129 | validation_stocks=general_hyperparameters["target_stocks"],
130 | target_stocks=general_hyperparameters["target_stocks"]
131 | )
132 | torch.save(dataset, f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/test_dataset_backtest.pt")
133 | elif self.torch_dataset_preparation and self.torch_dataset_preparation_backtest is False:
134 | dataset = CustomDataset(
135 | dataset=general_hyperparameters["dataset"],
136 | learning_stage="test",
137 | window_size=model_hyperparameters["history_length"],
138 | shuffling_seed=model_hyperparameters["shuffling_seed"],
139 | cache_size=1,
140 | lighten=model_hyperparameters["lighten"],
141 | threshold=model_hyperparameters["threshold"],
142 | all_horizons=general_hyperparameters["horizons"],
143 | targets_type=general_hyperparameters["targets_type"],
144 | prediction_horizon=model_hyperparameters["prediction_horizon"],
145 | training_stocks=general_hyperparameters["training_stocks"],
146 | validation_stocks=general_hyperparameters["target_stocks"],
147 | target_stocks=general_hyperparameters["target_stocks"]
148 | )
149 | torch.save(dataset, f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/test_dataset.pt")
150 | elif self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest is False:
151 | dataset = torch.load(f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{self.training_stocks_string}_test_{self.test_stocks_string}/{model_hyperparameters['prediction_horizon']}/test_dataset.pt")
152 | self.test_loader = DataLoader(
153 | dataset,
154 | batch_size=model_hyperparameters["batch_size"],
155 | shuffle=False,
156 | num_workers=model_hyperparameters["num_workers"],
157 | )
158 |
159 | if self.torch_dataset_preparation is False and self.torch_dataset_preparation_backtest is False:
160 | self.manager = BatchGDManager(
161 | experiment_id=experiment_id,
162 | model=self.model,
163 | train_loader=self.train_loader,
164 | val_loader=self.val_loader,
165 | test_loader=self.test_loader,
166 | epochs=model_hyperparameters["epochs"],
167 | learning_rate=model_hyperparameters["learning_rate"],
168 | patience=model_hyperparameters["patience"],
169 | general_hyperparameters=general_hyperparameters,
170 | model_hyperparameters=model_hyperparameters,
171 | )
172 |
173 | def execute_training(self):
174 | self.manager.train()
175 |
176 | def execute_testing(self):
177 | self.manager.test()
178 |
179 | def logger_clean_up(self):
180 | folder_path = f"{logger.find_save_path(self.experiment_id)}/wandb/"
181 | try:
182 | shutil.rmtree(folder_path)
183 | except:
184 | pass
185 |
186 |
187 |
--------------------------------------------------------------------------------
/optimizers/lightning_batch_gd.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import os
3 | import pickle
4 | import wandb
5 | import shutil
6 | import stat
7 |
8 | import lightning.pytorch as pl
9 | import numpy as np
10 | import torch
11 | from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
12 | from torch import nn, optim
13 | from torchmetrics import Accuracy, F1Score
14 | from lightning.pytorch.loggers import WandbLogger
15 |
16 | from loggers import logger
17 | from utils import get_best_levels_prices_and_labels, wandb_hyperparameters_saving
18 | import sys
19 |
20 | class LOBLightningModule(pl.LightningModule):
21 | def __init__(
22 | self,
23 | model,
24 | experiment_id,
25 | learning_rate,
26 | general_hyperparameters,
27 | model_hyperparameters,
28 | ):
29 | super().__init__()
30 | self.model = model
31 | self.experiment_id = experiment_id
32 | self.learning_rate = learning_rate
33 | self.general_hyperparameters = general_hyperparameters
34 | self.model_hyperparameters = model_hyperparameters
35 |
36 | self.loss = nn.CrossEntropyLoss()
37 |
38 | self.training_accuracy = Accuracy(task="multiclass", num_classes=3)
39 | self.training_f1 = F1Score(task="multiclass", num_classes=3, average="macro")
40 | self.validation_accuracy = Accuracy(task="multiclass", num_classes=3)
41 | self.validation_f1 = F1Score(task="multiclass", num_classes=3, average="macro")
42 |
43 | self.batch_loss_training = []
44 | self.batch_accuracies_training = []
45 | self.batch_f1_scores_training = []
46 | self.batch_loss_validation = []
47 | self.batch_accuracies_validation = []
48 | self.batch_f1_scores_validation = []
49 | self.batch_loss_test = []
50 | self.test_outputs = []
51 | self.test_targets = []
52 | self.test_probs = []
53 |
54 | self.csv_path = f"{logger.find_save_path(experiment_id)}/metrics.csv"
55 |
56 | def forward(self, x):
57 | return self.model(x)
58 |
59 | def training_step(self, batch, batch_idx):
60 | inputs, targets = batch
61 | logits = self.model(inputs)
62 | loss = self.loss(logits, targets)
63 | outputs = nn.functional.softmax(logits, dim=1)
64 | outputs = torch.argmax(outputs, dim=1)
65 | train_acc = self.training_accuracy(outputs, targets)
66 | train_f1 = self.training_f1(outputs, targets)
67 |
68 | self.batch_loss_training.append(loss.item())
69 | self.batch_accuracies_training.append(train_acc.item())
70 | self.batch_f1_scores_training.append(train_f1.item())
71 |
72 | return loss
73 |
74 | def validation_step(self, batch, batch_idx):
75 | inputs, targets = batch
76 | logits = self.model(inputs)
77 | loss = self.loss(logits, targets)
78 | outputs = nn.functional.softmax(logits, dim=1)
79 | outputs = torch.argmax(outputs, dim=1)
80 | val_acc = self.validation_accuracy(outputs, targets)
81 | val_f1 = self.validation_f1(outputs, targets)
82 |
83 | self.batch_loss_validation.append(loss.item())
84 | self.batch_accuracies_validation.append(val_acc.item())
85 | self.batch_f1_scores_validation.append(val_f1.item())
86 |
87 | return loss
88 |
89 | def test_step(self, batch, batch_idx):
90 | inputs, targets = batch
91 | logits = self.model(inputs)
92 | loss = self.loss(logits, targets)
93 | outputs = nn.functional.softmax(logits, dim=1)
94 |
95 | saving_probs = copy.copy(outputs)
96 | self.test_probs.extend(saving_probs.tolist())
97 |
98 | outputs = torch.argmax(outputs, dim=1).tolist()
99 | self.test_outputs.extend(outputs)
100 | self.test_targets.extend(targets.tolist())
101 |
102 | return loss
103 |
104 | def configure_optimizers(self):
105 | optimizer = optim.AdamW(
106 | self.model.parameters(),
107 | lr=self.model_hyperparameters["learning_rate"],
108 | betas=(0.9, 0.95),
109 | weight_decay=1e-1,
110 | )
111 | return optimizer
112 |
113 | def on_validation_epoch_end(self):
114 | # Calculate the average accuracy, F1, and MCC for the epoch (assuming you have them stored in a list).
115 | avg_loss_training = np.mean(
116 | self.batch_loss_training
117 | ) # Average of the batch-level losses (training).
118 | avg_accuracy_training = np.mean(
119 | self.batch_accuracies_training
120 | ) # Average of the batch-level accuracies (training).
121 | avg_f1_score_training = np.mean(
122 | self.batch_f1_scores_training
123 | ) # Average of the batch-level F1 scores (training).
124 |
125 | avg_loss_validation = np.mean(
126 | self.batch_loss_validation
127 | ) # Replace with your batch-level loss list (validation).
128 | avg_accuracy_validation = np.mean(
129 | self.batch_accuracies_validation
130 | ) # Replace with your batch-level accuracy list (validation).
131 | avg_f1_score_validation = np.mean(
132 | self.batch_f1_scores_validation
133 | ) # Replace with your batch-level F1 score list (validation).
134 |
135 | self.log(
136 | "loss",
137 | torch.tensor(avg_loss_training),
138 | prog_bar=True,
139 | on_step=False,
140 | on_epoch=True,
141 | )
142 | self.log(
143 | "acc",
144 | torch.tensor(avg_accuracy_training),
145 | prog_bar=True,
146 | on_step=False,
147 | on_epoch=True,
148 | )
149 | self.log(
150 | "f1",
151 | torch.tensor(avg_f1_score_training),
152 | prog_bar=False,
153 | on_step=False,
154 | on_epoch=True,
155 | )
156 | self.log(
157 | "val_loss",
158 | torch.tensor(avg_loss_validation),
159 | prog_bar=True,
160 | on_step=False,
161 | on_epoch=True,
162 | )
163 | self.log(
164 | "val_acc",
165 | torch.tensor(avg_accuracy_validation),
166 | prog_bar=True,
167 | on_step=False,
168 | on_epoch=True,
169 | )
170 | self.log(
171 | "val_f1",
172 | torch.tensor(avg_f1_score_validation),
173 | prog_bar=False,
174 | on_step=False,
175 | on_epoch=True,
176 | )
177 |
178 | self.training_accuracy.reset()
179 | self.training_f1.reset()
180 | self.validation_accuracy.reset()
181 | self.validation_f1.reset()
182 |
183 | # Append metrics to the list
184 | metrics_data = [
185 | avg_loss_training,
186 | avg_accuracy_training,
187 | avg_f1_score_training,
188 | avg_loss_validation,
189 | avg_accuracy_validation,
190 | avg_f1_score_validation,
191 | ]
192 |
193 | # Save metrics to a CSV file
194 | if not os.path.exists(self.csv_path):
195 | header = [
196 | "Training_Loss",
197 | "Training_Accuracy",
198 | "Training_F1",
199 | "Validation_Loss",
200 | "Validation_Accuracy",
201 | "Validation_F1",
202 | ]
203 | with open(self.csv_path, "w") as file:
204 | file.write(",".join(header) + "\n")
205 |
206 | with open(self.csv_path, "a") as file:
207 | file.write(",".join(map(str, metrics_data)) + "\n")
208 |
209 | def on_test_end(self):
210 | best_levels_prices, sanity_check_labels = get_best_levels_prices_and_labels(
211 | dataset=self.general_hyperparameters["dataset"],
212 | target_stocks=self.general_hyperparameters["target_stocks"],
213 | history_length=self.model_hyperparameters["history_length"],
214 | prediction_horizon=self.model_hyperparameters["prediction_horizon"],
215 | all_horizons=self.general_hyperparameters["horizons"],
216 | threshold=self.model_hyperparameters["threshold"],
217 | )
218 | with open(
219 | os.path.join(logger.find_save_path(self.experiment_id), "prediction.pkl"),
220 | "wb",
221 | ) as f:
222 | pickle.dump(
223 | [
224 | best_levels_prices,
225 | sanity_check_labels,
226 | np.array(self.test_targets),
227 | np.array(self.test_outputs),
228 | np.array(self.test_probs),
229 | ],
230 | f,
231 | )
232 |
233 |
234 | class BatchGDManager:
235 | def __init__(
236 | self,
237 | experiment_id,
238 | model,
239 | train_loader,
240 | val_loader,
241 | test_loader,
242 | epochs,
243 | learning_rate,
244 | patience,
245 | general_hyperparameters,
246 | model_hyperparameters,
247 | ):
248 | self.experiment_id = experiment_id
249 | self.model = model
250 | self.train_loader = train_loader
251 | self.val_loader = val_loader
252 | self.test_loader = test_loader
253 | self.epochs = epochs
254 | self.learning_rate = learning_rate
255 | self.patience = patience
256 | self.general_hyperparameters = general_hyperparameters
257 | self.model_hyperparameters = model_hyperparameters
258 | self.lob_lightning_module = None
259 | self.trainer = None
260 | self.deleted_run = None
261 |
262 | def delete_run(self):
263 | api = wandb.Api()
264 | project_path = "" # TODO: Specify here the name of WB project.
265 | runs = api.runs(path=project_path)
266 | print('Deleting runs...')
267 | while len(runs) < 1:
268 | runs = api.runs(path=project_path)
269 | for run in runs:
270 | input_list = run.metadata
271 | if input_list is not None:
272 | input_list = input_list['args']
273 | result_dict = {input_list[i][2:]: input_list[i + 1] for i in range(0, len(input_list), 2)}
274 | modified_dict = result_dict
275 | if modified_dict['model'] == str(self.general_hyperparameters['model']) and modified_dict['prediction_horizon'] == str(self.model_hyperparameters['prediction_horizon']) and modified_dict['training_stocks'] == str(self.general_hyperparameters['training_stocks'][0]) and modified_dict['target_stocks'] == str(self.general_hyperparameters['target_stocks'][0]):
276 | self.deleted_run = run.name
277 | run.delete()
278 | print(f"Run succesfully deleted from WanDB: {run.name}.")
279 |
280 | def train(self):
281 | self.lob_lightning_module = LOBLightningModule(
282 | self.model,
283 | experiment_id=self.experiment_id,
284 | learning_rate=self.learning_rate,
285 | general_hyperparameters=self.general_hyperparameters,
286 | model_hyperparameters=self.model_hyperparameters,
287 | )
288 |
289 | checkpoint_callback = ModelCheckpoint(
290 | monitor="val_loss",
291 | dirpath=logger.find_save_path(self.experiment_id),
292 | filename="best_val_model",
293 | save_top_k=1,
294 | mode="min",
295 | )
296 | early_stopping_callback = EarlyStopping("val_loss", patience=self.patience, min_delta=0.003)
297 |
298 | os.environ["WANDB_API_KEY"] = "" # TODO: Insert API key
299 | os.environ["WANDB__SERVICE_WAIT"] = "300"
300 | try:
301 | wandb_logger = WandbLogger(
302 | project="Limit_Order_Book",
303 | name=self.experiment_id,
304 | save_dir=logger.find_save_path(self.experiment_id),
305 | )
306 | wandb_hyperparameters_saving(
307 | wandb_logger=wandb_logger,
308 | general_hyperparameters=self.general_hyperparameters,
309 | model_hyperparameters=self.model_hyperparameters,
310 | )
311 | self.trainer = pl.Trainer(
312 | max_epochs=self.epochs,
313 | callbacks=[checkpoint_callback, early_stopping_callback],
314 | logger=wandb_logger,
315 | num_sanity_val_steps=0,
316 | )
317 | self.trainer.fit(self.lob_lightning_module, self.train_loader, self.val_loader)
318 | wandb.finish()
319 | except:
320 | root_path = sys.path[0]
321 | dir_path = f"{root_path}/loggers/results/{self.experiment_id}"
322 | if os.path.exists(dir_path):
323 | shutil.rmtree(dir_path)
324 | print(f"Folder {self.experiment_id} deleted successfully.")
325 | else:
326 | print(f"Unable to delete folder {self.experiment_id}.")
327 |
328 | self.delete_run()
329 |
330 | model = self.general_hyperparameters['model']
331 | horizon = self.model_hyperparameters['prediction_horizon']
332 | training_stocks = self.general_hyperparameters['training_stocks']
333 | target_stocks = self.general_hyperparameters['target_stocks']
334 | errors_string = f"{model} {horizon} {training_stocks} {target_stocks} {self.deleted_run}\n"
335 | with open("errors.txt", 'r+') as file:
336 | content = file.read()
337 |
338 | # If the string does not exist in the file, append it
339 | if errors_string.strip() not in content:
340 | # Move the cursor to the end of the file before appending
341 | file.write(errors_string)
342 | print("String appended successfully.")
343 | else:
344 | print("String already exists in the file.")
345 | #raise Exception
346 |
347 | def test(self):
348 | if self.trainer is None:
349 | self.lob_lightning_module = LOBLightningModule(
350 | self.model,
351 | experiment_id=self.experiment_id,
352 | learning_rate=self.learning_rate,
353 | general_hyperparameters=self.general_hyperparameters,
354 | model_hyperparameters=self.model_hyperparameters,
355 | )
356 | self.trainer = pl.Trainer()
357 | try:
358 | best_model = self.lob_lightning_module.load_from_checkpoint(
359 | checkpoint_path=f"{logger.find_save_path(self.experiment_id)}/best_val_model.ckpt",
360 | model=self.model,
361 | experiment_id=self.experiment_id,
362 | learning_rate=self.learning_rate,
363 | general_hyperparameters=self.general_hyperparameters,
364 | model_hyperparameters=self.model_hyperparameters,
365 | )
366 | except:
367 | best_model = self.lob_lightning_module.load_from_checkpoint(
368 | checkpoint_path=f"{logger.find_save_path(self.experiment_id)}/best_val_model.ckpt",
369 | map_location=torch.device('cpu'),
370 | model=self.model,
371 | experiment_id=self.experiment_id,
372 | learning_rate=self.learning_rate,
373 | general_hyperparameters=self.general_hyperparameters,
374 | model_hyperparameters=self.model_hyperparameters,
375 | )
376 | self.trainer.test(best_model, dataloaders=self.test_loader)
377 | else:
378 | best_model_path = (
379 | f"{logger.find_save_path(self.experiment_id)}/best_val_model.ckpt"
380 | )
381 | self.trainer.test(ckpt_path=best_model_path, dataloaders=self.test_loader)
382 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.8.5
2 | aiosignal==1.3.1
3 | annotated-types==0.5.0
4 | anyio==3.7.1
5 | appdirs==1.4.4
6 | arrow==1.2.3
7 | async-timeout==4.0.3
8 | attrs==23.1.0
9 | backoff==2.2.1
10 | beautifulsoup4==4.12.2
11 | blessed==1.20.0
12 | certifi==2023.7.22
13 | charset-normalizer==3.2.0
14 | click==8.1.7
15 | cmake==3.25.0
16 | contourpy==1.1.1
17 | croniter==1.4.1
18 | cycler==0.11.0
19 | dateutils==0.6.12
20 | deepdiff==6.5.0
21 | docker-pycreds==0.4.0
22 | exceptiongroup==1.1.3
23 | fast-tmfg==0.0.8
24 | fastapi==0.103.1
25 | filelock==3.9.0
26 | fonttools==4.42.1
27 | frozenlist==1.4.0
28 | fsspec==2023.9.2
29 | gitdb==4.0.10
30 | GitPython==3.1.37
31 | h11==0.14.0
32 | idna==3.4
33 | importlib-resources==6.1.0
34 | inquirer==3.1.3
35 | itsdangerous==2.1.2
36 | Jinja2==3.1.2
37 | joblib==1.3.2
38 | kiwisolver==1.4.5
39 | lightning==2.0.9
40 | lightning-cloud==0.5.38
41 | lightning-utilities==0.9.0
42 | lit==15.0.7
43 | markdown-it-py==3.0.0
44 | MarkupSafe==2.1.2
45 | matplotlib==3.8.0
46 | mdurl==0.1.2
47 | mpmath==1.3.0
48 | multidict==6.0.4
49 | networkx==3.0
50 | numpy==1.26.0
51 | ordered-set==4.1.0
52 | packaging==23.1
53 | pandas==2.1.1
54 | pathtools==0.1.2
55 | Pillow==10.0.1
56 | polars==0.19.5
57 | protobuf==4.24.3
58 | psutil==5.9.5
59 | pyarrow==13.0.0
60 | pydantic==2.1.1
61 | pydantic_core==2.4.0
62 | Pygments==2.16.1
63 | PyJWT==2.8.0
64 | pyparsing==3.1.1
65 | python-dateutil==2.8.2
66 | python-editor==1.0.4
67 | python-multipart==0.0.6
68 | pytorch-lightning==2.0.9
69 | pytz==2023.3.post1
70 | PyYAML==6.0.1
71 | readchar==4.0.5
72 | requests==2.31.0
73 | rich==13.5.3
74 | scikit-learn==1.3.1
75 | scipy==1.11.3
76 | seaborn==0.13.2
77 | sentry-sdk==1.31.0
78 | setproctitle==1.3.2
79 | six==1.16.0
80 | smmap==5.0.1
81 | sniffio==1.3.0
82 | soupsieve==2.5
83 | starlette==0.27.0
84 | starsessions==1.3.0
85 | sympy==1.12
86 | threadpoolctl==3.2.0
87 | torch==2.0.0+cu118
88 | torchinfo==1.8.0
89 | torchmetrics==1.2.0
90 | tqdm==4.66.1
91 | traitlets==5.10.1
92 | triton==2.0.0
93 | typing_extensions==4.8.0
94 | tzdata==2023.3
95 | urllib3==1.26.16
96 | uvicorn==0.23.2
97 | wandb==0.15.11
98 | wcwidth==0.2.6
99 | websocket-client==1.6.3
100 | websockets==11.0.3
101 | yarl==1.9.2
102 | zipp==3.17.0
103 |
--------------------------------------------------------------------------------
/requirements_mac_os.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.8.5
2 | aiosignal==1.3.1
3 | annotated-types==0.5.0
4 | anyio==3.7.1
5 | appdirs==1.4.4
6 | arrow==1.2.3
7 | async-timeout==4.0.3
8 | attrs==23.1.0
9 | backoff==2.2.1
10 | beautifulsoup4==4.12.2
11 | blessed==1.20.0
12 | certifi==2023.7.22
13 | charset-normalizer==3.2.0
14 | click==8.1.7
15 | cmake==3.25.0
16 | contourpy==1.1.1
17 | croniter==1.4.1
18 | cycler==0.11.0
19 | dateutils==0.6.12
20 | deepdiff==6.5.0
21 | docker-pycreds==0.4.0
22 | exceptiongroup==1.1.3
23 | fast-tmfg==0.0.8
24 | fastapi==0.103.1
25 | filelock==3.9.0
26 | fonttools==4.42.1
27 | frozenlist==1.4.0
28 | fsspec==2023.9.2
29 | gitdb==4.0.10
30 | GitPython==3.1.37
31 | h11==0.14.0
32 | idna==3.4
33 | importlib-resources==6.1.0
34 | inquirer==3.1.3
35 | itsdangerous==2.1.2
36 | Jinja2==3.1.2
37 | joblib==1.3.2
38 | kiwisolver==1.4.5
39 | lightning==2.0.9
40 | lightning-cloud==0.5.38
41 | lightning-utilities==0.9.0
42 | lit==15.0.7
43 | markdown-it-py==3.0.0
44 | MarkupSafe==2.1.2
45 | matplotlib==3.8.0
46 | mdurl==0.1.2
47 | mpmath==1.3.0
48 | multidict==6.0.4
49 | networkx==3.0
50 | numpy==1.26.0
51 | ordered-set==4.1.0
52 | packaging==23.1
53 | pandas==2.1.1
54 | pathtools==0.1.2
55 | Pillow==10.0.1
56 | polars==0.19.5
57 | protobuf==4.24.3
58 | psutil==5.9.5
59 | pyarrow==13.0.0
60 | pydantic==2.1.1
61 | pydantic_core==2.4.0
62 | Pygments==2.16.1
63 | PyJWT==2.8.0
64 | pyparsing==3.1.1
65 | python-dateutil==2.8.2
66 | python-editor==1.0.4
67 | python-multipart==0.0.6
68 | pytorch-lightning==2.0.9
69 | pytz==2023.3.post1
70 | PyYAML==6.0.1
71 | readchar==4.0.5
72 | requests==2.31.0
73 | rich==13.5.3
74 | scikit-learn==1.3.1
75 | scipy==1.11.3
76 | seaborn==0.13.2
77 | sentry-sdk==1.31.0
78 | setproctitle==1.3.2
79 | six==1.16.0
80 | smmap==5.0.1
81 | sniffio==1.3.0
82 | soupsieve==2.5
83 | starlette==0.27.0
84 | starsessions==1.3.0
85 | sympy==1.12
86 | threadpoolctl==3.2.0
87 | torch==2.0.0
88 | torchinfo==1.8.0
89 | torchmetrics==1.2.0
90 | tqdm==4.66.1
91 | traitlets==5.10.1
92 | typing_extensions==4.8.0
93 | tzdata==2023.3
94 | urllib3==1.26.16
95 | uvicorn==0.23.2
96 | wandb==0.15.11
97 | wcwidth==0.2.6
98 | websocket-client==1.6.3
99 | websockets==11.0.3
100 | yarl==1.9.2
101 | zipp==3.17.0
102 |
--------------------------------------------------------------------------------
/simulator/market_sim.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from tqdm import tqdm
4 | from loggers import logger
5 | from simulator.trading_agent import Trading
6 | from utils import load_yaml
7 | from datetime import timedelta
8 |
9 |
10 | def __get_data__(experiment_id):
11 | path = f"{logger.find_save_path(experiment_id)}/prediction.pkl"
12 | all_prices, sanity_check_labels, all_targets, all_predictions, all_probs = pd.read_pickle(path)
13 | all_prices.reset_index(drop=True, inplace=True)
14 | return all_prices, sanity_check_labels, all_targets.tolist(), all_predictions.tolist(), all_probs
15 |
16 |
17 | def backtest(experiment_id, trading_hyperparameters):
18 | prices, sanity_check_labels, targets, predictions, probs = __get_data__(experiment_id)
19 | TradingAgent = Trading(trading_hyperparameters)
20 |
21 | prices["Mid"] = (prices["BIDp1"] + prices["ASKp1"]) / 2
22 | prices["seconds"] = pd.to_datetime(prices["seconds"])
23 |
24 | prices['Predictions'] = predictions
25 | prices.reset_index(drop=True, inplace=True)
26 | indices_to_delete = prices[prices['Predictions'] == 1].index
27 | prices = prices.drop(indices_to_delete)
28 | mask = (prices['Predictions'] != prices['Predictions'].shift()) | (prices.index == 0)
29 | prices = prices[mask]
30 | prices = prices.reset_index(drop=True)
31 | predictions = prices['Predictions'].tolist()
32 | prices = prices.drop(columns=['Predictions'])
33 | prices.reset_index(drop=True, inplace=True)
34 |
35 | dates = prices['seconds'].dt.date
36 | day_changed_indices = dates.ne(dates.shift())
37 | new_day_indices = day_changed_indices.index[day_changed_indices].tolist()
38 | end_of_day_indices = [element - 1 for element in new_day_indices]
39 | end_of_day_indices.append(len(prices) - 1)
40 | end_of_day_indices = end_of_day_indices[1:]
41 |
42 | for i in tqdm(range(len(predictions))):
43 | mid_price = prices.at[i, "Mid"]
44 | best_bid_price = prices.at[i, "BIDp1"]
45 | best_ask_price = prices.at[i, "ASKp1"]
46 | timestamp = prices.at[i, "seconds"]
47 | prediction = predictions[i]
48 | probability = np.max(probs[i])
49 |
50 | if trading_hyperparameters['mid_side_trading'] == 'mid_to_mid':
51 | if i in end_of_day_indices:
52 | if TradingAgent.long_inventory > 0:
53 | TradingAgent.exit_long(mid_price, timestamp)
54 | if TradingAgent.short_inventory > 0:
55 | TradingAgent.exit_short(mid_price, timestamp)
56 | else:
57 | if prediction == 2 and probability >= trading_hyperparameters['probability_threshold']:
58 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0:
59 | TradingAgent.long(mid_price, timestamp)
60 | elif TradingAgent.long_inventory == 0 and TradingAgent.short_inventory > 0:
61 | TradingAgent.exit_short(mid_price, timestamp)
62 | TradingAgent.long(mid_price, timestamp)
63 | elif prediction == 0 and probability >= trading_hyperparameters['probability_threshold']:
64 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0:
65 | TradingAgent.short(mid_price, timestamp)
66 | elif TradingAgent.short_inventory == 0 and TradingAgent.long_inventory > 0:
67 | TradingAgent.exit_long(mid_price, timestamp)
68 | TradingAgent.short(mid_price, timestamp)
69 | elif trading_hyperparameters['mid_side_trading'] == 'side_market_orders':
70 | if i in end_of_day_indices:
71 | if TradingAgent.long_inventory > 0:
72 | TradingAgent.exit_long(best_bid_price, timestamp)
73 | if TradingAgent.short_inventory > 0:
74 | TradingAgent.exit_short(best_ask_price, timestamp)
75 | else:
76 | if prediction == 2 and probability >= trading_hyperparameters['probability_threshold']:
77 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0:
78 | TradingAgent.long(best_ask_price, timestamp)
79 | elif TradingAgent.long_inventory == 0 and TradingAgent.short_inventory > 0:
80 | TradingAgent.exit_short(best_ask_price, timestamp)
81 | TradingAgent.long(best_ask_price, timestamp)
82 | elif prediction == 0 and probability >= trading_hyperparameters['probability_threshold']:
83 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0:
84 | TradingAgent.short(best_bid_price, timestamp)
85 | elif TradingAgent.short_inventory == 0 and TradingAgent.long_inventory > 0:
86 | TradingAgent.exit_long(best_bid_price, timestamp)
87 | TradingAgent.short(best_bid_price, timestamp)
88 | elif trading_hyperparameters['mid_side_trading'] == 'side_limit_orders':
89 | if i in end_of_day_indices:
90 | if TradingAgent.long_inventory > 0:
91 | TradingAgent.exit_long(best_ask_price, timestamp)
92 | if TradingAgent.short_inventory > 0:
93 | TradingAgent.exit_short(best_bid_price, timestamp)
94 | else:
95 | if prediction == 2 and probability >= trading_hyperparameters['probability_threshold']:
96 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0:
97 | TradingAgent.long(best_bid_price, timestamp)
98 | elif TradingAgent.long_inventory == 0 and TradingAgent.short_inventory > 0:
99 | TradingAgent.exit_short(best_bid_price, timestamp)
100 | TradingAgent.long(best_bid_price, timestamp)
101 | elif prediction == 0 and probability >= trading_hyperparameters['probability_threshold']:
102 | if TradingAgent.long_inventory == 0 and TradingAgent.short_inventory == 0:
103 | TradingAgent.short(best_ask_price, timestamp)
104 | elif TradingAgent.short_inventory == 0 and TradingAgent.long_inventory > 0:
105 | TradingAgent.exit_long(best_ask_price, timestamp)
106 | TradingAgent.short(best_ask_price, timestamp)
107 |
108 | trading_history_dataframe = pd.DataFrame(TradingAgent.trading_history)
109 | save_path = f"{logger.find_save_path(experiment_id)}/trading_simulation.pkl"
110 | trading_history_dataframe.to_pickle(save_path)
111 |
--------------------------------------------------------------------------------
/simulator/post_trading_analysis.py:
--------------------------------------------------------------------------------
1 | from itertools import cycle
2 |
3 | import matplotlib.pyplot as plt
4 | import pandas as pd
5 | import numpy as np
6 | from sklearn.metrics import *
7 | from sklearn.preprocessing import LabelBinarizer
8 | import torch
9 | from torch.utils.data import DataLoader
10 | from tqdm import tqdm
11 |
12 | from loggers import logger
13 | from utils import get_training_test_stocks_as_string
14 |
15 |
16 | def __get_fees_free_pnl__(trading_simulation):
17 | df = trading_simulation
18 | profit_list = []
19 | for index, row in df.iterrows():
20 | profit_no_fees = 0
21 | if row.Type == 'Long':
22 | local_profit = (row.Price_Exit_Long - row.Price_Entry_Long)
23 | profit_no_fees += local_profit
24 | elif row.Type == 'Short':
25 | local_profit = (row.Price_Entry_Short - row.Price_Exit_Short)
26 | profit_no_fees += local_profit
27 |
28 | profit_list.append(profit_no_fees)
29 | return profit_list
30 |
31 |
32 | def __get_pnl_with_fees__(trading_simulation, trading_hyperparameters):
33 | df = trading_simulation
34 | profit_list = []
35 | for index, row in df.iterrows():
36 | profit_no_fees = 0
37 | if row.Type == 'Long':
38 | local_profit = (row.Price_Exit_Long - row.Price_Entry_Long) - (row.Price_Exit_Long * trading_hyperparameters['trading_fee']) - (row.Price_Entry_Long * trading_hyperparameters['trading_fee'])
39 | profit_no_fees += local_profit
40 | elif row.Type == 'Short':
41 | local_profit = (row.Price_Entry_Short - row.Price_Exit_Short) - (row.Price_Entry_Short * trading_hyperparameters['trading_fee']) - (row.Price_Exit_Short * trading_hyperparameters['trading_fee'])
42 | profit_no_fees += local_profit
43 |
44 | profit_list.append(profit_no_fees)
45 | return profit_list
46 |
47 |
48 | def __get_long_short_indices__(trading_simulation):
49 | long_indices = []
50 | short_indices = []
51 | for index, row in trading_simulation.iterrows():
52 | if row.Type == 'Long':
53 | long_indices.append(pd.to_datetime(row.Entry_Long))
54 | elif row.Type == 'Short':
55 | short_indices.append(pd.to_datetime(row.Entry_Short))
56 |
57 | return long_indices, short_indices
58 |
59 |
60 | def post_trading_analysis(experiment_id, general_hyperparameters, trading_hyperparameters, model_hyperparameters):
61 | prediction = pd.read_pickle(f"{logger.find_save_path(experiment_id)}/prediction.pkl")
62 | trading_simulation = pd.read_pickle(f"{logger.find_save_path(experiment_id)}/trading_simulation.pkl")
63 |
64 | training_stocks_string, test_stocks_string = get_training_test_stocks_as_string(general_hyperparameters)
65 |
66 | dataset = torch.load(
67 | f"./torch_datasets/threshold_{model_hyperparameters['threshold']}/batch_size_{model_hyperparameters['batch_size']}/training_{training_stocks_string}_test_{test_stocks_string}/{model_hyperparameters['prediction_horizon']}/test_dataset_backtest.pt")
68 | print(f"Reading test (backtest version) dataset...")
69 | test_loader = DataLoader(
70 | dataset,
71 | batch_size=model_hyperparameters["batch_size"],
72 | shuffle=False,
73 | num_workers=model_hyperparameters["num_workers"],
74 | )
75 | returns_labels_list = []
76 | for data, labels in tqdm(test_loader):
77 | returns_labels_list.extend(labels.tolist())
78 |
79 | targets = prediction[2].tolist()
80 | predictions = prediction[3].tolist()
81 |
82 | print(classification_report(targets, predictions))
83 |
84 | distributions_dataset = pd.DataFrame({"Predictions": predictions, "PCs": returns_labels_list})
85 | distribution_label_0 = distributions_dataset[distributions_dataset['Predictions'] == 0].PCs
86 | distribution_label_1 = distributions_dataset[distributions_dataset['Predictions'] == 1].PCs
87 | distribution_label_2 = distributions_dataset[distributions_dataset['Predictions'] == 2].PCs
88 |
89 | plt.hist(distribution_label_0, label='Label 0', alpha=0.5, bins=10)
90 | plt.hist(distribution_label_1, label='Label 1', alpha=0.5, bins=10)
91 | plt.hist(distribution_label_2, label='Label 2', alpha=0.5, bins=10)
92 |
93 | plt.title("Predictions' distribution")
94 | plt.xlabel("PCs Values")
95 | plt.ylabel("Frequency")
96 | plt.legend(title="Labels")
97 | plt.show()
98 |
99 | label_binarizer = LabelBinarizer().fit(targets)
100 | y_onehot_test = label_binarizer.transform(targets)
101 | colors = cycle(["aqua", "darkorange", "cornflowerblue"])
102 | fig, ax = plt.subplots(figsize=(10, 8))
103 | for class_id, color in zip(range(0, 3), colors):
104 | RocCurveDisplay.from_predictions(
105 | y_onehot_test[:, class_id],
106 | prediction[-1][:, class_id],
107 | name=f"ROC curve for class: {class_id}",
108 | color=color,
109 | ax=ax,
110 | plot_chance_level=(class_id == 2),
111 | )
112 |
113 | plt.axis("square")
114 | plt.xlabel("False Positive Rate")
115 | plt.ylabel("True Positive Rate")
116 | plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
117 | plt.legend()
118 | plt.show()
119 |
120 | print(f"Matthews Correlation Coefficient: {round(matthews_corrcoef(targets, predictions), 2)}")
121 | print(f"Macro-average AUC-ROC (ovr): {round(roc_auc_score(targets, prediction[-1].tolist(), average='macro', multi_class='ovr'), 2)}")
122 | print(f"Macro-average AUC-ROC (ovo): {round(roc_auc_score(targets, prediction[-1].tolist(), average='macro', multi_class='ovo'), 2)}")
123 | print(f"Top-k (with k=2) Accuracy Score: {round(top_k_accuracy_score(targets, prediction[-1], k=2), 2)}")
124 |
125 | fig, axs = plt.subplots(2, 2, figsize=(10, 8))
126 | for ax in axs.flat:
127 | ax.set_yticklabels([])
128 | ax.set_yticks([])
129 | ax.set_xticklabels([])
130 | ax.set_xticks([])
131 |
132 | # Confusion matrix plot.
133 | cm = confusion_matrix(targets, predictions, labels=[0, 1, 2], normalize='true')
134 | disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1, 2])
135 | disp.plot(ax=axs[0, 0], cmap='Blues')
136 | axs[0, 0].set_title('Confusion Matrix')
137 |
138 | # P&L distribution plot.
139 | ax = fig.add_subplot(2, 2, 2)
140 | if trading_hyperparameters['simulation_type'] == 'no_fees':
141 | plt.hist(__get_fees_free_pnl__(trading_simulation), bins=30)
142 | elif trading_hyperparameters['simulation_type'] == 'with_fees':
143 | plt.hist(__get_pnl_with_fees__(trading_simulation, trading_hyperparameters), bins=30)
144 | axs[0, 1].set_title('P&L Distribution')
145 |
146 | # P&L cumsum plot.
147 | ax = fig.add_subplot(2, 2, 3)
148 | if trading_hyperparameters['simulation_type'] == 'no_fees':
149 | plt.plot(np.cumsum(__get_fees_free_pnl__(trading_simulation)))
150 | elif trading_hyperparameters['simulation_type'] == 'with_fees':
151 | plt.plot(np.cumsum(__get_pnl_with_fees__(trading_simulation, trading_hyperparameters)))
152 | axs[1, 0].set_title('P&L cumsum')
153 |
154 | # Mid price
155 | df = prediction[0].reset_index(drop=True)
156 | seconds = pd.to_datetime(df.seconds)
157 | mid = (df.BIDp1 + df.ASKp1) / 2
158 | trading_df = pd.DataFrame()
159 | trading_df['seconds'] = seconds
160 | trading_df['mid'] = mid
161 |
162 | long_indices, short_indices = __get_long_short_indices__(trading_simulation)
163 | trading_df.drop_duplicates(inplace=True, keep='first', subset='seconds')
164 | trading_df.set_index('seconds', inplace=True)
165 |
166 | ax = fig.add_subplot(2, 2, 4)
167 | plt.plot(trading_df.mid)
168 | for datetime in long_indices:
169 | y_value = trading_df.loc[datetime, 'mid']
170 | ax.plot(datetime, y_value, marker='^', color='green', markersize=5)
171 | for datetime in short_indices:
172 | y_value = trading_df.loc[datetime, 'mid']
173 | ax.plot(datetime, y_value, marker='v', color='red', markersize=5)
174 |
175 | axs[1, 1].set_title('Mid price')
176 |
177 | plt.tight_layout()
178 | plt.show()
--------------------------------------------------------------------------------
/simulator/trading_agent.py:
--------------------------------------------------------------------------------
1 | class Trading:
2 | def __init__(self, trading_hyperparameters):
3 | self.long_inventory = 0
4 | self.short_inventory = 0
5 | self.long_price = 0
6 | self.short_price = 0
7 | self.date_time_entry_long = None
8 | self.date_time_exit_long = None
9 | self.date_time_entry_short = None
10 | self.date_time_exit_short = None
11 | self.trading_history = []
12 |
13 | def long(self, price, datetime=None):
14 | amount = 1
15 | self.long_inventory += amount
16 | self.long_price = price
17 | self.date_time_entry_long = datetime
18 |
19 | def short(self, price, datetime=None):
20 | amount = 1
21 | self.short_inventory += amount
22 | self.short_price = price
23 | self.date_time_entry_short = datetime
24 |
25 | def exit_long(self, price, datetime=None):
26 | self.trading_history.append({'Type': 'Long', 'Entry_Long': self.date_time_entry_long, 'Price_Entry_Long': self.long_price,
27 | 'Exit_Long': datetime, 'Price_Exit_Long': price})
28 |
29 | self.long_inventory = 0
30 | self.long_price = 0
31 | self.date_time_entry_long = None
32 |
33 | def exit_short(self, price, datetime=None):
34 | self.trading_history.append({'Type': 'Short', 'Entry_Short': self.date_time_entry_short, 'Price_Entry_Short': self.short_price,
35 | 'Exit_Short': datetime, 'Price_Exit_Short': price})
36 |
37 | self.short_inventory = 0
38 | self.short_price = 0
39 | self.date_time_entry_short = None
40 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 | import argparse
5 |
6 | import pandas as pd
7 | import numpy as np
8 | import yaml
9 |
10 | from loggers import logger
11 | from typing import List, Union, Any
12 |
13 |
14 | def load_yaml(path: str, subsection: str) -> dict[str, Any]:
15 | """
16 | Load a YAML file.
17 |
18 | Args:
19 | path (str): Path to the YAML file.
20 | subsection (str): Subsection to be considered (i.e. general, model, trading).
21 |
22 | Returns:
23 | A dictionary containing the YAML file.
24 | """
25 | with open(path) as f:
26 | config = yaml.safe_load(f)
27 | return config[subsection]
28 |
29 |
30 | def data_split(
31 | dataset: str,
32 | training_stocks: list[str],
33 | target_stock: list[str],
34 | training_ratio: float,
35 | validation_ratio: float,
36 | include_target_stock_in_training: bool,
37 | ) -> None:
38 | """
39 | Split the data into training, validation and test sets based on the training, validation and test ratios.
40 |
41 | Args:
42 | dataset (str): The considered dataset (i.e. nasdaq, lse, ...).
43 | training_stocks (list): The list of stocks to be used for training.
44 | target_stock (list): The list of stocks to be used for validation and test.
45 | training_ratio (float): The ratio of training data.
46 | validation_ratio (float): The ratio of validation data.
47 | include_target_stock_in_training (bool): Including or not the target stock in the training set.
48 |
49 | Returns:
50 | None.
51 | """
52 | # List of target_stocks contains stocks that must be split into training, validation and test sets.
53 | # If requested, target stocks are removed from the training set in a second stage.
54 | for stock in target_stock:
55 | # Sorted list of scaled data.
56 | files_scaled = sorted(glob.glob(f"./data/{dataset}/scaled_data/{stock}/*.csv"))
57 | # Sorted list of unscaled data.
58 | files_unscaled = sorted(
59 | glob.glob(f"./data/{dataset}/unscaled_data/{stock}/*.csv")
60 | )
61 |
62 | # Sanity check to make sure that the number of files in the scaled and unscaled folders is the same.
63 | assert len(files_scaled) == len(
64 | files_unscaled
65 | ), "The number of files in the scaled and unscaled folders must be the same."
66 |
67 | # Number of training files (based on training ratio).
68 | num_training_files = int(len(files_scaled) * training_ratio)
69 | # Number of validation files (based on validation ratio).
70 | num_validation_files = int(len(files_scaled) * validation_ratio)
71 | # Number of test files (based on test ratio).
72 | num_test_files = len(files_scaled) - num_training_files - num_validation_files
73 |
74 | # Create the training folder (scaled data) if it does not exist.
75 | if not os.path.exists(f"./data/{dataset}/scaled_data/training"):
76 | os.makedirs(f"./data/{dataset}/scaled_data/training")
77 | # Create the validation folder (scaled data) if it does not exist.
78 | if not os.path.exists(f"./data/{dataset}/scaled_data/validation"):
79 | os.makedirs(f"./data/{dataset}/scaled_data/validation")
80 | # Create the test folder (scaled data) if it does not exist.
81 | if not os.path.exists(f"./data/{dataset}/scaled_data/test"):
82 | os.makedirs(f"./data/{dataset}/scaled_data/test")
83 |
84 | # Create the training folder (unscaled data) if it does not exist.
85 | if not os.path.exists(f"./data/{dataset}/unscaled_data/training"):
86 | os.makedirs(f"./data/{dataset}/unscaled_data/training")
87 | # Create the validation folder (unscaled data) if it does not exist.
88 | if not os.path.exists(f"./data/{dataset}/unscaled_data/validation"):
89 | os.makedirs(f"./data/{dataset}/unscaled_data/validation")
90 | # Create the test folder (unscaled data) if it does not exist.
91 | if not os.path.exists(f"./data/{dataset}/unscaled_data/test"):
92 | os.makedirs(f"./data/{dataset}/unscaled_data/test")
93 |
94 | # Move the files to the training folder (scaled data).
95 | # If requested, target stocks are removed from the training set.
96 | for i in range(num_training_files):
97 | destination_folder = f"./data/{dataset}/scaled_data/training"
98 | file = files_scaled[i]
99 | if include_target_stock_in_training:
100 | shutil.move(file, destination_folder)
101 | else:
102 | if target_stock not in file:
103 | shutil.move(file, destination_folder)
104 | print(f"{file} --> {destination_folder}")
105 |
106 | # Move the files to the validation folder (scaled data).
107 | for i in range(num_validation_files):
108 | destination_folder = f"./data/{dataset}/scaled_data/validation"
109 | file = files_scaled[i + num_training_files]
110 | shutil.move(file, destination_folder)
111 | print(f"{file} --> {destination_folder}")
112 |
113 | # Move the files to the test folder (scaled data).
114 | for i in range(num_test_files):
115 | destination_folder = f"./data/{dataset}/scaled_data/test"
116 | file = files_scaled[i + num_training_files + num_validation_files]
117 | shutil.move(file, destination_folder)
118 | print(f"{file} --> {destination_folder}")
119 |
120 | # Move the files to the training folder (unscaled data).
121 | # If requested, target stocks are removed from the training set.
122 | for i in range(num_training_files):
123 | destination_folder = f"./data/{dataset}/unscaled_data/training"
124 | file = files_unscaled[i]
125 | if include_target_stock_in_training:
126 | shutil.move(file, destination_folder)
127 | else:
128 | if target_stock not in file:
129 | shutil.move(file, destination_folder)
130 | print(f"{file} --> {destination_folder}")
131 |
132 | # Move the files to the validation folder (unscaled data).
133 | for i in range(num_validation_files):
134 | destination_folder = f"./data/{dataset}/unscaled_data/validation"
135 | file = files_unscaled[i + num_training_files]
136 | shutil.move(file, destination_folder)
137 | print(f"{file} --> {destination_folder}")
138 |
139 | # Move the files to the test folder (unscaled data).
140 | for i in range(num_test_files):
141 | destination_folder = f"./data/{dataset}/unscaled_data/test"
142 | file = files_unscaled[i + num_training_files + num_validation_files]
143 | shutil.move(file, destination_folder)
144 | print(f"{file} --> {destination_folder}")
145 |
146 | # Delete the folders containing the original processed LOB data.
147 | shutil.rmtree(f"./data/{dataset}/scaled_data/{stock}")
148 | shutil.rmtree(f"./data/{dataset}/unscaled_data/{stock}")
149 |
150 | # Until now, only the data belonging to target_stocks have been treated.
151 | # Now, all the other stocks need to be treated.
152 | # Perform the set difference operation between the training_stocks and target_stock sets.
153 | difference_set = list(set(training_stocks).difference(set(target_stock)))
154 |
155 | # Stocks in difference_set are training-only data.
156 | for stock in difference_set:
157 | # Get the sorted list of scaled LOB files.
158 | files_scaled = sorted(glob.glob(f"./data/{dataset}/scaled_data/{stock}/*.csv"))
159 | # Get the sorted list of unscaled LOB files.
160 | files_unscaled = sorted(
161 | glob.glob(f"./data/{dataset}/unscaled_data/{stock}/*.csv")
162 | )
163 |
164 | # Sanity check to make sure that the number of files in the scaled and unscaled folders is the same.
165 | assert len(files_scaled) == len(
166 | files_unscaled
167 | ), "The number of files in the scaled and unscaled folders must be the same."
168 |
169 | # Move the files to the training folder (scaled data).
170 | for i in range(len(files_scaled)):
171 | destination_folder = f"./data/{dataset}/scaled_data/training"
172 | file = files_scaled[i]
173 | shutil.move(file, destination_folder)
174 | print(f"{file} --> {destination_folder}")
175 |
176 | # Move the files to the training folder (unscaled data).
177 | for i in range(len(files_unscaled)):
178 | destination_folder = f"./data/{dataset}/unscaled_data/training"
179 | file = files_unscaled[i]
180 | shutil.move(file, destination_folder)
181 | print(f"{file} --> {destination_folder}")
182 |
183 | # Delete the folders containing the original processed LOB data.
184 | shutil.rmtree(f"./data/{dataset}/scaled_data/{stock}")
185 | shutil.rmtree(f"./data/{dataset}/unscaled_data/{stock}")
186 |
187 | # When dealing with multiple stocks, we want to maintain the same number of files for each of them in the training folder.
188 | print("Aligning data...")
189 | target_stock_dates = set()
190 | other_dates = set()
191 | # As a first step, we check the number of representatives of the target_stock in the training folder.
192 | for stock in target_stock:
193 | files = sorted(
194 | glob.glob(f"./data/{dataset}/unscaled_data/training/{stock}_*.csv")
195 | )
196 | for file in files:
197 | date = file.split("/")[-1].split("_")[-1].split(".")[0]
198 | target_stock_dates.add(date)
199 | # As a second step, we check the number of representatives of the other stocks in the training folder.
200 | # As a third step, we remove redundant files (if any) from both scaled and unscaled data folder.
201 | for stock in training_stocks:
202 | files = sorted(
203 | glob.glob(f"./data/{dataset}/unscaled_data/training/{stock}_*.csv")
204 | )
205 | for file in files:
206 | date = file.split("/")[-1].split("_")[-1].split(".")[0]
207 | other_dates.add(date)
208 | dates_to_remove = list(other_dates.difference(target_stock_dates))
209 | for date in dates_to_remove:
210 | files = sorted(
211 | glob.glob(f"./data/{dataset}/unscaled_data/training/*_{date}.csv")
212 | )
213 | for file in files:
214 | os.remove(file)
215 | files = sorted(glob.glob(f"./data/{dataset}/scaled_data/training/*_{date}.csv"))
216 | for file in files:
217 | os.remove(file)
218 | print("Data aligned.")
219 |
220 |
221 | def save_dataset_info(
222 | experiment_id: str,
223 | general_hyperparameters: dict[str, Any],
224 | ) -> None:
225 | """
226 | Save all the days used in the training, validation and test sets.
227 | Args:
228 | experiment_id (str): ID of the experiment.
229 | general_hyperparameters (dict): General hyperparameters.
230 |
231 | Returns:
232 | None.
233 | """
234 | # Access the training data folder and list all the files.
235 | training_days_temp = glob.glob(
236 | f"./data/{general_hyperparameters['dataset']}/scaled_data/training/*.csv"
237 | )
238 | # Access the validation data folder and list all the files.
239 | validation_days_temp = glob.glob(
240 | f"./data/{general_hyperparameters['dataset']}/scaled_data/validation/*.csv"
241 | )
242 | # Access the test data folder and list all the files.
243 | test_days_temp = glob.glob(
244 | f"./data/{general_hyperparameters['dataset']}/scaled_data/test/*.csv"
245 | )
246 |
247 | training_days = []
248 | validation_days = []
249 | test_days = []
250 |
251 | # Extract the dates from the file names (training data).
252 | for i in training_days_temp:
253 | i = i.split("/")[-1].split("_")[-1]
254 | training_days.append(i)
255 |
256 | # Extract the dates from the file names (validation data).
257 | for i in validation_days_temp:
258 | i = i.split("/")[-1].split("_")[-1]
259 | validation_days.append(i)
260 |
261 | # Extract the dates from the file names (test data).
262 | for i in test_days_temp:
263 | i = i.split("/")[-1].split("_")[-1]
264 | test_days.append(i)
265 |
266 | # Create a dictionary containing the training, validation and test days.
267 | dataset_info = {
268 | "training_days": sorted(set(training_days)),
269 | "validation_days": sorted(set(validation_days)),
270 | "test_days": sorted(set(test_days)),
271 | }
272 |
273 | # Save the dictionary as a YAML file.
274 | logger.logger(
275 | experiment_id=experiment_id,
276 | header="dataset_info",
277 | contents=dataset_info,
278 | )
279 |
280 |
281 | def get_best_levels_prices_and_labels(
282 | dataset: str,
283 | target_stocks: str,
284 | history_length: int,
285 | all_horizons: list[int],
286 | prediction_horizon: int,
287 | threshold: float,
288 | ) -> tuple[Any, ...]:
289 | """
290 | Get the best levels (bid and ask) prices and the corresponding discretized labels.
291 | Args:
292 | dataset (str): Name of the dataset to be used (e.g. nasdaq, lse, ...).
293 | history_length (int): Length of the history (each model's sample is a 2D array of shape (, )).
294 | all_horizons (list): List all horizons computed in the preprocessing stage.
295 | prediction_horizon (int): Horizon to be considered.
296 | threshold (float): Threshold to be used to discretize the labels.
297 |
298 | Returns:
299 | A tuple containing the best levels (bid and ask) prices and the corresponding discretized labels.
300 | """
301 |
302 | # List the test files.
303 | test_files = sorted(glob.glob(f"./data/{dataset}/unscaled_data/test/*{target_stocks[0]}*.csv"))
304 |
305 | best_levels_prices = pd.DataFrame()
306 |
307 | # Get the position of the prediction horizon in the list of all horizons.
308 | position = next(
309 | (
310 | index
311 | for index, value in enumerate(all_horizons)
312 | if value == prediction_horizon
313 | ),
314 | None,
315 | )
316 | all_labels_temp = []
317 |
318 | for file in test_files:
319 | # Load the file.
320 | df = pd.read_csv(file).iloc[history_length:, :]
321 | # Reset the index.
322 | df.reset_index(drop=True, inplace=True)
323 | # Get all the labels.
324 | label_df = df.iloc[:, 41:]
325 | # Get the label corresponding to the prediction horizon.
326 | label = label_df.iloc[:, position]
327 | # Get the best levels (ask and bid) prices and the datetime corresponding to each tick.
328 | best_levels_prices = pd.concat(
329 | [best_levels_prices, df[["seconds", "ASKp1", "BIDp1"]]]
330 | )
331 | # Append the label to the list of labels.
332 | all_labels_temp = all_labels_temp + label.tolist()
333 |
334 | # Discretize the labels (0: downtrend, 1: no trend, 2: uptrend).
335 | all_labels = [
336 | 2 if label >= threshold else 0 if label <= -threshold else 1
337 | for label in all_labels_temp
338 | ]
339 |
340 | return best_levels_prices, all_labels
341 |
342 |
343 | def detect_changing_points(
344 | target: int, cumulative_lengths: list[int]
345 | ) -> Union[int, None]:
346 | """
347 | Detect the last index of the file containing the target value.
348 | Args:
349 | target (int): Target index.
350 | cumulative_lengths (list): List of cumulative lengths.
351 |
352 | Returns:
353 | 0 if the target value is in the first file, the last index of the file containing the target value otherwise.
354 | """
355 | for i, length in enumerate(cumulative_lengths):
356 | if target <= length:
357 | if i == 0:
358 | return 0
359 | else:
360 | return cumulative_lengths[i - 1]
361 | return None
362 |
363 |
364 | def wandb_hyperparameters_saving(
365 | wandb_logger: Any,
366 | general_hyperparameters: dict[str, Any],
367 | model_hyperparameters: dict[str, Any],
368 | ) -> None:
369 | """
370 | Save the general/model hyperparameters in the Weights & Biases dashboard.
371 | Args:
372 | wandb_logger (any): Wandb logger.
373 | general_hyperparameters (dict): General hyperparameters.
374 | model_hyperparameters (dict): Model hyperparameters.
375 |
376 | Returns:
377 | None.
378 | """
379 | wbl = wandb_logger
380 | for key in general_hyperparameters:
381 | wbl.experiment.config[key] = general_hyperparameters[key]
382 | for key in model_hyperparameters:
383 | wbl.experiment.config[key] = model_hyperparameters[key]
384 |
385 |
386 | def str2bool(v):
387 | if isinstance(v, bool):
388 | return v
389 | if v.lower() in ('yes', 'true', 't', 'y', '1'):
390 | return True
391 | elif v.lower() in ('no', 'false', 'f', 'n', '0'):
392 | return False
393 | else:
394 | raise argparse.ArgumentTypeError('Boolean value expected.')
395 |
396 |
397 | def parse_args() -> Any:
398 | """
399 | Parser for input arguments.
400 |
401 | Returns:
402 | The parsed arguments.
403 | """
404 | parser = argparse.ArgumentParser(description="Hyperparameters acquisition.")
405 |
406 | parser.add_argument(
407 | "--experiment_id",
408 | type=str,
409 | default=None,
410 | help="ID of the experiment (if any). This argument is used to resume older experiments or partially re-run experiments.",
411 | )
412 |
413 | # General hyperparameters
414 | parser.add_argument(
415 | "--dataset",
416 | type=str,
417 | default="nasdaq",
418 | help="The dataset to be used (e.g. nasdaq, lse, ...). Each dataset has a different raw data format which needs to be correctly handled.",
419 | )
420 | parser.add_argument(
421 | "--model",
422 | type=str,
423 | default="deeplob",
424 | help="The model to be used (e.g. deeplob, ...).",
425 | )
426 | parser.add_argument(
427 | "--training_stocks",
428 | type=str,
429 | default="XYZ",
430 | help="Stock to be used for training (e.g., 'CSCO').",
431 | )
432 | parser.add_argument(
433 | "--target_stocks",
434 | type=str,
435 | default="XYZ",
436 | help="The stock to be used in the validation and test sets (it is always unique)",
437 | )
438 | parser.add_argument(
439 | "--normalization_window",
440 | type=int,
441 | default=5,
442 | help="Number of files to be used for rolling data normalization.",
443 | )
444 | parser.add_argument(
445 | "--horizons",
446 | type=str,
447 | default="10,50,100",
448 | help="Horizon(s) to be considered (to be expressed in this format: '10,50,100').",
449 | )
450 | parser.add_argument(
451 | "--training_ratio",
452 | type=float,
453 | default=0.6,
454 | help="Training data proportion."
455 | )
456 | parser.add_argument(
457 | "--validation_ratio",
458 | type=float,
459 | default=0.2,
460 | help="Validation data proportion.",
461 | )
462 | parser.add_argument(
463 | "--test_ratio",
464 | type=float,
465 | default=0.2,
466 | help="Test data proportion."
467 | )
468 | parser.add_argument(
469 | "--stages",
470 | type=str,
471 | default="data_processing",
472 | help="Stage(s) to be run (to be expressed in this format: 'training,evaluation').",
473 | ) # data_processing | torch_dataset_preparation | torch_dataset_preparation_backtest | complete_homological_structures_preparation | training,evaluation | backtest,post_trading_analysis
474 | parser.add_argument(
475 | "--include_target_stock_in_training",
476 | type=str2bool,
477 | default=True,
478 | help="Including or not the target stock in the training set.",
479 | )
480 | parser.add_argument(
481 | "--targets_type",
482 | type=str,
483 | default='raw',
484 | help="Type of targets to be used (i.e. smooth, raw).",
485 | )
486 |
487 | # Model hyperparameters
488 | parser.add_argument(
489 | "--batch_size",
490 | type=int,
491 | default=32,
492 | help="Batch size."
493 | )
494 | parser.add_argument(
495 | "--epochs",
496 | type=int,
497 | default=100,
498 | help="Maximum number of epochs."
499 | )
500 | parser.add_argument(
501 | "--learning_rate",
502 | type=float,
503 | default=6e-5,
504 | help="Learning rate."
505 | )
506 | parser.add_argument(
507 | "--num_workers",
508 | type=int,
509 | default=5,
510 | help="Number of workers to be used by the dataloader.",
511 | )
512 | parser.add_argument(
513 | "--history_length",
514 | type=int,
515 | default=100,
516 | help="Length of the history to be used (each model's sample is a 2D array of shape (, ).",
517 | )
518 | parser.add_argument(
519 | "--shuffling_seed",
520 | type=int,
521 | default=428,
522 | help="Seed to be used for data shuffling.",
523 | )
524 | parser.add_argument(
525 | "--lighten",
526 | type=str2bool,
527 | default=False,
528 | help="Lighten the model's input (10 -> 5 levels).",
529 | )
530 | parser.add_argument(
531 | "--threshold",
532 | type=float,
533 | default=0.0,
534 | help="Threshold to be used to discretize the labels.",
535 | )
536 | parser.add_argument(
537 | "--prediction_horizon",
538 | type=int,
539 | default=10,
540 | help="Horizon to be considered in the inference stage.",
541 | )
542 | parser.add_argument(
543 | "--balanced_sampling",
544 | type=str2bool,
545 | default=True,
546 | help="Either or not using a balanced sampling approach in the training stage.",
547 | )
548 | parser.add_argument(
549 | "--patience",
550 | type=int,
551 | default=10,
552 | help="Patience to be used in the training stage.",
553 | )
554 |
555 | # Trading hyperparameters
556 | parser.add_argument(
557 | "--initial_cash",
558 | type=int,
559 | default=1000,
560 | help="Initial cash to be used in the trading simulation.",
561 | )
562 | parser.add_argument(
563 | "--trading_fee",
564 | type=float,
565 | default=0.0001,
566 | help="Trading fee to be used in the trading simulation.",
567 | )
568 | parser.add_argument(
569 | "--mid_side_trading",
570 | type=str,
571 | default="mid_to_mid",
572 | help="Trading strategy to be used in the trading simulation.",
573 | )
574 | parser.add_argument(
575 | "--simulation_type",
576 | type=str,
577 | default="with_fees",
578 | help="Either or not applying trading fees in the trading simulation.",
579 | )
580 | parser.add_argument(
581 | "--probability_threshold",
582 | type=float,
583 | default=0.65,
584 | help="Threshold used to decide if exploiting or ignoring a signal in the trading simulation.",
585 | )
586 |
587 | args = parser.parse_args()
588 | return args
589 |
590 |
591 | def create_hyperparameters_yaml(experiment_id: str, args: Any) -> None:
592 | """
593 | Create and save a YAML file containing the hyperparameters as part of an experiment.
594 | Args:
595 | experiment_id (str): ID of the experiment.
596 | args (any): Stage's arguments.
597 |
598 | Returns:
599 | None.
600 | """
601 | training_stocks = list(
602 | args.training_stocks.split(",")
603 | ) # Parsing of 'training_stocks' input argument.
604 | target_stocks = list(
605 | args.target_stocks.split(",")
606 | ) # Parsing of 'target_stocks' input argument.
607 | horizons = list(
608 | map(int, args.horizons.split(","))
609 | ) # Parsing of 'horizons' input argument.
610 | stages = list(args.stages.split(",")) # Parsing of 'stages' input argument.
611 |
612 | # Create a dictionary (YAML structure) containing the hyperparameters.
613 | data = {
614 | "general": {
615 | "dataset": args.dataset,
616 | "model": args.model,
617 | "training_stocks": training_stocks,
618 | "target_stocks": target_stocks,
619 | "normalization_window": args.normalization_window,
620 | "horizons": horizons,
621 | "training_ratio": args.training_ratio,
622 | "validation_ratio": args.validation_ratio,
623 | "test_ratio": args.test_ratio,
624 | "stages": stages,
625 | "include_target_stock_in_training": args.include_target_stock_in_training,
626 | "targets_type": args.targets_type,
627 | },
628 | "model": {
629 | "batch_size": args.batch_size,
630 | "epochs": args.epochs,
631 | "learning_rate": args.learning_rate,
632 | "num_workers": args.num_workers,
633 | "history_length": args.history_length,
634 | "shuffling_seed": args.shuffling_seed,
635 | "lighten": args.lighten,
636 | "threshold": args.threshold,
637 | "prediction_horizon": args.prediction_horizon,
638 | "balanced_sampling": args.balanced_sampling,
639 | "patience": args.patience,
640 | },
641 | "trading": {
642 | "initial_cash": args.initial_cash,
643 | "trading_fee": args.trading_fee,
644 | "mid_side_trading": args.mid_side_trading,
645 | "simulation_type": args.simulation_type,
646 | "probability_threshold": args.probability_threshold,
647 | },
648 | }
649 |
650 | # Specify the file path where saving the YAML file.
651 | file_path = f"{logger.find_save_path(experiment_id)}/hyperparameters.yaml"
652 |
653 | # Write the data to the YAML file.
654 | with open(file_path, "w") as file:
655 | yaml.dump(data, file)
656 |
657 |
658 | def create_tree(path: str) -> None:
659 | """
660 | Create folders recursively.
661 | Args:
662 | path (str): Tree of folders to be created.
663 |
664 | Returns:
665 | None.
666 | """
667 | # Recursively create a tree of folders. If the path already exists, delete it and create a new one.
668 | if os.path.exists(path):
669 | shutil.rmtree(path)
670 | os.makedirs(path)
671 |
672 |
673 | def get_training_test_stocks_as_string(general_hyperparameters):
674 | training_stocks = general_hyperparameters["training_stocks"]
675 | general_training_string = ""
676 | for s in training_stocks:
677 | general_training_string += s + "_"
678 | general_training_string = general_training_string[:-1]
679 |
680 | test_stocks = general_hyperparameters["target_stocks"]
681 | general_test_string = ""
682 | for s in test_stocks:
683 | general_test_string += s + "_"
684 | general_test_string = general_test_string[:-1]
685 |
686 | return general_training_string, general_test_string
--------------------------------------------------------------------------------