├── torchNDF
├── __init__.py
├── data
│ ├── __init__.py
│ ├── torch_utils.py
│ ├── datahandler.py
│ └── pandas_utils.py
├── models
│ ├── __init__.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── torch_utils.py
│ │ ├── datahandler.py
│ │ └── pandas_utils.py
│ ├── modules.py
│ ├── recurrent_wrapper.py
│ ├── mlp.py
│ └── wattnet.py
├── script_utils.py
├── metrics.py
├── utils.py
└── vis.py
├── fig
└── WATTNet.JPG
├── setup.py
├── LICENSE
├── .gitignore
├── README.md
└── scripts
├── features.py
├── dtcc_records_collector.py
└── spot_rates_collector.py
/torchNDF/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/torchNDF/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/torchNDF/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/torchNDF/models/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fig/WATTNet.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zymrael/wattnet-fx-trading/HEAD/fig/WATTNet.JPG
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(name='torchNDF',
4 | version='0.1.0',
5 | description='PyTorch for NDF trading',
6 | url='https://github.com/Zymrael/NDF-IL',
7 | author='Michael Poli',
8 | author_email='',
9 | license='MIT',
10 | packages=find_packages(),
11 | zip_safe=False)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Zymrael
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/torchNDF/data/torch_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.utils.data import DataLoader, Dataset
4 | from ..utils import seq_normalization
5 |
6 | class Hook():
7 | """Generic hook for nn.Modules with input/output"""
8 | def __init__(self, module, backward=False):
9 | if backward==False:
10 | self.hook = module.register_forward_hook(self.hook_fn)
11 | else:
12 | self.hook = module.register_backward_hook(self.hook_fn)
13 | def hook_fn(self, module, input, output):
14 | self.input = input
15 | self.output = output
16 | def close(self):
17 | self.hook.remove()
18 |
19 | class RegularFinancialData(Dataset):
20 | """Class of torch datasets for loading the financial data. When using with dataloaders,
21 | set shuffle=False to sample via shifting (consecutive samples share `seq_len - 1` data points).
22 | Especially useful for forecasting validation dataloaders.
23 | Data shape is:
24 | `(n_samples, seq_len, n_feat)` = `(N, H, C)`
25 | """
26 | def __init__(self, dataset, forecast_length):
27 | """
28 | Args:
29 | dataset: the torch dataset
30 | """
31 | self._data = dataset
32 | self.fl = forecast_length
33 |
34 | def __len__(self):
35 | """ Returns length of the dataset. Length calculated as len(data) - forecast length
36 | to avoid sampling time series shorter than forecast length. """
37 | return len(self._data) - self.fl
38 |
39 | def __getitem__(self, idx):
40 | batch = self._data[idx:idx + self.fl, :]
41 | return batch, idx
42 |
43 |
44 | def ensemble_predictions(ensemble:list, data:torch.Tensor, device:torch.device, mode:str='min', softmax=True):
45 | """Returns predictions of a voting or min ensemble of models. Can be used for single models
46 | if length of ensemble is 1"""
47 | preds = torch.LongTensor([]).to(device)
48 | for model in ensemble:
49 | yhat = model(data)
50 | if softmax: yhat = nn.Softmax(1)(yhat)
51 | _, pred = torch.max(yhat, 1)
52 | preds = torch.cat((preds, pred.unsqueeze(0)))
53 | if mode=='min': predictions, _ = preds.cpu().min(0)
54 | else: predictions, _ = torch.mode(preds.cpu(), 0)
55 | return predictions
56 |
--------------------------------------------------------------------------------
/torchNDF/models/data/torch_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.utils.data import DataLoader, Dataset
4 | from ..utils import seq_normalization
5 |
6 | class Hook():
7 | """Generic hook for nn.Modules with input/output"""
8 | def __init__(self, module, backward=False):
9 | if backward==False:
10 | self.hook = module.register_forward_hook(self.hook_fn)
11 | else:
12 | self.hook = module.register_backward_hook(self.hook_fn)
13 | def hook_fn(self, module, input, output):
14 | self.input = input
15 | self.output = output
16 | def close(self):
17 | self.hook.remove()
18 |
19 | class RegularFinancialData(Dataset):
20 | """Class of torch datasets for loading the financial data. When using with dataloaders,
21 | set shuffle=False to sample via shifting (consecutive samples share `seq_len - 1` data points).
22 | Especially useful for forecasting validation dataloaders.
23 | Data shape is:
24 | `(n_samples, seq_len, n_feat)` = `(N, H, C)`
25 | """
26 | def __init__(self, dataset, forecast_length):
27 | """
28 | Args:
29 | dataset: the torch dataset
30 | """
31 | self._data = dataset
32 | self.fl = forecast_length
33 |
34 | def __len__(self):
35 | """ Returns length of the dataset. Length calculated as len(data) - forecast length
36 | to avoid sampling time series shorter than forecast length. """
37 | return len(self._data) - self.fl
38 |
39 | def __getitem__(self, idx):
40 | batch = self._data[idx:idx + self.fl, :]
41 | return batch, idx
42 |
43 |
44 | def ensemble_predictions(ensemble:list, data:torch.Tensor, device:torch.device, mode:str='min', softmax=True):
45 | """Returns predictions of a voting or min ensemble of models. Can be used for single models
46 | if length of ensemble is 1"""
47 | preds = torch.LongTensor([]).to(device)
48 | for model in ensemble:
49 | yhat = model(data)
50 | if softmax: yhat = nn.Softmax(1)(yhat)
51 | _, pred = torch.max(yhat, 1)
52 | preds = torch.cat((preds, pred.unsqueeze(0)))
53 | if mode=='min': predictions, _ = preds.cpu().min(0)
54 | else: predictions, _ = torch.mode(preds.cpu(), 0)
55 | return predictions
56 |
--------------------------------------------------------------------------------
/torchNDF/models/modules.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 |
6 | import math
7 | import torch
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 |
11 |
12 | class AttentionBlock(nn.Module):
13 | def __init__(self, in_channels: int, key_size: int, value_size: int):
14 | """Attention block without masking"""
15 | super(AttentionBlock, self).__init__()
16 | self.linear_query = nn.Linear(key_size, key_size)
17 | self.linear_keys = nn.Linear(key_size, key_size)
18 | self.linear_values = nn.Linear(value_size, value_size)
19 | self.sqrt_key_size = math.sqrt(key_size)
20 |
21 | def forward(self, x_in, alpha_check: str = False):
22 | bs = x_in.size(0)
23 | w_dim = x_in.size(1)
24 | x_orig = x_in
25 |
26 | x_in = x_in.reshape((bs, -1))
27 | keys = self.linear_keys(x_in)
28 | keys = keys.reshape((bs, w_dim, -1)) # `N, W, key_size`
29 |
30 | query = self.linear_query(x_in)
31 | query = query.reshape((bs, w_dim, -1)) # `N, W, key_size`
32 |
33 | values = self.linear_values(x_in)
34 | values = values.reshape((bs, w_dim, -1)) # `N, W, value_size`
35 |
36 | alphas = torch.bmm(query, torch.transpose(keys, 1, 2)) # `N, W, W`
37 | alphas = F.softmax(alphas / self.sqrt_key_size, dim=1) # `N, W, W`
38 | res = torch.bmm(alphas, values) # `N, W, value_size`
39 | res = torch.sigmoid(res)
40 | if alpha_check: return alphas
41 | return res + x_orig
42 |
43 | class GatedBlock(nn.Module):
44 | def __init__(self, dilation: int, w_dim: int):
45 | """Gated block with sigmoid/tanh gates."""
46 | super().__init__()
47 | self.dilation = dilation
48 | self.tanh_conv = nn.Conv2d(w_dim, w_dim,
49 | kernel_size=(2, 1), dilation=(dilation, 1), groups=w_dim)
50 | self.sigmoid_conv = nn.Conv2d(w_dim, w_dim,
51 | kernel_size=(2, 1), dilation=(dilation, 1), groups=w_dim)
52 | self.out_conv = nn.Conv2d(w_dim, w_dim,
53 | kernel_size=1, groups=w_dim)
54 |
55 | def forward(self, x_in):
56 | x_tanh, x_sigmoid = self.tanh_conv(x_in), self.sigmoid_conv(x_in)
57 | x_gate = torch.tanh(x_tanh) * torch.sigmoid(x_sigmoid)
58 | x_out = self.out_conv(x_gate + x_in[:, :, :x_gate.size(2), :])
59 | return x_out
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # wattnet-fx-trading
2 | WATTNet: Learning to Trade FX with Hierarchical Spatio-Temporal Representations of Highly Multivariate Time Series:
3 | [paper](https://arxiv.org/abs/1909.10801), [article](https://medium.com/neuri-ai/wattnet-learning-to-trade-fx-with-hierarchical-spatio-temporal-representations-of-highly-bbd0f02c812f)
4 |
5 |
6 |
7 |
8 |
9 | > Finance is a particularly challenging application area for deep learning models due to low noise-to-signal ratio, non-stationarity, and partial observability. Non-deliverable-forwards (NDF), a derivatives contract used in foreign exchange (FX) trading, presents additional difficulty in the form of long-term planning required for an effective selection of start and end date of the contract. In this work, we focus on tackling the problem of NDF tenor selection by leveraging high-dimensional sequential data consisting of spot rates, technical indicators and expert tenor patterns. To this end, we construct a dataset from the Depository Trust & Clearing Corporation (DTCC) NDF data that includes a comprehensive list of NDF volumes and daily spot rates for 64 FX pairs. We introduce WaveATTentionNet (WATTNet), a novel temporal convolution (TCN) model for spatio-temporal modeling of highly multivariate time series, and validate it across NDF markets with varying degrees of dissimilarity between the training and test periods in terms of volatility and general market regimes. The proposed method achieves a significant positive return on investment (ROI) in all NDF markets under analysis, outperforming recurrent and classical baselines by a wide margin. Finally, we propose two orthogonal interpretability approaches to verify noise stability and detect the driving factors of the learned tenor selection strategy.
10 |
11 | ## Installation
12 |
13 | ``` pip install git+https://github.com/Zymrael/wattnet-fx-trading ```
14 |
15 | ## Code availability
16 |
17 | The repo contains PyTorch code for training and evaluating the models, as well as a series of in-depth data exploration tutorial notebooks that document the project step-by-step.
18 | Two versions of each notebooks are provided, a `compact` version containing all the necessary code and an `extended` with additional plots, exploratory analysis and discussion. All data preprocessing scripts are available.
19 |
20 | ## Issues
21 | For clarifications or inquiries about the code, contact michael [at] neuri.ai
22 | For any request, feel free to open a thread under `Issues`.
23 |
24 | If you find our work interesting or useful, consider citing it:
25 | ```
26 | @article{poli2019wattnet,
27 | title={WATTNet: Learning to Trade FX via Hierarchical Spatio-Temporal Representation of Highly Multivariate Time Series},
28 | author={Poli, Michael and Park, Jinkyoo and Ilievski, Ilija},
29 | journal={arXiv preprint arXiv:1909.10801},
30 | year={2019}
31 | }
32 | ```
33 |
34 |
--------------------------------------------------------------------------------
/torchNDF/script_utils.py:
--------------------------------------------------------------------------------
1 | """Utils exclusively utilized by data collection and preprocessing scripts"""
2 |
3 | import re
4 | import pandas as pd
5 |
6 | def remove_dissemination_id_changes(dataframe:pd.Dataframe):
7 | """Drops rows in pandas.DataFrame with updated DISSEMINATION_ID information"""
8 | n_corrections = len(dataframe[dataframe['ACTION'] == 'CORRECT'])
9 | n_cancels = len(dataframe[dataframe['ACTION'] == 'CANCEL'])
10 | to_drop = []
11 | print(f'There have been {n_cancels} cancels and '
12 | f'{n_corrections} corrections in dissemination IDs')
13 | for row_idx, row in dataframe.iterrows():
14 | if row['ACTION'] in ['CORRECT', 'CANCEL']:
15 | o_id = row['ORIGINAL_DISSEMINATION_ID']
16 | o_id = int(o_id)
17 | if o_id in dataframe.index:
18 | to_drop.append(o_id)
19 | if len(to_drop) > 0:
20 | dataframe = dataframe.drop(to_drop, axis=0)
21 | return dataframe
22 |
23 | def to_int(series:pd.Series):
24 | """Transform values in pandas.Series into a valid format for int conversion
25 | NaN values are replaced by 0. Removes [,.+]. Trailing decimals are removed."""
26 | series.fillna(0, inplace=True)
27 | series = series.astype(str).apply(lambda x:
28 | re.sub(r'[.]+\d+$', '', x))
29 | series = series.astype(str).str.replace(r'[,.+]', '')
30 | series = series.astype(int)
31 | return series
32 |
33 | def augment_with_pluses(dataframe:pd.Dataframe, usd_is_1:pd.Series, usd_is_2:pd.Series):
34 | """Augment DataFrame with bool feature flagging whether currency amount strings contain '+'"""
35 | find_plus = lambda elem: str(elem).find('+')
36 | plus_1 = dataframe['ROUNDED_NOTIONAL_AMOUNT_1'].astype(str).apply(find_plus) != -1
37 | plus_2 = dataframe['ROUNDED_NOTIONAL_AMOUNT_2'].astype(str).apply(find_plus) != -1
38 | dataframe.loc[:, 'PLUS_USD'] = (usd_is_1 & plus_1) | (usd_is_2 & plus_2)
39 | dataframe.loc[:, 'PLUS_CCY'] = (usd_is_2 & plus_1) | (usd_is_1 & plus_2)
40 |
41 |
42 | def amounts_to_ndf_rate(dataframe:pd.Dataframe, usd_is_1:pd.Series, usd_is_2:pd.Series) -> None:
43 | """Computes NDF rates from notional amounts and augments `dataframe` with an NDF rate column"""
44 | dataframe.loc[usd_is_1, 'CURRENCY'] = dataframe[usd_is_1]['NOTIONAL_CURRENCY_2']
45 | dataframe.loc[usd_is_2, 'CURRENCY'] = dataframe[usd_is_2]['NOTIONAL_CURRENCY_1']
46 |
47 | dataframe.loc[usd_is_1, 'USD_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_1']
48 | dataframe.loc[usd_is_2, 'USD_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_2']
49 | dataframe.loc[usd_is_2, 'CCY_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_1']
50 | dataframe.loc[usd_is_1, 'CCY_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_2']
51 |
52 | dataframe.loc[:, 'NDF_RATE'] = dataframe['CCY_AMOUNT'] / dataframe['USD_AMOUNT']
53 |
54 | def split_timestamp(dataframe:pd.Dataframe, colname:str) -> None:
55 | """Splits timestamp pd.Series into a time feature and a date feature."""
56 | dataframe[colname] = pd.to_datetime(dataframe[colname])
57 | date, time = dataframe[colname].dt.date, dataframe[colname].dt.time
58 | dataframe[f'{colname}_TIME'] = pd.to_datetime(time)
59 | dataframe[f'{colname}_DATE'] = date
60 |
61 |
62 | def augment_with_delta(dataframe:pd.Dataframe, feature_1:str, feature_2:str, new_name:str) -> None:
63 | """ Augments pandas.DataFrame with column counting number of days between two
64 | datetime.date features
65 |
66 | Args:
67 | dataframe (pandas.DataFrame)
68 | feature_1 (str): start date feature for difference calculation
69 | feature_2 (str): end date feature for difference calculation
70 | new_name (str): new feature name
71 | """
72 | delta = dataframe[feature_1] - dataframe[feature_2]
73 | delta_days = delta.apply(lambda x: x.days)
74 | dataframe[new_name] = delta_days
--------------------------------------------------------------------------------
/torchNDF/models/recurrent_wrapper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | from ..utils import expert_guided_loss
5 | from ..metrics import cross_entropy_accuracy
6 | from .mlp import MLP
7 |
8 |
9 | class RecurrentWrapper(nn.Module):
10 | """Wrapper for recurrent models (GRU - LSTM)"""
11 | def __init__(self,
12 | seq_len: int,
13 | rec_cell_type: str,
14 | in_dim: int,
15 | latent_dim: int,
16 | n_recurrent_layers: int,
17 | mlp_in_dim: int=90,
18 | mlp_out_dim: int=19,
19 | mlp_layers=[128, 128],
20 | dropout_prob: int=0.2
21 | ):
22 |
23 | super().__init__()
24 | self.seq_len = seq_len
25 | self.rec_cell_type = rec_cell_type
26 | self._set_recurrent_layers(in_dim, latent_dim, n_recurrent_layers, dropout_prob)
27 | self.MLP = MLP(mlp_in_dim, mlp_out_dim, mlp_layers, drop_probability=dropout_prob, \
28 | hidden_activation='leaky_relu', out_softmax=False)
29 |
30 | def _set_recurrent_layers(self, in_dim, ld, nl, dp):
31 | if self.rec_cell_type == 'LSTM':
32 | self.recurrent_layers = nn.LSTM(in_dim, ld, num_layers=nl, dropout=dp)
33 | elif self.rec_cell_type == 'GRU':
34 | self.recurrent_layers = nn.GRU(in_dim, ld, num_layers=nl, dropout=dp)
35 | else:
36 | print('f{self.rec_cell_type} not supported')
37 |
38 | def forward(self, x_in):
39 | x_in = x_in.transpose(0, 1)
40 | x_in, _ = self.recurrent_layers(x_in)
41 | # last latent
42 | x_in = x_in[-1, :, :]
43 | x_out = self.MLP(x_in)
44 | return x_out
45 |
46 | def fit(self,
47 | epochs,
48 | trainloader,
49 | valloader,
50 | opt,
51 | sched,
52 | device,
53 | log_interval=10000,
54 | dropout_schedule=None,
55 | early_stop_loss=2.,
56 | dropout_interval=1000,
57 | tr_returns=None,
58 | val_returns=None
59 | ):
60 | if valloader:
61 | x_val, val_idx = next(iter(valloader))
62 | x_val_clip = x_val[:, :-1, :].to(device)
63 | y_val = val_returns.argmax(0)
64 | val_opt_rets, _ = val_returns.max(0)
65 | val_opt_rets = val_opt_rets.sum().item()
66 |
67 | for e in range(epochs):
68 | drop_idx, run_rets, run_loss, run_opt_rets = 0, 0., 0., 0.
69 | iterator = iter(trainloader)
70 | sched.step()
71 | for i in range(len(iterator)):
72 | opt.zero_grad()
73 | x, idx = next(iterator)
74 | x = x.to(device)
75 | x_clip = x[:, :-1, :]
76 | yhat = self(x_clip)
77 | loss, iter_rets, iter_opt_rets = expert_guided_loss(yhat, tr_returns, idx)
78 | run_loss += loss.item()
79 | # early stopping check:
80 | if run_loss / (i + 1) < early_stop_loss:
81 | print(f'Early stopping...')
82 | return None
83 |
84 | run_rets += iter_rets.item()
85 | run_opt_rets += iter_opt_rets.item()
86 | loss.backward()
87 | opt.step()
88 |
89 | if i % log_interval == 0:
90 |
91 | print(f'Epoch: {e}')
92 | print(f'Training Loss: {np.round(run_loss / (i + 1), 2)}')
93 | print(f'Avg train returns: {np.round(run_rets / (i + 1), 2)}')
94 | print(f'Avg train optimal returns: {np.round(run_opt_rets / (i + 1), 2)} \n')
95 | if valloader:
96 | yhat = self(x_val_clip)
97 | val_loss = nn.CrossEntropyLoss()(yhat, y_val).item()
98 | probs = nn.Softmax(1)(yhat)
99 | val_acc = cross_entropy_accuracy(probs, y_val).item()
100 | val_act = probs.argmax(1)
101 | val_mod_rets = (val_returns[val_act, val_idx]).sum().item()
102 |
103 | print(f'Validation Loss: {np.round(val_loss, 2)}')
104 | print(f'Validation Accuracy: {np.round(val_acc, 2)}')
105 | print(f'Avg val returns: {np.round(val_mod_rets, 2)}')
106 | print(f'Avg val optimal returns: {np.round(val_opt_rets, 2)} \n')
107 | if e % dropout_interval and dropout_schedule:
108 | drop_idx += 1
109 | if drop_idx < len(dropout_schedule):
110 | self.MLP.drop_probability = dropout_schedule[drop_idx]
111 | self.recurrent_layers.drop_probability = dropout_schedule[drop_idx]
--------------------------------------------------------------------------------
/scripts/features.py:
--------------------------------------------------------------------------------
1 | """Contains constants required for DTCC and FX spot rate download/preprocessing.
2 | Column names and FX pair lists"""
3 |
4 | ALL_COLUMNS_DTCC = ['DISSEMINATION_ID', 'ORIGINAL_DISSEMINATION_ID', 'ACTION',
5 | 'EXECUTION_TIMESTAMP', 'CLEARED', 'INDICATION_OF_COLLATERALIZATION',
6 | 'INDICATION_OF_END_USER_EXCEPTION', 'INDICATION_OF_OTHER_PRICE_AFFECTING_TERM',
7 | 'BLOCK_TRADES_AND_LARGE_NOTIONAL_OFF-FACILITY_SWAPS', 'EXECUTION_VENUE',
8 | 'EFFECTIVE_DATE', 'END_DATE', 'DAY_COUNT_CONVENTION', 'SETTLEMENT_CURRENCY',
9 | 'ASSET_CLASS', 'SUB-ASSET_CLASS_FOR_OTHER_COMMODITY', 'TAXONOMY',
10 | 'PRICE_FORMING_CONTINUATION_DATA', 'UNDERLYING_ASSET_1', 'UNDERLYING_ASSET_2',
11 | 'PRICE_NOTATION_TYPE', 'PRICE_NOTATION', 'ADDITIONAL_PRICE_NOTATION_TYPE',
12 | 'ADDITIONAL_PRICE_NOTATION', 'NOTIONAL_CURRENCY_1', 'NOTIONAL_CURRENCY_2',
13 | 'ROUNDED_NOTIONAL_AMOUNT_1', 'ROUNDED_NOTIONAL_AMOUNT_2', 'PAYMENT_FREQUENCY_1',
14 | 'PAYMENT_FREQUENCY_2', 'RESET_FREQUENCY_1', 'RESET_FREQUENCY_2', 'PRICE_NOTATION2_TYPE',
15 | 'PRICE_NOTATION2', 'PRICE_NOTATION3_TYPE', 'PRICE_NOTATION3'
16 | ]
17 |
18 | NDF_COLUMNS_DTCC = ['DISSEMINATION_ID', 'ORIGINAL_DISSEMINATION_ID', 'ACTION', 'EXECUTION_TIMESTAMP',
19 | 'INDICATION_OF_OTHER_PRICE_AFFECTING_TERM',
20 | 'BLOCK_TRADES_AND_LARGE_NOTIONAL_OFF-FACILITY_SWAPS',
21 | 'EFFECTIVE_DATE', 'END_DATE', 'SETTLEMENT_CURRENCY',
22 | 'NOTIONAL_CURRENCY_1', 'NOTIONAL_CURRENCY_2',
23 | 'ROUNDED_NOTIONAL_AMOUNT_1', 'ROUNDED_NOTIONAL_AMOUNT_2'
24 | ]
25 |
26 | NDF_BOOL_COLUMNS_DTCC = ['INDICATION_OF_OTHER_PRICE_AFFECTING_TERM',
27 | 'BLOCK_TRADES_AND_LARGE_NOTIONAL_OFF-FACILITY_SWAPS']
28 |
29 | NDF_CONT_COLUMNS_DTCC = ['ORIGINAL_DISSEMINATION_ID', 'ROUNDED_NOTIONAL_AMOUNT_1',
30 | 'ROUNDED_NOTIONAL_AMOUNT_2']
31 |
32 | NDF_CATEGORY_COLUMNS_DTCC = ['SETTLEMENT_CURRENCY', 'NOTIONAL_CURRENCY_1', 'NOTIONAL_CURRENCY_2']
33 | NDF_DATE_COLUMNS_DTCC = ['EFFECTIVE_DATE', 'END_DATE']
34 | NDF_TIMESTAMP_COLUMNS_DTCC = ['EXECUTION_TIMESTAMP']
35 | NDF_CURRENCIES_DTCC = ['KRW', 'TWD', 'MYR', 'IDR', 'PHP', 'CNY', 'INR', 'USD']
36 |
37 | FEATURES_DTCC = {'all': ALL_COLUMNS_DTCC,
38 | 'all_ndf': NDF_COLUMNS_DTCC,
39 | 'continuous': NDF_CONT_COLUMNS_DTCC,
40 | 'bool': NDF_BOOL_COLUMNS_DTCC,
41 | 'timestamp': NDF_TIMESTAMP_COLUMNS_DTCC,
42 | 'date': NDF_DATE_COLUMNS_DTCC,
43 | 'category': NDF_CATEGORY_COLUMNS_DTCC,
44 | 'currencies': NDF_CURRENCIES_DTCC
45 | }
46 |
47 |
48 | COLUMNS_SPOT = ['Date', 'USDKRW', 'USDTWD', 'USDMYR', 'USDIDR', 'USDPHP', 'USDCNY',
49 | 'USDCNH', 'USDINR']
50 | DATE_COLUMNS_SPOT = ['Date']
51 | CONT_COLUMNS_SPOT = ['USDKRW', 'USDTWD', 'USDMYR', 'USDIDR', 'USDPHP', 'USDCNY',
52 | 'USDCNH', 'USDINR']
53 |
54 | FEATURES_SPOT = {'all': COLUMNS_SPOT,
55 | 'continuous': CONT_COLUMNS_SPOT,
56 | 'datetime': DATE_COLUMNS_SPOT
57 | }
58 |
59 | ALL_COLUMNS_OANDA_SPOT = ['PAIR_CODE', 'FREQUENCY', 'OCCURRED_AT',
60 | 'OPEN_BID', 'OPEN_ASK', 'HIGH_BID', 'HIGH_ASK', 'LOW_BID', 'LOW_ASK', 'CLOSE_BID',
61 | 'CLOSE_ASK', 'TOTAL_TICKS']
62 |
63 | CONT_COLUMNS_OANDA_SPOT = ['OPEN_BID', 'OPEN_ASK', 'HIGH_BID', 'HIGH_ASK', 'LOW_BID',
64 | 'LOW_ASK', 'CLOSE_BID', 'CLOSE_ASK', 'TOTAL_TICKS']
65 |
66 | TIMESTAMP_COLUMNS_OANDA_SPOT = ['OCCURRED_AT']
67 |
68 | CURRENCIES_OANDA_SPOT = ['EUR_USD', 'GBP_USD', 'USD_CAD', 'USD_CHF', 'USD_JPY', 'EUR_GBP', 'EUR_CHF',
69 | 'AUD_USD', 'EUR_JPY', 'GBP_JPY', 'EUR_AUD', 'EUR_CZK', 'EUR_HUF', 'EUR_NZD',
70 | 'EUR_SEK', 'EUR_SGD', 'EUR_CAD', 'EUR_DKK', 'EUR_NOK', 'EUR_PLN', 'EUR_TRY',
71 | 'EUR_ZAR', 'USD_CNH', 'USD_DKK', 'USD_HUF', 'USD_MXN', 'USD_PLN', 'USD_SEK',
72 | 'USD_THB', 'USD_ZAR', 'USD_CZK', 'USD_HKD', 'USD_INR', 'USD_NOK', 'USD_SAR',
73 | 'USD_SGD', 'USD_TRY', 'GBP_AUD', 'GBP_CHF', 'GBP_ZAR', 'GBP_SGD', 'AUD_JPY',
74 | 'AUD_SGD', 'CAD_JPY', 'CHF_JPY', 'NZD_CAD', 'NZD_USD', 'SGD_JPY', 'ZAR_JPY',
75 | 'GBP_CAD', 'GBP_NZD', 'GBP_PLN', 'AUD_CAD', 'AUD_NZD', 'CAD_CHF', 'CAD_SGD',
76 | 'CHF_ZAR', 'NZD_JPY', 'NZD_SGD', 'TRY_JPY'
77 | ]
78 |
79 | FEATURES_OANDA_SPOT = {'all': ALL_COLUMNS_OANDA_SPOT,
80 | 'continuous': CONT_COLUMNS_OANDA_SPOT,
81 | 'timestamp': TIMESTAMP_COLUMNS_OANDA_SPOT,
82 | 'currencies': CURRENCIES_OANDA_SPOT
83 | }
84 |
--------------------------------------------------------------------------------
/torchNDF/data/datahandler.py:
--------------------------------------------------------------------------------
1 | import natsort
2 | import torch
3 | import pandas as pd
4 | from .pandas_utils import mu_law_encode
5 | from ..utils import one_hot
6 |
7 |
8 | class FXDataHandler:
9 | def __init__(self,
10 | fpath=None):
11 | if fpath:
12 | self.data = pd.read_pickle(f'{fpath}')
13 | self._is_split = False
14 | self._is_quantized = False
15 | self._is_tensor = False
16 |
17 | def assign_data(self, data):
18 | self.data = data
19 |
20 | def split_train_val_test(self, percentage_split):
21 | assert sum(percentage_split) == 100, f'Percentage splits sum {sum(percentage_split)} not equal to 100'
22 | split_d = []
23 | run_idx = 0
24 | for p in percentage_split:
25 | idx = p * len(self.data) // 100
26 | split_d.append(self.data.iloc[run_idx: run_idx + idx])
27 | run_idx += idx
28 | self.train = split_d[0]
29 | self.val = split_d[1]
30 | self.test = split_d[2]
31 | self._is_split = True
32 |
33 | def normalize(self, window=30):
34 | assert not self._is_split and not self._is_tensor, 'normalize before splitting and tensor transforms'
35 | if not window:
36 | self.data = (self.data - self.data.mean()) / self.data.std()
37 | else:
38 | self.data = (self.data - self.data.rolling(window, min_periods=1).mean()) / \
39 | self.data.rolling(window, min_periods=1).std()
40 | self.data = self.data.dropna()
41 |
42 | def to_percentage_change(self, multiplier=1, diff_degree=1):
43 | self.data = multiplier * self.data.pct_change(diff_degree).dropna()
44 | if self._is_split:
45 | self.train = multiplier * self.train.pct_change(diff_degree).dropna()
46 | self.val = multiplier * self.val.pct_change(diff_degree).dropna()
47 | self.test = multiplier * self.test.pct_change(diff_degree).dropna()
48 |
49 | def encode(self,
50 | scheme='mu',
51 | n_bins=4,
52 | shift=True,
53 | norm=False,
54 | clip=True):
55 |
56 | if scheme == 'mu':
57 | half = n_bins // 2
58 | self.data = mu_law_encode(self.data, n_bins, norm)
59 | if self._is_split:
60 | self.train = mu_law_encode(self.train, n_bins, norm)
61 | self.val = mu_law_encode(self.val, n_bins, norm)
62 | self.test = mu_law_encode(self.test, n_bins, norm)
63 | if shift:
64 | self.data -= half
65 | if self._is_split:
66 | self.train -= half
67 | self.val -= half
68 | self.test -= half
69 | if clip:
70 | self._clip(n_bins, shift)
71 | self._is_quantized = True
72 |
73 | else:
74 | raise NotImplementedError
75 |
76 | def _clip(self, n_bins, shift):
77 | half = n_bins // 2
78 | if shift:
79 | self.data.clip(-half, half - 1, inplace=True)
80 | if self._is_split:
81 | self.train.clip(-half, half - 1, inplace=True)
82 | self.val.clip(-half, half - 1, inplace=True)
83 | self.test.clip(-half, half - 1, inplace=True)
84 | else:
85 | self.data.clip(0, n_bins - 1, inplace=True)
86 | if self._is_split:
87 | self.train.clip(0, n_bins - 1, inplace=True)
88 | self.val.clip(0, n_bins - 1, inplace=True)
89 | self.test.clip(0, n_bins - 1, inplace=True)
90 |
91 | def to_tensor(self, device):
92 | if self._is_tensor: return
93 | if self._is_quantized:
94 | self.data = torch.IntTensor(self.data.values).to(device)
95 | if self._is_split:
96 | self.train = torch.IntTensor(self.train.values).to(device)
97 | self.val = torch.IntTensor(self.val.values).to(device)
98 | self.test = torch.IntTensor(self.test.values).to(device)
99 | else:
100 | self.data = torch.FloatTensor(self.data.values).to(device)
101 | if self._is_split:
102 | self.train = torch.FloatTensor(self.train.values).to(device)
103 | self.val = torch.FloatTensor(self.val.values).to(device)
104 | self.test = torch.FloatTensor(self.test.values).to(device)
105 | self._is_tensor = True
106 |
107 | def one_hot_transform(self, n_bins):
108 | assert self._is_tensor, f'one_hot only implemented for torch Tensors'
109 | self.data = one_hot(self.data, dim=n_bins)
110 | if self._is_split:
111 | self.train = one_hot(self.train, dim=n_bins)
112 | self.val = one_hot(self.val, dim=n_bins)
113 | self.test = one_hot(self.test, dim=n_bins)
114 |
115 | @property
116 | def datasets(self):
117 | if self._is_split:
118 | return self.train, self.val, self.test
119 | else:
120 | return self.data
121 |
122 | def rolling_serve(self, index:int, window:int, stride:int):
123 |
124 | yield data
--------------------------------------------------------------------------------
/torchNDF/models/data/datahandler.py:
--------------------------------------------------------------------------------
1 | import natsort
2 | import torch
3 | import pandas as pd
4 | from .pandas_utils import mu_law_encode
5 | from ..utils import one_hot
6 |
7 |
8 | class FXDataHandler:
9 | def __init__(self,
10 | fpath=None):
11 | if fpath:
12 | self.data = pd.read_pickle(f'{fpath}')
13 | self._is_split = False
14 | self._is_quantized = False
15 | self._is_tensor = False
16 |
17 | def assign_data(self, data):
18 | self.data = data
19 |
20 | def split_train_val_test(self, percentage_split):
21 | assert sum(percentage_split) == 100, f'Percentage splits sum {sum(percentage_split)} not equal to 100'
22 | split_d = []
23 | run_idx = 0
24 | for p in percentage_split:
25 | idx = p * len(self.data) // 100
26 | split_d.append(self.data.iloc[run_idx: run_idx + idx])
27 | run_idx += idx
28 | self.train = split_d[0]
29 | self.val = split_d[1]
30 | self.test = split_d[2]
31 | self._is_split = True
32 |
33 | def normalize(self, window=30):
34 | assert not self._is_split and not self._is_tensor, 'normalize before splitting and tensor transforms'
35 | if not window:
36 | self.data = (self.data - self.data.mean()) / self.data.std()
37 | else:
38 | self.data = (self.data - self.data.rolling(window, min_periods=1).mean()) / \
39 | self.data.rolling(window, min_periods=1).std()
40 | self.data = self.data.dropna()
41 |
42 | def to_percentage_change(self, multiplier=1, diff_degree=1):
43 | self.data = multiplier * self.data.pct_change(diff_degree).dropna()
44 | if self._is_split:
45 | self.train = multiplier * self.train.pct_change(diff_degree).dropna()
46 | self.val = multiplier * self.val.pct_change(diff_degree).dropna()
47 | self.test = multiplier * self.test.pct_change(diff_degree).dropna()
48 |
49 | def encode(self,
50 | scheme='mu',
51 | n_bins=4,
52 | shift=True,
53 | norm=False,
54 | clip=True):
55 |
56 | if scheme == 'mu':
57 | half = n_bins // 2
58 | self.data = mu_law_encode(self.data, n_bins, norm)
59 | if self._is_split:
60 | self.train = mu_law_encode(self.train, n_bins, norm)
61 | self.val = mu_law_encode(self.val, n_bins, norm)
62 | self.test = mu_law_encode(self.test, n_bins, norm)
63 | if shift:
64 | self.data -= half
65 | if self._is_split:
66 | self.train -= half
67 | self.val -= half
68 | self.test -= half
69 | if clip:
70 | self._clip(n_bins, shift)
71 | self._is_quantized = True
72 |
73 | else:
74 | raise NotImplementedError
75 |
76 | def _clip(self, n_bins, shift):
77 | half = n_bins // 2
78 | if shift:
79 | self.data.clip(-half, half - 1, inplace=True)
80 | if self._is_split:
81 | self.train.clip(-half, half - 1, inplace=True)
82 | self.val.clip(-half, half - 1, inplace=True)
83 | self.test.clip(-half, half - 1, inplace=True)
84 | else:
85 | self.data.clip(0, n_bins - 1, inplace=True)
86 | if self._is_split:
87 | self.train.clip(0, n_bins - 1, inplace=True)
88 | self.val.clip(0, n_bins - 1, inplace=True)
89 | self.test.clip(0, n_bins - 1, inplace=True)
90 |
91 | def to_tensor(self, device):
92 | if self._is_tensor: return
93 | if self._is_quantized:
94 | self.data = torch.IntTensor(self.data.values).to(device)
95 | if self._is_split:
96 | self.train = torch.IntTensor(self.train.values).to(device)
97 | self.val = torch.IntTensor(self.val.values).to(device)
98 | self.test = torch.IntTensor(self.test.values).to(device)
99 | else:
100 | self.data = torch.FloatTensor(self.data.values).to(device)
101 | if self._is_split:
102 | self.train = torch.FloatTensor(self.train.values).to(device)
103 | self.val = torch.FloatTensor(self.val.values).to(device)
104 | self.test = torch.FloatTensor(self.test.values).to(device)
105 | self._is_tensor = True
106 |
107 | def one_hot_transform(self, n_bins):
108 | assert self._is_tensor, f'one_hot only implemented for torch Tensors'
109 | self.data = one_hot(self.data, dim=n_bins)
110 | if self._is_split:
111 | self.train = one_hot(self.train, dim=n_bins)
112 | self.val = one_hot(self.val, dim=n_bins)
113 | self.test = one_hot(self.test, dim=n_bins)
114 |
115 | @property
116 | def datasets(self):
117 | if self._is_split:
118 | return self.train, self.val, self.test
119 | else:
120 | return self.data
121 |
122 | def rolling_serve(self, index:int, window:int, stride:int):
123 |
124 | yield data
--------------------------------------------------------------------------------
/torchNDF/metrics.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | from torch.utils.data import DataLoader
6 | from .utils import seq_normalization
7 |
8 | def cross_entropy_accuracy(preds, targets):
9 | """Accuracy for regular CrossEntropyLoss"""
10 | _, preds = torch.max(preds, dim=1)
11 | acc = 100*(preds == targets).float().mean()
12 | return acc
13 |
14 | def weighted_cross_entropy_accuracy(preds:torch.Tensor, targets:torch.Tensor, weights:torch.Tensor):
15 | """Accuracy function for unbalanced classes"""
16 | preds, targets, weights = preds.cpu(), targets.cpu(), weights.cpu()
17 | _, preds = torch.max(preds, dim=1)
18 | weighted_preds = copy.deepcopy(preds).to(dtype=torch.float)
19 | weighted_targets = copy.deepcopy(targets).to(dtype=torch.float)
20 | weighted_preds.apply_(lambda x: weights[int(x)])
21 | weighted_targets.apply_(lambda x: weights[int(x)])
22 | are_equal = (preds == targets).to(dtype=torch.float)
23 | acc = 100 * torch.sum(weighted_preds * are_equal)/torch.sum(weighted_targets)
24 | return acc
25 |
26 | class BaselinePerfContainer:
27 | """Simple container of performance metrics"""
28 | def __init__(self):
29 | self.returns = 0
30 | self.return_list = []
31 | self.volatility = 0.
32 | self.activity = 0
33 | self.days_held = 0
34 | self.pos_acc = 0.
35 | self.opt_acc = 0.
36 | def __repr__(self):
37 | return f'Returns {round(self.returns, 2)}\n' + \
38 | f'Standard dev of returns {round(self.volatility, 4)}\n' + \
39 | f'Cumulative sum of tenors {self.days_held}\n' + \
40 | f'Number of Buys {self.activity}\n' + \
41 | f'Positive return accuracy {round(self.pos_acc, 2)}\n' + \
42 | f'Optimal return accuracy {round(self.opt_acc, 2)}\n'
43 |
44 | def positive_return_accuracy(preds:np.array, returns:np.array):
45 | """Computes accuracy against positive return tenor actions"""
46 | count = 0
47 | for i in range(len(returns[0])):
48 | if returns[int(preds[i]),i] >= 0:
49 | count += 1
50 | return 100*count/len(returns[0])
51 |
52 | def returns_and_activity(returns: torch.Tensor, predictions: torch.Tensor = None, baseline: str = None,
53 | input_data: torch.Tensor = None, confidence_multipliers: torch.Tensor = None,
54 | frictional_check: bool = False, frictional_act: torch.Tensor = None):
55 | """Calculate trading returns of model given `predictions` or returns of one of the following baselines:
56 | `random`: uniform random action
57 | `expert`: expert positive return trades computed from `input_data` labels
58 | confidence_multipliers, if given as input, are used to weigh model returns"""
59 | options = ['last_seen', 'expert', 'random', None]
60 | assert (predictions is not None) | (baseline is not None), \
61 | 'If not using a standard model please choose a baseline'
62 | assert baseline in options, f'{baseline} not supported'
63 |
64 | if baseline == 'random':
65 | # random action each step
66 | predictions = np.random.choice(np.arange(0, 91), len(input_data), replace=True)
67 | predictions = torch.Tensor(predictions).to(dtype=torch.long)
68 |
69 | container = BaselinePerfContainer()
70 | for idx, act in enumerate(predictions):
71 | # if action not `Hold`
72 | if act != 0:
73 | act = int(act)
74 | if frictional_check:
75 | if frictional_act[idx] != act: continue
76 | if confidence_multipliers is not None:
77 | container.return_list.append(confidence_multipliers[idx].item() * returns[act, idx])
78 | else:
79 | container.return_list.append(returns[act, idx])
80 | container.days_held += act
81 | else:
82 | container.return_list.append(0.)
83 | container.return_list = np.array(container.return_list)
84 |
85 | container.activity = torch.numel(predictions.nonzero())
86 | container.volatility = container.return_list.std()
87 | container.returns = container.return_list.sum()
88 | _, optimal_actions = returns.max(0)
89 | container.opt_acc = 100 * (predictions == optimal_actions).to(dtype=torch.float).mean().item()
90 | container.pos_acc = positive_return_accuracy(predictions, returns)
91 | return container
92 |
93 | def ensemble_single_model_returns(ensemble:list, dataloader:DataLoader, returns:torch.Tensor, device:torch.device):
94 | """Returns for each model in an ensemble"""
95 | data, _ = next(iter(dataloader))
96 | performances = []
97 | for model in ensemble:
98 | data[:, :-1, :-1] = seq_normalization(data[:, :-1, :-1])
99 | yhat = model(data[:, :-1, :-1].to(device))
100 | probs = nn.Softmax(1)(yhat)
101 | _, predictions = torch.max(probs, 1)
102 | return_model = 0
103 | for idx, el in enumerate(predictions):
104 | # if action not `Hold`
105 | if el != 0:
106 | return_model += returns[el, idx]
107 | performances.append(return_model)
108 | return performances
--------------------------------------------------------------------------------
/torchNDF/models/mlp.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class MLP(torch.nn.Module):
5 | def __init__(self,
6 | input_dimension,
7 | output_dimension,
8 | num_neurons=[128, 128, 128],
9 | input_normalization=0,
10 | hidden_activation='relu',
11 | out_activation=None,
12 | out_softmax=False,
13 | drop_probability=0.0,
14 | init='kaiming_normal'):
15 | """
16 | :num_neurons: number of neurons for each layer
17 | :out_activation: output layer's activation unit
18 | :input_normalization: input normalization behavior flag
19 | 0: Do not normalize, 1: Batch normalization, 2: Layer normalization
20 | :hidden_activation: hidden layer activation units. supports 'relu','SELU','leaky_relu','sigmoid', 'tanh'
21 | :init: hidden layer initialization. supports 'kaiming_normal'
22 | """
23 |
24 | super().__init__()
25 | self.layers = torch.nn.ModuleList()
26 | self.input_dimension = input_dimension
27 | self.output_dimension = output_dimension
28 | self.out_activation = out_activation
29 | self.out_softmax = out_softmax
30 | self.input_normalization = input_normalization
31 | self.hidden_activation = hidden_activation
32 | self.drop_probability = drop_probability
33 | self.init = init
34 |
35 | # infer normalization layers
36 | if self.input_normalization == 0:
37 | pass
38 | else:
39 | if self.input_normalization == 1:
40 | norm_layer = torch.nn.BatchNorm1d(self.input_dimension)
41 | elif self.input_normalization == 2:
42 | norm_layer = torch.nn.LayerNorm(self.input_dimension)
43 | self.layers.append(norm_layer)
44 |
45 | self.layers.append(torch.nn.Linear(self.input_dimension, num_neurons[0])) # input -> hidden 1
46 | for i, num_neuron in enumerate(num_neurons[:-1]):
47 | hidden_layer = torch.nn.Linear(num_neuron, num_neurons[i + 1])
48 | self.apply_weight_init(hidden_layer, self.init)
49 | self.layers.append(hidden_layer)
50 | last_layer = torch.nn.Linear(num_neurons[-1], self.output_dimension)
51 | self.apply_weight_init(last_layer, self.init)
52 | self.layers.append(last_layer) # hidden_n -> output
53 |
54 | def forward(self, x):
55 | if self.input_normalization != 0: # The first layer is not normalization layer
56 | out = self.layers[0](x)
57 | for layer in self.layers[1:-1]: # Linear layer starts from layers[1]
58 | out = layer(out)
59 | if self.drop_probability > 0.0:
60 | out = self.infer_dropout(self.drop_probability)(out) # Apply dropout
61 | out = self.infer_activation(self.hidden_activation)(out)
62 |
63 | out = self.layers[-1](out) # The last linear layer
64 | if self.out_activation is None:
65 | pass
66 | else:
67 | out = self.infer_activation(self.out_activation)(out)
68 | else:
69 | out = x
70 | for layer in self.layers[:-1]:
71 | out = layer(out)
72 | if self.drop_probability > 0.0:
73 | out = self.infer_dropout(self.drop_probability)(out)
74 | out = self.infer_activation(self.hidden_activation)(out)
75 |
76 | out = self.layers[-1](out)
77 | # infer output activation units
78 | if self.out_activation is None:
79 | pass
80 | else:
81 | out = self.infer_activation(self.out_activation)(out)
82 | if self.out_softmax:
83 | out = nn.Softmax()(out)
84 | return out
85 |
86 | def apply_weight_init(self, tensor, init_method=None):
87 | if init_method is None:
88 | pass # do not apply weight init
89 | elif init_method == "normal":
90 | torch.nn.init.normal_(tensor.weight, std=0.3)
91 | torch.nn.init.constant_(tensor.bias, 0.0)
92 | elif init_method == "kaiming_normal":
93 | torch.nn.init.kaiming_normal_(tensor.weight, nonlinearity=self.hidden_activation)
94 | torch.nn.init.constant_(tensor.bias, 0.0)
95 |
96 | def infer_activation(self, activation):
97 | if activation == 'relu':
98 | ret = torch.nn.ReLU()
99 | elif activation == 'sigmoid':
100 | ret = torch.nn.Sigmoid()
101 | elif activation == 'SELU':
102 | ret = torch.nn.SELU()
103 | elif activation == 'leaky_relu':
104 | ret = torch.nn.LeakyReLU()
105 | elif activation == 'tanh':
106 | ret = torch.nn.Tanh()
107 | elif activation == 'prelu':
108 | ret = torch.nn.PReLU()
109 | else:
110 | raise RuntimeError("Given {} activation is not supported".format(self.out_activation))
111 | return ret
112 |
113 | @staticmethod
114 | def infer_dropout(p):
115 | if p >= 0.0:
116 | ret = torch.nn.Dropout(p=p)
117 | return ret
--------------------------------------------------------------------------------
/torchNDF/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | def seq_normalization(x:torch.Tensor, col_idx:int=145):
6 | """Normalizes a Tensor across its temporal (1) dimension `N. H, W` up to `col_idx`
7 | in the `W` dimension"""
8 | epsilon = 1e-10
9 | m_x = x[:,:,:col_idx]
10 | x[:,:,:col_idx] = (m_x - m_x.mean(1).unsqueeze(1))/(m_x.std(1).unsqueeze(1)+epsilon)
11 | return x
12 |
13 | def masked_temperature_softmax(x:torch.Tensor, idx:int, T:float=0.01):
14 | """Temperature scaled softmax on torch.Tensor masked by indices `idx`"""
15 | return torch.exp(x[idx, :] / T) / torch.exp(x[idx, :] / T).sum()
16 |
17 | def scaled_cross_entropy(preds:torch.Tensor, labels, order_penalty=0.2, margin=2, margin_penalty=0.1):
18 | """
19 | Computes 2D soft cross entropy loss between with asymmetric scaling `preds` and `labels`.
20 |
21 | Args:
22 | preds (torch.Tensor): shape `N, C, H, W`
23 | labels (torch.Tensor): shape `N, H, W`
24 | order_penalty int: percentage penalty to predictions with index bigger than ground truth
25 | label
26 | margin int: maximum distance without penalty between ground truth index and prediction index
27 | margin_penalty int: percentage penalty to predictions with index outside `margin`
28 |
29 | Returns:
30 | loss (torch.Tensor):
31 | """
32 | loss = 0
33 | # loop through samples
34 | for i in range(preds.size(0)):
35 | # loop through `H` dim
36 | for j in range(preds.size(2)):
37 | # loop through `W` dim
38 | for k in range(preds.size(3)):
39 | # weight vector of length num. classes `C`
40 | w = preds.new_ones(preds.size(1))
41 | positive_label = labels[i, j, k].data
42 | w[positive_label:] += order_penalty
43 | if positive_label > margin:
44 | w[:positive_label - margin] += margin_penalty
45 | if positive_label < preds.size(1) - margin:
46 | w[positive_label + 2:] += margin_penalty
47 | loss += F.cross_entropy(preds[None, i, :, j, k], labels[None, i, j, k,], weight=w)
48 | loss /= torch.numel(labels)
49 | return loss
50 |
51 | def expert_guided_loss(yhat: torch.Tensor, returns: torch.Tensor, index: torch.Tensor):
52 | """Compute CrossEntropyLoss between `yhat` and optimal return actions given `returns`"""
53 | probs = nn.Softmax(1)(yhat)
54 | action = torch.argmax(probs, 1)
55 | model_return = returns[action, index].sum(0)
56 | optimal_action = returns[:, index].argmax(0)
57 | optimal_return, _ = (returns[:, index].max(0))
58 | optimal_return = optimal_return.sum()
59 | loss = nn.CrossEntropyLoss()(yhat, optimal_action)
60 | return loss, model_return, optimal_return
61 |
62 | def risk_penalty(x:torch.Tensor, returns:torch.Tensor):
63 | """Linearly scaling penalty for long tenor returns"""
64 | x = torch.sign(x) * (torch.abs(x) * torch.linspace(1, 0.5, returns.size(0)).unsqueeze(1).type_as(x))
65 | return x
66 |
67 | def expert_risk_aware_loss(yhat: torch.Tensor, returns: torch.Tensor, index: torch.Tensor):
68 | """Compute CrossEntropyLoss between `yhat` and optimal return actions given `returns`"""
69 | probs = nn.Softmax(1)(yhat)
70 | action = torch.argmax(probs, 1)
71 | model_return = returns[action, index].sum(0)
72 | mask = (returns[:, index] > 0)
73 | optimal_action = mask.argmax(0)
74 | optimal_return, _ = (returns[optimal_action, index].max(0))
75 | optimal_return = optimal_return.sum()
76 | loss = nn.CrossEntropyLoss()(yhat, optimal_action)
77 | return loss, model_return, optimal_return
78 |
79 | def probabilistic_expert_guided_loss(yhat:torch.Tensor, returns:torch.Tensor, index:torch.Tensor):
80 | """Compute CrossEntropyLoss between `yhat` and optimal return actions given `returns`"""
81 | action = torch.argmax(yhat, 1)
82 | model_return = returns[action, index].sum(0)
83 | optimal_distrib = masked_temperature_softmax(returns.transpose(0, 1), index) # returns: `W, H` -> `H, W`
84 | expert_sampled_action = torch.distributions.Multinomial(1, probs=optimal_distrib).sample()
85 | optimal_action = expert_sampled_action.argmax(0)
86 | optimal_return, _ = (returns[:, index].max(0))
87 | optimal_return = optimal_return.sum()
88 | loss = nn.CrossEntropyLoss()(yhat, optimal_action)
89 | return loss, model_return, optimal_return
90 |
91 | def one_hot(input_data:torch.Tensor, dim:int):
92 | """ Turns input_data of shape `H, W` with integer entries into a tensor of shape `H, C, W` where
93 | `C` is the one-hot encoding dimension. """
94 | res = []
95 | n_channels = input_data.size(1)
96 | offset = input_data.min()
97 | length = dim
98 | for channel_idx in range(n_channels):
99 | channel_one_hot = []
100 | channel = input_data[:, channel_idx]
101 | for entry in channel:
102 | one_hot_x = torch.zeros(length)
103 | one_hot_x[entry+offset] = 1
104 | channel_one_hot.append(one_hot_x)
105 | channel_one_hot = torch.cat(channel_one_hot)
106 | channel_one_hot = channel_one_hot.reshape(-1, length)
107 | res.append(channel_one_hot.unsqueeze(2))
108 | res = torch.cat(res, dim=2)
109 | return res
--------------------------------------------------------------------------------
/torchNDF/vis.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from sklearn.decomposition import PCA
5 |
6 | def plot_cumsum_explained_variance(pca:PCA):
7 | """Plots cumulative sum of explained variance as a function of the number
8 | of PCA factors """
9 | c = pca.explained_variance_ratio_
10 | plt.plot(np.cumsum(c))
11 | plt.ylabel('explained variance')
12 | plt.xlabel('number factors')
13 |
14 | def plot_trade_mode_embeddings(embeddings:torch.Tensor, label_idxs:torch.Tensor, dims_to_plot:int=10,
15 | save_fpath:str=None):
16 | """Plots first `dims_to_plot` of embeddings labelled by model tenor actions"""
17 | plt.figure(figsize=(30,75))
18 | # action ranges (by tenor groups): 0 -> Hold, 1:91 -> Buy
19 | action_ranges = [slice(0,1), slice(10,30), slice(30,60), slice(60,91)]
20 | for dim in range(dims_to_plot):
21 | plt.subplot(10,3,1+dim)
22 | for i in range(4):
23 | action_idxs = np.concatenate(label_idxs[action_ranges[i]])
24 | plt.scatter(embeddings[action_idxs, dim],
25 | embeddings[action_idxs,dim+1], s=12.8, alpha=0.6);
26 | plt.xlabel(f'Embedding dimension: {dim}')
27 | plt.ylabel(f'Embedding dimension: {dim+1}')
28 | plt.legend(['Hold', 'Buy_short', 'Buy_med', 'Buy_long'])
29 | if save_fpath: plt.savefig(f'{save_fpath}.jpg', dpi=200)
30 |
31 | def plot_activity(predictions:np.array, model_name:str, curr_name:str, expert_labels:np.array, returns:np.array,
32 | save_fpath:str=None):
33 | """Plots trading activity of experts and model"""
34 | n_preds = len(predictions)
35 | assert n_preds == len(expert_labels) == len(returns[0]), 'Number of predictions has to match number' + \
36 | '# expert labels and size(1) for the return gradient'
37 | plt.figure(figsize=(20, 20))
38 | ax = plt.subplot(211)
39 | ax.scatter(list(range(n_preds)),
40 | expert_labels, alpha=1, color='r', edgecolor='black' , s=0.7);
41 | im = ax.matshow(returns, alpha=0.6, cmap='RdYlGn', aspect='auto', origin='lower', extent=[0,n_preds,-0.1,90.1],
42 | vmin=-0.08, vmax=0.08)
43 | plt.colorbar(im)
44 | ax.set_yticks(np.arange(0, 90, 10))
45 | ax.set_yticklabels(['Hold', 'Buy<10d', 'Buy<20d', 'Buy<30d', 'Buy<40d', 'Buy<50d', 'Buy<60d', \
46 | 'Buy<70d', 'Buy<80d', 'Buy<90d'])
47 | ax.set_xlabel('Trading Days')
48 | plt.title(f'DTCC (oracle) `{curr_name}`, background: {curr_name} return gradient, 1 day tenor (bottom) to 90 days (top)')
49 |
50 | ax = plt.subplot(212)
51 | ax.scatter(list(range(n_preds)),
52 | predictions.cpu().numpy(), alpha=1, color='y', edgecolor='black', s=0.7)
53 | im = ax.matshow(returns, alpha=0.6, cmap='RdYlGn', aspect='auto', origin='lower', extent=[0,n_preds,-0.1,90.1],
54 | vmin=-0.08, vmax=0.08)
55 | plt.colorbar(im)
56 | ax.set_yticks(np.arange(0, 90, 10))
57 | ax.set_yticklabels(['Hold', 'Buy<10d', 'Buy<20d', 'Buy<30d', 'Buy<40d', 'Buy<50d', 'Buy<60d', \
58 | 'Buy<70d', 'Buy<80d', 'Buy<90d'])
59 | ax.set_xlabel('Trading Days')
60 | plt.title(f'{model_name} on `{curr_name}`, background: {curr_name} return gradient, 1 day tenor (bottom) to 90 days (top)')
61 | if save_fpath: plt.savefig(f'{save_fpath}.jpg', dpi=200)
62 |
63 | def plot_explaining_currency(gradients: torch.Tensor, sequence_idxs: torch.Tensor,
64 | spots: np.array, volatility: np.array, mode: str = 'min',
65 | n_explaining_currencies: int = 6):
66 | plt.figure(figsize=(30, 10))
67 |
68 | # `N, H, W` obtain index of minimum or maximum gradients
69 | # w.r.t input spot rates
70 |
71 | # average across sequence and batch samples
72 | gradients = gradients.mean(0).mean(1).cpu().numpy()
73 |
74 | # `N`
75 | # sort indices by gradient, then slice bottom `n_explaining_currencies`
76 | # or top `n_explaining_currencies` (at the start of the sorted arr)
77 | if mode == 'min':
78 | idx = np.argsort(gradients[:TECH_START_IDX])[:n_explaining_currencies]
79 | elif mode == 'max':
80 | idx = np.argsort(gradients[:TECH_START_IDX])[-n_explaining_currencies:]
81 | elif mode == 'abs':
82 | gradients = abs(gradients)
83 | idx = np.argsort(gradients[:TECH_START_IDX])[-n_explaining_currencies:]
84 | for i in range(n_explaining_currencies):
85 | plt.subplot(2, 3, 1 + i)
86 | plt.xticks([])
87 | plt.yticks([])
88 | curr_idx = idx[i].item()
89 | curr_n = spots.columns[curr_idx]
90 | # single currency time series
91 | ts = spots.iloc[:, curr_idx]
92 | plt.title(rf'{curr_n} $\rho$: {corr[i]}', fontsize=30)
93 | ts.plot()
94 | plt.pcolor(spots.index, [ts.min(), ts.max() + 0.5],
95 | abs(volatility.iloc[:, curr_idx]).values[np.newaxis], alpha=0.3, cmap='Greens')
96 | cb = plt.colorbar()
97 | cb.ax.tick_params(labelsize=20)
98 | for el in sequence_idxs:
99 | el = el.cpu().item()
100 | # plot dot where tenor action is taken
101 | plt.scatter(spots.index[30 + el], ts[30 + el], s=20, color='black', zorder=4, marker='p')
102 | ts[el:30 + el].plot(color='red', alpha=0.4)
--------------------------------------------------------------------------------
/torchNDF/data/pandas_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import datetime
4 | from statsmodels.tsa.arima_model import ARIMA
5 | import logging
6 |
7 |
8 | def spotrate_lookup(dataframe: pd.DataFrame, date: datetime.date):
9 | """ Returns the row in dataframe with index closest to `date`"""
10 | try:
11 | sliced_dataframe = dataframe[dataframe.index.year == date.year]
12 | seq = np.array(abs(sliced_dataframe.index.day - date.day) +
13 | 30 * (abs((sliced_dataframe.index.month - date.month))))
14 | idx = seq.argmin()
15 | res = sliced_dataframe.iloc[idx]
16 | except KeyError:
17 | logging.info(f'{date.year} not present in dataframe')
18 | sliced_dataframe = dataframe[str(dataframe.index.year[-1])]
19 | res = sliced_dataframe.iloc[-1]
20 | return res
21 |
22 |
23 | def augment_with_spot_rate(dataframe: pd.DataFrame, spot_dataframe: pd.DataFrame) -> None:
24 | """ Augments dataframe (in-place) with spot rate feature. dataframe['END_DATE'] used
25 | to find most recent spot rate information available for currency `dataframe['CURRENCY']`
26 | in spot_dataframe """
27 | spot_col = []
28 | date, curr = None, None
29 | for _, row in dataframe.iterrows():
30 | try:
31 | date = pd.to_datetime(row['END_DATE']).date()
32 | spot_rates = spotrate_lookup(spot_dataframe, date)
33 | if spot_rates.any():
34 | curr = row['CURRENCY']
35 | spot_data = spot_rates.drop([c for c in spot_rates.index \
36 | if not str(curr) in c])
37 | spot_col.append(spot_data.item())
38 |
39 | except ValueError:
40 | logging.debug(f'Incorrect currency or date request to the spot rate dataframe: {curr}, {date}')
41 | spot_col.append(0)
42 | spot_data = pd.Series(spot_col)
43 | spot_data.index = dataframe.index
44 | dataframe['SPOT_RATE'] = spot_data
45 |
46 |
47 | def mu_law_encode(signal: pd.DataFrame, quantization_channels: int, norm: bool = True):
48 | """Mu-law encode"""
49 | mu = quantization_channels - 1
50 | if norm: signal = 2 * (signal - signal.min()) / (signal.max() - signal.min()) - 1
51 | magnitude = np.log1p(mu * np.abs(signal)) / np.log1p(mu)
52 | signal = np.sign(signal) * magnitude
53 | signal = (signal + 1) / 2 * mu + 0.5
54 | quantized_signal = signal.astype(np.int16)
55 | return quantized_signal
56 |
57 |
58 | def as_dateindex_filled(dataframe: pd.DataFrame):
59 | """Fills out a DataFrame adding rows start and end index (daily frequency)"""
60 | original_columns = dataframe.columns
61 | dataframe_new = pd.DataFrame({'date': pd.date_range(dataframe.index.min(),
62 | dataframe.index.max(), freq='D')})
63 | dataframe_new.index = dataframe_new['date']
64 | dataframe_new.drop(columns=['date'], inplace=True)
65 | for col in dataframe:
66 | dataframe_new[col] = 0.
67 | dataframe = dataframe_new.join(dataframe, lsuffix='', rsuffix='_ff')
68 | dataframe.drop(columns=original_columns, inplace=True)
69 | dataframe.ffill(inplace=True)
70 | return dataframe
71 |
72 |
73 | def n_step_returns(dataframe_no_weekends: pd.DataFrame, dataframe_weekends: pd.DataFrame, steps:int):
74 | """Computes `steps` day returns for each row in `dataframe_no_weekends` using data from
75 | `dataframe_weekends"""
76 | returns = []
77 | for date, _ in dataframe_no_weekends.iterrows():
78 | start_price = dataframe_no_weekends.loc[date]
79 | date = date + datetime.timedelta(days=steps)
80 | if date < dataframe_weekends.index.max():
81 | end_price = dataframe_weekends.loc[date.date()]
82 | returns.append((end_price - start_price)/start_price)
83 | return np.array(returns)
84 |
85 |
86 | def arima_forecasts(series: pd.Series, split: float = 0.5):
87 | """1-step ARIMA forecast for `series`"""
88 | train_idx = int(len(series) * split)
89 | train, test = series[:train_idx], series[train_idx:]
90 | history = list(train)
91 | predictions = []
92 | for t in range(len(test)):
93 | model = ARIMA(history, order=(4, 1, 0))
94 | model_fit = model.fit(disp=0)
95 | pred = model_fit.forecast()[0]
96 | predictions.append(pred)
97 | obs = test[t]
98 | history.append(obs)
99 | # look-ahead by 1 with full history
100 | model = ARIMA(history, order=(4, 1, 0))
101 | model_fit = model.fit(disp=0)
102 | pred = model_fit.forecast()[0]
103 | predictions.append(pred)
104 | # skip first train datapoint (shift right by 1)
105 | arima_feature = np.concatenate((train[1:], np.array(predictions).flatten()))
106 | return arima_feature
107 |
108 |
109 | def augment_with_arima_features(dataframe: pd.DataFrame, trval_split: float = 0.5, column_idx: int = 7) -> None:
110 | """Augments `dataframe` with ARIMA 1-step forecast features for columns up to `column_idx`"""
111 | static_columns = dataframe.columns
112 | for column in static_columns[:column_idx]:
113 | arima_feature = arima_forecasts(dataframe[column].values, split=trval_split)
114 | dataframe[f'ARIMA_{column}'] = arima_feature
115 |
116 |
117 | def augment_with_technical_indicators(dataframe: pd.DataFrame, column_idx: int = 7):
118 | for col in dataframe.columns[:column_idx]:
119 | dataframe[f'{col}_ma7'] = dataframe[col].rolling(window=7).mean()
120 | dataframe[f'{col}_ma21'] = dataframe[col].rolling(window=21).mean()
121 | dataframe[f'{col}_26ema'] = dataframe[col].ewm(span=26).mean()
122 | dataframe[f'{col}_12ema'] = dataframe[col].ewm(span=12).mean()
123 | dataframe[f'{col}_MACD'] = (dataframe[f'{col}_12ema'] - dataframe[f'{col}_26ema'])
124 | dataframe[f'{col}_20sd'] = dataframe[col].rolling(window=20).std()
125 | dataframe[f'{col}_upper_band'] = dataframe[f'{col}_ma21'] + (dataframe[f'{col}_20sd'] * 2)
126 | dataframe[f'{col}_lower_band'] = dataframe[f'{col}_ma21'] - (dataframe[f'{col}_20sd'] * 2)
127 |
128 |
--------------------------------------------------------------------------------
/torchNDF/models/data/pandas_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import datetime
4 | from statsmodels.tsa.arima_model import ARIMA
5 | import logging
6 |
7 |
8 | def spotrate_lookup(dataframe: pd.DataFrame, date: datetime.date):
9 | """ Returns the row in dataframe with index closest to `date`"""
10 | try:
11 | sliced_dataframe = dataframe[dataframe.index.year == date.year]
12 | seq = np.array(abs(sliced_dataframe.index.day - date.day) +
13 | 30 * (abs((sliced_dataframe.index.month - date.month))))
14 | idx = seq.argmin()
15 | res = sliced_dataframe.iloc[idx]
16 | except KeyError:
17 | logging.info(f'{date.year} not present in dataframe')
18 | sliced_dataframe = dataframe[str(dataframe.index.year[-1])]
19 | res = sliced_dataframe.iloc[-1]
20 | return res
21 |
22 |
23 | def augment_with_spot_rate(dataframe: pd.DataFrame, spot_dataframe: pd.DataFrame) -> None:
24 | """ Augments dataframe (in-place) with spot rate feature. dataframe['END_DATE'] used
25 | to find most recent spot rate information available for currency `dataframe['CURRENCY']`
26 | in spot_dataframe """
27 | spot_col = []
28 | date, curr = None, None
29 | for _, row in dataframe.iterrows():
30 | try:
31 | date = pd.to_datetime(row['END_DATE']).date()
32 | spot_rates = spotrate_lookup(spot_dataframe, date)
33 | if spot_rates.any():
34 | curr = row['CURRENCY']
35 | spot_data = spot_rates.drop([c for c in spot_rates.index \
36 | if not str(curr) in c])
37 | spot_col.append(spot_data.item())
38 |
39 | except ValueError:
40 | logging.debug(f'Incorrect currency or date request to the spot rate dataframe: {curr}, {date}')
41 | spot_col.append(0)
42 | spot_data = pd.Series(spot_col)
43 | spot_data.index = dataframe.index
44 | dataframe['SPOT_RATE'] = spot_data
45 |
46 |
47 | def mu_law_encode(signal: pd.DataFrame, quantization_channels: int, norm: bool = True):
48 | """Mu-law encode"""
49 | mu = quantization_channels - 1
50 | if norm: signal = 2 * (signal - signal.min()) / (signal.max() - signal.min()) - 1
51 | magnitude = np.log1p(mu * np.abs(signal)) / np.log1p(mu)
52 | signal = np.sign(signal) * magnitude
53 | signal = (signal + 1) / 2 * mu + 0.5
54 | quantized_signal = signal.astype(np.int16)
55 | return quantized_signal
56 |
57 |
58 | def as_dateindex_filled(dataframe: pd.DataFrame):
59 | """Fills out a DataFrame adding rows start and end index (daily frequency)"""
60 | original_columns = dataframe.columns
61 | dataframe_new = pd.DataFrame({'date': pd.date_range(dataframe.index.min(),
62 | dataframe.index.max(), freq='D')})
63 | dataframe_new.index = dataframe_new['date']
64 | dataframe_new.drop(columns=['date'], inplace=True)
65 | for col in dataframe:
66 | dataframe_new[col] = 0.
67 | dataframe = dataframe_new.join(dataframe, lsuffix='', rsuffix='_ff')
68 | dataframe.drop(columns=original_columns, inplace=True)
69 | dataframe.ffill(inplace=True)
70 | return dataframe
71 |
72 |
73 | def n_step_returns(dataframe_no_weekends: pd.DataFrame, dataframe_weekends: pd.DataFrame, steps:int):
74 | """Computes `steps` day returns for each row in `dataframe_no_weekends` using data from
75 | `dataframe_weekends"""
76 | returns = []
77 | for date, _ in dataframe_no_weekends.iterrows():
78 | start_price = dataframe_no_weekends.loc[date]
79 | date = date + datetime.timedelta(days=steps)
80 | if date < dataframe_weekends.index.max():
81 | end_price = dataframe_weekends.loc[date.date()]
82 | returns.append((end_price - start_price)/start_price)
83 | return np.array(returns)
84 |
85 |
86 | def arima_forecasts(series: pd.Series, split: float = 0.5):
87 | """1-step ARIMA forecast for `series`"""
88 | train_idx = int(len(series) * split)
89 | train, test = series[:train_idx], series[train_idx:]
90 | history = list(train)
91 | predictions = []
92 | for t in range(len(test)):
93 | model = ARIMA(history, order=(4, 1, 0))
94 | model_fit = model.fit(disp=0)
95 | pred = model_fit.forecast()[0]
96 | predictions.append(pred)
97 | obs = test[t]
98 | history.append(obs)
99 | # look-ahead by 1 with full history
100 | model = ARIMA(history, order=(4, 1, 0))
101 | model_fit = model.fit(disp=0)
102 | pred = model_fit.forecast()[0]
103 | predictions.append(pred)
104 | # skip first train datapoint (shift right by 1)
105 | arima_feature = np.concatenate((train[1:], np.array(predictions).flatten()))
106 | return arima_feature
107 |
108 |
109 | def augment_with_arima_features(dataframe: pd.DataFrame, trval_split: float = 0.5, column_idx: int = 7) -> None:
110 | """Augments `dataframe` with ARIMA 1-step forecast features for columns up to `column_idx`"""
111 | static_columns = dataframe.columns
112 | for column in static_columns[:column_idx]:
113 | arima_feature = arima_forecasts(dataframe[column].values, split=trval_split)
114 | dataframe[f'ARIMA_{column}'] = arima_feature
115 |
116 |
117 | def augment_with_technical_indicators(dataframe: pd.DataFrame, column_idx: int = 7):
118 | for col in dataframe.columns[:column_idx]:
119 | dataframe[f'{col}_ma7'] = dataframe[col].rolling(window=7).mean()
120 | dataframe[f'{col}_ma21'] = dataframe[col].rolling(window=21).mean()
121 | dataframe[f'{col}_26ema'] = dataframe[col].ewm(span=26).mean()
122 | dataframe[f'{col}_12ema'] = dataframe[col].ewm(span=12).mean()
123 | dataframe[f'{col}_MACD'] = (dataframe[f'{col}_12ema'] - dataframe[f'{col}_26ema'])
124 | dataframe[f'{col}_20sd'] = dataframe[col].rolling(window=20).std()
125 | dataframe[f'{col}_upper_band'] = dataframe[f'{col}_ma21'] + (dataframe[f'{col}_20sd'] * 2)
126 | dataframe[f'{col}_lower_band'] = dataframe[f'{col}_ma21'] - (dataframe[f'{col}_20sd'] * 2)
127 |
128 |
--------------------------------------------------------------------------------
/scripts/dtcc_records_collector.py:
--------------------------------------------------------------------------------
1 | """ Script to collect and preprocess DTCC NDF records"""
2 |
3 | import argparse
4 | import logging
5 | import logging.config
6 | from pathlib import Path
7 | import pandas as pd
8 | import features
9 | from torchNDF.script_utils import *
10 | from torchNDF.data.pandas_utils import *
11 |
12 | def process_chunk(chunk_df, spot_df):
13 | """Preprocess a DataFrame chunk"""
14 | timestamp_features = features.NDF_TIMESTAMP_COLUMNS_DTCC
15 |
16 | logging.info('Dropping rows with duplicate DISSEMINATION_ID')
17 | chunk_df = chunk_df[~chunk_df.index.duplicated(keep='first')]
18 |
19 | chunk_df.fillna(0, inplace=True)
20 |
21 | usd_is_1 = chunk_df['NOTIONAL_CURRENCY_1'] == 'USD'
22 | usd_is_2 = chunk_df['NOTIONAL_CURRENCY_2'] == 'USD'
23 |
24 | logging.debug('numrows before usd drop: %d', len(chunk_df))
25 | chunk_df = chunk_df[usd_is_1 | usd_is_2]
26 | logging.debug('numrows after usd drop: %d', len(chunk_df))
27 |
28 | logging.info('Dropping currencies not being considered')
29 | relevant_ccy_1 = chunk_df['NOTIONAL_CURRENCY_1'].isin(features.NDF_CURRENCIES_DTCC)
30 | relevant_ccy_2 = chunk_df['NOTIONAL_CURRENCY_2'].isin(features.NDF_CURRENCIES_DTCC)
31 | chunk_df = chunk_df[relevant_ccy_1 & relevant_ccy_2]
32 | logging.debug('numrows after curr mask: %d', len(chunk_df))
33 |
34 | logging.info('Adding pluses bool feature')
35 | augment_with_pluses(chunk_df, usd_is_1, usd_is_2)
36 | logging.debug('numrows after plus feat: %d', len(chunk_df))
37 |
38 | logging.info('Setting correct types for various features...')
39 | for feature in features.NDF_BOOL_COLUMNS_DTCC:
40 | chunk_df[feature].astype('bool', inplace=True)
41 | for feature in features.NDF_CONT_COLUMNS_DTCC:
42 | chunk_df[feature] = to_int(chunk_df[feature])
43 | for feature in features.NDF_CATEGORY_COLUMNS_DTCC:
44 | chunk_df[feature].astype('category', inplace=True)
45 | for feature in features.NDF_DATE_COLUMNS_DTCC:
46 | chunk_df[feature] = pd.to_datetime(chunk_df[feature], errors='coerce')
47 | for feature in features.NDF_TIMESTAMP_COLUMNS_DTCC:
48 | chunk_df[feature] = pd.to_datetime(chunk_df[feature], errors='coerce')
49 |
50 | logging.info('Adding ndf rate')
51 | amounts_to_ndf_rate(chunk_df, usd_is_1, usd_is_2)
52 | logging.debug('numrows after ndf rate: %d', len(chunk_df))
53 |
54 | logging.info('Removing outdated trade information based on DISSEMINATION_IDs...')
55 | chunk_df = remove_dissemination_id_changes(chunk_df)
56 | logging.debug('numrows after diss_id removal: %d', len(chunk_df))
57 |
58 | logging.info('Adding spot rate feature')
59 | augment_with_spot_rate(chunk_df, spot_df)
60 | logging.debug('numrows after spot rate: %d', len(chunk_df))
61 |
62 | logging.info('Adding term length feature')
63 | augment_with_delta(chunk_df, 'END_DATE', 'EFFECTIVE_DATE', 'TERM_LENGTH')
64 |
65 | logging.info('Splitting timestamp columns into date and time features...')
66 | for timestamp in timestamp_features:
67 | split_timestamp(chunk_df, timestamp)
68 | logging.debug('numrows after timestamp split: %d', len(chunk_df))
69 |
70 |
71 | cols_out = ['INDICATION_OF_OTHER_PRICE_AFFECTING_TERM', 'BLOCK_TRADES_AND_LARGE_NOTIONAL_OFF-FACILITY_SWAPS']
72 | cols_in = ['PRICE_AFFECTING_TERM', 'OFF_FACILITY_SWAPS']
73 | chunk_df = chunk_df.rename(index=str, columns=dict(zip(cols_out, cols_in)))
74 |
75 | chunk_df = chunk_df.drop(columns=['ORIGINAL_DISSEMINATION_ID', 'ACTION',
76 | 'ROUNDED_NOTIONAL_AMOUNT_1', 'ROUNDED_NOTIONAL_AMOUNT_2',
77 | 'NOTIONAL_CURRENCY_1', 'NOTIONAL_CURRENCY_2',
78 | 'SETTLEMENT_CURRENCY'
79 | ]
80 | )
81 |
82 | return chunk_df
83 |
84 |
85 | def main(path, name, mode, chunksize):
86 |
87 | format_str = '%(asctime)s | %(name)-10s | %(funcName)s | %(message)s'
88 | logging.Formatter(format_str)
89 | logging.basicConfig(level=logging.INFO, format=format_str)
90 |
91 | save_path = Path('pickle_data')
92 | if not save_path.exists():
93 | save_path.mkdir()
94 |
95 | columns = features.NDF_COLUMNS_DTCC
96 |
97 | columns_spot = features.COLUMNS_SPOT
98 | spot_dataframe = pd.read_csv(f'{path}/fx.csv', usecols=columns_spot,
99 | infer_datetime_format=True, index_col=columns_spot[0])
100 | spot_dataframe.index = pd.to_datetime(spot_dataframe.index)
101 |
102 | if mode=='m':
103 | logging.info('Reading and merging CSV files...')
104 | ### TO DO ###
105 | dataframe = utils.merge_from_folder(path, columns)
106 | dataframe = process_chunk(dataframe, spot_dataframe)
107 |
108 | logging.info('Saving processed pd.DataFrame...')
109 | dataframe.to_pickle('pickle_data/all_data_processed.pickle')
110 |
111 | if mode== 'r':
112 | logging.info('Chunking big csv file...')
113 | count = 1
114 |
115 | n_rows = sum(1 for row in open(f'{path}/{name}.csv', 'r'))
116 | logging.info('There are %d rows in the file', n_rows)
117 |
118 | for chunk_df in pd.read_csv('{}/{}.csv'.format(path, name), index_col=columns[0],
119 | parse_dates=True, infer_datetime_format=True, usecols=columns,
120 | chunksize=chunksize):
121 |
122 | logging.info('Processing chunk: %d', count)
123 | chunk_df = process_chunk(chunk_df, spot_dataframe)
124 |
125 | logging.info('Saving processed pd.DataFrame...')
126 | chunk_df.to_pickle(f'pickle_data/slice_{count}.pickle')
127 |
128 | logging.info('Progress: %f', round(100*(round(count*chunksize)/n_rows), 4))
129 | count += 1
130 |
131 | logging.info('Preprocessing complete')
132 |
133 |
134 | def get_args_parser():
135 | parser = argparse.ArgumentParser(description='Load all CSV files from folder and merges them into a pd.DataFrame')
136 |
137 | parser.add_argument('--path', type=str, default='.',
138 | help='Path containing CSV files to merge')
139 |
140 | parser.add_argument('--name', type=str, default='all_data',
141 | help='Name of single BIG csv file')
142 |
143 | parser.add_argument('--mode', type=str, default='r',
144 | choices = ['r', 'm'],
145 | help='data script mode: *r* for read (single BIG file) *m* for merge and read')
146 |
147 | parser.add_argument('--chunksize', type=int, default=1000000,
148 | help='Size (rows) of each csv chunk. Limit to 10**6 to avoid memory issues')
149 | return parser
150 |
151 |
152 | def parse_run():
153 | parser = get_args_parser()
154 | args = parser.parse_args()
155 | main(**vars(args))
156 |
157 |
158 | if __name__ == '__main__':
159 | parse_run()
--------------------------------------------------------------------------------
/torchNDF/models/wattnet.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | from torch.utils.data import DataLoader
5 | from .mlp import MLP
6 | from .modules import *
7 | from ..utils import expert_guided_loss
8 | from ..metrics import cross_entropy_accuracy
9 |
10 |
11 | class WATTNet(nn.Module):
12 | def __init__(self, in_dim: int = 1132, out_dim: int = 91, w_dim: int = 128, emb_dim: int = 8,
13 | dilation_depth: int = 4, dropout_prob: float = 0.2, n_repeat: int = 2):
14 | """
15 | Args:
16 | w_dim: spatial compression dimension carried out by a 2-layer MLP.
17 | When more memory/data is available, increasing w_dim can yield better performance
18 | emb_dim: embedding dimension of scalar values for each of the `w_dim` left after compression.
19 | Higher embedding dimension increases accuracy of the spatial attention module at the cost
20 | of increased memory requirement. BEWARE: w_dim * emb_dim > 1e4 can get *VERY* costly in terms
21 | of GPU memory, especially with big batches.
22 | dilation_depth: number of temporal-spatial blocks. Dilation for temporal dilated convolution is doubled
23 | each time.
24 | n_repeat: number of repeats of #`dilation_depth` of temporal-spatial layers. Useful to increase model depth
25 | with short sequences without running into situations where the dilated kernel becomes wider than the
26 | sequence itself.
27 | """
28 | super().__init__()
29 | self.w_dim = w_dim
30 | self.emb_dim = emb_dim
31 | self.dilation_depth = dilation_depth
32 | self.n_layers = dilation_depth * n_repeat
33 | self.dilations = [2 ** i for i in range(1, dilation_depth + 1)] * n_repeat
34 |
35 | ltransf_dim = w_dim * emb_dim
36 | self.attblocks = nn.ModuleList([AttentionBlock(in_channels=w_dim,
37 | key_size=ltransf_dim,
38 | value_size=ltransf_dim)
39 | for _ in self.dilations])
40 |
41 | self.resblocks = nn.ModuleList([GatedBlock(dilation=d, w_dim=w_dim)
42 | for d in self.dilations])
43 |
44 | self.emb_conv = nn.Conv2d(1, emb_dim, kernel_size=1)
45 | self.dec_conv = nn.Conv2d(w_dim, w_dim, kernel_size=(1, emb_dim), groups=w_dim)
46 |
47 | # feature compression: when more memory/data is available, increasing w_dim can yield
48 | # better performance
49 | self.preMLP = MLP(in_dim, w_dim, out_softmax=False)
50 |
51 | # post fully-connected head not always necessary. When sequence length perfectly aligns
52 | # with the number of time points lost to high dilation, (i.e single latent output by
53 | # alternating TCN and attention modules) the single latent can be used directly
54 | self.postMLP = MLP(5 * w_dim, out_dim, [512], \
55 | out_softmax=False, drop_probability=dropout_prob)
56 |
57 | def forward(self, x_in):
58 | """
59 | Args:
60 | x_in: 'N, C, H, W' where `N` is the batch dimension, `C` the one-hot
61 | embedding dimension, `H` is the temporal dimension, `W` is the
62 | second dimension of the timeseries (e.g timeseries for different FX pairs)
63 | Returns:
64 | """
65 | x_in = self.preMLP(x_in.squeeze(1))
66 | x_in = x_in.unsqueeze(1)
67 |
68 | if self.emb_dim > 1: x_in = self.emb_conv(x_in)
69 |
70 | # swap `W` dim to channel dimension for grouped convolutions
71 | # `N, W, H, C`
72 | x_in = x_in.transpose(1, 3)
73 |
74 | skip_connections = []
75 | for i in range(len(self.resblocks)):
76 | x_in = self.resblocks[i](x_in)
77 | x_att_list = []
78 | # slicing across `H`, temporal dimension
79 | for k in range(x_in.size(2)):
80 | # `C` embedding message passing using self-att
81 | x_att = self.attblocks[i](x_in[:, :, k, :])
82 | # `N, W, C` -> `N, W, 1, C`
83 | x_att = x_att.unsqueeze(2)
84 | x_att_list.append(x_att)
85 | # `N, W, 1, C` -> `N, W, H, C`
86 | x_in = torch.cat(x_att_list, dim=2)
87 | # `N, W, H, C` -> `N, W, H, 1`
88 | if self.emb_dim > 1: x_in = self.dec_conv(x_in)
89 | # `N, W, H, 1` -> `N, 1, H, W`
90 | x_out = x_in.transpose(1, 3)
91 | # `N, 1, H, W` -> `N, H, W`
92 | x_out = x_out[:, 0, :, :]
93 |
94 | x_out = x_out.reshape(x_out.size(0), -1)
95 | x_out = self.postMLP(x_out)
96 | return x_out
97 |
98 | def fit(self,
99 | epochs,
100 | trainloader,
101 | valloader,
102 | opt,
103 | sched,
104 | device,
105 | log_interval=10000,
106 | dropout_schedule=None,
107 | dropout_interval=1000,
108 | early_stop_loss=2.,
109 | tr_returns=None,
110 | val_returns=None
111 | ):
112 |
113 | if valloader:
114 | x_val, val_idx = next(iter(valloader))
115 | x_val_clip = x_val[:, :-1, :].unsqueeze(1).to(device)
116 | y_val = val_returns.argmax(0)
117 | val_opt_rets, _ = val_returns.max(0)
118 | val_opt_rets = val_opt_rets.sum().item()
119 |
120 | for e in range(epochs):
121 | drop_idx, run_loss, run_rets, run_opt_rets = 0, 0., 0., 0.
122 | iterator = iter(trainloader)
123 | if sched: sched.step()
124 | for i in range(len(iterator)):
125 | opt.zero_grad()
126 | x, idx = next(iterator)
127 | x = x.to(device)
128 | x_clip = x[:, :-1, :].unsqueeze(1)
129 | yhat = self(x_clip)
130 | loss, iter_rets, iter_opt_rets = expert_guided_loss(yhat, tr_returns, idx)
131 | run_loss += loss.item()
132 | # early stopping check:
133 | if run_loss / (i + 1) < early_stop_loss:
134 | print(f'Early stopping...')
135 | return None
136 |
137 | run_rets += iter_rets.item()
138 | run_opt_rets += iter_opt_rets.item()
139 | loss.backward()
140 | opt.step()
141 |
142 | if i % log_interval == 0:
143 | print(f'Epoch: {e}')
144 | print(f'Training Loss: {np.round(run_loss / (i + 1), 2)}')
145 | print(f'Avg train returns: {np.round(run_rets / (i + 1), 2)}')
146 | print(f'Avg train optimal returns: {np.round(run_opt_rets / (i + 1), 2)} \n')
147 |
148 | if valloader:
149 | yhat = self(x_val_clip)
150 | val_loss = nn.CrossEntropyLoss()(yhat, y_val).item()
151 | probs = nn.Softmax(1)(yhat)
152 | val_act = probs.argmax(1)
153 | val_mod_rets = (val_returns[val_act, val_idx]).sum().item()
154 |
155 | val_acc = cross_entropy_accuracy(probs, y_val).item()
156 |
157 | print(f'Validation Loss: {np.round(val_loss, 2)}')
158 | print(f'Validation Accuracy: {np.round(val_acc, 2)} %')
159 | print(f'Avg val returns: {np.round(val_mod_rets, 2)}')
160 | print(f'Avg val optimal returns: {np.round(val_opt_rets, 2)} \n')
161 |
162 | if e % dropout_interval and dropout_schedule:
163 | drop_idx += 1
164 | if drop_idx < len(dropout_schedule):
165 | self.preMLP.drop_probability = dropout_schedule[drop_idx]
166 | self.postMLP.drop_probability = dropout_schedule[drop_idx]
--------------------------------------------------------------------------------
/scripts/spot_rates_collector.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from datetime import datetime, timedelta
3 | import errno
4 | import logging
5 | import os
6 | from pathlib import Path
7 | import re
8 | import sys
9 | import time
10 |
11 | from oandapyV20 import API
12 | import oandapyV20.endpoints.instruments as instruments
13 | from oandapyV20.exceptions import V20Error
14 | import pandas as pd
15 | import pickle
16 | import pytz
17 |
18 | log = logging.getLogger(__name__)
19 | format_str = '%(asctime)s | %(name)-10s | %(funcName)s | %(message)s'
20 | logging.Formatter(format_str)
21 | logging.basicConfig(level=logging.INFO)
22 |
23 | class OandaRunner():
24 | '''Historical data fetcher for Oanda FX data. Connects to Oanda API and updates a data
25 | cache files in _data/oanda/.
26 |
27 | Args:
28 | instruments (list): instruments to fetch (strings containing base currency and
29 | quote currency delimited by a underscore).
30 | dt_from (str): start date (format: "YYYY-MM-DD").
31 | dt_to (str): start date (format: "YYYY-MM-DD"). Defaults to None (fetch
32 | data up to the most recent rounded granularity).
33 | fields (str): one of ['M', 'B', 'A', 'BA', 'MBA'] ([M]id, [B]id, [A]sk,
34 | Bid and Ask ('BA'), or Mid, Bid and Ask ('MBA').
35 | granularity (str): data granularity ([S]econd, [M]inute, [H]our, [D]ay, [W]eek or
36 | [MO]nth).
37 | frequency (int): data frequency (e.g. 1 if one minute).
38 | timeout (int): seconds for API request timeout
39 | n_bars = number of requests (not needed if start and end date provided).
40 | # max 5000, default 500.
41 | keep_updated (bool): whether to fetch history once or keep updating.
42 | update_freq (str): update frequency string (e.g. '1T' is one minute, '1H' one hour,
43 | etc.).
44 | data_path (str): data folder name (will be in precog/ root).
45 | '''
46 | def __init__(
47 | self,
48 | instruments='EUR_USD',
49 | dt_from='2019-05-15',
50 | dt_to=None,
51 | fields='BA',
52 | granularity='M',
53 | frequency=1,
54 | timeout=10,
55 | n_bars=5000,
56 | keep_updated=True,
57 | data_path='_data',
58 | ):
59 | self.data_path = Path(data_path)
60 |
61 | if not os.path.exists(data_path):
62 | log.info(f'creating data path: {data_path}')
63 | os.makedirs(data_path)
64 |
65 | self.instruments = instruments
66 | if len(dt_from) != 10:
67 | raise ValueError('length of date should be 10 (e.g. "2019-05-15")')
68 | else:
69 | dt_from = dt_from + 'T00:00:00+0000'
70 |
71 | self.dt_from = self._check_date(dt_from)
72 |
73 | if dt_to:
74 | dt_to = dt_to + 'T00:00:00+0000'
75 | self.dt_to = self._check_date(dt_to)
76 | self.dt_to = self._round_date(
77 | date=dt_to, granularity=granularity, frequency=frequency)
78 | else:
79 | self.dt_to = None
80 | self.dt_end = self._round_date(
81 | date=None, granularity=granularity, frequency=frequency, out_string=False)
82 |
83 | self.fields = fields
84 | self.granularity = granularity
85 | self.frequency = frequency
86 | self.timeout = timeout
87 | self.n_bars = n_bars
88 | self.keep_updated = keep_updated
89 |
90 | self.check_frequency = 60
91 |
92 | def run(self):
93 | api = self._start_session(timeout=self.timeout)
94 | to_check = True
95 | try:
96 | if self.instruments:
97 | while to_check:
98 | dt_now = self._round_date(
99 | date=None, granularity=self.granularity, frequency=self.frequency,
100 | out_string=False)
101 | log.info(f'fetching data up to {dt_now}')
102 |
103 | for i in self.instruments:
104 | file_path = self._name_file(date=dt_now, instrument=i)
105 | is_file = os.path.isfile(file_path)
106 |
107 | if not is_file:
108 | log.info(f'{i}: data cache not available, will create {file_path}')
109 |
110 | data = None
111 | dt_last = self._round_date(
112 | date=self.dt_from, granularity=self.granularity,
113 | frequency=self.frequency, out_string=False)
114 |
115 | while dt_last < dt_now:
116 | data = self._append_data(
117 | data=data, api=api, instrument=i, dt_from=dt_last,
118 | dt_to=dt_now)
119 | #dt_last = pd.to_datetime(data.index[-1]).tz_localize(pytz.utc)
120 | dt_last = pd.to_datetime(data.index[-1]).tz_convert(pytz.utc)
121 | file_path = self._name_file(date=self.dt_end, instrument=i)
122 | self._save_data(data=data, file_path=file_path)
123 | else:
124 | log.info(f'{i}: data cache available, loading {file_path}')
125 |
126 | data = self._load_data(file_path=file_path)
127 | dt_last = self._round_date(
128 | date=data.index[-1], granularity=self.granularity,
129 | frequency=self.frequency, out_string=False)
130 |
131 | is_update_time = self._check_update(
132 | from_time=dt_last, to_time=dt_now)
133 |
134 | if is_update_time:
135 | data = self._append_data(
136 | data=data, api=api, instrument=i, dt_from=dt_last,
137 | dt_to=dt_now)
138 | self._save_data(data=data, file_path=file_path)
139 |
140 | if self.keep_updated:
141 | log.info(f'next data check in {self.check_frequency} seconds')
142 | time.sleep(self.check_frequency)
143 |
144 | else:
145 | to_check = False
146 | log.info('data fetching complete')
147 | sys.exit()
148 |
149 | else:
150 | raise ValueError('no instrument provided')
151 |
152 | except ConnectionError as e:
153 | log.error(f"ConnectionError: {e}")
154 | time.sleep(3)
155 |
156 | except V20Error as v20e:
157 | log.info(f"ERROR {v20e.code}: {v20e.msg}")
158 |
159 | except ValueError as e:
160 | log.info(f"{e}")
161 |
162 | except Exception as e:
163 | log.info(f"Unkown error: {e}")
164 |
165 | def _append_data(self, data, api, dt_from, instrument, dt_to=None):
166 | log.info(f'{instrument}: requesting historical bars from {dt_from}')
167 |
168 | new_data = self._request_data(api=api, instrument=instrument, dt_from=dt_from)
169 | new_data = self._format_data(data=new_data)
170 |
171 | if dt_to:
172 | # subset to avoid downloading more data for some instruments (if we passet the
173 | # granularity/frequency)
174 | #new_last = pd.to_datetime(new_data.index[-1]).tz_localize(pytz.utc)
175 | new_last = pd.to_datetime(new_data.index[-1]).tz_convert(pytz.utc)
176 | if new_last >= dt_to:
177 | end_idx = new_data.index.get_loc(dt_to.strftime("%Y-%m-%dT%H:%M:%S.%f000Z"))
178 | new_data = new_data.iloc[:end_idx + 1]
179 |
180 | data = pd.concat([data, new_data])
181 | log.info(f'new data tail: {data.tail()}')
182 | log.info(f'data fetched up to {data.index[-1]}')
183 |
184 | return data
185 |
186 | def _check_update(self, from_time, to_time):
187 | if self.granularity == 'M':
188 | from_check = from_time.minute
189 | to_check = to_time.minute
190 |
191 | elif self.granularity == 'H':
192 | from_check = from_time.hour
193 | to_check = to_time.hour
194 |
195 | else:
196 | raise NotImplementedError(f'granularity {self.granularity} not supported')
197 |
198 | out = to_check != from_check
199 |
200 | return out
201 |
202 | def _name_file(self, date, instrument):
203 | date = date.strftime('%Y%m%d')
204 | file_name = '_'.join([
205 | instrument, date, self.fields, str(self.frequency) + self.granularity + '.pickle'])
206 |
207 | file_path = self.data_path / file_name
208 |
209 | return file_path
210 |
211 | def _request_data(self, api, instrument, dt_from):
212 | if not isinstance(dt_from, str):
213 | dt_from = dt_from.strftime("%Y-%m-%dT%H:%M:%S+0000")
214 |
215 | if self.granularity == 'MO':
216 | granularity = 'M'
217 |
218 | else:
219 | granularity = self.granularity + str(self.frequency)
220 |
221 | self.params = self._parametrize(
222 | fields=self.fields,
223 | granularity=granularity,
224 | n_bars=self.n_bars,
225 | dt_from=dt_from,
226 | dt_to=self.dt_to,
227 | )
228 |
229 | req = instruments.InstrumentsCandles(instrument=instrument, params=self.params)
230 | out = api.request(req)
231 |
232 | return out
233 |
234 | def _format_data(self, data):
235 | col_from = ['bid.o', 'ask.o', 'bid.h', 'ask.h', 'bid.l', 'ask.l', 'bid.c', 'ask.c']
236 | col_to = [
237 | 'Open Bid', 'Open Ask', 'High Bid', 'High Ask', 'Low Bid', 'Low Ask',
238 | 'Close Bid', 'Close Ask'
239 | ]
240 | data = pd.io.json.json_normalize(data['candles'])
241 | data = data.set_index('time')
242 | data = data.loc[:, col_from]
243 | data.columns = col_to
244 |
245 | return data
246 |
247 | def _check_date(self, date):
248 | date_format = r'(\d\d\d\d[-]\d\d[-]\d\dT\d\d[:]\d\d[:]\d\d[+]\d\d\d\d)'
249 |
250 | if date:
251 | correct_format = re.match(date_format, date)
252 |
253 | if not correct_format:
254 | raise ValueError(
255 | f'incorrect date format (require: "YYYY-MM-DDTHH:MM:SS+0000"): {date}')
256 |
257 | else:
258 | raise ValueError('date not provided')
259 |
260 | return date
261 |
262 | def _start_session(self, timeout):
263 | token = 'INSERT YOUR TOKEN HERE'
264 |
265 | api_params = {}
266 | api_params['timeout'] = timeout
267 |
268 | api = API(
269 | access_token=token,
270 | environment="practice",
271 | request_params=api_params,
272 | )
273 |
274 | return api
275 |
276 | def _parametrize(self, fields, granularity, n_bars, dt_from, dt_to):
277 | req_params = {}
278 | req_params["granularity"] = granularity
279 | req_params["from"] = dt_from
280 | req_params["price"] = fields
281 |
282 | if n_bars:
283 | req_params["count"] = n_bars
284 |
285 | if dt_to:
286 | req_params["to"] = dt_to
287 |
288 | return req_params
289 |
290 | def _round_date(self, date, granularity, frequency, out_string=True):
291 | if date:
292 | #date = pd.to_datetime(date).tz_localize(pytz.utc)
293 | date = pd.to_datetime(date).tz_convert(pytz.utc)
294 |
295 | else:
296 | date = datetime.now(pytz.utc)
297 |
298 | if granularity == 'M':
299 | to_round = date.minute % frequency
300 | dt_excess = timedelta(
301 | hours=0, minutes=to_round, seconds=date.second,
302 | microseconds=date.microsecond)
303 |
304 |
305 | elif granularity == 'H':
306 | to_round = date.hour % frequency
307 | dt_excess = timedelta(
308 | hours=to_round, minutes=date.minute, seconds=date.second,
309 | microseconds=date.microsecond)
310 |
311 | else:
312 | raise NotImplementedError(f'rounding not implemented for {granularity} granularity')
313 |
314 | dt_round = date - dt_excess
315 | if out_string:
316 | out = dt_round.strftime("%Y-%m-%dT%H:%M:%S+0000")
317 |
318 | else:
319 | out = dt_round
320 |
321 | return out
322 |
323 | def _save_data(self, data, file_path):
324 | '''Save retrieved data to local.
325 | '''
326 | with open(file_path, "wb") as output_file:
327 | pickle.dump(data, output_file)
328 | log.info(f'data cached in {file_path}')
329 | log.info(f'new tail of data:\n{data.tail()}')
330 |
331 | def _load_data(self, file_path):
332 | '''Load cached data, if any.
333 | '''
334 | if not os.path.isfile(file_path):
335 | raise IOError(f'no such file: {file_path}')
336 |
337 | with open(file_path, "rb") as input_file:
338 | log.info(f'data loaded from {file_path}')
339 | out = pickle.load(input_file)
340 |
341 | return out
342 |
343 | def _remove_data(self, file_path):
344 | '''Remove data, if any.
345 | '''
346 | try:
347 | os.remove(file_path)
348 | log.info(f'data removed from {file_path}')
349 |
350 | except OSError as e:
351 | # errno.ENOENT is "no such file or directory"
352 | if e.errno != errno.ENOENT:
353 | raise
354 |
355 |
356 | if __name__ == '__main__':
357 | example_text = '''Examples of use:
358 | python scripts/oanda_runner.py
359 | --ids EUR_USD USD_JPY
360 | --start 2019-05-19f
361 | --fields BA
362 | --g M
363 | --frequency 1
364 | --live
365 | '''
366 |
367 | parser = argparse.ArgumentParser(
368 | description='Oanda data fetcher.',
369 | epilog=example_text,
370 | formatter_class=argparse.RawDescriptionHelpFormatter)
371 |
372 | parser.add_argument(
373 | '--ids',
374 | type=str,
375 | dest='instruments',
376 | nargs='+',
377 | help='specify instruments to fetch')
378 |
379 | parser.add_argument(
380 | '--start',
381 | type=str,
382 | dest='from_dt',
383 | help='start date')
384 |
385 | parser.add_argument(
386 | '--end',
387 | type=str,
388 | dest='to_dt',
389 | default=None,
390 | help='end date')
391 |
392 | parser.add_argument(
393 | '--fields',
394 | type=str,
395 | dest='fields',
396 | default='BA',
397 | choices=['B', 'A', 'BA', 'MBA'],
398 | help='fields requested (mid, bid, ask or their combination)')
399 |
400 | parser.add_argument(
401 | '--granularity',
402 | type=str,
403 | dest='granularity',
404 | choices=['S', 'M', 'H', 'D', 'W', 'CRITICAL'],
405 | help='data granularity')
406 |
407 | parser.add_argument(
408 | '--frequency',
409 | type=int,
410 | dest='frequency',
411 | help='data frequency')
412 |
413 | parser.add_argument(
414 | '--path',
415 | type=str,
416 | dest='data_path',
417 | default='_data',
418 | help='path to data folder from script run path')
419 |
420 | parser.add_argument(
421 | '--live',
422 | dest='keep_updated',
423 | action='store_true',
424 | help='whether to keep data updated at each granularity/frequency interval')
425 |
426 | args = parser.parse_args()
427 |
428 | oanda = OandaRunner(
429 | instruments=args.instruments,
430 | dt_from=args.from_dt,
431 | dt_to=args.to_dt,
432 | fields=args.fields,
433 | granularity=args.granularity,
434 | frequency=args.frequency,
435 | keep_updated=args.keep_updated,
436 | data_path=args.data_path,
437 | )
438 |
439 | try:
440 | oanda.run()
441 | except (KeyboardInterrupt, SystemExit):
442 | log.info('Exit on KeyboardInterrupt or SystemExit')
443 | sys.exit()
--------------------------------------------------------------------------------