├── README.md
├── config
    ├── data_loader.json
    └── signal.json
├── hft
    ├── backtester.py
    ├── data_loader.py
    ├── signal_utils.py
    └── utils.py
├── research
    ├── backtest.py
    ├── eda.py
    ├── hmm.py
    ├── price_dynamics_eda.py
    ├── signal_cache.py
    └── signal_research.py
└── tests
    ├── scratch_pad.py
    ├── test_backtester.py
    └── test_signal_utils.py


/README.md:
--------------------------------------------------------------------------------
1 | # hft
2 | High Frequency Trading Strategies
3 | 


--------------------------------------------------------------------------------
/config/data_loader.json:
--------------------------------------------------------------------------------
1 | {
2 |   "columns": ["date", "time", "price", "qty", "volume", "open_interest",
3 |               "b1", "b1_size", "b2", "b2_size", "b3", "b3_size",
4 |               "s1", "s1_size", "s2", "s2_size", "s3", "s3_size", "side"],
5 |   "columns_to_drop": ["b2", "b2_size", "b3", "b3_size", "s2", "s2_size", "s3", "s3_size"],
6 |   "encoding": "gb18030"
7 | }


--------------------------------------------------------------------------------
/config/signal.json:
--------------------------------------------------------------------------------
1 | {
2 |   "n_flexible_half_seconds": 2
3 | }


--------------------------------------------------------------------------------
/hft/backtester.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Backtest Strategy
  3 | """
  4 | 
  5 | import os
  6 | import logging
  7 | import pickle
  8 | import numpy as np
  9 | import pandas as pd
 10 | from sklearn import linear_model
 11 | 
 12 | import hft.utils as utils
 13 | import hft.signal_utils as signal
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def select_feature(train, config):
 19 |     """Select features to fit model
 20 | 
 21 |     :param train: pandas data frame
 22 |     :param config: dictionary, config parameters
 23 |     :return: list of strings, column names
 24 |     """
 25 |     y_column = utils.get_moving_column_name(config['response_column'], 0, config['holding_period'])
 26 |     selected_features = []
 27 |     for feature in config['feature_column']:
 28 |         logger.debug('Computing correlation of %s and %s', feature, config['response_column'])
 29 |         winsorize_option = {'x_prob': config['feature_winsorize_prob'][feature],
 30 |                             'x_bound': config['feature_winsorize_bound'][feature],
 31 |                             'y_prob': config['response_winsorize_prob'],
 32 |                             'y_bound': config['response_winsorize_bound']
 33 |                             }
 34 |         corr_mat = signal.xy_corr(train, config['feature_freq'], feature, config['response_column'], winsorize_option)
 35 |         correlation = corr_mat.loc[y_column]
 36 |         selected_features.append(correlation.argmax())
 37 |     return selected_features
 38 | 
 39 | 
 40 | def fit(train, features, config):
 41 |     """Fit linear model using features
 42 | 
 43 |     :param train: pandas data frame, must contain columns in features
 44 |     :param features: list of column names
 45 |     :param config: dictionary, config parameters
 46 |     :return: sklearn model class
 47 |     """
 48 |     y_column = utils.get_moving_column_name(config['response_column'], 0, config['holding_period'])
 49 |     regr_data = train[features+[y_column]].dropna()
 50 | 
 51 |     # data processing
 52 |     for feature in features:
 53 |         raw_feature = utils.get_raw_column_name(feature)
 54 |         regr_data[feature] = utils.winsorize(regr_data[feature], config['feature_winsorize_prob'][raw_feature],
 55 |                                              config['feature_winsorize_bound'][raw_feature])
 56 |     regr_data[y_column] = utils.winsorize(regr_data[y_column], config['response_winsorize_prob'],
 57 |                                           config['response_winsorize_bound'])
 58 |     x = regr_data[features].values
 59 |     y = regr_data[y_column].values
 60 |     regr = linear_model.LinearRegression(fit_intercept=False)
 61 |     regr.fit(x, y)
 62 |     n = len(y)
 63 |     p = len(features) + regr.fit_intercept
 64 |     mse = np.sum((regr.predict(x) - y) ** 2) / (n-p)
 65 |     se = np.sqrt(np.diagonal(mse * np.linalg.inv(np.dot(x.T, x))))
 66 |     stats = {'rsq': regr.score(x, y),
 67 |              'beta': regr.coef_,
 68 |              'tstat': regr.coef_ / se,
 69 |              'mse': mse,
 70 |              'df_1': p-1,
 71 |              'df_2': n-p}
 72 |     return regr, stats
 73 | 
 74 | 
 75 | def backtest(px, config):
 76 |     logger.info('Start backtesting')
 77 |     dates = list(set(px.date))
 78 |     dates.sort()
 79 |     y_name = utils.get_moving_column_name(config['response_column'], 0, config['holding_period'])
 80 |     btdf = pd.DataFrame()
 81 |     columns = ['dt', 'date', 'time', 'price', 'qty', 'volume', 'open_interest',
 82 |                'b1', 'b1_size', 's1', 's1_size', 'mid', 'second']
 83 |     fitting_stats = pd.DataFrame(columns=['date', 'rsq', 'beta', 'tstat', 'mse', 'pred_rsq', 'pred_mse'])
 84 |     for i in range(config['training_period'], len(dates)):
 85 |         date = dates[i]
 86 |         logger.info('Backtesting on %s', date)
 87 |         logger.debug('Selecting feature')
 88 |         train = px[(px.date >= dates[i-config['training_period']]) & (px.date < date)].copy()
 89 |         features = select_feature(train, config)
 90 |         logger.debug('Fitting model')
 91 |         model, stats = fit(train, features, config)
 92 |         stats['date'] = date
 93 |         logger.debug('Predicting future return')
 94 |         px_i = px.loc[px.date == date, columns + features + [y_name]].copy()
 95 |         x_new = px_i[features]
 96 |         x_new = x_new.fillna(x_new.median())
 97 |         y_new = px_i[y_name].values
 98 |         alpha = model.predict(X=x_new)
 99 |         px_i['alpha'] = alpha
100 |         pred_rsq = pd.DataFrame({'alpha': alpha, 'y_new': y_new}).corr().iloc[0, 1]
101 |         pred_resid = alpha - y_new
102 |         pred_mse = np.nanmean(pred_resid ** 2)
103 |         stats['pred_rsq'] = pred_rsq
104 |         stats['pred_mse'] = pred_mse
105 |         fitting_stats = fitting_stats.append(stats, ignore_index=True)
106 |         btdf = btdf.append(px_i)
107 |     logger.info('Finish backtesting')
108 |     return btdf, fitting_stats
109 | 
110 | 
111 | def trade(btdf, config):
112 |     logger.info('Making trading decision')
113 |     btdf['trade'] = 0
114 |     btdf.loc[btdf.alpha > config['trade_trigger_threshold'][1], 'trade'] = 1
115 |     btdf.loc[btdf.alpha < config['trade_trigger_threshold'][0], 'trade'] = -1
116 |     btdf.loc[btdf.second > config['end_second'], 'trade'] = 0
117 |     btdf.loc[btdf.second < config['start_second'], 'trade'] = 0
118 |     return btdf
119 | 
120 | 
121 | def get_fixed_period_close_second(btdf, config):
122 |     btdf['close_second'] = btdf.second + config['holding_period']
123 |     dates = list(set(btdf.date))
124 |     dates.sort()
125 |     matched_close_second = []
126 |     for date in dates:
127 |         bti = btdf[btdf.date == date]
128 |         close_index = np.searchsorted(bti.second, bti.close_second)
129 |         close_index[close_index == len(close_index)] = len(close_index) - 1
130 |         matched_close_second_i = bti.second.values[close_index].tolist()
131 |         matched_close_second.extend(matched_close_second_i)
132 |     return matched_close_second
133 | 
134 | 
135 | def dynamic_hold(bti, config, i):
136 |     tick_change = (bti.mid - bti.mid[i]) / config['tick_size']
137 |     cond = ((tick_change >= config['unwinding_tick_move_upper_bound']) |
138 |            (tick_change <= config['unwinding_tick_move_lower_bound'])) & (tick_change.index > i)
139 |     idx = cond.index[cond]
140 |     idx = idx[0] if len(idx) > 0 else bti.index[-1]
141 |     return idx
142 | 
143 | 
144 | def get_dynamic_period_close_second(btdf, config):
145 |     dates = list(set(btdf.date))
146 |     dates.sort()
147 |     matched_close_second = []
148 |     for date in dates:
149 |         logger.debug('Getting dynamic holding end time on %s', date)
150 |         bti = btdf[btdf.date == date]
151 |         close_index = [np.nan if bti.trade[i] == 0 else dynamic_hold(bti, config, i) for i in bti.index]
152 |         matched_close_second_i = bti.second[close_index].tolist()
153 |         matched_close_second.extend(matched_close_second_i)
154 |     return matched_close_second
155 | 
156 | 
157 | def pnl(btdf, config):
158 |     logger.info('Computing PnL...')
159 |     if config['use_mid']:
160 |         btdf['open_price'] = btdf.mid
161 |     else:
162 |         btdf['open_price'] = (btdf.trade > 0) * btdf.s1 + (btdf.trade < 0) * btdf.b1
163 |     if config['dynamic_unwinding']:
164 |         btdf['matched_close_second'] = get_dynamic_period_close_second(btdf, config)
165 |     else:
166 |         btdf['matched_close_second'] = get_fixed_period_close_second(btdf, config)
167 |     dummy_bt = btdf[['date', 'second', 'b1', 's1', 'mid']].copy()
168 |     dummy_bt.columns = ['date', 'matched_close_second', 'close_b1', 'close_s1', 'close_mid']
169 |     btdf = utils.left_join(btdf, dummy_bt, ['date', 'matched_close_second'])
170 |     if config['use_mid']:
171 |         btdf['close_price'] = btdf.close_mid
172 |     else:
173 |         btdf['close_price'] = (btdf.trade > 0) * btdf.close_b1 + (btdf.trade < 0) * btdf.close_s1
174 |     btdf['pnl'] = btdf.trade * (btdf.close_price - btdf.open_price)
175 |     btdf['transaction_fee'] = config['transaction_fee'] * np.abs(btdf.trade) * (btdf.open_price + btdf.close_price)
176 |     btdf['net_pnl'] = btdf['pnl'] - btdf['transaction_fee']
177 |     logger.info('Finished PnL calculation')
178 |     return btdf
179 | 
180 | 
181 | def save(btdf, config):
182 |     file_path = os.path.join(config['data_path'], 'backtest', config['name'])
183 |     if not os.path.exists(file_path):
184 |         os.makedirs(file_path)
185 |     bt_file = os.path.join(file_path, 'backtest.pkl')
186 |     logger.info('Saving backtesting result to %s', bt_file)
187 |     btdf.to_pickle(bt_file)
188 |     config_file = os.path.join(file_path, 'config.pkl')
189 |     logger.info('Saving config file to %s', config_file)
190 |     with open(config_file, 'wb') as cf:
191 |         pickle.dump(config, cf)
192 |     return
193 | 
194 | 
195 | def daily_summary(btdf):
196 |     trades = btdf[btdf.trade != 0]
197 |     f = {'pnl': 'sum', 'transaction_fee': 'sum', 'net_pnl': 'sum'}
198 |     daily = trades.groupby('date').agg(f)
199 |     daily['n_trades'] = trades.groupby('date').size()
200 |     return daily
201 | 
202 | 
203 | def summary(btdf, config):
204 |     trades = btdf[btdf.trade != 0]
205 |     res = dict()
206 |     res['training_period'] = config['training_period']
207 |     res['trade_trigger_threshold'] = config['trade_trigger_threshold'][1]
208 |     res['holding_period'] = config['holding_period']
209 |     res['use_mid'] = config['use_mid']
210 |     res['unwinding_tick_move_upper_bound'] = config['unwinding_tick_move_upper_bound']
211 |     res['unwinding_tick_move_lower_bound'] = config['unwinding_tick_move_lower_bound']
212 | 
213 |     res['n_trades'] = trades.shape[0]
214 |     res['n_trading_days'] = len(set(trades.date))
215 |     res['n_trades_per_day'] = utils.safe_divide(res['n_trades'], res['n_trading_days'])
216 | 
217 |     res['winning_rate'] = sum(trades.pnl > 0) / trades.shape[0]
218 |     res['losing_rate'] = sum(trades.pnl < 0) / trades.shape[0]
219 |     res['net_winning_rate'] = sum(trades.net_pnl > 0) / trades.shape[0]
220 |     res['net_losing_rate'] = sum(trades.net_pnl < 0) / trades.shape[0]
221 | 
222 |     res['total_pnl'] = trades.pnl.sum()
223 |     res['total_net_pnl'] = trades.net_pnl.sum()
224 | 
225 |     res['avg_pnl_per_trade'] = trades.pnl.mean()
226 |     res['avg_net_pnl_per_trade'] = trades.net_pnl.mean()
227 |     res['med_pnl_per_trade'] = trades.pnl.median()
228 |     res['med_net_pnl_per_trade'] = trades.net_pnl.median()
229 | 
230 |     res['avg_pnl_per_winning_trade'] = trades[trades.pnl > 0].pnl.mean()
231 |     res['avg_pnl_per_losing_trade'] = trades[trades.pnl < 0].pnl.mean()
232 |     res['avg_net_pnl_per_winning_trade'] = trades[trades.net_pnl > 0].net_pnl.mean()
233 |     res['avg_net_pnl_per_losing_trade'] = trades[trades.net_pnl < 0].net_pnl.mean()
234 | 
235 |     res['avg_net_pnl_per_day'] = utils.safe_divide(res['total_net_pnl'], res['n_trading_days'])
236 |     res['avg_pnl_per_day'] = utils.safe_divide(res['total_pnl'], res['n_trading_days'])
237 | 
238 |     res['std_pnl_per_trade'] = trades.pnl.std()
239 |     res['std_net_pnl_per_trade'] = trades.net_pnl.std()
240 | 
241 |     res['corr_alpha_pnl'] = np.corrcoef(trades.alpha, trades.pnl)[0, 1]
242 |     res['corr_alpha_net_pnl'] = np.corrcoef(trades.alpha, trades.pnl)[0, 1]
243 | 
244 |     return pd.Series(res, name='value')
245 | 


--------------------------------------------------------------------------------
/hft/data_loader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data Loading Functions
 3 | """
 4 | 
 5 | import os
 6 | import logging
 7 | import json
 8 | import pandas as pd
 9 | from datetime import datetime
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | DATA_PATH = os.path.join(os.environ['HOME'], 'hft', 'data', 'SpRawFutureTick')
14 | with open(os.path.join('config', 'data_loader.json')) as data_config_file:
15 |     data_config = json.load(data_config_file)
16 | COLUMNS = data_config['columns']
17 | COLUMNS_TO_DROP = data_config['columns_to_drop']
18 | ENCODING = data_config['encoding']
19 | 
20 | 
21 | def get_dates():
22 |     return os.listdir(DATA_PATH)
23 | 
24 | 
25 | def get_filenames(product, yyyymmdd):
26 |     contract_month = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
27 |     filenames = [os.path.join(DATA_PATH, yyyymmdd, product + x + '_' + yyyymmdd + '.csv') for x in contract_month]
28 |     filenames = [x for x in filenames if os.path.isfile(x)]
29 |     return filenames
30 | 
31 | 
32 | def process_raw_table(px):
33 |     px.columns = COLUMNS
34 |     px['spread'] = px['s1'] - px['b1']
35 |     px['mid'] = 0.5 * (px['b1']+px['s1'])
36 |     px['return'] = (px['mid'] - px['mid'].shift(1)) / px['mid'].shift(1)
37 |     px['dt'] = px['date'] + ' ' + px['time']
38 |     px['dt'] = [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in px['dt']]
39 |     px.set_index('dt', inplace=True)
40 |     px['second'] = (px.index.hour-9)*3600 + px.index.minute*60 + px.index.second
41 |     half_second_index = px.second == px.second.shift(-1)
42 |     px.loc[half_second_index, 'second'] = px.loc[half_second_index, 'second'] - 0.5
43 |     px.drop(COLUMNS_TO_DROP, axis=1, inplace=True)
44 |     return px
45 | 
46 | 
47 | def load_contract(product, yyyymmdd, contract_month):
48 |     logger.debug('Loading %s-%s data on %s', product, contract_month, yyyymmdd)
49 |     filename = os.path.join(DATA_PATH, yyyymmdd, product + contract_month + '_' + yyyymmdd + '.csv')
50 |     px = pd.read_csv(filename, encoding=ENCODING)
51 |     px = process_raw_table(px)
52 |     return px
53 | 
54 | 
55 | def load_active_contract(product, yyyymmdd):
56 |     logger.debug('Loading %s active contract data on %s', product, yyyymmdd)
57 |     filenames = get_filenames(product, yyyymmdd)
58 |     if len(filenames) == 0:
59 |         logger.warning('Cannot find files of %s on %s', product, yyyymmdd)
60 |         return pd.DataFrame()
61 |     px_list = [pd.read_csv(x, encoding=ENCODING) for x in filenames]
62 |     total_qty = [x.iloc[-1]['总量'] for x in px_list]
63 |     px = px_list[total_qty.index(max(total_qty))]  # select the contract with max qty
64 |     px = process_raw_table(px)
65 |     return px
66 | 
67 | 
68 | def load_active_contract_multiple_dates(product, dates):
69 |     logger.info('Loading %s active contract data from %s to %s', product, dates[0], dates[-1])
70 |     px_list = [load_active_contract(product, x) for x in dates]
71 |     px = pd.concat(px_list)
72 |     return px
73 | 


--------------------------------------------------------------------------------
/hft/signal_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility Functions of Constructing Signals for Research Purpose
  3 | """
  4 | 
  5 | import os
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import pandas as pd
 10 | import pylab
 11 | import matplotlib.pyplot as plt
 12 | from sklearn import linear_model
 13 | from scipy.stats.mstats import winsorize
 14 | import scipy.stats as stats
 15 | import statsmodels.api as sm
 16 | 
 17 | import hft.utils as utils
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | # signal construction
 22 | # -------------------
 23 | 
 24 | with open(os.path.join('config', 'signal.json')) as signal_config_file:
 25 |     signal_config = json.load(signal_config_file)
 26 | # N_FLEXIBLE_HALF_SECONDS = signal_config['n_flexible_half_seconds']
 27 | 
 28 | 
 29 | def order_imbalance_ratio(px, backward_seconds, forward_seconds, index_series):
 30 |     """Order Imbalance Ratio
 31 |     Reference: Carteaa, Donnellyb and Jaimungal (2015)
 32 |     """
 33 |     px['order_imbalance_ratio'] = (px['b1_size']-px['s1_size']) / (px['b1_size']+px['s1_size'])
 34 |     px = utils.moving_operate(px, 'order_imbalance_ratio', np.mean, backward_seconds, forward_seconds, index_series)
 35 |     return px
 36 | 
 37 | 
 38 | def get_order_imbalance_column_name(conservative):
 39 |     return ('conservative_' if conservative else '') + 'order_flow_imbalance'
 40 | 
 41 | 
 42 | def single_order_imbalance(px, conservative=False):
 43 |     """Order Flow Imbalance
 44 |      Reference: [1] Rama Cont, Kukanov and Stoikov (2011)
 45 |                 [2] D Shen (2015)
 46 | 
 47 |     :param px: pandas data frame
 48 |     :param conservative: logical, if True use definition in [1], otherwise use definition in [2]
 49 |     :return: pandas data frame with OFI column appended
 50 |     """
 51 |     px['delta_b1'] = (px.b1 >= px.b1.shift(1)) * px.b1_size - (px.b1 <= px.b1.shift(1)) * px.b1_size.shift(1)
 52 |     px['delta_s1'] = (px.s1 <= px.s1.shift(1)) * px.s1_size - (px.s1 >= px.s1.shift(1)) * px.s1_size.shift(1)
 53 |     if conservative:
 54 |         px.loc[px.b1 < px.b1.shift(1), 'delta_b1'] = 0.0
 55 |         px.loc[px.s1 > px.s1.shift(1), 'delta_s1'] = 0.0
 56 |     col_name = get_order_imbalance_column_name(conservative)
 57 |     px[col_name] = px['delta_b1'] - px['delta_s1']
 58 |     px.drop(['delta_b1', 'delta_s1'], axis=1, inplace=True)
 59 |     return px
 60 | 
 61 | 
 62 | def order_flow_imbalance(px, backward_seconds, forward_seconds, index_series, conservative=False):
 63 |     col_name = get_order_imbalance_column_name(conservative)
 64 |     if col_name not in px.columns:
 65 |         px = single_order_imbalance(px, conservative)
 66 |     px = utils.moving_operate(px, col_name, sum, backward_seconds, forward_seconds, index_series)
 67 |     return px
 68 | 
 69 | 
 70 | def period_return(price_series):
 71 |     price_array = np.array(price_series)
 72 |     return np.nan if len(price_array) == 1 else (price_array[-1] - price_array[0]) / price_array[0]
 73 | 
 74 | 
 75 | def period_tick_move(price_series, tick_size):
 76 |     price_array = np.array(price_series)
 77 |     return np.nan if len(price_array) == 1 else (price_array[-1] - price_array[0]) / tick_size
 78 | 
 79 | 
 80 | def period_mid_move(px, backward_seconds, forward_seconds, tick_size, index_series):
 81 |     """
 82 |     Compute period price move, price tick move and return
 83 |     """
 84 |     px = utils.moving_operate(px, 'mid', lambda x: period_tick_move(x, tick_size),
 85 |                               backward_seconds, forward_seconds, index_series, 'tick_move')
 86 |     px = utils.moving_operate(px, 'mid', period_return, backward_seconds, forward_seconds, index_series, 'return')
 87 |     return px
 88 | 
 89 | 
 90 | def signal_on_multiple_dates(pxall, func):
 91 |     """Compute signal over multiple days
 92 | 
 93 |     :param pxall: pandas data frame, price data
 94 |     :param func: function to compute one signal
 95 |     :return: pandas data frame with signal column appended
 96 |     """
 97 |     dates = sorted(list(set(pxall.date)))
 98 |     logger.info('Computing signal from %s to %s', dates[0], dates[-1])
 99 |     px_list = [func(pxall[pxall.date == x].copy()) for x in dates]
100 |     px_enrich = pd.concat(px_list)
101 |     return px_enrich
102 | 
103 | 
104 | # signal research / backtesting
105 | # -----------------------------
106 | 
107 | 
108 | def plot_two_hist(px, column, freq1, freq2):
109 |     column1 = utils.get_moving_column_name(column, freq1, 0)
110 |     column2 = utils.get_moving_column_name(column, freq2, 0)
111 |     plt.subplot(1, 2, 1)
112 |     px[column1].hist(bins=100)
113 |     plt.xlabel(column1)
114 |     plt.subplot(1, 2, 2)
115 |     px[column2].hist(bins=100)
116 |     plt.xlabel(column2)
117 |     return
118 | 
119 | 
120 | def scatter_plot(px, x_column, x_backward, x_forward, y_column, y_backward, y_forward):
121 |     x_column_name = utils.get_moving_column_name(x_column, x_backward, x_forward)
122 |     y_column_name = utils.get_moving_column_name(y_column, y_backward, y_forward)
123 |     regr_data = px[[x_column_name, y_column_name]].dropna()
124 |     x = regr_data[[x_column_name]].values
125 |     y = regr_data[y_column_name].values
126 |     regr = linear_model.LinearRegression()
127 |     regr.fit(x, y)
128 |     print('Coefficients: \n', regr.coef_)
129 |     print('R-square: %f' % regr.score(x, y))
130 |     plt.scatter(x, y, marker='o', s=0.1)
131 |     plt.plot(x, regr.predict(x), color='red', linewidth=1)
132 |     plt.xlabel(x_column_name)
133 |     plt.ylabel(y_column_name)
134 |     plt.show()
135 |     return
136 | 
137 | 
138 | def plot_two_scatter(px, x_column, y_column, x_b1, x_f1, y_b1, y_f1, x_b2, x_f2, y_b2, y_f2):
139 |     plt.subplot(1, 2, 1)
140 |     scatter_plot(px, x_column, x_b1, x_f1, y_column, y_b1, y_f1)
141 |     plt.subplot(1, 2, 2)
142 |     scatter_plot(px, x_column, x_b2, x_f2, y_column, y_b2, y_f2)
143 |     return
144 | 
145 | 
146 | def xy_corr(px, second_list, x_raw_column, y_raw_column='tick_move', winsorize_option=None):
147 |     px_new = px.copy()
148 |     x_column = [utils.get_moving_column_name(x_raw_column, x, 0) for x in second_list]
149 |     y_column = [utils.get_moving_column_name(y_raw_column, 0, x) for x in second_list]
150 |     if winsorize_option is not None:
151 |         for col in x_column:
152 |             px_new[col] = utils.winsorize(px_new[col], winsorize_option['x_prob'], winsorize_option['x_bound'])
153 |         for col in y_column:
154 |             px_new[col] = utils.winsorize(px_new[col], winsorize_option['y_prob'], winsorize_option['y_bound'])
155 |     big_corr = px_new[x_column + y_column].corr()
156 |     corr_mat = big_corr.loc[y_column, x_column]
157 |     return corr_mat
158 | 
159 | 
160 | def xx_corr(px, second_list, column_name, row_name):
161 |     column_names = [utils.get_moving_column_name(column_name, x, 0) for x in second_list]
162 |     row_names = [utils.get_moving_column_name(row_name, x, 0) for x in second_list]
163 |     big_corr = px[column_names + row_names].corr()
164 |     corr_mat = big_corr.loc[row_names, column_names]
165 |     return corr_mat
166 | 
167 | 
168 | def reg(px, freq_oir, freq_ofi, freq_xreturn, freq_yreturn, show_plot=True, show_inference=True):
169 |     oir_column_name = utils.get_moving_column_name('order_imbalance_ratio', freq_oir, 0)
170 |     ofi_column_name = utils.get_moving_column_name('order_flow_imbalance', freq_ofi, 0)
171 |     xreturn_column_name = utils.get_moving_column_name('tick_move', freq_xreturn, 0)
172 |     yreturn_column_name = utils.get_moving_column_name('tick_move', 0, freq_yreturn)
173 |     regr_data = px[[oir_column_name, ofi_column_name, xreturn_column_name, yreturn_column_name]].dropna()
174 |     regr_data[ofi_column_name] = winsorize(regr_data[ofi_column_name], (0.005, 0.005))
175 |     # regr_data[xreturn_column_name] = winsorize(regr_data[xreturn_column_name], (0.005, 0.005))
176 |     # regr_data[yreturn_column_name] = winsorize(regr_data[yreturn_column_name], (0.005, 0.005))
177 |     x = regr_data[[oir_column_name, ofi_column_name, xreturn_column_name]].values
178 |     y = regr_data[yreturn_column_name].values
179 |     regr = linear_model.LinearRegression()
180 |     regr.fit(x, y)
181 |     yhat = regr.predict(x)
182 |     resids = yhat - y
183 |     if show_plot:
184 |         # regression line
185 |         plt.figure(1)
186 |         plt.scatter(yhat, y, marker='o', s=0.1)
187 |         plt.plot(yhat, yhat, color='red', linewidth=1)
188 |         plt.xlabel('Fitted ' + yreturn_column_name)
189 |         plt.ylabel('Observed ' + yreturn_column_name)
190 |         plt.show()
191 |         # residual histogram
192 |         plt.figure(2)
193 |         plt.hist(resids, bins=40)
194 |         plt.title('Histogram of residuals')
195 |         # residual qq plot
196 |         plt.figure(3)
197 |         stats.probplot(resids, dist="norm", plot=pylab)
198 |         plt.title('QQ plot of residuals')
199 |     if show_inference:
200 |         x2 = sm.add_constant(x)
201 |         est = sm.OLS(y, x2)
202 |         est2 = est.fit()
203 |         print(est2.summary())
204 |     return {'r-square': regr.score(x, y), 'beta': regr.coef_, 'residuals': resids}
205 | 


--------------------------------------------------------------------------------
/hft/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions
  3 | """
  4 | 
  5 | import logging
  6 | import numpy as np
  7 | import pandas as pd
  8 | from scipy.stats import mstats
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | # array/atom manipulation
 13 | # -----------------------
 14 | 
 15 | 
 16 | def safe_divide(a, b):
 17 |     return np.nan if b == 0 else a/b
 18 | 
 19 | 
 20 | def winsorize(array, prob, bound):
 21 |     """Winsorize an array based on both prob and bound
 22 | 
 23 |     :param array: array-like
 24 |     :param prob: list of len=2, tail probabilities, example: [0.01, 0.01]
 25 |     :param bound: list of len=2, tail bounds, example: [-100, 100]
 26 |     :return: array-like, winsorized array
 27 |     """
 28 |     winsorized_array = pd.Series(mstats.winsorize(array, tuple(prob)))
 29 |     winsorized_array[winsorized_array > bound[1]] = bound[1]
 30 |     winsorized_array[winsorized_array < bound[0]] = bound[0]
 31 |     return winsorized_array.values
 32 | 
 33 | 
 34 | def get_period_px(px, period):
 35 |     """Return period price (minutely, five-minutely etc)
 36 | 
 37 |     :param px: original px data frame, must have column mid
 38 |     :param period: integer, in seconds, to get minutely return, set period=60
 39 |     :return: numpy array of period prices
 40 |     """
 41 |     period_px = px.groupby(pd.cut(px['second'], np.arange(0, 21600, period))).last()
 42 |     period_px = period_px[~period_px.time.isin(['10:30:00', '13:30:00'])]
 43 |     period_px = period_px[~np.isnan(period_px.second)]
 44 |     return period_px
 45 | 
 46 | 
 47 | # table aggregation
 48 | # -----------------
 49 | 
 50 | 
 51 | def aggregate(pxall, group, funs, rename_dict=None):
 52 |     daily_agg_px = pxall.groupby(['date', group]).agg(funs)
 53 |     daily_agg_px.rename(columns=rename_dict, inplace=True)
 54 |     daily_agg_px['n_trades'] = pxall.groupby(['date', group]).size()
 55 |     agg_px = daily_agg_px.reset_index().groupby(group).median()
 56 |     return agg_px
 57 | 
 58 | 
 59 | def left_join(df1, df2, key_column):
 60 |     """Left join two pandas data frames. Always replace columns in df1 if also presented in df2
 61 | 
 62 |     :param df1: pandas data frame
 63 |     :param df2: pandas data frame
 64 |     :param key_column: list of strings or string
 65 |     :return: pandas data frame
 66 |     """
 67 |     df1 = df1[list(key_column) + df1.columns.difference(df2.columns).tolist()]
 68 |     df = pd.merge(df1, df2, on=key_column, how='left')
 69 |     return df
 70 | 
 71 | 
 72 | # compute a new column based on a period of data
 73 | # ----------------------------------------------
 74 | 
 75 | 
 76 | def get_moving_column_name(column, backward_seconds, forward_seconds):
 77 |     return column + '_' + str(backward_seconds) + '_' + str(forward_seconds)
 78 | 
 79 | 
 80 | def get_raw_column_name(moving_column_name):
 81 |     words = moving_column_name.split('_')
 82 |     return '_'.join(words[:(len(words)-2)])
 83 | 
 84 | 
 85 | def get_index_within_period(second, backward_seconds, forward_seconds, px=None):
 86 |     logger.info('Getting index within (%s, %s) seconds', str(backward_seconds), str(forward_seconds))
 87 |     forward_second = second + forward_seconds
 88 |     backward_second = second - backward_seconds
 89 |     index_series = [second.index[(second.between(backward_second[i], forward_second[i])).values] for i in second.index]
 90 |     idx_col = get_moving_column_name('index_within_period', backward_seconds, forward_seconds)
 91 |     if px is not None:
 92 |         px[idx_col] = index_series
 93 |     logger.info('Finished getting index within (%s, %s) seconds', str(backward_seconds), str(forward_seconds))
 94 |     return pd.Series(index_series, index=second.index, name=idx_col)
 95 | 
 96 | 
 97 | def get_index_multiple_dates(pxall, backward_seconds, forward_seconds):
 98 |     dates = sorted(list(set(pxall.date)))
 99 |     logger.info('Getting index from %s to %s', dates[0], dates[-1])
100 |     index_list = [get_index_within_period(pxall.loc[pxall.date == x, 'second'], backward_seconds, forward_seconds)
101 |                   for x in dates]
102 |     index_series = pd.concat(index_list)
103 |     return index_series
104 | 
105 | 
106 | def moving_operate(px, column_name, func, backward_seconds, forward_seconds, index_series, new_column_name=None):
107 |     """Compute the moving operation of a column
108 | 
109 |     :param px: pandas data frame, need to have column column
110 |     :type px: pandas data frame
111 |     :param forward_seconds: int, number of seconds going forward
112 |     :param backward_seconds: int, number of seconds going backward
113 |     :param column_name: string, column name
114 |     :param func: function, could be average, sum or any user-defined operations
115 |     :param new_column_name: string, new column name
116 |     :param index_series: pandas series, index of prevailing observations
117 |     :return: pandas data frame
118 |     """
119 |     if new_column_name is None:
120 |         new_column_name = column_name
121 |     new_column_name = get_moving_column_name(new_column_name, backward_seconds, forward_seconds)
122 |     logger.info('Computing moving operation')
123 |     index_series = index_series[px.index]
124 |     px[new_column_name] = [func(px.loc[idx, column_name]) for idx in index_series]
125 |     logger.info('Finish computing moving operation')
126 |     return px
127 | 


--------------------------------------------------------------------------------
/research/backtest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Back test
  3 | """
  4 | 
  5 | import os
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | import hft.backtester as bt
 12 | 
 13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s  %(name)s  %(levelname)s  %(message)s')
 14 | 
 15 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft')
 16 | data_path = os.path.join(hft_path, 'data')
 17 | research_path = os.path.join(hft_path, 'research')
 18 | 
 19 | # load enriched data
 20 | # ------------------
 21 | 
 22 | product = 'cu'  # switch between cu and zn
 23 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file:
 24 |     ticksize_json = json.load(ticksize_file)
 25 | 
 26 | px = pd.read_pickle(os.path.join(data_path, product+'_enriched.pkl'))
 27 | if product == 'zn':
 28 |     # remove unusual day for zn
 29 |     px20131031 = px[px.date == '2013-10-31']
 30 |     px = px[px.date != '2013-10-31']
 31 | 
 32 | # configuration
 33 | # -------------
 34 | 
 35 | config = dict()
 36 | 
 37 | # general configuration
 38 | config['name'] = product + '_1'
 39 | config['data_path'] = data_path
 40 | config['start_date'] = '2013-10-05'
 41 | config['tick_size'] = ticksize_json[product]
 42 | 
 43 | # model specifics
 44 | config['training_period'] = 1  # days
 45 | config['feature_column'] = ['order_imbalance_ratio', 'order_flow_imbalance', 'tick_move']
 46 | config['feature_freq'] = [1, 2, 5, 10, 20, 30, 60, 120, 180, 300]
 47 | config['feature_winsorize_prob'] = {'order_imbalance_ratio': [0.0, 0.0],
 48 |                                     'order_flow_imbalance': [0.005, 0.005],
 49 |                                     'tick_move': [0, 0]}
 50 | config['feature_winsorize_bound'] = {'order_imbalance_ratio': [-np.inf, np.inf],
 51 |                                      'order_flow_imbalance': [-np.inf, np.inf],
 52 |                                      'tick_move': [-10, 10]}
 53 | config['response_column'] = 'tick_move'
 54 | config['response_winsorize_prob'] = [0, 0]
 55 | config['response_winsorize_bound'] = [-5, 5]
 56 | 
 57 | # open/close/hold condition
 58 | config['holding_period'] = 120  # seconds
 59 | config['dynamic_unwinding'] = True
 60 | config['unwinding_tick_move_upper_bound'] = 3
 61 | config['unwinding_tick_move_lower_bound'] = -3
 62 | config['trade_trigger_threshold'] = [-1.5, 1.5]
 63 | config['start_second'] = 120
 64 | config['end_second'] = 21420
 65 | 
 66 | # pnl
 67 | config['use_mid'] = False  # if False, use touch price
 68 | config['transaction_fee'] = 0.0001  # 1 bps transaction fee
 69 | 
 70 | # backtesting
 71 | # -----------
 72 | 
 73 | btdf = bt.backtest(px, config)
 74 | btdf = bt.trade(btdf, config)
 75 | btdf = bt.pnl(btdf, config)
 76 | # bt.save(btdf, config)
 77 | 
 78 | trades = btdf[btdf.trade != 0]
 79 | bt.summary(btdf, config)
 80 | bt.daily_summary(btdf)
 81 | trades.pnl.hist(bins=30)
 82 | 
 83 | # pnl vs threshold - fixed period
 84 | # -------------------------------
 85 | 
 86 | training_periods = [1, 5]
 87 | holding_periods = [20, 30, 60, 120, 180, 300]
 88 | thresholds = [0.5, 1.0, 1.5, 2.0]
 89 | file_path = os.path.join(data_path, 'backtest', product + '_by_hldg_thld')
 90 | res_table = pd.DataFrame()
 91 | 
 92 | for training_period in training_periods:
 93 |     print('############################################')
 94 |     print('########## training_period = ' + str(training_period) + ' ##########')
 95 |     config['training_period'] = training_period
 96 |     for use_mid in [True, False]:
 97 |         print('############################################')
 98 |         print('########## use_mid = ' + str(use_mid) + ' ##########')
 99 |         config['use_mid'] = use_mid
100 |         for hldg in holding_periods:
101 |             print('Compute pnl for Holding period = ' + str(hldg))
102 |             by_thld_table = pd.DataFrame()
103 |             config['holding_period'] = hldg
104 |             btdf, _ = bt.backtest(px, config)
105 |             for thld in thresholds:
106 |                 config['trade_trigger_threshold'] = [-thld, thld]
107 |                 btdf = bt.trade(btdf, config)
108 |                 btdf = bt.pnl(btdf, config)
109 |                 by_thld_table[str(thld)] = bt.summary(btdf, config)
110 |             by_thld_table = by_thld_table.transpose()
111 |             res_table = res_table.append(by_thld_table)
112 |             # file_name = os.path.join(file_path, product + '_' + str(hldg) + '.csv')
113 |             # by_thld_table.to_csv(file_name)
114 | 
115 | file_name = os.path.join(file_path, product + '.csv')
116 | res_table.to_csv(file_name, index=False)
117 | 
118 | # pnl vs threshold - dynamic holding
119 | # ----------------------------------
120 | 
121 | training_periods = [1, 5]
122 | thresholds = [0.5, 1.0, 1.5]
123 | holding_periods = [60, 120, 300]
124 | unwinding_upper_bounds = [3, 3, 5, 5]
125 | unwinding_lower_bounds = [-3, -2, -5, -3]
126 | file_path = os.path.join(data_path, 'backtest')
127 | res_table = pd.DataFrame()
128 | fitting_stats = pd.DataFrame()
129 | 
130 | for training_period in training_periods:
131 |     print('############################################')
132 |     print('########## training_period = ' + str(training_period) + ' ##########')
133 |     config['training_period'] = training_period
134 |     for hldg in holding_periods:
135 |         print('############################################')
136 |         print('########## Holding_period = ' + str(hldg) + ' ##########')
137 |         config['holding_period'] = hldg
138 |         by_thld_table = pd.DataFrame()
139 |         btdf, stats = bt.backtest(px, config)
140 |         fitting_stats = fitting_stats.append(stats)
141 |         for i_unwinding in range(len(unwinding_lower_bounds)):
142 |             print('Unwinding upper bound = ' + str(unwinding_upper_bounds[i_unwinding]))
143 |             config['unwinding_tick_move_upper_bound'] = unwinding_upper_bounds[i_unwinding]
144 |             config['unwinding_tick_move_lower_bound'] = unwinding_lower_bounds[i_unwinding]
145 |             for thld in thresholds:
146 |                 config['trade_trigger_threshold'] = [-thld, thld]
147 |                 btdf = bt.trade(btdf, config)
148 |                 for use_mid in [True, False]:
149 |                     config['use_mid'] = use_mid
150 |                     btdf = bt.pnl(btdf, config)
151 |                     by_thld_table = bt.summary(btdf, config)
152 |                     res_table = res_table.append(by_thld_table, ignore_index=True)
153 |             # file_name = os.path.join(file_path, product + '_' + str(hldg) + '.csv')
154 |             # by_thld_table.to_csv(file_name)
155 | 
156 | res_file_name = os.path.join(file_path, product + '_dynamic_holding.csv')
157 | fit_file_name = os.path.join(file_path, product + '_dynamic_holding_fitting.pkl')
158 | res_table.to_csv(res_file_name, index=False)
159 | fitting_stats.to_pickle(fit_file_name)
160 | 
161 | # exam why positive pnl
162 | # ---------------------
163 | 
164 | config = dict()
165 | 
166 | # general configuration
167 | config['name'] = product + '_1'
168 | config['data_path'] = data_path
169 | config['start_date'] = '2013-10-05'
170 | config['tick_size'] = ticksize_json[product]
171 | 
172 | # model specifics
173 | config['training_period'] = 1  # days
174 | config['feature_column'] = ['order_imbalance_ratio', 'order_flow_imbalance', 'tick_move']
175 | config['feature_freq'] = [1, 2, 5, 10, 20, 30, 60, 120, 180, 300]
176 | config['feature_winsorize_prob'] = {'order_imbalance_ratio': [0.0, 0.0],
177 |                                     'order_flow_imbalance': [0.005, 0.005],
178 |                                     'tick_move': [0, 0]}
179 | config['feature_winsorize_bound'] = {'order_imbalance_ratio': [-np.inf, np.inf],
180 |                                      'order_flow_imbalance': [-np.inf, np.inf],
181 |                                      'tick_move': [-10, 10]}
182 | config['response_column'] = 'tick_move'
183 | config['response_winsorize_prob'] = [0, 0]
184 | config['response_winsorize_bound'] = [-5, 5]
185 | 
186 | # open/close/hold condition
187 | config['holding_period'] = 60  # seconds
188 | config['dynamic_unwinding'] = True
189 | config['unwinding_tick_move_upper_bound'] = 5
190 | config['unwinding_tick_move_lower_bound'] = -5
191 | config['trade_trigger_threshold'] = [-1.5, 1.5]
192 | config['start_second'] = 120
193 | config['end_second'] = 21420
194 | 
195 | # pnl
196 | config['use_mid'] = True  # if False, use touch price
197 | config['transaction_fee'] = 0.0001  # 1 bps transaction fee
198 | 
199 | # backtesting
200 | 
201 | btdf, stats = bt.backtest(px, config)
202 | btdf = bt.trade(btdf, config)
203 | btdf = bt.pnl(btdf, config)
204 | trades = btdf[btdf.trade != 0]
205 | bt.summary(btdf, config)
206 | bt.daily_summary(btdf)
207 | t = trades[trades.date == '2013-12-26']
208 | 


--------------------------------------------------------------------------------
/research/eda.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy as np
 3 | import matplotlib.pylab as plt
 4 | 
 5 | import hft.data_loader as dl
 6 | import hft.utils as utils
 7 | 
 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s  %(name)s  %(levelname)s  %(message)s')
 9 | 
10 | # some random day
11 | # ---------------
12 | 
13 | product = 'cu'  # switch between cu and zn
14 | yyyymmdd = '20131015'
15 | px = dl.load_active_contract(product, yyyymmdd)
16 | px.price.plot()
17 | px[['price', 'mid']].plot()
18 | 
19 | # overall eda
20 | # -----------
21 | 
22 | dates = dl.get_dates()
23 | pxall = dl.load_active_contract_multiple_dates(product, dates)
24 | 
25 | # daily aggregate
26 | # we are more interested in intraday behavior
27 | 
28 | daily_funs = {'price': 'last', 'volume': 'last', 'open_interest': 'sum', 'spread': 'mean', 'mid': 'mean',
29 |               'return': lambda x: np.nansum(x * x)}
30 | daily_px = pxall.groupby('date').agg(daily_funs)
31 | daily_px.rename(columns={'return': 'realized_vol'}, inplace=True)
32 | daily_px.volume.plot(title='volume')
33 | daily_px.price.plot(title='close')
34 | daily_px.spread.plot(title='avg spread')
35 | daily_px.mid.plot(title='avg mid px')
36 | daily_px.realized_vol.plot(title='realized volatility')
37 | daily_px[['price', 'mid']].plot()
38 | plt.plot(daily_px.price, daily_px.volume, 'o')
39 | plt.plot(daily_px.volume, daily_px.spread, 'o')
40 | 
41 | # intraday
42 | 
43 | pxall['hour'] = pxall.index.hour
44 | pxall['minute'] = pxall.index.minute + 60 * pxall.index.hour
45 | funs = {'mid': np.mean, 'qty': np.sum, 'spread': np.mean, 'open_interest': np.sum,
46 |         'b1_size': np.mean, 's1_size': np.mean, 'return': lambda x: np.nansum(x*x)}
47 | rename_dict = {'return': 'realized_vol'}
48 | 
49 | hourly_px = utils.aggregate(pxall, 'hour', funs, rename_dict)
50 | 
51 | minutely_px = utils.aggregate(pxall, 'minute', funs, rename_dict)
52 | minutely_px['n_trades'].plot(title='# trades')
53 | minutely_px['qty'].plot(title='volume')
54 | minutely_px['spread'].plot(title='spread')
55 | minutely_px['realized_vol'].plot(title='realized volatility')
56 | minutely_px[['b1_size', 's1_size']].plot()
57 | (minutely_px.b1_size - minutely_px.s1_size).plot(title='b1_size - s1_size')
58 | plt.plot(minutely_px.spread, minutely_px.realized_vol, 'o')
59 | 


--------------------------------------------------------------------------------
/research/hmm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | import numpy as np
  5 | import pandas as pd
  6 | from hmmlearn import hmm
  7 | import matplotlib.pyplot as plt
  8 | from datetime import datetime
  9 | 
 10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s  %(name)s  %(levelname)s  %(message)s')
 11 | 
 12 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft')
 13 | data_path = os.path.join(hft_path, 'data')
 14 | research_path = os.path.join(hft_path, 'research')
 15 | 
 16 | # load enriched data
 17 | # ------------------
 18 | 
 19 | product = 'cu'  # switch between cu and zn
 20 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file:
 21 |     ticksize_json = json.load(ticksize_file)
 22 | px = pd.read_pickle(os.path.join(data_path, product+'.pkl'))
 23 | 
 24 | if product == 'zn':
 25 |     # remove unusual day for zn
 26 |     px20131031 = px[px.date == '2013-10-31']
 27 |     px = px[px.date != '2013-10-31']
 28 | 
 29 | dates = list(set(px.date.tolist()))
 30 | dates.sort()
 31 | n_dates = len(dates)
 32 | format_dates = [datetime.strptime(x, '%Y-%m-%d') for x in dates]
 33 | 
 34 | # test hmm!
 35 | # --------
 36 | 
 37 | date = '2013-10-09'
 38 | dailyPx = px[['date', 'mid']][px.date == date]
 39 | dailyPx['tick_move'] = (dailyPx['mid']-dailyPx['mid'].shift(1)) / ticksize_json[product]
 40 | dailyPx.mid.plot()
 41 | dailyPx.tick_move.plot()
 42 | 
 43 | x = dailyPx[dailyPx.date == date].tick_move.values[1:]
 44 | x[np.abs(x) > 3] = 3
 45 | x = x.reshape(x.size, 1)
 46 | model = hmm.GaussianHMM(n_components=5, covariance_type='diag', n_iter=50)
 47 | model.fit(x)
 48 | 
 49 | # hmm parameter by date
 50 | # ---------------------
 51 | 
 52 | n_comp = 3
 53 | transmat = np.repeat(np.nan, n_comp*n_comp*n_dates).reshape(n_dates, n_comp, n_comp)
 54 | emission_mean = np.repeat(np.nan, n_comp*n_dates).reshape(n_dates, n_comp)
 55 | emission_std = np.repeat(np.nan, n_comp*n_dates).reshape(n_dates, n_comp)
 56 | starting_prob = np.repeat(np.nan, n_comp*n_dates).reshape(n_dates, n_comp)
 57 | 
 58 | for i, date in enumerate(dates):
 59 |     print('Fit HMM on ' + date)
 60 |     dailyPx = px[['date', 'mid']][px.date == date]
 61 |     dailyPx['tick_move'] = (dailyPx['mid'] - dailyPx['mid'].shift(1)) / ticksize_json[product]
 62 |     x = dailyPx[dailyPx.date == date].tick_move.values[1:]
 63 |     x[np.abs(x) > 3] = 3
 64 |     x = x.reshape(x.size, 1)
 65 |     model = hmm.GaussianHMM(n_components=n_comp, n_iter=50)
 66 |     model.fit(x)
 67 |     index = np.argsort(model.means_.reshape(n_comp))  # sort states based on means
 68 |     transmat[i, :, :] = model.transmat_[index, index]
 69 |     emission_mean[i, :] = model.means_.reshape(n_comp)[index]
 70 |     emission_std[i, :] = np.sqrt(model.covars_).reshape(n_comp)[index]
 71 |     starting_prob[i, :] = model.startprob_[index]
 72 | 
 73 | plt.plot(format_dates, emission_mean[:, 0], 'r')
 74 | plt.plot(format_dates, emission_mean[:, 1], 'b')
 75 | plt.plot(format_dates, emission_mean[:, 2], 'g')
 76 | plt.show()
 77 | 
 78 | plt.plot(format_dates, emission_std[:, 0], 'r')
 79 | plt.plot(format_dates, emission_std[:, 1], 'b')
 80 | plt.plot(format_dates, emission_std[:, 2], 'g')
 81 | plt.show()
 82 | 
 83 | plt.plot(format_dates, starting_prob[:, 0], 'r')
 84 | plt.plot(format_dates, starting_prob[:, 1], 'b')
 85 | plt.plot(format_dates, starting_prob[:, 2], 'g')
 86 | plt.show()
 87 | 
 88 | plt.plot(format_dates, transmat[:, 0, 0], 'r')
 89 | plt.plot(format_dates, transmat[:, 0, 1], 'b')
 90 | plt.plot(format_dates, transmat[:, 0, 2], 'g')
 91 | plt.show()
 92 | 
 93 | plt.plot(format_dates, transmat[:, 1, 0], 'r')
 94 | plt.plot(format_dates, transmat[:, 1, 1], 'b')
 95 | plt.plot(format_dates, transmat[:, 1, 2], 'g')
 96 | plt.show()
 97 | 
 98 | plt.plot(format_dates, transmat[:, 2, 0], 'r')
 99 | plt.plot(format_dates, transmat[:, 2, 1], 'b')
100 | plt.plot(format_dates, transmat[:, 2, 2], 'g')
101 | plt.show()
102 | 


--------------------------------------------------------------------------------
/research/price_dynamics_eda.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import json
  4 | from datetime import datetime
  5 | import numpy as np
  6 | import pandas as pd
  7 | import statsmodels.tsa.stattools as sm
  8 | from sklearn import linear_model
  9 | 
 10 | import hft.utils as utils
 11 | 
 12 | logging.basicConfig(level=logging.INFO, format='%(asctime)s  %(name)s  %(levelname)s  %(message)s')
 13 | 
 14 | 
 15 | # load data
 16 | # ---------
 17 | 
 18 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft')
 19 | data_path = os.path.join(hft_path, 'data')
 20 | research_path = os.path.join(hft_path, 'research')
 21 | 
 22 | product = 'cu'  # switch between cu and zn
 23 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file:
 24 |     ticksize_json = json.load(ticksize_file)
 25 | tick_size = ticksize_json[product]
 26 | px = pd.read_pickle(os.path.join(data_path, product+'.pkl'))
 27 | 
 28 | if product == 'zn':
 29 |     # remove unusual day for zn
 30 |     px20131031 = px[px.date == '2013-10-31']
 31 |     px = px[px.date != '2013-10-31']
 32 | 
 33 | dates = list(set(px.date.tolist()))
 34 | dates.sort()
 35 | n_dates = len(dates)
 36 | format_dates = [datetime.strptime(x, '%Y-%m-%d') for x in dates]
 37 | 
 38 | # save minutely price
 39 | # -------------------
 40 | 
 41 | period = 60
 42 | mpx = pd.DataFrame()
 43 | for date in dates:
 44 |     print('Gathering prices on ' + date)
 45 |     dailyPx = px[px.date == date]
 46 |     dailyPx = utils.get_period_px(dailyPx, period)
 47 |     mpx = mpx.append(dailyPx)
 48 | mpx = mpx[['date', 'second', 'mid', 'b1', 's1', 'b1_size', 's1_size', 'spread', 'price', 'qty', 'volume', 'open_interest']]
 49 | mpx.reset_index(drop=True, inplace=True)
 50 | mpx.to_csv('mpx.csv', index=False)
 51 | 
 52 | # daily statistics
 53 | # ----------------
 54 | 
 55 | df = pd.DataFrame()
 56 | df['date'] = dates
 57 | df['win'] = np.repeat(np.nan, n_dates)
 58 | df['draw'] = np.repeat(np.nan, n_dates)
 59 | df['lose'] = np.repeat(np.nan, n_dates)
 60 | df['pnl'] = np.repeat(np.nan, n_dates)
 61 | df['rv'] = np.repeat(np.nan, n_dates)
 62 | df['hml'] = np.repeat(np.nan, n_dates)
 63 | df['break1'] = np.repeat(np.nan, n_dates)
 64 | df['break2'] = np.repeat(np.nan, n_dates)
 65 | df['mid_acf_1'] = np.repeat(np.nan, n_dates)
 66 | df['mid_acf_2'] = np.repeat(np.nan, n_dates)
 67 | df['mid_pacf_1'] = np.repeat(np.nan, n_dates)
 68 | df['mid_pacf_2'] = np.repeat(np.nan, n_dates)
 69 | df['diff_acf_1'] = np.repeat(np.nan, n_dates)
 70 | df['diff_acf_2'] = np.repeat(np.nan, n_dates)
 71 | df['diff_pacf_1'] = np.repeat(np.nan, n_dates)
 72 | df['diff_pacf_2'] = np.repeat(np.nan, n_dates)
 73 | 
 74 | for date in dates:
 75 |     print('Compute statistics on date ' + date)
 76 | 
 77 |     dailyPx = px.loc[px.date == date, 'mid']
 78 |     seconds = px.loc[px.date == date, 'second']
 79 |     mid_diff = (dailyPx - dailyPx.shift(1)).values / tick_size
 80 |     mid_diff = mid_diff[1:]
 81 |     win = np.sum(mid_diff > 0) / len(mid_diff)
 82 |     lose = np.sum(mid_diff < 0) / len(mid_diff)
 83 |     draw = np.sum(mid_diff == 0) / len(mid_diff)
 84 |     pnl = np.sum(mid_diff)
 85 |     hml = (dailyPx.max() - dailyPx.min()) / tick_size
 86 |     df.loc[df.date == date, 'win'] = win
 87 |     df.loc[df.date == date, 'draw'] = draw
 88 |     df.loc[df.date == date, 'lose'] = lose
 89 |     df.loc[df.date == date, 'pnl'] = pnl
 90 |     df.loc[df.date == date, 'rv'] = np.sqrt(np.nansum(mid_diff * mid_diff) / len(mid_diff))
 91 |     df.loc[df.date == date, 'hml'] = hml
 92 | 
 93 |     break1 = dailyPx[seconds >= 5400].values[0] - dailyPx[seconds <= 4500].values[-1]
 94 |     break2 = dailyPx[seconds >= 16200].values[0] - dailyPx[seconds <= 9000].values[-1]
 95 |     df.loc[df.date == date, 'break1'] = break1 / tick_size
 96 |     df.loc[df.date == date, 'break2'] = break2 / tick_size
 97 | 
 98 |     mid_acf = sm.acf(dailyPx.values, nlags=2)
 99 |     mid_pacf = sm.pacf(dailyPx.values, nlags=2)
100 |     diff_acf = sm.acf(mid_diff, nlags=2)
101 |     diff_pacf = sm.pacf(mid_diff, nlags=2)
102 |     df.loc[df.date == date, 'mid_acf_1'] = mid_acf[1]
103 |     df.loc[df.date == date, 'mid_acf_2'] = mid_acf[2]
104 |     df.loc[df.date == date, 'mid_pacf_1'] = mid_pacf[1]
105 |     df.loc[df.date == date, 'mid_pacf_2'] = mid_pacf[2]
106 |     df.loc[df.date == date, 'diff_acf_1'] = diff_acf[1]
107 |     df.loc[df.date == date, 'diff_acf_2'] = diff_acf[2]
108 |     df.loc[df.date == date, 'diff_pacf_1'] = diff_pacf[1]
109 |     df.loc[df.date == date, 'diff_pacf_2'] = diff_pacf[2]
110 | 
111 | df['format_date'] = format_dates
112 | df.set_index('format_date', inplace=True)
113 | 
114 | df[['win', 'draw', 'lose']].plot()
115 | df[['win', 'lose']].plot()
116 | (df.win - df.lose).plot()
117 | df.pnl.plot()
118 | df[['pnl', 'hml']].plot()
119 | df.rv.plot()
120 | df.hml.plot()
121 | df.break1.hist()
122 | df.break2.hist()
123 | 
124 | df.plot.scatter(x='win', y='lose')
125 | df.plot.scatter(x='win', y='pnl')
126 | df.plot.scatter(x='hml', y='lose')
127 | 
128 | # ACF and PACF of mid and mid_diff
129 | df[['mid_acf_1', 'mid_acf_2', 'mid_pacf_1', 'mid_pacf_2']].plot()
130 | df[['diff_acf_1', 'diff_acf_2', 'diff_pacf_1', 'diff_pacf_2']].plot()
131 | 
132 | 
133 | # same statistics different sample freq
134 | # -------------------------------------
135 | 
136 | period = 60
137 | df = pd.DataFrame()
138 | df['date'] = dates
139 | df['win'] = np.repeat(np.nan, n_dates)
140 | df['draw'] = np.repeat(np.nan, n_dates)
141 | df['lose'] = np.repeat(np.nan, n_dates)
142 | df['pnl'] = np.repeat(np.nan, n_dates)
143 | df['rv'] = np.repeat(np.nan, n_dates)
144 | df['hml'] = np.repeat(np.nan, n_dates)
145 | df['break1'] = np.repeat(np.nan, n_dates)
146 | df['break2'] = np.repeat(np.nan, n_dates)
147 | df['mid_acf_1'] = np.repeat(np.nan, n_dates)
148 | df['mid_acf_2'] = np.repeat(np.nan, n_dates)
149 | df['mid_pacf_1'] = np.repeat(np.nan, n_dates)
150 | df['mid_pacf_2'] = np.repeat(np.nan, n_dates)
151 | df['diff_acf_1'] = np.repeat(np.nan, n_dates)
152 | df['diff_acf_2'] = np.repeat(np.nan, n_dates)
153 | df['diff_pacf_1'] = np.repeat(np.nan, n_dates)
154 | df['diff_pacf_2'] = np.repeat(np.nan, n_dates)
155 | 
156 | for date in dates:
157 |     print('Compute statistics on date ' + date)
158 | 
159 |     dailyPx = px[px.date == date]
160 |     dailyPx = utils.get_period_px(dailyPx, period)
161 |     seconds = dailyPx.second
162 |     prices = dailyPx.mid
163 |     mid_diff = (prices - prices.shift(1)).values / tick_size
164 |     mid_diff = mid_diff[1:]
165 |     win = np.sum(mid_diff > 0) / len(mid_diff)
166 |     lose = np.sum(mid_diff < 0) / len(mid_diff)
167 |     draw = np.sum(mid_diff == 0) / len(mid_diff)
168 |     pnl = np.sum(mid_diff)
169 |     hml = (prices.max() - prices.min()) / tick_size
170 |     df.loc[df.date == date, 'win'] = win
171 |     df.loc[df.date == date, 'draw'] = draw
172 |     df.loc[df.date == date, 'lose'] = lose
173 |     df.loc[df.date == date, 'pnl'] = pnl
174 |     df.loc[df.date == date, 'rv'] = np.sqrt(np.nansum(mid_diff * mid_diff) / len(mid_diff))
175 |     df.loc[df.date == date, 'hml'] = hml
176 | 
177 |     break1 = prices[seconds >= 5400].values[0] - prices[seconds <= 4500].values[-1]
178 |     break2 = prices[seconds >= 16200].values[0] - prices[seconds <= 9000].values[-1]
179 |     df.loc[df.date == date, 'break1'] = break1 / tick_size
180 |     df.loc[df.date == date, 'break2'] = break2 / tick_size
181 | 
182 |     mid_acf = sm.acf(prices.values, nlags=2)
183 |     mid_pacf = sm.pacf(prices.values, nlags=2)
184 |     diff_acf = sm.acf(mid_diff, nlags=2)
185 |     diff_pacf = sm.pacf(mid_diff, nlags=2)
186 |     df.loc[df.date == date, 'mid_acf_1'] = mid_acf[1]
187 |     df.loc[df.date == date, 'mid_acf_2'] = mid_acf[2]
188 |     df.loc[df.date == date, 'mid_pacf_1'] = mid_pacf[1]
189 |     df.loc[df.date == date, 'mid_pacf_2'] = mid_pacf[2]
190 |     df.loc[df.date == date, 'diff_acf_1'] = diff_acf[1]
191 |     df.loc[df.date == date, 'diff_acf_2'] = diff_acf[2]
192 |     df.loc[df.date == date, 'diff_pacf_1'] = diff_pacf[1]
193 |     df.loc[df.date == date, 'diff_pacf_2'] = diff_pacf[2]
194 | 
195 | df['format_date'] = format_dates
196 | df.set_index('format_date', inplace=True)
197 | 
198 | df[['win', 'draw', 'lose']].plot()
199 | df[['win', 'lose']].plot()
200 | (df.win - df.lose).plot()
201 | df.pnl.plot()
202 | df[['pnl', 'hml']].plot()
203 | df.rv.plot()
204 | df.break1.hist()
205 | df.break2.hist()
206 | 
207 | df.plot.scatter(x='win', y='lose')
208 | df.plot.scatter(x='win', y='pnl')
209 | df.plot.scatter(x='hml', y='lose')
210 | 
211 | # ACF and PACF of mid and mid_diff
212 | df[['mid_acf_1', 'mid_acf_2', 'mid_pacf_1', 'mid_pacf_2']].plot()
213 | df[['diff_acf_1', 'diff_acf_2', 'diff_pacf_1', 'diff_pacf_2']].plot()
214 | 
215 | 
216 | # fit daily O-U process
217 | # ---------------------
218 | 
219 | period = 60
220 | df = pd.DataFrame()
221 | df['date'] = dates
222 | df['b0'] = np.repeat(np.nan, n_dates)
223 | df['b1'] = np.repeat(np.nan, n_dates)
224 | df['mse'] = np.repeat(np.nan, n_dates)
225 | df['rsq'] = np.repeat(np.nan, n_dates)
226 | df['s0'] = np.repeat(np.nan, n_dates)
227 | df['s1'] = np.repeat(np.nan, n_dates)
228 | df['t0'] = np.repeat(np.nan, n_dates)
229 | df['t1'] = np.repeat(np.nan, n_dates)
230 | df['kappa'] = np.repeat(np.nan, n_dates)
231 | df['m'] = np.repeat(np.nan, n_dates)
232 | df['sigma'] = np.repeat(np.nan, n_dates)
233 | 
234 | for date in dates:
235 |     print('Fitting O-U process on date ' + date)
236 | 
237 |     dailyPx = px[px.date == date]
238 |     dailyPx = utils.get_period_px(dailyPx, period)
239 |     seconds = dailyPx.second
240 |     prices = dailyPx.mid.values
241 | 
242 |     y = prices[1:]
243 |     n = len(y)
244 |     x = prices[:-1].reshape(n, 1)
245 |     regr = linear_model.LinearRegression()
246 |     regr.fit(x, y)
247 | 
248 |     b0 = regr.intercept_
249 |     b1 = regr.coef_.item()
250 |     df.loc[df.date == date, 'b0'] = b0
251 |     df.loc[df.date == date, 'b1'] = b1
252 |     mse = np.sum((regr.predict(x) - y) ** 2) / (n-2)
253 |     df.loc[df.date == date, 'mse'] = mse
254 |     df.loc[df.date == date, 'rsq'] = regr.score(x, y)
255 |     ssq = np.sum((x - np.mean(x)) ** 2)
256 |     s1 = np.sqrt(mse / ssq)
257 |     s0 = np.sqrt(mse / ssq * np.mean(x ** 2))
258 |     df.loc[df.date == date, 's0'] = s0
259 |     df.loc[df.date == date, 's1'] = s1
260 |     df.loc[df.date == date, 't0'] = b0 / s0
261 |     df.loc[df.date == date, 't1'] = b1 / s1
262 | 
263 |     kappa = -np.log(b1) / period
264 |     df.loc[df.date == date, 'kappa'] = kappa
265 |     df.loc[df.date == date, 'm'] = b0 / (1 - b1)
266 |     df.loc[df.date == date, 'sigma'] = np.sqrt(mse * 2 * kappa / (1 - b1**2))
267 | 
268 | df['format_date'] = format_dates
269 | df.set_index('format_date', inplace=True)
270 | 
271 | df[['rsq']].plot()
272 | df[['mse']].plot()
273 | df[['b0']].plot()
274 | df[['b1']].plot()
275 | df[['t0', 't1']].plot()
276 | 
277 | df.kappa.plot()
278 | df.m.plot()
279 | df.sigma.plot()
280 | 
281 | # aggregate all the tick move and fit OU process
282 | # ----------------------------------------------
283 | 
284 | period = 60
285 | price_delta = []
286 | for date in dates:
287 |     print('Gathering prices on ' + date)
288 |     dailyPx = px[px.date == date]
289 |     dailyPx = utils.get_period_px(dailyPx, period)
290 |     prices = dailyPx.mid.values
291 |     delta = (prices[1:] - prices[:-1]) / tick_size
292 |     price_delta += list(delta)
293 | 
294 | price_delta = np.array(price_delta)
295 | prices = np.cumsum(price_delta)
296 | y = prices[1:]
297 | n = len(y)
298 | x = prices[:-1].reshape(n, 1)
299 | regr = linear_model.LinearRegression()
300 | regr.fit(x, y)
301 | b0 = regr.intercept_
302 | b1 = regr.coef_.item()
303 | mse = np.sum((regr.predict(x) - y) ** 2) / (n-2)
304 | rsq = regr.score(x, y)
305 | ssq = np.sum((x - np.mean(x)) ** 2)
306 | s1 = np.sqrt(mse/ssq)
307 | s0 = np.sqrt(mse / ssq * np.mean(x ** 2))
308 | t0 = b0 / s0
309 | t1 = b1 / s1
310 | 
311 | kappa = -np.log(b1) / period
312 | m = b0 / (1 - b1)
313 | sigma = np.sqrt(mse * 2 * kappa / (1 - b1 ** 2))
314 | 
315 | 


--------------------------------------------------------------------------------
/research/signal_cache.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import logging
 4 | import pandas as pd
 5 | 
 6 | # import hft.data_loader as dl
 7 | import hft.utils as utils
 8 | import hft.signal_utils as signal
 9 | 
10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s  %(name)s  %(levelname)s  %(message)s')
11 | 
12 | data_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft', 'data')
13 | index_folder = os.path.join(data_path, 'index')
14 | 
15 | # load raw data
16 | # -------------
17 | 
18 | product = 'cu'  # switch between cu and zn
19 | with open(os.path.join(os.environ['HOME'], 'hft', 'ticksize.json')) as ticksize_file:
20 |     ticksize_json = json.load(ticksize_file)
21 | tick_size = ticksize_json[product]
22 | 
23 | # load raw data
24 | # dates = dl.get_dates()
25 | # pxall = dl.load_active_contract_multiple_dates(product, dates)
26 | # pxall.to_pickle(os.path.join(os.environ['HOME'], 'hft', product+'.pkl'))
27 | pxall = pd.read_pickle(os.path.join(data_path, product+'_20.pkl'))
28 | 
29 | # cache index
30 | # -----------
31 | 
32 | px = pxall.copy().reset_index()
33 | # second_list = [1, 2, 5, 10, 20]
34 | second_list = [30, 60, 120, 180, 300]
35 | 
36 | for sec in second_list:
37 |     print('----------- sec = ' + str(sec) + ' -------------')
38 |     backward_index = utils.get_index_multiple_dates(px, sec, 0)
39 |     file_name = os.path.join(index_folder, product+'_index_'+str(sec)+'_0.pkl')
40 |     print('Saving backward index to ' + file_name)
41 |     backward_index.to_pickle(file_name)
42 | 
43 |     forward_index = utils.get_index_multiple_dates(px, 0, sec)
44 |     file_name = os.path.join(index_folder, product+'_index_0_'+str(sec)+'.pkl')
45 |     print('Saving forward index to ' + file_name)
46 |     forward_index.to_pickle(file_name)
47 | 
48 | # compute signal
49 | # --------------
50 | 
51 | # px = pxall[pxall.date.isin(['2013-10-08', '2013-10-09'])].copy().reset_index()
52 | px = pxall.copy().reset_index()
53 | # second_list = [1, 2, 5, 10, 20]
54 | second_list = [30, 60, 120, 180, 300]
55 | 
56 | for sec in second_list:
57 |     print('----------- sec = ' + str(sec) + ' -------------')
58 | 
59 |     # backward metrics
60 |     filename = os.path.join(index_folder, product+'_index_'+str(sec)+'_0.pkl')
61 |     backward_index = pd.read_pickle(filename)
62 |     px = signal.signal_on_multiple_dates(px, lambda x: signal.order_imbalance_ratio(x, sec, 0, backward_index))
63 |     px = signal.signal_on_multiple_dates(px, lambda x: signal.order_flow_imbalance(x, sec, 0, backward_index, False))
64 |     px = signal.signal_on_multiple_dates(px, lambda x: signal.order_flow_imbalance(x, sec, 0, backward_index, True))
65 |     px = signal.signal_on_multiple_dates(px, lambda x: signal.period_mid_move(x, sec, 0, tick_size, backward_index))
66 | 
67 |     # forward metrics
68 |     filename = os.path.join(index_folder, product + '_index_0_' + str(sec) + '.pkl')
69 |     forward_index = pd.read_pickle(filename)
70 |     px = signal.signal_on_multiple_dates(px, lambda x: signal.period_mid_move(x, 0, sec, tick_size, forward_index))
71 | 
72 |     # save to file
73 |     filename = os.path.join(data_path, product+'_'+str(sec)+'.pkl')
74 |     print('Saving forward index to ' + filename)
75 |     px.to_pickle(filename)
76 | 


--------------------------------------------------------------------------------
/research/signal_research.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | 
  8 | import hft.utils as utils
  9 | import hft.signal_utils as signal
 10 | 
 11 | logging.basicConfig(level=logging.INFO, format='%(asctime)s  %(name)s  %(levelname)s  %(message)s')
 12 | 
 13 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft')
 14 | data_path = os.path.join(hft_path, 'data')
 15 | research_path = os.path.join(hft_path, 'research')
 16 | 
 17 | # load enriched data
 18 | # ------------------
 19 | 
 20 | product = 'zn'  # switch between cu and zn
 21 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file:
 22 |     ticksize_json = json.load(ticksize_file)
 23 | tick_size = ticksize_json[product]
 24 | 
 25 | px = pd.read_pickle(os.path.join(data_path, product+'_enriched.pkl'))
 26 | 
 27 | px20131031 = px[px.date == '2013-10-31']
 28 | # px = px[np.isnan(px.tick_move_1_0) | (np.abs(px.tick_move_1_0) <= 5)]
 29 | px = px[px.date != '2013-10-31']
 30 | 
 31 | # signal and return distribution
 32 | # ------------------------------
 33 | 
 34 | px[['order_flow_imbalance_1_0', 'order_flow_imbalance_2_0', 'order_flow_imbalance_5_0', 'order_flow_imbalance_10_0',
 35 |     'order_flow_imbalance_20_0', 'order_flow_imbalance_30_0', 'order_flow_imbalance_60_0',
 36 |     'order_flow_imbalance_120_0', 'order_flow_imbalance_180_0', 'order_flow_imbalance_300_0']].describe()
 37 | 
 38 | px[['order_imbalance_ratio_1_0', 'order_imbalance_ratio_2_0', 'order_imbalance_ratio_5_0', 'order_imbalance_ratio_10_0',
 39 |     'order_imbalance_ratio_20_0', 'order_imbalance_ratio_30_0', 'order_imbalance_ratio_60_0',
 40 |     'order_imbalance_ratio_120_0', 'order_imbalance_ratio_180_0', 'order_imbalance_ratio_300_0']].describe()
 41 | 
 42 | px[['tick_move_1_0', 'tick_move_2_0', 'tick_move_5_0', 'tick_move_10_0', 'tick_move_20_0', 'tick_move_30_0',
 43 |     'tick_move_60_0', 'tick_move_120_0', 'tick_move_180_0', 'tick_move_300_0']][px.date != '2013-10-31'].describe()
 44 | 
 45 | px[['tick_move_5_0', 'tick_move_0_10', 'tick_move_0_20']].describe()
 46 | 
 47 | signal.plot_two_hist(px, 'order_flow_imbalance', 60, 300)
 48 | signal.plot_two_hist(px, 'order_imbalance_ratio', 60, 300)
 49 | signal.plot_two_hist(px, 'tick_move', 60, 300)
 50 | 
 51 | px.groupby(np.abs(px.tick_move_0_10)).size()
 52 | 
 53 | print(sum(px.tick_move_0_10 == 0) / sum(~np.isnan(px.tick_move_0_10)))  # % no move
 54 | print(sum(np.abs(px.tick_move_0_10) >= 1) / sum(~np.isnan(px.tick_move_0_10)))  # % 1 tick move
 55 | print(sum(np.abs(px.tick_move_0_10) >= 2) / sum(~np.isnan(px.tick_move_0_10)))  # % 2 tick move
 56 | 
 57 | print(sum(px.tick_move_0_20 == 0) / sum(~np.isnan(px.tick_move_0_20)))  # % no move
 58 | print(sum(np.abs(px.tick_move_0_20) >= 1) / sum(~np.isnan(px.tick_move_0_20)))  # % 1 tick move
 59 | print(sum(np.abs(px.tick_move_0_20) >= 2) / sum(~np.isnan(px.tick_move_0_20)))  # % 2 tick move
 60 | 
 61 | # scatter plot
 62 | # ------------
 63 | 
 64 | # forward return by signal
 65 | signal.plot_two_scatter(px, 'order_imbalance_ratio', 'tick_move', 1, 0, 0, 1, 5, 0, 0, 5)
 66 | signal.plot_two_scatter(px, 'order_flow_imbalance', 'tick_move', 60, 0, 0, 60, 300, 0, 0, 300)
 67 | signal.plot_two_scatter(px, 'tick_move',  'tick_move', 5, 0, 0, 5, 60, 0, 0, 60)
 68 | 
 69 | # signal by signal
 70 | signal.plot_two_scatter(px, 'order_imbalance_ratio', 'tick_move', 1, 0, 1, 0, 5, 0, 5, 0)
 71 | signal.plot_two_scatter(px, 'order_flow_imbalance', 'tick_move', 60, 0, 60, 0, 300, 0, 300, 0)
 72 | signal.plot_two_scatter(px, 'order_flow_imbalance', 'order_imbalance_ratio', 60, 0, 60, 0, 300, 0, 300, 0)
 73 | 
 74 | # correlations
 75 | # ------------
 76 | 
 77 | 
 78 | second_list = [1, 2, 5, 10, 20, 30, 60, 120, 180, 300]
 79 | for sec in second_list:
 80 |     px = px[(px[utils.get_moving_column_name('tick_move', 0, sec)] <= 10) | np.isnan(px.tick_move_1_0)]
 81 |     px = px[(px[utils.get_moving_column_name('tick_move', sec, 0)] <= 10) | np.isnan(px.tick_move_1_0)]
 82 | 
 83 | oir_corr = signal.xy_corr(px, second_list, 'order_imbalance_ratio')
 84 | ofi_corr = signal.xy_corr(px, second_list, 'order_flow_imbalance')
 85 | autocorr = signal.xy_corr(px, second_list, 'tick_move')
 86 | oir_corr.to_csv(os.path.join(research_path, 'oir_corr.csv'))
 87 | ofi_corr.to_csv(os.path.join(research_path, 'ofi_corr.csv'))
 88 | autocorr.to_csv(os.path.join(research_path, 'autocorr.csv'))
 89 | 
 90 | oir_ofi = signal.xx_corr(px, second_list, 'order_imbalance_ratio', 'order_flow_imbalance')
 91 | oir_return = signal.xx_corr(px, second_list, 'order_imbalance_ratio', 'tick_move')
 92 | ofi_return = signal.xx_corr(px, second_list, 'order_flow_imbalance', 'tick_move')
 93 | oir_ofi.to_csv(os.path.join(research_path, 'oir_ofi_corr.csv'))
 94 | oir_return.to_csv(os.path.join(research_path, 'oir_return_corr.csv'))
 95 | ofi_return.to_csv(os.path.join(research_path, 'ofi_return_corr.csv'))
 96 | 
 97 | # multivariate regression
 98 | # -----------------------
 99 | 
100 | freq_oir = 1
101 | freq_ofi = 5
102 | freq_xreturn = 2
103 | freq_yreturn = 10
104 | 
105 | res = signal.reg(px, freq_oir, freq_ofi, freq_xreturn, freq_yreturn, True)
106 | 
107 | 


--------------------------------------------------------------------------------
/tests/scratch_pad.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import json
 4 | import logging
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s  %(name)s  %(levelname)s  %(message)s')
 9 | 
10 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft')
11 | data_path = os.path.join(hft_path, 'data')
12 | research_path = os.path.join(hft_path, 'research')
13 | 
14 | # load enriched data
15 | # ------------------
16 | 
17 | product = 'cu'  # switch between cu and zn
18 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file:
19 |     ticksize_json = json.load(ticksize_file)
20 | 
21 | px = pd.read_pickle(os.path.join(data_path, product+'_enriched.pkl'))
22 | px20131031 = px[px.date == '2013-10-31']
23 | px = px[px.date != '2013-10-31']
24 | 
25 | dailyPx = px.groupby('date')[['mid']].transform(pd.Series.diff)
26 | dailyPx['tick_move'] = dailyPx['mid'] / ticksize_json[product]
27 | dailyPx['date'] = px['date']
28 | dailyPx[dailyPx.date == '2013-10-08'].tick_move.plot()
29 | 


--------------------------------------------------------------------------------
/tests/test_backtester.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Back test
 3 | """
 4 | 
 5 | import os
 6 | import json
 7 | import logging
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | import hft.backtester as bt
12 | 
13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s  %(name)s  %(levelname)s  %(message)s')
14 | 
15 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft')
16 | data_path = os.path.join(hft_path, 'data')
17 | research_path = os.path.join(hft_path, 'research')
18 | 
19 | # load enriched data
20 | # ------------------
21 | 
22 | product = 'zn'  # switch between cu and zn
23 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file:
24 |     ticksize_json = json.load(ticksize_file)
25 | 
26 | px = pd.read_pickle(os.path.join(data_path, product+'_enriched.pkl'))
27 | px20131031 = px[px.date == '2013-10-31']
28 | px = px[px.date != '2013-10-31']
29 | 
30 | # configuration
31 | # -------------
32 | 
33 | config = dict()
34 | 
35 | # general configuration
36 | config['name'] = product + '_1'
37 | config['data_path'] = data_path
38 | config['start_date'] = '2013-10-05'
39 | 
40 | # model specifics
41 | config['training_period'] = 21  # days
42 | config['feature_column'] = ['order_imbalance_ratio', 'order_flow_imbalance', 'tick_move']
43 | config['feature_freq'] = [1, 2, 5, 10, 20, 30, 60, 120, 180, 300]
44 | config['feature_winsorize_prob'] = {'order_imbalance_ratio': [0.0, 0.0],
45 |                                     'order_flow_imbalance': [0.005, 0.005],
46 |                                     'tick_move': [0, 0]}
47 | config['feature_winsorize_bound'] = {'order_imbalance_ratio': [-np.inf, np.inf],
48 |                                      'order_flow_imbalance': [-np.inf, np.inf],
49 |                                      'tick_move': [-10, 10]}
50 | config['response_column'] = 'tick_move'
51 | config['response_winsorize_prob'] = [0, 0]
52 | config['response_winsorize_bound'] = [-5, 5]
53 | 
54 | # open/close/hold condition
55 | config['holding_period'] = 10  # seconds
56 | config['trade_trigger_threshold'] = [-0.4, 0.4]
57 | config['start_second'] = 180
58 | config['end_second'] = 21420
59 | 
60 | # pnl
61 | config['use_mid'] = False  # if False, use touch price
62 | config['transaction_fee'] = 0.0001  # 1 bps transaction fee
63 | 
64 | # backtesting
65 | # -----------
66 | 
67 | backtest = bt.backtest(px, config)
68 | backtest = bt.trade(backtest, config)
69 | backtest = bt.pnl(backtest, config)
70 | bt.save(backtest, config)
71 | 


--------------------------------------------------------------------------------
/tests/test_signal_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import hft.data_loader as dl
 4 | import hft.signal_utils as signal
 5 | 
 6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s  %(name)s  %(levelname)s  %(message)s')
 7 | 
 8 | product = 'cu'  # switch between cu and zn
 9 | tick_size = 10 if product == 'cu' else 5
10 | seconds = 10
11 | return_col = 'return'+str(seconds)
12 | return_cutoff = 6
13 | 
14 | # test on a single date
15 | px = dl.load_active_contract(product, '20131231')
16 | px = signal.order_flow_imbalance(px, 1, 0)
17 | px = signal.order_imbalance_ratio(px, 1, 0)
18 | px = signal.period_mid_move(px, 5, 0, tick_size)
19 | px = signal.period_mid_move(px, 0, 10, tick_size)
20 | 


--------------------------------------------------------------------------------