├── README.md ├── config ├── data_loader.json └── signal.json ├── hft ├── backtester.py ├── data_loader.py ├── signal_utils.py └── utils.py ├── research ├── backtest.py ├── eda.py ├── hmm.py ├── price_dynamics_eda.py ├── signal_cache.py └── signal_research.py └── tests ├── scratch_pad.py ├── test_backtester.py └── test_signal_utils.py /README.md: -------------------------------------------------------------------------------- 1 | # hft 2 | High Frequency Trading Strategies 3 | -------------------------------------------------------------------------------- /config/data_loader.json: -------------------------------------------------------------------------------- 1 | { 2 | "columns": ["date", "time", "price", "qty", "volume", "open_interest", 3 | "b1", "b1_size", "b2", "b2_size", "b3", "b3_size", 4 | "s1", "s1_size", "s2", "s2_size", "s3", "s3_size", "side"], 5 | "columns_to_drop": ["b2", "b2_size", "b3", "b3_size", "s2", "s2_size", "s3", "s3_size"], 6 | "encoding": "gb18030" 7 | } -------------------------------------------------------------------------------- /config/signal.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_flexible_half_seconds": 2 3 | } -------------------------------------------------------------------------------- /hft/backtester.py: -------------------------------------------------------------------------------- 1 | """ 2 | Backtest Strategy 3 | """ 4 | 5 | import os 6 | import logging 7 | import pickle 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn import linear_model 11 | 12 | import hft.utils as utils 13 | import hft.signal_utils as signal 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def select_feature(train, config): 19 | """Select features to fit model 20 | 21 | :param train: pandas data frame 22 | :param config: dictionary, config parameters 23 | :return: list of strings, column names 24 | """ 25 | y_column = utils.get_moving_column_name(config['response_column'], 0, config['holding_period']) 26 | selected_features = [] 27 | for feature in config['feature_column']: 28 | logger.debug('Computing correlation of %s and %s', feature, config['response_column']) 29 | winsorize_option = {'x_prob': config['feature_winsorize_prob'][feature], 30 | 'x_bound': config['feature_winsorize_bound'][feature], 31 | 'y_prob': config['response_winsorize_prob'], 32 | 'y_bound': config['response_winsorize_bound'] 33 | } 34 | corr_mat = signal.xy_corr(train, config['feature_freq'], feature, config['response_column'], winsorize_option) 35 | correlation = corr_mat.loc[y_column] 36 | selected_features.append(correlation.argmax()) 37 | return selected_features 38 | 39 | 40 | def fit(train, features, config): 41 | """Fit linear model using features 42 | 43 | :param train: pandas data frame, must contain columns in features 44 | :param features: list of column names 45 | :param config: dictionary, config parameters 46 | :return: sklearn model class 47 | """ 48 | y_column = utils.get_moving_column_name(config['response_column'], 0, config['holding_period']) 49 | regr_data = train[features+[y_column]].dropna() 50 | 51 | # data processing 52 | for feature in features: 53 | raw_feature = utils.get_raw_column_name(feature) 54 | regr_data[feature] = utils.winsorize(regr_data[feature], config['feature_winsorize_prob'][raw_feature], 55 | config['feature_winsorize_bound'][raw_feature]) 56 | regr_data[y_column] = utils.winsorize(regr_data[y_column], config['response_winsorize_prob'], 57 | config['response_winsorize_bound']) 58 | x = regr_data[features].values 59 | y = regr_data[y_column].values 60 | regr = linear_model.LinearRegression(fit_intercept=False) 61 | regr.fit(x, y) 62 | n = len(y) 63 | p = len(features) + regr.fit_intercept 64 | mse = np.sum((regr.predict(x) - y) ** 2) / (n-p) 65 | se = np.sqrt(np.diagonal(mse * np.linalg.inv(np.dot(x.T, x)))) 66 | stats = {'rsq': regr.score(x, y), 67 | 'beta': regr.coef_, 68 | 'tstat': regr.coef_ / se, 69 | 'mse': mse, 70 | 'df_1': p-1, 71 | 'df_2': n-p} 72 | return regr, stats 73 | 74 | 75 | def backtest(px, config): 76 | logger.info('Start backtesting') 77 | dates = list(set(px.date)) 78 | dates.sort() 79 | y_name = utils.get_moving_column_name(config['response_column'], 0, config['holding_period']) 80 | btdf = pd.DataFrame() 81 | columns = ['dt', 'date', 'time', 'price', 'qty', 'volume', 'open_interest', 82 | 'b1', 'b1_size', 's1', 's1_size', 'mid', 'second'] 83 | fitting_stats = pd.DataFrame(columns=['date', 'rsq', 'beta', 'tstat', 'mse', 'pred_rsq', 'pred_mse']) 84 | for i in range(config['training_period'], len(dates)): 85 | date = dates[i] 86 | logger.info('Backtesting on %s', date) 87 | logger.debug('Selecting feature') 88 | train = px[(px.date >= dates[i-config['training_period']]) & (px.date < date)].copy() 89 | features = select_feature(train, config) 90 | logger.debug('Fitting model') 91 | model, stats = fit(train, features, config) 92 | stats['date'] = date 93 | logger.debug('Predicting future return') 94 | px_i = px.loc[px.date == date, columns + features + [y_name]].copy() 95 | x_new = px_i[features] 96 | x_new = x_new.fillna(x_new.median()) 97 | y_new = px_i[y_name].values 98 | alpha = model.predict(X=x_new) 99 | px_i['alpha'] = alpha 100 | pred_rsq = pd.DataFrame({'alpha': alpha, 'y_new': y_new}).corr().iloc[0, 1] 101 | pred_resid = alpha - y_new 102 | pred_mse = np.nanmean(pred_resid ** 2) 103 | stats['pred_rsq'] = pred_rsq 104 | stats['pred_mse'] = pred_mse 105 | fitting_stats = fitting_stats.append(stats, ignore_index=True) 106 | btdf = btdf.append(px_i) 107 | logger.info('Finish backtesting') 108 | return btdf, fitting_stats 109 | 110 | 111 | def trade(btdf, config): 112 | logger.info('Making trading decision') 113 | btdf['trade'] = 0 114 | btdf.loc[btdf.alpha > config['trade_trigger_threshold'][1], 'trade'] = 1 115 | btdf.loc[btdf.alpha < config['trade_trigger_threshold'][0], 'trade'] = -1 116 | btdf.loc[btdf.second > config['end_second'], 'trade'] = 0 117 | btdf.loc[btdf.second < config['start_second'], 'trade'] = 0 118 | return btdf 119 | 120 | 121 | def get_fixed_period_close_second(btdf, config): 122 | btdf['close_second'] = btdf.second + config['holding_period'] 123 | dates = list(set(btdf.date)) 124 | dates.sort() 125 | matched_close_second = [] 126 | for date in dates: 127 | bti = btdf[btdf.date == date] 128 | close_index = np.searchsorted(bti.second, bti.close_second) 129 | close_index[close_index == len(close_index)] = len(close_index) - 1 130 | matched_close_second_i = bti.second.values[close_index].tolist() 131 | matched_close_second.extend(matched_close_second_i) 132 | return matched_close_second 133 | 134 | 135 | def dynamic_hold(bti, config, i): 136 | tick_change = (bti.mid - bti.mid[i]) / config['tick_size'] 137 | cond = ((tick_change >= config['unwinding_tick_move_upper_bound']) | 138 | (tick_change <= config['unwinding_tick_move_lower_bound'])) & (tick_change.index > i) 139 | idx = cond.index[cond] 140 | idx = idx[0] if len(idx) > 0 else bti.index[-1] 141 | return idx 142 | 143 | 144 | def get_dynamic_period_close_second(btdf, config): 145 | dates = list(set(btdf.date)) 146 | dates.sort() 147 | matched_close_second = [] 148 | for date in dates: 149 | logger.debug('Getting dynamic holding end time on %s', date) 150 | bti = btdf[btdf.date == date] 151 | close_index = [np.nan if bti.trade[i] == 0 else dynamic_hold(bti, config, i) for i in bti.index] 152 | matched_close_second_i = bti.second[close_index].tolist() 153 | matched_close_second.extend(matched_close_second_i) 154 | return matched_close_second 155 | 156 | 157 | def pnl(btdf, config): 158 | logger.info('Computing PnL...') 159 | if config['use_mid']: 160 | btdf['open_price'] = btdf.mid 161 | else: 162 | btdf['open_price'] = (btdf.trade > 0) * btdf.s1 + (btdf.trade < 0) * btdf.b1 163 | if config['dynamic_unwinding']: 164 | btdf['matched_close_second'] = get_dynamic_period_close_second(btdf, config) 165 | else: 166 | btdf['matched_close_second'] = get_fixed_period_close_second(btdf, config) 167 | dummy_bt = btdf[['date', 'second', 'b1', 's1', 'mid']].copy() 168 | dummy_bt.columns = ['date', 'matched_close_second', 'close_b1', 'close_s1', 'close_mid'] 169 | btdf = utils.left_join(btdf, dummy_bt, ['date', 'matched_close_second']) 170 | if config['use_mid']: 171 | btdf['close_price'] = btdf.close_mid 172 | else: 173 | btdf['close_price'] = (btdf.trade > 0) * btdf.close_b1 + (btdf.trade < 0) * btdf.close_s1 174 | btdf['pnl'] = btdf.trade * (btdf.close_price - btdf.open_price) 175 | btdf['transaction_fee'] = config['transaction_fee'] * np.abs(btdf.trade) * (btdf.open_price + btdf.close_price) 176 | btdf['net_pnl'] = btdf['pnl'] - btdf['transaction_fee'] 177 | logger.info('Finished PnL calculation') 178 | return btdf 179 | 180 | 181 | def save(btdf, config): 182 | file_path = os.path.join(config['data_path'], 'backtest', config['name']) 183 | if not os.path.exists(file_path): 184 | os.makedirs(file_path) 185 | bt_file = os.path.join(file_path, 'backtest.pkl') 186 | logger.info('Saving backtesting result to %s', bt_file) 187 | btdf.to_pickle(bt_file) 188 | config_file = os.path.join(file_path, 'config.pkl') 189 | logger.info('Saving config file to %s', config_file) 190 | with open(config_file, 'wb') as cf: 191 | pickle.dump(config, cf) 192 | return 193 | 194 | 195 | def daily_summary(btdf): 196 | trades = btdf[btdf.trade != 0] 197 | f = {'pnl': 'sum', 'transaction_fee': 'sum', 'net_pnl': 'sum'} 198 | daily = trades.groupby('date').agg(f) 199 | daily['n_trades'] = trades.groupby('date').size() 200 | return daily 201 | 202 | 203 | def summary(btdf, config): 204 | trades = btdf[btdf.trade != 0] 205 | res = dict() 206 | res['training_period'] = config['training_period'] 207 | res['trade_trigger_threshold'] = config['trade_trigger_threshold'][1] 208 | res['holding_period'] = config['holding_period'] 209 | res['use_mid'] = config['use_mid'] 210 | res['unwinding_tick_move_upper_bound'] = config['unwinding_tick_move_upper_bound'] 211 | res['unwinding_tick_move_lower_bound'] = config['unwinding_tick_move_lower_bound'] 212 | 213 | res['n_trades'] = trades.shape[0] 214 | res['n_trading_days'] = len(set(trades.date)) 215 | res['n_trades_per_day'] = utils.safe_divide(res['n_trades'], res['n_trading_days']) 216 | 217 | res['winning_rate'] = sum(trades.pnl > 0) / trades.shape[0] 218 | res['losing_rate'] = sum(trades.pnl < 0) / trades.shape[0] 219 | res['net_winning_rate'] = sum(trades.net_pnl > 0) / trades.shape[0] 220 | res['net_losing_rate'] = sum(trades.net_pnl < 0) / trades.shape[0] 221 | 222 | res['total_pnl'] = trades.pnl.sum() 223 | res['total_net_pnl'] = trades.net_pnl.sum() 224 | 225 | res['avg_pnl_per_trade'] = trades.pnl.mean() 226 | res['avg_net_pnl_per_trade'] = trades.net_pnl.mean() 227 | res['med_pnl_per_trade'] = trades.pnl.median() 228 | res['med_net_pnl_per_trade'] = trades.net_pnl.median() 229 | 230 | res['avg_pnl_per_winning_trade'] = trades[trades.pnl > 0].pnl.mean() 231 | res['avg_pnl_per_losing_trade'] = trades[trades.pnl < 0].pnl.mean() 232 | res['avg_net_pnl_per_winning_trade'] = trades[trades.net_pnl > 0].net_pnl.mean() 233 | res['avg_net_pnl_per_losing_trade'] = trades[trades.net_pnl < 0].net_pnl.mean() 234 | 235 | res['avg_net_pnl_per_day'] = utils.safe_divide(res['total_net_pnl'], res['n_trading_days']) 236 | res['avg_pnl_per_day'] = utils.safe_divide(res['total_pnl'], res['n_trading_days']) 237 | 238 | res['std_pnl_per_trade'] = trades.pnl.std() 239 | res['std_net_pnl_per_trade'] = trades.net_pnl.std() 240 | 241 | res['corr_alpha_pnl'] = np.corrcoef(trades.alpha, trades.pnl)[0, 1] 242 | res['corr_alpha_net_pnl'] = np.corrcoef(trades.alpha, trades.pnl)[0, 1] 243 | 244 | return pd.Series(res, name='value') 245 | -------------------------------------------------------------------------------- /hft/data_loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data Loading Functions 3 | """ 4 | 5 | import os 6 | import logging 7 | import json 8 | import pandas as pd 9 | from datetime import datetime 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | DATA_PATH = os.path.join(os.environ['HOME'], 'hft', 'data', 'SpRawFutureTick') 14 | with open(os.path.join('config', 'data_loader.json')) as data_config_file: 15 | data_config = json.load(data_config_file) 16 | COLUMNS = data_config['columns'] 17 | COLUMNS_TO_DROP = data_config['columns_to_drop'] 18 | ENCODING = data_config['encoding'] 19 | 20 | 21 | def get_dates(): 22 | return os.listdir(DATA_PATH) 23 | 24 | 25 | def get_filenames(product, yyyymmdd): 26 | contract_month = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'] 27 | filenames = [os.path.join(DATA_PATH, yyyymmdd, product + x + '_' + yyyymmdd + '.csv') for x in contract_month] 28 | filenames = [x for x in filenames if os.path.isfile(x)] 29 | return filenames 30 | 31 | 32 | def process_raw_table(px): 33 | px.columns = COLUMNS 34 | px['spread'] = px['s1'] - px['b1'] 35 | px['mid'] = 0.5 * (px['b1']+px['s1']) 36 | px['return'] = (px['mid'] - px['mid'].shift(1)) / px['mid'].shift(1) 37 | px['dt'] = px['date'] + ' ' + px['time'] 38 | px['dt'] = [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in px['dt']] 39 | px.set_index('dt', inplace=True) 40 | px['second'] = (px.index.hour-9)*3600 + px.index.minute*60 + px.index.second 41 | half_second_index = px.second == px.second.shift(-1) 42 | px.loc[half_second_index, 'second'] = px.loc[half_second_index, 'second'] - 0.5 43 | px.drop(COLUMNS_TO_DROP, axis=1, inplace=True) 44 | return px 45 | 46 | 47 | def load_contract(product, yyyymmdd, contract_month): 48 | logger.debug('Loading %s-%s data on %s', product, contract_month, yyyymmdd) 49 | filename = os.path.join(DATA_PATH, yyyymmdd, product + contract_month + '_' + yyyymmdd + '.csv') 50 | px = pd.read_csv(filename, encoding=ENCODING) 51 | px = process_raw_table(px) 52 | return px 53 | 54 | 55 | def load_active_contract(product, yyyymmdd): 56 | logger.debug('Loading %s active contract data on %s', product, yyyymmdd) 57 | filenames = get_filenames(product, yyyymmdd) 58 | if len(filenames) == 0: 59 | logger.warning('Cannot find files of %s on %s', product, yyyymmdd) 60 | return pd.DataFrame() 61 | px_list = [pd.read_csv(x, encoding=ENCODING) for x in filenames] 62 | total_qty = [x.iloc[-1]['总量'] for x in px_list] 63 | px = px_list[total_qty.index(max(total_qty))] # select the contract with max qty 64 | px = process_raw_table(px) 65 | return px 66 | 67 | 68 | def load_active_contract_multiple_dates(product, dates): 69 | logger.info('Loading %s active contract data from %s to %s', product, dates[0], dates[-1]) 70 | px_list = [load_active_contract(product, x) for x in dates] 71 | px = pd.concat(px_list) 72 | return px 73 | -------------------------------------------------------------------------------- /hft/signal_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility Functions of Constructing Signals for Research Purpose 3 | """ 4 | 5 | import os 6 | import json 7 | import logging 8 | import numpy as np 9 | import pandas as pd 10 | import pylab 11 | import matplotlib.pyplot as plt 12 | from sklearn import linear_model 13 | from scipy.stats.mstats import winsorize 14 | import scipy.stats as stats 15 | import statsmodels.api as sm 16 | 17 | import hft.utils as utils 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | # signal construction 22 | # ------------------- 23 | 24 | with open(os.path.join('config', 'signal.json')) as signal_config_file: 25 | signal_config = json.load(signal_config_file) 26 | # N_FLEXIBLE_HALF_SECONDS = signal_config['n_flexible_half_seconds'] 27 | 28 | 29 | def order_imbalance_ratio(px, backward_seconds, forward_seconds, index_series): 30 | """Order Imbalance Ratio 31 | Reference: Carteaa, Donnellyb and Jaimungal (2015) 32 | """ 33 | px['order_imbalance_ratio'] = (px['b1_size']-px['s1_size']) / (px['b1_size']+px['s1_size']) 34 | px = utils.moving_operate(px, 'order_imbalance_ratio', np.mean, backward_seconds, forward_seconds, index_series) 35 | return px 36 | 37 | 38 | def get_order_imbalance_column_name(conservative): 39 | return ('conservative_' if conservative else '') + 'order_flow_imbalance' 40 | 41 | 42 | def single_order_imbalance(px, conservative=False): 43 | """Order Flow Imbalance 44 | Reference: [1] Rama Cont, Kukanov and Stoikov (2011) 45 | [2] D Shen (2015) 46 | 47 | :param px: pandas data frame 48 | :param conservative: logical, if True use definition in [1], otherwise use definition in [2] 49 | :return: pandas data frame with OFI column appended 50 | """ 51 | px['delta_b1'] = (px.b1 >= px.b1.shift(1)) * px.b1_size - (px.b1 <= px.b1.shift(1)) * px.b1_size.shift(1) 52 | px['delta_s1'] = (px.s1 <= px.s1.shift(1)) * px.s1_size - (px.s1 >= px.s1.shift(1)) * px.s1_size.shift(1) 53 | if conservative: 54 | px.loc[px.b1 < px.b1.shift(1), 'delta_b1'] = 0.0 55 | px.loc[px.s1 > px.s1.shift(1), 'delta_s1'] = 0.0 56 | col_name = get_order_imbalance_column_name(conservative) 57 | px[col_name] = px['delta_b1'] - px['delta_s1'] 58 | px.drop(['delta_b1', 'delta_s1'], axis=1, inplace=True) 59 | return px 60 | 61 | 62 | def order_flow_imbalance(px, backward_seconds, forward_seconds, index_series, conservative=False): 63 | col_name = get_order_imbalance_column_name(conservative) 64 | if col_name not in px.columns: 65 | px = single_order_imbalance(px, conservative) 66 | px = utils.moving_operate(px, col_name, sum, backward_seconds, forward_seconds, index_series) 67 | return px 68 | 69 | 70 | def period_return(price_series): 71 | price_array = np.array(price_series) 72 | return np.nan if len(price_array) == 1 else (price_array[-1] - price_array[0]) / price_array[0] 73 | 74 | 75 | def period_tick_move(price_series, tick_size): 76 | price_array = np.array(price_series) 77 | return np.nan if len(price_array) == 1 else (price_array[-1] - price_array[0]) / tick_size 78 | 79 | 80 | def period_mid_move(px, backward_seconds, forward_seconds, tick_size, index_series): 81 | """ 82 | Compute period price move, price tick move and return 83 | """ 84 | px = utils.moving_operate(px, 'mid', lambda x: period_tick_move(x, tick_size), 85 | backward_seconds, forward_seconds, index_series, 'tick_move') 86 | px = utils.moving_operate(px, 'mid', period_return, backward_seconds, forward_seconds, index_series, 'return') 87 | return px 88 | 89 | 90 | def signal_on_multiple_dates(pxall, func): 91 | """Compute signal over multiple days 92 | 93 | :param pxall: pandas data frame, price data 94 | :param func: function to compute one signal 95 | :return: pandas data frame with signal column appended 96 | """ 97 | dates = sorted(list(set(pxall.date))) 98 | logger.info('Computing signal from %s to %s', dates[0], dates[-1]) 99 | px_list = [func(pxall[pxall.date == x].copy()) for x in dates] 100 | px_enrich = pd.concat(px_list) 101 | return px_enrich 102 | 103 | 104 | # signal research / backtesting 105 | # ----------------------------- 106 | 107 | 108 | def plot_two_hist(px, column, freq1, freq2): 109 | column1 = utils.get_moving_column_name(column, freq1, 0) 110 | column2 = utils.get_moving_column_name(column, freq2, 0) 111 | plt.subplot(1, 2, 1) 112 | px[column1].hist(bins=100) 113 | plt.xlabel(column1) 114 | plt.subplot(1, 2, 2) 115 | px[column2].hist(bins=100) 116 | plt.xlabel(column2) 117 | return 118 | 119 | 120 | def scatter_plot(px, x_column, x_backward, x_forward, y_column, y_backward, y_forward): 121 | x_column_name = utils.get_moving_column_name(x_column, x_backward, x_forward) 122 | y_column_name = utils.get_moving_column_name(y_column, y_backward, y_forward) 123 | regr_data = px[[x_column_name, y_column_name]].dropna() 124 | x = regr_data[[x_column_name]].values 125 | y = regr_data[y_column_name].values 126 | regr = linear_model.LinearRegression() 127 | regr.fit(x, y) 128 | print('Coefficients: \n', regr.coef_) 129 | print('R-square: %f' % regr.score(x, y)) 130 | plt.scatter(x, y, marker='o', s=0.1) 131 | plt.plot(x, regr.predict(x), color='red', linewidth=1) 132 | plt.xlabel(x_column_name) 133 | plt.ylabel(y_column_name) 134 | plt.show() 135 | return 136 | 137 | 138 | def plot_two_scatter(px, x_column, y_column, x_b1, x_f1, y_b1, y_f1, x_b2, x_f2, y_b2, y_f2): 139 | plt.subplot(1, 2, 1) 140 | scatter_plot(px, x_column, x_b1, x_f1, y_column, y_b1, y_f1) 141 | plt.subplot(1, 2, 2) 142 | scatter_plot(px, x_column, x_b2, x_f2, y_column, y_b2, y_f2) 143 | return 144 | 145 | 146 | def xy_corr(px, second_list, x_raw_column, y_raw_column='tick_move', winsorize_option=None): 147 | px_new = px.copy() 148 | x_column = [utils.get_moving_column_name(x_raw_column, x, 0) for x in second_list] 149 | y_column = [utils.get_moving_column_name(y_raw_column, 0, x) for x in second_list] 150 | if winsorize_option is not None: 151 | for col in x_column: 152 | px_new[col] = utils.winsorize(px_new[col], winsorize_option['x_prob'], winsorize_option['x_bound']) 153 | for col in y_column: 154 | px_new[col] = utils.winsorize(px_new[col], winsorize_option['y_prob'], winsorize_option['y_bound']) 155 | big_corr = px_new[x_column + y_column].corr() 156 | corr_mat = big_corr.loc[y_column, x_column] 157 | return corr_mat 158 | 159 | 160 | def xx_corr(px, second_list, column_name, row_name): 161 | column_names = [utils.get_moving_column_name(column_name, x, 0) for x in second_list] 162 | row_names = [utils.get_moving_column_name(row_name, x, 0) for x in second_list] 163 | big_corr = px[column_names + row_names].corr() 164 | corr_mat = big_corr.loc[row_names, column_names] 165 | return corr_mat 166 | 167 | 168 | def reg(px, freq_oir, freq_ofi, freq_xreturn, freq_yreturn, show_plot=True, show_inference=True): 169 | oir_column_name = utils.get_moving_column_name('order_imbalance_ratio', freq_oir, 0) 170 | ofi_column_name = utils.get_moving_column_name('order_flow_imbalance', freq_ofi, 0) 171 | xreturn_column_name = utils.get_moving_column_name('tick_move', freq_xreturn, 0) 172 | yreturn_column_name = utils.get_moving_column_name('tick_move', 0, freq_yreturn) 173 | regr_data = px[[oir_column_name, ofi_column_name, xreturn_column_name, yreturn_column_name]].dropna() 174 | regr_data[ofi_column_name] = winsorize(regr_data[ofi_column_name], (0.005, 0.005)) 175 | # regr_data[xreturn_column_name] = winsorize(regr_data[xreturn_column_name], (0.005, 0.005)) 176 | # regr_data[yreturn_column_name] = winsorize(regr_data[yreturn_column_name], (0.005, 0.005)) 177 | x = regr_data[[oir_column_name, ofi_column_name, xreturn_column_name]].values 178 | y = regr_data[yreturn_column_name].values 179 | regr = linear_model.LinearRegression() 180 | regr.fit(x, y) 181 | yhat = regr.predict(x) 182 | resids = yhat - y 183 | if show_plot: 184 | # regression line 185 | plt.figure(1) 186 | plt.scatter(yhat, y, marker='o', s=0.1) 187 | plt.plot(yhat, yhat, color='red', linewidth=1) 188 | plt.xlabel('Fitted ' + yreturn_column_name) 189 | plt.ylabel('Observed ' + yreturn_column_name) 190 | plt.show() 191 | # residual histogram 192 | plt.figure(2) 193 | plt.hist(resids, bins=40) 194 | plt.title('Histogram of residuals') 195 | # residual qq plot 196 | plt.figure(3) 197 | stats.probplot(resids, dist="norm", plot=pylab) 198 | plt.title('QQ plot of residuals') 199 | if show_inference: 200 | x2 = sm.add_constant(x) 201 | est = sm.OLS(y, x2) 202 | est2 = est.fit() 203 | print(est2.summary()) 204 | return {'r-square': regr.score(x, y), 'beta': regr.coef_, 'residuals': resids} 205 | -------------------------------------------------------------------------------- /hft/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions 3 | """ 4 | 5 | import logging 6 | import numpy as np 7 | import pandas as pd 8 | from scipy.stats import mstats 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | # array/atom manipulation 13 | # ----------------------- 14 | 15 | 16 | def safe_divide(a, b): 17 | return np.nan if b == 0 else a/b 18 | 19 | 20 | def winsorize(array, prob, bound): 21 | """Winsorize an array based on both prob and bound 22 | 23 | :param array: array-like 24 | :param prob: list of len=2, tail probabilities, example: [0.01, 0.01] 25 | :param bound: list of len=2, tail bounds, example: [-100, 100] 26 | :return: array-like, winsorized array 27 | """ 28 | winsorized_array = pd.Series(mstats.winsorize(array, tuple(prob))) 29 | winsorized_array[winsorized_array > bound[1]] = bound[1] 30 | winsorized_array[winsorized_array < bound[0]] = bound[0] 31 | return winsorized_array.values 32 | 33 | 34 | def get_period_px(px, period): 35 | """Return period price (minutely, five-minutely etc) 36 | 37 | :param px: original px data frame, must have column mid 38 | :param period: integer, in seconds, to get minutely return, set period=60 39 | :return: numpy array of period prices 40 | """ 41 | period_px = px.groupby(pd.cut(px['second'], np.arange(0, 21600, period))).last() 42 | period_px = period_px[~period_px.time.isin(['10:30:00', '13:30:00'])] 43 | period_px = period_px[~np.isnan(period_px.second)] 44 | return period_px 45 | 46 | 47 | # table aggregation 48 | # ----------------- 49 | 50 | 51 | def aggregate(pxall, group, funs, rename_dict=None): 52 | daily_agg_px = pxall.groupby(['date', group]).agg(funs) 53 | daily_agg_px.rename(columns=rename_dict, inplace=True) 54 | daily_agg_px['n_trades'] = pxall.groupby(['date', group]).size() 55 | agg_px = daily_agg_px.reset_index().groupby(group).median() 56 | return agg_px 57 | 58 | 59 | def left_join(df1, df2, key_column): 60 | """Left join two pandas data frames. Always replace columns in df1 if also presented in df2 61 | 62 | :param df1: pandas data frame 63 | :param df2: pandas data frame 64 | :param key_column: list of strings or string 65 | :return: pandas data frame 66 | """ 67 | df1 = df1[list(key_column) + df1.columns.difference(df2.columns).tolist()] 68 | df = pd.merge(df1, df2, on=key_column, how='left') 69 | return df 70 | 71 | 72 | # compute a new column based on a period of data 73 | # ---------------------------------------------- 74 | 75 | 76 | def get_moving_column_name(column, backward_seconds, forward_seconds): 77 | return column + '_' + str(backward_seconds) + '_' + str(forward_seconds) 78 | 79 | 80 | def get_raw_column_name(moving_column_name): 81 | words = moving_column_name.split('_') 82 | return '_'.join(words[:(len(words)-2)]) 83 | 84 | 85 | def get_index_within_period(second, backward_seconds, forward_seconds, px=None): 86 | logger.info('Getting index within (%s, %s) seconds', str(backward_seconds), str(forward_seconds)) 87 | forward_second = second + forward_seconds 88 | backward_second = second - backward_seconds 89 | index_series = [second.index[(second.between(backward_second[i], forward_second[i])).values] for i in second.index] 90 | idx_col = get_moving_column_name('index_within_period', backward_seconds, forward_seconds) 91 | if px is not None: 92 | px[idx_col] = index_series 93 | logger.info('Finished getting index within (%s, %s) seconds', str(backward_seconds), str(forward_seconds)) 94 | return pd.Series(index_series, index=second.index, name=idx_col) 95 | 96 | 97 | def get_index_multiple_dates(pxall, backward_seconds, forward_seconds): 98 | dates = sorted(list(set(pxall.date))) 99 | logger.info('Getting index from %s to %s', dates[0], dates[-1]) 100 | index_list = [get_index_within_period(pxall.loc[pxall.date == x, 'second'], backward_seconds, forward_seconds) 101 | for x in dates] 102 | index_series = pd.concat(index_list) 103 | return index_series 104 | 105 | 106 | def moving_operate(px, column_name, func, backward_seconds, forward_seconds, index_series, new_column_name=None): 107 | """Compute the moving operation of a column 108 | 109 | :param px: pandas data frame, need to have column column 110 | :type px: pandas data frame 111 | :param forward_seconds: int, number of seconds going forward 112 | :param backward_seconds: int, number of seconds going backward 113 | :param column_name: string, column name 114 | :param func: function, could be average, sum or any user-defined operations 115 | :param new_column_name: string, new column name 116 | :param index_series: pandas series, index of prevailing observations 117 | :return: pandas data frame 118 | """ 119 | if new_column_name is None: 120 | new_column_name = column_name 121 | new_column_name = get_moving_column_name(new_column_name, backward_seconds, forward_seconds) 122 | logger.info('Computing moving operation') 123 | index_series = index_series[px.index] 124 | px[new_column_name] = [func(px.loc[idx, column_name]) for idx in index_series] 125 | logger.info('Finish computing moving operation') 126 | return px 127 | -------------------------------------------------------------------------------- /research/backtest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Back test 3 | """ 4 | 5 | import os 6 | import json 7 | import logging 8 | import numpy as np 9 | import pandas as pd 10 | 11 | import hft.backtester as bt 12 | 13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') 14 | 15 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft') 16 | data_path = os.path.join(hft_path, 'data') 17 | research_path = os.path.join(hft_path, 'research') 18 | 19 | # load enriched data 20 | # ------------------ 21 | 22 | product = 'cu' # switch between cu and zn 23 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file: 24 | ticksize_json = json.load(ticksize_file) 25 | 26 | px = pd.read_pickle(os.path.join(data_path, product+'_enriched.pkl')) 27 | if product == 'zn': 28 | # remove unusual day for zn 29 | px20131031 = px[px.date == '2013-10-31'] 30 | px = px[px.date != '2013-10-31'] 31 | 32 | # configuration 33 | # ------------- 34 | 35 | config = dict() 36 | 37 | # general configuration 38 | config['name'] = product + '_1' 39 | config['data_path'] = data_path 40 | config['start_date'] = '2013-10-05' 41 | config['tick_size'] = ticksize_json[product] 42 | 43 | # model specifics 44 | config['training_period'] = 1 # days 45 | config['feature_column'] = ['order_imbalance_ratio', 'order_flow_imbalance', 'tick_move'] 46 | config['feature_freq'] = [1, 2, 5, 10, 20, 30, 60, 120, 180, 300] 47 | config['feature_winsorize_prob'] = {'order_imbalance_ratio': [0.0, 0.0], 48 | 'order_flow_imbalance': [0.005, 0.005], 49 | 'tick_move': [0, 0]} 50 | config['feature_winsorize_bound'] = {'order_imbalance_ratio': [-np.inf, np.inf], 51 | 'order_flow_imbalance': [-np.inf, np.inf], 52 | 'tick_move': [-10, 10]} 53 | config['response_column'] = 'tick_move' 54 | config['response_winsorize_prob'] = [0, 0] 55 | config['response_winsorize_bound'] = [-5, 5] 56 | 57 | # open/close/hold condition 58 | config['holding_period'] = 120 # seconds 59 | config['dynamic_unwinding'] = True 60 | config['unwinding_tick_move_upper_bound'] = 3 61 | config['unwinding_tick_move_lower_bound'] = -3 62 | config['trade_trigger_threshold'] = [-1.5, 1.5] 63 | config['start_second'] = 120 64 | config['end_second'] = 21420 65 | 66 | # pnl 67 | config['use_mid'] = False # if False, use touch price 68 | config['transaction_fee'] = 0.0001 # 1 bps transaction fee 69 | 70 | # backtesting 71 | # ----------- 72 | 73 | btdf = bt.backtest(px, config) 74 | btdf = bt.trade(btdf, config) 75 | btdf = bt.pnl(btdf, config) 76 | # bt.save(btdf, config) 77 | 78 | trades = btdf[btdf.trade != 0] 79 | bt.summary(btdf, config) 80 | bt.daily_summary(btdf) 81 | trades.pnl.hist(bins=30) 82 | 83 | # pnl vs threshold - fixed period 84 | # ------------------------------- 85 | 86 | training_periods = [1, 5] 87 | holding_periods = [20, 30, 60, 120, 180, 300] 88 | thresholds = [0.5, 1.0, 1.5, 2.0] 89 | file_path = os.path.join(data_path, 'backtest', product + '_by_hldg_thld') 90 | res_table = pd.DataFrame() 91 | 92 | for training_period in training_periods: 93 | print('############################################') 94 | print('########## training_period = ' + str(training_period) + ' ##########') 95 | config['training_period'] = training_period 96 | for use_mid in [True, False]: 97 | print('############################################') 98 | print('########## use_mid = ' + str(use_mid) + ' ##########') 99 | config['use_mid'] = use_mid 100 | for hldg in holding_periods: 101 | print('Compute pnl for Holding period = ' + str(hldg)) 102 | by_thld_table = pd.DataFrame() 103 | config['holding_period'] = hldg 104 | btdf, _ = bt.backtest(px, config) 105 | for thld in thresholds: 106 | config['trade_trigger_threshold'] = [-thld, thld] 107 | btdf = bt.trade(btdf, config) 108 | btdf = bt.pnl(btdf, config) 109 | by_thld_table[str(thld)] = bt.summary(btdf, config) 110 | by_thld_table = by_thld_table.transpose() 111 | res_table = res_table.append(by_thld_table) 112 | # file_name = os.path.join(file_path, product + '_' + str(hldg) + '.csv') 113 | # by_thld_table.to_csv(file_name) 114 | 115 | file_name = os.path.join(file_path, product + '.csv') 116 | res_table.to_csv(file_name, index=False) 117 | 118 | # pnl vs threshold - dynamic holding 119 | # ---------------------------------- 120 | 121 | training_periods = [1, 5] 122 | thresholds = [0.5, 1.0, 1.5] 123 | holding_periods = [60, 120, 300] 124 | unwinding_upper_bounds = [3, 3, 5, 5] 125 | unwinding_lower_bounds = [-3, -2, -5, -3] 126 | file_path = os.path.join(data_path, 'backtest') 127 | res_table = pd.DataFrame() 128 | fitting_stats = pd.DataFrame() 129 | 130 | for training_period in training_periods: 131 | print('############################################') 132 | print('########## training_period = ' + str(training_period) + ' ##########') 133 | config['training_period'] = training_period 134 | for hldg in holding_periods: 135 | print('############################################') 136 | print('########## Holding_period = ' + str(hldg) + ' ##########') 137 | config['holding_period'] = hldg 138 | by_thld_table = pd.DataFrame() 139 | btdf, stats = bt.backtest(px, config) 140 | fitting_stats = fitting_stats.append(stats) 141 | for i_unwinding in range(len(unwinding_lower_bounds)): 142 | print('Unwinding upper bound = ' + str(unwinding_upper_bounds[i_unwinding])) 143 | config['unwinding_tick_move_upper_bound'] = unwinding_upper_bounds[i_unwinding] 144 | config['unwinding_tick_move_lower_bound'] = unwinding_lower_bounds[i_unwinding] 145 | for thld in thresholds: 146 | config['trade_trigger_threshold'] = [-thld, thld] 147 | btdf = bt.trade(btdf, config) 148 | for use_mid in [True, False]: 149 | config['use_mid'] = use_mid 150 | btdf = bt.pnl(btdf, config) 151 | by_thld_table = bt.summary(btdf, config) 152 | res_table = res_table.append(by_thld_table, ignore_index=True) 153 | # file_name = os.path.join(file_path, product + '_' + str(hldg) + '.csv') 154 | # by_thld_table.to_csv(file_name) 155 | 156 | res_file_name = os.path.join(file_path, product + '_dynamic_holding.csv') 157 | fit_file_name = os.path.join(file_path, product + '_dynamic_holding_fitting.pkl') 158 | res_table.to_csv(res_file_name, index=False) 159 | fitting_stats.to_pickle(fit_file_name) 160 | 161 | # exam why positive pnl 162 | # --------------------- 163 | 164 | config = dict() 165 | 166 | # general configuration 167 | config['name'] = product + '_1' 168 | config['data_path'] = data_path 169 | config['start_date'] = '2013-10-05' 170 | config['tick_size'] = ticksize_json[product] 171 | 172 | # model specifics 173 | config['training_period'] = 1 # days 174 | config['feature_column'] = ['order_imbalance_ratio', 'order_flow_imbalance', 'tick_move'] 175 | config['feature_freq'] = [1, 2, 5, 10, 20, 30, 60, 120, 180, 300] 176 | config['feature_winsorize_prob'] = {'order_imbalance_ratio': [0.0, 0.0], 177 | 'order_flow_imbalance': [0.005, 0.005], 178 | 'tick_move': [0, 0]} 179 | config['feature_winsorize_bound'] = {'order_imbalance_ratio': [-np.inf, np.inf], 180 | 'order_flow_imbalance': [-np.inf, np.inf], 181 | 'tick_move': [-10, 10]} 182 | config['response_column'] = 'tick_move' 183 | config['response_winsorize_prob'] = [0, 0] 184 | config['response_winsorize_bound'] = [-5, 5] 185 | 186 | # open/close/hold condition 187 | config['holding_period'] = 60 # seconds 188 | config['dynamic_unwinding'] = True 189 | config['unwinding_tick_move_upper_bound'] = 5 190 | config['unwinding_tick_move_lower_bound'] = -5 191 | config['trade_trigger_threshold'] = [-1.5, 1.5] 192 | config['start_second'] = 120 193 | config['end_second'] = 21420 194 | 195 | # pnl 196 | config['use_mid'] = True # if False, use touch price 197 | config['transaction_fee'] = 0.0001 # 1 bps transaction fee 198 | 199 | # backtesting 200 | 201 | btdf, stats = bt.backtest(px, config) 202 | btdf = bt.trade(btdf, config) 203 | btdf = bt.pnl(btdf, config) 204 | trades = btdf[btdf.trade != 0] 205 | bt.summary(btdf, config) 206 | bt.daily_summary(btdf) 207 | t = trades[trades.date == '2013-12-26'] 208 | -------------------------------------------------------------------------------- /research/eda.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | import matplotlib.pylab as plt 4 | 5 | import hft.data_loader as dl 6 | import hft.utils as utils 7 | 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') 9 | 10 | # some random day 11 | # --------------- 12 | 13 | product = 'cu' # switch between cu and zn 14 | yyyymmdd = '20131015' 15 | px = dl.load_active_contract(product, yyyymmdd) 16 | px.price.plot() 17 | px[['price', 'mid']].plot() 18 | 19 | # overall eda 20 | # ----------- 21 | 22 | dates = dl.get_dates() 23 | pxall = dl.load_active_contract_multiple_dates(product, dates) 24 | 25 | # daily aggregate 26 | # we are more interested in intraday behavior 27 | 28 | daily_funs = {'price': 'last', 'volume': 'last', 'open_interest': 'sum', 'spread': 'mean', 'mid': 'mean', 29 | 'return': lambda x: np.nansum(x * x)} 30 | daily_px = pxall.groupby('date').agg(daily_funs) 31 | daily_px.rename(columns={'return': 'realized_vol'}, inplace=True) 32 | daily_px.volume.plot(title='volume') 33 | daily_px.price.plot(title='close') 34 | daily_px.spread.plot(title='avg spread') 35 | daily_px.mid.plot(title='avg mid px') 36 | daily_px.realized_vol.plot(title='realized volatility') 37 | daily_px[['price', 'mid']].plot() 38 | plt.plot(daily_px.price, daily_px.volume, 'o') 39 | plt.plot(daily_px.volume, daily_px.spread, 'o') 40 | 41 | # intraday 42 | 43 | pxall['hour'] = pxall.index.hour 44 | pxall['minute'] = pxall.index.minute + 60 * pxall.index.hour 45 | funs = {'mid': np.mean, 'qty': np.sum, 'spread': np.mean, 'open_interest': np.sum, 46 | 'b1_size': np.mean, 's1_size': np.mean, 'return': lambda x: np.nansum(x*x)} 47 | rename_dict = {'return': 'realized_vol'} 48 | 49 | hourly_px = utils.aggregate(pxall, 'hour', funs, rename_dict) 50 | 51 | minutely_px = utils.aggregate(pxall, 'minute', funs, rename_dict) 52 | minutely_px['n_trades'].plot(title='# trades') 53 | minutely_px['qty'].plot(title='volume') 54 | minutely_px['spread'].plot(title='spread') 55 | minutely_px['realized_vol'].plot(title='realized volatility') 56 | minutely_px[['b1_size', 's1_size']].plot() 57 | (minutely_px.b1_size - minutely_px.s1_size).plot(title='b1_size - s1_size') 58 | plt.plot(minutely_px.spread, minutely_px.realized_vol, 'o') 59 | -------------------------------------------------------------------------------- /research/hmm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | import numpy as np 5 | import pandas as pd 6 | from hmmlearn import hmm 7 | import matplotlib.pyplot as plt 8 | from datetime import datetime 9 | 10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') 11 | 12 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft') 13 | data_path = os.path.join(hft_path, 'data') 14 | research_path = os.path.join(hft_path, 'research') 15 | 16 | # load enriched data 17 | # ------------------ 18 | 19 | product = 'cu' # switch between cu and zn 20 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file: 21 | ticksize_json = json.load(ticksize_file) 22 | px = pd.read_pickle(os.path.join(data_path, product+'.pkl')) 23 | 24 | if product == 'zn': 25 | # remove unusual day for zn 26 | px20131031 = px[px.date == '2013-10-31'] 27 | px = px[px.date != '2013-10-31'] 28 | 29 | dates = list(set(px.date.tolist())) 30 | dates.sort() 31 | n_dates = len(dates) 32 | format_dates = [datetime.strptime(x, '%Y-%m-%d') for x in dates] 33 | 34 | # test hmm! 35 | # -------- 36 | 37 | date = '2013-10-09' 38 | dailyPx = px[['date', 'mid']][px.date == date] 39 | dailyPx['tick_move'] = (dailyPx['mid']-dailyPx['mid'].shift(1)) / ticksize_json[product] 40 | dailyPx.mid.plot() 41 | dailyPx.tick_move.plot() 42 | 43 | x = dailyPx[dailyPx.date == date].tick_move.values[1:] 44 | x[np.abs(x) > 3] = 3 45 | x = x.reshape(x.size, 1) 46 | model = hmm.GaussianHMM(n_components=5, covariance_type='diag', n_iter=50) 47 | model.fit(x) 48 | 49 | # hmm parameter by date 50 | # --------------------- 51 | 52 | n_comp = 3 53 | transmat = np.repeat(np.nan, n_comp*n_comp*n_dates).reshape(n_dates, n_comp, n_comp) 54 | emission_mean = np.repeat(np.nan, n_comp*n_dates).reshape(n_dates, n_comp) 55 | emission_std = np.repeat(np.nan, n_comp*n_dates).reshape(n_dates, n_comp) 56 | starting_prob = np.repeat(np.nan, n_comp*n_dates).reshape(n_dates, n_comp) 57 | 58 | for i, date in enumerate(dates): 59 | print('Fit HMM on ' + date) 60 | dailyPx = px[['date', 'mid']][px.date == date] 61 | dailyPx['tick_move'] = (dailyPx['mid'] - dailyPx['mid'].shift(1)) / ticksize_json[product] 62 | x = dailyPx[dailyPx.date == date].tick_move.values[1:] 63 | x[np.abs(x) > 3] = 3 64 | x = x.reshape(x.size, 1) 65 | model = hmm.GaussianHMM(n_components=n_comp, n_iter=50) 66 | model.fit(x) 67 | index = np.argsort(model.means_.reshape(n_comp)) # sort states based on means 68 | transmat[i, :, :] = model.transmat_[index, index] 69 | emission_mean[i, :] = model.means_.reshape(n_comp)[index] 70 | emission_std[i, :] = np.sqrt(model.covars_).reshape(n_comp)[index] 71 | starting_prob[i, :] = model.startprob_[index] 72 | 73 | plt.plot(format_dates, emission_mean[:, 0], 'r') 74 | plt.plot(format_dates, emission_mean[:, 1], 'b') 75 | plt.plot(format_dates, emission_mean[:, 2], 'g') 76 | plt.show() 77 | 78 | plt.plot(format_dates, emission_std[:, 0], 'r') 79 | plt.plot(format_dates, emission_std[:, 1], 'b') 80 | plt.plot(format_dates, emission_std[:, 2], 'g') 81 | plt.show() 82 | 83 | plt.plot(format_dates, starting_prob[:, 0], 'r') 84 | plt.plot(format_dates, starting_prob[:, 1], 'b') 85 | plt.plot(format_dates, starting_prob[:, 2], 'g') 86 | plt.show() 87 | 88 | plt.plot(format_dates, transmat[:, 0, 0], 'r') 89 | plt.plot(format_dates, transmat[:, 0, 1], 'b') 90 | plt.plot(format_dates, transmat[:, 0, 2], 'g') 91 | plt.show() 92 | 93 | plt.plot(format_dates, transmat[:, 1, 0], 'r') 94 | plt.plot(format_dates, transmat[:, 1, 1], 'b') 95 | plt.plot(format_dates, transmat[:, 1, 2], 'g') 96 | plt.show() 97 | 98 | plt.plot(format_dates, transmat[:, 2, 0], 'r') 99 | plt.plot(format_dates, transmat[:, 2, 1], 'b') 100 | plt.plot(format_dates, transmat[:, 2, 2], 'g') 101 | plt.show() 102 | -------------------------------------------------------------------------------- /research/price_dynamics_eda.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import json 4 | from datetime import datetime 5 | import numpy as np 6 | import pandas as pd 7 | import statsmodels.tsa.stattools as sm 8 | from sklearn import linear_model 9 | 10 | import hft.utils as utils 11 | 12 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') 13 | 14 | 15 | # load data 16 | # --------- 17 | 18 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft') 19 | data_path = os.path.join(hft_path, 'data') 20 | research_path = os.path.join(hft_path, 'research') 21 | 22 | product = 'cu' # switch between cu and zn 23 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file: 24 | ticksize_json = json.load(ticksize_file) 25 | tick_size = ticksize_json[product] 26 | px = pd.read_pickle(os.path.join(data_path, product+'.pkl')) 27 | 28 | if product == 'zn': 29 | # remove unusual day for zn 30 | px20131031 = px[px.date == '2013-10-31'] 31 | px = px[px.date != '2013-10-31'] 32 | 33 | dates = list(set(px.date.tolist())) 34 | dates.sort() 35 | n_dates = len(dates) 36 | format_dates = [datetime.strptime(x, '%Y-%m-%d') for x in dates] 37 | 38 | # save minutely price 39 | # ------------------- 40 | 41 | period = 60 42 | mpx = pd.DataFrame() 43 | for date in dates: 44 | print('Gathering prices on ' + date) 45 | dailyPx = px[px.date == date] 46 | dailyPx = utils.get_period_px(dailyPx, period) 47 | mpx = mpx.append(dailyPx) 48 | mpx = mpx[['date', 'second', 'mid', 'b1', 's1', 'b1_size', 's1_size', 'spread', 'price', 'qty', 'volume', 'open_interest']] 49 | mpx.reset_index(drop=True, inplace=True) 50 | mpx.to_csv('mpx.csv', index=False) 51 | 52 | # daily statistics 53 | # ---------------- 54 | 55 | df = pd.DataFrame() 56 | df['date'] = dates 57 | df['win'] = np.repeat(np.nan, n_dates) 58 | df['draw'] = np.repeat(np.nan, n_dates) 59 | df['lose'] = np.repeat(np.nan, n_dates) 60 | df['pnl'] = np.repeat(np.nan, n_dates) 61 | df['rv'] = np.repeat(np.nan, n_dates) 62 | df['hml'] = np.repeat(np.nan, n_dates) 63 | df['break1'] = np.repeat(np.nan, n_dates) 64 | df['break2'] = np.repeat(np.nan, n_dates) 65 | df['mid_acf_1'] = np.repeat(np.nan, n_dates) 66 | df['mid_acf_2'] = np.repeat(np.nan, n_dates) 67 | df['mid_pacf_1'] = np.repeat(np.nan, n_dates) 68 | df['mid_pacf_2'] = np.repeat(np.nan, n_dates) 69 | df['diff_acf_1'] = np.repeat(np.nan, n_dates) 70 | df['diff_acf_2'] = np.repeat(np.nan, n_dates) 71 | df['diff_pacf_1'] = np.repeat(np.nan, n_dates) 72 | df['diff_pacf_2'] = np.repeat(np.nan, n_dates) 73 | 74 | for date in dates: 75 | print('Compute statistics on date ' + date) 76 | 77 | dailyPx = px.loc[px.date == date, 'mid'] 78 | seconds = px.loc[px.date == date, 'second'] 79 | mid_diff = (dailyPx - dailyPx.shift(1)).values / tick_size 80 | mid_diff = mid_diff[1:] 81 | win = np.sum(mid_diff > 0) / len(mid_diff) 82 | lose = np.sum(mid_diff < 0) / len(mid_diff) 83 | draw = np.sum(mid_diff == 0) / len(mid_diff) 84 | pnl = np.sum(mid_diff) 85 | hml = (dailyPx.max() - dailyPx.min()) / tick_size 86 | df.loc[df.date == date, 'win'] = win 87 | df.loc[df.date == date, 'draw'] = draw 88 | df.loc[df.date == date, 'lose'] = lose 89 | df.loc[df.date == date, 'pnl'] = pnl 90 | df.loc[df.date == date, 'rv'] = np.sqrt(np.nansum(mid_diff * mid_diff) / len(mid_diff)) 91 | df.loc[df.date == date, 'hml'] = hml 92 | 93 | break1 = dailyPx[seconds >= 5400].values[0] - dailyPx[seconds <= 4500].values[-1] 94 | break2 = dailyPx[seconds >= 16200].values[0] - dailyPx[seconds <= 9000].values[-1] 95 | df.loc[df.date == date, 'break1'] = break1 / tick_size 96 | df.loc[df.date == date, 'break2'] = break2 / tick_size 97 | 98 | mid_acf = sm.acf(dailyPx.values, nlags=2) 99 | mid_pacf = sm.pacf(dailyPx.values, nlags=2) 100 | diff_acf = sm.acf(mid_diff, nlags=2) 101 | diff_pacf = sm.pacf(mid_diff, nlags=2) 102 | df.loc[df.date == date, 'mid_acf_1'] = mid_acf[1] 103 | df.loc[df.date == date, 'mid_acf_2'] = mid_acf[2] 104 | df.loc[df.date == date, 'mid_pacf_1'] = mid_pacf[1] 105 | df.loc[df.date == date, 'mid_pacf_2'] = mid_pacf[2] 106 | df.loc[df.date == date, 'diff_acf_1'] = diff_acf[1] 107 | df.loc[df.date == date, 'diff_acf_2'] = diff_acf[2] 108 | df.loc[df.date == date, 'diff_pacf_1'] = diff_pacf[1] 109 | df.loc[df.date == date, 'diff_pacf_2'] = diff_pacf[2] 110 | 111 | df['format_date'] = format_dates 112 | df.set_index('format_date', inplace=True) 113 | 114 | df[['win', 'draw', 'lose']].plot() 115 | df[['win', 'lose']].plot() 116 | (df.win - df.lose).plot() 117 | df.pnl.plot() 118 | df[['pnl', 'hml']].plot() 119 | df.rv.plot() 120 | df.hml.plot() 121 | df.break1.hist() 122 | df.break2.hist() 123 | 124 | df.plot.scatter(x='win', y='lose') 125 | df.plot.scatter(x='win', y='pnl') 126 | df.plot.scatter(x='hml', y='lose') 127 | 128 | # ACF and PACF of mid and mid_diff 129 | df[['mid_acf_1', 'mid_acf_2', 'mid_pacf_1', 'mid_pacf_2']].plot() 130 | df[['diff_acf_1', 'diff_acf_2', 'diff_pacf_1', 'diff_pacf_2']].plot() 131 | 132 | 133 | # same statistics different sample freq 134 | # ------------------------------------- 135 | 136 | period = 60 137 | df = pd.DataFrame() 138 | df['date'] = dates 139 | df['win'] = np.repeat(np.nan, n_dates) 140 | df['draw'] = np.repeat(np.nan, n_dates) 141 | df['lose'] = np.repeat(np.nan, n_dates) 142 | df['pnl'] = np.repeat(np.nan, n_dates) 143 | df['rv'] = np.repeat(np.nan, n_dates) 144 | df['hml'] = np.repeat(np.nan, n_dates) 145 | df['break1'] = np.repeat(np.nan, n_dates) 146 | df['break2'] = np.repeat(np.nan, n_dates) 147 | df['mid_acf_1'] = np.repeat(np.nan, n_dates) 148 | df['mid_acf_2'] = np.repeat(np.nan, n_dates) 149 | df['mid_pacf_1'] = np.repeat(np.nan, n_dates) 150 | df['mid_pacf_2'] = np.repeat(np.nan, n_dates) 151 | df['diff_acf_1'] = np.repeat(np.nan, n_dates) 152 | df['diff_acf_2'] = np.repeat(np.nan, n_dates) 153 | df['diff_pacf_1'] = np.repeat(np.nan, n_dates) 154 | df['diff_pacf_2'] = np.repeat(np.nan, n_dates) 155 | 156 | for date in dates: 157 | print('Compute statistics on date ' + date) 158 | 159 | dailyPx = px[px.date == date] 160 | dailyPx = utils.get_period_px(dailyPx, period) 161 | seconds = dailyPx.second 162 | prices = dailyPx.mid 163 | mid_diff = (prices - prices.shift(1)).values / tick_size 164 | mid_diff = mid_diff[1:] 165 | win = np.sum(mid_diff > 0) / len(mid_diff) 166 | lose = np.sum(mid_diff < 0) / len(mid_diff) 167 | draw = np.sum(mid_diff == 0) / len(mid_diff) 168 | pnl = np.sum(mid_diff) 169 | hml = (prices.max() - prices.min()) / tick_size 170 | df.loc[df.date == date, 'win'] = win 171 | df.loc[df.date == date, 'draw'] = draw 172 | df.loc[df.date == date, 'lose'] = lose 173 | df.loc[df.date == date, 'pnl'] = pnl 174 | df.loc[df.date == date, 'rv'] = np.sqrt(np.nansum(mid_diff * mid_diff) / len(mid_diff)) 175 | df.loc[df.date == date, 'hml'] = hml 176 | 177 | break1 = prices[seconds >= 5400].values[0] - prices[seconds <= 4500].values[-1] 178 | break2 = prices[seconds >= 16200].values[0] - prices[seconds <= 9000].values[-1] 179 | df.loc[df.date == date, 'break1'] = break1 / tick_size 180 | df.loc[df.date == date, 'break2'] = break2 / tick_size 181 | 182 | mid_acf = sm.acf(prices.values, nlags=2) 183 | mid_pacf = sm.pacf(prices.values, nlags=2) 184 | diff_acf = sm.acf(mid_diff, nlags=2) 185 | diff_pacf = sm.pacf(mid_diff, nlags=2) 186 | df.loc[df.date == date, 'mid_acf_1'] = mid_acf[1] 187 | df.loc[df.date == date, 'mid_acf_2'] = mid_acf[2] 188 | df.loc[df.date == date, 'mid_pacf_1'] = mid_pacf[1] 189 | df.loc[df.date == date, 'mid_pacf_2'] = mid_pacf[2] 190 | df.loc[df.date == date, 'diff_acf_1'] = diff_acf[1] 191 | df.loc[df.date == date, 'diff_acf_2'] = diff_acf[2] 192 | df.loc[df.date == date, 'diff_pacf_1'] = diff_pacf[1] 193 | df.loc[df.date == date, 'diff_pacf_2'] = diff_pacf[2] 194 | 195 | df['format_date'] = format_dates 196 | df.set_index('format_date', inplace=True) 197 | 198 | df[['win', 'draw', 'lose']].plot() 199 | df[['win', 'lose']].plot() 200 | (df.win - df.lose).plot() 201 | df.pnl.plot() 202 | df[['pnl', 'hml']].plot() 203 | df.rv.plot() 204 | df.break1.hist() 205 | df.break2.hist() 206 | 207 | df.plot.scatter(x='win', y='lose') 208 | df.plot.scatter(x='win', y='pnl') 209 | df.plot.scatter(x='hml', y='lose') 210 | 211 | # ACF and PACF of mid and mid_diff 212 | df[['mid_acf_1', 'mid_acf_2', 'mid_pacf_1', 'mid_pacf_2']].plot() 213 | df[['diff_acf_1', 'diff_acf_2', 'diff_pacf_1', 'diff_pacf_2']].plot() 214 | 215 | 216 | # fit daily O-U process 217 | # --------------------- 218 | 219 | period = 60 220 | df = pd.DataFrame() 221 | df['date'] = dates 222 | df['b0'] = np.repeat(np.nan, n_dates) 223 | df['b1'] = np.repeat(np.nan, n_dates) 224 | df['mse'] = np.repeat(np.nan, n_dates) 225 | df['rsq'] = np.repeat(np.nan, n_dates) 226 | df['s0'] = np.repeat(np.nan, n_dates) 227 | df['s1'] = np.repeat(np.nan, n_dates) 228 | df['t0'] = np.repeat(np.nan, n_dates) 229 | df['t1'] = np.repeat(np.nan, n_dates) 230 | df['kappa'] = np.repeat(np.nan, n_dates) 231 | df['m'] = np.repeat(np.nan, n_dates) 232 | df['sigma'] = np.repeat(np.nan, n_dates) 233 | 234 | for date in dates: 235 | print('Fitting O-U process on date ' + date) 236 | 237 | dailyPx = px[px.date == date] 238 | dailyPx = utils.get_period_px(dailyPx, period) 239 | seconds = dailyPx.second 240 | prices = dailyPx.mid.values 241 | 242 | y = prices[1:] 243 | n = len(y) 244 | x = prices[:-1].reshape(n, 1) 245 | regr = linear_model.LinearRegression() 246 | regr.fit(x, y) 247 | 248 | b0 = regr.intercept_ 249 | b1 = regr.coef_.item() 250 | df.loc[df.date == date, 'b0'] = b0 251 | df.loc[df.date == date, 'b1'] = b1 252 | mse = np.sum((regr.predict(x) - y) ** 2) / (n-2) 253 | df.loc[df.date == date, 'mse'] = mse 254 | df.loc[df.date == date, 'rsq'] = regr.score(x, y) 255 | ssq = np.sum((x - np.mean(x)) ** 2) 256 | s1 = np.sqrt(mse / ssq) 257 | s0 = np.sqrt(mse / ssq * np.mean(x ** 2)) 258 | df.loc[df.date == date, 's0'] = s0 259 | df.loc[df.date == date, 's1'] = s1 260 | df.loc[df.date == date, 't0'] = b0 / s0 261 | df.loc[df.date == date, 't1'] = b1 / s1 262 | 263 | kappa = -np.log(b1) / period 264 | df.loc[df.date == date, 'kappa'] = kappa 265 | df.loc[df.date == date, 'm'] = b0 / (1 - b1) 266 | df.loc[df.date == date, 'sigma'] = np.sqrt(mse * 2 * kappa / (1 - b1**2)) 267 | 268 | df['format_date'] = format_dates 269 | df.set_index('format_date', inplace=True) 270 | 271 | df[['rsq']].plot() 272 | df[['mse']].plot() 273 | df[['b0']].plot() 274 | df[['b1']].plot() 275 | df[['t0', 't1']].plot() 276 | 277 | df.kappa.plot() 278 | df.m.plot() 279 | df.sigma.plot() 280 | 281 | # aggregate all the tick move and fit OU process 282 | # ---------------------------------------------- 283 | 284 | period = 60 285 | price_delta = [] 286 | for date in dates: 287 | print('Gathering prices on ' + date) 288 | dailyPx = px[px.date == date] 289 | dailyPx = utils.get_period_px(dailyPx, period) 290 | prices = dailyPx.mid.values 291 | delta = (prices[1:] - prices[:-1]) / tick_size 292 | price_delta += list(delta) 293 | 294 | price_delta = np.array(price_delta) 295 | prices = np.cumsum(price_delta) 296 | y = prices[1:] 297 | n = len(y) 298 | x = prices[:-1].reshape(n, 1) 299 | regr = linear_model.LinearRegression() 300 | regr.fit(x, y) 301 | b0 = regr.intercept_ 302 | b1 = regr.coef_.item() 303 | mse = np.sum((regr.predict(x) - y) ** 2) / (n-2) 304 | rsq = regr.score(x, y) 305 | ssq = np.sum((x - np.mean(x)) ** 2) 306 | s1 = np.sqrt(mse/ssq) 307 | s0 = np.sqrt(mse / ssq * np.mean(x ** 2)) 308 | t0 = b0 / s0 309 | t1 = b1 / s1 310 | 311 | kappa = -np.log(b1) / period 312 | m = b0 / (1 - b1) 313 | sigma = np.sqrt(mse * 2 * kappa / (1 - b1 ** 2)) 314 | 315 | -------------------------------------------------------------------------------- /research/signal_cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | import pandas as pd 5 | 6 | # import hft.data_loader as dl 7 | import hft.utils as utils 8 | import hft.signal_utils as signal 9 | 10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') 11 | 12 | data_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft', 'data') 13 | index_folder = os.path.join(data_path, 'index') 14 | 15 | # load raw data 16 | # ------------- 17 | 18 | product = 'cu' # switch between cu and zn 19 | with open(os.path.join(os.environ['HOME'], 'hft', 'ticksize.json')) as ticksize_file: 20 | ticksize_json = json.load(ticksize_file) 21 | tick_size = ticksize_json[product] 22 | 23 | # load raw data 24 | # dates = dl.get_dates() 25 | # pxall = dl.load_active_contract_multiple_dates(product, dates) 26 | # pxall.to_pickle(os.path.join(os.environ['HOME'], 'hft', product+'.pkl')) 27 | pxall = pd.read_pickle(os.path.join(data_path, product+'_20.pkl')) 28 | 29 | # cache index 30 | # ----------- 31 | 32 | px = pxall.copy().reset_index() 33 | # second_list = [1, 2, 5, 10, 20] 34 | second_list = [30, 60, 120, 180, 300] 35 | 36 | for sec in second_list: 37 | print('----------- sec = ' + str(sec) + ' -------------') 38 | backward_index = utils.get_index_multiple_dates(px, sec, 0) 39 | file_name = os.path.join(index_folder, product+'_index_'+str(sec)+'_0.pkl') 40 | print('Saving backward index to ' + file_name) 41 | backward_index.to_pickle(file_name) 42 | 43 | forward_index = utils.get_index_multiple_dates(px, 0, sec) 44 | file_name = os.path.join(index_folder, product+'_index_0_'+str(sec)+'.pkl') 45 | print('Saving forward index to ' + file_name) 46 | forward_index.to_pickle(file_name) 47 | 48 | # compute signal 49 | # -------------- 50 | 51 | # px = pxall[pxall.date.isin(['2013-10-08', '2013-10-09'])].copy().reset_index() 52 | px = pxall.copy().reset_index() 53 | # second_list = [1, 2, 5, 10, 20] 54 | second_list = [30, 60, 120, 180, 300] 55 | 56 | for sec in second_list: 57 | print('----------- sec = ' + str(sec) + ' -------------') 58 | 59 | # backward metrics 60 | filename = os.path.join(index_folder, product+'_index_'+str(sec)+'_0.pkl') 61 | backward_index = pd.read_pickle(filename) 62 | px = signal.signal_on_multiple_dates(px, lambda x: signal.order_imbalance_ratio(x, sec, 0, backward_index)) 63 | px = signal.signal_on_multiple_dates(px, lambda x: signal.order_flow_imbalance(x, sec, 0, backward_index, False)) 64 | px = signal.signal_on_multiple_dates(px, lambda x: signal.order_flow_imbalance(x, sec, 0, backward_index, True)) 65 | px = signal.signal_on_multiple_dates(px, lambda x: signal.period_mid_move(x, sec, 0, tick_size, backward_index)) 66 | 67 | # forward metrics 68 | filename = os.path.join(index_folder, product + '_index_0_' + str(sec) + '.pkl') 69 | forward_index = pd.read_pickle(filename) 70 | px = signal.signal_on_multiple_dates(px, lambda x: signal.period_mid_move(x, 0, sec, tick_size, forward_index)) 71 | 72 | # save to file 73 | filename = os.path.join(data_path, product+'_'+str(sec)+'.pkl') 74 | print('Saving forward index to ' + filename) 75 | px.to_pickle(filename) 76 | -------------------------------------------------------------------------------- /research/signal_research.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | import hft.utils as utils 9 | import hft.signal_utils as signal 10 | 11 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') 12 | 13 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft') 14 | data_path = os.path.join(hft_path, 'data') 15 | research_path = os.path.join(hft_path, 'research') 16 | 17 | # load enriched data 18 | # ------------------ 19 | 20 | product = 'zn' # switch between cu and zn 21 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file: 22 | ticksize_json = json.load(ticksize_file) 23 | tick_size = ticksize_json[product] 24 | 25 | px = pd.read_pickle(os.path.join(data_path, product+'_enriched.pkl')) 26 | 27 | px20131031 = px[px.date == '2013-10-31'] 28 | # px = px[np.isnan(px.tick_move_1_0) | (np.abs(px.tick_move_1_0) <= 5)] 29 | px = px[px.date != '2013-10-31'] 30 | 31 | # signal and return distribution 32 | # ------------------------------ 33 | 34 | px[['order_flow_imbalance_1_0', 'order_flow_imbalance_2_0', 'order_flow_imbalance_5_0', 'order_flow_imbalance_10_0', 35 | 'order_flow_imbalance_20_0', 'order_flow_imbalance_30_0', 'order_flow_imbalance_60_0', 36 | 'order_flow_imbalance_120_0', 'order_flow_imbalance_180_0', 'order_flow_imbalance_300_0']].describe() 37 | 38 | px[['order_imbalance_ratio_1_0', 'order_imbalance_ratio_2_0', 'order_imbalance_ratio_5_0', 'order_imbalance_ratio_10_0', 39 | 'order_imbalance_ratio_20_0', 'order_imbalance_ratio_30_0', 'order_imbalance_ratio_60_0', 40 | 'order_imbalance_ratio_120_0', 'order_imbalance_ratio_180_0', 'order_imbalance_ratio_300_0']].describe() 41 | 42 | px[['tick_move_1_0', 'tick_move_2_0', 'tick_move_5_0', 'tick_move_10_0', 'tick_move_20_0', 'tick_move_30_0', 43 | 'tick_move_60_0', 'tick_move_120_0', 'tick_move_180_0', 'tick_move_300_0']][px.date != '2013-10-31'].describe() 44 | 45 | px[['tick_move_5_0', 'tick_move_0_10', 'tick_move_0_20']].describe() 46 | 47 | signal.plot_two_hist(px, 'order_flow_imbalance', 60, 300) 48 | signal.plot_two_hist(px, 'order_imbalance_ratio', 60, 300) 49 | signal.plot_two_hist(px, 'tick_move', 60, 300) 50 | 51 | px.groupby(np.abs(px.tick_move_0_10)).size() 52 | 53 | print(sum(px.tick_move_0_10 == 0) / sum(~np.isnan(px.tick_move_0_10))) # % no move 54 | print(sum(np.abs(px.tick_move_0_10) >= 1) / sum(~np.isnan(px.tick_move_0_10))) # % 1 tick move 55 | print(sum(np.abs(px.tick_move_0_10) >= 2) / sum(~np.isnan(px.tick_move_0_10))) # % 2 tick move 56 | 57 | print(sum(px.tick_move_0_20 == 0) / sum(~np.isnan(px.tick_move_0_20))) # % no move 58 | print(sum(np.abs(px.tick_move_0_20) >= 1) / sum(~np.isnan(px.tick_move_0_20))) # % 1 tick move 59 | print(sum(np.abs(px.tick_move_0_20) >= 2) / sum(~np.isnan(px.tick_move_0_20))) # % 2 tick move 60 | 61 | # scatter plot 62 | # ------------ 63 | 64 | # forward return by signal 65 | signal.plot_two_scatter(px, 'order_imbalance_ratio', 'tick_move', 1, 0, 0, 1, 5, 0, 0, 5) 66 | signal.plot_two_scatter(px, 'order_flow_imbalance', 'tick_move', 60, 0, 0, 60, 300, 0, 0, 300) 67 | signal.plot_two_scatter(px, 'tick_move', 'tick_move', 5, 0, 0, 5, 60, 0, 0, 60) 68 | 69 | # signal by signal 70 | signal.plot_two_scatter(px, 'order_imbalance_ratio', 'tick_move', 1, 0, 1, 0, 5, 0, 5, 0) 71 | signal.plot_two_scatter(px, 'order_flow_imbalance', 'tick_move', 60, 0, 60, 0, 300, 0, 300, 0) 72 | signal.plot_two_scatter(px, 'order_flow_imbalance', 'order_imbalance_ratio', 60, 0, 60, 0, 300, 0, 300, 0) 73 | 74 | # correlations 75 | # ------------ 76 | 77 | 78 | second_list = [1, 2, 5, 10, 20, 30, 60, 120, 180, 300] 79 | for sec in second_list: 80 | px = px[(px[utils.get_moving_column_name('tick_move', 0, sec)] <= 10) | np.isnan(px.tick_move_1_0)] 81 | px = px[(px[utils.get_moving_column_name('tick_move', sec, 0)] <= 10) | np.isnan(px.tick_move_1_0)] 82 | 83 | oir_corr = signal.xy_corr(px, second_list, 'order_imbalance_ratio') 84 | ofi_corr = signal.xy_corr(px, second_list, 'order_flow_imbalance') 85 | autocorr = signal.xy_corr(px, second_list, 'tick_move') 86 | oir_corr.to_csv(os.path.join(research_path, 'oir_corr.csv')) 87 | ofi_corr.to_csv(os.path.join(research_path, 'ofi_corr.csv')) 88 | autocorr.to_csv(os.path.join(research_path, 'autocorr.csv')) 89 | 90 | oir_ofi = signal.xx_corr(px, second_list, 'order_imbalance_ratio', 'order_flow_imbalance') 91 | oir_return = signal.xx_corr(px, second_list, 'order_imbalance_ratio', 'tick_move') 92 | ofi_return = signal.xx_corr(px, second_list, 'order_flow_imbalance', 'tick_move') 93 | oir_ofi.to_csv(os.path.join(research_path, 'oir_ofi_corr.csv')) 94 | oir_return.to_csv(os.path.join(research_path, 'oir_return_corr.csv')) 95 | ofi_return.to_csv(os.path.join(research_path, 'ofi_return_corr.csv')) 96 | 97 | # multivariate regression 98 | # ----------------------- 99 | 100 | freq_oir = 1 101 | freq_ofi = 5 102 | freq_xreturn = 2 103 | freq_yreturn = 10 104 | 105 | res = signal.reg(px, freq_oir, freq_ofi, freq_xreturn, freq_yreturn, True) 106 | 107 | -------------------------------------------------------------------------------- /tests/scratch_pad.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import json 4 | import logging 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') 9 | 10 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft') 11 | data_path = os.path.join(hft_path, 'data') 12 | research_path = os.path.join(hft_path, 'research') 13 | 14 | # load enriched data 15 | # ------------------ 16 | 17 | product = 'cu' # switch between cu and zn 18 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file: 19 | ticksize_json = json.load(ticksize_file) 20 | 21 | px = pd.read_pickle(os.path.join(data_path, product+'_enriched.pkl')) 22 | px20131031 = px[px.date == '2013-10-31'] 23 | px = px[px.date != '2013-10-31'] 24 | 25 | dailyPx = px.groupby('date')[['mid']].transform(pd.Series.diff) 26 | dailyPx['tick_move'] = dailyPx['mid'] / ticksize_json[product] 27 | dailyPx['date'] = px['date'] 28 | dailyPx[dailyPx.date == '2013-10-08'].tick_move.plot() 29 | -------------------------------------------------------------------------------- /tests/test_backtester.py: -------------------------------------------------------------------------------- 1 | """ 2 | Back test 3 | """ 4 | 5 | import os 6 | import json 7 | import logging 8 | import numpy as np 9 | import pandas as pd 10 | 11 | import hft.backtester as bt 12 | 13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') 14 | 15 | hft_path = os.path.join(os.environ['HOME'], 'dropbox', 'hft') 16 | data_path = os.path.join(hft_path, 'data') 17 | research_path = os.path.join(hft_path, 'research') 18 | 19 | # load enriched data 20 | # ------------------ 21 | 22 | product = 'zn' # switch between cu and zn 23 | with open(os.path.join(data_path, 'ticksize.json')) as ticksize_file: 24 | ticksize_json = json.load(ticksize_file) 25 | 26 | px = pd.read_pickle(os.path.join(data_path, product+'_enriched.pkl')) 27 | px20131031 = px[px.date == '2013-10-31'] 28 | px = px[px.date != '2013-10-31'] 29 | 30 | # configuration 31 | # ------------- 32 | 33 | config = dict() 34 | 35 | # general configuration 36 | config['name'] = product + '_1' 37 | config['data_path'] = data_path 38 | config['start_date'] = '2013-10-05' 39 | 40 | # model specifics 41 | config['training_period'] = 21 # days 42 | config['feature_column'] = ['order_imbalance_ratio', 'order_flow_imbalance', 'tick_move'] 43 | config['feature_freq'] = [1, 2, 5, 10, 20, 30, 60, 120, 180, 300] 44 | config['feature_winsorize_prob'] = {'order_imbalance_ratio': [0.0, 0.0], 45 | 'order_flow_imbalance': [0.005, 0.005], 46 | 'tick_move': [0, 0]} 47 | config['feature_winsorize_bound'] = {'order_imbalance_ratio': [-np.inf, np.inf], 48 | 'order_flow_imbalance': [-np.inf, np.inf], 49 | 'tick_move': [-10, 10]} 50 | config['response_column'] = 'tick_move' 51 | config['response_winsorize_prob'] = [0, 0] 52 | config['response_winsorize_bound'] = [-5, 5] 53 | 54 | # open/close/hold condition 55 | config['holding_period'] = 10 # seconds 56 | config['trade_trigger_threshold'] = [-0.4, 0.4] 57 | config['start_second'] = 180 58 | config['end_second'] = 21420 59 | 60 | # pnl 61 | config['use_mid'] = False # if False, use touch price 62 | config['transaction_fee'] = 0.0001 # 1 bps transaction fee 63 | 64 | # backtesting 65 | # ----------- 66 | 67 | backtest = bt.backtest(px, config) 68 | backtest = bt.trade(backtest, config) 69 | backtest = bt.pnl(backtest, config) 70 | bt.save(backtest, config) 71 | -------------------------------------------------------------------------------- /tests/test_signal_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import hft.data_loader as dl 4 | import hft.signal_utils as signal 5 | 6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') 7 | 8 | product = 'cu' # switch between cu and zn 9 | tick_size = 10 if product == 'cu' else 5 10 | seconds = 10 11 | return_col = 'return'+str(seconds) 12 | return_cutoff = 6 13 | 14 | # test on a single date 15 | px = dl.load_active_contract(product, '20131231') 16 | px = signal.order_flow_imbalance(px, 1, 0) 17 | px = signal.order_imbalance_ratio(px, 1, 0) 18 | px = signal.period_mid_move(px, 5, 0, tick_size) 19 | px = signal.period_mid_move(px, 0, 10, tick_size) 20 | --------------------------------------------------------------------------------