├── paper.pdf ├── README.md ├── unpack.py ├── clean_data.py ├── models.py └── itch.py /paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vnatesh/VWAP-Prediction/HEAD/paper.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VWAP-Prediction 2 | This is a research project where we designed several algorithms for volume-weighted-average-price (VWAP) prediction. Our dataset consisted of millisecond-level limit-order books for multiple stocks. Random forest and logistic regression were used for VWAP direction classification (up or down), while PCA and random forest were used for feature selection. Least absolute shrinkage and selection operator (LASSO) regression was performed to predict real VWAP value. We also used a long short-term memory (LSTM) recurrent neural network to predict real VWAP value. See paper.pdf for more details. 3 | -------------------------------------------------------------------------------- /unpack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import itch 4 | import h5py 5 | import sys 6 | import time as timer 7 | 8 | 9 | DATE_list = ['20181105','20181106','20181107','20181108','20181109', 10 | '20181112','20181113','20181114','20181115','20181116', 11 | '20181119','20181120','20181121','20181123','20181126', 12 | '20181127','20181128','20181129','20181130','20181203', 13 | '20181204' ] 14 | # stock = 'AAPL' 15 | stock = 'GS' 16 | 17 | fout = h5py.File('gs_tick_data.hdf5', 'a') 18 | LEVELS = 10 19 | 20 | for DATE in DATE_list: 21 | print(DATE) 22 | start = timer.time() 23 | 24 | df = pd.read_csv("~/Downloads/"+DATE+"_"+stock+".csv") 25 | df = df.drop('MPID', axis = 1) 26 | df = df.drop('X', axis = 1) 27 | df_v = df.values 28 | 29 | orderpool = itch.Orderpool() 30 | book = itch.Book(LEVELS) 31 | messagedata = [] 32 | bookdata = [] 33 | 34 | for i in range(len(df_v)): 35 | line = df_v[i] 36 | message_type = line[3] 37 | message = itch.get_message(line,message_type) 38 | 39 | # complete message... 40 | if message_type in ('E', 'C', 'F', 'D'): 41 | orderpool.complete_message(message) 42 | 43 | # update orderpool... 44 | if message_type in ('B','S','E', 'C', 'F', 'D'): 45 | orderpool.update(message) 46 | 47 | # update booklist... 48 | if message_type in ('B','S','E', 'C', 'F', 'D'): 49 | book.update(message) 50 | 51 | # update messagedata... 52 | messagedata.append(message.values()) 53 | 54 | # update bookdata... 55 | # check if bookdata is all zero 56 | book_v = book.values() 57 | if np.any(book_v[1:]): 58 | bookdata.append(book_v) 59 | 60 | # messagedata to HDF5... 61 | messagedata = np.asarray(messagedata) 62 | group = 'messages' 63 | itch.writedata(messagedata,fout,group,stock,DATE) 64 | 65 | # bookdata to HDF5... 66 | bookdata = np.asarray(bookdata) 67 | group = 'orderbooks' 68 | itch.writedata(bookdata,fout,group,stock,DATE) 69 | 70 | stop = timer.time() 71 | 72 | # OUTPUT # 73 | print('Elapsed time:', stop - start, 'sec') 74 | 75 | fout.close() 76 | -------------------------------------------------------------------------------- /clean_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import h5py 4 | import matplotlib.pyplot as plt 5 | 6 | def load_messages(path, name, date): 7 | data = h5py.File(path, 'r') 8 | messages = data['/messages/' + name + '/' + date] 9 | mdata = messages[:, :] 10 | t, n = mdata.shape 11 | data.close() 12 | mcolumns = ['msec', 13 | 'type', 14 | 'buysell', 15 | 'price', 16 | 'shares', 17 | 'refno'] 18 | mout = pd.DataFrame(mdata, index=np.arange(0, t), columns=mcolumns) 19 | # mout["time"]= pd.to_datetime(mout["msec"],unit='ms',origin=pd.Timestamp(date)) 20 | # mout = mout[['time','msec', 'type', 'buysell', 'price', 'shares', 'refno']] 21 | return mout 22 | 23 | 24 | def load_books(path, name, date): 25 | data = h5py.File(path, 'r') 26 | orderbooks = data['/orderbooks/' + name + '/' + date] 27 | mdata = orderbooks[:, :] 28 | t, n = mdata.shape 29 | data.close() 30 | # columns names 31 | time = ['msec'] 32 | bid_price = ['bp'+str(i) for i in range(1,11)] 33 | ask_price = ['ap'+str(i) for i in range(1,11)] 34 | bid_volumn = ['bv'+str(i) for i in range(1,11)] 35 | ask_volumn = ['av'+str(i) for i in range(1,11)] 36 | mcolumns = time+bid_price+ask_price+bid_volumn+ask_volumn 37 | mout = pd.DataFrame(mdata, index=np.arange(0, t),columns=mcolumns) 38 | return mout 39 | 40 | def vwap_series(df, tinterval): 41 | df['sec'] = df['msec']/1000 42 | vwap_list = [] 43 | df_v = df.values 44 | time = 34200 45 | temp = [] 46 | for i in range(len(df_v)): 47 | if df_v[i][6]>= time and df_v[i][6]< time+tinterval: 48 | temp.append([df_v[i][3],df_v[i][4]]) 49 | if df_v[i][6]>= time+tinterval: 50 | time = time+tinterval 51 | vol_time_price = [x[0]*x[1] for x in temp] 52 | if sum([x[1] for x in temp]) != 0: 53 | vwap_list.append(sum(vol_time_price)/sum([x[1] for x in temp])) 54 | temp = [] 55 | else: 56 | vwap_list.append(np.nan) 57 | temp = [] 58 | if i == len(df_v)-1: 59 | # multiply volume by price for each row in the 10s interval 60 | vol_time_price = [x[0]*x[1] for x in temp] 61 | if sum([x[1] for x in temp]) != 0: 62 | # sum all the vol*p and divide by total volume to get vwap 63 | vwap_list.append(sum(vol_time_price)/sum([x[1] for x in temp])) 64 | else: 65 | vwap_list.append(np.nan) 66 | return vwap_list 67 | 68 | 69 | 70 | 71 | 72 | DATE_list = ['20181105','20181106','20181107','20181108','20181109', 73 | '20181112','20181113','20181114','20181115','20181116', 74 | '20181119','20181120','20181121','20181126', 75 | '20181127','20181128','20181129','20181130','20181203', 76 | '20181204' ] 77 | 78 | df_mult_date = pd.DataFrame() 79 | 80 | for DATE in DATE_list: 81 | 82 | # Goldman Sachs message data 83 | # df = load_messages('gs_tick_data.hdf5', 'GS', DATE) 84 | # Apple message data 85 | df = load_messages('/Volumes/easystore/FML_project/aapl_tick_data.hdf5', 'AAPL', DATE) 86 | df = df[(df['msec'] >= 34200000) & (df['msec'] <= 57600000)] 87 | ex = df[df['type'].isin([2,4,6,7])] 88 | ex = ex.reset_index(drop=True) 89 | ex['price'] = ex['price']/10000 90 | 91 | # computing 10 second vwap series 92 | vwap = vwap_series(ex,10) 93 | sec = list(range(34210,57601,10)) 94 | vwap_df = pd.DataFrame() 95 | vwap_df['vwap'] = vwap 96 | vwap_df['sec'] = sec 97 | vwap_df['msec'] = vwap_df['sec']*1000 98 | vwap_df["time"]= pd.to_datetime(vwap_df["msec"],unit='ms',origin=pd.Timestamp(20181204)) 99 | vwap_df = vwap_df.dropna() 100 | # l.plot(x='time', y='vwap') 101 | # plt.show() 102 | 103 | # Goldman Sachs Tick data 104 | # book = load_books('gs_tick_data.hdf5', 'GS', DATE) 105 | # Apple tick data 106 | book = load_books('/Volumes/easystore/FML_project/aapl_tick_data.hdf5','AAPL',DATE) 107 | book = book[(book['msec'] >= 34200000) & (book['msec'] <= 57600000)] 108 | book = book.reset_index(drop=True) 109 | book = book[['msec', 'bp1', 'bp2', 'bp3', 'bp4', 'bp5', 'ap1', 'ap2', 'ap3', 'ap4', 'ap5', 'bv1', 'bv2', 'bv3', 'bv4', 'bv5', 'av1', 'av2', 'av3', 'av4', 'av5']] 110 | book['origion'] = 1 111 | 112 | msec = [x*1000 for x in list(range(34200,57600,10))] 113 | mcolumns = ['msec', 'bp1', 'bp2', 'bp3', 'bp4', 'bp5', 'ap1', 'ap2', 'ap3', 'ap4', 'ap5', 'bv1', 'bv2', 'bv3', 'bv4', 'bv5', 'av1', 'av2', 'av3', 'av4', 'av5','origion'] 114 | a = np.empty((len(msec),22,)) 115 | a[:] = np.nan 116 | insert_book = pd.DataFrame(a,index=np.arange(0, len(msec)),columns=mcolumns) 117 | insert_book['msec'] = msec 118 | insert_book['origion'] = 0 119 | 120 | # merge 121 | frames = [book, insert_book] 122 | y = pd.concat(frames,ignore_index=True) 123 | y = y.sort_values(by=['msec']) 124 | y = y.fillna(method='ffill') 125 | # pull out 126 | new_book = y[y['origion']==0] 127 | new_book = new_book.dropna() 128 | new_book = new_book.drop(columns=['origion']) 129 | 130 | d = vwap_df.join(new_book.set_index('msec'), on='msec') 131 | d = d.dropna() 132 | d = d.drop(columns=['time','sec']) 133 | 134 | cols = ['bp1', 'bp2', 'bp3', 'bp4', 'bp5', 'ap1', 'ap2', 'ap3', 'ap4', 'ap5', 'bv1', 'bv2', 'bv3', 'bv4', 'bv5', 'av1', 'av2', 'av3', 'av4', 'av5'] 135 | for col in cols: 136 | d['delta_'+col] = d[col].diff(1) 137 | 138 | d['mean_volumn_diff'] = (d['bv1']+d['bv2']+d['bv3']+d['bv4']+d['bv5'])/5 - (d['av1']+d['av2']+d['av3']+d['av4']+d['av5'])/5 139 | d['spread'] = d['ap1'] - d['bp1'] 140 | d['vol_unb1'] = (d['bv1'] - d['av1'])/d['bv1'] 141 | d['vol_unb2'] = (d['bv2'] - d['av2'])/d['bv2'] 142 | d['vol_unb3'] = (d['bv3'] - d['av3'])/d['bv3'] 143 | d['vol_unb4'] = (d['bv4'] - d['av4'])/d['bv4'] 144 | d['vol_unb5'] = (d['bv5'] - d['av5'])/d['bv5'] 145 | 146 | d_v = d.values 147 | mom_b = [np.nan,np.nan,np.nan,np.nan,np.nan] 148 | volat_b = [np.nan,np.nan,np.nan,np.nan,np.nan] 149 | mom_a = [np.nan,np.nan,np.nan,np.nan,np.nan] 150 | volat_a = [np.nan,np.nan,np.nan,np.nan,np.nan] 151 | 152 | # why volataility for last 5 153 | for i in range(5,len(d_v)): 154 | bp_past5 = np.asarray([d_v[i-1][2]/10000,d_v[i-2][2]/10000,d_v[-3][2]/10000,d_v[-4][2]/10000,d_v[i-5][2]/10000]) 155 | ap_past5 = np.asarray([d_v[i-1][7]/10000,d_v[i-2][7]/10000,d_v[-3][7]/10000,d_v[-4][7]/10000,d_v[i-5][7]/10000]) 156 | mom_b.append((d_v[i][2]-d_v[i-5][2])/d_v[i-5][2]) 157 | volat_b.append(bp_past5.std()) 158 | mom_a.append((d_v[i][7]-d_v[i-5][7])/d_v[i-5][7]) 159 | volat_a.append(ap_past5.std()) 160 | d['mom_bp1'] = mom_b 161 | d['mom_ap1'] = mom_a 162 | d['vola_bp1'] = volat_b 163 | d['vola_ap1'] = volat_a 164 | 165 | label1 = [] 166 | label2 = [] 167 | for i in range(len(d_v)-1): 168 | if d_v[i+1][0]>d_v[i][0]: 169 | label1.append(1) 170 | if d_v[i+1][0]=quantile[i] and x