├── .gitignore ├── README.md ├── models ├── Experiments2_VanillaRNN.py ├── __init__.py ├── attention_decoder.py ├── data_cleaning.py ├── data_partitioning.py ├── lstm_attention_v0.py ├── lstm_attention_v1.py ├── lstm_v01.py ├── lstm_v02.py ├── lstm_v02_analysis.py ├── lstm_v03.py ├── lstm_v03_analysis.py ├── lstm_v04.py ├── lstm_v04_analysis.py ├── lstm_v05.py └── lstm_v05_analysis.py ├── notebooks ├── __init__.py ├── avis-kernel.ipynb ├── data_cleaning.ipynb ├── exploration-filter-non-continuous-news.ipynb ├── exploration-filter-non-continuous-stocks.ipynb ├── se_kernel_v0.ipynb └── se_kernel_v1.py ├── report ├── Diagram.png ├── LSTMAgrid.png ├── LSTMgrid.png ├── Shuffling.png ├── Stocks.png ├── lstm_att_v0_ts_1_drop_04_cells_64.png ├── lstm_att_v0_ts_5_drop_0_cells_64.png ├── lstm_plot1.png ├── lstm_plot2.png ├── main.bbl ├── main.pdf ├── main.tex ├── nicefrac.sty ├── nips_2016.sty ├── printlen.sty ├── ref.bib └── temp └── utils └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | .ipynb* 3 | .DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LSTM-Attention 2 | 3 | A Comparison of LSTMs and Attention Mechanisms for Forecasting Financial Time Series - [read it here](https://github.com/PsiPhiTheta/LSTM-Attention/tree/master/report/main.pdf). 4 | -------------------------------------------------------------------------------- /models/Experiments2_VanillaRNN.py: -------------------------------------------------------------------------------- 1 | # Vanilla RNN 2 | 3 | # This is a vanilla version of rudimentary RNN with random structure 4 | # which cannot yet be tested (waiting on data from Antoine), most likely 5 | # will need to be tweaked as I can’t test (assumed single step ahead using 6 | # 10 days of data as ‘features’, assumed output as predicted value i.e. 7 | # the higher the predicted value the higher the confidence that we predict 8 | # the asset goes up & vice versa). Further details in my journal googledoc. 9 | 10 | # Since I have no knowledge of Keras, this follows the tutorial on 11 | # machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/ 12 | # Full Keras documentation can be found here: https://keras.io/layers/recurrent/ 13 | 14 | # 1. Import dependancies 15 | import numpy as numpy 16 | import matplotlib.pyplot as plt 17 | from math import sqrt 18 | from sklearn.metrics import mean_squared_error 19 | from keras.models import Sequential 20 | from keras.layers import Dense 21 | from keras.layers import LSTM 22 | 23 | # 2. Functions 24 | def antoineData(): 25 | # Antoine's script will go here, prelim data will be 'assetCode', 26 | # 'time', 'volume', 'open', 'returnsOpenPrevMktres1', 27 | # 'returnsOpenPrevMkres10', 'returnsOpenNextMktres10', 28 | # 'sentimentNegative', 'sentimentNeutral' 29 | return 0 30 | 31 | # 3. Import data 32 | x_train, y_train, x_test, y_test = antoineData() 33 | 34 | # 4. Build model from Keras 35 | model = Sequential() # Sequential model is a linear stack of layers 36 | model.add(LSTM(50, input_shape=(x_train.shape[1], x_train.shape[2]))) # adds LSTM layer 37 | model.add(Dense(1)) # adds a dense layer 38 | model.compile(loss='mae', optimizer='adam') # sets the loss as mean absolute error and the optimiser as ADAM 39 | 40 | # 5. Fit RNN 41 | history = model.fit(x_train, y_train, epochs=50, batch_size=72, validation_data=(x_test, y_test), verbose=2, shuffle=False) # fits 42 | 43 | # 6. Plot history 44 | plt.plot(history.history['loss'], label='train') 45 | plt.plot(history.history['val_loss'], label='test') 46 | plt.legend() 47 | plt.show() 48 | 49 | # make a prediction 50 | y_hat = model.predict(x_test) 51 | # calculate the error (can modify this for accuracy instead if needed using skl) 52 | RMSE = sqrt(mean_squared_error(y_test, y_hat)) -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/models/__init__.py -------------------------------------------------------------------------------- /models/attention_decoder.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from keras import backend as K 3 | from keras import regularizers, constraints, initializers, activations 4 | from keras.layers.recurrent import Recurrent 5 | from keras.engine import InputSpec 6 | 7 | 8 | tfPrint = lambda d, T: tf.Print(input_=T, data=[T, tf.shape(T)], message=d) 9 | 10 | 11 | def time_distributed_dense(x, w, b=None, dropout=None, 12 | input_dim=None, output_dim=None, timesteps=None): 13 | '''Apply y.w + b for every temporal slice y of x. 14 | ''' 15 | if not input_dim: 16 | # won't work with TensorFlow 17 | input_dim = K.shape(x)[2] 18 | if not timesteps: 19 | # won't work with TensorFlow 20 | timesteps = K.shape(x)[1] 21 | if not output_dim: 22 | # won't work with TensorFlow 23 | output_dim = K.shape(w)[1] 24 | 25 | if dropout: 26 | # apply the same dropout pattern at every timestep 27 | ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) 28 | dropout_matrix = K.dropout(ones, dropout) 29 | expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) 30 | x *= expanded_dropout_matrix 31 | 32 | # collapse time dimension and batch dimension together 33 | x = K.reshape(x, (-1, input_dim)) 34 | 35 | x = K.dot(x, w) 36 | if b: 37 | x = x + b 38 | # reshape to 3D tensor 39 | x = K.reshape(x, (-1, timesteps, output_dim)) 40 | return x 41 | 42 | class AttentionDecoder(Recurrent): 43 | 44 | def __init__(self, units, output_dim, 45 | activation='tanh', 46 | return_probabilities=False, 47 | name='AttentionDecoder', 48 | kernel_initializer='glorot_uniform', 49 | recurrent_initializer='orthogonal', 50 | bias_initializer='zeros', 51 | kernel_regularizer=None, 52 | bias_regularizer=None, 53 | activity_regularizer=None, 54 | kernel_constraint=None, 55 | bias_constraint=None, 56 | **kwargs): 57 | """ 58 | Implements an AttentionDecoder that takes in a sequence encoded by an 59 | encoder and outputs the decoded states 60 | :param units: dimension of the hidden state and the attention matrices 61 | :param output_dim: the number of labels in the output space 62 | 63 | references: 64 | Bahdanau, Dzmitry, Kyunghyun Cho, and Yoshua Bengio. 65 | "Neural machine translation by jointly learning to align and translate." 66 | arXiv preprint arXiv:1409.0473 (2014). 67 | """ 68 | self.units = units 69 | self.output_dim = output_dim 70 | self.return_probabilities = return_probabilities 71 | self.activation = activations.get(activation) 72 | self.kernel_initializer = initializers.get(kernel_initializer) 73 | self.recurrent_initializer = initializers.get(recurrent_initializer) 74 | self.bias_initializer = initializers.get(bias_initializer) 75 | 76 | self.kernel_regularizer = regularizers.get(kernel_regularizer) 77 | self.recurrent_regularizer = regularizers.get(kernel_regularizer) 78 | self.bias_regularizer = regularizers.get(bias_regularizer) 79 | self.activity_regularizer = regularizers.get(activity_regularizer) 80 | 81 | self.kernel_constraint = constraints.get(kernel_constraint) 82 | self.recurrent_constraint = constraints.get(kernel_constraint) 83 | self.bias_constraint = constraints.get(bias_constraint) 84 | 85 | super(AttentionDecoder, self).__init__(**kwargs) 86 | self.name = name 87 | self.return_sequences = True # must return sequences 88 | 89 | def build(self, input_shape): 90 | """ 91 | See Appendix 2 of Bahdanau 2014, arXiv:1409.0473 92 | for model details that correspond to the matrices here. 93 | """ 94 | 95 | self.batch_size, self.timesteps, self.input_dim = input_shape 96 | 97 | if self.stateful: 98 | super(AttentionDecoder, self).reset_states() 99 | 100 | self.states = [None, None] # y, s 101 | 102 | """ 103 | Matrices for creating the context vector 104 | """ 105 | 106 | self.V_a = self.add_weight(shape=(self.units,), 107 | name='V_a', 108 | initializer=self.kernel_initializer, 109 | regularizer=self.kernel_regularizer, 110 | constraint=self.kernel_constraint) 111 | self.W_a = self.add_weight(shape=(self.units, self.units), 112 | name='W_a', 113 | initializer=self.kernel_initializer, 114 | regularizer=self.kernel_regularizer, 115 | constraint=self.kernel_constraint) 116 | self.U_a = self.add_weight(shape=(self.input_dim, self.units), 117 | name='U_a', 118 | initializer=self.kernel_initializer, 119 | regularizer=self.kernel_regularizer, 120 | constraint=self.kernel_constraint) 121 | self.b_a = self.add_weight(shape=(self.units,), 122 | name='b_a', 123 | initializer=self.bias_initializer, 124 | regularizer=self.bias_regularizer, 125 | constraint=self.bias_constraint) 126 | """ 127 | Matrices for the r (reset) gate 128 | """ 129 | self.C_r = self.add_weight(shape=(self.input_dim, self.units), 130 | name='C_r', 131 | initializer=self.recurrent_initializer, 132 | regularizer=self.recurrent_regularizer, 133 | constraint=self.recurrent_constraint) 134 | self.U_r = self.add_weight(shape=(self.units, self.units), 135 | name='U_r', 136 | initializer=self.recurrent_initializer, 137 | regularizer=self.recurrent_regularizer, 138 | constraint=self.recurrent_constraint) 139 | self.W_r = self.add_weight(shape=(self.output_dim, self.units), 140 | name='W_r', 141 | initializer=self.recurrent_initializer, 142 | regularizer=self.recurrent_regularizer, 143 | constraint=self.recurrent_constraint) 144 | self.b_r = self.add_weight(shape=(self.units, ), 145 | name='b_r', 146 | initializer=self.bias_initializer, 147 | regularizer=self.bias_regularizer, 148 | constraint=self.bias_constraint) 149 | 150 | """ 151 | Matrices for the z (update) gate 152 | """ 153 | self.C_z = self.add_weight(shape=(self.input_dim, self.units), 154 | name='C_z', 155 | initializer=self.recurrent_initializer, 156 | regularizer=self.recurrent_regularizer, 157 | constraint=self.recurrent_constraint) 158 | self.U_z = self.add_weight(shape=(self.units, self.units), 159 | name='U_z', 160 | initializer=self.recurrent_initializer, 161 | regularizer=self.recurrent_regularizer, 162 | constraint=self.recurrent_constraint) 163 | self.W_z = self.add_weight(shape=(self.output_dim, self.units), 164 | name='W_z', 165 | initializer=self.recurrent_initializer, 166 | regularizer=self.recurrent_regularizer, 167 | constraint=self.recurrent_constraint) 168 | self.b_z = self.add_weight(shape=(self.units, ), 169 | name='b_z', 170 | initializer=self.bias_initializer, 171 | regularizer=self.bias_regularizer, 172 | constraint=self.bias_constraint) 173 | """ 174 | Matrices for the proposal 175 | """ 176 | self.C_p = self.add_weight(shape=(self.input_dim, self.units), 177 | name='C_p', 178 | initializer=self.recurrent_initializer, 179 | regularizer=self.recurrent_regularizer, 180 | constraint=self.recurrent_constraint) 181 | self.U_p = self.add_weight(shape=(self.units, self.units), 182 | name='U_p', 183 | initializer=self.recurrent_initializer, 184 | regularizer=self.recurrent_regularizer, 185 | constraint=self.recurrent_constraint) 186 | self.W_p = self.add_weight(shape=(self.output_dim, self.units), 187 | name='W_p', 188 | initializer=self.recurrent_initializer, 189 | regularizer=self.recurrent_regularizer, 190 | constraint=self.recurrent_constraint) 191 | self.b_p = self.add_weight(shape=(self.units, ), 192 | name='b_p', 193 | initializer=self.bias_initializer, 194 | regularizer=self.bias_regularizer, 195 | constraint=self.bias_constraint) 196 | """ 197 | Matrices for making the final prediction vector 198 | """ 199 | self.C_o = self.add_weight(shape=(self.input_dim, self.output_dim), 200 | name='C_o', 201 | initializer=self.recurrent_initializer, 202 | regularizer=self.recurrent_regularizer, 203 | constraint=self.recurrent_constraint) 204 | self.U_o = self.add_weight(shape=(self.units, self.output_dim), 205 | name='U_o', 206 | initializer=self.recurrent_initializer, 207 | regularizer=self.recurrent_regularizer, 208 | constraint=self.recurrent_constraint) 209 | self.W_o = self.add_weight(shape=(self.output_dim, self.output_dim), 210 | name='W_o', 211 | initializer=self.recurrent_initializer, 212 | regularizer=self.recurrent_regularizer, 213 | constraint=self.recurrent_constraint) 214 | self.b_o = self.add_weight(shape=(self.output_dim, ), 215 | name='b_o', 216 | initializer=self.bias_initializer, 217 | regularizer=self.bias_regularizer, 218 | constraint=self.bias_constraint) 219 | 220 | # For creating the initial state: 221 | self.W_s = self.add_weight(shape=(self.input_dim, self.units), 222 | name='W_s', 223 | initializer=self.recurrent_initializer, 224 | regularizer=self.recurrent_regularizer, 225 | constraint=self.recurrent_constraint) 226 | 227 | self.input_spec = [ 228 | InputSpec(shape=(self.batch_size, self.timesteps, self.input_dim))] 229 | self.built = True 230 | 231 | def call(self, x): 232 | # store the whole sequence so we can "attend" to it at each timestep 233 | self.x_seq = x 234 | 235 | # apply the a dense layer over the time dimension of the sequence 236 | # do it here because it doesn't depend on any previous steps 237 | # thefore we can save computation time: 238 | self._uxpb = time_distributed_dense(self.x_seq, self.U_a, b=self.b_a, 239 | input_dim=self.input_dim, 240 | timesteps=self.timesteps, 241 | output_dim=self.units) 242 | 243 | return super(AttentionDecoder, self).call(x) 244 | 245 | def get_initial_state(self, inputs): 246 | # apply the matrix on the first time step to get the initial s0. 247 | s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s)) 248 | 249 | # from keras.layers.recurrent to initialize a vector of (batchsize, 250 | # output_dim) 251 | y0 = K.zeros_like(inputs) # (samples, timesteps, input_dims) 252 | y0 = K.sum(y0, axis=(1, 2)) # (samples, ) 253 | y0 = K.expand_dims(y0) # (samples, 1) 254 | y0 = K.tile(y0, [1, self.output_dim]) 255 | 256 | return [y0, s0] 257 | 258 | def step(self, x, states): 259 | 260 | ytm, stm = states 261 | 262 | # repeat the hidden state to the length of the sequence 263 | _stm = K.repeat(stm, self.timesteps) 264 | 265 | # now multiplty the weight matrix with the repeated hidden state 266 | _Wxstm = K.dot(_stm, self.W_a) 267 | 268 | # calculate the attention probabilities 269 | # this relates how much other timesteps contributed to this one. 270 | et = K.dot(activations.tanh(_Wxstm + self._uxpb), 271 | K.expand_dims(self.V_a)) 272 | at = K.exp(et) 273 | at_sum = K.sum(at, axis=1) 274 | at_sum_repeated = K.repeat(at_sum, self.timesteps) 275 | at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) 276 | 277 | # calculate the context vector 278 | context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) 279 | # ~~~> calculate new hidden state 280 | # first calculate the "r" gate: 281 | 282 | rt = activations.sigmoid( 283 | K.dot(ytm, self.W_r) 284 | + K.dot(stm, self.U_r) 285 | + K.dot(context, self.C_r) 286 | + self.b_r) 287 | 288 | # now calculate the "z" gate 289 | zt = activations.sigmoid( 290 | K.dot(ytm, self.W_z) 291 | + K.dot(stm, self.U_z) 292 | + K.dot(context, self.C_z) 293 | + self.b_z) 294 | 295 | # calculate the proposal hidden state: 296 | s_tp = activations.tanh( 297 | K.dot(ytm, self.W_p) 298 | + K.dot((rt * stm), self.U_p) 299 | + K.dot(context, self.C_p) 300 | + self.b_p) 301 | 302 | # new hidden state: 303 | st = (1-zt)*stm + zt * s_tp 304 | 305 | yt = activations.softmax( 306 | K.dot(ytm, self.W_o) 307 | + K.dot(stm, self.U_o) 308 | + K.dot(context, self.C_o) 309 | + self.b_o) 310 | 311 | if self.return_probabilities: 312 | return at, [yt, st] 313 | else: 314 | return yt, [yt, st] 315 | 316 | def compute_output_shape(self, input_shape): 317 | """ 318 | For Keras internal compatability checking 319 | """ 320 | if self.return_probabilities: 321 | return (None, self.timesteps, self.timesteps) 322 | else: 323 | return (None, self.timesteps, self.output_dim) 324 | 325 | def get_config(self): 326 | """ 327 | For rebuilding models on load time. 328 | """ 329 | config = { 330 | 'output_dim': self.output_dim, 331 | 'units': self.units, 332 | 'return_probabilities': self.return_probabilities 333 | } 334 | base_config = super(AttentionDecoder, self).get_config() 335 | return dict(list(base_config.items()) + list(config.items())) 336 | -------------------------------------------------------------------------------- /models/data_cleaning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from itertools import chain 5 | 6 | 7 | MARKET_DATA_PATH = './data/raw/market_train_df.csv' 8 | NEWS_DATA_PATH = './data/raw/news_train_df.csv' 9 | 10 | 11 | def clean_market_data(market_df, train=True): 12 | '''Clean and preprocess the market data for training or testing. 13 | 14 | Parameters 15 | ---------- 16 | market_df : dataframe 17 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full 18 | description of the dataframe. 19 | train : bool 20 | When true, adds the target variable to the dataframe. 21 | 22 | Returns 23 | ------- 24 | dataframe 25 | Cleaned market data. 26 | 27 | ''' 28 | # Select wanted columns 29 | if train: 30 | cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1', 31 | 'returnsOpenPrevMktres10', 'returnsOpenNextMktres10'] 32 | else: 33 | cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1', 34 | 'returnsOpenPrevMktres10'] 35 | market_df = market_df.loc[:,cols] 36 | 37 | # Drop NA 38 | market_df.dropna(inplace=True) 39 | 40 | # Filter out stocks that cover the full time series 41 | series_len = market_df.time.nunique() 42 | market_df = market_df.groupby('assetCode') .filter(lambda x: len(x) == series_len) 43 | assert (market_df.groupby('assetCode').size() == series_len).all() 44 | 45 | # Normalize time 46 | market_df.loc[:, 'time'] = pd.to_datetime(market_df.time).dt.normalize() 47 | 48 | return market_df 49 | 50 | 51 | 52 | def clean_news_data(news_df): 53 | '''Clean and preprocess the news data for training or testing. 54 | 55 | Parameters 56 | ---------- 57 | news_df : dataframe 58 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full 59 | description of the dataframe. 60 | 61 | Returns 62 | ------- 63 | dataframe 64 | Cleaned news data. 65 | 66 | ''' 67 | # Select columns and drop NA 68 | cols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral', 69 | 'sentimentPositive', 'urgency', 'provider', 'bodySize', 'relevance'] 70 | news_df = news_df.loc[:,cols] 71 | news_df.dropna(inplace=True) 72 | 73 | # Normalize time 74 | news_df.loc[:, 'time'] = pd.to_datetime(news_df.time).dt.normalize() 75 | 76 | # assetCodes from String to List 77 | news_df['assetCodes'] = news_df['assetCodes'].str.findall(f"'([\w\./]+)'") 78 | 79 | # Explode news on assetCodes 80 | assetCodes_expanded = list(chain(*news_df['assetCodes'])) 81 | assetCodes_index = news_df.index.repeat(news_df['assetCodes'].apply(len)) 82 | assert len(assetCodes_expanded) == len(assetCodes_index) 83 | 84 | assetCodes_df = pd.DataFrame({'index': assetCodes_index, 'assetCode': assetCodes_expanded}) 85 | news_df_exploded = news_df.merge(assetCodes_df, 'right', right_on='index', left_index=True, validate='1:m') 86 | news_df_exploded.drop(['assetCodes', 'index'], 1, inplace=True) 87 | 88 | # Compute means for same date and assetCode 89 | news_agg_dict = { 90 | 'sentimentNegative':'mean', 91 | 'sentimentNeutral':'mean', 92 | 'sentimentPositive':'mean', 93 | 'urgency':'mean', 94 | 'bodySize':'mean', 95 | 'relevance':'mean' 96 | } 97 | news_df_agg = news_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict) 98 | 99 | # Add provider information 100 | idx = news_df_exploded.groupby(['time', 'assetCode'])['urgency'].transform(max) == news_df_exploded['urgency'] 101 | news_df_exploded_2 = news_df_exploded[idx][['time', 'assetCode', 'provider']].drop_duplicates(['time', 'assetCode']) 102 | news_df_agg = news_df_agg.merge(news_df_exploded_2, 'left', ['time', 'assetCode']) 103 | 104 | # One-hot encoding provider 105 | ohe_provider = pd.get_dummies(news_df_agg['provider']) 106 | news_df_agg = pd.concat([news_df_agg, ohe_provider], axis=1).drop(['provider'], axis=1) 107 | 108 | return news_df_agg 109 | 110 | 111 | 112 | def clean_data(market_df, news_df, train=True): 113 | '''Clean and preprocess the news and market data for training then merge 114 | them, to create a train set or test set. 115 | 116 | Parameters 117 | ---------- 118 | market_df : dataframe 119 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full 120 | description of the dataframe. 121 | news_df : dataframe 122 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full 123 | description of the dataframe. 124 | train : bool 125 | When true, creates both the input features and the target dataframes. 126 | 127 | Returns 128 | ------- 129 | dataframe 130 | Cleaned data ready to be fed to the model. Returns both the input and 131 | the target dataframes when train=True. 132 | 133 | ''' 134 | cleaned_market_df = clean_market_data(market_df, train) 135 | cleaned_news_df = clean_news_data(news_df) 136 | 137 | # Merge on market data 138 | df_merged = cleaned_market_df.merge(cleaned_news_df, 'left', ['time', 'assetCode']) 139 | 140 | if train: 141 | y = df_merged['returnsOpenNextMktres10'] 142 | X = df_merged.drop(['returnsOpenNextMktres10'], axis=1) 143 | return X, y 144 | else: 145 | return df_merged 146 | 147 | 148 | def extract_asset(X_train, y_train, assetCode): 149 | '''Extracts the training data for a particular asset 150 | 151 | Parameters 152 | ---------- 153 | X_train : dataframe 154 | Dataframe containing all the assets' training data. 155 | y_train : dataframe 156 | Dataframe containing all the assets' labels. 157 | assetCode : String. 158 | Asset code of asset to be extracted. 159 | 160 | Returns 161 | ------- 162 | dataframe 163 | Dataframe containing data for only the chosen assetCode. 164 | dataframe 165 | Dataframe containing label for only the chosen assetCode 166 | 167 | ''' 168 | X_train_asset = X_train[X_train['assetCode']==assetCode] 169 | y_train_asset = X_train.join(y_train) 170 | y_train_asset = y_train_asset[y_train_asset['assetCode']==assetCode] 171 | y_train_asset = y_train_asset.T.tail(1).T 172 | 173 | return X_train_asset.copy(), y_train_asset.copy() 174 | 175 | 176 | def generate_cleaned_filtered_data(market_data_path, news_data_path, 177 | save_path, assetCodes): 178 | ''' Imports the raw data, cleans and filters it and then saves it. 179 | 180 | Parameters 181 | ---------- 182 | market_data_path : String 183 | The path to the raw market data. 184 | news_data_path : String 185 | The path to the raw news data. 186 | save_path : String 187 | The path where to save the cleaned and filtered data. 188 | asset_Codes : List of Strings 189 | The asset codes to filter out of the dataset. 190 | 191 | ''' 192 | print('Reading CSV files...') 193 | market_train_df = pd.read_csv(MARKET_DATA_PATH) 194 | news_train_df = pd.read_csv(NEWS_DATA_PATH) 195 | 196 | print('Cleaining data...') 197 | X_train, y_train = clean_data(market_train_df, news_train_df) 198 | 199 | assets = ['INTC.O', 'WFC.N', 'AMZN.O', 'A.N', 'BHE.N'] 200 | print('Extracting assets {}...'.format(asset)) 201 | X_train_asset = X_train[X_train['assetCode'].isin(assetCodes)] 202 | cleaned_filtered_data = X_train_asset.join(y_train) 203 | 204 | print('Saving cleaned and filtered data to {}.'.format(path)) 205 | cleaned_filtered_data.to_csv(path) 206 | print('It can now be retrieved using get_cleaned_filtered_data()') 207 | 208 | 209 | def get_cleaned_filtered_data(path): 210 | ''' Fetches the data from the CSV file generated by 211 | generate_cleaned_filterd_data. 212 | 213 | Parameters 214 | ---------- 215 | path : String 216 | The path to the cleaned and filtered data. 217 | 218 | Returns 219 | ------- 220 | dataframe 221 | Dataframe containing the features (X). 222 | dataframe 223 | Dataframe containing the label (y). 224 | ''' 225 | 226 | df = pd.read_csv(path) 227 | y = df['returnsOpenNextMktres10'] 228 | X = df.drop(['returnsOpenNextMktres10'], axis=1) 229 | return X, y 230 | 231 | 232 | if __name__ == '__main__': 233 | pass -------------------------------------------------------------------------------- /models/data_partitioning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | 6 | def validate_df(X, y, sort_column='time'): 7 | ''' Validate the dataset 8 | 9 | Parameters 10 | ---------- 11 | X : dataframe 12 | The data. 13 | y : dataframe 14 | The labels. 15 | sort_column : String 16 | Column on which the data should be sorted. Defaults to 'time'. 17 | 18 | Returns 19 | ------- 20 | X_train : list of dataframe 21 | A list containing the training sets. 22 | y_train : list of dataframe 23 | A list containing the training labels sets. 24 | X_val : list of dataframe 25 | A list containing the validation sets. 26 | y_val : list of dataframe 27 | A list containing the validation labels sets. 28 | X_test : dataframe 29 | The test set 30 | y_test : dataframe 31 | The test set 32 | 33 | ''' 34 | if len(X) != len(y): 35 | raise Exception('X and y should have the same length: len(X) is {}, \ 36 | len(y) is {}'.format(len(X), len(y))) 37 | 38 | if sort_column not in X.columns: 39 | raise Exception('X should have a column named {}'.format(sort_column)) 40 | 41 | if sort_column not in y.columns: 42 | raise Exception('y should have a column named {}'.format(sort_column)) 43 | 44 | return X.sort_values(by=[sort_column]), y.sort_values(by=[sort_column]) 45 | 46 | 47 | def split_fixed_origin(X, train_size): 48 | ''' Generator that yields training and validation sets according to the 49 | fixed-origin evaluation strategy. 50 | 51 | Fixed-origin evaluation is typically applied during forecasting 52 | competitions. A forecast for each value present in the test set is computed 53 | using only the training set. The forecast origin is fixed to the last point 54 | in the training set. So, for each horizon only one forecast can be computed. 55 | Obvious drawbacks of this type of evaluation are, that characteristics of 56 | the forecast origin might heavily influence evaluation results, and, as only 57 | one forecast per horizon is present, averaging is not possible within one 58 | series and one horizon (Bergmeir & Benitez, 2012). 59 | 60 | Parameters 61 | ---------- 62 | X : dataframe 63 | The data to be split. 64 | train_ratio : int 65 | The size of the training set. 66 | 67 | Returns 68 | ------- 69 | dataframe 70 | The training set. 71 | dataframe 72 | The validation set. 73 | 74 | ''' 75 | yield np.split(X, [train_size]) 76 | 77 | 78 | def split_rolling_origin_recal(X, initial_train_size, rolling_size): 79 | ''' Generator that yields training and validation sets according to the 80 | rolling-origin-recalibration evaluation strategy. 81 | 82 | Within rolling-origin-recalibration evaluation, forecasts for a fixed 83 | horizon are performed by sequentially moving values from the test set to the 84 | training set, and changing the forecast origin accordingly. For each 85 | forecast, the model is recalibrated using all available data in the training 86 | set, which often means a complete retraining of the model 87 | (Bergmeir & Benitez, 2012). 88 | 89 | Parameters 90 | ---------- 91 | X : dataframe 92 | The data to be split. 93 | initial_train_size : int 94 | The initial size of the training set. 95 | rolling_size : int 96 | The number of elements that are moved from the validation set to the 97 | training set at each iteration. 98 | 99 | Returns 100 | ------- 101 | dataframe 102 | The training set. 103 | dataframe 104 | The validation set. 105 | 106 | ''' 107 | pointer = initial_train_size 108 | while pointer < len(X): 109 | yield X[:pointer], X[pointer:] 110 | pointer += rolling_size 111 | 112 | 113 | def split_rolling_origin_update(X, train_size, val_size): 114 | ''' Generator that yields a training and a validation sets according to the 115 | rolling_origin_update strategy. Essentially, this is the same as 116 | split_rolling but the model should not be recalibrated but simply updated 117 | after each subsequent iteration. 118 | 119 | After the first iteration which 120 | 121 | Rolling-origin-update evaluation is probably the normal use case of most 122 | applications. Forecasts are computed in analogy to rolling-origin- 123 | recalibration evaluation, but values from the test set are not moved to the 124 | training set, and no model recalibration is performed. Instead, past values 125 | from the test set are used merely to update the input information of the 126 | model. Both types of rolling-origin evaluation are often referred to as 127 | n-step-ahead evaluation, with n being the forecast horizon used during the 128 | evaluation. Tashman [47] argues that model recalibration probably yields 129 | better results than updating. But recalibration may be computationally 130 | expensive, and within a real-world application, the model typically will be 131 | built once by experts, and later it will be used with updated information as 132 | new values are available, but it will certainly not be rebuilt. 133 | (Bergmeir & Benitez, 2012). 134 | 135 | Parameters 136 | ---------- 137 | X : dataframe 138 | The data to be split. 139 | train_window_size : int 140 | The number of data points to be included in the training set window. 141 | val_window_size : int 142 | The number of data points to be included in the validation set window. 143 | 144 | Returns 145 | ------- 146 | dataframe 147 | The training set followed by one new observation at a time. 148 | dataframe 149 | The validation set followed by an empty dataframe after the first 150 | iteration. 151 | 152 | ''' 153 | yield (X[:train_size], 154 | X[train_size:]) 155 | 156 | while train_size < len(X): 157 | yield X[train_size:train_size+1], pd.DataFrame() 158 | train_size += 1 159 | 160 | 161 | def split_rolling_window(X, train_size, val_size, shift): 162 | ''' Generator that yields training and validation sets according to the 163 | rolling-window evaluation strategy. 164 | 165 | Rolling-window evaluation is similar to rolling-origin evaluation, but 166 | the amount of data used for training is kept constant, so that as new data 167 | is available, old data from the beginning of the series is discarded. 168 | Rolling-window evaluation is only applicable if the model is rebuilt in 169 | every window, and has merely theoretical statistical advantages, that might 170 | be noted in practice only if old values tend to disturb model generation 171 | (Bergmeir & Benitez, 2012). 172 | 173 | Parameters 174 | ---------- 175 | X : dataframe 176 | The data to be split. 177 | train_window_size : int 178 | The number of data points to be included in the training set window. 179 | val_window_size : int 180 | The number of data points to be included in the validation set window. 181 | shift : int 182 | By how many data points do the windows shift after each iteration. 183 | 184 | Returns 185 | ------- 186 | dataframe 187 | The training set. 188 | dataframe 189 | The validation set. 190 | 191 | ''' 192 | 193 | pointer = 0 194 | while pointer + train_size + val_size <= len(X): 195 | yield (X[pointer:pointer+train_size], 196 | X[pointer+train_size:pointer+train_size+val_size]) 197 | pointer += shift 198 | 199 | 200 | if __name__ == '__main__': 201 | 202 | 203 | # Test split_data_ordered 204 | X = pd.DataFrame(np.random.randint(0, 100, size=(101, 2)), 205 | columns=list('AB')) 206 | y = pd.DataFrame(np.random.randint(0, 2, size=(101, 1)), 207 | columns=['target']) 208 | time = range(0, 101) 209 | X['time'] = time 210 | y['time'] = time 211 | 212 | # Unit tests setup 213 | df = pd.DataFrame({'A':range(10)}) 214 | 215 | 216 | # Unit tests for split_fixed_origin 217 | print('split_fixed_origin tests') 218 | print('------------------------') 219 | for i, j in split_fixed_origin(df, 6): 220 | print(i.values.reshape(1,-1)) 221 | print(j.values.reshape(1,-1)) 222 | print() 223 | 224 | # Unit tests for split_rolling_origin_recal 225 | print('split_rolling_origin_recal tests') 226 | print('--------------------------------') 227 | len_i = 4 228 | len_j = 6 229 | for i, j in split_rolling_origin_recal(df, 4, 2): 230 | assert len(i) == len_i and len(j) == len_j 231 | assert len(i) != 0 and len(j) != 0 232 | len_i += 2 233 | len_j -= 2 234 | print(i.values.reshape(1,-1)) 235 | print(j.values.reshape(1,-1)) 236 | print() 237 | 238 | # Unit tests for split_rolling_origin_update 239 | print('split_rolling_origin_update tests') 240 | print('---------------------------------') 241 | for i, j in split_rolling_origin_update(df, 4, 2): 242 | print(i.values.reshape(1,-1)) 243 | print(j.values.reshape(1,-1)) 244 | print() 245 | 246 | # Unit tests for split_rolling_window 247 | print('split_rolling_window tests') 248 | print('--------------------------') 249 | for i, j in split_rolling_window(df, 4, 2, 2): 250 | print(i.values.reshape(1,-1)) 251 | print(j.values.reshape(1,-1)) 252 | print() 253 | 254 | -------------------------------------------------------------------------------- /models/lstm_attention_v0.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import os 4 | import pandas as pd 5 | import glob 6 | 7 | sys.path.append('../') 8 | from models.data_cleaning import clean_market_data, clean_news_data 9 | 10 | # Import libraries used for lstm 11 | from keras.models import Sequential 12 | from keras.layers import Input, Dense, multiply, Dot, Concatenate 13 | from keras.layers.core import * 14 | from keras.layers import LSTM 15 | from keras.models import * 16 | 17 | INPUT_DIM = 43 18 | TIME_STEPS = 1 19 | # if True, the attention vector is shared across the input_dimensions where the attention is applied. 20 | SINGLE_ATTENTION_VECTOR = False 21 | APPLY_ATTENTION_BEFORE_LSTM = False 22 | assetcode_list = ["AMZN.O"] 23 | 24 | MARKET_CLEAN_PATH = 'data/processed/market_cleaned_df.csv' 25 | NEWS_CLEAN_PATH = 'data/processed/news_cleaned_df.csv' 26 | 27 | 28 | def get_activations(model, inputs, print_shape_only=False, layer_name=None): 29 | # Documentation is available online on Github at the address below. 30 | # From: https://github.com/philipperemy/keras-visualize-activations 31 | print('----- activations -----') 32 | activations = [] 33 | inp = model.input 34 | if layer_name is None: 35 | outputs = [layer.output for layer in model.layers] 36 | else: 37 | outputs = [layer.output for layer in model.layers if layer.name == layer_name] # all layer outputs 38 | funcs = [K.function([inp] + [K.learning_phase()], [out]) for out in outputs] # evaluation functions 39 | layer_outputs = [func([inputs, 1.])[0] for func in funcs] 40 | for layer_activations in layer_outputs: 41 | activations.append(layer_activations) 42 | if print_shape_only: 43 | print(layer_activations.shape) 44 | else: 45 | print(layer_activations) 46 | return activations 47 | 48 | 49 | def attention_3d_block(inputs): 50 | # inputs.shape = (batch_size, time_steps, input_dim) 51 | input_dim = int(inputs.shape[2]) 52 | a = Permute((2, 1))(inputs) 53 | a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what. 54 | a = Dense(TIME_STEPS, activation='softmax')(a) 55 | if SINGLE_ATTENTION_VECTOR: 56 | a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a) 57 | a = RepeatVector(input_dim)(a) 58 | a_probs = Permute((2, 1), name='attention_vec')(a) 59 | output_attention_mul = multiply([inputs, a_probs], name='attention_mul') 60 | return output_attention_mul 61 | 62 | 63 | def model_attention_applied_after_lstm(): 64 | inputs = Input(shape=(TIME_STEPS, INPUT_DIM,)) 65 | lstm_units = 50 66 | lstm_out = LSTM(lstm_units, return_sequences=True)(inputs) 67 | attention_mul = attention_3d_block(lstm_out) 68 | attention_mul = Flatten()(attention_mul) 69 | output = Dense(1, activation='sigmoid')(attention_mul) 70 | model = Model(input=[inputs], output=output) 71 | return model 72 | 73 | 74 | def model_attention_applied_before_lstm(): 75 | inputs = Input(shape=(TIME_STEPS, INPUT_DIM,)) 76 | attention_mul = attention_3d_block(inputs) 77 | lstm_units = 32 78 | attention_mul = LSTM(lstm_units, return_sequences=False)(attention_mul) 79 | output = Dense(1, activation='sigmoid')(attention_mul) 80 | model = Model(input=[inputs], output=output) 81 | return model 82 | 83 | 84 | def extract_stock(df, assetCode, split=False): 85 | '''Extracts the training data for a particular asset 86 | 87 | Parameters 88 | ---------- 89 | X_train : pandas dataframe containing all the assets' training data 90 | y_train : pandas dataframe containing all the assets' labels 91 | assetCode: asset code of asset to be extracted, in a list 92 | 93 | Returns 94 | ------- 95 | X_train_asset : pandas dataframe containing data for only the chosen assetCode 96 | y_train_asset : pandas dataframe containing label for only the chosen assetCode 97 | ''' 98 | df_asset = df[df['assetCode'].isin(assetCode)] 99 | if split: 100 | y = df_asset['returnsOpenNextMktres10'] 101 | X = df_asset.drop(['returnsOpenNextMktres10'], axis=1) 102 | return X, y 103 | 104 | return df_asset 105 | 106 | 107 | if __name__ == '__main__': 108 | 109 | df_market = pd.read_csv(MARKET_CLEAN_PATH) 110 | df_news = pd.read_csv(NEWS_CLEAN_PATH) 111 | 112 | df_merged = df_market.merge(df_news, 'left', ['time', 'assetCode']) 113 | df_merged = df_merged.sort_values(['time', 'assetCode'], ascending=[True, True]) 114 | 115 | df_merged = extract_stock(df_merged, assetcode_list) 116 | # taking 80%, 10%, 10% for train, val, test sets 117 | df_train = df_merged[:522*1990] 118 | df_val = df_merged[522*1990:522*(1990+249)] 119 | df_test = df_merged[522*(1990+249):] 120 | 121 | # create the different data sets 122 | y_train = df_train['returnsOpenNextMktres10'] 123 | X_train = df_train.drop(['returnsOpenNextMktres10'], axis=1) 124 | 125 | y_val = df_val['returnsOpenNextMktres10'] 126 | X_val = df_val.drop(['returnsOpenNextMktres10'], axis=1) 127 | 128 | y_test = df_test['returnsOpenNextMktres10'] 129 | X_test = df_test.drop(['returnsOpenNextMktres10'], axis=1) 130 | 131 | X_train_ar = X_train.drop(['assetCode', "time"], axis=1).as_matrix() 132 | X_train_ar = X_train_ar.reshape(X_train_ar.shape[0], 1, X_train_ar.shape[1]) 133 | 134 | X_val_ar = X_val.drop(['assetCode', "time"], axis=1).as_matrix() 135 | X_val_ar = X_val_ar.reshape(X_val_ar.shape[0], 1, X_val_ar.shape[1]) 136 | 137 | X_test_ar = X_test.drop(['assetCode', "time"], axis=1).as_matrix() 138 | X_test_ar = X_test_ar.reshape(X_val_ar.shape[0], 1, X_test_ar.shape[1]) 139 | 140 | #y_train_ar = y_train.values.reshape((1990, 522)) 141 | #y_val_ar = y_val.values.reshape((int(len(y_val)/522), 522)) 142 | #y_test_ar = y_test.values.reshape((int(len(y_test)/522), 522)) 143 | 144 | # 4. Build model from Keras 145 | N = 300000 146 | 147 | if APPLY_ATTENTION_BEFORE_LSTM: 148 | m = model_attention_applied_before_lstm() 149 | else: 150 | m = model_attention_applied_after_lstm() 151 | 152 | m.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 153 | print(m.summary()) 154 | 155 | m.fit(X_train_ar, y_train, epochs=3, batch_size=64, validation_data=(X_val_ar, y_val), verbose=1) 156 | 157 | attention_vectors = [] 158 | for i in range(300): 159 | X_test_ar, y_test = get_data_recurrent(1, TIME_STEPS, INPUT_DIM) 160 | attention_vector = np.mean(get_activations(m, 161 | X_test_ar, 162 | print_shape_only=True, 163 | layer_name='attention_vec')[0], axis=2).squeeze() 164 | #print('attention =', attention_vector) 165 | assert (np.sum(attention_vector) - 1.0) < 1e-5 166 | attention_vectors.append(attention_vector) 167 | 168 | attention_vector_final = np.mean(np.array(attention_vectors), axis=0) 169 | # plot part. 170 | import matplotlib.pyplot as plt 171 | import pandas as pd 172 | 173 | pd.DataFrame(attention_vector_final, columns=['attention (%)']).plot(kind='bar', 174 | title='Attention Mechanism as ' 175 | 'a function of input' 176 | ' dimensions.') 177 | plt.show() 178 | -------------------------------------------------------------------------------- /models/lstm_attention_v1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import os 4 | import pandas as pd 5 | import glob 6 | import matplotlib.pyplot as plt 7 | import pickle 8 | import tensorflow as tf 9 | 10 | from models.data_cleaning import generate_cleaned_filtered_data 11 | from models.attention_decoder import AttentionDecoder 12 | from models.data_partitioning import validate_df 13 | from models.data_partitioning import split_fixed_origin 14 | from keras.models import Sequential 15 | from keras.layers import Input, Dense 16 | from keras.layers import LSTM 17 | from keras.layers import TimeDistributed 18 | from keras.layers import RepeatVector 19 | from keras import backend as K 20 | 21 | from sklearn.model_selection import train_test_split 22 | from sklearn.preprocessing import MinMaxScaler 23 | 24 | from IPython.display import SVG 25 | from keras.utils.vis_utils import model_to_dot 26 | 27 | test_frac = 0.1 # fraction of the whole data 28 | train_frac = 0.8 # fraction of the remaining data 29 | 30 | cleaned_data_path = './data/processed/df_merged.csv' 31 | 32 | ASSETS = ['INTC.O', 'WFC.N', 'AMZN.O', 'A.N', 'BHE.N'] 33 | 34 | 35 | def top_down_acc(y_true, y_pred): 36 | return K.abs(K.sign(y_true) + K.sign(y_pred)) / 2 37 | 38 | 39 | def time_lag_data(X, y, n_in=1, n_out=1): 40 | n_features = X.shape[1] 41 | feature_names = X.columns 42 | 43 | # Define column names 44 | names = list() 45 | for i in range(n_in): 46 | names += [('%s(t-%d)' % (feature_names[j], -(i+1-n_in))) for j in range(n_features)] 47 | 48 | x_list = [] 49 | # input sequence (t-n, ... t-1) 50 | for i in range(X.shape[0]-n_in-n_out+2): 51 | rows_x = [] 52 | for _, row in X[i:i+n_in].iterrows(): 53 | rows_x += row.tolist() 54 | x_list.append(rows_x) 55 | 56 | X_time = pd.DataFrame(x_list, columns=names) 57 | # forecast sequence (t, t+1, ... t+n) 58 | cols = list() 59 | for i in range(0, n_out): 60 | if i == 0: 61 | cols += [('%s(t)' % ('returnsOpenNextMktres10'))] 62 | else: 63 | cols += [('%s(t+%d)' % ('returnsOpenNextMktres10', i))] 64 | # put it all together 65 | 66 | y_list = [] 67 | # input sequence (t-n, ... t-1) 68 | for i in range(n_in-1, X.shape[0]-n_out+1): 69 | y_list.append(y[i:i+n_out].tolist()) 70 | 71 | y_time = pd.DataFrame(y_list, columns=cols) 72 | 73 | return X_time, y_time 74 | 75 | 76 | df = pd.read_csv(cleaned_data_path) 77 | df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'time'], inplace=True, axis=1) 78 | 79 | # For loop for assets 80 | asset = 'BHE.N' 81 | df = df[df['assetCode'] == asset] 82 | df.drop(['assetCode'], axis=1, inplace=True) 83 | 84 | split = len(df) - round(test_frac*len(df)) 85 | df_test = df[split:] 86 | df_tv = df[:split] 87 | 88 | # For loop for different splitting techniques 89 | df_train, df_val = train_test_split(df_tv, 90 | train_size=train_frac, 91 | shuffle=False) 92 | 93 | y_train = df_train['returnsOpenNextMktres10'] 94 | y_train_init = y_train.reset_index(drop=True) 95 | X_train = df_train.drop(['returnsOpenNextMktres10'], axis=1) 96 | X_train_init = X_train.reset_index(drop=True) 97 | print('The train data size is : ', X_train.shape, y_train.shape) 98 | 99 | y_val = df_val['returnsOpenNextMktres10'] 100 | y_val_init = y_val.reset_index(drop=True) 101 | X_val = df_val.drop(['returnsOpenNextMktres10'], axis=1) 102 | X_val_init = X_val.reset_index(drop=True) 103 | print('The validation data size is : ', X_val.shape, y_val.shape) 104 | 105 | y_test = df_test['returnsOpenNextMktres10'] 106 | y_test_init = y_test.reset_index(drop=True) 107 | X_test = df_test.drop(['returnsOpenNextMktres10'], axis=1) 108 | X_test_init = X_test.reset_index(drop=True) 109 | print('The test data size is : ', X_test.shape, y_test.shape) 110 | 111 | # Hyperparameter tuning 112 | # lag (1, 5, 15, 30, 60, 90), dropout (LSTM) (0, 0.05, 0.4), cells (16, 32, 64) 113 | n_features = 40 114 | n_timesteps_out = 1 115 | n_epochs = 25 116 | 117 | # LSTM + EncoderDecoder 118 | for n_timesteps_in in [1]: 119 | for dropout in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]: 120 | for cells in [16]: 121 | 122 | X_train, y_train = time_lag_data(X_train_init, y_train_init, 123 | n_in=n_timesteps_in, 124 | n_out=n_timesteps_out) 125 | print('The train data size is : ', X_train.shape, y_train.shape) 126 | 127 | X_val, y_val = time_lag_data(X_val_init, y_val_init, 128 | n_in=n_timesteps_in, 129 | n_out=n_timesteps_out) 130 | print('The val data size is : ', X_val.shape, y_val.shape) 131 | 132 | scaler = MinMaxScaler((-1, 1), False) 133 | X_train = scaler.fit_transform(X_train) 134 | X_val = scaler.transform(X_val) 135 | 136 | # Reshape the datasets 137 | X_train = X_train.reshape((len(X_train), n_timesteps_in, n_features)) 138 | y_train = y_train.values.reshape((len(y_train), n_timesteps_out, 1)) 139 | 140 | X_val = X_val.reshape((len(X_val), n_timesteps_in, n_features)) 141 | y_val = y_val.values.reshape((len(y_val), n_timesteps_out, 1)) 142 | 143 | 144 | # Model with Encoder/Decoder 145 | model = Sequential() 146 | model.add(LSTM(cells, dropout=dropout, 147 | input_shape=(n_timesteps_in, n_features))) 148 | model.add(RepeatVector(n_timesteps_out)) 149 | model.add(LSTM(cells, dropout=dropout, return_sequences=True)) 150 | model.add(TimeDistributed(Dense(1, activation='tanh'))) 151 | model.compile(loss='mean_squared_error', optimizer='adam', 152 | metrics=[top_down_acc]) 153 | model.summary() 154 | history = model.fit(X_train, 155 | y_train, 156 | epochs=n_epochs, 157 | validation_data=(X_val, y_val), 158 | shuffle=False) 159 | 160 | with open('history_ed_v0_ts_{}_drop_{}_cells_{}'.format(str(n_timesteps_in), 161 | str(dropout), 162 | str(cells)), 'wb') as file_hs: 163 | pickle.dump(history.history, file_hs) 164 | 165 | # plot training history 166 | fig = plt.figure() 167 | ax = fig.add_subplot(1, 1, 1) 168 | ax.plot(history.history['loss']) 169 | ax.plot(history.history['val_loss']) 170 | ax.set_xlim([0, 125]) 171 | ax.set_ylim([0, 0.01]) 172 | # plt.plot(history.history['top_down_acc']) 173 | 174 | ax.set_xlabel('Epoch') 175 | ax.set_ylabel('Mean Absolute Error Loss') 176 | ax.set_title('Loss Over Time') 177 | ax.legend(['Train','Val']) 178 | # plt.legend(['Train','Val', 'Top Down Accuracy']) 179 | fig.savefig('lstm_ed_v0_ts_{}_drop_{}_cells_{}.png'.format(str(n_timesteps_in), 180 | str(dropout), 181 | str(cells))) 182 | # LSTM + Attention 183 | for n_timesteps_in in [1, 5, 15, 30, 60, 90]: 184 | for dropout in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]: 185 | for cells in [16, 32, 64]: 186 | 187 | n_timesteps_out = n_timesteps_in 188 | 189 | X_train, y_train = time_lag_data(X_train_init, y_train_init, 190 | n_in=n_timesteps_in, 191 | n_out=n_timesteps_out) 192 | print('The train data size is : ', X_train.shape, y_train.shape) 193 | 194 | X_val, y_val = time_lag_data(X_val_init, y_val_init, 195 | n_in=n_timesteps_in, 196 | n_out=n_timesteps_out) 197 | print('The val data size is : ', X_val.shape, y_val.shape) 198 | 199 | X_test, y_test = time_lag_data(X_test_init, y_test_init, 200 | n_in=n_timesteps_in, 201 | n_out=n_timesteps_out) 202 | print('The test data size is : ', X_test.shape, y_test.shape) 203 | 204 | scaler = MinMaxScaler((-1, 1), False) 205 | X_train = scaler.fit_transform(X_train) 206 | X_val = scaler.transform(X_val) 207 | X_test = scaler.transform(X_test) 208 | 209 | # Reshape the datasets 210 | X_train = X_train.reshape((len(X_train), n_timesteps_in, n_features)) 211 | y_train = y_train.values.reshape((len(y_train), n_timesteps_out, 1)) 212 | 213 | X_val = X_val.reshape((len(X_val), n_timesteps_in, n_features)) 214 | y_val = y_val.values.reshape((len(y_val), n_timesteps_out, 1)) 215 | 216 | X_test = X_test.reshape((len(X_test), n_timesteps_in, n_features)) 217 | y_test = y_test.values.reshape((len(y_test), n_timesteps_out, 1)) 218 | 219 | model_at = Sequential() 220 | model_at.add(LSTM(cells, input_shape=(n_timesteps_in, n_features), 221 | return_sequences=True)) 222 | model_at.add(AttentionDecoder(cells, n_features)) 223 | model_at.add(Dense(1, activation='tanh')) 224 | model_at.compile(loss='mean_squared_error', optimizer='adam', 225 | metrics=[top_down_acc]) 226 | model_at.summary() 227 | history = model_at.fit(X_train, 228 | y_train, 229 | epochs=n_epochs, 230 | validation_data=(X_val, y_val), 231 | shuffle=False) 232 | 233 | with open('results_final/history_att_v0_ts_{}_drop_{}_cells_{}'.format(str(n_timesteps_in), 234 | str(dropout), 235 | str(cells)), 'wb') as file_hs: 236 | pickle.dump(history.history, file_hs) 237 | 238 | prediction = model_at.predict(X_test) 239 | top_down_accuracy = sum(top_down_acc(p[0], np.float32(t[0])) for p, t in zip(prediction[:,0], y_test[:,0]))/len(y_test) 240 | 241 | with tf.Session() as sess: 242 | top_down_accuracy = sess.run(top_down_accuracy) 243 | # plot training history 244 | fig = plt.figure() 245 | ax = fig.add_subplot(1, 1, 1) 246 | ax.plot(history.history['loss']) 247 | ax.plot(history.history['val_loss']) 248 | ax.set_xlim([0, 40]) 249 | ax.set_ylim([0, 0.01]) 250 | # plt.plot(history.history['top_down_acc']) 251 | 252 | ax.set_xlabel('Epoch') 253 | ax.set_ylabel('Mean Absolute Error Loss') 254 | print(min(history.history['val_loss'])) 255 | ax.set_title('Loss Over Time') 256 | print('Predicted Top-Down Accuracy : {}'.format(str(top_down_accuracy))) 257 | ax.legend(['Train','Val']) 258 | # plt.legend(['Train','Val', 'Top Down Accuracy']) 259 | fig.savefig('results_final/lstm_att_v0_ts_{}_drop_{}_cells_{}.png'.format(str(n_timesteps_in), 260 | str(dropout), 261 | str(cells))) 262 | -------------------------------------------------------------------------------- /models/lstm_v01.py: -------------------------------------------------------------------------------- 1 | from data_partitioning import validate_df 2 | from data_partitioning import split_fixed_origin 3 | from data_cleaning import get_cleaned_filtered_data, extract_asset 4 | 5 | from keras.models import Sequential 6 | from keras.layers import Dense, LSTM 7 | from keras.utils import plot_model 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import pandas as pd 12 | 13 | DRY_RUN = True # if True, will only run for one asset with fixed origin strategy 14 | 15 | ASSETS = ['INTC.O', 'WFC.N', 'AMZN.O', 'A.N', 'BHE.N'] 16 | DATA_PATH = './data/processed/cleaned_filtered_data.csv' 17 | 18 | 19 | test_frac = 0.1 # fraction of the whole data 20 | train_frac = 0.8 # fraction of the remaining data 21 | latent_dim = 50 # LSTM hidden units 22 | batch_size = 1 23 | look_back = 30 24 | 25 | 26 | def create_dataset(X, look_back=1): 27 | cols = list() 28 | for i in range(look_back, 0, -1): 29 | cols.append(X.shift(i)) 30 | 31 | return pd.concat(cols, axis=1) 32 | 33 | if __name__ == '__main__': 34 | X, y = get_cleaned_filtered_data(DATA_PATH) 35 | 36 | 37 | for asset in ASSETS: 38 | X, y = extract_asset(X, y, asset) 39 | X['y'] = y 40 | 41 | # Isolating the test set 42 | split = len(X) - round(test_frac*len(X)) 43 | X_test = X[split:] 44 | y_test = X_test['y'] 45 | X_test = X_test.drop(['y'], axis=1) 46 | X = X[:split] 47 | 48 | # Training and validating the model using fixed origin 49 | train_size = round(train_frac * len(X)) 50 | 51 | for X_train, X_val in split_fixed_origin(X, train_size): 52 | y_train = X_train['y'] 53 | X_train = X_train.drop(['y'], axis=1) 54 | y_val = X_val['y'] 55 | X_val = X_val.drop(['y'], axis=1) 56 | 57 | # fill nan ad drop the asset code and time 58 | drop_col = ['Unnamed: 0', 'assetCode', 'time'] 59 | X_train.fillna(0, inplace=True) 60 | X_val.fillna(0, inplace=True) 61 | X_train.drop(drop_col, axis=1, inplace=True) 62 | X_val.drop(drop_col, axis=1, inplace=True) 63 | 64 | # Create the sets according to the look_back range 65 | X_train = create_dataset(X_train, look_back) 66 | 67 | # input dimensionality 68 | data_dim = X_train.shape[-1] 69 | 70 | # Reshape input to 3 dimensions (batch_size, timesteps, data_dim) 71 | X_train = X_train.reshape((batch_size, X_train.shape[0], data_dim)) 72 | X_val = X_val.reshape((batch_size, X_val.shape[0], data_dim)) 73 | y_train = y_train.reshape((batch_size, -1, 1)) 74 | y_val = y_val.reshape((batch_size, -1, 1)) 75 | 76 | # Expected input shape: (batch_size, timesteps, data_dim) 77 | model = Sequential() 78 | model.add(LSTM(latent_dim, input_dim=data_dim, 79 | return_sequences=True)) 80 | model.add(Dense(1)) 81 | model.compile(loss='mse', optimizer='adam') 82 | history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 83 | epochs=60, batch_size=batch_size) 84 | 85 | # plot training history 86 | plt.plot(history.history['loss']) 87 | plt.plot(history.history['val_loss']) 88 | 89 | plt.xlabel('Epoch') 90 | plt.ylabel('Mean Absolute Error Loss') 91 | plt.title('Loss Over Time') 92 | plt.legend(['Train','Val']) 93 | 94 | if DRY_RUN: 95 | break; 96 | 97 | -------------------------------------------------------------------------------- /models/lstm_v02.py: -------------------------------------------------------------------------------- 1 | from data_partitioning import validate_df 2 | from data_partitioning import split_fixed_origin 3 | from data_cleaning import get_cleaned_filtered_data, extract_asset 4 | 5 | from keras.models import Sequential 6 | from keras.layers import Dense, LSTM 7 | from keras.utils import plot_model 8 | from keras.callbacks import ModelCheckpoint 9 | from keras import backend as K 10 | 11 | 12 | from sklearn.preprocessing import MinMaxScaler 13 | from sklearn.metrics import roc_auc_score 14 | 15 | from itertools import product 16 | import matplotlib.pyplot as plt 17 | import numpy as np 18 | import pandas as pd 19 | import pickle 20 | 21 | 22 | DRY_RUN = False 23 | DUMP_HISTORY = True 24 | 25 | ASSETS = ['INTC.O', 'WFC.N', 'AMZN.O', 'A.N', 'BHE.N'] 26 | DATA_PATH = './data/processed/cleaned_filtered_data.csv' 27 | HISTORY_TOP_PATH = './data/history/' 28 | 29 | test_frac = 0.1 # fraction of the whole data 30 | train_frac = 0.8 # fraction of the remaining data 31 | n_epochs = 200 32 | 33 | lstm_sizes = [16, 32, 64] 34 | lags = [1, 5, 15, 30, 60, 90] 35 | dropouts = [0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40] 36 | 37 | 38 | def add_lag(df, lag=1): 39 | cols = [df] 40 | for i in range(lag, 0, -1): 41 | cols.append(df.shift(i)) 42 | return pd.concat(cols, axis=1).dropna() 43 | 44 | 45 | def top_down_acc(y_true, y_pred): 46 | return K.abs(K.sign(y_true) + K.sign(y_pred)) / 2 47 | 48 | 49 | def create_model(lstm_size, dropout, lag, n_features): 50 | model = Sequential() 51 | model.add(LSTM(lstm_size, dropout=dropout, 52 | input_shape=(lag+1, n_features))) 53 | model.add(Dense(1, activation='tanh')) 54 | model.compile(loss='mse', optimizer='adam', 55 | metrics=[top_down_acc]) 56 | return model 57 | 58 | 59 | if __name__ == '__main__': 60 | 61 | # Fetch the data from the saved csv 62 | X_clean, y_clean = get_cleaned_filtered_data(DATA_PATH) 63 | 64 | for asset, lstm_size, lag, dropout in product( 65 | ASSETS, lstm_sizes, lags, dropouts): 66 | 67 | # Extract the asset and perform some cleaning 68 | X, y = extract_asset(X_clean, y_clean, asset) 69 | cols = ['Unnamed: 0', 'assetCode', 'time'] 70 | X.drop(cols, axis=1, inplace=True) 71 | X.fillna(-1, inplace=True) # Making sure unknown values are obvious 72 | n_features = X.shape[1] 73 | 74 | # Merge the labels and the features into one dataset 75 | df = X 76 | df['y'] = y 77 | 78 | # Isolating the test set 79 | split = len(df) - round(test_frac*len(df)) 80 | df_test = df[split:] 81 | df = df[:split] 82 | 83 | # Some user feedback 84 | print('\nTraining with\n\tlstm size: {}\n\tlag: {}\n\tdropout: {}\n' 85 | .format(lstm_size, lag, dropout)) 86 | 87 | # Add the lag features 88 | df_lag = add_lag(df.drop(['y'], axis=1), lag) 89 | df_lag['y'] = df['y'] 90 | 91 | # Train and evaluate using fixed origin 92 | train_size = round(train_frac * len(df_lag)) 93 | for df_train, df_val in split_fixed_origin(df_lag, train_size): 94 | y_train = df_train['y'] 95 | X_train = df_train.drop(['y'], axis=1) 96 | y_val = df_val['y'] 97 | X_val = df_val.drop(['y'], axis=1) 98 | 99 | # Scale the data 100 | scaler = MinMaxScaler((-1, 1), False) 101 | scaler.fit_transform(X_train) 102 | scaler.transform(X_val) 103 | 104 | # Reshape input data according to Keras documentation 105 | # (batch_size, timesteps, input_dim) 106 | X_train = X_train.values.reshape((-1, lag+1, n_features)) 107 | X_val = X_val.values.reshape((-1, lag+1, n_features)) 108 | 109 | # Create the model 110 | # Input shape expected (timesteps, input_dim) 111 | model = create_model(lstm_size, dropout, lag, n_features) 112 | 113 | # Fit the model 114 | checkpoint_name = ('best-lstm-{{epoch:03d}}-{{val_loss:.4f}}-{}-{}-' 115 | '{}-{}.hdf5').format(asset, lstm_size, lag, int(dropout*100)) 116 | checkpoint = ModelCheckpoint( 117 | './data/models/' + checkpoint_name, 118 | monitor='val_loss', 119 | save_best_only=True) 120 | history = model.fit(X_train, 121 | y_train, 122 | epochs=n_epochs, 123 | validation_data=(X_val, y_val), 124 | shuffle=False, 125 | callbacks=[checkpoint]) 126 | 127 | # Dumpm the history to a pickle file 128 | if DUMP_HISTORY: 129 | path = HISTORY_TOP_PATH + 'lstm-{}-{}-{}-{}.pickle'.format( 130 | asset, lstm_size, lag, int(dropout*100)) 131 | with open(path, 'wb') as f: 132 | pickle.dump(history.history, f) 133 | 134 | if DRY_RUN: 135 | break -------------------------------------------------------------------------------- /models/lstm_v02_analysis.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | from os.path import isfile, join 3 | import pandas as pd 4 | import pickle 5 | import re 6 | 7 | from sklearn.preprocessing import MinMaxScaler 8 | 9 | from lstm_v02 import create_model, add_lag 10 | from data_partitioning import split_fixed_origin 11 | from data_cleaning import get_cleaned_filtered_data, extract_asset 12 | 13 | 14 | DATA_PATH = './data/processed/cleaned_filtered_data.csv' 15 | 16 | test_frac = 0.1 17 | train_frac = 0.8 18 | 19 | 20 | asset = 'INTC.O' 21 | val_loss = 0.0016 22 | epoch = 7 23 | lstm_size = 32 24 | lag = 60 25 | dropout = 40 26 | 27 | 28 | def get_saved_model_path(root, val_loss, epoch, asset, lstm_size, lag, dropout): 29 | 30 | # Generate file path from parameters 31 | return root + 'best-lstm-{:03}-{}-{}-{}-{}-{}.hdf5'.format( 32 | epoch, val_loss, asset, lstm_size, lag, dropout) 33 | 34 | 35 | if __name__ == '__main__': 36 | 37 | # Fetch the data 38 | X_clean, y_clean = get_cleaned_filtered_data(DATA_PATH) 39 | X, y = extract_asset(X_clean, y_clean, asset) 40 | cols = ['Unnamed: 0', 'assetCode', 'time'] 41 | X.drop(cols, axis=1, inplace=True) 42 | X.fillna(-1, inplace=True) 43 | n_features = X.shape[1] 44 | 45 | # Split the data 46 | df = X 47 | df['y'] = y 48 | split = len(df) - round(test_frac*len(df)) 49 | df_test = df[split:] 50 | df = df[:split] 51 | 52 | print(len(df_test)) 53 | 54 | # Add the lag features 55 | df_lag = add_lag(df.drop(['y'], axis=1), lag) 56 | df_lag['y'] = df['y'] 57 | df_test_lag = add_lag(df_test.drop(['y'], axis=1), lag) 58 | df_test_lag['y'] = df_test['y'] 59 | 60 | X_test = df_test_lag.drop(['y'], axis=1) 61 | y_test = df_test_lag['y'] 62 | 63 | train_size = round(train_frac * len(df_lag)) 64 | for df_train, df_val in split_fixed_origin(df_lag, train_size): 65 | X_train = df_train.drop(['y'], axis=1) 66 | 67 | # Scale the data 68 | scaler = MinMaxScaler((-1, 1), False) 69 | scaler.fit_transform(X_train) 70 | scaler.transform(X_test) 71 | 72 | # Reshape to keras input shape 73 | X_test = X_test.values.reshape((-1, lag+1, n_features)) 74 | 75 | # Create the model from saved weights 76 | weights_path = get_saved_model_path( 77 | './data/models/', val_loss, epoch, asset, lstm_size, lag, dropout) 78 | model = create_model(lstm_size, dropout, lag, n_features) 79 | model.load_weights(weights_path) 80 | 81 | # Test and print the results 82 | scores = model.evaluate(X_test, y_test, verbose=0) 83 | print('\n{} : {}\n{} : {}'.format( 84 | model.metrics_names[0], scores[0], model.metrics_names[1], scores[1])) -------------------------------------------------------------------------------- /models/lstm_v03.py: -------------------------------------------------------------------------------- 1 | ''' In this version, I train the model on INTC.O using the hyper parameters 2 | found with the version 02 but using the rolling window splitting strategy. 3 | ''' 4 | 5 | from data_partitioning import split_rolling_window 6 | from data_cleaning import get_cleaned_filtered_data, extract_asset 7 | 8 | from keras.models import Sequential 9 | from keras.layers import Dense, LSTM 10 | from keras.utils import plot_model 11 | from keras.callbacks import ModelCheckpoint 12 | from keras import backend as K 13 | 14 | 15 | from sklearn.preprocessing import MinMaxScaler 16 | from sklearn.metrics import roc_auc_score 17 | 18 | from itertools import product 19 | import matplotlib.pyplot as plt 20 | import numpy as np 21 | import pandas as pd 22 | import pickle 23 | 24 | 25 | DRY_RUN = False 26 | DUMP_HISTORY = True 27 | 28 | DATA_PATH = './data/processed/cleaned_filtered_data.csv' 29 | HISTORY_TOP_PATH = './data/history/' 30 | 31 | test_frac = 0.1 # fraction of the whole data used for test set 32 | n_epochs = 1 # Number of pass over the data when training 33 | 34 | # Params for rolling window (fraction of the remaining data) 35 | train_frac = 0.2 36 | val_frac = 0.1 37 | shift = 15 38 | 39 | asset = 'INTC.O' 40 | lstm_size = 64 41 | lag = 15 42 | dropout = 0.10 43 | 44 | 45 | def add_lag(df, lag=1): 46 | cols = [df] 47 | for i in range(lag, 0, -1): 48 | cols.append(df.shift(i)) 49 | return pd.concat(cols, axis=1).dropna() 50 | 51 | 52 | def top_down_acc(y_true, y_pred): 53 | return K.abs(K.sign(y_true) + K.sign(y_pred)) / 2 54 | 55 | 56 | def create_model(lstm_size, dropout, lag, n_features): 57 | model = Sequential() 58 | model.add(LSTM(lstm_size, dropout=dropout, 59 | input_shape=(lag+1, n_features))) 60 | model.add(Dense(1, activation='tanh')) 61 | model.compile(loss='mse', optimizer='adam', 62 | metrics=[top_down_acc]) 63 | return model 64 | 65 | 66 | def get_df(test_frac, asset): 67 | 68 | # Fetch the data from the saved csv 69 | X_clean, y_clean = get_cleaned_filtered_data(DATA_PATH) 70 | 71 | # Extract the asset and perform some cleaning 72 | df, y = extract_asset(X_clean, y_clean, asset) 73 | cols = ['Unnamed: 0', 'assetCode', 'time'] 74 | df.drop(cols, axis=1, inplace=True) 75 | df.fillna(-1, inplace=True) # Making sure unknown values are obvious 76 | n_features = df.shape[1] 77 | 78 | # Merge the labels and the features into one dataset 79 | df['y'] = y 80 | 81 | # Add the lag features 82 | df_lag = add_lag(df.drop(['y'], axis=1), lag) 83 | df_lag = df_lag.assign(y=df['y']) 84 | total_len = len(df_lag) 85 | 86 | # Isolating the test set 87 | split = len(df_lag) - round(test_frac*len(df_lag)) 88 | df_lag_test = df_lag[split:] 89 | df_lag = df_lag[:split] 90 | 91 | # Scale the data 92 | scaler = MinMaxScaler((-1, 1), False) 93 | 94 | temp_y = df_lag['y'] 95 | df_lag.drop('y', axis=1, inplace=True) 96 | scaler.fit_transform(df_lag) 97 | df_lag['y'] = temp_y 98 | 99 | temp_y = df_lag_test['y'] 100 | df_lag_test.drop('y', axis=1, inplace=True) 101 | scaler.transform(df_lag_test) 102 | df_lag_test['y'] = temp_y 103 | 104 | assert total_len == len(df_lag) + len(df_lag_test) 105 | 106 | return df_lag, df_lag_test, n_features 107 | 108 | 109 | if __name__ == '__main__': 110 | 111 | df_lag, _, n_features = get_df(test_frac, asset) 112 | 113 | 114 | # Create the model 115 | # Input shape expected (timesteps, input_dim) 116 | model = create_model(lstm_size, dropout, lag, n_features) 117 | 118 | # Train and evaluate using rolling window 119 | train_size = round(train_frac * len(df_lag)) 120 | val_size = round(val_frac * len(df_lag)) 121 | count = -1 122 | for df_train, df_val in split_rolling_window(df_lag, train_size, 123 | val_size, shift): 124 | count += 1 125 | y_train = df_train['y'] 126 | X_train = df_train.drop(['y'], axis=1) 127 | y_val = df_val['y'] 128 | X_val = df_val.drop(['y'], axis=1) 129 | 130 | # Reshape input data according to Keras documentation 131 | # (batch_size, timesteps, input_dim) 132 | X_train = X_train.values.reshape((-1, lag+1, n_features)) 133 | X_val = X_val.values.reshape((-1, lag+1, n_features)) 134 | 135 | # Fit the model 136 | checkpoint_name = ('best-lstm-{:03d}-{}-{}-{}-{}.hdf5').format( 137 | count, asset, lstm_size, lag, int(dropout*100)) 138 | checkpoint = ModelCheckpoint( 139 | './data/models/rollingwindow/' + checkpoint_name, 140 | monitor='val_loss', 141 | save_best_only=True) 142 | history = model.fit(X_train, 143 | y_train, 144 | epochs=n_epochs, 145 | validation_data=(X_val, y_val), 146 | shuffle=False, 147 | callbacks=[checkpoint]) 148 | 149 | # Dumpm the history to a pickle file 150 | if DUMP_HISTORY: 151 | path = (HISTORY_TOP_PATH + 'rollingwindow/lstm.{:03d}-{}-{}-{}-{}' 152 | '.pickle'.format(count, asset, lstm_size, lag, int(dropout*100))) 153 | with open(path, 'wb') as f: 154 | pickle.dump(history.history, f) 155 | 156 | if DRY_RUN: 157 | break -------------------------------------------------------------------------------- /models/lstm_v03_analysis.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | import matplotlib.pyplot as plt 3 | import pickle 4 | 5 | from lstm_v03 import create_model, get_df 6 | 7 | 8 | def concat_history(): 9 | path = './data/history/rollingwindow' 10 | keys = ['val_loss', 'val_top_down_acc', 'loss', 'top_down_acc'] 11 | 12 | hist_list = listdir(path) 13 | history = {key: [] for key in keys} 14 | 15 | for hist_name in hist_list: 16 | with open(path + '/' + hist_name, 'rb') as f: 17 | hist = pickle.load(f) 18 | 19 | for key in keys: 20 | history[key] += hist[key] 21 | 22 | return history 23 | 24 | 25 | def plot_train_loss(history, ylim=(0, 0.03)): 26 | plt.ylim(ylim) 27 | 28 | plt.plot(history['loss']) 29 | plt.plot(history['val_loss']) 30 | 31 | plt.xlabel('Epoch') 32 | plt.ylabel('Mean Absolute Error Loss') 33 | plt.title('Training Loss') 34 | plt.legend(['Train','Val']) 35 | plt.show() 36 | 37 | 38 | def perform_tests(): 39 | 40 | test_frac = 0.1 41 | 42 | asset = 'INTC.O' 43 | lstm_size = 64 44 | lag = 15 45 | dropout = 0.1 46 | 47 | path = './data/models/rollingwindow' 48 | models = listdir(path) 49 | 50 | df_lag, df_lag_test, n_features = get_df(test_frac, asset) 51 | X_test = df_lag_test.drop('y', axis=1) 52 | y_test = df_lag_test['y'] 53 | 54 | # Reshape input data according to Keras documentation 55 | # (batch_size, timesteps, input_dim) 56 | X_test = X_test.values.reshape((-1, lag+1, n_features)) 57 | 58 | model = create_model(lstm_size, dropout, lag, n_features) 59 | 60 | f = open('data/lstm_rollingwindow.csv', 'w+') 61 | f.write(model.metrics_names[0] + ',' + model.metrics_names[1] + '\n') 62 | 63 | for model_name in models: 64 | model.load_weights(path + '/' + model_name) 65 | scores = model.evaluate(X_test, y_test, verbose=0) 66 | f.write('{},{}\n'.format(scores[0], scores[1])) 67 | 68 | f.close() 69 | 70 | 71 | if __name__ == '__main__': 72 | # history = concat_history() 73 | # plot_train_loss(history) 74 | 75 | perform_tests() -------------------------------------------------------------------------------- /models/lstm_v04.py: -------------------------------------------------------------------------------- 1 | ''' In this version, I train the model on INTC.O using the hyper parameters 2 | found withthe version 02 but using the rolling origin recalibration splitting 3 | strategy. 4 | ''' 5 | 6 | import pickle 7 | from keras.callbacks import ModelCheckpoint 8 | 9 | from data_partitioning import split_rolling_origin_recal 10 | from data_cleaning import get_cleaned_filtered_data, extract_asset 11 | 12 | from lstm_v03 import add_lag, top_down_acc, create_model, get_df 13 | 14 | 15 | DATA_PATH = './data/processed/cleaned_filtered_data.csv' 16 | HISTORY_PATH = './data/history/lstm_recal/' 17 | CHECKPOINT_PATH = './data/models/lstm_recal/' 18 | 19 | test_frac = 0.1 20 | n_epochs = 1 21 | 22 | init_train_frac = 0.1 23 | rolling_size = 10 24 | 25 | asset = 'INTC.O' 26 | lstm_size = 64 27 | lag = 15 28 | dropout = 0.10 29 | 30 | 31 | if __name__ == '__main__': 32 | 33 | # Get the cleaned and processed data 34 | df_lag, _, n_features = get_df(test_frac, asset) 35 | 36 | # Instantiate the model 37 | model = create_model(lstm_size, dropout, lag, n_features) 38 | 39 | # Train and evaluate using the rolling origin recalibration strategy 40 | init_train_size = round(init_train_frac * len(df_lag)) 41 | count = -1 42 | for df_train, df_val in split_rolling_origin_recal(df_lag, 43 | init_train_size, rolling_size): 44 | count += 1 45 | y_train = df_train['y'] 46 | X_train = df_train.drop(['y'], axis=1) 47 | y_val = df_val['y'] 48 | X_val = df_val.drop(['y'], axis=1) 49 | 50 | # Reshape to match Keras input shape (batch_size, timsteps, input_dim) 51 | X_train = X_train.values.reshape((-1, lag+1, n_features)) 52 | X_val = X_val.values.reshape((-1, lag+1, n_features)) 53 | 54 | # Fit the model 55 | checkpoint_name = ('best-lstm-{:03d}-{}-{}-{}-{}.hdf5').format( 56 | count, asset, lstm_size, lag, int(dropout*100)) 57 | checkpoint = ModelCheckpoint( 58 | CHECKPOINT_PATH + checkpoint_name, 59 | monitor='val_loss', 60 | save_best_only=True) 61 | 62 | history = model.fit(X_train, 63 | y_train, 64 | epochs=n_epochs, 65 | validation_data=(X_val, y_val), 66 | shuffle=False, 67 | callbacks=[checkpoint]) 68 | 69 | # Dumpm the history to a pickle file 70 | path = (HISTORY_PATH + 'lstm.{:03d}-{}-{}-{}-{}.pickle'.format( 71 | count, asset, lstm_size, lag, int(dropout*100))) 72 | with open(path, 'wb') as f: 73 | pickle.dump(history.history, f) 74 | 75 | 76 | -------------------------------------------------------------------------------- /models/lstm_v04_analysis.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | import matplotlib.pyplot as plt 3 | import pickle 4 | 5 | from lstm_v03 import create_model, get_df 6 | 7 | def concat_history(): 8 | path = './data/history/lstm_recal/' 9 | keys = ['val_loss', 'val_top_down_acc', 'loss', 'top_down_acc'] 10 | 11 | hist_list = listdir(path) 12 | history = {key: [] for key in keys} 13 | 14 | for hist_name in hist_list: 15 | with open(path + hist_name, 'rb') as f: 16 | hist = pickle.load(f) 17 | 18 | for key in keys: 19 | history[key] += hist[key] 20 | 21 | return history 22 | 23 | 24 | def plot_train_loss(history, ylim=(0, 0.03)): 25 | plt.ylim(ylim) 26 | 27 | plt.plot(history['loss']) 28 | plt.plot(history['val_loss']) 29 | 30 | plt.xlabel('Epoch') 31 | plt.ylabel('Mean Absolute Error Loss') 32 | plt.title('Training Loss') 33 | plt.legend(['Train','Val']) 34 | plt.show() 35 | 36 | 37 | def perform_tests(): 38 | 39 | test_frac = 0.1 40 | 41 | asset = 'INTC.O' 42 | lstm_size = 64 43 | lag = 15 44 | dropout = 0.1 45 | 46 | path = './data/models/lstm_recal' 47 | models = listdir(path) 48 | 49 | df_lag, df_lag_test, n_features = get_df(test_frac, asset) 50 | X_test = df_lag_test.drop('y', axis=1) 51 | y_test = df_lag_test['y'] 52 | 53 | # Reshape input data according to Keras documentation 54 | # (batch_size, timesteps, input_dim) 55 | X_test = X_test.values.reshape((-1, lag+1, n_features)) 56 | 57 | model = create_model(lstm_size, dropout, lag, n_features) 58 | 59 | f = open('data/lstm_recalibration.csv', 'w+') 60 | f.write(model.metrics_names[0] + ',' + model.metrics_names[1] + '\n') 61 | 62 | for model_name in models: 63 | model.load_weights(path + '/' + model_name) 64 | scores = model.evaluate(X_test, y_test, verbose=0) 65 | f.write('{},{}\n'.format(scores[0], scores[1])) 66 | 67 | f.close() 68 | 69 | 70 | if __name__ == '__main__': 71 | # history = concat_history() 72 | # plot_train_loss(history) 73 | 74 | perform_tests() -------------------------------------------------------------------------------- /models/lstm_v05.py: -------------------------------------------------------------------------------- 1 | ''' In this version, I train the model on INTC.O using the hyper parameters 2 | found withthe version 02 but using the rolling origin update splitting strategy. 3 | ''' 4 | 5 | import pickle 6 | from keras.callbacks import ModelCheckpoint 7 | 8 | from data_partitioning import split_fixed_origin 9 | from data_cleaning import get_cleaned_filtered_data, extract_asset 10 | 11 | from lstm_v03 import add_lag, top_down_acc, create_model, get_df 12 | 13 | 14 | DATA_PATH = './data/processed/cleaned_filtered_data.csv' 15 | HISTORY_PATH = './data/history/{}/' 16 | CHECKPOINT_PATH = './data/models/{}/' 17 | 18 | ASSETS = ['WFC.N', 'AMZN.O', 'A.N', 'BHE.N'] 19 | 20 | test_frac = 0.1 21 | train_frac = 0.8 22 | n_epochs = 50 23 | 24 | lstm_size = 64 25 | lag = 15 26 | dropout = 0.1 27 | 28 | 29 | if __name__ == '__main__': 30 | 31 | for asset in ASSETS : 32 | 33 | # Get the cleaned and processed data 34 | df_lag, _, n_features = get_df(test_frac, asset) 35 | 36 | # Instantiate the model 37 | model = create_model(lstm_size, dropout, lag, n_features) 38 | 39 | # Train and evaluate the model 40 | train_size = round(train_frac * len(df_lag)) 41 | for df_train, df_val in split_fixed_origin(df_lag, train_size): 42 | y_train = df_train['y'] 43 | X_train = df_train.drop(['y'], axis=1) 44 | y_val = df_val['y'] 45 | X_val = df_val.drop(['y'], axis=1) 46 | 47 | # Reshape to match Keras input shape (batch_size, timsteps, input_dim) 48 | X_train = X_train.values.reshape((-1, lag+1, n_features)) 49 | X_val = X_val.values.reshape((-1, lag+1, n_features)) 50 | 51 | # Some user feedback 52 | print('\nFitting model for {}\n'.format(asset)) 53 | 54 | # Fit the model 55 | checkpoint_name = ('best-lstm-{{epoch:03d}}-{{val_loss:.4f}}-{}-' 56 | '{}-{}-{}.hdf5').format(asset, lstm_size, lag, int(dropout*100)) 57 | checkpoint = ModelCheckpoint( 58 | CHECKPOINT_PATH.format(asset) + checkpoint_name, 59 | monitor='val_loss', 60 | save_best_only=True) 61 | 62 | history = model.fit(X_train, 63 | y_train, 64 | epochs=n_epochs, 65 | validation_data=(X_val, y_val), 66 | shuffle=False, 67 | callbacks=[checkpoint]) 68 | 69 | # Dumpm the history to a pickle file 70 | path = (HISTORY_PATH.format(asset) + 'lstm-{}-{}-{}-{}.pickle' 71 | .format(asset, lstm_size, lag, int(dropout*100))) 72 | with open(path, 'wb') as f: 73 | pickle.dump(history.history, f) 74 | 75 | 76 | -------------------------------------------------------------------------------- /models/lstm_v05_analysis.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | import pandas as pd 3 | 4 | from lstm_v03 import create_model, get_df 5 | 6 | 7 | HISTORY_PATH = './data/history/{}/' 8 | CHECKPOINT_PATH = './data/models/{}/' 9 | ASSETS = ['WFC.N', 'AMZN.O', 'A.N', 'BHE.N'] 10 | 11 | def models_to_csv(): 12 | for asset in ASSETS: 13 | models = [f for f in listdir(CHECKPOINT_PATH.format(asset))] 14 | models = pd.DataFrame(models) 15 | models = models[0].str[:-5] 16 | models = models.str.split('-', expand=True) 17 | models = models.drop([0, 1], axis=1) 18 | models.columns = [ 19 | 'epoch', 'val_loss', 'asset', 'lstm_size', 'lag', 'dropout'] 20 | 21 | # Cast to numeric 22 | models['epoch'] = pd.to_numeric(models['epoch']) 23 | models['val_loss'] = pd.to_numeric(models['val_loss']) 24 | models['lstm_size'] = pd.to_numeric(models['lstm_size']) 25 | models['lag'] = pd.to_numeric(models['lag']) 26 | models['dropout'] = pd.to_numeric(models['dropout']) 27 | 28 | # Write to csv file 29 | models.to_csv('./data/lstm-{}-results.csv'.format(asset)) 30 | 31 | 32 | def perform_test_best_model(): 33 | test_frac = 0.1 34 | 35 | lstm_size = 64 36 | lag = 15 37 | dropout = 0.1 38 | 39 | for asset in ASSETS: 40 | 41 | print(asset) 42 | 43 | df_lag, df_lag_test, n_features = get_df(test_frac, asset) 44 | X_test = df_lag_test.drop('y', axis=1) 45 | y_test = df_lag_test['y'] 46 | 47 | X_test = X_test.values.reshape((-1, lag+1, n_features)) 48 | 49 | w = [f for f in listdir(CHECKPOINT_PATH.format(asset))][-1] 50 | model = create_model(lstm_size, dropout, lag, n_features) 51 | model.load_weights(CHECKPOINT_PATH.format(asset) + w) 52 | scores = model.evaluate(X_test, y_test, verbose=0) 53 | print(scores[0], scores[1]) 54 | 55 | -------------------------------------------------------------------------------- /notebooks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/notebooks/__init__.py -------------------------------------------------------------------------------- /notebooks/avis-kernel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": { 5 | "_uuid": "5ffb21374c7cf4b98e7239045ef9bf312effee25" 6 | }, 7 | "cell_type": "markdown", 8 | "source": "# Vanilla Net" 9 | }, 10 | { 11 | "metadata": { 12 | "trusted": true, 13 | "_uuid": "c9fd41029d6cfca6e9bae3f1bfd557a679eda5ec" 14 | }, 15 | "cell_type": "code", 16 | "source": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom itertools import chain\n\n%matplotlib inline\n\nREDUCED = False # Reduce the data size for development and testing", 17 | "execution_count": 11, 18 | "outputs": [] 19 | }, 20 | { 21 | "metadata": { 22 | "trusted": true, 23 | "_uuid": "4fdb018eaba527ddc1dff59ae86845dabfbee52d" 24 | }, 25 | "cell_type": "code", 26 | "source": "def clean_train_data(news_df, market_df):\n '''Clean and preprocess the news and market data for training.\n \n Parameters\n ----------\n news_df : dataframe\n See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n market_df : dataframe\n See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n \n Returns\n -------\n dataframe \n Cleaned data ready to be fed to the model.\n \n '''\n # assetCode, time, volume, open, returnsOpenPrevMktres1, \n # returnsOpenPrevMkres10, returnsOpenNextMktres10\n # sentimentNegative, sentimentNeutral, sentimentPositive\n cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1', \n 'returnsOpenPrevMkres10', 'returnsOpenNextMktres10']\n cleaned_df = market_df.loc[cols]\n \n return None", 27 | "execution_count": 3, 28 | "outputs": [] 29 | }, 30 | { 31 | "metadata": { 32 | "trusted": true, 33 | "_uuid": "54214ee5c758e6f8f22637e8725d15ffc360c266" 34 | }, 35 | "cell_type": "code", 36 | "source": "#TODO: Add cleaned data specifications\n#TODO: Define Returns\ndef train_model(train_df):\n '''Train the model using the given trianing data.\n \n Parameters\n ----------\n train_data : dataframe\n Cleaned data. (Specifications)\n \n Returns\n -------\n\n '''\n \n return None", 37 | "execution_count": 4, 38 | "outputs": [] 39 | }, 40 | { 41 | "metadata": { 42 | "_uuid": "33186c3231b06ced0157278e9f5ed8f4f9c84192" 43 | }, 44 | "cell_type": "markdown", 45 | "source": "## Get competition environment" 46 | }, 47 | { 48 | "metadata": { 49 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 50 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 51 | "trusted": true 52 | }, 53 | "cell_type": "code", 54 | "source": "from kaggle.competitions import twosigmanews\nenv = twosigmanews.make_env()", 55 | "execution_count": 5, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "text": "Loading the data... This could take a minute.\nDone!\n", 60 | "name": "stdout" 61 | } 62 | ] 63 | }, 64 | { 65 | "metadata": { 66 | "_uuid": "d14d4ae98c62668ab6ff1b1aa98168a204031571" 67 | }, 68 | "cell_type": "markdown", 69 | "source": "## Get training data" 70 | }, 71 | { 72 | "metadata": { 73 | "trusted": true, 74 | "_uuid": "c20fa6deeac9d374c98774abd90bdc76b023ee63" 75 | }, 76 | "cell_type": "code", 77 | "source": "(market_train_df, news_train_df) = env.get_training_data()\n\nif REDUCED:\n market_train_df = market_train_df.tail(100_000)\n news_train_df = news_train_df.tail(300_000)", 78 | "execution_count": 7, 79 | "outputs": [] 80 | }, 81 | { 82 | "metadata": { 83 | "_uuid": "38a6ee0f4f565b35466396bd071ff6369a94a75c" 84 | }, 85 | "cell_type": "markdown", 86 | "source": "## Preprocess and clean the data" 87 | }, 88 | { 89 | "metadata": { 90 | "trusted": true, 91 | "_uuid": "1aef352177a2d14af19de1cb128a1142d75721cd" 92 | }, 93 | "cell_type": "code", 94 | "source": "# Select columns and drop NA\ncols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1', \n 'returnsOpenPrevMktres10', 'returnsOpenNextMktres10']\nmarket_train_df = market_train_df.loc[:,cols]\nmarket_train_df.dropna(inplace=True)", 95 | "execution_count": 9, 96 | "outputs": [] 97 | }, 98 | { 99 | "metadata": { 100 | "trusted": true, 101 | "_uuid": "de7bbe376af84a62b32dbfe0f595368c8aa3d69a" 102 | }, 103 | "cell_type": "code", 104 | "source": "# Select columns and drop NA\ncols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive']\nnews_train_df = news_train_df.loc[:,cols]\nnews_train_df.dropna(inplace=True)", 105 | "execution_count": 10, 106 | "outputs": [] 107 | }, 108 | { 109 | "metadata": { 110 | "trusted": true, 111 | "_uuid": "4c2a68bb7f16ee1cafc39179199d90b7f7d97a5a", 112 | "scrolled": false 113 | }, 114 | "cell_type": "code", 115 | "source": "# Normalize time\nmarket_train_df.loc[:, 'time'] = market_train_df.time.dt.normalize()\nnews_train_df.loc[:, 'time'] = news_train_df.time.dt.normalize()\n\n# assetCodes from String to List\nnews_train_df['assetCodes'] = news_train_df['assetCodes'].str.findall(f\"'([\\w\\./]+)'\")", 116 | "execution_count": 14, 117 | "outputs": [] 118 | }, 119 | { 120 | "metadata": { 121 | "trusted": true, 122 | "_uuid": "6fb4e18645fd024edd29f4e32d2e02e5b848e4c7" 123 | }, 124 | "cell_type": "code", 125 | "source": "# Explode news on assetCodes\nassetCodes_expanded = list(chain(*news_train_df['assetCodes']))\nassetCodes_index = news_train_df.index.repeat(news_train_df['assetCodes'].apply(len))\n\nassert len(assetCodes_expanded) == len(assetCodes_index)", 126 | "execution_count": 39, 127 | "outputs": [] 128 | }, 129 | { 130 | "metadata": { 131 | "trusted": true, 132 | "_uuid": "a4094e3fd134232f335d4792ba04fb7e5d407cc6" 133 | }, 134 | "cell_type": "code", 135 | "source": "assetCodes_df = pd.DataFrame({'index': assetCodes_index, 'assetCode': assetCodes_expanded})\nnews_train_df_exploded = news_train_df.merge(assetCodes_df, 'right', right_on='index', left_index=True, validate='1:m')\nnews_train_df_exploded.drop(['assetCodes', 'index'], 1, inplace=True)", 136 | "execution_count": 57, 137 | "outputs": [] 138 | }, 139 | { 140 | "metadata": { 141 | "trusted": true, 142 | "_uuid": "336cb9b8df3a7e56c9d315e2a94f7abdd2bee28c" 143 | }, 144 | "cell_type": "code", 145 | "source": "# Compute means for same date and assetCode\nnews_agg_dict = {\n 'sentimentNegative':'mean'\n ,'sentimentNeutral':'mean'\n ,'sentimentPositive':'mean'\n}\nnews_train_df_agg = news_train_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict)", 146 | "execution_count": 75, 147 | "outputs": [] 148 | }, 149 | { 150 | "metadata": { 151 | "trusted": true, 152 | "scrolled": true, 153 | "_uuid": "9ed3b0db1d9ac57e09d4818ce439e85a00d705bd" 154 | }, 155 | "cell_type": "code", 156 | "source": "# Merge on market data\nX = market_train_df.merge(news_train_df_agg, 'left', ['time', 'assetCode'])", 157 | "execution_count": 77, 158 | "outputs": [] 159 | }, 160 | { 161 | "metadata": { 162 | "_uuid": "7f27c9b0c0b1e255935bc432d2454a36928d2b53" 163 | }, 164 | "cell_type": "markdown", 165 | "source": "## Train the model" 166 | }, 167 | { 168 | "metadata": { 169 | "trusted": true, 170 | "_uuid": "85e6235365c34283e32d0e0484f2874a14ebd092" 171 | }, 172 | "cell_type": "code", 173 | "source": "train_model(train_df)", 174 | "execution_count": null, 175 | "outputs": [] 176 | }, 177 | { 178 | "metadata": { 179 | "_uuid": "763d8d5693ecb9156dc48a613b05ad28292b7d87" 180 | }, 181 | "cell_type": "markdown", 182 | "source": "## Make predictions on test data" 183 | }, 184 | { 185 | "metadata": { 186 | "trusted": true, 187 | "_uuid": "724c38149860c8e9058474ac9045c2301e8a20da" 188 | }, 189 | "cell_type": "code", 190 | "source": "days = env.get_prediction_days()", 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "metadata": { 196 | "trusted": true, 197 | "_uuid": "a3f2197ed790f1aff1356a6954575fde976a4935" 198 | }, 199 | "cell_type": "code", 200 | "source": "import numpy as np\ndef make_random_predictions(predictions_df):\n predictions_df.confidenceValue = 2.0 * np.random.rand(len(predictions_df)) - 1.0", 201 | "execution_count": null, 202 | "outputs": [] 203 | }, 204 | { 205 | "metadata": { 206 | "trusted": true, 207 | "_uuid": "ef60bc52a8a228e5a2ce18e4bd416f1f1f25aeae" 208 | }, 209 | "cell_type": "code", 210 | "source": "for (market_obs_df, news_obs_df, predictions_template_df) in days:\n make_random_predictions(predictions_template_df)\n env.predict(predictions_template_df)\nprint('Done!')", 211 | "execution_count": null, 212 | "outputs": [] 213 | }, 214 | { 215 | "metadata": { 216 | "trusted": true, 217 | "_uuid": "2c8ed34ffb2c47c6e124530ec798c0b4eb01ddd5" 218 | }, 219 | "cell_type": "code", 220 | "source": "env.write_submission_file()", 221 | "execution_count": null, 222 | "outputs": [] 223 | } 224 | ], 225 | "metadata": { 226 | "kernelspec": { 227 | "display_name": "Python 3", 228 | "language": "python", 229 | "name": "python3" 230 | }, 231 | "language_info": { 232 | "name": "python", 233 | "version": "3.6.6", 234 | "mimetype": "text/x-python", 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "pygments_lexer": "ipython3", 240 | "nbconvert_exporter": "python", 241 | "file_extension": ".py" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 1 246 | } -------------------------------------------------------------------------------- /notebooks/exploration-filter-non-continuous-news.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Goal \n", 8 | "Filter non-continuous stocks after merging with the news data." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 20, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import sys\n", 18 | "sys.path.append(r'../models/')\n", 19 | "\n", 20 | "import pandas as pd\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "from data_cleaning import MARKET_DATA_PATH, NEWS_DATA_PATH, clean_market_data, clean_news_data, clean_data\n", 23 | "\n", 24 | "%matplotlib inline" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 4, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "market_train_df = pd.read_csv(MARKET_DATA_PATH)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 5, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "clean_market_df = clean_market_data(market_train_df)\n", 43 | "del market_train_df" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 6, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "news_train_df = pd.read_csv(NEWS_DATA_PATH)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 8, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "clean_news_df = clean_news_data(news_train_df)\n", 62 | "del news_train_df" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 9, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/html": [ 73 | "
\n", 74 | "\n", 87 | "\n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | "
timeassetCodesentimentNegativesentimentNeutralsentimentPositiveurgencybodySizerelevanceACNACT...ONEPNWPRNRNSROMRTRSSEHKSETSSNTEN
02007-01-010857.DE0.5007390.4193270.0799343.01438.00.23570200...0000010000
12007-01-010857.F0.5007390.4193270.0799343.01438.00.23570200...0000010000
22007-01-010857.HK0.5007390.4193270.0799343.01438.00.23570200...0000010000
32007-01-016758.T0.1467650.3923520.4608833.02742.00.20412400...0000010000
42007-01-01BHP.AX0.1306770.4654330.4038913.09674.00.17817400...0000010000
\n", 237 | "

5 rows × 38 columns

\n", 238 | "
" 239 | ], 240 | "text/plain": [ 241 | " time assetCode sentimentNegative sentimentNeutral \\\n", 242 | "0 2007-01-01 0857.DE 0.500739 0.419327 \n", 243 | "1 2007-01-01 0857.F 0.500739 0.419327 \n", 244 | "2 2007-01-01 0857.HK 0.500739 0.419327 \n", 245 | "3 2007-01-01 6758.T 0.146765 0.392352 \n", 246 | "4 2007-01-01 BHP.AX 0.130677 0.465433 \n", 247 | "\n", 248 | " sentimentPositive urgency bodySize relevance ACN ACT ... ONE PNW \\\n", 249 | "0 0.079934 3.0 1438.0 0.235702 0 0 ... 0 0 \n", 250 | "1 0.079934 3.0 1438.0 0.235702 0 0 ... 0 0 \n", 251 | "2 0.079934 3.0 1438.0 0.235702 0 0 ... 0 0 \n", 252 | "3 0.460883 3.0 2742.0 0.204124 0 0 ... 0 0 \n", 253 | "4 0.403891 3.0 9674.0 0.178174 0 0 ... 0 0 \n", 254 | "\n", 255 | " PRN RNS ROM RTRS SEHK SET SSN TEN \n", 256 | "0 0 0 0 1 0 0 0 0 \n", 257 | "1 0 0 0 1 0 0 0 0 \n", 258 | "2 0 0 0 1 0 0 0 0 \n", 259 | "3 0 0 0 1 0 0 0 0 \n", 260 | "4 0 0 0 1 0 0 0 0 \n", 261 | "\n", 262 | "[5 rows x 38 columns]" 263 | ] 264 | }, 265 | "execution_count": 9, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "clean_news_df.head()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 10, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "3652" 283 | ] 284 | }, 285 | "execution_count": 10, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "clean_news_df.time.nunique()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "This number is larger than for the market data (2488). Probably because the weekends are included?" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 11, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "0" 310 | ] 311 | }, 312 | "execution_count": 11, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "sizes = clean_news_df.groupby('assetCode').size()\n", 319 | "sel = sizes == clean_news_df.time.nunique()\n", 320 | "sum(sel)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "This is what I expected: it is very unlikely that news are published for a company every single day for 10+ years. Will probably have to engineer a feature taking the average of the different metrics over some period and add a feature of the number of articles included in the average (or the sum of their importance/urgency).\n", 328 | "\n", 329 | "But first, let's look at the distributions to have a better idea of the extend of the problem." 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 21, 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "data": { 339 | "text/plain": [ 340 | "(array([1.044e+03, 6.610e+02, 5.780e+02, 5.620e+02, 5.430e+02, 5.500e+02,\n", 341 | " 5.750e+02, 5.160e+02, 4.690e+02, 4.340e+02, 4.340e+02, 3.920e+02,\n", 342 | " 3.480e+02, 3.280e+02, 3.570e+02, 3.240e+02, 2.740e+02, 2.490e+02,\n", 343 | " 2.840e+02, 2.730e+02, 2.130e+02, 2.260e+02, 1.940e+02, 1.850e+02,\n", 344 | " 1.640e+02, 1.810e+02, 1.770e+02, 1.410e+02, 1.290e+02, 1.250e+02,\n", 345 | " 1.210e+02, 1.190e+02, 9.400e+01, 1.310e+02, 1.250e+02, 7.400e+01,\n", 346 | " 7.500e+01, 9.300e+01, 7.400e+01, 6.900e+01, 9.400e+01, 6.000e+01,\n", 347 | " 6.000e+01, 5.700e+01, 6.200e+01, 5.700e+01, 4.400e+01, 4.000e+01,\n", 348 | " 3.200e+01, 4.200e+01, 4.500e+01, 4.800e+01, 5.500e+01, 2.900e+01,\n", 349 | " 4.700e+01, 3.000e+01, 3.300e+01, 2.800e+01, 4.400e+01, 2.700e+01,\n", 350 | " 2.300e+01, 3.500e+01, 3.000e+01, 3.700e+01, 3.200e+01, 2.100e+01,\n", 351 | " 1.300e+01, 8.000e+00, 3.200e+01, 1.500e+01, 1.600e+01, 1.500e+01,\n", 352 | " 1.500e+01, 1.500e+01, 1.600e+01, 2.300e+01, 2.100e+01, 1.700e+01,\n", 353 | " 1.400e+01, 1.700e+01, 1.900e+01, 2.300e+01, 2.100e+01, 1.800e+01,\n", 354 | " 1.400e+01, 1.100e+01, 1.200e+01, 1.000e+01, 1.800e+01, 1.700e+01,\n", 355 | " 1.800e+01, 2.000e+01, 1.200e+01, 1.200e+01, 1.200e+01, 1.000e+01,\n", 356 | " 1.000e+01, 8.000e+00, 9.000e+00, 1.400e+01, 1.200e+01, 3.000e+00,\n", 357 | " 6.000e+00, 6.000e+00, 8.000e+00, 3.000e+00, 6.000e+00, 1.000e+01,\n", 358 | " 9.000e+00, 4.000e+00, 4.000e+00, 5.000e+00, 7.000e+00, 7.000e+00,\n", 359 | " 7.000e+00, 9.000e+00, 3.000e+00, 1.400e+01, 9.000e+00, 9.000e+00,\n", 360 | " 4.000e+00, 2.000e+00, 9.000e+00, 1.500e+01, 9.000e+00, 8.000e+00,\n", 361 | " 1.200e+01, 2.000e+00, 4.000e+00, 3.000e+00, 7.000e+00, 1.000e+01,\n", 362 | " 2.000e+00, 3.000e+00, 4.000e+00, 0.000e+00, 1.300e+01, 9.000e+00,\n", 363 | " 7.000e+00, 2.000e+00, 3.000e+00, 6.000e+00, 5.000e+00, 1.200e+01,\n", 364 | " 8.000e+00, 3.000e+00, 1.000e+01, 0.000e+00, 1.500e+01, 9.000e+00,\n", 365 | " 6.000e+00, 5.000e+00, 9.000e+00, 4.000e+00, 6.000e+00, 5.000e+00,\n", 366 | " 3.000e+00, 2.000e+00, 5.000e+00, 1.000e+00, 4.000e+00, 4.000e+00,\n", 367 | " 5.000e+00, 6.000e+00, 3.000e+00, 3.000e+00, 6.000e+00, 3.000e+00,\n", 368 | " 1.000e+00, 1.000e+00, 5.000e+00, 1.000e+00, 6.000e+00, 3.000e+00,\n", 369 | " 3.000e+00, 0.000e+00, 6.000e+00, 1.000e+00, 1.000e+00, 5.000e+00,\n", 370 | " 3.000e+00, 2.000e+00, 0.000e+00, 7.000e+00, 7.000e+00, 2.000e+00,\n", 371 | " 3.000e+00, 0.000e+00, 3.000e+00, 5.000e+00, 1.300e+01, 0.000e+00,\n", 372 | " 7.000e+00, 2.000e+00, 1.000e+00, 1.000e+00, 6.000e+00, 0.000e+00,\n", 373 | " 8.000e+00, 4.000e+00, 2.000e+00, 0.000e+00, 8.000e+00, 2.000e+00,\n", 374 | " 4.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 4.000e+00, 3.000e+00,\n", 375 | " 3.000e+00, 1.000e+00, 2.000e+00, 0.000e+00, 0.000e+00, 4.000e+00,\n", 376 | " 1.000e+00, 2.000e+00, 0.000e+00, 5.000e+00, 5.000e+00, 2.000e+00,\n", 377 | " 0.000e+00, 0.000e+00, 0.000e+00, 4.000e+00, 5.000e+00, 4.000e+00,\n", 378 | " 2.000e+00, 1.000e+00, 0.000e+00, 2.000e+00, 3.000e+00, 3.000e+00,\n", 379 | " 5.000e+00, 7.000e+00, 2.000e+00, 1.000e+00, 0.000e+00, 4.000e+00,\n", 380 | " 8.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.000e+00, 0.000e+00,\n", 381 | " 6.000e+00, 0.000e+00, 0.000e+00, 5.000e+00, 0.000e+00, 2.000e+00,\n", 382 | " 2.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 7.000e+00,\n", 383 | " 3.000e+00, 1.400e+01, 1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,\n", 384 | " 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.000e+00, 0.000e+00,\n", 385 | " 6.000e+00, 2.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", 386 | " 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 3.000e+00, 4.000e+00,\n", 387 | " 8.000e+00, 0.000e+00, 2.000e+00, 2.000e+00, 4.000e+00, 1.000e+00,\n", 388 | " 3.000e+00, 0.000e+00, 3.000e+00, 0.000e+00, 7.000e+00, 0.000e+00,\n", 389 | " 0.000e+00, 0.000e+00, 0.000e+00, 4.000e+00, 6.000e+00, 0.000e+00,\n", 390 | " 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 5.000e+00,\n", 391 | " 7.000e+00, 9.000e+00]),\n", 392 | " array([1.00000000e+00, 1.10292208e+01, 2.10584416e+01, 3.10876623e+01,\n", 393 | " 4.11168831e+01, 5.11461039e+01, 6.11753247e+01, 7.12045455e+01,\n", 394 | " 8.12337662e+01, 9.12629870e+01, 1.01292208e+02, 1.11321429e+02,\n", 395 | " 1.21350649e+02, 1.31379870e+02, 1.41409091e+02, 1.51438312e+02,\n", 396 | " 1.61467532e+02, 1.71496753e+02, 1.81525974e+02, 1.91555195e+02,\n", 397 | " 2.01584416e+02, 2.11613636e+02, 2.21642857e+02, 2.31672078e+02,\n", 398 | " 2.41701299e+02, 2.51730519e+02, 2.61759740e+02, 2.71788961e+02,\n", 399 | " 2.81818182e+02, 2.91847403e+02, 3.01876623e+02, 3.11905844e+02,\n", 400 | " 3.21935065e+02, 3.31964286e+02, 3.41993506e+02, 3.52022727e+02,\n", 401 | " 3.62051948e+02, 3.72081169e+02, 3.82110390e+02, 3.92139610e+02,\n", 402 | " 4.02168831e+02, 4.12198052e+02, 4.22227273e+02, 4.32256494e+02,\n", 403 | " 4.42285714e+02, 4.52314935e+02, 4.62344156e+02, 4.72373377e+02,\n", 404 | " 4.82402597e+02, 4.92431818e+02, 5.02461039e+02, 5.12490260e+02,\n", 405 | " 5.22519481e+02, 5.32548701e+02, 5.42577922e+02, 5.52607143e+02,\n", 406 | " 5.62636364e+02, 5.72665584e+02, 5.82694805e+02, 5.92724026e+02,\n", 407 | " 6.02753247e+02, 6.12782468e+02, 6.22811688e+02, 6.32840909e+02,\n", 408 | " 6.42870130e+02, 6.52899351e+02, 6.62928571e+02, 6.72957792e+02,\n", 409 | " 6.82987013e+02, 6.93016234e+02, 7.03045455e+02, 7.13074675e+02,\n", 410 | " 7.23103896e+02, 7.33133117e+02, 7.43162338e+02, 7.53191558e+02,\n", 411 | " 7.63220779e+02, 7.73250000e+02, 7.83279221e+02, 7.93308442e+02,\n", 412 | " 8.03337662e+02, 8.13366883e+02, 8.23396104e+02, 8.33425325e+02,\n", 413 | " 8.43454545e+02, 8.53483766e+02, 8.63512987e+02, 8.73542208e+02,\n", 414 | " 8.83571429e+02, 8.93600649e+02, 9.03629870e+02, 9.13659091e+02,\n", 415 | " 9.23688312e+02, 9.33717532e+02, 9.43746753e+02, 9.53775974e+02,\n", 416 | " 9.63805195e+02, 9.73834416e+02, 9.83863636e+02, 9.93892857e+02,\n", 417 | " 1.00392208e+03, 1.01395130e+03, 1.02398052e+03, 1.03400974e+03,\n", 418 | " 1.04403896e+03, 1.05406818e+03, 1.06409740e+03, 1.07412662e+03,\n", 419 | " 1.08415584e+03, 1.09418506e+03, 1.10421429e+03, 1.11424351e+03,\n", 420 | " 1.12427273e+03, 1.13430195e+03, 1.14433117e+03, 1.15436039e+03,\n", 421 | " 1.16438961e+03, 1.17441883e+03, 1.18444805e+03, 1.19447727e+03,\n", 422 | " 1.20450649e+03, 1.21453571e+03, 1.22456494e+03, 1.23459416e+03,\n", 423 | " 1.24462338e+03, 1.25465260e+03, 1.26468182e+03, 1.27471104e+03,\n", 424 | " 1.28474026e+03, 1.29476948e+03, 1.30479870e+03, 1.31482792e+03,\n", 425 | " 1.32485714e+03, 1.33488636e+03, 1.34491558e+03, 1.35494481e+03,\n", 426 | " 1.36497403e+03, 1.37500325e+03, 1.38503247e+03, 1.39506169e+03,\n", 427 | " 1.40509091e+03, 1.41512013e+03, 1.42514935e+03, 1.43517857e+03,\n", 428 | " 1.44520779e+03, 1.45523701e+03, 1.46526623e+03, 1.47529545e+03,\n", 429 | " 1.48532468e+03, 1.49535390e+03, 1.50538312e+03, 1.51541234e+03,\n", 430 | " 1.52544156e+03, 1.53547078e+03, 1.54550000e+03, 1.55552922e+03,\n", 431 | " 1.56555844e+03, 1.57558766e+03, 1.58561688e+03, 1.59564610e+03,\n", 432 | " 1.60567532e+03, 1.61570455e+03, 1.62573377e+03, 1.63576299e+03,\n", 433 | " 1.64579221e+03, 1.65582143e+03, 1.66585065e+03, 1.67587987e+03,\n", 434 | " 1.68590909e+03, 1.69593831e+03, 1.70596753e+03, 1.71599675e+03,\n", 435 | " 1.72602597e+03, 1.73605519e+03, 1.74608442e+03, 1.75611364e+03,\n", 436 | " 1.76614286e+03, 1.77617208e+03, 1.78620130e+03, 1.79623052e+03,\n", 437 | " 1.80625974e+03, 1.81628896e+03, 1.82631818e+03, 1.83634740e+03,\n", 438 | " 1.84637662e+03, 1.85640584e+03, 1.86643506e+03, 1.87646429e+03,\n", 439 | " 1.88649351e+03, 1.89652273e+03, 1.90655195e+03, 1.91658117e+03,\n", 440 | " 1.92661039e+03, 1.93663961e+03, 1.94666883e+03, 1.95669805e+03,\n", 441 | " 1.96672727e+03, 1.97675649e+03, 1.98678571e+03, 1.99681494e+03,\n", 442 | " 2.00684416e+03, 2.01687338e+03, 2.02690260e+03, 2.03693182e+03,\n", 443 | " 2.04696104e+03, 2.05699026e+03, 2.06701948e+03, 2.07704870e+03,\n", 444 | " 2.08707792e+03, 2.09710714e+03, 2.10713636e+03, 2.11716558e+03,\n", 445 | " 2.12719481e+03, 2.13722403e+03, 2.14725325e+03, 2.15728247e+03,\n", 446 | " 2.16731169e+03, 2.17734091e+03, 2.18737013e+03, 2.19739935e+03,\n", 447 | " 2.20742857e+03, 2.21745779e+03, 2.22748701e+03, 2.23751623e+03,\n", 448 | " 2.24754545e+03, 2.25757468e+03, 2.26760390e+03, 2.27763312e+03,\n", 449 | " 2.28766234e+03, 2.29769156e+03, 2.30772078e+03, 2.31775000e+03,\n", 450 | " 2.32777922e+03, 2.33780844e+03, 2.34783766e+03, 2.35786688e+03,\n", 451 | " 2.36789610e+03, 2.37792532e+03, 2.38795455e+03, 2.39798377e+03,\n", 452 | " 2.40801299e+03, 2.41804221e+03, 2.42807143e+03, 2.43810065e+03,\n", 453 | " 2.44812987e+03, 2.45815909e+03, 2.46818831e+03, 2.47821753e+03,\n", 454 | " 2.48824675e+03, 2.49827597e+03, 2.50830519e+03, 2.51833442e+03,\n", 455 | " 2.52836364e+03, 2.53839286e+03, 2.54842208e+03, 2.55845130e+03,\n", 456 | " 2.56848052e+03, 2.57850974e+03, 2.58853896e+03, 2.59856818e+03,\n", 457 | " 2.60859740e+03, 2.61862662e+03, 2.62865584e+03, 2.63868506e+03,\n", 458 | " 2.64871429e+03, 2.65874351e+03, 2.66877273e+03, 2.67880195e+03,\n", 459 | " 2.68883117e+03, 2.69886039e+03, 2.70888961e+03, 2.71891883e+03,\n", 460 | " 2.72894805e+03, 2.73897727e+03, 2.74900649e+03, 2.75903571e+03,\n", 461 | " 2.76906494e+03, 2.77909416e+03, 2.78912338e+03, 2.79915260e+03,\n", 462 | " 2.80918182e+03, 2.81921104e+03, 2.82924026e+03, 2.83926948e+03,\n", 463 | " 2.84929870e+03, 2.85932792e+03, 2.86935714e+03, 2.87938636e+03,\n", 464 | " 2.88941558e+03, 2.89944481e+03, 2.90947403e+03, 2.91950325e+03,\n", 465 | " 2.92953247e+03, 2.93956169e+03, 2.94959091e+03, 2.95962013e+03,\n", 466 | " 2.96964935e+03, 2.97967857e+03, 2.98970779e+03, 2.99973701e+03,\n", 467 | " 3.00976623e+03, 3.01979545e+03, 3.02982468e+03, 3.03985390e+03,\n", 468 | " 3.04988312e+03, 3.05991234e+03, 3.06994156e+03, 3.07997078e+03,\n", 469 | " 3.09000000e+03]),\n", 470 | " )" 471 | ] 472 | }, 473 | "execution_count": 21, 474 | "metadata": {}, 475 | "output_type": "execute_result" 476 | }, 477 | { 478 | "data": { 479 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD8CAYAAAB+UHOxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEYZJREFUeJzt3W2MXFd9x/Hvv3ESHhU7ycpKbavrFAsUVS1YVjACoSqGkIeqTqWAgipiUVeWSmihaVUWITW0fWOqlpRIKMjgtE4VQWigilWnpa4ThPoihg2EkMQNXoLBtpx4IYmhRRRS/n0xZ8lkmdldz92dp/P9SKO599wz956zdzy/uefeO47MRJJUn18adAMkSYNhAEhSpQwASaqUASBJlTIAJKlSBoAkVcoAkKRKGQCSVCkDQJIqtWrQDVjIxRdfnJOTk4NuhiSNlIceeuh7mTmxWL2hDoDJyUmmp6cH3QxJGikR8Z2l1HMISJIqZQBIUqUMAEmqlAEgSZUyACSpUgaAJFXKAJCkShkAklQpA0CSKjXWATA5dWDQTZCkoTXWASBJ6s4AkKRKGQCSVCkDQJIqtWgARMQdEXE6Ih5tK7swIg5GxNHyvKaUR0TcFhEzEfFIRGxue82OUv9oROxYme5IkpZqKUcA/wBcNa9sCjiUmZuAQ2Ue4GpgU3nsAm6HVmAAtwCvBy4HbpkLDUnSYCwaAJn5JeCZecXbgX1leh9wXVv5ndnyILA6Ii4B3gYczMxnMvNZ4CC/GCqSpD7q9RzA2sw8VaafAtaW6XXA8bZ6J0pZt/JfEBG7ImI6IqZnZ2d7bJ4kaTGNTwJnZgK5DG2ZW9+ezNySmVsmJhb9Ly0lST3qNQCeLkM7lOfTpfwksKGt3vpS1q1ckjQgvQbAfmDuSp4dwL1t5TeWq4G2AmfKUNEXgCsjYk05+XtlKZMkDciqxSpExKeB3wQujogTtK7m2Q18NiJ2At8B3lGq3wdcA8wAPwLeDZCZz0TEXwFfKfX+MjPnn1iWJPXRogGQme/ssmhbh7oJ3NRlPXcAd5xV6yRJK8Y7gSWpUgaAJFXKAJCkShkAklQpA0CSKmUASFKlDABJqpQBIEmVMgAkqVIGgCRVygCQpEoZAJJUKQNAkiplAEhSpQwASaqUASBJlTIAJKlSBoAkVcoAkKRKGQCSVCkDQJIqZQBIUqUMAEmqlAEgSZUyACSpUgaAJFXKAJCkShkAklQpA0CSKmUASFKlGgVARPxxRDwWEY9GxKcj4iURsTEiDkfETETcHRHnlbrnl/mZsnxyOTogSepNzwEQEeuAPwK2ZOavAecANwAfAW7NzFcBzwI7y0t2As+W8ltLPUnSgDQdAloFvDQiVgEvA04BVwD3lOX7gOvK9PYyT1m+LSKi4fYlST3qOQAy8yTwN8B3aX3wnwEeAp7LzOdLtRPAujK9DjheXvt8qX9Rr9uXJDXTZAhoDa1v9RuBXwZeDlzVtEERsSsipiNienZ2tunqJEldNBkCegvw7cyczcyfAp8H3gisLkNCAOuBk2X6JLABoCy/APj+/JVm5p7M3JKZWyYmJho0T5K0kCYB8F1ga0S8rIzlbwMeBx4Ari91dgD3lun9ZZ6y/P7MzAbblyQ10OQcwGFaJ3O/CnyjrGsP8AHg5oiYoTXGv7e8ZC9wUSm/GZhq0G5JUkOrFq/SXWbeAtwyr/hJ4PIOdX8MvL3J9iRJy8c7gSWpUgaAJFXKAJCkShkAklQpA0CSKmUASFKlDABJqpQBIEmVMgAkqVIGgCRVygCQpEoZAJJUKQNAkiplAEhSpQwASaqUASBJlTIAJKlSBoAkVcoAkKRKGQCSVCkDQJIqZQBIUqUMAEmq1NgHwOTUgUE3QZKG0tgHgCSpMwNAkiplAEhSpQwASaqUASBJlTIAJKlSBoAkVapRAETE6oi4JyL+KyKORMQbIuLCiDgYEUfL85pSNyLitoiYiYhHImLz8nRBktSLpkcAHwP+LTNfA/wGcASYAg5l5ibgUJkHuBrYVB67gNsbbluS1EDPARARFwBvBvYCZOZPMvM5YDuwr1TbB1xXprcDd2bLg8DqiLik55ZLkhppcgSwEZgF/j4ivhYRn4qIlwNrM/NUqfMUsLZMrwOOt73+RCmTJA1AkwBYBWwGbs/M1wH/wwvDPQBkZgJ5NiuNiF0RMR0R07Ozsw2aJ0laSJMAOAGcyMzDZf4eWoHw9NzQTnk+XZafBDa0vX59KXuRzNyTmVsyc8vExESD5kmSFtJzAGTmU8DxiHh1KdoGPA7sB3aUsh3AvWV6P3BjuRpoK3CmbahIktRnqxq+/g+BuyLiPOBJ4N20QuWzEbET+A7wjlL3PuAaYAb4UakrSRqQRgGQmQ8DWzos2tahbgI3NdleryanDnBs97WD2LQkDa3q7gT2P4iRpJbqAkCS1GIASFKlDABJqpQBIEmVqioAPAEsSS+oJgD88JekF6smACRJL2YASFKlDABJqpQBIEmVqjIAPCEsSZUGgCTJAJCkahkAklQpA0CSKmUASFKlDABJqpQBIEmVMgAkqVIGgCRVqtoA8G5gSbWrNgAkqXYGgCRVygCQpEoZAJJUKQNAkiplAEhSpaoOgMmpA14OKqlaVQeAJNXMAJCkSjUOgIg4JyK+FhH/UuY3RsThiJiJiLsj4rxSfn6ZnynLJ5tuW5LUu+U4AngfcKRt/iPArZn5KuBZYGcp3wk8W8pvLfUkSQPSKAAiYj1wLfCpMh/AFcA9pco+4Loyvb3MU5ZvK/UlSQPQ9Ajg74A/A35W5i8CnsvM58v8CWBdmV4HHAcoy8+U+i8SEbsiYjoipmdnZxs2T5LUTc8BEBG/BZzOzIeWsT1k5p7M3JKZWyYmJpZz1ZKkNk2OAN4I/HZEHAM+Q2vo52PA6ohYVeqsB06W6ZPABoCy/ALg+w22v2y8F0BSjXoOgMz8YGauz8xJ4Abg/sz8XeAB4PpSbQdwb5neX+Ypy+/PzOx1+5KkZlbiPoAPADdHxAytMf69pXwvcFEpvxmYWoFtS5KWaNXiVRaXmV8EvlimnwQu71Dnx8Dbl2N7kqTmvBNYkiplAEhSpQwASaqUATCPl4RKqoUBIEmVMgAKv/lLqo0BIEmVMgAkqVIGgCRVygBo43kASTUxADowCCTVwACQpEoZAJJUKQNAkiplAEhSpQwASaqUASBJlTIAFuEloZLGlQEgSZUyALrwm7+kcWcALIFhIGkcGQCSVCkDQJIqZQBIUqUMgAU49i9pnBkAklQpA0CSKmUASFKlDABJqpQBcBY8KSxpnPQcABGxISIeiIjHI+KxiHhfKb8wIg5GxNHyvKaUR0TcFhEzEfFIRGxerk5Iks5ekyOA54E/yczLgK3ATRFxGTAFHMrMTcChMg9wNbCpPHYBtzfYtiSpoZ4DIDNPZeZXy/QPgSPAOmA7sK9U2wdcV6a3A3dmy4PA6oi4pOeW95nDP5LGzbKcA4iISeB1wGFgbWaeKoueAtaW6XXA8baXnShlkqQBaBwAEfEK4HPA+zPzB+3LMjOBPMv17YqI6YiYnp2dbdo8SVIXjQIgIs6l9eF/V2Z+vhQ/PTe0U55Pl/KTwIa2l68vZS+SmXsyc0tmbpmYmGjSvBXhUJCkcdHkKqAA9gJHMvOjbYv2AzvK9A7g3rbyG8vVQFuBM21DRZKkPmtyBPBG4F3AFRHxcHlcA+wG3hoRR4G3lHmA+4AngRngk8B7Gmx7oDwKkDQOVvX6wsz8TyC6LN7WoX4CN/W6vWE0OXWAY7uvHXQzJKkn3gnckEcDkkaVASBJlTIAeuQ3f0mjzgCQpEoZAJJUKQNAkiplAEhSpQyAZeAJYUmjyACQpEoZAMuk01GARwaShpkBsIwMAUmjxABYZpNTB/zQlzQSDABJqpQBIEmVMgAkqVIGgCRVygBYIYtdEeSJYkmDZgBIUqUMAEmqlAEgSZUyAPqg/eYwx/4lDQsDYAAMAUnDwACQpEqtGnQDatbpSODY7msH0BJJNfIIYEQ4bCRpuRkAI8AbyCStBANgyMz/sF/sA99AkNQrA2AINf1QNxQkLYUBMKR6+ea/0FCRoSBpPgNghMy/mazTzWV+0EtaKgNgTDT54O/2WsNEGm99vw8gIq4CPgacA3wqM3f3uw21mJw6sOT7CuY+7I/tvrbjEcX89bTX8d4FaTT19QggIs4BPg5cDVwGvDMiLutnG2rT6aqi+Y9u9TuVdzv30O2KpaVcybQUSzlKWep2Fqq32LmVYTUKbewn/x5L0+8hoMuBmcx8MjN/AnwG2N7nNlRnuf4xLOUH7RYKl07t6RZCndbRab7bOhYqX8qyTu3sNN1PTS4LHsaLAsY9bHvty3J9aVqKfg8BrQOOt82fAF7f5zZoABb7ADqbb+VNvsEvNJR1Nuvp9rr5Q2jd6syto73+3HT78k7t7jZc1758/jbmv67TfKdtdlrX/HYvtI5O21uord2Wd2vH/G13W08ni62nU/1u8/P/Lt3Wtdi2+h2AkZn921jE9cBVmfn7Zf5dwOsz871tdXYBu8rsq4EnetzcxcD3GjR3WIxDP+zD8BiHftiHxf1KZk4sVqnfRwAngQ1t8+tL2c9l5h5gT9MNRcR0Zm5pup5BG4d+2IfhMQ79sA/Lp9/nAL4CbIqIjRFxHnADsL/PbZAk0ecjgMx8PiLeC3yB1mWgd2TmY/1sgySppe/3AWTmfcB9fdhU42GkITEO/bAPw2Mc+mEflklfTwJLkoaHPwUhSZUaywCIiKsi4omImImIqUG3ZyERcSwivhERD0fEdCm7MCIORsTR8rymlEdE3Fb69UhEbB5gu++IiNMR8Whb2Vm3OyJ2lPpHI2LHEPThwxFxsuyPhyPimrZlHyx9eCIi3tZWPrD3W0RsiIgHIuLxiHgsIt5XykdmXyzQh5HZFxHxkoj4ckR8vfThL0r5xog4XNpzd7n4hYg4v8zPlOWTi/VtRWTmWD1onVz+FnApcB7wdeCyQbdrgfYeAy6eV/bXwFSZngI+UqavAf4VCGArcHiA7X4zsBl4tNd2AxcCT5bnNWV6zYD78GHgTzvUvay8l84HNpb32DmDfr8BlwCby/QrgW+Wto7MvligDyOzL8rf8xVl+lzgcPn7fha4oZR/AviDMv0e4BNl+gbg7oX6tlLtHscjgHH4uYntwL4yvQ+4rq38zmx5EFgdEZcMooGZ+SXgmXnFZ9vutwEHM/OZzHwWOAhctfKtb+nSh262A5/JzP/NzG8DM7TeawN9v2Xmqcz8apn+IXCE1h33I7MvFuhDN0O3L8rf87/L7LnlkcAVwD2lfP5+mNs/9wDbIiLo3rcVMY4B0OnnJhZ6Mw1aAv8eEQ9F6y5ogLWZeapMPwWsLdPD3rezbfew9ue9ZXjkjrmhE0agD2UY4XW0vn2O5L6Y1wcYoX0REedExMPAaVoB+i3gucx8vkN7ft7WsvwMcBF97sM4BsCoeVNmbqb1C6k3RcSb2xdm67hw5C7VGtV2A7cDvwq8FjgF/O1gm7M0EfEK4HPA+zPzB+3LRmVfdOjDSO2LzPy/zHwtrV84uBx4zYCbtKhxDIBFf25imGTmyfJ8GvhnWm+cp+eGdsrz6VJ92Pt2tu0euv5k5tPlH/LPgE/ywuH30PYhIs6l9cF5V2Z+vhSP1L7o1IdR3BcAmfkc8ADwBlpDbHP3W7W35+dtLcsvAL5Pn/swjgEwMj83EREvj4hXzk0DVwKP0mrv3FUYO4B7y/R+4MZyJcdW4EzbYf4wONt2fwG4MiLWlMP7K0vZwMw7p/I7tPYHtPpwQ7l6YyOwCfgyA36/lXHjvcCRzPxo26KR2Rfd+jBK+yIiJiJidZl+KfBWWucyHgCuL9Xm74e5/XM9cH85UuvWt5WxkmfGB/WgdaXDN2mNwX1o0O1ZoJ2X0jrj/3Xgsbm20hoLPAQcBf4DuDBfuNLg46Vf3wC2DLDtn6Z1WP5TWuOUO3tpN/B7tE50zQDvHoI+/GNp4yO0/jFe0lb/Q6UPTwBXD8P7DXgTreGdR4CHy+OaUdoXC/RhZPYF8OvA10pbHwX+vJRfSusDfAb4J+D8Uv6SMj9Tll+6WN9W4uGdwJJUqXEcApIkLYEBIEmVMgAkqVIGgCRVygCQpEoZAJJUKQNAkiplAEhSpf4fa1NpNaCVd4QAAAAASUVORK5CYII=\n", 480 | "text/plain": [ 481 | "
" 482 | ] 483 | }, 484 | "metadata": { 485 | "needs_background": "light" 486 | }, 487 | "output_type": "display_data" 488 | } 489 | ], 490 | "source": [ 491 | "plt.hist(sizes, bins=int((sizes.max() - sizes.min())/10))" 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "We see thatmost stocks have very little news coverage. \n", 499 | "One possible solution would be to bin on a 10 day basis.\n", 500 | "Another possible solution would be to simply fill the blanks with 0s. This solution feels like the easiest." 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [] 509 | } 510 | ], 511 | "metadata": { 512 | "kernelspec": { 513 | "display_name": "Python 3", 514 | "language": "python", 515 | "name": "python3" 516 | }, 517 | "language_info": { 518 | "codemirror_mode": { 519 | "name": "ipython", 520 | "version": 3 521 | }, 522 | "file_extension": ".py", 523 | "mimetype": "text/x-python", 524 | "name": "python", 525 | "nbconvert_exporter": "python", 526 | "pygments_lexer": "ipython3", 527 | "version": "3.6.5" 528 | } 529 | }, 530 | "nbformat": 4, 531 | "nbformat_minor": 2 532 | } 533 | -------------------------------------------------------------------------------- /notebooks/exploration-filter-non-continuous-stocks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Goal \n", 8 | "Filter non-continuous stocks. That is, filter out stocks that do not span the entire time series." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import sys\n", 18 | "sys.path.append(r'../models/')\n", 19 | "\n", 20 | "import pandas as pd\n", 21 | "from data_cleaning import MARKET_DATA_PATH, NEWS_DATA_PATH, clean_market_data" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "market_train_df = pd.read_csv(MARKET_DATA_PATH)\n", 31 | "# news_train_df = pd.read_csv(NEWS_DATA_PATH)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "clean_market_df = clean_market_data(market_train_df)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
\n", 52 | "\n", 65 | "\n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | "
assetCodetimevolumeopenreturnsOpenPrevMktres1returnsOpenPrevMktres10returnsOpenNextMktres10
14290A.N2007-02-154095135.032.990-0.0015720.007461-0.029993
14291AAI.N2007-02-151378650.011.650-0.001498-0.010884-0.013111
14292AAP.N2007-02-153884400.037.000-0.019388-0.026448-0.028244
14293AAPL.O2007-02-1512997017.085.3100.000738-0.044809-0.014505
14294ABB.N2007-02-1510168100.018.245-0.026040-0.0109270.017172
\n", 131 | "
" 132 | ], 133 | "text/plain": [ 134 | " assetCode time volume open returnsOpenPrevMktres1 \\\n", 135 | "14290 A.N 2007-02-15 4095135.0 32.990 -0.001572 \n", 136 | "14291 AAI.N 2007-02-15 1378650.0 11.650 -0.001498 \n", 137 | "14292 AAP.N 2007-02-15 3884400.0 37.000 -0.019388 \n", 138 | "14293 AAPL.O 2007-02-15 12997017.0 85.310 0.000738 \n", 139 | "14294 ABB.N 2007-02-15 10168100.0 18.245 -0.026040 \n", 140 | "\n", 141 | " returnsOpenPrevMktres10 returnsOpenNextMktres10 \n", 142 | "14290 0.007461 -0.029993 \n", 143 | "14291 -0.010884 -0.013111 \n", 144 | "14292 -0.026448 -0.028244 \n", 145 | "14293 -0.044809 -0.014505 \n", 146 | "14294 -0.010927 0.017172 " 147 | ] 148 | }, 149 | "execution_count": 5, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "clean_market_df.head(5)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 16, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "2488" 167 | ] 168 | }, 169 | "execution_count": 16, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "clean_market_df.time.nunique()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 10, 181 | "metadata": { 182 | "scrolled": true 183 | }, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "assetCode\n", 189 | "A.N 2488\n", 190 | "AAI.N 879\n", 191 | "AAL.O 772\n", 192 | "AAMRQ.OB 70\n", 193 | "AAN.N 1486\n", 194 | "dtype: int64" 195 | ] 196 | }, 197 | "execution_count": 10, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "clean_market_df.groupby('assetCode').size().head()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 19, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "522" 215 | ] 216 | }, 217 | "execution_count": 19, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "sel = clean_market_df.groupby('assetCode').size() == clean_market_df.time.nunique()\n", 224 | "sum(sel)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "522 stocks span the entire time series. We want to filter only those out.\n", 232 | "\n", 233 | "---" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 21, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "series_len = clean_market_df.time.nunique()\n", 243 | "clean_market_df = clean_market_df.groupby('assetCode').filter(lambda x: len(x) == series_len)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 25, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "# Confirm that we are left only with the ones covering the whole series\n", 253 | "assert (clean_market_df.groupby('assetCode').size() == series_len).all()" 254 | ] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.6.5" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 2 278 | } 279 | -------------------------------------------------------------------------------- /notebooks/se_kernel_v0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "from itertools import chain\n", 13 | "\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "REDUCED = True # Reduce the data size for development and testing" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from kaggle.competitions import twosigmanews\n", 33 | "env = twosigmanews.make_env()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "(market_train_df, news_train_df) = env.get_training_data()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "if REDUCED:\n", 52 | " market_train_df = market_train_df.tail(10000)\n", 53 | " news_train_df = news_train_df.tail(50000)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "def clean_market_data(market_df, train=True):\n", 63 | " '''Clean and preprocess the market data for training or testing.\n", 64 | " \n", 65 | " Parameters\n", 66 | " ----------\n", 67 | " market_df : dataframe\n", 68 | " See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n", 69 | " train : bool\n", 70 | " When true, adds the target variable to the dataframe.\n", 71 | " \n", 72 | " Returns\n", 73 | " -------\n", 74 | " dataframe \n", 75 | " Cleaned market data.\n", 76 | " \n", 77 | " '''\n", 78 | " # Select columns and drop NA\n", 79 | " if train:\n", 80 | " cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1',\n", 81 | " 'returnsOpenPrevMktres10', 'returnsOpenNextMktres10']\n", 82 | " else:\n", 83 | " cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1',\n", 84 | " 'returnsOpenPrevMktres10']\n", 85 | " market_df = market_df.loc[:,cols]\n", 86 | " market_df.dropna(inplace=True)\n", 87 | " \n", 88 | " # Normalize time\n", 89 | " market_df.loc[:, 'time'] = market_df.time.dt.normalize()\n", 90 | " \n", 91 | " return market_df" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "def clean_news_data(news_df, extra_features= False):\n", 101 | " '''Clean and preprocess the news data for training or testing.\n", 102 | " \n", 103 | " Parameters\n", 104 | " ----------\n", 105 | " news_df : dataframe\n", 106 | " See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n", 107 | " extra_features : bool\n", 108 | " When true, adds extra columns that SE added ('urgency', 'provider', 'bodySize', 'relevance').\n", 109 | " \n", 110 | " Returns\n", 111 | " -------\n", 112 | " dataframe \n", 113 | " Cleaned news data.\n", 114 | " \n", 115 | " '''\n", 116 | " # Select columns and drop NA\n", 117 | " if extra_features:\n", 118 | " cols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',\n", 119 | " 'urgency', 'provider', 'bodySize', 'relevance']\n", 120 | " else:\n", 121 | " cols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive']\n", 122 | " news_df = news_df.loc[:,cols]\n", 123 | " news_df.dropna(inplace=True)\n", 124 | " \n", 125 | " # Normalize time\n", 126 | " news_df.loc[:, 'time'] = news_df.time.dt.normalize()\n", 127 | " \n", 128 | " # assetCodes from String to List\n", 129 | " news_df['assetCodes'] = news_df['assetCodes'].str.findall(f\"'([\\w\\./]+)'\")\n", 130 | " \n", 131 | " # Explode news on assetCodes\n", 132 | " assetCodes_expanded = list(chain(*news_df['assetCodes']))\n", 133 | " assetCodes_index = news_df.index.repeat(news_df['assetCodes'].apply(len))\n", 134 | "\n", 135 | " assert len(assetCodes_expanded) == len(assetCodes_index)\n", 136 | " \n", 137 | " assetCodes_df = pd.DataFrame({'index': assetCodes_index, 'assetCode': assetCodes_expanded})\n", 138 | " news_df_exploded = news_df.merge(assetCodes_df, 'right', right_on='index', left_index=True, validate='1:m')\n", 139 | " news_df_exploded.drop(['assetCodes', 'index'], 1, inplace=True)\n", 140 | "\n", 141 | " if extra_features:\n", 142 | " # Compute means for same date and assetCode\n", 143 | " news_agg_dict = {\n", 144 | " 'sentimentNegative':'mean',\n", 145 | " 'sentimentNeutral':'mean',\n", 146 | " 'sentimentPositive':'mean',\n", 147 | " 'urgency':'mean',\n", 148 | " 'bodySize':'mean',\n", 149 | " 'relevance':'mean'\n", 150 | " }\n", 151 | " news_df_agg = news_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict)\n", 152 | " \n", 153 | " # Add provider information\n", 154 | " idx = news_df_exploded.groupby(['time', 'assetCode'])['urgency'].transform(max) == news_df_exploded['urgency']\n", 155 | " news_df_exploded_2 = news_df_exploded[idx][['time', 'assetCode', 'provider']].drop_duplicates(['time', 'assetCode'])\n", 156 | " news_df_agg = news_df_agg.merge(news_df_exploded_2, 'left', ['time', 'assetCode'])\n", 157 | " \n", 158 | " # One-hot encoding provider\n", 159 | " ohe_provider = pd.get_dummies(news_df_agg['provider'])\n", 160 | " news_df_agg = pd.concat([news_df_agg, ohe_provider], axis=1).drop(['provider'], axis=1)\n", 161 | " \n", 162 | " else:\n", 163 | " # Compute means for same date and assetCode\n", 164 | " news_agg_dict = {\n", 165 | " 'sentimentNegative':'mean',\n", 166 | " 'sentimentNeutral':'mean',\n", 167 | " 'sentimentPositive':'mean'\n", 168 | " }\n", 169 | " news_df_agg = news_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict)\n", 170 | " \n", 171 | " return news_df_agg" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "def clean_data(market_df, news_df, train=True, extra_features=False):\n", 181 | " '''Clean and preprocess the news and market data for training then merge them, to create a train set or test set.\n", 182 | " \n", 183 | " Parameters\n", 184 | " ----------\n", 185 | " market_df : dataframe\n", 186 | " See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n", 187 | " news_df : dataframe\n", 188 | " See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n", 189 | " train : bool\n", 190 | " When true, creates both the input features and the target dataframes.\n", 191 | " extra_features : bool\n", 192 | " When true, adds extra columns that SE added ('urgency', 'provider', 'bodySize', 'relevance').\n", 193 | " \n", 194 | " Returns\n", 195 | " -------\n", 196 | " dataframe \n", 197 | " Cleaned data ready to be fed to the model. Returns both the input and the target dataframes when train=True.\n", 198 | " \n", 199 | " '''\n", 200 | " cleaned_market_df = clean_market_data(market_df, train)\n", 201 | " cleaned_news_df = clean_news_data(news_df, extra_features)\n", 202 | " \n", 203 | " # Merge on market data\n", 204 | " df_merged = cleaned_market_df.merge(cleaned_news_df, 'inner', ['time', 'assetCode'])\n", 205 | " \n", 206 | " if train:\n", 207 | " y = df_merged['returnsOpenNextMktres10']\n", 208 | " X = df_merged.drop(['returnsOpenNextMktres10'], axis=1)\n", 209 | " return X, y\n", 210 | " else:\n", 211 | " return df_merged" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "#Final dataframes for training\n", 221 | "X_train, y_train = clean_data(market_train_df, news_train_df, extra_features=True)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Python 3", 235 | "language": "python", 236 | "name": "python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.6.5" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 2 253 | } 254 | -------------------------------------------------------------------------------- /notebooks/se_kernel_v1.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pandas as pd 4 | import glob 5 | 6 | sys.path.append('../') 7 | from models.data_cleaning import clean_market_data, clean_news_data 8 | 9 | # Import libraries used for lstm 10 | from keras.models import Sequential 11 | from keras.layers import Dense 12 | from keras.layers import LSTM 13 | 14 | # Define some global variables 15 | MARKET_DATA_PATH = '../data/raw/market_train_df.csv' 16 | NEWS_DATA_PATH = '../data/raw/news_train_df.csv' 17 | MERGED_PATH = '../data/processed/df_merged.csv' 18 | 19 | MARKET_CLEAN_PATH = '../data/processed/market_cleaned_df.csv' 20 | NEWS_CLEAN_CHUNK_PATH = '../data/processed/news_cleaned_df_' 21 | NEWS_CLEAN_PATH = '../data/processed/news_cleaned_df.csv' 22 | 23 | MARKET_CONTINOUS_PATH = '../data/processed/market_continuous_df.csv' 24 | NEWS_CONTINUOUS_PATH = '../data/processed/news_continuous_df.csv' 25 | 26 | 27 | def get_continuous_df(market_data_path, news_data_path, merged_path, 28 | market_clean_path=MARKET_CLEAN_PATH, 29 | news_clean_chunk_path=NEWS_CLEAN_CHUNK_PATH, 30 | news_clean_path=NEWS_CLEAN_PATH, 31 | market_continuous_path=MARKET_CONTINOUS_PATH, 32 | news_continuous_path=NEWS_CONTINUOUS_PATH): 33 | """ 34 | Cleans and filters the datasets to only select assets with 35 | continuous information 36 | """ 37 | market_train_df = pd.read_csv(market_data_path) 38 | cleaned_market_df = clean_market_data(market_train_df) 39 | print('market data was cleaned') 40 | cleaned_market_df.to_csv(market_clean_path) 41 | print('cleaned market data was saved') 42 | # save memory usage 43 | del market_train_df 44 | 45 | series_len = cleaned_market_df.time.nunique() 46 | cleaned_market_df = cleaned_market_df.groupby('assetCode').filter(lambda x: len(x) == series_len) 47 | cleaned_market_df = cleaned_market_df.reset_index(drop=True) 48 | print('market data was filtered') 49 | cleaned_market_df.to_csv(market_continuous_path) 50 | print('filtered market data was saved') 51 | 52 | c = 0 53 | for news_chunk in pd.read_csv(news_data_path, chunksize=100000): 54 | print('news chunk_number ' + str(c)) 55 | news_cleaned = clean_news_data(news_chunk) 56 | news_cleaned.to_csv(news_clean_chunk_path + str(c) + '.csv') 57 | print('news chunk number ' + str(c) + ' saved') 58 | c += 1 59 | 60 | news_files = glob.glob(news_clean_chunk_path + "*") 61 | cleaned_news_df = pd.concat((pd.read_csv(f, header=0) for f in news_files)) 62 | print('cleaned news data concatenated') 63 | cleaned_news_df.to_csv(news_clean_path) 64 | print('cleaned news data was saved') 65 | 66 | assetcodes = cleaned_market_df['assetCode'].tolist() 67 | news_continuous_df = cleaned_news_df[cleaned_news_df['assetCode'].isin(assetcodes)] 68 | news_continuous_df.loc[:, 'time'] = pd.to_datetime(news_continuous_df.time).dt.normalize() 69 | news_continuous_df.to_csv(news_continuous_path) 70 | print('filtered news data was saved') 71 | df_merged = cleaned_market_df.merge(news_continuous_df.drop_duplicates(subset=['time', 'assetCode']), 'left', ['time', 'assetCode']) 72 | 73 | print('filling missing values and saving the merged dataset') 74 | df_merged = df_merged.fillna(-1) 75 | df_merged.to_csv(merged_path) 76 | 77 | # return the final merged dataset 78 | return df_merged 79 | 80 | 81 | if __name__ == '__main__': 82 | 83 | if os.path.exists(MERGED_PATH): 84 | df_merged = pd.read_csv(MERGED_PATH) 85 | else: 86 | df_merged = get_continuous_df(MARKET_DATA_PATH, 87 | NEWS_DATA_PATH, 88 | MERGED_PATH) 89 | 90 | df_merged = df_merged.sort_values(['time', 'assetCode'], ascending=[True, True]) 91 | 92 | # taking 80%, 10%, 10% for train, val, test sets 93 | df_train = df_merged[:522*1990] 94 | df_val = df_merged[522*1990:522*(1990+249)] 95 | df_test = df_merged[522*(1990+249):] 96 | 97 | # create the different data sets 98 | y_train = df_train['returnsOpenNextMktres10'] 99 | X_train = df_train.drop(['returnsOpenNextMktres10'], axis=1) 100 | 101 | y_val = df_val['returnsOpenNextMktres10'] 102 | X_val = df_val.drop(['returnsOpenNextMktres10'], axis=1) 103 | 104 | y_test = df_test['returnsOpenNextMktres10'] 105 | X_test = df_test.drop(['returnsOpenNextMktres10'], axis=1) 106 | 107 | X_train_ar = X_train.drop(['Unnamed: 0', 'assetCode', "time"], axis=1).as_matrix() 108 | X_train_ar = X_train_ar.reshape(int(X_train_ar.shape[0]/522), 1, 522*X_train_ar.shape[1]) 109 | 110 | X_val_ar = X_val.drop(['Unnamed: 0', 'assetCode', "time"], axis=1).as_matrix() 111 | X_val_ar = X_val_ar.reshape(int(X_val_ar.shape[0]/522), 1, 522*X_val_ar.shape[1]) 112 | 113 | X_test_ar = X_test.drop(['Unnamed: 0', 'assetCode', "time"], axis=1).as_matrix() 114 | X_test_ar = X_test_ar.reshape(int(X_test_ar.shape[0]/522), 1, 522*X_test_ar.shape[1]) 115 | 116 | y_train_ar = y_train.values.reshape((1990, 522)) 117 | y_val_ar = y_val.values.reshape((int(len(y_val)/522), 522)) 118 | y_test_ar = y_test.values.reshape((int(len(y_test)/522), 522)) 119 | 120 | # 4. Build Keras model 121 | model = Sequential() 122 | model.add(LSTM(50, input_shape=(1, 41*522))) # adds LSTM layer 123 | model.add(Dense(522)) # adds a dense layer 124 | model.compile(loss='mae', optimizer='adam') # TODO: change the loss 125 | 126 | # 5. Fit RNN 127 | model.fit(X_train_ar, y_train_ar, epochs=3, batch_size=1, 128 | validation_data=(X_val_ar, y_val_ar), verbose=1, shuffle=False) 129 | 130 | model.save('vanilla_lstm_20181117.hdf5') 131 | print('model saved.') 132 | -------------------------------------------------------------------------------- /report/Diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/Diagram.png -------------------------------------------------------------------------------- /report/LSTMAgrid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/LSTMAgrid.png -------------------------------------------------------------------------------- /report/LSTMgrid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/LSTMgrid.png -------------------------------------------------------------------------------- /report/Shuffling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/Shuffling.png -------------------------------------------------------------------------------- /report/Stocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/Stocks.png -------------------------------------------------------------------------------- /report/lstm_att_v0_ts_1_drop_04_cells_64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/lstm_att_v0_ts_1_drop_04_cells_64.png -------------------------------------------------------------------------------- /report/lstm_att_v0_ts_5_drop_0_cells_64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/lstm_att_v0_ts_5_drop_0_cells_64.png -------------------------------------------------------------------------------- /report/lstm_plot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/lstm_plot1.png -------------------------------------------------------------------------------- /report/lstm_plot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/lstm_plot2.png -------------------------------------------------------------------------------- /report/main.bbl: -------------------------------------------------------------------------------- 1 | \begin{thebibliography}{10} 2 | 3 | \bibitem{bahdanau2014neural} 4 | Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 5 | \newblock Neural machine translation by jointly learning to align and 6 | translate. 7 | \newblock {\em arXiv preprint arXiv:1409.0473}, 2014. 8 | 9 | \bibitem{bao2017deep} 10 | Wei Bao, Jun Yue, and Yulei Rao. 11 | \newblock A deep learning framework for financial time series using stacked 12 | autoencoders and long-short term memory. 13 | \newblock {\em PloS one}, 12(7):e0180944, 2017. 14 | 15 | \bibitem{bergmeir2012use} 16 | Christoph Bergmeir and Jos{\'e}~M Ben{\'\i}tez. 17 | \newblock On the use of cross-validation for time series predictor evaluation. 18 | \newblock {\em Information Sciences}, 191:192--213, 2012. 19 | 20 | \bibitem{bollerslev1986generalized} 21 | Tim Bollerslev. 22 | \newblock Generalized autoregressive conditional heteroskedasticity. 23 | \newblock {\em Journal of econometrics}, 31(3):307--327, 1986. 24 | 25 | \bibitem{burg1968new} 26 | John~Parker Burg. 27 | \newblock A new analysis technique for time series data. 28 | \newblock {\em Paper presented at NATO Advanced Study Institute on Signal 29 | Processing, Enschede, Netherlands, 1968}, 1968. 30 | 31 | \bibitem{chandra2012cooperative} 32 | Rohitash Chandra and Mengjie Zhang. 33 | \newblock Cooperative coevolution of elman recurrent neural networks for 34 | chaotic time series prediction. 35 | \newblock {\em Neurocomputing}, 86:116--123, 2012. 36 | 37 | \bibitem{chen1989representations} 38 | Sheng Chen and Steve~A Billings. 39 | \newblock Representations of non-linear systems: the narmax model. 40 | \newblock {\em International Journal of Control}, 49(3):1013--1032, 1989. 41 | 42 | \bibitem{engle1982autoregressive} 43 | Robert~F Engle. 44 | \newblock Autoregressive conditional heteroscedasticity with estimates of the 45 | variance of united kingdom inflation. 46 | \newblock {\em Econometrica: Journal of the Econometric Society}, pages 47 | 987--1007, 1982. 48 | 49 | \bibitem{engle1993measuring} 50 | Robert~F Engle and Victor~K Ng. 51 | \newblock Measuring and testing the impact of news on volatility. 52 | \newblock {\em The journal of finance}, 48(5):1749--1778, 1993. 53 | 54 | \bibitem{firat2016multi} 55 | Orhan Firat, Kyunghyun Cho, and Yoshua Bengio. 56 | \newblock Multi-way, multilingual neural machine translation with a shared 57 | attention mechanism. 58 | \newblock {\em arXiv preprint arXiv:1601.01073}, 2016. 59 | 60 | \bibitem{graves2014neural} 61 | Alex Graves, Greg Wayne, and Ivo Danihelka. 62 | \newblock Neural turing machines. 63 | \newblock {\em arXiv preprint arXiv:1410.5401}, 2014. 64 | 65 | \bibitem{hamilton1994time} 66 | James~Douglas Hamilton. 67 | \newblock {\em Time series analysis}, volume~2. 68 | \newblock Princeton university press Princeton, NJ, 1994. 69 | 70 | \bibitem{hamzaoui2016glosten} 71 | Nessrine Hamzaoui and Boutheina Regaieg. 72 | \newblock The glosten-jagannathan-runkle-generalized autoregressive conditional 73 | heteroscedastic approach to investigating the foreign exchange forward 74 | premium volatility. 75 | \newblock {\em International Journal of Economics and Financial Issues}, 76 | 6(4):1608--1615, 2016. 77 | 78 | \bibitem{hentschel1995all} 79 | Ludger Hentschel et~al. 80 | \newblock All in the family: Nesting symmetric and asymmetric garch models. 81 | \newblock {\em Journal of Financial Economics}, 39(1):71--104, 1995. 82 | 83 | \bibitem{hochreiter1997long} 84 | Sepp Hochreiter and J{\"u}rgen Schmidhuber. 85 | \newblock Long short-term memory. 86 | \newblock {\em Neural computation}, 9(8):1735--1780, 1997. 87 | 88 | \bibitem{hollis2018deep} 89 | Thomas Hollis. 90 | \newblock Deep learning algorithms applied to blockchain-based financial time 91 | series. 92 | \newblock 2018. 93 | 94 | \bibitem{kaggle2017twosigma} 95 | Kaggle. 96 | \newblock Two sigma: Using news to predict stock movements. 97 | \newblock \url{https://www.kaggle.com/c/two-sigma-financial-news}. 98 | \newblock Accessed: 2018-09-30. 99 | 100 | \bibitem{kim2003financial} 101 | Kyoung-jae Kim. 102 | \newblock Financial time series forecasting using support vector machines. 103 | \newblock {\em Neurocomputing}, 55(1-2):307--319, 2003. 104 | 105 | \bibitem{kingma2014adam} 106 | Diederik~P Kingma and Jimmy Ba. 107 | \newblock Adam: A method for stochastic optimization. 108 | \newblock {\em arXiv preprint arXiv:1412.6980}, 2014. 109 | 110 | \bibitem{kohonen1982self} 111 | Teuvo Kohonen. 112 | \newblock Self-organized formation of topologically correct feature maps. 113 | \newblock {\em Biological cybernetics}, 43(1):59--69, 1982. 114 | 115 | \bibitem{koskela1998time} 116 | Timo Koskela, Markus Varsta, Jukka Heikkonen, and Kimmo Kaski. 117 | \newblock Time series prediction using recurrent som with local linear models. 118 | \newblock {\em Int. J. of Knowledge-Based Intelligent Engineering Systems}, 119 | 2(1):60--68, 1998. 120 | 121 | \bibitem{krizhevsky2012imagenet} 122 | Alex Krizhevsky, Ilya Sutskever, and Geoffrey~E Hinton. 123 | \newblock Imagenet classification with deep convolutional neural networks. 124 | \newblock In {\em Advances in neural information processing systems}, pages 125 | 1097--1105, 2012. 126 | 127 | \bibitem{kuremoto2014time} 128 | Takashi Kuremoto, Shinsuke Kimura, Kunikazu Kobayashi, and Masanao Obayashi. 129 | \newblock Time series forecasting using a deep belief network with restricted 130 | boltzmann machines. 131 | \newblock {\em Neurocomputing}, 137:47--56, 2014. 132 | 133 | \bibitem{lin2009short} 134 | Xiaowei Lin, Zehong Yang, and Yixu Song. 135 | \newblock Short-term stock price prediction based on echo state networks. 136 | \newblock {\em Expert systems with applications}, 36(3):7313--7317, 2009. 137 | 138 | \bibitem{malkiel1970efficient} 139 | Burton~G Malkiel and Eugene~F Fama. 140 | \newblock Efficient capital markets: A review of theory and empirical work. 141 | \newblock {\em The journal of Finance}, 25(2):383--417, 1970. 142 | 143 | \bibitem{murphy1999technical} 144 | John~J Murphy. 145 | \newblock {\em Technical analysis of the financial markets: A comprehensive 146 | guide to trading methods and applications}. 147 | \newblock Penguin, 1999. 148 | 149 | \bibitem{pierre1998estimating} 150 | Eileen F~St Pierre. 151 | \newblock Estimating egarch-m models: Science or art? 152 | \newblock {\em The Quarterly Review of Economics and Finance}, 38(2):167--180, 153 | 1998. 154 | 155 | \bibitem{vaswani2017attention} 156 | Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, 157 | Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin. 158 | \newblock Attention is all you need. 159 | \newblock In {\em Advances in Neural Information Processing Systems}, pages 160 | 5998--6008, 2017. 161 | 162 | \bibitem{walker1931periodicity} 163 | Gilbert~Thomas Walker. 164 | \newblock On periodicity in series of related terms. 165 | \newblock {\em Proceedings of the Royal Society of London. Series A, Containing 166 | Papers of a Mathematical and Physical Character}, 131(818):518--532, 1931. 167 | 168 | \bibitem{xu2015show} 169 | Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan 170 | Salakhudinov, Rich Zemel, and Yoshua Bengio. 171 | \newblock Show, attend and tell: Neural image caption generation with visual 172 | attention. 173 | \newblock In {\em International conference on machine learning}, pages 174 | 2048--2057, 2015. 175 | 176 | \bibitem{zhang2003time} 177 | G~Peter Zhang. 178 | \newblock Time series forecasting using a hybrid arima and neural network 179 | model. 180 | \newblock {\em Neurocomputing}, 50:159--175, 2003. 181 | 182 | \end{thebibliography} 183 | -------------------------------------------------------------------------------- /report/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/main.pdf -------------------------------------------------------------------------------- /report/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | % if you need to pass options to natbib, use, e.g.: 4 | % \PassOptionsToPackage{numbers, compress}{natbib} 5 | % before loading nips_2016 6 | % 7 | % to avoid loading the natbib package, add option nonatbib: 8 | %\usepackage[no]{nips_2016} 9 | 10 | %\usepackage{nips_2016} 11 | 12 | % to compile a camera-ready version, add the [final] option, e.g.: 13 | \usepackage[final]{nips_2016} 14 | \usepackage{natbib} 15 | \setcitestyle{numbers} 16 | \usepackage[utf8]{inputenc} % allow utf-8 input 17 | \usepackage[T1]{fontenc} % use 8-bit T1 fonts 18 | \usepackage{hyperref} % hyperlinks 19 | \usepackage{url} % simple URL typesetting 20 | \usepackage{booktabs} % professional-quality tables 21 | \usepackage{amsfonts} % blackboard math symbols 22 | \usepackage{nicefrac} % compact symbols for 1/2, etc. 23 | \usepackage{microtype} % microtypography 24 | \usepackage{indentfirst} 25 | \usepackage{amsmath} 26 | \usepackage{fancyhdr} 27 | \usepackage{graphicx} 28 | \usepackage{printlen} 29 | \usepackage{color} 30 | %\usepackage{lmodern} 31 | 32 | \fancypagestyle{equalc}{\fancyhf{}\renewcommand{\headrulewidth}{0pt}\fancyfoot[R]{* indicates equal contribution}} 33 | 34 | \title{A Comparison of LSTMs and Attention Mechanisms for Forecasting Financial Time Series} 35 | 36 | \author{ 37 | S.E. Yi* \\ 38 | Department of Computer Science\\ 39 | University of Toronto\\ 40 | Toronto, ON M5S 3H7 \\ 41 | \texttt{seungeunyi@cs.toronto.edu} \\ 42 | \And 43 | A. Viscardi* \\ 44 | Department of Computer Science\\ 45 | University of Toronto\\ 46 | Toronto, ON M5S 3H7 \\ 47 | \texttt{avis@cs.toronto.edu} \\ 48 | \And 49 | T. Hollis* \\ 50 | Department of Computer Science\\ 51 | University of Toronto\\ 52 | Toronto, ON M5S 3H7 \\ 53 | \texttt{thollis@cs.toronto.edu} \\ 54 | } 55 | 56 | \begin{document} 57 | 58 | \maketitle 59 | 60 | \begin{abstract} 61 | 62 | While LSTMs show increasingly promising results for forecasting Financial Time Series (FTS), this paper seeks to assess if attention mechanisms can further improve performance. The hypothesis is that attention can help prevent long-term dependencies experienced by LSTM models. To test this hypothesis, the main contribution of this paper is the implementation of an LSTM with attention. Both the benchmark LSTM and the LSTM with attention were compared and both achieved reasonable performances of up to 60\% on five stocks from Kaggle's Two Sigma dataset. This comparative analysis demonstrates that an LSTM with attention can indeed outperform standalone LSTMs but further investigation is required as issues do arise with such model architectures. 63 | 64 | \end{abstract} 65 | 66 | \thispagestyle{equalc} 67 | 68 | \section{Introduction} 69 | 70 | Financial Time Series (FTS) modelling is a practice with a long history which first revolutionised algorithmic trading in the early 1970s. The analysis of FTS was divided into two categories: fundamental analysis and technical analysis. Fundamental analysis is the study of a stock or currency’s price based on economic factors. These include the overall state of the business or economy, revenue, interest rates and others. On the other hand, technical analysis, as defined by J. Murphy in \cite{murphy1999technical}, is the study of market action to forecast future trends. This is achieved through the analysis of shorter term financial data, primarily price and volume. Both fundamental and technical analysis are put into question by the Efficient Market Hypothesis (EMH). The EMH, highly disputed since its initial publication in 1970, hypothesizes that stock prices are ultimately unpredictable \cite{malkiel1970efficient}. This has not stopped research attempting to model FTS through the use of linear, non-linear and ML-based models, as mentioned hereafter. Together these approaches form the main subcategories of existing solutions in FTS analysis. 71 | 72 | In parallel, the origins of attention mechanisms initially came into prominence in the field of Computer Vision (CV). These new models were originally loosely inspired by human visual attention mechanisms present in a region of the prefrontal cortex known as the inferior frontal junction. Applications were later leveraged to tackle issues with context-dependent inference in Natural Language Processing (NLP). In both cases, the core principle of attention is achieved by setting attention weights to assign more or less of the algorithm’s finite attention on different subsets of the input feature space. In CV, this corresponds to focussing on particular features of the input image while in NLP this represents focus on particular words in the input sentence. In NLP, this attention mechanism allows inference to be made about an entire sentence while remaining sensitive to its context. This remains to this day a particularly challenging task for long sentences. 73 | 74 | In this paper we propose a novel approach to FTS forecasting by combining these two fields of research. The idea is to leverage the developments in attention mechanisms to improve the performance of promising LSTM RNN architectures currently in use for FTS forecasting. The main contribution in this paper is the implementation of an LSTM that uses attention for parsing both news headlines and financial data. The performance of this model is then compared with that of a regular LSTM without attention. This performance is then evaluated using five stocks from Kaggle's Two Sigma dataset \cite{kaggle2017twosigma} and using various methods of data preprocessing \cite{bergmeir2012use}. 75 | 76 | The ultimate goals of FTS forecasting, among others, is to help solve the problem of volatility in speculative markets and to help foresee large financial events such as the 2009 financial crisis to ensure better economic preparation. 77 | 78 | \section{Related Work} 79 | 80 | As discussed in \cite{hollis2018deep}, the most rudimentary approach to modelling FTS is by assuming that they follow the random walk model. The random walk model can be simply expressed as a sum of a series of independent random variables \cite{hamilton1994time}. By weighing these, the first Auto-Regressive (AR) model was developed. A set of equations were developed by U. Yule and G. Walker in \cite{walker1931periodicity} to provide quantitative methods for estimating parameters in AR models. This work was subsequently expanded upon by J. P. Burg in \cite{burg1968new} who provided an alternative approach albeit with different stability properties. These AR models are often accompanied by another type of linear model, the Moving Average (MA) model which gives the Auto-Regressive Moving Average (ARMA) model. However, a fundamental limitation of AR, MA and ARMA models is that they all assume the process being modelled is stationary. Stationarity is a property of processes whereby the probability distribution remains constant over time, thus variance also remains constant. Indeed, this assumption is significant as FTS are often non-stationary processes. Therefore, this model’s accuracy will suffer, highlighting the need to take this problem of stationarity into consideration. This is done by generalising the ARMA model into the Autoregressive Integrated Moving Average (ARIMA) model \cite{hamilton1994time}. The ARIMA model solves the issue of non-stationarity by exploiting the concept of returns (or degrees of differencing). Non-stationary time series can therefore be made stationary by differencing. The aforementioned linear models all suffer from the assumption that FTS are homoscedastic processes. This is indeed often a poor assumption to make, as shown in \cite{engle1982autoregressive} by R.F. Engle. In \cite{engle1982autoregressive}, Engle states that by using a more sophisticated model such as the Auto-Regressive Conditional Heteroscedasticity (ARCH) model, the homoscedastic assumption can be avoided. This ARCH model was later described by Bollerslev in \cite{bollerslev1986generalized} as a special case of a more generalised model called the Generalised Auto-Regressive Conditional Heteroscedasticity (GARCH) model. Many more variants of the GARCH model have been published since its original publication in 1986. These include NAGARCH (nonlinear asymmetric GARCH) \cite{engle1993measuring}, EGARCH (exponential GARCH) \cite{pierre1998estimating}, GJR-GARCH (Glosten-Jagannathan-Runkle GARCH) \cite{hamzaoui2016glosten} and many others. These GARCH derivatives are often nested under Hentschel’s fGARCH (Family GARCH) model \cite{hentschel1995all} but these all lie outside the scope of this paper. In the same time as the ARCH and GARCH models, J. Leontaris and S. A. Billings published an alternative in \cite{chen1989representations} known as is the Nonlinear Autoregressive Moving Average model with exogenous inputs (NARMAX). This work, building on their own previous work on ARMAX models, demonstrated that NARMAX models can successfully be applied to model complex time series. More information on these models can be found in \cite{hollis2018deep}, including equations and further explanation. 81 | 82 | These state space and stochastic models were however quickly overwhelmed by advances in Machine Learning. A wave of ML approaches to modelling FTS severely disrupted the field of algorithmic trading via stochastic modelling in the last two decades. One of the earliest approaches to FTS forecasting using ML built on work from Kohonen in \cite{kohonen1982self}. In \cite{kohonen1982self}, Kohonen introduced the idea of Self-Organising Maps (SOM) which were subsequently successfully applied to FTS forecasting \cite{koskela1998time}. In 2003, still in the early days of ML for FTS predictions, SVMs (both linear and non-linear) were shown by Kim in \cite{kim2003financial} to be of significant predictive capabilities for FTS. In parallel, Zhang showed in his 2003 paper \cite{zhang2003time} that, by combining Artificial Neural Networks (ANNs) with the aforementioned ARIMA model, promising FTS forecasting can be achieved. 83 | 84 | Nevertheless, it was only by benefitting from the Neural Network boom of 2012 brought on by the AlexNet work of Krizhevsky, Sutskever and Hinton on the ImageNet competition \cite{krizhevsky2012imagenet}, that ANNs became some of the most mainstream methods for FTS forecasting, in particular with the rise of RNNs \cite{chandra2012cooperative}. However, another type of neural network that has also been widely lauded for its performance in FTS forecasting is the Echo State Network (ESN). Indeed, Lin et. al showed in \cite{lin2009short} that ESNs combined with Principal Component Analysis (PCA) can sometimes exceed or at least match the performance of conventional RNNs while decreasing the computational costs. This is due to the techniques of Reservoir Computing introduced in ESNs. In short, ESNs can bypass the issue of the vanishing gradient problem and long computational times, present in conventional RNNs, by creating a large `reservoir' of sparsely connected neurons. The connections are assigned at random and weights within the reservoir do not get conventionally trained, reducing computational time and allowing the network to echo past states (emulating the `memory' of RNNs). 85 | 86 | Another alternative to RNNs is the Deep Belief Network (DBN). Hinton and Salakhutdinov's DBNs are a type of probabilistic generative neural network composed of layers of Restricted Boltzmann Machines (RBMs). These have also successfully been leveraged for accurate time series prediction \cite{kuremoto2014time}. 87 | 88 | In the modern day however, LSTM RNNs remain amongst some of the most popular and promising models for predicting FTS. While LSTMs originally came to light in the seminal 1997 paper \cite{hochreiter1997long} by Hochreiter and Schmidhuber, they only recently rose to prominence for FTS forecasting. Amongst the most successful LSTM implementations, the pioneering paper by Bao et al. in \cite{bao2017deep} implements a variety of the most modern LSTM architectures coupled with autoencoders and applies them to stock data. The work presented here extends and builds on the insight of this paper by exploring the impact of leveraging attention models for sentiment analysis built on top of LSTMs. 89 | 90 | However, a word of caution is worth mentioning here. It is true that academic publications in the field of FTS forecasting are often misleading. Indeed, many of the most performant models are developed by private companies and kept away from the public, with the utmost secrecy for competitive reasons. Academia seems to be struggling to shed light on the most modern techniques which is one of the prime motivations for the investigation presented hereafter. In addition, many FTS forecasting papers tend to inflate their performance for recognition and overfit their models due to the heavy use simulators. Many of the performances claimed in these papers are difficult to replicate as they fail to generalise for future changes in the particular FTS being forecast. 91 | 92 | Having reviewed the field of FTS forecasting, in order to better situate our paper amongst existing literature, it is important to now cover a brief history of attention mechanisms. 93 | 94 | Many early prominent papers using attention mechanisms in NLP initially used the term “alignment” to refer to attention in the context neural machine translation. One of the most foundational of these is the 2014 Bahdanau, Cho and Bengio collaboration in \cite{bahdanau2014neural}. In \cite{bahdanau2014neural}, Bahdanau shows that a single neural network can be jointly tuned to maximise translation performance using attention-based encoder-decoders. In parallel, Graves, Wayne and Danihelka showed in \cite{graves2014neural} that neural networks (in particular LSTMs) can be improved by coupling them to attention processes. While the application in \cite{graves2014neural} was helping Neural Turing Machines infer copying, sorting and recall, subsequent CV and NLP applications of the very same concept were greatly inspired by techniques presented in this paper \cite{xu2015show}. Building on the work above as well as on their own work, in 2015 Firat, Cho and Bengio in \cite{firat2016multi} had a breakthrough in multilingual Neural Machine Translation (NMT). They used single attention mechanisms shared across language pairs to outperform existing NMT benchmarks. 95 | 96 | Since then, a whole flurry of different types of attention have been developed including but not limited to self-attention, shared attention, local attention, hybrid attention and multi-headed attention. Indeed, the multi-headed attention model presented in the Transformer \cite{vaswani2017attention} by Google Brain and UofT alumni has been widely lauded as a promising novel architecture for dealing with long range dependencies without the issues of LSTMs in NMT. For this reason, that paper will serve as inspiration for our investigation on the potential of attention in FTS forecasting. 97 | 98 | From the history of attention mechanisms presented here we can indeed see that attention mechanisms are currently accepted as a very promising approach to many problems, especially in the field of machine translation. However, we have yet to see if they present any promise in sentiment analysis for FTS forecasting which is the purpose of the work presented here. Intuitively, since attention mechanisms are designed to help address the issue of context dependency this suggests promising potential for sentiment analysis in FTS forecasting. Indeed, most sentiment analysis algorithms would fail to consider the impact of such context for long lengths. A naive example of this, such as “I am a compulsive liar but this company is fantastic, the stock is destined to rise and I don’t understand why more people have not invested yet”, would be wrongly detected as a positive sentiment by most algorithms. The following investigation aims to confirm or refute the hypothesis that LSTM performance can be improved with attention. 99 | 100 | \section{Model Architecture} 101 | 102 | The two models investigated in this paper are a vanilla LSTM (as a benchmark) and an LSTM with an attention mechanism. A diagram of these models is shown in figure 1 below. 103 | 104 | \begin{figure}[!h] 105 | \includegraphics[width=375pt]{Diagram.png} 106 | \caption{System diagram of the LSTM and LSTM with attention} 107 | \end{figure} 108 | 109 | Both the LSTM model and the LSTM with attention model used in this paper are implemented with mean squared error loss using an Adam optimiser \cite{kingma2014adam}. It is nonetheless important to cover the mathematical foundations of these models before comparing their performances. 110 | 111 | In encoder-decoder RNNs, the encoder reads an input sequence $\textbf{x} = (x_1,$ ... $, x_{t-1})$ into a vector $\textbf{c} = (c_1,$ ... $, c_{t-1})$, as shown in figure 1. A common approach and the one we will be following in this paper is to use an RNN such that \cite{bahdanau2014neural}: 112 | \begin{align} 113 | h_t = f(x_t, h_{t-1}) 114 | \end{align} 115 | and 116 | \begin{align} 117 | \textbf{c} = q(\{h_0, \dots , h_{T_x}\}) 118 | \end{align} 119 | where $h_t$ is a hidden state at time $t$, $\textbf{c}$ is the context vector generated by the hidden states and $f$ and $q$ are non-linear functions. 120 | 121 | Subsequently to equations 1 and 2, prediction is done at the decoder by defining a probability over the translation $\textbf{y}$ through the following decomposition \cite{bahdanau2014neural}: 122 | \begin{align} 123 | p(\textbf{y}) = \prod_{t=1}^T p(y_t | \{y_1, \dots , y_{t-1}\}, c) 124 | \end{align} 125 | where $\textbf{y} = (y_1, \dots , y_{T_y})$. For RNNs, each conditional probability is modelled as: 126 | \begin{align} 127 | p(y_t | \{y_1, \dots , y_{t-1}\}, c) = g(y_{t-1},s_t,c) 128 | \end{align} 129 | where $s_t$ is the RNN's hidden state and $g$ is a nonlinear function that outputs the probability of $y_t$. 130 | 131 | In attention, a particular context $c_i$ depends on a sequence of annotations ($h_1, \dots , h_{T_x}$) to which an encoder maps the input sequence. While each annotation $h_i$ contains information about the input sequence, we want to focus attention on a particular part of the input. Thus the context vector is computed as a weighted sum as follows \cite{bahdanau2014neural}: 132 | \begin{align} 133 | c_i = \sum_{j=1}^{T_x} \alpha_{ij}h_j 134 | \end{align} 135 | where each context weight $\alpha_{ij}$ for each annotation $h_j$ is calculated as follows: 136 | \begin{align} 137 | \alpha_{ij} = \frac{\exp(e_{ij})}{\sum_{k=1}^{T_x} \exp(e_{ik})} 138 | \end{align} 139 | where 140 | \begin{align} 141 | e_{ik} = a(s_{i-1},h_j) 142 | \end{align} 143 | is known as the alignment model. Alignment models score how well matched the input at position $j$ and output at position $i$ are. 144 | 145 | Equations 1 through 6 are used for the LSTM and attention mechanisms while Adam is used as the optimiser and Mean Square Error (MSE) is used as the loss function. Mathematically, MSE loss is as follows: 146 | \begin{align} 147 | MSE = \frac{1}{n} \sum_{t=1}^{n} (y-t)^2 148 | \end{align} 149 | 150 | \section{Comparison of LSTM and LSTM with attention} 151 | 152 | We can now move onto the implementation of the baseline LSTM. Both this baseline LSTM and the LSTM with attention were implemented within the Kaggle kernel environment in Python using the Keras library. It is worth noting here that special care was taken for sensible crosschecks. For example, the Kaggle kernel environment was set up to block the use of `future' data when training, preventing look-ahead bias. Indeed, look-ahead bias is a significant source of malpractice in the field of FTS so was worthy of extra consideration here. In addition, all implementations closely followed existing literature from the related work discussed in section 2. 153 | 154 | To gain a good benchmark for the performance of an LSTM on forecasting stock prices from the Two Sigma dataset, we first consider a subset of stocks. This subset of stocks contains three very large companies (Intel, Wells Fargo, Amazon), one SME (Agilent Technologies) and one smaller company (Benchmark Electronics). These stocks were carefully chosen to have a wide variety of market cap, volatility and overall trend. Larger market cap stocks tend to be less volatile compared to smaller stocks. In addition, tech companies (like Amazon) tend to be less affected by the 2009 crash than finance companies (like Wells Fargo). These stocks and their volatility are shown in figure 2 as follows. 155 | \begin{figure}[!h] 156 | \includegraphics[width=395pt]{Stocks.png} 157 | \caption{Price and volatility of small stocks (Agilent, Benchmark Electronics - top) and large stocks (Intel, Wells Fargo, Amazon - bottom)} 158 | \end{figure} 159 | 160 | Figure 2 clearly shows one of the main challenges of FTS forecasting which is the change of volatility over time. Indeed, looking at the volatility of Amazon, one notices a significant increase in volatility with increasing size of the company. This is the main justification behind picking a diverse set of stocks that are bound by different statistical properties. 161 | 162 | It is worth showing some loss curves for the benchmark LSTM in order to examine how well the model generalises to the validation data with increasing numbers of epochs. This is shown in Figure 3 as follows. 163 | 164 | \begin{figure}[!h] 165 | \includegraphics[width=195pt]{lstm_plot1.png} 166 | \includegraphics[width=195pt]{lstm_plot2.png} 167 | \caption{Typical LSTM loss for training and validation data per epoch} 168 | \end{figure} 169 | 170 | Figure 3 shows the typical LSTM training and validation losses through epochs. It can be noted that loss decreases as expected and begins to overfit after a certain number of epochs. It is worth noting one particularity from this plot which is that it shows training loss greater than validation loss for early epochs. While this may seem unusual, it is a documented artifact of using Keras. Indeed, the training loss output is the average loss over the batch while the validation loss is the final loss. It is therefore expected that the first few iterations show a higher training loss than validation loss. This is because the very first training iterations will have a higher loss than the final iteration of the validation loss. 171 | 172 | In order to tune this benchmark model, a grid search is undertaken. The LSTM performance during this grid search hyperparameter tuning, evaluated on the validation set, can be seen by the following 3D loss plots. 173 | 174 | \begin{figure}[!h] 175 | \includegraphics[width=395pt]{LSTMgrid.png} 176 | \caption{LSTM grid search loss plots for 3 different sizes} 177 | \end{figure} 178 | 179 | These plots in figure 4 show the performance impact of dropout and lag for three different LSTM sizes. Indeed, it seems the LSTM with size 64 performs best on average. In addition, dropout is clearly a useful regulariser for all three LSTM sizes, particularly for values of 0.2 and higher as it leads to lower losses. From this hyperparameter tuning, the best chosen configuration was with $size=64$, $lag=15$ and $dropout=0.1$. This resulted in a loss of 0.000805 and an up-down accuracy of 0.572 (or 57\%). Indeed, this consistently outperforms random guessing, which would have a performance of 50\%, and is in line with top of the range algorithms which usually achieve around 60\% up-down accuracy. 180 | 181 | Once the hyperparameters are tuned, a common FTS investigation worth pursuing is that of dataset shuffling techniques. In FTS, the choice of which piece of data to use as the validation set is not trivial. Indeed, there exist a myriad of ways of doing this which must be carefully considered before comparing this LSTM to an LSTM with attention. The three methods investigated in this paper are visualised in figure 5. 182 | 183 | \begin{figure}[!h] 184 | \begin{center} 185 | \includegraphics[width=395pt]{Shuffling.png} 186 | \end{center} 187 | \caption{Shuffling techniques visualised} 188 | \end{figure} 189 | 190 | The fixed origin method is the most naive and common method used. Given a certain split size, the start of the data is the training set and the end is the validation set. However, this is a particularly rudimentary method to choose, especially for a high-growth stock like Amazon. The reason why this is the case is that the Amazon's stock price starts off with low volatility and, as the stock grows, experiences increasingly volatile behaviour. We would therefore be training a model on low volatility dynamics and expect it to deal with unseen high volatility dynamics for its predictions. This has indeed shown itself to be difficult and come at a cost in performance for these types of stocks as we will see in table 1. Therefore our benchmark for validation loss and performance may be misleading if we only consider this. However, for stocks like Intel that are more constant in their volatility, this method is reasonable. 191 | 192 | The rolling origin recalibration method is slightly less vulnerable than fixed origin as it allows the validation loss to be computed by taking the average of various different splits of the data to avoid running into unrepresentative issues with high volatility timeframes. 193 | 194 | Finally, the rolling window method is usually one of the most useful methods as it is particularly used for FTS algorithms being run for long timeframes. Indeed, this model outputs the average validation error of multiple rolling windows of data. This means the final values we get are more representative of recent model performance, as we are less biased by strong or poor performance in the distant past. 195 | 196 | It is now important to show the model performance of our tuned benchmark LSTM using these different shuffling techniques. Using regular fixed origin our tuned LSTM achieved a loss of 0.000805 and an up-down accuracy of 57\%. Let's compare these values with top six values of rolling window (RW) and rolling origin recalibration (ROR) in table 1. 197 | 198 | \begin{center} 199 | \begin{tabular}{||c c c c||} 200 | \hline 201 | Loss (RW) & Accuracy (RW) & Loss (ROR) & Accuracy (ROR) \\ [0.5ex] 202 | \hline\hline 203 | 0.000692 & 0.538 & 0.000810 & 0.555 \\ 204 | \hline 205 | 0.000693 & 0.530 & 0.000825 & 0.571 \\ 206 | \hline 207 | 0.000725 & 0.575 & 0.000978 & 0.607 \\ 208 | \hline 209 | 0.000755 & 0.551 & 0.000989 & 0.563 \\ 210 | \hline 211 | 0.000780 & 0.514 & 0.001001 & 0.579 \\ 212 | \hline 213 | 0.000788 & 0.583 & 0.001014 & 0.538 \\ 214 | \hline 215 | \end{tabular} 216 | \end{center} 217 | \begin{center} 218 | Table 1: Performance comparison of shuffling methods 219 | \end{center} 220 | 221 | What table 1 shows is that both RW and ROR describe very slightly better performances (58\% and 60\%) than that of the simple fixed origin method. This suggests that for stocks like Amazon, using these shuffling methods would be inevitable. 222 | 223 | Now that reasonable benchmarks have been ascertained, we can compare them to the performance of our LSTM with attention. The LSTM models with attention generated here had a strong tendency to overfit the training data for multiple epochs as shown in figure 6. 224 | 225 | \begin{figure}[!h] 226 | \includegraphics[width=195pt]{lstm_att_v0_ts_1_drop_04_cells_64.png} 227 | \includegraphics[width=195pt]{lstm_att_v0_ts_5_drop_0_cells_64.png} 228 | \caption{Typical LSTM+A loss for training and validation data per epoch} 229 | \end{figure} 230 | 231 | As shown in the loss curves of figure 6, we can see that performance at train time continually decreases while validation performance rises suggesting overfitting. Due to the high number of weights used in our implementation of the attention mechanism (with 16 cells, there are around 10 000 parameters), there are more parameters than the number of data points themselves, which inevitably leads to overfitting. Due to a large amount of parameters compared to the number of our data points, the loss values are very low and the training loss does not fluctuate a lot. This explains how the validation loss seems to increase immediately after one or two epochs when looking at the graphs. From this behaviour, we decided to limit the number of epochs for the LSTMs with attention (grid search over 25 epochs, compared to 100 epochs with LSTMs only). In order to help alleviate some of the overfitting errors and lack of generalisation, a hyperparameter grid search was undertaken to tune the model’s hyperparameters. In particular, dropout and varying the amount of lag for given LSTM sizes were investigated, as done for the LSTM without attention. Loss plots are shown in figure 7 below. 232 | 233 | \begin{figure}[!h] 234 | \includegraphics[width=395pt]{LSTMAgrid.png} 235 | \caption{LSTM+A grid search loss plots for 3 different sizes} 236 | \end{figure} 237 | 238 | The LSTM with attention achieves a globally lower loss than the LSTM without attention but is sensitive to different hyperparameters. Indeed, in the case of the LSTM with attention, the amount of lag is a significant factor in decreasing loss. This can be seen in figure 7 as lags larger than 30 drastically improve loss. It could be hypothesized that this is due to the unique nature of attention that can focus on relevant information from the past while the LSTM without attention is incapable of doing this due to its long term dependencies. This seems to support our original hypothesis on the impact of using attention on LSTMs for FTS. From this hyperparameter tuning, the best chosen configuration was with $size=16$, $lag=60$ and $dropout=0.05$. This resulted in a loss of 0.001511 and an up-down accuracy of 0.588 (or 59\%). Again, this is in line with top of the range algorithms and is slightly higher than that of the LSTM without attention (58\% accuracy). 239 | 240 | It is now worth comparing the performance, loss and accuracy, of both the tuned LSTM and the tuned LSTM with attention across all five stocks. The optimal hyperparameters for both the LSTM and LSTM with attention were set as detailed previously and the following performances were observed. 241 | 242 | \begin{center} 243 | \begin{tabular}{||c c c c c c||} 244 | \hline 245 | & Intel & Wells Fargo & Amazon & Agilent & BE \\ [0.5ex] 246 | \hline\hline 247 | LSTM (loss) & 0.000805 & 0.001200 & 0.002804 & 0.001073 & 0.002300 \\ 248 | \hline 249 | LSTM (accuracy) & 0.573 & 0.457 & 0.490 & 0.457 & 0.470 \\ 250 | \hline 251 | LSTM+A (loss) & 0.001511 & 0.000357 & 0.003168 & 0.000955 & 0.001787\\ 252 | \hline 253 | LSTM+A (accuracy) & 0.588 & 0.603 & 0.328 & 0.443 & 0.493\\ 254 | \hline 255 | \end{tabular} 256 | \end{center} 257 | \begin{center} 258 | Table 2: LSTM and LSTM with attention performance comparison 259 | \end{center} 260 | 261 | From table 2 above, we can observe multiple interesting dynamics at play. The first being that the optimal parameters chosen by grid search with the Intel stock fail to generalise to other stocks for the LSTM and the LSTM with attention. While this was somewhat expected it is interesting nonetheless to observe this limitation. Further time would allow the tuning of hyperparameters for each stock but this is outside the scope of this paper and will be discussed in the following section. In addition, it seems that the LSTM with attention has a higher variability in its performances. This makes sense as we know that the model is very complex in terms of parameter number with respect to the data input. Overall in this comparison, the LSTM with attention has outperformed the regular LSTM. Indeed, it does seem like we can tentatively confirm our original hypothesis albeit further work is certainly required. In addition, the attention architecture used in this paper does involve certain limitations and has a caveat of inherent complexity. 262 | 263 | \section{Limitations and Future Work} 264 | 265 | One particular limitation of this model is that we only considered the first prediction in our accuracy score. This means we are effectively doing single step ahead prediction even if the model is inherently capable of doing multi-step ahead prediction. Indeed, multi-step ahead forecasting is historically a much harder problem than single step ahead. The reason why this is the case can be seen by the naive approach of converting single step ahead forecasting to multi-step forecasting via the iterative method. The iterative method simply takes the predicted output of the model for time $t+1$ and forecasts output for $t+2$ from the existing forecast without feedback from the real world. This causes the model error to cascade through iterations and grow out of control for more than a handful of time steps. This is a fundamental issue where most models fail for FTS forecasting. However, the model described in this paper is capable of seq2seq which allows for forecasting an entire sequence without the uncontrollable error growth experienced by iterative methods. Indeed, this potential for seq2seq is key advantage of the models used and constitutes a major avenue for future work and research. 266 | 267 | Another limitation of this investigation is confidence interval encoding. Indeed, there are various ways to do so but the approach used here was to make the models output a value between -1 and 1 by using a $\tanh$ function. However, the confidence output of our model was not considered in the up-down accuracy detailed above. Indeed, a simple way to exploit confidence intervals to improve performance would be to only execute trades beyond certain threshold of confidence. Indeed, this has been leveraged successfully for most models in FTS forecasting however it remains a non-trivial task. Indeed, the higher the threshold is set for executing trades, the higher the overall up-down accuracy will be. However, the higher the threshold the lower the number of total trades executed. There is therefore a major trade-off here as perfect accuracy for few trades may be less profitable than excellent accuracy for many trades. 268 | 269 | One major possible extension to the work presented here would be the use of Bayesian optimisation. Bayesian optimisation is a technique commonly used in financial time series forecasting for the tuning of hyperparameters. While this was not done in this report for time constraints, it is essential for a more detailed comparison of the potential of attention mechanisms in LSTMs for FTS forecasting. Indeed, as revealed by this paper, hyperparameters have different sensitivities in the benchmark LSTM and in the LSTM with attention. Future work would consist of choosing an appropriate surrogate function and acquisition function (such as expected improvement) in an attempt to explore the hyperparameter space with a smart explore-exploit trade-off. This is indeed still an open problem in the field of FTS forecasting. 270 | 271 | Another possible extension would be to investigate how the LSTM with attention performs compared to other benchmark models for FTS forecasting. One particular interesting comparison could be done with cutting edge Temporal Convolutional Networks (TCN) as they are currently showing promising performances compared to RNNs in NLP applications. 272 | 273 | \section{Conclusions} 274 | 275 | This paper has demonstrated the performance of a benchmark LSTM. Investigations were undertaken to avoid using a single number as a benchmark and a wide range of experimentation was conducted. This includes the investigation of data shuffling methods such as rolling window and rolling origin recalibration, which showed the impact of volatility on estimating model performance. Ultimately, the benchmark LSTM consistently performed around the 58\% range which is in line with the best models currently available which usually reach performances around 60\%. 276 | 277 | In addition, the LSTM with attention was successfully implemented and leveraged for FTS forecasting. This task, while totally novel, did nonetheless closely follow methods used in other FTS papers and papers using attention for NLP. This LSTM with attention achieved performances of around 60\% and above, albeit with a higher variability than the benchmark LSTM. 278 | 279 | The final comparison of this LSTM with attention does indeed confirm the investigated hypothesis that an LSTM with attention can improve the performance of existing LSTMs in FTS. A slight improvement was indeed highlighted in the final comparison table, albeit both models need to be re-tuned in between stocks. A theoretical explanation for why this is the case was suggested, developed and tested. 280 | 281 | Finally, further work in this topic has been suggested and the main model limitations were discussed. 282 | 283 | %\pagebreak 284 | 285 | \bibliographystyle{plain} 286 | \bibliography{ref} 287 | 288 | \end{document} -------------------------------------------------------------------------------- /report/nicefrac.sty: -------------------------------------------------------------------------------- 1 | %% 2 | %% This is file `nicefrac.sty', 3 | %% generated with the docstrip utility. 4 | %% 5 | %% The original source files were: 6 | %% 7 | %% units.dtx (with options: `nicefrac') 8 | %% 9 | %% LaTeX package for typesetting nice fractions 10 | %% 11 | %% Copyright (C) 1998 Axel Reichert 12 | %% See the files README and COPYING. 13 | %% 14 | %% \CharacterTable 15 | %% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z 16 | %% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z 17 | %% Digits \0\1\2\3\4\5\6\7\8\9 18 | %% Exclamation \! Double quote \" Hash (number) \# 19 | %% Dollar \$ Percent \% Ampersand \& 20 | %% Acute accent \' Left paren \( Right paren \) 21 | %% Asterisk \* Plus \+ Comma \, 22 | %% Minus \- Point \. Solidus \/ 23 | %% Colon \: Semicolon \; Less than \< 24 | %% Equals \= Greater than \> Question mark \? 25 | %% Commercial at \@ Left bracket \[ Backslash \\ 26 | %% Right bracket \] Circumflex \^ Underscore \_ 27 | %% Grave accent \` Left brace \{ Vertical bar \| 28 | %% Right brace \} Tilde \~} 29 | \NeedsTeXFormat{LaTeX2e}[1995/12/01] 30 | \ProvidesPackage{nicefrac}[1998/08/04 v0.9b Nice fractions] 31 | \newlength{\L@UnitsRaiseDisplaystyle} 32 | \newlength{\L@UnitsRaiseTextstyle} 33 | \newlength{\L@UnitsRaiseScriptstyle} 34 | \RequirePackage{ifthen} 35 | \DeclareRobustCommand*{\@UnitsNiceFrac}[3][]{% 36 | \ifthenelse{\boolean{mmode}}{% 37 | \settoheight{\L@UnitsRaiseDisplaystyle}{% 38 | \ensuremath{\displaystyle#1{M}}% 39 | }% 40 | \settoheight{\L@UnitsRaiseTextstyle}{% 41 | \ensuremath{\textstyle#1{M}}% 42 | }% 43 | \settoheight{\L@UnitsRaiseScriptstyle}{% 44 | \ensuremath{\scriptstyle#1{M}}% 45 | }% 46 | \settoheight{\@tempdima}{% 47 | \ensuremath{\scriptscriptstyle#1{M}}% 48 | }% 49 | \addtolength{\L@UnitsRaiseDisplaystyle}{% 50 | -\L@UnitsRaiseScriptstyle% 51 | }% 52 | \addtolength{\L@UnitsRaiseTextstyle}{% 53 | -\L@UnitsRaiseScriptstyle% 54 | }% 55 | \addtolength{\L@UnitsRaiseScriptstyle}{-\@tempdima}% 56 | \mathchoice 57 | {% 58 | \raisebox{\L@UnitsRaiseDisplaystyle}{% 59 | \ensuremath{\scriptstyle#1{#2}}% 60 | }% 61 | }% 62 | {% 63 | \raisebox{\L@UnitsRaiseTextstyle}{% 64 | \ensuremath{\scriptstyle#1{#2}}% 65 | }% 66 | }% 67 | {% 68 | \raisebox{\L@UnitsRaiseScriptstyle}{% 69 | \ensuremath{\scriptscriptstyle#1{#2}}% 70 | }% 71 | }% 72 | {% 73 | \raisebox{\L@UnitsRaiseScriptstyle}{% 74 | \ensuremath{\scriptscriptstyle#1{#2}}% 75 | }% 76 | }% 77 | \mkern-2mu/\mkern-1mu% 78 | \bgroup 79 | \mathchoice 80 | {\scriptstyle}% 81 | {\scriptstyle}% 82 | {\scriptscriptstyle}% 83 | {\scriptscriptstyle}% 84 | #1{#3}% 85 | \egroup 86 | }% 87 | {% 88 | \settoheight{\L@UnitsRaiseTextstyle}{#1{M}}% 89 | \settoheight{\@tempdima}{% 90 | \ensuremath{% 91 | \mbox{\fontsize\sf@size\z@\selectfont#1{M}}% 92 | }% 93 | }% 94 | \addtolength{\L@UnitsRaiseTextstyle}{-\@tempdima}% 95 | \raisebox{\L@UnitsRaiseTextstyle}{% 96 | \ensuremath{% 97 | \mbox{\fontsize\sf@size\z@\selectfont#1{#2}}% 98 | }% 99 | }% 100 | \ensuremath{\mkern-2mu}/\ensuremath{\mkern-1mu}% 101 | \ensuremath{% 102 | \mbox{\fontsize\sf@size\z@\selectfont#1{#3}}% 103 | }% 104 | }% 105 | } 106 | \DeclareRobustCommand*{\@UnitsUglyFrac}[3][]{% 107 | \ifthenelse{\boolean{mmode}}{% 108 | \frac{#1{#2}}{#1{#3}}% 109 | }% 110 | {% 111 | #1{#2}/#1{#3}% 112 | \PackageWarning{nicefrac}{% 113 | You used \protect\nicefrac\space or 114 | \protect\unitfrac\space in text mode\MessageBreak 115 | and specified the ``ugly'' option.\MessageBreak 116 | The fraction may be ambiguous or wrong.\MessageBreak 117 | Please make sure the denominator is 118 | correct.\MessageBreak 119 | If it is, you can safely ignore\MessageBreak 120 | this warning 121 | }% 122 | }% 123 | } 124 | \DeclareOption{nice}{% 125 | \DeclareRobustCommand*{\nicefrac}{\@UnitsNiceFrac}% 126 | } 127 | \DeclareOption{ugly}{% 128 | \DeclareRobustCommand*{\nicefrac}{\@UnitsUglyFrac}% 129 | } 130 | \ExecuteOptions{nice} 131 | \ProcessOptions* 132 | \endinput 133 | %% 134 | %% End of file `nicefrac.sty'. -------------------------------------------------------------------------------- /report/nips_2016.sty: -------------------------------------------------------------------------------- 1 | % partial rewrite of the LaTeX2e package for submissions to the 2 | % Conference on Neural Information Processing Systems (NIPS): 3 | % 4 | % - uses more LaTeX conventions 5 | % - line numbers at submission time replaced with aligned numbers from 6 | % lineno package 7 | % - \nipsfinalcopy replaced with [final] package option 8 | % - automatically loads times package for authors 9 | % - loads natbib automatically; this can be suppressed with the 10 | % [nonatbib] package option 11 | % - adds foot line to first page identifying the conference 12 | % 13 | % Roman Garnett (garnett@wustl.edu) and the many authors of 14 | % nips15submit_e.sty, including MK and drstrip@sandia 15 | % 16 | % last revision: August 2016 17 | 18 | \NeedsTeXFormat{LaTeX2e} 19 | \ProvidesPackage{nips_2016}[2016/08/08 NIPS 2016 submission/camera-ready style file] 20 | 21 | % declare final option, which creates camera-ready copy 22 | \newif\if@nipsfinal\@nipsfinalfalse 23 | \DeclareOption{final}{ 24 | \@nipsfinaltrue 25 | } 26 | 27 | % declare nonatbib option, which does not load natbib in case of 28 | % package clash (users can pass options to natbib via 29 | % \PassOptionsToPackage) 30 | \newif\if@natbib\@natbibtrue 31 | \DeclareOption{nonatbib}{ 32 | \@natbibfalse 33 | } 34 | 35 | \ProcessOptions\relax 36 | 37 | % fonts 38 | \renewcommand{\rmdefault}{ptm} 39 | \renewcommand{\sfdefault}{phv} 40 | 41 | % change this every year for notice string at bottom 42 | \newcommand{\@nipsordinal}{30th} 43 | \newcommand{\@nipsyear}{2016} 44 | \newcommand{\@nipslocation}{Barcelona, Spain} 45 | 46 | % handle tweaks for camera-ready copy vs. submission copy 47 | \if@nipsfinal 48 | \newcommand{\@noticestring}{% 49 | \@nipsordinal\/ Conference on Neural Information Processing Systems 50 | (NIPS \@nipsyear), \@nipslocation.% 51 | } 52 | \else 53 | \newcommand{\@noticestring}{% 54 | Submitted to \@nipsordinal\/ Conference on Neural Information 55 | Processing Systems (NIPS \@nipsyear). Do not distribute.% 56 | } 57 | 58 | % line numbers for submission 59 | \RequirePackage{lineno} 60 | \linenumbers 61 | 62 | % fix incompatibilities between lineno and amsmath, if required, by 63 | % transparently wrapping linenomath environments around amsmath 64 | % environments 65 | \AtBeginDocument{% 66 | \@ifpackageloaded{amsmath}{% 67 | \newcommand*\patchAmsMathEnvironmentForLineno[1]{% 68 | \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname 69 | \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname 70 | \renewenvironment{#1}% 71 | {\linenomath\csname old#1\endcsname}% 72 | {\csname oldend#1\endcsname\endlinenomath}% 73 | }% 74 | \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{% 75 | \patchAmsMathEnvironmentForLineno{#1}% 76 | \patchAmsMathEnvironmentForLineno{#1*}% 77 | }% 78 | \patchBothAmsMathEnvironmentsForLineno{equation}% 79 | \patchBothAmsMathEnvironmentsForLineno{align}% 80 | \patchBothAmsMathEnvironmentsForLineno{flalign}% 81 | \patchBothAmsMathEnvironmentsForLineno{alignat}% 82 | \patchBothAmsMathEnvironmentsForLineno{gather}% 83 | \patchBothAmsMathEnvironmentsForLineno{multline}% 84 | }{} 85 | } 86 | \fi 87 | 88 | % load natbib unless told otherwise 89 | \if@natbib 90 | \RequirePackage{natbib} 91 | \fi 92 | 93 | % set page geometry 94 | \usepackage[verbose=true,letterpaper]{geometry} 95 | \AtBeginDocument{ 96 | \newgeometry{ 97 | textheight=9in, 98 | textwidth=5.5in, 99 | top=1in, 100 | headheight=12pt, 101 | headsep=25pt, 102 | footskip=30pt 103 | } 104 | \@ifpackageloaded{fullpage} 105 | {\PackageWarning{nips_2016}{fullpage package not allowed! Overwriting formatting.}} 106 | {} 107 | } 108 | 109 | \widowpenalty=10000 110 | \clubpenalty=10000 111 | \flushbottom 112 | \sloppy 113 | 114 | % font sizes with reduced leading 115 | \renewcommand{\normalsize}{% 116 | \@setfontsize\normalsize\@xpt\@xipt 117 | \abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@ 118 | \abovedisplayshortskip \z@ \@plus 3\p@ 119 | \belowdisplayskip \abovedisplayskip 120 | \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@ 121 | } 122 | \normalsize 123 | \renewcommand{\small}{% 124 | \@setfontsize\small\@ixpt\@xpt 125 | \abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@ 126 | \abovedisplayshortskip \z@ \@plus 2\p@ 127 | \belowdisplayskip \abovedisplayskip 128 | \belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@ 129 | } 130 | \renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt} 131 | \renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt} 132 | \renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt} 133 | \renewcommand{\large}{\@setfontsize\large\@xiipt{14}} 134 | \renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}} 135 | \renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}} 136 | \renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}} 137 | \renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}} 138 | 139 | % sections with less space 140 | \providecommand{\section}{} 141 | \renewcommand{\section}{% 142 | \@startsection{section}{1}{\z@}% 143 | {-2.0ex \@plus -0.5ex \@minus -0.2ex}% 144 | { 1.5ex \@plus 0.3ex \@minus 0.2ex}% 145 | {\large\bf\raggedright}% 146 | } 147 | \providecommand{\subsection}{} 148 | \renewcommand{\subsection}{% 149 | \@startsection{subsection}{2}{\z@}% 150 | {-1.8ex \@plus -0.5ex \@minus -0.2ex}% 151 | { 0.8ex \@plus 0.2ex}% 152 | {\normalsize\bf\raggedright}% 153 | } 154 | \providecommand{\subsubsection}{} 155 | \renewcommand{\subsubsection}{% 156 | \@startsection{subsubsection}{3}{\z@}% 157 | {-1.5ex \@plus -0.5ex \@minus -0.2ex}% 158 | { 0.5ex \@plus 0.2ex}% 159 | {\normalsize\bf\raggedright}% 160 | } 161 | \providecommand{\paragraph}{} 162 | \renewcommand{\paragraph}{% 163 | \@startsection{paragraph}{4}{\z@}% 164 | {1.5ex \@plus 0.5ex \@minus 0.2ex}% 165 | {-1em}% 166 | {\normalsize\bf}% 167 | } 168 | \providecommand{\subparagraph}{} 169 | \renewcommand{\subparagraph}{% 170 | \@startsection{subparagraph}{5}{\z@}% 171 | {1.5ex \@plus 0.5ex \@minus 0.2ex}% 172 | {-1em}% 173 | {\normalsize\bf}% 174 | } 175 | \providecommand{\subsubsubsection}{} 176 | \renewcommand{\subsubsubsection}{% 177 | \vskip5pt{\noindent\normalsize\rm\raggedright}% 178 | } 179 | 180 | % float placement 181 | \renewcommand{\topfraction }{0.85} 182 | \renewcommand{\bottomfraction }{0.4} 183 | \renewcommand{\textfraction }{0.1} 184 | \renewcommand{\floatpagefraction}{0.7} 185 | 186 | \newlength{\@nipsabovecaptionskip}\setlength{\@nipsabovecaptionskip}{7\p@} 187 | \newlength{\@nipsbelowcaptionskip}\setlength{\@nipsbelowcaptionskip}{\z@} 188 | 189 | \setlength{\abovecaptionskip}{\@nipsabovecaptionskip} 190 | \setlength{\belowcaptionskip}{\@nipsbelowcaptionskip} 191 | 192 | % swap above/belowcaptionskip lengths for tables 193 | \renewenvironment{table} 194 | {\setlength{\abovecaptionskip}{\@nipsbelowcaptionskip}% 195 | \setlength{\belowcaptionskip}{\@nipsabovecaptionskip}% 196 | \@float{table}} 197 | {\end@float} 198 | 199 | % footnote formatting 200 | \setlength{\footnotesep }{6.65\p@} 201 | \setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@} 202 | \renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@} 203 | \setcounter{footnote}{0} 204 | 205 | % paragraph formatting 206 | \setlength{\parindent}{\z@} 207 | \setlength{\parskip }{5.5\p@} 208 | 209 | % list formatting 210 | \setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@} 211 | \setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@} 212 | \setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} 213 | \setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} 214 | \setlength{\leftmargin }{3pc} 215 | \setlength{\leftmargini }{\leftmargin} 216 | \setlength{\leftmarginii }{2em} 217 | \setlength{\leftmarginiii}{1.5em} 218 | \setlength{\leftmarginiv }{1.0em} 219 | \setlength{\leftmarginv }{0.5em} 220 | \def\@listi {\leftmargin\leftmargini} 221 | \def\@listii {\leftmargin\leftmarginii 222 | \labelwidth\leftmarginii 223 | \advance\labelwidth-\labelsep 224 | \topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@ 225 | \parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ 226 | \itemsep \parsep} 227 | \def\@listiii{\leftmargin\leftmarginiii 228 | \labelwidth\leftmarginiii 229 | \advance\labelwidth-\labelsep 230 | \topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ 231 | \parsep \z@ 232 | \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@ 233 | \itemsep \topsep} 234 | \def\@listiv {\leftmargin\leftmarginiv 235 | \labelwidth\leftmarginiv 236 | \advance\labelwidth-\labelsep} 237 | \def\@listv {\leftmargin\leftmarginv 238 | \labelwidth\leftmarginv 239 | \advance\labelwidth-\labelsep} 240 | \def\@listvi {\leftmargin\leftmarginvi 241 | \labelwidth\leftmarginvi 242 | \advance\labelwidth-\labelsep} 243 | 244 | % create title 245 | \providecommand{\maketitle}{} 246 | \renewcommand{\maketitle}{% 247 | \par 248 | \begingroup 249 | \renewcommand{\thefootnote}{\fnsymbol{footnote}} 250 | % for perfect author name centering 251 | \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}} 252 | % The footnote-mark was overlapping the footnote-text, 253 | % added the following to fix this problem (MK) 254 | \long\def\@makefntext##1{% 255 | \parindent 1em\noindent 256 | \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1 257 | } 258 | \thispagestyle{empty} 259 | \@maketitle 260 | \@thanks 261 | \@notice 262 | \endgroup 263 | \let\maketitle\relax 264 | \let\thanks\relax 265 | } 266 | 267 | % rules for title box at top of first page 268 | \newcommand{\@toptitlebar}{ 269 | \hrule height 4\p@ 270 | \vskip 0.25in 271 | \vskip -\parskip% 272 | } 273 | \newcommand{\@bottomtitlebar}{ 274 | \vskip 0.29in 275 | \vskip -\parskip 276 | \hrule height 1\p@ 277 | \vskip 0.09in% 278 | } 279 | 280 | % create title (includes both anonymized and non-anonymized versions) 281 | \providecommand{\@maketitle}{} 282 | \renewcommand{\@maketitle}{% 283 | \vbox{% 284 | \hsize\textwidth 285 | \linewidth\hsize 286 | \vskip 0.1in 287 | \@toptitlebar 288 | \centering 289 | {\LARGE\bf \@title\par} 290 | \@bottomtitlebar 291 | \if@nipsfinal 292 | \def\And{% 293 | \end{tabular}\hfil\linebreak[0]\hfil% 294 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% 295 | } 296 | \def\AND{% 297 | \end{tabular}\hfil\linebreak[4]\hfil% 298 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% 299 | } 300 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}% 301 | \else 302 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@} 303 | Anonymous Author(s) \\ 304 | Affiliation \\ 305 | Address \\ 306 | \texttt{email} \\ 307 | \end{tabular}% 308 | \fi 309 | \vskip 0.3in \@minus 0.1in 310 | } 311 | } 312 | 313 | % add conference notice to bottom of first page 314 | \newcommand{\ftype@noticebox}{8} 315 | \newcommand{\@notice}{% 316 | % give a bit of extra room back to authors on first page 317 | \enlargethispage{2\baselineskip}% 318 | \@float{noticebox}[b]% 319 | %UNCOMMENT THIS LINE FOR FOOTNOTE \footnotesize\@noticestring% 320 | \end@float% 321 | } 322 | 323 | % abstract styling 324 | \renewenvironment{abstract}% 325 | {% 326 | \vskip 0.075in% 327 | \centerline% 328 | {\large\bf Abstract}% 329 | \vspace{0.5ex}% 330 | \begin{quote}% 331 | } 332 | { 333 | \par% 334 | \end{quote}% 335 | \vskip 1ex% 336 | } 337 | 338 | \endinput -------------------------------------------------------------------------------- /report/printlen.sty: -------------------------------------------------------------------------------- 1 | % printlen.sty Print lengths in a variety of units 2 | % 3 | % Author: Peter Wilson, Herries Press 4 | % Maintainer: Will Robertson (will dot robertson at latex-project dot org) 5 | % Copyright 2001 Peter R. Wilson 6 | % Released under the LaTeX Project Public License 7 | % 8 | % Extensions courtesy of Harald Harders (h.harders@tu-bs.de) 9 | % 10 | % Usage instructions are at the end of this file. 11 | % 12 | \NeedsTeXFormat{LaTeX2e} 13 | \ProvidesPackage{printlen}[2009/09/03 v1.1a print lengths with units] 14 | % 15 | % \uselengthunit{} sets \l@nunits to the value of 16 | % and \l@nunitperpt to the number of in 1pt. 17 | \newcommand{\uselengthunit}[1]{% 18 | \def\l@nunitperpt{1.0}\def\l@nunits{pt}% 19 | \def\l@nta{#1}\def\l@ntb{pt}% 20 | \ifx \l@nta\l@ntb 21 | \def\l@nunitperpt{1.0}\def\l@nunits{pt}% 22 | \else 23 | \def\l@ntb{pc}% 24 | \ifx \l@nta\l@ntb 25 | \def\l@nunitperpt{0.083333}\def\l@nunits{pc}% 26 | \else 27 | \def\l@ntb{in}% 28 | \ifx \l@nta\l@ntb 29 | \def\l@nunitperpt{0.013837}\def\l@nunits{in}% 30 | \else 31 | \def\l@ntb{mm}% 32 | \ifx \l@nta\l@ntb 33 | \def\l@nunitperpt{0.351459}\def\l@nunits{mm}% 34 | \else 35 | \def\l@ntb{cm}% 36 | \ifx \l@nta\l@ntb 37 | \def\l@nunitperpt{0.0351459}\def\l@nunits{cm}% 38 | \else 39 | \def\l@ntb{bp}% 40 | \ifx \l@nta\l@ntb 41 | \def\l@nunitperpt{0.996264}\def\l@nunits{bp}% 42 | \else 43 | \def\l@ntb{dd}% 44 | \ifx \l@nta\l@ntb 45 | \def\l@nunitperpt{0.9345718}\def\l@nunits{dd}% 46 | \else 47 | \def\l@ntb{cc}% 48 | \ifx \l@nta\l@ntb 49 | \def\l@nunitperpt{0.0778809}\def\l@nunits{cc}% 50 | \else 51 | \def\l@ntb{PT}% 52 | \ifx \l@nta\l@ntb 53 | \def\l@nunitperpt{1.0}\def\l@nunits{PT}% 54 | \fi 55 | \fi 56 | \fi 57 | \fi 58 | \fi 59 | \fi 60 | \fi 61 | \fi 62 | \fi 63 | } 64 | \uselengthunit{pt} 65 | 66 | % \printlength{} prints the value of in the units set 67 | % by \uselengthunit. 68 | \newcommand{\printlength}[1]{% 69 | \def\l@nta{pt}\ifx\l@nta\l@nunits\the#1\else 70 | \def\l@nta{PT}% 71 | \@tempdimc=\l@nunitperpt #1\relax\strip@pt\@tempdimc 72 | \unitspace\relax\ifmmode 73 | \mathrm{\ifx\l@nta\l@nunits pt\else\l@nunits\fi}% 74 | \else 75 | \ifx\l@nta\l@nunits pt\else\l@nunits\fi 76 | \fi\fi} 77 | 78 | % \rndprintlength{} prints the rounded value of in 79 | % the units set by \uselengthunit. Contributed by Harald Harders. 80 | \def\@round#1.#2\@empty{#1}% 81 | \newcommand{\rndprintlength}[1]{% 82 | \def\l@nta{pt}\ifx\l@nta\l@nunits\the#1\else 83 | \def\l@nta{PT}% 84 | \setlength{\@tempdimc}{\l@nunitperpt #1}% 85 | \addtolength{\@tempdimc}{0.5pt}% 86 | \edef\@@round{\strip@pt\@tempdimc}% 87 | \expandafter\@round\@@round.\@empty 88 | \unitspace\relax\ifmmode 89 | \mathrm{\ifx\l@nta\l@nunits pt\else\l@nunits\fi}% 90 | \else 91 | \ifx\l@nta\l@nunits pt\else\l@nunits\fi 92 | \fi\fi} 93 | 94 | % Small space. Contributed by Harald Harders. 95 | \newcommand{\unitspace}{\,} 96 | 97 | \endinput 98 | 99 | % USAGE: 100 | % 101 | % \printlength{} prints the value of a LaTeX length in the 102 | % units specified by \uselengthunit{}, where may be any TeX 103 | % length unit except for scaled point. That is, may be any of: 104 | % pt, pc, in, mm, cm, bp, dd or cc. When pt is set the printed length 105 | % value will include any stretch or shrink values, otherwise these 106 | % are not printed. The argument may also be PT, in which case 107 | % length values will be printed in pt units but without any stretch 108 | % or shrink values. An unknown value for is treated as though it 109 | % had been specified as pt. 110 | % The unit is separated from the number using the command 111 | % \unitspace which is set to \, by default. In math mode the units are 112 | % printed upright. 113 | % \rndprintlength{} prints the rounded value of a LaTeX length. 114 | % Use PT instead of pt for rounded points if there are stretch or 115 | % shrink values. 116 | % 117 | % The initial setting is \uselengthunit{pt} 118 | % 119 | % Example: 120 | % 121 | % The \verb|\textwidth| is \printlength{\textwidth} which is also 122 | % \uselengthunit{in}\printlength{\textwidth} and 123 | % \uselengthunit{mm}\printlength{\textwidth}. 124 | % 125 | % 126 | % CHANGE HISTORY 127 | % 128 | % Version 1.1a (2009/09/03) 129 | % - New maintainer (Will Robertson) 130 | % 131 | % Version 1.1 (2001/12/09) 132 | % - Print rounded values 133 | % - Space between value and units 134 | % 135 | % Version 1.0 (2001/11/03) 136 | % - First public release 137 | % -------------------------------------------------------------------------------- /report/ref.bib: -------------------------------------------------------------------------------- 1 | @book{murphy1999technical, 2 | title={Technical analysis of the financial markets: A comprehensive guide to trading methods and applications}, 3 | author={Murphy, John J}, 4 | year={1999}, 5 | publisher={Penguin} 6 | } 7 | 8 | @article{hollis2018deep, 9 | title={Deep Learning Algorithms Applied to Blockchain-Based Financial Time Series}, 10 | author={Hollis, Thomas}, 11 | year={2018} 12 | } 13 | 14 | @misc{kaggle2017twosigma, 15 | author={Kaggle}, 16 | title={Two Sigma: Using News to Predict Stock Movements}, 17 | howpublished={\url{https://www.kaggle.com/c/two-sigma-financial-news}}, 18 | note = {Accessed: 2018-09-30} 19 | } 20 | 21 | @misc{NYSE2007NYSE, 22 | author={NYSE}, 23 | title={NYSE Group Equities Streamlining}, 24 | howpublished={\url{https://www.nyse.com/publicdocs/nyse/markets/nyse/CCG_Notification_Update1.pdf}}, 25 | note = {Accessed: 2018-09-30} 26 | } 27 | 28 | @article{gamble2009british, 29 | title={British politics and the financial crisis}, 30 | author={Gamble, Andrew}, 31 | journal={British Politics}, 32 | volume={4}, 33 | number={4}, 34 | pages={450--462}, 35 | year={2009}, 36 | publisher={Springer} 37 | } 38 | 39 | @article{malkiel1970efficient, 40 | title={Efficient capital markets: A review of theory and empirical work}, 41 | author={Malkiel, Burton G and Fama, Eugene F}, 42 | journal={The journal of Finance}, 43 | volume={25}, 44 | number={2}, 45 | pages={383--417}, 46 | year={1970}, 47 | publisher={Wiley Online Library} 48 | } 49 | 50 | @book{hamilton1994time, 51 | title={Time series analysis}, 52 | author={Hamilton, James Douglas}, 53 | volume={2}, 54 | year={1994}, 55 | publisher={Princeton university press Princeton, NJ} 56 | } 57 | 58 | @article{walker1931periodicity, 59 | title={On periodicity in series of related terms}, 60 | author={Walker, Gilbert Thomas}, 61 | journal={Proceedings of the Royal Society of London. Series A, Containing Papers of a Mathematical and Physical Character}, 62 | volume={131}, 63 | number={818}, 64 | pages={518--532}, 65 | year={1931}, 66 | publisher={The Royal Society London} 67 | } 68 | 69 | @article{burg1968new, 70 | title={A new analysis technique for time series data}, 71 | author={Burg, John Parker}, 72 | journal={Paper presented at NATO Advanced Study Institute on Signal Processing, Enschede, Netherlands, 1968}, 73 | year={1968} 74 | } 75 | 76 | @article{engle1982autoregressive, 77 | title={Autoregressive conditional heteroscedasticity with estimates of the variance of United Kingdom inflation}, 78 | author={Engle, Robert F}, 79 | journal={Econometrica: Journal of the Econometric Society}, 80 | pages={987--1007}, 81 | year={1982}, 82 | publisher={JSTOR} 83 | } 84 | 85 | @article{bollerslev1986generalized, 86 | title={Generalized autoregressive conditional heteroskedasticity}, 87 | author={Bollerslev, Tim}, 88 | journal={Journal of econometrics}, 89 | volume={31}, 90 | number={3}, 91 | pages={307--327}, 92 | year={1986}, 93 | publisher={Elsevier} 94 | } 95 | 96 | @article{engle1993measuring, 97 | title={Measuring and testing the impact of news on volatility}, 98 | author={Engle, Robert F and Ng, Victor K}, 99 | journal={The journal of finance}, 100 | volume={48}, 101 | number={5}, 102 | pages={1749--1778}, 103 | year={1993}, 104 | publisher={Wiley Online Library} 105 | } 106 | 107 | @article{pierre1998estimating, 108 | title={Estimating EGARCH-M models: Science or art?}, 109 | author={Pierre, Eileen F St}, 110 | journal={The Quarterly Review of Economics and Finance}, 111 | volume={38}, 112 | number={2}, 113 | pages={167--180}, 114 | year={1998}, 115 | publisher={Elsevier} 116 | } 117 | 118 | @article{hamzaoui2016glosten, 119 | title={The Glosten-Jagannathan-Runkle-Generalized Autoregressive Conditional Heteroscedastic approach to investigating the foreign exchange forward premium volatility}, 120 | author={Hamzaoui, Nessrine and Regaieg, Boutheina}, 121 | journal={International Journal of Economics and Financial Issues}, 122 | volume={6}, 123 | number={4}, 124 | pages={1608--1615}, 125 | year={2016} 126 | } 127 | 128 | @article{hentschel1995all, 129 | title={All in the family: Nesting symmetric and asymmetric GARCH models}, 130 | author={Hentschel, Ludger and others}, 131 | journal={Journal of Financial Economics}, 132 | volume={39}, 133 | number={1}, 134 | pages={71--104}, 135 | year={1995} 136 | } 137 | 138 | @article{chen1989representations, 139 | title={Representations of non-linear systems: the NARMAX model}, 140 | author={Chen, Sheng and Billings, Steve A}, 141 | journal={International Journal of Control}, 142 | volume={49}, 143 | number={3}, 144 | pages={1013--1032}, 145 | year={1989}, 146 | publisher={Taylor \& Francis} 147 | } 148 | 149 | @article{kohonen1982self, 150 | title={Self-organized formation of topologically correct feature maps}, 151 | author={Kohonen, Teuvo}, 152 | journal={Biological cybernetics}, 153 | volume={43}, 154 | number={1}, 155 | pages={59--69}, 156 | year={1982}, 157 | publisher={Springer} 158 | } 159 | 160 | @article{koskela1998time, 161 | title={Time series prediction using recurrent SOM with local linear models}, 162 | author={Koskela, Timo and Varsta, Markus and Heikkonen, Jukka and Kaski, Kimmo}, 163 | journal={Int. J. of Knowledge-Based Intelligent Engineering Systems}, 164 | volume={2}, 165 | number={1}, 166 | pages={60--68}, 167 | year={1998} 168 | } 169 | 170 | @article{kim2003financial, 171 | title={Financial time series forecasting using support vector machines}, 172 | author={Kim, Kyoung-jae}, 173 | journal={Neurocomputing}, 174 | volume={55}, 175 | number={1-2}, 176 | pages={307--319}, 177 | year={2003}, 178 | publisher={Elsevier} 179 | } 180 | 181 | @article{zhang2003time, 182 | title={Time series forecasting using a hybrid ARIMA and neural network model}, 183 | author={Zhang, G Peter}, 184 | journal={Neurocomputing}, 185 | volume={50}, 186 | pages={159--175}, 187 | year={2003}, 188 | publisher={Elsevier} 189 | } 190 | 191 | @inproceedings{krizhevsky2012imagenet, 192 | title={Imagenet classification with deep convolutional neural networks}, 193 | author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, 194 | booktitle={Advances in neural information processing systems}, 195 | pages={1097--1105}, 196 | year={2012} 197 | } 198 | 199 | @article{chandra2012cooperative, 200 | title={Cooperative coevolution of Elman recurrent neural networks for chaotic time series prediction}, 201 | author={Chandra, Rohitash and Zhang, Mengjie}, 202 | journal={Neurocomputing}, 203 | volume={86}, 204 | pages={116--123}, 205 | year={2012}, 206 | publisher={Elsevier} 207 | } 208 | 209 | @article{lin2009short, 210 | title={Short-term stock price prediction based on echo state networks}, 211 | author={Lin, Xiaowei and Yang, Zehong and Song, Yixu}, 212 | journal={Expert systems with applications}, 213 | volume={36}, 214 | number={3}, 215 | pages={7313--7317}, 216 | year={2009}, 217 | publisher={Elsevier} 218 | } 219 | 220 | @article{kuremoto2014time, 221 | title={Time series forecasting using a deep belief network with restricted Boltzmann machines}, 222 | author={Kuremoto, Takashi and Kimura, Shinsuke and Kobayashi, Kunikazu and Obayashi, Masanao}, 223 | journal={Neurocomputing}, 224 | volume={137}, 225 | pages={47--56}, 226 | year={2014}, 227 | publisher={Elsevier} 228 | } 229 | 230 | @article{hochreiter1997long, 231 | title={Long short-term memory}, 232 | author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen}, 233 | journal={Neural computation}, 234 | volume={9}, 235 | number={8}, 236 | pages={1735--1780}, 237 | year={1997}, 238 | publisher={MIT Press} 239 | } 240 | 241 | @article{bao2017deep, 242 | title={A deep learning framework for financial time series using stacked autoencoders and long-short term memory}, 243 | author={Bao, Wei and Yue, Jun and Rao, Yulei}, 244 | journal={PloS one}, 245 | volume={12}, 246 | number={7}, 247 | pages={e0180944}, 248 | year={2017}, 249 | publisher={Public Library of Science} 250 | } 251 | 252 | @inproceedings{vaswani2017attention, 253 | title={Attention is all you need}, 254 | author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, 255 | booktitle={Advances in Neural Information Processing Systems}, 256 | pages={5998--6008}, 257 | year={2017} 258 | } 259 | 260 | @article{bahdanau2014neural, 261 | title={Neural machine translation by jointly learning to align and translate}, 262 | author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua}, 263 | journal={arXiv preprint arXiv:1409.0473}, 264 | year={2014} 265 | } 266 | 267 | @article{graves2014neural, 268 | title={Neural turing machines}, 269 | author={Graves, Alex and Wayne, Greg and Danihelka, Ivo}, 270 | journal={arXiv preprint arXiv:1410.5401}, 271 | year={2014} 272 | } 273 | 274 | @inproceedings{xu2015show, 275 | title={Show, attend and tell: Neural image caption generation with visual attention}, 276 | author={Xu, Kelvin and Ba, Jimmy and Kiros, Ryan and Cho, Kyunghyun and Courville, Aaron and Salakhudinov, Ruslan and Zemel, Rich and Bengio, Yoshua}, 277 | booktitle={International conference on machine learning}, 278 | pages={2048--2057}, 279 | year={2015} 280 | } 281 | 282 | @article{firat2016multi, 283 | title={Multi-way, multilingual neural machine translation with a shared attention mechanism}, 284 | author={Firat, Orhan and Cho, Kyunghyun and Bengio, Yoshua}, 285 | journal={arXiv preprint arXiv:1601.01073}, 286 | year={2016} 287 | } 288 | 289 | @article{peters2018deep, 290 | title={Deep contextualized word representations}, 291 | author={Peters, Matthew E and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke}, 292 | journal={arXiv preprint arXiv:1802.05365}, 293 | year={2018} 294 | } 295 | 296 | @article{bergmeir2012use, 297 | title={On the use of cross-validation for time series predictor evaluation}, 298 | author={Bergmeir, Christoph and Ben{\'\i}tez, Jos{\'e} M}, 299 | journal={Information Sciences}, 300 | volume={191}, 301 | pages={192--213}, 302 | year={2012}, 303 | publisher={Elsevier} 304 | } 305 | 306 | @article{kingma2014adam, 307 | title={Adam: A method for stochastic optimization}, 308 | author={Kingma, Diederik P and Ba, Jimmy}, 309 | journal={arXiv preprint arXiv:1412.6980}, 310 | year={2014} 311 | } -------------------------------------------------------------------------------- /report/temp: -------------------------------------------------------------------------------- 1 | temp 2 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | from os import listdir 6 | from os.path import isfile, join 7 | import pandas as pd 8 | import pickle 9 | 10 | 11 | def EDA(): 12 | '''Prints a brief overview of the data. 13 | 14 | Parameters 15 | ---------- 16 | none 17 | 18 | Returns 19 | ------- 20 | none 21 | 22 | ''' 23 | print(X_train.shape, y_train.shape) 24 | print(y_train.head()) 25 | print(X_train.head()) 26 | print(X_train.info()) 27 | print(X_train.describe()) 28 | 29 | 30 | def plot_asset(market, assetCode): 31 | '''Plots an asset's price, volatility and voume. 32 | 33 | Parameters 34 | ---------- 35 | market_df : dataframe 36 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe. 37 | assetCode : string 38 | The asset code of the instrument you want to plot 39 | 40 | Returns 41 | ------- 42 | none 43 | 44 | ''' 45 | # Set plot style 46 | plt.style.use('seaborn') 47 | 48 | # Fetch the asset from data 49 | ass_market = market[market['assetCode'] == assetCode] 50 | ass_market.index = ass_market.time 51 | 52 | # Setup 3 subplots 53 | f, axs = plt.subplots(2,1, sharex=True, figsize=(12,8)) 54 | 55 | # Subplot 1. Close price 56 | ass_market.close.plot(ax=axs[0], color='black') 57 | axs[0].set_ylabel("Price") 58 | 59 | # Subplot 2. Volatility 60 | volat_df = (ass_market.close - ass_market.open) 61 | (ass_market.close - ass_market.open).plot(color='darkred', ax = axs[1]) 62 | axs[1].set_ylabel("Volatility") 63 | 64 | # Show all subplots with label 65 | f.suptitle("Asset: %s" % assetCode, fontsize=22) 66 | plt.tight_layout() 67 | plt.subplots_adjust(top=0.93) 68 | plt.show() 69 | 70 | 71 | def plot_chosen_assets(): 72 | '''Prints a group of select stocks, their price and their volatility. 73 | 74 | Parameters 75 | ---------- 76 | none 77 | 78 | Returns 79 | ------- 80 | none 81 | 82 | ''' 83 | # Huge stocks (market cap 200BN - 1000BN) 84 | #plot_asset(market_train_df, "GOOGL.O") #nonsense data? 85 | #plot_asset(market_train_df, "AAPL.O") #randomly crashes from 2013-2015? 86 | #plot_asset(market_train_df, "FB.O") #Facebook: correct, verified, unpredictable volatility 87 | plot_asset(market_train_df, "INTC.O") #Intel: correct, verified, fair constant volatility 88 | plot_asset(market_train_df, "WFC.N") #Wells Fargo: correct, verified, crash volatility 89 | plot_asset(market_train_df, "AMZN.O") #Amazon: correct, verified, increasing volatility 90 | 91 | # SMEs (5-20Bn MC) 92 | #plot_asset(market_train_df, "ADI.N") #Analogue Devices (32Bn MC): kinda correct (one weird correction), verified 93 | #plot_asset(market_train_df, "NATI.O") #NI (6Bn MC): kinda correct (one weird correction in middle), verified 94 | plot_asset(market_train_df, "A.N") #Agilent Tech (20Bn MC): kinda correct (one weird correction toward end), verified 95 | 96 | # Small stocks (MC < 1Bn) 97 | #plot_asset(market_train_df, "ANDE.O") #Andersons (900M MC): unverified, high vol 98 | #plot_asset(market_train_df, "ALO.N") #Alio Gold (90M MC): unverified, low vol 99 | plot_asset(market_train_df, "BHE.N") #Benchmark Electronics (1Bn MC): verified, low vol 100 | 101 | 102 | def get_models_list(asset): 103 | 104 | # Import the list of models from the directory into a dataframe 105 | models_path = './data/models' 106 | models = [f for f in listdir(models_path) if isfile(join(models_path, f))] 107 | models = pd.DataFrame(models) 108 | 109 | # Strip the file extension 110 | models = models[0].str[:-5] 111 | 112 | # Split the string in multiple columns 113 | models = models.str.split('-', expand=True) 114 | 115 | # Remove the 'best-lstm' prefix columns 116 | models = models.drop([0 ,1], axis=1) 117 | 118 | # Set column names 119 | models.columns = ['epoch', 'val_loss', 'asset', 120 | 'lstm_size', 'lag', 'dropout'] 121 | 122 | # Cast to numeric 123 | models['epoch'] = pd.to_numeric(models['epoch']) 124 | models['val_loss'] = pd.to_numeric(models['val_loss']) 125 | models['lstm_size'] = pd.to_numeric(models['lstm_size']) 126 | models['lag'] = pd.to_numeric(models['lag']) 127 | models['dropout'] = pd.to_numeric(models['dropout']) 128 | 129 | # Filter for the asset 130 | models = models[models['asset'] == asset] 131 | 132 | return models 133 | 134 | 135 | def plot_train_loss(history, ylim=(0, 0.01), xlim=(0, 50)): 136 | plt.ylim(ylim) 137 | plt.xlim(xlim) 138 | 139 | plt.plot(history['loss']) 140 | plt.plot(history['val_loss']) 141 | 142 | plt.xlabel('Epoch') 143 | plt.ylabel('Mean Absolute Error Loss') 144 | plt.title('Training Loss') 145 | plt.legend(['Train','Val']) 146 | plt.show() 147 | 148 | 149 | def get_history_from_file(file): 150 | with open(file, 'rb') as f: 151 | return pickle.load(f) 152 | 153 | 154 | def get_history_from_params(path, asset, lstm_size, lag, dropout): 155 | path = '{}/lstm-{}-{}-{}-{}.pickle'.format( 156 | path, asset, lstm_size, lag, dropout) 157 | with open(path, 'rb') as f: 158 | return pickle.load(f) 159 | 160 | if __name__ == '__main__': 161 | hist = get_history_from_params( 162 | './data/history/fixedpoint', 163 | 'INTC.O', 32, 30, 15) 164 | plot_train_loss(hist, xlim=(20, 100)) 165 | --------------------------------------------------------------------------------