├── .gitignore
├── README.md
├── models
├── Experiments2_VanillaRNN.py
├── __init__.py
├── attention_decoder.py
├── data_cleaning.py
├── data_partitioning.py
├── lstm_attention_v0.py
├── lstm_attention_v1.py
├── lstm_v01.py
├── lstm_v02.py
├── lstm_v02_analysis.py
├── lstm_v03.py
├── lstm_v03_analysis.py
├── lstm_v04.py
├── lstm_v04_analysis.py
├── lstm_v05.py
└── lstm_v05_analysis.py
├── notebooks
├── __init__.py
├── avis-kernel.ipynb
├── data_cleaning.ipynb
├── exploration-filter-non-continuous-news.ipynb
├── exploration-filter-non-continuous-stocks.ipynb
├── se_kernel_v0.ipynb
└── se_kernel_v1.py
├── report
├── Diagram.png
├── LSTMAgrid.png
├── LSTMgrid.png
├── Shuffling.png
├── Stocks.png
├── lstm_att_v0_ts_1_drop_04_cells_64.png
├── lstm_att_v0_ts_5_drop_0_cells_64.png
├── lstm_plot1.png
├── lstm_plot2.png
├── main.bbl
├── main.pdf
├── main.tex
├── nicefrac.sty
├── nips_2016.sty
├── printlen.sty
├── ref.bib
└── temp
└── utils
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | data/*
2 | .ipynb*
3 | .DS_Store
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LSTM-Attention
2 |
3 | A Comparison of LSTMs and Attention Mechanisms for Forecasting Financial Time Series - [read it here](https://github.com/PsiPhiTheta/LSTM-Attention/tree/master/report/main.pdf).
4 |
--------------------------------------------------------------------------------
/models/Experiments2_VanillaRNN.py:
--------------------------------------------------------------------------------
1 | # Vanilla RNN
2 |
3 | # This is a vanilla version of rudimentary RNN with random structure
4 | # which cannot yet be tested (waiting on data from Antoine), most likely
5 | # will need to be tweaked as I can’t test (assumed single step ahead using
6 | # 10 days of data as ‘features’, assumed output as predicted value i.e.
7 | # the higher the predicted value the higher the confidence that we predict
8 | # the asset goes up & vice versa). Further details in my journal googledoc.
9 |
10 | # Since I have no knowledge of Keras, this follows the tutorial on
11 | # machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
12 | # Full Keras documentation can be found here: https://keras.io/layers/recurrent/
13 |
14 | # 1. Import dependancies
15 | import numpy as numpy
16 | import matplotlib.pyplot as plt
17 | from math import sqrt
18 | from sklearn.metrics import mean_squared_error
19 | from keras.models import Sequential
20 | from keras.layers import Dense
21 | from keras.layers import LSTM
22 |
23 | # 2. Functions
24 | def antoineData():
25 | # Antoine's script will go here, prelim data will be 'assetCode',
26 | # 'time', 'volume', 'open', 'returnsOpenPrevMktres1',
27 | # 'returnsOpenPrevMkres10', 'returnsOpenNextMktres10',
28 | # 'sentimentNegative', 'sentimentNeutral'
29 | return 0
30 |
31 | # 3. Import data
32 | x_train, y_train, x_test, y_test = antoineData()
33 |
34 | # 4. Build model from Keras
35 | model = Sequential() # Sequential model is a linear stack of layers
36 | model.add(LSTM(50, input_shape=(x_train.shape[1], x_train.shape[2]))) # adds LSTM layer
37 | model.add(Dense(1)) # adds a dense layer
38 | model.compile(loss='mae', optimizer='adam') # sets the loss as mean absolute error and the optimiser as ADAM
39 |
40 | # 5. Fit RNN
41 | history = model.fit(x_train, y_train, epochs=50, batch_size=72, validation_data=(x_test, y_test), verbose=2, shuffle=False) # fits
42 |
43 | # 6. Plot history
44 | plt.plot(history.history['loss'], label='train')
45 | plt.plot(history.history['val_loss'], label='test')
46 | plt.legend()
47 | plt.show()
48 |
49 | # make a prediction
50 | y_hat = model.predict(x_test)
51 | # calculate the error (can modify this for accuracy instead if needed using skl)
52 | RMSE = sqrt(mean_squared_error(y_test, y_hat))
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/models/__init__.py
--------------------------------------------------------------------------------
/models/attention_decoder.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from keras import backend as K
3 | from keras import regularizers, constraints, initializers, activations
4 | from keras.layers.recurrent import Recurrent
5 | from keras.engine import InputSpec
6 |
7 |
8 | tfPrint = lambda d, T: tf.Print(input_=T, data=[T, tf.shape(T)], message=d)
9 |
10 |
11 | def time_distributed_dense(x, w, b=None, dropout=None,
12 | input_dim=None, output_dim=None, timesteps=None):
13 | '''Apply y.w + b for every temporal slice y of x.
14 | '''
15 | if not input_dim:
16 | # won't work with TensorFlow
17 | input_dim = K.shape(x)[2]
18 | if not timesteps:
19 | # won't work with TensorFlow
20 | timesteps = K.shape(x)[1]
21 | if not output_dim:
22 | # won't work with TensorFlow
23 | output_dim = K.shape(w)[1]
24 |
25 | if dropout:
26 | # apply the same dropout pattern at every timestep
27 | ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
28 | dropout_matrix = K.dropout(ones, dropout)
29 | expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
30 | x *= expanded_dropout_matrix
31 |
32 | # collapse time dimension and batch dimension together
33 | x = K.reshape(x, (-1, input_dim))
34 |
35 | x = K.dot(x, w)
36 | if b:
37 | x = x + b
38 | # reshape to 3D tensor
39 | x = K.reshape(x, (-1, timesteps, output_dim))
40 | return x
41 |
42 | class AttentionDecoder(Recurrent):
43 |
44 | def __init__(self, units, output_dim,
45 | activation='tanh',
46 | return_probabilities=False,
47 | name='AttentionDecoder',
48 | kernel_initializer='glorot_uniform',
49 | recurrent_initializer='orthogonal',
50 | bias_initializer='zeros',
51 | kernel_regularizer=None,
52 | bias_regularizer=None,
53 | activity_regularizer=None,
54 | kernel_constraint=None,
55 | bias_constraint=None,
56 | **kwargs):
57 | """
58 | Implements an AttentionDecoder that takes in a sequence encoded by an
59 | encoder and outputs the decoded states
60 | :param units: dimension of the hidden state and the attention matrices
61 | :param output_dim: the number of labels in the output space
62 |
63 | references:
64 | Bahdanau, Dzmitry, Kyunghyun Cho, and Yoshua Bengio.
65 | "Neural machine translation by jointly learning to align and translate."
66 | arXiv preprint arXiv:1409.0473 (2014).
67 | """
68 | self.units = units
69 | self.output_dim = output_dim
70 | self.return_probabilities = return_probabilities
71 | self.activation = activations.get(activation)
72 | self.kernel_initializer = initializers.get(kernel_initializer)
73 | self.recurrent_initializer = initializers.get(recurrent_initializer)
74 | self.bias_initializer = initializers.get(bias_initializer)
75 |
76 | self.kernel_regularizer = regularizers.get(kernel_regularizer)
77 | self.recurrent_regularizer = regularizers.get(kernel_regularizer)
78 | self.bias_regularizer = regularizers.get(bias_regularizer)
79 | self.activity_regularizer = regularizers.get(activity_regularizer)
80 |
81 | self.kernel_constraint = constraints.get(kernel_constraint)
82 | self.recurrent_constraint = constraints.get(kernel_constraint)
83 | self.bias_constraint = constraints.get(bias_constraint)
84 |
85 | super(AttentionDecoder, self).__init__(**kwargs)
86 | self.name = name
87 | self.return_sequences = True # must return sequences
88 |
89 | def build(self, input_shape):
90 | """
91 | See Appendix 2 of Bahdanau 2014, arXiv:1409.0473
92 | for model details that correspond to the matrices here.
93 | """
94 |
95 | self.batch_size, self.timesteps, self.input_dim = input_shape
96 |
97 | if self.stateful:
98 | super(AttentionDecoder, self).reset_states()
99 |
100 | self.states = [None, None] # y, s
101 |
102 | """
103 | Matrices for creating the context vector
104 | """
105 |
106 | self.V_a = self.add_weight(shape=(self.units,),
107 | name='V_a',
108 | initializer=self.kernel_initializer,
109 | regularizer=self.kernel_regularizer,
110 | constraint=self.kernel_constraint)
111 | self.W_a = self.add_weight(shape=(self.units, self.units),
112 | name='W_a',
113 | initializer=self.kernel_initializer,
114 | regularizer=self.kernel_regularizer,
115 | constraint=self.kernel_constraint)
116 | self.U_a = self.add_weight(shape=(self.input_dim, self.units),
117 | name='U_a',
118 | initializer=self.kernel_initializer,
119 | regularizer=self.kernel_regularizer,
120 | constraint=self.kernel_constraint)
121 | self.b_a = self.add_weight(shape=(self.units,),
122 | name='b_a',
123 | initializer=self.bias_initializer,
124 | regularizer=self.bias_regularizer,
125 | constraint=self.bias_constraint)
126 | """
127 | Matrices for the r (reset) gate
128 | """
129 | self.C_r = self.add_weight(shape=(self.input_dim, self.units),
130 | name='C_r',
131 | initializer=self.recurrent_initializer,
132 | regularizer=self.recurrent_regularizer,
133 | constraint=self.recurrent_constraint)
134 | self.U_r = self.add_weight(shape=(self.units, self.units),
135 | name='U_r',
136 | initializer=self.recurrent_initializer,
137 | regularizer=self.recurrent_regularizer,
138 | constraint=self.recurrent_constraint)
139 | self.W_r = self.add_weight(shape=(self.output_dim, self.units),
140 | name='W_r',
141 | initializer=self.recurrent_initializer,
142 | regularizer=self.recurrent_regularizer,
143 | constraint=self.recurrent_constraint)
144 | self.b_r = self.add_weight(shape=(self.units, ),
145 | name='b_r',
146 | initializer=self.bias_initializer,
147 | regularizer=self.bias_regularizer,
148 | constraint=self.bias_constraint)
149 |
150 | """
151 | Matrices for the z (update) gate
152 | """
153 | self.C_z = self.add_weight(shape=(self.input_dim, self.units),
154 | name='C_z',
155 | initializer=self.recurrent_initializer,
156 | regularizer=self.recurrent_regularizer,
157 | constraint=self.recurrent_constraint)
158 | self.U_z = self.add_weight(shape=(self.units, self.units),
159 | name='U_z',
160 | initializer=self.recurrent_initializer,
161 | regularizer=self.recurrent_regularizer,
162 | constraint=self.recurrent_constraint)
163 | self.W_z = self.add_weight(shape=(self.output_dim, self.units),
164 | name='W_z',
165 | initializer=self.recurrent_initializer,
166 | regularizer=self.recurrent_regularizer,
167 | constraint=self.recurrent_constraint)
168 | self.b_z = self.add_weight(shape=(self.units, ),
169 | name='b_z',
170 | initializer=self.bias_initializer,
171 | regularizer=self.bias_regularizer,
172 | constraint=self.bias_constraint)
173 | """
174 | Matrices for the proposal
175 | """
176 | self.C_p = self.add_weight(shape=(self.input_dim, self.units),
177 | name='C_p',
178 | initializer=self.recurrent_initializer,
179 | regularizer=self.recurrent_regularizer,
180 | constraint=self.recurrent_constraint)
181 | self.U_p = self.add_weight(shape=(self.units, self.units),
182 | name='U_p',
183 | initializer=self.recurrent_initializer,
184 | regularizer=self.recurrent_regularizer,
185 | constraint=self.recurrent_constraint)
186 | self.W_p = self.add_weight(shape=(self.output_dim, self.units),
187 | name='W_p',
188 | initializer=self.recurrent_initializer,
189 | regularizer=self.recurrent_regularizer,
190 | constraint=self.recurrent_constraint)
191 | self.b_p = self.add_weight(shape=(self.units, ),
192 | name='b_p',
193 | initializer=self.bias_initializer,
194 | regularizer=self.bias_regularizer,
195 | constraint=self.bias_constraint)
196 | """
197 | Matrices for making the final prediction vector
198 | """
199 | self.C_o = self.add_weight(shape=(self.input_dim, self.output_dim),
200 | name='C_o',
201 | initializer=self.recurrent_initializer,
202 | regularizer=self.recurrent_regularizer,
203 | constraint=self.recurrent_constraint)
204 | self.U_o = self.add_weight(shape=(self.units, self.output_dim),
205 | name='U_o',
206 | initializer=self.recurrent_initializer,
207 | regularizer=self.recurrent_regularizer,
208 | constraint=self.recurrent_constraint)
209 | self.W_o = self.add_weight(shape=(self.output_dim, self.output_dim),
210 | name='W_o',
211 | initializer=self.recurrent_initializer,
212 | regularizer=self.recurrent_regularizer,
213 | constraint=self.recurrent_constraint)
214 | self.b_o = self.add_weight(shape=(self.output_dim, ),
215 | name='b_o',
216 | initializer=self.bias_initializer,
217 | regularizer=self.bias_regularizer,
218 | constraint=self.bias_constraint)
219 |
220 | # For creating the initial state:
221 | self.W_s = self.add_weight(shape=(self.input_dim, self.units),
222 | name='W_s',
223 | initializer=self.recurrent_initializer,
224 | regularizer=self.recurrent_regularizer,
225 | constraint=self.recurrent_constraint)
226 |
227 | self.input_spec = [
228 | InputSpec(shape=(self.batch_size, self.timesteps, self.input_dim))]
229 | self.built = True
230 |
231 | def call(self, x):
232 | # store the whole sequence so we can "attend" to it at each timestep
233 | self.x_seq = x
234 |
235 | # apply the a dense layer over the time dimension of the sequence
236 | # do it here because it doesn't depend on any previous steps
237 | # thefore we can save computation time:
238 | self._uxpb = time_distributed_dense(self.x_seq, self.U_a, b=self.b_a,
239 | input_dim=self.input_dim,
240 | timesteps=self.timesteps,
241 | output_dim=self.units)
242 |
243 | return super(AttentionDecoder, self).call(x)
244 |
245 | def get_initial_state(self, inputs):
246 | # apply the matrix on the first time step to get the initial s0.
247 | s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))
248 |
249 | # from keras.layers.recurrent to initialize a vector of (batchsize,
250 | # output_dim)
251 | y0 = K.zeros_like(inputs) # (samples, timesteps, input_dims)
252 | y0 = K.sum(y0, axis=(1, 2)) # (samples, )
253 | y0 = K.expand_dims(y0) # (samples, 1)
254 | y0 = K.tile(y0, [1, self.output_dim])
255 |
256 | return [y0, s0]
257 |
258 | def step(self, x, states):
259 |
260 | ytm, stm = states
261 |
262 | # repeat the hidden state to the length of the sequence
263 | _stm = K.repeat(stm, self.timesteps)
264 |
265 | # now multiplty the weight matrix with the repeated hidden state
266 | _Wxstm = K.dot(_stm, self.W_a)
267 |
268 | # calculate the attention probabilities
269 | # this relates how much other timesteps contributed to this one.
270 | et = K.dot(activations.tanh(_Wxstm + self._uxpb),
271 | K.expand_dims(self.V_a))
272 | at = K.exp(et)
273 | at_sum = K.sum(at, axis=1)
274 | at_sum_repeated = K.repeat(at_sum, self.timesteps)
275 | at /= at_sum_repeated # vector of size (batchsize, timesteps, 1)
276 |
277 | # calculate the context vector
278 | context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
279 | # ~~~> calculate new hidden state
280 | # first calculate the "r" gate:
281 |
282 | rt = activations.sigmoid(
283 | K.dot(ytm, self.W_r)
284 | + K.dot(stm, self.U_r)
285 | + K.dot(context, self.C_r)
286 | + self.b_r)
287 |
288 | # now calculate the "z" gate
289 | zt = activations.sigmoid(
290 | K.dot(ytm, self.W_z)
291 | + K.dot(stm, self.U_z)
292 | + K.dot(context, self.C_z)
293 | + self.b_z)
294 |
295 | # calculate the proposal hidden state:
296 | s_tp = activations.tanh(
297 | K.dot(ytm, self.W_p)
298 | + K.dot((rt * stm), self.U_p)
299 | + K.dot(context, self.C_p)
300 | + self.b_p)
301 |
302 | # new hidden state:
303 | st = (1-zt)*stm + zt * s_tp
304 |
305 | yt = activations.softmax(
306 | K.dot(ytm, self.W_o)
307 | + K.dot(stm, self.U_o)
308 | + K.dot(context, self.C_o)
309 | + self.b_o)
310 |
311 | if self.return_probabilities:
312 | return at, [yt, st]
313 | else:
314 | return yt, [yt, st]
315 |
316 | def compute_output_shape(self, input_shape):
317 | """
318 | For Keras internal compatability checking
319 | """
320 | if self.return_probabilities:
321 | return (None, self.timesteps, self.timesteps)
322 | else:
323 | return (None, self.timesteps, self.output_dim)
324 |
325 | def get_config(self):
326 | """
327 | For rebuilding models on load time.
328 | """
329 | config = {
330 | 'output_dim': self.output_dim,
331 | 'units': self.units,
332 | 'return_probabilities': self.return_probabilities
333 | }
334 | base_config = super(AttentionDecoder, self).get_config()
335 | return dict(list(base_config.items()) + list(config.items()))
336 |
--------------------------------------------------------------------------------
/models/data_cleaning.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | from itertools import chain
5 |
6 |
7 | MARKET_DATA_PATH = './data/raw/market_train_df.csv'
8 | NEWS_DATA_PATH = './data/raw/news_train_df.csv'
9 |
10 |
11 | def clean_market_data(market_df, train=True):
12 | '''Clean and preprocess the market data for training or testing.
13 |
14 | Parameters
15 | ----------
16 | market_df : dataframe
17 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full
18 | description of the dataframe.
19 | train : bool
20 | When true, adds the target variable to the dataframe.
21 |
22 | Returns
23 | -------
24 | dataframe
25 | Cleaned market data.
26 |
27 | '''
28 | # Select wanted columns
29 | if train:
30 | cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1',
31 | 'returnsOpenPrevMktres10', 'returnsOpenNextMktres10']
32 | else:
33 | cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1',
34 | 'returnsOpenPrevMktres10']
35 | market_df = market_df.loc[:,cols]
36 |
37 | # Drop NA
38 | market_df.dropna(inplace=True)
39 |
40 | # Filter out stocks that cover the full time series
41 | series_len = market_df.time.nunique()
42 | market_df = market_df.groupby('assetCode') .filter(lambda x: len(x) == series_len)
43 | assert (market_df.groupby('assetCode').size() == series_len).all()
44 |
45 | # Normalize time
46 | market_df.loc[:, 'time'] = pd.to_datetime(market_df.time).dt.normalize()
47 |
48 | return market_df
49 |
50 |
51 |
52 | def clean_news_data(news_df):
53 | '''Clean and preprocess the news data for training or testing.
54 |
55 | Parameters
56 | ----------
57 | news_df : dataframe
58 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full
59 | description of the dataframe.
60 |
61 | Returns
62 | -------
63 | dataframe
64 | Cleaned news data.
65 |
66 | '''
67 | # Select columns and drop NA
68 | cols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral',
69 | 'sentimentPositive', 'urgency', 'provider', 'bodySize', 'relevance']
70 | news_df = news_df.loc[:,cols]
71 | news_df.dropna(inplace=True)
72 |
73 | # Normalize time
74 | news_df.loc[:, 'time'] = pd.to_datetime(news_df.time).dt.normalize()
75 |
76 | # assetCodes from String to List
77 | news_df['assetCodes'] = news_df['assetCodes'].str.findall(f"'([\w\./]+)'")
78 |
79 | # Explode news on assetCodes
80 | assetCodes_expanded = list(chain(*news_df['assetCodes']))
81 | assetCodes_index = news_df.index.repeat(news_df['assetCodes'].apply(len))
82 | assert len(assetCodes_expanded) == len(assetCodes_index)
83 |
84 | assetCodes_df = pd.DataFrame({'index': assetCodes_index, 'assetCode': assetCodes_expanded})
85 | news_df_exploded = news_df.merge(assetCodes_df, 'right', right_on='index', left_index=True, validate='1:m')
86 | news_df_exploded.drop(['assetCodes', 'index'], 1, inplace=True)
87 |
88 | # Compute means for same date and assetCode
89 | news_agg_dict = {
90 | 'sentimentNegative':'mean',
91 | 'sentimentNeutral':'mean',
92 | 'sentimentPositive':'mean',
93 | 'urgency':'mean',
94 | 'bodySize':'mean',
95 | 'relevance':'mean'
96 | }
97 | news_df_agg = news_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict)
98 |
99 | # Add provider information
100 | idx = news_df_exploded.groupby(['time', 'assetCode'])['urgency'].transform(max) == news_df_exploded['urgency']
101 | news_df_exploded_2 = news_df_exploded[idx][['time', 'assetCode', 'provider']].drop_duplicates(['time', 'assetCode'])
102 | news_df_agg = news_df_agg.merge(news_df_exploded_2, 'left', ['time', 'assetCode'])
103 |
104 | # One-hot encoding provider
105 | ohe_provider = pd.get_dummies(news_df_agg['provider'])
106 | news_df_agg = pd.concat([news_df_agg, ohe_provider], axis=1).drop(['provider'], axis=1)
107 |
108 | return news_df_agg
109 |
110 |
111 |
112 | def clean_data(market_df, news_df, train=True):
113 | '''Clean and preprocess the news and market data for training then merge
114 | them, to create a train set or test set.
115 |
116 | Parameters
117 | ----------
118 | market_df : dataframe
119 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full
120 | description of the dataframe.
121 | news_df : dataframe
122 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full
123 | description of the dataframe.
124 | train : bool
125 | When true, creates both the input features and the target dataframes.
126 |
127 | Returns
128 | -------
129 | dataframe
130 | Cleaned data ready to be fed to the model. Returns both the input and
131 | the target dataframes when train=True.
132 |
133 | '''
134 | cleaned_market_df = clean_market_data(market_df, train)
135 | cleaned_news_df = clean_news_data(news_df)
136 |
137 | # Merge on market data
138 | df_merged = cleaned_market_df.merge(cleaned_news_df, 'left', ['time', 'assetCode'])
139 |
140 | if train:
141 | y = df_merged['returnsOpenNextMktres10']
142 | X = df_merged.drop(['returnsOpenNextMktres10'], axis=1)
143 | return X, y
144 | else:
145 | return df_merged
146 |
147 |
148 | def extract_asset(X_train, y_train, assetCode):
149 | '''Extracts the training data for a particular asset
150 |
151 | Parameters
152 | ----------
153 | X_train : dataframe
154 | Dataframe containing all the assets' training data.
155 | y_train : dataframe
156 | Dataframe containing all the assets' labels.
157 | assetCode : String.
158 | Asset code of asset to be extracted.
159 |
160 | Returns
161 | -------
162 | dataframe
163 | Dataframe containing data for only the chosen assetCode.
164 | dataframe
165 | Dataframe containing label for only the chosen assetCode
166 |
167 | '''
168 | X_train_asset = X_train[X_train['assetCode']==assetCode]
169 | y_train_asset = X_train.join(y_train)
170 | y_train_asset = y_train_asset[y_train_asset['assetCode']==assetCode]
171 | y_train_asset = y_train_asset.T.tail(1).T
172 |
173 | return X_train_asset.copy(), y_train_asset.copy()
174 |
175 |
176 | def generate_cleaned_filtered_data(market_data_path, news_data_path,
177 | save_path, assetCodes):
178 | ''' Imports the raw data, cleans and filters it and then saves it.
179 |
180 | Parameters
181 | ----------
182 | market_data_path : String
183 | The path to the raw market data.
184 | news_data_path : String
185 | The path to the raw news data.
186 | save_path : String
187 | The path where to save the cleaned and filtered data.
188 | asset_Codes : List of Strings
189 | The asset codes to filter out of the dataset.
190 |
191 | '''
192 | print('Reading CSV files...')
193 | market_train_df = pd.read_csv(MARKET_DATA_PATH)
194 | news_train_df = pd.read_csv(NEWS_DATA_PATH)
195 |
196 | print('Cleaining data...')
197 | X_train, y_train = clean_data(market_train_df, news_train_df)
198 |
199 | assets = ['INTC.O', 'WFC.N', 'AMZN.O', 'A.N', 'BHE.N']
200 | print('Extracting assets {}...'.format(asset))
201 | X_train_asset = X_train[X_train['assetCode'].isin(assetCodes)]
202 | cleaned_filtered_data = X_train_asset.join(y_train)
203 |
204 | print('Saving cleaned and filtered data to {}.'.format(path))
205 | cleaned_filtered_data.to_csv(path)
206 | print('It can now be retrieved using get_cleaned_filtered_data()')
207 |
208 |
209 | def get_cleaned_filtered_data(path):
210 | ''' Fetches the data from the CSV file generated by
211 | generate_cleaned_filterd_data.
212 |
213 | Parameters
214 | ----------
215 | path : String
216 | The path to the cleaned and filtered data.
217 |
218 | Returns
219 | -------
220 | dataframe
221 | Dataframe containing the features (X).
222 | dataframe
223 | Dataframe containing the label (y).
224 | '''
225 |
226 | df = pd.read_csv(path)
227 | y = df['returnsOpenNextMktres10']
228 | X = df.drop(['returnsOpenNextMktres10'], axis=1)
229 | return X, y
230 |
231 |
232 | if __name__ == '__main__':
233 | pass
--------------------------------------------------------------------------------
/models/data_partitioning.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 |
6 | def validate_df(X, y, sort_column='time'):
7 | ''' Validate the dataset
8 |
9 | Parameters
10 | ----------
11 | X : dataframe
12 | The data.
13 | y : dataframe
14 | The labels.
15 | sort_column : String
16 | Column on which the data should be sorted. Defaults to 'time'.
17 |
18 | Returns
19 | -------
20 | X_train : list of dataframe
21 | A list containing the training sets.
22 | y_train : list of dataframe
23 | A list containing the training labels sets.
24 | X_val : list of dataframe
25 | A list containing the validation sets.
26 | y_val : list of dataframe
27 | A list containing the validation labels sets.
28 | X_test : dataframe
29 | The test set
30 | y_test : dataframe
31 | The test set
32 |
33 | '''
34 | if len(X) != len(y):
35 | raise Exception('X and y should have the same length: len(X) is {}, \
36 | len(y) is {}'.format(len(X), len(y)))
37 |
38 | if sort_column not in X.columns:
39 | raise Exception('X should have a column named {}'.format(sort_column))
40 |
41 | if sort_column not in y.columns:
42 | raise Exception('y should have a column named {}'.format(sort_column))
43 |
44 | return X.sort_values(by=[sort_column]), y.sort_values(by=[sort_column])
45 |
46 |
47 | def split_fixed_origin(X, train_size):
48 | ''' Generator that yields training and validation sets according to the
49 | fixed-origin evaluation strategy.
50 |
51 | Fixed-origin evaluation is typically applied during forecasting
52 | competitions. A forecast for each value present in the test set is computed
53 | using only the training set. The forecast origin is fixed to the last point
54 | in the training set. So, for each horizon only one forecast can be computed.
55 | Obvious drawbacks of this type of evaluation are, that characteristics of
56 | the forecast origin might heavily influence evaluation results, and, as only
57 | one forecast per horizon is present, averaging is not possible within one
58 | series and one horizon (Bergmeir & Benitez, 2012).
59 |
60 | Parameters
61 | ----------
62 | X : dataframe
63 | The data to be split.
64 | train_ratio : int
65 | The size of the training set.
66 |
67 | Returns
68 | -------
69 | dataframe
70 | The training set.
71 | dataframe
72 | The validation set.
73 |
74 | '''
75 | yield np.split(X, [train_size])
76 |
77 |
78 | def split_rolling_origin_recal(X, initial_train_size, rolling_size):
79 | ''' Generator that yields training and validation sets according to the
80 | rolling-origin-recalibration evaluation strategy.
81 |
82 | Within rolling-origin-recalibration evaluation, forecasts for a fixed
83 | horizon are performed by sequentially moving values from the test set to the
84 | training set, and changing the forecast origin accordingly. For each
85 | forecast, the model is recalibrated using all available data in the training
86 | set, which often means a complete retraining of the model
87 | (Bergmeir & Benitez, 2012).
88 |
89 | Parameters
90 | ----------
91 | X : dataframe
92 | The data to be split.
93 | initial_train_size : int
94 | The initial size of the training set.
95 | rolling_size : int
96 | The number of elements that are moved from the validation set to the
97 | training set at each iteration.
98 |
99 | Returns
100 | -------
101 | dataframe
102 | The training set.
103 | dataframe
104 | The validation set.
105 |
106 | '''
107 | pointer = initial_train_size
108 | while pointer < len(X):
109 | yield X[:pointer], X[pointer:]
110 | pointer += rolling_size
111 |
112 |
113 | def split_rolling_origin_update(X, train_size, val_size):
114 | ''' Generator that yields a training and a validation sets according to the
115 | rolling_origin_update strategy. Essentially, this is the same as
116 | split_rolling but the model should not be recalibrated but simply updated
117 | after each subsequent iteration.
118 |
119 | After the first iteration which
120 |
121 | Rolling-origin-update evaluation is probably the normal use case of most
122 | applications. Forecasts are computed in analogy to rolling-origin-
123 | recalibration evaluation, but values from the test set are not moved to the
124 | training set, and no model recalibration is performed. Instead, past values
125 | from the test set are used merely to update the input information of the
126 | model. Both types of rolling-origin evaluation are often referred to as
127 | n-step-ahead evaluation, with n being the forecast horizon used during the
128 | evaluation. Tashman [47] argues that model recalibration probably yields
129 | better results than updating. But recalibration may be computationally
130 | expensive, and within a real-world application, the model typically will be
131 | built once by experts, and later it will be used with updated information as
132 | new values are available, but it will certainly not be rebuilt.
133 | (Bergmeir & Benitez, 2012).
134 |
135 | Parameters
136 | ----------
137 | X : dataframe
138 | The data to be split.
139 | train_window_size : int
140 | The number of data points to be included in the training set window.
141 | val_window_size : int
142 | The number of data points to be included in the validation set window.
143 |
144 | Returns
145 | -------
146 | dataframe
147 | The training set followed by one new observation at a time.
148 | dataframe
149 | The validation set followed by an empty dataframe after the first
150 | iteration.
151 |
152 | '''
153 | yield (X[:train_size],
154 | X[train_size:])
155 |
156 | while train_size < len(X):
157 | yield X[train_size:train_size+1], pd.DataFrame()
158 | train_size += 1
159 |
160 |
161 | def split_rolling_window(X, train_size, val_size, shift):
162 | ''' Generator that yields training and validation sets according to the
163 | rolling-window evaluation strategy.
164 |
165 | Rolling-window evaluation is similar to rolling-origin evaluation, but
166 | the amount of data used for training is kept constant, so that as new data
167 | is available, old data from the beginning of the series is discarded.
168 | Rolling-window evaluation is only applicable if the model is rebuilt in
169 | every window, and has merely theoretical statistical advantages, that might
170 | be noted in practice only if old values tend to disturb model generation
171 | (Bergmeir & Benitez, 2012).
172 |
173 | Parameters
174 | ----------
175 | X : dataframe
176 | The data to be split.
177 | train_window_size : int
178 | The number of data points to be included in the training set window.
179 | val_window_size : int
180 | The number of data points to be included in the validation set window.
181 | shift : int
182 | By how many data points do the windows shift after each iteration.
183 |
184 | Returns
185 | -------
186 | dataframe
187 | The training set.
188 | dataframe
189 | The validation set.
190 |
191 | '''
192 |
193 | pointer = 0
194 | while pointer + train_size + val_size <= len(X):
195 | yield (X[pointer:pointer+train_size],
196 | X[pointer+train_size:pointer+train_size+val_size])
197 | pointer += shift
198 |
199 |
200 | if __name__ == '__main__':
201 |
202 |
203 | # Test split_data_ordered
204 | X = pd.DataFrame(np.random.randint(0, 100, size=(101, 2)),
205 | columns=list('AB'))
206 | y = pd.DataFrame(np.random.randint(0, 2, size=(101, 1)),
207 | columns=['target'])
208 | time = range(0, 101)
209 | X['time'] = time
210 | y['time'] = time
211 |
212 | # Unit tests setup
213 | df = pd.DataFrame({'A':range(10)})
214 |
215 |
216 | # Unit tests for split_fixed_origin
217 | print('split_fixed_origin tests')
218 | print('------------------------')
219 | for i, j in split_fixed_origin(df, 6):
220 | print(i.values.reshape(1,-1))
221 | print(j.values.reshape(1,-1))
222 | print()
223 |
224 | # Unit tests for split_rolling_origin_recal
225 | print('split_rolling_origin_recal tests')
226 | print('--------------------------------')
227 | len_i = 4
228 | len_j = 6
229 | for i, j in split_rolling_origin_recal(df, 4, 2):
230 | assert len(i) == len_i and len(j) == len_j
231 | assert len(i) != 0 and len(j) != 0
232 | len_i += 2
233 | len_j -= 2
234 | print(i.values.reshape(1,-1))
235 | print(j.values.reshape(1,-1))
236 | print()
237 |
238 | # Unit tests for split_rolling_origin_update
239 | print('split_rolling_origin_update tests')
240 | print('---------------------------------')
241 | for i, j in split_rolling_origin_update(df, 4, 2):
242 | print(i.values.reshape(1,-1))
243 | print(j.values.reshape(1,-1))
244 | print()
245 |
246 | # Unit tests for split_rolling_window
247 | print('split_rolling_window tests')
248 | print('--------------------------')
249 | for i, j in split_rolling_window(df, 4, 2, 2):
250 | print(i.values.reshape(1,-1))
251 | print(j.values.reshape(1,-1))
252 | print()
253 |
254 |
--------------------------------------------------------------------------------
/models/lstm_attention_v0.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sys
3 | import os
4 | import pandas as pd
5 | import glob
6 |
7 | sys.path.append('../')
8 | from models.data_cleaning import clean_market_data, clean_news_data
9 |
10 | # Import libraries used for lstm
11 | from keras.models import Sequential
12 | from keras.layers import Input, Dense, multiply, Dot, Concatenate
13 | from keras.layers.core import *
14 | from keras.layers import LSTM
15 | from keras.models import *
16 |
17 | INPUT_DIM = 43
18 | TIME_STEPS = 1
19 | # if True, the attention vector is shared across the input_dimensions where the attention is applied.
20 | SINGLE_ATTENTION_VECTOR = False
21 | APPLY_ATTENTION_BEFORE_LSTM = False
22 | assetcode_list = ["AMZN.O"]
23 |
24 | MARKET_CLEAN_PATH = 'data/processed/market_cleaned_df.csv'
25 | NEWS_CLEAN_PATH = 'data/processed/news_cleaned_df.csv'
26 |
27 |
28 | def get_activations(model, inputs, print_shape_only=False, layer_name=None):
29 | # Documentation is available online on Github at the address below.
30 | # From: https://github.com/philipperemy/keras-visualize-activations
31 | print('----- activations -----')
32 | activations = []
33 | inp = model.input
34 | if layer_name is None:
35 | outputs = [layer.output for layer in model.layers]
36 | else:
37 | outputs = [layer.output for layer in model.layers if layer.name == layer_name] # all layer outputs
38 | funcs = [K.function([inp] + [K.learning_phase()], [out]) for out in outputs] # evaluation functions
39 | layer_outputs = [func([inputs, 1.])[0] for func in funcs]
40 | for layer_activations in layer_outputs:
41 | activations.append(layer_activations)
42 | if print_shape_only:
43 | print(layer_activations.shape)
44 | else:
45 | print(layer_activations)
46 | return activations
47 |
48 |
49 | def attention_3d_block(inputs):
50 | # inputs.shape = (batch_size, time_steps, input_dim)
51 | input_dim = int(inputs.shape[2])
52 | a = Permute((2, 1))(inputs)
53 | a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
54 | a = Dense(TIME_STEPS, activation='softmax')(a)
55 | if SINGLE_ATTENTION_VECTOR:
56 | a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
57 | a = RepeatVector(input_dim)(a)
58 | a_probs = Permute((2, 1), name='attention_vec')(a)
59 | output_attention_mul = multiply([inputs, a_probs], name='attention_mul')
60 | return output_attention_mul
61 |
62 |
63 | def model_attention_applied_after_lstm():
64 | inputs = Input(shape=(TIME_STEPS, INPUT_DIM,))
65 | lstm_units = 50
66 | lstm_out = LSTM(lstm_units, return_sequences=True)(inputs)
67 | attention_mul = attention_3d_block(lstm_out)
68 | attention_mul = Flatten()(attention_mul)
69 | output = Dense(1, activation='sigmoid')(attention_mul)
70 | model = Model(input=[inputs], output=output)
71 | return model
72 |
73 |
74 | def model_attention_applied_before_lstm():
75 | inputs = Input(shape=(TIME_STEPS, INPUT_DIM,))
76 | attention_mul = attention_3d_block(inputs)
77 | lstm_units = 32
78 | attention_mul = LSTM(lstm_units, return_sequences=False)(attention_mul)
79 | output = Dense(1, activation='sigmoid')(attention_mul)
80 | model = Model(input=[inputs], output=output)
81 | return model
82 |
83 |
84 | def extract_stock(df, assetCode, split=False):
85 | '''Extracts the training data for a particular asset
86 |
87 | Parameters
88 | ----------
89 | X_train : pandas dataframe containing all the assets' training data
90 | y_train : pandas dataframe containing all the assets' labels
91 | assetCode: asset code of asset to be extracted, in a list
92 |
93 | Returns
94 | -------
95 | X_train_asset : pandas dataframe containing data for only the chosen assetCode
96 | y_train_asset : pandas dataframe containing label for only the chosen assetCode
97 | '''
98 | df_asset = df[df['assetCode'].isin(assetCode)]
99 | if split:
100 | y = df_asset['returnsOpenNextMktres10']
101 | X = df_asset.drop(['returnsOpenNextMktres10'], axis=1)
102 | return X, y
103 |
104 | return df_asset
105 |
106 |
107 | if __name__ == '__main__':
108 |
109 | df_market = pd.read_csv(MARKET_CLEAN_PATH)
110 | df_news = pd.read_csv(NEWS_CLEAN_PATH)
111 |
112 | df_merged = df_market.merge(df_news, 'left', ['time', 'assetCode'])
113 | df_merged = df_merged.sort_values(['time', 'assetCode'], ascending=[True, True])
114 |
115 | df_merged = extract_stock(df_merged, assetcode_list)
116 | # taking 80%, 10%, 10% for train, val, test sets
117 | df_train = df_merged[:522*1990]
118 | df_val = df_merged[522*1990:522*(1990+249)]
119 | df_test = df_merged[522*(1990+249):]
120 |
121 | # create the different data sets
122 | y_train = df_train['returnsOpenNextMktres10']
123 | X_train = df_train.drop(['returnsOpenNextMktres10'], axis=1)
124 |
125 | y_val = df_val['returnsOpenNextMktres10']
126 | X_val = df_val.drop(['returnsOpenNextMktres10'], axis=1)
127 |
128 | y_test = df_test['returnsOpenNextMktres10']
129 | X_test = df_test.drop(['returnsOpenNextMktres10'], axis=1)
130 |
131 | X_train_ar = X_train.drop(['assetCode', "time"], axis=1).as_matrix()
132 | X_train_ar = X_train_ar.reshape(X_train_ar.shape[0], 1, X_train_ar.shape[1])
133 |
134 | X_val_ar = X_val.drop(['assetCode', "time"], axis=1).as_matrix()
135 | X_val_ar = X_val_ar.reshape(X_val_ar.shape[0], 1, X_val_ar.shape[1])
136 |
137 | X_test_ar = X_test.drop(['assetCode', "time"], axis=1).as_matrix()
138 | X_test_ar = X_test_ar.reshape(X_val_ar.shape[0], 1, X_test_ar.shape[1])
139 |
140 | #y_train_ar = y_train.values.reshape((1990, 522))
141 | #y_val_ar = y_val.values.reshape((int(len(y_val)/522), 522))
142 | #y_test_ar = y_test.values.reshape((int(len(y_test)/522), 522))
143 |
144 | # 4. Build model from Keras
145 | N = 300000
146 |
147 | if APPLY_ATTENTION_BEFORE_LSTM:
148 | m = model_attention_applied_before_lstm()
149 | else:
150 | m = model_attention_applied_after_lstm()
151 |
152 | m.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
153 | print(m.summary())
154 |
155 | m.fit(X_train_ar, y_train, epochs=3, batch_size=64, validation_data=(X_val_ar, y_val), verbose=1)
156 |
157 | attention_vectors = []
158 | for i in range(300):
159 | X_test_ar, y_test = get_data_recurrent(1, TIME_STEPS, INPUT_DIM)
160 | attention_vector = np.mean(get_activations(m,
161 | X_test_ar,
162 | print_shape_only=True,
163 | layer_name='attention_vec')[0], axis=2).squeeze()
164 | #print('attention =', attention_vector)
165 | assert (np.sum(attention_vector) - 1.0) < 1e-5
166 | attention_vectors.append(attention_vector)
167 |
168 | attention_vector_final = np.mean(np.array(attention_vectors), axis=0)
169 | # plot part.
170 | import matplotlib.pyplot as plt
171 | import pandas as pd
172 |
173 | pd.DataFrame(attention_vector_final, columns=['attention (%)']).plot(kind='bar',
174 | title='Attention Mechanism as '
175 | 'a function of input'
176 | ' dimensions.')
177 | plt.show()
178 |
--------------------------------------------------------------------------------
/models/lstm_attention_v1.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sys
3 | import os
4 | import pandas as pd
5 | import glob
6 | import matplotlib.pyplot as plt
7 | import pickle
8 | import tensorflow as tf
9 |
10 | from models.data_cleaning import generate_cleaned_filtered_data
11 | from models.attention_decoder import AttentionDecoder
12 | from models.data_partitioning import validate_df
13 | from models.data_partitioning import split_fixed_origin
14 | from keras.models import Sequential
15 | from keras.layers import Input, Dense
16 | from keras.layers import LSTM
17 | from keras.layers import TimeDistributed
18 | from keras.layers import RepeatVector
19 | from keras import backend as K
20 |
21 | from sklearn.model_selection import train_test_split
22 | from sklearn.preprocessing import MinMaxScaler
23 |
24 | from IPython.display import SVG
25 | from keras.utils.vis_utils import model_to_dot
26 |
27 | test_frac = 0.1 # fraction of the whole data
28 | train_frac = 0.8 # fraction of the remaining data
29 |
30 | cleaned_data_path = './data/processed/df_merged.csv'
31 |
32 | ASSETS = ['INTC.O', 'WFC.N', 'AMZN.O', 'A.N', 'BHE.N']
33 |
34 |
35 | def top_down_acc(y_true, y_pred):
36 | return K.abs(K.sign(y_true) + K.sign(y_pred)) / 2
37 |
38 |
39 | def time_lag_data(X, y, n_in=1, n_out=1):
40 | n_features = X.shape[1]
41 | feature_names = X.columns
42 |
43 | # Define column names
44 | names = list()
45 | for i in range(n_in):
46 | names += [('%s(t-%d)' % (feature_names[j], -(i+1-n_in))) for j in range(n_features)]
47 |
48 | x_list = []
49 | # input sequence (t-n, ... t-1)
50 | for i in range(X.shape[0]-n_in-n_out+2):
51 | rows_x = []
52 | for _, row in X[i:i+n_in].iterrows():
53 | rows_x += row.tolist()
54 | x_list.append(rows_x)
55 |
56 | X_time = pd.DataFrame(x_list, columns=names)
57 | # forecast sequence (t, t+1, ... t+n)
58 | cols = list()
59 | for i in range(0, n_out):
60 | if i == 0:
61 | cols += [('%s(t)' % ('returnsOpenNextMktres10'))]
62 | else:
63 | cols += [('%s(t+%d)' % ('returnsOpenNextMktres10', i))]
64 | # put it all together
65 |
66 | y_list = []
67 | # input sequence (t-n, ... t-1)
68 | for i in range(n_in-1, X.shape[0]-n_out+1):
69 | y_list.append(y[i:i+n_out].tolist())
70 |
71 | y_time = pd.DataFrame(y_list, columns=cols)
72 |
73 | return X_time, y_time
74 |
75 |
76 | df = pd.read_csv(cleaned_data_path)
77 | df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'time'], inplace=True, axis=1)
78 |
79 | # For loop for assets
80 | asset = 'BHE.N'
81 | df = df[df['assetCode'] == asset]
82 | df.drop(['assetCode'], axis=1, inplace=True)
83 |
84 | split = len(df) - round(test_frac*len(df))
85 | df_test = df[split:]
86 | df_tv = df[:split]
87 |
88 | # For loop for different splitting techniques
89 | df_train, df_val = train_test_split(df_tv,
90 | train_size=train_frac,
91 | shuffle=False)
92 |
93 | y_train = df_train['returnsOpenNextMktres10']
94 | y_train_init = y_train.reset_index(drop=True)
95 | X_train = df_train.drop(['returnsOpenNextMktres10'], axis=1)
96 | X_train_init = X_train.reset_index(drop=True)
97 | print('The train data size is : ', X_train.shape, y_train.shape)
98 |
99 | y_val = df_val['returnsOpenNextMktres10']
100 | y_val_init = y_val.reset_index(drop=True)
101 | X_val = df_val.drop(['returnsOpenNextMktres10'], axis=1)
102 | X_val_init = X_val.reset_index(drop=True)
103 | print('The validation data size is : ', X_val.shape, y_val.shape)
104 |
105 | y_test = df_test['returnsOpenNextMktres10']
106 | y_test_init = y_test.reset_index(drop=True)
107 | X_test = df_test.drop(['returnsOpenNextMktres10'], axis=1)
108 | X_test_init = X_test.reset_index(drop=True)
109 | print('The test data size is : ', X_test.shape, y_test.shape)
110 |
111 | # Hyperparameter tuning
112 | # lag (1, 5, 15, 30, 60, 90), dropout (LSTM) (0, 0.05, 0.4), cells (16, 32, 64)
113 | n_features = 40
114 | n_timesteps_out = 1
115 | n_epochs = 25
116 |
117 | # LSTM + EncoderDecoder
118 | for n_timesteps_in in [1]:
119 | for dropout in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]:
120 | for cells in [16]:
121 |
122 | X_train, y_train = time_lag_data(X_train_init, y_train_init,
123 | n_in=n_timesteps_in,
124 | n_out=n_timesteps_out)
125 | print('The train data size is : ', X_train.shape, y_train.shape)
126 |
127 | X_val, y_val = time_lag_data(X_val_init, y_val_init,
128 | n_in=n_timesteps_in,
129 | n_out=n_timesteps_out)
130 | print('The val data size is : ', X_val.shape, y_val.shape)
131 |
132 | scaler = MinMaxScaler((-1, 1), False)
133 | X_train = scaler.fit_transform(X_train)
134 | X_val = scaler.transform(X_val)
135 |
136 | # Reshape the datasets
137 | X_train = X_train.reshape((len(X_train), n_timesteps_in, n_features))
138 | y_train = y_train.values.reshape((len(y_train), n_timesteps_out, 1))
139 |
140 | X_val = X_val.reshape((len(X_val), n_timesteps_in, n_features))
141 | y_val = y_val.values.reshape((len(y_val), n_timesteps_out, 1))
142 |
143 |
144 | # Model with Encoder/Decoder
145 | model = Sequential()
146 | model.add(LSTM(cells, dropout=dropout,
147 | input_shape=(n_timesteps_in, n_features)))
148 | model.add(RepeatVector(n_timesteps_out))
149 | model.add(LSTM(cells, dropout=dropout, return_sequences=True))
150 | model.add(TimeDistributed(Dense(1, activation='tanh')))
151 | model.compile(loss='mean_squared_error', optimizer='adam',
152 | metrics=[top_down_acc])
153 | model.summary()
154 | history = model.fit(X_train,
155 | y_train,
156 | epochs=n_epochs,
157 | validation_data=(X_val, y_val),
158 | shuffle=False)
159 |
160 | with open('history_ed_v0_ts_{}_drop_{}_cells_{}'.format(str(n_timesteps_in),
161 | str(dropout),
162 | str(cells)), 'wb') as file_hs:
163 | pickle.dump(history.history, file_hs)
164 |
165 | # plot training history
166 | fig = plt.figure()
167 | ax = fig.add_subplot(1, 1, 1)
168 | ax.plot(history.history['loss'])
169 | ax.plot(history.history['val_loss'])
170 | ax.set_xlim([0, 125])
171 | ax.set_ylim([0, 0.01])
172 | # plt.plot(history.history['top_down_acc'])
173 |
174 | ax.set_xlabel('Epoch')
175 | ax.set_ylabel('Mean Absolute Error Loss')
176 | ax.set_title('Loss Over Time')
177 | ax.legend(['Train','Val'])
178 | # plt.legend(['Train','Val', 'Top Down Accuracy'])
179 | fig.savefig('lstm_ed_v0_ts_{}_drop_{}_cells_{}.png'.format(str(n_timesteps_in),
180 | str(dropout),
181 | str(cells)))
182 | # LSTM + Attention
183 | for n_timesteps_in in [1, 5, 15, 30, 60, 90]:
184 | for dropout in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]:
185 | for cells in [16, 32, 64]:
186 |
187 | n_timesteps_out = n_timesteps_in
188 |
189 | X_train, y_train = time_lag_data(X_train_init, y_train_init,
190 | n_in=n_timesteps_in,
191 | n_out=n_timesteps_out)
192 | print('The train data size is : ', X_train.shape, y_train.shape)
193 |
194 | X_val, y_val = time_lag_data(X_val_init, y_val_init,
195 | n_in=n_timesteps_in,
196 | n_out=n_timesteps_out)
197 | print('The val data size is : ', X_val.shape, y_val.shape)
198 |
199 | X_test, y_test = time_lag_data(X_test_init, y_test_init,
200 | n_in=n_timesteps_in,
201 | n_out=n_timesteps_out)
202 | print('The test data size is : ', X_test.shape, y_test.shape)
203 |
204 | scaler = MinMaxScaler((-1, 1), False)
205 | X_train = scaler.fit_transform(X_train)
206 | X_val = scaler.transform(X_val)
207 | X_test = scaler.transform(X_test)
208 |
209 | # Reshape the datasets
210 | X_train = X_train.reshape((len(X_train), n_timesteps_in, n_features))
211 | y_train = y_train.values.reshape((len(y_train), n_timesteps_out, 1))
212 |
213 | X_val = X_val.reshape((len(X_val), n_timesteps_in, n_features))
214 | y_val = y_val.values.reshape((len(y_val), n_timesteps_out, 1))
215 |
216 | X_test = X_test.reshape((len(X_test), n_timesteps_in, n_features))
217 | y_test = y_test.values.reshape((len(y_test), n_timesteps_out, 1))
218 |
219 | model_at = Sequential()
220 | model_at.add(LSTM(cells, input_shape=(n_timesteps_in, n_features),
221 | return_sequences=True))
222 | model_at.add(AttentionDecoder(cells, n_features))
223 | model_at.add(Dense(1, activation='tanh'))
224 | model_at.compile(loss='mean_squared_error', optimizer='adam',
225 | metrics=[top_down_acc])
226 | model_at.summary()
227 | history = model_at.fit(X_train,
228 | y_train,
229 | epochs=n_epochs,
230 | validation_data=(X_val, y_val),
231 | shuffle=False)
232 |
233 | with open('results_final/history_att_v0_ts_{}_drop_{}_cells_{}'.format(str(n_timesteps_in),
234 | str(dropout),
235 | str(cells)), 'wb') as file_hs:
236 | pickle.dump(history.history, file_hs)
237 |
238 | prediction = model_at.predict(X_test)
239 | top_down_accuracy = sum(top_down_acc(p[0], np.float32(t[0])) for p, t in zip(prediction[:,0], y_test[:,0]))/len(y_test)
240 |
241 | with tf.Session() as sess:
242 | top_down_accuracy = sess.run(top_down_accuracy)
243 | # plot training history
244 | fig = plt.figure()
245 | ax = fig.add_subplot(1, 1, 1)
246 | ax.plot(history.history['loss'])
247 | ax.plot(history.history['val_loss'])
248 | ax.set_xlim([0, 40])
249 | ax.set_ylim([0, 0.01])
250 | # plt.plot(history.history['top_down_acc'])
251 |
252 | ax.set_xlabel('Epoch')
253 | ax.set_ylabel('Mean Absolute Error Loss')
254 | print(min(history.history['val_loss']))
255 | ax.set_title('Loss Over Time')
256 | print('Predicted Top-Down Accuracy : {}'.format(str(top_down_accuracy)))
257 | ax.legend(['Train','Val'])
258 | # plt.legend(['Train','Val', 'Top Down Accuracy'])
259 | fig.savefig('results_final/lstm_att_v0_ts_{}_drop_{}_cells_{}.png'.format(str(n_timesteps_in),
260 | str(dropout),
261 | str(cells)))
262 |
--------------------------------------------------------------------------------
/models/lstm_v01.py:
--------------------------------------------------------------------------------
1 | from data_partitioning import validate_df
2 | from data_partitioning import split_fixed_origin
3 | from data_cleaning import get_cleaned_filtered_data, extract_asset
4 |
5 | from keras.models import Sequential
6 | from keras.layers import Dense, LSTM
7 | from keras.utils import plot_model
8 |
9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | import pandas as pd
12 |
13 | DRY_RUN = True # if True, will only run for one asset with fixed origin strategy
14 |
15 | ASSETS = ['INTC.O', 'WFC.N', 'AMZN.O', 'A.N', 'BHE.N']
16 | DATA_PATH = './data/processed/cleaned_filtered_data.csv'
17 |
18 |
19 | test_frac = 0.1 # fraction of the whole data
20 | train_frac = 0.8 # fraction of the remaining data
21 | latent_dim = 50 # LSTM hidden units
22 | batch_size = 1
23 | look_back = 30
24 |
25 |
26 | def create_dataset(X, look_back=1):
27 | cols = list()
28 | for i in range(look_back, 0, -1):
29 | cols.append(X.shift(i))
30 |
31 | return pd.concat(cols, axis=1)
32 |
33 | if __name__ == '__main__':
34 | X, y = get_cleaned_filtered_data(DATA_PATH)
35 |
36 |
37 | for asset in ASSETS:
38 | X, y = extract_asset(X, y, asset)
39 | X['y'] = y
40 |
41 | # Isolating the test set
42 | split = len(X) - round(test_frac*len(X))
43 | X_test = X[split:]
44 | y_test = X_test['y']
45 | X_test = X_test.drop(['y'], axis=1)
46 | X = X[:split]
47 |
48 | # Training and validating the model using fixed origin
49 | train_size = round(train_frac * len(X))
50 |
51 | for X_train, X_val in split_fixed_origin(X, train_size):
52 | y_train = X_train['y']
53 | X_train = X_train.drop(['y'], axis=1)
54 | y_val = X_val['y']
55 | X_val = X_val.drop(['y'], axis=1)
56 |
57 | # fill nan ad drop the asset code and time
58 | drop_col = ['Unnamed: 0', 'assetCode', 'time']
59 | X_train.fillna(0, inplace=True)
60 | X_val.fillna(0, inplace=True)
61 | X_train.drop(drop_col, axis=1, inplace=True)
62 | X_val.drop(drop_col, axis=1, inplace=True)
63 |
64 | # Create the sets according to the look_back range
65 | X_train = create_dataset(X_train, look_back)
66 |
67 | # input dimensionality
68 | data_dim = X_train.shape[-1]
69 |
70 | # Reshape input to 3 dimensions (batch_size, timesteps, data_dim)
71 | X_train = X_train.reshape((batch_size, X_train.shape[0], data_dim))
72 | X_val = X_val.reshape((batch_size, X_val.shape[0], data_dim))
73 | y_train = y_train.reshape((batch_size, -1, 1))
74 | y_val = y_val.reshape((batch_size, -1, 1))
75 |
76 | # Expected input shape: (batch_size, timesteps, data_dim)
77 | model = Sequential()
78 | model.add(LSTM(latent_dim, input_dim=data_dim,
79 | return_sequences=True))
80 | model.add(Dense(1))
81 | model.compile(loss='mse', optimizer='adam')
82 | history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
83 | epochs=60, batch_size=batch_size)
84 |
85 | # plot training history
86 | plt.plot(history.history['loss'])
87 | plt.plot(history.history['val_loss'])
88 |
89 | plt.xlabel('Epoch')
90 | plt.ylabel('Mean Absolute Error Loss')
91 | plt.title('Loss Over Time')
92 | plt.legend(['Train','Val'])
93 |
94 | if DRY_RUN:
95 | break;
96 |
97 |
--------------------------------------------------------------------------------
/models/lstm_v02.py:
--------------------------------------------------------------------------------
1 | from data_partitioning import validate_df
2 | from data_partitioning import split_fixed_origin
3 | from data_cleaning import get_cleaned_filtered_data, extract_asset
4 |
5 | from keras.models import Sequential
6 | from keras.layers import Dense, LSTM
7 | from keras.utils import plot_model
8 | from keras.callbacks import ModelCheckpoint
9 | from keras import backend as K
10 |
11 |
12 | from sklearn.preprocessing import MinMaxScaler
13 | from sklearn.metrics import roc_auc_score
14 |
15 | from itertools import product
16 | import matplotlib.pyplot as plt
17 | import numpy as np
18 | import pandas as pd
19 | import pickle
20 |
21 |
22 | DRY_RUN = False
23 | DUMP_HISTORY = True
24 |
25 | ASSETS = ['INTC.O', 'WFC.N', 'AMZN.O', 'A.N', 'BHE.N']
26 | DATA_PATH = './data/processed/cleaned_filtered_data.csv'
27 | HISTORY_TOP_PATH = './data/history/'
28 |
29 | test_frac = 0.1 # fraction of the whole data
30 | train_frac = 0.8 # fraction of the remaining data
31 | n_epochs = 200
32 |
33 | lstm_sizes = [16, 32, 64]
34 | lags = [1, 5, 15, 30, 60, 90]
35 | dropouts = [0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40]
36 |
37 |
38 | def add_lag(df, lag=1):
39 | cols = [df]
40 | for i in range(lag, 0, -1):
41 | cols.append(df.shift(i))
42 | return pd.concat(cols, axis=1).dropna()
43 |
44 |
45 | def top_down_acc(y_true, y_pred):
46 | return K.abs(K.sign(y_true) + K.sign(y_pred)) / 2
47 |
48 |
49 | def create_model(lstm_size, dropout, lag, n_features):
50 | model = Sequential()
51 | model.add(LSTM(lstm_size, dropout=dropout,
52 | input_shape=(lag+1, n_features)))
53 | model.add(Dense(1, activation='tanh'))
54 | model.compile(loss='mse', optimizer='adam',
55 | metrics=[top_down_acc])
56 | return model
57 |
58 |
59 | if __name__ == '__main__':
60 |
61 | # Fetch the data from the saved csv
62 | X_clean, y_clean = get_cleaned_filtered_data(DATA_PATH)
63 |
64 | for asset, lstm_size, lag, dropout in product(
65 | ASSETS, lstm_sizes, lags, dropouts):
66 |
67 | # Extract the asset and perform some cleaning
68 | X, y = extract_asset(X_clean, y_clean, asset)
69 | cols = ['Unnamed: 0', 'assetCode', 'time']
70 | X.drop(cols, axis=1, inplace=True)
71 | X.fillna(-1, inplace=True) # Making sure unknown values are obvious
72 | n_features = X.shape[1]
73 |
74 | # Merge the labels and the features into one dataset
75 | df = X
76 | df['y'] = y
77 |
78 | # Isolating the test set
79 | split = len(df) - round(test_frac*len(df))
80 | df_test = df[split:]
81 | df = df[:split]
82 |
83 | # Some user feedback
84 | print('\nTraining with\n\tlstm size: {}\n\tlag: {}\n\tdropout: {}\n'
85 | .format(lstm_size, lag, dropout))
86 |
87 | # Add the lag features
88 | df_lag = add_lag(df.drop(['y'], axis=1), lag)
89 | df_lag['y'] = df['y']
90 |
91 | # Train and evaluate using fixed origin
92 | train_size = round(train_frac * len(df_lag))
93 | for df_train, df_val in split_fixed_origin(df_lag, train_size):
94 | y_train = df_train['y']
95 | X_train = df_train.drop(['y'], axis=1)
96 | y_val = df_val['y']
97 | X_val = df_val.drop(['y'], axis=1)
98 |
99 | # Scale the data
100 | scaler = MinMaxScaler((-1, 1), False)
101 | scaler.fit_transform(X_train)
102 | scaler.transform(X_val)
103 |
104 | # Reshape input data according to Keras documentation
105 | # (batch_size, timesteps, input_dim)
106 | X_train = X_train.values.reshape((-1, lag+1, n_features))
107 | X_val = X_val.values.reshape((-1, lag+1, n_features))
108 |
109 | # Create the model
110 | # Input shape expected (timesteps, input_dim)
111 | model = create_model(lstm_size, dropout, lag, n_features)
112 |
113 | # Fit the model
114 | checkpoint_name = ('best-lstm-{{epoch:03d}}-{{val_loss:.4f}}-{}-{}-'
115 | '{}-{}.hdf5').format(asset, lstm_size, lag, int(dropout*100))
116 | checkpoint = ModelCheckpoint(
117 | './data/models/' + checkpoint_name,
118 | monitor='val_loss',
119 | save_best_only=True)
120 | history = model.fit(X_train,
121 | y_train,
122 | epochs=n_epochs,
123 | validation_data=(X_val, y_val),
124 | shuffle=False,
125 | callbacks=[checkpoint])
126 |
127 | # Dumpm the history to a pickle file
128 | if DUMP_HISTORY:
129 | path = HISTORY_TOP_PATH + 'lstm-{}-{}-{}-{}.pickle'.format(
130 | asset, lstm_size, lag, int(dropout*100))
131 | with open(path, 'wb') as f:
132 | pickle.dump(history.history, f)
133 |
134 | if DRY_RUN:
135 | break
--------------------------------------------------------------------------------
/models/lstm_v02_analysis.py:
--------------------------------------------------------------------------------
1 | from os import listdir
2 | from os.path import isfile, join
3 | import pandas as pd
4 | import pickle
5 | import re
6 |
7 | from sklearn.preprocessing import MinMaxScaler
8 |
9 | from lstm_v02 import create_model, add_lag
10 | from data_partitioning import split_fixed_origin
11 | from data_cleaning import get_cleaned_filtered_data, extract_asset
12 |
13 |
14 | DATA_PATH = './data/processed/cleaned_filtered_data.csv'
15 |
16 | test_frac = 0.1
17 | train_frac = 0.8
18 |
19 |
20 | asset = 'INTC.O'
21 | val_loss = 0.0016
22 | epoch = 7
23 | lstm_size = 32
24 | lag = 60
25 | dropout = 40
26 |
27 |
28 | def get_saved_model_path(root, val_loss, epoch, asset, lstm_size, lag, dropout):
29 |
30 | # Generate file path from parameters
31 | return root + 'best-lstm-{:03}-{}-{}-{}-{}-{}.hdf5'.format(
32 | epoch, val_loss, asset, lstm_size, lag, dropout)
33 |
34 |
35 | if __name__ == '__main__':
36 |
37 | # Fetch the data
38 | X_clean, y_clean = get_cleaned_filtered_data(DATA_PATH)
39 | X, y = extract_asset(X_clean, y_clean, asset)
40 | cols = ['Unnamed: 0', 'assetCode', 'time']
41 | X.drop(cols, axis=1, inplace=True)
42 | X.fillna(-1, inplace=True)
43 | n_features = X.shape[1]
44 |
45 | # Split the data
46 | df = X
47 | df['y'] = y
48 | split = len(df) - round(test_frac*len(df))
49 | df_test = df[split:]
50 | df = df[:split]
51 |
52 | print(len(df_test))
53 |
54 | # Add the lag features
55 | df_lag = add_lag(df.drop(['y'], axis=1), lag)
56 | df_lag['y'] = df['y']
57 | df_test_lag = add_lag(df_test.drop(['y'], axis=1), lag)
58 | df_test_lag['y'] = df_test['y']
59 |
60 | X_test = df_test_lag.drop(['y'], axis=1)
61 | y_test = df_test_lag['y']
62 |
63 | train_size = round(train_frac * len(df_lag))
64 | for df_train, df_val in split_fixed_origin(df_lag, train_size):
65 | X_train = df_train.drop(['y'], axis=1)
66 |
67 | # Scale the data
68 | scaler = MinMaxScaler((-1, 1), False)
69 | scaler.fit_transform(X_train)
70 | scaler.transform(X_test)
71 |
72 | # Reshape to keras input shape
73 | X_test = X_test.values.reshape((-1, lag+1, n_features))
74 |
75 | # Create the model from saved weights
76 | weights_path = get_saved_model_path(
77 | './data/models/', val_loss, epoch, asset, lstm_size, lag, dropout)
78 | model = create_model(lstm_size, dropout, lag, n_features)
79 | model.load_weights(weights_path)
80 |
81 | # Test and print the results
82 | scores = model.evaluate(X_test, y_test, verbose=0)
83 | print('\n{} : {}\n{} : {}'.format(
84 | model.metrics_names[0], scores[0], model.metrics_names[1], scores[1]))
--------------------------------------------------------------------------------
/models/lstm_v03.py:
--------------------------------------------------------------------------------
1 | ''' In this version, I train the model on INTC.O using the hyper parameters
2 | found with the version 02 but using the rolling window splitting strategy.
3 | '''
4 |
5 | from data_partitioning import split_rolling_window
6 | from data_cleaning import get_cleaned_filtered_data, extract_asset
7 |
8 | from keras.models import Sequential
9 | from keras.layers import Dense, LSTM
10 | from keras.utils import plot_model
11 | from keras.callbacks import ModelCheckpoint
12 | from keras import backend as K
13 |
14 |
15 | from sklearn.preprocessing import MinMaxScaler
16 | from sklearn.metrics import roc_auc_score
17 |
18 | from itertools import product
19 | import matplotlib.pyplot as plt
20 | import numpy as np
21 | import pandas as pd
22 | import pickle
23 |
24 |
25 | DRY_RUN = False
26 | DUMP_HISTORY = True
27 |
28 | DATA_PATH = './data/processed/cleaned_filtered_data.csv'
29 | HISTORY_TOP_PATH = './data/history/'
30 |
31 | test_frac = 0.1 # fraction of the whole data used for test set
32 | n_epochs = 1 # Number of pass over the data when training
33 |
34 | # Params for rolling window (fraction of the remaining data)
35 | train_frac = 0.2
36 | val_frac = 0.1
37 | shift = 15
38 |
39 | asset = 'INTC.O'
40 | lstm_size = 64
41 | lag = 15
42 | dropout = 0.10
43 |
44 |
45 | def add_lag(df, lag=1):
46 | cols = [df]
47 | for i in range(lag, 0, -1):
48 | cols.append(df.shift(i))
49 | return pd.concat(cols, axis=1).dropna()
50 |
51 |
52 | def top_down_acc(y_true, y_pred):
53 | return K.abs(K.sign(y_true) + K.sign(y_pred)) / 2
54 |
55 |
56 | def create_model(lstm_size, dropout, lag, n_features):
57 | model = Sequential()
58 | model.add(LSTM(lstm_size, dropout=dropout,
59 | input_shape=(lag+1, n_features)))
60 | model.add(Dense(1, activation='tanh'))
61 | model.compile(loss='mse', optimizer='adam',
62 | metrics=[top_down_acc])
63 | return model
64 |
65 |
66 | def get_df(test_frac, asset):
67 |
68 | # Fetch the data from the saved csv
69 | X_clean, y_clean = get_cleaned_filtered_data(DATA_PATH)
70 |
71 | # Extract the asset and perform some cleaning
72 | df, y = extract_asset(X_clean, y_clean, asset)
73 | cols = ['Unnamed: 0', 'assetCode', 'time']
74 | df.drop(cols, axis=1, inplace=True)
75 | df.fillna(-1, inplace=True) # Making sure unknown values are obvious
76 | n_features = df.shape[1]
77 |
78 | # Merge the labels and the features into one dataset
79 | df['y'] = y
80 |
81 | # Add the lag features
82 | df_lag = add_lag(df.drop(['y'], axis=1), lag)
83 | df_lag = df_lag.assign(y=df['y'])
84 | total_len = len(df_lag)
85 |
86 | # Isolating the test set
87 | split = len(df_lag) - round(test_frac*len(df_lag))
88 | df_lag_test = df_lag[split:]
89 | df_lag = df_lag[:split]
90 |
91 | # Scale the data
92 | scaler = MinMaxScaler((-1, 1), False)
93 |
94 | temp_y = df_lag['y']
95 | df_lag.drop('y', axis=1, inplace=True)
96 | scaler.fit_transform(df_lag)
97 | df_lag['y'] = temp_y
98 |
99 | temp_y = df_lag_test['y']
100 | df_lag_test.drop('y', axis=1, inplace=True)
101 | scaler.transform(df_lag_test)
102 | df_lag_test['y'] = temp_y
103 |
104 | assert total_len == len(df_lag) + len(df_lag_test)
105 |
106 | return df_lag, df_lag_test, n_features
107 |
108 |
109 | if __name__ == '__main__':
110 |
111 | df_lag, _, n_features = get_df(test_frac, asset)
112 |
113 |
114 | # Create the model
115 | # Input shape expected (timesteps, input_dim)
116 | model = create_model(lstm_size, dropout, lag, n_features)
117 |
118 | # Train and evaluate using rolling window
119 | train_size = round(train_frac * len(df_lag))
120 | val_size = round(val_frac * len(df_lag))
121 | count = -1
122 | for df_train, df_val in split_rolling_window(df_lag, train_size,
123 | val_size, shift):
124 | count += 1
125 | y_train = df_train['y']
126 | X_train = df_train.drop(['y'], axis=1)
127 | y_val = df_val['y']
128 | X_val = df_val.drop(['y'], axis=1)
129 |
130 | # Reshape input data according to Keras documentation
131 | # (batch_size, timesteps, input_dim)
132 | X_train = X_train.values.reshape((-1, lag+1, n_features))
133 | X_val = X_val.values.reshape((-1, lag+1, n_features))
134 |
135 | # Fit the model
136 | checkpoint_name = ('best-lstm-{:03d}-{}-{}-{}-{}.hdf5').format(
137 | count, asset, lstm_size, lag, int(dropout*100))
138 | checkpoint = ModelCheckpoint(
139 | './data/models/rollingwindow/' + checkpoint_name,
140 | monitor='val_loss',
141 | save_best_only=True)
142 | history = model.fit(X_train,
143 | y_train,
144 | epochs=n_epochs,
145 | validation_data=(X_val, y_val),
146 | shuffle=False,
147 | callbacks=[checkpoint])
148 |
149 | # Dumpm the history to a pickle file
150 | if DUMP_HISTORY:
151 | path = (HISTORY_TOP_PATH + 'rollingwindow/lstm.{:03d}-{}-{}-{}-{}'
152 | '.pickle'.format(count, asset, lstm_size, lag, int(dropout*100)))
153 | with open(path, 'wb') as f:
154 | pickle.dump(history.history, f)
155 |
156 | if DRY_RUN:
157 | break
--------------------------------------------------------------------------------
/models/lstm_v03_analysis.py:
--------------------------------------------------------------------------------
1 | from os import listdir
2 | import matplotlib.pyplot as plt
3 | import pickle
4 |
5 | from lstm_v03 import create_model, get_df
6 |
7 |
8 | def concat_history():
9 | path = './data/history/rollingwindow'
10 | keys = ['val_loss', 'val_top_down_acc', 'loss', 'top_down_acc']
11 |
12 | hist_list = listdir(path)
13 | history = {key: [] for key in keys}
14 |
15 | for hist_name in hist_list:
16 | with open(path + '/' + hist_name, 'rb') as f:
17 | hist = pickle.load(f)
18 |
19 | for key in keys:
20 | history[key] += hist[key]
21 |
22 | return history
23 |
24 |
25 | def plot_train_loss(history, ylim=(0, 0.03)):
26 | plt.ylim(ylim)
27 |
28 | plt.plot(history['loss'])
29 | plt.plot(history['val_loss'])
30 |
31 | plt.xlabel('Epoch')
32 | plt.ylabel('Mean Absolute Error Loss')
33 | plt.title('Training Loss')
34 | plt.legend(['Train','Val'])
35 | plt.show()
36 |
37 |
38 | def perform_tests():
39 |
40 | test_frac = 0.1
41 |
42 | asset = 'INTC.O'
43 | lstm_size = 64
44 | lag = 15
45 | dropout = 0.1
46 |
47 | path = './data/models/rollingwindow'
48 | models = listdir(path)
49 |
50 | df_lag, df_lag_test, n_features = get_df(test_frac, asset)
51 | X_test = df_lag_test.drop('y', axis=1)
52 | y_test = df_lag_test['y']
53 |
54 | # Reshape input data according to Keras documentation
55 | # (batch_size, timesteps, input_dim)
56 | X_test = X_test.values.reshape((-1, lag+1, n_features))
57 |
58 | model = create_model(lstm_size, dropout, lag, n_features)
59 |
60 | f = open('data/lstm_rollingwindow.csv', 'w+')
61 | f.write(model.metrics_names[0] + ',' + model.metrics_names[1] + '\n')
62 |
63 | for model_name in models:
64 | model.load_weights(path + '/' + model_name)
65 | scores = model.evaluate(X_test, y_test, verbose=0)
66 | f.write('{},{}\n'.format(scores[0], scores[1]))
67 |
68 | f.close()
69 |
70 |
71 | if __name__ == '__main__':
72 | # history = concat_history()
73 | # plot_train_loss(history)
74 |
75 | perform_tests()
--------------------------------------------------------------------------------
/models/lstm_v04.py:
--------------------------------------------------------------------------------
1 | ''' In this version, I train the model on INTC.O using the hyper parameters
2 | found withthe version 02 but using the rolling origin recalibration splitting
3 | strategy.
4 | '''
5 |
6 | import pickle
7 | from keras.callbacks import ModelCheckpoint
8 |
9 | from data_partitioning import split_rolling_origin_recal
10 | from data_cleaning import get_cleaned_filtered_data, extract_asset
11 |
12 | from lstm_v03 import add_lag, top_down_acc, create_model, get_df
13 |
14 |
15 | DATA_PATH = './data/processed/cleaned_filtered_data.csv'
16 | HISTORY_PATH = './data/history/lstm_recal/'
17 | CHECKPOINT_PATH = './data/models/lstm_recal/'
18 |
19 | test_frac = 0.1
20 | n_epochs = 1
21 |
22 | init_train_frac = 0.1
23 | rolling_size = 10
24 |
25 | asset = 'INTC.O'
26 | lstm_size = 64
27 | lag = 15
28 | dropout = 0.10
29 |
30 |
31 | if __name__ == '__main__':
32 |
33 | # Get the cleaned and processed data
34 | df_lag, _, n_features = get_df(test_frac, asset)
35 |
36 | # Instantiate the model
37 | model = create_model(lstm_size, dropout, lag, n_features)
38 |
39 | # Train and evaluate using the rolling origin recalibration strategy
40 | init_train_size = round(init_train_frac * len(df_lag))
41 | count = -1
42 | for df_train, df_val in split_rolling_origin_recal(df_lag,
43 | init_train_size, rolling_size):
44 | count += 1
45 | y_train = df_train['y']
46 | X_train = df_train.drop(['y'], axis=1)
47 | y_val = df_val['y']
48 | X_val = df_val.drop(['y'], axis=1)
49 |
50 | # Reshape to match Keras input shape (batch_size, timsteps, input_dim)
51 | X_train = X_train.values.reshape((-1, lag+1, n_features))
52 | X_val = X_val.values.reshape((-1, lag+1, n_features))
53 |
54 | # Fit the model
55 | checkpoint_name = ('best-lstm-{:03d}-{}-{}-{}-{}.hdf5').format(
56 | count, asset, lstm_size, lag, int(dropout*100))
57 | checkpoint = ModelCheckpoint(
58 | CHECKPOINT_PATH + checkpoint_name,
59 | monitor='val_loss',
60 | save_best_only=True)
61 |
62 | history = model.fit(X_train,
63 | y_train,
64 | epochs=n_epochs,
65 | validation_data=(X_val, y_val),
66 | shuffle=False,
67 | callbacks=[checkpoint])
68 |
69 | # Dumpm the history to a pickle file
70 | path = (HISTORY_PATH + 'lstm.{:03d}-{}-{}-{}-{}.pickle'.format(
71 | count, asset, lstm_size, lag, int(dropout*100)))
72 | with open(path, 'wb') as f:
73 | pickle.dump(history.history, f)
74 |
75 |
76 |
--------------------------------------------------------------------------------
/models/lstm_v04_analysis.py:
--------------------------------------------------------------------------------
1 | from os import listdir
2 | import matplotlib.pyplot as plt
3 | import pickle
4 |
5 | from lstm_v03 import create_model, get_df
6 |
7 | def concat_history():
8 | path = './data/history/lstm_recal/'
9 | keys = ['val_loss', 'val_top_down_acc', 'loss', 'top_down_acc']
10 |
11 | hist_list = listdir(path)
12 | history = {key: [] for key in keys}
13 |
14 | for hist_name in hist_list:
15 | with open(path + hist_name, 'rb') as f:
16 | hist = pickle.load(f)
17 |
18 | for key in keys:
19 | history[key] += hist[key]
20 |
21 | return history
22 |
23 |
24 | def plot_train_loss(history, ylim=(0, 0.03)):
25 | plt.ylim(ylim)
26 |
27 | plt.plot(history['loss'])
28 | plt.plot(history['val_loss'])
29 |
30 | plt.xlabel('Epoch')
31 | plt.ylabel('Mean Absolute Error Loss')
32 | plt.title('Training Loss')
33 | plt.legend(['Train','Val'])
34 | plt.show()
35 |
36 |
37 | def perform_tests():
38 |
39 | test_frac = 0.1
40 |
41 | asset = 'INTC.O'
42 | lstm_size = 64
43 | lag = 15
44 | dropout = 0.1
45 |
46 | path = './data/models/lstm_recal'
47 | models = listdir(path)
48 |
49 | df_lag, df_lag_test, n_features = get_df(test_frac, asset)
50 | X_test = df_lag_test.drop('y', axis=1)
51 | y_test = df_lag_test['y']
52 |
53 | # Reshape input data according to Keras documentation
54 | # (batch_size, timesteps, input_dim)
55 | X_test = X_test.values.reshape((-1, lag+1, n_features))
56 |
57 | model = create_model(lstm_size, dropout, lag, n_features)
58 |
59 | f = open('data/lstm_recalibration.csv', 'w+')
60 | f.write(model.metrics_names[0] + ',' + model.metrics_names[1] + '\n')
61 |
62 | for model_name in models:
63 | model.load_weights(path + '/' + model_name)
64 | scores = model.evaluate(X_test, y_test, verbose=0)
65 | f.write('{},{}\n'.format(scores[0], scores[1]))
66 |
67 | f.close()
68 |
69 |
70 | if __name__ == '__main__':
71 | # history = concat_history()
72 | # plot_train_loss(history)
73 |
74 | perform_tests()
--------------------------------------------------------------------------------
/models/lstm_v05.py:
--------------------------------------------------------------------------------
1 | ''' In this version, I train the model on INTC.O using the hyper parameters
2 | found withthe version 02 but using the rolling origin update splitting strategy.
3 | '''
4 |
5 | import pickle
6 | from keras.callbacks import ModelCheckpoint
7 |
8 | from data_partitioning import split_fixed_origin
9 | from data_cleaning import get_cleaned_filtered_data, extract_asset
10 |
11 | from lstm_v03 import add_lag, top_down_acc, create_model, get_df
12 |
13 |
14 | DATA_PATH = './data/processed/cleaned_filtered_data.csv'
15 | HISTORY_PATH = './data/history/{}/'
16 | CHECKPOINT_PATH = './data/models/{}/'
17 |
18 | ASSETS = ['WFC.N', 'AMZN.O', 'A.N', 'BHE.N']
19 |
20 | test_frac = 0.1
21 | train_frac = 0.8
22 | n_epochs = 50
23 |
24 | lstm_size = 64
25 | lag = 15
26 | dropout = 0.1
27 |
28 |
29 | if __name__ == '__main__':
30 |
31 | for asset in ASSETS :
32 |
33 | # Get the cleaned and processed data
34 | df_lag, _, n_features = get_df(test_frac, asset)
35 |
36 | # Instantiate the model
37 | model = create_model(lstm_size, dropout, lag, n_features)
38 |
39 | # Train and evaluate the model
40 | train_size = round(train_frac * len(df_lag))
41 | for df_train, df_val in split_fixed_origin(df_lag, train_size):
42 | y_train = df_train['y']
43 | X_train = df_train.drop(['y'], axis=1)
44 | y_val = df_val['y']
45 | X_val = df_val.drop(['y'], axis=1)
46 |
47 | # Reshape to match Keras input shape (batch_size, timsteps, input_dim)
48 | X_train = X_train.values.reshape((-1, lag+1, n_features))
49 | X_val = X_val.values.reshape((-1, lag+1, n_features))
50 |
51 | # Some user feedback
52 | print('\nFitting model for {}\n'.format(asset))
53 |
54 | # Fit the model
55 | checkpoint_name = ('best-lstm-{{epoch:03d}}-{{val_loss:.4f}}-{}-'
56 | '{}-{}-{}.hdf5').format(asset, lstm_size, lag, int(dropout*100))
57 | checkpoint = ModelCheckpoint(
58 | CHECKPOINT_PATH.format(asset) + checkpoint_name,
59 | monitor='val_loss',
60 | save_best_only=True)
61 |
62 | history = model.fit(X_train,
63 | y_train,
64 | epochs=n_epochs,
65 | validation_data=(X_val, y_val),
66 | shuffle=False,
67 | callbacks=[checkpoint])
68 |
69 | # Dumpm the history to a pickle file
70 | path = (HISTORY_PATH.format(asset) + 'lstm-{}-{}-{}-{}.pickle'
71 | .format(asset, lstm_size, lag, int(dropout*100)))
72 | with open(path, 'wb') as f:
73 | pickle.dump(history.history, f)
74 |
75 |
76 |
--------------------------------------------------------------------------------
/models/lstm_v05_analysis.py:
--------------------------------------------------------------------------------
1 | from os import listdir
2 | import pandas as pd
3 |
4 | from lstm_v03 import create_model, get_df
5 |
6 |
7 | HISTORY_PATH = './data/history/{}/'
8 | CHECKPOINT_PATH = './data/models/{}/'
9 | ASSETS = ['WFC.N', 'AMZN.O', 'A.N', 'BHE.N']
10 |
11 | def models_to_csv():
12 | for asset in ASSETS:
13 | models = [f for f in listdir(CHECKPOINT_PATH.format(asset))]
14 | models = pd.DataFrame(models)
15 | models = models[0].str[:-5]
16 | models = models.str.split('-', expand=True)
17 | models = models.drop([0, 1], axis=1)
18 | models.columns = [
19 | 'epoch', 'val_loss', 'asset', 'lstm_size', 'lag', 'dropout']
20 |
21 | # Cast to numeric
22 | models['epoch'] = pd.to_numeric(models['epoch'])
23 | models['val_loss'] = pd.to_numeric(models['val_loss'])
24 | models['lstm_size'] = pd.to_numeric(models['lstm_size'])
25 | models['lag'] = pd.to_numeric(models['lag'])
26 | models['dropout'] = pd.to_numeric(models['dropout'])
27 |
28 | # Write to csv file
29 | models.to_csv('./data/lstm-{}-results.csv'.format(asset))
30 |
31 |
32 | def perform_test_best_model():
33 | test_frac = 0.1
34 |
35 | lstm_size = 64
36 | lag = 15
37 | dropout = 0.1
38 |
39 | for asset in ASSETS:
40 |
41 | print(asset)
42 |
43 | df_lag, df_lag_test, n_features = get_df(test_frac, asset)
44 | X_test = df_lag_test.drop('y', axis=1)
45 | y_test = df_lag_test['y']
46 |
47 | X_test = X_test.values.reshape((-1, lag+1, n_features))
48 |
49 | w = [f for f in listdir(CHECKPOINT_PATH.format(asset))][-1]
50 | model = create_model(lstm_size, dropout, lag, n_features)
51 | model.load_weights(CHECKPOINT_PATH.format(asset) + w)
52 | scores = model.evaluate(X_test, y_test, verbose=0)
53 | print(scores[0], scores[1])
54 |
55 |
--------------------------------------------------------------------------------
/notebooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/notebooks/__init__.py
--------------------------------------------------------------------------------
/notebooks/avis-kernel.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {
5 | "_uuid": "5ffb21374c7cf4b98e7239045ef9bf312effee25"
6 | },
7 | "cell_type": "markdown",
8 | "source": "# Vanilla Net"
9 | },
10 | {
11 | "metadata": {
12 | "trusted": true,
13 | "_uuid": "c9fd41029d6cfca6e9bae3f1bfd557a679eda5ec"
14 | },
15 | "cell_type": "code",
16 | "source": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom itertools import chain\n\n%matplotlib inline\n\nREDUCED = False # Reduce the data size for development and testing",
17 | "execution_count": 11,
18 | "outputs": []
19 | },
20 | {
21 | "metadata": {
22 | "trusted": true,
23 | "_uuid": "4fdb018eaba527ddc1dff59ae86845dabfbee52d"
24 | },
25 | "cell_type": "code",
26 | "source": "def clean_train_data(news_df, market_df):\n '''Clean and preprocess the news and market data for training.\n \n Parameters\n ----------\n news_df : dataframe\n See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n market_df : dataframe\n See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n \n Returns\n -------\n dataframe \n Cleaned data ready to be fed to the model.\n \n '''\n # assetCode, time, volume, open, returnsOpenPrevMktres1, \n # returnsOpenPrevMkres10, returnsOpenNextMktres10\n # sentimentNegative, sentimentNeutral, sentimentPositive\n cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1', \n 'returnsOpenPrevMkres10', 'returnsOpenNextMktres10']\n cleaned_df = market_df.loc[cols]\n \n return None",
27 | "execution_count": 3,
28 | "outputs": []
29 | },
30 | {
31 | "metadata": {
32 | "trusted": true,
33 | "_uuid": "54214ee5c758e6f8f22637e8725d15ffc360c266"
34 | },
35 | "cell_type": "code",
36 | "source": "#TODO: Add cleaned data specifications\n#TODO: Define Returns\ndef train_model(train_df):\n '''Train the model using the given trianing data.\n \n Parameters\n ----------\n train_data : dataframe\n Cleaned data. (Specifications)\n \n Returns\n -------\n\n '''\n \n return None",
37 | "execution_count": 4,
38 | "outputs": []
39 | },
40 | {
41 | "metadata": {
42 | "_uuid": "33186c3231b06ced0157278e9f5ed8f4f9c84192"
43 | },
44 | "cell_type": "markdown",
45 | "source": "## Get competition environment"
46 | },
47 | {
48 | "metadata": {
49 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
50 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
51 | "trusted": true
52 | },
53 | "cell_type": "code",
54 | "source": "from kaggle.competitions import twosigmanews\nenv = twosigmanews.make_env()",
55 | "execution_count": 5,
56 | "outputs": [
57 | {
58 | "output_type": "stream",
59 | "text": "Loading the data... This could take a minute.\nDone!\n",
60 | "name": "stdout"
61 | }
62 | ]
63 | },
64 | {
65 | "metadata": {
66 | "_uuid": "d14d4ae98c62668ab6ff1b1aa98168a204031571"
67 | },
68 | "cell_type": "markdown",
69 | "source": "## Get training data"
70 | },
71 | {
72 | "metadata": {
73 | "trusted": true,
74 | "_uuid": "c20fa6deeac9d374c98774abd90bdc76b023ee63"
75 | },
76 | "cell_type": "code",
77 | "source": "(market_train_df, news_train_df) = env.get_training_data()\n\nif REDUCED:\n market_train_df = market_train_df.tail(100_000)\n news_train_df = news_train_df.tail(300_000)",
78 | "execution_count": 7,
79 | "outputs": []
80 | },
81 | {
82 | "metadata": {
83 | "_uuid": "38a6ee0f4f565b35466396bd071ff6369a94a75c"
84 | },
85 | "cell_type": "markdown",
86 | "source": "## Preprocess and clean the data"
87 | },
88 | {
89 | "metadata": {
90 | "trusted": true,
91 | "_uuid": "1aef352177a2d14af19de1cb128a1142d75721cd"
92 | },
93 | "cell_type": "code",
94 | "source": "# Select columns and drop NA\ncols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1', \n 'returnsOpenPrevMktres10', 'returnsOpenNextMktres10']\nmarket_train_df = market_train_df.loc[:,cols]\nmarket_train_df.dropna(inplace=True)",
95 | "execution_count": 9,
96 | "outputs": []
97 | },
98 | {
99 | "metadata": {
100 | "trusted": true,
101 | "_uuid": "de7bbe376af84a62b32dbfe0f595368c8aa3d69a"
102 | },
103 | "cell_type": "code",
104 | "source": "# Select columns and drop NA\ncols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive']\nnews_train_df = news_train_df.loc[:,cols]\nnews_train_df.dropna(inplace=True)",
105 | "execution_count": 10,
106 | "outputs": []
107 | },
108 | {
109 | "metadata": {
110 | "trusted": true,
111 | "_uuid": "4c2a68bb7f16ee1cafc39179199d90b7f7d97a5a",
112 | "scrolled": false
113 | },
114 | "cell_type": "code",
115 | "source": "# Normalize time\nmarket_train_df.loc[:, 'time'] = market_train_df.time.dt.normalize()\nnews_train_df.loc[:, 'time'] = news_train_df.time.dt.normalize()\n\n# assetCodes from String to List\nnews_train_df['assetCodes'] = news_train_df['assetCodes'].str.findall(f\"'([\\w\\./]+)'\")",
116 | "execution_count": 14,
117 | "outputs": []
118 | },
119 | {
120 | "metadata": {
121 | "trusted": true,
122 | "_uuid": "6fb4e18645fd024edd29f4e32d2e02e5b848e4c7"
123 | },
124 | "cell_type": "code",
125 | "source": "# Explode news on assetCodes\nassetCodes_expanded = list(chain(*news_train_df['assetCodes']))\nassetCodes_index = news_train_df.index.repeat(news_train_df['assetCodes'].apply(len))\n\nassert len(assetCodes_expanded) == len(assetCodes_index)",
126 | "execution_count": 39,
127 | "outputs": []
128 | },
129 | {
130 | "metadata": {
131 | "trusted": true,
132 | "_uuid": "a4094e3fd134232f335d4792ba04fb7e5d407cc6"
133 | },
134 | "cell_type": "code",
135 | "source": "assetCodes_df = pd.DataFrame({'index': assetCodes_index, 'assetCode': assetCodes_expanded})\nnews_train_df_exploded = news_train_df.merge(assetCodes_df, 'right', right_on='index', left_index=True, validate='1:m')\nnews_train_df_exploded.drop(['assetCodes', 'index'], 1, inplace=True)",
136 | "execution_count": 57,
137 | "outputs": []
138 | },
139 | {
140 | "metadata": {
141 | "trusted": true,
142 | "_uuid": "336cb9b8df3a7e56c9d315e2a94f7abdd2bee28c"
143 | },
144 | "cell_type": "code",
145 | "source": "# Compute means for same date and assetCode\nnews_agg_dict = {\n 'sentimentNegative':'mean'\n ,'sentimentNeutral':'mean'\n ,'sentimentPositive':'mean'\n}\nnews_train_df_agg = news_train_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict)",
146 | "execution_count": 75,
147 | "outputs": []
148 | },
149 | {
150 | "metadata": {
151 | "trusted": true,
152 | "scrolled": true,
153 | "_uuid": "9ed3b0db1d9ac57e09d4818ce439e85a00d705bd"
154 | },
155 | "cell_type": "code",
156 | "source": "# Merge on market data\nX = market_train_df.merge(news_train_df_agg, 'left', ['time', 'assetCode'])",
157 | "execution_count": 77,
158 | "outputs": []
159 | },
160 | {
161 | "metadata": {
162 | "_uuid": "7f27c9b0c0b1e255935bc432d2454a36928d2b53"
163 | },
164 | "cell_type": "markdown",
165 | "source": "## Train the model"
166 | },
167 | {
168 | "metadata": {
169 | "trusted": true,
170 | "_uuid": "85e6235365c34283e32d0e0484f2874a14ebd092"
171 | },
172 | "cell_type": "code",
173 | "source": "train_model(train_df)",
174 | "execution_count": null,
175 | "outputs": []
176 | },
177 | {
178 | "metadata": {
179 | "_uuid": "763d8d5693ecb9156dc48a613b05ad28292b7d87"
180 | },
181 | "cell_type": "markdown",
182 | "source": "## Make predictions on test data"
183 | },
184 | {
185 | "metadata": {
186 | "trusted": true,
187 | "_uuid": "724c38149860c8e9058474ac9045c2301e8a20da"
188 | },
189 | "cell_type": "code",
190 | "source": "days = env.get_prediction_days()",
191 | "execution_count": null,
192 | "outputs": []
193 | },
194 | {
195 | "metadata": {
196 | "trusted": true,
197 | "_uuid": "a3f2197ed790f1aff1356a6954575fde976a4935"
198 | },
199 | "cell_type": "code",
200 | "source": "import numpy as np\ndef make_random_predictions(predictions_df):\n predictions_df.confidenceValue = 2.0 * np.random.rand(len(predictions_df)) - 1.0",
201 | "execution_count": null,
202 | "outputs": []
203 | },
204 | {
205 | "metadata": {
206 | "trusted": true,
207 | "_uuid": "ef60bc52a8a228e5a2ce18e4bd416f1f1f25aeae"
208 | },
209 | "cell_type": "code",
210 | "source": "for (market_obs_df, news_obs_df, predictions_template_df) in days:\n make_random_predictions(predictions_template_df)\n env.predict(predictions_template_df)\nprint('Done!')",
211 | "execution_count": null,
212 | "outputs": []
213 | },
214 | {
215 | "metadata": {
216 | "trusted": true,
217 | "_uuid": "2c8ed34ffb2c47c6e124530ec798c0b4eb01ddd5"
218 | },
219 | "cell_type": "code",
220 | "source": "env.write_submission_file()",
221 | "execution_count": null,
222 | "outputs": []
223 | }
224 | ],
225 | "metadata": {
226 | "kernelspec": {
227 | "display_name": "Python 3",
228 | "language": "python",
229 | "name": "python3"
230 | },
231 | "language_info": {
232 | "name": "python",
233 | "version": "3.6.6",
234 | "mimetype": "text/x-python",
235 | "codemirror_mode": {
236 | "name": "ipython",
237 | "version": 3
238 | },
239 | "pygments_lexer": "ipython3",
240 | "nbconvert_exporter": "python",
241 | "file_extension": ".py"
242 | }
243 | },
244 | "nbformat": 4,
245 | "nbformat_minor": 1
246 | }
--------------------------------------------------------------------------------
/notebooks/exploration-filter-non-continuous-news.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Goal \n",
8 | "Filter non-continuous stocks after merging with the news data."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 20,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import sys\n",
18 | "sys.path.append(r'../models/')\n",
19 | "\n",
20 | "import pandas as pd\n",
21 | "import matplotlib.pyplot as plt\n",
22 | "from data_cleaning import MARKET_DATA_PATH, NEWS_DATA_PATH, clean_market_data, clean_news_data, clean_data\n",
23 | "\n",
24 | "%matplotlib inline"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 4,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "market_train_df = pd.read_csv(MARKET_DATA_PATH)"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 5,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "clean_market_df = clean_market_data(market_train_df)\n",
43 | "del market_train_df"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 6,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "news_train_df = pd.read_csv(NEWS_DATA_PATH)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 8,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "clean_news_df = clean_news_data(news_train_df)\n",
62 | "del news_train_df"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 9,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/html": [
73 | "
\n",
74 | "\n",
87 | "
\n",
88 | " \n",
89 | " \n",
90 | " | \n",
91 | " time | \n",
92 | " assetCode | \n",
93 | " sentimentNegative | \n",
94 | " sentimentNeutral | \n",
95 | " sentimentPositive | \n",
96 | " urgency | \n",
97 | " bodySize | \n",
98 | " relevance | \n",
99 | " ACN | \n",
100 | " ACT | \n",
101 | " ... | \n",
102 | " ONE | \n",
103 | " PNW | \n",
104 | " PRN | \n",
105 | " RNS | \n",
106 | " ROM | \n",
107 | " RTRS | \n",
108 | " SEHK | \n",
109 | " SET | \n",
110 | " SSN | \n",
111 | " TEN | \n",
112 | "
\n",
113 | " \n",
114 | " \n",
115 | " \n",
116 | " 0 | \n",
117 | " 2007-01-01 | \n",
118 | " 0857.DE | \n",
119 | " 0.500739 | \n",
120 | " 0.419327 | \n",
121 | " 0.079934 | \n",
122 | " 3.0 | \n",
123 | " 1438.0 | \n",
124 | " 0.235702 | \n",
125 | " 0 | \n",
126 | " 0 | \n",
127 | " ... | \n",
128 | " 0 | \n",
129 | " 0 | \n",
130 | " 0 | \n",
131 | " 0 | \n",
132 | " 0 | \n",
133 | " 1 | \n",
134 | " 0 | \n",
135 | " 0 | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | "
\n",
139 | " \n",
140 | " 1 | \n",
141 | " 2007-01-01 | \n",
142 | " 0857.F | \n",
143 | " 0.500739 | \n",
144 | " 0.419327 | \n",
145 | " 0.079934 | \n",
146 | " 3.0 | \n",
147 | " 1438.0 | \n",
148 | " 0.235702 | \n",
149 | " 0 | \n",
150 | " 0 | \n",
151 | " ... | \n",
152 | " 0 | \n",
153 | " 0 | \n",
154 | " 0 | \n",
155 | " 0 | \n",
156 | " 0 | \n",
157 | " 1 | \n",
158 | " 0 | \n",
159 | " 0 | \n",
160 | " 0 | \n",
161 | " 0 | \n",
162 | "
\n",
163 | " \n",
164 | " 2 | \n",
165 | " 2007-01-01 | \n",
166 | " 0857.HK | \n",
167 | " 0.500739 | \n",
168 | " 0.419327 | \n",
169 | " 0.079934 | \n",
170 | " 3.0 | \n",
171 | " 1438.0 | \n",
172 | " 0.235702 | \n",
173 | " 0 | \n",
174 | " 0 | \n",
175 | " ... | \n",
176 | " 0 | \n",
177 | " 0 | \n",
178 | " 0 | \n",
179 | " 0 | \n",
180 | " 0 | \n",
181 | " 1 | \n",
182 | " 0 | \n",
183 | " 0 | \n",
184 | " 0 | \n",
185 | " 0 | \n",
186 | "
\n",
187 | " \n",
188 | " 3 | \n",
189 | " 2007-01-01 | \n",
190 | " 6758.T | \n",
191 | " 0.146765 | \n",
192 | " 0.392352 | \n",
193 | " 0.460883 | \n",
194 | " 3.0 | \n",
195 | " 2742.0 | \n",
196 | " 0.204124 | \n",
197 | " 0 | \n",
198 | " 0 | \n",
199 | " ... | \n",
200 | " 0 | \n",
201 | " 0 | \n",
202 | " 0 | \n",
203 | " 0 | \n",
204 | " 0 | \n",
205 | " 1 | \n",
206 | " 0 | \n",
207 | " 0 | \n",
208 | " 0 | \n",
209 | " 0 | \n",
210 | "
\n",
211 | " \n",
212 | " 4 | \n",
213 | " 2007-01-01 | \n",
214 | " BHP.AX | \n",
215 | " 0.130677 | \n",
216 | " 0.465433 | \n",
217 | " 0.403891 | \n",
218 | " 3.0 | \n",
219 | " 9674.0 | \n",
220 | " 0.178174 | \n",
221 | " 0 | \n",
222 | " 0 | \n",
223 | " ... | \n",
224 | " 0 | \n",
225 | " 0 | \n",
226 | " 0 | \n",
227 | " 0 | \n",
228 | " 0 | \n",
229 | " 1 | \n",
230 | " 0 | \n",
231 | " 0 | \n",
232 | " 0 | \n",
233 | " 0 | \n",
234 | "
\n",
235 | " \n",
236 | "
\n",
237 | "
5 rows × 38 columns
\n",
238 | "
"
239 | ],
240 | "text/plain": [
241 | " time assetCode sentimentNegative sentimentNeutral \\\n",
242 | "0 2007-01-01 0857.DE 0.500739 0.419327 \n",
243 | "1 2007-01-01 0857.F 0.500739 0.419327 \n",
244 | "2 2007-01-01 0857.HK 0.500739 0.419327 \n",
245 | "3 2007-01-01 6758.T 0.146765 0.392352 \n",
246 | "4 2007-01-01 BHP.AX 0.130677 0.465433 \n",
247 | "\n",
248 | " sentimentPositive urgency bodySize relevance ACN ACT ... ONE PNW \\\n",
249 | "0 0.079934 3.0 1438.0 0.235702 0 0 ... 0 0 \n",
250 | "1 0.079934 3.0 1438.0 0.235702 0 0 ... 0 0 \n",
251 | "2 0.079934 3.0 1438.0 0.235702 0 0 ... 0 0 \n",
252 | "3 0.460883 3.0 2742.0 0.204124 0 0 ... 0 0 \n",
253 | "4 0.403891 3.0 9674.0 0.178174 0 0 ... 0 0 \n",
254 | "\n",
255 | " PRN RNS ROM RTRS SEHK SET SSN TEN \n",
256 | "0 0 0 0 1 0 0 0 0 \n",
257 | "1 0 0 0 1 0 0 0 0 \n",
258 | "2 0 0 0 1 0 0 0 0 \n",
259 | "3 0 0 0 1 0 0 0 0 \n",
260 | "4 0 0 0 1 0 0 0 0 \n",
261 | "\n",
262 | "[5 rows x 38 columns]"
263 | ]
264 | },
265 | "execution_count": 9,
266 | "metadata": {},
267 | "output_type": "execute_result"
268 | }
269 | ],
270 | "source": [
271 | "clean_news_df.head()"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 10,
277 | "metadata": {},
278 | "outputs": [
279 | {
280 | "data": {
281 | "text/plain": [
282 | "3652"
283 | ]
284 | },
285 | "execution_count": 10,
286 | "metadata": {},
287 | "output_type": "execute_result"
288 | }
289 | ],
290 | "source": [
291 | "clean_news_df.time.nunique()"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "This number is larger than for the market data (2488). Probably because the weekends are included?"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 11,
304 | "metadata": {},
305 | "outputs": [
306 | {
307 | "data": {
308 | "text/plain": [
309 | "0"
310 | ]
311 | },
312 | "execution_count": 11,
313 | "metadata": {},
314 | "output_type": "execute_result"
315 | }
316 | ],
317 | "source": [
318 | "sizes = clean_news_df.groupby('assetCode').size()\n",
319 | "sel = sizes == clean_news_df.time.nunique()\n",
320 | "sum(sel)"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "This is what I expected: it is very unlikely that news are published for a company every single day for 10+ years. Will probably have to engineer a feature taking the average of the different metrics over some period and add a feature of the number of articles included in the average (or the sum of their importance/urgency).\n",
328 | "\n",
329 | "But first, let's look at the distributions to have a better idea of the extend of the problem."
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 21,
335 | "metadata": {},
336 | "outputs": [
337 | {
338 | "data": {
339 | "text/plain": [
340 | "(array([1.044e+03, 6.610e+02, 5.780e+02, 5.620e+02, 5.430e+02, 5.500e+02,\n",
341 | " 5.750e+02, 5.160e+02, 4.690e+02, 4.340e+02, 4.340e+02, 3.920e+02,\n",
342 | " 3.480e+02, 3.280e+02, 3.570e+02, 3.240e+02, 2.740e+02, 2.490e+02,\n",
343 | " 2.840e+02, 2.730e+02, 2.130e+02, 2.260e+02, 1.940e+02, 1.850e+02,\n",
344 | " 1.640e+02, 1.810e+02, 1.770e+02, 1.410e+02, 1.290e+02, 1.250e+02,\n",
345 | " 1.210e+02, 1.190e+02, 9.400e+01, 1.310e+02, 1.250e+02, 7.400e+01,\n",
346 | " 7.500e+01, 9.300e+01, 7.400e+01, 6.900e+01, 9.400e+01, 6.000e+01,\n",
347 | " 6.000e+01, 5.700e+01, 6.200e+01, 5.700e+01, 4.400e+01, 4.000e+01,\n",
348 | " 3.200e+01, 4.200e+01, 4.500e+01, 4.800e+01, 5.500e+01, 2.900e+01,\n",
349 | " 4.700e+01, 3.000e+01, 3.300e+01, 2.800e+01, 4.400e+01, 2.700e+01,\n",
350 | " 2.300e+01, 3.500e+01, 3.000e+01, 3.700e+01, 3.200e+01, 2.100e+01,\n",
351 | " 1.300e+01, 8.000e+00, 3.200e+01, 1.500e+01, 1.600e+01, 1.500e+01,\n",
352 | " 1.500e+01, 1.500e+01, 1.600e+01, 2.300e+01, 2.100e+01, 1.700e+01,\n",
353 | " 1.400e+01, 1.700e+01, 1.900e+01, 2.300e+01, 2.100e+01, 1.800e+01,\n",
354 | " 1.400e+01, 1.100e+01, 1.200e+01, 1.000e+01, 1.800e+01, 1.700e+01,\n",
355 | " 1.800e+01, 2.000e+01, 1.200e+01, 1.200e+01, 1.200e+01, 1.000e+01,\n",
356 | " 1.000e+01, 8.000e+00, 9.000e+00, 1.400e+01, 1.200e+01, 3.000e+00,\n",
357 | " 6.000e+00, 6.000e+00, 8.000e+00, 3.000e+00, 6.000e+00, 1.000e+01,\n",
358 | " 9.000e+00, 4.000e+00, 4.000e+00, 5.000e+00, 7.000e+00, 7.000e+00,\n",
359 | " 7.000e+00, 9.000e+00, 3.000e+00, 1.400e+01, 9.000e+00, 9.000e+00,\n",
360 | " 4.000e+00, 2.000e+00, 9.000e+00, 1.500e+01, 9.000e+00, 8.000e+00,\n",
361 | " 1.200e+01, 2.000e+00, 4.000e+00, 3.000e+00, 7.000e+00, 1.000e+01,\n",
362 | " 2.000e+00, 3.000e+00, 4.000e+00, 0.000e+00, 1.300e+01, 9.000e+00,\n",
363 | " 7.000e+00, 2.000e+00, 3.000e+00, 6.000e+00, 5.000e+00, 1.200e+01,\n",
364 | " 8.000e+00, 3.000e+00, 1.000e+01, 0.000e+00, 1.500e+01, 9.000e+00,\n",
365 | " 6.000e+00, 5.000e+00, 9.000e+00, 4.000e+00, 6.000e+00, 5.000e+00,\n",
366 | " 3.000e+00, 2.000e+00, 5.000e+00, 1.000e+00, 4.000e+00, 4.000e+00,\n",
367 | " 5.000e+00, 6.000e+00, 3.000e+00, 3.000e+00, 6.000e+00, 3.000e+00,\n",
368 | " 1.000e+00, 1.000e+00, 5.000e+00, 1.000e+00, 6.000e+00, 3.000e+00,\n",
369 | " 3.000e+00, 0.000e+00, 6.000e+00, 1.000e+00, 1.000e+00, 5.000e+00,\n",
370 | " 3.000e+00, 2.000e+00, 0.000e+00, 7.000e+00, 7.000e+00, 2.000e+00,\n",
371 | " 3.000e+00, 0.000e+00, 3.000e+00, 5.000e+00, 1.300e+01, 0.000e+00,\n",
372 | " 7.000e+00, 2.000e+00, 1.000e+00, 1.000e+00, 6.000e+00, 0.000e+00,\n",
373 | " 8.000e+00, 4.000e+00, 2.000e+00, 0.000e+00, 8.000e+00, 2.000e+00,\n",
374 | " 4.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 4.000e+00, 3.000e+00,\n",
375 | " 3.000e+00, 1.000e+00, 2.000e+00, 0.000e+00, 0.000e+00, 4.000e+00,\n",
376 | " 1.000e+00, 2.000e+00, 0.000e+00, 5.000e+00, 5.000e+00, 2.000e+00,\n",
377 | " 0.000e+00, 0.000e+00, 0.000e+00, 4.000e+00, 5.000e+00, 4.000e+00,\n",
378 | " 2.000e+00, 1.000e+00, 0.000e+00, 2.000e+00, 3.000e+00, 3.000e+00,\n",
379 | " 5.000e+00, 7.000e+00, 2.000e+00, 1.000e+00, 0.000e+00, 4.000e+00,\n",
380 | " 8.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.000e+00, 0.000e+00,\n",
381 | " 6.000e+00, 0.000e+00, 0.000e+00, 5.000e+00, 0.000e+00, 2.000e+00,\n",
382 | " 2.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 7.000e+00,\n",
383 | " 3.000e+00, 1.400e+01, 1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,\n",
384 | " 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.000e+00, 0.000e+00,\n",
385 | " 6.000e+00, 2.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n",
386 | " 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 3.000e+00, 4.000e+00,\n",
387 | " 8.000e+00, 0.000e+00, 2.000e+00, 2.000e+00, 4.000e+00, 1.000e+00,\n",
388 | " 3.000e+00, 0.000e+00, 3.000e+00, 0.000e+00, 7.000e+00, 0.000e+00,\n",
389 | " 0.000e+00, 0.000e+00, 0.000e+00, 4.000e+00, 6.000e+00, 0.000e+00,\n",
390 | " 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 5.000e+00,\n",
391 | " 7.000e+00, 9.000e+00]),\n",
392 | " array([1.00000000e+00, 1.10292208e+01, 2.10584416e+01, 3.10876623e+01,\n",
393 | " 4.11168831e+01, 5.11461039e+01, 6.11753247e+01, 7.12045455e+01,\n",
394 | " 8.12337662e+01, 9.12629870e+01, 1.01292208e+02, 1.11321429e+02,\n",
395 | " 1.21350649e+02, 1.31379870e+02, 1.41409091e+02, 1.51438312e+02,\n",
396 | " 1.61467532e+02, 1.71496753e+02, 1.81525974e+02, 1.91555195e+02,\n",
397 | " 2.01584416e+02, 2.11613636e+02, 2.21642857e+02, 2.31672078e+02,\n",
398 | " 2.41701299e+02, 2.51730519e+02, 2.61759740e+02, 2.71788961e+02,\n",
399 | " 2.81818182e+02, 2.91847403e+02, 3.01876623e+02, 3.11905844e+02,\n",
400 | " 3.21935065e+02, 3.31964286e+02, 3.41993506e+02, 3.52022727e+02,\n",
401 | " 3.62051948e+02, 3.72081169e+02, 3.82110390e+02, 3.92139610e+02,\n",
402 | " 4.02168831e+02, 4.12198052e+02, 4.22227273e+02, 4.32256494e+02,\n",
403 | " 4.42285714e+02, 4.52314935e+02, 4.62344156e+02, 4.72373377e+02,\n",
404 | " 4.82402597e+02, 4.92431818e+02, 5.02461039e+02, 5.12490260e+02,\n",
405 | " 5.22519481e+02, 5.32548701e+02, 5.42577922e+02, 5.52607143e+02,\n",
406 | " 5.62636364e+02, 5.72665584e+02, 5.82694805e+02, 5.92724026e+02,\n",
407 | " 6.02753247e+02, 6.12782468e+02, 6.22811688e+02, 6.32840909e+02,\n",
408 | " 6.42870130e+02, 6.52899351e+02, 6.62928571e+02, 6.72957792e+02,\n",
409 | " 6.82987013e+02, 6.93016234e+02, 7.03045455e+02, 7.13074675e+02,\n",
410 | " 7.23103896e+02, 7.33133117e+02, 7.43162338e+02, 7.53191558e+02,\n",
411 | " 7.63220779e+02, 7.73250000e+02, 7.83279221e+02, 7.93308442e+02,\n",
412 | " 8.03337662e+02, 8.13366883e+02, 8.23396104e+02, 8.33425325e+02,\n",
413 | " 8.43454545e+02, 8.53483766e+02, 8.63512987e+02, 8.73542208e+02,\n",
414 | " 8.83571429e+02, 8.93600649e+02, 9.03629870e+02, 9.13659091e+02,\n",
415 | " 9.23688312e+02, 9.33717532e+02, 9.43746753e+02, 9.53775974e+02,\n",
416 | " 9.63805195e+02, 9.73834416e+02, 9.83863636e+02, 9.93892857e+02,\n",
417 | " 1.00392208e+03, 1.01395130e+03, 1.02398052e+03, 1.03400974e+03,\n",
418 | " 1.04403896e+03, 1.05406818e+03, 1.06409740e+03, 1.07412662e+03,\n",
419 | " 1.08415584e+03, 1.09418506e+03, 1.10421429e+03, 1.11424351e+03,\n",
420 | " 1.12427273e+03, 1.13430195e+03, 1.14433117e+03, 1.15436039e+03,\n",
421 | " 1.16438961e+03, 1.17441883e+03, 1.18444805e+03, 1.19447727e+03,\n",
422 | " 1.20450649e+03, 1.21453571e+03, 1.22456494e+03, 1.23459416e+03,\n",
423 | " 1.24462338e+03, 1.25465260e+03, 1.26468182e+03, 1.27471104e+03,\n",
424 | " 1.28474026e+03, 1.29476948e+03, 1.30479870e+03, 1.31482792e+03,\n",
425 | " 1.32485714e+03, 1.33488636e+03, 1.34491558e+03, 1.35494481e+03,\n",
426 | " 1.36497403e+03, 1.37500325e+03, 1.38503247e+03, 1.39506169e+03,\n",
427 | " 1.40509091e+03, 1.41512013e+03, 1.42514935e+03, 1.43517857e+03,\n",
428 | " 1.44520779e+03, 1.45523701e+03, 1.46526623e+03, 1.47529545e+03,\n",
429 | " 1.48532468e+03, 1.49535390e+03, 1.50538312e+03, 1.51541234e+03,\n",
430 | " 1.52544156e+03, 1.53547078e+03, 1.54550000e+03, 1.55552922e+03,\n",
431 | " 1.56555844e+03, 1.57558766e+03, 1.58561688e+03, 1.59564610e+03,\n",
432 | " 1.60567532e+03, 1.61570455e+03, 1.62573377e+03, 1.63576299e+03,\n",
433 | " 1.64579221e+03, 1.65582143e+03, 1.66585065e+03, 1.67587987e+03,\n",
434 | " 1.68590909e+03, 1.69593831e+03, 1.70596753e+03, 1.71599675e+03,\n",
435 | " 1.72602597e+03, 1.73605519e+03, 1.74608442e+03, 1.75611364e+03,\n",
436 | " 1.76614286e+03, 1.77617208e+03, 1.78620130e+03, 1.79623052e+03,\n",
437 | " 1.80625974e+03, 1.81628896e+03, 1.82631818e+03, 1.83634740e+03,\n",
438 | " 1.84637662e+03, 1.85640584e+03, 1.86643506e+03, 1.87646429e+03,\n",
439 | " 1.88649351e+03, 1.89652273e+03, 1.90655195e+03, 1.91658117e+03,\n",
440 | " 1.92661039e+03, 1.93663961e+03, 1.94666883e+03, 1.95669805e+03,\n",
441 | " 1.96672727e+03, 1.97675649e+03, 1.98678571e+03, 1.99681494e+03,\n",
442 | " 2.00684416e+03, 2.01687338e+03, 2.02690260e+03, 2.03693182e+03,\n",
443 | " 2.04696104e+03, 2.05699026e+03, 2.06701948e+03, 2.07704870e+03,\n",
444 | " 2.08707792e+03, 2.09710714e+03, 2.10713636e+03, 2.11716558e+03,\n",
445 | " 2.12719481e+03, 2.13722403e+03, 2.14725325e+03, 2.15728247e+03,\n",
446 | " 2.16731169e+03, 2.17734091e+03, 2.18737013e+03, 2.19739935e+03,\n",
447 | " 2.20742857e+03, 2.21745779e+03, 2.22748701e+03, 2.23751623e+03,\n",
448 | " 2.24754545e+03, 2.25757468e+03, 2.26760390e+03, 2.27763312e+03,\n",
449 | " 2.28766234e+03, 2.29769156e+03, 2.30772078e+03, 2.31775000e+03,\n",
450 | " 2.32777922e+03, 2.33780844e+03, 2.34783766e+03, 2.35786688e+03,\n",
451 | " 2.36789610e+03, 2.37792532e+03, 2.38795455e+03, 2.39798377e+03,\n",
452 | " 2.40801299e+03, 2.41804221e+03, 2.42807143e+03, 2.43810065e+03,\n",
453 | " 2.44812987e+03, 2.45815909e+03, 2.46818831e+03, 2.47821753e+03,\n",
454 | " 2.48824675e+03, 2.49827597e+03, 2.50830519e+03, 2.51833442e+03,\n",
455 | " 2.52836364e+03, 2.53839286e+03, 2.54842208e+03, 2.55845130e+03,\n",
456 | " 2.56848052e+03, 2.57850974e+03, 2.58853896e+03, 2.59856818e+03,\n",
457 | " 2.60859740e+03, 2.61862662e+03, 2.62865584e+03, 2.63868506e+03,\n",
458 | " 2.64871429e+03, 2.65874351e+03, 2.66877273e+03, 2.67880195e+03,\n",
459 | " 2.68883117e+03, 2.69886039e+03, 2.70888961e+03, 2.71891883e+03,\n",
460 | " 2.72894805e+03, 2.73897727e+03, 2.74900649e+03, 2.75903571e+03,\n",
461 | " 2.76906494e+03, 2.77909416e+03, 2.78912338e+03, 2.79915260e+03,\n",
462 | " 2.80918182e+03, 2.81921104e+03, 2.82924026e+03, 2.83926948e+03,\n",
463 | " 2.84929870e+03, 2.85932792e+03, 2.86935714e+03, 2.87938636e+03,\n",
464 | " 2.88941558e+03, 2.89944481e+03, 2.90947403e+03, 2.91950325e+03,\n",
465 | " 2.92953247e+03, 2.93956169e+03, 2.94959091e+03, 2.95962013e+03,\n",
466 | " 2.96964935e+03, 2.97967857e+03, 2.98970779e+03, 2.99973701e+03,\n",
467 | " 3.00976623e+03, 3.01979545e+03, 3.02982468e+03, 3.03985390e+03,\n",
468 | " 3.04988312e+03, 3.05991234e+03, 3.06994156e+03, 3.07997078e+03,\n",
469 | " 3.09000000e+03]),\n",
470 | " )"
471 | ]
472 | },
473 | "execution_count": 21,
474 | "metadata": {},
475 | "output_type": "execute_result"
476 | },
477 | {
478 | "data": {
479 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD8CAYAAAB+UHOxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEYZJREFUeJzt3W2MXFd9x/Hvv3ESHhU7ycpKbavrFAsUVS1YVjACoSqGkIeqTqWAgipiUVeWSmihaVUWITW0fWOqlpRIKMjgtE4VQWigilWnpa4ThPoihg2EkMQNXoLBtpx4IYmhRRRS/n0xZ8lkmdldz92dp/P9SKO599wz956zdzy/uefeO47MRJJUn18adAMkSYNhAEhSpQwASaqUASBJlTIAJKlSBoAkVcoAkKRKGQCSVCkDQJIqtWrQDVjIxRdfnJOTk4NuhiSNlIceeuh7mTmxWL2hDoDJyUmmp6cH3QxJGikR8Z2l1HMISJIqZQBIUqUMAEmqlAEgSZUyACSpUgaAJFXKAJCkShkAklQpA0CSKjXWATA5dWDQTZCkoTXWASBJ6s4AkKRKGQCSVCkDQJIqtWgARMQdEXE6Ih5tK7swIg5GxNHyvKaUR0TcFhEzEfFIRGxue82OUv9oROxYme5IkpZqKUcA/wBcNa9sCjiUmZuAQ2Ue4GpgU3nsAm6HVmAAtwCvBy4HbpkLDUnSYCwaAJn5JeCZecXbgX1leh9wXVv5ndnyILA6Ii4B3gYczMxnMvNZ4CC/GCqSpD7q9RzA2sw8VaafAtaW6XXA8bZ6J0pZt/JfEBG7ImI6IqZnZ2d7bJ4kaTGNTwJnZgK5DG2ZW9+ezNySmVsmJhb9Ly0lST3qNQCeLkM7lOfTpfwksKGt3vpS1q1ckjQgvQbAfmDuSp4dwL1t5TeWq4G2AmfKUNEXgCsjYk05+XtlKZMkDciqxSpExKeB3wQujogTtK7m2Q18NiJ2At8B3lGq3wdcA8wAPwLeDZCZz0TEXwFfKfX+MjPnn1iWJPXRogGQme/ssmhbh7oJ3NRlPXcAd5xV6yRJK8Y7gSWpUgaAJFXKAJCkShkAklQpA0CSKmUASFKlDABJqpQBIEmVMgAkqVIGgCRVygCQpEoZAJJUKQNAkiplAEhSpQwASaqUASBJlTIAJKlSBoAkVcoAkKRKGQCSVCkDQJIqZQBIUqUMAEmqlAEgSZUyACSpUgaAJFXKAJCkShkAklQpA0CSKmUASFKlGgVARPxxRDwWEY9GxKcj4iURsTEiDkfETETcHRHnlbrnl/mZsnxyOTogSepNzwEQEeuAPwK2ZOavAecANwAfAW7NzFcBzwI7y0t2As+W8ltLPUnSgDQdAloFvDQiVgEvA04BVwD3lOX7gOvK9PYyT1m+LSKi4fYlST3qOQAy8yTwN8B3aX3wnwEeAp7LzOdLtRPAujK9DjheXvt8qX9Rr9uXJDXTZAhoDa1v9RuBXwZeDlzVtEERsSsipiNienZ2tunqJEldNBkCegvw7cyczcyfAp8H3gisLkNCAOuBk2X6JLABoCy/APj+/JVm5p7M3JKZWyYmJho0T5K0kCYB8F1ga0S8rIzlbwMeBx4Ari91dgD3lun9ZZ6y/P7MzAbblyQ10OQcwGFaJ3O/CnyjrGsP8AHg5oiYoTXGv7e8ZC9wUSm/GZhq0G5JUkOrFq/SXWbeAtwyr/hJ4PIOdX8MvL3J9iRJy8c7gSWpUgaAJFXKAJCkShkAklQpA0CSKmUASFKlDABJqpQBIEmVMgAkqVIGgCRVygCQpEoZAJJUKQNAkiplAEhSpQwASaqUASBJlTIAJKlSBoAkVcoAkKRKGQCSVCkDQJIqZQBIUqUMAEmq1NgHwOTUgUE3QZKG0tgHgCSpMwNAkiplAEhSpQwASaqUASBJlTIAJKlSBoAkVapRAETE6oi4JyL+KyKORMQbIuLCiDgYEUfL85pSNyLitoiYiYhHImLz8nRBktSLpkcAHwP+LTNfA/wGcASYAg5l5ibgUJkHuBrYVB67gNsbbluS1EDPARARFwBvBvYCZOZPMvM5YDuwr1TbB1xXprcDd2bLg8DqiLik55ZLkhppcgSwEZgF/j4ivhYRn4qIlwNrM/NUqfMUsLZMrwOOt73+RCmTJA1AkwBYBWwGbs/M1wH/wwvDPQBkZgJ5NiuNiF0RMR0R07Ozsw2aJ0laSJMAOAGcyMzDZf4eWoHw9NzQTnk+XZafBDa0vX59KXuRzNyTmVsyc8vExESD5kmSFtJzAGTmU8DxiHh1KdoGPA7sB3aUsh3AvWV6P3BjuRpoK3CmbahIktRnqxq+/g+BuyLiPOBJ4N20QuWzEbET+A7wjlL3PuAaYAb4UakrSRqQRgGQmQ8DWzos2tahbgI3NdleryanDnBs97WD2LQkDa3q7gT2P4iRpJbqAkCS1GIASFKlDABJqpQBIEmVqioAPAEsSS+oJgD88JekF6smACRJL2YASFKlDABJqpQBIEmVqjIAPCEsSZUGgCTJAJCkahkAklQpA0CSKmUASFKlDABJqpQBIEmVMgAkqVIGgCRVqtoA8G5gSbWrNgAkqXYGgCRVygCQpEoZAJJUKQNAkiplAEhSpaoOgMmpA14OKqlaVQeAJNXMAJCkSjUOgIg4JyK+FhH/UuY3RsThiJiJiLsj4rxSfn6ZnynLJ5tuW5LUu+U4AngfcKRt/iPArZn5KuBZYGcp3wk8W8pvLfUkSQPSKAAiYj1wLfCpMh/AFcA9pco+4Loyvb3MU5ZvK/UlSQPQ9Ajg74A/A35W5i8CnsvM58v8CWBdmV4HHAcoy8+U+i8SEbsiYjoipmdnZxs2T5LUTc8BEBG/BZzOzIeWsT1k5p7M3JKZWyYmJpZz1ZKkNk2OAN4I/HZEHAM+Q2vo52PA6ohYVeqsB06W6ZPABoCy/ALg+w22v2y8F0BSjXoOgMz8YGauz8xJ4Abg/sz8XeAB4PpSbQdwb5neX+Ypy+/PzOx1+5KkZlbiPoAPADdHxAytMf69pXwvcFEpvxmYWoFtS5KWaNXiVRaXmV8EvlimnwQu71Dnx8Dbl2N7kqTmvBNYkiplAEhSpQwASaqUATCPl4RKqoUBIEmVMgAKv/lLqo0BIEmVMgAkqVIGgCRVygBo43kASTUxADowCCTVwACQpEoZAJJUKQNAkiplAEhSpQwASaqUASBJlTIAFuEloZLGlQEgSZUyALrwm7+kcWcALIFhIGkcGQCSVCkDQJIqZQBIUqUMgAU49i9pnBkAklQpA0CSKmUASFKlDABJqpQBcBY8KSxpnPQcABGxISIeiIjHI+KxiHhfKb8wIg5GxNHyvKaUR0TcFhEzEfFIRGxerk5Iks5ekyOA54E/yczLgK3ATRFxGTAFHMrMTcChMg9wNbCpPHYBtzfYtiSpoZ4DIDNPZeZXy/QPgSPAOmA7sK9U2wdcV6a3A3dmy4PA6oi4pOeW95nDP5LGzbKcA4iISeB1wGFgbWaeKoueAtaW6XXA8baXnShlkqQBaBwAEfEK4HPA+zPzB+3LMjOBPMv17YqI6YiYnp2dbdo8SVIXjQIgIs6l9eF/V2Z+vhQ/PTe0U55Pl/KTwIa2l68vZS+SmXsyc0tmbpmYmGjSvBXhUJCkcdHkKqAA9gJHMvOjbYv2AzvK9A7g3rbyG8vVQFuBM21DRZKkPmtyBPBG4F3AFRHxcHlcA+wG3hoRR4G3lHmA+4AngRngk8B7Gmx7oDwKkDQOVvX6wsz8TyC6LN7WoX4CN/W6vWE0OXWAY7uvHXQzJKkn3gnckEcDkkaVASBJlTIAeuQ3f0mjzgCQpEoZAJJUKQNAkiplAEhSpQyAZeAJYUmjyACQpEoZAMuk01GARwaShpkBsIwMAUmjxABYZpNTB/zQlzQSDABJqpQBIEmVMgAkqVIGgCRVygBYIYtdEeSJYkmDZgBIUqUMAEmqlAEgSZUyAPqg/eYwx/4lDQsDYAAMAUnDwACQpEqtGnQDatbpSODY7msH0BJJNfIIYEQ4bCRpuRkAI8AbyCStBANgyMz/sF/sA99AkNQrA2AINf1QNxQkLYUBMKR6+ea/0FCRoSBpPgNghMy/mazTzWV+0EtaKgNgTDT54O/2WsNEGm99vw8gIq4CPgacA3wqM3f3uw21mJw6sOT7CuY+7I/tvrbjEcX89bTX8d4FaTT19QggIs4BPg5cDVwGvDMiLutnG2rT6aqi+Y9u9TuVdzv30O2KpaVcybQUSzlKWep2Fqq32LmVYTUKbewn/x5L0+8hoMuBmcx8MjN/AnwG2N7nNlRnuf4xLOUH7RYKl07t6RZCndbRab7bOhYqX8qyTu3sNN1PTS4LHsaLAsY9bHvty3J9aVqKfg8BrQOOt82fAF7f5zZoABb7ADqbb+VNvsEvNJR1Nuvp9rr5Q2jd6syto73+3HT78k7t7jZc1758/jbmv67TfKdtdlrX/HYvtI5O21uord2Wd2vH/G13W08ni62nU/1u8/P/Lt3Wtdi2+h2AkZn921jE9cBVmfn7Zf5dwOsz871tdXYBu8rsq4EnetzcxcD3GjR3WIxDP+zD8BiHftiHxf1KZk4sVqnfRwAngQ1t8+tL2c9l5h5gT9MNRcR0Zm5pup5BG4d+2IfhMQ79sA/Lp9/nAL4CbIqIjRFxHnADsL/PbZAk0ecjgMx8PiLeC3yB1mWgd2TmY/1sgySppe/3AWTmfcB9fdhU42GkITEO/bAPw2Mc+mEflklfTwJLkoaHPwUhSZUaywCIiKsi4omImImIqUG3ZyERcSwivhERD0fEdCm7MCIORsTR8rymlEdE3Fb69UhEbB5gu++IiNMR8Whb2Vm3OyJ2lPpHI2LHEPThwxFxsuyPhyPimrZlHyx9eCIi3tZWPrD3W0RsiIgHIuLxiHgsIt5XykdmXyzQh5HZFxHxkoj4ckR8vfThL0r5xog4XNpzd7n4hYg4v8zPlOWTi/VtRWTmWD1onVz+FnApcB7wdeCyQbdrgfYeAy6eV/bXwFSZngI+UqavAf4VCGArcHiA7X4zsBl4tNd2AxcCT5bnNWV6zYD78GHgTzvUvay8l84HNpb32DmDfr8BlwCby/QrgW+Wto7MvligDyOzL8rf8xVl+lzgcPn7fha4oZR/AviDMv0e4BNl+gbg7oX6tlLtHscjgHH4uYntwL4yvQ+4rq38zmx5EFgdEZcMooGZ+SXgmXnFZ9vutwEHM/OZzHwWOAhctfKtb+nSh262A5/JzP/NzG8DM7TeawN9v2Xmqcz8apn+IXCE1h33I7MvFuhDN0O3L8rf87/L7LnlkcAVwD2lfP5+mNs/9wDbIiLo3rcVMY4B0OnnJhZ6Mw1aAv8eEQ9F6y5ogLWZeapMPwWsLdPD3rezbfew9ue9ZXjkjrmhE0agD2UY4XW0vn2O5L6Y1wcYoX0REedExMPAaVoB+i3gucx8vkN7ft7WsvwMcBF97sM4BsCoeVNmbqb1C6k3RcSb2xdm67hw5C7VGtV2A7cDvwq8FjgF/O1gm7M0EfEK4HPA+zPzB+3LRmVfdOjDSO2LzPy/zHwtrV84uBx4zYCbtKhxDIBFf25imGTmyfJ8GvhnWm+cp+eGdsrz6VJ92Pt2tu0euv5k5tPlH/LPgE/ywuH30PYhIs6l9cF5V2Z+vhSP1L7o1IdR3BcAmfkc8ADwBlpDbHP3W7W35+dtLcsvAL5Pn/swjgEwMj83EREvj4hXzk0DVwKP0mrv3FUYO4B7y/R+4MZyJcdW4EzbYf4wONt2fwG4MiLWlMP7K0vZwMw7p/I7tPYHtPpwQ7l6YyOwCfgyA36/lXHjvcCRzPxo26KR2Rfd+jBK+yIiJiJidZl+KfBWWucyHgCuL9Xm74e5/XM9cH85UuvWt5WxkmfGB/WgdaXDN2mNwX1o0O1ZoJ2X0jrj/3Xgsbm20hoLPAQcBf4DuDBfuNLg46Vf3wC2DLDtn6Z1WP5TWuOUO3tpN/B7tE50zQDvHoI+/GNp4yO0/jFe0lb/Q6UPTwBXD8P7DXgTreGdR4CHy+OaUdoXC/RhZPYF8OvA10pbHwX+vJRfSusDfAb4J+D8Uv6SMj9Tll+6WN9W4uGdwJJUqXEcApIkLYEBIEmVMgAkqVIGgCRVygCQpEoZAJJUKQNAkiplAEhSpf4fa1NpNaCVd4QAAAAASUVORK5CYII=\n",
480 | "text/plain": [
481 | ""
482 | ]
483 | },
484 | "metadata": {
485 | "needs_background": "light"
486 | },
487 | "output_type": "display_data"
488 | }
489 | ],
490 | "source": [
491 | "plt.hist(sizes, bins=int((sizes.max() - sizes.min())/10))"
492 | ]
493 | },
494 | {
495 | "cell_type": "markdown",
496 | "metadata": {},
497 | "source": [
498 | "We see thatmost stocks have very little news coverage. \n",
499 | "One possible solution would be to bin on a 10 day basis.\n",
500 | "Another possible solution would be to simply fill the blanks with 0s. This solution feels like the easiest."
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": null,
506 | "metadata": {},
507 | "outputs": [],
508 | "source": []
509 | }
510 | ],
511 | "metadata": {
512 | "kernelspec": {
513 | "display_name": "Python 3",
514 | "language": "python",
515 | "name": "python3"
516 | },
517 | "language_info": {
518 | "codemirror_mode": {
519 | "name": "ipython",
520 | "version": 3
521 | },
522 | "file_extension": ".py",
523 | "mimetype": "text/x-python",
524 | "name": "python",
525 | "nbconvert_exporter": "python",
526 | "pygments_lexer": "ipython3",
527 | "version": "3.6.5"
528 | }
529 | },
530 | "nbformat": 4,
531 | "nbformat_minor": 2
532 | }
533 |
--------------------------------------------------------------------------------
/notebooks/exploration-filter-non-continuous-stocks.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Goal \n",
8 | "Filter non-continuous stocks. That is, filter out stocks that do not span the entire time series."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import sys\n",
18 | "sys.path.append(r'../models/')\n",
19 | "\n",
20 | "import pandas as pd\n",
21 | "from data_cleaning import MARKET_DATA_PATH, NEWS_DATA_PATH, clean_market_data"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "market_train_df = pd.read_csv(MARKET_DATA_PATH)\n",
31 | "# news_train_df = pd.read_csv(NEWS_DATA_PATH)"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 3,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "clean_market_df = clean_market_data(market_train_df)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 5,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/html": [
51 | "\n",
52 | "\n",
65 | "
\n",
66 | " \n",
67 | " \n",
68 | " | \n",
69 | " assetCode | \n",
70 | " time | \n",
71 | " volume | \n",
72 | " open | \n",
73 | " returnsOpenPrevMktres1 | \n",
74 | " returnsOpenPrevMktres10 | \n",
75 | " returnsOpenNextMktres10 | \n",
76 | "
\n",
77 | " \n",
78 | " \n",
79 | " \n",
80 | " 14290 | \n",
81 | " A.N | \n",
82 | " 2007-02-15 | \n",
83 | " 4095135.0 | \n",
84 | " 32.990 | \n",
85 | " -0.001572 | \n",
86 | " 0.007461 | \n",
87 | " -0.029993 | \n",
88 | "
\n",
89 | " \n",
90 | " 14291 | \n",
91 | " AAI.N | \n",
92 | " 2007-02-15 | \n",
93 | " 1378650.0 | \n",
94 | " 11.650 | \n",
95 | " -0.001498 | \n",
96 | " -0.010884 | \n",
97 | " -0.013111 | \n",
98 | "
\n",
99 | " \n",
100 | " 14292 | \n",
101 | " AAP.N | \n",
102 | " 2007-02-15 | \n",
103 | " 3884400.0 | \n",
104 | " 37.000 | \n",
105 | " -0.019388 | \n",
106 | " -0.026448 | \n",
107 | " -0.028244 | \n",
108 | "
\n",
109 | " \n",
110 | " 14293 | \n",
111 | " AAPL.O | \n",
112 | " 2007-02-15 | \n",
113 | " 12997017.0 | \n",
114 | " 85.310 | \n",
115 | " 0.000738 | \n",
116 | " -0.044809 | \n",
117 | " -0.014505 | \n",
118 | "
\n",
119 | " \n",
120 | " 14294 | \n",
121 | " ABB.N | \n",
122 | " 2007-02-15 | \n",
123 | " 10168100.0 | \n",
124 | " 18.245 | \n",
125 | " -0.026040 | \n",
126 | " -0.010927 | \n",
127 | " 0.017172 | \n",
128 | "
\n",
129 | " \n",
130 | "
\n",
131 | "
"
132 | ],
133 | "text/plain": [
134 | " assetCode time volume open returnsOpenPrevMktres1 \\\n",
135 | "14290 A.N 2007-02-15 4095135.0 32.990 -0.001572 \n",
136 | "14291 AAI.N 2007-02-15 1378650.0 11.650 -0.001498 \n",
137 | "14292 AAP.N 2007-02-15 3884400.0 37.000 -0.019388 \n",
138 | "14293 AAPL.O 2007-02-15 12997017.0 85.310 0.000738 \n",
139 | "14294 ABB.N 2007-02-15 10168100.0 18.245 -0.026040 \n",
140 | "\n",
141 | " returnsOpenPrevMktres10 returnsOpenNextMktres10 \n",
142 | "14290 0.007461 -0.029993 \n",
143 | "14291 -0.010884 -0.013111 \n",
144 | "14292 -0.026448 -0.028244 \n",
145 | "14293 -0.044809 -0.014505 \n",
146 | "14294 -0.010927 0.017172 "
147 | ]
148 | },
149 | "execution_count": 5,
150 | "metadata": {},
151 | "output_type": "execute_result"
152 | }
153 | ],
154 | "source": [
155 | "clean_market_df.head(5)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 16,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "data": {
165 | "text/plain": [
166 | "2488"
167 | ]
168 | },
169 | "execution_count": 16,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "clean_market_df.time.nunique()"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 10,
181 | "metadata": {
182 | "scrolled": true
183 | },
184 | "outputs": [
185 | {
186 | "data": {
187 | "text/plain": [
188 | "assetCode\n",
189 | "A.N 2488\n",
190 | "AAI.N 879\n",
191 | "AAL.O 772\n",
192 | "AAMRQ.OB 70\n",
193 | "AAN.N 1486\n",
194 | "dtype: int64"
195 | ]
196 | },
197 | "execution_count": 10,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "clean_market_df.groupby('assetCode').size().head()"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 19,
209 | "metadata": {},
210 | "outputs": [
211 | {
212 | "data": {
213 | "text/plain": [
214 | "522"
215 | ]
216 | },
217 | "execution_count": 19,
218 | "metadata": {},
219 | "output_type": "execute_result"
220 | }
221 | ],
222 | "source": [
223 | "sel = clean_market_df.groupby('assetCode').size() == clean_market_df.time.nunique()\n",
224 | "sum(sel)"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "522 stocks span the entire time series. We want to filter only those out.\n",
232 | "\n",
233 | "---"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 21,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "series_len = clean_market_df.time.nunique()\n",
243 | "clean_market_df = clean_market_df.groupby('assetCode').filter(lambda x: len(x) == series_len)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 25,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "# Confirm that we are left only with the ones covering the whole series\n",
253 | "assert (clean_market_df.groupby('assetCode').size() == series_len).all()"
254 | ]
255 | }
256 | ],
257 | "metadata": {
258 | "kernelspec": {
259 | "display_name": "Python 3",
260 | "language": "python",
261 | "name": "python3"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.6.5"
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 2
278 | }
279 |
--------------------------------------------------------------------------------
/notebooks/se_kernel_v0.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "from itertools import chain\n",
13 | "\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "REDUCED = True # Reduce the data size for development and testing"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from kaggle.competitions import twosigmanews\n",
33 | "env = twosigmanews.make_env()"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "(market_train_df, news_train_df) = env.get_training_data()"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "if REDUCED:\n",
52 | " market_train_df = market_train_df.tail(10000)\n",
53 | " news_train_df = news_train_df.tail(50000)"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "def clean_market_data(market_df, train=True):\n",
63 | " '''Clean and preprocess the market data for training or testing.\n",
64 | " \n",
65 | " Parameters\n",
66 | " ----------\n",
67 | " market_df : dataframe\n",
68 | " See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n",
69 | " train : bool\n",
70 | " When true, adds the target variable to the dataframe.\n",
71 | " \n",
72 | " Returns\n",
73 | " -------\n",
74 | " dataframe \n",
75 | " Cleaned market data.\n",
76 | " \n",
77 | " '''\n",
78 | " # Select columns and drop NA\n",
79 | " if train:\n",
80 | " cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1',\n",
81 | " 'returnsOpenPrevMktres10', 'returnsOpenNextMktres10']\n",
82 | " else:\n",
83 | " cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1',\n",
84 | " 'returnsOpenPrevMktres10']\n",
85 | " market_df = market_df.loc[:,cols]\n",
86 | " market_df.dropna(inplace=True)\n",
87 | " \n",
88 | " # Normalize time\n",
89 | " market_df.loc[:, 'time'] = market_df.time.dt.normalize()\n",
90 | " \n",
91 | " return market_df"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "def clean_news_data(news_df, extra_features= False):\n",
101 | " '''Clean and preprocess the news data for training or testing.\n",
102 | " \n",
103 | " Parameters\n",
104 | " ----------\n",
105 | " news_df : dataframe\n",
106 | " See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n",
107 | " extra_features : bool\n",
108 | " When true, adds extra columns that SE added ('urgency', 'provider', 'bodySize', 'relevance').\n",
109 | " \n",
110 | " Returns\n",
111 | " -------\n",
112 | " dataframe \n",
113 | " Cleaned news data.\n",
114 | " \n",
115 | " '''\n",
116 | " # Select columns and drop NA\n",
117 | " if extra_features:\n",
118 | " cols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',\n",
119 | " 'urgency', 'provider', 'bodySize', 'relevance']\n",
120 | " else:\n",
121 | " cols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive']\n",
122 | " news_df = news_df.loc[:,cols]\n",
123 | " news_df.dropna(inplace=True)\n",
124 | " \n",
125 | " # Normalize time\n",
126 | " news_df.loc[:, 'time'] = news_df.time.dt.normalize()\n",
127 | " \n",
128 | " # assetCodes from String to List\n",
129 | " news_df['assetCodes'] = news_df['assetCodes'].str.findall(f\"'([\\w\\./]+)'\")\n",
130 | " \n",
131 | " # Explode news on assetCodes\n",
132 | " assetCodes_expanded = list(chain(*news_df['assetCodes']))\n",
133 | " assetCodes_index = news_df.index.repeat(news_df['assetCodes'].apply(len))\n",
134 | "\n",
135 | " assert len(assetCodes_expanded) == len(assetCodes_index)\n",
136 | " \n",
137 | " assetCodes_df = pd.DataFrame({'index': assetCodes_index, 'assetCode': assetCodes_expanded})\n",
138 | " news_df_exploded = news_df.merge(assetCodes_df, 'right', right_on='index', left_index=True, validate='1:m')\n",
139 | " news_df_exploded.drop(['assetCodes', 'index'], 1, inplace=True)\n",
140 | "\n",
141 | " if extra_features:\n",
142 | " # Compute means for same date and assetCode\n",
143 | " news_agg_dict = {\n",
144 | " 'sentimentNegative':'mean',\n",
145 | " 'sentimentNeutral':'mean',\n",
146 | " 'sentimentPositive':'mean',\n",
147 | " 'urgency':'mean',\n",
148 | " 'bodySize':'mean',\n",
149 | " 'relevance':'mean'\n",
150 | " }\n",
151 | " news_df_agg = news_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict)\n",
152 | " \n",
153 | " # Add provider information\n",
154 | " idx = news_df_exploded.groupby(['time', 'assetCode'])['urgency'].transform(max) == news_df_exploded['urgency']\n",
155 | " news_df_exploded_2 = news_df_exploded[idx][['time', 'assetCode', 'provider']].drop_duplicates(['time', 'assetCode'])\n",
156 | " news_df_agg = news_df_agg.merge(news_df_exploded_2, 'left', ['time', 'assetCode'])\n",
157 | " \n",
158 | " # One-hot encoding provider\n",
159 | " ohe_provider = pd.get_dummies(news_df_agg['provider'])\n",
160 | " news_df_agg = pd.concat([news_df_agg, ohe_provider], axis=1).drop(['provider'], axis=1)\n",
161 | " \n",
162 | " else:\n",
163 | " # Compute means for same date and assetCode\n",
164 | " news_agg_dict = {\n",
165 | " 'sentimentNegative':'mean',\n",
166 | " 'sentimentNeutral':'mean',\n",
167 | " 'sentimentPositive':'mean'\n",
168 | " }\n",
169 | " news_df_agg = news_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict)\n",
170 | " \n",
171 | " return news_df_agg"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "def clean_data(market_df, news_df, train=True, extra_features=False):\n",
181 | " '''Clean and preprocess the news and market data for training then merge them, to create a train set or test set.\n",
182 | " \n",
183 | " Parameters\n",
184 | " ----------\n",
185 | " market_df : dataframe\n",
186 | " See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n",
187 | " news_df : dataframe\n",
188 | " See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.\n",
189 | " train : bool\n",
190 | " When true, creates both the input features and the target dataframes.\n",
191 | " extra_features : bool\n",
192 | " When true, adds extra columns that SE added ('urgency', 'provider', 'bodySize', 'relevance').\n",
193 | " \n",
194 | " Returns\n",
195 | " -------\n",
196 | " dataframe \n",
197 | " Cleaned data ready to be fed to the model. Returns both the input and the target dataframes when train=True.\n",
198 | " \n",
199 | " '''\n",
200 | " cleaned_market_df = clean_market_data(market_df, train)\n",
201 | " cleaned_news_df = clean_news_data(news_df, extra_features)\n",
202 | " \n",
203 | " # Merge on market data\n",
204 | " df_merged = cleaned_market_df.merge(cleaned_news_df, 'inner', ['time', 'assetCode'])\n",
205 | " \n",
206 | " if train:\n",
207 | " y = df_merged['returnsOpenNextMktres10']\n",
208 | " X = df_merged.drop(['returnsOpenNextMktres10'], axis=1)\n",
209 | " return X, y\n",
210 | " else:\n",
211 | " return df_merged"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "#Final dataframes for training\n",
221 | "X_train, y_train = clean_data(market_train_df, news_train_df, extra_features=True)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": []
230 | }
231 | ],
232 | "metadata": {
233 | "kernelspec": {
234 | "display_name": "Python 3",
235 | "language": "python",
236 | "name": "python3"
237 | },
238 | "language_info": {
239 | "codemirror_mode": {
240 | "name": "ipython",
241 | "version": 3
242 | },
243 | "file_extension": ".py",
244 | "mimetype": "text/x-python",
245 | "name": "python",
246 | "nbconvert_exporter": "python",
247 | "pygments_lexer": "ipython3",
248 | "version": "3.6.5"
249 | }
250 | },
251 | "nbformat": 4,
252 | "nbformat_minor": 2
253 | }
254 |
--------------------------------------------------------------------------------
/notebooks/se_kernel_v1.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import pandas as pd
4 | import glob
5 |
6 | sys.path.append('../')
7 | from models.data_cleaning import clean_market_data, clean_news_data
8 |
9 | # Import libraries used for lstm
10 | from keras.models import Sequential
11 | from keras.layers import Dense
12 | from keras.layers import LSTM
13 |
14 | # Define some global variables
15 | MARKET_DATA_PATH = '../data/raw/market_train_df.csv'
16 | NEWS_DATA_PATH = '../data/raw/news_train_df.csv'
17 | MERGED_PATH = '../data/processed/df_merged.csv'
18 |
19 | MARKET_CLEAN_PATH = '../data/processed/market_cleaned_df.csv'
20 | NEWS_CLEAN_CHUNK_PATH = '../data/processed/news_cleaned_df_'
21 | NEWS_CLEAN_PATH = '../data/processed/news_cleaned_df.csv'
22 |
23 | MARKET_CONTINOUS_PATH = '../data/processed/market_continuous_df.csv'
24 | NEWS_CONTINUOUS_PATH = '../data/processed/news_continuous_df.csv'
25 |
26 |
27 | def get_continuous_df(market_data_path, news_data_path, merged_path,
28 | market_clean_path=MARKET_CLEAN_PATH,
29 | news_clean_chunk_path=NEWS_CLEAN_CHUNK_PATH,
30 | news_clean_path=NEWS_CLEAN_PATH,
31 | market_continuous_path=MARKET_CONTINOUS_PATH,
32 | news_continuous_path=NEWS_CONTINUOUS_PATH):
33 | """
34 | Cleans and filters the datasets to only select assets with
35 | continuous information
36 | """
37 | market_train_df = pd.read_csv(market_data_path)
38 | cleaned_market_df = clean_market_data(market_train_df)
39 | print('market data was cleaned')
40 | cleaned_market_df.to_csv(market_clean_path)
41 | print('cleaned market data was saved')
42 | # save memory usage
43 | del market_train_df
44 |
45 | series_len = cleaned_market_df.time.nunique()
46 | cleaned_market_df = cleaned_market_df.groupby('assetCode').filter(lambda x: len(x) == series_len)
47 | cleaned_market_df = cleaned_market_df.reset_index(drop=True)
48 | print('market data was filtered')
49 | cleaned_market_df.to_csv(market_continuous_path)
50 | print('filtered market data was saved')
51 |
52 | c = 0
53 | for news_chunk in pd.read_csv(news_data_path, chunksize=100000):
54 | print('news chunk_number ' + str(c))
55 | news_cleaned = clean_news_data(news_chunk)
56 | news_cleaned.to_csv(news_clean_chunk_path + str(c) + '.csv')
57 | print('news chunk number ' + str(c) + ' saved')
58 | c += 1
59 |
60 | news_files = glob.glob(news_clean_chunk_path + "*")
61 | cleaned_news_df = pd.concat((pd.read_csv(f, header=0) for f in news_files))
62 | print('cleaned news data concatenated')
63 | cleaned_news_df.to_csv(news_clean_path)
64 | print('cleaned news data was saved')
65 |
66 | assetcodes = cleaned_market_df['assetCode'].tolist()
67 | news_continuous_df = cleaned_news_df[cleaned_news_df['assetCode'].isin(assetcodes)]
68 | news_continuous_df.loc[:, 'time'] = pd.to_datetime(news_continuous_df.time).dt.normalize()
69 | news_continuous_df.to_csv(news_continuous_path)
70 | print('filtered news data was saved')
71 | df_merged = cleaned_market_df.merge(news_continuous_df.drop_duplicates(subset=['time', 'assetCode']), 'left', ['time', 'assetCode'])
72 |
73 | print('filling missing values and saving the merged dataset')
74 | df_merged = df_merged.fillna(-1)
75 | df_merged.to_csv(merged_path)
76 |
77 | # return the final merged dataset
78 | return df_merged
79 |
80 |
81 | if __name__ == '__main__':
82 |
83 | if os.path.exists(MERGED_PATH):
84 | df_merged = pd.read_csv(MERGED_PATH)
85 | else:
86 | df_merged = get_continuous_df(MARKET_DATA_PATH,
87 | NEWS_DATA_PATH,
88 | MERGED_PATH)
89 |
90 | df_merged = df_merged.sort_values(['time', 'assetCode'], ascending=[True, True])
91 |
92 | # taking 80%, 10%, 10% for train, val, test sets
93 | df_train = df_merged[:522*1990]
94 | df_val = df_merged[522*1990:522*(1990+249)]
95 | df_test = df_merged[522*(1990+249):]
96 |
97 | # create the different data sets
98 | y_train = df_train['returnsOpenNextMktres10']
99 | X_train = df_train.drop(['returnsOpenNextMktres10'], axis=1)
100 |
101 | y_val = df_val['returnsOpenNextMktres10']
102 | X_val = df_val.drop(['returnsOpenNextMktres10'], axis=1)
103 |
104 | y_test = df_test['returnsOpenNextMktres10']
105 | X_test = df_test.drop(['returnsOpenNextMktres10'], axis=1)
106 |
107 | X_train_ar = X_train.drop(['Unnamed: 0', 'assetCode', "time"], axis=1).as_matrix()
108 | X_train_ar = X_train_ar.reshape(int(X_train_ar.shape[0]/522), 1, 522*X_train_ar.shape[1])
109 |
110 | X_val_ar = X_val.drop(['Unnamed: 0', 'assetCode', "time"], axis=1).as_matrix()
111 | X_val_ar = X_val_ar.reshape(int(X_val_ar.shape[0]/522), 1, 522*X_val_ar.shape[1])
112 |
113 | X_test_ar = X_test.drop(['Unnamed: 0', 'assetCode', "time"], axis=1).as_matrix()
114 | X_test_ar = X_test_ar.reshape(int(X_test_ar.shape[0]/522), 1, 522*X_test_ar.shape[1])
115 |
116 | y_train_ar = y_train.values.reshape((1990, 522))
117 | y_val_ar = y_val.values.reshape((int(len(y_val)/522), 522))
118 | y_test_ar = y_test.values.reshape((int(len(y_test)/522), 522))
119 |
120 | # 4. Build Keras model
121 | model = Sequential()
122 | model.add(LSTM(50, input_shape=(1, 41*522))) # adds LSTM layer
123 | model.add(Dense(522)) # adds a dense layer
124 | model.compile(loss='mae', optimizer='adam') # TODO: change the loss
125 |
126 | # 5. Fit RNN
127 | model.fit(X_train_ar, y_train_ar, epochs=3, batch_size=1,
128 | validation_data=(X_val_ar, y_val_ar), verbose=1, shuffle=False)
129 |
130 | model.save('vanilla_lstm_20181117.hdf5')
131 | print('model saved.')
132 |
--------------------------------------------------------------------------------
/report/Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/Diagram.png
--------------------------------------------------------------------------------
/report/LSTMAgrid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/LSTMAgrid.png
--------------------------------------------------------------------------------
/report/LSTMgrid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/LSTMgrid.png
--------------------------------------------------------------------------------
/report/Shuffling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/Shuffling.png
--------------------------------------------------------------------------------
/report/Stocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/Stocks.png
--------------------------------------------------------------------------------
/report/lstm_att_v0_ts_1_drop_04_cells_64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/lstm_att_v0_ts_1_drop_04_cells_64.png
--------------------------------------------------------------------------------
/report/lstm_att_v0_ts_5_drop_0_cells_64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/lstm_att_v0_ts_5_drop_0_cells_64.png
--------------------------------------------------------------------------------
/report/lstm_plot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/lstm_plot1.png
--------------------------------------------------------------------------------
/report/lstm_plot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/lstm_plot2.png
--------------------------------------------------------------------------------
/report/main.bbl:
--------------------------------------------------------------------------------
1 | \begin{thebibliography}{10}
2 |
3 | \bibitem{bahdanau2014neural}
4 | Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio.
5 | \newblock Neural machine translation by jointly learning to align and
6 | translate.
7 | \newblock {\em arXiv preprint arXiv:1409.0473}, 2014.
8 |
9 | \bibitem{bao2017deep}
10 | Wei Bao, Jun Yue, and Yulei Rao.
11 | \newblock A deep learning framework for financial time series using stacked
12 | autoencoders and long-short term memory.
13 | \newblock {\em PloS one}, 12(7):e0180944, 2017.
14 |
15 | \bibitem{bergmeir2012use}
16 | Christoph Bergmeir and Jos{\'e}~M Ben{\'\i}tez.
17 | \newblock On the use of cross-validation for time series predictor evaluation.
18 | \newblock {\em Information Sciences}, 191:192--213, 2012.
19 |
20 | \bibitem{bollerslev1986generalized}
21 | Tim Bollerslev.
22 | \newblock Generalized autoregressive conditional heteroskedasticity.
23 | \newblock {\em Journal of econometrics}, 31(3):307--327, 1986.
24 |
25 | \bibitem{burg1968new}
26 | John~Parker Burg.
27 | \newblock A new analysis technique for time series data.
28 | \newblock {\em Paper presented at NATO Advanced Study Institute on Signal
29 | Processing, Enschede, Netherlands, 1968}, 1968.
30 |
31 | \bibitem{chandra2012cooperative}
32 | Rohitash Chandra and Mengjie Zhang.
33 | \newblock Cooperative coevolution of elman recurrent neural networks for
34 | chaotic time series prediction.
35 | \newblock {\em Neurocomputing}, 86:116--123, 2012.
36 |
37 | \bibitem{chen1989representations}
38 | Sheng Chen and Steve~A Billings.
39 | \newblock Representations of non-linear systems: the narmax model.
40 | \newblock {\em International Journal of Control}, 49(3):1013--1032, 1989.
41 |
42 | \bibitem{engle1982autoregressive}
43 | Robert~F Engle.
44 | \newblock Autoregressive conditional heteroscedasticity with estimates of the
45 | variance of united kingdom inflation.
46 | \newblock {\em Econometrica: Journal of the Econometric Society}, pages
47 | 987--1007, 1982.
48 |
49 | \bibitem{engle1993measuring}
50 | Robert~F Engle and Victor~K Ng.
51 | \newblock Measuring and testing the impact of news on volatility.
52 | \newblock {\em The journal of finance}, 48(5):1749--1778, 1993.
53 |
54 | \bibitem{firat2016multi}
55 | Orhan Firat, Kyunghyun Cho, and Yoshua Bengio.
56 | \newblock Multi-way, multilingual neural machine translation with a shared
57 | attention mechanism.
58 | \newblock {\em arXiv preprint arXiv:1601.01073}, 2016.
59 |
60 | \bibitem{graves2014neural}
61 | Alex Graves, Greg Wayne, and Ivo Danihelka.
62 | \newblock Neural turing machines.
63 | \newblock {\em arXiv preprint arXiv:1410.5401}, 2014.
64 |
65 | \bibitem{hamilton1994time}
66 | James~Douglas Hamilton.
67 | \newblock {\em Time series analysis}, volume~2.
68 | \newblock Princeton university press Princeton, NJ, 1994.
69 |
70 | \bibitem{hamzaoui2016glosten}
71 | Nessrine Hamzaoui and Boutheina Regaieg.
72 | \newblock The glosten-jagannathan-runkle-generalized autoregressive conditional
73 | heteroscedastic approach to investigating the foreign exchange forward
74 | premium volatility.
75 | \newblock {\em International Journal of Economics and Financial Issues},
76 | 6(4):1608--1615, 2016.
77 |
78 | \bibitem{hentschel1995all}
79 | Ludger Hentschel et~al.
80 | \newblock All in the family: Nesting symmetric and asymmetric garch models.
81 | \newblock {\em Journal of Financial Economics}, 39(1):71--104, 1995.
82 |
83 | \bibitem{hochreiter1997long}
84 | Sepp Hochreiter and J{\"u}rgen Schmidhuber.
85 | \newblock Long short-term memory.
86 | \newblock {\em Neural computation}, 9(8):1735--1780, 1997.
87 |
88 | \bibitem{hollis2018deep}
89 | Thomas Hollis.
90 | \newblock Deep learning algorithms applied to blockchain-based financial time
91 | series.
92 | \newblock 2018.
93 |
94 | \bibitem{kaggle2017twosigma}
95 | Kaggle.
96 | \newblock Two sigma: Using news to predict stock movements.
97 | \newblock \url{https://www.kaggle.com/c/two-sigma-financial-news}.
98 | \newblock Accessed: 2018-09-30.
99 |
100 | \bibitem{kim2003financial}
101 | Kyoung-jae Kim.
102 | \newblock Financial time series forecasting using support vector machines.
103 | \newblock {\em Neurocomputing}, 55(1-2):307--319, 2003.
104 |
105 | \bibitem{kingma2014adam}
106 | Diederik~P Kingma and Jimmy Ba.
107 | \newblock Adam: A method for stochastic optimization.
108 | \newblock {\em arXiv preprint arXiv:1412.6980}, 2014.
109 |
110 | \bibitem{kohonen1982self}
111 | Teuvo Kohonen.
112 | \newblock Self-organized formation of topologically correct feature maps.
113 | \newblock {\em Biological cybernetics}, 43(1):59--69, 1982.
114 |
115 | \bibitem{koskela1998time}
116 | Timo Koskela, Markus Varsta, Jukka Heikkonen, and Kimmo Kaski.
117 | \newblock Time series prediction using recurrent som with local linear models.
118 | \newblock {\em Int. J. of Knowledge-Based Intelligent Engineering Systems},
119 | 2(1):60--68, 1998.
120 |
121 | \bibitem{krizhevsky2012imagenet}
122 | Alex Krizhevsky, Ilya Sutskever, and Geoffrey~E Hinton.
123 | \newblock Imagenet classification with deep convolutional neural networks.
124 | \newblock In {\em Advances in neural information processing systems}, pages
125 | 1097--1105, 2012.
126 |
127 | \bibitem{kuremoto2014time}
128 | Takashi Kuremoto, Shinsuke Kimura, Kunikazu Kobayashi, and Masanao Obayashi.
129 | \newblock Time series forecasting using a deep belief network with restricted
130 | boltzmann machines.
131 | \newblock {\em Neurocomputing}, 137:47--56, 2014.
132 |
133 | \bibitem{lin2009short}
134 | Xiaowei Lin, Zehong Yang, and Yixu Song.
135 | \newblock Short-term stock price prediction based on echo state networks.
136 | \newblock {\em Expert systems with applications}, 36(3):7313--7317, 2009.
137 |
138 | \bibitem{malkiel1970efficient}
139 | Burton~G Malkiel and Eugene~F Fama.
140 | \newblock Efficient capital markets: A review of theory and empirical work.
141 | \newblock {\em The journal of Finance}, 25(2):383--417, 1970.
142 |
143 | \bibitem{murphy1999technical}
144 | John~J Murphy.
145 | \newblock {\em Technical analysis of the financial markets: A comprehensive
146 | guide to trading methods and applications}.
147 | \newblock Penguin, 1999.
148 |
149 | \bibitem{pierre1998estimating}
150 | Eileen F~St Pierre.
151 | \newblock Estimating egarch-m models: Science or art?
152 | \newblock {\em The Quarterly Review of Economics and Finance}, 38(2):167--180,
153 | 1998.
154 |
155 | \bibitem{vaswani2017attention}
156 | Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
157 | Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin.
158 | \newblock Attention is all you need.
159 | \newblock In {\em Advances in Neural Information Processing Systems}, pages
160 | 5998--6008, 2017.
161 |
162 | \bibitem{walker1931periodicity}
163 | Gilbert~Thomas Walker.
164 | \newblock On periodicity in series of related terms.
165 | \newblock {\em Proceedings of the Royal Society of London. Series A, Containing
166 | Papers of a Mathematical and Physical Character}, 131(818):518--532, 1931.
167 |
168 | \bibitem{xu2015show}
169 | Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan
170 | Salakhudinov, Rich Zemel, and Yoshua Bengio.
171 | \newblock Show, attend and tell: Neural image caption generation with visual
172 | attention.
173 | \newblock In {\em International conference on machine learning}, pages
174 | 2048--2057, 2015.
175 |
176 | \bibitem{zhang2003time}
177 | G~Peter Zhang.
178 | \newblock Time series forecasting using a hybrid arima and neural network
179 | model.
180 | \newblock {\em Neurocomputing}, 50:159--175, 2003.
181 |
182 | \end{thebibliography}
183 |
--------------------------------------------------------------------------------
/report/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PsiPhiTheta/LSTM-Attention/996b541f48b9aa627cd96d5c0e239ffb9f66b7a0/report/main.pdf
--------------------------------------------------------------------------------
/report/main.tex:
--------------------------------------------------------------------------------
1 | \documentclass{article}
2 |
3 | % if you need to pass options to natbib, use, e.g.:
4 | % \PassOptionsToPackage{numbers, compress}{natbib}
5 | % before loading nips_2016
6 | %
7 | % to avoid loading the natbib package, add option nonatbib:
8 | %\usepackage[no]{nips_2016}
9 |
10 | %\usepackage{nips_2016}
11 |
12 | % to compile a camera-ready version, add the [final] option, e.g.:
13 | \usepackage[final]{nips_2016}
14 | \usepackage{natbib}
15 | \setcitestyle{numbers}
16 | \usepackage[utf8]{inputenc} % allow utf-8 input
17 | \usepackage[T1]{fontenc} % use 8-bit T1 fonts
18 | \usepackage{hyperref} % hyperlinks
19 | \usepackage{url} % simple URL typesetting
20 | \usepackage{booktabs} % professional-quality tables
21 | \usepackage{amsfonts} % blackboard math symbols
22 | \usepackage{nicefrac} % compact symbols for 1/2, etc.
23 | \usepackage{microtype} % microtypography
24 | \usepackage{indentfirst}
25 | \usepackage{amsmath}
26 | \usepackage{fancyhdr}
27 | \usepackage{graphicx}
28 | \usepackage{printlen}
29 | \usepackage{color}
30 | %\usepackage{lmodern}
31 |
32 | \fancypagestyle{equalc}{\fancyhf{}\renewcommand{\headrulewidth}{0pt}\fancyfoot[R]{* indicates equal contribution}}
33 |
34 | \title{A Comparison of LSTMs and Attention Mechanisms for Forecasting Financial Time Series}
35 |
36 | \author{
37 | S.E. Yi* \\
38 | Department of Computer Science\\
39 | University of Toronto\\
40 | Toronto, ON M5S 3H7 \\
41 | \texttt{seungeunyi@cs.toronto.edu} \\
42 | \And
43 | A. Viscardi* \\
44 | Department of Computer Science\\
45 | University of Toronto\\
46 | Toronto, ON M5S 3H7 \\
47 | \texttt{avis@cs.toronto.edu} \\
48 | \And
49 | T. Hollis* \\
50 | Department of Computer Science\\
51 | University of Toronto\\
52 | Toronto, ON M5S 3H7 \\
53 | \texttt{thollis@cs.toronto.edu} \\
54 | }
55 |
56 | \begin{document}
57 |
58 | \maketitle
59 |
60 | \begin{abstract}
61 |
62 | While LSTMs show increasingly promising results for forecasting Financial Time Series (FTS), this paper seeks to assess if attention mechanisms can further improve performance. The hypothesis is that attention can help prevent long-term dependencies experienced by LSTM models. To test this hypothesis, the main contribution of this paper is the implementation of an LSTM with attention. Both the benchmark LSTM and the LSTM with attention were compared and both achieved reasonable performances of up to 60\% on five stocks from Kaggle's Two Sigma dataset. This comparative analysis demonstrates that an LSTM with attention can indeed outperform standalone LSTMs but further investigation is required as issues do arise with such model architectures.
63 |
64 | \end{abstract}
65 |
66 | \thispagestyle{equalc}
67 |
68 | \section{Introduction}
69 |
70 | Financial Time Series (FTS) modelling is a practice with a long history which first revolutionised algorithmic trading in the early 1970s. The analysis of FTS was divided into two categories: fundamental analysis and technical analysis. Fundamental analysis is the study of a stock or currency’s price based on economic factors. These include the overall state of the business or economy, revenue, interest rates and others. On the other hand, technical analysis, as defined by J. Murphy in \cite{murphy1999technical}, is the study of market action to forecast future trends. This is achieved through the analysis of shorter term financial data, primarily price and volume. Both fundamental and technical analysis are put into question by the Efficient Market Hypothesis (EMH). The EMH, highly disputed since its initial publication in 1970, hypothesizes that stock prices are ultimately unpredictable \cite{malkiel1970efficient}. This has not stopped research attempting to model FTS through the use of linear, non-linear and ML-based models, as mentioned hereafter. Together these approaches form the main subcategories of existing solutions in FTS analysis.
71 |
72 | In parallel, the origins of attention mechanisms initially came into prominence in the field of Computer Vision (CV). These new models were originally loosely inspired by human visual attention mechanisms present in a region of the prefrontal cortex known as the inferior frontal junction. Applications were later leveraged to tackle issues with context-dependent inference in Natural Language Processing (NLP). In both cases, the core principle of attention is achieved by setting attention weights to assign more or less of the algorithm’s finite attention on different subsets of the input feature space. In CV, this corresponds to focussing on particular features of the input image while in NLP this represents focus on particular words in the input sentence. In NLP, this attention mechanism allows inference to be made about an entire sentence while remaining sensitive to its context. This remains to this day a particularly challenging task for long sentences.
73 |
74 | In this paper we propose a novel approach to FTS forecasting by combining these two fields of research. The idea is to leverage the developments in attention mechanisms to improve the performance of promising LSTM RNN architectures currently in use for FTS forecasting. The main contribution in this paper is the implementation of an LSTM that uses attention for parsing both news headlines and financial data. The performance of this model is then compared with that of a regular LSTM without attention. This performance is then evaluated using five stocks from Kaggle's Two Sigma dataset \cite{kaggle2017twosigma} and using various methods of data preprocessing \cite{bergmeir2012use}.
75 |
76 | The ultimate goals of FTS forecasting, among others, is to help solve the problem of volatility in speculative markets and to help foresee large financial events such as the 2009 financial crisis to ensure better economic preparation.
77 |
78 | \section{Related Work}
79 |
80 | As discussed in \cite{hollis2018deep}, the most rudimentary approach to modelling FTS is by assuming that they follow the random walk model. The random walk model can be simply expressed as a sum of a series of independent random variables \cite{hamilton1994time}. By weighing these, the first Auto-Regressive (AR) model was developed. A set of equations were developed by U. Yule and G. Walker in \cite{walker1931periodicity} to provide quantitative methods for estimating parameters in AR models. This work was subsequently expanded upon by J. P. Burg in \cite{burg1968new} who provided an alternative approach albeit with different stability properties. These AR models are often accompanied by another type of linear model, the Moving Average (MA) model which gives the Auto-Regressive Moving Average (ARMA) model. However, a fundamental limitation of AR, MA and ARMA models is that they all assume the process being modelled is stationary. Stationarity is a property of processes whereby the probability distribution remains constant over time, thus variance also remains constant. Indeed, this assumption is significant as FTS are often non-stationary processes. Therefore, this model’s accuracy will suffer, highlighting the need to take this problem of stationarity into consideration. This is done by generalising the ARMA model into the Autoregressive Integrated Moving Average (ARIMA) model \cite{hamilton1994time}. The ARIMA model solves the issue of non-stationarity by exploiting the concept of returns (or degrees of differencing). Non-stationary time series can therefore be made stationary by differencing. The aforementioned linear models all suffer from the assumption that FTS are homoscedastic processes. This is indeed often a poor assumption to make, as shown in \cite{engle1982autoregressive} by R.F. Engle. In \cite{engle1982autoregressive}, Engle states that by using a more sophisticated model such as the Auto-Regressive Conditional Heteroscedasticity (ARCH) model, the homoscedastic assumption can be avoided. This ARCH model was later described by Bollerslev in \cite{bollerslev1986generalized} as a special case of a more generalised model called the Generalised Auto-Regressive Conditional Heteroscedasticity (GARCH) model. Many more variants of the GARCH model have been published since its original publication in 1986. These include NAGARCH (nonlinear asymmetric GARCH) \cite{engle1993measuring}, EGARCH (exponential GARCH) \cite{pierre1998estimating}, GJR-GARCH (Glosten-Jagannathan-Runkle GARCH) \cite{hamzaoui2016glosten} and many others. These GARCH derivatives are often nested under Hentschel’s fGARCH (Family GARCH) model \cite{hentschel1995all} but these all lie outside the scope of this paper. In the same time as the ARCH and GARCH models, J. Leontaris and S. A. Billings published an alternative in \cite{chen1989representations} known as is the Nonlinear Autoregressive Moving Average model with exogenous inputs (NARMAX). This work, building on their own previous work on ARMAX models, demonstrated that NARMAX models can successfully be applied to model complex time series. More information on these models can be found in \cite{hollis2018deep}, including equations and further explanation.
81 |
82 | These state space and stochastic models were however quickly overwhelmed by advances in Machine Learning. A wave of ML approaches to modelling FTS severely disrupted the field of algorithmic trading via stochastic modelling in the last two decades. One of the earliest approaches to FTS forecasting using ML built on work from Kohonen in \cite{kohonen1982self}. In \cite{kohonen1982self}, Kohonen introduced the idea of Self-Organising Maps (SOM) which were subsequently successfully applied to FTS forecasting \cite{koskela1998time}. In 2003, still in the early days of ML for FTS predictions, SVMs (both linear and non-linear) were shown by Kim in \cite{kim2003financial} to be of significant predictive capabilities for FTS. In parallel, Zhang showed in his 2003 paper \cite{zhang2003time} that, by combining Artificial Neural Networks (ANNs) with the aforementioned ARIMA model, promising FTS forecasting can be achieved.
83 |
84 | Nevertheless, it was only by benefitting from the Neural Network boom of 2012 brought on by the AlexNet work of Krizhevsky, Sutskever and Hinton on the ImageNet competition \cite{krizhevsky2012imagenet}, that ANNs became some of the most mainstream methods for FTS forecasting, in particular with the rise of RNNs \cite{chandra2012cooperative}. However, another type of neural network that has also been widely lauded for its performance in FTS forecasting is the Echo State Network (ESN). Indeed, Lin et. al showed in \cite{lin2009short} that ESNs combined with Principal Component Analysis (PCA) can sometimes exceed or at least match the performance of conventional RNNs while decreasing the computational costs. This is due to the techniques of Reservoir Computing introduced in ESNs. In short, ESNs can bypass the issue of the vanishing gradient problem and long computational times, present in conventional RNNs, by creating a large `reservoir' of sparsely connected neurons. The connections are assigned at random and weights within the reservoir do not get conventionally trained, reducing computational time and allowing the network to echo past states (emulating the `memory' of RNNs).
85 |
86 | Another alternative to RNNs is the Deep Belief Network (DBN). Hinton and Salakhutdinov's DBNs are a type of probabilistic generative neural network composed of layers of Restricted Boltzmann Machines (RBMs). These have also successfully been leveraged for accurate time series prediction \cite{kuremoto2014time}.
87 |
88 | In the modern day however, LSTM RNNs remain amongst some of the most popular and promising models for predicting FTS. While LSTMs originally came to light in the seminal 1997 paper \cite{hochreiter1997long} by Hochreiter and Schmidhuber, they only recently rose to prominence for FTS forecasting. Amongst the most successful LSTM implementations, the pioneering paper by Bao et al. in \cite{bao2017deep} implements a variety of the most modern LSTM architectures coupled with autoencoders and applies them to stock data. The work presented here extends and builds on the insight of this paper by exploring the impact of leveraging attention models for sentiment analysis built on top of LSTMs.
89 |
90 | However, a word of caution is worth mentioning here. It is true that academic publications in the field of FTS forecasting are often misleading. Indeed, many of the most performant models are developed by private companies and kept away from the public, with the utmost secrecy for competitive reasons. Academia seems to be struggling to shed light on the most modern techniques which is one of the prime motivations for the investigation presented hereafter. In addition, many FTS forecasting papers tend to inflate their performance for recognition and overfit their models due to the heavy use simulators. Many of the performances claimed in these papers are difficult to replicate as they fail to generalise for future changes in the particular FTS being forecast.
91 |
92 | Having reviewed the field of FTS forecasting, in order to better situate our paper amongst existing literature, it is important to now cover a brief history of attention mechanisms.
93 |
94 | Many early prominent papers using attention mechanisms in NLP initially used the term “alignment” to refer to attention in the context neural machine translation. One of the most foundational of these is the 2014 Bahdanau, Cho and Bengio collaboration in \cite{bahdanau2014neural}. In \cite{bahdanau2014neural}, Bahdanau shows that a single neural network can be jointly tuned to maximise translation performance using attention-based encoder-decoders. In parallel, Graves, Wayne and Danihelka showed in \cite{graves2014neural} that neural networks (in particular LSTMs) can be improved by coupling them to attention processes. While the application in \cite{graves2014neural} was helping Neural Turing Machines infer copying, sorting and recall, subsequent CV and NLP applications of the very same concept were greatly inspired by techniques presented in this paper \cite{xu2015show}. Building on the work above as well as on their own work, in 2015 Firat, Cho and Bengio in \cite{firat2016multi} had a breakthrough in multilingual Neural Machine Translation (NMT). They used single attention mechanisms shared across language pairs to outperform existing NMT benchmarks.
95 |
96 | Since then, a whole flurry of different types of attention have been developed including but not limited to self-attention, shared attention, local attention, hybrid attention and multi-headed attention. Indeed, the multi-headed attention model presented in the Transformer \cite{vaswani2017attention} by Google Brain and UofT alumni has been widely lauded as a promising novel architecture for dealing with long range dependencies without the issues of LSTMs in NMT. For this reason, that paper will serve as inspiration for our investigation on the potential of attention in FTS forecasting.
97 |
98 | From the history of attention mechanisms presented here we can indeed see that attention mechanisms are currently accepted as a very promising approach to many problems, especially in the field of machine translation. However, we have yet to see if they present any promise in sentiment analysis for FTS forecasting which is the purpose of the work presented here. Intuitively, since attention mechanisms are designed to help address the issue of context dependency this suggests promising potential for sentiment analysis in FTS forecasting. Indeed, most sentiment analysis algorithms would fail to consider the impact of such context for long lengths. A naive example of this, such as “I am a compulsive liar but this company is fantastic, the stock is destined to rise and I don’t understand why more people have not invested yet”, would be wrongly detected as a positive sentiment by most algorithms. The following investigation aims to confirm or refute the hypothesis that LSTM performance can be improved with attention.
99 |
100 | \section{Model Architecture}
101 |
102 | The two models investigated in this paper are a vanilla LSTM (as a benchmark) and an LSTM with an attention mechanism. A diagram of these models is shown in figure 1 below.
103 |
104 | \begin{figure}[!h]
105 | \includegraphics[width=375pt]{Diagram.png}
106 | \caption{System diagram of the LSTM and LSTM with attention}
107 | \end{figure}
108 |
109 | Both the LSTM model and the LSTM with attention model used in this paper are implemented with mean squared error loss using an Adam optimiser \cite{kingma2014adam}. It is nonetheless important to cover the mathematical foundations of these models before comparing their performances.
110 |
111 | In encoder-decoder RNNs, the encoder reads an input sequence $\textbf{x} = (x_1,$ ... $, x_{t-1})$ into a vector $\textbf{c} = (c_1,$ ... $, c_{t-1})$, as shown in figure 1. A common approach and the one we will be following in this paper is to use an RNN such that \cite{bahdanau2014neural}:
112 | \begin{align}
113 | h_t = f(x_t, h_{t-1})
114 | \end{align}
115 | and
116 | \begin{align}
117 | \textbf{c} = q(\{h_0, \dots , h_{T_x}\})
118 | \end{align}
119 | where $h_t$ is a hidden state at time $t$, $\textbf{c}$ is the context vector generated by the hidden states and $f$ and $q$ are non-linear functions.
120 |
121 | Subsequently to equations 1 and 2, prediction is done at the decoder by defining a probability over the translation $\textbf{y}$ through the following decomposition \cite{bahdanau2014neural}:
122 | \begin{align}
123 | p(\textbf{y}) = \prod_{t=1}^T p(y_t | \{y_1, \dots , y_{t-1}\}, c)
124 | \end{align}
125 | where $\textbf{y} = (y_1, \dots , y_{T_y})$. For RNNs, each conditional probability is modelled as:
126 | \begin{align}
127 | p(y_t | \{y_1, \dots , y_{t-1}\}, c) = g(y_{t-1},s_t,c)
128 | \end{align}
129 | where $s_t$ is the RNN's hidden state and $g$ is a nonlinear function that outputs the probability of $y_t$.
130 |
131 | In attention, a particular context $c_i$ depends on a sequence of annotations ($h_1, \dots , h_{T_x}$) to which an encoder maps the input sequence. While each annotation $h_i$ contains information about the input sequence, we want to focus attention on a particular part of the input. Thus the context vector is computed as a weighted sum as follows \cite{bahdanau2014neural}:
132 | \begin{align}
133 | c_i = \sum_{j=1}^{T_x} \alpha_{ij}h_j
134 | \end{align}
135 | where each context weight $\alpha_{ij}$ for each annotation $h_j$ is calculated as follows:
136 | \begin{align}
137 | \alpha_{ij} = \frac{\exp(e_{ij})}{\sum_{k=1}^{T_x} \exp(e_{ik})}
138 | \end{align}
139 | where
140 | \begin{align}
141 | e_{ik} = a(s_{i-1},h_j)
142 | \end{align}
143 | is known as the alignment model. Alignment models score how well matched the input at position $j$ and output at position $i$ are.
144 |
145 | Equations 1 through 6 are used for the LSTM and attention mechanisms while Adam is used as the optimiser and Mean Square Error (MSE) is used as the loss function. Mathematically, MSE loss is as follows:
146 | \begin{align}
147 | MSE = \frac{1}{n} \sum_{t=1}^{n} (y-t)^2
148 | \end{align}
149 |
150 | \section{Comparison of LSTM and LSTM with attention}
151 |
152 | We can now move onto the implementation of the baseline LSTM. Both this baseline LSTM and the LSTM with attention were implemented within the Kaggle kernel environment in Python using the Keras library. It is worth noting here that special care was taken for sensible crosschecks. For example, the Kaggle kernel environment was set up to block the use of `future' data when training, preventing look-ahead bias. Indeed, look-ahead bias is a significant source of malpractice in the field of FTS so was worthy of extra consideration here. In addition, all implementations closely followed existing literature from the related work discussed in section 2.
153 |
154 | To gain a good benchmark for the performance of an LSTM on forecasting stock prices from the Two Sigma dataset, we first consider a subset of stocks. This subset of stocks contains three very large companies (Intel, Wells Fargo, Amazon), one SME (Agilent Technologies) and one smaller company (Benchmark Electronics). These stocks were carefully chosen to have a wide variety of market cap, volatility and overall trend. Larger market cap stocks tend to be less volatile compared to smaller stocks. In addition, tech companies (like Amazon) tend to be less affected by the 2009 crash than finance companies (like Wells Fargo). These stocks and their volatility are shown in figure 2 as follows.
155 | \begin{figure}[!h]
156 | \includegraphics[width=395pt]{Stocks.png}
157 | \caption{Price and volatility of small stocks (Agilent, Benchmark Electronics - top) and large stocks (Intel, Wells Fargo, Amazon - bottom)}
158 | \end{figure}
159 |
160 | Figure 2 clearly shows one of the main challenges of FTS forecasting which is the change of volatility over time. Indeed, looking at the volatility of Amazon, one notices a significant increase in volatility with increasing size of the company. This is the main justification behind picking a diverse set of stocks that are bound by different statistical properties.
161 |
162 | It is worth showing some loss curves for the benchmark LSTM in order to examine how well the model generalises to the validation data with increasing numbers of epochs. This is shown in Figure 3 as follows.
163 |
164 | \begin{figure}[!h]
165 | \includegraphics[width=195pt]{lstm_plot1.png}
166 | \includegraphics[width=195pt]{lstm_plot2.png}
167 | \caption{Typical LSTM loss for training and validation data per epoch}
168 | \end{figure}
169 |
170 | Figure 3 shows the typical LSTM training and validation losses through epochs. It can be noted that loss decreases as expected and begins to overfit after a certain number of epochs. It is worth noting one particularity from this plot which is that it shows training loss greater than validation loss for early epochs. While this may seem unusual, it is a documented artifact of using Keras. Indeed, the training loss output is the average loss over the batch while the validation loss is the final loss. It is therefore expected that the first few iterations show a higher training loss than validation loss. This is because the very first training iterations will have a higher loss than the final iteration of the validation loss.
171 |
172 | In order to tune this benchmark model, a grid search is undertaken. The LSTM performance during this grid search hyperparameter tuning, evaluated on the validation set, can be seen by the following 3D loss plots.
173 |
174 | \begin{figure}[!h]
175 | \includegraphics[width=395pt]{LSTMgrid.png}
176 | \caption{LSTM grid search loss plots for 3 different sizes}
177 | \end{figure}
178 |
179 | These plots in figure 4 show the performance impact of dropout and lag for three different LSTM sizes. Indeed, it seems the LSTM with size 64 performs best on average. In addition, dropout is clearly a useful regulariser for all three LSTM sizes, particularly for values of 0.2 and higher as it leads to lower losses. From this hyperparameter tuning, the best chosen configuration was with $size=64$, $lag=15$ and $dropout=0.1$. This resulted in a loss of 0.000805 and an up-down accuracy of 0.572 (or 57\%). Indeed, this consistently outperforms random guessing, which would have a performance of 50\%, and is in line with top of the range algorithms which usually achieve around 60\% up-down accuracy.
180 |
181 | Once the hyperparameters are tuned, a common FTS investigation worth pursuing is that of dataset shuffling techniques. In FTS, the choice of which piece of data to use as the validation set is not trivial. Indeed, there exist a myriad of ways of doing this which must be carefully considered before comparing this LSTM to an LSTM with attention. The three methods investigated in this paper are visualised in figure 5.
182 |
183 | \begin{figure}[!h]
184 | \begin{center}
185 | \includegraphics[width=395pt]{Shuffling.png}
186 | \end{center}
187 | \caption{Shuffling techniques visualised}
188 | \end{figure}
189 |
190 | The fixed origin method is the most naive and common method used. Given a certain split size, the start of the data is the training set and the end is the validation set. However, this is a particularly rudimentary method to choose, especially for a high-growth stock like Amazon. The reason why this is the case is that the Amazon's stock price starts off with low volatility and, as the stock grows, experiences increasingly volatile behaviour. We would therefore be training a model on low volatility dynamics and expect it to deal with unseen high volatility dynamics for its predictions. This has indeed shown itself to be difficult and come at a cost in performance for these types of stocks as we will see in table 1. Therefore our benchmark for validation loss and performance may be misleading if we only consider this. However, for stocks like Intel that are more constant in their volatility, this method is reasonable.
191 |
192 | The rolling origin recalibration method is slightly less vulnerable than fixed origin as it allows the validation loss to be computed by taking the average of various different splits of the data to avoid running into unrepresentative issues with high volatility timeframes.
193 |
194 | Finally, the rolling window method is usually one of the most useful methods as it is particularly used for FTS algorithms being run for long timeframes. Indeed, this model outputs the average validation error of multiple rolling windows of data. This means the final values we get are more representative of recent model performance, as we are less biased by strong or poor performance in the distant past.
195 |
196 | It is now important to show the model performance of our tuned benchmark LSTM using these different shuffling techniques. Using regular fixed origin our tuned LSTM achieved a loss of 0.000805 and an up-down accuracy of 57\%. Let's compare these values with top six values of rolling window (RW) and rolling origin recalibration (ROR) in table 1.
197 |
198 | \begin{center}
199 | \begin{tabular}{||c c c c||}
200 | \hline
201 | Loss (RW) & Accuracy (RW) & Loss (ROR) & Accuracy (ROR) \\ [0.5ex]
202 | \hline\hline
203 | 0.000692 & 0.538 & 0.000810 & 0.555 \\
204 | \hline
205 | 0.000693 & 0.530 & 0.000825 & 0.571 \\
206 | \hline
207 | 0.000725 & 0.575 & 0.000978 & 0.607 \\
208 | \hline
209 | 0.000755 & 0.551 & 0.000989 & 0.563 \\
210 | \hline
211 | 0.000780 & 0.514 & 0.001001 & 0.579 \\
212 | \hline
213 | 0.000788 & 0.583 & 0.001014 & 0.538 \\
214 | \hline
215 | \end{tabular}
216 | \end{center}
217 | \begin{center}
218 | Table 1: Performance comparison of shuffling methods
219 | \end{center}
220 |
221 | What table 1 shows is that both RW and ROR describe very slightly better performances (58\% and 60\%) than that of the simple fixed origin method. This suggests that for stocks like Amazon, using these shuffling methods would be inevitable.
222 |
223 | Now that reasonable benchmarks have been ascertained, we can compare them to the performance of our LSTM with attention. The LSTM models with attention generated here had a strong tendency to overfit the training data for multiple epochs as shown in figure 6.
224 |
225 | \begin{figure}[!h]
226 | \includegraphics[width=195pt]{lstm_att_v0_ts_1_drop_04_cells_64.png}
227 | \includegraphics[width=195pt]{lstm_att_v0_ts_5_drop_0_cells_64.png}
228 | \caption{Typical LSTM+A loss for training and validation data per epoch}
229 | \end{figure}
230 |
231 | As shown in the loss curves of figure 6, we can see that performance at train time continually decreases while validation performance rises suggesting overfitting. Due to the high number of weights used in our implementation of the attention mechanism (with 16 cells, there are around 10 000 parameters), there are more parameters than the number of data points themselves, which inevitably leads to overfitting. Due to a large amount of parameters compared to the number of our data points, the loss values are very low and the training loss does not fluctuate a lot. This explains how the validation loss seems to increase immediately after one or two epochs when looking at the graphs. From this behaviour, we decided to limit the number of epochs for the LSTMs with attention (grid search over 25 epochs, compared to 100 epochs with LSTMs only). In order to help alleviate some of the overfitting errors and lack of generalisation, a hyperparameter grid search was undertaken to tune the model’s hyperparameters. In particular, dropout and varying the amount of lag for given LSTM sizes were investigated, as done for the LSTM without attention. Loss plots are shown in figure 7 below.
232 |
233 | \begin{figure}[!h]
234 | \includegraphics[width=395pt]{LSTMAgrid.png}
235 | \caption{LSTM+A grid search loss plots for 3 different sizes}
236 | \end{figure}
237 |
238 | The LSTM with attention achieves a globally lower loss than the LSTM without attention but is sensitive to different hyperparameters. Indeed, in the case of the LSTM with attention, the amount of lag is a significant factor in decreasing loss. This can be seen in figure 7 as lags larger than 30 drastically improve loss. It could be hypothesized that this is due to the unique nature of attention that can focus on relevant information from the past while the LSTM without attention is incapable of doing this due to its long term dependencies. This seems to support our original hypothesis on the impact of using attention on LSTMs for FTS. From this hyperparameter tuning, the best chosen configuration was with $size=16$, $lag=60$ and $dropout=0.05$. This resulted in a loss of 0.001511 and an up-down accuracy of 0.588 (or 59\%). Again, this is in line with top of the range algorithms and is slightly higher than that of the LSTM without attention (58\% accuracy).
239 |
240 | It is now worth comparing the performance, loss and accuracy, of both the tuned LSTM and the tuned LSTM with attention across all five stocks. The optimal hyperparameters for both the LSTM and LSTM with attention were set as detailed previously and the following performances were observed.
241 |
242 | \begin{center}
243 | \begin{tabular}{||c c c c c c||}
244 | \hline
245 | & Intel & Wells Fargo & Amazon & Agilent & BE \\ [0.5ex]
246 | \hline\hline
247 | LSTM (loss) & 0.000805 & 0.001200 & 0.002804 & 0.001073 & 0.002300 \\
248 | \hline
249 | LSTM (accuracy) & 0.573 & 0.457 & 0.490 & 0.457 & 0.470 \\
250 | \hline
251 | LSTM+A (loss) & 0.001511 & 0.000357 & 0.003168 & 0.000955 & 0.001787\\
252 | \hline
253 | LSTM+A (accuracy) & 0.588 & 0.603 & 0.328 & 0.443 & 0.493\\
254 | \hline
255 | \end{tabular}
256 | \end{center}
257 | \begin{center}
258 | Table 2: LSTM and LSTM with attention performance comparison
259 | \end{center}
260 |
261 | From table 2 above, we can observe multiple interesting dynamics at play. The first being that the optimal parameters chosen by grid search with the Intel stock fail to generalise to other stocks for the LSTM and the LSTM with attention. While this was somewhat expected it is interesting nonetheless to observe this limitation. Further time would allow the tuning of hyperparameters for each stock but this is outside the scope of this paper and will be discussed in the following section. In addition, it seems that the LSTM with attention has a higher variability in its performances. This makes sense as we know that the model is very complex in terms of parameter number with respect to the data input. Overall in this comparison, the LSTM with attention has outperformed the regular LSTM. Indeed, it does seem like we can tentatively confirm our original hypothesis albeit further work is certainly required. In addition, the attention architecture used in this paper does involve certain limitations and has a caveat of inherent complexity.
262 |
263 | \section{Limitations and Future Work}
264 |
265 | One particular limitation of this model is that we only considered the first prediction in our accuracy score. This means we are effectively doing single step ahead prediction even if the model is inherently capable of doing multi-step ahead prediction. Indeed, multi-step ahead forecasting is historically a much harder problem than single step ahead. The reason why this is the case can be seen by the naive approach of converting single step ahead forecasting to multi-step forecasting via the iterative method. The iterative method simply takes the predicted output of the model for time $t+1$ and forecasts output for $t+2$ from the existing forecast without feedback from the real world. This causes the model error to cascade through iterations and grow out of control for more than a handful of time steps. This is a fundamental issue where most models fail for FTS forecasting. However, the model described in this paper is capable of seq2seq which allows for forecasting an entire sequence without the uncontrollable error growth experienced by iterative methods. Indeed, this potential for seq2seq is key advantage of the models used and constitutes a major avenue for future work and research.
266 |
267 | Another limitation of this investigation is confidence interval encoding. Indeed, there are various ways to do so but the approach used here was to make the models output a value between -1 and 1 by using a $\tanh$ function. However, the confidence output of our model was not considered in the up-down accuracy detailed above. Indeed, a simple way to exploit confidence intervals to improve performance would be to only execute trades beyond certain threshold of confidence. Indeed, this has been leveraged successfully for most models in FTS forecasting however it remains a non-trivial task. Indeed, the higher the threshold is set for executing trades, the higher the overall up-down accuracy will be. However, the higher the threshold the lower the number of total trades executed. There is therefore a major trade-off here as perfect accuracy for few trades may be less profitable than excellent accuracy for many trades.
268 |
269 | One major possible extension to the work presented here would be the use of Bayesian optimisation. Bayesian optimisation is a technique commonly used in financial time series forecasting for the tuning of hyperparameters. While this was not done in this report for time constraints, it is essential for a more detailed comparison of the potential of attention mechanisms in LSTMs for FTS forecasting. Indeed, as revealed by this paper, hyperparameters have different sensitivities in the benchmark LSTM and in the LSTM with attention. Future work would consist of choosing an appropriate surrogate function and acquisition function (such as expected improvement) in an attempt to explore the hyperparameter space with a smart explore-exploit trade-off. This is indeed still an open problem in the field of FTS forecasting.
270 |
271 | Another possible extension would be to investigate how the LSTM with attention performs compared to other benchmark models for FTS forecasting. One particular interesting comparison could be done with cutting edge Temporal Convolutional Networks (TCN) as they are currently showing promising performances compared to RNNs in NLP applications.
272 |
273 | \section{Conclusions}
274 |
275 | This paper has demonstrated the performance of a benchmark LSTM. Investigations were undertaken to avoid using a single number as a benchmark and a wide range of experimentation was conducted. This includes the investigation of data shuffling methods such as rolling window and rolling origin recalibration, which showed the impact of volatility on estimating model performance. Ultimately, the benchmark LSTM consistently performed around the 58\% range which is in line with the best models currently available which usually reach performances around 60\%.
276 |
277 | In addition, the LSTM with attention was successfully implemented and leveraged for FTS forecasting. This task, while totally novel, did nonetheless closely follow methods used in other FTS papers and papers using attention for NLP. This LSTM with attention achieved performances of around 60\% and above, albeit with a higher variability than the benchmark LSTM.
278 |
279 | The final comparison of this LSTM with attention does indeed confirm the investigated hypothesis that an LSTM with attention can improve the performance of existing LSTMs in FTS. A slight improvement was indeed highlighted in the final comparison table, albeit both models need to be re-tuned in between stocks. A theoretical explanation for why this is the case was suggested, developed and tested.
280 |
281 | Finally, further work in this topic has been suggested and the main model limitations were discussed.
282 |
283 | %\pagebreak
284 |
285 | \bibliographystyle{plain}
286 | \bibliography{ref}
287 |
288 | \end{document}
--------------------------------------------------------------------------------
/report/nicefrac.sty:
--------------------------------------------------------------------------------
1 | %%
2 | %% This is file `nicefrac.sty',
3 | %% generated with the docstrip utility.
4 | %%
5 | %% The original source files were:
6 | %%
7 | %% units.dtx (with options: `nicefrac')
8 | %%
9 | %% LaTeX package for typesetting nice fractions
10 | %%
11 | %% Copyright (C) 1998 Axel Reichert
12 | %% See the files README and COPYING.
13 | %%
14 | %% \CharacterTable
15 | %% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z
16 | %% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z
17 | %% Digits \0\1\2\3\4\5\6\7\8\9
18 | %% Exclamation \! Double quote \" Hash (number) \#
19 | %% Dollar \$ Percent \% Ampersand \&
20 | %% Acute accent \' Left paren \( Right paren \)
21 | %% Asterisk \* Plus \+ Comma \,
22 | %% Minus \- Point \. Solidus \/
23 | %% Colon \: Semicolon \; Less than \<
24 | %% Equals \= Greater than \> Question mark \?
25 | %% Commercial at \@ Left bracket \[ Backslash \\
26 | %% Right bracket \] Circumflex \^ Underscore \_
27 | %% Grave accent \` Left brace \{ Vertical bar \|
28 | %% Right brace \} Tilde \~}
29 | \NeedsTeXFormat{LaTeX2e}[1995/12/01]
30 | \ProvidesPackage{nicefrac}[1998/08/04 v0.9b Nice fractions]
31 | \newlength{\L@UnitsRaiseDisplaystyle}
32 | \newlength{\L@UnitsRaiseTextstyle}
33 | \newlength{\L@UnitsRaiseScriptstyle}
34 | \RequirePackage{ifthen}
35 | \DeclareRobustCommand*{\@UnitsNiceFrac}[3][]{%
36 | \ifthenelse{\boolean{mmode}}{%
37 | \settoheight{\L@UnitsRaiseDisplaystyle}{%
38 | \ensuremath{\displaystyle#1{M}}%
39 | }%
40 | \settoheight{\L@UnitsRaiseTextstyle}{%
41 | \ensuremath{\textstyle#1{M}}%
42 | }%
43 | \settoheight{\L@UnitsRaiseScriptstyle}{%
44 | \ensuremath{\scriptstyle#1{M}}%
45 | }%
46 | \settoheight{\@tempdima}{%
47 | \ensuremath{\scriptscriptstyle#1{M}}%
48 | }%
49 | \addtolength{\L@UnitsRaiseDisplaystyle}{%
50 | -\L@UnitsRaiseScriptstyle%
51 | }%
52 | \addtolength{\L@UnitsRaiseTextstyle}{%
53 | -\L@UnitsRaiseScriptstyle%
54 | }%
55 | \addtolength{\L@UnitsRaiseScriptstyle}{-\@tempdima}%
56 | \mathchoice
57 | {%
58 | \raisebox{\L@UnitsRaiseDisplaystyle}{%
59 | \ensuremath{\scriptstyle#1{#2}}%
60 | }%
61 | }%
62 | {%
63 | \raisebox{\L@UnitsRaiseTextstyle}{%
64 | \ensuremath{\scriptstyle#1{#2}}%
65 | }%
66 | }%
67 | {%
68 | \raisebox{\L@UnitsRaiseScriptstyle}{%
69 | \ensuremath{\scriptscriptstyle#1{#2}}%
70 | }%
71 | }%
72 | {%
73 | \raisebox{\L@UnitsRaiseScriptstyle}{%
74 | \ensuremath{\scriptscriptstyle#1{#2}}%
75 | }%
76 | }%
77 | \mkern-2mu/\mkern-1mu%
78 | \bgroup
79 | \mathchoice
80 | {\scriptstyle}%
81 | {\scriptstyle}%
82 | {\scriptscriptstyle}%
83 | {\scriptscriptstyle}%
84 | #1{#3}%
85 | \egroup
86 | }%
87 | {%
88 | \settoheight{\L@UnitsRaiseTextstyle}{#1{M}}%
89 | \settoheight{\@tempdima}{%
90 | \ensuremath{%
91 | \mbox{\fontsize\sf@size\z@\selectfont#1{M}}%
92 | }%
93 | }%
94 | \addtolength{\L@UnitsRaiseTextstyle}{-\@tempdima}%
95 | \raisebox{\L@UnitsRaiseTextstyle}{%
96 | \ensuremath{%
97 | \mbox{\fontsize\sf@size\z@\selectfont#1{#2}}%
98 | }%
99 | }%
100 | \ensuremath{\mkern-2mu}/\ensuremath{\mkern-1mu}%
101 | \ensuremath{%
102 | \mbox{\fontsize\sf@size\z@\selectfont#1{#3}}%
103 | }%
104 | }%
105 | }
106 | \DeclareRobustCommand*{\@UnitsUglyFrac}[3][]{%
107 | \ifthenelse{\boolean{mmode}}{%
108 | \frac{#1{#2}}{#1{#3}}%
109 | }%
110 | {%
111 | #1{#2}/#1{#3}%
112 | \PackageWarning{nicefrac}{%
113 | You used \protect\nicefrac\space or
114 | \protect\unitfrac\space in text mode\MessageBreak
115 | and specified the ``ugly'' option.\MessageBreak
116 | The fraction may be ambiguous or wrong.\MessageBreak
117 | Please make sure the denominator is
118 | correct.\MessageBreak
119 | If it is, you can safely ignore\MessageBreak
120 | this warning
121 | }%
122 | }%
123 | }
124 | \DeclareOption{nice}{%
125 | \DeclareRobustCommand*{\nicefrac}{\@UnitsNiceFrac}%
126 | }
127 | \DeclareOption{ugly}{%
128 | \DeclareRobustCommand*{\nicefrac}{\@UnitsUglyFrac}%
129 | }
130 | \ExecuteOptions{nice}
131 | \ProcessOptions*
132 | \endinput
133 | %%
134 | %% End of file `nicefrac.sty'.
--------------------------------------------------------------------------------
/report/nips_2016.sty:
--------------------------------------------------------------------------------
1 | % partial rewrite of the LaTeX2e package for submissions to the
2 | % Conference on Neural Information Processing Systems (NIPS):
3 | %
4 | % - uses more LaTeX conventions
5 | % - line numbers at submission time replaced with aligned numbers from
6 | % lineno package
7 | % - \nipsfinalcopy replaced with [final] package option
8 | % - automatically loads times package for authors
9 | % - loads natbib automatically; this can be suppressed with the
10 | % [nonatbib] package option
11 | % - adds foot line to first page identifying the conference
12 | %
13 | % Roman Garnett (garnett@wustl.edu) and the many authors of
14 | % nips15submit_e.sty, including MK and drstrip@sandia
15 | %
16 | % last revision: August 2016
17 |
18 | \NeedsTeXFormat{LaTeX2e}
19 | \ProvidesPackage{nips_2016}[2016/08/08 NIPS 2016 submission/camera-ready style file]
20 |
21 | % declare final option, which creates camera-ready copy
22 | \newif\if@nipsfinal\@nipsfinalfalse
23 | \DeclareOption{final}{
24 | \@nipsfinaltrue
25 | }
26 |
27 | % declare nonatbib option, which does not load natbib in case of
28 | % package clash (users can pass options to natbib via
29 | % \PassOptionsToPackage)
30 | \newif\if@natbib\@natbibtrue
31 | \DeclareOption{nonatbib}{
32 | \@natbibfalse
33 | }
34 |
35 | \ProcessOptions\relax
36 |
37 | % fonts
38 | \renewcommand{\rmdefault}{ptm}
39 | \renewcommand{\sfdefault}{phv}
40 |
41 | % change this every year for notice string at bottom
42 | \newcommand{\@nipsordinal}{30th}
43 | \newcommand{\@nipsyear}{2016}
44 | \newcommand{\@nipslocation}{Barcelona, Spain}
45 |
46 | % handle tweaks for camera-ready copy vs. submission copy
47 | \if@nipsfinal
48 | \newcommand{\@noticestring}{%
49 | \@nipsordinal\/ Conference on Neural Information Processing Systems
50 | (NIPS \@nipsyear), \@nipslocation.%
51 | }
52 | \else
53 | \newcommand{\@noticestring}{%
54 | Submitted to \@nipsordinal\/ Conference on Neural Information
55 | Processing Systems (NIPS \@nipsyear). Do not distribute.%
56 | }
57 |
58 | % line numbers for submission
59 | \RequirePackage{lineno}
60 | \linenumbers
61 |
62 | % fix incompatibilities between lineno and amsmath, if required, by
63 | % transparently wrapping linenomath environments around amsmath
64 | % environments
65 | \AtBeginDocument{%
66 | \@ifpackageloaded{amsmath}{%
67 | \newcommand*\patchAmsMathEnvironmentForLineno[1]{%
68 | \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname
69 | \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname
70 | \renewenvironment{#1}%
71 | {\linenomath\csname old#1\endcsname}%
72 | {\csname oldend#1\endcsname\endlinenomath}%
73 | }%
74 | \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{%
75 | \patchAmsMathEnvironmentForLineno{#1}%
76 | \patchAmsMathEnvironmentForLineno{#1*}%
77 | }%
78 | \patchBothAmsMathEnvironmentsForLineno{equation}%
79 | \patchBothAmsMathEnvironmentsForLineno{align}%
80 | \patchBothAmsMathEnvironmentsForLineno{flalign}%
81 | \patchBothAmsMathEnvironmentsForLineno{alignat}%
82 | \patchBothAmsMathEnvironmentsForLineno{gather}%
83 | \patchBothAmsMathEnvironmentsForLineno{multline}%
84 | }{}
85 | }
86 | \fi
87 |
88 | % load natbib unless told otherwise
89 | \if@natbib
90 | \RequirePackage{natbib}
91 | \fi
92 |
93 | % set page geometry
94 | \usepackage[verbose=true,letterpaper]{geometry}
95 | \AtBeginDocument{
96 | \newgeometry{
97 | textheight=9in,
98 | textwidth=5.5in,
99 | top=1in,
100 | headheight=12pt,
101 | headsep=25pt,
102 | footskip=30pt
103 | }
104 | \@ifpackageloaded{fullpage}
105 | {\PackageWarning{nips_2016}{fullpage package not allowed! Overwriting formatting.}}
106 | {}
107 | }
108 |
109 | \widowpenalty=10000
110 | \clubpenalty=10000
111 | \flushbottom
112 | \sloppy
113 |
114 | % font sizes with reduced leading
115 | \renewcommand{\normalsize}{%
116 | \@setfontsize\normalsize\@xpt\@xipt
117 | \abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@
118 | \abovedisplayshortskip \z@ \@plus 3\p@
119 | \belowdisplayskip \abovedisplayskip
120 | \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@
121 | }
122 | \normalsize
123 | \renewcommand{\small}{%
124 | \@setfontsize\small\@ixpt\@xpt
125 | \abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@
126 | \abovedisplayshortskip \z@ \@plus 2\p@
127 | \belowdisplayskip \abovedisplayskip
128 | \belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@
129 | }
130 | \renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt}
131 | \renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt}
132 | \renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt}
133 | \renewcommand{\large}{\@setfontsize\large\@xiipt{14}}
134 | \renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}}
135 | \renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}}
136 | \renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}}
137 | \renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}}
138 |
139 | % sections with less space
140 | \providecommand{\section}{}
141 | \renewcommand{\section}{%
142 | \@startsection{section}{1}{\z@}%
143 | {-2.0ex \@plus -0.5ex \@minus -0.2ex}%
144 | { 1.5ex \@plus 0.3ex \@minus 0.2ex}%
145 | {\large\bf\raggedright}%
146 | }
147 | \providecommand{\subsection}{}
148 | \renewcommand{\subsection}{%
149 | \@startsection{subsection}{2}{\z@}%
150 | {-1.8ex \@plus -0.5ex \@minus -0.2ex}%
151 | { 0.8ex \@plus 0.2ex}%
152 | {\normalsize\bf\raggedright}%
153 | }
154 | \providecommand{\subsubsection}{}
155 | \renewcommand{\subsubsection}{%
156 | \@startsection{subsubsection}{3}{\z@}%
157 | {-1.5ex \@plus -0.5ex \@minus -0.2ex}%
158 | { 0.5ex \@plus 0.2ex}%
159 | {\normalsize\bf\raggedright}%
160 | }
161 | \providecommand{\paragraph}{}
162 | \renewcommand{\paragraph}{%
163 | \@startsection{paragraph}{4}{\z@}%
164 | {1.5ex \@plus 0.5ex \@minus 0.2ex}%
165 | {-1em}%
166 | {\normalsize\bf}%
167 | }
168 | \providecommand{\subparagraph}{}
169 | \renewcommand{\subparagraph}{%
170 | \@startsection{subparagraph}{5}{\z@}%
171 | {1.5ex \@plus 0.5ex \@minus 0.2ex}%
172 | {-1em}%
173 | {\normalsize\bf}%
174 | }
175 | \providecommand{\subsubsubsection}{}
176 | \renewcommand{\subsubsubsection}{%
177 | \vskip5pt{\noindent\normalsize\rm\raggedright}%
178 | }
179 |
180 | % float placement
181 | \renewcommand{\topfraction }{0.85}
182 | \renewcommand{\bottomfraction }{0.4}
183 | \renewcommand{\textfraction }{0.1}
184 | \renewcommand{\floatpagefraction}{0.7}
185 |
186 | \newlength{\@nipsabovecaptionskip}\setlength{\@nipsabovecaptionskip}{7\p@}
187 | \newlength{\@nipsbelowcaptionskip}\setlength{\@nipsbelowcaptionskip}{\z@}
188 |
189 | \setlength{\abovecaptionskip}{\@nipsabovecaptionskip}
190 | \setlength{\belowcaptionskip}{\@nipsbelowcaptionskip}
191 |
192 | % swap above/belowcaptionskip lengths for tables
193 | \renewenvironment{table}
194 | {\setlength{\abovecaptionskip}{\@nipsbelowcaptionskip}%
195 | \setlength{\belowcaptionskip}{\@nipsabovecaptionskip}%
196 | \@float{table}}
197 | {\end@float}
198 |
199 | % footnote formatting
200 | \setlength{\footnotesep }{6.65\p@}
201 | \setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@}
202 | \renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@}
203 | \setcounter{footnote}{0}
204 |
205 | % paragraph formatting
206 | \setlength{\parindent}{\z@}
207 | \setlength{\parskip }{5.5\p@}
208 |
209 | % list formatting
210 | \setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@}
211 | \setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@}
212 | \setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@}
213 | \setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@}
214 | \setlength{\leftmargin }{3pc}
215 | \setlength{\leftmargini }{\leftmargin}
216 | \setlength{\leftmarginii }{2em}
217 | \setlength{\leftmarginiii}{1.5em}
218 | \setlength{\leftmarginiv }{1.0em}
219 | \setlength{\leftmarginv }{0.5em}
220 | \def\@listi {\leftmargin\leftmargini}
221 | \def\@listii {\leftmargin\leftmarginii
222 | \labelwidth\leftmarginii
223 | \advance\labelwidth-\labelsep
224 | \topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@
225 | \parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@
226 | \itemsep \parsep}
227 | \def\@listiii{\leftmargin\leftmarginiii
228 | \labelwidth\leftmarginiii
229 | \advance\labelwidth-\labelsep
230 | \topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@
231 | \parsep \z@
232 | \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@
233 | \itemsep \topsep}
234 | \def\@listiv {\leftmargin\leftmarginiv
235 | \labelwidth\leftmarginiv
236 | \advance\labelwidth-\labelsep}
237 | \def\@listv {\leftmargin\leftmarginv
238 | \labelwidth\leftmarginv
239 | \advance\labelwidth-\labelsep}
240 | \def\@listvi {\leftmargin\leftmarginvi
241 | \labelwidth\leftmarginvi
242 | \advance\labelwidth-\labelsep}
243 |
244 | % create title
245 | \providecommand{\maketitle}{}
246 | \renewcommand{\maketitle}{%
247 | \par
248 | \begingroup
249 | \renewcommand{\thefootnote}{\fnsymbol{footnote}}
250 | % for perfect author name centering
251 | \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}}
252 | % The footnote-mark was overlapping the footnote-text,
253 | % added the following to fix this problem (MK)
254 | \long\def\@makefntext##1{%
255 | \parindent 1em\noindent
256 | \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1
257 | }
258 | \thispagestyle{empty}
259 | \@maketitle
260 | \@thanks
261 | \@notice
262 | \endgroup
263 | \let\maketitle\relax
264 | \let\thanks\relax
265 | }
266 |
267 | % rules for title box at top of first page
268 | \newcommand{\@toptitlebar}{
269 | \hrule height 4\p@
270 | \vskip 0.25in
271 | \vskip -\parskip%
272 | }
273 | \newcommand{\@bottomtitlebar}{
274 | \vskip 0.29in
275 | \vskip -\parskip
276 | \hrule height 1\p@
277 | \vskip 0.09in%
278 | }
279 |
280 | % create title (includes both anonymized and non-anonymized versions)
281 | \providecommand{\@maketitle}{}
282 | \renewcommand{\@maketitle}{%
283 | \vbox{%
284 | \hsize\textwidth
285 | \linewidth\hsize
286 | \vskip 0.1in
287 | \@toptitlebar
288 | \centering
289 | {\LARGE\bf \@title\par}
290 | \@bottomtitlebar
291 | \if@nipsfinal
292 | \def\And{%
293 | \end{tabular}\hfil\linebreak[0]\hfil%
294 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
295 | }
296 | \def\AND{%
297 | \end{tabular}\hfil\linebreak[4]\hfil%
298 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
299 | }
300 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}%
301 | \else
302 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}
303 | Anonymous Author(s) \\
304 | Affiliation \\
305 | Address \\
306 | \texttt{email} \\
307 | \end{tabular}%
308 | \fi
309 | \vskip 0.3in \@minus 0.1in
310 | }
311 | }
312 |
313 | % add conference notice to bottom of first page
314 | \newcommand{\ftype@noticebox}{8}
315 | \newcommand{\@notice}{%
316 | % give a bit of extra room back to authors on first page
317 | \enlargethispage{2\baselineskip}%
318 | \@float{noticebox}[b]%
319 | %UNCOMMENT THIS LINE FOR FOOTNOTE \footnotesize\@noticestring%
320 | \end@float%
321 | }
322 |
323 | % abstract styling
324 | \renewenvironment{abstract}%
325 | {%
326 | \vskip 0.075in%
327 | \centerline%
328 | {\large\bf Abstract}%
329 | \vspace{0.5ex}%
330 | \begin{quote}%
331 | }
332 | {
333 | \par%
334 | \end{quote}%
335 | \vskip 1ex%
336 | }
337 |
338 | \endinput
--------------------------------------------------------------------------------
/report/printlen.sty:
--------------------------------------------------------------------------------
1 | % printlen.sty Print lengths in a variety of units
2 | %
3 | % Author: Peter Wilson, Herries Press
4 | % Maintainer: Will Robertson (will dot robertson at latex-project dot org)
5 | % Copyright 2001 Peter R. Wilson
6 | % Released under the LaTeX Project Public License
7 | %
8 | % Extensions courtesy of Harald Harders (h.harders@tu-bs.de)
9 | %
10 | % Usage instructions are at the end of this file.
11 | %
12 | \NeedsTeXFormat{LaTeX2e}
13 | \ProvidesPackage{printlen}[2009/09/03 v1.1a print lengths with units]
14 | %
15 | % \uselengthunit{} sets \l@nunits to the value of
16 | % and \l@nunitperpt to the number of in 1pt.
17 | \newcommand{\uselengthunit}[1]{%
18 | \def\l@nunitperpt{1.0}\def\l@nunits{pt}%
19 | \def\l@nta{#1}\def\l@ntb{pt}%
20 | \ifx \l@nta\l@ntb
21 | \def\l@nunitperpt{1.0}\def\l@nunits{pt}%
22 | \else
23 | \def\l@ntb{pc}%
24 | \ifx \l@nta\l@ntb
25 | \def\l@nunitperpt{0.083333}\def\l@nunits{pc}%
26 | \else
27 | \def\l@ntb{in}%
28 | \ifx \l@nta\l@ntb
29 | \def\l@nunitperpt{0.013837}\def\l@nunits{in}%
30 | \else
31 | \def\l@ntb{mm}%
32 | \ifx \l@nta\l@ntb
33 | \def\l@nunitperpt{0.351459}\def\l@nunits{mm}%
34 | \else
35 | \def\l@ntb{cm}%
36 | \ifx \l@nta\l@ntb
37 | \def\l@nunitperpt{0.0351459}\def\l@nunits{cm}%
38 | \else
39 | \def\l@ntb{bp}%
40 | \ifx \l@nta\l@ntb
41 | \def\l@nunitperpt{0.996264}\def\l@nunits{bp}%
42 | \else
43 | \def\l@ntb{dd}%
44 | \ifx \l@nta\l@ntb
45 | \def\l@nunitperpt{0.9345718}\def\l@nunits{dd}%
46 | \else
47 | \def\l@ntb{cc}%
48 | \ifx \l@nta\l@ntb
49 | \def\l@nunitperpt{0.0778809}\def\l@nunits{cc}%
50 | \else
51 | \def\l@ntb{PT}%
52 | \ifx \l@nta\l@ntb
53 | \def\l@nunitperpt{1.0}\def\l@nunits{PT}%
54 | \fi
55 | \fi
56 | \fi
57 | \fi
58 | \fi
59 | \fi
60 | \fi
61 | \fi
62 | \fi
63 | }
64 | \uselengthunit{pt}
65 |
66 | % \printlength{} prints the value of in the units set
67 | % by \uselengthunit.
68 | \newcommand{\printlength}[1]{%
69 | \def\l@nta{pt}\ifx\l@nta\l@nunits\the#1\else
70 | \def\l@nta{PT}%
71 | \@tempdimc=\l@nunitperpt #1\relax\strip@pt\@tempdimc
72 | \unitspace\relax\ifmmode
73 | \mathrm{\ifx\l@nta\l@nunits pt\else\l@nunits\fi}%
74 | \else
75 | \ifx\l@nta\l@nunits pt\else\l@nunits\fi
76 | \fi\fi}
77 |
78 | % \rndprintlength{} prints the rounded value of in
79 | % the units set by \uselengthunit. Contributed by Harald Harders.
80 | \def\@round#1.#2\@empty{#1}%
81 | \newcommand{\rndprintlength}[1]{%
82 | \def\l@nta{pt}\ifx\l@nta\l@nunits\the#1\else
83 | \def\l@nta{PT}%
84 | \setlength{\@tempdimc}{\l@nunitperpt #1}%
85 | \addtolength{\@tempdimc}{0.5pt}%
86 | \edef\@@round{\strip@pt\@tempdimc}%
87 | \expandafter\@round\@@round.\@empty
88 | \unitspace\relax\ifmmode
89 | \mathrm{\ifx\l@nta\l@nunits pt\else\l@nunits\fi}%
90 | \else
91 | \ifx\l@nta\l@nunits pt\else\l@nunits\fi
92 | \fi\fi}
93 |
94 | % Small space. Contributed by Harald Harders.
95 | \newcommand{\unitspace}{\,}
96 |
97 | \endinput
98 |
99 | % USAGE:
100 | %
101 | % \printlength{} prints the value of a LaTeX length in the
102 | % units specified by \uselengthunit{}, where may be any TeX
103 | % length unit except for scaled point. That is, may be any of:
104 | % pt, pc, in, mm, cm, bp, dd or cc. When pt is set the printed length
105 | % value will include any stretch or shrink values, otherwise these
106 | % are not printed. The argument may also be PT, in which case
107 | % length values will be printed in pt units but without any stretch
108 | % or shrink values. An unknown value for is treated as though it
109 | % had been specified as pt.
110 | % The unit is separated from the number using the command
111 | % \unitspace which is set to \, by default. In math mode the units are
112 | % printed upright.
113 | % \rndprintlength{} prints the rounded value of a LaTeX length.
114 | % Use PT instead of pt for rounded points if there are stretch or
115 | % shrink values.
116 | %
117 | % The initial setting is \uselengthunit{pt}
118 | %
119 | % Example:
120 | %
121 | % The \verb|\textwidth| is \printlength{\textwidth} which is also
122 | % \uselengthunit{in}\printlength{\textwidth} and
123 | % \uselengthunit{mm}\printlength{\textwidth}.
124 | %
125 | %
126 | % CHANGE HISTORY
127 | %
128 | % Version 1.1a (2009/09/03)
129 | % - New maintainer (Will Robertson)
130 | %
131 | % Version 1.1 (2001/12/09)
132 | % - Print rounded values
133 | % - Space between value and units
134 | %
135 | % Version 1.0 (2001/11/03)
136 | % - First public release
137 | %
--------------------------------------------------------------------------------
/report/ref.bib:
--------------------------------------------------------------------------------
1 | @book{murphy1999technical,
2 | title={Technical analysis of the financial markets: A comprehensive guide to trading methods and applications},
3 | author={Murphy, John J},
4 | year={1999},
5 | publisher={Penguin}
6 | }
7 |
8 | @article{hollis2018deep,
9 | title={Deep Learning Algorithms Applied to Blockchain-Based Financial Time Series},
10 | author={Hollis, Thomas},
11 | year={2018}
12 | }
13 |
14 | @misc{kaggle2017twosigma,
15 | author={Kaggle},
16 | title={Two Sigma: Using News to Predict Stock Movements},
17 | howpublished={\url{https://www.kaggle.com/c/two-sigma-financial-news}},
18 | note = {Accessed: 2018-09-30}
19 | }
20 |
21 | @misc{NYSE2007NYSE,
22 | author={NYSE},
23 | title={NYSE Group Equities Streamlining},
24 | howpublished={\url{https://www.nyse.com/publicdocs/nyse/markets/nyse/CCG_Notification_Update1.pdf}},
25 | note = {Accessed: 2018-09-30}
26 | }
27 |
28 | @article{gamble2009british,
29 | title={British politics and the financial crisis},
30 | author={Gamble, Andrew},
31 | journal={British Politics},
32 | volume={4},
33 | number={4},
34 | pages={450--462},
35 | year={2009},
36 | publisher={Springer}
37 | }
38 |
39 | @article{malkiel1970efficient,
40 | title={Efficient capital markets: A review of theory and empirical work},
41 | author={Malkiel, Burton G and Fama, Eugene F},
42 | journal={The journal of Finance},
43 | volume={25},
44 | number={2},
45 | pages={383--417},
46 | year={1970},
47 | publisher={Wiley Online Library}
48 | }
49 |
50 | @book{hamilton1994time,
51 | title={Time series analysis},
52 | author={Hamilton, James Douglas},
53 | volume={2},
54 | year={1994},
55 | publisher={Princeton university press Princeton, NJ}
56 | }
57 |
58 | @article{walker1931periodicity,
59 | title={On periodicity in series of related terms},
60 | author={Walker, Gilbert Thomas},
61 | journal={Proceedings of the Royal Society of London. Series A, Containing Papers of a Mathematical and Physical Character},
62 | volume={131},
63 | number={818},
64 | pages={518--532},
65 | year={1931},
66 | publisher={The Royal Society London}
67 | }
68 |
69 | @article{burg1968new,
70 | title={A new analysis technique for time series data},
71 | author={Burg, John Parker},
72 | journal={Paper presented at NATO Advanced Study Institute on Signal Processing, Enschede, Netherlands, 1968},
73 | year={1968}
74 | }
75 |
76 | @article{engle1982autoregressive,
77 | title={Autoregressive conditional heteroscedasticity with estimates of the variance of United Kingdom inflation},
78 | author={Engle, Robert F},
79 | journal={Econometrica: Journal of the Econometric Society},
80 | pages={987--1007},
81 | year={1982},
82 | publisher={JSTOR}
83 | }
84 |
85 | @article{bollerslev1986generalized,
86 | title={Generalized autoregressive conditional heteroskedasticity},
87 | author={Bollerslev, Tim},
88 | journal={Journal of econometrics},
89 | volume={31},
90 | number={3},
91 | pages={307--327},
92 | year={1986},
93 | publisher={Elsevier}
94 | }
95 |
96 | @article{engle1993measuring,
97 | title={Measuring and testing the impact of news on volatility},
98 | author={Engle, Robert F and Ng, Victor K},
99 | journal={The journal of finance},
100 | volume={48},
101 | number={5},
102 | pages={1749--1778},
103 | year={1993},
104 | publisher={Wiley Online Library}
105 | }
106 |
107 | @article{pierre1998estimating,
108 | title={Estimating EGARCH-M models: Science or art?},
109 | author={Pierre, Eileen F St},
110 | journal={The Quarterly Review of Economics and Finance},
111 | volume={38},
112 | number={2},
113 | pages={167--180},
114 | year={1998},
115 | publisher={Elsevier}
116 | }
117 |
118 | @article{hamzaoui2016glosten,
119 | title={The Glosten-Jagannathan-Runkle-Generalized Autoregressive Conditional Heteroscedastic approach to investigating the foreign exchange forward premium volatility},
120 | author={Hamzaoui, Nessrine and Regaieg, Boutheina},
121 | journal={International Journal of Economics and Financial Issues},
122 | volume={6},
123 | number={4},
124 | pages={1608--1615},
125 | year={2016}
126 | }
127 |
128 | @article{hentschel1995all,
129 | title={All in the family: Nesting symmetric and asymmetric GARCH models},
130 | author={Hentschel, Ludger and others},
131 | journal={Journal of Financial Economics},
132 | volume={39},
133 | number={1},
134 | pages={71--104},
135 | year={1995}
136 | }
137 |
138 | @article{chen1989representations,
139 | title={Representations of non-linear systems: the NARMAX model},
140 | author={Chen, Sheng and Billings, Steve A},
141 | journal={International Journal of Control},
142 | volume={49},
143 | number={3},
144 | pages={1013--1032},
145 | year={1989},
146 | publisher={Taylor \& Francis}
147 | }
148 |
149 | @article{kohonen1982self,
150 | title={Self-organized formation of topologically correct feature maps},
151 | author={Kohonen, Teuvo},
152 | journal={Biological cybernetics},
153 | volume={43},
154 | number={1},
155 | pages={59--69},
156 | year={1982},
157 | publisher={Springer}
158 | }
159 |
160 | @article{koskela1998time,
161 | title={Time series prediction using recurrent SOM with local linear models},
162 | author={Koskela, Timo and Varsta, Markus and Heikkonen, Jukka and Kaski, Kimmo},
163 | journal={Int. J. of Knowledge-Based Intelligent Engineering Systems},
164 | volume={2},
165 | number={1},
166 | pages={60--68},
167 | year={1998}
168 | }
169 |
170 | @article{kim2003financial,
171 | title={Financial time series forecasting using support vector machines},
172 | author={Kim, Kyoung-jae},
173 | journal={Neurocomputing},
174 | volume={55},
175 | number={1-2},
176 | pages={307--319},
177 | year={2003},
178 | publisher={Elsevier}
179 | }
180 |
181 | @article{zhang2003time,
182 | title={Time series forecasting using a hybrid ARIMA and neural network model},
183 | author={Zhang, G Peter},
184 | journal={Neurocomputing},
185 | volume={50},
186 | pages={159--175},
187 | year={2003},
188 | publisher={Elsevier}
189 | }
190 |
191 | @inproceedings{krizhevsky2012imagenet,
192 | title={Imagenet classification with deep convolutional neural networks},
193 | author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
194 | booktitle={Advances in neural information processing systems},
195 | pages={1097--1105},
196 | year={2012}
197 | }
198 |
199 | @article{chandra2012cooperative,
200 | title={Cooperative coevolution of Elman recurrent neural networks for chaotic time series prediction},
201 | author={Chandra, Rohitash and Zhang, Mengjie},
202 | journal={Neurocomputing},
203 | volume={86},
204 | pages={116--123},
205 | year={2012},
206 | publisher={Elsevier}
207 | }
208 |
209 | @article{lin2009short,
210 | title={Short-term stock price prediction based on echo state networks},
211 | author={Lin, Xiaowei and Yang, Zehong and Song, Yixu},
212 | journal={Expert systems with applications},
213 | volume={36},
214 | number={3},
215 | pages={7313--7317},
216 | year={2009},
217 | publisher={Elsevier}
218 | }
219 |
220 | @article{kuremoto2014time,
221 | title={Time series forecasting using a deep belief network with restricted Boltzmann machines},
222 | author={Kuremoto, Takashi and Kimura, Shinsuke and Kobayashi, Kunikazu and Obayashi, Masanao},
223 | journal={Neurocomputing},
224 | volume={137},
225 | pages={47--56},
226 | year={2014},
227 | publisher={Elsevier}
228 | }
229 |
230 | @article{hochreiter1997long,
231 | title={Long short-term memory},
232 | author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
233 | journal={Neural computation},
234 | volume={9},
235 | number={8},
236 | pages={1735--1780},
237 | year={1997},
238 | publisher={MIT Press}
239 | }
240 |
241 | @article{bao2017deep,
242 | title={A deep learning framework for financial time series using stacked autoencoders and long-short term memory},
243 | author={Bao, Wei and Yue, Jun and Rao, Yulei},
244 | journal={PloS one},
245 | volume={12},
246 | number={7},
247 | pages={e0180944},
248 | year={2017},
249 | publisher={Public Library of Science}
250 | }
251 |
252 | @inproceedings{vaswani2017attention,
253 | title={Attention is all you need},
254 | author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
255 | booktitle={Advances in Neural Information Processing Systems},
256 | pages={5998--6008},
257 | year={2017}
258 | }
259 |
260 | @article{bahdanau2014neural,
261 | title={Neural machine translation by jointly learning to align and translate},
262 | author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
263 | journal={arXiv preprint arXiv:1409.0473},
264 | year={2014}
265 | }
266 |
267 | @article{graves2014neural,
268 | title={Neural turing machines},
269 | author={Graves, Alex and Wayne, Greg and Danihelka, Ivo},
270 | journal={arXiv preprint arXiv:1410.5401},
271 | year={2014}
272 | }
273 |
274 | @inproceedings{xu2015show,
275 | title={Show, attend and tell: Neural image caption generation with visual attention},
276 | author={Xu, Kelvin and Ba, Jimmy and Kiros, Ryan and Cho, Kyunghyun and Courville, Aaron and Salakhudinov, Ruslan and Zemel, Rich and Bengio, Yoshua},
277 | booktitle={International conference on machine learning},
278 | pages={2048--2057},
279 | year={2015}
280 | }
281 |
282 | @article{firat2016multi,
283 | title={Multi-way, multilingual neural machine translation with a shared attention mechanism},
284 | author={Firat, Orhan and Cho, Kyunghyun and Bengio, Yoshua},
285 | journal={arXiv preprint arXiv:1601.01073},
286 | year={2016}
287 | }
288 |
289 | @article{peters2018deep,
290 | title={Deep contextualized word representations},
291 | author={Peters, Matthew E and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
292 | journal={arXiv preprint arXiv:1802.05365},
293 | year={2018}
294 | }
295 |
296 | @article{bergmeir2012use,
297 | title={On the use of cross-validation for time series predictor evaluation},
298 | author={Bergmeir, Christoph and Ben{\'\i}tez, Jos{\'e} M},
299 | journal={Information Sciences},
300 | volume={191},
301 | pages={192--213},
302 | year={2012},
303 | publisher={Elsevier}
304 | }
305 |
306 | @article{kingma2014adam,
307 | title={Adam: A method for stochastic optimization},
308 | author={Kingma, Diederik P and Ba, Jimmy},
309 | journal={arXiv preprint arXiv:1412.6980},
310 | year={2014}
311 | }
--------------------------------------------------------------------------------
/report/temp:
--------------------------------------------------------------------------------
1 | temp
2 |
--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | import seaborn as sns
5 | from os import listdir
6 | from os.path import isfile, join
7 | import pandas as pd
8 | import pickle
9 |
10 |
11 | def EDA():
12 | '''Prints a brief overview of the data.
13 |
14 | Parameters
15 | ----------
16 | none
17 |
18 | Returns
19 | -------
20 | none
21 |
22 | '''
23 | print(X_train.shape, y_train.shape)
24 | print(y_train.head())
25 | print(X_train.head())
26 | print(X_train.info())
27 | print(X_train.describe())
28 |
29 |
30 | def plot_asset(market, assetCode):
31 | '''Plots an asset's price, volatility and voume.
32 |
33 | Parameters
34 | ----------
35 | market_df : dataframe
36 | See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.
37 | assetCode : string
38 | The asset code of the instrument you want to plot
39 |
40 | Returns
41 | -------
42 | none
43 |
44 | '''
45 | # Set plot style
46 | plt.style.use('seaborn')
47 |
48 | # Fetch the asset from data
49 | ass_market = market[market['assetCode'] == assetCode]
50 | ass_market.index = ass_market.time
51 |
52 | # Setup 3 subplots
53 | f, axs = plt.subplots(2,1, sharex=True, figsize=(12,8))
54 |
55 | # Subplot 1. Close price
56 | ass_market.close.plot(ax=axs[0], color='black')
57 | axs[0].set_ylabel("Price")
58 |
59 | # Subplot 2. Volatility
60 | volat_df = (ass_market.close - ass_market.open)
61 | (ass_market.close - ass_market.open).plot(color='darkred', ax = axs[1])
62 | axs[1].set_ylabel("Volatility")
63 |
64 | # Show all subplots with label
65 | f.suptitle("Asset: %s" % assetCode, fontsize=22)
66 | plt.tight_layout()
67 | plt.subplots_adjust(top=0.93)
68 | plt.show()
69 |
70 |
71 | def plot_chosen_assets():
72 | '''Prints a group of select stocks, their price and their volatility.
73 |
74 | Parameters
75 | ----------
76 | none
77 |
78 | Returns
79 | -------
80 | none
81 |
82 | '''
83 | # Huge stocks (market cap 200BN - 1000BN)
84 | #plot_asset(market_train_df, "GOOGL.O") #nonsense data?
85 | #plot_asset(market_train_df, "AAPL.O") #randomly crashes from 2013-2015?
86 | #plot_asset(market_train_df, "FB.O") #Facebook: correct, verified, unpredictable volatility
87 | plot_asset(market_train_df, "INTC.O") #Intel: correct, verified, fair constant volatility
88 | plot_asset(market_train_df, "WFC.N") #Wells Fargo: correct, verified, crash volatility
89 | plot_asset(market_train_df, "AMZN.O") #Amazon: correct, verified, increasing volatility
90 |
91 | # SMEs (5-20Bn MC)
92 | #plot_asset(market_train_df, "ADI.N") #Analogue Devices (32Bn MC): kinda correct (one weird correction), verified
93 | #plot_asset(market_train_df, "NATI.O") #NI (6Bn MC): kinda correct (one weird correction in middle), verified
94 | plot_asset(market_train_df, "A.N") #Agilent Tech (20Bn MC): kinda correct (one weird correction toward end), verified
95 |
96 | # Small stocks (MC < 1Bn)
97 | #plot_asset(market_train_df, "ANDE.O") #Andersons (900M MC): unverified, high vol
98 | #plot_asset(market_train_df, "ALO.N") #Alio Gold (90M MC): unverified, low vol
99 | plot_asset(market_train_df, "BHE.N") #Benchmark Electronics (1Bn MC): verified, low vol
100 |
101 |
102 | def get_models_list(asset):
103 |
104 | # Import the list of models from the directory into a dataframe
105 | models_path = './data/models'
106 | models = [f for f in listdir(models_path) if isfile(join(models_path, f))]
107 | models = pd.DataFrame(models)
108 |
109 | # Strip the file extension
110 | models = models[0].str[:-5]
111 |
112 | # Split the string in multiple columns
113 | models = models.str.split('-', expand=True)
114 |
115 | # Remove the 'best-lstm' prefix columns
116 | models = models.drop([0 ,1], axis=1)
117 |
118 | # Set column names
119 | models.columns = ['epoch', 'val_loss', 'asset',
120 | 'lstm_size', 'lag', 'dropout']
121 |
122 | # Cast to numeric
123 | models['epoch'] = pd.to_numeric(models['epoch'])
124 | models['val_loss'] = pd.to_numeric(models['val_loss'])
125 | models['lstm_size'] = pd.to_numeric(models['lstm_size'])
126 | models['lag'] = pd.to_numeric(models['lag'])
127 | models['dropout'] = pd.to_numeric(models['dropout'])
128 |
129 | # Filter for the asset
130 | models = models[models['asset'] == asset]
131 |
132 | return models
133 |
134 |
135 | def plot_train_loss(history, ylim=(0, 0.01), xlim=(0, 50)):
136 | plt.ylim(ylim)
137 | plt.xlim(xlim)
138 |
139 | plt.plot(history['loss'])
140 | plt.plot(history['val_loss'])
141 |
142 | plt.xlabel('Epoch')
143 | plt.ylabel('Mean Absolute Error Loss')
144 | plt.title('Training Loss')
145 | plt.legend(['Train','Val'])
146 | plt.show()
147 |
148 |
149 | def get_history_from_file(file):
150 | with open(file, 'rb') as f:
151 | return pickle.load(f)
152 |
153 |
154 | def get_history_from_params(path, asset, lstm_size, lag, dropout):
155 | path = '{}/lstm-{}-{}-{}-{}.pickle'.format(
156 | path, asset, lstm_size, lag, dropout)
157 | with open(path, 'rb') as f:
158 | return pickle.load(f)
159 |
160 | if __name__ == '__main__':
161 | hist = get_history_from_params(
162 | './data/history/fixedpoint',
163 | 'INTC.O', 32, 30, 15)
164 | plot_train_loss(hist, xlim=(20, 100))
165 |
--------------------------------------------------------------------------------