├── README.md ├── data └── S&P500IndexData-Table1.csv ├── models ├── __init__.py ├── lstm.py ├── sparse_autoencoder.py └── wavelet.py ├── run_training.py ├── runs └── checkpoint │ └── .DS_Store └── utils ├── __init__.py └── utilities.py /README.md: -------------------------------------------------------------------------------- 1 | # DeepLearning_Financial 2 | Attempt to replicate: A deep learning framework for financial time series using stacked autoencoders and long- short term memory 3 | 4 | The original article can be found here: http://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0180944&type=printable 5 | 6 | I use the S&P data file provided by the authors here: https://figshare.com/s/acdfb4918c0695405e33 7 | 8 | My attempts haven't been succesful so far. Given the very limited comments regarding implementation in the article, it may be the case that I am missing something important, however the results seem too good to be true, so my assuption is that the authors have a bug in their own implementation. I would of course be happy to be proven wrong about this statement ;-) 9 | 10 | To run the code: 11 | 12 | > python run_training.py 13 | 14 | This assumes that you have all the packages installed, which I am too lazy to list - python will tell you.. 15 | 16 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparse_autoencoder import Autoencoder 2 | from .lstm import Sequence 3 | from .wavelet import waveletSmooth -------------------------------------------------------------------------------- /models/lstm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sklearn 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Variable 7 | import torch.optim as optim 8 | from torch.utils.data import Dataset, DataLoader 9 | import torch.nn.functional as F 10 | 11 | 12 | class Sequence(nn.Module): 13 | def __init__(self, nb_features=1, hidden_size=100, nb_layers=5, dropout=0.5): 14 | super(Sequence, self).__init__() 15 | self.nb_features=nb_features 16 | self.hidden_size=hidden_size 17 | self.nb_layers=nb_layers 18 | self.lstm = nn.LSTM(self.nb_features, self.hidden_size, self.nb_layers, dropout=dropout) 19 | self.lin = nn.Linear(self.hidden_size,1) 20 | 21 | def forward(self, input): 22 | h0 = Variable(torch.zeros(self.nb_layers, input.size()[1], self.hidden_size)) 23 | #print(type(h0)) 24 | c0 = Variable(torch.zeros(self.nb_layers, input.size()[1], self.hidden_size)) 25 | #print(type(c0)) 26 | output, hn = self.lstm(input, (h0, c0)) 27 | #output = F.relu(self.lin(output)) 28 | out = self.lin(output[-1]) 29 | return out -------------------------------------------------------------------------------- /models/sparse_autoencoder.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sklearn 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Variable 7 | import torch.optim as optim 8 | from torch.utils.data import Dataset, DataLoader 9 | import torch.nn.functional as F 10 | 11 | class Autoencoder(torch.nn.Module): 12 | def __init__(self, n_in, n_hidden=10, sparsity_target=0.05, sparsity_weight=0.2, lr=0.001, weight_decay=0.0):#lr=0.0001): 13 | super(Autoencoder, self).__init__() 14 | self.n_in = n_in 15 | self.n_hidden = n_hidden 16 | self.sparsity_target = sparsity_target 17 | self.sparsity_weight = sparsity_weight 18 | self.weight_decay = weight_decay 19 | self.lr = lr 20 | self.build_model() 21 | # end constructor 22 | 23 | 24 | def build_model(self): 25 | self.encoder = torch.nn.Sequential( 26 | torch.nn.Linear(self.n_in, self.n_hidden), 27 | torch.nn.Sigmoid()) 28 | self.decoder = torch.nn.Sequential( 29 | torch.nn.Linear(self.n_hidden, self.n_in))#, 30 | #torch.nn.Sigmoid()) 31 | self.l1_loss = torch.nn.L1Loss(size_average=False) 32 | self.optimizer = torch.optim.Adam(self.parameters(), self.lr, weight_decay=self.weight_decay) 33 | # end method 34 | 35 | 36 | def forward(self, inputs): 37 | hidden = self.encoder(inputs) 38 | hidden_mean = torch.mean(hidden, dim=0) 39 | sparsity_loss = torch.sum(self.kl_divergence(self.sparsity_target, hidden_mean)) 40 | return self.decoder(hidden), sparsity_loss 41 | # end method 42 | 43 | 44 | def kl_divergence(self, p, q): 45 | return p * torch.log(p / q) + (1 - p) * torch.log((1 - p) / (1 - q)) # Kullback Leibler divergence 46 | # end method 47 | 48 | 49 | def fit(self, X, n_epoch=10, batch_size=64, en_shuffle=True): 50 | for epoch in range(n_epoch): 51 | if en_shuffle: 52 | print("Data Shuffled") 53 | X = sklearn.utils.shuffle(X) 54 | for local_step, X_batch in enumerate(self.gen_batch(X, batch_size)): 55 | inputs = torch.autograd.Variable(torch.from_numpy(X_batch.astype(np.float32))) 56 | outputs, sparsity_loss = self.forward(inputs) 57 | 58 | l1_loss = self.l1_loss(outputs, inputs) 59 | loss = l1_loss + self.sparsity_weight * sparsity_loss 60 | self.optimizer.zero_grad() # clear gradients for this training step 61 | loss.backward() # backpropagation, compute gradients 62 | self.optimizer.step() # apply gradients 63 | if local_step % 50 == 0: 64 | print ("Epoch %d/%d | Step %d/%d | train loss: %.4f | l1 loss: %.4f | sparsity loss: %.4f" 65 | %(epoch+1, n_epoch, local_step, len(X)//batch_size, 66 | loss.data[0], l1_loss.data[0], sparsity_loss.data[0])) 67 | # end method 68 | 69 | 70 | def gen_batch(self, arr, batch_size): 71 | for i in range(0, len(arr), batch_size): 72 | yield arr[i : i+batch_size] -------------------------------------------------------------------------------- /models/wavelet.py: -------------------------------------------------------------------------------- 1 | import pywt 2 | from statsmodels.robust import mad 3 | import numpy as np 4 | 5 | def waveletSmooth( x, wavelet="db4", level=1, DecLvl=2, title=None): 6 | # calculate the wavelet coefficients 7 | coeff = pywt.wavedec( x, wavelet, mode="per", level=DecLvl ) 8 | # calculate a threshold 9 | sigma = mad( coeff[-level] ) 10 | # changing this threshold also changes the behavior, 11 | # but I have not played with this very much 12 | uthresh = sigma * np.sqrt( 2*np.log( len( x ) ) ) 13 | coeff[1:] = ( pywt.threshold( i, value=uthresh, mode="soft" ) for i in coeff[1:] ) 14 | # reconstruct the signal using the thresholded coefficients 15 | y = pywt.waverec( coeff, wavelet, mode="per" ) 16 | return y -------------------------------------------------------------------------------- /run_training.py: -------------------------------------------------------------------------------- 1 | ## EXTERNAL 2 | import pandas as pd 3 | import numpy as np 4 | import pickle 5 | import shutil 6 | import torch 7 | import torch.nn as nn 8 | from torch.autograd import Variable 9 | import torch.optim as optim 10 | from torch.utils.data import Dataset, DataLoader 11 | import torch.nn.functional as F 12 | import numpy as np 13 | import sklearn 14 | import time 15 | import os 16 | import random 17 | from sklearn.preprocessing import MinMaxScaler 18 | from sklearn.preprocessing import StandardScaler 19 | import matplotlib.pyplot as plt 20 | 21 | ##INTERNAL 22 | from models import Autoencoder 23 | from models import Sequence 24 | from models import waveletSmooth 25 | 26 | from utils import prepare_data_lstm, ExampleDataset, save_checkpoint, evaluate_lstm, backtest 27 | 28 | # --------------------------------------------------------------------------- 29 | # --------------------------- STEP 0: LOAD DATA ----------------------------- 30 | # --------------------------------------------------------------------------- 31 | 32 | path = "./data/S&P500IndexData-Table1.csv" 33 | data_master = pd.read_csv(path, sep=";") 34 | 35 | # 600 is a bit more than 2 years of data 36 | num_datapoints = 600 37 | # roll by approx. 60 days - 3 months of trading days 38 | step_size = int(0.1 * num_datapoints) 39 | # calculate number of iterations we can do over the entire data set 40 | num_iterations = int(np.ceil((len(data_master)-num_datapoints)/step_size))+2 41 | 42 | y_test_lst = [] 43 | preds = [] 44 | ct = 0 45 | 46 | for n in range(num_iterations): 47 | print(n) 48 | data = data_master.iloc[n*step_size:num_datapoints+n*step_size,:] 49 | data.columns = [col.strip() for col in data.columns.tolist()] 50 | print(data.shape) 51 | ct +=1 52 | 53 | feats = data.iloc[:,2:] 54 | 55 | # This is a scaling of the inputs such that they are in an appropriate range 56 | feats["Close Price"].loc[:] = feats["Close Price"].loc[:]/1000 57 | feats["Open Price"].loc[:] = feats["Open Price"].loc[:]/1000 58 | feats["High Price"].loc[:] = feats["High Price"].loc[:]/1000 59 | feats["Low Price"].loc[:] = feats["Low Price"].loc[:]/1000 60 | feats["Volume"].loc[:] = feats["Volume"].loc[:]/1000000 61 | feats["MACD"].loc[:] = feats["MACD"].loc[:]/10 62 | feats["CCI"].loc[:] = feats["CCI"].loc[:]/100 63 | feats["ATR"].loc[:] = feats["ATR"].loc[:]/100 64 | feats["BOLL"].loc[:] = feats["BOLL"].loc[:]/1000 65 | feats["EMA20"].loc[:] = feats["EMA20"].loc[:]/1000 66 | feats["MA10"].loc[:] = feats["MA10"].loc[:]/1000 67 | feats["MTM6"].loc[:] = feats["MTM6"].loc[:]/100 68 | feats["MA5"].loc[:] = feats["MA5"].loc[:]/1000 69 | feats["MTM12"].loc[:] = feats["MTM12"].loc[:]/100 70 | feats["ROC"].loc[:] = feats["ROC"].loc[:]/10 71 | feats["SMI"].loc[:] = feats["SMI"].loc[:] * 10 72 | feats["WVAD"].loc[:] = feats["WVAD"].loc[:]/100000000 73 | feats["US Dollar Index"].loc[:] = feats["US Dollar Index"].loc[:]/100 74 | feats["Federal Fund Rate"].loc[:] = feats["Federal Fund Rate"].loc[:] 75 | 76 | data_close = feats["Close Price"].copy() 77 | data_close_new = data_close 78 | 79 | # Split in train, test and validation set 80 | 81 | test = feats[-step_size:] 82 | validate = feats[-2*step_size:-step_size] 83 | train = feats[:-2*step_size] 84 | 85 | y_test = data_close_new[-step_size:].as_matrix() 86 | y_validate = data_close_new[-2*step_size:-step_size].as_matrix() 87 | y_train = data_close_new[:-2*step_size].as_matrix() 88 | feats_train = train.as_matrix().astype(np.float) 89 | feats_validate = validate.as_matrix().astype(np.float) 90 | feats_test = test.as_matrix().astype(np.float) 91 | 92 | # --------------------------------------------------------------------------- 93 | # ----------------------- STEP 2.0: NORMALIZE DATA -------------------------- 94 | # --------------------------------------------------------------------------- 95 | 96 | # REMOVED THE NORMALIZATION AND MANUALLY SCALED TO APPROPRIATE VALUES ABOVE 97 | 98 | """ 99 | scaler = StandardScaler().fit(feats_train) 100 | 101 | feats_norm_train = scaler.transform(feats_train) 102 | feats_norm_validate = scaler.transform(feats_validate) 103 | feats_norm_test = scaler.transform(feats_test) 104 | """ 105 | """ 106 | scaler = MinMaxScaler(feature_range=(0,1)) 107 | scaler.fit(feats_train) 108 | 109 | feats_norm_train = scaler.transform(feats_train) 110 | feats_norm_validate = scaler.transform(feats_validate) 111 | feats_norm_test = scaler.transform(feats_test) 112 | """ 113 | data_close = pd.Series(np.concatenate((y_train, y_validate, y_test))) 114 | 115 | feats_norm_train = feats_train.copy() 116 | feats_norm_validate = feats_validate.copy() 117 | feats_norm_test = feats_test.copy() 118 | 119 | # --------------------------------------------------------------------------- 120 | # ----------------------- STEP 2.1: DENOISE USING DWT ----------------------- 121 | # --------------------------------------------------------------------------- 122 | 123 | for i in range(feats_norm_train.shape[1]): 124 | feats_norm_train[:,i] = waveletSmooth(feats_norm_train[:,i], level=1)[-len(feats_norm_train):] 125 | 126 | # for the validation we have to do the transform using training data + the current and past validation data 127 | # i.e. we CAN'T USE all the validation data because we would then look into the future 128 | temp = np.copy(feats_norm_train) 129 | feats_norm_validate_WT = np.copy(feats_norm_validate) 130 | for j in range(feats_norm_validate.shape[0]): 131 | #first concatenate train with the latest validation sample 132 | temp = np.append(temp, np.expand_dims(feats_norm_validate[j,:], axis=0), axis=0) 133 | for i in range(feats_norm_validate.shape[1]): 134 | feats_norm_validate_WT[j,i] = waveletSmooth(temp[:,i], level=1)[-1] 135 | 136 | # for the test we have to do the transform using training data + validation data + current and past test data 137 | # i.e. we CAN'T USE all the test data because we would then look into the future 138 | temp_train = np.copy(feats_norm_train) 139 | temp_val = np.copy(feats_norm_validate) 140 | temp = np.concatenate((temp_train, temp_val)) 141 | feats_norm_test_WT = np.copy(feats_norm_test) 142 | for j in range(feats_norm_test.shape[0]): 143 | #first concatenate train with the latest validation sample 144 | temp = np.append(temp, np.expand_dims(feats_norm_test[j,:], axis=0), axis=0) 145 | for i in range(feats_norm_test.shape[1]): 146 | feats_norm_test_WT[j,i] = waveletSmooth(temp[:,i], level=1)[-1] 147 | 148 | # --------------------------------------------------------------------------- 149 | # ------------- STEP 3: ENCODE FEATURES USING STACKED AUTOENCODER ----------- 150 | # --------------------------------------------------------------------------- 151 | 152 | num_hidden_1 = 10 153 | num_hidden_2 = 10 154 | num_hidden_3 = 10 155 | num_hidden_4 = 10 156 | 157 | n_epoch=100#20000 158 | 159 | # ---- train using training data 160 | 161 | # The n==0 statement is done because we only want to initialize the network once and then keep training 162 | # as we move through time 163 | 164 | if n == 0: 165 | auto1 = Autoencoder(feats_norm_train.shape[1], num_hidden_1) 166 | auto1.fit(feats_norm_train, n_epoch=n_epoch) 167 | 168 | inputs = torch.autograd.Variable(torch.from_numpy(feats_norm_train.astype(np.float32))) 169 | 170 | if n == 0: 171 | auto2 = Autoencoder(num_hidden_1, num_hidden_2) 172 | auto1_out = auto1.encoder(inputs).data.numpy() 173 | auto2.fit(auto1_out, n_epoch=n_epoch) 174 | 175 | if n == 0: 176 | auto3 = Autoencoder(num_hidden_2, num_hidden_3) 177 | auto1_out = torch.autograd.Variable(torch.from_numpy(auto1_out.astype(np.float32))) 178 | auto2_out = auto2.encoder(auto1_out).data.numpy() 179 | auto3.fit(auto2_out, n_epoch=n_epoch) 180 | 181 | if n == 0: 182 | auto4 = Autoencoder(num_hidden_3, num_hidden_4) 183 | auto2_out = torch.autograd.Variable(torch.from_numpy(auto2_out.astype(np.float32))) 184 | auto3_out = auto3.encoder(auto2_out).data.numpy() 185 | auto4.fit(auto3_out, n_epoch=n_epoch) 186 | 187 | 188 | # Change to evaluation mode, in this mode the network behaves differently, e.g. dropout is switched off and so on 189 | auto1.eval() 190 | auto2.eval() 191 | auto3.eval() 192 | auto4.eval() 193 | 194 | X_train = feats_norm_train 195 | X_train = torch.autograd.Variable(torch.from_numpy(X_train.astype(np.float32))) 196 | train_encoded = auto4.encoder(auto3.encoder(auto2.encoder(auto1.encoder(X_train)))) 197 | train_encoded = train_encoded.data.numpy() 198 | 199 | # ---- encode validation and test data using autoencoder trained only on training data 200 | X_validate = feats_norm_validate_WT 201 | X_validate = torch.autograd.Variable(torch.from_numpy(X_validate.astype(np.float32))) 202 | validate_encoded = auto4.encoder(auto3.encoder(auto2.encoder(auto1.encoder(X_validate)))) 203 | validate_encoded = validate_encoded.data.numpy() 204 | 205 | X_test = feats_norm_test_WT 206 | X_test = torch.autograd.Variable(torch.from_numpy(X_test.astype(np.float32))) 207 | test_encoded = auto4.encoder(auto3.encoder(auto2.encoder(auto1.encoder(X_test)))) 208 | test_encoded = test_encoded.data.numpy() 209 | 210 | # switch back to training mode 211 | auto1.train() 212 | auto2.train() 213 | auto3.train() 214 | auto4.train() 215 | 216 | 217 | # --------------------------------------------------------------------------- 218 | # -------------------- STEP 4: PREPARE TIME-SERIES -------------------------- 219 | # --------------------------------------------------------------------------- 220 | 221 | # split the entire training time-series into pieces, depending on the number 222 | # of time steps for the LSTM 223 | 224 | time_steps = 4 225 | 226 | args = (train_encoded, validate_encoded, test_encoded) 227 | 228 | x_concat = np.concatenate(args) 229 | 230 | validate_encoded_extra = np.concatenate((train_encoded[-time_steps:], validate_encoded)) 231 | test_encoded_extra = np.concatenate((validate_encoded[-time_steps:], test_encoded)) 232 | 233 | y_train_input = data_close[:-len(validate_encoded)-len(test_encoded)] 234 | y_val_input = data_close[-len(test_encoded)-len(validate_encoded)-1:-len(test_encoded)] 235 | y_test_input = data_close[-len(test_encoded)-1:] 236 | 237 | x, y = prepare_data_lstm(train_encoded, y_train_input, time_steps, log_return=True, train=True) 238 | x_v, y_v = prepare_data_lstm(validate_encoded_extra, y_val_input, time_steps, log_return=False, train=False) 239 | x_te, y_te = prepare_data_lstm(test_encoded_extra, y_test_input, time_steps, log_return=False, train=False) 240 | 241 | 242 | x_test = x_te 243 | x_validate = x_v 244 | x_train = x 245 | 246 | y_test = y_te 247 | y_validate = y_v 248 | y_train = y 249 | 250 | y_train = y_train.as_matrix() 251 | 252 | # --------------------------------------------------------------------------- 253 | # ------------- STEP 5: TIME-SERIES REGRESSION USING LSTM ------------------- 254 | # --------------------------------------------------------------------------- 255 | 256 | batchsize = 60 257 | 258 | trainloader = ExampleDataset(x_train, y_train, batchsize) 259 | valloader = ExampleDataset(x_validate, y_validate, 1) 260 | testloader = ExampleDataset(x_test, y_test, 1) 261 | 262 | # set ramdom seed to 0 263 | np.random.seed(0) 264 | torch.manual_seed(0) 265 | 266 | # build the model 267 | if n == 0: 268 | seq = Sequence(num_hidden_4, hidden_size=100, nb_layers=3) 269 | 270 | resume = "" 271 | 272 | # if a path is given in resume, we resume from a checkpoint 273 | if os.path.isfile(resume): 274 | print("=> loading checkpoint '{}'".format(resume)) 275 | checkpoint = torch.load(resume) 276 | start_epoch = checkpoint['epoch'] 277 | seq.load_state_dict(checkpoint['state_dict']) 278 | print("=> loaded checkpoint '{}' (epoch {})" 279 | .format(resume, checkpoint['epoch'])) 280 | else: 281 | print("=> no checkpoint found at '{}'".format(resume)) 282 | 283 | # get the number of model parameters 284 | print('Number of model parameters: {}'.format( 285 | sum([p.data.nelement() for p in seq.parameters()]))) 286 | 287 | # we use the mean squared error loss 288 | criterion = nn.MSELoss() 289 | 290 | optimizer = optim.Adam(params=seq.parameters(), lr=0.0005) 291 | 292 | start_epoch = 0 293 | epochs = 1#5000 294 | 295 | global_loss_val = np.inf 296 | #begin to train 297 | global_profit_val = -np.inf 298 | 299 | for i in range(start_epoch, epochs): 300 | seq.train() 301 | loss_train = 0 302 | 303 | # shuffle ONLY training set 304 | combined = list(zip(x_train, y_train)) 305 | random.shuffle(combined) 306 | x_train=[] 307 | y_train=[] 308 | x_train[:], y_train[:] = zip(*combined) 309 | 310 | # initialize trainloader with newly shuffled training data 311 | trainloader = ExampleDataset(x_train, y_train, batchsize) 312 | 313 | pred_train = [] 314 | target_train = [] 315 | for j in range(len(trainloader)): 316 | sample = trainloader[j] 317 | sample_x = sample["x"] 318 | 319 | if len(sample_x) != 0: 320 | 321 | sample_x = np.stack(sample_x) 322 | input = Variable(torch.FloatTensor(sample_x), requires_grad=False) 323 | input = torch.transpose(input, 0, 1) 324 | target = Variable(torch.FloatTensor([x for x in sample["y"]]), requires_grad=False) 325 | 326 | optimizer.zero_grad() 327 | out = seq(input) 328 | loss = criterion(out, target) 329 | 330 | loss_train += float(loss.data.numpy()) 331 | pred_train.extend(out.data.numpy().flatten().tolist()) 332 | target_train.extend(target.data.numpy().flatten().tolist()) 333 | 334 | loss.backward() 335 | 336 | optimizer.step() 337 | 338 | 339 | if i % 100 == 0: 340 | 341 | plt.plot(pred_train) 342 | plt.plot(target_train) 343 | plt.show() 344 | 345 | loss_val, pred_val, target_val = evaluate_lstm(dataloader=valloader, model=seq, criterion=criterion) 346 | 347 | plt.scatter(range(len(pred_val)), pred_val) 348 | plt.scatter(range(len(pred_val)), target_val) 349 | plt.show() 350 | 351 | index, real = backtest(pred_val, y_validate) 352 | 353 | print(index[-1]) 354 | # save according to profitability 355 | if index[-1]>global_profit_val and i>200: 356 | print("CURRENT BEST") 357 | global_profit_val = index[-1] 358 | save_checkpoint({'epoch': i + 1, 'state_dict': seq.state_dict()}, is_best=True, filename='checkpoint_lstm.pth.tar') 359 | 360 | save_checkpoint({'epoch': i + 1, 'state_dict': seq.state_dict()}, is_best=False, filename='checkpoint_lstm.pth.tar') 361 | 362 | print("LOSS TRAIN: " + str(float(loss_train))) 363 | print("LOSS VAL: " + str(float(loss_val))) 364 | print(i) 365 | 366 | # do the final test 367 | # first load the best checkpoint on the val set 368 | 369 | resume = "./runs/checkpoint/model_best.pth.tar" 370 | #resume = "./runs/HF/checkpoint_lstm.pth.tar" 371 | 372 | if os.path.isfile(resume): 373 | print("=> loading checkpoint '{}'".format(resume)) 374 | checkpoint = torch.load(resume) 375 | start_epoch = checkpoint['epoch'] 376 | seq.load_state_dict(checkpoint['state_dict']) 377 | print("=> loaded checkpoint '{}' (epoch {})" 378 | .format(resume, checkpoint['epoch'])) 379 | else: 380 | print("=> no checkpoint found at '{}'".format(resume)) 381 | 382 | seq.eval() 383 | 384 | loss_test, preds_test, target_test = evaluate_lstm(dataloader=testloader, model=seq, criterion=criterion) 385 | 386 | print("LOSS TEST: " + str(float(loss_test))) 387 | 388 | temp2 = y_test.as_matrix().flatten().tolist() 389 | y_test_lst.extend(temp2) 390 | 391 | plt.plot(preds_test) 392 | plt.plot(y_test_lst) 393 | plt.scatter(range(len(preds_test)), preds_test) 394 | plt.scatter(range(len(y_test_lst)), y_test_lst) 395 | plt.savefig("test_preds.pdf") 396 | 397 | # --------------------------------------------------------------------------- 398 | # ------------------ STEP 6: BACKTEST (ARTICLE WAY) ------------------------- 399 | # --------------------------------------------------------------------------- 400 | 401 | index, real = backtest(preds_test, pd.DataFrame(y_test_lst)) 402 | 403 | plt.close() 404 | plt.plot(index, label="strat") 405 | plt.plot(real, label="bm") 406 | plt.legend() 407 | plt.savefig("performance_article_way.pdf") 408 | plt.close() 409 | 410 | -------------------------------------------------------------------------------- /runs/checkpoint/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlpanda/DeepLearning_Financial/7e846144629d8b49b8fd74a87d5ff047b7af55d1/runs/checkpoint/.DS_Store -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utilities import prepare_data_lstm, ExampleDataset, save_checkpoint, evaluate_lstm, backtest -------------------------------------------------------------------------------- /utils/utilities.py: -------------------------------------------------------------------------------- 1 | ## EXTERNAL 2 | import pandas as pd 3 | import numpy as np 4 | import pickle 5 | import shutil 6 | import torch 7 | import torch.nn as nn 8 | from torch.autograd import Variable 9 | import torch.optim as optim 10 | from torch.utils.data import Dataset, DataLoader 11 | import torch.nn.functional as F 12 | import numpy as np 13 | import sklearn 14 | import time 15 | import os 16 | import random 17 | from sklearn.preprocessing import MinMaxScaler 18 | from sklearn.preprocessing import StandardScaler 19 | import matplotlib.pyplot as plt 20 | 21 | def prepare_data_lstm(x_encoded, y_close, time_steps, log_return=True, train=True): 22 | ct = 0 23 | data = [] 24 | for i in range(len(x_encoded)-time_steps): 25 | ct +=1 26 | if train: 27 | x_train = x_encoded[i:i+time_steps] 28 | else: 29 | x_train = x_encoded[:i+time_steps] 30 | 31 | data.append(x_train) 32 | 33 | if log_return==False: 34 | y_close = y_close.pct_change()[1:] 35 | else: 36 | y_close = (np.log(y_close) - np.log(y_close.shift(1)))[1:] # the log return, i.e. ln(y_t/y_(t-1)) 37 | 38 | if train: 39 | y = y_close[time_steps-1:] 40 | else: 41 | y=y_close 42 | 43 | return data, y 44 | 45 | 46 | class ExampleDataset(Dataset): 47 | 48 | def __init__(self, x, y, batchsize): 49 | self.datalist = x 50 | self.target = y 51 | self.batchsize = batchsize 52 | self.length = 0 53 | self.length = len(x) 54 | 55 | def __len__(self): 56 | return int(self.length/self.batchsize+1) 57 | 58 | def __getitem__(self, idx): 59 | x = self.datalist[idx*self.batchsize:(idx+1)*self.batchsize] 60 | y = self.target[idx*self.batchsize:(idx+1)*self.batchsize] 61 | sample = {'x': x, 'y': y} 62 | 63 | return sample 64 | 65 | 66 | def evaluate_lstm(dataloader, model, criterion): 67 | 68 | pred_val = [] 69 | target_val = [] 70 | model.eval() 71 | # do evaluation 72 | loss_val = 0 73 | sample_cum_x = [None] 74 | 75 | for j in range(len(dataloader)): 76 | 77 | sample = dataloader[j] 78 | sample_x = sample["x"] 79 | 80 | if len(sample_x) != 0: 81 | 82 | sample_x = np.stack(sample_x) 83 | input = Variable(torch.FloatTensor(sample_x), requires_grad=False) 84 | input = torch.transpose(input, 0, 1) 85 | target = Variable(torch.FloatTensor(sample["y"].as_matrix()), requires_grad=False) 86 | 87 | out = model(input) 88 | 89 | loss = criterion(out, target) 90 | 91 | loss_val += float(loss.data.numpy()) 92 | pred_val.extend(out.data.numpy().flatten().tolist()) 93 | target_val.extend(target.data.numpy().flatten().tolist()) 94 | 95 | return loss_val, pred_val, target_val 96 | 97 | 98 | def backtest(predictions, y): 99 | 100 | trans_cost = 0.0001 101 | real = [1] 102 | index = [1] 103 | for r in range(len(predictions)): 104 | rets= y.as_matrix().flatten().tolist() 105 | ret = rets[r] 106 | real.append(real[-1]*(1+ret)) 107 | 108 | if predictions[r]>0.0: 109 | # buy 110 | ret = rets[r] - 2*trans_cost 111 | index.append(index[-1]*(1+ret)) 112 | 113 | elif predictions[r]<0.0: 114 | # sell 115 | ret = -rets[r] - 2*trans_cost 116 | index.append(index[-1]*(1+ret)) 117 | else: 118 | #print("no trade") 119 | # don't trade 120 | index.append(index[-1]) 121 | 122 | return index, real 123 | 124 | 125 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar', name="checkpoint"): 126 | """Saves checkpoint to disk""" 127 | directory = "runs/%s/"%(name) 128 | if not os.path.exists(directory): 129 | os.makedirs(directory) 130 | filename = directory + filename 131 | torch.save(state, filename) 132 | if is_best: 133 | shutil.copyfile(filename, 'runs/%s/'%(name) + 'model_best.pth.tar') 134 | 135 | --------------------------------------------------------------------------------