├── .gitignore ├── .idea ├── .gitignore ├── TGC_torch.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── README.md ├── evaluator.py ├── load_data.py ├── model.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | data 3 | *.json 4 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml -------------------------------------------------------------------------------- /.idea/TGC_torch.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TGC_torch 2 | A Pytorch implementation of the paper [Temporal Relational Ranking for Stock Prediction](https://arxiv.org/abs/1809.09441). 3 | The officical tensorflow implementation is [here](https://github.com/fulifeng/Temporal_Relational_Stock_Ranking) 4 | # requirements 5 | Python>=3.6 and torch>=1.3 6 | # Run the code 7 | Once you download the data that provided by [link](https://github.com/fulifeng/Temporal_Relational_Stock_Ranking), put the files to the ```data``` folder, and then run the ```train.py``` to train a RelationLSTM using TGC. 8 | 9 | -------------------------------------------------------------------------------- /evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def evaluate(prediction, ground_truth, mask, report=False): 5 | assert ground_truth.shape == prediction.shape, 'shape mis-match' 6 | performance = {} 7 | # mse 8 | performance['mse'] = np.linalg.norm((prediction - ground_truth) * mask) ** 2 / np.sum(mask) 9 | mrr_top = 0.0 10 | all_miss_days_top = 0 11 | bt_long = 1.0 12 | bt_long5 = 1.0 13 | bt_long10 = 1.0 14 | 15 | for i in range(prediction.shape[1]): 16 | rank_gt = np.argsort(ground_truth[:, i]) 17 | gt_top1 = set() 18 | gt_top5 = set() 19 | gt_top10 = set() 20 | for j in range(1, prediction.shape[0] + 1): 21 | cur_rank = rank_gt[-1 * j] 22 | if mask[cur_rank][i] < 0.5: 23 | continue 24 | if len(gt_top1) < 1: 25 | gt_top1.add(cur_rank) 26 | if len(gt_top5) < 5: 27 | gt_top5.add(cur_rank) 28 | if len(gt_top10) < 10: 29 | gt_top10.add(cur_rank) 30 | rank_pre = np.argsort(prediction[:, i]) 31 | pre_top1 = set() 32 | pre_top5 = set() 33 | pre_top10 = set() 34 | for j in range(1, prediction.shape[0] + 1): 35 | cur_rank = rank_pre[-1 * j] 36 | if mask[cur_rank][i] < 0.5: 37 | continue 38 | if len(pre_top1) < 1: 39 | pre_top1.add(cur_rank) 40 | if len(pre_top5) < 5: 41 | pre_top5.add(cur_rank) 42 | if len(pre_top10) < 10: 43 | pre_top10.add(cur_rank) 44 | 45 | # calculate mrr of top1 46 | top1_pos_in_gt = 0 47 | # for each stock rank 1 to 1026 48 | # got the real rank of prediction top 1 49 | for j in range(1, prediction.shape[0] + 1): 50 | cur_rank = rank_gt[-1 * j] 51 | if mask[cur_rank][i] < 0.5: 52 | continue 53 | else: 54 | top1_pos_in_gt += 1 55 | if cur_rank in pre_top1: 56 | break 57 | if top1_pos_in_gt == 0: 58 | all_miss_days_top += 1 59 | else: 60 | mrr_top += 1.0 / top1_pos_in_gt 61 | 62 | # back testing on top 1 63 | real_ret_rat_top = ground_truth[list(pre_top1)[0]][i] 64 | bt_long += real_ret_rat_top 65 | 66 | # back testing on top 5 67 | real_ret_rat_top5 = 0 68 | for pre in pre_top5: 69 | real_ret_rat_top5 += ground_truth[pre][i] 70 | real_ret_rat_top5 /= 5 71 | bt_long5 += real_ret_rat_top5 72 | 73 | # back testing on top 10 74 | real_ret_rat_top10 = 0 75 | for pre in pre_top10: 76 | real_ret_rat_top10 += ground_truth[pre][i] 77 | real_ret_rat_top10 /= 10 78 | bt_long10 += real_ret_rat_top10 79 | 80 | # 1/real position average 81 | performance['mrrt'] = mrr_top / (prediction.shape[1] - all_miss_days_top) 82 | # prediction best return ratio 83 | performance['btl'] = bt_long 84 | # prediction top 5 average ratio 85 | # performance['btl5'] = bt_long5 86 | # top 10 average 87 | # performance['btl10'] = bt_long10 88 | return performance 89 | -------------------------------------------------------------------------------- /load_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from tqdm import tqdm 4 | 5 | 6 | def load_EOD_data(data_path, market_name, tickers, steps=1): 7 | eod_data = [] 8 | masks = [] 9 | ground_truth = [] 10 | base_price = [] 11 | for index, ticker in enumerate(tqdm(tickers)): 12 | single_EOD = np.genfromtxt( 13 | os.path.join(data_path, market_name + '_' + ticker + '_1.csv'), 14 | dtype=np.float32, delimiter=',', skip_header=False 15 | ) 16 | if market_name == 'NASDAQ': 17 | # remove the last day since lots of missing data 18 | single_EOD = single_EOD[:-1, :] 19 | if index == 0: 20 | print('single EOD data shape:', single_EOD.shape) 21 | eod_data = np.zeros([len(tickers), single_EOD.shape[0], 22 | single_EOD.shape[1] - 1], dtype=np.float32) 23 | masks = np.ones([len(tickers), single_EOD.shape[0]], 24 | dtype=np.float32) 25 | ground_truth = np.zeros([len(tickers), single_EOD.shape[0]], 26 | dtype=np.float32) 27 | base_price = np.zeros([len(tickers), single_EOD.shape[0]], 28 | dtype=np.float32) 29 | for row in range(single_EOD.shape[0]): 30 | if abs(single_EOD[row][-1] + 1234) < 1e-8: 31 | masks[index][row] = 0.0 32 | elif row > steps - 1 and abs(single_EOD[row - steps][-1] + 1234) \ 33 | > 1e-8: 34 | ground_truth[index][row] = \ 35 | (single_EOD[row][-1] - single_EOD[row - steps][-1]) / \ 36 | single_EOD[row - steps][-1] 37 | for col in range(single_EOD.shape[1]): 38 | if abs(single_EOD[row][col] + 1234) < 1e-8: 39 | single_EOD[row][col] = 1.1 40 | eod_data[index, :, :] = single_EOD[:, 1:] 41 | base_price[index, :] = single_EOD[:, -1] 42 | return eod_data, masks, ground_truth, base_price 43 | 44 | 45 | def load_graph_relation_data(relation_file, lap=False): 46 | relation_encoding = np.load(relation_file) 47 | print('relation encoding shape:', relation_encoding.shape) 48 | rel_shape = [relation_encoding.shape[0], relation_encoding.shape[1]] 49 | mask_flags = np.equal(np.zeros(rel_shape, dtype=int), 50 | np.sum(relation_encoding, axis=2)) 51 | ajacent = np.where(mask_flags, np.zeros(rel_shape, dtype=float), 52 | np.ones(rel_shape, dtype=float)) 53 | degree = np.sum(ajacent, axis=0) 54 | for i in range(len(degree)): 55 | degree[i] = 1.0 / degree[i] 56 | np.sqrt(degree, degree) 57 | deg_neg_half_power = np.diag(degree) 58 | if lap: 59 | return np.identity(ajacent.shape[0], dtype=float) - np.dot( 60 | np.dot(deg_neg_half_power, ajacent), deg_neg_half_power) 61 | else: 62 | return np.dot(np.dot(deg_neg_half_power, ajacent), deg_neg_half_power) 63 | 64 | 65 | def load_relation_data(relation_file): 66 | relation_encoding = np.load(relation_file) 67 | print('relation encoding shape:', relation_encoding.shape) 68 | rel_shape = [relation_encoding.shape[0], relation_encoding.shape[1]] 69 | mask_flags = np.equal(np.zeros(rel_shape, dtype=int), 70 | np.sum(relation_encoding, axis=2)) 71 | mask = np.where(mask_flags, np.ones(rel_shape) * -1e9, np.zeros(rel_shape)) 72 | return relation_encoding, mask 73 | 74 | 75 | def build_SFM_data(data_path, market_name, tickers): 76 | eod_data = [] 77 | for index, ticker in enumerate(tickers): 78 | single_EOD = np.genfromtxt( 79 | os.path.join(data_path, market_name + '_' + ticker + '_1.csv'), 80 | dtype=np.float32, delimiter=',', skip_header=False 81 | ) 82 | if index == 0: 83 | print('single EOD data shape:', single_EOD.shape) 84 | eod_data = np.zeros([len(tickers), single_EOD.shape[0]], 85 | dtype=np.float32) 86 | 87 | for row in range(single_EOD.shape[0]): 88 | if abs(single_EOD[row][-1] + 1234) < 1e-8: 89 | # handle missing data 90 | if row < 3: 91 | # eod_data[index, row] = 0.0 92 | for i in range(row + 1, single_EOD.shape[0]): 93 | if abs(single_EOD[i][-1] + 1234) > 1e-8: 94 | eod_data[index][row] = single_EOD[i][-1] 95 | # print(index, row, i, eod_data[index][row]) 96 | break 97 | else: 98 | eod_data[index][row] = np.sum( 99 | eod_data[index, row - 3:row]) / 3 100 | # print(index, row, eod_data[index][row]) 101 | else: 102 | eod_data[index][row] = single_EOD[row][-1] 103 | # print('test point') 104 | np.save(market_name + '_sfm_data', eod_data) 105 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def get_loss(prediction, ground_truth, base_price, mask, batch_size, alpha): 7 | device = prediction.device 8 | all_one = torch.ones(batch_size, 1, dtype=torch.float32).to(device) 9 | return_ratio = torch.div(torch.sub(prediction, base_price), base_price) 10 | # return ratio's mse loss 11 | reg_loss = F.mse_loss(return_ratio * mask, ground_truth * mask) 12 | # formula (4-6) 13 | pre_pw_dif = torch.sub( 14 | return_ratio @ all_one.t(), 15 | all_one @ return_ratio.t() 16 | ) 17 | gt_pw_dif = torch.sub( 18 | all_one @ ground_truth.t(), 19 | ground_truth @ all_one.t() 20 | ) 21 | mask_pw = mask @ mask.t() 22 | rank_loss = torch.mean( 23 | F.relu(pre_pw_dif * gt_pw_dif * mask_pw) 24 | ) 25 | loss = reg_loss + alpha * rank_loss 26 | return loss, reg_loss, rank_loss, return_ratio 27 | 28 | 29 | class GraphModule(nn.Module): 30 | def __init__(self, batch_size, fea_shape, rel_encoding, rel_mask, inner_prod=False): 31 | super().__init__() 32 | self.batch_size = batch_size 33 | self.input_shape = fea_shape 34 | self.inner_prod = inner_prod 35 | self.relation = nn.Parameter(torch.tensor(rel_encoding, dtype=torch.float32), requires_grad=False) 36 | self.rel_mask = nn.Parameter(torch.tensor(rel_mask, dtype=torch.float32), requires_grad=False) 37 | self.all_one = nn.Parameter(torch.ones(self.batch_size, 1, dtype=torch.float32), requires_grad=False) 38 | self.rel_weight = nn.Linear(rel_encoding.shape[-1], 1) 39 | if self.inner_prod is False: 40 | self.head_weight = nn.Linear(fea_shape, 1) 41 | self.tail_weight = nn.Linear(fea_shape, 1) 42 | 43 | def forward(self, inputs): 44 | rel_weight = self.rel_weight(self.relation) 45 | if self.inner_prod: 46 | inner_weight = inputs @ inputs.t() 47 | weight = inner_weight @ rel_weight[:, :, -1] 48 | else: 49 | all_one = self.all_one 50 | head_weight = self.head_weight(inputs) 51 | tail_weight = self.tail_weight(inputs) 52 | weight = (head_weight @ all_one.t() + all_one @ tail_weight.t()) + rel_weight[:, :, -1] 53 | weight_masked = F.softmax(self.rel_mask + weight, dim=0) 54 | outputs = weight_masked @ inputs 55 | return outputs 56 | 57 | 58 | class StockLSTM(nn.Module): 59 | def __init__(self, batch_size): 60 | super().__init__() 61 | self.batch_size = batch_size 62 | self.lstm_cell = nn.LSTM(5, 64, batch_first=True) 63 | self.fc = nn.Linear(64, 1) 64 | 65 | def forward(self, inputs): 66 | x, _ = self.lstm(inputs) 67 | x = x[:, -1, :] 68 | prediction = F.leaky_relu(self.fc(x)) 69 | return prediction 70 | 71 | 72 | class RelationLSTM(nn.Module): 73 | def __init__(self, batch_size, rel_encoding, rel_mask, inner_prod=False): 74 | super().__init__() 75 | self.batch_size = batch_size 76 | self.lstm = nn.LSTM(5, 64, batch_first=True) 77 | self.graph_layer = GraphModule(batch_size, 64, rel_encoding, rel_mask, inner_prod) 78 | self.fc = nn.Linear(64 * 2, 1) 79 | 80 | def forward(self, inputs): 81 | x, _ = self.lstm(inputs) 82 | x = x[:, -1, :] 83 | outputs_graph = self.graph_layer(x) 84 | outputs_cat = torch.cat([x, outputs_graph], dim=1) 85 | prediction = F.leaky_relu(self.fc(outputs_cat)) 86 | return prediction 87 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import os 4 | import torch as torch 5 | from load_data import load_relation_data, load_EOD_data 6 | from evaluator import evaluate 7 | from model import get_loss, RelationLSTM 8 | 9 | 10 | np.random.seed(123456789) 11 | torch.random.manual_seed(12345678) 12 | device = torch.device("cuda") if torch.cuda.is_available() else 'cpu' 13 | 14 | data_path = 'data/2013-01-01' 15 | market_name = 'NASDAQ' 16 | relation_name = 'wikidata' # or sector_industry 17 | parameters = {'seq': 16, 'unit': 64, 'alpha': 0.1} 18 | epochs = 50 19 | valid_index = 756 20 | test_index = 1008 21 | fea_dim = 5 22 | steps = 1 23 | 24 | tickers_fname = market_name + '_tickers_qualify_dr-0.98_min-5_smooth.csv' 25 | tickers = np.genfromtxt(os.path.join(data_path, '..', tickers_fname), dtype=str, delimiter='\t', skip_header=False) 26 | batch_size = len(tickers) 27 | print('#tickers selected:', len(tickers)) 28 | eod_data, mask_data, gt_data, price_data = load_EOD_data(data_path, market_name, tickers, steps) 29 | trade_dates = mask_data.shape[1] 30 | # relation data 31 | rname_tail = {'sector_industry': '_industry_relation.npy', 'wikidata': '_wiki_relation.npy'} 32 | rel_encoding, rel_mask = load_relation_data( 33 | os.path.join(data_path, '..', 'relation', relation_name, market_name + rname_tail[relation_name]) 34 | ) 35 | print('relation encoding shape:', rel_encoding.shape) 36 | print('relation mask shape:', rel_mask.shape) 37 | 38 | model = RelationLSTM( 39 | batch_size=batch_size, 40 | rel_encoding=rel_encoding, 41 | rel_mask=rel_mask 42 | ).to(device) 43 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) 44 | best_valid_loss = np.inf 45 | best_valid_perf = None 46 | best_test_perf = None 47 | batch_offsets = np.arange(start=0, stop=valid_index, dtype=int) 48 | 49 | 50 | def validate(start_index, end_index): 51 | """ 52 | get loss on validate/test set 53 | """ 54 | with torch.no_grad(): 55 | cur_valid_pred = np.zeros([len(tickers), end_index - start_index], dtype=float) 56 | cur_valid_gt = np.zeros([len(tickers), end_index - start_index], dtype=float) 57 | cur_valid_mask = np.zeros([len(tickers), end_index - start_index], dtype=float) 58 | loss = 0. 59 | reg_loss = 0. 60 | rank_loss = 0. 61 | for cur_offset in range(start_index - parameters['seq'] - steps + 1, end_index - parameters['seq'] - steps + 1): 62 | data_batch, mask_batch, price_batch, gt_batch = map( 63 | lambda x: torch.Tensor(x).to(device), 64 | get_batch(cur_offset) 65 | ) 66 | prediction = model(data_batch) 67 | cur_loss, cur_reg_loss, cur_rank_loss, cur_rr = get_loss(prediction, gt_batch, price_batch, mask_batch, 68 | batch_size, parameters['alpha']) 69 | loss += cur_loss.item() 70 | reg_loss += cur_reg_loss.item() 71 | rank_loss += cur_rank_loss.item() 72 | cur_valid_pred[:, cur_offset - (start_index - parameters['seq'] - steps + 1)] = cur_rr[:, 0].cpu() 73 | cur_valid_gt[:, cur_offset - (start_index - parameters['seq'] - steps + 1)] = gt_batch[:, 0].cpu() 74 | cur_valid_mask[:, cur_offset - (start_index - parameters['seq'] - steps + 1)] = mask_batch[:, 0].cpu() 75 | loss = loss / (end_index - start_index) 76 | reg_loss = reg_loss / (end_index - start_index) 77 | rank_loss = rank_loss / (end_index - start_index) 78 | cur_valid_perf = evaluate(cur_valid_pred, cur_valid_gt, cur_valid_mask) 79 | return loss, reg_loss, rank_loss, cur_valid_perf 80 | 81 | 82 | def get_batch(offset=None): 83 | if offset is None: 84 | offset = random.randrange(0, valid_index) 85 | seq_len = parameters['seq'] 86 | mask_batch = mask_data[:, offset: offset + seq_len + steps] 87 | mask_batch = np.min(mask_batch, axis=1) 88 | return ( 89 | eod_data[:, offset:offset + seq_len, :], 90 | np.expand_dims(mask_batch, axis=1), 91 | np.expand_dims(price_data[:, offset + seq_len - 1], axis=1), 92 | np.expand_dims(gt_data[:, offset + seq_len + steps - 1], axis=1)) 93 | 94 | 95 | # train loop 96 | for epoch in range(epochs): 97 | np.random.shuffle(batch_offsets) 98 | tra_loss = 0.0 99 | tra_reg_loss = 0.0 100 | tra_rank_loss = 0.0 101 | # steps 102 | for j in range(valid_index - parameters['seq'] - steps + 1): 103 | data_batch, mask_batch, price_batch, gt_batch = map( 104 | lambda x: torch.Tensor(x).to(device), 105 | get_batch(batch_offsets[j]) 106 | ) 107 | optimizer.zero_grad() 108 | prediction = model(data_batch) 109 | cur_loss, cur_reg_loss, cur_rank_loss, _ = get_loss(prediction, gt_batch, price_batch, mask_batch, 110 | batch_size, parameters['alpha']) 111 | # update model 112 | cur_loss.backward() 113 | optimizer.step() 114 | 115 | tra_loss += cur_loss.item() 116 | tra_reg_loss += cur_reg_loss.item() 117 | tra_rank_loss += cur_rank_loss.item() 118 | 119 | # train loss 120 | # loss = reg_loss(mse) + alpha*rank_loss 121 | tra_loss = tra_loss / (valid_index - parameters['seq'] - steps + 1) 122 | tra_reg_loss = tra_reg_loss / (valid_index - parameters['seq'] - steps + 1) 123 | tra_rank_loss = tra_rank_loss / (valid_index - parameters['seq'] - steps + 1) 124 | print('\n\nTrain : loss:{} reg_loss:{} rank_loss:{}'.format(tra_loss, tra_reg_loss, tra_rank_loss)) 125 | 126 | # show performance on valid set 127 | val_loss, val_reg_loss, val_rank_loss, val_perf = validate(valid_index, test_index) 128 | print('Valid : loss:{} reg_loss:{} rank_loss:{}'.format(val_loss, val_reg_loss, val_rank_loss)) 129 | print('\t Valid performance:', val_perf) 130 | 131 | # show performance on valid set 132 | test_loss, test_reg_loss, test_rank_loss, test_perf = validate(test_index, trade_dates) 133 | print('Test: loss:{} reg_loss:{} rank_loss:{}'.format(test_loss, test_reg_loss, test_rank_loss)) 134 | print('\t Test performance:', test_perf) 135 | 136 | # best result 137 | if val_loss < best_valid_loss: 138 | best_valid_loss = val_loss 139 | # In this place, remove some var that wouldn't be printed 140 | # without copy.copy() 141 | best_valid_perf = val_perf 142 | best_test_perf = test_perf 143 | print('Better valid loss:', best_valid_loss) 144 | print('\nBest Valid performance:', best_valid_perf) 145 | print('Best Test performance:', best_test_perf) 146 | --------------------------------------------------------------------------------