├── .gitignore
├── .idea
├── .gitignore
├── TGC_torch.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── README.md
├── evaluator.py
├── load_data.py
├── model.py
└── train.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | data
3 | *.json
4 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml
--------------------------------------------------------------------------------
/.idea/TGC_torch.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TGC_torch
2 | A Pytorch implementation of the paper [Temporal Relational Ranking for Stock Prediction](https://arxiv.org/abs/1809.09441).
3 | The officical tensorflow implementation is [here](https://github.com/fulifeng/Temporal_Relational_Stock_Ranking)
4 | # requirements
5 | Python>=3.6 and torch>=1.3
6 | # Run the code
7 | Once you download the data that provided by [link](https://github.com/fulifeng/Temporal_Relational_Stock_Ranking), put the files to the ```data``` folder, and then run the ```train.py``` to train a RelationLSTM using TGC.
8 |
9 |
--------------------------------------------------------------------------------
/evaluator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def evaluate(prediction, ground_truth, mask, report=False):
5 | assert ground_truth.shape == prediction.shape, 'shape mis-match'
6 | performance = {}
7 | # mse
8 | performance['mse'] = np.linalg.norm((prediction - ground_truth) * mask) ** 2 / np.sum(mask)
9 | mrr_top = 0.0
10 | all_miss_days_top = 0
11 | bt_long = 1.0
12 | bt_long5 = 1.0
13 | bt_long10 = 1.0
14 |
15 | for i in range(prediction.shape[1]):
16 | rank_gt = np.argsort(ground_truth[:, i])
17 | gt_top1 = set()
18 | gt_top5 = set()
19 | gt_top10 = set()
20 | for j in range(1, prediction.shape[0] + 1):
21 | cur_rank = rank_gt[-1 * j]
22 | if mask[cur_rank][i] < 0.5:
23 | continue
24 | if len(gt_top1) < 1:
25 | gt_top1.add(cur_rank)
26 | if len(gt_top5) < 5:
27 | gt_top5.add(cur_rank)
28 | if len(gt_top10) < 10:
29 | gt_top10.add(cur_rank)
30 | rank_pre = np.argsort(prediction[:, i])
31 | pre_top1 = set()
32 | pre_top5 = set()
33 | pre_top10 = set()
34 | for j in range(1, prediction.shape[0] + 1):
35 | cur_rank = rank_pre[-1 * j]
36 | if mask[cur_rank][i] < 0.5:
37 | continue
38 | if len(pre_top1) < 1:
39 | pre_top1.add(cur_rank)
40 | if len(pre_top5) < 5:
41 | pre_top5.add(cur_rank)
42 | if len(pre_top10) < 10:
43 | pre_top10.add(cur_rank)
44 |
45 | # calculate mrr of top1
46 | top1_pos_in_gt = 0
47 | # for each stock rank 1 to 1026
48 | # got the real rank of prediction top 1
49 | for j in range(1, prediction.shape[0] + 1):
50 | cur_rank = rank_gt[-1 * j]
51 | if mask[cur_rank][i] < 0.5:
52 | continue
53 | else:
54 | top1_pos_in_gt += 1
55 | if cur_rank in pre_top1:
56 | break
57 | if top1_pos_in_gt == 0:
58 | all_miss_days_top += 1
59 | else:
60 | mrr_top += 1.0 / top1_pos_in_gt
61 |
62 | # back testing on top 1
63 | real_ret_rat_top = ground_truth[list(pre_top1)[0]][i]
64 | bt_long += real_ret_rat_top
65 |
66 | # back testing on top 5
67 | real_ret_rat_top5 = 0
68 | for pre in pre_top5:
69 | real_ret_rat_top5 += ground_truth[pre][i]
70 | real_ret_rat_top5 /= 5
71 | bt_long5 += real_ret_rat_top5
72 |
73 | # back testing on top 10
74 | real_ret_rat_top10 = 0
75 | for pre in pre_top10:
76 | real_ret_rat_top10 += ground_truth[pre][i]
77 | real_ret_rat_top10 /= 10
78 | bt_long10 += real_ret_rat_top10
79 |
80 | # 1/real position average
81 | performance['mrrt'] = mrr_top / (prediction.shape[1] - all_miss_days_top)
82 | # prediction best return ratio
83 | performance['btl'] = bt_long
84 | # prediction top 5 average ratio
85 | # performance['btl5'] = bt_long5
86 | # top 10 average
87 | # performance['btl10'] = bt_long10
88 | return performance
89 |
--------------------------------------------------------------------------------
/load_data.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | from tqdm import tqdm
4 |
5 |
6 | def load_EOD_data(data_path, market_name, tickers, steps=1):
7 | eod_data = []
8 | masks = []
9 | ground_truth = []
10 | base_price = []
11 | for index, ticker in enumerate(tqdm(tickers)):
12 | single_EOD = np.genfromtxt(
13 | os.path.join(data_path, market_name + '_' + ticker + '_1.csv'),
14 | dtype=np.float32, delimiter=',', skip_header=False
15 | )
16 | if market_name == 'NASDAQ':
17 | # remove the last day since lots of missing data
18 | single_EOD = single_EOD[:-1, :]
19 | if index == 0:
20 | print('single EOD data shape:', single_EOD.shape)
21 | eod_data = np.zeros([len(tickers), single_EOD.shape[0],
22 | single_EOD.shape[1] - 1], dtype=np.float32)
23 | masks = np.ones([len(tickers), single_EOD.shape[0]],
24 | dtype=np.float32)
25 | ground_truth = np.zeros([len(tickers), single_EOD.shape[0]],
26 | dtype=np.float32)
27 | base_price = np.zeros([len(tickers), single_EOD.shape[0]],
28 | dtype=np.float32)
29 | for row in range(single_EOD.shape[0]):
30 | if abs(single_EOD[row][-1] + 1234) < 1e-8:
31 | masks[index][row] = 0.0
32 | elif row > steps - 1 and abs(single_EOD[row - steps][-1] + 1234) \
33 | > 1e-8:
34 | ground_truth[index][row] = \
35 | (single_EOD[row][-1] - single_EOD[row - steps][-1]) / \
36 | single_EOD[row - steps][-1]
37 | for col in range(single_EOD.shape[1]):
38 | if abs(single_EOD[row][col] + 1234) < 1e-8:
39 | single_EOD[row][col] = 1.1
40 | eod_data[index, :, :] = single_EOD[:, 1:]
41 | base_price[index, :] = single_EOD[:, -1]
42 | return eod_data, masks, ground_truth, base_price
43 |
44 |
45 | def load_graph_relation_data(relation_file, lap=False):
46 | relation_encoding = np.load(relation_file)
47 | print('relation encoding shape:', relation_encoding.shape)
48 | rel_shape = [relation_encoding.shape[0], relation_encoding.shape[1]]
49 | mask_flags = np.equal(np.zeros(rel_shape, dtype=int),
50 | np.sum(relation_encoding, axis=2))
51 | ajacent = np.where(mask_flags, np.zeros(rel_shape, dtype=float),
52 | np.ones(rel_shape, dtype=float))
53 | degree = np.sum(ajacent, axis=0)
54 | for i in range(len(degree)):
55 | degree[i] = 1.0 / degree[i]
56 | np.sqrt(degree, degree)
57 | deg_neg_half_power = np.diag(degree)
58 | if lap:
59 | return np.identity(ajacent.shape[0], dtype=float) - np.dot(
60 | np.dot(deg_neg_half_power, ajacent), deg_neg_half_power)
61 | else:
62 | return np.dot(np.dot(deg_neg_half_power, ajacent), deg_neg_half_power)
63 |
64 |
65 | def load_relation_data(relation_file):
66 | relation_encoding = np.load(relation_file)
67 | print('relation encoding shape:', relation_encoding.shape)
68 | rel_shape = [relation_encoding.shape[0], relation_encoding.shape[1]]
69 | mask_flags = np.equal(np.zeros(rel_shape, dtype=int),
70 | np.sum(relation_encoding, axis=2))
71 | mask = np.where(mask_flags, np.ones(rel_shape) * -1e9, np.zeros(rel_shape))
72 | return relation_encoding, mask
73 |
74 |
75 | def build_SFM_data(data_path, market_name, tickers):
76 | eod_data = []
77 | for index, ticker in enumerate(tickers):
78 | single_EOD = np.genfromtxt(
79 | os.path.join(data_path, market_name + '_' + ticker + '_1.csv'),
80 | dtype=np.float32, delimiter=',', skip_header=False
81 | )
82 | if index == 0:
83 | print('single EOD data shape:', single_EOD.shape)
84 | eod_data = np.zeros([len(tickers), single_EOD.shape[0]],
85 | dtype=np.float32)
86 |
87 | for row in range(single_EOD.shape[0]):
88 | if abs(single_EOD[row][-1] + 1234) < 1e-8:
89 | # handle missing data
90 | if row < 3:
91 | # eod_data[index, row] = 0.0
92 | for i in range(row + 1, single_EOD.shape[0]):
93 | if abs(single_EOD[i][-1] + 1234) > 1e-8:
94 | eod_data[index][row] = single_EOD[i][-1]
95 | # print(index, row, i, eod_data[index][row])
96 | break
97 | else:
98 | eod_data[index][row] = np.sum(
99 | eod_data[index, row - 3:row]) / 3
100 | # print(index, row, eod_data[index][row])
101 | else:
102 | eod_data[index][row] = single_EOD[row][-1]
103 | # print('test point')
104 | np.save(market_name + '_sfm_data', eod_data)
105 |
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | def get_loss(prediction, ground_truth, base_price, mask, batch_size, alpha):
7 | device = prediction.device
8 | all_one = torch.ones(batch_size, 1, dtype=torch.float32).to(device)
9 | return_ratio = torch.div(torch.sub(prediction, base_price), base_price)
10 | # return ratio's mse loss
11 | reg_loss = F.mse_loss(return_ratio * mask, ground_truth * mask)
12 | # formula (4-6)
13 | pre_pw_dif = torch.sub(
14 | return_ratio @ all_one.t(),
15 | all_one @ return_ratio.t()
16 | )
17 | gt_pw_dif = torch.sub(
18 | all_one @ ground_truth.t(),
19 | ground_truth @ all_one.t()
20 | )
21 | mask_pw = mask @ mask.t()
22 | rank_loss = torch.mean(
23 | F.relu(pre_pw_dif * gt_pw_dif * mask_pw)
24 | )
25 | loss = reg_loss + alpha * rank_loss
26 | return loss, reg_loss, rank_loss, return_ratio
27 |
28 |
29 | class GraphModule(nn.Module):
30 | def __init__(self, batch_size, fea_shape, rel_encoding, rel_mask, inner_prod=False):
31 | super().__init__()
32 | self.batch_size = batch_size
33 | self.input_shape = fea_shape
34 | self.inner_prod = inner_prod
35 | self.relation = nn.Parameter(torch.tensor(rel_encoding, dtype=torch.float32), requires_grad=False)
36 | self.rel_mask = nn.Parameter(torch.tensor(rel_mask, dtype=torch.float32), requires_grad=False)
37 | self.all_one = nn.Parameter(torch.ones(self.batch_size, 1, dtype=torch.float32), requires_grad=False)
38 | self.rel_weight = nn.Linear(rel_encoding.shape[-1], 1)
39 | if self.inner_prod is False:
40 | self.head_weight = nn.Linear(fea_shape, 1)
41 | self.tail_weight = nn.Linear(fea_shape, 1)
42 |
43 | def forward(self, inputs):
44 | rel_weight = self.rel_weight(self.relation)
45 | if self.inner_prod:
46 | inner_weight = inputs @ inputs.t()
47 | weight = inner_weight @ rel_weight[:, :, -1]
48 | else:
49 | all_one = self.all_one
50 | head_weight = self.head_weight(inputs)
51 | tail_weight = self.tail_weight(inputs)
52 | weight = (head_weight @ all_one.t() + all_one @ tail_weight.t()) + rel_weight[:, :, -1]
53 | weight_masked = F.softmax(self.rel_mask + weight, dim=0)
54 | outputs = weight_masked @ inputs
55 | return outputs
56 |
57 |
58 | class StockLSTM(nn.Module):
59 | def __init__(self, batch_size):
60 | super().__init__()
61 | self.batch_size = batch_size
62 | self.lstm_cell = nn.LSTM(5, 64, batch_first=True)
63 | self.fc = nn.Linear(64, 1)
64 |
65 | def forward(self, inputs):
66 | x, _ = self.lstm(inputs)
67 | x = x[:, -1, :]
68 | prediction = F.leaky_relu(self.fc(x))
69 | return prediction
70 |
71 |
72 | class RelationLSTM(nn.Module):
73 | def __init__(self, batch_size, rel_encoding, rel_mask, inner_prod=False):
74 | super().__init__()
75 | self.batch_size = batch_size
76 | self.lstm = nn.LSTM(5, 64, batch_first=True)
77 | self.graph_layer = GraphModule(batch_size, 64, rel_encoding, rel_mask, inner_prod)
78 | self.fc = nn.Linear(64 * 2, 1)
79 |
80 | def forward(self, inputs):
81 | x, _ = self.lstm(inputs)
82 | x = x[:, -1, :]
83 | outputs_graph = self.graph_layer(x)
84 | outputs_cat = torch.cat([x, outputs_graph], dim=1)
85 | prediction = F.leaky_relu(self.fc(outputs_cat))
86 | return prediction
87 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | import os
4 | import torch as torch
5 | from load_data import load_relation_data, load_EOD_data
6 | from evaluator import evaluate
7 | from model import get_loss, RelationLSTM
8 |
9 |
10 | np.random.seed(123456789)
11 | torch.random.manual_seed(12345678)
12 | device = torch.device("cuda") if torch.cuda.is_available() else 'cpu'
13 |
14 | data_path = 'data/2013-01-01'
15 | market_name = 'NASDAQ'
16 | relation_name = 'wikidata' # or sector_industry
17 | parameters = {'seq': 16, 'unit': 64, 'alpha': 0.1}
18 | epochs = 50
19 | valid_index = 756
20 | test_index = 1008
21 | fea_dim = 5
22 | steps = 1
23 |
24 | tickers_fname = market_name + '_tickers_qualify_dr-0.98_min-5_smooth.csv'
25 | tickers = np.genfromtxt(os.path.join(data_path, '..', tickers_fname), dtype=str, delimiter='\t', skip_header=False)
26 | batch_size = len(tickers)
27 | print('#tickers selected:', len(tickers))
28 | eod_data, mask_data, gt_data, price_data = load_EOD_data(data_path, market_name, tickers, steps)
29 | trade_dates = mask_data.shape[1]
30 | # relation data
31 | rname_tail = {'sector_industry': '_industry_relation.npy', 'wikidata': '_wiki_relation.npy'}
32 | rel_encoding, rel_mask = load_relation_data(
33 | os.path.join(data_path, '..', 'relation', relation_name, market_name + rname_tail[relation_name])
34 | )
35 | print('relation encoding shape:', rel_encoding.shape)
36 | print('relation mask shape:', rel_mask.shape)
37 |
38 | model = RelationLSTM(
39 | batch_size=batch_size,
40 | rel_encoding=rel_encoding,
41 | rel_mask=rel_mask
42 | ).to(device)
43 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
44 | best_valid_loss = np.inf
45 | best_valid_perf = None
46 | best_test_perf = None
47 | batch_offsets = np.arange(start=0, stop=valid_index, dtype=int)
48 |
49 |
50 | def validate(start_index, end_index):
51 | """
52 | get loss on validate/test set
53 | """
54 | with torch.no_grad():
55 | cur_valid_pred = np.zeros([len(tickers), end_index - start_index], dtype=float)
56 | cur_valid_gt = np.zeros([len(tickers), end_index - start_index], dtype=float)
57 | cur_valid_mask = np.zeros([len(tickers), end_index - start_index], dtype=float)
58 | loss = 0.
59 | reg_loss = 0.
60 | rank_loss = 0.
61 | for cur_offset in range(start_index - parameters['seq'] - steps + 1, end_index - parameters['seq'] - steps + 1):
62 | data_batch, mask_batch, price_batch, gt_batch = map(
63 | lambda x: torch.Tensor(x).to(device),
64 | get_batch(cur_offset)
65 | )
66 | prediction = model(data_batch)
67 | cur_loss, cur_reg_loss, cur_rank_loss, cur_rr = get_loss(prediction, gt_batch, price_batch, mask_batch,
68 | batch_size, parameters['alpha'])
69 | loss += cur_loss.item()
70 | reg_loss += cur_reg_loss.item()
71 | rank_loss += cur_rank_loss.item()
72 | cur_valid_pred[:, cur_offset - (start_index - parameters['seq'] - steps + 1)] = cur_rr[:, 0].cpu()
73 | cur_valid_gt[:, cur_offset - (start_index - parameters['seq'] - steps + 1)] = gt_batch[:, 0].cpu()
74 | cur_valid_mask[:, cur_offset - (start_index - parameters['seq'] - steps + 1)] = mask_batch[:, 0].cpu()
75 | loss = loss / (end_index - start_index)
76 | reg_loss = reg_loss / (end_index - start_index)
77 | rank_loss = rank_loss / (end_index - start_index)
78 | cur_valid_perf = evaluate(cur_valid_pred, cur_valid_gt, cur_valid_mask)
79 | return loss, reg_loss, rank_loss, cur_valid_perf
80 |
81 |
82 | def get_batch(offset=None):
83 | if offset is None:
84 | offset = random.randrange(0, valid_index)
85 | seq_len = parameters['seq']
86 | mask_batch = mask_data[:, offset: offset + seq_len + steps]
87 | mask_batch = np.min(mask_batch, axis=1)
88 | return (
89 | eod_data[:, offset:offset + seq_len, :],
90 | np.expand_dims(mask_batch, axis=1),
91 | np.expand_dims(price_data[:, offset + seq_len - 1], axis=1),
92 | np.expand_dims(gt_data[:, offset + seq_len + steps - 1], axis=1))
93 |
94 |
95 | # train loop
96 | for epoch in range(epochs):
97 | np.random.shuffle(batch_offsets)
98 | tra_loss = 0.0
99 | tra_reg_loss = 0.0
100 | tra_rank_loss = 0.0
101 | # steps
102 | for j in range(valid_index - parameters['seq'] - steps + 1):
103 | data_batch, mask_batch, price_batch, gt_batch = map(
104 | lambda x: torch.Tensor(x).to(device),
105 | get_batch(batch_offsets[j])
106 | )
107 | optimizer.zero_grad()
108 | prediction = model(data_batch)
109 | cur_loss, cur_reg_loss, cur_rank_loss, _ = get_loss(prediction, gt_batch, price_batch, mask_batch,
110 | batch_size, parameters['alpha'])
111 | # update model
112 | cur_loss.backward()
113 | optimizer.step()
114 |
115 | tra_loss += cur_loss.item()
116 | tra_reg_loss += cur_reg_loss.item()
117 | tra_rank_loss += cur_rank_loss.item()
118 |
119 | # train loss
120 | # loss = reg_loss(mse) + alpha*rank_loss
121 | tra_loss = tra_loss / (valid_index - parameters['seq'] - steps + 1)
122 | tra_reg_loss = tra_reg_loss / (valid_index - parameters['seq'] - steps + 1)
123 | tra_rank_loss = tra_rank_loss / (valid_index - parameters['seq'] - steps + 1)
124 | print('\n\nTrain : loss:{} reg_loss:{} rank_loss:{}'.format(tra_loss, tra_reg_loss, tra_rank_loss))
125 |
126 | # show performance on valid set
127 | val_loss, val_reg_loss, val_rank_loss, val_perf = validate(valid_index, test_index)
128 | print('Valid : loss:{} reg_loss:{} rank_loss:{}'.format(val_loss, val_reg_loss, val_rank_loss))
129 | print('\t Valid performance:', val_perf)
130 |
131 | # show performance on valid set
132 | test_loss, test_reg_loss, test_rank_loss, test_perf = validate(test_index, trade_dates)
133 | print('Test: loss:{} reg_loss:{} rank_loss:{}'.format(test_loss, test_reg_loss, test_rank_loss))
134 | print('\t Test performance:', test_perf)
135 |
136 | # best result
137 | if val_loss < best_valid_loss:
138 | best_valid_loss = val_loss
139 | # In this place, remove some var that wouldn't be printed
140 | # without copy.copy()
141 | best_valid_perf = val_perf
142 | best_test_perf = test_perf
143 | print('Better valid loss:', best_valid_loss)
144 | print('\nBest Valid performance:', best_valid_perf)
145 | print('Best Test performance:', best_test_perf)
146 |
--------------------------------------------------------------------------------