├── requirement
├── README.md
├── config
    └── config.yml
├── dataloader
    └── dataset.py
├── modules
    ├── utils.py
    ├── environment.py
    ├── model.py
    └── agent.py
└── train.py


/requirement:
--------------------------------------------------------------------------------
1 | torch==1.1.0
2 | torchvision==0.2.2
3 | PyYAML==5.1
4 | numpy==1.16.3


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # value-based-deep-reinforcement-learning-trading-model-in-pytorch
2 | This is a repo for deep reinforcement learning in trading. I used value based double DQN variant for single stock trading. The agent learn to make decision between selling, holding and buying stock with fixed amount based on the reward returned from the environment. 
3 | 
4 | 
5 | [YoutubeVideo](https://youtu.be/bZLzHkrAUBU)
6 | 
7 | If you like, please subscript. I will keep upload and create more interesting and innovative projects.


--------------------------------------------------------------------------------
/config/config.yml:
--------------------------------------------------------------------------------
 1 | device: 'cpu'
 2 | resume_checkpoint: 'None'
 3 | checkpoint_dir: ''
 4 | #------------------------train spec-----------------------
 5 | learning_rate: 0.00025 # small for adam, larger for non momentum sgd variant
 6 | epsilon: 1.0
 7 | epsilon_min: 0.1
 8 | epsilon_reduce: 0.0005
 9 | epsilon_reduce_freq: 500
10 | memory_size: 500
11 | train_freq: 10
12 | batch_size: 100
13 | discount_rate: 0.97
14 | update_model_ast_freq: 500
15 | mode: 'ddqn'
16 | save_freq: 10
17 | num_epoch: 100
18 | #------------------------data spec-------------------------
19 | data_name: 'Data/Stocks/googl.us.txt'
20 | start_date: '2010-01-01'
21 | split_date: '2016-01-01'
22 | 
23 | 


--------------------------------------------------------------------------------
/dataloader/dataset.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | def read_data(path, start_date, split_date):
 4 |     data = pd.read_csv(path)
 5 |     data['Date'] = pd.to_datetime(data['Date'])
 6 |     data = data.set_index('Date')
 7 | 
 8 |     if start_date:
 9 |         train_data = data[start_date:split_date]
10 |     else:
11 |         train_data = data[:split_date]
12 |     test_data = data[split_date:]
13 | 
14 |     return train_data, test_data
15 | 
16 | if __name__ == '__main__':
17 |     train_data, test_data = read_data(
18 |         '/Users/jaychan/Desktop/youtube_project/rl_trading_agent/Data/Stocks/a.us.txt',
19 |         start_date='2006-01-01',
20 |         split_date='2016-01-01',
21 |     )
22 | 
23 |     print('train\n',train_data)
24 |     print('test\n',test_data)


--------------------------------------------------------------------------------
/modules/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import os
 4 | 
 5 | def policy(model, state, epsilon, device):
 6 |     # ----------------------------- epsilon greedy policy ----------------------------------------------
 7 |     # random policy
 8 |     action  = np.random.randint(3)
 9 | 
10 |     # greedy polcy
11 |     if np.random.rand() > epsilon:
12 |         action = model(torch.Tensor(np.array(state, dtype=np.float32)).view(1, -1).to(device))
13 |         action = np.argmax(action.data)
14 | 
15 |     return action
16 | 
17 | def shuffle_tensor(size, device):
18 |     shuffle_index = torch.randperm(size).to(device)
19 | 
20 |     return shuffle_index
21 | 
22 | def resume(model, cuda, resume_checkpoint):
23 |     print('=> loading checkpoint : "{}"'.format(resume_checkpoint))
24 |     model_dict = model.state_dict()
25 |     if not cuda:
26 |         checkpoint = torch.load(resume_checkpoint, map_location=lambda storage, loc: storage)['state_dict']
27 |         checkpoint = {k: v for k,v in checkpoint.items() if k in model_dict}
28 |     else:
29 |         checkpoint = torch.load(resume_checkpoint)['state_dict']
30 |         checkpoint = {k: v for k,v in checkpoint.items() if k in model_dict}
31 |     model_dict.update(checkpoint)
32 |     model.load_state_dict(model_dict)
33 | 
34 |     return
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     shuffled_tensor = shuffle_tensor(32, 'cpu')
39 |     print(shuffled_tensor)
40 | 
41 | 


--------------------------------------------------------------------------------
/modules/environment.py:
--------------------------------------------------------------------------------
 1 | class Environment():
 2 |     def __init__(self, data, history_length=90):
 3 |         self.data = data
 4 |         self.history_length = history_length
 5 |         self.reset()
 6 | 
 7 |     def reset(self):
 8 |         self.t = 0
 9 |         self.profits = 0
10 |         self.positions = []
11 |         self.position_value = 0
12 |         self.history = [0 for _ in range(self.history_length)]
13 |         # return the state/observation representation
14 |         return [self.position_value] + self.history
15 | 
16 |     def __call__(self, action):
17 |         reward = 0
18 |         profits = 0
19 |         hold_size = 0
20 |         hold_amount = 0
21 | 
22 |         if action == 1:
23 |             self.positions.append(self.data.iloc[self.t, :]['Close'])
24 |         elif action == 2:
25 |             if len(self.positions) == 0:
26 |                 reward = -1
27 |             else:
28 |                 for p in self.positions:
29 |                     profits += (self.data.iloc[self.t, :]['Close'] - p) -1
30 |                 # actually, this provides a way to make the reward be stochastic
31 |                 reward_signal = profits / sum(self.positions)
32 | 
33 |                 # in this case reward is deterministic
34 |                 if reward_signal < 0:
35 |                     reward = -1
36 |                 else:
37 |                     reward = 1
38 |                 hold_size = len(self.positions)
39 |                 hold_amount = sum(self.positions)
40 |                 self.profits += profits
41 |                 self.positions = []
42 | 
43 |         self.t += 1
44 |         self.position_value = 0
45 |         for p in self.positions:
46 |             self.position_value += (self.data.iloc[self.t, :]['Close'] - p) - 1
47 |         self.history.pop(0)
48 |         self.history.append(self.data.iloc[self.t, :]['Close'])
49 | 
50 |         return [self.position_value] + self.history, reward, profits, hold_size, hold_amount


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .modules.utils import resume
 3 | from .modules.agent import rl_agent_train
 4 | from .modules.environment import Environment
 5 | from tensorboardX import SummaryWriter
 6 | import yaml
 7 | import os
 8 | from .modules.model import Dueling_Q_Network
 9 | from .dataloader.dataset import read_data
10 | 
11 | 
12 | def train(env, model, config):
13 |     model.to(config['device'])
14 |     model_ast = type(model)(model.input_size).to(config['device'])
15 |     cuda = torch.cuda.is_available()
16 | 
17 |     if os.path.join(os.path.dirname(__file__), config['resume_checkpoint']):
18 |         if os.path.exists(os.path.join(os.path.dirname(__file__), config['resume_checkpoint'])):
19 |             _ = resume(
20 |                 model=model,
21 |                 cuda=cuda,
22 |                 resume_checkpoint=os.path.join(os.path.dirname(__file__), config['resume_checkpoint'])
23 |             )
24 |         else:
25 |             print('checkpoint: "{}" does not exist'.format(config['resume_checkpoint']))
26 |             print('------------------------train from scratch------------------------------------')
27 |     else:
28 |         print('-----------------------------train from scratch------------------------------------')
29 | 
30 |     if torch.cuda.device_count() > 1:
31 |         print(" let's use , {} GPU".format(torch.cuda.device_count()))
32 |         model = torch.nn.DataParallel(model)
33 |         model_ast = torch.nn.DataParallel(model_ast)
34 | 
35 |     model.train()
36 |     model_ast.train(mode=False)
37 |     optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rate'], weight_decay=0.0005)
38 |     criterion = torch.nn.MSELoss()
39 | 
40 |     global_step = 0
41 |     rewards = []
42 |     losses = []
43 | 
44 |     writer = SummaryWriter(log_dir=os.path.join(os.path.dirname(__file__) + '/tensorboard', config['checkpoint_dir']))
45 |     rewards, losses = rl_agent_train(
46 |         model=model,
47 |         model_ast=model_ast,
48 |         env=env,
49 |         step_max=len(env.data)-1,
50 |         epsilon=config['epsilon'],
51 |         epsilon_min=config['epsilon_min'],
52 |         epsilon_reduce=config['epsilon_reduce'],
53 |         epsilon_reduce_freq=config['epsilon_reduce_freq'],
54 |         device=config['device'],
55 |         memory_size=config['memory_size'],
56 |         global_step=global_step,
57 |         train_freq=config['train_freq'],
58 |         batch_size=config['batch_size'],
59 |         discount_rate=config['discount_rate'],
60 |         criterion=criterion,
61 |         optimizer=optimizer,
62 |         losses=losses,
63 |         rewards=rewards,
64 |         update_model_ast_freq=config['update_model_ast_freq'],
65 |         checkpoint_dir=os.path.join(os.path.dirname(__file__) + '/checkpoint', config['checkpoint_dir']),
66 |         mode=config['mode'],
67 |         writer=writer,
68 |         save_freq=config['save_freq'],
69 |         num_epoch=config['num_epoch']
70 |     )
71 | 
72 |     return model, losses, rewards
73 | 
74 | 
75 | def main(config):
76 |     model = Dueling_Q_Network(181)
77 |     train_data, test_data = read_data(
78 |         path=os.path.join(os.path.dirname(__file__), config['data_name']),
79 |         start_date=config['start_date'],
80 |         split_date=config['split_date']
81 |     )
82 | 
83 |     train_env = Environment(data=train_data, history_length=180)
84 |     test_env = Environment(data=test_data, history_length=180)
85 | 
86 |     model, losses, rewards = train(
87 |         env=train_env,
88 |         model=model,
89 |         config=config
90 |     )
91 | 
92 |     return
93 | 
94 | if __name__ == "__main__":
95 |     with open(os.path.join(os.path.dirname(__file__), 'config/config.yml'), 'r') as stream:
96 |         config = yaml.load(stream)
97 | 
98 |     print(config)
99 |     _ = main(config)


--------------------------------------------------------------------------------
/modules/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | class Q_Network(torch.nn.Module):
  5 |     def __init__(self, input_size):
  6 |         super(Q_Network, self).__init__()
  7 |         self.input_size = input_size
  8 |         self.backbone = torch.nn.Sequential(
  9 |             torch.nn.Conv1d(in_channels=1, out_channels=2, kernel_size=2, dilation=1, bias=True),
 10 |             torch.nn.ReLU(),
 11 |             torch.nn.BatchNorm1d(2),
 12 |             torch.nn.Dropout(0.3),
 13 |             torch.nn.Conv1d(in_channels=2, out_channels=4, kernel_size=2, dilation=2, bias=True),
 14 |             torch.nn.ReLU(),
 15 |             torch.nn.BatchNorm1d(4),
 16 |             torch.nn.Dropout(0.3),
 17 |             torch.nn.Conv1d(in_channels=4, out_channels=8, kernel_size=2, dilation=4, bias=True),
 18 |             torch.nn.ReLU(),
 19 |             torch.nn.BatchNorm1d(8),
 20 |             torch.nn.Dropout(0.3),
 21 |             torch.nn.Conv1d(in_channels=8, out_channels=16, kernel_size=2, dilation=8, bias=True),
 22 |             torch.nn.ReLU(),
 23 |             torch.nn.BatchNorm1d(16),
 24 |             torch.nn.Dropout(0.3),
 25 |             torch.nn.Conv1d(in_channels=16, out_channels=32, kernel_size=2, dilation=16, bias=True),
 26 |             torch.nn.ReLU(),
 27 |             torch.nn.BatchNorm1d(32),
 28 |             torch.nn.Dropout(0.3),
 29 |             torch.nn.Conv1d(in_channels=32, out_channels=64, kernel_size=2, dilation=32, bias=True),
 30 |             torch.nn.ReLU(),
 31 |             torch.nn.BatchNorm1d(64),
 32 |             torch.nn.Dropout(0.3),
 33 |             torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=2, dilation=64, bias=True),
 34 |             torch.nn.ReLU(),
 35 |             torch.nn.BatchNorm1d(128),
 36 |             torch.nn.Dropout(0.3),
 37 |         )
 38 | 
 39 |         self.affine_layer = torch.nn.Sequential(
 40 |             torch.nn.Linear(128*54, 2048),
 41 |             torch.nn.SELU(),
 42 |             torch.nn.AlphaDropout(0.3),
 43 |             torch.nn.Linear(2048, 1024),
 44 |             torch.nn.SELU(),
 45 |             torch.nn.AlphaDropout(0.3),
 46 |             torch.nn.Linear(1024, 512),
 47 |             torch.nn.SELU(),
 48 |             torch.nn.AlphaDropout(0.3),
 49 |             torch.nn.Linear(512, 3),
 50 |             torch.nn.SELU(),
 51 |             torch.nn.AlphaDropout(0.3),
 52 |         )
 53 | 
 54 |     def reset(self):
 55 |         self.zero_grad()
 56 | 
 57 |     def forward(self, x):
 58 |         x = self.backbone(x.view(-1, 1, self.input_size))
 59 |         x = self.affine_layer(x.view(-1, 128*54))
 60 | 
 61 |         return x
 62 | 
 63 | class Dueling_Q_Network(nn.Module):
 64 |     def __init__(self, input_size):
 65 |         super(Dueling_Q_Network, self).__init__()
 66 |         self.input_size = input_size
 67 |         self.backbone = torch.nn.Sequential(
 68 |             torch.nn.Conv1d(in_channels=1, out_channels=2, kernel_size=2, dilation=1, bias=True),
 69 |             torch.nn.ReLU(),
 70 |             torch.nn.BatchNorm1d(2),
 71 |             torch.nn.Dropout(0.3),
 72 |             torch.nn.Conv1d(in_channels=2, out_channels=4, kernel_size=2, dilation=2, bias=True),
 73 |             torch.nn.ReLU(),
 74 |             torch.nn.BatchNorm1d(4),
 75 |             torch.nn.Dropout(0.3),
 76 |             torch.nn.Conv1d(in_channels=4, out_channels=8, kernel_size=2, dilation=4, bias=True),
 77 |             torch.nn.ReLU(),
 78 |             torch.nn.BatchNorm1d(8),
 79 |             torch.nn.Dropout(0.3),
 80 |             torch.nn.Conv1d(in_channels=8, out_channels=16, kernel_size=2, dilation=8, bias=True),
 81 |             torch.nn.ReLU(),
 82 |             torch.nn.BatchNorm1d(16),
 83 |             torch.nn.Dropout(0.3),
 84 |             torch.nn.Conv1d(in_channels=16, out_channels=32, kernel_size=2, dilation=16, bias=True),
 85 |             torch.nn.ReLU(),
 86 |             torch.nn.BatchNorm1d(32),
 87 |             torch.nn.Dropout(0.3),
 88 |             torch.nn.Conv1d(in_channels=32, out_channels=64, kernel_size=2, dilation=32, bias=True),
 89 |             torch.nn.ReLU(),
 90 |             torch.nn.BatchNorm1d(64),
 91 |             torch.nn.Dropout(0.3),
 92 |             torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=2, dilation=64, bias=True),
 93 |             torch.nn.ReLU(),
 94 |             torch.nn.BatchNorm1d(128),
 95 |             torch.nn.Dropout(0.3),
 96 |         )
 97 | 
 98 |         self.state_value = torch.nn.Sequential(
 99 |             torch.nn.Linear(128*54, 2048),
100 |             torch.nn.SELU(),
101 |             torch.nn.AlphaDropout(0.3),
102 |             torch.nn.Linear(2048, 1024),
103 |             torch.nn.SELU(),
104 |             torch.nn.AlphaDropout(0.3),
105 |             torch.nn.Linear(1024, 512),
106 |             torch.nn.SELU(),
107 |             torch.nn.AlphaDropout(0.3),
108 |             torch.nn.Linear(512, 1),
109 |             torch.nn.SELU(),
110 |             torch.nn.AlphaDropout(0.3),
111 |         )
112 | 
113 |         self.advantage_value = torch.nn.Sequential(
114 |             torch.nn.Linear(128*54, 2048),
115 |             torch.nn.SELU(),
116 |             torch.nn.AlphaDropout(0.3),
117 |             torch.nn.Linear(2048, 1024),
118 |             torch.nn.SELU(),
119 |             torch.nn.AlphaDropout(0.3),
120 |             torch.nn.Linear(1024, 512),
121 |             torch.nn.SELU(),
122 |             torch.nn.AlphaDropout(0.3),
123 |             torch.nn.Linear(512, 3),
124 |             torch.nn.SELU(),
125 |             torch.nn.AlphaDropout(0.3),
126 |         )
127 | 
128 |     def reset(self):
129 |         self.zero_grad()
130 | 
131 |     def forward(self, x):
132 |         x = self.backbone(x.view(-1, 1, self.input_size))
133 |         state_value = self.state_value(x.view(-1, 128 * 54))
134 |         advantage_value = self.advantage_value(x.view(-1, 128 * 54))
135 |         advantage_mean = torch.Tensor.mean(advantage_value, dim=1, keepdim=True)
136 |         q_value = state_value.expand([-1, 3]) + (advantage_value - advantage_mean.expand([-1, 3]))
137 | 
138 |         return q_value
139 | 


--------------------------------------------------------------------------------
/modules/agent.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import os
  4 | from .utils import shuffle_tensor, policy
  5 | 
  6 | def rl_agent_train(model, env, num_epoch, step_max, epsilon, device, memory_size, train_freq, batch_size, mode, model_ast,
  7 |                    discount_rate, criterion, optimizer, update_model_ast_freq, epsilon_min, epsilon_reduce_freq, epsilon_reduce,
  8 |                    writer, rewards, losses, save_freq, checkpoint_dir, global_step):
  9 |     for epoch in range(num_epoch):
 10 |         state = env.reset()
 11 |         step = 0
 12 |         total_loss = 0
 13 |         total_reward = 0
 14 | 
 15 |         # add replay memory buffer for the agent
 16 |         state_memory = []
 17 |         action_memory = []
 18 |         reward_memory = []
 19 |         observation_memory = [] # observation is actually the next state for boostrap
 20 |         while True and step < step_max:
 21 |             action = policy(model=model, state=state, epsilon=epsilon, device=device)
 22 | 
 23 |             observation, reward, profit, _, __ = env(action)
 24 | 
 25 |             # add to memory buffer
 26 |             state_memory.append(state)
 27 |             action_memory.append(action)
 28 |             reward_memory.append(reward)
 29 |             observation_memory.append(observation)
 30 |             if len(state_memory) > memory_size:
 31 |                 state_memory.pop(0)
 32 |                 action_memory.pop(0)
 33 |                 reward_memory.pop(0)
 34 |                 observation_memory.pop(0)
 35 | 
 36 |             memory = (state_memory, action_memory, reward_memory, observation_memory)
 37 | 
 38 |             if len(state_memory) == memory_size:
 39 |                 # train or update only in every train freq
 40 |                 if global_step % train_freq == 0:
 41 |                     state_tensor = torch.Tensor(np.array(memory[0], dtype=np.float32)).to(device)
 42 |                     action_tensor = torch.Tensor(np.array(memory[1], dtype=np.int32)).to(device)
 43 |                     reward_tensor = torch.Tensor(np.array(memory[2], dtype=np.int32)).to(device)
 44 |                     observation_tensor = torch.Tensor(np.array(memory[3], dtype=np.float32)).to(device)
 45 | 
 46 |                     shuffle_index = shuffle_tensor(memory_size, device)
 47 |                     shuffle_state = torch.index_select(state_tensor, 0, shuffle_index)
 48 |                     shuffle_reward = torch.index_select(reward_tensor, 0, shuffle_index)
 49 |                     shuffle_action = torch.index_select(action_tensor, 0, shuffle_index)
 50 |                     shuffle_observation = torch.index_select(observation_tensor, 0, shuffle_index)
 51 | 
 52 |                     for i in range(memory_size)[::batch_size]:
 53 |                         batch_state = shuffle_state[i: i+batch_size, :].type('torch.FloatTensor').to(device)
 54 |                         batch_action = shuffle_action[i: i+batch_size].type('torch.FloatTensor').to(device)
 55 |                         batch_reward = shuffle_reward[i: i+batch_size].type('torch.FloatTensor').to(device)
 56 |                         batch_observation = shuffle_observation[i: i+batch_size, :].type('torch.FloatTensor').to(device)
 57 | 
 58 |                         q_eval = model(batch_state).gather(1, batch_action.long().unsqueeze(1))
 59 |                         q_next = model_ast(batch_observation).detach()
 60 | 
 61 |                         if mode == 'dqn':
 62 |                             q_target = batch_reward + discount_rate * q_next.max(1)[0]
 63 | 
 64 |                         elif mode == 'ddqn':
 65 |                             q_target = batch_reward + discount_rate * q_next.gather(1, torch.argmax(
 66 |                                 model(batch_observation), dim=1, keepdim=True))
 67 | 
 68 |                         else:
 69 |                             raise ValueError('please input correct mode for rl agent, either "dqn", or "ddqn"')
 70 | 
 71 |                         optimizer.zero_grad()
 72 |                         loss = criterion(input=q_eval, target=q_target)
 73 |                         total_loss += loss.item()
 74 |                         loss.backward()
 75 |                         optimizer.step()
 76 | 
 77 |                 if global_step % update_model_ast_freq == 0:
 78 |                     para = {k: 0.3 * v + 0.7 * model.state_dict()[k] for k, v in model_ast.state_dict().items()}
 79 |                     model_ast.load_state_dict(para)
 80 | 
 81 |             # epsilon
 82 |             if epsilon > epsilon_min and global_step % epsilon_reduce_freq == 0:
 83 |                 epsilon -= epsilon_reduce
 84 | 
 85 |             total_reward +=reward
 86 |             state = observation
 87 |             step += 1
 88 |             global_step += 1
 89 | 
 90 |             # here writer is tensorboardX
 91 |             if writer and global_step % 500 == 0:
 92 |                 writer.add_scalar('loss', total_loss, global_step=global_step)
 93 |                 writer.add_scalar('reward', total_reward, global_step=global_step)
 94 | 
 95 |             if global_step % 500 == 0:
 96 |                 print('-----------------------------------global step {}------------------------------------'.format(global_step))
 97 |                 print('total_reward : {}'.format(total_reward))
 98 |                 print('total_loss : {}'.format(total_loss))
 99 |                 print('-----------------------------------------------------------------------------------------')
100 | 
101 |         rewards.append(total_reward)
102 |         losses.append(total_loss)
103 | 
104 |         print('------------------------------------------epoch {}--------------------------------------------',format(epoch))
105 |         print('total loss : {}'.format(total_loss))
106 |         print('total reward : {}'.format(total_reward))
107 |         print('-----------------------------------------------------------------------------------------------')
108 | 
109 |         if (epoch + 1) % save_freq == 0:
110 |             checkpoint_state = {'epoch': epoch, 'state_dict': model.state_dict()}
111 |             torch.save(checkpoint_state, os.path.join(checkpoint_dir, '{}_checkpoint.pth.tar'.format(str(epoch)+ '_' + str(global_step))))
112 | 
113 |         return rewards, losses


--------------------------------------------------------------------------------