├── requirement ├── README.md ├── config └── config.yml ├── dataloader └── dataset.py ├── modules ├── utils.py ├── environment.py ├── model.py └── agent.py └── train.py /requirement: -------------------------------------------------------------------------------- 1 | torch==1.1.0 2 | torchvision==0.2.2 3 | PyYAML==5.1 4 | numpy==1.16.3 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # value-based-deep-reinforcement-learning-trading-model-in-pytorch 2 | This is a repo for deep reinforcement learning in trading. I used value based double DQN variant for single stock trading. The agent learn to make decision between selling, holding and buying stock with fixed amount based on the reward returned from the environment. 3 | 4 | 5 | [YoutubeVideo](https://youtu.be/bZLzHkrAUBU) 6 | 7 | If you like, please subscript. I will keep upload and create more interesting and innovative projects. -------------------------------------------------------------------------------- /config/config.yml: -------------------------------------------------------------------------------- 1 | device: 'cpu' 2 | resume_checkpoint: 'None' 3 | checkpoint_dir: '' 4 | #------------------------train spec----------------------- 5 | learning_rate: 0.00025 # small for adam, larger for non momentum sgd variant 6 | epsilon: 1.0 7 | epsilon_min: 0.1 8 | epsilon_reduce: 0.0005 9 | epsilon_reduce_freq: 500 10 | memory_size: 500 11 | train_freq: 10 12 | batch_size: 100 13 | discount_rate: 0.97 14 | update_model_ast_freq: 500 15 | mode: 'ddqn' 16 | save_freq: 10 17 | num_epoch: 100 18 | #------------------------data spec------------------------- 19 | data_name: 'Data/Stocks/googl.us.txt' 20 | start_date: '2010-01-01' 21 | split_date: '2016-01-01' 22 | 23 | -------------------------------------------------------------------------------- /dataloader/dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def read_data(path, start_date, split_date): 4 | data = pd.read_csv(path) 5 | data['Date'] = pd.to_datetime(data['Date']) 6 | data = data.set_index('Date') 7 | 8 | if start_date: 9 | train_data = data[start_date:split_date] 10 | else: 11 | train_data = data[:split_date] 12 | test_data = data[split_date:] 13 | 14 | return train_data, test_data 15 | 16 | if __name__ == '__main__': 17 | train_data, test_data = read_data( 18 | '/Users/jaychan/Desktop/youtube_project/rl_trading_agent/Data/Stocks/a.us.txt', 19 | start_date='2006-01-01', 20 | split_date='2016-01-01', 21 | ) 22 | 23 | print('train\n',train_data) 24 | print('test\n',test_data) -------------------------------------------------------------------------------- /modules/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | 5 | def policy(model, state, epsilon, device): 6 | # ----------------------------- epsilon greedy policy ---------------------------------------------- 7 | # random policy 8 | action = np.random.randint(3) 9 | 10 | # greedy polcy 11 | if np.random.rand() > epsilon: 12 | action = model(torch.Tensor(np.array(state, dtype=np.float32)).view(1, -1).to(device)) 13 | action = np.argmax(action.data) 14 | 15 | return action 16 | 17 | def shuffle_tensor(size, device): 18 | shuffle_index = torch.randperm(size).to(device) 19 | 20 | return shuffle_index 21 | 22 | def resume(model, cuda, resume_checkpoint): 23 | print('=> loading checkpoint : "{}"'.format(resume_checkpoint)) 24 | model_dict = model.state_dict() 25 | if not cuda: 26 | checkpoint = torch.load(resume_checkpoint, map_location=lambda storage, loc: storage)['state_dict'] 27 | checkpoint = {k: v for k,v in checkpoint.items() if k in model_dict} 28 | else: 29 | checkpoint = torch.load(resume_checkpoint)['state_dict'] 30 | checkpoint = {k: v for k,v in checkpoint.items() if k in model_dict} 31 | model_dict.update(checkpoint) 32 | model.load_state_dict(model_dict) 33 | 34 | return 35 | 36 | 37 | if __name__ == "__main__": 38 | shuffled_tensor = shuffle_tensor(32, 'cpu') 39 | print(shuffled_tensor) 40 | 41 | -------------------------------------------------------------------------------- /modules/environment.py: -------------------------------------------------------------------------------- 1 | class Environment(): 2 | def __init__(self, data, history_length=90): 3 | self.data = data 4 | self.history_length = history_length 5 | self.reset() 6 | 7 | def reset(self): 8 | self.t = 0 9 | self.profits = 0 10 | self.positions = [] 11 | self.position_value = 0 12 | self.history = [0 for _ in range(self.history_length)] 13 | # return the state/observation representation 14 | return [self.position_value] + self.history 15 | 16 | def __call__(self, action): 17 | reward = 0 18 | profits = 0 19 | hold_size = 0 20 | hold_amount = 0 21 | 22 | if action == 1: 23 | self.positions.append(self.data.iloc[self.t, :]['Close']) 24 | elif action == 2: 25 | if len(self.positions) == 0: 26 | reward = -1 27 | else: 28 | for p in self.positions: 29 | profits += (self.data.iloc[self.t, :]['Close'] - p) -1 30 | # actually, this provides a way to make the reward be stochastic 31 | reward_signal = profits / sum(self.positions) 32 | 33 | # in this case reward is deterministic 34 | if reward_signal < 0: 35 | reward = -1 36 | else: 37 | reward = 1 38 | hold_size = len(self.positions) 39 | hold_amount = sum(self.positions) 40 | self.profits += profits 41 | self.positions = [] 42 | 43 | self.t += 1 44 | self.position_value = 0 45 | for p in self.positions: 46 | self.position_value += (self.data.iloc[self.t, :]['Close'] - p) - 1 47 | self.history.pop(0) 48 | self.history.append(self.data.iloc[self.t, :]['Close']) 49 | 50 | return [self.position_value] + self.history, reward, profits, hold_size, hold_amount -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .modules.utils import resume 3 | from .modules.agent import rl_agent_train 4 | from .modules.environment import Environment 5 | from tensorboardX import SummaryWriter 6 | import yaml 7 | import os 8 | from .modules.model import Dueling_Q_Network 9 | from .dataloader.dataset import read_data 10 | 11 | 12 | def train(env, model, config): 13 | model.to(config['device']) 14 | model_ast = type(model)(model.input_size).to(config['device']) 15 | cuda = torch.cuda.is_available() 16 | 17 | if os.path.join(os.path.dirname(__file__), config['resume_checkpoint']): 18 | if os.path.exists(os.path.join(os.path.dirname(__file__), config['resume_checkpoint'])): 19 | _ = resume( 20 | model=model, 21 | cuda=cuda, 22 | resume_checkpoint=os.path.join(os.path.dirname(__file__), config['resume_checkpoint']) 23 | ) 24 | else: 25 | print('checkpoint: "{}" does not exist'.format(config['resume_checkpoint'])) 26 | print('------------------------train from scratch------------------------------------') 27 | else: 28 | print('-----------------------------train from scratch------------------------------------') 29 | 30 | if torch.cuda.device_count() > 1: 31 | print(" let's use , {} GPU".format(torch.cuda.device_count())) 32 | model = torch.nn.DataParallel(model) 33 | model_ast = torch.nn.DataParallel(model_ast) 34 | 35 | model.train() 36 | model_ast.train(mode=False) 37 | optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rate'], weight_decay=0.0005) 38 | criterion = torch.nn.MSELoss() 39 | 40 | global_step = 0 41 | rewards = [] 42 | losses = [] 43 | 44 | writer = SummaryWriter(log_dir=os.path.join(os.path.dirname(__file__) + '/tensorboard', config['checkpoint_dir'])) 45 | rewards, losses = rl_agent_train( 46 | model=model, 47 | model_ast=model_ast, 48 | env=env, 49 | step_max=len(env.data)-1, 50 | epsilon=config['epsilon'], 51 | epsilon_min=config['epsilon_min'], 52 | epsilon_reduce=config['epsilon_reduce'], 53 | epsilon_reduce_freq=config['epsilon_reduce_freq'], 54 | device=config['device'], 55 | memory_size=config['memory_size'], 56 | global_step=global_step, 57 | train_freq=config['train_freq'], 58 | batch_size=config['batch_size'], 59 | discount_rate=config['discount_rate'], 60 | criterion=criterion, 61 | optimizer=optimizer, 62 | losses=losses, 63 | rewards=rewards, 64 | update_model_ast_freq=config['update_model_ast_freq'], 65 | checkpoint_dir=os.path.join(os.path.dirname(__file__) + '/checkpoint', config['checkpoint_dir']), 66 | mode=config['mode'], 67 | writer=writer, 68 | save_freq=config['save_freq'], 69 | num_epoch=config['num_epoch'] 70 | ) 71 | 72 | return model, losses, rewards 73 | 74 | 75 | def main(config): 76 | model = Dueling_Q_Network(181) 77 | train_data, test_data = read_data( 78 | path=os.path.join(os.path.dirname(__file__), config['data_name']), 79 | start_date=config['start_date'], 80 | split_date=config['split_date'] 81 | ) 82 | 83 | train_env = Environment(data=train_data, history_length=180) 84 | test_env = Environment(data=test_data, history_length=180) 85 | 86 | model, losses, rewards = train( 87 | env=train_env, 88 | model=model, 89 | config=config 90 | ) 91 | 92 | return 93 | 94 | if __name__ == "__main__": 95 | with open(os.path.join(os.path.dirname(__file__), 'config/config.yml'), 'r') as stream: 96 | config = yaml.load(stream) 97 | 98 | print(config) 99 | _ = main(config) -------------------------------------------------------------------------------- /modules/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Q_Network(torch.nn.Module): 5 | def __init__(self, input_size): 6 | super(Q_Network, self).__init__() 7 | self.input_size = input_size 8 | self.backbone = torch.nn.Sequential( 9 | torch.nn.Conv1d(in_channels=1, out_channels=2, kernel_size=2, dilation=1, bias=True), 10 | torch.nn.ReLU(), 11 | torch.nn.BatchNorm1d(2), 12 | torch.nn.Dropout(0.3), 13 | torch.nn.Conv1d(in_channels=2, out_channels=4, kernel_size=2, dilation=2, bias=True), 14 | torch.nn.ReLU(), 15 | torch.nn.BatchNorm1d(4), 16 | torch.nn.Dropout(0.3), 17 | torch.nn.Conv1d(in_channels=4, out_channels=8, kernel_size=2, dilation=4, bias=True), 18 | torch.nn.ReLU(), 19 | torch.nn.BatchNorm1d(8), 20 | torch.nn.Dropout(0.3), 21 | torch.nn.Conv1d(in_channels=8, out_channels=16, kernel_size=2, dilation=8, bias=True), 22 | torch.nn.ReLU(), 23 | torch.nn.BatchNorm1d(16), 24 | torch.nn.Dropout(0.3), 25 | torch.nn.Conv1d(in_channels=16, out_channels=32, kernel_size=2, dilation=16, bias=True), 26 | torch.nn.ReLU(), 27 | torch.nn.BatchNorm1d(32), 28 | torch.nn.Dropout(0.3), 29 | torch.nn.Conv1d(in_channels=32, out_channels=64, kernel_size=2, dilation=32, bias=True), 30 | torch.nn.ReLU(), 31 | torch.nn.BatchNorm1d(64), 32 | torch.nn.Dropout(0.3), 33 | torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=2, dilation=64, bias=True), 34 | torch.nn.ReLU(), 35 | torch.nn.BatchNorm1d(128), 36 | torch.nn.Dropout(0.3), 37 | ) 38 | 39 | self.affine_layer = torch.nn.Sequential( 40 | torch.nn.Linear(128*54, 2048), 41 | torch.nn.SELU(), 42 | torch.nn.AlphaDropout(0.3), 43 | torch.nn.Linear(2048, 1024), 44 | torch.nn.SELU(), 45 | torch.nn.AlphaDropout(0.3), 46 | torch.nn.Linear(1024, 512), 47 | torch.nn.SELU(), 48 | torch.nn.AlphaDropout(0.3), 49 | torch.nn.Linear(512, 3), 50 | torch.nn.SELU(), 51 | torch.nn.AlphaDropout(0.3), 52 | ) 53 | 54 | def reset(self): 55 | self.zero_grad() 56 | 57 | def forward(self, x): 58 | x = self.backbone(x.view(-1, 1, self.input_size)) 59 | x = self.affine_layer(x.view(-1, 128*54)) 60 | 61 | return x 62 | 63 | class Dueling_Q_Network(nn.Module): 64 | def __init__(self, input_size): 65 | super(Dueling_Q_Network, self).__init__() 66 | self.input_size = input_size 67 | self.backbone = torch.nn.Sequential( 68 | torch.nn.Conv1d(in_channels=1, out_channels=2, kernel_size=2, dilation=1, bias=True), 69 | torch.nn.ReLU(), 70 | torch.nn.BatchNorm1d(2), 71 | torch.nn.Dropout(0.3), 72 | torch.nn.Conv1d(in_channels=2, out_channels=4, kernel_size=2, dilation=2, bias=True), 73 | torch.nn.ReLU(), 74 | torch.nn.BatchNorm1d(4), 75 | torch.nn.Dropout(0.3), 76 | torch.nn.Conv1d(in_channels=4, out_channels=8, kernel_size=2, dilation=4, bias=True), 77 | torch.nn.ReLU(), 78 | torch.nn.BatchNorm1d(8), 79 | torch.nn.Dropout(0.3), 80 | torch.nn.Conv1d(in_channels=8, out_channels=16, kernel_size=2, dilation=8, bias=True), 81 | torch.nn.ReLU(), 82 | torch.nn.BatchNorm1d(16), 83 | torch.nn.Dropout(0.3), 84 | torch.nn.Conv1d(in_channels=16, out_channels=32, kernel_size=2, dilation=16, bias=True), 85 | torch.nn.ReLU(), 86 | torch.nn.BatchNorm1d(32), 87 | torch.nn.Dropout(0.3), 88 | torch.nn.Conv1d(in_channels=32, out_channels=64, kernel_size=2, dilation=32, bias=True), 89 | torch.nn.ReLU(), 90 | torch.nn.BatchNorm1d(64), 91 | torch.nn.Dropout(0.3), 92 | torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=2, dilation=64, bias=True), 93 | torch.nn.ReLU(), 94 | torch.nn.BatchNorm1d(128), 95 | torch.nn.Dropout(0.3), 96 | ) 97 | 98 | self.state_value = torch.nn.Sequential( 99 | torch.nn.Linear(128*54, 2048), 100 | torch.nn.SELU(), 101 | torch.nn.AlphaDropout(0.3), 102 | torch.nn.Linear(2048, 1024), 103 | torch.nn.SELU(), 104 | torch.nn.AlphaDropout(0.3), 105 | torch.nn.Linear(1024, 512), 106 | torch.nn.SELU(), 107 | torch.nn.AlphaDropout(0.3), 108 | torch.nn.Linear(512, 1), 109 | torch.nn.SELU(), 110 | torch.nn.AlphaDropout(0.3), 111 | ) 112 | 113 | self.advantage_value = torch.nn.Sequential( 114 | torch.nn.Linear(128*54, 2048), 115 | torch.nn.SELU(), 116 | torch.nn.AlphaDropout(0.3), 117 | torch.nn.Linear(2048, 1024), 118 | torch.nn.SELU(), 119 | torch.nn.AlphaDropout(0.3), 120 | torch.nn.Linear(1024, 512), 121 | torch.nn.SELU(), 122 | torch.nn.AlphaDropout(0.3), 123 | torch.nn.Linear(512, 3), 124 | torch.nn.SELU(), 125 | torch.nn.AlphaDropout(0.3), 126 | ) 127 | 128 | def reset(self): 129 | self.zero_grad() 130 | 131 | def forward(self, x): 132 | x = self.backbone(x.view(-1, 1, self.input_size)) 133 | state_value = self.state_value(x.view(-1, 128 * 54)) 134 | advantage_value = self.advantage_value(x.view(-1, 128 * 54)) 135 | advantage_mean = torch.Tensor.mean(advantage_value, dim=1, keepdim=True) 136 | q_value = state_value.expand([-1, 3]) + (advantage_value - advantage_mean.expand([-1, 3])) 137 | 138 | return q_value 139 | -------------------------------------------------------------------------------- /modules/agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | from .utils import shuffle_tensor, policy 5 | 6 | def rl_agent_train(model, env, num_epoch, step_max, epsilon, device, memory_size, train_freq, batch_size, mode, model_ast, 7 | discount_rate, criterion, optimizer, update_model_ast_freq, epsilon_min, epsilon_reduce_freq, epsilon_reduce, 8 | writer, rewards, losses, save_freq, checkpoint_dir, global_step): 9 | for epoch in range(num_epoch): 10 | state = env.reset() 11 | step = 0 12 | total_loss = 0 13 | total_reward = 0 14 | 15 | # add replay memory buffer for the agent 16 | state_memory = [] 17 | action_memory = [] 18 | reward_memory = [] 19 | observation_memory = [] # observation is actually the next state for boostrap 20 | while True and step < step_max: 21 | action = policy(model=model, state=state, epsilon=epsilon, device=device) 22 | 23 | observation, reward, profit, _, __ = env(action) 24 | 25 | # add to memory buffer 26 | state_memory.append(state) 27 | action_memory.append(action) 28 | reward_memory.append(reward) 29 | observation_memory.append(observation) 30 | if len(state_memory) > memory_size: 31 | state_memory.pop(0) 32 | action_memory.pop(0) 33 | reward_memory.pop(0) 34 | observation_memory.pop(0) 35 | 36 | memory = (state_memory, action_memory, reward_memory, observation_memory) 37 | 38 | if len(state_memory) == memory_size: 39 | # train or update only in every train freq 40 | if global_step % train_freq == 0: 41 | state_tensor = torch.Tensor(np.array(memory[0], dtype=np.float32)).to(device) 42 | action_tensor = torch.Tensor(np.array(memory[1], dtype=np.int32)).to(device) 43 | reward_tensor = torch.Tensor(np.array(memory[2], dtype=np.int32)).to(device) 44 | observation_tensor = torch.Tensor(np.array(memory[3], dtype=np.float32)).to(device) 45 | 46 | shuffle_index = shuffle_tensor(memory_size, device) 47 | shuffle_state = torch.index_select(state_tensor, 0, shuffle_index) 48 | shuffle_reward = torch.index_select(reward_tensor, 0, shuffle_index) 49 | shuffle_action = torch.index_select(action_tensor, 0, shuffle_index) 50 | shuffle_observation = torch.index_select(observation_tensor, 0, shuffle_index) 51 | 52 | for i in range(memory_size)[::batch_size]: 53 | batch_state = shuffle_state[i: i+batch_size, :].type('torch.FloatTensor').to(device) 54 | batch_action = shuffle_action[i: i+batch_size].type('torch.FloatTensor').to(device) 55 | batch_reward = shuffle_reward[i: i+batch_size].type('torch.FloatTensor').to(device) 56 | batch_observation = shuffle_observation[i: i+batch_size, :].type('torch.FloatTensor').to(device) 57 | 58 | q_eval = model(batch_state).gather(1, batch_action.long().unsqueeze(1)) 59 | q_next = model_ast(batch_observation).detach() 60 | 61 | if mode == 'dqn': 62 | q_target = batch_reward + discount_rate * q_next.max(1)[0] 63 | 64 | elif mode == 'ddqn': 65 | q_target = batch_reward + discount_rate * q_next.gather(1, torch.argmax( 66 | model(batch_observation), dim=1, keepdim=True)) 67 | 68 | else: 69 | raise ValueError('please input correct mode for rl agent, either "dqn", or "ddqn"') 70 | 71 | optimizer.zero_grad() 72 | loss = criterion(input=q_eval, target=q_target) 73 | total_loss += loss.item() 74 | loss.backward() 75 | optimizer.step() 76 | 77 | if global_step % update_model_ast_freq == 0: 78 | para = {k: 0.3 * v + 0.7 * model.state_dict()[k] for k, v in model_ast.state_dict().items()} 79 | model_ast.load_state_dict(para) 80 | 81 | # epsilon 82 | if epsilon > epsilon_min and global_step % epsilon_reduce_freq == 0: 83 | epsilon -= epsilon_reduce 84 | 85 | total_reward +=reward 86 | state = observation 87 | step += 1 88 | global_step += 1 89 | 90 | # here writer is tensorboardX 91 | if writer and global_step % 500 == 0: 92 | writer.add_scalar('loss', total_loss, global_step=global_step) 93 | writer.add_scalar('reward', total_reward, global_step=global_step) 94 | 95 | if global_step % 500 == 0: 96 | print('-----------------------------------global step {}------------------------------------'.format(global_step)) 97 | print('total_reward : {}'.format(total_reward)) 98 | print('total_loss : {}'.format(total_loss)) 99 | print('-----------------------------------------------------------------------------------------') 100 | 101 | rewards.append(total_reward) 102 | losses.append(total_loss) 103 | 104 | print('------------------------------------------epoch {}--------------------------------------------',format(epoch)) 105 | print('total loss : {}'.format(total_loss)) 106 | print('total reward : {}'.format(total_reward)) 107 | print('-----------------------------------------------------------------------------------------------') 108 | 109 | if (epoch + 1) % save_freq == 0: 110 | checkpoint_state = {'epoch': epoch, 'state_dict': model.state_dict()} 111 | torch.save(checkpoint_state, os.path.join(checkpoint_dir, '{}_checkpoint.pth.tar'.format(str(epoch)+ '_' + str(global_step)))) 112 | 113 | return rewards, losses --------------------------------------------------------------------------------