├── ou_noise.py ├── DDPG-agent.py ├── README.md ├── market.py ├── actor_critic.py └── DDPG.py /ou_noise.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------- 2 | # Ornstein-Uhlenbeck Noise 3 | # Author: Flood Sung 4 | # Date: 2016.5.4 5 | # Reference: https://github.com/rllab/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py 6 | # -------------------------------------- 7 | 8 | import numpy as np 9 | 10 | 11 | class OUNoise: 12 | """docstring for OUNoise""" 13 | 14 | def __init__(self, action_dimension, mu=0, theta=0.15, sigma=0.2): 15 | self.action_dimension = action_dimension 16 | self.mu = mu 17 | self.theta = theta 18 | self.sigma = sigma 19 | self.state = np.ones(self.action_dimension) * self.mu 20 | self.seeds = 0 21 | self.reset() 22 | 23 | def reset(self): 24 | self.state = np.ones(self.action_dimension) * self.mu 25 | self.seeds += 1 26 | np.random.seed(self.seeds) 27 | 28 | def noise(self): 29 | x = self.state 30 | dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x)) 31 | self.state = x + dx 32 | return self.state 33 | 34 | def std_noise(self, mu, sigma): 35 | self.seeds += 1 36 | np.random.seed(self.seeds) 37 | return np.random.normal(mu, sigma, len(self.state)) 38 | 39 | 40 | if __name__ == '__main__': 41 | ou = OUNoise(1, sigma=0.1 / 50) 42 | states = [] 43 | for i in range(1000): 44 | states.append(ou.noise()) 45 | import matplotlib.pyplot as plt 46 | 47 | # plt.plot(states) 48 | plt.hist(np.array(states).ravel()) 49 | plt.show() 50 | -------------------------------------------------------------------------------- /DDPG-agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | @File :DDPG_agent.py 3 | @Author :JohsuaWu1997 4 | @Date :01/05/2020 5 | """ 6 | import numpy as np 7 | import pandas as pd 8 | import torch 9 | 10 | from DDPG import DDPG 11 | from market import MarketEnv 12 | 13 | cuda = torch.device('cuda') 14 | 15 | raw_amount = pd.read_csv('../sh000016/i_amount.csv', header=0, index_col=0).values 16 | raw_buy = pd.read_csv('../sh000016/o_buy.csv', header=0, index_col=0).values 17 | raw_sell = pd.read_csv('../sh000016/o_sell.csv', header=0, index_col=0).values 18 | 19 | START = 10441 20 | END = 13899 21 | 22 | 23 | def scale(data): 24 | data_min = np.min(data, axis=0) 25 | data_max = np.max(data, axis=0) 26 | data_max[data_max - data_min == 0] = 1 27 | data = (data - data_min) / (data_max - data_min) 28 | return data 29 | 30 | 31 | def train(Train_Env, Epoch): 32 | agent = DDPG(train_env, lb, node) 33 | save_iter = [1, 2, 5, 10, 20, 30, 50, 100, 150, 200] 34 | for t in range(Epoch): 35 | print('epoch:', t) 36 | state, done = Train_Env.reset(), False 37 | while not done: 38 | action = agent.act(state, Train_Env.portfolio) 39 | next_state, reward, done, _ = Train_Env.step(action) 40 | agent.perceive(state, action, reward, next_state, done) 41 | state = next_state 42 | if Train_Env.n_step % 300 == 299: 43 | print(Train_Env.n_step, ':', 44 | int(Train_Env.rewards[Train_Env.n_step]), '\t', 45 | int(sum(Train_Env.cost)), '\t', 46 | int(Train_Env.available_cash[Train_Env.n_step]), '\t', 47 | agent.critic_network.loss.data 48 | ) 49 | total_reward = Train_Env.rewards[-1] 50 | total_cost = sum(Train_Env.cost) 51 | print('DDPG: Evaluation Average Reward:', total_reward) 52 | print('DDPG: Average Cost: ', total_cost) 53 | 54 | for k in save_iter: 55 | if t == k: 56 | torch.save(agent.actor_network.target.state_dict(), 'DDPG_model' + str(t) + '.pth') 57 | return agent 58 | 59 | 60 | if __name__ == '__main__': 61 | lb, node, epoch = 12, 1024, 201 62 | buy_train = raw_buy[:START] 63 | sell_train = raw_sell[:START] 64 | amount_train = raw_amount[:START] 65 | 66 | train_env = MarketEnv([buy_train, sell_train, amount_train], 0) 67 | agent = train(train_env, epoch) 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch-DDPG-Stock-Trading 2 | An implementation of DDPG using PyTorch for algorithmic trading on Chinese SH50 stock market, from [Continuous Control with Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf). 3 | 4 | 5 | ## Environment 6 | The reinforcement learning environment is to simulate Chinese SH50 stock market HF-trading at an average of 5s per tick. The environment is based on `gym` and optimised using PyTorch and GPU. Need only to change the target device to `cuda` or `cpu`. 7 | 8 | The environment has several parameters to be set, for example: the initial cash is `asset`, minimum volume to be bought or sold is `unit`, the overall transaction rate is `rate` and the additional charge on short position is `short_rate` (which genuinely exists in Chinese stock market). 9 | 10 | ## Model 11 | The Actor-Critic model is defined in `actor_critic.py` with act and target networks for them both. Complying to the original DDPG algorithm, the target networks are updated using `soft-copy`. 12 | 13 | The train-on-data process is same as the original DDPG algorithm using SARSAs from memory buffer. 14 | ``` 15 | # Calculate y_batch 16 | next_action_batch = self.actor_network.target_action(next_state_batch) 17 | q_batch = self.critic_network.target_q(next_action_batch, next_state_batch) 18 | y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1) 19 | 20 | # train actor-critic by target loss 21 | self.actor_network.train( 22 | self.critic_network.train( 23 | y_batch, action_batch, state_batch 24 | ) 25 | ) 26 | 27 | # Update target networks by soft update 28 | self.actor_network.update_target() 29 | self.critic_network.update_target() 30 | ``` 31 | 32 | The policy gradience is fetched from the very first layer between actor & critic and directed to the actor's backward propagation. 33 | ``` 34 | # The policy mean gradience from critic 35 | return torch.mean(self.critic_weights[0].grad[:, :self.action_dim], dim=0) 36 | ``` 37 | ``` 38 | # Using policy gradience training the actor 39 | self.actor_weights[-1].backward(-loss_grad) 40 | ``` 41 | 42 | ## Agent 43 | `DDPG.py` is the wrapped up agent to collect memory buffer and train-on-data. Only `train_on_batch` and `perceive` are relevant to the algorithm. The random sampling is realised using a more sufficient way on cuda: 44 | ``` 45 | sample = torch.randint(self.time_dim, self.replay_reward.shape[0], [self.batch_size], device=cuda) 46 | 47 | index = torch.stack([sample - i for i in range(self.time_dim, 0, -1)]).t().reshape(-1) 48 | ``` 49 | ``` 50 | state_batch = torch.index_select(state_data, 0, index).view(self.batch_size, -1) 51 | next_amount_data = torch.index_select(next_amount_data, 0, sample).view(self.batch_size, -1) 52 | action_batch = torch.index_select(self.replay_action / self.unit, 0, sample) 53 | reward_batch = torch.index_select(self.replay_reward, 0, sample) 54 | ``` 55 | ## OUNoise 56 | The OU-noise is implemented by [Flood Sung](https://github.com/rllab/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py). 57 | 58 | ## Playground 59 | `DDPG-agent.py` is the playground to interact. This repo provides the data of Chinese SH50 stock market from 17/04/2020 to 13/04/2020 for totally more than 13000 ticks. -------------------------------------------------------------------------------- /market.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import torch 6 | from gym import spaces 7 | 8 | device = torch.device('cuda') 9 | 10 | plt.ion() 11 | 12 | 13 | class MarketEnv(gym.Env): 14 | def __init__(self, data, seed, asset=1000000.00, unit=100): 15 | self.asset = asset 16 | self.unit = unit 17 | self.rate = 5e-4 18 | self.short_rate = 1e-3 19 | self.rd_seed = seed 20 | self.sh000016 = data[0][:, 0].ravel() 21 | self.data = torch.tensor([data[i][:, 1:].tolist() for i in range(3)], device=device).permute(1, 0, 2) 22 | 23 | self.stock_number = data[0].shape[1] - 1 24 | self.sample_size = data[0].shape[0] 25 | self.action_space = spaces.Box(low=0, high=1, shape=(self.stock_number,)) 26 | self.observation_space = spaces.Box(low=0, high=1, shape=(3, self.stock_number,)) 27 | 28 | def reset(self): 29 | self.n_step = 0 30 | self.state = self.data[self.n_step, :, :] 31 | self.position = torch.zeros(self.stock_number, device=device) 32 | self.cash = torch.tensor(self.asset, device=device) 33 | self.portfolio = torch.tensor(self.asset, device=device) 34 | self.rewards = torch.tensor([self.asset] * self.sample_size, device=device) 35 | self.cost = torch.zeros(self.sample_size, device=device) 36 | self.success = [] 37 | self.available_cash = torch.tensor([self.asset] * self.sample_size, device=device) 38 | self.book = [] 39 | return self.state 40 | 41 | def step(self, position: torch.Tensor): 42 | self.n_step += 1 43 | self.state = self.data[self.n_step, :, :] 44 | amount = position - self.position 45 | price = self.state[1, :].view(-1) # price to buy 46 | price[amount < 0] = self.state[0, :][amount < 0] # price to sell 47 | 48 | transaction_buy = torch.sum((amount * price)[amount > 0] * self.rate) 49 | transaction_sell = -torch.sum((amount * price)[amount < 0] * (self.short_rate + self.rate)) 50 | 51 | cost_buy = torch.sum((amount * price)[amount > 0]) 52 | cost_sell = torch.sum((amount * price)[amount < 0]) 53 | if self.cash < transaction_buy + cost_buy: 54 | self.success.append(False) 55 | self.cost[self.n_step] = transaction_sell 56 | self.position[amount < 0] = position[amount < 0] 57 | self.cash -= cost_sell + transaction_sell 58 | else: 59 | self.success.append(True) 60 | self.cost[self.n_step] = transaction_sell + transaction_buy 61 | self.position = position 62 | self.cash -= cost_sell + transaction_sell + cost_buy + transaction_buy 63 | 64 | portfolio = self.cash + torch.sum(self.state[0, :] * self.position) 65 | reward = portfolio - self.portfolio 66 | 67 | self.portfolio = portfolio 68 | self.rewards[self.n_step] = portfolio 69 | self.available_cash[self.n_step] = self.cash 70 | self.book.append(amount.cpu().numpy().ravel().tolist()) 71 | if self.n_step == self.sample_size - 1: 72 | done = True 73 | else: 74 | done = False 75 | return self.state, reward, done, {} 76 | 77 | def plot(self, path=None, batch_size=1024): 78 | sh000016 = self.sh000016[1:] / self.sh000016[0] * self.asset 79 | plt.figure(figsize=(76.80, 43.20)) 80 | plt.plot(sh000016) 81 | plt.plot(self.rewards.cpu().numpy().ravel()) 82 | if path is not None: 83 | plt.savefig(path + '.png') 84 | plt.close() 85 | 86 | def render(self, mode='human', path=None): 87 | if path is not None: 88 | result = np.array([ 89 | self.rewards.cpu().numpy().ravel(), 90 | self.cost.cpu().numpy().ravel(), 91 | self.available_cash.cpu().numpy().ravel(), 92 | self.success]).T 93 | pd.DataFrame(result, columns=['portfolio', 'transaction', 'cash', 'success']).to_csv(path + '-result.csv') 94 | pd.DataFrame(self.book).to_csv(path + '-book.csv') 95 | 96 | def close(self): 97 | pass 98 | -------------------------------------------------------------------------------- /actor_critic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | cuda = torch.device('cuda') 4 | 5 | 6 | def hard_copy(target, source): 7 | for weight1, weight2 in zip(target, source): 8 | weight1.data = weight2.data.clone() 9 | 10 | 11 | def soft_copy(target, source, w=0.01): 12 | for weight1, weight2 in zip(target, source): 13 | weight1.data = torch.add( 14 | weight1.data, torch.add( 15 | weight2.data, weight1.data, alpha=-1 16 | ), alpha=w) 17 | 18 | 19 | class ActorNet(torch.nn.Module): 20 | def __init__(self, input_dim, hidden_dim, output_dim): 21 | super(ActorNet, self).__init__() 22 | self.nn = torch.nn.Sequential( 23 | torch.nn.Linear(input_dim, hidden_dim), 24 | torch.nn.ReLU(), 25 | torch.nn.Linear(hidden_dim, hidden_dim), 26 | torch.nn.ReLU(), 27 | torch.nn.Linear(hidden_dim, hidden_dim), 28 | torch.nn.ReLU(), 29 | torch.nn.Linear(hidden_dim, output_dim), 30 | torch.nn.Softmax(dim=1) 31 | ) 32 | 33 | def forward(self, x): 34 | out = self.nn(x) 35 | return out 36 | 37 | 38 | class CriticNet(torch.nn.Module): 39 | def __init__(self, input_dim, hidden_dim, output_dim): 40 | super(CriticNet, self).__init__() 41 | self.nn2 = torch.nn.Sequential( 42 | torch.nn.Linear(hidden_dim + output_dim, hidden_dim), 43 | torch.nn.Tanh(), 44 | torch.nn.Linear(hidden_dim, hidden_dim), 45 | torch.nn.Tanh(), 46 | torch.nn.Linear(hidden_dim, 1) 47 | ) 48 | self.nn1 = torch.nn.Sequential( 49 | torch.nn.Linear(input_dim, hidden_dim), 50 | torch.nn.ReLU(), 51 | torch.nn.Linear(hidden_dim, hidden_dim), 52 | torch.nn.Softmax(dim=1), 53 | ) 54 | 55 | def forward(self, a, x): 56 | x_out = self.nn1(x) 57 | ax = torch.cat((a, x_out), 1) 58 | out = self.nn2(ax) 59 | return out 60 | 61 | 62 | class Actor: 63 | def __init__(self, time_dim, state_dim, action_dim, hidden_dim): 64 | self.actor = ActorNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda) 65 | self.target = ActorNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda) 66 | self.actor_weights = [params for params in self.actor.parameters()] 67 | self.target_weights = [params for params in self.target.parameters()] 68 | self.optimizer = torch.optim.Adam(self.actor.parameters()) 69 | hard_copy(self.target_weights, self.actor_weights) 70 | 71 | def train(self, loss_grad): 72 | for _ in range(1): 73 | self.optimizer.zero_grad() 74 | self.actor_weights[-1].backward(-loss_grad) 75 | self.optimizer.step() 76 | 77 | def actor_action(self, state): 78 | self.actor.zero_grad() 79 | return self.actor(state) 80 | 81 | def target_action(self, state): 82 | self.target.zero_grad() 83 | return self.target(state) 84 | 85 | def update_target(self): 86 | soft_copy(self.target_weights, self.actor_weights) 87 | 88 | 89 | class Critic: 90 | def __init__(self, time_dim, state_dim, action_dim, hidden_dim): 91 | self.action_dim = action_dim 92 | self.critic = CriticNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda) 93 | self.target = CriticNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda) 94 | self.critic_weights = [params for params in self.critic.parameters()] 95 | self.target_weights = [params for params in self.target.parameters()] 96 | self.optimizer = torch.optim.Adam(self.critic.parameters()) 97 | self.loss = torch.tensor(0, device=cuda) 98 | hard_copy(self.target_weights, self.critic_weights) 99 | 100 | def train(self, y_batch, action_batch, state_batch): 101 | criterion = torch.nn.MSELoss() 102 | for _ in range(1): 103 | y_pred = self.critic(action_batch, state_batch) 104 | self.loss = criterion(y_pred, y_batch) 105 | self.optimizer.zero_grad() 106 | self.loss.backward() 107 | self.optimizer.step() 108 | return torch.mean(self.critic_weights[0].grad[:, :self.action_dim], dim=0) 109 | 110 | def target_q(self, next_action_batch, next_state_batch): 111 | self.target.zero_grad() 112 | return self.target(next_action_batch, next_state_batch).view(-1) 113 | 114 | def update_target(self): 115 | soft_copy(self.target_weights, self.critic_weights) 116 | 117 | 118 | if __name__ == '__main__': 119 | critic = CriticNet(50 * (12 + 1), 37, 50).to(cuda) 120 | for params in critic.parameters(): 121 | print(params.shape) 122 | -------------------------------------------------------------------------------- /DDPG.py: -------------------------------------------------------------------------------- 1 | """ 2 | @File :DDPG.py 3 | @Author :JohsuaWu1997 4 | @Date :2020/1/30 5 | """ 6 | import torch 7 | 8 | from actor_critic import Actor, Critic 9 | from ou_noise import OUNoise 10 | 11 | cuda = torch.device('cuda') 12 | 13 | GAMMA = 0.9999999993340943687843739933894 14 | 15 | 16 | def min_max_scale(data): 17 | data_min = torch.min(data, 0).values.view(1, -1) 18 | data_max = torch.max(data, 0).values.view(1, -1) 19 | data_max[data_max - data_min == 0] = 0 20 | return (data - data_min) / (data_max - data_min) 21 | 22 | 23 | class DDPG: 24 | """docstring for DDPG""" 25 | 26 | def __init__(self, env, time_steps, hidden_dim): 27 | self.name = 'DDPG' # name for uploading results 28 | self.scale = env.asset 29 | self.unit = env.unit 30 | self.seed = env.rd_seed 31 | 32 | self.time_dim = time_steps 33 | self.state_dim = env.observation_space.shape[1] 34 | self.action_dim = env.action_space.shape[0] 35 | self.batch_size = 64 36 | self.memory_size = self.time_dim + self.batch_size * 10 37 | self.start_size = self.time_dim + self.batch_size * 2 38 | 39 | # Initialise actor & critic networks 40 | self.actor_network = Actor(self.time_dim, self.state_dim, self.action_dim, hidden_dim) 41 | self.critic_network = Critic(self.time_dim, self.state_dim, self.action_dim, hidden_dim) 42 | 43 | # Initialize replay buffer 44 | self.replay_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda) 45 | self.replay_next_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda) 46 | self.replay_action = torch.zeros((self.start_size - 1, 1, self.state_dim), device=cuda) 47 | self.replay_reward = torch.zeros((self.start_size - 1,), device=cuda) 48 | 49 | # Initialize a random process the Ornstein-Uhlenbeck process for action exploration 50 | self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim) 51 | self.initial() 52 | 53 | def initial(self): 54 | self.steps = 0 55 | self.action = torch.zeros(self.action_dim, device=cuda) 56 | self.replay_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda) 57 | self.replay_next_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda) 58 | self.replay_action = torch.zeros((self.start_size - 1, self.state_dim), device=cuda) 59 | self.replay_reward = torch.zeros((self.start_size - 1,), device=cuda) 60 | 61 | def train_on_batch(self): 62 | # Sample a random minibatch of N transitions from replay buffer 63 | sample = torch.randint(self.time_dim, self.replay_reward.shape[0], [self.batch_size], device=cuda) 64 | index = torch.stack([sample - i for i in range(self.time_dim, 0, -1)]).t().reshape(-1) 65 | 66 | state_data = min_max_scale(self.replay_state[:, 0, :]) 67 | amount_data = min_max_scale(self.replay_state[:, 2, :]) 68 | next_state_data = min_max_scale(self.replay_next_state[:, 0, :]) 69 | next_amount_data = min_max_scale(self.replay_next_state[:, 2, :]) 70 | 71 | state_batch = torch.index_select(state_data, 0, index).view(self.batch_size, -1) 72 | amount_data = torch.index_select(amount_data, 0, sample).view(self.batch_size, -1) 73 | state_batch = torch.cat([state_batch, amount_data], dim=1) 74 | next_state_batch = torch.index_select(next_state_data, 0, index).view(self.batch_size, -1) 75 | next_amount_data = torch.index_select(next_amount_data, 0, sample).view(self.batch_size, -1) 76 | next_state_batch = torch.cat([next_state_batch, next_amount_data], dim=1) 77 | action_batch = torch.index_select(self.replay_action / self.unit, 0, sample) 78 | reward_batch = torch.index_select(self.replay_reward, 0, sample) 79 | 80 | # Calculate y_batch 81 | next_action_batch = self.actor_network.target_action(next_state_batch) 82 | q_batch = self.critic_network.target_q(next_action_batch, next_state_batch) 83 | y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1) 84 | 85 | # train actor-critic by target loss 86 | self.actor_network.train( 87 | self.critic_network.train( 88 | y_batch, action_batch, state_batch 89 | ) 90 | ) 91 | 92 | # Update target networks by soft update 93 | self.actor_network.update_target() 94 | self.critic_network.update_target() 95 | 96 | def perceive(self, state, action, reward, next_state, done): 97 | if self.steps < self.start_size - 1: 98 | self.replay_state[self.steps] = state 99 | self.replay_next_state[self.steps] = next_state 100 | self.replay_action[self.steps] = action 101 | self.replay_reward[self.steps] = reward 102 | else: 103 | if self.steps >= self.memory_size: 104 | self.replay_state = self.replay_state[1:] 105 | self.replay_next_state = self.replay_next_state[1:] 106 | self.replay_action = self.replay_action[1:] 107 | self.replay_reward = self.replay_reward[1:] 108 | self.replay_state = torch.cat((self.replay_state, state.unsqueeze(0)), dim=0) 109 | self.replay_next_state = torch.cat((self.replay_next_state, next_state.unsqueeze(0)), dim=0) 110 | self.replay_action = torch.cat((self.replay_action, action.unsqueeze(0)), dim=0) 111 | self.replay_reward = torch.cat((self.replay_reward, reward.unsqueeze(0)), dim=0) 112 | self.steps += 1 113 | 114 | def act(self, next_state, portfolio): 115 | if self.steps > self.start_size: 116 | next_state_data = min_max_scale(self.replay_next_state[:, 0, :])[-self.time_dim:].view(1, -1) 117 | next_amount_data = min_max_scale(self.replay_next_state[:, 2, :])[-1].view(1, -1) 118 | next_state_data = torch.cat([next_state_data, next_amount_data], dim=1) 119 | self.train_on_batch() 120 | allocation = self.actor_network.target_action(next_state_data).data.view(-1) 121 | allocation += torch.tensor(self.exploration_noise.noise().tolist(), device=cuda) 122 | allocation[allocation < 0] = 0 123 | allocation /= sum(allocation) 124 | allocation = torch.floor( 125 | portfolio * allocation / next_state[1, :] / self.unit 126 | ) * self.unit 127 | self.action = allocation 128 | return self.action.clone() 129 | --------------------------------------------------------------------------------