├── ou_noise.py
├── DDPG-agent.py
├── README.md
├── market.py
├── actor_critic.py
└── DDPG.py


/ou_noise.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------
 2 | # Ornstein-Uhlenbeck Noise
 3 | # Author: Flood Sung
 4 | # Date: 2016.5.4
 5 | # Reference: https://github.com/rllab/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py
 6 | # --------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | class OUNoise:
12 |     """docstring for OUNoise"""
13 | 
14 |     def __init__(self, action_dimension, mu=0, theta=0.15, sigma=0.2):
15 |         self.action_dimension = action_dimension
16 |         self.mu = mu
17 |         self.theta = theta
18 |         self.sigma = sigma
19 |         self.state = np.ones(self.action_dimension) * self.mu
20 |         self.seeds = 0
21 |         self.reset()
22 | 
23 |     def reset(self):
24 |         self.state = np.ones(self.action_dimension) * self.mu
25 |         self.seeds += 1
26 |         np.random.seed(self.seeds)
27 | 
28 |     def noise(self):
29 |         x = self.state
30 |         dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
31 |         self.state = x + dx
32 |         return self.state
33 | 
34 |     def std_noise(self, mu, sigma):
35 |         self.seeds += 1
36 |         np.random.seed(self.seeds)
37 |         return np.random.normal(mu, sigma, len(self.state))
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     ou = OUNoise(1, sigma=0.1 / 50)
42 |     states = []
43 |     for i in range(1000):
44 |         states.append(ou.noise())
45 |     import matplotlib.pyplot as plt
46 | 
47 |     # plt.plot(states)
48 |     plt.hist(np.array(states).ravel())
49 |     plt.show()
50 | 


--------------------------------------------------------------------------------
/DDPG-agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @File   :DDPG_agent.py
 3 | @Author :JohsuaWu1997
 4 | @Date   :01/05/2020
 5 | """
 6 | import numpy as np
 7 | import pandas as pd
 8 | import torch
 9 | 
10 | from DDPG import DDPG
11 | from market import MarketEnv
12 | 
13 | cuda = torch.device('cuda')
14 | 
15 | raw_amount = pd.read_csv('../sh000016/i_amount.csv', header=0, index_col=0).values
16 | raw_buy = pd.read_csv('../sh000016/o_buy.csv', header=0, index_col=0).values
17 | raw_sell = pd.read_csv('../sh000016/o_sell.csv', header=0, index_col=0).values
18 | 
19 | START = 10441
20 | END = 13899
21 | 
22 | 
23 | def scale(data):
24 |     data_min = np.min(data, axis=0)
25 |     data_max = np.max(data, axis=0)
26 |     data_max[data_max - data_min == 0] = 1
27 |     data = (data - data_min) / (data_max - data_min)
28 |     return data
29 | 
30 | 
31 | def train(Train_Env, Epoch):
32 |     agent = DDPG(train_env, lb, node)
33 |     save_iter = [1, 2, 5, 10, 20, 30, 50, 100, 150, 200]
34 |     for t in range(Epoch):
35 |         print('epoch:', t)
36 |         state, done = Train_Env.reset(), False
37 |         while not done:
38 |             action = agent.act(state, Train_Env.portfolio)
39 |             next_state, reward, done, _ = Train_Env.step(action)
40 |             agent.perceive(state, action, reward, next_state, done)
41 |             state = next_state
42 |             if Train_Env.n_step % 300 == 299:
43 |                 print(Train_Env.n_step, ':',
44 |                       int(Train_Env.rewards[Train_Env.n_step]), '\t',
45 |                       int(sum(Train_Env.cost)), '\t',
46 |                       int(Train_Env.available_cash[Train_Env.n_step]), '\t',
47 |                       agent.critic_network.loss.data
48 |                       )
49 |         total_reward = Train_Env.rewards[-1]
50 |         total_cost = sum(Train_Env.cost)
51 |         print('DDPG: Evaluation Average Reward:', total_reward)
52 |         print('DDPG: Average Cost: ', total_cost)
53 | 
54 |         for k in save_iter:
55 |             if t == k:
56 |                 torch.save(agent.actor_network.target.state_dict(), 'DDPG_model' + str(t) + '.pth')
57 |     return agent
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     lb, node, epoch = 12, 1024, 201
62 |     buy_train = raw_buy[:START]
63 |     sell_train = raw_sell[:START]
64 |     amount_train = raw_amount[:START]
65 | 
66 |     train_env = MarketEnv([buy_train, sell_train, amount_train], 0)
67 |     agent = train(train_env, epoch)
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch-DDPG-Stock-Trading
 2 | An implementation of DDPG using PyTorch for algorithmic trading on Chinese SH50 stock market, from [Continuous Control with Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf).
 3 | 
 4 | 
 5 | ## Environment
 6 | The reinforcement learning environment is to simulate Chinese SH50 stock market HF-trading at an average of 5s per tick. The environment is based on `gym` and optimised using PyTorch and GPU. Need only to change the target device to `cuda` or `cpu`.
 7 | 
 8 | The environment has several parameters to be set, for example: the initial cash is `asset`, minimum volume to be bought or sold is `unit`, the overall transaction rate is `rate` and the additional charge on short position is `short_rate` (which genuinely exists in Chinese stock market).
 9 | 
10 | ## Model
11 | The Actor-Critic model is defined in `actor_critic.py` with act and target networks for them both. Complying to the original DDPG algorithm, the target networks are updated using `soft-copy`.
12 | 
13 | The train-on-data process is same as the original DDPG algorithm using SARSAs from memory buffer.
14 | ```
15 | # Calculate y_batch
16 | next_action_batch = self.actor_network.target_action(next_state_batch)
17 | q_batch = self.critic_network.target_q(next_action_batch, next_state_batch)
18 | y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1)
19 | 
20 | # train actor-critic by target loss
21 | self.actor_network.train(
22 |     self.critic_network.train(
23 |         y_batch, action_batch, state_batch
24 |     )
25 | )
26 | 
27 | # Update target networks by soft update
28 | self.actor_network.update_target()
29 | self.critic_network.update_target()
30 | ```
31 | 
32 | The policy gradience is fetched from the very first layer between actor & critic and directed to the actor's backward propagation.
33 | ```
34 | # The policy mean gradience from critic
35 | return torch.mean(self.critic_weights[0].grad[:, :self.action_dim], dim=0)
36 | ```
37 | ```
38 | # Using policy gradience training the actor 
39 | self.actor_weights[-1].backward(-loss_grad)
40 | ```
41 | 
42 | ## Agent
43 | `DDPG.py` is the wrapped up agent to collect memory buffer and train-on-data. Only `train_on_batch` and `perceive` are relevant to the algorithm. The random sampling is realised using a more sufficient way on cuda:
44 | ```
45 | sample = torch.randint(self.time_dim, self.replay_reward.shape[0], [self.batch_size], device=cuda)
46 | 
47 | index = torch.stack([sample - i for i in range(self.time_dim, 0, -1)]).t().reshape(-1)
48 | ```
49 | ```
50 | state_batch = torch.index_select(state_data, 0, index).view(self.batch_size, -1)
51 | next_amount_data = torch.index_select(next_amount_data, 0, sample).view(self.batch_size, -1)
52 | action_batch = torch.index_select(self.replay_action / self.unit, 0, sample)
53 | reward_batch = torch.index_select(self.replay_reward, 0, sample)
54 | ```
55 | ## OUNoise
56 | The OU-noise is implemented by [Flood Sung](https://github.com/rllab/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py).
57 | 
58 | ## Playground
59 | `DDPG-agent.py` is the playground to interact. This repo provides the data of Chinese SH50 stock market from 17/04/2020 to 13/04/2020 for totally more than 13000 ticks.


--------------------------------------------------------------------------------
/market.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import pandas as pd
 5 | import torch
 6 | from gym import spaces
 7 | 
 8 | device = torch.device('cuda')
 9 | 
10 | plt.ion()
11 | 
12 | 
13 | class MarketEnv(gym.Env):
14 |     def __init__(self, data, seed, asset=1000000.00, unit=100):
15 |         self.asset = asset
16 |         self.unit = unit
17 |         self.rate = 5e-4
18 |         self.short_rate = 1e-3
19 |         self.rd_seed = seed
20 |         self.sh000016 = data[0][:, 0].ravel()
21 |         self.data = torch.tensor([data[i][:, 1:].tolist() for i in range(3)], device=device).permute(1, 0, 2)
22 | 
23 |         self.stock_number = data[0].shape[1] - 1
24 |         self.sample_size = data[0].shape[0]
25 |         self.action_space = spaces.Box(low=0, high=1, shape=(self.stock_number,))
26 |         self.observation_space = spaces.Box(low=0, high=1, shape=(3, self.stock_number,))
27 | 
28 |     def reset(self):
29 |         self.n_step = 0
30 |         self.state = self.data[self.n_step, :, :]
31 |         self.position = torch.zeros(self.stock_number, device=device)
32 |         self.cash = torch.tensor(self.asset, device=device)
33 |         self.portfolio = torch.tensor(self.asset, device=device)
34 |         self.rewards = torch.tensor([self.asset] * self.sample_size, device=device)
35 |         self.cost = torch.zeros(self.sample_size, device=device)
36 |         self.success = []
37 |         self.available_cash = torch.tensor([self.asset] * self.sample_size, device=device)
38 |         self.book = []
39 |         return self.state
40 | 
41 |     def step(self, position: torch.Tensor):
42 |         self.n_step += 1
43 |         self.state = self.data[self.n_step, :, :]
44 |         amount = position - self.position
45 |         price = self.state[1, :].view(-1)  # price to buy
46 |         price[amount < 0] = self.state[0, :][amount < 0]  # price to sell
47 | 
48 |         transaction_buy = torch.sum((amount * price)[amount > 0] * self.rate)
49 |         transaction_sell = -torch.sum((amount * price)[amount < 0] * (self.short_rate + self.rate))
50 | 
51 |         cost_buy = torch.sum((amount * price)[amount > 0])
52 |         cost_sell = torch.sum((amount * price)[amount < 0])
53 |         if self.cash < transaction_buy + cost_buy:
54 |             self.success.append(False)
55 |             self.cost[self.n_step] = transaction_sell
56 |             self.position[amount < 0] = position[amount < 0]
57 |             self.cash -= cost_sell + transaction_sell
58 |         else:
59 |             self.success.append(True)
60 |             self.cost[self.n_step] = transaction_sell + transaction_buy
61 |             self.position = position
62 |             self.cash -= cost_sell + transaction_sell + cost_buy + transaction_buy
63 | 
64 |         portfolio = self.cash + torch.sum(self.state[0, :] * self.position)
65 |         reward = portfolio - self.portfolio
66 | 
67 |         self.portfolio = portfolio
68 |         self.rewards[self.n_step] = portfolio
69 |         self.available_cash[self.n_step] = self.cash
70 |         self.book.append(amount.cpu().numpy().ravel().tolist())
71 |         if self.n_step == self.sample_size - 1:
72 |             done = True
73 |         else:
74 |             done = False
75 |         return self.state, reward, done, {}
76 | 
77 |     def plot(self, path=None, batch_size=1024):
78 |         sh000016 = self.sh000016[1:] / self.sh000016[0] * self.asset
79 |         plt.figure(figsize=(76.80, 43.20))
80 |         plt.plot(sh000016)
81 |         plt.plot(self.rewards.cpu().numpy().ravel())
82 |         if path is not None:
83 |             plt.savefig(path + '.png')
84 |         plt.close()
85 | 
86 |     def render(self, mode='human', path=None):
87 |         if path is not None:
88 |             result = np.array([
89 |                 self.rewards.cpu().numpy().ravel(),
90 |                 self.cost.cpu().numpy().ravel(),
91 |                 self.available_cash.cpu().numpy().ravel(),
92 |                 self.success]).T
93 |             pd.DataFrame(result, columns=['portfolio', 'transaction', 'cash', 'success']).to_csv(path + '-result.csv')
94 |             pd.DataFrame(self.book).to_csv(path + '-book.csv')
95 | 
96 |     def close(self):
97 |         pass
98 | 


--------------------------------------------------------------------------------
/actor_critic.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | cuda = torch.device('cuda')
  4 | 
  5 | 
  6 | def hard_copy(target, source):
  7 |     for weight1, weight2 in zip(target, source):
  8 |         weight1.data = weight2.data.clone()
  9 | 
 10 | 
 11 | def soft_copy(target, source, w=0.01):
 12 |     for weight1, weight2 in zip(target, source):
 13 |         weight1.data = torch.add(
 14 |             weight1.data, torch.add(
 15 |                 weight2.data, weight1.data, alpha=-1
 16 |             ), alpha=w)
 17 | 
 18 | 
 19 | class ActorNet(torch.nn.Module):
 20 |     def __init__(self, input_dim, hidden_dim, output_dim):
 21 |         super(ActorNet, self).__init__()
 22 |         self.nn = torch.nn.Sequential(
 23 |             torch.nn.Linear(input_dim, hidden_dim),
 24 |             torch.nn.ReLU(),
 25 |             torch.nn.Linear(hidden_dim, hidden_dim),
 26 |             torch.nn.ReLU(),
 27 |             torch.nn.Linear(hidden_dim, hidden_dim),
 28 |             torch.nn.ReLU(),
 29 |             torch.nn.Linear(hidden_dim, output_dim),
 30 |             torch.nn.Softmax(dim=1)
 31 |         )
 32 | 
 33 |     def forward(self, x):
 34 |         out = self.nn(x)
 35 |         return out
 36 | 
 37 | 
 38 | class CriticNet(torch.nn.Module):
 39 |     def __init__(self, input_dim, hidden_dim, output_dim):
 40 |         super(CriticNet, self).__init__()
 41 |         self.nn2 = torch.nn.Sequential(
 42 |             torch.nn.Linear(hidden_dim + output_dim, hidden_dim),
 43 |             torch.nn.Tanh(),
 44 |             torch.nn.Linear(hidden_dim, hidden_dim),
 45 |             torch.nn.Tanh(),
 46 |             torch.nn.Linear(hidden_dim, 1)
 47 |         )
 48 |         self.nn1 = torch.nn.Sequential(
 49 |             torch.nn.Linear(input_dim, hidden_dim),
 50 |             torch.nn.ReLU(),
 51 |             torch.nn.Linear(hidden_dim, hidden_dim),
 52 |             torch.nn.Softmax(dim=1),
 53 |         )
 54 | 
 55 |     def forward(self, a, x):
 56 |         x_out = self.nn1(x)
 57 |         ax = torch.cat((a, x_out), 1)
 58 |         out = self.nn2(ax)
 59 |         return out
 60 | 
 61 | 
 62 | class Actor:
 63 |     def __init__(self, time_dim, state_dim, action_dim, hidden_dim):
 64 |         self.actor = ActorNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda)
 65 |         self.target = ActorNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda)
 66 |         self.actor_weights = [params for params in self.actor.parameters()]
 67 |         self.target_weights = [params for params in self.target.parameters()]
 68 |         self.optimizer = torch.optim.Adam(self.actor.parameters())
 69 |         hard_copy(self.target_weights, self.actor_weights)
 70 | 
 71 |     def train(self, loss_grad):
 72 |         for _ in range(1):
 73 |             self.optimizer.zero_grad()
 74 |             self.actor_weights[-1].backward(-loss_grad)
 75 |             self.optimizer.step()
 76 | 
 77 |     def actor_action(self, state):
 78 |         self.actor.zero_grad()
 79 |         return self.actor(state)
 80 | 
 81 |     def target_action(self, state):
 82 |         self.target.zero_grad()
 83 |         return self.target(state)
 84 | 
 85 |     def update_target(self):
 86 |         soft_copy(self.target_weights, self.actor_weights)
 87 | 
 88 | 
 89 | class Critic:
 90 |     def __init__(self, time_dim, state_dim, action_dim, hidden_dim):
 91 |         self.action_dim = action_dim
 92 |         self.critic = CriticNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda)
 93 |         self.target = CriticNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda)
 94 |         self.critic_weights = [params for params in self.critic.parameters()]
 95 |         self.target_weights = [params for params in self.target.parameters()]
 96 |         self.optimizer = torch.optim.Adam(self.critic.parameters())
 97 |         self.loss = torch.tensor(0, device=cuda)
 98 |         hard_copy(self.target_weights, self.critic_weights)
 99 | 
100 |     def train(self, y_batch, action_batch, state_batch):
101 |         criterion = torch.nn.MSELoss()
102 |         for _ in range(1):
103 |             y_pred = self.critic(action_batch, state_batch)
104 |             self.loss = criterion(y_pred, y_batch)
105 |             self.optimizer.zero_grad()
106 |             self.loss.backward()
107 |             self.optimizer.step()
108 |         return torch.mean(self.critic_weights[0].grad[:, :self.action_dim], dim=0)
109 | 
110 |     def target_q(self, next_action_batch, next_state_batch):
111 |         self.target.zero_grad()
112 |         return self.target(next_action_batch, next_state_batch).view(-1)
113 | 
114 |     def update_target(self):
115 |         soft_copy(self.target_weights, self.critic_weights)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     critic = CriticNet(50 * (12 + 1), 37, 50).to(cuda)
120 |     for params in critic.parameters():
121 |         print(params.shape)
122 | 


--------------------------------------------------------------------------------
/DDPG.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @File   :DDPG.py
  3 | @Author :JohsuaWu1997
  4 | @Date   :2020/1/30
  5 | """
  6 | import torch
  7 | 
  8 | from actor_critic import Actor, Critic
  9 | from ou_noise import OUNoise
 10 | 
 11 | cuda = torch.device('cuda')
 12 | 
 13 | GAMMA = 0.9999999993340943687843739933894
 14 | 
 15 | 
 16 | def min_max_scale(data):
 17 |     data_min = torch.min(data, 0).values.view(1, -1)
 18 |     data_max = torch.max(data, 0).values.view(1, -1)
 19 |     data_max[data_max - data_min == 0] = 0
 20 |     return (data - data_min) / (data_max - data_min)
 21 | 
 22 | 
 23 | class DDPG:
 24 |     """docstring for DDPG"""
 25 | 
 26 |     def __init__(self, env, time_steps, hidden_dim):
 27 |         self.name = 'DDPG'  # name for uploading results
 28 |         self.scale = env.asset
 29 |         self.unit = env.unit
 30 |         self.seed = env.rd_seed
 31 | 
 32 |         self.time_dim = time_steps
 33 |         self.state_dim = env.observation_space.shape[1]
 34 |         self.action_dim = env.action_space.shape[0]
 35 |         self.batch_size = 64
 36 |         self.memory_size = self.time_dim + self.batch_size * 10
 37 |         self.start_size = self.time_dim + self.batch_size * 2
 38 | 
 39 |         # Initialise actor & critic networks
 40 |         self.actor_network = Actor(self.time_dim, self.state_dim, self.action_dim, hidden_dim)
 41 |         self.critic_network = Critic(self.time_dim, self.state_dim, self.action_dim, hidden_dim)
 42 | 
 43 |         # Initialize replay buffer
 44 |         self.replay_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
 45 |         self.replay_next_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
 46 |         self.replay_action = torch.zeros((self.start_size - 1, 1, self.state_dim), device=cuda)
 47 |         self.replay_reward = torch.zeros((self.start_size - 1,), device=cuda)
 48 | 
 49 |         # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
 50 |         self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim)
 51 |         self.initial()
 52 | 
 53 |     def initial(self):
 54 |         self.steps = 0
 55 |         self.action = torch.zeros(self.action_dim, device=cuda)
 56 |         self.replay_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
 57 |         self.replay_next_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
 58 |         self.replay_action = torch.zeros((self.start_size - 1, self.state_dim), device=cuda)
 59 |         self.replay_reward = torch.zeros((self.start_size - 1,), device=cuda)
 60 | 
 61 |     def train_on_batch(self):
 62 |         # Sample a random minibatch of N transitions from replay buffer
 63 |         sample = torch.randint(self.time_dim, self.replay_reward.shape[0], [self.batch_size], device=cuda)
 64 |         index = torch.stack([sample - i for i in range(self.time_dim, 0, -1)]).t().reshape(-1)
 65 | 
 66 |         state_data = min_max_scale(self.replay_state[:, 0, :])
 67 |         amount_data = min_max_scale(self.replay_state[:, 2, :])
 68 |         next_state_data = min_max_scale(self.replay_next_state[:, 0, :])
 69 |         next_amount_data = min_max_scale(self.replay_next_state[:, 2, :])
 70 | 
 71 |         state_batch = torch.index_select(state_data, 0, index).view(self.batch_size, -1)
 72 |         amount_data = torch.index_select(amount_data, 0, sample).view(self.batch_size, -1)
 73 |         state_batch = torch.cat([state_batch, amount_data], dim=1)
 74 |         next_state_batch = torch.index_select(next_state_data, 0, index).view(self.batch_size, -1)
 75 |         next_amount_data = torch.index_select(next_amount_data, 0, sample).view(self.batch_size, -1)
 76 |         next_state_batch = torch.cat([next_state_batch, next_amount_data], dim=1)
 77 |         action_batch = torch.index_select(self.replay_action / self.unit, 0, sample)
 78 |         reward_batch = torch.index_select(self.replay_reward, 0, sample)
 79 | 
 80 |         # Calculate y_batch
 81 |         next_action_batch = self.actor_network.target_action(next_state_batch)
 82 |         q_batch = self.critic_network.target_q(next_action_batch, next_state_batch)
 83 |         y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1)
 84 | 
 85 |         # train actor-critic by target loss
 86 |         self.actor_network.train(
 87 |             self.critic_network.train(
 88 |                 y_batch, action_batch, state_batch
 89 |             )
 90 |         )
 91 | 
 92 |         # Update target networks by soft update
 93 |         self.actor_network.update_target()
 94 |         self.critic_network.update_target()
 95 | 
 96 |     def perceive(self, state, action, reward, next_state, done):
 97 |         if self.steps < self.start_size - 1:
 98 |             self.replay_state[self.steps] = state
 99 |             self.replay_next_state[self.steps] = next_state
100 |             self.replay_action[self.steps] = action
101 |             self.replay_reward[self.steps] = reward
102 |         else:
103 |             if self.steps >= self.memory_size:
104 |                 self.replay_state = self.replay_state[1:]
105 |                 self.replay_next_state = self.replay_next_state[1:]
106 |                 self.replay_action = self.replay_action[1:]
107 |                 self.replay_reward = self.replay_reward[1:]
108 |             self.replay_state = torch.cat((self.replay_state, state.unsqueeze(0)), dim=0)
109 |             self.replay_next_state = torch.cat((self.replay_next_state, next_state.unsqueeze(0)), dim=0)
110 |             self.replay_action = torch.cat((self.replay_action, action.unsqueeze(0)), dim=0)
111 |             self.replay_reward = torch.cat((self.replay_reward, reward.unsqueeze(0)), dim=0)
112 |         self.steps += 1
113 | 
114 |     def act(self, next_state, portfolio):
115 |         if self.steps > self.start_size:
116 |             next_state_data = min_max_scale(self.replay_next_state[:, 0, :])[-self.time_dim:].view(1, -1)
117 |             next_amount_data = min_max_scale(self.replay_next_state[:, 2, :])[-1].view(1, -1)
118 |             next_state_data = torch.cat([next_state_data, next_amount_data], dim=1)
119 |             self.train_on_batch()
120 |             allocation = self.actor_network.target_action(next_state_data).data.view(-1)
121 |             allocation += torch.tensor(self.exploration_noise.noise().tolist(), device=cuda)
122 |             allocation[allocation < 0] = 0
123 |             allocation /= sum(allocation)
124 |             allocation = torch.floor(
125 |                 portfolio * allocation / next_state[1, :] / self.unit
126 |             ) * self.unit
127 |             self.action = allocation
128 |         return self.action.clone()
129 | 


--------------------------------------------------------------------------------